From: elserj Date: Fri, 21 Apr 2017 22:33:09 +0000 (+0000) Subject: Add script that will look for known isoform patterns and create a new fasta file... X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=b65f9ad425a76039084f8747492e91fda7331fe9;p=old-jaiswallab-svn%2F.git Add script that will look for known isoform patterns and create a new fasta file without them svn path=/; revision=655 --- diff --git a/interactome_scripts/fasta_header_and_isoform_fix.pl b/interactome_scripts/fasta_header_and_isoform_fix.pl new file mode 100755 index 0000000..e9887b5 --- /dev/null +++ b/interactome_scripts/fasta_header_and_isoform_fix.pl @@ -0,0 +1,79 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") { + require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl"; +}elsif(-e "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl") { + require "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl"; +} + +my @files; + +if($#ARGV != 1) { + print "Usage: fasta_header_fix.pl input_file output_file\n"; + exit; +} + +my $input_file = $ARGV[0]; +my $output_file = $ARGV[1]; + +open(INFILE, $input_file) || die "Error: File $input_file can not be opened"; +open(OUTFILE, ">$output_file") || die "Error: File $output_file can not be opened"; + +my $species = find_species($input_file); + +my $is_isoform = 0; + +while() { + my $line = $_; + chomp $line; + if ($line =~ /^>/) { + my $gene = find_gene($line,$species); + $is_isoform = check_isoform($gene); + next if($is_isoform); + $line = ">" . "$gene"; + } + next if ($is_isoform); + print OUTFILE "$line\n"; +} + +close(INFILE); +close(OUTFILE); + +#subroutine to check if current gene is an isoform +sub check_isoform { + my $gene = $_[0]; + + #Maize gene ids are different + if($species eq "Zea_mays") { + print "in here\t$gene\n"; + if ($gene =~ /\_T0[2-9]$/ || $gene =~ /\_T1[0-9]$/) { + return "1"; + } + } + + # So are theobroma + if($species eq "Theobroma_cacao") { + if ($gene =~ /t[2-9]$/ || $gene =~ /t1[0-9]$/) { + return "1"; + } + } + # Jatropha + if($species eq "Jatropha_curcas") { + if ($gene =~ /\.20$/ || $gene =~ /\.30/ || $gene =~ /\.40/ || $gene =~ /\.50/ || $gene =~ /\.60/ || $gene =~ /\.70/ || $gene =~ /\.80/ || $gene =~ /\.90/) { + return "1"; + } + } + #all other cases + if($species ne "Jatropha_curcas") { #Jatropha gene ids are .10, .20, .30, etc... + if ($gene =~ /\.[2-9]$/ || $gene =~ /\.1[0-9]$/ || $gene =~ /\.2[0-9]$/ || $gene =~ /\.3[0-9]$/ || $gene =~ /\.4[0-9]$/ || $gene =~ /\.5[0-9]$/ || $gene =~ /\.6[0-9]$/ || $gene =~ /\.7[0-9]$/ || $gene =~ /\.8[0-9]$/ || $gene =~ /\.9[0-9]$/) { + return "1"; + } + } + + + # if not found above, assume not an isoform + return "0"; +}