Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Add script that will look for known isoform patterns and create a new fasta file...
authorelserj <elserj@localhost>
Fri, 21 Apr 2017 22:33:09 +0000 (22:33 +0000)
committerelserj <elserj@localhost>
Fri, 21 Apr 2017 22:33:09 +0000 (22:33 +0000)
svn path=/; revision=655

interactome_scripts/fasta_header_and_isoform_fix.pl [new file with mode: 0755]

diff --git a/interactome_scripts/fasta_header_and_isoform_fix.pl b/interactome_scripts/fasta_header_and_isoform_fix.pl
new file mode 100755 (executable)
index 0000000..e9887b5
--- /dev/null
@@ -0,0 +1,79 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") {
+       require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl";
+}elsif(-e "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl") {
+       require "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl";
+}
+
+my @files;
+
+if($#ARGV != 1) {
+       print "Usage: fasta_header_fix.pl input_file output_file\n";
+       exit;
+}
+
+my $input_file = $ARGV[0];
+my $output_file = $ARGV[1];
+       
+open(INFILE, $input_file)  || die "Error: File $input_file can not be opened";
+open(OUTFILE, ">$output_file") || die "Error: File $output_file can not be opened";
+       
+my $species = find_species($input_file);
+
+my  $is_isoform = 0;
+
+while(<INFILE>) {
+       my $line = $_;
+       chomp $line;
+       if ($line =~ /^>/) {
+               my $gene = find_gene($line,$species);
+               $is_isoform = check_isoform($gene);
+               next if($is_isoform);
+               $line = ">" . "$gene";
+       }
+       next if ($is_isoform);
+       print OUTFILE "$line\n";
+}
+
+close(INFILE);
+close(OUTFILE);
+
+#subroutine to check if current gene is an isoform
+sub check_isoform {
+               my $gene = $_[0];
+
+               #Maize gene ids are different
+               if($species eq "Zea_mays") {
+                               print "in here\t$gene\n";
+                               if ($gene =~ /\_T0[2-9]$/ || $gene =~ /\_T1[0-9]$/) {
+                                               return "1";
+                               }
+               }
+               
+               # So are theobroma
+               if($species eq "Theobroma_cacao") {
+                               if ($gene =~ /t[2-9]$/ || $gene =~ /t1[0-9]$/) {
+                                               return "1";
+                               }
+               }
+               # Jatropha
+               if($species eq "Jatropha_curcas") {
+                               if ($gene =~ /\.20$/ || $gene =~ /\.30/ || $gene =~ /\.40/ || $gene =~ /\.50/ || $gene =~ /\.60/ || $gene =~ /\.70/ || $gene =~ /\.80/ || $gene =~ /\.90/) {
+                                               return "1";
+                               }
+               }
+               #all other cases
+               if($species ne "Jatropha_curcas") { #Jatropha gene ids are .10, .20, .30, etc...
+                               if ($gene =~ /\.[2-9]$/ || $gene =~ /\.1[0-9]$/ || $gene =~ /\.2[0-9]$/ || $gene =~ /\.3[0-9]$/ || $gene =~ /\.4[0-9]$/ || $gene =~ /\.5[0-9]$/ || $gene =~ /\.6[0-9]$/ || $gene =~ /\.7[0-9]$/ || $gene =~ /\.8[0-9]$/ || $gene =~ /\.9[0-9]$/) {
+                                               return "1";
+                               }
+               }
+               
+               
+               # if not found above, assume not an isoform
+               return "0";
+}