From 305fcbdcdbe3497b48be5f81a1e26f719701bd16 Mon Sep 17 00:00:00 2001
From: elserj <elserj@localhost>
Date: Mon, 27 Apr 2015 23:33:30 +0000
Subject: [PATCH] New script that creates a fasta file with no isoforms.  To be
 used in CAFE pipeline.

svn path=/; revision=622
---
 .../make_fasta_from_fasta_with_no_isoforms.pl | 85 +++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100755 interactome_scripts/make_fasta_from_fasta_with_no_isoforms.pl
diff --git a/interactome_scripts/make_fasta_from_fasta_with_no_isoforms.pl b/interactome_scripts/make_fasta_from_fasta_with_no_isoforms.pl
new file mode 100755
index 0000000..ad93f80
--- /dev/null
+++ b/interactome_scripts/make_fasta_from_fasta_with_no_isoforms.pl
@@ -0,0 +1,85 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+##  Written by Justin Elser
+#  10/20/14
+#
+#  Takes a fasta file and a list of genes to ignore and creates 
+#    a new fasta file with the longest isoform only (if known)
+
+if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") {
+	require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl";
+}elsif(-e "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl") {
+	require "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl";
+}
+
+if ($#ARGV != 1) {
+		print "usage: make_fasta_from_fasta_with_no_isoforms.pl input_fasta output_fasta\n";
+		exit;
+}
+
+my $infile = $ARGV[0];
+my $outfile = $ARGV[1];
+
+my $species = find_species($infile);
+
+
+# Read in the fasta file and put in a hash
+my $line_is_header = 0; 
+my %gene_hash; 
+ 
+my $gene_id = "";
+my $prev_gene_id = "";
+ 
+my $seq;
+
+open(INFILE, "$infile") or die "Error opening input file!\n";
+
+ 
+while(my $line = <INFILE>) {
+        # match the > at beginning of lines
+        if ($line =~ /^>/) {  
+        		chomp ($line);
+               # $line =~ s/>//; # remove the > from the line
+                $prev_gene_id = $gene_id; 
+                $line_is_header = 1; 
+                $gene_id = find_gene($line,$species);
+        } else {
+                $seq .= $line;
+                $line_is_header = 0;
+        }
+ 
+        if ($line_is_header) {
+                if ($prev_gene_id ne "") {
+                        # Since we are now on the next gene, we need to
+                        # put the previous gene in the hash
+                        
+                        if($prev_gene_id =~ /\.[1-9][0-9]$/ || $prev_gene_id =~/\.[2-9]$/) {
+                        		$seq = "";
+                        		next;
+                        }
+                        
+                        if($species eq "Zea_mays" && $prev_gene_id =~ /\_T[0-9][02-9]$/) {
+                        		$seq = "";
+                        		next;
+                        }
+                        
+                        
+						$gene_hash{$prev_gene_id} = $seq;
+ 
+                        # IMPORTANT: reset $seq to empty for the next gene    
+                        $seq = "";
+                }
+        }
+}
+
+$gene_hash{$gene_id} = $seq;
+
+close(INFILE);
+
+open(OUTPUT, ">$outfile");
+foreach my $gene (keys %gene_hash) {
+		print OUTPUT ">$species|$gene\n$gene_hash{$gene}";
+}
-- 
2.34.1