From 305fcbdcdbe3497b48be5f81a1e26f719701bd16 Mon Sep 17 00:00:00 2001 From: elserj Date: Mon, 27 Apr 2015 23:33:30 +0000 Subject: [PATCH] New script that creates a fasta file with no isoforms. To be used in CAFE pipeline. svn path=/; revision=622 --- .../make_fasta_from_fasta_with_no_isoforms.pl | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100755 interactome_scripts/make_fasta_from_fasta_with_no_isoforms.pl diff --git a/interactome_scripts/make_fasta_from_fasta_with_no_isoforms.pl b/interactome_scripts/make_fasta_from_fasta_with_no_isoforms.pl new file mode 100755 index 0000000..ad93f80 --- /dev/null +++ b/interactome_scripts/make_fasta_from_fasta_with_no_isoforms.pl @@ -0,0 +1,85 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +## Written by Justin Elser +# 10/20/14 +# +# Takes a fasta file and a list of genes to ignore and creates +# a new fasta file with the longest isoform only (if known) + +if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") { + require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl"; +}elsif(-e "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl") { + require "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl"; +} + +if ($#ARGV != 1) { + print "usage: make_fasta_from_fasta_with_no_isoforms.pl input_fasta output_fasta\n"; + exit; +} + +my $infile = $ARGV[0]; +my $outfile = $ARGV[1]; + +my $species = find_species($infile); + + +# Read in the fasta file and put in a hash +my $line_is_header = 0; +my %gene_hash; + +my $gene_id = ""; +my $prev_gene_id = ""; + +my $seq; + +open(INFILE, "$infile") or die "Error opening input file!\n"; + + +while(my $line = ) { + # match the > at beginning of lines + if ($line =~ /^>/) { + chomp ($line); + # $line =~ s/>//; # remove the > from the line + $prev_gene_id = $gene_id; + $line_is_header = 1; + $gene_id = find_gene($line,$species); + } else { + $seq .= $line; + $line_is_header = 0; + } + + if ($line_is_header) { + if ($prev_gene_id ne "") { + # Since we are now on the next gene, we need to + # put the previous gene in the hash + + if($prev_gene_id =~ /\.[1-9][0-9]$/ || $prev_gene_id =~/\.[2-9]$/) { + $seq = ""; + next; + } + + if($species eq "Zea_mays" && $prev_gene_id =~ /\_T[0-9][02-9]$/) { + $seq = ""; + next; + } + + + $gene_hash{$prev_gene_id} = $seq; + + # IMPORTANT: reset $seq to empty for the next gene + $seq = ""; + } + } +} + +$gene_hash{$gene_id} = $seq; + +close(INFILE); + +open(OUTPUT, ">$outfile"); +foreach my $gene (keys %gene_hash) { + print OUTPUT ">$species|$gene\n$gene_hash{$gene}"; +} -- 2.34.1