From 873830c2727a2b8cbae126cf82bdfacac07e60d5 Mon Sep 17 00:00:00 2001 From: elserj Date: Thu, 10 Mar 2011 22:52:55 +0000 Subject: [PATCH] Added script that will take the list of protein coding genes and repeated genes from IPRscan and removes them from a fasta file svn path=/; revision=89 --- .../remove_transposable_elements.pl | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100755 interactome_scripts/remove_transposable_elements.pl diff --git a/interactome_scripts/remove_transposable_elements.pl b/interactome_scripts/remove_transposable_elements.pl new file mode 100755 index 0000000..9a4f51a --- /dev/null +++ b/interactome_scripts/remove_transposable_elements.pl @@ -0,0 +1,66 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +if ($#ARGV != 3) { + print "usage: remove_transposable_elements.pl IPR_genes genes_to_remove fasta_file output_file\n"; + exit; +} + +my $ipr_file = $ARGV[0]; +my $remove_file = $ARGV[1]; +my $fasta_file = $ARGV[2]; +my $out_file = $ARGV[3]; + +my %ipr_hash; +open (ipr_file, $ipr_file); +while() { + my $line = $_; + chomp $line; + $ipr_hash{$line} = $line; +} +close(ipr_file); + +my %remove_hash; + +open (remove_file, $remove_file); +while() { + my $line = $_; + chomp $line; + + $remove_hash{$line} = $line; +} +close(remove_file); + +open(fasta_file, $fasta_file); + + +my %fasta_hash; +my $gene_prev = ""; +while() { + my $line = $_; + + if ($line =~ /^>/) { + chomp $line; + $line =~ s/^>//g; + $line =~ s/_1//g; + $gene_prev=$line; + }else{ + if(!defined($fasta_hash{$gene_prev})) { + $fasta_hash{$gene_prev} = $line; + }else{ + $fasta_hash{$gene_prev} = $fasta_hash{$gene_prev} . $line; + } + } +} +close(fasta_file); + +open(out_file, ">$out_file"); +foreach my $gene (keys %fasta_hash) { + if(defined($ipr_hash{$gene}) && !defined($remove_hash{$gene})) { + print out_file ">$gene\n"; + print out_file "$fasta_hash{$gene}"; + } +} +close(out_file); -- 2.34.1