From 88fd902757880cabe55ece2549aa303d7e2ad2f8 Mon Sep 17 00:00:00 2001 From: elserj Date: Thu, 19 Feb 2015 23:06:01 +0000 Subject: [PATCH] Script that will use supercluster table to generate fasta file with only longest (or random) genes from each cluster. Useful for running bio-hal svn path=/; revision=620 --- .../make_fasta_with_no_paralogs.pl | 209 ++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100755 interactome_scripts/make_fasta_with_no_paralogs.pl diff --git a/interactome_scripts/make_fasta_with_no_paralogs.pl b/interactome_scripts/make_fasta_with_no_paralogs.pl new file mode 100755 index 0000000..add9c2e --- /dev/null +++ b/interactome_scripts/make_fasta_with_no_paralogs.pl @@ -0,0 +1,209 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts"; + +use DbiFloret; + +if ($#ARGV != 1) { + print "usage: make_fasta_with_no_paralogs.pl species_list output_file_suffix\n"; + exit; +} + +my $species_file = $ARGV[0]; +open(species_file, $species_file); +my @species; +while() { + my $line = $_; + chomp $line; + push(@species, $line); +} +close(species_file); + + +print "credentials and DB for super cluster table\n"; +my $dbh_super = DbiFloret::dbconnect; + +print "credentials and DB for protein sequences\n"; +my $dbh_sequences = DbiFloret::dbconnect; + +# get the genes for each species into a hash based on the supercluster id +my $sth_get_supers = $dbh_super->prepare("select * from super_clust where species like ?"); + +# create hash to store all genes +my %super_hash; + +foreach my $species (@species) { + print "building super hash for species $species\n"; + $sth_get_supers->execute($species); + while (my @line = $sth_get_supers->fetchrow_array) { + my ($super_id, $species, $gene) = @line; + if(defined($super_hash{$super_id})) { + my $oldhash = $super_hash{$super_id}; + $oldhash->{'species'} = "$oldhash->{'species'}\t$species"; + $oldhash->{'gene'} = "$oldhash->{'gene'}\t$gene"; + $super_hash{$super_id} = $oldhash; + }else{ + $super_hash{$super_id}->{'species'} = $species; + $super_hash{$super_id}->{'gene'} = $gene; + } + } +} + +# create hash to store gene sequences. Key will be species+super_id +my %gene_hash; + +my $counter = 0; +my $gene_hash_counter = keys %super_hash; + +foreach my $key (keys %super_hash) { + + if($counter % 1000 == 0) { + print "finished $counter of $gene_hash_counter\n"; + } + $counter ++; + + my @species_array = split ("\t", $super_hash{$key}->{'species'}); + my @gene_array = split ("\t", $super_hash{$key}->{'gene'}); + my $count = @gene_array; + + for (my $i=0; $i<$count; $i++) { + my $species = $species_array[$i]; + my $gene = $gene_array[$i]; + + my $safe_species_table = $dbh_sequences->quote_identifier($species); + my $sth_get_sequence = $dbh_sequences->prepare("select sequence from $safe_species_table where gene_id like ?"); + +# $gene =~ s/\_T01$//; +# $gene =~ s/T0(\d\d)$/0$1/; + + $sth_get_sequence->execute($gene); + + + while (my @line = $sth_get_sequence->fetchrow_array) { + my $line = $line[0]; + + my $gene_hash_key = "$species" . "zzz" . "$key"; + if(defined($gene_hash{$gene_hash_key})) { + my $oldhash = $gene_hash{$gene_hash_key}; + $oldhash->{'gene'} = "$oldhash->{'gene'}\t$gene"; + $oldhash->{'sequence'} = "$oldhash->{'sequence'}\t$line"; + my $len = length($line); + $oldhash->{'length'} = "$oldhash->{'length'}\t$len"; + $gene_hash{$gene_hash_key} = $oldhash; + }else{ + $gene_hash{$gene_hash_key}->{'gene'} = $gene; + $gene_hash{$gene_hash_key}->{'sequence'} = $line; + $gene_hash{$gene_hash_key}->{'length'} = length($line); + } + + } + } +} + +foreach my $key (keys %gene_hash) { + # get species and super_id from key + $key =~ /([\w\.]+)zzz(\w+)/; + my $species = $1; + my $super_id = $2; + + my @gene_array = split("\t", $gene_hash{$key}->{'gene'}); + my @sequence_array = split("\t", $gene_hash{$key}->{'sequence'}); + my @length_array = split("\t", $gene_hash{$key}->{'length'}); + + my $count = @gene_array; + + my $longest_gene; + my $longest_length=0; + + # get the longest gene in each species cluster + for (my $i=0; $i<$count; $i++) { + my $length = $length_array[$i]; + if($length > $longest_length) { + $longest_length = $length; + $longest_gene = $i; + } + } + + my $species_longest_filename = "$species" . "_longest_" . "$ARGV[1]"; + open(SPECIESLONGFH, ">>$species_longest_filename"); + print SPECIESLONGFH ">$gene_array[$longest_gene]\n"; + for (my $pos=0; $pos < $longest_length; $pos +=80) { + print SPECIESLONGFH substr($sequence_array[$longest_gene], $pos, 80), "\n"; + } + close(SPECIESLONGFH); + + + # do the same for a random gene in each species cluster +# my $rand = int(rand($count)); +# my $species_rand_filename = "$species" . "_random_" . "$ARGV[1]"; +# open(SPECIESRANDFH, ">>$species_rand_filename"); +# print SPECIESRANDFH ">$gene_array[$rand]\n"; +# for (my $pos=0; $pos < length($sequence_array[$rand]); $pos +=80) { +# print SPECIESRANDFH substr($sequence_array[$rand], $pos, 80), "\n"; +# } +# close(SPECIESRANDFH); +} + +# Comment this if you want the smaller sets. +exit; + +my %species_hash; +my $number_genes = 5000; # number of genes to have in each shortened file + +# Repeat it but just do the first 10,000 sequences for each species +foreach my $key (keys %gene_hash) { + # get species and super_id from key + $key =~ /(\w+)zzz(\w+)/; + my $species = $1; + my $super_id = $2; + + if(!defined($species_hash{$species})) { + $species_hash{$species} = 0; + }else{ + $species_hash{$species}++; + } + + if($species_hash{$species} < $number_genes) { + + my @gene_array = split("\t", $gene_hash{$key}->{'gene'}); + my @sequence_array = split("\t", $gene_hash{$key}->{'sequence'}); + my @length_array = split("\t", $gene_hash{$key}->{'length'}); + + my $count = @gene_array; + + my $longest_gene; + my $longest_length=0; + + # get the longest gene in each species cluster + for (my $i=0; $i<$count; $i++) { + my $length = $length_array[$i]; + if($length > $longest_length) { + $longest_length = $length; + $longest_gene = $i; + } + } + + my $species_longest_filename = "$species" . "_longest_" . "$number_genes" . "_$ARGV[1]"; + open(SPECIESLONGFH, ">>$species_longest_filename"); + print SPECIESLONGFH ">$gene_array[$longest_gene]\n"; + for (my $pos=0; $pos < $longest_length; $pos +=80) { + print SPECIESLONGFH substr($sequence_array[$longest_gene], $pos, 80), "\n"; + } + close(SPECIESLONGFH); + + + # do the same for a random gene in each species cluster + my $rand = int(rand($count)); + my $species_rand_filename = "$species" . "_random_" . "$number_genes" . "_$ARGV[1]"; + open(SPECIESRANDFH, ">>$species_rand_filename"); + print SPECIESRANDFH ">$gene_array[$rand]\n"; + for (my $pos=0; $pos < length($sequence_array[$rand]); $pos +=80) { + print SPECIESRANDFH substr($sequence_array[$rand], $pos, 80), "\n"; + } + close(SPECIESRANDFH); + } +} + -- 2.34.1