From 88fd902757880cabe55ece2549aa303d7e2ad2f8 Mon Sep 17 00:00:00 2001
From: elserj <elserj@localhost>
Date: Thu, 19 Feb 2015 23:06:01 +0000
Subject: [PATCH] Script that will use supercluster table to generate fasta
 file with only longest (or random) genes from each cluster.  Useful for
 running bio-hal

svn path=/; revision=620
---
 .../make_fasta_with_no_paralogs.pl            | 209 ++++++++++++++++++
 1 file changed, 209 insertions(+)
 create mode 100755 interactome_scripts/make_fasta_with_no_paralogs.pl

diff --git a/interactome_scripts/make_fasta_with_no_paralogs.pl b/interactome_scripts/make_fasta_with_no_paralogs.pl
new file mode 100755
index 0000000..add9c2e
--- /dev/null
+++ b/interactome_scripts/make_fasta_with_no_paralogs.pl
@@ -0,0 +1,209 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts";
+
+use DbiFloret;
+
+if ($#ARGV != 1) {
+	print "usage: make_fasta_with_no_paralogs.pl species_list output_file_suffix\n";
+	exit;
+}
+
+my $species_file = $ARGV[0];
+open(species_file, $species_file);
+my @species;
+while(<species_file>) {
+	my $line = $_;
+	chomp $line;
+	push(@species, $line);
+}
+close(species_file);
+
+
+print "credentials and DB for super cluster table\n";
+my $dbh_super = DbiFloret::dbconnect;
+
+print "credentials and DB for protein sequences\n";
+my $dbh_sequences = DbiFloret::dbconnect;
+
+# get the genes for each species into a hash based on the supercluster id
+my $sth_get_supers = $dbh_super->prepare("select * from super_clust where species like ?");
+
+# create hash to store all genes
+my %super_hash;
+
+foreach my $species (@species) {
+	print "building super hash for species $species\n";
+	$sth_get_supers->execute($species);
+	while (my @line = $sth_get_supers->fetchrow_array) {
+		my ($super_id, $species, $gene) = @line;
+		if(defined($super_hash{$super_id})) {
+			my $oldhash = $super_hash{$super_id};
+			$oldhash->{'species'} = "$oldhash->{'species'}\t$species";
+			$oldhash->{'gene'} = "$oldhash->{'gene'}\t$gene";
+			$super_hash{$super_id} = $oldhash;
+		}else{
+			$super_hash{$super_id}->{'species'} = $species;
+			$super_hash{$super_id}->{'gene'} = $gene;
+		}
+	}
+}
+
+# create hash to store gene sequences.  Key will be species+super_id
+my %gene_hash;
+
+my $counter = 0;
+my $gene_hash_counter = keys %super_hash;
+
+foreach my $key (keys %super_hash) {
+
+	if($counter % 1000 == 0) {
+		print "finished $counter of $gene_hash_counter\n";
+	}
+	$counter ++;
+
+	my @species_array = split ("\t", $super_hash{$key}->{'species'});
+	my @gene_array = split ("\t", $super_hash{$key}->{'gene'});
+	my $count = @gene_array;
+	
+	for (my $i=0; $i<$count; $i++) {
+		my $species = $species_array[$i];
+		my $gene = $gene_array[$i];
+		
+		my $safe_species_table = $dbh_sequences->quote_identifier($species);
+		my $sth_get_sequence = $dbh_sequences->prepare("select sequence from $safe_species_table where gene_id like ?");
+		
+#		$gene =~ s/\_T01$//;
+#		$gene =~ s/T0(\d\d)$/0$1/;
+		
+		$sth_get_sequence->execute($gene);
+		
+	
+		while (my @line = $sth_get_sequence->fetchrow_array) {
+			my $line = $line[0];
+			
+			my $gene_hash_key = "$species" . "zzz" . "$key";
+			if(defined($gene_hash{$gene_hash_key})) {
+				my $oldhash = $gene_hash{$gene_hash_key};
+				$oldhash->{'gene'} = "$oldhash->{'gene'}\t$gene";
+				$oldhash->{'sequence'} = "$oldhash->{'sequence'}\t$line";
+				my $len = length($line);
+				$oldhash->{'length'} = "$oldhash->{'length'}\t$len";
+				$gene_hash{$gene_hash_key} = $oldhash;
+			}else{
+				$gene_hash{$gene_hash_key}->{'gene'} = $gene;
+				$gene_hash{$gene_hash_key}->{'sequence'} = $line;
+				$gene_hash{$gene_hash_key}->{'length'} = length($line);
+			}
+			
+		}
+	}
+}
+		
+foreach my $key (keys %gene_hash) {
+	# get species and super_id from key
+	$key =~ /([\w\.]+)zzz(\w+)/;
+	my $species = $1;
+	my $super_id = $2;
+
+	my @gene_array = split("\t", $gene_hash{$key}->{'gene'});
+	my @sequence_array = split("\t", $gene_hash{$key}->{'sequence'});
+	my @length_array = split("\t", $gene_hash{$key}->{'length'});
+	
+	my $count = @gene_array;
+	
+	my $longest_gene;
+	my $longest_length=0;
+	
+	# get the longest gene in each species cluster
+	for (my $i=0; $i<$count; $i++) {
+		my $length = $length_array[$i];
+		if($length > $longest_length) {
+			$longest_length = $length;
+			$longest_gene = $i;
+		}
+	}
+
+	my $species_longest_filename = "$species" . "_longest_" . "$ARGV[1]";
+	open(SPECIESLONGFH, ">>$species_longest_filename");
+	print SPECIESLONGFH ">$gene_array[$longest_gene]\n";
+	for (my $pos=0; $pos < $longest_length; $pos +=80) {
+		print SPECIESLONGFH substr($sequence_array[$longest_gene], $pos, 80), "\n";
+	}
+	close(SPECIESLONGFH);
+	
+	
+	# do the same for a random gene in each species cluster
+#	my $rand = int(rand($count));
+#	my $species_rand_filename = "$species" . "_random_" . "$ARGV[1]";
+#	open(SPECIESRANDFH, ">>$species_rand_filename");
+#	print SPECIESRANDFH ">$gene_array[$rand]\n";
+#	for (my $pos=0; $pos < length($sequence_array[$rand]); $pos +=80) {
+#		print SPECIESRANDFH substr($sequence_array[$rand], $pos, 80), "\n";
+#	}
+#	close(SPECIESRANDFH);
+}
+
+# Comment this if you want the smaller sets.
+exit;
+
+my %species_hash;
+my $number_genes = 5000;  # number of genes to have in each shortened file
+
+# Repeat it but just do the first 10,000 sequences for each species
+foreach my $key (keys %gene_hash) {
+	# get species and super_id from key
+	$key =~ /(\w+)zzz(\w+)/;
+	my $species = $1;
+	my $super_id = $2;
+	
+	if(!defined($species_hash{$species})) {
+		$species_hash{$species} = 0;
+	}else{
+		$species_hash{$species}++;
+	}
+
+	if($species_hash{$species} < $number_genes) {
+	
+		my @gene_array = split("\t", $gene_hash{$key}->{'gene'});
+		my @sequence_array = split("\t", $gene_hash{$key}->{'sequence'});
+		my @length_array = split("\t", $gene_hash{$key}->{'length'});
+		
+		my $count = @gene_array;
+		
+		my $longest_gene;
+		my $longest_length=0;
+		
+		# get the longest gene in each species cluster
+		for (my $i=0; $i<$count; $i++) {
+			my $length = $length_array[$i];
+			if($length > $longest_length) {
+				$longest_length = $length;
+				$longest_gene = $i;
+			}
+		}
+		
+		my $species_longest_filename = "$species" . "_longest_" . "$number_genes" . "_$ARGV[1]";
+		open(SPECIESLONGFH, ">>$species_longest_filename");
+		print SPECIESLONGFH ">$gene_array[$longest_gene]\n";
+		for (my $pos=0; $pos < $longest_length; $pos +=80) {
+			print SPECIESLONGFH substr($sequence_array[$longest_gene], $pos, 80), "\n";
+		}
+		close(SPECIESLONGFH);
+		
+		
+		# do the same for a random gene in each species cluster
+		my $rand = int(rand($count));
+		my $species_rand_filename = "$species" . "_random_" . "$number_genes" . "_$ARGV[1]";
+		open(SPECIESRANDFH, ">>$species_rand_filename");
+		print SPECIESRANDFH ">$gene_array[$rand]\n";
+		for (my $pos=0; $pos < length($sequence_array[$rand]); $pos +=80) {
+			print SPECIESRANDFH substr($sequence_array[$rand], $pos, 80), "\n";
+		}
+		close(SPECIESRANDFH);
+	}
+}
+
-- 
2.34.1