From: elserj <elserj@localhost>
Date: Fri, 9 Dec 2016 17:27:32 +0000 (+0000)
Subject: Add weighted version of supercluster program
X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=a141dc7062f6322c34575bbcab3d3d8c7dbb773a;p=old-jaiswallab-svn%2F.git

Add weighted version of supercluster program

svn path=/; revision=651
---

diff --git a/interactome_scripts/supercluster_weighted.pl b/interactome_scripts/supercluster_weighted.pl
new file mode 100755
index 0000000..12e3444
--- /dev/null
+++ b/interactome_scripts/supercluster_weighted.pl
@@ -0,0 +1,297 @@
+#!/usr/bin/perl
+
+
+
+#############################################################
+# Written by Justin Elser, BPP, OSU 12/15/09                #
+#                                                           #
+#  Initial version seemed to work but assumed that only     #
+#   matching the first cluster was necessary.               #
+#                                                           #
+#  Version 0.9                                              #
+#      initial version                                      #
+#  Version 0.91                                             #
+#      attempt to fix the assumption problem by using the   #
+#        most popular rather than the first one found       #
+#  Version 0.92 (started 3/17/10)                           #
+#      reblast questionable clusters to get accurate        #
+#        assignment of cluster
+#                                                           #
+#############################################################
+
+use warnings;
+use strict;
+
+use DBI;
+use Term::ReadKey;
+
+use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts";
+
+use DbiFloret;
+
+if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") {
+	require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl";
+}elsif(-e "$ENV{HOME}/bin/find_species.pl") {
+	require "$ENV{HOME}/bin/find_species.pl";
+}
+
+# check for arguments and explain usage
+if ($#ARGV !=0) {
+	print "usage: supercluster.pl species_list_file\n";
+	exit;
+}
+
+my $spec_file = $ARGV[0];
+
+my $dbh = DbiFloret::dbconnect;
+	
+print "Minimum paralogy score: ";
+chomp(my $min_score = <STDIN>);
+	
+if($min_score eq "") {
+	$min_score = 0;
+}
+
+my @species;
+my %genus_weight;
+
+open(SPECFILE, "$spec_file");
+
+while(<SPECFILE>) {
+		my $line = $_;
+		chomp $line;
+		push (@species, $line);
+		my ($genus, $sub_species) = split("_", $line);
+		if(defined($genus_weight{$genus})){
+				$genus_weight{$genus}++;
+		}else{
+				$genus_weight{$genus} = 1;
+		}
+}
+
+	
+#### Note to self ####
+## To not have multiple values in mysql table, use insert ignore instead of insert.  Or use replace.
+### also note that for insert ignore to work, must have a "unique" field
+
+# make the new table to hold the super clusters
+my $super_table = "super_clust_weighted";
+my $safe_super_table = $dbh->quote_identifier($super_table);
+
+$dbh->do("drop table if exists $safe_super_table");
+$dbh->do("CREATE TABLE $safe_super_table (
+	`super_id` INT( 11 ) NOT NULL ,
+	`species` VARCHAR( 255 ) NOT NULL ,
+	`gene` VARCHAR( 255 ) NOT NULL ,
+	UNIQUE ( `gene` )
+	) TYPE = MYISAM");
+	
+	
+my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, species, gene) values (?,?,?)");
+
+my $super_id = 0; #initialize the super cluster id
+my %super_hash;
+my %big_count_hash;
+	
+my $num_species = @species;
+
+for (my $i=0; $i<$num_species-1; $i++) {
+	print "Working on species: $species[$i]\n";
+	
+	for (my $j=$i+1; $j<$num_species; $j++) {
+		# hashes to store the paralogs and orthologs
+		my (%species_1_gene_hash, %species_2_gene_hash);
+
+		# make sure the species are listed in alphabetical order to get correct table names
+		#  since I have fixed the species naming, this should be unnecessary now.  
+		#    Confirmed with no output from print statement.
+		my $species_1 = $species[$i];
+		my $species_2 = $species[$j];
+	#	if ($species_1 lt $species_2) {
+	#		$species_1 = $species_1;
+	#		$species_2 = $species_2;
+	#	} else {
+	#		my $spec_temp = $species_1;
+	#		$species_1 = $species_2;
+	#		$species_2 = $spec_temp;
+	#		print "non optimal order\n";
+	#	}
+		
+		
+	
+		# create the statement handler to pull the species-pair data
+		my $spec_table = "$species_1" . "_" . "$species_2";
+		my $safe_species_table = $dbh->quote_identifier($spec_table);
+		my $sth = $dbh->prepare("select * from $safe_species_table");
+	
+		my $id_prev = "";
+	
+		my $rv = $sth->execute();
+		
+		# error handling, make sure the table exists
+		if (!$rv) {
+			print "database table doesn't exist\n";
+			next;
+		}
+	
+		# fill the hashes with data from species-pair tables
+		while (my @line = $sth->fetchrow_array()) {
+			my ($id, $bit_score, $spec, $score, $gene) = @line;
+			# skip if paralogy score is not higher than the minimum set above
+			next if($score < $min_score);
+			if ($id ne $id_prev) {
+				if ($spec eq $species_1) {
+					$species_1_gene_hash{$id} = $gene;
+				}else {
+					$species_2_gene_hash{$id} = $gene;
+				}
+			}else {
+				if ($spec eq $species_1) {
+					if(defined($species_1_gene_hash{$id})) {
+						$species_1_gene_hash{$id} = "$species_1_gene_hash{$id} $gene";
+					} else {
+						$species_1_gene_hash{$id} = $gene;
+					}
+				}else {
+					if(defined($species_2_gene_hash{$id})) {
+						$species_2_gene_hash{$id} = "$species_2_gene_hash{$id} $gene";
+					} else {
+						$species_2_gene_hash{$id} = $gene;
+					}
+				}
+			}
+			$id_prev = $id;
+		}
+		$sth->finish();
+
+		# Each key defines a species pair cluster
+		foreach my $key (keys %species_1_gene_hash) {
+			if(defined($species_2_gene_hash{$key})) {
+				my (@spec_1_array, @spec_2_array);
+				@spec_1_array = split " ", $species_1_gene_hash{$key};
+				@spec_2_array = split " ", $species_2_gene_hash{$key};
+
+				# set up a hash to keep track of the counts for each cluster id to determine the most popular one
+				my %count_hash;
+				# if the gene is already in a cluster, use its id #
+				my $super_temp_id;
+				
+				foreach my $super_gene (@spec_1_array) {
+					# get the genus from the species name
+					my ($genus, $sub_species) = split("_", $species_1);
+					# if the gene already has a super id associated to it from an earlier pairing, add it to the list to vote on
+					if(defined($super_hash{$super_gene}->{'sup_clust_id'})) {
+						# gene already has a super id that has been counted, add one to the count for that id
+						if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) {
+							$count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} += 1/($genus_weight{$genus});
+						# gene has a new super id, start the count for that one
+						}else{
+							$count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1/($genus_weight{$genus});
+						}
+					}
+				}
+				
+				foreach my $super_gene (@spec_2_array) {
+					# get the genus from the species name
+					my ($genus, $sub_species) = split("_", $species_2);
+					# see above stanza
+					if(defined($super_hash{$super_gene})) {
+						if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) {
+							$count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} += 1/($genus_weight{$genus});
+						}else{
+							$count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1/($genus_weight{$genus});
+						}
+					}
+				}
+				
+				# This section does the voting for which super id to use for the cluster for this species-pair
+				#  
+				
+				my $prev_count = 0;
+				foreach my $count_key (keys %count_hash) {
+					if($count_hash{$count_key} > $prev_count){
+						$super_temp_id = $count_key;
+						$prev_count = $count_hash{$count_key};
+					# find the non-unanimous clusters
+					}elsif($count_hash{$count_key} < $prev_count && $prev_count != 0) {
+						#print "$key\t$count_key\t$count_hash{$count_key}\n";
+					}
+				}
+				
+
+				
+				# if none of the genes are in a cluster already, get a new id #
+				if(!defined($super_temp_id)) {
+					$super_id++;
+					$super_temp_id = $super_id;
+				}
+				
+				# build the hash and put the entries in the database
+				foreach my $super_gene (@spec_1_array) {
+					if(defined($big_count_hash{$super_gene})) {
+						$big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id";
+					}else{
+						$big_count_hash{$super_gene} = $super_temp_id;
+					}
+					$super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id;
+					$super_hash{$super_gene}->{'species'} = $species_1;
+					#print "$super_gene\t$super_temp_id\n";
+					#$insert_sth->execute($super_temp_id,$species_1,$super_gene);
+
+				}
+				foreach my $super_gene (@spec_2_array) {
+					if(defined($big_count_hash{$super_gene})) {
+						$big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id";
+					}else{
+						$big_count_hash{$super_gene} = $super_temp_id;
+					}
+					$super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id;
+					$super_hash{$super_gene}->{'species'} = $species_2;
+				#	print "$super_gene\t$super_temp_id\n";
+					#$insert_sth->execute($super_temp_id,$species_2,$super_gene);
+					
+				}
+			
+			}
+		}
+		
+	}
+}
+print "Done creating the hashes, now loading into the database\n";
+
+my $gene_count_total = keys %big_count_hash;
+my $loop_counter = 0;
+
+foreach my $gene (keys %big_count_hash) {
+	my @count_array = split("\t", $big_count_hash{$gene});
+	my %small_count_hash;
+	my $gene_cluster;
+	
+	foreach my $count_id (@count_array) {
+		if(defined($small_count_hash{$count_id})) {
+			$small_count_hash{$count_id}++;
+		}else{
+			$small_count_hash{$count_id} = 1;
+		}
+	}
+	my $prev_count = 0;
+	foreach my $small_count_key (keys %small_count_hash) {
+		if($small_count_hash{$small_count_key} > $prev_count) {
+			$gene_cluster = $small_count_key;
+			$prev_count = $small_count_hash{$small_count_key};
+		}
+	}
+	$super_hash{$gene}->{'sup_clust_id'} = $gene_cluster;
+	my $species = $super_hash{$gene}->{'species'};
+	$insert_sth->execute($super_hash{$gene}->{'sup_clust_id'},$species,$gene);
+	
+	if($loop_counter % 10000 == 0) {
+		print "finished $loop_counter of $gene_count_total\n";
+	}
+	$loop_counter++;
+}
+print "finished\n";
+
+
+
+