From: elserj <elserj@localhost>
Date: Mon, 5 Apr 2010 20:12:23 +0000 (+0000)
Subject: Updated version of supercluster to do majority voting
X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=41cf2acca4cfa1332936e8ce6ba3ff0035eb42f7;p=old-jaiswallab-svn%2F.git

Updated version of supercluster to do majority voting

svn path=/; revision=14
---

diff --git a/interactome_scripts/supercluster.pl b/interactome_scripts/supercluster.pl
index 5a5a1f8..bef30d9 100755
--- a/interactome_scripts/supercluster.pl
+++ b/interactome_scripts/supercluster.pl
@@ -1,5 +1,24 @@
 #!/usr/bin/perl
 
+
+
+#############################################################
+# Written by Justin Elser, BPP, OSU 12/15/09                #
+#                                                           #
+#  Initial version seemed to work but assumed that only     #
+#   matching the first cluster was necessary.               #
+#                                                           #
+#  Version 0.9                                              #
+#      initial version                                      #
+#  Version 0.91                                             #
+#      attempt to fix the assumption problem by using the   #
+#        most popular rather than the first one found       #
+#  Version 0.92 (started 3/17/10)                           #
+#      reblast questionable clusters to get accurate        #
+#        assignment of cluster
+#                                                           #
+#############################################################
+
 use warnings;
 use strict;
 
@@ -19,10 +38,18 @@ my $screen = Term::Screen::ReadLine->new();
 	# ask for password, replace character presses with stars
 	$screen->at(1,0)->puts("Password: ");
 	my $password = $screen->readline(ROW => 1, COL => 11, PASSWORD => 1);
+	
+	# Ask for the minimum score for the paralogy cutoff
+	$screen->at(2,0)->puts("Minimum paralogy score: ");
+	my $min_score = $screen->readline(ROW => 2, COL => 24);
 
-	$screen->at(2,0);
+	$screen->at(3,0);
 	undef $screen;
 	
+if($min_score eq "") {
+	$min_score = 0;
+}
+	
 my $dbh = DBI->connect('DBI:mysql:inparanoid_data;host=floret.cgrb.oregonstate.edu', $username, $password,
 	{ RaiseError=> 1, AutoCommit=>1 }
 	) or die "Failed to connect to database: $DBI::errstr";
@@ -43,24 +70,34 @@ $dbh->do("CREATE TABLE $safe_super_table (
 	`gene` VARCHAR( 255 ) NOT NULL ,
 	UNIQUE ( `gene` )
 	) TYPE = MYISAM");
+	
+	
+#$dbh->do("CREATE TABLE $safe_super_table (
+#	`super_id` INT( 11 ) NOT NULL ,
+#	`gene` VARCHAR( 255 ) NOT NULL ,
+#	UNIQUE ( `gene` )
+#	) TYPE = MYISAM");
+	
 my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, species, gene) values (?,?,?)");
+#my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, gene) values (?,?)");
 
 my $super_id = 0; #initialize the super cluster id
 my %super_hash;
+my %big_count_hash;
 	
-#my @species = ("Maize", "Oryza_sativa", "Ath", "Sorghum", "Brachy");
-#my @species = ("Maize", "Oryza_sativa", "Ath");
-my @species  = ("Ath", "Brachy", "C_elegans", "Chlamy", "Danio", "E_coli", "Fragaria", "Glycine", "Human", "Maize", "Mouse", "Neurospora", "Oryza_sativa", "Physcomitreall", "Poplar", "Sacc_cerevisiae", "Sacc_pombe", "Selaginella", "Sorghum", "Synechosystis", "Vitis_vinifera");
+#my @species = ("Ath", "Brachy", "Maize", "Oryza_sativa", "Sorghum");
+my @species = ("Ath", "Fragaria", "Oryza_sativa", "Vitis");
+#my @species  = ("Ath", "Brachy", "C_elegans", "Chlamy", "Danio", "E_coli", "Fragaria", "Glycine", "Human", "Maize", "Mouse", "Neurospora", "Oryza_sativa", "Physcomitreall", "Poplar", "Sacc_cerevisiae", "Sacc_pombe", "Selaginella", "Sorghum", "Synechosystis", "Vitis");
 my $num_species = @species;
 
-my %species_hash;
-
 for (my $i=0; $i<$num_species-1; $i++) {
 	for (my $j=$i+1; $j<$num_species; $j++) {
 		# hashes to store the paralogs and orthologs
 		my (%species_1_gene_hash, %species_2_gene_hash);
 
 		# make sure the species are listed in alphabetical order to get correct table names
+		#  since I have fixed the species naming, this should be unnecessary now.  
+		#    Confirmed with no output from print statement.
 		my $species_1 = $species[$i];
 		my $species_2 = $species[$j];
 		if ($species_1 lt $species_2) {
@@ -70,25 +107,12 @@ for (my $i=0; $i<$num_species-1; $i++) {
 			my $spec_temp = $species_1;
 			$species_1 = $species_2;
 			$species_2 = $spec_temp;
+			print "non optimal order\n";
 		}
 		
-		if ($species_1 eq "Glycine") {
-			$species_1 = "Soy";
-		}
-		
-		if ($species_2 eq "Glycine") {
-			$species_2 = "Soy";
-		}
-		
-		if ($species_1 eq "Vitis_vinifera") {
-			$species_1 = "Grape";
-		}
-		
-		if ($species_2 eq "Vitis_vinifera") {
-			$species_2 = "Grape";
-		}
 		
 	
+		# create the statement handler to pull the species-pair data
 		my $spec_table = "$species_1" . "_" . "$species_2";
 		my $safe_species_table = $dbh->quote_identifier($spec_table);
 		my $sth = $dbh->prepare("select * from $safe_species_table");
@@ -99,11 +123,15 @@ for (my $i=0; $i<$num_species-1; $i++) {
 		
 		# error handling, make sure the table exists
 		if (!$rv) {
+			print "database table doesn't exist\n";
 			next;
 		}
 	
+		# fill the hashes with data from species-pair tables
 		while (my @line = $sth->fetchrow_array()) {
 			my ($id, $bit_score, $spec, $score, $gene) = @line;
+			# skip if paralogy score is not higher than the minimum set above
+			next if($score < $min_score);
 			if ($id ne $id_prev) {
 				if ($spec eq $species_1) {
 					$species_1_gene_hash{$id} = $gene;
@@ -135,40 +163,82 @@ for (my $i=0; $i<$num_species-1; $i++) {
 				my (@spec_1_array, @spec_2_array);
 				@spec_1_array = split " ", $species_1_gene_hash{$key};
 				@spec_2_array = split " ", $species_2_gene_hash{$key};
-				
+
+				# set up a hash to keep track of the counts for each cluster id to determine the most popular one
+				my %count_hash;
 				# if the gene is already in a cluster, use its id #
 				my $super_temp_id;
 				
 				foreach my $super_gene (@spec_1_array) {
-					if(defined($super_hash{$super_gene})) {
-						$super_temp_id = $super_hash{$super_gene};
-						last;
+					# if the gene already has a super id associated to it from an earlier pairing, add it to the list to vote on
+					if(defined($super_hash{$super_gene}->{'sup_clust_id'})) {
+						# gene already has a super id that has been counted, add one to the count for that id
+						if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) {
+							$count_hash{$super_hash{$super_gene}->{'sup_clust_id'}}++;
+						# gene has a new super id, start the count for that one
+						}else{
+							$count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1;
+						}
 					}
 				}
 				
-				if(!defined($super_temp_id)) {
-					foreach my $super_gene (@spec_2_array) {
-						if(defined($super_hash{$super_gene})) {
-							$super_temp_id = $super_hash{$super_gene};
-							last;
+				foreach my $super_gene (@spec_2_array) {
+					# see above stanza
+					if(defined($super_hash{$super_gene})) {
+						if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) {
+							$count_hash{$super_hash{$super_gene}->{'sup_clust_id'}}++;
+						}else{
+							$count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1;
 						}
 					}
 				}
 				
+				# This section does the voting for which super id to use for the cluster for this species-pair
+				#  
+				
+				my $prev_count = 0;
+				foreach my $count_key (keys %count_hash) {
+					if($count_hash{$count_key} > $prev_count){
+						$super_temp_id = $count_key;
+						$prev_count = $count_hash{$count_key};
+					# find the non-unanimous clusters
+					}elsif($count_hash{$count_key} < $prev_count && $prev_count != 0) {
+						#print "$key\t$count_key\t$count_hash{$count_key}\n";
+					}
+				}
+				
+
+				
 				# if none of the genes are in a cluster already, get a new id #
 				if(!defined($super_temp_id)) {
-					++$super_id;
+					$super_id++;
 					$super_temp_id = $super_id;
 				}
 				
 				# build the hash and put the entries in the database
 				foreach my $super_gene (@spec_1_array) {
-					$super_hash{$super_gene} = $super_temp_id;
-					$insert_sth->execute($super_temp_id,$species_1,$super_gene);
+					if(defined($big_count_hash{$super_gene})) {
+						$big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id";
+					}else{
+						$big_count_hash{$super_gene} = $super_temp_id;
+					}
+					$super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id;
+					$super_hash{$super_gene}->{'species'} = $species_1;
+					#print "$super_gene\t$super_temp_id\n";
+					#$insert_sth->execute($super_temp_id,$species_1,$super_gene);
+
 				}
 				foreach my $super_gene (@spec_2_array) {
-					$super_hash{$super_gene} = $super_temp_id;
-					$insert_sth->execute($super_temp_id,$species_2,$super_gene);
+					if(defined($big_count_hash{$super_gene})) {
+						$big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id";
+					}else{
+						$big_count_hash{$super_gene} = $super_temp_id;
+					}
+					$super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id;
+					$super_hash{$super_gene}->{'species'} = $species_2;
+				#	print "$super_gene\t$super_temp_id\n";
+					#$insert_sth->execute($super_temp_id,$species_2,$super_gene);
+					
 				}
 			
 			}
@@ -176,5 +246,41 @@ for (my $i=0; $i<$num_species-1; $i++) {
 		
 	}
 }
+print "Done creating the hashes, now loading into the database\n";
+
+my $gene_count_total = keys %big_count_hash;
+my $loop_counter = 0;
+
+foreach my $gene (keys %big_count_hash) {
+	my @count_array = split("\t", $big_count_hash{$gene});
+	my %small_count_hash;
+	my $gene_cluster;
+	
+	foreach my $count_id (@count_array) {
+		if(defined($small_count_hash{$count_id})) {
+			$small_count_hash{$count_id}++;
+		}else{
+			$small_count_hash{$count_id} = 1;
+		}
+	}
+	my $prev_count = 0;
+	foreach my $small_count_key (keys %small_count_hash) {
+		if($small_count_hash{$small_count_key} > $prev_count) {
+			$gene_cluster = $small_count_key;
+			$prev_count = $small_count_hash{$small_count_key};
+		}
+	}
+	$super_hash{$gene}->{'sup_clust_id'} = $gene_cluster;
+	my $species = $super_hash{$gene}->{'species'};
+	$insert_sth->execute($super_hash{$gene}->{'sup_clust_id'},$species,$gene);
+	
+	if($loop_counter % 10000 == 0) {
+		print "finished $loop_counter of $gene_count_total\n";
+	}
+	$loop_counter++;
+}
+print "finished\n";
+
+