Updated version of supercluster to do majority voting

author elserj <elserj@localhost>

Mon, 5 Apr 2010 20:12:23 +0000 (20:12 +0000)

committer elserj <elserj@localhost>

Mon, 5 Apr 2010 20:12:23 +0000 (20:12 +0000)
author elserj <elserj@localhost>
Mon, 5 Apr 2010 20:12:23 +0000 (20:12 +0000)
committer elserj <elserj@localhost>
Mon, 5 Apr 2010 20:12:23 +0000 (20:12 +0000)
diff --git a/interactome_scripts/supercluster.pl b/interactome_scripts/supercluster.pl

index 5a5a1f88f2ce5fae8866dd49176965b9823cc43c..bef30d95615e042897c3af6c6a8b6913fa8b00f0 100755 (executable)
--- a/interactome_scripts/supercluster.pl
+++ b/interactome_scripts/supercluster.pl
@@ -1,5 +1,24 @@
  #!/usr/bin/perl
  
+
+
+#############################################################
+# Written by Justin Elser, BPP, OSU 12/15/09                #
+#                                                           #
+#  Initial version seemed to work but assumed that only     #
+#   matching the first cluster was necessary.               #
+#                                                           #
+#  Version 0.9                                              #
+#      initial version                                      #
+#  Version 0.91                                             #
+#      attempt to fix the assumption problem by using the   #
+#        most popular rather than the first one found       #
+#  Version 0.92 (started 3/17/10)                           #
+#      reblast questionable clusters to get accurate        #
+#        assignment of cluster
+#                                                           #
+#############################################################
+
  use warnings;
  use strict;
  
@@ -19,10 +38,18 @@ my $screen = Term::Screen::ReadLine->new();
         # ask for password, replace character presses with stars
         $screen->at(1,0)->puts("Password: ");
         my $password = $screen->readline(ROW => 1, COL => 11, PASSWORD => 1);
+       
+       # Ask for the minimum score for the paralogy cutoff
+       $screen->at(2,0)->puts("Minimum paralogy score: ");
+       my $min_score = $screen->readline(ROW => 2, COL => 24);
  
-       $screen->at(2,0);
+       $screen->at(3,0);
         undef $screen;
         
+if($min_score eq "") {
+       $min_score = 0;
+}
+       
  my $dbh = DBI->connect('DBI:mysql:inparanoid_data;host=floret.cgrb.oregonstate.edu', $username, $password,
         { RaiseError=> 1, AutoCommit=>1 }
         ) or die "Failed to connect to database: $DBI::errstr";
@@ -43,24 +70,34 @@ $dbh->do("CREATE TABLE $safe_super_table (
         `gene` VARCHAR( 255 ) NOT NULL ,
         UNIQUE ( `gene` )
         ) TYPE = MYISAM");
+       
+       
+#$dbh->do("CREATE TABLE $safe_super_table (
+#      `super_id` INT( 11 ) NOT NULL ,
+#      `gene` VARCHAR( 255 ) NOT NULL ,
+#      UNIQUE ( `gene` )
+#      ) TYPE = MYISAM");
+       
  my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, species, gene) values (?,?,?)");
+#my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, gene) values (?,?)");
  
  my $super_id = 0; #initialize the super cluster id
  my %super_hash;
+my %big_count_hash;
         
-#my @species = ("Maize", "Oryza_sativa", "Ath", "Sorghum", "Brachy");
-#my @species = ("Maize", "Oryza_sativa", "Ath");
-my @species  = ("Ath", "Brachy", "C_elegans", "Chlamy", "Danio", "E_coli", "Fragaria", "Glycine", "Human", "Maize", "Mouse", "Neurospora", "Oryza_sativa", "Physcomitreall", "Poplar", "Sacc_cerevisiae", "Sacc_pombe", "Selaginella", "Sorghum", "Synechosystis", "Vitis_vinifera");
+#my @species = ("Ath", "Brachy", "Maize", "Oryza_sativa", "Sorghum");
+my @species = ("Ath", "Fragaria", "Oryza_sativa", "Vitis");
+#my @species  = ("Ath", "Brachy", "C_elegans", "Chlamy", "Danio", "E_coli", "Fragaria", "Glycine", "Human", "Maize", "Mouse", "Neurospora", "Oryza_sativa", "Physcomitreall", "Poplar", "Sacc_cerevisiae", "Sacc_pombe", "Selaginella", "Sorghum", "Synechosystis", "Vitis");
  my $num_species = @species;
  
-my %species_hash;
-
  for (my $i=0; $i<$num_species-1; $i++) {
         for (my $j=$i+1; $j<$num_species; $j++) {
                 # hashes to store the paralogs and orthologs
                 my (%species_1_gene_hash, %species_2_gene_hash);
  
                 # make sure the species are listed in alphabetical order to get correct table names
+               #  since I have fixed the species naming, this should be unnecessary now.  
+               #    Confirmed with no output from print statement.
                 my $species_1 = $species[$i];
                 my $species_2 = $species[$j];
                 if ($species_1 lt $species_2) {
@@ -70,25 +107,12 @@ for (my $i=0; $i<$num_species-1; $i++) {
                         my $spec_temp = $species_1;
                         $species_1 = $species_2;
                         $species_2 = $spec_temp;
+                       print "non optimal order\n";
                 }
                 
-               if ($species_1 eq "Glycine") {
-                       $species_1 = "Soy";
-               }
-               
-               if ($species_2 eq "Glycine") {
-                       $species_2 = "Soy";
-               }
-               
-               if ($species_1 eq "Vitis_vinifera") {
-                       $species_1 = "Grape";
-               }
-               
-               if ($species_2 eq "Vitis_vinifera") {
-                       $species_2 = "Grape";
-               }
                 
         
+               # create the statement handler to pull the species-pair data
                 my $spec_table = "$species_1" . "_" . "$species_2";
                 my $safe_species_table = $dbh->quote_identifier($spec_table);
                 my $sth = $dbh->prepare("select * from $safe_species_table");
@@ -99,11 +123,15 @@ for (my $i=0; $i<$num_species-1; $i++) {
                 
                 # error handling, make sure the table exists
                 if (!$rv) {
+                       print "database table doesn't exist\n";
                         next;
                 }
         
+               # fill the hashes with data from species-pair tables
                 while (my @line = $sth->fetchrow_array()) {
                         my ($id, $bit_score, $spec, $score, $gene) = @line;
+                       # skip if paralogy score is not higher than the minimum set above
+                       next if($score < $min_score);
                         if ($id ne $id_prev) {
                                 if ($spec eq $species_1) {
                                         $species_1_gene_hash{$id} = $gene;
@@ -135,40 +163,82 @@ for (my $i=0; $i<$num_species-1; $i++) {
                                 my (@spec_1_array, @spec_2_array);
                                 @spec_1_array = split " ", $species_1_gene_hash{$key};
                                 @spec_2_array = split " ", $species_2_gene_hash{$key};
-                               
+
+                               # set up a hash to keep track of the counts for each cluster id to determine the most popular one
+                               my %count_hash;
                                 # if the gene is already in a cluster, use its id #
                                 my $super_temp_id;
                                 
                                 foreach my $super_gene (@spec_1_array) {
-                                       if(defined($super_hash{$super_gene})) {
-                                               $super_temp_id = $super_hash{$super_gene};
-                                               last;
+                                       # if the gene already has a super id associated to it from an earlier pairing, add it to the list to vote on
+                                       if(defined($super_hash{$super_gene}->{'sup_clust_id'})) {
+                                               # gene already has a super id that has been counted, add one to the count for that id
+                                               if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) {
+                                                       $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}}++;
+                                               # gene has a new super id, start the count for that one
+                                               }else{
+                                                       $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1;
+                                               }
                                         }
                                 }
                                 
-                               if(!defined($super_temp_id)) {
-                                       foreach my $super_gene (@spec_2_array) {
-                                               if(defined($super_hash{$super_gene})) {
-                                                       $super_temp_id = $super_hash{$super_gene};
-                                                       last;
+                               foreach my $super_gene (@spec_2_array) {
+                                       # see above stanza
+                                       if(defined($super_hash{$super_gene})) {
+                                               if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) {
+                                                       $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}}++;
+                                               }else{
+                                                       $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1;
                                                 }
                                         }
                                 }
                                 
+                               # This section does the voting for which super id to use for the cluster for this species-pair
+                               #  
+                               
+                               my $prev_count = 0;
+                               foreach my $count_key (keys %count_hash) {
+                                       if($count_hash{$count_key} > $prev_count){
+                                               $super_temp_id = $count_key;
+                                               $prev_count = $count_hash{$count_key};
+                                       # find the non-unanimous clusters
+                                       }elsif($count_hash{$count_key} < $prev_count && $prev_count != 0) {
+                                               #print "$key\t$count_key\t$count_hash{$count_key}\n";
+                                       }
+                               }
+                               
+
+                               
                                 # if none of the genes are in a cluster already, get a new id #
                                 if(!defined($super_temp_id)) {
-                                       ++$super_id;
+                                       $super_id++;
                                         $super_temp_id = $super_id;
                                 }
                                 
                                 # build the hash and put the entries in the database
                                 foreach my $super_gene (@spec_1_array) {
-                                       $super_hash{$super_gene} = $super_temp_id;
-                                       $insert_sth->execute($super_temp_id,$species_1,$super_gene);
+                                       if(defined($big_count_hash{$super_gene})) {
+                                               $big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id";
+                                       }else{
+                                               $big_count_hash{$super_gene} = $super_temp_id;
+                                       }
+                                       $super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id;
+                                       $super_hash{$super_gene}->{'species'} = $species_1;
+                                       #print "$super_gene\t$super_temp_id\n";
+                                       #$insert_sth->execute($super_temp_id,$species_1,$super_gene);
+
                                 }
                                 foreach my $super_gene (@spec_2_array) {
-                                       $super_hash{$super_gene} = $super_temp_id;
-                                       $insert_sth->execute($super_temp_id,$species_2,$super_gene);
+                                       if(defined($big_count_hash{$super_gene})) {
+                                               $big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id";
+                                       }else{
+                                               $big_count_hash{$super_gene} = $super_temp_id;
+                                       }
+                                       $super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id;
+                                       $super_hash{$super_gene}->{'species'} = $species_2;
+                               #       print "$super_gene\t$super_temp_id\n";
+                                       #$insert_sth->execute($super_temp_id,$species_2,$super_gene);
+                                       
                                 }
                         
                         }
@@ -176,5 +246,41 @@ for (my $i=0; $i<$num_species-1; $i++) {
                 
         }
  }
+print "Done creating the hashes, now loading into the database\n";
+
+my $gene_count_total = keys %big_count_hash;
+my $loop_counter = 0;
+
+foreach my $gene (keys %big_count_hash) {
+       my @count_array = split("\t", $big_count_hash{$gene});
+       my %small_count_hash;
+       my $gene_cluster;
+       
+       foreach my $count_id (@count_array) {
+               if(defined($small_count_hash{$count_id})) {
+                       $small_count_hash{$count_id}++;
+               }else{
+                       $small_count_hash{$count_id} = 1;
+               }
+       }
+       my $prev_count = 0;
+       foreach my $small_count_key (keys %small_count_hash) {
+               if($small_count_hash{$small_count_key} > $prev_count) {
+                       $gene_cluster = $small_count_key;
+                       $prev_count = $small_count_hash{$small_count_key};
+               }
+       }
+       $super_hash{$gene}->{'sup_clust_id'} = $gene_cluster;
+       my $species = $super_hash{$gene}->{'species'};
+       $insert_sth->execute($super_hash{$gene}->{'sup_clust_id'},$species,$gene);
+       
+       if($loop_counter % 10000 == 0) {
+               print "finished $loop_counter of $gene_count_total\n";
+       }
+       $loop_counter++;
+}
+print "finished\n";
+
+
author	elserj <elserj@localhost>
	Mon, 5 Apr 2010 20:12:23 +0000 (20:12 +0000)
committer	elserj <elserj@localhost>
	Mon, 5 Apr 2010 20:12:23 +0000 (20:12 +0000)