Add weighted version of supercluster program

author elserj <elserj@localhost>

Fri, 9 Dec 2016 17:27:32 +0000 (17:27 +0000)

committer elserj <elserj@localhost>

Fri, 9 Dec 2016 17:27:32 +0000 (17:27 +0000)
author elserj <elserj@localhost>
Fri, 9 Dec 2016 17:27:32 +0000 (17:27 +0000)
committer elserj <elserj@localhost>
Fri, 9 Dec 2016 17:27:32 +0000 (17:27 +0000)
diff --git a/interactome_scripts/supercluster_weighted.pl b/interactome_scripts/supercluster_weighted.pl

new file mode 100755 (executable)

index 0000000..12e3444
--- /dev/null
+++ b/interactome_scripts/supercluster_weighted.pl
@@ -0,0 +1,297 @@
+#!/usr/bin/perl
+
+
+
+#############################################################
+# Written by Justin Elser, BPP, OSU 12/15/09                #
+#                                                           #
+#  Initial version seemed to work but assumed that only     #
+#   matching the first cluster was necessary.               #
+#                                                           #
+#  Version 0.9                                              #
+#      initial version                                      #
+#  Version 0.91                                             #
+#      attempt to fix the assumption problem by using the   #
+#        most popular rather than the first one found       #
+#  Version 0.92 (started 3/17/10)                           #
+#      reblast questionable clusters to get accurate        #
+#        assignment of cluster
+#                                                           #
+#############################################################
+
+use warnings;
+use strict;
+
+use DBI;
+use Term::ReadKey;
+
+use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts";
+
+use DbiFloret;
+
+if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") {
+       require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl";
+}elsif(-e "$ENV{HOME}/bin/find_species.pl") {
+       require "$ENV{HOME}/bin/find_species.pl";
+}
+
+# check for arguments and explain usage
+if ($#ARGV !=0) {
+       print "usage: supercluster.pl species_list_file\n";
+       exit;
+}
+
+my $spec_file = $ARGV[0];
+
+my $dbh = DbiFloret::dbconnect;
+       
+print "Minimum paralogy score: ";
+chomp(my $min_score = <STDIN>);
+       
+if($min_score eq "") {
+       $min_score = 0;
+}
+
+my @species;
+my %genus_weight;
+
+open(SPECFILE, "$spec_file");
+
+while(<SPECFILE>) {
+               my $line = $_;
+               chomp $line;
+               push (@species, $line);
+               my ($genus, $sub_species) = split("_", $line);
+               if(defined($genus_weight{$genus})){
+                               $genus_weight{$genus}++;
+               }else{
+                               $genus_weight{$genus} = 1;
+               }
+}
+
+       
+#### Note to self ####
+## To not have multiple values in mysql table, use insert ignore instead of insert.  Or use replace.
+### also note that for insert ignore to work, must have a "unique" field
+
+# make the new table to hold the super clusters
+my $super_table = "super_clust_weighted";
+my $safe_super_table = $dbh->quote_identifier($super_table);
+
+$dbh->do("drop table if exists $safe_super_table");
+$dbh->do("CREATE TABLE $safe_super_table (
+       `super_id` INT( 11 ) NOT NULL ,
+       `species` VARCHAR( 255 ) NOT NULL ,
+       `gene` VARCHAR( 255 ) NOT NULL ,
+       UNIQUE ( `gene` )
+       ) TYPE = MYISAM");
+       
+       
+my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, species, gene) values (?,?,?)");
+
+my $super_id = 0; #initialize the super cluster id
+my %super_hash;
+my %big_count_hash;
+       
+my $num_species = @species;
+
+for (my $i=0; $i<$num_species-1; $i++) {
+       print "Working on species: $species[$i]\n";
+       
+       for (my $j=$i+1; $j<$num_species; $j++) {
+               # hashes to store the paralogs and orthologs
+               my (%species_1_gene_hash, %species_2_gene_hash);
+
+               # make sure the species are listed in alphabetical order to get correct table names
+               #  since I have fixed the species naming, this should be unnecessary now.  
+               #    Confirmed with no output from print statement.
+               my $species_1 = $species[$i];
+               my $species_2 = $species[$j];
+       #       if ($species_1 lt $species_2) {
+       #               $species_1 = $species_1;
+       #               $species_2 = $species_2;
+       #       } else {
+       #               my $spec_temp = $species_1;
+       #               $species_1 = $species_2;
+       #               $species_2 = $spec_temp;
+       #               print "non optimal order\n";
+       #       }
+               
+               
+       
+               # create the statement handler to pull the species-pair data
+               my $spec_table = "$species_1" . "_" . "$species_2";
+               my $safe_species_table = $dbh->quote_identifier($spec_table);
+               my $sth = $dbh->prepare("select * from $safe_species_table");
+       
+               my $id_prev = "";
+       
+               my $rv = $sth->execute();
+               
+               # error handling, make sure the table exists
+               if (!$rv) {
+                       print "database table doesn't exist\n";
+                       next;
+               }
+       
+               # fill the hashes with data from species-pair tables
+               while (my @line = $sth->fetchrow_array()) {
+                       my ($id, $bit_score, $spec, $score, $gene) = @line;
+                       # skip if paralogy score is not higher than the minimum set above
+                       next if($score < $min_score);
+                       if ($id ne $id_prev) {
+                               if ($spec eq $species_1) {
+                                       $species_1_gene_hash{$id} = $gene;
+                               }else {
+                                       $species_2_gene_hash{$id} = $gene;
+                               }
+                       }else {
+                               if ($spec eq $species_1) {
+                                       if(defined($species_1_gene_hash{$id})) {
+                                               $species_1_gene_hash{$id} = "$species_1_gene_hash{$id} $gene";
+                                       } else {
+                                               $species_1_gene_hash{$id} = $gene;
+                                       }
+                               }else {
+                                       if(defined($species_2_gene_hash{$id})) {
+                                               $species_2_gene_hash{$id} = "$species_2_gene_hash{$id} $gene";
+                                       } else {
+                                               $species_2_gene_hash{$id} = $gene;
+                                       }
+                               }
+                       }
+                       $id_prev = $id;
+               }
+               $sth->finish();
+
+               # Each key defines a species pair cluster
+               foreach my $key (keys %species_1_gene_hash) {
+                       if(defined($species_2_gene_hash{$key})) {
+                               my (@spec_1_array, @spec_2_array);
+                               @spec_1_array = split " ", $species_1_gene_hash{$key};
+                               @spec_2_array = split " ", $species_2_gene_hash{$key};
+
+                               # set up a hash to keep track of the counts for each cluster id to determine the most popular one
+                               my %count_hash;
+                               # if the gene is already in a cluster, use its id #
+                               my $super_temp_id;
+                               
+                               foreach my $super_gene (@spec_1_array) {
+                                       # get the genus from the species name
+                                       my ($genus, $sub_species) = split("_", $species_1);
+                                       # if the gene already has a super id associated to it from an earlier pairing, add it to the list to vote on
+                                       if(defined($super_hash{$super_gene}->{'sup_clust_id'})) {
+                                               # gene already has a super id that has been counted, add one to the count for that id
+                                               if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) {
+                                                       $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} += 1/($genus_weight{$genus});
+                                               # gene has a new super id, start the count for that one
+                                               }else{
+                                                       $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1/($genus_weight{$genus});
+                                               }
+                                       }
+                               }
+                               
+                               foreach my $super_gene (@spec_2_array) {
+                                       # get the genus from the species name
+                                       my ($genus, $sub_species) = split("_", $species_2);
+                                       # see above stanza
+                                       if(defined($super_hash{$super_gene})) {
+                                               if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) {
+                                                       $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} += 1/($genus_weight{$genus});
+                                               }else{
+                                                       $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1/($genus_weight{$genus});
+                                               }
+                                       }
+                               }
+                               
+                               # This section does the voting for which super id to use for the cluster for this species-pair
+                               #  
+                               
+                               my $prev_count = 0;
+                               foreach my $count_key (keys %count_hash) {
+                                       if($count_hash{$count_key} > $prev_count){
+                                               $super_temp_id = $count_key;
+                                               $prev_count = $count_hash{$count_key};
+                                       # find the non-unanimous clusters
+                                       }elsif($count_hash{$count_key} < $prev_count && $prev_count != 0) {
+                                               #print "$key\t$count_key\t$count_hash{$count_key}\n";
+                                       }
+                               }
+                               
+
+                               
+                               # if none of the genes are in a cluster already, get a new id #
+                               if(!defined($super_temp_id)) {
+                                       $super_id++;
+                                       $super_temp_id = $super_id;
+                               }
+                               
+                               # build the hash and put the entries in the database
+                               foreach my $super_gene (@spec_1_array) {
+                                       if(defined($big_count_hash{$super_gene})) {
+                                               $big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id";
+                                       }else{
+                                               $big_count_hash{$super_gene} = $super_temp_id;
+                                       }
+                                       $super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id;
+                                       $super_hash{$super_gene}->{'species'} = $species_1;
+                                       #print "$super_gene\t$super_temp_id\n";
+                                       #$insert_sth->execute($super_temp_id,$species_1,$super_gene);
+
+                               }
+                               foreach my $super_gene (@spec_2_array) {
+                                       if(defined($big_count_hash{$super_gene})) {
+                                               $big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id";
+                                       }else{
+                                               $big_count_hash{$super_gene} = $super_temp_id;
+                                       }
+                                       $super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id;
+                                       $super_hash{$super_gene}->{'species'} = $species_2;
+                               #       print "$super_gene\t$super_temp_id\n";
+                                       #$insert_sth->execute($super_temp_id,$species_2,$super_gene);
+                                       
+                               }
+                       
+                       }
+               }
+               
+       }
+}
+print "Done creating the hashes, now loading into the database\n";
+
+my $gene_count_total = keys %big_count_hash;
+my $loop_counter = 0;
+
+foreach my $gene (keys %big_count_hash) {
+       my @count_array = split("\t", $big_count_hash{$gene});
+       my %small_count_hash;
+       my $gene_cluster;
+       
+       foreach my $count_id (@count_array) {
+               if(defined($small_count_hash{$count_id})) {
+                       $small_count_hash{$count_id}++;
+               }else{
+                       $small_count_hash{$count_id} = 1;
+               }
+       }
+       my $prev_count = 0;
+       foreach my $small_count_key (keys %small_count_hash) {
+               if($small_count_hash{$small_count_key} > $prev_count) {
+                       $gene_cluster = $small_count_key;
+                       $prev_count = $small_count_hash{$small_count_key};
+               }
+       }
+       $super_hash{$gene}->{'sup_clust_id'} = $gene_cluster;
+       my $species = $super_hash{$gene}->{'species'};
+       $insert_sth->execute($super_hash{$gene}->{'sup_clust_id'},$species,$gene);
+       
+       if($loop_counter % 10000 == 0) {
+               print "finished $loop_counter of $gene_count_total\n";
+       }
+       $loop_counter++;
+}
+print "finished\n";
+
+
+
+
author	elserj <elserj@localhost>
	Fri, 9 Dec 2016 17:27:32 +0000 (17:27 +0000)
committer	elserj <elserj@localhost>
	Fri, 9 Dec 2016 17:27:32 +0000 (17:27 +0000)