--- /dev/null
+#!/usr/bin/perl
+
+
+
+#############################################################
+# Written by Justin Elser, BPP, OSU 12/15/09 #
+# #
+# Initial version seemed to work but assumed that only #
+# matching the first cluster was necessary. #
+# #
+# Version 0.9 #
+# initial version #
+# Version 0.91 #
+# attempt to fix the assumption problem by using the #
+# most popular rather than the first one found #
+# Version 0.92 (started 3/17/10) #
+# reblast questionable clusters to get accurate #
+# assignment of cluster
+# #
+#############################################################
+
+use warnings;
+use strict;
+
+use DBI;
+use Term::ReadKey;
+
+use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts";
+
+use DbiFloret;
+
+if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") {
+ require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl";
+}elsif(-e "$ENV{HOME}/bin/find_species.pl") {
+ require "$ENV{HOME}/bin/find_species.pl";
+}
+
+# check for arguments and explain usage
+if ($#ARGV !=0) {
+ print "usage: supercluster.pl species_list_file\n";
+ exit;
+}
+
+my $spec_file = $ARGV[0];
+
+my $dbh = DbiFloret::dbconnect;
+
+print "Minimum paralogy score: ";
+chomp(my $min_score = <STDIN>);
+
+if($min_score eq "") {
+ $min_score = 0;
+}
+
+my @species;
+my %genus_weight;
+
+open(SPECFILE, "$spec_file");
+
+while(<SPECFILE>) {
+ my $line = $_;
+ chomp $line;
+ push (@species, $line);
+ my ($genus, $sub_species) = split("_", $line);
+ if(defined($genus_weight{$genus})){
+ $genus_weight{$genus}++;
+ }else{
+ $genus_weight{$genus} = 1;
+ }
+}
+
+
+#### Note to self ####
+## To not have multiple values in mysql table, use insert ignore instead of insert. Or use replace.
+### also note that for insert ignore to work, must have a "unique" field
+
+# make the new table to hold the super clusters
+my $super_table = "super_clust_weighted";
+my $safe_super_table = $dbh->quote_identifier($super_table);
+
+$dbh->do("drop table if exists $safe_super_table");
+$dbh->do("CREATE TABLE $safe_super_table (
+ `super_id` INT( 11 ) NOT NULL ,
+ `species` VARCHAR( 255 ) NOT NULL ,
+ `gene` VARCHAR( 255 ) NOT NULL ,
+ UNIQUE ( `gene` )
+ ) TYPE = MYISAM");
+
+
+my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, species, gene) values (?,?,?)");
+
+my $super_id = 0; #initialize the super cluster id
+my %super_hash;
+my %big_count_hash;
+
+my $num_species = @species;
+
+for (my $i=0; $i<$num_species-1; $i++) {
+ print "Working on species: $species[$i]\n";
+
+ for (my $j=$i+1; $j<$num_species; $j++) {
+ # hashes to store the paralogs and orthologs
+ my (%species_1_gene_hash, %species_2_gene_hash);
+
+ # make sure the species are listed in alphabetical order to get correct table names
+ # since I have fixed the species naming, this should be unnecessary now.
+ # Confirmed with no output from print statement.
+ my $species_1 = $species[$i];
+ my $species_2 = $species[$j];
+ # if ($species_1 lt $species_2) {
+ # $species_1 = $species_1;
+ # $species_2 = $species_2;
+ # } else {
+ # my $spec_temp = $species_1;
+ # $species_1 = $species_2;
+ # $species_2 = $spec_temp;
+ # print "non optimal order\n";
+ # }
+
+
+
+ # create the statement handler to pull the species-pair data
+ my $spec_table = "$species_1" . "_" . "$species_2";
+ my $safe_species_table = $dbh->quote_identifier($spec_table);
+ my $sth = $dbh->prepare("select * from $safe_species_table");
+
+ my $id_prev = "";
+
+ my $rv = $sth->execute();
+
+ # error handling, make sure the table exists
+ if (!$rv) {
+ print "database table doesn't exist\n";
+ next;
+ }
+
+ # fill the hashes with data from species-pair tables
+ while (my @line = $sth->fetchrow_array()) {
+ my ($id, $bit_score, $spec, $score, $gene) = @line;
+ # skip if paralogy score is not higher than the minimum set above
+ next if($score < $min_score);
+ if ($id ne $id_prev) {
+ if ($spec eq $species_1) {
+ $species_1_gene_hash{$id} = $gene;
+ }else {
+ $species_2_gene_hash{$id} = $gene;
+ }
+ }else {
+ if ($spec eq $species_1) {
+ if(defined($species_1_gene_hash{$id})) {
+ $species_1_gene_hash{$id} = "$species_1_gene_hash{$id} $gene";
+ } else {
+ $species_1_gene_hash{$id} = $gene;
+ }
+ }else {
+ if(defined($species_2_gene_hash{$id})) {
+ $species_2_gene_hash{$id} = "$species_2_gene_hash{$id} $gene";
+ } else {
+ $species_2_gene_hash{$id} = $gene;
+ }
+ }
+ }
+ $id_prev = $id;
+ }
+ $sth->finish();
+
+ # Each key defines a species pair cluster
+ foreach my $key (keys %species_1_gene_hash) {
+ if(defined($species_2_gene_hash{$key})) {
+ my (@spec_1_array, @spec_2_array);
+ @spec_1_array = split " ", $species_1_gene_hash{$key};
+ @spec_2_array = split " ", $species_2_gene_hash{$key};
+
+ # set up a hash to keep track of the counts for each cluster id to determine the most popular one
+ my %count_hash;
+ # if the gene is already in a cluster, use its id #
+ my $super_temp_id;
+
+ foreach my $super_gene (@spec_1_array) {
+ # get the genus from the species name
+ my ($genus, $sub_species) = split("_", $species_1);
+ # if the gene already has a super id associated to it from an earlier pairing, add it to the list to vote on
+ if(defined($super_hash{$super_gene}->{'sup_clust_id'})) {
+ # gene already has a super id that has been counted, add one to the count for that id
+ if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) {
+ $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} += 1/($genus_weight{$genus});
+ # gene has a new super id, start the count for that one
+ }else{
+ $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1/($genus_weight{$genus});
+ }
+ }
+ }
+
+ foreach my $super_gene (@spec_2_array) {
+ # get the genus from the species name
+ my ($genus, $sub_species) = split("_", $species_2);
+ # see above stanza
+ if(defined($super_hash{$super_gene})) {
+ if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) {
+ $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} += 1/($genus_weight{$genus});
+ }else{
+ $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1/($genus_weight{$genus});
+ }
+ }
+ }
+
+ # This section does the voting for which super id to use for the cluster for this species-pair
+ #
+
+ my $prev_count = 0;
+ foreach my $count_key (keys %count_hash) {
+ if($count_hash{$count_key} > $prev_count){
+ $super_temp_id = $count_key;
+ $prev_count = $count_hash{$count_key};
+ # find the non-unanimous clusters
+ }elsif($count_hash{$count_key} < $prev_count && $prev_count != 0) {
+ #print "$key\t$count_key\t$count_hash{$count_key}\n";
+ }
+ }
+
+
+
+ # if none of the genes are in a cluster already, get a new id #
+ if(!defined($super_temp_id)) {
+ $super_id++;
+ $super_temp_id = $super_id;
+ }
+
+ # build the hash and put the entries in the database
+ foreach my $super_gene (@spec_1_array) {
+ if(defined($big_count_hash{$super_gene})) {
+ $big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id";
+ }else{
+ $big_count_hash{$super_gene} = $super_temp_id;
+ }
+ $super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id;
+ $super_hash{$super_gene}->{'species'} = $species_1;
+ #print "$super_gene\t$super_temp_id\n";
+ #$insert_sth->execute($super_temp_id,$species_1,$super_gene);
+
+ }
+ foreach my $super_gene (@spec_2_array) {
+ if(defined($big_count_hash{$super_gene})) {
+ $big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id";
+ }else{
+ $big_count_hash{$super_gene} = $super_temp_id;
+ }
+ $super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id;
+ $super_hash{$super_gene}->{'species'} = $species_2;
+ # print "$super_gene\t$super_temp_id\n";
+ #$insert_sth->execute($super_temp_id,$species_2,$super_gene);
+
+ }
+
+ }
+ }
+
+ }
+}
+print "Done creating the hashes, now loading into the database\n";
+
+my $gene_count_total = keys %big_count_hash;
+my $loop_counter = 0;
+
+foreach my $gene (keys %big_count_hash) {
+ my @count_array = split("\t", $big_count_hash{$gene});
+ my %small_count_hash;
+ my $gene_cluster;
+
+ foreach my $count_id (@count_array) {
+ if(defined($small_count_hash{$count_id})) {
+ $small_count_hash{$count_id}++;
+ }else{
+ $small_count_hash{$count_id} = 1;
+ }
+ }
+ my $prev_count = 0;
+ foreach my $small_count_key (keys %small_count_hash) {
+ if($small_count_hash{$small_count_key} > $prev_count) {
+ $gene_cluster = $small_count_key;
+ $prev_count = $small_count_hash{$small_count_key};
+ }
+ }
+ $super_hash{$gene}->{'sup_clust_id'} = $gene_cluster;
+ my $species = $super_hash{$gene}->{'species'};
+ $insert_sth->execute($super_hash{$gene}->{'sup_clust_id'},$species,$gene);
+
+ if($loop_counter % 10000 == 0) {
+ print "finished $loop_counter of $gene_count_total\n";
+ }
+ $loop_counter++;
+}
+print "finished\n";
+
+
+
+