From: elserj Date: Fri, 9 Dec 2016 17:27:32 +0000 (+0000) Subject: Add weighted version of supercluster program X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=a141dc7062f6322c34575bbcab3d3d8c7dbb773a;p=old-jaiswallab-svn%2F.git Add weighted version of supercluster program svn path=/; revision=651 --- diff --git a/interactome_scripts/supercluster_weighted.pl b/interactome_scripts/supercluster_weighted.pl new file mode 100755 index 0000000..12e3444 --- /dev/null +++ b/interactome_scripts/supercluster_weighted.pl @@ -0,0 +1,297 @@ +#!/usr/bin/perl + + + +############################################################# +# Written by Justin Elser, BPP, OSU 12/15/09 # +# # +# Initial version seemed to work but assumed that only # +# matching the first cluster was necessary. # +# # +# Version 0.9 # +# initial version # +# Version 0.91 # +# attempt to fix the assumption problem by using the # +# most popular rather than the first one found # +# Version 0.92 (started 3/17/10) # +# reblast questionable clusters to get accurate # +# assignment of cluster +# # +############################################################# + +use warnings; +use strict; + +use DBI; +use Term::ReadKey; + +use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts"; + +use DbiFloret; + +if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") { + require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl"; +}elsif(-e "$ENV{HOME}/bin/find_species.pl") { + require "$ENV{HOME}/bin/find_species.pl"; +} + +# check for arguments and explain usage +if ($#ARGV !=0) { + print "usage: supercluster.pl species_list_file\n"; + exit; +} + +my $spec_file = $ARGV[0]; + +my $dbh = DbiFloret::dbconnect; + +print "Minimum paralogy score: "; +chomp(my $min_score = ); + +if($min_score eq "") { + $min_score = 0; +} + +my @species; +my %genus_weight; + +open(SPECFILE, "$spec_file"); + +while() { + my $line = $_; + chomp $line; + push (@species, $line); + my ($genus, $sub_species) = split("_", $line); + if(defined($genus_weight{$genus})){ + $genus_weight{$genus}++; + }else{ + $genus_weight{$genus} = 1; + } +} + + +#### Note to self #### +## To not have multiple values in mysql table, use insert ignore instead of insert. Or use replace. +### also note that for insert ignore to work, must have a "unique" field + +# make the new table to hold the super clusters +my $super_table = "super_clust_weighted"; +my $safe_super_table = $dbh->quote_identifier($super_table); + +$dbh->do("drop table if exists $safe_super_table"); +$dbh->do("CREATE TABLE $safe_super_table ( + `super_id` INT( 11 ) NOT NULL , + `species` VARCHAR( 255 ) NOT NULL , + `gene` VARCHAR( 255 ) NOT NULL , + UNIQUE ( `gene` ) + ) TYPE = MYISAM"); + + +my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, species, gene) values (?,?,?)"); + +my $super_id = 0; #initialize the super cluster id +my %super_hash; +my %big_count_hash; + +my $num_species = @species; + +for (my $i=0; $i<$num_species-1; $i++) { + print "Working on species: $species[$i]\n"; + + for (my $j=$i+1; $j<$num_species; $j++) { + # hashes to store the paralogs and orthologs + my (%species_1_gene_hash, %species_2_gene_hash); + + # make sure the species are listed in alphabetical order to get correct table names + # since I have fixed the species naming, this should be unnecessary now. + # Confirmed with no output from print statement. + my $species_1 = $species[$i]; + my $species_2 = $species[$j]; + # if ($species_1 lt $species_2) { + # $species_1 = $species_1; + # $species_2 = $species_2; + # } else { + # my $spec_temp = $species_1; + # $species_1 = $species_2; + # $species_2 = $spec_temp; + # print "non optimal order\n"; + # } + + + + # create the statement handler to pull the species-pair data + my $spec_table = "$species_1" . "_" . "$species_2"; + my $safe_species_table = $dbh->quote_identifier($spec_table); + my $sth = $dbh->prepare("select * from $safe_species_table"); + + my $id_prev = ""; + + my $rv = $sth->execute(); + + # error handling, make sure the table exists + if (!$rv) { + print "database table doesn't exist\n"; + next; + } + + # fill the hashes with data from species-pair tables + while (my @line = $sth->fetchrow_array()) { + my ($id, $bit_score, $spec, $score, $gene) = @line; + # skip if paralogy score is not higher than the minimum set above + next if($score < $min_score); + if ($id ne $id_prev) { + if ($spec eq $species_1) { + $species_1_gene_hash{$id} = $gene; + }else { + $species_2_gene_hash{$id} = $gene; + } + }else { + if ($spec eq $species_1) { + if(defined($species_1_gene_hash{$id})) { + $species_1_gene_hash{$id} = "$species_1_gene_hash{$id} $gene"; + } else { + $species_1_gene_hash{$id} = $gene; + } + }else { + if(defined($species_2_gene_hash{$id})) { + $species_2_gene_hash{$id} = "$species_2_gene_hash{$id} $gene"; + } else { + $species_2_gene_hash{$id} = $gene; + } + } + } + $id_prev = $id; + } + $sth->finish(); + + # Each key defines a species pair cluster + foreach my $key (keys %species_1_gene_hash) { + if(defined($species_2_gene_hash{$key})) { + my (@spec_1_array, @spec_2_array); + @spec_1_array = split " ", $species_1_gene_hash{$key}; + @spec_2_array = split " ", $species_2_gene_hash{$key}; + + # set up a hash to keep track of the counts for each cluster id to determine the most popular one + my %count_hash; + # if the gene is already in a cluster, use its id # + my $super_temp_id; + + foreach my $super_gene (@spec_1_array) { + # get the genus from the species name + my ($genus, $sub_species) = split("_", $species_1); + # if the gene already has a super id associated to it from an earlier pairing, add it to the list to vote on + if(defined($super_hash{$super_gene}->{'sup_clust_id'})) { + # gene already has a super id that has been counted, add one to the count for that id + if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) { + $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} += 1/($genus_weight{$genus}); + # gene has a new super id, start the count for that one + }else{ + $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1/($genus_weight{$genus}); + } + } + } + + foreach my $super_gene (@spec_2_array) { + # get the genus from the species name + my ($genus, $sub_species) = split("_", $species_2); + # see above stanza + if(defined($super_hash{$super_gene})) { + if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) { + $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} += 1/($genus_weight{$genus}); + }else{ + $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1/($genus_weight{$genus}); + } + } + } + + # This section does the voting for which super id to use for the cluster for this species-pair + # + + my $prev_count = 0; + foreach my $count_key (keys %count_hash) { + if($count_hash{$count_key} > $prev_count){ + $super_temp_id = $count_key; + $prev_count = $count_hash{$count_key}; + # find the non-unanimous clusters + }elsif($count_hash{$count_key} < $prev_count && $prev_count != 0) { + #print "$key\t$count_key\t$count_hash{$count_key}\n"; + } + } + + + + # if none of the genes are in a cluster already, get a new id # + if(!defined($super_temp_id)) { + $super_id++; + $super_temp_id = $super_id; + } + + # build the hash and put the entries in the database + foreach my $super_gene (@spec_1_array) { + if(defined($big_count_hash{$super_gene})) { + $big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id"; + }else{ + $big_count_hash{$super_gene} = $super_temp_id; + } + $super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id; + $super_hash{$super_gene}->{'species'} = $species_1; + #print "$super_gene\t$super_temp_id\n"; + #$insert_sth->execute($super_temp_id,$species_1,$super_gene); + + } + foreach my $super_gene (@spec_2_array) { + if(defined($big_count_hash{$super_gene})) { + $big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id"; + }else{ + $big_count_hash{$super_gene} = $super_temp_id; + } + $super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id; + $super_hash{$super_gene}->{'species'} = $species_2; + # print "$super_gene\t$super_temp_id\n"; + #$insert_sth->execute($super_temp_id,$species_2,$super_gene); + + } + + } + } + + } +} +print "Done creating the hashes, now loading into the database\n"; + +my $gene_count_total = keys %big_count_hash; +my $loop_counter = 0; + +foreach my $gene (keys %big_count_hash) { + my @count_array = split("\t", $big_count_hash{$gene}); + my %small_count_hash; + my $gene_cluster; + + foreach my $count_id (@count_array) { + if(defined($small_count_hash{$count_id})) { + $small_count_hash{$count_id}++; + }else{ + $small_count_hash{$count_id} = 1; + } + } + my $prev_count = 0; + foreach my $small_count_key (keys %small_count_hash) { + if($small_count_hash{$small_count_key} > $prev_count) { + $gene_cluster = $small_count_key; + $prev_count = $small_count_hash{$small_count_key}; + } + } + $super_hash{$gene}->{'sup_clust_id'} = $gene_cluster; + my $species = $super_hash{$gene}->{'species'}; + $insert_sth->execute($super_hash{$gene}->{'sup_clust_id'},$species,$gene); + + if($loop_counter % 10000 == 0) { + print "finished $loop_counter of $gene_count_total\n"; + } + $loop_counter++; +} +print "finished\n"; + + + +