From: elserj Date: Mon, 5 Apr 2010 20:12:23 +0000 (+0000) Subject: Updated version of supercluster to do majority voting X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=41cf2acca4cfa1332936e8ce6ba3ff0035eb42f7;p=old-jaiswallab-svn%2F.git Updated version of supercluster to do majority voting svn path=/; revision=14 --- diff --git a/interactome_scripts/supercluster.pl b/interactome_scripts/supercluster.pl index 5a5a1f8..bef30d9 100755 --- a/interactome_scripts/supercluster.pl +++ b/interactome_scripts/supercluster.pl @@ -1,5 +1,24 @@ #!/usr/bin/perl + + +############################################################# +# Written by Justin Elser, BPP, OSU 12/15/09 # +# # +# Initial version seemed to work but assumed that only # +# matching the first cluster was necessary. # +# # +# Version 0.9 # +# initial version # +# Version 0.91 # +# attempt to fix the assumption problem by using the # +# most popular rather than the first one found # +# Version 0.92 (started 3/17/10) # +# reblast questionable clusters to get accurate # +# assignment of cluster +# # +############################################################# + use warnings; use strict; @@ -19,10 +38,18 @@ my $screen = Term::Screen::ReadLine->new(); # ask for password, replace character presses with stars $screen->at(1,0)->puts("Password: "); my $password = $screen->readline(ROW => 1, COL => 11, PASSWORD => 1); + + # Ask for the minimum score for the paralogy cutoff + $screen->at(2,0)->puts("Minimum paralogy score: "); + my $min_score = $screen->readline(ROW => 2, COL => 24); - $screen->at(2,0); + $screen->at(3,0); undef $screen; +if($min_score eq "") { + $min_score = 0; +} + my $dbh = DBI->connect('DBI:mysql:inparanoid_data;host=floret.cgrb.oregonstate.edu', $username, $password, { RaiseError=> 1, AutoCommit=>1 } ) or die "Failed to connect to database: $DBI::errstr"; @@ -43,24 +70,34 @@ $dbh->do("CREATE TABLE $safe_super_table ( `gene` VARCHAR( 255 ) NOT NULL , UNIQUE ( `gene` ) ) TYPE = MYISAM"); + + +#$dbh->do("CREATE TABLE $safe_super_table ( +# `super_id` INT( 11 ) NOT NULL , +# `gene` VARCHAR( 255 ) NOT NULL , +# UNIQUE ( `gene` ) +# ) TYPE = MYISAM"); + my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, species, gene) values (?,?,?)"); +#my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, gene) values (?,?)"); my $super_id = 0; #initialize the super cluster id my %super_hash; +my %big_count_hash; -#my @species = ("Maize", "Oryza_sativa", "Ath", "Sorghum", "Brachy"); -#my @species = ("Maize", "Oryza_sativa", "Ath"); -my @species = ("Ath", "Brachy", "C_elegans", "Chlamy", "Danio", "E_coli", "Fragaria", "Glycine", "Human", "Maize", "Mouse", "Neurospora", "Oryza_sativa", "Physcomitreall", "Poplar", "Sacc_cerevisiae", "Sacc_pombe", "Selaginella", "Sorghum", "Synechosystis", "Vitis_vinifera"); +#my @species = ("Ath", "Brachy", "Maize", "Oryza_sativa", "Sorghum"); +my @species = ("Ath", "Fragaria", "Oryza_sativa", "Vitis"); +#my @species = ("Ath", "Brachy", "C_elegans", "Chlamy", "Danio", "E_coli", "Fragaria", "Glycine", "Human", "Maize", "Mouse", "Neurospora", "Oryza_sativa", "Physcomitreall", "Poplar", "Sacc_cerevisiae", "Sacc_pombe", "Selaginella", "Sorghum", "Synechosystis", "Vitis"); my $num_species = @species; -my %species_hash; - for (my $i=0; $i<$num_species-1; $i++) { for (my $j=$i+1; $j<$num_species; $j++) { # hashes to store the paralogs and orthologs my (%species_1_gene_hash, %species_2_gene_hash); # make sure the species are listed in alphabetical order to get correct table names + # since I have fixed the species naming, this should be unnecessary now. + # Confirmed with no output from print statement. my $species_1 = $species[$i]; my $species_2 = $species[$j]; if ($species_1 lt $species_2) { @@ -70,25 +107,12 @@ for (my $i=0; $i<$num_species-1; $i++) { my $spec_temp = $species_1; $species_1 = $species_2; $species_2 = $spec_temp; + print "non optimal order\n"; } - if ($species_1 eq "Glycine") { - $species_1 = "Soy"; - } - - if ($species_2 eq "Glycine") { - $species_2 = "Soy"; - } - - if ($species_1 eq "Vitis_vinifera") { - $species_1 = "Grape"; - } - - if ($species_2 eq "Vitis_vinifera") { - $species_2 = "Grape"; - } + # create the statement handler to pull the species-pair data my $spec_table = "$species_1" . "_" . "$species_2"; my $safe_species_table = $dbh->quote_identifier($spec_table); my $sth = $dbh->prepare("select * from $safe_species_table"); @@ -99,11 +123,15 @@ for (my $i=0; $i<$num_species-1; $i++) { # error handling, make sure the table exists if (!$rv) { + print "database table doesn't exist\n"; next; } + # fill the hashes with data from species-pair tables while (my @line = $sth->fetchrow_array()) { my ($id, $bit_score, $spec, $score, $gene) = @line; + # skip if paralogy score is not higher than the minimum set above + next if($score < $min_score); if ($id ne $id_prev) { if ($spec eq $species_1) { $species_1_gene_hash{$id} = $gene; @@ -135,40 +163,82 @@ for (my $i=0; $i<$num_species-1; $i++) { my (@spec_1_array, @spec_2_array); @spec_1_array = split " ", $species_1_gene_hash{$key}; @spec_2_array = split " ", $species_2_gene_hash{$key}; - + + # set up a hash to keep track of the counts for each cluster id to determine the most popular one + my %count_hash; # if the gene is already in a cluster, use its id # my $super_temp_id; foreach my $super_gene (@spec_1_array) { - if(defined($super_hash{$super_gene})) { - $super_temp_id = $super_hash{$super_gene}; - last; + # if the gene already has a super id associated to it from an earlier pairing, add it to the list to vote on + if(defined($super_hash{$super_gene}->{'sup_clust_id'})) { + # gene already has a super id that has been counted, add one to the count for that id + if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) { + $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}}++; + # gene has a new super id, start the count for that one + }else{ + $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1; + } } } - if(!defined($super_temp_id)) { - foreach my $super_gene (@spec_2_array) { - if(defined($super_hash{$super_gene})) { - $super_temp_id = $super_hash{$super_gene}; - last; + foreach my $super_gene (@spec_2_array) { + # see above stanza + if(defined($super_hash{$super_gene})) { + if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) { + $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}}++; + }else{ + $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1; } } } + # This section does the voting for which super id to use for the cluster for this species-pair + # + + my $prev_count = 0; + foreach my $count_key (keys %count_hash) { + if($count_hash{$count_key} > $prev_count){ + $super_temp_id = $count_key; + $prev_count = $count_hash{$count_key}; + # find the non-unanimous clusters + }elsif($count_hash{$count_key} < $prev_count && $prev_count != 0) { + #print "$key\t$count_key\t$count_hash{$count_key}\n"; + } + } + + + # if none of the genes are in a cluster already, get a new id # if(!defined($super_temp_id)) { - ++$super_id; + $super_id++; $super_temp_id = $super_id; } # build the hash and put the entries in the database foreach my $super_gene (@spec_1_array) { - $super_hash{$super_gene} = $super_temp_id; - $insert_sth->execute($super_temp_id,$species_1,$super_gene); + if(defined($big_count_hash{$super_gene})) { + $big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id"; + }else{ + $big_count_hash{$super_gene} = $super_temp_id; + } + $super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id; + $super_hash{$super_gene}->{'species'} = $species_1; + #print "$super_gene\t$super_temp_id\n"; + #$insert_sth->execute($super_temp_id,$species_1,$super_gene); + } foreach my $super_gene (@spec_2_array) { - $super_hash{$super_gene} = $super_temp_id; - $insert_sth->execute($super_temp_id,$species_2,$super_gene); + if(defined($big_count_hash{$super_gene})) { + $big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id"; + }else{ + $big_count_hash{$super_gene} = $super_temp_id; + } + $super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id; + $super_hash{$super_gene}->{'species'} = $species_2; + # print "$super_gene\t$super_temp_id\n"; + #$insert_sth->execute($super_temp_id,$species_2,$super_gene); + } } @@ -176,5 +246,41 @@ for (my $i=0; $i<$num_species-1; $i++) { } } +print "Done creating the hashes, now loading into the database\n"; + +my $gene_count_total = keys %big_count_hash; +my $loop_counter = 0; + +foreach my $gene (keys %big_count_hash) { + my @count_array = split("\t", $big_count_hash{$gene}); + my %small_count_hash; + my $gene_cluster; + + foreach my $count_id (@count_array) { + if(defined($small_count_hash{$count_id})) { + $small_count_hash{$count_id}++; + }else{ + $small_count_hash{$count_id} = 1; + } + } + my $prev_count = 0; + foreach my $small_count_key (keys %small_count_hash) { + if($small_count_hash{$small_count_key} > $prev_count) { + $gene_cluster = $small_count_key; + $prev_count = $small_count_hash{$small_count_key}; + } + } + $super_hash{$gene}->{'sup_clust_id'} = $gene_cluster; + my $species = $super_hash{$gene}->{'species'}; + $insert_sth->execute($super_hash{$gene}->{'sup_clust_id'},$species,$gene); + + if($loop_counter % 10000 == 0) { + print "finished $loop_counter of $gene_count_total\n"; + } + $loop_counter++; +} +print "finished\n"; + +