#!/usr/bin/perl
+
+
+#############################################################
+# Written by Justin Elser, BPP, OSU 12/15/09 #
+# #
+# Initial version seemed to work but assumed that only #
+# matching the first cluster was necessary. #
+# #
+# Version 0.9 #
+# initial version #
+# Version 0.91 #
+# attempt to fix the assumption problem by using the #
+# most popular rather than the first one found #
+# Version 0.92 (started 3/17/10) #
+# reblast questionable clusters to get accurate #
+# assignment of cluster
+# #
+#############################################################
+
use warnings;
use strict;
# ask for password, replace character presses with stars
$screen->at(1,0)->puts("Password: ");
my $password = $screen->readline(ROW => 1, COL => 11, PASSWORD => 1);
+
+ # Ask for the minimum score for the paralogy cutoff
+ $screen->at(2,0)->puts("Minimum paralogy score: ");
+ my $min_score = $screen->readline(ROW => 2, COL => 24);
- $screen->at(2,0);
+ $screen->at(3,0);
undef $screen;
+if($min_score eq "") {
+ $min_score = 0;
+}
+
my $dbh = DBI->connect('DBI:mysql:inparanoid_data;host=floret.cgrb.oregonstate.edu', $username, $password,
{ RaiseError=> 1, AutoCommit=>1 }
) or die "Failed to connect to database: $DBI::errstr";
`gene` VARCHAR( 255 ) NOT NULL ,
UNIQUE ( `gene` )
) TYPE = MYISAM");
+
+
+#$dbh->do("CREATE TABLE $safe_super_table (
+# `super_id` INT( 11 ) NOT NULL ,
+# `gene` VARCHAR( 255 ) NOT NULL ,
+# UNIQUE ( `gene` )
+# ) TYPE = MYISAM");
+
my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, species, gene) values (?,?,?)");
+#my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, gene) values (?,?)");
my $super_id = 0; #initialize the super cluster id
my %super_hash;
+my %big_count_hash;
-#my @species = ("Maize", "Oryza_sativa", "Ath", "Sorghum", "Brachy");
-#my @species = ("Maize", "Oryza_sativa", "Ath");
-my @species = ("Ath", "Brachy", "C_elegans", "Chlamy", "Danio", "E_coli", "Fragaria", "Glycine", "Human", "Maize", "Mouse", "Neurospora", "Oryza_sativa", "Physcomitreall", "Poplar", "Sacc_cerevisiae", "Sacc_pombe", "Selaginella", "Sorghum", "Synechosystis", "Vitis_vinifera");
+#my @species = ("Ath", "Brachy", "Maize", "Oryza_sativa", "Sorghum");
+my @species = ("Ath", "Fragaria", "Oryza_sativa", "Vitis");
+#my @species = ("Ath", "Brachy", "C_elegans", "Chlamy", "Danio", "E_coli", "Fragaria", "Glycine", "Human", "Maize", "Mouse", "Neurospora", "Oryza_sativa", "Physcomitreall", "Poplar", "Sacc_cerevisiae", "Sacc_pombe", "Selaginella", "Sorghum", "Synechosystis", "Vitis");
my $num_species = @species;
-my %species_hash;
-
for (my $i=0; $i<$num_species-1; $i++) {
for (my $j=$i+1; $j<$num_species; $j++) {
# hashes to store the paralogs and orthologs
my (%species_1_gene_hash, %species_2_gene_hash);
# make sure the species are listed in alphabetical order to get correct table names
+ # since I have fixed the species naming, this should be unnecessary now.
+ # Confirmed with no output from print statement.
my $species_1 = $species[$i];
my $species_2 = $species[$j];
if ($species_1 lt $species_2) {
my $spec_temp = $species_1;
$species_1 = $species_2;
$species_2 = $spec_temp;
+ print "non optimal order\n";
}
- if ($species_1 eq "Glycine") {
- $species_1 = "Soy";
- }
-
- if ($species_2 eq "Glycine") {
- $species_2 = "Soy";
- }
-
- if ($species_1 eq "Vitis_vinifera") {
- $species_1 = "Grape";
- }
-
- if ($species_2 eq "Vitis_vinifera") {
- $species_2 = "Grape";
- }
+ # create the statement handler to pull the species-pair data
my $spec_table = "$species_1" . "_" . "$species_2";
my $safe_species_table = $dbh->quote_identifier($spec_table);
my $sth = $dbh->prepare("select * from $safe_species_table");
# error handling, make sure the table exists
if (!$rv) {
+ print "database table doesn't exist\n";
next;
}
+ # fill the hashes with data from species-pair tables
while (my @line = $sth->fetchrow_array()) {
my ($id, $bit_score, $spec, $score, $gene) = @line;
+ # skip if paralogy score is not higher than the minimum set above
+ next if($score < $min_score);
if ($id ne $id_prev) {
if ($spec eq $species_1) {
$species_1_gene_hash{$id} = $gene;
my (@spec_1_array, @spec_2_array);
@spec_1_array = split " ", $species_1_gene_hash{$key};
@spec_2_array = split " ", $species_2_gene_hash{$key};
-
+
+ # set up a hash to keep track of the counts for each cluster id to determine the most popular one
+ my %count_hash;
# if the gene is already in a cluster, use its id #
my $super_temp_id;
foreach my $super_gene (@spec_1_array) {
- if(defined($super_hash{$super_gene})) {
- $super_temp_id = $super_hash{$super_gene};
- last;
+ # if the gene already has a super id associated to it from an earlier pairing, add it to the list to vote on
+ if(defined($super_hash{$super_gene}->{'sup_clust_id'})) {
+ # gene already has a super id that has been counted, add one to the count for that id
+ if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) {
+ $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}}++;
+ # gene has a new super id, start the count for that one
+ }else{
+ $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1;
+ }
}
}
- if(!defined($super_temp_id)) {
- foreach my $super_gene (@spec_2_array) {
- if(defined($super_hash{$super_gene})) {
- $super_temp_id = $super_hash{$super_gene};
- last;
+ foreach my $super_gene (@spec_2_array) {
+ # see above stanza
+ if(defined($super_hash{$super_gene})) {
+ if(defined($count_hash{$super_hash{$super_gene}->{'sup_clust_id'}})) {
+ $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}}++;
+ }else{
+ $count_hash{$super_hash{$super_gene}->{'sup_clust_id'}} = 1;
}
}
}
+ # This section does the voting for which super id to use for the cluster for this species-pair
+ #
+
+ my $prev_count = 0;
+ foreach my $count_key (keys %count_hash) {
+ if($count_hash{$count_key} > $prev_count){
+ $super_temp_id = $count_key;
+ $prev_count = $count_hash{$count_key};
+ # find the non-unanimous clusters
+ }elsif($count_hash{$count_key} < $prev_count && $prev_count != 0) {
+ #print "$key\t$count_key\t$count_hash{$count_key}\n";
+ }
+ }
+
+
+
# if none of the genes are in a cluster already, get a new id #
if(!defined($super_temp_id)) {
- ++$super_id;
+ $super_id++;
$super_temp_id = $super_id;
}
# build the hash and put the entries in the database
foreach my $super_gene (@spec_1_array) {
- $super_hash{$super_gene} = $super_temp_id;
- $insert_sth->execute($super_temp_id,$species_1,$super_gene);
+ if(defined($big_count_hash{$super_gene})) {
+ $big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id";
+ }else{
+ $big_count_hash{$super_gene} = $super_temp_id;
+ }
+ $super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id;
+ $super_hash{$super_gene}->{'species'} = $species_1;
+ #print "$super_gene\t$super_temp_id\n";
+ #$insert_sth->execute($super_temp_id,$species_1,$super_gene);
+
}
foreach my $super_gene (@spec_2_array) {
- $super_hash{$super_gene} = $super_temp_id;
- $insert_sth->execute($super_temp_id,$species_2,$super_gene);
+ if(defined($big_count_hash{$super_gene})) {
+ $big_count_hash{$super_gene} = "$big_count_hash{$super_gene}\t$super_temp_id";
+ }else{
+ $big_count_hash{$super_gene} = $super_temp_id;
+ }
+ $super_hash{$super_gene}->{'sup_clust_id'} = $super_temp_id;
+ $super_hash{$super_gene}->{'species'} = $species_2;
+ # print "$super_gene\t$super_temp_id\n";
+ #$insert_sth->execute($super_temp_id,$species_2,$super_gene);
+
}
}
}
}
+print "Done creating the hashes, now loading into the database\n";
+
+my $gene_count_total = keys %big_count_hash;
+my $loop_counter = 0;
+
+foreach my $gene (keys %big_count_hash) {
+ my @count_array = split("\t", $big_count_hash{$gene});
+ my %small_count_hash;
+ my $gene_cluster;
+
+ foreach my $count_id (@count_array) {
+ if(defined($small_count_hash{$count_id})) {
+ $small_count_hash{$count_id}++;
+ }else{
+ $small_count_hash{$count_id} = 1;
+ }
+ }
+ my $prev_count = 0;
+ foreach my $small_count_key (keys %small_count_hash) {
+ if($small_count_hash{$small_count_key} > $prev_count) {
+ $gene_cluster = $small_count_key;
+ $prev_count = $small_count_hash{$small_count_key};
+ }
+ }
+ $super_hash{$gene}->{'sup_clust_id'} = $gene_cluster;
+ my $species = $super_hash{$gene}->{'species'};
+ $insert_sth->execute($super_hash{$gene}->{'sup_clust_id'},$species,$gene);
+
+ if($loop_counter % 10000 == 0) {
+ print "finished $loop_counter of $gene_count_total\n";
+ }
+ $loop_counter++;
+}
+print "finished\n";
+
+