Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
updated versions of flowering_interaction and inparanoid_output_parse, added superclu...
authorelserj <elserj@localhost>
Fri, 6 Nov 2009 18:36:24 +0000 (18:36 +0000)
committerelserj <elserj@localhost>
Fri, 6 Nov 2009 18:36:24 +0000 (18:36 +0000)
svn path=/; revision=11

interactome_scripts/flowering_interaction.pl
interactome_scripts/inparanoid_orthologs.pl [new file with mode: 0755]
interactome_scripts/inparanoid_output_parse.pl
interactome_scripts/supercluster.pl [new file with mode: 0755]

index 6b1d643abbcc3812bd12172bbcbc01c87ec541ee..703efec86768f1286a1508cd35c7b7b111bee56b 100755 (executable)
@@ -1,3 +1,281 @@
 #!/usr/bin/perl
 
+#########################################################
+#  Written by Justin Elser 4/29/2009                    #
+#                                                      #
+#      This program takes the genes known to interact  #
+#      in the long day flowering process in            #
+#      Arabidopsis and predicts the similar short day  #
+#      interaction in rice                             #
+#                                                      #
+#      Uses the orthologs first, then paralogs         #
+#                                                      #
+#########################################################
+
+
+use strict;
+use warnings;
+
+# print usage
+if ($#ARGV !=4) {
+       print "usage: flowering_interaction.pl input_file interaction_network_file ortholog_file paralog_file output_file_prefix\n";
+       exit;
+}
+
+my $in_file = $ARGV[0];
+my $inter_file = $ARGV[1];
+my $ortho_file = $ARGV[2];
+my $para_file = $ARGV[3];
+my $out_file = $ARGV[4] . ".sif";
+my $out_file_inter = $ARGV[4] . "_inter.sif";
+my $out_file_ortho = $ARGV[4] . "_ortho.sif";
+my $out_file_para = $ARGV[4] . "_para.sif";
+
+my %in_hash;
+
+
+open (input_file, "$in_file") || die "Error: file '$in_file' can not be opened \n";
+while(<input_file>) {
+       my $entry = $_;
+       # strip off newline characters
+       $entry =~ s/\r//g;
+       $entry =~ s/\n//g;
+       # split the columns into 2 separate variables (using either multiple congruent spaces or a tab)
+       my ($gene_name, $gene_id) = split /\s{2,}|\t/, $entry;
+       $in_hash{$gene_id} = $gene_name;
+}
+close (input_file);
+
+open (interaction_file, "$inter_file") || die "Error: file '$inter_file' can not be opened\n";
+my %inter_hash;
+while(<interaction_file>) {
+       my $entry = $_;
+       # strip off newline characters
+       $entry =~ s/\r//g;
+       $entry =~ s/\n//g;
+       # split the columns into 3 separate variables
+       my ($gene_1, $inter_type, $gene_2) = split ("\t", $entry);
+       # change gene ids to all caps (for matching)
+       $gene_1 =~ tr/a-z/A-Z/;
+       $gene_2 =~ tr/a-z/A-Z/;
+       
+       # because it doesn't matter which way the interaction goes, need to do the interaction in both directions
+       #      ie. as gene_1 as the key then as gene_2 as the key
+       if(defined($inter_hash{$gene_1})){
+               # if gene already in hash, just add to the list
+               my $oldhash = $inter_hash{$gene_1};
+               $oldhash->{'type'} = "$oldhash->{'type'}\t$inter_type";
+               $oldhash->{'inter_gene'} = "$oldhash->{'inter_gene'}\t$gene_2";
+               $inter_hash{$gene_1} = $oldhash;
+               # if new gene, create hash element
+       }else{
+               my %hash;
+               $hash{'map_gene'} = $gene_1;
+               $hash{'type'} = $inter_type;
+               $hash{'inter_gene'} = $gene_2;
+               $inter_hash{$gene_1} = \%hash;
+       }
+       
+       if(defined($inter_hash{$gene_2})){
+               # if new gene in the hash, just add to the list
+               my $oldhash = $inter_hash{$gene_2};
+               $oldhash->{'type'} = "$oldhash->{'type'}\t$inter_type";
+               $oldhash->{'inter_gene'} = "$oldhash->{'inter_gene'}\t$gene_1";
+               $inter_hash{$gene_2} = $oldhash;
+               # if new gene, create hash element
+       }else{
+               my %hash;
+               $hash{'map_gene'} = $gene_2;
+               $hash{'type'} = $inter_type;
+               $hash{'inter_gene'} = $gene_1;
+               $inter_hash{$gene_2} = \%hash;
+       }
+}
+close(interaction_file);
+
+
+
+my $min_ident=0;
+
+my %ortho_hash;
+#my @ortho_array;
+
+open (ortholog_file, "$ortho_file") || die "Error: file '$ortho_file' can not be opened\n";
+while(<ortholog_file>) {
+       my $entry = $_;
+       # strip off newline characters
+       $entry =~ s/\r//g;
+       $entry =~ s/\n//g;
+       # split the columns into 3 separate variables
+       my ($gene_id, $ortho_id, $ident_id) = split("\t", $entry);
+       # change gene ids to all caps
+       $gene_id =~ tr/a-z/A-Z/;
+       $ortho_id =~ tr/a-z/A-Z/;
+       # strip off the TAIR-G suffix on ids
+       $gene_id =~ s/\-TAIR-G//g;
+       
+       # check to make sure the %identity is above the minimum defined above
+       if($ident_id =~ /\d/ && $ident_id>=$min_ident){
+               # if gene_id already found, add the new ortholog to hash
+               if(defined($ortho_hash{$gene_id})){
+                       my $oldhash = $ortho_hash{$gene_id};
+                       $oldhash->{'ortho'} = "$oldhash->{'ortho'}\t$ortho_id";;
+                       $ortho_hash{$gene_id} = $oldhash;
+               # if new gene_id, create hash element
+               }else{
+                       my %hash;
+                       $hash{'ortho'} = $ortho_id;
+                       $hash{'gene'} = $gene_id;
+                       $ortho_hash{$gene_id} = \%hash;
+                       #push @ortho_array, $gene_id;
+               }
+       }
+}
+close (ortholog_file);
+
+my %para_hash;
+open(paralog_file, "$para_file") || die "Error: file '$para_file' can not be opened\n";
+       
+       my $min_ident_para = 0; # set the threshold for paralog identities
+       while(<paralog_file>) {
+               my $entry = $_;
+               # strip off newline characters
+               $entry =~ s/\r//g;
+               $entry =~ s/\n//g;
+               # split the columns into 3 separate variables
+               my ($gene_1, $gene_2, $ident) = split("\t", $entry);
+               # skip if $ident is empty or Nan
+               next if(!defined($ident));
+               next if($ident =~ /D/);
+               next if($ident eq "");
+               next if($ident =~ /\%/);
+               # change gene ids to all caps
+               $gene_1 =~ tr/a-z/A-Z/;
+               $gene_2 =~ tr/a-z/A-Z/;
+               
+               # put the genes in ASCII order to help remove dupes
+               if($gene_1 gt $gene_2) {
+                       my $temp = $gene_1;
+                       $gene_1 = $gene_2;
+                       $gene_2 = $temp;
+               }
+               
+               # only use paralogs with high confidence values
+               if($ident>=$min_ident_para) {
+                       # most genes will show up many times, so push values for hash key
+                       if(defined($para_hash{$gene_1})) {
+                               my $oldhash = $para_hash{$gene_1};
+                               $oldhash->{'gene2'} = "$oldhash->{'gene2'}\t$gene_2";;
+                               $para_hash{$gene_1} = $oldhash;
+                               #print "$oldhash\n";
+                       }else{
+                               my %hash;
+                               $hash{'gene1'} = $gene_1;
+                               $hash{'gene2'} = $gene_2;
+                               $para_hash{$gene_1} = \%hash;
+                       }
+               }
+               
+       }
+
+       close(paralog_file);
+
+# open output files for writing
+open(output_file, ">$out_file");
+open(output_file_inter, ">$out_file_inter");
+open(output_file_ortho, ">$out_file_ortho");
+open(output_file_para, ">$out_file_para");
+
+
+
+# initialize the counters
+my $Ath_inter_count = 0;
+my $rice_ortho_count = 0;
+my $rice_para_count = 0;
+
+foreach my $key (keys %in_hash) {
+       my $name = $in_hash{$key};
+       
+       # find all interactions with the genes in %in_hash
+       if(defined($inter_hash{$key})) {
+               
+               my @inter_array = split ("\t", $inter_hash{$key}->{'inter_gene'});
+               my @inter_type_array = split ("\t", $inter_hash{$key}->{'type'});
+               my $inter_index = @inter_type_array;
+               for (my $i=0; $i<$inter_index; $i++) {
+                       my $in_gene_1 = $key;
+                       my $in_gene_2 = $inter_array[$i];
+                       # add one to the number of Ath interactions counted
+                       $Ath_inter_count++;
+                       # output the interactions to file
+                       print output_file_inter "$in_gene_1\t$inter_type_array[$i]\t$in_gene_2\n";
+                       
+                       # make the genes be listed in ASCII order to make sorting out dupes easier
+                       if ($in_gene_1 gt $in_gene_2) { # note that this should not be used if order matters for interaction type
+                               my $temp = $in_gene_1;
+                               $in_gene_1 = $in_gene_2;
+                               $in_gene_2 = $temp;
+                       }
+                       
+                       # next step is to find all the orthologs for both genes in the interaction
+                       if(exists($ortho_hash{$in_gene_2})) {
+                               my @ortho_gene_2_array = split ("\t", $ortho_hash{$in_gene_2}->{'ortho'});
+                               foreach my $ortho_gene_2 (@ortho_gene_2_array) {
+                                       # output orthologs to file
+                                       print output_file_ortho "$in_gene_2\tortho\t$ortho_gene_2\n";
+                                       
+                                       if(exists($ortho_hash{$in_gene_1})) {
+                                               my @ortho_gene_1_array = split ("\t", $ortho_hash{$in_gene_1}->{'ortho'});
+                                               foreach my $ortho_gene_1 (@ortho_gene_1_array) {
+                                                       #print output_file "$ortho_gene_1\t$inter_type_array[$i]\t$ortho_gene_2\n";
+                                                       print output_file "$ortho_gene_1\tortho\t$ortho_gene_2\n";
+                                                       # add one to the number of orthologs counted
+                                                       $rice_ortho_count++;
+                                                       # output the orthologs to file
+                                                       print output_file_ortho "$in_gene_1\tortho\t$ortho_gene_1\n";
+                                                       
+                                                       # now find all paralogs to the orthologs found
+                                                       if(exists($para_hash{$ortho_gene_1})) {
+                                                               my @para_gene_1_array = split ("\t", $para_hash{$ortho_gene_1}->{'gene2'});
+                                                               foreach my $para_gene_1 (@para_gene_1_array) {
+                                                                       # output the paralogs to file
+                                                                       print output_file_para "$ortho_gene_1\tpara\t$para_gene_1\n";
+                                                                       
+                                                                       if(exists($para_hash{$ortho_gene_2})) {
+                                                                               my @para_gene_2_array = split ("\t", $para_hash{$ortho_gene_2}->{'gene2'});
+                                                                               foreach my $para_gene_2 (@para_gene_2_array) {
+                                                                                       # add one to the number of paralogs counted
+                                                                                       $rice_para_count++;
+                                                                                       #print output_file "$para_gene_1\t$inter_type_array[$i]\t$para_gene_2\n";
+                                                                                       print output_file "$para_gene_1\tpara\t$para_gene_2\n";
+                                                                                       # output the paralogs to file
+                                                                                       print output_file_para "$ortho_gene_2\tpara\t$para_gene_2\n";
+
+                                                                               }
+                                                                       }
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+                       
+               }
+       }
+}
+
+close(output_file);
+print "Ath_inter_count = $Ath_inter_count\nrice_ortho_count = $rice_ortho_count\nrice_para_count = $rice_para_count\n";
+print "Now removing duplicates\n";
+# sort the file and get rid of duplicates
+#   Note: this requires replacing the file, hence the mv command
+system "sort $out_file | uniq > $out_file.tmp; mv $out_file.tmp $out_file";
+
+
+close(output_file_inter);
+close(output_file_ortho);
+close(output_file_para);
+
+
+
 
diff --git a/interactome_scripts/inparanoid_orthologs.pl b/interactome_scripts/inparanoid_orthologs.pl
new file mode 100755 (executable)
index 0000000..194a427
--- /dev/null
@@ -0,0 +1,99 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+use DBI;
+use Term::Screen::ReadLine;
+
+if ($#ARGV != 0) {
+       print "usage: inparanoid_orthologs.pl output_file\n";
+       exit;
+}
+open(outfile,">$ARGV[0]") || die "Error: file '$ARGV[0]' can not be opened\n";
+
+# define the database handle to be used 
+
+my $screen = Term::Screen::ReadLine->new();
+       # clear the screen
+       $screen->clrscr;
+       # ask for username
+       $screen->at(0,0)->puts("Username: ");
+       my $username = $screen->readline(ROW => 0, COL=>11);
+
+       # ask for password, replace character presses with stars
+       $screen->at(1,0)->puts("Password: ");
+       my $password = $screen->readline(ROW => 1, COL => 11, PASSWORD => 1);
+
+       $screen->at(2,0);
+       undef $screen;
+       
+my $dbh = DBI->connect('DBI:mysql:inparanoid_data;host=floret.cgrb.oregonstate.edu', $username, $password,
+       { RaiseError=> 1, AutoCommit=>1 }
+       ) or die "Failed to connect to database: $DBI::errstr";
+       
+my @species = ("Maize", "Oryza_sativa", "Ath", "Sorghum", "Brachy");
+my $num_species = @species;
+
+for (my $i=0; $i<$num_species-1; $i++) {
+       for (my $j=$i+1; $j<$num_species; $j++) {
+               # hashes to store the paralogs and orthologs
+               my (%species_1_gene_hash, %species_2_gene_hash);
+
+               # make sure the species are listed in alphabetical order to get correct table names
+               my $species_1 = $species[$i];
+               my $species_2 = $species[$j];
+               if ($species_1 lt $species_2) {
+                       $species_1 = $species_1;
+                       $species_2 = $species_2;
+               } else {
+                       my $spec_temp = $species_1;
+                       $species_1 = $species_2;
+                       $species_2 = $spec_temp;
+               }
+       
+               my $spec_table = "$species_1" . "_" . "$species_2";
+               my $safe_species_table = $dbh->quote_identifier($spec_table);
+               my $sth = $dbh->prepare("select * from $safe_species_table");
+       
+               my $id_prev = "";
+       
+               $sth->execute();
+       
+               while (my @line = $sth->fetchrow_array()) {
+                       my ($id, $bit_score, $spec, $score, $gene) = @line;
+                       if ($id ne $id_prev) {
+                               if ($spec eq $species_1) {
+                                       $species_1_gene_hash{$id} = $gene;
+                               }else {
+                                       $species_2_gene_hash{$id} = $gene;
+                               }
+                       }else {
+                               if ($spec eq $species_1) {
+                                       if(defined($species_1_gene_hash{$id})) {
+                                               $species_1_gene_hash{$id} = "$species_1_gene_hash{$id} $gene";
+                                       } else {
+                                               $species_1_gene_hash{$id} = $gene;
+                                       }
+                               }else {
+                                       if(defined($species_2_gene_hash{$id})) {
+                                               $species_2_gene_hash{$id} = "$species_2_gene_hash{$id} $gene";
+                                       } else {
+                                               $species_2_gene_hash{$id} = $gene;
+                                       }
+                               }
+                       }
+                       $id_prev = $id;
+               }
+               
+               foreach my $key (keys %species_1_gene_hash) {
+                       if(defined($species_2_gene_hash{$key})) {
+                               print outfile "$species_1_gene_hash{$key}\t$species_2_gene_hash{$key}\n";
+                       }
+               }
+               $sth->finish();
+       }
+}
+
+close(outfile);
+
index 3528d9738bbaa168eb4fc6c1ed0e8d4bddc6fecd..8f64b9ab6bcd1f635a96b9f242b1fd14781bb4db 100755 (executable)
@@ -1,5 +1,18 @@
 #!/usr/bin/perl
 
+###############################################################
+#  Justin Elser  (elserj@science.oregonstate.edu)             #
+#        Parses the output from an inparanoid run and         #
+#           enters it into a database                         #
+#                                                             #
+#  Version 1.00 - September 2009                              #
+#     Seems to work fine                                      #
+#  Version 1.01 - Nov 4 '09                                   #
+#     Added support for strawberry                            #
+#                                                             #
+###############################################################
+
+
 use strict;
 use warnings;
 
@@ -90,6 +103,10 @@ foreach my $input_file (@files) {
        
                $clust_id = "$species_1"."___".$species_2."___".$clust_id;
                $species = find_species($species);
+               my $gene = find_gene($gene_header,$species);
+               
+               # skip isomers that are not .1
+               next if ($gene =~ /\.[2-9]$/);
        
                if(!defined($id_hash{$clust_id})) {
                        if ($clust_id ne $clust_id_prev) {
@@ -104,7 +121,7 @@ foreach my $input_file (@files) {
                        $id = $id_hash{$clust_id};
                }
        
-               my $gene = find_gene($gene_header,$species);
+               
                $sth->execute($id, $bit_score, $species, $score, $gene);
        
                $clust_id_prev = $clust_id;
@@ -132,6 +149,8 @@ sub find_species {
                $temp = "Danio";
        }elsif ($temp =~ /E\_coli/) {
                $temp = "E_coli";
+       }elsif ($temp =~ /Fragaria/) {
+               $temp = "Fragaria";
        }elsif ($temp =~ /Glycine/) {
                $temp = "Soy";
        }elsif ($temp =~ /Homo\_sapiens/) {
@@ -171,22 +190,25 @@ sub find_gene {
        my $species = $_[1];
        my $gene;
        if ($species eq "Ath") {
-               my ($name,$gene_id,$chrom,$isomer) = split("\|", $gene_header);
+               my ($name,$gene_id,$chrom,$isomer) = split(/\|/, $gene_header);
                $gene = $isomer;
        }elsif ($species eq "Brachy") {
                $gene = $gene_header;
        }elsif ($species eq "C_elegans") {
-               my ($gene_id,$temp) = split("\|", $gene_header);
+               my ($gene_id,$temp) = split(/\|/, $gene_header);
                $gene = $gene_id; #???
        }elsif ($species eq "Chlamy") {
-               my ($name,$locus_id,$scaff_id,$temp) = split("\|",$gene_header);
+               my ($name,$locus_id,$scaff_id,$temp) = split(/\|/,$gene_header);
                $gene = $locus_id; #???
        }elsif ($species eq "Danio") {
                $gene = $gene_header;
        }elsif ($species eq "E_coli") {
                $gene = $gene_header; #???
+       }elsif ($species eq "Fragaria") {
+               my ($gene_id, $mrna_id, $method, $length) = split(/\|/, $gene_header);
+               $gene = $gene_id;
        }elsif ($species eq "Soy") {
-               my ($name,$locus_id,$scaff_id,$isomer) = split("\|",$gene_header);
+               my ($name,$locus_id,$scaff_id,$isomer) = split(/\|/,$gene_header);
                $gene = $isomer;
        }elsif ($species eq "Human") {
                $gene = $gene_header;
@@ -197,29 +219,29 @@ sub find_gene {
        }elsif ($species eq "Neurospora") {
                $gene = $gene_header;
        }elsif ($species eq "Oryza_sativa") {
-               my ($isomer,$temp,$type) = split("\|",$gene_header);
+               my ($isomer,$temp,$type) = split(/\|/,$gene_header);
                $gene = $isomer;
        }elsif ($species eq "Physcomitreall") {
-               my ($name,$locus_id,$chrom_id,$prot_id) = split("\|",$gene_header);
+               my ($name,$locus_id,$chrom_id,$prot_id) = split(/\|/,$gene_header);
                $gene = $prot_id; #???
        }elsif ($species eq "Poplar") {
-               my ($name,$locus_id,$chrom_id,$prot_id) = split("\|",$gene_header);
+               my ($name,$locus_id,$chrom_id,$prot_id) = split(/\|/,$gene_header);
                $gene = $prot_id; #???
        }elsif ($species eq "Sacc_cerevisiae") {
                $gene = $gene_header;
        }elsif ($species eq "Sacc_pombe") {
                $gene = $gene_header;
        }elsif ($species eq "Selaginella") {
-               my ($name,$locus_id,$chrom_id,$prot_id) = split("\|",$gene_header);
+               my ($name,$locus_id,$chrom_id,$prot_id) = split(/\|/,$gene_header);
                $gene = $prot_id; #???
        }elsif ($species eq "Sorghum") {
-               my ($name,$locus_id,$scaff_id,$prot_id) = split("\|",$gene_header);
+               my ($name,$locus_id,$scaff_id,$prot_id) = split(/\|/,$gene_header);
                $gene = $prot_id; #???
        }elsif ($species eq "Synechosystis") {
                my ($gene_id,$type,$temp) = split(" ",$gene_header);
                $gene = $gene_id; #???
        }elsif ($species eq "Grape") {
-               my ($name,$gene_id,$chrom_id,$id) = split("\|",$gene_header);
+               my ($name,$gene_id,$chrom_id,$id) = split(/\|/,$gene_header);
                $gene = $gene_id; #???
        }else {
                die "Error: Gene id can not be found!";
diff --git a/interactome_scripts/supercluster.pl b/interactome_scripts/supercluster.pl
new file mode 100755 (executable)
index 0000000..5a5a1f8
--- /dev/null
@@ -0,0 +1,180 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+use DBI;
+use Term::Screen::ReadLine;
+
+
+# define the database handle to be used 
+
+my $screen = Term::Screen::ReadLine->new();
+       # clear the screen
+       $screen->clrscr;
+       # ask for username
+       $screen->at(0,0)->puts("Username: ");
+       my $username = $screen->readline(ROW => 0, COL=>11);
+
+       # ask for password, replace character presses with stars
+       $screen->at(1,0)->puts("Password: ");
+       my $password = $screen->readline(ROW => 1, COL => 11, PASSWORD => 1);
+
+       $screen->at(2,0);
+       undef $screen;
+       
+my $dbh = DBI->connect('DBI:mysql:inparanoid_data;host=floret.cgrb.oregonstate.edu', $username, $password,
+       { RaiseError=> 1, AutoCommit=>1 }
+       ) or die "Failed to connect to database: $DBI::errstr";
+       
+       
+#### Note to self ####
+## To not have multiple values in mysql table, use insert ignore instead of insert.  Or use replace.
+### also note that for insert ignore to work, must have a "unique" field
+
+# make the new table to hold the super clusters
+my $super_table = "super_clust";
+my $safe_super_table = $dbh->quote_identifier($super_table);
+
+$dbh->do("drop table if exists $safe_super_table");
+$dbh->do("CREATE TABLE $safe_super_table (
+       `super_id` INT( 11 ) NOT NULL ,
+       `species` VARCHAR( 255 ) NOT NULL ,
+       `gene` VARCHAR( 255 ) NOT NULL ,
+       UNIQUE ( `gene` )
+       ) TYPE = MYISAM");
+my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, species, gene) values (?,?,?)");
+
+my $super_id = 0; #initialize the super cluster id
+my %super_hash;
+       
+#my @species = ("Maize", "Oryza_sativa", "Ath", "Sorghum", "Brachy");
+#my @species = ("Maize", "Oryza_sativa", "Ath");
+my @species  = ("Ath", "Brachy", "C_elegans", "Chlamy", "Danio", "E_coli", "Fragaria", "Glycine", "Human", "Maize", "Mouse", "Neurospora", "Oryza_sativa", "Physcomitreall", "Poplar", "Sacc_cerevisiae", "Sacc_pombe", "Selaginella", "Sorghum", "Synechosystis", "Vitis_vinifera");
+my $num_species = @species;
+
+my %species_hash;
+
+for (my $i=0; $i<$num_species-1; $i++) {
+       for (my $j=$i+1; $j<$num_species; $j++) {
+               # hashes to store the paralogs and orthologs
+               my (%species_1_gene_hash, %species_2_gene_hash);
+
+               # make sure the species are listed in alphabetical order to get correct table names
+               my $species_1 = $species[$i];
+               my $species_2 = $species[$j];
+               if ($species_1 lt $species_2) {
+                       $species_1 = $species_1;
+                       $species_2 = $species_2;
+               } else {
+                       my $spec_temp = $species_1;
+                       $species_1 = $species_2;
+                       $species_2 = $spec_temp;
+               }
+               
+               if ($species_1 eq "Glycine") {
+                       $species_1 = "Soy";
+               }
+               
+               if ($species_2 eq "Glycine") {
+                       $species_2 = "Soy";
+               }
+               
+               if ($species_1 eq "Vitis_vinifera") {
+                       $species_1 = "Grape";
+               }
+               
+               if ($species_2 eq "Vitis_vinifera") {
+                       $species_2 = "Grape";
+               }
+               
+       
+               my $spec_table = "$species_1" . "_" . "$species_2";
+               my $safe_species_table = $dbh->quote_identifier($spec_table);
+               my $sth = $dbh->prepare("select * from $safe_species_table");
+       
+               my $id_prev = "";
+       
+               my $rv = $sth->execute();
+               
+               # error handling, make sure the table exists
+               if (!$rv) {
+                       next;
+               }
+       
+               while (my @line = $sth->fetchrow_array()) {
+                       my ($id, $bit_score, $spec, $score, $gene) = @line;
+                       if ($id ne $id_prev) {
+                               if ($spec eq $species_1) {
+                                       $species_1_gene_hash{$id} = $gene;
+                               }else {
+                                       $species_2_gene_hash{$id} = $gene;
+                               }
+                       }else {
+                               if ($spec eq $species_1) {
+                                       if(defined($species_1_gene_hash{$id})) {
+                                               $species_1_gene_hash{$id} = "$species_1_gene_hash{$id} $gene";
+                                       } else {
+                                               $species_1_gene_hash{$id} = $gene;
+                                       }
+                               }else {
+                                       if(defined($species_2_gene_hash{$id})) {
+                                               $species_2_gene_hash{$id} = "$species_2_gene_hash{$id} $gene";
+                                       } else {
+                                               $species_2_gene_hash{$id} = $gene;
+                                       }
+                               }
+                       }
+                       $id_prev = $id;
+               }
+               $sth->finish();
+
+               # Each key defines a species pair cluster
+               foreach my $key (keys %species_1_gene_hash) {
+                       if(defined($species_2_gene_hash{$key})) {
+                               my (@spec_1_array, @spec_2_array);
+                               @spec_1_array = split " ", $species_1_gene_hash{$key};
+                               @spec_2_array = split " ", $species_2_gene_hash{$key};
+                               
+                               # if the gene is already in a cluster, use its id #
+                               my $super_temp_id;
+                               
+                               foreach my $super_gene (@spec_1_array) {
+                                       if(defined($super_hash{$super_gene})) {
+                                               $super_temp_id = $super_hash{$super_gene};
+                                               last;
+                                       }
+                               }
+                               
+                               if(!defined($super_temp_id)) {
+                                       foreach my $super_gene (@spec_2_array) {
+                                               if(defined($super_hash{$super_gene})) {
+                                                       $super_temp_id = $super_hash{$super_gene};
+                                                       last;
+                                               }
+                                       }
+                               }
+                               
+                               # if none of the genes are in a cluster already, get a new id #
+                               if(!defined($super_temp_id)) {
+                                       ++$super_id;
+                                       $super_temp_id = $super_id;
+                               }
+                               
+                               # build the hash and put the entries in the database
+                               foreach my $super_gene (@spec_1_array) {
+                                       $super_hash{$super_gene} = $super_temp_id;
+                                       $insert_sth->execute($super_temp_id,$species_1,$super_gene);
+                               }
+                               foreach my $super_gene (@spec_2_array) {
+                                       $super_hash{$super_gene} = $super_temp_id;
+                                       $insert_sth->execute($super_temp_id,$species_2,$super_gene);
+                               }
+                       
+                       }
+               }
+               
+       }
+}
+
+