From 18c3a522dfe3d9238d2533857065b886763f6e7e Mon Sep 17 00:00:00 2001
From: elserj <elserj@localhost>
Date: Fri, 6 Nov 2009 18:36:24 +0000
Subject: [PATCH] updated versions of flowering_interaction and
 inparanoid_output_parse, added supercluster and inparanoid_orthologs scripts

svn path=/; revision=11
---
 interactome_scripts/flowering_interaction.pl  | 278 ++++++++++++++++++
 interactome_scripts/inparanoid_orthologs.pl   |  99 +++++++
 .../inparanoid_output_parse.pl                |  44 ++-
 interactome_scripts/supercluster.pl           | 180 ++++++++++++
 4 files changed, 590 insertions(+), 11 deletions(-)
 create mode 100755 interactome_scripts/inparanoid_orthologs.pl
 create mode 100755 interactome_scripts/supercluster.pl

diff --git a/interactome_scripts/flowering_interaction.pl b/interactome_scripts/flowering_interaction.pl
index 6b1d643..703efec 100755
--- a/interactome_scripts/flowering_interaction.pl
+++ b/interactome_scripts/flowering_interaction.pl
@@ -1,3 +1,281 @@
 #!/usr/bin/perl
 
+#########################################################
+#  Written by Justin Elser 4/29/2009                    #
+#                					#
+#	This program takes the genes known to interact	#
+#	in the long day flowering process in 		#
+#	Arabidopsis and predicts the similar short day	#
+#	interaction in rice				#
+#							#
+#	Uses the orthologs first, then paralogs		#
+#							#
+#########################################################
+
+
+use strict;
+use warnings;
+
+# print usage
+if ($#ARGV !=4) {
+	print "usage: flowering_interaction.pl input_file interaction_network_file ortholog_file paralog_file output_file_prefix\n";
+	exit;
+}
+
+my $in_file = $ARGV[0];
+my $inter_file = $ARGV[1];
+my $ortho_file = $ARGV[2];
+my $para_file = $ARGV[3];
+my $out_file = $ARGV[4] . ".sif";
+my $out_file_inter = $ARGV[4] . "_inter.sif";
+my $out_file_ortho = $ARGV[4] . "_ortho.sif";
+my $out_file_para = $ARGV[4] . "_para.sif";
+
+my %in_hash;
+
+
+open (input_file, "$in_file") || die "Error: file '$in_file' can not be opened \n";
+while(<input_file>) {
+	my $entry = $_;
+	# strip off newline characters
+	$entry =~ s/\r//g;
+	$entry =~ s/\n//g;
+	# split the columns into 2 separate variables (using either multiple congruent spaces or a tab)
+	my ($gene_name, $gene_id) = split /\s{2,}|\t/, $entry;
+	$in_hash{$gene_id} = $gene_name;
+}
+close (input_file);
+
+open (interaction_file, "$inter_file") || die "Error: file '$inter_file' can not be opened\n";
+my %inter_hash;
+while(<interaction_file>) {
+	my $entry = $_;
+	# strip off newline characters
+	$entry =~ s/\r//g;
+	$entry =~ s/\n//g;
+	# split the columns into 3 separate variables
+	my ($gene_1, $inter_type, $gene_2) = split ("\t", $entry);
+	# change gene ids to all caps (for matching)
+	$gene_1 =~ tr/a-z/A-Z/;
+	$gene_2 =~ tr/a-z/A-Z/;
+	
+	# because it doesn't matter which way the interaction goes, need to do the interaction in both directions
+	#      ie. as gene_1 as the key then as gene_2 as the key
+	if(defined($inter_hash{$gene_1})){
+		# if gene already in hash, just add to the list
+		my $oldhash = $inter_hash{$gene_1};
+		$oldhash->{'type'} = "$oldhash->{'type'}\t$inter_type";
+		$oldhash->{'inter_gene'} = "$oldhash->{'inter_gene'}\t$gene_2";
+		$inter_hash{$gene_1} = $oldhash;
+		# if new gene, create hash element
+	}else{
+		my %hash;
+		$hash{'map_gene'} = $gene_1;
+		$hash{'type'} = $inter_type;
+		$hash{'inter_gene'} = $gene_2;
+		$inter_hash{$gene_1} = \%hash;
+	}
+	
+	if(defined($inter_hash{$gene_2})){
+		# if new gene in the hash, just add to the list
+		my $oldhash = $inter_hash{$gene_2};
+		$oldhash->{'type'} = "$oldhash->{'type'}\t$inter_type";
+		$oldhash->{'inter_gene'} = "$oldhash->{'inter_gene'}\t$gene_1";
+		$inter_hash{$gene_2} = $oldhash;
+		# if new gene, create hash element
+	}else{
+		my %hash;
+		$hash{'map_gene'} = $gene_2;
+		$hash{'type'} = $inter_type;
+		$hash{'inter_gene'} = $gene_1;
+		$inter_hash{$gene_2} = \%hash;
+	}
+}
+close(interaction_file);
+
+
+
+my $min_ident=0;
+
+my %ortho_hash;
+#my @ortho_array;
+
+open (ortholog_file, "$ortho_file") || die "Error: file '$ortho_file' can not be opened\n";
+while(<ortholog_file>) {
+	my $entry = $_;
+	# strip off newline characters
+	$entry =~ s/\r//g;
+	$entry =~ s/\n//g;
+	# split the columns into 3 separate variables
+	my ($gene_id, $ortho_id, $ident_id) = split("\t", $entry);
+	# change gene ids to all caps
+	$gene_id =~ tr/a-z/A-Z/;
+	$ortho_id =~ tr/a-z/A-Z/;
+	# strip off the TAIR-G suffix on ids
+	$gene_id =~ s/\-TAIR-G//g;
+	
+	# check to make sure the %identity is above the minimum defined above
+	if($ident_id =~ /\d/ && $ident_id>=$min_ident){
+		# if gene_id already found, add the new ortholog to hash
+		if(defined($ortho_hash{$gene_id})){
+			my $oldhash = $ortho_hash{$gene_id};
+			$oldhash->{'ortho'} = "$oldhash->{'ortho'}\t$ortho_id";;
+			$ortho_hash{$gene_id} = $oldhash;
+		# if new gene_id, create hash element
+		}else{
+			my %hash;
+			$hash{'ortho'} = $ortho_id;
+			$hash{'gene'} = $gene_id;
+			$ortho_hash{$gene_id} = \%hash;
+			#push @ortho_array, $gene_id;
+		}
+	}
+}
+close (ortholog_file);
+
+my %para_hash;
+open(paralog_file, "$para_file") || die "Error: file '$para_file' can not be opened\n";
+	
+	my $min_ident_para = 0; # set the threshold for paralog identities
+	while(<paralog_file>) {
+		my $entry = $_;
+		# strip off newline characters
+		$entry =~ s/\r//g;
+		$entry =~ s/\n//g;
+		# split the columns into 3 separate variables
+		my ($gene_1, $gene_2, $ident) = split("\t", $entry);
+		# skip if $ident is empty or Nan
+		next if(!defined($ident));
+		next if($ident =~ /D/);
+		next if($ident eq "");
+		next if($ident =~ /\%/);
+		# change gene ids to all caps
+		$gene_1 =~ tr/a-z/A-Z/;
+		$gene_2 =~ tr/a-z/A-Z/;
+		
+		# put the genes in ASCII order to help remove dupes
+		if($gene_1 gt $gene_2) {
+			my $temp = $gene_1;
+			$gene_1 = $gene_2;
+			$gene_2 = $temp;
+		}
+		
+		# only use paralogs with high confidence values
+		if($ident>=$min_ident_para) {
+			# most genes will show up many times, so push values for hash key
+			if(defined($para_hash{$gene_1})) {
+				my $oldhash = $para_hash{$gene_1};
+				$oldhash->{'gene2'} = "$oldhash->{'gene2'}\t$gene_2";;
+				$para_hash{$gene_1} = $oldhash;
+				#print "$oldhash\n";
+			}else{
+				my %hash;
+				$hash{'gene1'} = $gene_1;
+				$hash{'gene2'} = $gene_2;
+				$para_hash{$gene_1} = \%hash;
+			}
+		}
+		
+	}
+
+	close(paralog_file);
+
+# open output files for writing
+open(output_file, ">$out_file");
+open(output_file_inter, ">$out_file_inter");
+open(output_file_ortho, ">$out_file_ortho");
+open(output_file_para, ">$out_file_para");
+
+
+
+# initialize the counters
+my $Ath_inter_count = 0;
+my $rice_ortho_count = 0;
+my $rice_para_count = 0;
+
+foreach my $key (keys %in_hash) {
+	my $name = $in_hash{$key};
+	
+	# find all interactions with the genes in %in_hash
+	if(defined($inter_hash{$key})) {
+		
+		my @inter_array = split ("\t", $inter_hash{$key}->{'inter_gene'});
+		my @inter_type_array = split ("\t", $inter_hash{$key}->{'type'});
+		my $inter_index = @inter_type_array;
+		for (my $i=0; $i<$inter_index; $i++) {
+			my $in_gene_1 = $key;
+			my $in_gene_2 = $inter_array[$i];
+			# add one to the number of Ath interactions counted
+			$Ath_inter_count++;
+			# output the interactions to file
+			print output_file_inter "$in_gene_1\t$inter_type_array[$i]\t$in_gene_2\n";
+			
+			# make the genes be listed in ASCII order to make sorting out dupes easier
+			if ($in_gene_1 gt $in_gene_2) { # note that this should not be used if order matters for interaction type
+				my $temp = $in_gene_1;
+				$in_gene_1 = $in_gene_2;
+				$in_gene_2 = $temp;
+			}
+			
+			# next step is to find all the orthologs for both genes in the interaction
+			if(exists($ortho_hash{$in_gene_2})) {
+				my @ortho_gene_2_array = split ("\t", $ortho_hash{$in_gene_2}->{'ortho'});
+				foreach my $ortho_gene_2 (@ortho_gene_2_array) {
+					# output orthologs to file
+					print output_file_ortho "$in_gene_2\tortho\t$ortho_gene_2\n";
+					
+					if(exists($ortho_hash{$in_gene_1})) {
+						my @ortho_gene_1_array = split ("\t", $ortho_hash{$in_gene_1}->{'ortho'});
+						foreach my $ortho_gene_1 (@ortho_gene_1_array) {
+							#print output_file "$ortho_gene_1\t$inter_type_array[$i]\t$ortho_gene_2\n";
+							print output_file "$ortho_gene_1\tortho\t$ortho_gene_2\n";
+							# add one to the number of orthologs counted
+							$rice_ortho_count++;
+							# output the orthologs to file
+							print output_file_ortho "$in_gene_1\tortho\t$ortho_gene_1\n";
+							
+							# now find all paralogs to the orthologs found
+							if(exists($para_hash{$ortho_gene_1})) {
+								my @para_gene_1_array = split ("\t", $para_hash{$ortho_gene_1}->{'gene2'});
+								foreach my $para_gene_1 (@para_gene_1_array) {
+									# output the paralogs to file
+									print output_file_para "$ortho_gene_1\tpara\t$para_gene_1\n";
+									
+									if(exists($para_hash{$ortho_gene_2})) {
+										my @para_gene_2_array = split ("\t", $para_hash{$ortho_gene_2}->{'gene2'});
+										foreach my $para_gene_2 (@para_gene_2_array) {
+											# add one to the number of paralogs counted
+											$rice_para_count++;
+											#print output_file "$para_gene_1\t$inter_type_array[$i]\t$para_gene_2\n";
+											print output_file "$para_gene_1\tpara\t$para_gene_2\n";
+											# output the paralogs to file
+											print output_file_para "$ortho_gene_2\tpara\t$para_gene_2\n";
+
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+			
+		}
+	}
+}
+
+close(output_file);
+print "Ath_inter_count = $Ath_inter_count\nrice_ortho_count = $rice_ortho_count\nrice_para_count = $rice_para_count\n";
+print "Now removing duplicates\n";
+# sort the file and get rid of duplicates
+#   Note: this requires replacing the file, hence the mv command
+system "sort $out_file | uniq > $out_file.tmp; mv $out_file.tmp $out_file";
+
+
+close(output_file_inter);
+close(output_file_ortho);
+close(output_file_para);
+
+
+
 
diff --git a/interactome_scripts/inparanoid_orthologs.pl b/interactome_scripts/inparanoid_orthologs.pl
new file mode 100755
index 0000000..194a427
--- /dev/null
+++ b/interactome_scripts/inparanoid_orthologs.pl
@@ -0,0 +1,99 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+use DBI;
+use Term::Screen::ReadLine;
+
+if ($#ARGV != 0) {
+	print "usage: inparanoid_orthologs.pl output_file\n";
+	exit;
+}
+open(outfile,">$ARGV[0]") || die "Error: file '$ARGV[0]' can not be opened\n";
+
+# define the database handle to be used 
+
+my $screen = Term::Screen::ReadLine->new();
+	# clear the screen
+	$screen->clrscr;
+	# ask for username
+	$screen->at(0,0)->puts("Username: ");
+	my $username = $screen->readline(ROW => 0, COL=>11);
+
+	# ask for password, replace character presses with stars
+	$screen->at(1,0)->puts("Password: ");
+	my $password = $screen->readline(ROW => 1, COL => 11, PASSWORD => 1);
+
+	$screen->at(2,0);
+	undef $screen;
+	
+my $dbh = DBI->connect('DBI:mysql:inparanoid_data;host=floret.cgrb.oregonstate.edu', $username, $password,
+	{ RaiseError=> 1, AutoCommit=>1 }
+	) or die "Failed to connect to database: $DBI::errstr";
+	
+my @species = ("Maize", "Oryza_sativa", "Ath", "Sorghum", "Brachy");
+my $num_species = @species;
+
+for (my $i=0; $i<$num_species-1; $i++) {
+	for (my $j=$i+1; $j<$num_species; $j++) {
+		# hashes to store the paralogs and orthologs
+		my (%species_1_gene_hash, %species_2_gene_hash);
+
+		# make sure the species are listed in alphabetical order to get correct table names
+		my $species_1 = $species[$i];
+		my $species_2 = $species[$j];
+		if ($species_1 lt $species_2) {
+			$species_1 = $species_1;
+			$species_2 = $species_2;
+		} else {
+			my $spec_temp = $species_1;
+			$species_1 = $species_2;
+			$species_2 = $spec_temp;
+		}
+	
+		my $spec_table = "$species_1" . "_" . "$species_2";
+		my $safe_species_table = $dbh->quote_identifier($spec_table);
+		my $sth = $dbh->prepare("select * from $safe_species_table");
+	
+		my $id_prev = "";
+	
+		$sth->execute();
+	
+		while (my @line = $sth->fetchrow_array()) {
+			my ($id, $bit_score, $spec, $score, $gene) = @line;
+			if ($id ne $id_prev) {
+				if ($spec eq $species_1) {
+					$species_1_gene_hash{$id} = $gene;
+				}else {
+					$species_2_gene_hash{$id} = $gene;
+				}
+			}else {
+				if ($spec eq $species_1) {
+					if(defined($species_1_gene_hash{$id})) {
+						$species_1_gene_hash{$id} = "$species_1_gene_hash{$id} $gene";
+					} else {
+						$species_1_gene_hash{$id} = $gene;
+					}
+				}else {
+					if(defined($species_2_gene_hash{$id})) {
+						$species_2_gene_hash{$id} = "$species_2_gene_hash{$id} $gene";
+					} else {
+						$species_2_gene_hash{$id} = $gene;
+					}
+				}
+			}
+			$id_prev = $id;
+		}
+		
+		foreach my $key (keys %species_1_gene_hash) {
+			if(defined($species_2_gene_hash{$key})) {
+				print outfile "$species_1_gene_hash{$key}\t$species_2_gene_hash{$key}\n";
+			}
+		}
+		$sth->finish();
+	}
+}
+
+close(outfile);
+
diff --git a/interactome_scripts/inparanoid_output_parse.pl b/interactome_scripts/inparanoid_output_parse.pl
index 3528d97..8f64b9a 100755
--- a/interactome_scripts/inparanoid_output_parse.pl
+++ b/interactome_scripts/inparanoid_output_parse.pl
@@ -1,5 +1,18 @@
 #!/usr/bin/perl
 
+###############################################################
+#  Justin Elser  (elserj@science.oregonstate.edu)             #
+#        Parses the output from an inparanoid run and         #
+#           enters it into a database                         #
+#                                                             #
+#  Version 1.00 - September 2009                              #
+#     Seems to work fine                                      #
+#  Version 1.01 - Nov 4 '09                                   #
+#     Added support for strawberry                            #
+#                                                             #
+###############################################################
+
+
 use strict;
 use warnings;
 
@@ -90,6 +103,10 @@ foreach my $input_file (@files) {
 	
 		$clust_id = "$species_1"."___".$species_2."___".$clust_id;
 		$species = find_species($species);
+		my $gene = find_gene($gene_header,$species);
+		
+		# skip isomers that are not .1
+		next if ($gene =~ /\.[2-9]$/);
 	
 		if(!defined($id_hash{$clust_id})) {
 			if ($clust_id ne $clust_id_prev) {
@@ -104,7 +121,7 @@ foreach my $input_file (@files) {
 			$id = $id_hash{$clust_id};
 		}
 	
-		my $gene = find_gene($gene_header,$species);
+		
 		$sth->execute($id, $bit_score, $species, $score, $gene);
 	
 		$clust_id_prev = $clust_id;
@@ -132,6 +149,8 @@ sub find_species {
 		$temp = "Danio";
 	}elsif ($temp =~ /E\_coli/) {
 		$temp = "E_coli";
+	}elsif ($temp =~ /Fragaria/) {
+		$temp = "Fragaria";
 	}elsif ($temp =~ /Glycine/) {
 		$temp = "Soy";
 	}elsif ($temp =~ /Homo\_sapiens/) {
@@ -171,22 +190,25 @@ sub find_gene {
 	my $species = $_[1];
 	my $gene;
 	if ($species eq "Ath") {
-		my ($name,$gene_id,$chrom,$isomer) = split("\|", $gene_header);
+		my ($name,$gene_id,$chrom,$isomer) = split(/\|/, $gene_header);
 		$gene = $isomer;
 	}elsif ($species eq "Brachy") {
 		$gene = $gene_header;
 	}elsif ($species eq "C_elegans") {
-		my ($gene_id,$temp) = split("\|", $gene_header);
+		my ($gene_id,$temp) = split(/\|/, $gene_header);
 		$gene = $gene_id; #???
 	}elsif ($species eq "Chlamy") {
-		my ($name,$locus_id,$scaff_id,$temp) = split("\|",$gene_header);
+		my ($name,$locus_id,$scaff_id,$temp) = split(/\|/,$gene_header);
 		$gene = $locus_id; #???
 	}elsif ($species eq "Danio") {
 		$gene = $gene_header;
 	}elsif ($species eq "E_coli") {
 		$gene = $gene_header; #???
+	}elsif ($species eq "Fragaria") {
+		my ($gene_id, $mrna_id, $method, $length) = split(/\|/, $gene_header);
+		$gene = $gene_id;
 	}elsif ($species eq "Soy") {
-		my ($name,$locus_id,$scaff_id,$isomer) = split("\|",$gene_header);
+		my ($name,$locus_id,$scaff_id,$isomer) = split(/\|/,$gene_header);
 		$gene = $isomer;
 	}elsif ($species eq "Human") {
 		$gene = $gene_header;
@@ -197,29 +219,29 @@ sub find_gene {
 	}elsif ($species eq "Neurospora") {
 		$gene = $gene_header;
 	}elsif ($species eq "Oryza_sativa") {
-		my ($isomer,$temp,$type) = split("\|",$gene_header);
+		my ($isomer,$temp,$type) = split(/\|/,$gene_header);
 		$gene = $isomer;
 	}elsif ($species eq "Physcomitreall") {
-		my ($name,$locus_id,$chrom_id,$prot_id) = split("\|",$gene_header);
+		my ($name,$locus_id,$chrom_id,$prot_id) = split(/\|/,$gene_header);
 		$gene = $prot_id; #???
 	}elsif ($species eq "Poplar") {
-		my ($name,$locus_id,$chrom_id,$prot_id) = split("\|",$gene_header);
+		my ($name,$locus_id,$chrom_id,$prot_id) = split(/\|/,$gene_header);
 		$gene = $prot_id; #???
 	}elsif ($species eq "Sacc_cerevisiae") {
 		$gene = $gene_header;
 	}elsif ($species eq "Sacc_pombe") {
 		$gene = $gene_header;
 	}elsif ($species eq "Selaginella") {
-		my ($name,$locus_id,$chrom_id,$prot_id) = split("\|",$gene_header);
+		my ($name,$locus_id,$chrom_id,$prot_id) = split(/\|/,$gene_header);
 		$gene = $prot_id; #???
 	}elsif ($species eq "Sorghum") {
-		my ($name,$locus_id,$scaff_id,$prot_id) = split("\|",$gene_header);
+		my ($name,$locus_id,$scaff_id,$prot_id) = split(/\|/,$gene_header);
 		$gene = $prot_id; #???
 	}elsif ($species eq "Synechosystis") {
 		my ($gene_id,$type,$temp) = split(" ",$gene_header);
 		$gene = $gene_id; #???
 	}elsif ($species eq "Grape") {
-		my ($name,$gene_id,$chrom_id,$id) = split("\|",$gene_header);
+		my ($name,$gene_id,$chrom_id,$id) = split(/\|/,$gene_header);
 		$gene = $gene_id; #???
 	}else {
 		die "Error: Gene id can not be found!";
diff --git a/interactome_scripts/supercluster.pl b/interactome_scripts/supercluster.pl
new file mode 100755
index 0000000..5a5a1f8
--- /dev/null
+++ b/interactome_scripts/supercluster.pl
@@ -0,0 +1,180 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+use DBI;
+use Term::Screen::ReadLine;
+
+
+# define the database handle to be used 
+
+my $screen = Term::Screen::ReadLine->new();
+	# clear the screen
+	$screen->clrscr;
+	# ask for username
+	$screen->at(0,0)->puts("Username: ");
+	my $username = $screen->readline(ROW => 0, COL=>11);
+
+	# ask for password, replace character presses with stars
+	$screen->at(1,0)->puts("Password: ");
+	my $password = $screen->readline(ROW => 1, COL => 11, PASSWORD => 1);
+
+	$screen->at(2,0);
+	undef $screen;
+	
+my $dbh = DBI->connect('DBI:mysql:inparanoid_data;host=floret.cgrb.oregonstate.edu', $username, $password,
+	{ RaiseError=> 1, AutoCommit=>1 }
+	) or die "Failed to connect to database: $DBI::errstr";
+	
+	
+#### Note to self ####
+## To not have multiple values in mysql table, use insert ignore instead of insert.  Or use replace.
+### also note that for insert ignore to work, must have a "unique" field
+
+# make the new table to hold the super clusters
+my $super_table = "super_clust";
+my $safe_super_table = $dbh->quote_identifier($super_table);
+
+$dbh->do("drop table if exists $safe_super_table");
+$dbh->do("CREATE TABLE $safe_super_table (
+	`super_id` INT( 11 ) NOT NULL ,
+	`species` VARCHAR( 255 ) NOT NULL ,
+	`gene` VARCHAR( 255 ) NOT NULL ,
+	UNIQUE ( `gene` )
+	) TYPE = MYISAM");
+my $insert_sth = $dbh->prepare("insert ignore into $safe_super_table (super_id, species, gene) values (?,?,?)");
+
+my $super_id = 0; #initialize the super cluster id
+my %super_hash;
+	
+#my @species = ("Maize", "Oryza_sativa", "Ath", "Sorghum", "Brachy");
+#my @species = ("Maize", "Oryza_sativa", "Ath");
+my @species  = ("Ath", "Brachy", "C_elegans", "Chlamy", "Danio", "E_coli", "Fragaria", "Glycine", "Human", "Maize", "Mouse", "Neurospora", "Oryza_sativa", "Physcomitreall", "Poplar", "Sacc_cerevisiae", "Sacc_pombe", "Selaginella", "Sorghum", "Synechosystis", "Vitis_vinifera");
+my $num_species = @species;
+
+my %species_hash;
+
+for (my $i=0; $i<$num_species-1; $i++) {
+	for (my $j=$i+1; $j<$num_species; $j++) {
+		# hashes to store the paralogs and orthologs
+		my (%species_1_gene_hash, %species_2_gene_hash);
+
+		# make sure the species are listed in alphabetical order to get correct table names
+		my $species_1 = $species[$i];
+		my $species_2 = $species[$j];
+		if ($species_1 lt $species_2) {
+			$species_1 = $species_1;
+			$species_2 = $species_2;
+		} else {
+			my $spec_temp = $species_1;
+			$species_1 = $species_2;
+			$species_2 = $spec_temp;
+		}
+		
+		if ($species_1 eq "Glycine") {
+			$species_1 = "Soy";
+		}
+		
+		if ($species_2 eq "Glycine") {
+			$species_2 = "Soy";
+		}
+		
+		if ($species_1 eq "Vitis_vinifera") {
+			$species_1 = "Grape";
+		}
+		
+		if ($species_2 eq "Vitis_vinifera") {
+			$species_2 = "Grape";
+		}
+		
+	
+		my $spec_table = "$species_1" . "_" . "$species_2";
+		my $safe_species_table = $dbh->quote_identifier($spec_table);
+		my $sth = $dbh->prepare("select * from $safe_species_table");
+	
+		my $id_prev = "";
+	
+		my $rv = $sth->execute();
+		
+		# error handling, make sure the table exists
+		if (!$rv) {
+			next;
+		}
+	
+		while (my @line = $sth->fetchrow_array()) {
+			my ($id, $bit_score, $spec, $score, $gene) = @line;
+			if ($id ne $id_prev) {
+				if ($spec eq $species_1) {
+					$species_1_gene_hash{$id} = $gene;
+				}else {
+					$species_2_gene_hash{$id} = $gene;
+				}
+			}else {
+				if ($spec eq $species_1) {
+					if(defined($species_1_gene_hash{$id})) {
+						$species_1_gene_hash{$id} = "$species_1_gene_hash{$id} $gene";
+					} else {
+						$species_1_gene_hash{$id} = $gene;
+					}
+				}else {
+					if(defined($species_2_gene_hash{$id})) {
+						$species_2_gene_hash{$id} = "$species_2_gene_hash{$id} $gene";
+					} else {
+						$species_2_gene_hash{$id} = $gene;
+					}
+				}
+			}
+			$id_prev = $id;
+		}
+		$sth->finish();
+
+		# Each key defines a species pair cluster
+		foreach my $key (keys %species_1_gene_hash) {
+			if(defined($species_2_gene_hash{$key})) {
+				my (@spec_1_array, @spec_2_array);
+				@spec_1_array = split " ", $species_1_gene_hash{$key};
+				@spec_2_array = split " ", $species_2_gene_hash{$key};
+				
+				# if the gene is already in a cluster, use its id #
+				my $super_temp_id;
+				
+				foreach my $super_gene (@spec_1_array) {
+					if(defined($super_hash{$super_gene})) {
+						$super_temp_id = $super_hash{$super_gene};
+						last;
+					}
+				}
+				
+				if(!defined($super_temp_id)) {
+					foreach my $super_gene (@spec_2_array) {
+						if(defined($super_hash{$super_gene})) {
+							$super_temp_id = $super_hash{$super_gene};
+							last;
+						}
+					}
+				}
+				
+				# if none of the genes are in a cluster already, get a new id #
+				if(!defined($super_temp_id)) {
+					++$super_id;
+					$super_temp_id = $super_id;
+				}
+				
+				# build the hash and put the entries in the database
+				foreach my $super_gene (@spec_1_array) {
+					$super_hash{$super_gene} = $super_temp_id;
+					$insert_sth->execute($super_temp_id,$species_1,$super_gene);
+				}
+				foreach my $super_gene (@spec_2_array) {
+					$super_hash{$super_gene} = $super_temp_id;
+					$insert_sth->execute($super_temp_id,$species_2,$super_gene);
+				}
+			
+			}
+		}
+		
+	}
+}
+
+
-- 
2.34.1