--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts";
+
+use DbiFloret;
+
+my $dbh = DbiFloret::dbconnect;
+
+my @species_array = ("Vitis_vinifera", "Eucalyptus_grandis", "Arabidopsis_thaliana", "Populus_trichocarpa");
+my $array_size = @species_array;
+
+my %gene_hash;
+my %super_hash;
+
+my $sth=$dbh->prepare("select * from super_clust where `species` = '$species_array[0]' or `species` = '$species_array[1]' or `species` = '$species_array[2]' or `species` = '$species_array[3]'");
+
+my $rv = $sth->execute();
+
+if (!$rv) {
+ next;
+}
+
+# build the hash from the db
+while (my @line = $sth->fetchrow_array()) {
+ my ($super_id, $species, $gene) = @line;
+
+ if(defined($super_hash{$super_id})){
+ my $oldhash = $super_hash{$super_id};
+ $oldhash->{'species'} = "$oldhash->{'species'}\t$species";;
+ $oldhash->{'gene'} = "$oldhash->{'gene'}\t$gene";;
+ $super_hash{$super_id} = $oldhash;
+ }else{
+ my %hash;
+ $hash{'species'} = $species;
+ $hash{'gene'} = $gene;
+ $super_hash{$super_id} = \%hash;
+ }
+
+ # block to add the hash keyed by gene and value $super_id
+ # should not have the same gene with multiple ids, so error if found
+ if(defined($gene_hash{$gene})) {
+ die "gene found with multiple id's\t$gene";
+ }else{
+ $gene_hash{$gene}->{'id'} = $super_id;
+ $gene_hash{$gene}->{'species'} = $species;
+ }
+}
+
+# create the hash that is keyed by the cluster ids and has values given by the Venn set eg. Grape-Fragaria
+my %clust_hash;
+foreach my $super_id (keys %super_hash) {
+ if(defined($clust_hash{$super_id})) {
+ my $oldhash = $clust_hash{$super_id};
+ for (my $i=0;$i<$array_size;$i++) {
+ if($super_hash{$super_id}->{'species'} =~ /$species_array[$i]/) {
+ if($clust_hash{$super_id} !~ /$species_array[$i]/) {
+ $oldhash = "$oldhash\t$species_array[$i]";
+ }
+ }
+ }
+
+ $clust_hash{$super_id} = $oldhash;
+
+
+ }else{
+ for (my $i=0;$i<$array_size;$i++) {
+ if($super_hash{$super_id}->{'species'} =~ /$species_array[$i]/) {
+ if(defined($clust_hash{$super_id})) {
+ $clust_hash{$super_id} = "$clust_hash{$super_id}\t$species_array[$i]";
+ }else{$clust_hash{$super_id} = $species_array[$i];
+ }
+ }
+ }
+
+ }
+}
+
+# create a new hash that is keyed by the Venn set and with values for the genes and cluster ids
+ # kind of a reverse hash of %clust_hash
+my %count_hash;
+foreach my $clust_id (keys %clust_hash) {
+ if(defined($count_hash{$clust_hash{$clust_id}})) {
+ $count_hash{$clust_hash{$clust_id}}->{'genes'} = "$count_hash{$clust_hash{$clust_id}}->{'genes'}\t$super_hash{$clust_id}->{'gene'}";
+ $count_hash{$clust_hash{$clust_id}}->{'ids'} = "$count_hash{$clust_hash{$clust_id}}->{'ids'}\t$clust_id";
+ }else{
+ $count_hash{$clust_hash{$clust_id}}->{'genes'} = $super_hash{$clust_id}->{'gene'};
+ $count_hash{$clust_hash{$clust_id}}->{'ids'} = $clust_id;
+ }
+}
+
+# print "\nTotal number of genes for each species\n";
+#
+# for (my $k = 0; $k<$array_size; $k++) {
+ # my $gene_count = 0;
+ # foreach my $super_id (keys %super_hash) {
+ # while($super_hash{$super_id}->{'species'} =~ /$species_array[$k]/g) {
+ # $gene_count++;
+ # }
+ # }
+ # print "$species_array[$k]\t$gene_count\n";
+# }
+#print "\n\nCluster counts and gene counts for each Venn set\n";
+print "\n\n";
+foreach my $key (keys %count_hash) {
+ my @gene_array = split ("\t", $count_hash{$key}->{'genes'});
+ my $gene_array_size = @gene_array;
+ #foreach my $gene (@gene_array) {
+ # print "$key\t$gene\n";
+ #}
+ my @id_array = split("\t", $count_hash{$key}->{'ids'});
+ my $id_array_size = @id_array;
+ #print "\n\n\n\n";
+ #foreach my $id (@id_array) {
+ # print "$key\t$id\n";
+ #}
+ print "$key\t$id_array_size\t$gene_array_size\n";
+
+ # create the file to output the genes involved in each Venn set
+ $key =~ s/\t/__/g; # replace the \t in the filename with two working spaces
+ my $fh = "genes_$key";
+ open(output_file, ">$fh");
+ foreach my $gene (@gene_array) {
+ print output_file "$gene\t$gene_hash{$gene}->{'species'}\t$gene_hash{$gene}->{'id'}\n";
+ }
+ close(output_file);
+
+
+}
+
+my $sth_get_species_cluster_count=$dbh->prepare("select count(distinct super_id) from super_clust where species like ?");
+my $sth_get_species_gene_count=$dbh->prepare("select count(gene) from super_clust where species like?");
+
+foreach my $species (@species_array) {
+ $sth_get_species_cluster_count->execute($species);
+ my $species_count = $sth_get_species_cluster_count->fetchrow_array();
+ $sth_get_species_gene_count->execute($species);
+ my $species_gene_count = $sth_get_species_gene_count->fetchrow_array();
+ print "\nFor species $species, there were $species_count unique clusters with $species_gene_count genes\n";
+}
+
+my $sth_get_total_cluster_count=$dbh->prepare("select count(distinct super_id) from super_clust");
+$sth_get_total_cluster_count->execute();
+my $tot_count = $sth_get_total_cluster_count->fetchrow_array();
+
+my $sth_get_total_species_count=$dbh->prepare("select count(distinct species) from super_clust");
+$sth_get_total_species_count->execute();
+my $tot_species_count = $sth_get_total_species_count->fetchrow_array();
+print "\nTotal number of clusters across all species was $tot_count across $tot_species_count species\n";
+
+
+my $tot_count_clusters = keys %clust_hash;
+print "\nTotal number of clusters among the $array_size species = $tot_count_clusters\n";
+
+
+
+
+
+
+
+
+
+