Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Add Venn diagram creation program to svn
authorelserj <elserj@localhost>
Thu, 26 Jan 2012 00:11:32 +0000 (00:11 +0000)
committerelserj <elserj@localhost>
Thu, 26 Jan 2012 00:11:32 +0000 (00:11 +0000)
svn path=/; revision=278

interactome_scripts/build_inparanoid_venn.pl [new file with mode: 0755]

diff --git a/interactome_scripts/build_inparanoid_venn.pl b/interactome_scripts/build_inparanoid_venn.pl
new file mode 100755 (executable)
index 0000000..1aed242
--- /dev/null
@@ -0,0 +1,165 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts";
+
+use DbiFloret;
+
+my $dbh = DbiFloret::dbconnect;
+
+my @species_array = ("Vitis_vinifera", "Eucalyptus_grandis", "Arabidopsis_thaliana", "Populus_trichocarpa");
+my $array_size = @species_array;
+
+my %gene_hash;
+my %super_hash;
+
+my $sth=$dbh->prepare("select * from super_clust where `species` = '$species_array[0]' or `species` = '$species_array[1]' or `species` = '$species_array[2]' or `species` = '$species_array[3]'");
+
+my $rv = $sth->execute();
+
+if (!$rv) {
+       next;
+}
+
+# build the hash from the db
+while (my @line = $sth->fetchrow_array()) {
+       my ($super_id, $species, $gene) = @line;
+
+       if(defined($super_hash{$super_id})){
+               my $oldhash = $super_hash{$super_id};
+               $oldhash->{'species'} = "$oldhash->{'species'}\t$species";;
+               $oldhash->{'gene'} = "$oldhash->{'gene'}\t$gene";;
+               $super_hash{$super_id} = $oldhash;
+       }else{
+               my %hash;
+               $hash{'species'} = $species;
+               $hash{'gene'} = $gene;
+               $super_hash{$super_id} = \%hash;
+       }
+       
+       # block to add the hash keyed by gene and value $super_id
+       #  should not have the same gene with multiple ids, so error if found
+       if(defined($gene_hash{$gene})) {
+               die "gene found with multiple id's\t$gene";
+       }else{
+               $gene_hash{$gene}->{'id'} = $super_id;
+               $gene_hash{$gene}->{'species'} = $species;
+       }
+}
+
+# create the hash that is keyed by the cluster ids and has values given by the Venn set eg. Grape-Fragaria
+my %clust_hash;
+foreach my $super_id (keys %super_hash) {
+       if(defined($clust_hash{$super_id})) {
+               my $oldhash = $clust_hash{$super_id};
+               for (my $i=0;$i<$array_size;$i++) {
+                       if($super_hash{$super_id}->{'species'} =~ /$species_array[$i]/) {
+                               if($clust_hash{$super_id} !~ /$species_array[$i]/) {
+                                       $oldhash = "$oldhash\t$species_array[$i]";
+                               }
+                       }
+               }
+               
+               $clust_hash{$super_id} = $oldhash;
+               
+               
+       }else{
+               for (my $i=0;$i<$array_size;$i++) {
+                       if($super_hash{$super_id}->{'species'} =~ /$species_array[$i]/) {
+                               if(defined($clust_hash{$super_id})) {
+                                       $clust_hash{$super_id} = "$clust_hash{$super_id}\t$species_array[$i]";
+                               }else{$clust_hash{$super_id} = $species_array[$i];
+                               }
+                       }
+               }
+               
+       }
+}
+
+# create a new hash that is keyed by the Venn set and with values for the genes and cluster ids
+       # kind of a reverse hash of %clust_hash
+my %count_hash;
+foreach my $clust_id (keys %clust_hash) {
+       if(defined($count_hash{$clust_hash{$clust_id}})) {
+               $count_hash{$clust_hash{$clust_id}}->{'genes'} = "$count_hash{$clust_hash{$clust_id}}->{'genes'}\t$super_hash{$clust_id}->{'gene'}";
+               $count_hash{$clust_hash{$clust_id}}->{'ids'} = "$count_hash{$clust_hash{$clust_id}}->{'ids'}\t$clust_id";
+       }else{
+               $count_hash{$clust_hash{$clust_id}}->{'genes'} = $super_hash{$clust_id}->{'gene'};
+               $count_hash{$clust_hash{$clust_id}}->{'ids'} = $clust_id;
+       }
+}
+
+# print "\nTotal number of genes for each species\n";
+# 
+# for (my $k = 0; $k<$array_size; $k++) {
+       # my $gene_count = 0;
+       # foreach my $super_id (keys %super_hash) {
+               # while($super_hash{$super_id}->{'species'} =~ /$species_array[$k]/g) {
+                       # $gene_count++;
+               # }
+       # }
+       # print "$species_array[$k]\t$gene_count\n";
+# }
+#print "\n\nCluster counts and gene counts for each Venn set\n";
+print "\n\n";
+foreach my $key (keys %count_hash) {
+       my @gene_array = split ("\t", $count_hash{$key}->{'genes'});
+       my $gene_array_size = @gene_array;
+       #foreach my $gene (@gene_array) {
+       #       print "$key\t$gene\n";
+       #}
+       my @id_array = split("\t", $count_hash{$key}->{'ids'});
+       my $id_array_size = @id_array;
+       #print "\n\n\n\n";
+       #foreach my $id (@id_array) {
+       #       print "$key\t$id\n";
+       #}
+       print "$key\t$id_array_size\t$gene_array_size\n";
+       
+       # create the file to output the genes involved in each Venn set
+       $key =~ s/\t/__/g; # replace the \t in the filename with two working spaces
+       my $fh = "genes_$key";
+       open(output_file, ">$fh");
+       foreach my $gene (@gene_array) {
+               print output_file "$gene\t$gene_hash{$gene}->{'species'}\t$gene_hash{$gene}->{'id'}\n";
+       }
+       close(output_file);
+       
+
+}
+
+my $sth_get_species_cluster_count=$dbh->prepare("select count(distinct super_id) from super_clust where species like ?");
+my $sth_get_species_gene_count=$dbh->prepare("select count(gene) from super_clust where species like?");
+
+foreach my $species (@species_array) {
+               $sth_get_species_cluster_count->execute($species);
+               my $species_count = $sth_get_species_cluster_count->fetchrow_array();
+               $sth_get_species_gene_count->execute($species);
+               my $species_gene_count = $sth_get_species_gene_count->fetchrow_array();
+               print "\nFor species $species, there were $species_count unique clusters with $species_gene_count genes\n";
+}
+
+my $sth_get_total_cluster_count=$dbh->prepare("select count(distinct super_id) from super_clust");
+$sth_get_total_cluster_count->execute();
+my $tot_count = $sth_get_total_cluster_count->fetchrow_array();
+
+my $sth_get_total_species_count=$dbh->prepare("select count(distinct species) from super_clust");
+$sth_get_total_species_count->execute();
+my $tot_species_count = $sth_get_total_species_count->fetchrow_array();
+print "\nTotal number of clusters across all species was $tot_count across $tot_species_count species\n";
+               
+
+my $tot_count_clusters = keys %clust_hash;
+print "\nTotal number of clusters among the $array_size species = $tot_count_clusters\n";
+
+
+
+
+
+
+
+
+
+