From b615c38efe29b7ffb8ad9e41d654d0e8fdbec1b3 Mon Sep 17 00:00:00 2001 From: elserj Date: Thu, 26 Jan 2012 00:11:32 +0000 Subject: [PATCH] Add Venn diagram creation program to svn svn path=/; revision=278 --- interactome_scripts/build_inparanoid_venn.pl | 165 +++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100755 interactome_scripts/build_inparanoid_venn.pl diff --git a/interactome_scripts/build_inparanoid_venn.pl b/interactome_scripts/build_inparanoid_venn.pl new file mode 100755 index 0000000..1aed242 --- /dev/null +++ b/interactome_scripts/build_inparanoid_venn.pl @@ -0,0 +1,165 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts"; + +use DbiFloret; + +my $dbh = DbiFloret::dbconnect; + +my @species_array = ("Vitis_vinifera", "Eucalyptus_grandis", "Arabidopsis_thaliana", "Populus_trichocarpa"); +my $array_size = @species_array; + +my %gene_hash; +my %super_hash; + +my $sth=$dbh->prepare("select * from super_clust where `species` = '$species_array[0]' or `species` = '$species_array[1]' or `species` = '$species_array[2]' or `species` = '$species_array[3]'"); + +my $rv = $sth->execute(); + +if (!$rv) { + next; +} + +# build the hash from the db +while (my @line = $sth->fetchrow_array()) { + my ($super_id, $species, $gene) = @line; + + if(defined($super_hash{$super_id})){ + my $oldhash = $super_hash{$super_id}; + $oldhash->{'species'} = "$oldhash->{'species'}\t$species";; + $oldhash->{'gene'} = "$oldhash->{'gene'}\t$gene";; + $super_hash{$super_id} = $oldhash; + }else{ + my %hash; + $hash{'species'} = $species; + $hash{'gene'} = $gene; + $super_hash{$super_id} = \%hash; + } + + # block to add the hash keyed by gene and value $super_id + # should not have the same gene with multiple ids, so error if found + if(defined($gene_hash{$gene})) { + die "gene found with multiple id's\t$gene"; + }else{ + $gene_hash{$gene}->{'id'} = $super_id; + $gene_hash{$gene}->{'species'} = $species; + } +} + +# create the hash that is keyed by the cluster ids and has values given by the Venn set eg. Grape-Fragaria +my %clust_hash; +foreach my $super_id (keys %super_hash) { + if(defined($clust_hash{$super_id})) { + my $oldhash = $clust_hash{$super_id}; + for (my $i=0;$i<$array_size;$i++) { + if($super_hash{$super_id}->{'species'} =~ /$species_array[$i]/) { + if($clust_hash{$super_id} !~ /$species_array[$i]/) { + $oldhash = "$oldhash\t$species_array[$i]"; + } + } + } + + $clust_hash{$super_id} = $oldhash; + + + }else{ + for (my $i=0;$i<$array_size;$i++) { + if($super_hash{$super_id}->{'species'} =~ /$species_array[$i]/) { + if(defined($clust_hash{$super_id})) { + $clust_hash{$super_id} = "$clust_hash{$super_id}\t$species_array[$i]"; + }else{$clust_hash{$super_id} = $species_array[$i]; + } + } + } + + } +} + +# create a new hash that is keyed by the Venn set and with values for the genes and cluster ids + # kind of a reverse hash of %clust_hash +my %count_hash; +foreach my $clust_id (keys %clust_hash) { + if(defined($count_hash{$clust_hash{$clust_id}})) { + $count_hash{$clust_hash{$clust_id}}->{'genes'} = "$count_hash{$clust_hash{$clust_id}}->{'genes'}\t$super_hash{$clust_id}->{'gene'}"; + $count_hash{$clust_hash{$clust_id}}->{'ids'} = "$count_hash{$clust_hash{$clust_id}}->{'ids'}\t$clust_id"; + }else{ + $count_hash{$clust_hash{$clust_id}}->{'genes'} = $super_hash{$clust_id}->{'gene'}; + $count_hash{$clust_hash{$clust_id}}->{'ids'} = $clust_id; + } +} + +# print "\nTotal number of genes for each species\n"; +# +# for (my $k = 0; $k<$array_size; $k++) { + # my $gene_count = 0; + # foreach my $super_id (keys %super_hash) { + # while($super_hash{$super_id}->{'species'} =~ /$species_array[$k]/g) { + # $gene_count++; + # } + # } + # print "$species_array[$k]\t$gene_count\n"; +# } +#print "\n\nCluster counts and gene counts for each Venn set\n"; +print "\n\n"; +foreach my $key (keys %count_hash) { + my @gene_array = split ("\t", $count_hash{$key}->{'genes'}); + my $gene_array_size = @gene_array; + #foreach my $gene (@gene_array) { + # print "$key\t$gene\n"; + #} + my @id_array = split("\t", $count_hash{$key}->{'ids'}); + my $id_array_size = @id_array; + #print "\n\n\n\n"; + #foreach my $id (@id_array) { + # print "$key\t$id\n"; + #} + print "$key\t$id_array_size\t$gene_array_size\n"; + + # create the file to output the genes involved in each Venn set + $key =~ s/\t/__/g; # replace the \t in the filename with two working spaces + my $fh = "genes_$key"; + open(output_file, ">$fh"); + foreach my $gene (@gene_array) { + print output_file "$gene\t$gene_hash{$gene}->{'species'}\t$gene_hash{$gene}->{'id'}\n"; + } + close(output_file); + + +} + +my $sth_get_species_cluster_count=$dbh->prepare("select count(distinct super_id) from super_clust where species like ?"); +my $sth_get_species_gene_count=$dbh->prepare("select count(gene) from super_clust where species like?"); + +foreach my $species (@species_array) { + $sth_get_species_cluster_count->execute($species); + my $species_count = $sth_get_species_cluster_count->fetchrow_array(); + $sth_get_species_gene_count->execute($species); + my $species_gene_count = $sth_get_species_gene_count->fetchrow_array(); + print "\nFor species $species, there were $species_count unique clusters with $species_gene_count genes\n"; +} + +my $sth_get_total_cluster_count=$dbh->prepare("select count(distinct super_id) from super_clust"); +$sth_get_total_cluster_count->execute(); +my $tot_count = $sth_get_total_cluster_count->fetchrow_array(); + +my $sth_get_total_species_count=$dbh->prepare("select count(distinct species) from super_clust"); +$sth_get_total_species_count->execute(); +my $tot_species_count = $sth_get_total_species_count->fetchrow_array(); +print "\nTotal number of clusters across all species was $tot_count across $tot_species_count species\n"; + + +my $tot_count_clusters = keys %clust_hash; +print "\nTotal number of clusters among the $array_size species = $tot_count_clusters\n"; + + + + + + + + + + -- 2.34.1