Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Script to find some stats about a species supercluster data
authorelserj <elserj@localhost>
Fri, 29 Oct 2010 22:33:22 +0000 (22:33 +0000)
committerelserj <elserj@localhost>
Fri, 29 Oct 2010 22:33:22 +0000 (22:33 +0000)
svn path=/; revision=81

interactome_scripts/species_db_query.pl [new file with mode: 0755]

diff --git a/interactome_scripts/species_db_query.pl b/interactome_scripts/species_db_query.pl
new file mode 100755 (executable)
index 0000000..0b93ea0
--- /dev/null
@@ -0,0 +1,96 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+#######################################################
+#  species_db_query.pl                                #
+# Written by Justin Elser 10/29/10                    #
+#                                                     #
+#  script takes in a fasta file, then queries the     #
+#    file and super_cluster db table for stats        #
+#                                                     #
+#  10/29/10 -- initial version 0.1                    #
+#                                                     #
+#######################################################
+
+if($#ARGV != 0) {
+       print "usage: species_db_query.pl fasta_file\n";
+       exit;
+}
+
+if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") {
+       require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl";
+       use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts";
+}elsif(-e "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl") {
+       require "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl";
+       use lib "$ENV{HOME}/jaiswallab_svn/interactome_scripts";
+}
+
+use DbiFloret;
+
+my $dbh = DbiFloret::dbconnect;
+
+my $in_file = $ARGV[0];
+
+my $species = find_species($in_file);
+
+# find total number of genes in fasta file
+my $gene_tot_count = qx(cat $in_file | grep ">" | wc -l);
+chomp $gene_tot_count;
+
+# find number of unique genes in super_clust table
+my $sth_uniq_genes=$dbh->prepare("SELECT DISTINCT  `gene` 
+                        FROM  `super_clust` 
+                        WHERE species LIKE  ?");
+my $rv1 = $sth_uniq_genes->execute($species);
+if (!$rv1) {
+       next;
+}
+my $uniq_genes = 0;
+while (my @line = $sth_uniq_genes->fetchrow_array()) {
+       $uniq_genes++;
+}
+
+# find number of clusters with at least one gene from the species
+my $sth_species_clusters=$dbh->prepare("select distinct `super_id`
+                                       from `super_clust`
+                                       where species like ?");
+my $rv2 = $sth_species_clusters->execute($species);
+next if (!$rv2);
+my $uniq_clusters = 0;
+while (my @line = $sth_species_clusters->fetchrow_array()) {
+       $uniq_clusters++;
+}
+
+
+# find number of clusters that have paralogs (of the species)
+#   need to put all entries for that species in a hash to then parse through
+my %super_hash;
+
+my $tot_paralogs = 0;
+
+my $sth_get_all_info=$dbh->prepare("select `super_id`,`gene` from `super_clust` where species like ?");
+
+my $rv3 = $sth_get_all_info->execute($species);
+next if (!$rv3);
+while (my @line = $sth_get_all_info->fetchrow_array()) {
+       my ($super_id, $gene) = @line;
+       if(defined($super_hash{$super_id})) {
+               $super_hash{$super_id} = "$super_hash{$super_id}" . "XXXXX" . "$gene";
+               $tot_paralogs++;
+       }else{
+               $super_hash{$super_id} = $gene;
+       }
+}
+
+my $clusters_with_paralogs = 0;
+foreach my $super_id (keys %super_hash) {
+       if($super_hash{$super_id} =~ /XXXXX/) {
+               $clusters_with_paralogs++;
+               # need to add 1 to tot_paralogs for each cluster so that the first one is counted
+               $tot_paralogs++;
+       }
+}
+
+print "Species = $species\ngene count in fasta = $gene_tot_count\nunique genes in superclusters = $uniq_genes\nunique clusters = $uniq_clusters\nnumber of clusters with paralogs = $clusters_with_paralogs\ntotal number of paralogs = $tot_paralogs\n";