--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+#######################################################
+# species_db_query.pl #
+# Written by Justin Elser 10/29/10 #
+# #
+# script takes in a fasta file, then queries the #
+# file and super_cluster db table for stats #
+# #
+# 10/29/10 -- initial version 0.1 #
+# #
+#######################################################
+
+if($#ARGV != 0) {
+ print "usage: species_db_query.pl fasta_file\n";
+ exit;
+}
+
+if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") {
+ require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl";
+ use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts";
+}elsif(-e "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl") {
+ require "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl";
+ use lib "$ENV{HOME}/jaiswallab_svn/interactome_scripts";
+}
+
+use DbiFloret;
+
+my $dbh = DbiFloret::dbconnect;
+
+my $in_file = $ARGV[0];
+
+my $species = find_species($in_file);
+
+# find total number of genes in fasta file
+my $gene_tot_count = qx(cat $in_file | grep ">" | wc -l);
+chomp $gene_tot_count;
+
+# find number of unique genes in super_clust table
+my $sth_uniq_genes=$dbh->prepare("SELECT DISTINCT `gene`
+ FROM `super_clust`
+ WHERE species LIKE ?");
+my $rv1 = $sth_uniq_genes->execute($species);
+if (!$rv1) {
+ next;
+}
+my $uniq_genes = 0;
+while (my @line = $sth_uniq_genes->fetchrow_array()) {
+ $uniq_genes++;
+}
+
+# find number of clusters with at least one gene from the species
+my $sth_species_clusters=$dbh->prepare("select distinct `super_id`
+ from `super_clust`
+ where species like ?");
+my $rv2 = $sth_species_clusters->execute($species);
+next if (!$rv2);
+my $uniq_clusters = 0;
+while (my @line = $sth_species_clusters->fetchrow_array()) {
+ $uniq_clusters++;
+}
+
+
+# find number of clusters that have paralogs (of the species)
+# need to put all entries for that species in a hash to then parse through
+my %super_hash;
+
+my $tot_paralogs = 0;
+
+my $sth_get_all_info=$dbh->prepare("select `super_id`,`gene` from `super_clust` where species like ?");
+
+my $rv3 = $sth_get_all_info->execute($species);
+next if (!$rv3);
+while (my @line = $sth_get_all_info->fetchrow_array()) {
+ my ($super_id, $gene) = @line;
+ if(defined($super_hash{$super_id})) {
+ $super_hash{$super_id} = "$super_hash{$super_id}" . "XXXXX" . "$gene";
+ $tot_paralogs++;
+ }else{
+ $super_hash{$super_id} = $gene;
+ }
+}
+
+my $clusters_with_paralogs = 0;
+foreach my $super_id (keys %super_hash) {
+ if($super_hash{$super_id} =~ /XXXXX/) {
+ $clusters_with_paralogs++;
+ # need to add 1 to tot_paralogs for each cluster so that the first one is counted
+ $tot_paralogs++;
+ }
+}
+
+print "Species = $species\ngene count in fasta = $gene_tot_count\nunique genes in superclusters = $uniq_genes\nunique clusters = $uniq_clusters\nnumber of clusters with paralogs = $clusters_with_paralogs\ntotal number of paralogs = $tot_paralogs\n";