From f8ea221d98d6adb876cfd671d8f7cfa13f1818d6 Mon Sep 17 00:00:00 2001
From: elserj <elserj@localhost>
Date: Fri, 29 Oct 2010 22:33:22 +0000
Subject: [PATCH] Script to find some stats about a species supercluster data

svn path=/; revision=81
---
 interactome_scripts/species_db_query.pl | 96 +++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100755 interactome_scripts/species_db_query.pl

diff --git a/interactome_scripts/species_db_query.pl b/interactome_scripts/species_db_query.pl
new file mode 100755
index 0000000..0b93ea0
--- /dev/null
+++ b/interactome_scripts/species_db_query.pl
@@ -0,0 +1,96 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+#######################################################
+#  species_db_query.pl                                #
+# Written by Justin Elser 10/29/10                    #
+#                                                     #
+#  script takes in a fasta file, then queries the     #
+#    file and super_cluster db table for stats        #
+#                                                     #
+#  10/29/10 -- initial version 0.1                    #
+#                                                     #
+#######################################################
+
+if($#ARGV != 0) {
+	print "usage: species_db_query.pl fasta_file\n";
+	exit;
+}
+
+if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") {
+	require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl";
+	use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts";
+}elsif(-e "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl") {
+	require "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl";
+	use lib "$ENV{HOME}/jaiswallab_svn/interactome_scripts";
+}
+
+use DbiFloret;
+
+my $dbh = DbiFloret::dbconnect;
+
+my $in_file = $ARGV[0];
+
+my $species = find_species($in_file);
+
+# find total number of genes in fasta file
+my $gene_tot_count = qx(cat $in_file | grep ">" | wc -l);
+chomp $gene_tot_count;
+
+# find number of unique genes in super_clust table
+my $sth_uniq_genes=$dbh->prepare("SELECT DISTINCT  `gene` 
+			 FROM  `super_clust` 
+			 WHERE species LIKE  ?");
+my $rv1 = $sth_uniq_genes->execute($species);
+if (!$rv1) {
+	next;
+}
+my $uniq_genes = 0;
+while (my @line = $sth_uniq_genes->fetchrow_array()) {
+	$uniq_genes++;
+}
+
+# find number of clusters with at least one gene from the species
+my $sth_species_clusters=$dbh->prepare("select distinct `super_id`
+					from `super_clust`
+					where species like ?");
+my $rv2 = $sth_species_clusters->execute($species);
+next if (!$rv2);
+my $uniq_clusters = 0;
+while (my @line = $sth_species_clusters->fetchrow_array()) {
+	$uniq_clusters++;
+}
+
+
+# find number of clusters that have paralogs (of the species)
+#   need to put all entries for that species in a hash to then parse through
+my %super_hash;
+
+my $tot_paralogs = 0;
+
+my $sth_get_all_info=$dbh->prepare("select `super_id`,`gene` from `super_clust` where species like ?");
+
+my $rv3 = $sth_get_all_info->execute($species);
+next if (!$rv3);
+while (my @line = $sth_get_all_info->fetchrow_array()) {
+	my ($super_id, $gene) = @line;
+	if(defined($super_hash{$super_id})) {
+		$super_hash{$super_id} = "$super_hash{$super_id}" . "XXXXX" . "$gene";
+		$tot_paralogs++;
+	}else{
+		$super_hash{$super_id} = $gene;
+	}
+}
+
+my $clusters_with_paralogs = 0;
+foreach my $super_id (keys %super_hash) {
+	if($super_hash{$super_id} =~ /XXXXX/) {
+		$clusters_with_paralogs++;
+		# need to add 1 to tot_paralogs for each cluster so that the first one is counted
+		$tot_paralogs++;
+	}
+}
+
+print "Species = $species\ngene count in fasta = $gene_tot_count\nunique genes in superclusters = $uniq_genes\nunique clusters = $uniq_clusters\nnumber of clusters with paralogs = $clusters_with_paralogs\ntotal number of paralogs = $tot_paralogs\n";
-- 
2.34.1