From f8ea221d98d6adb876cfd671d8f7cfa13f1818d6 Mon Sep 17 00:00:00 2001 From: elserj Date: Fri, 29 Oct 2010 22:33:22 +0000 Subject: [PATCH] Script to find some stats about a species supercluster data svn path=/; revision=81 --- interactome_scripts/species_db_query.pl | 96 +++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100755 interactome_scripts/species_db_query.pl diff --git a/interactome_scripts/species_db_query.pl b/interactome_scripts/species_db_query.pl new file mode 100755 index 0000000..0b93ea0 --- /dev/null +++ b/interactome_scripts/species_db_query.pl @@ -0,0 +1,96 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +####################################################### +# species_db_query.pl # +# Written by Justin Elser 10/29/10 # +# # +# script takes in a fasta file, then queries the # +# file and super_cluster db table for stats # +# # +# 10/29/10 -- initial version 0.1 # +# # +####################################################### + +if($#ARGV != 0) { + print "usage: species_db_query.pl fasta_file\n"; + exit; +} + +if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") { + require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl"; + use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts"; +}elsif(-e "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl") { + require "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl"; + use lib "$ENV{HOME}/jaiswallab_svn/interactome_scripts"; +} + +use DbiFloret; + +my $dbh = DbiFloret::dbconnect; + +my $in_file = $ARGV[0]; + +my $species = find_species($in_file); + +# find total number of genes in fasta file +my $gene_tot_count = qx(cat $in_file | grep ">" | wc -l); +chomp $gene_tot_count; + +# find number of unique genes in super_clust table +my $sth_uniq_genes=$dbh->prepare("SELECT DISTINCT `gene` + FROM `super_clust` + WHERE species LIKE ?"); +my $rv1 = $sth_uniq_genes->execute($species); +if (!$rv1) { + next; +} +my $uniq_genes = 0; +while (my @line = $sth_uniq_genes->fetchrow_array()) { + $uniq_genes++; +} + +# find number of clusters with at least one gene from the species +my $sth_species_clusters=$dbh->prepare("select distinct `super_id` + from `super_clust` + where species like ?"); +my $rv2 = $sth_species_clusters->execute($species); +next if (!$rv2); +my $uniq_clusters = 0; +while (my @line = $sth_species_clusters->fetchrow_array()) { + $uniq_clusters++; +} + + +# find number of clusters that have paralogs (of the species) +# need to put all entries for that species in a hash to then parse through +my %super_hash; + +my $tot_paralogs = 0; + +my $sth_get_all_info=$dbh->prepare("select `super_id`,`gene` from `super_clust` where species like ?"); + +my $rv3 = $sth_get_all_info->execute($species); +next if (!$rv3); +while (my @line = $sth_get_all_info->fetchrow_array()) { + my ($super_id, $gene) = @line; + if(defined($super_hash{$super_id})) { + $super_hash{$super_id} = "$super_hash{$super_id}" . "XXXXX" . "$gene"; + $tot_paralogs++; + }else{ + $super_hash{$super_id} = $gene; + } +} + +my $clusters_with_paralogs = 0; +foreach my $super_id (keys %super_hash) { + if($super_hash{$super_id} =~ /XXXXX/) { + $clusters_with_paralogs++; + # need to add 1 to tot_paralogs for each cluster so that the first one is counted + $tot_paralogs++; + } +} + +print "Species = $species\ngene count in fasta = $gene_tot_count\nunique genes in superclusters = $uniq_genes\nunique clusters = $uniq_clusters\nnumber of clusters with paralogs = $clusters_with_paralogs\ntotal number of paralogs = $tot_paralogs\n"; -- 2.34.1