From 402ebc00b2ff09b7ff9f6e74dfd728a9089ed589 Mon Sep 17 00:00:00 2001 From: elserj Date: Wed, 23 Sep 2015 22:29:02 +0000 Subject: [PATCH] Script that will generate the tabulated cluster gene counts from InParanoid. To be used with cafe instead of using MPIBlast/MCL to calculate. svn path=/; revision=634 --- interactome_scripts/cafe_from_inparanoid.pl | 112 ++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100755 interactome_scripts/cafe_from_inparanoid.pl diff --git a/interactome_scripts/cafe_from_inparanoid.pl b/interactome_scripts/cafe_from_inparanoid.pl new file mode 100755 index 0000000..5c2ab45 --- /dev/null +++ b/interactome_scripts/cafe_from_inparanoid.pl @@ -0,0 +1,112 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") { + require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl"; +}elsif(-e "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl") { + require "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl"; +} + + +if($#ARGV != 1) { + print "usage: cafe_from_inparanoid.pl species_list output_file\n"; + exit; +} + + +use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts"; + +use DbiFloret; + +my $dbh = DbiFloret::dbconnect; + +my $in_file = $ARGV[0]; +my $out_file = $ARGV[1]; + +# Get list of species to check +my @species_list; + +open(INFILE, $in_file); +while() { + my $line = $_; + chomp $line; + push (@species_list, $line); +} +close(INFILE); + + +# create the species string for the db query +my $species_string = ""; +for my $species (@species_list) { + $species_string = $species_string . "\'$species\',"; +} +$species_string =~ s/\,$//; + + + +# set up the mysql query +# query will only return values where species is from the list supplied +my $sth = $dbh->prepare("select super_id,species, count(gene) from super_clust group by super_id,species having species in ($species_string) order by super_id asc;"); + +my $rv = $sth->execute(); + +if (!$rv) { + next; +} + + + +# create hash of hashes where outer hash is keyed by super_id and inner hash is keyed by species with the count for the value +my %cluster_hash; + +while (my @line = $sth->fetchrow_array()) { + my ($super_id, $species, $count) = @line; + $cluster_hash{$super_id}{$species} = $count; +} + +open(OUTFILE, ">$out_file"); + +print OUTFILE "Description\tID"; +# Print species list to output file +for my $species (@species_list) { + print OUTFILE "\t$species"; +} +print OUTFILE "\n"; + +# Loop through all super_ids +for my $super_id (sort {$a <=> $b} keys %cluster_hash) { + my $mid_line = ""; + # Loop through each species in list + for my $species (@species_list) { + if(defined($cluster_hash{$super_id}{$species})) { + if($mid_line ne "") { + $mid_line = "$mid_line\t$cluster_hash{$super_id}{$species}"; + }else{ + $mid_line = "$cluster_hash{$super_id}{$species}"; + } + }else{ + if($mid_line ne "") { + $mid_line = "$mid_line\t0"; + }else{ + $mid_line = "0"; + } + } + } + + # drop any lines (clusters) that only have one species in them + my @line_array = split("\t", $mid_line); + my @match_array; + foreach my $elem (@line_array) { + if($elem != 0) { + push(@match_array, $elem); + } + } + my $match_count = @match_array; + if ($match_count != 1) { + print OUTFILE "$super_id\t$super_id\t$mid_line\n"; + } +} + + -- 2.34.1