From 402ebc00b2ff09b7ff9f6e74dfd728a9089ed589 Mon Sep 17 00:00:00 2001
From: elserj <elserj@localhost>
Date: Wed, 23 Sep 2015 22:29:02 +0000
Subject: [PATCH] Script that will generate the tabulated cluster gene counts
 from InParanoid.  To be used with cafe instead of using MPIBlast/MCL to
 calculate.

svn path=/; revision=634
---
 interactome_scripts/cafe_from_inparanoid.pl | 112 ++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100755 interactome_scripts/cafe_from_inparanoid.pl
diff --git a/interactome_scripts/cafe_from_inparanoid.pl b/interactome_scripts/cafe_from_inparanoid.pl
new file mode 100755
index 0000000..5c2ab45
--- /dev/null
+++ b/interactome_scripts/cafe_from_inparanoid.pl
@@ -0,0 +1,112 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") {
+	require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl";
+}elsif(-e "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl") {
+	require "$ENV{HOME}/jaiswallab_svn/interactome_scripts/find_species.pl";
+}
+
+
+if($#ARGV != 1) {
+	print "usage: cafe_from_inparanoid.pl species_list output_file\n";
+	exit;
+}
+
+
+use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts";
+
+use DbiFloret;
+
+my $dbh = DbiFloret::dbconnect;
+
+my $in_file = $ARGV[0];
+my $out_file = $ARGV[1];
+
+# Get list of species to check
+my @species_list;
+
+open(INFILE, $in_file);
+while(<INFILE>) {
+		my $line = $_;
+		chomp $line;
+		push (@species_list, $line);
+}
+close(INFILE);
+
+
+# create the species string for the db query
+my $species_string = "";
+for my $species (@species_list) {
+		$species_string = $species_string . "\'$species\',";
+}
+$species_string =~ s/\,$//;
+
+		
+
+# set up the mysql query
+# query will only return values where species is from the list supplied
+my $sth = $dbh->prepare("select super_id,species, count(gene) from super_clust group by super_id,species having species in ($species_string) order by super_id asc;");
+
+my $rv = $sth->execute();
+
+if (!$rv) {
+	next;
+}
+
+
+
+# create hash of hashes where outer hash is keyed by super_id and inner hash is keyed by species with the count for the value
+my %cluster_hash;
+
+while (my @line = $sth->fetchrow_array()) {
+	my ($super_id, $species, $count) = @line;
+	$cluster_hash{$super_id}{$species} = $count;
+}
+
+open(OUTFILE, ">$out_file");
+
+print OUTFILE "Description\tID";
+# Print species list to output file
+for my $species (@species_list) {
+		print OUTFILE "\t$species";
+}
+print OUTFILE "\n";
+
+# Loop through all super_ids
+for my $super_id (sort {$a <=> $b} keys %cluster_hash) {
+		my $mid_line = "";
+		# Loop through each species in list
+		for my $species (@species_list) {
+				if(defined($cluster_hash{$super_id}{$species})) {
+						if($mid_line ne "") {
+								$mid_line = "$mid_line\t$cluster_hash{$super_id}{$species}";
+						}else{
+								$mid_line = "$cluster_hash{$super_id}{$species}";
+						}
+				}else{
+						if($mid_line ne "") {
+								$mid_line = "$mid_line\t0";
+						}else{
+								$mid_line = "0";
+						}
+				}
+		}
+		
+		# drop any lines (clusters) that only have one species in them
+		my @line_array = split("\t", $mid_line);
+		my @match_array;
+		foreach my $elem (@line_array) {
+				if($elem != 0) {
+						push(@match_array, $elem);
+				}
+		}
+		my $match_count = @match_array;
+		if ($match_count != 1) {
+				print OUTFILE "$super_id\t$super_id\t$mid_line\n";
+		}
+}
+
+
-- 
2.34.1