From: elserj <elserj@localhost>
Date: Tue, 16 Dec 2014 23:33:24 +0000 (+0000)
Subject: Generalize species input so that script doesn't have to be edited each time
X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=8f438e289e5536b6cbdae42807156309e5dd9155;p=old-jaiswallab-svn%2F.git

Generalize species input so that script doesn't have to be edited each time

svn path=/; revision=592
---

diff --git a/interactome_scripts/find_ortho_inpara.pl b/interactome_scripts/find_ortho_inpara.pl
index a7876e4..600cd27 100755
--- a/interactome_scripts/find_ortho_inpara.pl
+++ b/interactome_scripts/find_ortho_inpara.pl
@@ -12,8 +12,8 @@
 use strict;
 use warnings;
 
-if($#ARGV != 1) {
-	print "usage: find_ortho_inpara.pl input_gene_list output_file_prefix\n";
+if($#ARGV != 3) {
+	print "usage: find_ortho_inpara.pl species_1 species_2 input_gene_list output_file\n";
 	exit;
 }
 
@@ -23,16 +23,17 @@ use DbiFloret;
 
 my $dbh = DbiFloret::dbconnect;
 
-my @species_array = ("Ath", "Oryza_sativa", "Sorghum", "Maize", "Glycine", "Brachy", "Poplar");
+my @species_array = ($ARGV[0], $ARGV[1]);
+#my @species_array = ("Arabidopsis_thaliana", "Zea_mays");
 #my @species_array = ("Ath", "Maize");
 my $spec_array_size = @species_array;
 
 # similarity score to determine if we should keep the paralogs.
 #  0.0 would keep all paralogs, 1.0 would restrict to strict orthologs
-my $sim_score = 0.0;
+my $sim_score = 0.1;
 
 # read in list of genes from csv file given as argument
-my $in_file = $ARGV[0];
+my $in_file = $ARGV[2];
 
 open (in_file, "$in_file");
 
@@ -43,7 +44,7 @@ while(<in_file>) {
 	chomp $in_gene;	
 	$in_gene =~ s/\s//g;
 	if ($in_gene !~ /\.\d$/) {
-		$in_gene .= ".1"; # add the suffix back so that the gene matches the db
+		#$in_gene .= ".1"; # add the suffix back so that the gene matches the db
 	}
 	push(@in_gene_array, $in_gene);
 	#print "$in_gene\n";
@@ -51,20 +52,23 @@ while(<in_file>) {
 }
 close(in_file);
 
+my %ortho_hash;
 
 for (my $i = 1; $i<$spec_array_size; $i++) {
 	
-	my $out_file = $ARGV[1] . "_$species_array[$i].txt";
+	my $out_file = $ARGV[3]; # . "_$species_array[3].txt";
 	open(out_file, ">$out_file");
 	
 	my $table = $species_array[0] . "_" . $species_array[$i];
-	print out_file "gene\tortho_gene\tscore\n";
+	print out_file "id\tgene\tortho_gene\tscore\n";
+
+	my $safe_table = $dbh->quote_identifier($table);
 	
 	# set up the db query statement
-	my $sth_get_id = $dbh->prepare("select id from $table where gene = ?");
+	my $sth_get_id = $dbh->prepare("select id from $safe_table where gene like ?");
 	
 	# set up db query statement to get the orthologs using the cluster id
-	my $sth_get_ortho = $dbh->prepare("select gene, score from $table where id = ? and species = '$species_array[$i]' and score >= '$sim_score'");
+	my $sth_get_ortho = $dbh->prepare("select gene, score from $safe_table where id = ? and species = '$species_array[1]' and score >= '$sim_score'");
 	
 	foreach my $gene (@in_gene_array) {
 		
@@ -74,6 +78,7 @@ for (my $i = 1; $i<$spec_array_size; $i++) {
 		}
 		
 		while (my $id = $sth_get_id->fetchrow_array()) {
+			#print "$id\n";
 			my $rv2 = $sth_get_ortho->execute($id);
 			if (!$rv2) {
 				next;
@@ -81,11 +86,19 @@ for (my $i = 1; $i<$spec_array_size; $i++) {
 			
 			while (my @line = $sth_get_ortho->fetchrow_array()) {
 				my ($ortho,$score) = @line;
-				print out_file "$gene\t$ortho\t$score\n";
+				print out_file "$id\t$gene\t$ortho\t$score\n";
+				if(defined($ortho_hash{$gene})) {
+						$ortho_hash{$gene} = "$ortho_hash{$gene}\t$ortho";
+				}else{
+						$ortho_hash{$gene} = $ortho;
+				}
 			}
 		}
 	}
-	close(out_file);
 }
 
+#foreach my $key (keys %ortho_hash) {
+#		print out_file "$key\t$ortho_hash{$key}\n";
+#}
 
+close(out_file);