From: elserj <elserj@localhost>
Date: Thu, 28 May 2015 00:08:52 +0000 (+0000)
Subject: Added section to parse OBO files in addition to assoc files for taxon IDs
X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=d5cafb3e35eef4554eb39497b9c30c5a65c4f5aa;p=old-jaiswallab-svn%2F.git

Added section to parse OBO files in addition to assoc files for taxon IDs

svn path=/; revision=630
---

diff --git a/interactome_scripts/planteome_ncbi_taxon_slimmer.pl b/interactome_scripts/planteome_ncbi_taxon_slimmer.pl
index a74e3ee..630af2c 100755
--- a/interactome_scripts/planteome_ncbi_taxon_slimmer.pl
+++ b/interactome_scripts/planteome_ncbi_taxon_slimmer.pl
@@ -15,27 +15,29 @@ use strict;
 use warnings;
 
 # check for arguments and explain usage
-if ($#ARGV !=2) {
-	print "usage: planteome_ncbi_taxon_slimmer.pl taxonomy.dat_file taxonomy_slim_file output_OWL_file\n";
+if ($#ARGV !=4) {
+	print "usage: planteome_ncbi_taxon_slimmer.pl taxonomy.dat_file assoc_file_directory obo_file_directory taxonomy_slim_file output_OWL_file\n";
 	exit;
 }
 
 my $taxon_file = $ARGV[0];
-my $slim_file = $ARGV[1];
-my $output_file = $ARGV[2];
+my $assoc_dir = $ARGV[1];
+my $obo_dir = $ARGV[2];
+my $slim_file = $ARGV[3];
+my $output_file = $ARGV[4];
 
 my $ncbi_owltools_path = "/home/justin/owltools/OWLTools-NCBI/bin/ncbi2owl.jar";
 
-# open the assoc files to be parsed, assume this is all sqltable* files in directory
+# open the assoc files to be parsed
 
-my @files = glob("*.assoc");
+my @assoc_files = glob("$assoc_dir/*.assoc");
 
 my %taxon_hash;
 # taxon id 1 is required, I think
 $taxon_hash{1} = 1;
 
 # get the relevant taxon ids
-foreach my $input_file (@files) {
+foreach my $input_file (@assoc_files) {
 	open(INFILE, "$input_file") || die "Error: file '$input_file' can not be opened\n";
 	while(<INFILE>){
 			my $line = $_;
@@ -51,6 +53,25 @@ foreach my $input_file (@files) {
 	close(INFILE);
 }
 
+# open the obo files to be parsed to see if there are more taxon ids
+my @obo_files = glob("$obo_dir/*.obo");
+
+foreach my $input_file (@obo_files) {
+	open(INFILE, "$input_file") || die "Error: file '$input_file' can not be opened\n";
+	while(<INFILE>){
+			my $line = $_;
+			chomp $line;
+			
+			if($line =~ /Taxonomy_ID:(\d+)/) {
+					my $taxon = $1;
+					if(!defined($taxon_hash{$taxon})){
+							$taxon_hash{$taxon} = $taxon;
+					}
+			}
+	}
+	close(INFILE);
+}
+
 
 
 # Read in the full taxon file from ncbi and output only the entries that have matching taxon ids