Added section to parse OBO files in addition to assoc files for taxon IDs

author elserj <elserj@localhost>

Thu, 28 May 2015 00:08:52 +0000 (00:08 +0000)

committer elserj <elserj@localhost>

Thu, 28 May 2015 00:08:52 +0000 (00:08 +0000)
author elserj <elserj@localhost>
Thu, 28 May 2015 00:08:52 +0000 (00:08 +0000)
committer elserj <elserj@localhost>
Thu, 28 May 2015 00:08:52 +0000 (00:08 +0000)
diff --git a/interactome_scripts/planteome_ncbi_taxon_slimmer.pl b/interactome_scripts/planteome_ncbi_taxon_slimmer.pl

index a74e3eeefe0b1b5ff200f923181ebc07c4f87601..630af2c9f4aaa99fd05f8cb000f1f3373686f6e0 100755 (executable)
--- a/interactome_scripts/planteome_ncbi_taxon_slimmer.pl
+++ b/interactome_scripts/planteome_ncbi_taxon_slimmer.pl
@@ -15,27 +15,29 @@ use strict;
  use warnings;
  
  # check for arguments and explain usage
-if ($#ARGV !=2) {
-       print "usage: planteome_ncbi_taxon_slimmer.pl taxonomy.dat_file taxonomy_slim_file output_OWL_file\n";
+if ($#ARGV !=4) {
+       print "usage: planteome_ncbi_taxon_slimmer.pl taxonomy.dat_file assoc_file_directory obo_file_directory taxonomy_slim_file output_OWL_file\n";
         exit;
  }
  
  my $taxon_file = $ARGV[0];
-my $slim_file = $ARGV[1];
-my $output_file = $ARGV[2];
+my $assoc_dir = $ARGV[1];
+my $obo_dir = $ARGV[2];
+my $slim_file = $ARGV[3];
+my $output_file = $ARGV[4];
  
  my $ncbi_owltools_path = "/home/justin/owltools/OWLTools-NCBI/bin/ncbi2owl.jar";
  
-# open the assoc files to be parsed, assume this is all sqltable* files in directory
+# open the assoc files to be parsed
  
-my @files = glob("*.assoc");
+my @assoc_files = glob("$assoc_dir/*.assoc");
  
  my %taxon_hash;
  # taxon id 1 is required, I think
  $taxon_hash{1} = 1;
  
  # get the relevant taxon ids
-foreach my $input_file (@files) {
+foreach my $input_file (@assoc_files) {
         open(INFILE, "$input_file") || die "Error: file '$input_file' can not be opened\n";
         while(<INFILE>){
                         my $line = $_;
@@ -51,6 +53,25 @@ foreach my $input_file (@files) {
         close(INFILE);
  }
  
+# open the obo files to be parsed to see if there are more taxon ids
+my @obo_files = glob("$obo_dir/*.obo");
+
+foreach my $input_file (@obo_files) {
+       open(INFILE, "$input_file") || die "Error: file '$input_file' can not be opened\n";
+       while(<INFILE>){
+                       my $line = $_;
+                       chomp $line;
+                       
+                       if($line =~ /Taxonomy_ID:(\d+)/) {
+                                       my $taxon = $1;
+                                       if(!defined($taxon_hash{$taxon})){
+                                                       $taxon_hash{$taxon} = $taxon;
+                                       }
+                       }
+       }
+       close(INFILE);
+}
+
  
  
  # Read in the full taxon file from ncbi and output only the entries that have matching taxon ids
author	elserj <elserj@localhost>
	Thu, 28 May 2015 00:08:52 +0000 (00:08 +0000)
committer	elserj <elserj@localhost>
	Thu, 28 May 2015 00:08:52 +0000 (00:08 +0000)