From: elserj Date: Thu, 28 May 2015 00:08:52 +0000 (+0000) Subject: Added section to parse OBO files in addition to assoc files for taxon IDs X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=d5cafb3e35eef4554eb39497b9c30c5a65c4f5aa;p=old-jaiswallab-svn%2F.git Added section to parse OBO files in addition to assoc files for taxon IDs svn path=/; revision=630 --- diff --git a/interactome_scripts/planteome_ncbi_taxon_slimmer.pl b/interactome_scripts/planteome_ncbi_taxon_slimmer.pl index a74e3ee..630af2c 100755 --- a/interactome_scripts/planteome_ncbi_taxon_slimmer.pl +++ b/interactome_scripts/planteome_ncbi_taxon_slimmer.pl @@ -15,27 +15,29 @@ use strict; use warnings; # check for arguments and explain usage -if ($#ARGV !=2) { - print "usage: planteome_ncbi_taxon_slimmer.pl taxonomy.dat_file taxonomy_slim_file output_OWL_file\n"; +if ($#ARGV !=4) { + print "usage: planteome_ncbi_taxon_slimmer.pl taxonomy.dat_file assoc_file_directory obo_file_directory taxonomy_slim_file output_OWL_file\n"; exit; } my $taxon_file = $ARGV[0]; -my $slim_file = $ARGV[1]; -my $output_file = $ARGV[2]; +my $assoc_dir = $ARGV[1]; +my $obo_dir = $ARGV[2]; +my $slim_file = $ARGV[3]; +my $output_file = $ARGV[4]; my $ncbi_owltools_path = "/home/justin/owltools/OWLTools-NCBI/bin/ncbi2owl.jar"; -# open the assoc files to be parsed, assume this is all sqltable* files in directory +# open the assoc files to be parsed -my @files = glob("*.assoc"); +my @assoc_files = glob("$assoc_dir/*.assoc"); my %taxon_hash; # taxon id 1 is required, I think $taxon_hash{1} = 1; # get the relevant taxon ids -foreach my $input_file (@files) { +foreach my $input_file (@assoc_files) { open(INFILE, "$input_file") || die "Error: file '$input_file' can not be opened\n"; while(){ my $line = $_; @@ -51,6 +53,25 @@ foreach my $input_file (@files) { close(INFILE); } +# open the obo files to be parsed to see if there are more taxon ids +my @obo_files = glob("$obo_dir/*.obo"); + +foreach my $input_file (@obo_files) { + open(INFILE, "$input_file") || die "Error: file '$input_file' can not be opened\n"; + while(){ + my $line = $_; + chomp $line; + + if($line =~ /Taxonomy_ID:(\d+)/) { + my $taxon = $1; + if(!defined($taxon_hash{$taxon})){ + $taxon_hash{$taxon} = $taxon; + } + } + } + close(INFILE); +} + # Read in the full taxon file from ncbi and output only the entries that have matching taxon ids