use warnings;
# check for arguments and explain usage
-if ($#ARGV !=2) {
- print "usage: planteome_ncbi_taxon_slimmer.pl taxonomy.dat_file taxonomy_slim_file output_OWL_file\n";
+if ($#ARGV !=4) {
+ print "usage: planteome_ncbi_taxon_slimmer.pl taxonomy.dat_file assoc_file_directory obo_file_directory taxonomy_slim_file output_OWL_file\n";
exit;
}
my $taxon_file = $ARGV[0];
-my $slim_file = $ARGV[1];
-my $output_file = $ARGV[2];
+my $assoc_dir = $ARGV[1];
+my $obo_dir = $ARGV[2];
+my $slim_file = $ARGV[3];
+my $output_file = $ARGV[4];
my $ncbi_owltools_path = "/home/justin/owltools/OWLTools-NCBI/bin/ncbi2owl.jar";
-# open the assoc files to be parsed, assume this is all sqltable* files in directory
+# open the assoc files to be parsed
-my @files = glob("*.assoc");
+my @assoc_files = glob("$assoc_dir/*.assoc");
my %taxon_hash;
# taxon id 1 is required, I think
$taxon_hash{1} = 1;
# get the relevant taxon ids
-foreach my $input_file (@files) {
+foreach my $input_file (@assoc_files) {
open(INFILE, "$input_file") || die "Error: file '$input_file' can not be opened\n";
while(<INFILE>){
my $line = $_;
close(INFILE);
}
+# open the obo files to be parsed to see if there are more taxon ids
+my @obo_files = glob("$obo_dir/*.obo");
+
+foreach my $input_file (@obo_files) {
+ open(INFILE, "$input_file") || die "Error: file '$input_file' can not be opened\n";
+ while(<INFILE>){
+ my $line = $_;
+ chomp $line;
+
+ if($line =~ /Taxonomy_ID:(\d+)/) {
+ my $taxon = $1;
+ if(!defined($taxon_hash{$taxon})){
+ $taxon_hash{$taxon} = $taxon;
+ }
+ }
+ }
+ close(INFILE);
+}
+
# Read in the full taxon file from ncbi and output only the entries that have matching taxon ids