--- /dev/null
+#!/usr/bin/perl
+
+###############################################################
+# Justin Elser (elserj@science.oregonstate.edu) #
+# Parses the assoc files for any taxon ids #
+# used in planteome assoc files. Uses that to #
+# generate small (slim) subset of the ncbi taxonomy #
+# dat file. From there, uses the owltools #
+# ncbi2owl.jar program to generate OWL file #
+# #
+###############################################################
+
+
+use strict;
+use warnings;
+
+# check for arguments and explain usage
+if ($#ARGV !=2) {
+ print "usage: planteome_ncbi_taxon_slimmer.pl taxonomy.dat_file taxonomy_slim_file output_OWL_file\n";
+ exit;
+}
+
+my $taxon_file = $ARGV[0];
+my $slim_file = $ARGV[1];
+my $output_file = $ARGV[2];
+
+my $ncbi_owltools_path = "/home/justin/owltools/OWLTools-NCBI/bin/ncbi2owl.jar";
+
+# open the assoc files to be parsed, assume this is all sqltable* files in directory
+
+my @files = glob("*.assoc");
+
+my %taxon_hash;
+# taxon id 1 is required, I think
+$taxon_hash{1} = 1;
+
+# get the relevant taxon ids
+foreach my $input_file (@files) {
+ open(INFILE, "$input_file") || die "Error: file '$input_file' can not be opened\n";
+ while(<INFILE>){
+ my $line = $_;
+ chomp $line;
+
+ if($line =~ /taxon:(\d+)/) {
+ my $taxon = $1;
+ if(!defined($taxon_hash{$taxon})){
+ $taxon_hash{$taxon} = $taxon;
+ }
+ }
+ }
+ close(INFILE);
+}
+
+
+
+# Read in the full taxon file from ncbi and output only the entries that have matching taxon ids
+open(TAXONFILE, "$taxon_file") || die "Error: file '$taxon_file' can not be opened\n";
+open(TAXONSLIM, ">$slim_file") || die "Error: file '$slim_file' can not be opened for writing\n";
+my $keep_section = 0;
+while(<TAXONFILE>){
+ my $line = $_;
+ chomp $line;
+
+ if($line =~ /^ID\s+:\s(\d+)/) {
+ if(defined($taxon_hash{$1})) {
+ print TAXONSLIM "$line\n";
+ $keep_section = 1;
+ next;
+ }
+ }
+
+ if($keep_section == 1) {
+ print TAXONSLIM "$line\n";
+ }
+
+ if($line =~ /^\/\//) {
+ $keep_section = 0;
+ }
+}
+
+
+system("java -Xmx6G -jar $ncbi_owltools_path $slim_file $output_file");
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+