From 156b1451c71990a800bb3bdef567a7d05131cb20 Mon Sep 17 00:00:00 2001 From: elserj Date: Fri, 22 May 2015 21:32:36 +0000 Subject: [PATCH] Added script that parses NCBI taxonomy.dat file and creates slim OWL file, to be used by planteome. svn path=/; revision=629 --- .../planteome_ncbi_taxon_slimmer.pl | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100755 interactome_scripts/planteome_ncbi_taxon_slimmer.pl diff --git a/interactome_scripts/planteome_ncbi_taxon_slimmer.pl b/interactome_scripts/planteome_ncbi_taxon_slimmer.pl new file mode 100755 index 0000000..a74e3ee --- /dev/null +++ b/interactome_scripts/planteome_ncbi_taxon_slimmer.pl @@ -0,0 +1,99 @@ +#!/usr/bin/perl + +############################################################### +# Justin Elser (elserj@science.oregonstate.edu) # +# Parses the assoc files for any taxon ids # +# used in planteome assoc files. Uses that to # +# generate small (slim) subset of the ncbi taxonomy # +# dat file. From there, uses the owltools # +# ncbi2owl.jar program to generate OWL file # +# # +############################################################### + + +use strict; +use warnings; + +# check for arguments and explain usage +if ($#ARGV !=2) { + print "usage: planteome_ncbi_taxon_slimmer.pl taxonomy.dat_file taxonomy_slim_file output_OWL_file\n"; + exit; +} + +my $taxon_file = $ARGV[0]; +my $slim_file = $ARGV[1]; +my $output_file = $ARGV[2]; + +my $ncbi_owltools_path = "/home/justin/owltools/OWLTools-NCBI/bin/ncbi2owl.jar"; + +# open the assoc files to be parsed, assume this is all sqltable* files in directory + +my @files = glob("*.assoc"); + +my %taxon_hash; +# taxon id 1 is required, I think +$taxon_hash{1} = 1; + +# get the relevant taxon ids +foreach my $input_file (@files) { + open(INFILE, "$input_file") || die "Error: file '$input_file' can not be opened\n"; + while(){ + my $line = $_; + chomp $line; + + if($line =~ /taxon:(\d+)/) { + my $taxon = $1; + if(!defined($taxon_hash{$taxon})){ + $taxon_hash{$taxon} = $taxon; + } + } + } + close(INFILE); +} + + + +# Read in the full taxon file from ncbi and output only the entries that have matching taxon ids +open(TAXONFILE, "$taxon_file") || die "Error: file '$taxon_file' can not be opened\n"; +open(TAXONSLIM, ">$slim_file") || die "Error: file '$slim_file' can not be opened for writing\n"; +my $keep_section = 0; +while(){ + my $line = $_; + chomp $line; + + if($line =~ /^ID\s+:\s(\d+)/) { + if(defined($taxon_hash{$1})) { + print TAXONSLIM "$line\n"; + $keep_section = 1; + next; + } + } + + if($keep_section == 1) { + print TAXONSLIM "$line\n"; + } + + if($line =~ /^\/\//) { + $keep_section = 0; + } +} + + +system("java -Xmx6G -jar $ncbi_owltools_path $slim_file $output_file"); + + + + + + + + + + + + + + + + + -- 2.34.1