Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Added script that parses NCBI taxonomy.dat file and creates slim OWL file, to be...
authorelserj <elserj@localhost>
Fri, 22 May 2015 21:32:36 +0000 (21:32 +0000)
committerelserj <elserj@localhost>
Fri, 22 May 2015 21:32:36 +0000 (21:32 +0000)
svn path=/; revision=629

interactome_scripts/planteome_ncbi_taxon_slimmer.pl [new file with mode: 0755]

diff --git a/interactome_scripts/planteome_ncbi_taxon_slimmer.pl b/interactome_scripts/planteome_ncbi_taxon_slimmer.pl
new file mode 100755 (executable)
index 0000000..a74e3ee
--- /dev/null
@@ -0,0 +1,99 @@
+#!/usr/bin/perl
+
+###############################################################
+#  Justin Elser  (elserj@science.oregonstate.edu)             #
+#              Parses the assoc files for any taxon ids              #
+#              used in planteome assoc files.  Uses that to          #
+#              generate small (slim) subset of the ncbi taxonomy     #
+#              dat file.  From there, uses the owltools                          #
+#              ncbi2owl.jar program to generate OWL file             #
+#                                                             #
+###############################################################
+
+
+use strict;
+use warnings;
+
+# check for arguments and explain usage
+if ($#ARGV !=2) {
+       print "usage: planteome_ncbi_taxon_slimmer.pl taxonomy.dat_file taxonomy_slim_file output_OWL_file\n";
+       exit;
+}
+
+my $taxon_file = $ARGV[0];
+my $slim_file = $ARGV[1];
+my $output_file = $ARGV[2];
+
+my $ncbi_owltools_path = "/home/justin/owltools/OWLTools-NCBI/bin/ncbi2owl.jar";
+
+# open the assoc files to be parsed, assume this is all sqltable* files in directory
+
+my @files = glob("*.assoc");
+
+my %taxon_hash;
+# taxon id 1 is required, I think
+$taxon_hash{1} = 1;
+
+# get the relevant taxon ids
+foreach my $input_file (@files) {
+       open(INFILE, "$input_file") || die "Error: file '$input_file' can not be opened\n";
+       while(<INFILE>){
+                       my $line = $_;
+                       chomp $line;
+                       
+                       if($line =~ /taxon:(\d+)/) {
+                                       my $taxon = $1;
+                                       if(!defined($taxon_hash{$taxon})){
+                                                       $taxon_hash{$taxon} = $taxon;
+                                       }
+                       }
+       }
+       close(INFILE);
+}
+
+
+
+# Read in the full taxon file from ncbi and output only the entries that have matching taxon ids
+open(TAXONFILE, "$taxon_file") || die "Error: file '$taxon_file' can not be opened\n";
+open(TAXONSLIM, ">$slim_file") || die "Error: file '$slim_file' can not be opened for writing\n";
+my $keep_section = 0;
+while(<TAXONFILE>){
+               my $line = $_;
+               chomp $line;
+               
+               if($line =~ /^ID\s+:\s(\d+)/) {
+                               if(defined($taxon_hash{$1})) {
+                                               print TAXONSLIM "$line\n";
+                                               $keep_section = 1;
+                                               next;
+                               }
+               }
+               
+               if($keep_section == 1) {
+                               print TAXONSLIM "$line\n";
+               }
+               
+               if($line =~ /^\/\//) {
+                               $keep_section = 0;
+               }
+}
+                               
+                                               
+system("java -Xmx6G -jar $ncbi_owltools_path $slim_file $output_file");
+                                               
+                                               
+                                               
+                                               
+                                               
+                                               
+                                               
+                                               
+                                               
+                                               
+                                               
+                                               
+                                               
+                                               
+                                               
+                                               
+