From: preecej
Date: Wed, 26 Oct 2011 21:45:24 +0000 (+0000)
Subject: Cleanup...
X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=82d8f9ea49d7345d39dc3d89c626691940ca7b53;p=old-jaiswallab-svn%2F.git
Cleanup...
svn path=/; revision=199
---
diff --git a/Personnel/preecej/.gitignore b/Personnel/preecej/.gitignore
deleted file mode 100644
index e69de29..0000000
diff --git a/Personnel/preecej/perl_singletons/aracyc_to_reactome_conversion/ara_rice_exclusive_sets.pl b/Personnel/preecej/perl_singletons/aracyc_to_reactome_conversion/ara_rice_exclusive_sets.pl
new file mode 100644
index 0000000..75e6ea0
--- /dev/null
+++ b/Personnel/preecej/perl_singletons/aracyc_to_reactome_conversion/ara_rice_exclusive_sets.pl
@@ -0,0 +1,68 @@
+#!usr/bin/perl -w
+use strict;
+
+system 'clear';
+
+# determine mapped ChEBI molecules listed in arabidopsis but not in rice,
+# and vice-versa
+
+my $dir = "/home/preecej/Documents/projects/reactome/aracyc_to_reactome_conversion/ara_rice_exclusive_sets/";
+my @rice;
+my @ara;
+
+open(IN_FILE, $dir . "Rice_ReferenceNameToChEBIId.txt") or die;
+while () {
+ my $line = $_;
+ chomp $line;
+ push(@rice,$line);
+}
+close IN_FILE;
+
+open(IN_FILE, $dir . "Ara_ReferenceNameToChEBIId.txt") or die;
+while () {
+ my $line = $_;
+ chomp $line;
+ push(@ara,$line);
+}
+close IN_FILE;
+
+my %ara_seen; # lookup tbl
+my @rice_only; #exclusive to rice
+
+# build lookup tbl
+@ara_seen{@ara} = ();
+
+foreach my $item (@rice) {
+ push(@rice_only, $item) unless exists $ara_seen{$item};
+}
+
+my %rice_seen; # lookup tbl
+my @ara_only; #exclusive to rice
+
+# build lookup tbl
+@rice_seen{@rice} = ();
+
+foreach my $item (@ara) {
+ push(@ara_only, $item) unless exists $rice_seen{$item};
+}
+
+my $exc_rice_count = 0;
+print "-- [EXCLUSIVE RICE MOLECULES] --\n";
+foreach my $item (@rice_only) {
+ $exc_rice_count++;
+ print "$exc_rice_count: $item\n";
+}
+
+#print "$exc_rice_count\n\n";
+
+my $exc_ara_count = 0;
+print "-- [EXCLUSIVE ARABIDOPSIS MOLECULES] --\n";
+foreach my $item (@ara_only) {
+ $exc_ara_count++;
+ print "$exc_ara_count: $item\n";
+}
+
+print "$exc_ara_count\n\n";
+
+# clean up
+exit;
diff --git a/Personnel/preecej/perl_singletons/aracyc_to_reactome_conversion/convert_bad_pub_authors_to_years.pl b/Personnel/preecej/perl_singletons/aracyc_to_reactome_conversion/convert_bad_pub_authors_to_years.pl
new file mode 100644
index 0000000..8769b10
--- /dev/null
+++ b/Personnel/preecej/perl_singletons/aracyc_to_reactome_conversion/convert_bad_pub_authors_to_years.pl
@@ -0,0 +1,44 @@
+#!usr/bin/perl -w
+use strict;
+
+system 'clear';
+
+# temp script used to fix elements that mistakenly refer to years in
+# parentheses. replaces those elements with .
+
+my $dir = "/home/preecej/Documents/projects/reactome/aracyc_to_reactome_conversion/aracyc_data/";
+
+open(in_file, $dir . "aracyc_v8_0_biopax-level2_STOIdouble.owl");
+open(out_file, ">>" . $dir . "aracyc_v8_0_biopax-level2_STOIdouble_AUTHORSYEAR.owl");
+
+my $i = 0; # limiter for testing
+
+while ()
+{
+ $i++;
+
+ # read the next line of the file
+ my $line = $_;
+ chomp $line;
+
+ # is it a bad author?
+ if ($line =~ /\(/)
+ {
+ print "$i: $line\n";
+ # change the line to a well-formatted year tag
+ $line =~ s/AUTHORS/YEAR/g;
+ $line =~ s/\(//;
+ $line =~ s/\)//;
+ $line =~ s/string/int/;
+ print "$i: $line\n";
+ }
+ print out_file "$line\n";
+
+ #last if $i > 1000; # let's test w/ low numbers for now
+}
+
+close in_file;
+close out_file;
+
+# clean up
+exit;
diff --git a/Personnel/preecej/perl_singletons/aracyc_to_reactome_conversion/convert_bad_step_interaction_datatypes.pl b/Personnel/preecej/perl_singletons/aracyc_to_reactome_conversion/convert_bad_step_interaction_datatypes.pl
new file mode 100644
index 0000000..13d8e83
--- /dev/null
+++ b/Personnel/preecej/perl_singletons/aracyc_to_reactome_conversion/convert_bad_step_interaction_datatypes.pl
@@ -0,0 +1,45 @@
+#!usr/bin/perl -w
+use strict;
+
+system 'clear';
+
+# temp script used to fix elements that mistakenly refer to years in
+# parentheses. replaces those elements with .
+# Example:
+# BEFORE: catalysis42055
+# AFTER:
+
+my $dir = "/home/preecej/Documents/projects/reactome/aracyc_to_reactome_conversion/aracyc_data/";
+
+open(in_file, $dir . "aracyc_v8_0_biopax-level2_STOIdouble_AUTHORSYEAR.owl");
+open(out_file, ">>" . $dir . "aracyc_v8_0_biopax-level2_STOIdouble_AUTHORSYEAR_STEPINT.owl");
+
+my $i = 0; # limiter for testing
+
+while ()
+{
+ $i++;
+
+ # read the next line of the file
+ my $line = $_;
+ chomp $line;
+
+ # is it a bad step interaction?
+ if ($line =~ //)
+ {
+ print "$i: $line\n";
+ # change the line to a well-formatted STEP-INTERACTION tag
+ $line =~ s/datatype="http:\/\/www.w3.org\/2001\/XMLSchema#string">/resource="\#/g;
+ $line =~ s/<\/bp:STEP-INTERACTIONS>/"\/>/;
+ print "$i: $line\n";
+ }
+ print out_file "$line\n";
+
+ #last if $i > 1000; # let's test w/ low numbers for now
+}
+
+close in_file;
+close out_file;
+
+# clean up
+exit;
diff --git a/Personnel/preecej/perl_singletons/pathway_gene_swapper.pl b/Personnel/preecej/perl_singletons/pathway_gene_swapper.pl
new file mode 100644
index 0000000..490959f
--- /dev/null
+++ b/Personnel/preecej/perl_singletons/pathway_gene_swapper.pl
@@ -0,0 +1,910 @@
+#!/usr/bin/perl -w
+
+=head1 NAME
+
+Pathway Gene Swapper
+
+=head1 VERSION
+
+0.2
+
+=head1 DESCRIPTION
+
+Swap out one set of genes (or gene representations) for another in an
+existing PathVisio GPML file. Optionally removes literature references.
+
+If multiple replacement genes map to a single original gene,
+multiple PathVisio boxes will be drawn in place of the
+original.
+
+If a group associating multiple gene boxes already existed,
+that group will extend to any replacement genes. If no group
+existed previously, and multiple replacement gene boxes are required,
+a new group will be created.
+
+If an original gene had multiple instances (homologs) displayed on the pathway
+diagram, each instance will be subjected to the replacement process. There
+is a "heat-map" option avaliable to help highlight homolog sets.
+
+There is also an option to read in an extra column of gene symbols, if the
+user wishes to provide their own. Otherwise, the application will
+continue to use the label prefix and auto-numbering suffix settings.
+
+The replacement gene symbols can be prefixed to separate them from the
+original, and an ordinal suffix ('-#') will be added to a group of
+replacement genes.
+
+Any new gene boxes may be painted with a custom color and border, and
+will be stacked and offset for ease of visualization (much like a deck
+of cards).
+
+=head1 FUTURE CHANGES
+
+Add a legend for the heat map (upper right-hand corner).
+
+Add a comment containing the NCBI species id of the new homolog (for
+the purpose of multi-species pathway comparison or host-pathogen
+interaction diagrams).
+
+=head1 USAGE
+
+pathway_gene_swapper.pl -i INPUT_FILE -g GENE_MAPPING_FILE -c CONFIG_FILE -o OUTPUT_FILE [-shLvGd]
+
+=head1 OPTIONS
+
+ -i Name of input GPML file.
+ -g CSV file containing the genes to swap
+ -c config file containing color, label, and placement preferences
+ -o Name of output GPML file.
+ (NOTE: if no path supplied for input files,
+ current working directory is assumed)
+ -s use provided gene symbols instead of config file's LabelPrefix
+ -h apply a heat-map to any multi-mapped set of homologs
+ (NOTE: precludes custom box-coloring for homologs)
+ -L Remove literature references.
+ -v View verbose information
+ -G Display GPML input/output documents
+ -d View debugging information
+
+=head1 DEPENDENCIES and PREREQUISITES
+
+ - Non-standard Perl modules: Switch, Data::Dumper, XML::DOM
+ - The input file must be a valid GPML file
+ - The CSV file must have a single-line column header.
+
+ The first column must have one and only one gene -- the
+ "original" gene.
+
+ The second column may have one and only one gene variant or
+ homolog -- the "replacement" gene(s).
+
+ An optional "gene symbol" column may be placed between the
+ first two columns, if the user would prefer to use a list of
+ symbols for the new homologous genes.
+
+ - The config file may have any or all of the following entries,
+ in addition to the required fields (in any order):
+
+ Source= (required)
+ Database= (required)
+ Version= (required)
+ Title=
+ Author=
+ Maintainer=
+ Organism=
+ CommentPrefix= (will precede back-reference to prior source,
+ default: "Previously labeled as: ")
+ LabelPrefix= (precedes current gene label)
+ BoxBorder= (RRGGBB hex, default: black)
+ BoxColor= (RRGGBB hex, default: white)
+ BoxWidth= (integer, in px, default: 60)
+ X-Offset= (integer, in px, default: 5px)
+ Y-Offset= (integer, in px, default: 4px)
+
+=head1 AUTHORS
+
+Justin Preece and Mamatha Hanumappa
+ Faculty Research Assistants
+ Jaiswal Lab, Botany & Plant Pathology
+ Oregon State University
+ L
+ L
+
+=cut
+
+# ---------------------------------------------------------------------------
+# modules
+# ---------------------------------------------------------------------------
+
+# general
+use strict;
+use Cwd;
+use Switch;
+use Getopt::Std;
+use Data::Dumper;
+
+# specific
+use Graphics::ColorUtils qw( :gradients );
+use XML::DOM;
+
+# ---------------------------------------------------------------------------
+# declarations
+# ---------------------------------------------------------------------------
+
+# command-line options
+my %opts; # arg options
+my $input_gpml_file;
+my $input_gene_file;
+my $input_config_file;
+my $output_file;
+my $apply_homolog_heat = 0;
+my $remove_lit = 0; # flag to remove literature and lit references
+my $use_symbols = 0; # flag to indicate use of provided gene symbols
+my $verbose = 0; # flag for verbose output
+my $doc_mode = 0; # flag for extra GPML doc output
+my $debug = 0; # debugging switch
+
+# global data containers
+my %configs; # configuration settings
+my %swap_genes; # original and swapped genes
+my $gpml_doc; # imported GPML data for manipulation and output
+my %unmapped_genes; # original genes not mapped to homologs
+my %gradient; # blue-red gradient used for opt. heat map box-coloring
+my $max_homolog_count = 0; # count for heat map calibration
+
+$Data::Dumper::Pad = "... ";
+
+# ---------------------------------------------------------------------------
+=head1 FUNCTIONS
+
+=over
+
+=cut
+# ---------------------------------------------------------------------------
+
+
+# ---------------------------------------------------------------------------
+=item B
+Accepts a config file path.
+Reads a configuration file and sets config values.
+Returns a hash with config values set.
+=cut
+# ---------------------------------------------------------------------------
+sub config($)
+{
+
+ print "Opening configuration file and reading data...\n\n";
+ my %local_config_hash;
+
+ open(CONFIG_FILE, $_[0]) or die("Could not open $_[0]");
+
+ while ()
+ {
+ my $line = $_;
+ chomp $line;
+ my @line_ary = split('=',$line);
+ my $data_field = $line_ary[0];
+ my $data_val = $line_ary[1];
+
+ if ($data_val)
+ {
+ $local_config_hash{$data_field} = $data_val;
+ }
+ }
+
+ close(CONFIG_FILE);
+
+ # check for required settings
+ if (!$local_config_hash{"Source"}) {
+ die("You are required to provide a Source for your new data.\n"
+ . "Run \"perldoc pathway_gene_swapper.pl\" for more information.\n\n"); }
+ if (!$local_config_hash{"Database"}) {
+ die("You are required to identify a Database for your new data.\n"
+ . "Run \"perldoc pathway_gene_swapper.pl\" for more information.\n\n"); }
+ if (!$local_config_hash{"Version"}) {
+ die("You are required to provide a Version idenifier for your new data.\n"
+ . "Run \"perldoc pathway_gene_swapper.pl\" for more information.\n\n"); }
+
+ # set defaults, if none declared
+ if (!$local_config_hash{"CommentPrefix"}) {
+ $local_config_hash{"CommentPrefix"} = "Previously labeled as: ";
+ }
+ if (!$local_config_hash{"BoxWidth"}) {
+ $local_config_hash{"BoxWidth"} = 60;
+ }
+ if (!$local_config_hash{"X-Offset"}) {
+ $local_config_hash{"X-Offset"} = 4;
+ }
+ if (!$local_config_hash{"Y-Offset"}) {
+ $local_config_hash{"Y-Offset"} = 3;
+ }
+
+ return %local_config_hash;
+}
+
+
+# ---------------------------------------------------------------------------
+=item B
+Reads in command-line values, calls for config settings, and begins
+screen output.
+=cut
+# ---------------------------------------------------------------------------
+sub init
+{
+ # read and set options
+ getopts('i:g:c:o:hLsvGd', \%opts);
+
+ foreach my $key (keys %opts) {
+ my $value = $opts{$key};
+ switch ($key) {
+ case "i" {
+ if ($value =~ /\//) { # assume path
+ $input_gpml_file = $value;
+ } else {
+ $input_gpml_file = getcwd() . "\/$value";
+ }
+ }
+ case "g" {
+ if ($value =~ /\//) { # assume path
+ $input_gene_file = $value;
+ } else {
+ $input_gene_file = getcwd() . "\/$value";
+ }
+ }
+ case "c" {
+ if ($value =~ /\//) { # assume path
+ $input_config_file = $value;
+ } else {
+ $input_config_file = getcwd() . "\/$value";
+ }
+ }
+ case "o" {
+ if ($value =~ /\//) { # assume path
+ $output_file = $value;
+ } else {
+ $output_file = getcwd() . "\/$value";
+ }
+ }
+ case "s" { $use_symbols = 1; }
+ case "h" { $apply_homolog_heat = 1; }
+ case "L" { $remove_lit = 1; }
+ case "v" { $verbose = 1; }
+ case "G" { $doc_mode = 1; }
+ case "d" { $debug = 1; }
+ }
+ }
+
+ system "clear";
+ print "\n"
+ . "------------------------------------------------------------\n"
+ . "------------------ Pathway Gene Swapper --------------------\n"
+ . "------------------------------------------------------------\n"
+ . "\n"
+ . "Input Files:\n"
+ . " - PathVisio File (GPML): $input_gpml_file\n"
+ . " - Gene List (CSV): $input_gene_file\n"
+ . " - Configuration settings: $input_config_file\n"
+ . "\n"
+ . "Output File: $output_file\n"
+ . "\n"
+ . "Use provided gene symbols? " . ($use_symbols ? "Yes" : "No") . "\n"
+ . "Provide homolog heat-map? " . ($apply_homolog_heat ? "Yes" : "No") . "\n"
+ . "Remove literature references? " . ($remove_lit ? "Yes" : "No") . "\n"
+ . "Running in verbose mode? " . ($verbose ? "Yes" : "No") . "\n"
+ . "Running in document mode? " . ($doc_mode ? "Yes" : "No") . "\n"
+ . "Running in debug mode? " . ($debug ? "Yes" : "No") . "\n"
+ . "\n"
+ . "------------------------------------------------------------\n"
+ . "------------------------------------------------------------\n"
+ . "------------------------------------------------------------\n"
+ . "\n";
+
+ %configs = config($input_config_file);
+}
+
+
+# ---------------------------------------------------------------------------
+=item B
+Accepts: The maximum number of homologs for one gene in this data set
+Creates a blue-red gradient and stores it in a hash.
+Returns: A hash of RRGGBB color values, keyed to the number set of
+possible homologs (1..n).
+=cut
+# ---------------------------------------------------------------------------
+sub set_gradient($)
+{
+ my @blue_red_array;
+
+ for (my $i=0; $i<256; $i++) {
+ push @blue_red_array, [$i, 0, (255-$i)];
+ }
+
+ register_gradient("blue_red",\@blue_red_array);
+
+ my %blue_red_gradient;
+ my $max_grad = $_[0];
+ my $inc = sprintf("%.2f", 1/$max_grad);
+ my $count = 1;
+
+ for (my $i=$inc; $i<=0.99; $i=$i+$inc) {
+ my $tmp_hex = grad2rgb("blue_red",$i);
+ $tmp_hex =~ s/\#//;
+ $blue_red_gradient{$count} = $tmp_hex;
+ $count++;
+ }
+
+ return %blue_red_gradient;
+}
+
+
+# ---------------------------------------------------------------------------
+=item B
+Reads, parses, and stores gene mapping file and source GPML.
+=cut
+# ---------------------------------------------------------------------------
+sub import_data
+{
+ print "Opening gene mapping file and reading data...\n\n";
+
+ open(GENE_FILE, $input_gene_file) or die("Could not open $input_gene_file");
+
+ # used to generate total counts of each species' gene list; sanity check
+ my $original_gene_count = 0;
+ my $replacement_homolog_count = 0;
+
+ my $orig_data_item;
+ my $new_symbol = "";
+ my $new_data_item;
+
+ # ignore header
+ my $line = ;
+
+ while ()
+ {
+ $line = $_;
+ chomp $line;
+ my @line_ary = split(',',$line);
+ my $orig_data_item = $line_ary[0];
+
+ if ($use_symbols) {
+ if (scalar(@line_ary) != 3) {
+ die("If you specify that your gene-mapping file includes "
+ . "symbols, then your CSV input file must have three "
+ . "columns of data (in this order): "
+ . "old gene, new symbol, new gene\n");
+ }
+ $new_symbol = $line_ary[1];
+ $new_data_item = $line_ary[2];
+ }
+ else
+ {
+ $new_data_item = $line_ary[1];
+ }
+
+ #Does ath_gene exist?
+ if (!exists $swap_genes{$orig_data_item})
+ {
+ $original_gene_count++;
+ }
+ $replacement_homolog_count++; # count this every time
+
+ # add new gene to hash value (array) for old gene hash key
+ push @{$swap_genes{$orig_data_item}},
+ { "symbol"=>$new_symbol, "new_item"=>$new_data_item };
+
+ $new_symbol = ""; # reset for next iter.
+ }
+
+ # determine the original gene(s) with the highest # of homologs in the new
+ # gene set
+ my @most_popular_genes; # identity of the gene(s) with the highest number of homologs
+ foreach my $orig_gene_key (keys %swap_genes)
+ {
+ if ($max_homolog_count < scalar(@{$swap_genes{$orig_gene_key}}))
+ {
+ $max_homolog_count = scalar(@{$swap_genes{$orig_gene_key}});
+ @most_popular_genes = (); # new max; refresh
+ push @most_popular_genes, $orig_gene_key;
+ }
+ else
+ {
+ if ($max_homolog_count == scalar(@{$swap_genes{$orig_gene_key}}))
+ {
+ push @most_popular_genes, $orig_gene_key;
+ }
+ }
+ }
+
+ if ($verbose) # give add'l stats on gene and homolog counts
+ {
+ print "[Total number of original genes and homologs]\n"
+ . "Original gene count: " . $original_gene_count . "\n"
+ . "Replacement homolog count: $replacement_homolog_count\n\n";
+
+ print "[Highest number of homologs per gene]\n";
+ print "Number of homologs: $max_homolog_count\n";
+ print "Gene(s): @most_popular_genes\n\n";
+
+ print "[Number of homologs per original gene]\n";
+ foreach my $orig_gene_key (keys %swap_genes)
+ {
+ print "$orig_gene_key: " . scalar(@{$swap_genes{$orig_gene_key}}) . "\n";
+ }
+ print "\n";
+ }
+
+ close(GENE_FILE);
+
+ # initialize the blue-red gradient for heat mapping
+ if ($apply_homolog_heat)
+ {
+ %gradient = set_gradient($max_homolog_count);
+ }
+
+ print "Opening GPML pathway file and reading data...\n\n";
+
+ my $parser = new XML::DOM::Parser;
+ $gpml_doc = $parser->parsefile($input_gpml_file);
+}
+
+
+# ---------------------------------------------------------------------------
+=item B
+Spits out the data to make sure you've read in the files correctly.
+Verbose only.
+=cut
+# ---------------------------------------------------------------------------
+sub show_input
+{
+ if ($verbose)
+ {
+ print "[Configuration Settings]\n";
+ print Dumper(\%configs) . "\n\n";
+ print "\n";
+
+ print "[Gene Mappings]\n";
+ print Dumper(\%swap_genes) . "\n\n";
+ print "\n";
+ }
+ if ($doc_mode)
+ {
+ print "[Source GPML]\n";
+ print $gpml_doc->toString;
+ print "\n";
+ }
+}
+
+# ---------------------------------------------------------------------------
+=item B
+Accepts a reference to a hash of existing hex ids and a string as
+the "type" of ID (e.g. "Group.GraphId"). The latter is currently for
+documentary purposes only.
+Generates a "random" 5-digit hexadecimal id, checks to see if it
+already exists in the hex list, and if not, adds it to the list
+of hex ids already present in the GPML doc. Otherwise, generates
+another "random" id and repeats the process until a new unique id
+is identified.
+Returns a string containing the new hex id.
+=cut
+# ---------------------------------------------------------------------------
+sub create_unique_hex_id($$)
+{
+ # NOTE: This algorithm breaks down at VERY large scale (100K genes+). The
+ # larger the number of original genes, groups, and new homologs you need to
+ # create, the more inefficient it becomes to make sure your "random" 5-digit
+ # hex number is not already present in your "existing ids" list via
+ # recursion. However, for a few hundred or thousand genes, it should be ok.
+
+ my $first_digit; # limited to a..f
+ my $last_four_digits; # 0..f
+
+ $first_digit = (('a'..'f')[rand(6)]);
+ $last_four_digits .= ((0..9,'a'..'f')[rand(16)]) for 1..4;
+
+ my $candidate_id = $first_digit . $last_four_digits;
+
+ # recurse if you haven't generated a unique id yet
+ if (exists ${$_[0]}{$candidate_id})
+ {
+ # print "not unique...\n"; # TEST
+ # the '&' suppresses prototype checking and avoids a runtime warning
+ # since this is a recursive call
+ $candidate_id = &create_unique_hex_id($_[0],$_[1]);
+ }
+ else
+ {
+ # print "unique!\n"; # TEST
+ ${$_[0]}{$candidate_id} = $_[1];
+ }
+
+ return $candidate_id;
+}
+
+# ---------------------------------------------------------------------------
+=item B
+Substitutes gene data.
+=cut
+# ---------------------------------------------------------------------------
+sub swap_genes
+{
+ print "Swapping gene data and making other document modifications...\n\n";
+
+ my $pathway_node = ($gpml_doc->getElementsByTagName("Pathway"))[0];
+
+ # change Pathway header info to config settings
+ if ($configs{"Title"}) {
+ $pathway_node->setAttribute("Name",$configs{"Title"}); }
+ if ($configs{"Author"}) {
+ $pathway_node->setAttribute("Author",$configs{"Author"}); }
+ if ($configs{"Maintainer"}) {
+ $pathway_node->setAttribute("Maintainer",$configs{"Maintainer"}); }
+ if ($configs{"Version"}) {
+ $pathway_node->setAttribute("Version",$configs{"Version"}); }
+ if ($configs{"Organism"}) {
+ $pathway_node->setAttribute("Organism",$configs{"Organism"}); }
+
+ # get all "gene box" data nodes
+ my $data_nodes = $pathway_node->getElementsByTagName("DataNode");
+ # print $data_nodes->getLength . "\n"; # TEST
+
+ # remove all and elements and children
+ if ($remove_lit)
+ {
+ print "Removing literature references...\n\n";
+ my $biopax_node = ($pathway_node->getElementsByTagName("Biopax"))[0];
+ $pathway_node->removeChild($biopax_node);
+
+ for (@$data_nodes)
+ {
+ my $curr_datanode = $_;
+ my $biopaxref_nodes = $curr_datanode->getElementsByTagName("BiopaxRef");
+ for (@$biopaxref_nodes)
+ {
+ # print $_->getTagName . "\n"; # TEST
+ $curr_datanode->removeChild($_);
+ }
+ }
+ }
+
+ # will hold a convenient nested hash of data node references in the gpml doc,
+ # indexed by the id of the gene located in the element for each
+ # node, and sub-indexed by the GraphId of each corresponding node
+ my %data_nodes_by_gene_id;
+
+ if ($verbose) {
+ print "[Original genes]\n";
+ }
+
+ # create a hash of all 5-digit hex ids in the gpml doc (this is the black list)
+ # one list of DataNode.GraphId, Group.GroupId, and Group.GraphId
+ my %existing_hex_ids;
+ for (@$data_nodes)
+ {
+ # print $_ . "\n"; # TEST
+
+ if ($_->getAttributeNode("GraphId"))
+ {
+ $existing_hex_ids{$_->getAttributeNode("GraphId")->getValue}
+ = $_->getTagName . ".GraphId";
+ }
+
+ # also build a data node hash to make lookup easier in the next section
+ my $curr_xref_id = ($_->getElementsByTagName("Xref"))[0]
+ ->getAttributeNode("ID")->getValue;
+ my $curr_graph_id = $_->getAttributeNode("GraphId")->getValue;
+
+ $curr_xref_id =~ s/\s+$//; # rtrim whitespace
+ $curr_xref_id =~ s/^\s+//; # ltrim whitespace
+
+ if ($verbose) {
+ my $curr_text_label = $_->getAttributeNode('TextLabel')->getValue;
+ $curr_text_label =~ s/^\s+|\s+$//; # trim whitespace
+ print "$curr_text_label\t$curr_xref_id\n";
+ }
+
+ if (length($curr_xref_id) > 0)
+ {
+ #if ($curr_xref_id eq "AT3G12810") { print "** hit on AT3G12810\n"; } # TEST
+ $data_nodes_by_gene_id{$curr_xref_id}{$curr_graph_id} = $_;
+ }
+ else
+ {
+ if ($verbose) {
+ print "WARNING: Found DataNode (TextLabel: "
+ . $_->getAttributeNode('TextLabel')->getValue . ") "
+ . "with missing Xref ID.\n"
+ }
+ }
+ }
+ print "\n";
+
+ if ($debug) {
+ print "...\n";
+ foreach my $tmp_gene (keys %data_nodes_by_gene_id) {
+ foreach my $tmp_node (keys %{$data_nodes_by_gene_id{$tmp_gene}}) {
+ print "... $tmp_gene => $tmp_node => $data_nodes_by_gene_id{$tmp_gene}{$tmp_node}\n";
+ #if ($tmp_gene eq "AT3G12810") { print "** hit on AT3G12810 node\n"; } # TEST
+ }
+ }
+ print "\n";
+ }
+
+ my $group_nodes = $pathway_node->getElementsByTagName("Group");
+ for (@$group_nodes)
+ {
+ if ($_->getAttributeNode("GroupId"))
+ {
+ $existing_hex_ids{$_->getAttributeNode("GroupId")->getValue}
+ = $_->getTagName . ".GroupId";
+ }
+ if ($_->getAttributeNode("GraphId"))
+ {
+ $existing_hex_ids{$_->getAttributeNode("GraphId")->getValue}
+ = $_->getTagName . ".GraphId";
+ }
+ }
+
+ if ($debug) { print "...\n"
+ . Dumper(\%existing_hex_ids) . "\n\n"; }
+
+ # iterate through gene mappings from csv file
+ foreach my $old_gene (keys %swap_genes)
+ {
+ # print $old_gene . "\n"; # TEST
+
+ # find curr old gene nodes in doc
+ if (exists $data_nodes_by_gene_id{$old_gene})
+ {
+ # iterate through each node by its GraphId
+ foreach my $curr_old_genes_by_hex_id (keys %{$data_nodes_by_gene_id{$old_gene}})
+ {
+ my $curr_old_gene_node = $data_nodes_by_gene_id{$old_gene}{$curr_old_genes_by_hex_id};
+ # print $curr_old_gene_node . "\n"; # TEST
+
+ # holds list of newly-created nodes, used to replace old node
+ my @new_nodes_map;
+
+ # iterate through new gene replacements
+ for (@{$swap_genes{$old_gene}})
+ {
+ # copy the curr old gene node
+ my $new_node = $curr_old_gene_node->cloneNode("deep");
+
+ # print "[$_]\n$new_node->toString\n\n"; # TEST
+
+ # add to new nodes ary
+ push @new_nodes_map, [$new_node, $_];
+ }
+ # print "@new_nodes_map\n"; # TEST
+
+ # if more than one new homolog exists, and the old gene doesn't
+ # already belong to a group, you'll need a new Group for multiple
+ # gene boxes
+ my $new_GroupId;
+
+ if (scalar(@new_nodes_map) > 1)
+ {
+ # if curr old gene does not belong to a group
+
+ # print $curr_old_gene_node->toString . "\n"; # TEST
+ # print $curr_old_gene_node->getAttribute("GroupRef"); # TEST
+
+ if (!$curr_old_gene_node->getAttribute("GroupRef"))
+ {
+ #print "no existing group ref\n"; # TEST
+
+ # generate a new GroupId and Group.GraphId hex ids not
+ # already in use
+ $new_GroupId = create_unique_hex_id(\%existing_hex_ids,"Group.GroupId");
+ # my $new_Group_GraphId = create_unique_hex_id(\%existing_hex_ids,"Group.GraphId");
+ #print "new group id: $new_GroupId\n"; # TEST
+ # print "$new_GroupId, $new_Group_GraphId\n"; # TEST
+
+ # create a new group node
+ my $new_group = $gpml_doc->createElement("Group");
+ $new_group->setAttribute("GroupId",$new_GroupId);
+ #$new_group->setAttribute("GraphId",$new_Group_GraphId);
+ $new_group->setAttribute("Style","Group");
+
+ # add to beginning of group nodes
+ $pathway_node->insertBefore($new_group,${$group_nodes}[0]);
+ }
+ }
+
+ # flag for determining if there are one or many replacement homologs
+ my $is_first_homolog = 1;
+
+ # makes sure each box is increasingly offset from the original
+ # (in all three dimensions)
+ my $offset_multiplier = 0;
+ my $gene_suffix_counter = 0; # used to affix numbers to multiple new gene symbols
+
+ # for new nodes ary
+ for (@new_nodes_map)
+ {
+ if (scalar(@new_nodes_map) > 1)
+ {
+ $gene_suffix_counter++;
+ }
+
+ my $curr_new_node = $$_[0];
+ my $curr_symbol = ${$$_[1]}{"symbol"};
+ my $curr_homolog = ${$$_[1]}{"new_item"};
+
+ #print "$_: $curr_new_node, $curr_symbol, $curr_homolog\n"; # TEST
+ #print "[Curr New Node before editing...]\n" . $curr_new_node->toString . "\n\n"; # TEST
+
+ # update all new nodes w/ attributes...
+
+ # grab original text label
+ my $old_label = $curr_new_node->getAttributeNode("TextLabel")->getValue;
+
+ # rename TextLabel...
+ if ($use_symbols && length($curr_symbol) > 0) # apply the provided gene symbol
+ {
+ $curr_new_node->setAttribute("TextLabel", $curr_symbol);
+ }
+ else # prefix (from config), suffix: new '-#' for multiple homologs
+ {
+ $curr_new_node->setAttribute("TextLabel",
+ (($configs{"LabelPrefix"}) ? $configs{"LabelPrefix"} : "")
+ . $curr_new_node->getAttributeNode("TextLabel")->getValue
+ . (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : ""));
+ }
+
+ # add new GroupRef if necessary
+ if ($new_GroupId)
+ {
+ $curr_new_node->setAttribute("GroupRef",$new_GroupId);
+ }
+
+ # add Comment back-referencing TAIR locus id (use "source" attribute)
+ # NOTE: order is important in GPML; the tags are first
+ my $new_comment = $gpml_doc->createElement("Comment");
+ $new_comment->setAttribute("Source",$configs{"Source"});
+ $new_comment->addText($configs{"CommentPrefix"} . " $old_gene ($old_label).");
+ $curr_new_node->insertBefore($new_comment,$curr_new_node->getFirstChild); # assumes other child nodes
+
+ # edit
+ my $curr_xref = ($curr_new_node->getElementsByTagName("Xref"))[0];
+ $curr_xref->setAttribute("Database",$configs{"Database"});
+ $curr_xref->setAttribute("ID",$curr_homolog);
+
+ # change box width and colors ()
+ my $curr_graphics = ($curr_new_node->getElementsByTagName("Graphics"))[0];
+ $curr_graphics->setAttribute("Width",$configs{"BoxWidth"});
+
+ # add "heat" to genes with multiple homologs
+ if ($apply_homolog_heat && ($gene_suffix_counter > 0))
+ {
+ $curr_graphics->setAttribute("FillColor", $gradient{scalar(@new_nodes_map)});
+ $curr_graphics->setAttribute("Color","8888ff");
+ }
+ else
+ {
+ if ($configs{"BoxColor"}) {
+ $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"}); }
+ if ($configs{"BoxBorder"}) {
+ $curr_graphics->setAttribute("Color",$configs{"BoxBorder"}); }
+ }
+
+ if ($is_first_homolog)
+ {
+ # print "that was the first homolog...\n"; # TEST
+ $is_first_homolog = 0; # first homolog complete
+ }
+ else # add'l homologs required
+ {
+ $offset_multiplier++;
+
+ # print "that was an add'l homolog, change more attrs...\n"; # TEST
+ # update add'l nodes w/ special attributes...
+
+ # generate a new DataNode GraphId not already in use
+ my $new_GraphId =
+ create_unique_hex_id(\%existing_hex_ids,"DataNode.GraphId");
+ # print $new_GraphId . "\n"; # TEST
+ $curr_new_node->setAttribute("GraphId",$new_GraphId);
+
+ # decrement the Z-order
+ $curr_graphics->setAttribute("ZOrder",
+ $curr_graphics->getAttributeNode("ZOrder")->getValue
+ - $offset_multiplier);
+ # stagger the extra boxes by decrementing the coords
+ $curr_graphics->setAttribute("CenterX",
+ $curr_graphics->getAttributeNode("CenterX")->getValue
+ - ($configs{"X-Offset"} * $offset_multiplier));
+ $curr_graphics->setAttribute("CenterY",
+ $curr_graphics->getAttributeNode("CenterY")->getValue
+ - ($configs{"Y-Offset"} * $offset_multiplier));
+ }
+ }
+ undef $new_GroupId; # clear this out so we can test against its existence next time
+
+ # replace old node w/ new node(s)
+ for (@new_nodes_map) {
+ # add all the new nodes...
+ $pathway_node->insertBefore($$_[0],$curr_old_gene_node);
+ }
+ # ...and remove the original node
+ $pathway_node->removeChild($curr_old_gene_node);
+ }
+ # once mapped, remove the old gene so we are left with a list of
+ # unmapped original genes (for show_ouput())
+ delete($data_nodes_by_gene_id{$old_gene});
+ }
+ else
+ {
+ print "ALERT: Gene identifier $old_gene is not present in this "
+ . "PathVisio GPML document.\n";
+ }
+ }
+ %unmapped_genes = %data_nodes_by_gene_id; # whatever is left over
+}
+
+# ---------------------------------------------------------------------------
+=item B
+Displays the transformed data. Verbose only.
+=cut
+# ---------------------------------------------------------------------------
+sub show_output
+{
+ if ($verbose) {
+ print "[Unmapped original genes]\n";
+ my $count = 0;
+ foreach my $tmp_gene (keys %unmapped_genes)
+ {
+ foreach my $tmp_node (keys %{$unmapped_genes{$tmp_gene}})
+ {
+ $count++;
+ print $unmapped_genes{$tmp_gene}{$tmp_node}
+ ->getAttributeNode("TextLabel")->getValue
+ . "\t($tmp_gene)\n";
+ }
+ }
+ print "\nTotal unmapped genes: $count\n";
+ }
+ if ($doc_mode)
+ {
+ print "\n[Modified GPML Output]\n";
+ print $gpml_doc->toString;
+ print "\n";
+ }
+}
+
+# ---------------------------------------------------------------------------
+=item B
+Writes the transformed GPML doc out to the specified output file.
+=cut
+# ---------------------------------------------------------------------------
+sub export_data
+{
+ print "\nWriting GPML to output file...\n\n";
+
+ # ensures utf-8 encoding (for accent marks, etc.)
+ open my $out_file_handle, ">:utf8", "$output_file" or die $!;
+
+ $gpml_doc->print($out_file_handle);
+}
+
+=back
+=cut
+
+# ---------------------------------------------------------------------------
+# main
+# ---------------------------------------------------------------------------
+
+init;
+import_data;
+show_input;
+swap_genes();
+show_output;
+export_data;
+
+$gpml_doc->dispose; # cleanup
+exit;
+
+# ---------------------------------------------------------------------------
+# end
+# ---------------------------------------------------------------------------
+
diff --git a/Personnel/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl b/Personnel/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl
new file mode 100755
index 0000000..cf6f4c9
--- /dev/null
+++ b/Personnel/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl
@@ -0,0 +1,428 @@
+#!/usr/bin/perl -w
+use strict;
+
+# ---------------------------------------------------------------------------
+# Reactome - CHEBI Ontology Mapping Script
+#
+# Justin Preece, 10/06/10
+# v1.0: 10/13/10 (svn rev. 66)
+# v1.1: 10/20/10 (svn rev. 70)
+# v1.2: 02/07/11 (svn rev. 86)
+#
+# Purpose: Map CHEBI ontology terms onto the Reactome database.
+#
+# Inputs:
+#
+# CHEBI OBO file (preset)
+#
+# Reactome file (preset, provided by Guanming Wu)
+# (Header) [ReactomeID] [Compound_Name] [CAS] [LIGAND] [Cyc]
+# (Row) 923893 S-adenosyl-L-methionine 29908-03-0 C00019 S-ADENOSYLMETHIONINE ** the '-' (dash) symbol will be applied to any empty columns
+#
+# Outputs: tab-del mapping file (reactome_chebi_mapping_complete_sorted.txt)
+#
+# [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID]
+# 923893 15414 CAS 29908-03-0
+# 923893 15414 LIGAND C00019
+# 923893 15414 CompoundTerm S-ADENOSYLMETHIONINE
+# 923893 15414 CompoundSynonym s-AdenosylMethionine
+# 923893 15414 CycTerm S-ADENOSYLMETHIONINE ** optional
+# 923893 15414 CycSynonym s-adenosylmethionine ** optional
+# ---------------------------------------------------------------------------
+
+
+# ---------------------------------------------------------------------------
+# modules
+# ---------------------------------------------------------------------------
+
+use GO::Parser;
+
+# ---------------------------------------------------------------------------
+# declarations
+# ---------------------------------------------------------------------------
+
+# set paths to data files
+my $data_path = "/home/preecej/Documents/projects/reactome/reactome_to_chebi_mapping/AraCyc/gk_central_041811/no_synonyms/";
+my $chebi_obo_file = "../../chebi_v78.obo";
+my $reactome_file = "AraReferenceMolecules.txt";
+my $mapped_output_file = "1.2_reactome_chebi_mapping_complete.txt";
+my $sorted_output_file = "1.2_reactome_chebi_mapping_complete_sorted.txt";
+my $unique_mappings = "1.2_reactome_unique_mappings.txt";
+my $sorted_no_match_file = "1.2_reactome_entries_with_no_chebi_match.txt";
+
+# options
+my $allow_obsolete_terms = 1;
+my $allow_cyc = 0;
+my $allow_synonyms = 0;
+
+my $ont; # chebi ontology
+
+my %reactome_CompoundName; # reactome Compound Name hash
+my %reactome_CAS; # reactome CAS hash
+my %reactome_LIGAND; # reactome LIGAND hash
+my %reactome_Cyc; # reactome Cyc hash
+
+my @map_results = (); # successful mappings between chebi and reactome
+
+
+# ---------------------------------------------------------------------------
+# functions
+# ---------------------------------------------------------------------------
+
+
+# setup chebi parser and reactome data
+# ---------------------------------------------------------------------------
+sub init
+{
+ # init ontology parser and ontology
+ my $parser = GO::Parser->new({handler=>'obj'});
+ $parser->parse($data_path . $chebi_obo_file);
+ $ont = $parser->handler->graph;
+
+ # read reactome file into 3 separate hashes
+ open(REACTOME_FILE,$data_path . $reactome_file);
+
+ my $line = ; # skip the header
+ my $reactome_count = 0;
+
+ while ()
+ {
+ $line = $_;
+ chomp $line;
+ $reactome_count++;
+ my @reactome_entry = split(/\t/, $line); # break up our tab-del line
+
+ # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and Cyc values
+ my $reactome_id = $reactome_entry[0];
+ my $compound_name = uc $reactome_entry[1]; # for case-insensitivity
+
+ # strips off "AN " and "A " indefinite articles
+ $compound_name =~ s/^ //;
+ $compound_name =~ s/^AN //;
+ $compound_name =~ s/^A //;
+
+ my $CAS_id = $reactome_entry[2];
+ my $LIGAND_id = $reactome_entry[3];
+ my $Cyc_term = uc $reactome_entry[4]; # for case-insensitivity
+
+ # There is a possibility that a single CAS, LIGAND, or Cyc
+ # identifier may appear in more than one reactome entry. This
+ # temp array allows each matched hash value to hold more than
+ # one ReactomeID, if necessary.
+
+ # --CAS Hash Load--
+ if ($CAS_id ne "-") { # keep those "-" placeholders out
+ # build the CAS hash; each value may hold 1...n reactome
+ # ids (as an array)
+ push @{$reactome_CAS{$CAS_id}}, $reactome_id;
+ }
+
+ # similarly...
+
+ # --LIGAND Hash Load--
+ if ($LIGAND_id ne "-") {
+ push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id;
+ }
+
+ # --CompoundName Hash Load--
+ if ($compound_name ne "-") {
+ push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id;
+ }
+
+ # --Cyc Hash Load--
+ if ($allow_cyc)
+ {
+ if ($Cyc_term ne "-") {
+ push @{$reactome_Cyc{"$Cyc_term"}}, $reactome_id;
+ }
+
+ }
+ }
+ close REACTOME_FILE;
+
+ print "\n[Reactome Stats]",
+ "\nTotal Reactome Entries: $reactome_count\n";
+
+}
+
+
+# spit out some data to make sure you've read in the files correctly
+# ---------------------------------------------------------------------------
+sub test_inputs
+{
+ # basic ontology info
+ print "[Node Count]: " . $ont->node_count . "\n";
+
+ # get all chebi terms in the ontology
+ my $terms = $ont->get_all_nodes;
+
+ # output contents of parsed ontology
+ foreach my $term (@$terms)
+ {
+ print "\n" . $term->acc . " " . $term->name . "\n[SYNONYMS]\n";
+
+ my $synonyms = $term->synonym_list;
+ foreach my $synonym (@$synonyms) {
+ print $synonym . "\n";
+ }
+
+ print "[XREFS]\n";
+ my $xrefs = $term->dbxref_list;
+ foreach my $xref (@$xrefs) {
+ print $xref->xref_key . ",",
+ $xref->xref_keytype . ",",
+ $xref->xref_dbname . ",",
+ $xref->xref_desc . "\n";
+ }
+ print "\n";
+ }
+
+ # show dupes in reactome hashes - give data to Pankaj;
+ # this is important b/c the duplicates may represent erroneous data in
+ # the Reactome dataset
+ my $k; my @v;
+ print "\n[Reactome Hashes - Dupes]\n";
+ print "\n--CAS Hash--\n";
+ for $k (keys %reactome_CAS) {
+ if (@{$reactome_CAS{$k}} > 1) {
+ print "$k: @{$reactome_CAS{$k}}\n";
+ }
+ }
+ print "\n--LIGAND Hash--\n";
+ for $k (keys %reactome_LIGAND) {
+ if (@{$reactome_LIGAND{$k}} > 1) {
+ print "$k: @{$reactome_LIGAND{$k}}\n";
+ }
+ }
+ print "\n--CompoundName Hash--\n";
+ for $k (keys %reactome_CompoundName) {
+ if (@{$reactome_CompoundName{$k}} > 1) {
+ print "$k: @{$reactome_CompoundName{$k}}\n";
+ }
+ }
+ if ($allow_cyc)
+ {
+ print "\n--Cyc Hash--\n";
+ for $k (keys %reactome_Cyc) {
+ if (@{$reactome_Cyc{$k}} > 1) {
+ print "$k: @{$reactome_Cyc{$k}}\n";
+ }
+ }
+ }
+}
+
+
+# map the chebi terms to the reactome entries
+# ---------------------------------------------------------------------------
+sub perform_map
+{
+ my $chebi_obo_terms = $ont->get_all_nodes;
+
+ # vars for mapping stats
+ my $attempted_mappings = 0;
+ my $successful_mappings = 0;
+ my $attempted_CAS_mappings = 0;
+ my $successful_CAS_mappings = 0;
+ my $attempted_LIGAND_mappings = 0;
+ my $successful_LIGAND_mappings = 0;
+ my $attempted_name_mappings = 0;
+ my $successful_name_mappings = 0;
+ my $attempted_synonym_mappings = 0;
+ my $successful_synonym_mappings = 0;
+
+ # loop through each chebi term
+ foreach my $term (@$chebi_obo_terms)
+ {
+ # eliminate "typedef" nodes (non-CHEBI terms), also check for obsolete
+ # terms and whether to allow them
+ if (($term->acc =~ m/^CHEBI:/)
+ && (!$term->is_obsolete || ($term->is_obsolete && $allow_obsolete_terms)))
+ {
+ # attempt CHEBI match on CAS and LIGAND ID's
+ $attempted_mappings++;
+ my $xrefs = $term->dbxref_list;
+ foreach my $xref (@$xrefs)
+ {
+ $attempted_CAS_mappings++;
+ $attempted_LIGAND_mappings++;
+
+ # temp-foo to skirt an interpolation problem
+ my $tmp_key = $xref->xref_key;
+
+ if (defined($reactome_CAS{"$tmp_key"}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_CAS{$tmp_key}})
+ {
+ $successful_CAS_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CAS\t" .
+ $tmp_key);
+ }
+ }
+
+ if (defined($reactome_LIGAND{"$tmp_key"}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_LIGAND{$tmp_key}})
+ {
+ $successful_LIGAND_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "LIGAND\t" .
+ $tmp_key);
+ }
+ }
+ }
+
+ # attempt CHEBI match on Reactome Compound Names (and optional Cyc names/synonyms)...
+ $attempted_name_mappings++;
+
+ # more temp-foo to skirt said interpolation problem
+ my $tmp_name = uc $term->name;
+
+ # reactome compound names...
+ if (defined($reactome_CompoundName{"$tmp_name"}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_name}})
+ {
+ $successful_name_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CompoundTerm\t" .
+ $term->name);
+ }
+ }
+ # ...and synonyms (optional)
+ if ($allow_synonyms)
+ {
+ my $synonyms = $term->synonym_list;
+ foreach my $synonym (@$synonyms)
+ {
+ $attempted_synonym_mappings++;
+
+ # yet more temp-foo to skirt interpolation problem
+ my $tmp_syn = "\U$synonym";
+
+ if (defined($reactome_CompoundName{$tmp_syn}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_syn}})
+ {
+ $successful_synonym_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CompoundSynonym\t" .
+ $synonym);
+ }
+ }
+ }
+ }
+
+ # Cyc names...
+ if ($allow_cyc)
+ {
+ if (defined($reactome_Cyc{"$tmp_name"}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_name}})
+ {
+ $successful_name_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CycTerm\t" .
+ $term->name);
+ }
+ }
+ # ...and synonyms (optional)
+ if ($allow_synonyms)
+ {
+ my $synonyms = $term->synonym_list;
+ foreach my $synonym (@$synonyms)
+ {
+ $attempted_synonym_mappings++;
+
+ # yet more temp-foo to skirt interpolation problem
+ my $tmp_syn = "\U$synonym";
+
+ if (defined($reactome_Cyc{$tmp_syn}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_syn}})
+ {
+ $successful_synonym_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CycSynonym\t" .
+ $synonym);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ # send up some stats on the mapping process
+ $successful_mappings =
+ + $successful_CAS_mappings
+ + $successful_LIGAND_mappings
+ + $successful_name_mappings
+ + $successful_synonym_mappings;
+
+ print "\n[ChEBI Stats]",
+ "\nNodes in File: " . $ont->node_count,
+ "\nTotal Attempted Mappings (with usable ChEBI terms): " . $attempted_mappings,
+ "\nTotal Successful Mappings: " . $successful_mappings . "\n";
+
+ print "\n[Mapping Breakdown by Type]",
+ "\nCAS (matches/attempts): ",
+ "$successful_CAS_mappings/$attempted_CAS_mappings ",
+ "(note: can include ChemIDplus and KEGG COMPUND db duplicates)",
+ "\nLIGAND: ",
+ "$successful_LIGAND_mappings/$attempted_LIGAND_mappings",
+ "\nTerm Names " . ($allow_cyc ? "includes Cyc terms and synonyms" : "") . ": ",
+ "$successful_name_mappings/$attempted_name_mappings";
+ if ($allow_synonyms)
+ {
+ print "\nTerm Synonyms: ",
+ "$successful_synonym_mappings/$attempted_synonym_mappings";
+ }
+ print "\n\n";
+}
+
+
+# put the results in the mapped output file
+# ---------------------------------------------------------------------------
+sub create_mapfile
+{
+ if (@map_results > 0)
+ {
+ # add a header to the results array
+ unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID");
+
+ # setup output file
+ open(OUTPUT_FILE,">" . $data_path . $mapped_output_file);
+
+ #format results for file output
+ print OUTPUT_FILE "$_\n" foreach @map_results;
+
+ close OUTPUT_FILE;
+
+ # sort on all cols (keep the header at the top), remove exact dupes
+ system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file";
+
+ # also produce files listing unique Reactome entries having a match...and those without a match
+ system "awk 'NR == 1; NR > 1 {print \$1}' $data_path$sorted_output_file | uniq > $data_path$unique_mappings";
+ system "cat $data_path$reactome_file | grep -vf $data_path$unique_mappings | sort > $data_path$sorted_no_match_file";
+ } else {
+ print "\n\nSorry, there are no mapped results.\n\n";
+ }
+}
+
+
+# ---------------------------------------------------------------------------
+# main
+# ---------------------------------------------------------------------------
+
+init;
+#test_inputs;
+perform_map;
+create_mapfile;
+
+exit;
diff --git a/Personnel/preecej/perl_singletons/zea_Maize_PO_CoGe_name_swap.pl b/Personnel/preecej/perl_singletons/zea_Maize_PO_CoGe_name_swap.pl
new file mode 100644
index 0000000..5dda3e7
--- /dev/null
+++ b/Personnel/preecej/perl_singletons/zea_Maize_PO_CoGe_name_swap.pl
@@ -0,0 +1,74 @@
+#!/usr/bin/perl -w
+use strict;
+use Data::Dumper;
+
+my $CoGe_genes_file_name = $ARGV[0];
+my $assoc_file_name = $ARGV[1];
+
+# read in CoGe file (arg 0), build hash of gene model ids to symbols/gene names
+
+open(IN_FILE, $CoGe_genes_file_name);
+
+my %classical_genes_by_gene_model;
+
+my $line = ;
+
+while ()
+{
+ $line = $_;
+ chomp $line;
+
+ my @curr_line = split(',',$line);
+
+ my $gene_symbol = $curr_line[0];
+ $gene_symbol =~ tr/"//d;
+ my $gene_name = $curr_line[2];
+ $gene_name =~ tr/"//d;
+ my $gene_model_id = $curr_line[8];
+ $gene_model_id =~ tr/"//d;
+
+ #print $gene_symbol . "\|" . $gene_name . "\|" . $gene_model_id . "\n";
+
+ my $gene_model_expr = "^(GRMZM)";
+ if ($gene_model_id =~ $gene_model_expr) {
+ $classical_genes_by_gene_model{$gene_model_id} = [ $gene_symbol, $gene_name ];
+ }
+}
+
+close (IN_FILE);
+
+#print Dumper(\%classical_genes_by_gene_model) . "\n\n";
+
+# read in assoc file (arg 1)
+
+open(ASSOC_IN_FILE, $ARGV[1]);
+
+open(OUT_FILE,">" . (split('\.',$assoc_file_name))[0] . "_named.assoc");
+
+while ()
+{
+ $line = $_;
+ chomp $line;
+
+ if (length($line) > 0) {
+
+ #print $line. "\n";
+
+ my @curr_line = split('\t',$line);
+
+ # look for each annotation's hashed gene model id
+ if (defined $classical_genes_by_gene_model{$curr_line[1]}) {
+ # add/replace the appropriate cols
+ $curr_line[2] = ${$classical_genes_by_gene_model{$curr_line[1]}}[0];
+ $curr_line[9] = ${$classical_genes_by_gene_model{$curr_line[1]}}[1];
+
+ }
+ # output to new assoc file with appended name
+ #print join("\t", @curr_line) . "\n";
+ print OUT_FILE join("\t", @curr_line) . "\n";
+ }
+}
+
+close (ASSOC_IN_FILE);
+close (OUT_FILE);
+exit;
diff --git a/Personnel/preecej/php_singletons/PO_web_service.php b/Personnel/preecej/php_singletons/PO_web_service.php
new file mode 100644
index 0000000..9ba7afe
--- /dev/null
+++ b/Personnel/preecej/php_singletons/PO_web_service.php
@@ -0,0 +1,106 @@
+ 50) { $number_of_terms = 50; }
+
+ $qval = $_GET['qval'];
+
+ $qval = isset($_GET['qval']) && strlen($_GET['qval']) > 0
+ ? strtolower($_GET['qval'])
+ : die('Please provide a searchable value');
+
+ $format = strtolower($_GET['format']) != 'json'
+ ? strtolower($_GET['format'])
+ : 'json'; //json is the default
+
+ /* connect to the db */
+ $link = mysql_connect($_SERVER['mysql_host'], $_SERVER['mysql_user'], $_SERVER['mysql_pw']) or die('Cannot connect to the DB');
+ mysql_select_db($_SERVER['mysql_db'],$link) or die('Cannot select the DB');
+
+ switch ($type) {
+ case 'autocomplete':
+ /* grab the terms from the db */
+ $query = "SELECT t.$field FROM term t"
+ . " LEFT JOIN term_definition d ON d.term_id = t.id"
+ . " WHERE t.$field LIKE '%$qval%'"
+ . " AND t.term_type in ('plant_anatomy','plant_growth_and_development_stage')"
+ . " AND t.is_obsolete = 0"
+ . " AND UCASE(t.name) NOT LIKE 'OBSOLETE%'"
+ . " AND UCASE(d.term_definition) NOT LIKE 'OBSOLETE%'"
+ . " ORDER BY name LIMIT $number_of_terms";
+ $result = mysql_query($query,$link) or die('Errant query: '.$query);
+
+ /* create one master array of the records */
+ $terms = array();
+ if(mysql_num_rows($result)) {
+ while($term = mysql_fetch_assoc($result)) {
+ $terms[] = array('term'=>$term[$field]);
+ }
+ }
+
+ /* output in necessary format */
+ if($format == 'json') {
+ header('Content-type: application/json');
+ echo json_encode(array('PO_term_lookup_response'=>$terms));
+ }
+ else {
+ die('Sorry, this request cannot be fulfilled in '.$format.' format.');
+ }
+ break;
+
+ case 'term_detail':
+ /* grab the ontology data from the db */
+ $query = "SELECT DISTINCT t.acc as 'acc', t.term_type as 'type', d.term_definition as 'definition', d.term_comment as 'comment'"
+ . " FROM term t"
+ . " LEFT JOIN term_definition d ON d.term_id = t.id"
+ . " WHERE t.name = '$qval'"
+ . " AND t.term_type in ('plant_anatomy','plant_growth_and_development_stage')"
+ . " AND t.is_obsolete = 0"
+ . " AND UCASE(t.name) NOT LIKE 'OBSOLETE%'"
+ . " AND UCASE(d.term_definition) NOT LIKE 'OBSOLETE%'"
+ . " ORDER BY t.name LIMIT 1";
+ $result = mysql_query($query,$link) or die('Errant query: '.$query);
+
+ /* create one master array of the records */
+ $terms = array();
+ if(mysql_num_rows($result)) {
+ while($term = mysql_fetch_assoc($result)) {
+ $terms[] = array(
+ 'accession_id'=>$term['acc'],
+ 'aspect'=>$term['type'] == "plant_anatomy" ? "Plant Anatomy" : "Plant Growth and Development Stage",
+ 'definition'=>$term['definition'],
+ 'comment'=>$term['comment']);
+ }
+ }
+ /* output in necessary format */
+ if($format == 'json') {
+ header('Content-type: application/json');
+ echo json_encode(array('PO_term_detail_response'=>$terms));
+ }
+ else {
+ die('Sorry, this request cannot be fulfilled in '.$format.' format.');
+ }
+ break;
+ default:
+ die('Sorry, this web service method is not available.');
+ }
+ /* disconnect from the db */
+ @mysql_close($link);
+}
+else { die('Not authorized.'); }
+?>
+
diff --git a/Personnel/preecej/php_singletons/cmd_line_test.php b/Personnel/preecej/php_singletons/cmd_line_test.php
new file mode 100644
index 0000000..0a58b4e
--- /dev/null
+++ b/Personnel/preecej/php_singletons/cmd_line_test.php
@@ -0,0 +1,14 @@
+load('/home/preecej/Documents/projects/pathvisio/Ath_scratch.gpml');
+
+$entry = $doc->getElementsByTagName("Pathway");
+$author = $entry->item(0)->getAttribute("Author");
+print "Author:$author\n";
+
+print "test over\n";
+
+?>
diff --git a/Personnel/preecej/sql_scripts/Reactome_EC_number_reaction_name_mapping.sql b/Personnel/preecej/sql_scripts/Reactome_EC_number_reaction_name_mapping.sql
new file mode 100644
index 0000000..6f60c73
--- /dev/null
+++ b/Personnel/preecej/sql_scripts/Reactome_EC_number_reaction_name_mapping.sql
@@ -0,0 +1,32 @@
+use gk_central_042211;
+/* query to id fully-qualified EC numbers in poorly-named A.th. reactions */
+select e2n.name, d.identifier from ReactionlikeEvent r
+join Event_2_crossReference e2c on e2c.DB_ID = r.DB_ID
+join Event_2_name e2n on e2n.DB_ID = r.DB_ID
+join DatabaseIdentifier d on d.DB_ID = e2c.crossReference
+join Event_2_species e on e.DB_ID = r.DB_ID
+join Species s on s.DB_ID = e.species
+join Taxon_2_name t on t.DB_ID = s.DB_ID
+where t.name like 'Arabidopsis%'
+and d.referenceDatabase = 4
+and e2n.name like '%->%'
+and d.identifier not like '%-%'
+order by e2n.name;
+
+use test;
+/* load in PJ's EC nums and names file */
+/* mysqlimport -u root -p -v --local --ignore-lines=12 test ./EC_reaction_names.txt */
+
+/* scratch queries */
+select * from Taxon_2_name
+where name like '%Ara%';
+
+select * from Species s
+join Taxon_2_name t on t.DB_ID = s.DB_ID
+where t.name like '%Arabi%';
+
+select * from Event_2_crossReference;
+
+select * from DatabaseIdentifier;
+
+select * from ReferenceDatabase;
\ No newline at end of file
diff --git a/planteome/.gitignore b/planteome/.gitignore
deleted file mode 100644
index e69de29..0000000
diff --git a/planteome/paw/content_pages/Main_Page.wiki b/planteome/paw/content_pages/Main_Page.wiki
new file mode 100644
index 0000000..7e31afe
--- /dev/null
+++ b/planteome/paw/content_pages/Main_Page.wiki
@@ -0,0 +1,37 @@
+__NOTOC__
+=== Welcome to the Planteome Annotation Wiki, a resource for annotated plant genotypes and phenotypes on the semantic web! ===
+
+We host annotations from a variety of published data sources. You are free to browse these annotations and search for specific genes, synonyms, EC numbers, cross-references, and references to publications, as well as many other types of data.
+
+You may also add information of your own. If you would like to create your own annotations or add more data to existing annotations, please click [[special:UserLogin|here]] to establish a new curator account.
+
+If you already have a curator account, just [[Special:UserLogin|log in]] and begin annotating!
+
+Search for existing annotations (or data therein): (search box)
+
+[[Special:FormEdit/Annotation|Add a new gene annotation.]]
+
+=== What is semantic data, and how can it enhance my research? ===
+
+examples...
+
+=== Our Collaborators ===
+
+* Gramene...
+* The Plant Ontology...
+
+=== Credits and Thanks ===
+
+* Jaiswal Lab (and the Planteome portal)...
+* OSU...
+* POC
+* NSF...
+* etc...
+
+
+
diff --git a/planteome/paw/content_pages/Reference:Plant_Taxa.wiki b/planteome/paw/content_pages/Reference:Plant_Taxa.wiki
new file mode 100644
index 0000000..7555bed
--- /dev/null
+++ b/planteome/paw/content_pages/Reference:Plant_Taxa.wiki
@@ -0,0 +1,34 @@
+=== A list of reference species for use on this wiki ===
+{{#show:{{PAGENAME}}
+ | format=table
+ | mainlabel=-
+ | ?Has Reference Taxon
+}}
+
+{{#set:Has Reference Taxon=Aquilegia coerulea}}
+{{#set:Has Reference Taxon=Arabidopsis lyrata}}
+{{#set:Has Reference Taxon=Arabidopsis thaliana}}
+{{#set:Has Reference Taxon=Brachypodium dystachyon}}
+{{#set:Has Reference Taxon=Carica papaya}}
+{{#set:Has Reference Taxon=Chlamydomonas reinhardtii}}
+{{#set:Has Reference Taxon=Citrus clementina}}
+{{#set:Has Reference Taxon=Citrus cinensis}}
+{{#set:Has Reference Taxon=Cucumis sativus}}
+{{#set:Has Reference Taxon=Eucalyptus grandis}}
+{{#set:Has Reference Taxon=Fragaria vesca}}
+{{#set:Has Reference Taxon=Glycine max}}
+{{#set:Has Reference Taxon=Manihot esculenta}}
+{{#set:Has Reference Taxon=Medicago trunculata}}
+{{#set:Has Reference Taxon=Mimulus guttatus}}
+{{#set:Has Reference Taxon=Oryza sativa}}
+{{#set:Has Reference Taxon=Physcomitrella patens}}
+{{#set:Has Reference Taxon=Populus trichocarpa}}
+{{#set:Has Reference Taxon=Prunus persica}}
+{{#set:Has Reference Taxon=Ricinus communis}}
+{{#set:Has Reference Taxon=Selaginella moellendorffii}}
+{{#set:Has Reference Taxon=Setaria italica}}
+{{#set:Has Reference Taxon=Sorghum bicolor}}
+{{#set:Has Reference Taxon=Vitis vinifera}}
+{{#set:Has Reference Taxon=Volvox carteri}}
+{{#set:Has Reference Taxon=Zea mays}}
+
diff --git a/planteome/paw/extensions/DataTransfer_PS/COPYING b/planteome/paw/extensions/DataTransfer_PS/COPYING
new file mode 100644
index 0000000..a865928
--- /dev/null
+++ b/planteome/paw/extensions/DataTransfer_PS/COPYING
@@ -0,0 +1,348 @@
+The license text below "----" applies to all files within this distribution, other
+than those that are in a directory which contains files named "LICENSE" or
+"COPYING", or a subdirectory thereof. For those files, the license text contained in
+said file overrides any license information contained in directories of smaller depth.
+Alternative licenses are typically used for software that is provided by external
+parties, and merely packaged with the Data Transfer release for convenience.
+----
+
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ , 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/planteome/paw/extensions/DataTransfer_PS/DataTransfer.php b/planteome/paw/extensions/DataTransfer_PS/DataTransfer.php
new file mode 100644
index 0000000..a5fbcfd
--- /dev/null
+++ b/planteome/paw/extensions/DataTransfer_PS/DataTransfer.php
@@ -0,0 +1,149 @@
+ __FILE__,
+ 'name' => 'Data Transfer',
+ 'version' => DATA_TRANSFER_VERSION,
+ 'author' => 'Yaron Koren',
+ 'url' => 'http://www.mediawiki.org/wiki/Extension:Data_Transfer',
+ 'descriptionmsg' => 'datatransfer-desc',
+);
+
+###
+# This is the path to your installation of Semantic Forms as
+# seen on your local filesystem. Used against some PHP file path
+# issues.
+##
+$dtgIP = dirname( __FILE__ );
+##
+
+// register all special pages and other classes
+$wgAutoloadClasses['DTUtils'] = $dtgIP . '/includes/DT_Utils.php';
+$wgSpecialPages['ViewXML'] = 'DTViewXML';
+$wgAutoloadClasses['DTViewXML'] = $dtgIP . '/specials/DT_ViewXML.php';
+$wgSpecialPages['ImportXML'] = 'DTImportXML';
+$wgAutoloadClasses['DTImportXML'] = $dtgIP . '/specials/DT_ImportXML.php';
+$wgSpecialPages['ImportCSV'] = 'DTImportCSV';
+$wgAutoloadClasses['DTImportCSV'] = $dtgIP . '/specials/DT_ImportCSV.php';
+$wgJobClasses['dtImport'] = 'DTImportJob';
+$wgAutoloadClasses['DTImportJob'] = $dtgIP . '/includes/DT_ImportJob.php';
+$wgAutoloadClasses['DTXMLParser'] = $dtgIP . '/includes/DT_XMLParser.php';
+$wgHooks['AdminLinks'][] = 'dtfAddToAdminLinks';
+$wgHooks['smwInitProperties'][] = 'dtfInitProperties';
+
+###
+# This is the path to your installation of the Data Transfer extension as
+# seen from the web. Change it if required ($wgScriptPath is the
+# path to the base directory of your wiki). No final slash.
+##
+$dtgScriptPath = $wgScriptPath . '/extensions/DataTransfer_PS';
+##
+
+###
+# Permission to import files
+###
+$wgGroupPermissions['sysop']['datatransferimport'] = true;
+$wgAvailableRights[] = 'datatransferimport';
+
+// initialize content language
+require_once($dtgIP . '/languages/DT_Language.php');
+global $wgLanguageCode;
+dtfInitContentLanguage($wgLanguageCode);
+
+$wgExtensionMessagesFiles['DataTransfer'] = $dtgIP . '/languages/DT_Messages.php';
+$wgExtensionAliasesFiles['DataTransfer'] = $dtgIP . '/languages/DT_Aliases.php';
+
+/**********************************************/
+/***** language settings *****/
+/**********************************************/
+
+/**
+ * Initialise a global language object for content language. This
+ * must happen early on, even before user language is known, to
+ * determine labels for additional namespaces. In contrast, messages
+ * can be initialised much later when they are actually needed.
+ */
+function dtfInitContentLanguage( $langcode ) {
+ global $dtgIP, $dtgContLang;
+
+ if ( !empty( $dtgContLang ) ) { return; }
+
+ $dtContLangClass = 'DT_Language' . str_replace( '-', '_', ucfirst( $langcode ) );
+
+ if ( file_exists( $dtgIP . '/languages/' . $dtContLangClass . '.php' ) ) {
+ include_once( $dtgIP . '/languages/' . $dtContLangClass . '.php' );
+ }
+
+ // fallback if language not supported
+ if ( !class_exists( $dtContLangClass ) ) {
+ include_once( $dtgIP . '/languages/DT_LanguageEn.php' );
+ $dtContLangClass = 'DT_LanguageEn';
+ }
+
+ $dtgContLang = new $dtContLangClass();
+}
+
+/**
+ * Initialise the global language object for user language. This
+ * must happen after the content language was initialised, since
+ * this language is used as a fallback.
+ */
+function dtfInitUserLanguage( $langcode ) {
+ global $dtgIP, $dtgLang;
+
+ if ( !empty( $dtgLang ) ) { return; }
+
+ $dtLangClass = 'DT_Language' . str_replace( '-', '_', ucfirst( $langcode ) );
+
+ if ( file_exists( $dtgIP . '/languages/' . $dtLangClass . '.php' ) ) {
+ include_once( $dtgIP . '/languages/' . $dtLangClass . '.php' );
+ }
+
+ // fallback if language not supported
+ if ( !class_exists( $dtLangClass ) ) {
+ global $dtgContLang;
+ $dtgLang = $dtgContLang;
+ } else {
+ $dtgLang = new $dtLangClass();
+ }
+}
+
+/**********************************************/
+/***** other global helpers *****/
+/**********************************************/
+
+function dtfInitProperties() {
+ global $dtgContLang;
+ $dt_props = $dtgContLang->getPropertyLabels();
+ SMWPropertyValue::registerProperty( '_DT_XG', '_str', $dt_props[DT_HAS_XML_GROUPING], true );
+ // TODO - this should set a "backup" English value as well,
+ // so that the phrase "Has XML grouping" works in all languages
+ return true;
+}
+
+/**
+ * Add links to the 'AdminLinks' special page, defined by the Admin Links
+ * extension
+ */
+function dtfAddToAdminLinks( $admin_links_tree ) {
+ $import_export_section = $admin_links_tree->getSection( wfMsg( 'adminlinks_importexport' ) );
+ $main_row = $import_export_section->getRow( 'main' );
+ $main_row->addItem( ALItem::newFromSpecialPage( 'ViewXML' ) );
+ $main_row->addItem( ALItem::newFromSpecialPage( 'ImportXML' ) );
+ $main_row->addItem( ALItem::newFromSpecialPage( 'ImportCSV' ) );
+ return true;
+}
diff --git a/planteome/paw/extensions/DataTransfer_PS/INSTALL b/planteome/paw/extensions/DataTransfer_PS/INSTALL
new file mode 100644
index 0000000..16462b6
--- /dev/null
+++ b/planteome/paw/extensions/DataTransfer_PS/INSTALL
@@ -0,0 +1,31 @@
+[[Data Transfer 0.3.8]]
+
+Contents:
+* Disclaimer
+* Requirements
+* Installation
+* Contact
+
+== Disclaimer ==
+
+For a proper legal disclaimer, see the file "COPYING".
+
+== Requirements ==
+
+The extension can make use of, but does not require, an install of
+Semantic MediaWiki. If Semantic MediaWiki is used, it must be of
+version 1.0 or greater. For more details, see Semantic MediaWiki's
+own installation requirements.
+
+== Installation ==
+
+(1) Extract the archive to obtain the directory "DataTransfer"
+ that contains all relevant files. Copy this directory (or
+ extract/download it) to "[wikipath]/extensions/".
+(2) Insert the following line into the file "[wikipath]/LocalSettings.php":
+ include_once('extensions/DataTransfer/DataTransfer.php');
+
+== Contact ==
+
+If you have any issues or questions, please send them to
+yaron57@gmail.com.
diff --git a/planteome/paw/extensions/DataTransfer_PS/README b/planteome/paw/extensions/DataTransfer_PS/README
new file mode 100644
index 0000000..66652b3
--- /dev/null
+++ b/planteome/paw/extensions/DataTransfer_PS/README
@@ -0,0 +1,24 @@
+== About ==
+
+Data Transfer is an extension to MediaWiki that both exports XML
+based on the current contents of pages in a wiki, and imports pages
+in both XML format (using the same structure as the XML export) and
+CSV format. Both the XML and CSV formats use template calls, and
+the fields within them, to define the data structure. Any text that
+is not within a template calls gets placed into one or more "free
+text" fields.
+
+For more information on Data Transfer, see the extension
+homepage at
+http://www.mediawiki.org/wiki/Extension:Data_Transfer
+
+Notes on installing Data Transfer can be found in the file INSTALL.
+
+== Credits ==
+
+Data Transfer was written by Yaron Koren.
+
+== Contact ==
+
+Comments, questions, suggestions and bug reports should be
+sent to Yaron at yaron57@gmail.com.
diff --git a/planteome/paw/extensions/DataTransfer_PS/data_transfer_0.3.8.tar.gz b/planteome/paw/extensions/DataTransfer_PS/data_transfer_0.3.8.tar.gz
new file mode 100644
index 0000000..8c58be9
Binary files /dev/null and b/planteome/paw/extensions/DataTransfer_PS/data_transfer_0.3.8.tar.gz differ
diff --git a/planteome/paw/extensions/DataTransfer_PS/includes/DT_ImportJob.php b/planteome/paw/extensions/DataTransfer_PS/includes/DT_ImportJob.php
new file mode 100644
index 0000000..3a5e76f
--- /dev/null
+++ b/planteome/paw/extensions/DataTransfer_PS/includes/DT_ImportJob.php
@@ -0,0 +1,53 @@
+title ) ) {
+ $this->error = "dtImport: Invalid title";
+ wfProfileOut( __METHOD__ );
+ return false;
+ }
+
+ $article = new Article( $this->title );
+ if ( !$article ) {
+ $this->error = 'dtImport: Article not found "' . $this->title->getPrefixedDBkey() . '"';
+ wfProfileOut( __METHOD__ );
+ return false;
+ }
+ $for_pages_that_exist = $this->params['for_pages_that_exist'];
+ if ( $for_pages_that_exist == 'skip' && $this->title->exists() ) {
+ return true;
+ }
+
+ // change global $wgUser variable to the one specified by
+ // the job only for the extent of this import
+ global $wgUser;
+ $actual_user = $wgUser;
+ $wgUser = User::newFromId( $this->params['user_id'] );
+ $text = $this->params['text'];
+ if ( $for_pages_that_exist == 'append' && $this->title->exists() ) {
+ $text = $article->getContent() . "\n" . $text;
+ }
+ $edit_summary = $this->params['edit_summary'];
+ $article->doEdit( $text, $edit_summary );
+ $wgUser = $actual_user;
+ wfProfileOut( __METHOD__ );
+ return true;
+ }
+}
diff --git a/planteome/paw/extensions/DataTransfer_PS/includes/DT_Utils.php b/planteome/paw/extensions/DataTransfer_PS/includes/DT_Utils.php
new file mode 100644
index 0000000..30736b1
--- /dev/null
+++ b/planteome/paw/extensions/DataTransfer_PS/includes/DT_Utils.php
@@ -0,0 +1,89 @@
+