--- /dev/null
+#!/usr/bin/perl -w
+
+=head1 NAME
+
+Plant Semantics Import Transformation Script
+
+=head1 VERSION
+
+0.1 (svn r100)
+
+=head1 DESCRIPTION
+
+Transform external gene annotation data into an XML document readable
+by the MediaWiki extension DataTransfer (Special:ImportXML) feature.
+Also generates appropriate provenance of data based on header of
+import file.
+
+=head1 USAGE
+
+PS_TransformForImport.pl -i INPUT_FILE -o OUTPUT_FILE
+
+=head1 USAGE
+
+ -i|--input Name of input CSV or tab-del file.
+ -t|--type Specifies input type of file ('csv' or 'tab')
+ -o|--output Name of output file.
+
+=head1 DEPENDENCIES
+
+Requires that the input files contain two headers: the first will hold
+the provenance information associated with the imported data, and the
+second will specify the template and field names for the annotation
+data.
+
+=head2 Provenance Header Format Example
+
+ [Provenance]
+ Source Date Time Stamp=Apr 2 2008
+ Source Data=Oryzabase
+ Source Version=rel. 10
+ Source URI=http://www.shigen.nig.ac.jp/rice/oryzabase/
+ Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/downloadAboutGeneAction.do?fileCategory=geneListAll
+
+=head2 Data Header Format Example
+
+ Gene[Accession ID],Gene[Gene Symbol],Reference_Publication[Species Name],Reference_Publication[Species ID]
+
+=head1 AUTHOR
+
+Justin Preece
+ Faculty Research Assistant, Bioinformatics
+ Jaiswal Lab, Botany & Plant Pathology
+ Oregon State University
+ L<mailto:preecej@science.oregonstate.edu>
+
+=cut
+
+# ---------------------------------------------------------------------------
+# modules
+# ---------------------------------------------------------------------------
+
+# general
+use strict;
+use Cwd; qw(cwd);
+
+# specific
+use GO::Parser;
+
+# ---------------------------------------------------------------------------
+# declarations
+# ---------------------------------------------------------------------------
+
+# set paths to data files
+my $data_path = "/home/preecej/Documents/projects/reactome/reactome_to_chebi_mapping/AraCyc/gk_central_041811/no_synonyms/";
+my $chebi_obo_file = "../../chebi_v78.obo";
+my $reactome_file = "AraReferenceMolecules.txt";
+my $mapped_output_file = "1.2_reactome_chebi_mapping_complete.txt";
+my $sorted_output_file = "1.2_reactome_chebi_mapping_complete_sorted.txt";
+my $unique_mappings = "1.2_reactome_unique_mappings.txt";
+my $sorted_no_match_file = "1.2_reactome_entries_with_no_chebi_match.txt";
+
+# options
+my $allow_obsolete_terms = 1;
+my $allow_cyc = 0;
+my $allow_synonyms = 0;
+
+my $ont; # chebi ontology
+
+my %reactome_CompoundName; # reactome Compound Name hash
+my %reactome_CAS; # reactome CAS hash
+my %reactome_LIGAND; # reactome LIGAND hash
+my %reactome_Cyc; # reactome Cyc hash
+
+my @map_results = (); # successful mappings between chebi and reactome
+
+
+# ---------------------------------------------------------------------------
+# functions
+# ---------------------------------------------------------------------------
+
+
+# setup chebi parser and reactome data
+# ---------------------------------------------------------------------------
+sub init
+{
+ # init ontology parser and ontology
+ my $parser = GO::Parser->new({handler=>'obj'});
+ $parser->parse($data_path . $chebi_obo_file);
+ $ont = $parser->handler->graph;
+
+ # read reactome file into 3 separate hashes
+ open(REACTOME_FILE,$data_path . $reactome_file);
+
+ my $line = <REACTOME_FILE>; # skip the header
+ my $reactome_count = 0;
+
+ while (<REACTOME_FILE>)
+ {
+ $line = $_;
+ chomp $line;
+ $reactome_count++;
+ my @reactome_entry = split(/\t/, $line); # break up our tab-del line
+
+ # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and Cyc values
+ my $reactome_id = $reactome_entry[0];
+ my $compound_name = uc $reactome_entry[1]; # for case-insensitivity
+
+ # strips off "AN " and "A " indefinite articles
+ $compound_name =~ s/^ //;
+ $compound_name =~ s/^AN //;
+ $compound_name =~ s/^A //;
+
+ my $CAS_id = $reactome_entry[2];
+ my $LIGAND_id = $reactome_entry[3];
+ my $Cyc_term = uc $reactome_entry[4]; # for case-insensitivity
+
+ # There is a possibility that a single CAS, LIGAND, or Cyc
+ # identifier may appear in more than one reactome entry. This
+ # temp array allows each matched hash value to hold more than
+ # one ReactomeID, if necessary.
+
+ # --CAS Hash Load--
+ if ($CAS_id ne "-") { # keep those "-" placeholders out
+ # build the CAS hash; each value may hold 1...n reactome
+ # ids (as an array)
+ push @{$reactome_CAS{$CAS_id}}, $reactome_id;
+ }
+
+ # similarly...
+
+ # --LIGAND Hash Load--
+ if ($LIGAND_id ne "-") {
+ push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id;
+ }
+
+ # --CompoundName Hash Load--
+ if ($compound_name ne "-") {
+ push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id;
+ }
+
+ # --Cyc Hash Load--
+ if ($allow_cyc)
+ {
+ if ($Cyc_term ne "-") {
+ push @{$reactome_Cyc{"$Cyc_term"}}, $reactome_id;
+ }
+
+ }
+ }
+ close REACTOME_FILE;
+
+ print "\n[Reactome Stats]",
+ "\nTotal Reactome Entries: $reactome_count\n";
+
+}
+
+
+# spit out some data to make sure you've read in the files correctly
+# ---------------------------------------------------------------------------
+sub test_inputs
+{
+ # basic ontology info
+ print "[Node Count]: " . $ont->node_count . "\n";
+
+ # get all chebi terms in the ontology
+ my $terms = $ont->get_all_nodes;
+
+ # output contents of parsed ontology
+ foreach my $term (@$terms)
+ {
+ print "\n" . $term->acc . " " . $term->name . "\n[SYNONYMS]\n";
+
+ my $synonyms = $term->synonym_list;
+ foreach my $synonym (@$synonyms) {
+ print $synonym . "\n";
+ }
+
+ print "[XREFS]\n";
+ my $xrefs = $term->dbxref_list;
+ foreach my $xref (@$xrefs) {
+ print $xref->xref_key . ",",
+ $xref->xref_keytype . ",",
+ $xref->xref_dbname . ",",
+ $xref->xref_desc . "\n";
+ }
+ print "\n";
+ }
+
+ # show dupes in reactome hashes - give data to Pankaj;
+ # this is important b/c the duplicates may represent erroneous data in
+ # the Reactome dataset
+ my $k; my @v;
+ print "\n[Reactome Hashes - Dupes]\n";
+ print "\n--CAS Hash--\n";
+ for $k (keys %reactome_CAS) {
+ if (@{$reactome_CAS{$k}} > 1) {
+ print "$k: @{$reactome_CAS{$k}}\n";
+ }
+ }
+ print "\n--LIGAND Hash--\n";
+ for $k (keys %reactome_LIGAND) {
+ if (@{$reactome_LIGAND{$k}} > 1) {
+ print "$k: @{$reactome_LIGAND{$k}}\n";
+ }
+ }
+ print "\n--CompoundName Hash--\n";
+ for $k (keys %reactome_CompoundName) {
+ if (@{$reactome_CompoundName{$k}} > 1) {
+ print "$k: @{$reactome_CompoundName{$k}}\n";
+ }
+ }
+ if ($allow_cyc)
+ {
+ print "\n--Cyc Hash--\n";
+ for $k (keys %reactome_Cyc) {
+ if (@{$reactome_Cyc{$k}} > 1) {
+ print "$k: @{$reactome_Cyc{$k}}\n";
+ }
+ }
+ }
+}
+
+
+# map the chebi terms to the reactome entries
+# ---------------------------------------------------------------------------
+sub perform_map
+{
+ my $chebi_obo_terms = $ont->get_all_nodes;
+
+ # vars for mapping stats
+ my $attempted_mappings = 0;
+ my $successful_mappings = 0;
+ my $attempted_CAS_mappings = 0;
+ my $successful_CAS_mappings = 0;
+ my $attempted_LIGAND_mappings = 0;
+ my $successful_LIGAND_mappings = 0;
+ my $attempted_name_mappings = 0;
+ my $successful_name_mappings = 0;
+ my $attempted_synonym_mappings = 0;
+ my $successful_synonym_mappings = 0;
+
+ # loop through each chebi term
+ foreach my $term (@$chebi_obo_terms)
+ {
+ # eliminate "typedef" nodes (non-CHEBI terms), also check for obsolete
+ # terms and whether to allow them
+ if (($term->acc =~ m/^CHEBI:/)
+ && (!$term->is_obsolete || ($term->is_obsolete && $allow_obsolete_terms)))
+ {
+ # attempt CHEBI match on CAS and LIGAND ID's
+ $attempted_mappings++;
+ my $xrefs = $term->dbxref_list;
+ foreach my $xref (@$xrefs)
+ {
+ $attempted_CAS_mappings++;
+ $attempted_LIGAND_mappings++;
+
+ # temp-foo to skirt an interpolation problem
+ my $tmp_key = $xref->xref_key;
+
+ if (defined($reactome_CAS{"$tmp_key"}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_CAS{$tmp_key}})
+ {
+ $successful_CAS_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CAS\t" .
+ $tmp_key);
+ }
+ }
+
+ if (defined($reactome_LIGAND{"$tmp_key"}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_LIGAND{$tmp_key}})
+ {
+ $successful_LIGAND_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "LIGAND\t" .
+ $tmp_key);
+ }
+ }
+ }
+
+ # attempt CHEBI match on Reactome Compound Names (and optional Cyc names/synonyms)...
+ $attempted_name_mappings++;
+
+ # more temp-foo to skirt said interpolation problem
+ my $tmp_name = uc $term->name;
+
+ # reactome compound names...
+ if (defined($reactome_CompoundName{"$tmp_name"}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_name}})
+ {
+ $successful_name_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CompoundTerm\t" .
+ $term->name);
+ }
+ }
+ # ...and synonyms (optional)
+ if ($allow_synonyms)
+ {
+ my $synonyms = $term->synonym_list;
+ foreach my $synonym (@$synonyms)
+ {
+ $attempted_synonym_mappings++;
+
+ # yet more temp-foo to skirt interpolation problem
+ my $tmp_syn = "\U$synonym";
+
+ if (defined($reactome_CompoundName{$tmp_syn}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_syn}})
+ {
+ $successful_synonym_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CompoundSynonym\t" .
+ $synonym);
+ }
+ }
+ }
+ }
+
+ # Cyc names...
+ if ($allow_cyc)
+ {
+ if (defined($reactome_Cyc{"$tmp_name"}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_name}})
+ {
+ $successful_name_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CycTerm\t" .
+ $term->name);
+ }
+ }
+ # ...and synonyms (optional)
+ if ($allow_synonyms)
+ {
+ my $synonyms = $term->synonym_list;
+ foreach my $synonym (@$synonyms)
+ {
+ $attempted_synonym_mappings++;
+
+ # yet more temp-foo to skirt interpolation problem
+ my $tmp_syn = "\U$synonym";
+
+ if (defined($reactome_Cyc{$tmp_syn}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_syn}})
+ {
+ $successful_synonym_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CycSynonym\t" .
+ $synonym);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ # send up some stats on the mapping process
+ $successful_mappings =
+ + $successful_CAS_mappings
+ + $successful_LIGAND_mappings
+ + $successful_name_mappings
+ + $successful_synonym_mappings;
+
+ print "\n[ChEBI Stats]",
+ "\nNodes in File: " . $ont->node_count,
+ "\nTotal Attempted Mappings (with usable ChEBI terms): " . $attempted_mappings,
+ "\nTotal Successful Mappings: " . $successful_mappings . "\n";
+
+ print "\n[Mapping Breakdown by Type]",
+ "\nCAS (matches/attempts): ",
+ "$successful_CAS_mappings/$attempted_CAS_mappings ",
+ "(note: can include ChemIDplus and KEGG COMPUND db duplicates)",
+ "\nLIGAND: ",
+ "$successful_LIGAND_mappings/$attempted_LIGAND_mappings",
+ "\nTerm Names " . ($allow_cyc ? "includes Cyc terms and synonyms" : "") . ": ",
+ "$successful_name_mappings/$attempted_name_mappings";
+ if ($allow_synonyms)
+ {
+ print "\nTerm Synonyms: ",
+ "$successful_synonym_mappings/$attempted_synonym_mappings";
+ }
+ print "\n\n";
+}
+
+
+# put the results in the mapped output file
+# ---------------------------------------------------------------------------
+sub create_mapfile
+{
+ if (@map_results > 0)
+ {
+ # add a header to the results array
+ unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID");
+
+ # setup output file
+ open(OUTPUT_FILE,">" . $data_path . $mapped_output_file);
+
+ #format results for file output
+ print OUTPUT_FILE "$_\n" foreach @map_results;
+
+ close OUTPUT_FILE;
+
+ # sort on all cols (keep the header at the top), remove exact dupes
+ system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file";
+
+ # also produce files listing unique Reactome entries having a match...and those without a match
+ system "awk 'NR == 1; NR > 1 {print \$1}' $data_path$sorted_output_file | uniq > $data_path$unique_mappings";
+ system "cat $data_path$reactome_file | grep -vf $data_path$unique_mappings | sort > $data_path$sorted_no_match_file";
+ } else {
+ print "\n\nSorry, there are no mapped results.\n\n";
+ }
+}
+
+
+# ---------------------------------------------------------------------------
+# main
+# ---------------------------------------------------------------------------
+
+init;
+#test_inputs;
+perform_map;
+create_mapfile;
+
+exit;
+