#!/usr/bin/perl -w
use strict;
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
# Rice Reactome - CHEBI Ontology Mapping Script
#
# Justin Preece, 10/06/10
# Purpose: Map CHEBI ontology terms onto Rice Reactome database.
#
# Inputs:
+#
# CHEBI OBO file (preset)
+#
# Rice Reactome file (preset, provided by YuanMing Wu)
# (Header) [ReactomeID] [Compound_Name] [CAS] [LIGAND] [RiceCyc]
# (Row) 923893 S-adenosyl-L-methionine 29908-03-0 C00019 S-ADENOSYLMETHIONINE ** the '-' (dash) symbol will be applied to any empty columns
#
# Outputs: tab-del mapping file (reactome_chebi_mapping.txt)
-# (Header) [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID]
-# (Row) 923893 15414 CAS 29908-03-0
-# (Row) 923893 15414 LIGAND C00019
-# (Row) 923893 15414 RiceCyc S-ADENOSYLMETHIONINE ** this would be a rare mapping occurrence
-# --------------------------------------------------------------------
+#
+# [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID]
+# 923893 15414 CAS 29908-03-0
+# 923893 15414 LIGAND C00019
+# 923893 15414 RiceCycTerm S-ADENOSYLMETHIONINE
+# 923893 15414 RiceCycSynonym s-adenosylmethionine
+# ---------------------------------------------------------------------------
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
# modules
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
use GO::Parser;
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
# declarations
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
# set paths to data files
-my $data_path = "/Users/cindypreece/Desktop/jaiswal_lab/projects/reactome/";
+my $data_path = "/home/preecej/Documents/Projects/Reactome/";
my $chebi_obo_file = "chebi_sample.obo";
-my $reactome_file = "RiceReferenceMolecules.txt";
+my $reactome_file = "RiceReferenceMolecules_sample.txt";
my $mapped_output_file = "reactome_chebi_mapping.txt";
+# options
+my $allow_obsolete_terms = 1;
+my $allow_synonyms = 1;
+
my $ont; # chebi ontology
my %reactome_CAS; # rice reactome CAS hash
my @map_results = (); # successful mappings between chebi and reactome
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
# functions
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
# setup chebi parser and reactome data
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
sub init
{
# init ontology parser and ontology
$parser->parse($data_path . $chebi_obo_file);
$ont = $parser->handler->graph;
-# my $parser = Bio::OntologyIO->new (
-# -format => "obo",
-# -file => $data_path . $chebi_obo_file);
-
- # init ontology
-# $ont = $parser->next_ontology();
-# $parser->close();
-
# read rice reactome file into 3 separate hashes
open(REACTOME_FILE,$data_path . $reactome_file);
my $line = <REACTOME_FILE>; # skip the header
+ my $reactome_count = 0;
while (<REACTOME_FILE>)
{
$line = $_;
chomp $line;
+ $reactome_count++;
my @reactome_entry = split(/\t/, $line); # break up our tab-del line
# load up this reactome entry's ID, CAS, LIGAND, and RiceCyc values
my $reactome_id = $reactome_entry[0];
my $CAS_id = $reactome_entry[2];
my $LIGAND_id = $reactome_entry[3];
- my $RiceCyc_term = $reactome_entry[4];
+ my $RiceCyc_term = uc $reactome_entry[4]; # for case-insensitivity
# There is a possibility that a single CAS, LIGAND, or RiceCyc
# identifier may appear in more than one reactome entry. This
# one ReactomeID, if necessary.
# --CAS Hash Load--
- if ($CAS_id ne "-") # keep those "-" placeholders out
- {
+ if ($CAS_id ne "-") { # keep those "-" placeholders out
# build the CAS hash; each value may hold 1...n reactome
# ids (as an array)
push @{$reactome_CAS{$CAS_id}}, $reactome_id;
# similarly...
# --LIGAND Hash Load--
- if ($LIGAND_id ne "-")
- {
+ if ($LIGAND_id ne "-") {
push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id;
}
# --RiceCyc Hash Load--
- if ($RiceCyc_term ne "-")
- {
+ if ($RiceCyc_term ne "-") {
push @{$reactome_RiceCyc{"\U$RiceCyc_term"}}, $reactome_id;
}
}
close REACTOME_FILE;
+
+ print "\n[Reactome Stats]",
+ "\nTotal Oryza Reactome Entries: $reactome_count\n";
+
}
# spit out some data to make sure you've read in the files correctly
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
sub test_inputs
{
- # output new GO version of parsed ontology
- my $terms = $ont->get_all_terms;
- foreach my $term (@$terms) {
+ # basic ontology info
+ print "[Node Count]: " . $ont->node_count . "\n";
+
+ # get all chebi terms in the ontology
+ my $terms = $ont->get_all_nodes;
+
+ # output contents of parsed ontology
+ foreach my $term (@$terms)
+ {
print "\n" . $term->acc . " " . $term->name . "\n[SYNONYMS]\n";
my $synonyms = $term->synonym_list;
print "\n";
}
- # output basic stats on chebi ontology
-# print "\n[Ontology Stats]\n";
-# print "read ontology ",$ont->name()," with ",
-# scalar($ont->get_root_terms)," root terms, and ",
-# scalar($ont->get_all_terms)," total terms, and ",
-# scalar($ont->get_leaf_terms)," leaf terms\n";
-
- # all chebi terms in the ontology
-# print "\n[CHEBI Term List from \$ont]\n";
-# foreach my $term ($ont->get_all_terms) {
-# my @synonyms = $term->get_synonyms;
-# my @xrefs = $term->get_dbxrefs;
-
-# print $term->identifier;
-# print " \|NAME\| ";
-# if (defined($term->name)) {
-# print $term->name;
-# }
-# print " \|SYNONYMS\| ";
-# print "$_," foreach @synonyms;
-# print " \|XREFS\| ";
-# print "$_" foreach @xrefs;
-# foreach my $xref (@xrefs) {
-# print $xref->primary_id;
-# }
-# print "\n\n";
-# }
-
- # show reactome hashes - this is important, give >1 dupes to Pankaj
- # for manual reference
+ # show dupes in reactome hashes - give data to Pankaj;
+ # this is important b/c the duplicates may represent erroneous data in
+ # the Rice Reactome dataset
my $k; my @v;
print "\n[Reactome Hashes - Dupes]\n";
print "\n--CAS Hash--\n";
# map the chebi terms to the reactome entries
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
sub perform_map
{
- my @chebi_obo_terms = $ont->get_all_terms;
- #print $_->identifier . "\n" foreach @chebi_obo_terms;
-
+ my $chebi_obo_terms = $ont->get_all_nodes;
+
+ # vars for mapping stats
+ my $attempted_mappings = 0;
+ my $successful_mappings = 0;
+ my $attempted_CAS_mappings = 0;
+ my $successful_CAS_mappings = 0;
+ my $attempted_LIGAND_mappings = 0;
+ my $successful_LIGAND_mappings = 0;
+ my $attempted_name_mappings = 0;
+ my $successful_name_mappings = 0;
+ my $attempted_synonym_mappings = 0;
+ my $successful_synonym_mappings = 0;
+
# loop through each chebi term
- foreach my $term (@chebi_obo_terms)
- {
- # set locals for matching each term property
- my $term_name;
- if (defined($term->name)) {
- $term_name = $term->name;
- } else {
- $term_name = "";
- }
- my @term_synonyms = $term->get_synonyms;
-
- # attempt CHEBI match on CAS ID
-
- # attempt CHEBI match on LIGAND ID
-
- # attempt CHEBI match on RiceCyc names
- if (defined($reactome_RiceCyc{"\U$term_name"})) {
- push (@map_results, "$reactome_RiceCyc{$term_name}\t",
- "$term->identifier\t",
- "RiceCyc\t",
- $term_name);
- } else { # check the term synonyms, if needed
- foreach my $synonym (@term_synonyms) {
- print "";
+ foreach my $term (@$chebi_obo_terms)
+ {
+ # eliminate "typedef" nodes (non-CHEBI terms), also check for obsolete
+ # terms and whether to allow them
+ if (($term->acc =~ m/^CHEBI:/)
+ && (!$term->is_obsolete || ($term->is_obsolete && $allow_obsolete_terms)))
+ {
+ # attempt CHEBI match on CAS and LIGAND ID's
+ $attempted_mappings++;
+ my $xrefs = $term->dbxref_list;
+ foreach my $xref (@$xrefs)
+ {
+ $attempted_CAS_mappings++;
+ $attempted_LIGAND_mappings++;
+
+ # temp-foo to skirt an interpolation problem
+ my $tmp_key = $xref->xref_key;
+
+ if (defined($reactome_CAS{"$tmp_key"}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_CAS{$tmp_key}})
+ {
+ $successful_CAS_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CAS\t" .
+ $tmp_key);
+ }
+ }
+
+ if (defined($reactome_LIGAND{"$tmp_key"}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_LIGAND{$tmp_key}})
+ {
+ $successful_LIGAND_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "LIGAND\t" .
+ $tmp_key);
+ }
+ }
+ }
+
+ # attempt CHEBI match on RiceCyc names...
+ $attempted_name_mappings++;
+
+ # more temp-foo to skirt said interpolation problem
+ my $tmp_name = uc $term->name;
+
+ if (defined($reactome_RiceCyc{"$tmp_name"}))
+ {
+ $successful_name_mappings++;
+ push (@map_results, "@{$reactome_RiceCyc{$tmp_name}}\t" .
+ $term->acc . "\t" .
+ "RiceCycTerm\t" .
+ $term->name);
+ }
+ # ...and synonyms (optional)
+ if ($allow_synonyms)
+ {
+ my $synonyms = $term->synonym_list;
+ foreach my $synonym (@$synonyms)
+ {
+ $attempted_synonym_mappings++;
+
+ # yet more temp-foo to skirt interpolation problem
+ my $tmp_syn = "\U$synonym";
+
+ if (defined($reactome_RiceCyc{$tmp_syn}))
+ {
+ $successful_synonym_mappings++;
+ push (@map_results, "@{$reactome_RiceCyc{$tmp_syn}}\t" .
+ $term->acc . "\t" .
+ "RiceCycSynonym\t" .
+ $synonym);
+ }
+ }
}
}
}
+ # send up some stats on the mapping process
+ $successful_mappings =
+ + $successful_CAS_mappings
+ + $successful_LIGAND_mappings
+ + $successful_name_mappings
+ + $successful_synonym_mappings;
+
+ print "\n[ChEBI Stats]",
+ "\nNodes in File: " . $ont->node_count,
+ "\nTotal Attempted Mappings (with usable ChEBI terms): " . $attempted_mappings,
+ "\nTotal Successful Mappings: " . $successful_mappings . "\n";
+
+ print "\n[Mapping Breakdown by Type]",
+ "\nCAS (matches/attempts): ",
+ "$successful_CAS_mappings/$attempted_CAS_mappings ",
+ "(note: can include ChemIDplus and KEGG COMPUND db duplicates)",
+ "\nLIGAND: ",
+ "$successful_LIGAND_mappings/$attempted_LIGAND_mappings",
+ "\nTerm Names: ",
+ "$successful_name_mappings/$attempted_name_mappings",
+ "\nTerm Synonyms: ",
+ "$successful_synonym_mappings/$attempted_synonym_mappings\n\n";
}
-# sample format - remove later
-# [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID]
-# 923893 15414 CAS 29908-03-0
-# 923893 15414 LIGAND C00019
-# 923893 15414 RiceCyc S-ADENOSYLMETHIONINE
-
# put the results in the mapped output file
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
sub create_mapfile
{
if (@map_results > 0)
unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID");
# setup output file
- open(OUTPUT_FILE,">>" . $data_path . $mapped_output_file);
+ open(OUTPUT_FILE,">" . $data_path . $mapped_output_file);
#format results for file output
print OUTPUT_FILE "$_\n" foreach @map_results;
}
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
# main
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
init;
-test_inputs;
-#perform_map;
+#test_inputs;
+perform_map;
create_mapfile;
exit;