use strict;
# ---------------------------------------------------------------------------
-# Rice Reactome - CHEBI Ontology Mapping Script
+# Reactome - CHEBI Ontology Mapping Script
#
-# Justin Preece, 10/06q/10
+# Justin Preece, 10/06/10
# v1.0: 10/13/10 (svn rev. 66)
# v1.1: 10/20/10 (svn rev. 70)
-# v2.0: 10/21/10 (svn rev. ?)
+# v1.2: 02/07/11 (svn rev. ?)
#
-# Purpose: Map CHEBI ontology terms onto Rice Reactome database.
+# Purpose: Map CHEBI ontology terms onto the Reactome database.
#
# Inputs:
#
# CHEBI OBO file (preset)
#
-# Rice Reactome file (preset, provided by YuanMing Wu)
-# (Header) [ReactomeID] [Compound_Name] [CAS] [LIGAND] [RiceCyc]
+# Reactome file (preset, provided by Guanming Wu)
+# (Header) [ReactomeID] [Compound_Name] [CAS] [LIGAND] [Cyc]
# (Row) 923893 S-adenosyl-L-methionine 29908-03-0 C00019 S-ADENOSYLMETHIONINE ** the '-' (dash) symbol will be applied to any empty columns
#
-# Outputs: tab-del mapping file (reactome_chebi_mapping_results_sorted.txt)
+# Outputs: tab-del mapping file (reactome_chebi_mapping_complete_sorted.txt)
#
# [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID]
# 923893 15414 CAS 29908-03-0
# 923893 15414 LIGAND C00019
# 923893 15414 CompoundTerm S-ADENOSYLMETHIONINE
# 923893 15414 CompoundSynonym s-AdenosylMethionine
-# 923893 15414 RiceCycTerm S-ADENOSYLMETHIONINE ** optional
-# 923893 15414 RiceCycSynonym s-adenosylmethionine ** optional
+# 923893 15414 CycTerm S-ADENOSYLMETHIONINE ** optional
+# 923893 15414 CycSynonym s-adenosylmethionine ** optional
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
# set paths to data files
-my $data_path = "/Users/cindypreece/Desktop/work/Documents/projects/reactome/";
-#my $data_path = "/home/preecej/Documents/projects/reactome/";
-my $chebi_obo_file = "chebi.obo";
-my $reactome_file = "RiceReferenceMolecules.txt";
-my $mapped_output_file = "2.0_reactome_chebi_mapping_complete.txt";
-my $sorted_output_file = "2.0_reactome_chebi_mapping_complete_sorted.txt";
-my $unique_mappings = "2.0_reactome_unique_mappings.txt";
-my $sorted_no_match_file = "2.0_reactome_entries_with_no_chebi_match.txt";
+my $data_path = "/home/preecej/Documents/projects/reactome/reactome_to_chebi_mapping/AraCyc/";
+my $chebi_obo_file = "chebi_v75.obo";
+my $reactome_file = "AraReferenceMolecules.txt";
+my $mapped_output_file = "1.2_reactome_chebi_mapping_complete.txt";
+my $sorted_output_file = "1.2_reactome_chebi_mapping_complete_sorted.txt";
+my $unique_mappings = "1.2_reactome_unique_mappings.txt";
+my $sorted_no_match_file = "1.2_reactome_entries_with_no_chebi_match.txt";
# options
my $allow_obsolete_terms = 1;
-my $allow_ricecyc = 0;
+my $allow_cyc = 0;
my $allow_synonyms = 1;
my $ont; # chebi ontology
-my %reactome_CompoundName; # rice reactome Compound Name hash
-my %reactome_CAS; # rice reactome CAS hash
-my %reactome_LIGAND; # rice reactome LIGAND hash
-my %reactome_RiceCyc; # rice reactome RiceCyc hash
+my %reactome_CompoundName; # reactome Compound Name hash
+my %reactome_CAS; # reactome CAS hash
+my %reactome_LIGAND; # reactome LIGAND hash
+my %reactome_Cyc; # reactome Cyc hash
my @map_results = (); # successful mappings between chebi and reactome
$parser->parse($data_path . $chebi_obo_file);
$ont = $parser->handler->graph;
- # read rice reactome file into 3 separate hashes
+ # read reactome file into 3 separate hashes
open(REACTOME_FILE,$data_path . $reactome_file);
my $line = <REACTOME_FILE>; # skip the header
$reactome_count++;
my @reactome_entry = split(/\t/, $line); # break up our tab-del line
- # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and RiceCyc values
+ # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and Cyc values
my $reactome_id = $reactome_entry[0];
my $compound_name = uc $reactome_entry[1]; # for case-insensitivity
my $CAS_id = $reactome_entry[2];
my $LIGAND_id = $reactome_entry[3];
- my $RiceCyc_term = uc $reactome_entry[4]; # for case-insensitivity
+ my $Cyc_term = uc $reactome_entry[4]; # for case-insensitivity
- # There is a possibility that a single CAS, LIGAND, or RiceCyc
+ # There is a possibility that a single CAS, LIGAND, or Cyc
# identifier may appear in more than one reactome entry. This
# temp array allows each matched hash value to hold more than
# one ReactomeID, if necessary.
push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id;
}
- # --RiceCyc Hash Load--
- if ($allow_ricecyc)
+ # --Cyc Hash Load--
+ if ($allow_cyc)
{
- if ($RiceCyc_term ne "-") {
- push @{$reactome_RiceCyc{"$RiceCyc_term"}}, $reactome_id;
+ if ($Cyc_term ne "-") {
+ push @{$reactome_Cyc{"$Cyc_term"}}, $reactome_id;
}
}
close REACTOME_FILE;
print "\n[Reactome Stats]",
- "\nTotal Oryza Reactome Entries: $reactome_count\n";
+ "\nTotal Reactome Entries: $reactome_count\n";
}
# show dupes in reactome hashes - give data to Pankaj;
# this is important b/c the duplicates may represent erroneous data in
- # the Rice Reactome dataset
+ # the Reactome dataset
my $k; my @v;
print "\n[Reactome Hashes - Dupes]\n";
print "\n--CAS Hash--\n";
print "$k: @{$reactome_CompoundName{$k}}\n";
}
}
- if ($allow_ricecyc)
+ if ($allow_cyc)
{
- print "\n--RiceCyc Hash--\n";
- for $k (keys %reactome_RiceCyc) {
- if (@{$reactome_RiceCyc{$k}} > 1) {
- print "$k: @{$reactome_RiceCyc{$k}}\n";
+ print "\n--Cyc Hash--\n";
+ for $k (keys %reactome_Cyc) {
+ if (@{$reactome_Cyc{$k}} > 1) {
+ print "$k: @{$reactome_Cyc{$k}}\n";
}
}
}
}
}
- # attempt CHEBI match on Reactome Compound Names (and optional RiceCyc names/synonyms)...
+ # attempt CHEBI match on Reactome Compound Names (and optional Cyc names/synonyms)...
$attempted_name_mappings++;
# more temp-foo to skirt said interpolation problem
}
}
- # RiceCyc names...
- if ($allow_ricecyc)
+ # Cyc names...
+ if ($allow_cyc)
{
- if (defined($reactome_RiceCyc{"$tmp_name"}))
+ if (defined($reactome_Cyc{"$tmp_name"}))
{
- foreach my $tmp_reactome_id (@{$reactome_RiceCyc{$tmp_name}})
+ foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_name}})
{
$successful_name_mappings++;
push (@map_results, "$tmp_reactome_id\t" .
$term->acc . "\t" .
- "RiceCycTerm\t" .
+ "CycTerm\t" .
$term->name);
}
}
# yet more temp-foo to skirt interpolation problem
my $tmp_syn = "\U$synonym";
- if (defined($reactome_RiceCyc{$tmp_syn}))
+ if (defined($reactome_Cyc{$tmp_syn}))
{
- foreach my $tmp_reactome_id (@{$reactome_RiceCyc{$tmp_syn}})
+ foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_syn}})
{
$successful_synonym_mappings++;
push (@map_results, "$tmp_reactome_id\t" .
$term->acc . "\t" .
- "RiceCycSynonym\t" .
+ "CycSynonym\t" .
$synonym);
}
}
"(note: can include ChemIDplus and KEGG COMPUND db duplicates)",
"\nLIGAND: ",
"$successful_LIGAND_mappings/$attempted_LIGAND_mappings",
- "\nTerm Names " . ($allow_ricecyc ? "includes RiceCyc terms and synonyms" : "") . ": ",
+ "\nTerm Names " . ($allow_cyc ? "includes Cyc terms and synonyms" : "") . ": ",
"$successful_name_mappings/$attempted_name_mappings";
if ($allow_synonyms)
{