# Rice Reactome - CHEBI Ontology Mapping Script
#
# Justin Preece, 10/06/10
+# v1.0: 10/13/10 (svn rev. 66)
+# v1.1: 10/20/10 (svn rev. 70)
#
# Purpose: Map CHEBI ontology terms onto Rice Reactome database.
#
#
# Outputs: tab-del mapping file (reactome_chebi_mapping.txt)
#
-# [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID]
-# 923893 15414 ReactomeTerm s-AdenosylMethionine
-# 923893 15414 CAS 29908-03-0
-# 923893 15414 LIGAND C00019
-# 923893 15414 RiceCycTerm S-ADENOSYLMETHIONINE
-# 923893 15414 RiceCycSynonym s-adenosylmethionine
+# [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID]
+# 923893 15414 CAS 29908-03-0
+# 923893 15414 LIGAND C00019
+# 923893 15414 CompoundTerm S-ADENOSYLMETHIONINE
+# 923893 15414 CompoundSynonym s-AdenosylMethionine
+# 923893 15414 RiceCycTerm S-ADENOSYLMETHIONINE ** optional
+# 923893 15414 RiceCycSynonym s-adenosylmethionine ** optional
# ---------------------------------------------------------------------------
-
# ---------------------------------------------------------------------------
# modules
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
# set paths to data files
-my $data_path = "/home/preecej/Documents/Projects/Reactome/";
+my $data_path = "/home/preecej/Documents/projects/reactome/";
my $chebi_obo_file = "chebi.obo";
my $reactome_file = "RiceReferenceMolecules.txt";
-my $mapped_output_file = "reactome_chebi_mapping_complete.txt";
-my $sorted_output_file = "reactome_chebi_mapping_complete_sorted.txt";
+my $mapped_output_file = "1.1_reactome_chebi_mapping_complete.txt";
+my $sorted_output_file = "1.1_reactome_chebi_mapping_complete_sorted.txt";
+my $unique_mappings = "1.1_reactome_unique_mappings.txt";
+my $sorted_no_match_file = "1.1_reactome_entries_with_no_chebi_match.txt";
# options
my $allow_obsolete_terms = 1;
+my $allow_ricecyc = 0;
my $allow_synonyms = 1;
my $ont; # chebi ontology
+my %reactome_CompoundName; # rice reactome Compound Name hash
my %reactome_CAS; # rice reactome CAS hash
my %reactome_LIGAND; # rice reactome LIGAND hash
my %reactome_RiceCyc; # rice reactome RiceCyc hash
$reactome_count++;
my @reactome_entry = split(/\t/, $line); # break up our tab-del line
- # load up this reactome entry's ID, CAS, LIGAND, and RiceCyc values
+ # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and RiceCyc values
my $reactome_id = $reactome_entry[0];
+ my $compound_name = uc $reactome_entry[1]; # for case-insensitivity
+
+ # strips off "AN " and "A " indefinite articles
+ $compound_name =~ s/^ //;
+ $compound_name =~ s/^AN //;
+ $compound_name =~ s/^A //;
+
my $CAS_id = $reactome_entry[2];
my $LIGAND_id = $reactome_entry[3];
my $RiceCyc_term = uc $reactome_entry[4]; # for case-insensitivity
push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id;
}
+ # --CompoundName Hash Load--
+ if ($compound_name ne "-") {
+ push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id;
+ }
+
# --RiceCyc Hash Load--
- if ($RiceCyc_term ne "-") {
- push @{$reactome_RiceCyc{"\U$RiceCyc_term"}}, $reactome_id;
+ if ($allow_ricecyc)
+ {
+ if ($RiceCyc_term ne "-") {
+ push @{$reactome_RiceCyc{"$RiceCyc_term"}}, $reactome_id;
+ }
+
}
}
close REACTOME_FILE;
print "$k: @{$reactome_LIGAND{$k}}\n";
}
}
- print "\n--RiceCyc Hash--\n";
- for $k (keys %reactome_RiceCyc) {
- if (@{$reactome_RiceCyc{$k}} > 1) {
- print "$k: @{$reactome_RiceCyc{$k}}\n";
+ print "\n--CompoundName Hash--\n";
+ for $k (keys %reactome_CompoundName) {
+ if (@{$reactome_CompoundName{$k}} > 1) {
+ print "$k: @{$reactome_CompoundName{$k}}\n";
+ }
+ }
+ if ($allow_ricecyc)
+ {
+ print "\n--RiceCyc Hash--\n";
+ for $k (keys %reactome_RiceCyc) {
+ if (@{$reactome_RiceCyc{$k}} > 1) {
+ print "$k: @{$reactome_RiceCyc{$k}}\n";
+ }
}
}
}
}
}
- # attempt CHEBI match on RiceCyc names...
+ # attempt CHEBI match on Reactome Compound Names (and optional RiceCyc names/synonyms)...
$attempted_name_mappings++;
# more temp-foo to skirt said interpolation problem
my $tmp_name = uc $term->name;
- if (defined($reactome_RiceCyc{"$tmp_name"}))
+ # reactome compound names...
+ if (defined($reactome_CompoundName{"$tmp_name"}))
{
- $successful_name_mappings++;
- push (@map_results, "@{$reactome_RiceCyc{$tmp_name}}\t" .
- $term->acc . "\t" .
- "RiceCycTerm\t" .
- $term->name);
+ foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_name}})
+ {
+ $successful_name_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CompoundTerm\t" .
+ $term->name);
+ }
}
# ...and synonyms (optional)
if ($allow_synonyms)
# yet more temp-foo to skirt interpolation problem
my $tmp_syn = "\U$synonym";
- if (defined($reactome_RiceCyc{$tmp_syn}))
+ if (defined($reactome_CompoundName{$tmp_syn}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_syn}})
+ {
+ $successful_synonym_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "CompoundSynonym\t" .
+ $synonym);
+ }
+ }
+ }
+ }
+
+ # RiceCyc names...
+ if ($allow_ricecyc)
+ {
+ if (defined($reactome_RiceCyc{"$tmp_name"}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_RiceCyc{$tmp_name}})
+ {
+ $successful_name_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "RiceCycTerm\t" .
+ $term->name);
+ }
+ }
+ # ...and synonyms (optional)
+ if ($allow_synonyms)
+ {
+ my $synonyms = $term->synonym_list;
+ foreach my $synonym (@$synonyms)
{
- $successful_synonym_mappings++;
- push (@map_results, "@{$reactome_RiceCyc{$tmp_syn}}\t" .
- $term->acc . "\t" .
- "RiceCycSynonym\t" .
- $synonym);
+ $attempted_synonym_mappings++;
+
+ # yet more temp-foo to skirt interpolation problem
+ my $tmp_syn = "\U$synonym";
+
+ if (defined($reactome_RiceCyc{$tmp_syn}))
+ {
+ foreach my $tmp_reactome_id (@{$reactome_RiceCyc{$tmp_syn}})
+ {
+ $successful_synonym_mappings++;
+ push (@map_results, "$tmp_reactome_id\t" .
+ $term->acc . "\t" .
+ "RiceCycSynonym\t" .
+ $synonym);
+ }
+ }
}
}
}
"(note: can include ChemIDplus and KEGG COMPUND db duplicates)",
"\nLIGAND: ",
"$successful_LIGAND_mappings/$attempted_LIGAND_mappings",
- "\nTerm Names: ",
- "$successful_name_mappings/$attempted_name_mappings",
- "\nTerm Synonyms: ",
- "$successful_synonym_mappings/$attempted_synonym_mappings\n\n";
+ "\nTerm Names " . ($allow_ricecyc ? "includes RiceCyc terms and synonyms" : "") . ": ",
+ "$successful_name_mappings/$attempted_name_mappings";
+ if ($allow_synonyms)
+ {
+ print "\nTerm Synonyms: ",
+ "$successful_synonym_mappings/$attempted_synonym_mappings";
+ }
+ print "\n\n";
}
close OUTPUT_FILE;
# sort on all cols (keep the header at the top), remove exact dupes
- system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file";
+ system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file";
+
+ # also produce files listing unique Reactome entries having a match...and those without a match
+ system "awk 'NR == 1; NR > 1 {print \$1}' $data_path$sorted_output_file | uniq > $data_path$unique_mappings";
+ system "cat $data_path$reactome_file | grep -vf $data_path$unique_mappings | sort > $data_path$sorted_no_match_file";
} else {
print "\n\nSorry, there are no mapped results.\n\n";
}