From 2e7daeea2defb596b1c29a2c6a60b3b4a8720980 Mon Sep 17 00:00:00 2001 From: preecej Date: Mon, 7 Feb 2011 23:23:07 +0000 Subject: [PATCH] removed rice references to make generic for other species svn path=/; revision=86 --- .../reactome_chebi_mapping.pl | 93 +++++++++---------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl index 899134a..2daedfd 100755 --- a/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl +++ b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl @@ -2,32 +2,32 @@ use strict; # --------------------------------------------------------------------------- -# Rice Reactome - CHEBI Ontology Mapping Script +# Reactome - CHEBI Ontology Mapping Script # -# Justin Preece, 10/06q/10 +# Justin Preece, 10/06/10 # v1.0: 10/13/10 (svn rev. 66) # v1.1: 10/20/10 (svn rev. 70) -# v2.0: 10/21/10 (svn rev. ?) +# v1.2: 02/07/11 (svn rev. ?) # -# Purpose: Map CHEBI ontology terms onto Rice Reactome database. +# Purpose: Map CHEBI ontology terms onto the Reactome database. # # Inputs: # # CHEBI OBO file (preset) # -# Rice Reactome file (preset, provided by YuanMing Wu) -# (Header) [ReactomeID] [Compound_Name] [CAS] [LIGAND] [RiceCyc] +# Reactome file (preset, provided by Guanming Wu) +# (Header) [ReactomeID] [Compound_Name] [CAS] [LIGAND] [Cyc] # (Row) 923893 S-adenosyl-L-methionine 29908-03-0 C00019 S-ADENOSYLMETHIONINE ** the '-' (dash) symbol will be applied to any empty columns # -# Outputs: tab-del mapping file (reactome_chebi_mapping_results_sorted.txt) +# Outputs: tab-del mapping file (reactome_chebi_mapping_complete_sorted.txt) # # [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID] # 923893 15414 CAS 29908-03-0 # 923893 15414 LIGAND C00019 # 923893 15414 CompoundTerm S-ADENOSYLMETHIONINE # 923893 15414 CompoundSynonym s-AdenosylMethionine -# 923893 15414 RiceCycTerm S-ADENOSYLMETHIONINE ** optional -# 923893 15414 RiceCycSynonym s-adenosylmethionine ** optional +# 923893 15414 CycTerm S-ADENOSYLMETHIONINE ** optional +# 923893 15414 CycSynonym s-adenosylmethionine ** optional # --------------------------------------------------------------------------- @@ -42,26 +42,25 @@ use GO::Parser; # --------------------------------------------------------------------------- # set paths to data files -my $data_path = "/Users/cindypreece/Desktop/work/Documents/projects/reactome/"; -#my $data_path = "/home/preecej/Documents/projects/reactome/"; -my $chebi_obo_file = "chebi.obo"; -my $reactome_file = "RiceReferenceMolecules.txt"; -my $mapped_output_file = "2.0_reactome_chebi_mapping_complete.txt"; -my $sorted_output_file = "2.0_reactome_chebi_mapping_complete_sorted.txt"; -my $unique_mappings = "2.0_reactome_unique_mappings.txt"; -my $sorted_no_match_file = "2.0_reactome_entries_with_no_chebi_match.txt"; +my $data_path = "/home/preecej/Documents/projects/reactome/reactome_to_chebi_mapping/AraCyc/"; +my $chebi_obo_file = "chebi_v75.obo"; +my $reactome_file = "AraReferenceMolecules.txt"; +my $mapped_output_file = "1.2_reactome_chebi_mapping_complete.txt"; +my $sorted_output_file = "1.2_reactome_chebi_mapping_complete_sorted.txt"; +my $unique_mappings = "1.2_reactome_unique_mappings.txt"; +my $sorted_no_match_file = "1.2_reactome_entries_with_no_chebi_match.txt"; # options my $allow_obsolete_terms = 1; -my $allow_ricecyc = 0; +my $allow_cyc = 0; my $allow_synonyms = 1; my $ont; # chebi ontology -my %reactome_CompoundName; # rice reactome Compound Name hash -my %reactome_CAS; # rice reactome CAS hash -my %reactome_LIGAND; # rice reactome LIGAND hash -my %reactome_RiceCyc; # rice reactome RiceCyc hash +my %reactome_CompoundName; # reactome Compound Name hash +my %reactome_CAS; # reactome CAS hash +my %reactome_LIGAND; # reactome LIGAND hash +my %reactome_Cyc; # reactome Cyc hash my @map_results = (); # successful mappings between chebi and reactome @@ -80,7 +79,7 @@ sub init $parser->parse($data_path . $chebi_obo_file); $ont = $parser->handler->graph; - # read rice reactome file into 3 separate hashes + # read reactome file into 3 separate hashes open(REACTOME_FILE,$data_path . $reactome_file); my $line = ; # skip the header @@ -93,7 +92,7 @@ sub init $reactome_count++; my @reactome_entry = split(/\t/, $line); # break up our tab-del line - # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and RiceCyc values + # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and Cyc values my $reactome_id = $reactome_entry[0]; my $compound_name = uc $reactome_entry[1]; # for case-insensitivity @@ -104,9 +103,9 @@ sub init my $CAS_id = $reactome_entry[2]; my $LIGAND_id = $reactome_entry[3]; - my $RiceCyc_term = uc $reactome_entry[4]; # for case-insensitivity + my $Cyc_term = uc $reactome_entry[4]; # for case-insensitivity - # There is a possibility that a single CAS, LIGAND, or RiceCyc + # There is a possibility that a single CAS, LIGAND, or Cyc # identifier may appear in more than one reactome entry. This # temp array allows each matched hash value to hold more than # one ReactomeID, if necessary. @@ -130,11 +129,11 @@ sub init push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id; } - # --RiceCyc Hash Load-- - if ($allow_ricecyc) + # --Cyc Hash Load-- + if ($allow_cyc) { - if ($RiceCyc_term ne "-") { - push @{$reactome_RiceCyc{"$RiceCyc_term"}}, $reactome_id; + if ($Cyc_term ne "-") { + push @{$reactome_Cyc{"$Cyc_term"}}, $reactome_id; } } @@ -142,7 +141,7 @@ sub init close REACTOME_FILE; print "\n[Reactome Stats]", - "\nTotal Oryza Reactome Entries: $reactome_count\n"; + "\nTotal Reactome Entries: $reactome_count\n"; } @@ -180,7 +179,7 @@ sub test_inputs # show dupes in reactome hashes - give data to Pankaj; # this is important b/c the duplicates may represent erroneous data in - # the Rice Reactome dataset + # the Reactome dataset my $k; my @v; print "\n[Reactome Hashes - Dupes]\n"; print "\n--CAS Hash--\n"; @@ -201,12 +200,12 @@ sub test_inputs print "$k: @{$reactome_CompoundName{$k}}\n"; } } - if ($allow_ricecyc) + if ($allow_cyc) { - print "\n--RiceCyc Hash--\n"; - for $k (keys %reactome_RiceCyc) { - if (@{$reactome_RiceCyc{$k}} > 1) { - print "$k: @{$reactome_RiceCyc{$k}}\n"; + print "\n--Cyc Hash--\n"; + for $k (keys %reactome_Cyc) { + if (@{$reactome_Cyc{$k}} > 1) { + print "$k: @{$reactome_Cyc{$k}}\n"; } } } @@ -275,7 +274,7 @@ sub perform_map } } - # attempt CHEBI match on Reactome Compound Names (and optional RiceCyc names/synonyms)... + # attempt CHEBI match on Reactome Compound Names (and optional Cyc names/synonyms)... $attempted_name_mappings++; # more temp-foo to skirt said interpolation problem @@ -318,17 +317,17 @@ sub perform_map } } - # RiceCyc names... - if ($allow_ricecyc) + # Cyc names... + if ($allow_cyc) { - if (defined($reactome_RiceCyc{"$tmp_name"})) + if (defined($reactome_Cyc{"$tmp_name"})) { - foreach my $tmp_reactome_id (@{$reactome_RiceCyc{$tmp_name}}) + foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_name}}) { $successful_name_mappings++; push (@map_results, "$tmp_reactome_id\t" . $term->acc . "\t" . - "RiceCycTerm\t" . + "CycTerm\t" . $term->name); } } @@ -343,14 +342,14 @@ sub perform_map # yet more temp-foo to skirt interpolation problem my $tmp_syn = "\U$synonym"; - if (defined($reactome_RiceCyc{$tmp_syn})) + if (defined($reactome_Cyc{$tmp_syn})) { - foreach my $tmp_reactome_id (@{$reactome_RiceCyc{$tmp_syn}}) + foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_syn}}) { $successful_synonym_mappings++; push (@map_results, "$tmp_reactome_id\t" . $term->acc . "\t" . - "RiceCycSynonym\t" . + "CycSynonym\t" . $synonym); } } @@ -377,7 +376,7 @@ sub perform_map "(note: can include ChemIDplus and KEGG COMPUND db duplicates)", "\nLIGAND: ", "$successful_LIGAND_mappings/$attempted_LIGAND_mappings", - "\nTerm Names " . ($allow_ricecyc ? "includes RiceCyc terms and synonyms" : "") . ": ", + "\nTerm Names " . ($allow_cyc ? "includes Cyc terms and synonyms" : "") . ": ", "$successful_name_mappings/$attempted_name_mappings"; if ($allow_synonyms) { -- 2.34.1