From: preecej Date: Thu, 21 Oct 2010 01:02:59 +0000 (+0000) Subject: - fixed bug that ignored multiple reactome id's in RiceCyc name during mapping ouput X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=d6652565cecc015c29d2bab47fbf69fc055fbbf0;p=old-jaiswallab-svn%2F.git - fixed bug that ignored multiple reactome id's in RiceCyc name during mapping ouput - added ability to map on Reactome Compound Names - made mapping on RiceCyc Names optional svn path=/; revision=70 --- diff --git a/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl index ef41798..6159dd5 100755 --- a/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl +++ b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl @@ -5,6 +5,8 @@ use strict; # Rice Reactome - CHEBI Ontology Mapping Script # # Justin Preece, 10/06/10 +# v1.0: 10/13/10 (svn rev. 66) +# v1.1: 10/20/10 (svn rev. 70) # # Purpose: Map CHEBI ontology terms onto Rice Reactome database. # @@ -18,16 +20,16 @@ use strict; # # Outputs: tab-del mapping file (reactome_chebi_mapping.txt) # -# [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID] -# 923893 15414 ReactomeTerm s-AdenosylMethionine -# 923893 15414 CAS 29908-03-0 -# 923893 15414 LIGAND C00019 -# 923893 15414 RiceCycTerm S-ADENOSYLMETHIONINE -# 923893 15414 RiceCycSynonym s-adenosylmethionine +# [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID] +# 923893 15414 CAS 29908-03-0 +# 923893 15414 LIGAND C00019 +# 923893 15414 CompoundTerm S-ADENOSYLMETHIONINE +# 923893 15414 CompoundSynonym s-AdenosylMethionine +# 923893 15414 RiceCycTerm S-ADENOSYLMETHIONINE ** optional +# 923893 15414 RiceCycSynonym s-adenosylmethionine ** optional # --------------------------------------------------------------------------- - # --------------------------------------------------------------------------- # modules # --------------------------------------------------------------------------- @@ -39,18 +41,22 @@ use GO::Parser; # --------------------------------------------------------------------------- # set paths to data files -my $data_path = "/home/preecej/Documents/Projects/Reactome/"; +my $data_path = "/home/preecej/Documents/projects/reactome/"; my $chebi_obo_file = "chebi.obo"; my $reactome_file = "RiceReferenceMolecules.txt"; -my $mapped_output_file = "reactome_chebi_mapping_complete.txt"; -my $sorted_output_file = "reactome_chebi_mapping_complete_sorted.txt"; +my $mapped_output_file = "1.1_reactome_chebi_mapping_complete.txt"; +my $sorted_output_file = "1.1_reactome_chebi_mapping_complete_sorted.txt"; +my $unique_mappings = "1.1_reactome_unique_mappings.txt"; +my $sorted_no_match_file = "1.1_reactome_entries_with_no_chebi_match.txt"; # options my $allow_obsolete_terms = 1; +my $allow_ricecyc = 0; my $allow_synonyms = 1; my $ont; # chebi ontology +my %reactome_CompoundName; # rice reactome Compound Name hash my %reactome_CAS; # rice reactome CAS hash my %reactome_LIGAND; # rice reactome LIGAND hash my %reactome_RiceCyc; # rice reactome RiceCyc hash @@ -85,8 +91,15 @@ sub init $reactome_count++; my @reactome_entry = split(/\t/, $line); # break up our tab-del line - # load up this reactome entry's ID, CAS, LIGAND, and RiceCyc values + # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and RiceCyc values my $reactome_id = $reactome_entry[0]; + my $compound_name = uc $reactome_entry[1]; # for case-insensitivity + + # strips off "AN " and "A " indefinite articles + $compound_name =~ s/^ //; + $compound_name =~ s/^AN //; + $compound_name =~ s/^A //; + my $CAS_id = $reactome_entry[2]; my $LIGAND_id = $reactome_entry[3]; my $RiceCyc_term = uc $reactome_entry[4]; # for case-insensitivity @@ -110,9 +123,18 @@ sub init push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id; } + # --CompoundName Hash Load-- + if ($compound_name ne "-") { + push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id; + } + # --RiceCyc Hash Load-- - if ($RiceCyc_term ne "-") { - push @{$reactome_RiceCyc{"\U$RiceCyc_term"}}, $reactome_id; + if ($allow_ricecyc) + { + if ($RiceCyc_term ne "-") { + push @{$reactome_RiceCyc{"$RiceCyc_term"}}, $reactome_id; + } + } } close REACTOME_FILE; @@ -171,10 +193,19 @@ sub test_inputs print "$k: @{$reactome_LIGAND{$k}}\n"; } } - print "\n--RiceCyc Hash--\n"; - for $k (keys %reactome_RiceCyc) { - if (@{$reactome_RiceCyc{$k}} > 1) { - print "$k: @{$reactome_RiceCyc{$k}}\n"; + print "\n--CompoundName Hash--\n"; + for $k (keys %reactome_CompoundName) { + if (@{$reactome_CompoundName{$k}} > 1) { + print "$k: @{$reactome_CompoundName{$k}}\n"; + } + } + if ($allow_ricecyc) + { + print "\n--RiceCyc Hash--\n"; + for $k (keys %reactome_RiceCyc) { + if (@{$reactome_RiceCyc{$k}} > 1) { + print "$k: @{$reactome_RiceCyc{$k}}\n"; + } } } } @@ -242,19 +273,23 @@ sub perform_map } } - # attempt CHEBI match on RiceCyc names... + # attempt CHEBI match on Reactome Compound Names (and optional RiceCyc names/synonyms)... $attempted_name_mappings++; # more temp-foo to skirt said interpolation problem my $tmp_name = uc $term->name; - if (defined($reactome_RiceCyc{"$tmp_name"})) + # reactome compound names... + if (defined($reactome_CompoundName{"$tmp_name"})) { - $successful_name_mappings++; - push (@map_results, "@{$reactome_RiceCyc{$tmp_name}}\t" . - $term->acc . "\t" . - "RiceCycTerm\t" . - $term->name); + foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_name}}) + { + $successful_name_mappings++; + push (@map_results, "$tmp_reactome_id\t" . + $term->acc . "\t" . + "CompoundTerm\t" . + $term->name); + } } # ...and synonyms (optional) if ($allow_synonyms) @@ -267,13 +302,56 @@ sub perform_map # yet more temp-foo to skirt interpolation problem my $tmp_syn = "\U$synonym"; - if (defined($reactome_RiceCyc{$tmp_syn})) + if (defined($reactome_CompoundName{$tmp_syn})) + { + foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_syn}}) + { + $successful_synonym_mappings++; + push (@map_results, "$tmp_reactome_id\t" . + $term->acc . "\t" . + "CompoundSynonym\t" . + $synonym); + } + } + } + } + + # RiceCyc names... + if ($allow_ricecyc) + { + if (defined($reactome_RiceCyc{"$tmp_name"})) + { + foreach my $tmp_reactome_id (@{$reactome_RiceCyc{$tmp_name}}) + { + $successful_name_mappings++; + push (@map_results, "$tmp_reactome_id\t" . + $term->acc . "\t" . + "RiceCycTerm\t" . + $term->name); + } + } + # ...and synonyms (optional) + if ($allow_synonyms) + { + my $synonyms = $term->synonym_list; + foreach my $synonym (@$synonyms) { - $successful_synonym_mappings++; - push (@map_results, "@{$reactome_RiceCyc{$tmp_syn}}\t" . - $term->acc . "\t" . - "RiceCycSynonym\t" . - $synonym); + $attempted_synonym_mappings++; + + # yet more temp-foo to skirt interpolation problem + my $tmp_syn = "\U$synonym"; + + if (defined($reactome_RiceCyc{$tmp_syn})) + { + foreach my $tmp_reactome_id (@{$reactome_RiceCyc{$tmp_syn}}) + { + $successful_synonym_mappings++; + push (@map_results, "$tmp_reactome_id\t" . + $term->acc . "\t" . + "RiceCycSynonym\t" . + $synonym); + } + } } } } @@ -297,10 +375,14 @@ sub perform_map "(note: can include ChemIDplus and KEGG COMPUND db duplicates)", "\nLIGAND: ", "$successful_LIGAND_mappings/$attempted_LIGAND_mappings", - "\nTerm Names: ", - "$successful_name_mappings/$attempted_name_mappings", - "\nTerm Synonyms: ", - "$successful_synonym_mappings/$attempted_synonym_mappings\n\n"; + "\nTerm Names " . ($allow_ricecyc ? "includes RiceCyc terms and synonyms" : "") . ": ", + "$successful_name_mappings/$attempted_name_mappings"; + if ($allow_synonyms) + { + print "\nTerm Synonyms: ", + "$successful_synonym_mappings/$attempted_synonym_mappings"; + } + print "\n\n"; } @@ -322,7 +404,11 @@ sub create_mapfile close OUTPUT_FILE; # sort on all cols (keep the header at the top), remove exact dupes - system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file"; + system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file"; + + # also produce files listing unique Reactome entries having a match...and those without a match + system "awk 'NR == 1; NR > 1 {print \$1}' $data_path$sorted_output_file | uniq > $data_path$unique_mappings"; + system "cat $data_path$reactome_file | grep -vf $data_path$unique_mappings | sort > $data_path$sorted_no_match_file"; } else { print "\n\nSorry, there are no mapped results.\n\n"; }