Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
- fixed bug that ignored multiple reactome id's in RiceCyc name during mapping ouput
authorpreecej <preecej@localhost>
Thu, 21 Oct 2010 01:02:59 +0000 (01:02 +0000)
committerpreecej <preecej@localhost>
Thu, 21 Oct 2010 01:02:59 +0000 (01:02 +0000)
- added ability to map on Reactome Compound Names
- made mapping on RiceCyc Names optional

svn path=/; revision=70

preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl

index ef41798873f48dbe4e56f2c6c99bbd68d81b605d..6159dd53b72b2488aa7cd539e164c04916316120 100755 (executable)
@@ -5,6 +5,8 @@ use strict;
 # Rice Reactome - CHEBI Ontology Mapping Script
 #
 # Justin Preece, 10/06/10
+#   v1.0: 10/13/10 (svn rev. 66)
+#   v1.1: 10/20/10 (svn rev. 70)
 #
 # Purpose: Map CHEBI ontology terms onto Rice Reactome database.
 #
@@ -18,16 +20,16 @@ use strict;
 #
 # Outputs: tab-del mapping file (reactome_chebi_mapping.txt)
 #
-#   [ReactomeID]    [CHEBI]    [XREF_Type]    [XREF_ID]       
-#   923893          15414      ReactomeTerm   s-AdenosylMethionine
-#   923893          15414      CAS            29908-03-0
-#   923893          15414      LIGAND         C00019
-#   923893          15414      RiceCycTerm    S-ADENOSYLMETHIONINE
-#   923893          15414      RiceCycSynonym s-adenosylmethionine
+#   [ReactomeID]    [CHEBI]    [XREF_Type]      [XREF_ID]       
+#   923893          15414      CAS              29908-03-0
+#   923893          15414      LIGAND           C00019
+#   923893          15414      CompoundTerm     S-ADENOSYLMETHIONINE
+#   923893          15414      CompoundSynonym  s-AdenosylMethionine
+#   923893          15414      RiceCycTerm      S-ADENOSYLMETHIONINE    ** optional
+#   923893          15414      RiceCycSynonym   s-adenosylmethionine    ** optional
 # ---------------------------------------------------------------------------
 
 
-
 # ---------------------------------------------------------------------------
 # modules
 # ---------------------------------------------------------------------------
@@ -39,18 +41,22 @@ use GO::Parser;
 # ---------------------------------------------------------------------------
 
 # set paths to data files
-my $data_path = "/home/preecej/Documents/Projects/Reactome/";
+my $data_path = "/home/preecej/Documents/projects/reactome/";
 my $chebi_obo_file = "chebi.obo";
 my $reactome_file = "RiceReferenceMolecules.txt";
-my $mapped_output_file = "reactome_chebi_mapping_complete.txt";
-my $sorted_output_file = "reactome_chebi_mapping_complete_sorted.txt";
+my $mapped_output_file = "1.1_reactome_chebi_mapping_complete.txt";
+my $sorted_output_file = "1.1_reactome_chebi_mapping_complete_sorted.txt";
+my $unique_mappings = "1.1_reactome_unique_mappings.txt";
+my $sorted_no_match_file = "1.1_reactome_entries_with_no_chebi_match.txt";
 
 # options
 my $allow_obsolete_terms = 1;
+my $allow_ricecyc = 0;
 my $allow_synonyms = 1;
 
 my $ont; # chebi ontology
 
+my %reactome_CompoundName; # rice reactome Compound Name hash
 my %reactome_CAS; # rice reactome CAS hash
 my %reactome_LIGAND; # rice reactome LIGAND hash
 my %reactome_RiceCyc; # rice reactome RiceCyc hash
@@ -85,8 +91,15 @@ sub init
         $reactome_count++;
         my @reactome_entry = split(/\t/, $line); # break up our tab-del line
 
-        # load up this reactome entry's ID, CAS, LIGAND, and RiceCyc values
+        # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and RiceCyc values
         my $reactome_id = $reactome_entry[0];
+        my $compound_name = uc $reactome_entry[1]; # for case-insensitivity
+
+        # strips off "AN " and "A " indefinite articles
+        $compound_name =~ s/^ //;
+        $compound_name =~ s/^AN //;
+        $compound_name =~ s/^A //;
+
         my $CAS_id = $reactome_entry[2];
         my $LIGAND_id = $reactome_entry[3];
         my $RiceCyc_term = uc $reactome_entry[4]; # for case-insensitivity
@@ -110,9 +123,18 @@ sub init
             push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id;
         }
 
+        # --CompoundName Hash Load--
+        if ($compound_name ne "-") {
+            push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id;
+        }
+        
         # --RiceCyc Hash Load--
-        if ($RiceCyc_term ne "-") {
-            push @{$reactome_RiceCyc{"\U$RiceCyc_term"}}, $reactome_id;
+        if ($allow_ricecyc)
+        {
+            if ($RiceCyc_term ne "-") {
+                push @{$reactome_RiceCyc{"$RiceCyc_term"}}, $reactome_id;
+            }
+
         }
     }
     close REACTOME_FILE;
@@ -171,10 +193,19 @@ sub test_inputs
             print "$k: @{$reactome_LIGAND{$k}}\n";
         }
     }
-    print "\n--RiceCyc Hash--\n";
-    for $k (keys %reactome_RiceCyc) {
-        if (@{$reactome_RiceCyc{$k}} > 1) {
-            print "$k: @{$reactome_RiceCyc{$k}}\n";
+    print "\n--CompoundName Hash--\n";
+    for $k (keys %reactome_CompoundName) {
+        if (@{$reactome_CompoundName{$k}} > 1) {
+            print "$k: @{$reactome_CompoundName{$k}}\n";
+        }
+    }
+    if ($allow_ricecyc)
+    {
+        print "\n--RiceCyc Hash--\n";
+        for $k (keys %reactome_RiceCyc) {
+            if (@{$reactome_RiceCyc{$k}} > 1) {
+                print "$k: @{$reactome_RiceCyc{$k}}\n";
+            }
         }
     }
 }
@@ -242,19 +273,23 @@ sub perform_map
                 }
             }
                 
-            # attempt CHEBI match on RiceCyc names...
+            # attempt CHEBI match on Reactome Compound Names (and optional RiceCyc names/synonyms)...
             $attempted_name_mappings++;
 
             # more temp-foo to skirt said interpolation problem 
             my $tmp_name = uc $term->name; 
 
-            if (defined($reactome_RiceCyc{"$tmp_name"}))
+            # reactome compound names...
+            if (defined($reactome_CompoundName{"$tmp_name"}))
             {
-                $successful_name_mappings++;
-                push (@map_results, "@{$reactome_RiceCyc{$tmp_name}}\t" .
-                    $term->acc . "\t" . 
-                    "RiceCycTerm\t" .
-                    $term->name);
+                foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_name}})
+                {
+                    $successful_name_mappings++;
+                    push (@map_results, "$tmp_reactome_id\t" .
+                        $term->acc . "\t" . 
+                        "CompoundTerm\t" .
+                        $term->name);
+                }
             }
             # ...and synonyms (optional)
             if ($allow_synonyms)
@@ -267,13 +302,56 @@ sub perform_map
                     # yet more temp-foo to skirt interpolation problem 
                     my $tmp_syn = "\U$synonym"; 
 
-                    if (defined($reactome_RiceCyc{$tmp_syn}))
+                    if (defined($reactome_CompoundName{$tmp_syn}))
+                    {
+                        foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_syn}})
+                        {
+                            $successful_synonym_mappings++;
+                            push (@map_results, "$tmp_reactome_id\t" .
+                                $term->acc . "\t" .
+                                "CompoundSynonym\t" .
+                                $synonym);
+                        }
+                    }
+                }
+            }
+
+            # RiceCyc names...
+            if ($allow_ricecyc)
+            {
+                if (defined($reactome_RiceCyc{"$tmp_name"}))
+                {
+                    foreach my $tmp_reactome_id (@{$reactome_RiceCyc{$tmp_name}})
+                    {
+                        $successful_name_mappings++;
+                        push (@map_results, "$tmp_reactome_id\t" .
+                            $term->acc . "\t" . 
+                            "RiceCycTerm\t" .
+                            $term->name);
+                    }
+                }
+                # ...and synonyms (optional)
+                if ($allow_synonyms)
+                {
+                    my $synonyms = $term->synonym_list;
+                    foreach my $synonym (@$synonyms)
                     {
-                        $successful_synonym_mappings++;
-                        push (@map_results, "@{$reactome_RiceCyc{$tmp_syn}}\t" .
-                            $term->acc . "\t" .
-                            "RiceCycSynonym\t" .
-                            $synonym);
+                        $attempted_synonym_mappings++;
+    
+                        # yet more temp-foo to skirt interpolation problem 
+                        my $tmp_syn = "\U$synonym"; 
+    
+                        if (defined($reactome_RiceCyc{$tmp_syn}))
+                        {
+                            foreach my $tmp_reactome_id (@{$reactome_RiceCyc{$tmp_syn}})
+                            {
+                                $successful_synonym_mappings++;
+                                push (@map_results, "$tmp_reactome_id\t" .
+                                    $term->acc . "\t" .
+                                    "RiceCycSynonym\t" .
+                                    $synonym);
+                            }
+                        }
                     }
                 }
             }
@@ -297,10 +375,14 @@ sub perform_map
             "(note: can include ChemIDplus and KEGG COMPUND db duplicates)",
         "\nLIGAND: ",
             "$successful_LIGAND_mappings/$attempted_LIGAND_mappings",
-        "\nTerm Names: ",
-            "$successful_name_mappings/$attempted_name_mappings",
-        "\nTerm Synonyms: ",
-            "$successful_synonym_mappings/$attempted_synonym_mappings\n\n";
+        "\nTerm Names " . ($allow_ricecyc ? "includes RiceCyc terms and synonyms" : "") . ": ",
+            "$successful_name_mappings/$attempted_name_mappings";
+    if ($allow_synonyms)
+    {
+        print "\nTerm Synonyms: ",
+            "$successful_synonym_mappings/$attempted_synonym_mappings";
+    }
+    print "\n\n";
 }
 
 
@@ -322,7 +404,11 @@ sub create_mapfile
         close OUTPUT_FILE;
         
         # sort on all cols (keep the header at the top), remove exact dupes
-        system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file";        
+        system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file";
+        
+        # also produce files listing unique Reactome entries having a match...and those without a match
+        system "awk 'NR == 1; NR > 1 {print \$1}' $data_path$sorted_output_file | uniq > $data_path$unique_mappings";
+        system "cat $data_path$reactome_file | grep -vf $data_path$unique_mappings | sort > $data_path$sorted_no_match_file";
     } else {
         print "\n\nSorry, there are no mapped results.\n\n";
     }