Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
removed rice references to make generic for other species
authorpreecej <preecej@localhost>
Mon, 7 Feb 2011 23:23:07 +0000 (23:23 +0000)
committerpreecej <preecej@localhost>
Mon, 7 Feb 2011 23:23:07 +0000 (23:23 +0000)
svn path=/; revision=86

preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl

index 899134add8ade88b74472b9cefa07c274b934529..2daedfd9c678eb5aede21d3c9b549020943c1b60 100755 (executable)
@@ -2,32 +2,32 @@
 use strict;
 
 # ---------------------------------------------------------------------------
-# Rice Reactome - CHEBI Ontology Mapping Script
+# Reactome - CHEBI Ontology Mapping Script
 #
-# Justin Preece, 10/06q/10
+# Justin Preece, 10/06/10
 #   v1.0: 10/13/10 (svn rev. 66)
 #   v1.1: 10/20/10 (svn rev. 70)
-#   v2.0: 10/21/10 (svn rev. ?)
+#   v1.2: 02/07/11 (svn rev. ?)
 #
-# Purpose: Map CHEBI ontology terms onto Rice Reactome database.
+# Purpose: Map CHEBI ontology terms onto the Reactome database.
 #
 # Inputs:
 #
 #   CHEBI OBO file (preset)
 #
-#   Rice Reactome file (preset, provided by YuanMing Wu)
-#   (Header)    [ReactomeID]    [Compound_Name]            [CAS]           [LIGAND]    [RiceCyc]
+#   Reactome file (preset, provided by Guanming Wu)
+#   (Header)    [ReactomeID]    [Compound_Name]            [CAS]           [LIGAND]    [Cyc]
 #   (Row)       923893          S-adenosyl-L-methionine    29908-03-0      C00019      S-ADENOSYLMETHIONINE        ** the '-' (dash) symbol will be applied to any empty columns
 #
-# Outputs: tab-del mapping file (reactome_chebi_mapping_results_sorted.txt)
+# Outputs: tab-del mapping file (reactome_chebi_mapping_complete_sorted.txt)
 #
 #   [ReactomeID]    [CHEBI]    [XREF_Type]      [XREF_ID]       
 #   923893          15414      CAS              29908-03-0
 #   923893          15414      LIGAND           C00019
 #   923893          15414      CompoundTerm     S-ADENOSYLMETHIONINE
 #   923893          15414      CompoundSynonym  s-AdenosylMethionine
-#   923893          15414      RiceCycTerm      S-ADENOSYLMETHIONINE    ** optional
-#   923893          15414      RiceCycSynonym   s-adenosylmethionine    ** optional
+#   923893          15414      CycTerm      S-ADENOSYLMETHIONINE    ** optional
+#   923893          15414      CycSynonym   s-adenosylmethionine    ** optional
 # ---------------------------------------------------------------------------
 
 
@@ -42,26 +42,25 @@ use GO::Parser;
 # ---------------------------------------------------------------------------
 
 # set paths to data files
-my $data_path = "/Users/cindypreece/Desktop/work/Documents/projects/reactome/";
-#my $data_path = "/home/preecej/Documents/projects/reactome/";
-my $chebi_obo_file = "chebi.obo";
-my $reactome_file = "RiceReferenceMolecules.txt";
-my $mapped_output_file = "2.0_reactome_chebi_mapping_complete.txt";
-my $sorted_output_file = "2.0_reactome_chebi_mapping_complete_sorted.txt";
-my $unique_mappings = "2.0_reactome_unique_mappings.txt";
-my $sorted_no_match_file = "2.0_reactome_entries_with_no_chebi_match.txt";
+my $data_path = "/home/preecej/Documents/projects/reactome/reactome_to_chebi_mapping/AraCyc/";
+my $chebi_obo_file = "chebi_v75.obo";
+my $reactome_file = "AraReferenceMolecules.txt";
+my $mapped_output_file = "1.2_reactome_chebi_mapping_complete.txt";
+my $sorted_output_file = "1.2_reactome_chebi_mapping_complete_sorted.txt";
+my $unique_mappings = "1.2_reactome_unique_mappings.txt";
+my $sorted_no_match_file = "1.2_reactome_entries_with_no_chebi_match.txt";
 
 # options
 my $allow_obsolete_terms = 1;
-my $allow_ricecyc = 0;
+my $allow_cyc = 0;
 my $allow_synonyms = 1;
 
 my $ont; # chebi ontology
 
-my %reactome_CompoundName; # rice reactome Compound Name hash
-my %reactome_CAS; # rice reactome CAS hash
-my %reactome_LIGAND; # rice reactome LIGAND hash
-my %reactome_RiceCyc; # rice reactome RiceCyc hash
+my %reactome_CompoundName; # reactome Compound Name hash
+my %reactome_CAS; # reactome CAS hash
+my %reactome_LIGAND; # reactome LIGAND hash
+my %reactome_Cyc; # reactome Cyc hash
 
 my @map_results = (); # successful mappings between chebi and reactome
 
@@ -80,7 +79,7 @@ sub init
     $parser->parse($data_path . $chebi_obo_file);
     $ont = $parser->handler->graph;
       
-    # read rice reactome file into 3 separate hashes
+    # read reactome file into 3 separate hashes
     open(REACTOME_FILE,$data_path . $reactome_file);
 
     my $line = <REACTOME_FILE>; # skip the header
@@ -93,7 +92,7 @@ sub init
         $reactome_count++;
         my @reactome_entry = split(/\t/, $line); # break up our tab-del line
 
-        # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and RiceCyc values
+        # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and Cyc values
         my $reactome_id = $reactome_entry[0];
         my $compound_name = uc $reactome_entry[1]; # for case-insensitivity
 
@@ -104,9 +103,9 @@ sub init
 
         my $CAS_id = $reactome_entry[2];
         my $LIGAND_id = $reactome_entry[3];
-        my $RiceCyc_term = uc $reactome_entry[4]; # for case-insensitivity
+        my $Cyc_term = uc $reactome_entry[4]; # for case-insensitivity
 
-        # There is a possibility that a single CAS, LIGAND, or RiceCyc
+        # There is a possibility that a single CAS, LIGAND, or Cyc
         # identifier may appear in more than one reactome entry. This
         # temp array allows each matched hash value to hold more than 
         # one ReactomeID, if necessary.
@@ -130,11 +129,11 @@ sub init
             push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id;
         }
         
-        # --RiceCyc Hash Load--
-        if ($allow_ricecyc)
+        # --Cyc Hash Load--
+        if ($allow_cyc)
         {
-            if ($RiceCyc_term ne "-") {
-                push @{$reactome_RiceCyc{"$RiceCyc_term"}}, $reactome_id;
+            if ($Cyc_term ne "-") {
+                push @{$reactome_Cyc{"$Cyc_term"}}, $reactome_id;
             }
 
         }
@@ -142,7 +141,7 @@ sub init
     close REACTOME_FILE;
     
     print "\n[Reactome Stats]",
-        "\nTotal Oryza Reactome Entries: $reactome_count\n";
+        "\nTotal Reactome Entries: $reactome_count\n";
 
 }
 
@@ -180,7 +179,7 @@ sub test_inputs
 
     # show dupes in reactome hashes - give data to Pankaj;
     # this is important b/c the duplicates may represent erroneous data in
-    # the Rice Reactome dataset
+    # the Reactome dataset
     my $k; my @v;
     print "\n[Reactome Hashes - Dupes]\n";
     print "\n--CAS Hash--\n";
@@ -201,12 +200,12 @@ sub test_inputs
             print "$k: @{$reactome_CompoundName{$k}}\n";
         }
     }
-    if ($allow_ricecyc)
+    if ($allow_cyc)
     {
-        print "\n--RiceCyc Hash--\n";
-        for $k (keys %reactome_RiceCyc) {
-            if (@{$reactome_RiceCyc{$k}} > 1) {
-                print "$k: @{$reactome_RiceCyc{$k}}\n";
+        print "\n--Cyc Hash--\n";
+        for $k (keys %reactome_Cyc) {
+            if (@{$reactome_Cyc{$k}} > 1) {
+                print "$k: @{$reactome_Cyc{$k}}\n";
             }
         }
     }
@@ -275,7 +274,7 @@ sub perform_map
                 }
             }
                 
-            # attempt CHEBI match on Reactome Compound Names (and optional RiceCyc names/synonyms)...
+            # attempt CHEBI match on Reactome Compound Names (and optional Cyc names/synonyms)...
             $attempted_name_mappings++;
 
             # more temp-foo to skirt said interpolation problem 
@@ -318,17 +317,17 @@ sub perform_map
                 }
             }
 
-            # RiceCyc names...
-            if ($allow_ricecyc)
+            # Cyc names...
+            if ($allow_cyc)
             {
-                if (defined($reactome_RiceCyc{"$tmp_name"}))
+                if (defined($reactome_Cyc{"$tmp_name"}))
                 {
-                    foreach my $tmp_reactome_id (@{$reactome_RiceCyc{$tmp_name}})
+                    foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_name}})
                     {
                         $successful_name_mappings++;
                         push (@map_results, "$tmp_reactome_id\t" .
                             $term->acc . "\t" . 
-                            "RiceCycTerm\t" .
+                            "CycTerm\t" .
                             $term->name);
                     }
                 }
@@ -343,14 +342,14 @@ sub perform_map
                         # yet more temp-foo to skirt interpolation problem 
                         my $tmp_syn = "\U$synonym"; 
     
-                        if (defined($reactome_RiceCyc{$tmp_syn}))
+                        if (defined($reactome_Cyc{$tmp_syn}))
                         {
-                            foreach my $tmp_reactome_id (@{$reactome_RiceCyc{$tmp_syn}})
+                            foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_syn}})
                             {
                                 $successful_synonym_mappings++;
                                 push (@map_results, "$tmp_reactome_id\t" .
                                     $term->acc . "\t" .
-                                    "RiceCycSynonym\t" .
+                                    "CycSynonym\t" .
                                     $synonym);
                             }
                         }
@@ -377,7 +376,7 @@ sub perform_map
             "(note: can include ChemIDplus and KEGG COMPUND db duplicates)",
         "\nLIGAND: ",
             "$successful_LIGAND_mappings/$attempted_LIGAND_mappings",
-        "\nTerm Names " . ($allow_ricecyc ? "includes RiceCyc terms and synonyms" : "") . ": ",
+        "\nTerm Names " . ($allow_cyc ? "includes Cyc terms and synonyms" : "") . ": ",
             "$successful_name_mappings/$attempted_name_mappings";
     if ($allow_synonyms)
     {