Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Added code for all mappings, statistical counters, and STDOUT; tested with
authorpreecej <preecej@localhost>
Tue, 12 Oct 2010 00:30:43 +0000 (00:30 +0000)
committerpreecej <preecej@localhost>
Tue, 12 Oct 2010 00:30:43 +0000 (00:30 +0000)
small data set that contained several intentional overlaps between
reactome and chebi terms

svn path=/; revision=65

preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl

index 4ef2a36b9bf5e234153933d2e2e71109bd6bcf77..bb44688064b3b157a5583d338cfdd322a0b2e98b 100755 (executable)
@@ -1,7 +1,7 @@
 #!/usr/bin/perl -w
 use strict;
 
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 # Rice Reactome - CHEBI Ontology Mapping Script
 #
 # Justin Preece, 10/06/10
@@ -9,35 +9,43 @@ use strict;
 # Purpose: Map CHEBI ontology terms onto Rice Reactome database.
 #
 # Inputs:
+#
 #   CHEBI OBO file (preset)
+#
 #   Rice Reactome file (preset, provided by YuanMing Wu)
 #   (Header)    [ReactomeID]    [Compound_Name]            [CAS]           [LIGAND]    [RiceCyc]
 #   (Row)       923893          S-adenosyl-L-methionine    29908-03-0      C00019      S-ADENOSYLMETHIONINE        ** the '-' (dash) symbol will be applied to any empty columns
 #
 # Outputs: tab-del mapping file (reactome_chebi_mapping.txt)
-#   (Header)    [ReactomeID]    [CHEBI]    [XREF_Type]    [XREF_ID]       
-#   (Row)       923893          15414      CAS            29908-03-0
-#   (Row)       923893          15414      LIGAND         C00019
-#   (Row)       923893          15414      RiceCyc        S-ADENOSYLMETHIONINE      ** this would be a rare mapping occurrence
-# --------------------------------------------------------------------
+#
+#   [ReactomeID]    [CHEBI]    [XREF_Type]    [XREF_ID]       
+#   923893          15414      CAS            29908-03-0
+#   923893          15414      LIGAND         C00019
+#   923893          15414      RiceCycTerm    S-ADENOSYLMETHIONINE
+#   923893          15414      RiceCycSynonym s-adenosylmethionine
+# ---------------------------------------------------------------------------
 
 
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 # modules
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 
 use GO::Parser;
 
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 # declarations
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 
 # set paths to data files
-my $data_path = "/Users/cindypreece/Desktop/jaiswal_lab/projects/reactome/";
+my $data_path = "/home/preecej/Documents/Projects/Reactome/";
 my $chebi_obo_file = "chebi_sample.obo";
-my $reactome_file = "RiceReferenceMolecules.txt";
+my $reactome_file = "RiceReferenceMolecules_sample.txt";
 my $mapped_output_file = "reactome_chebi_mapping.txt";
 
+# options
+my $allow_obsolete_terms = 1;
+my $allow_synonyms = 1;
+
 my $ont; # chebi ontology
 
 my %reactome_CAS; # rice reactome CAS hash
@@ -47,13 +55,13 @@ my %reactome_RiceCyc; # rice reactome RiceCyc hash
 my @map_results = (); # successful mappings between chebi and reactome
 
 
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 # functions
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 
 
 # setup chebi parser and reactome data
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 sub init
 {
     # init ontology parser and ontology
@@ -61,30 +69,24 @@ sub init
     $parser->parse($data_path . $chebi_obo_file);
     $ont = $parser->handler->graph;
       
-#    my $parser = Bio::OntologyIO->new (
-#        -format => "obo",
-#        -file => $data_path . $chebi_obo_file);
-
-    # init ontology
-#    $ont = $parser->next_ontology();
-#    $parser->close();
-
     # read rice reactome file into 3 separate hashes
     open(REACTOME_FILE,$data_path . $reactome_file);
 
     my $line = <REACTOME_FILE>; # skip the header
+    my $reactome_count = 0;
     
     while (<REACTOME_FILE>)
     {
         $line = $_;
         chomp $line;
+        $reactome_count++;
         my @reactome_entry = split(/\t/, $line); # break up our tab-del line
 
         # load up this reactome entry's ID, CAS, LIGAND, and RiceCyc values
         my $reactome_id = $reactome_entry[0];
         my $CAS_id = $reactome_entry[2];
         my $LIGAND_id = $reactome_entry[3];
-        my $RiceCyc_term = $reactome_entry[4];
+        my $RiceCyc_term = uc $reactome_entry[4]; # for case-insensitivity
 
         # There is a possibility that a single CAS, LIGAND, or RiceCyc
         # identifier may appear in more than one reactome entry. This
@@ -92,8 +94,7 @@ sub init
         # one ReactomeID, if necessary.
         
         # --CAS Hash Load--
-        if ($CAS_id ne "-") # keep those "-" placeholders out
-        {
+        if ($CAS_id ne "-") { # keep those "-" placeholders out
             # build the CAS hash; each value may hold 1...n reactome
             # ids (as an array)
             push @{$reactome_CAS{$CAS_id}}, $reactome_id;
@@ -102,28 +103,36 @@ sub init
         # similarly...
 
         # --LIGAND Hash Load--
-        if ($LIGAND_id ne "-")
-        {
+        if ($LIGAND_id ne "-") {
             push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id;
         }
 
         # --RiceCyc Hash Load--
-        if ($RiceCyc_term ne "-")
-        {
+        if ($RiceCyc_term ne "-") {
             push @{$reactome_RiceCyc{"\U$RiceCyc_term"}}, $reactome_id;
         }
     }
     close REACTOME_FILE;
+    
+    print "\n[Reactome Stats]",
+        "\nTotal Oryza Reactome Entries: $reactome_count\n";
+
 }
 
 
 # spit out some data to make sure you've read in the files correctly
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 sub test_inputs
 {
-       # output new GO version of parsed ontology
-       my $terms = $ont->get_all_terms;
-       foreach my $term (@$terms) {
+    # basic ontology info
+    print "[Node Count]: " . $ont->node_count . "\n"; 
+
+    # get all chebi terms in the ontology
+       my $terms = $ont->get_all_nodes;
+       
+       # output contents of parsed ontology
+       foreach my $term (@$terms)
+       {
                print "\n" . $term->acc . " " . $term->name . "\n[SYNONYMS]\n";
 
                my $synonyms = $term->synonym_list;
@@ -142,36 +151,9 @@ sub test_inputs
                print "\n";
        }
 
-    # output basic stats on chebi ontology
-#    print "\n[Ontology Stats]\n";
-#    print "read ontology ",$ont->name()," with ",
-#        scalar($ont->get_root_terms)," root terms, and ",
-#        scalar($ont->get_all_terms)," total terms, and ",
-#        scalar($ont->get_leaf_terms)," leaf terms\n";
-
-    # all chebi terms in the ontology
-#    print "\n[CHEBI Term List from \$ont]\n";
-#    foreach my $term ($ont->get_all_terms) {
-#        my @synonyms = $term->get_synonyms;
-#        my @xrefs = $term->get_dbxrefs;
-
-#        print $term->identifier;
-#        print " \|NAME\| ";
-#        if (defined($term->name)) {
-#            print $term->name;
-#        }
-#        print " \|SYNONYMS\| ";
-#        print "$_," foreach @synonyms;
-#        print " \|XREFS\| ";
-#        print "$_" foreach @xrefs;
-#        foreach my $xref (@xrefs) {
-#            print $xref->primary_id;
-#        }
-#        print "\n\n";
-#    }
-
-    # show reactome hashes - this is important, give >1 dupes to Pankaj
-    # for manual reference
+    # show dupes in reactome hashes - give data to Pankaj;
+    # this is important b/c the duplicates may represent erroneous data in
+    # the Rice Reactome dataset
     my $k; my @v;
     print "\n[Reactome Hashes - Dupes]\n";
     print "\n--CAS Hash--\n";
@@ -196,51 +178,131 @@ sub test_inputs
 
 
 # map the chebi terms to the reactome entries
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 sub perform_map
 {
-    my @chebi_obo_terms = $ont->get_all_terms;
-    #print $_->identifier . "\n" foreach @chebi_obo_terms;
-
+    my $chebi_obo_terms = $ont->get_all_nodes;
+    
+    # vars for mapping stats
+    my $attempted_mappings = 0;
+    my $successful_mappings = 0;
+    my $attempted_CAS_mappings = 0;
+    my $successful_CAS_mappings = 0;
+    my $attempted_LIGAND_mappings = 0;
+    my $successful_LIGAND_mappings = 0;
+    my $attempted_name_mappings = 0;
+    my $successful_name_mappings = 0;
+    my $attempted_synonym_mappings = 0;
+    my $successful_synonym_mappings = 0;
+    
     # loop through each chebi term
-    foreach my $term (@chebi_obo_terms)
-    {
-        # set locals for matching each term property
-        my $term_name;
-        if (defined($term->name)) {
-            $term_name = $term->name;
-        } else {
-            $term_name = "";
-        }
-        my @term_synonyms = $term->get_synonyms;
-
-        # attempt CHEBI match on CAS ID
-
-        # attempt CHEBI match on LIGAND ID
-        
-        # attempt CHEBI match on RiceCyc names
-        if (defined($reactome_RiceCyc{"\U$term_name"})) {
-            push (@map_results, "$reactome_RiceCyc{$term_name}\t",
-                "$term->identifier\t",
-                "RiceCyc\t",
-                $term_name);
-        } else { # check the term synonyms, if needed
-            foreach my $synonym (@term_synonyms) {
-                print "";
+       foreach my $term (@$chebi_obo_terms)
+       {
+           # eliminate "typedef" nodes (non-CHEBI terms), also check for obsolete
+           # terms and whether to allow them
+        if (($term->acc =~ m/^CHEBI:/)
+            && (!$term->is_obsolete || ($term->is_obsolete && $allow_obsolete_terms)))
+        {
+            # attempt CHEBI match on CAS and LIGAND ID's
+            $attempted_mappings++;
+            my $xrefs = $term->dbxref_list;
+            foreach my $xref (@$xrefs)
+            {
+                $attempted_CAS_mappings++;
+                $attempted_LIGAND_mappings++;
+
+                # temp-foo to skirt an interpolation problem 
+                my $tmp_key = $xref->xref_key;
+
+                if (defined($reactome_CAS{"$tmp_key"}))
+                {
+                    foreach my $tmp_reactome_id (@{$reactome_CAS{$tmp_key}})
+                    {
+                        $successful_CAS_mappings++;
+                        push (@map_results, "$tmp_reactome_id\t" .
+                            $term->acc . "\t" . 
+                            "CAS\t" .
+                            $tmp_key);
+                    }
+                }
+
+                if (defined($reactome_LIGAND{"$tmp_key"}))
+                {
+                    foreach my $tmp_reactome_id (@{$reactome_LIGAND{$tmp_key}})
+                    {
+                        $successful_LIGAND_mappings++;
+                        push (@map_results, "$tmp_reactome_id\t" .
+                            $term->acc . "\t" . 
+                            "LIGAND\t" .
+                            $tmp_key);
+                    }
+                }
+            }
+                
+            # attempt CHEBI match on RiceCyc names...
+            $attempted_name_mappings++;
+
+            # more temp-foo to skirt said interpolation problem 
+            my $tmp_name = uc $term->name; 
+
+            if (defined($reactome_RiceCyc{"$tmp_name"}))
+            {
+                $successful_name_mappings++;
+                push (@map_results, "@{$reactome_RiceCyc{$tmp_name}}\t" .
+                    $term->acc . "\t" . 
+                    "RiceCycTerm\t" .
+                    $term->name);
+            }
+            # ...and synonyms (optional)
+            if ($allow_synonyms)
+            {
+                my $synonyms = $term->synonym_list;
+                foreach my $synonym (@$synonyms)
+                {
+                    $attempted_synonym_mappings++;
+
+                    # yet more temp-foo to skirt interpolation problem 
+                    my $tmp_syn = "\U$synonym"; 
+
+                    if (defined($reactome_RiceCyc{$tmp_syn}))
+                    {
+                        $successful_synonym_mappings++;
+                        push (@map_results, "@{$reactome_RiceCyc{$tmp_syn}}\t" .
+                            $term->acc . "\t" .
+                            "RiceCycSynonym\t" .
+                            $synonym);
+                    }
+                }
             }
         }
     }
+    # send up some stats on the mapping process
+    $successful_mappings =
+        + $successful_CAS_mappings
+        + $successful_LIGAND_mappings
+        + $successful_name_mappings
+        + $successful_synonym_mappings;
+        
+    print "\n[ChEBI Stats]",
+        "\nNodes in File: " . $ont->node_count,
+        "\nTotal Attempted Mappings (with usable ChEBI terms): " . $attempted_mappings,
+        "\nTotal Successful Mappings: " . $successful_mappings . "\n";
+
+    print "\n[Mapping Breakdown by Type]",
+        "\nCAS (matches/attempts): ",
+            "$successful_CAS_mappings/$attempted_CAS_mappings ",
+            "(note: can include ChemIDplus and KEGG COMPUND db duplicates)",
+        "\nLIGAND: ",
+            "$successful_LIGAND_mappings/$attempted_LIGAND_mappings",
+        "\nTerm Names: ",
+            "$successful_name_mappings/$attempted_name_mappings",
+        "\nTerm Synonyms: ",
+            "$successful_synonym_mappings/$attempted_synonym_mappings\n\n";
 }
 
-# sample format - remove later
-#   [ReactomeID]    [CHEBI]    [XREF_Type]    [XREF_ID]       
-#   923893          15414      CAS            29908-03-0
-#   923893          15414      LIGAND         C00019
-#   923893          15414      RiceCyc        S-ADENOSYLMETHIONINE
-
 
 # put the results in the mapped output file
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 sub create_mapfile
 {
     if (@map_results > 0)
@@ -249,7 +311,7 @@ sub create_mapfile
         unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID");
         
         # setup output file
-        open(OUTPUT_FILE,">>" . $data_path . $mapped_output_file);
+        open(OUTPUT_FILE,">" . $data_path . $mapped_output_file);
     
         #format results for file output
         print OUTPUT_FILE "$_\n" foreach @map_results;
@@ -261,13 +323,13 @@ sub create_mapfile
 }
 
 
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 # main
-# --------------------------------------------------------------------
+# ---------------------------------------------------------------------------
 
 init;
-test_inputs;
-#perform_map;
+#test_inputs;
+perform_map;
 create_mapfile;
 
 exit;