Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
copying
authorpreecej <preecej@localhost>
Sat, 9 Oct 2010 07:21:33 +0000 (07:21 +0000)
committerpreecej <preecej@localhost>
Sat, 9 Oct 2010 07:21:33 +0000 (07:21 +0000)
svn path=/; revision=62

preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping-HEAD.pl [new file with mode: 0644]
preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping-HEAD.pl~ [new file with mode: 0644]

diff --git a/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping-HEAD.pl b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping-HEAD.pl
new file mode 100644 (file)
index 0000000..ec31169
--- /dev/null
@@ -0,0 +1,250 @@
+#!/usr/bin/perl -w
+use strict;
+
+# SVN test from jedit on Mac
+
+# --------------------------------------------------------------------
+# Rice Reactome - CHEBI Ontology Mapping Script
+#
+# Justin Preece, 10/06/10
+#
+# Purpose: Map CHEBI ontology terms onto Rice Reactome database.
+#
+# Inputs:
+#   CHEBI OBO file (preset)
+#   Rice Reactome file (preset, provided by YuanMing Wu)
+#   (Header)    [ReactomeID]    [Compound_Name]            [CAS]           [LIGAND]    [RiceCyc]
+#   (Row)       923893          S-adenosyl-L-methionine    29908-03-0      C00019      S-ADENOSYLMETHIONINE        ** the '-' (dash) symbol will be applied to any empty columns
+#
+# Outputs: tab-del mapping file (reactome_chebi_mapping.txt)
+#   (Header)    [ReactomeID]    [CHEBI]    [XREF_Type]    [XREF_ID]       
+#   (Row)       923893          15414      CAS            29908-03-0
+#   (Row)       923893          15414      LIGAND         C00019
+#   (Row)       923893          15414      RiceCyc        S-ADENOSYLMETHIONINE      ** this would be a rare mapping occurrence
+# --------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------
+# modules
+# --------------------------------------------------------------------
+
+use Bio::OntologyIO;
+
+# --------------------------------------------------------------------
+# declarations
+# --------------------------------------------------------------------
+
+# set paths to data files
+my $data_path = "/home/preecej/Documents/Projects/Reactome/";
+my $chebi_obo_file = "chebi_sample.obo";
+my $reactome_file = "RiceReferenceMolecules_sample.txt";
+my $mapped_output_file = "reactome_chebi_mapping.txt";
+
+my $ont; # chebi ontology
+
+my %reactome_CAS; # rice reactome CAS hash
+my %reactome_LIGAND; # rice reactome LIGAND hash
+my %reactome_RiceCyc; # rice reactome RiceCyc hash
+
+my @map_results = (); # successful mappings between chebi and reactome
+
+
+# --------------------------------------------------------------------
+# functions
+# --------------------------------------------------------------------
+
+
+# setup chebi parser and reactome data
+# --------------------------------------------------------------------
+sub init
+{
+    # init ontology parser
+    my $parser = Bio::OntologyIO->new (
+        -format => "obo",
+        -file => $data_path . $chebi_obo_file);
+
+    # init ontology
+    $ont = $parser->next_ontology();
+    $parser->close();
+
+    # read rice reactome file into 3 separate hashes
+    open(REACTOME_FILE,$data_path . $reactome_file);
+
+    my $line = <REACTOME_FILE>; # skip the header
+    
+    while (<REACTOME_FILE>)
+    {
+        $line = $_;
+        chomp $line;
+        my @reactome_entry = split(/\t/, $line); # break up our tab-del line
+
+        # load up this reactome entry's ID, CAS, LIGAND, and RiceCyc values
+        my $reactome_id = $reactome_entry[0];
+        my $CAS_id = $reactome_entry[2];
+        my $LIGAND_id = $reactome_entry[3];
+        my $RiceCyc_term = $reactome_entry[4];
+
+        # There is a possibility that a single CAS, LIGAND, or RiceCyc
+        # identifier may appear in more than one reactome entry. This
+        # temp array allows each matched hash value to hold more than 
+        # one ReactomeID, if necessary.
+        
+        # --CAS Hash Load--
+        if ($CAS_id ne "-") # keep those "-" placeholders out
+        {
+            # build the CAS hash; each value may hold 1...n reactome
+            # ids (as an array)
+            push @{$reactome_CAS{$CAS_id}}, $reactome_id;
+        }
+
+        # similarly...
+
+        # --LIGAND Hash Load--
+        if ($LIGAND_id ne "-")
+        {
+            push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id;
+        }
+
+        # --RiceCyc Hash Load--
+        if ($RiceCyc_term ne "-")
+        {
+            push @{$reactome_RiceCyc{"\U$RiceCyc_term"}}, $reactome_id;
+        }
+    }
+    close REACTOME_FILE;
+}
+
+
+# spit out some data to make sure you've read in the files correctly
+# --------------------------------------------------------------------
+sub test_inputs
+{
+    # output basic stats on chebi ontology
+    print "\n[Ontology Stats]\n";
+    print "read ontology ",$ont->name()," with ",
+        scalar($ont->get_root_terms)," root terms, and ",
+        scalar($ont->get_all_terms)," total terms, and ",
+        scalar($ont->get_leaf_terms)," leaf terms\n";
+
+    # all chebi terms in the ontology
+    print "\n[CHEBI Term List from \$ont]\n";
+    foreach my $term ($ont->get_all_terms) {
+        my @synonyms = $term->get_synonyms;
+        my @xrefs = $term->get_dbxrefs;
+
+        print $term->identifier;
+        print " \|NAME\| ";
+        if (defined($term->name)) {
+            print $term->name;
+        }
+        print " \|SYNONYMS\| ";
+        print "$_," foreach @synonyms;
+        print " \|XREFS\| ";
+        print "$_" foreach @xrefs;
+        foreach my $xref (@xrefs) {
+            print $xref->primary_id;
+        }
+        print "\n\n";
+    }
+
+    # show reactome hashes - this is important, give >1 dupes to Pankaj
+    # for manual reference
+    my $k; my @v;
+    print "\n[Reactome Hashes]\n";
+    print "\n--CAS Hash--\n";
+    for $k (keys %reactome_CAS) {
+        #if (@{$reactome_CAS{$k}} > 1) {
+            print "$k: @{$reactome_CAS{$k}}\n";
+        #}
+    }
+    print "\n--LIGAND Hash--\n";
+    for $k (keys %reactome_LIGAND) {
+        #if (@{$reactome_LIGAND{$k}} > 1) {
+            print "$k: @{$reactome_LIGAND{$k}}\n";
+        #}
+    }
+    print "\n--RiceCyc Hash--\n";
+    for $k (keys %reactome_RiceCyc) {
+        #if (@{$reactome_RiceCyc{$k}} > 1) {
+            print "$k: @{$reactome_RiceCyc{$k}}\n";
+        #}
+    }
+}
+
+
+# map the chebi terms to the reactome entries
+# --------------------------------------------------------------------
+sub perform_map
+{
+    my @chebi_obo_terms = $ont->get_all_terms;
+    #print $_->identifier . "\n" foreach @chebi_obo_terms;
+
+    # loop through each chebi term
+    foreach my $term (@chebi_obo_terms)
+    {
+        # set locals for matching each term property
+        my $term_name;
+        if (defined($term->name)) {
+            $term_name = $term->name;
+        } else {
+            $term_name = "";
+        }
+        my @term_synonyms = $term->get_synonyms;
+
+        # attempt CHEBI match on CAS ID
+
+        # attempt CHEBI match on LIGAND ID
+        
+        # attempt CHEBI match on RiceCyc names
+        if (defined($reactome_RiceCyc{"\U$term_name"})) {
+            push (@map_results, "$reactome_RiceCyc{$term_name}\t",
+                "$term->identifier\t",
+                "RiceCyc\t",
+                $term_name);
+        } else { # check the term synonyms, if needed
+            foreach my $synonym (@term_synonyms) {
+                print "";
+            }
+        }
+    }
+}
+
+# sample format - remove later
+#   [ReactomeID]    [CHEBI]    [XREF_Type]    [XREF_ID]       
+#   923893          15414      CAS            29908-03-0
+#   923893          15414      LIGAND         C00019
+#   923893          15414      RiceCyc        S-ADENOSYLMETHIONINE
+
+
+# put the results in the mapped output file
+# --------------------------------------------------------------------
+sub create_mapfile
+{
+    if (@map_results > 0)
+    {
+        # add a header to the results array
+        unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID");
+        
+        # setup output file
+        open(OUTPUT_FILE,">>" . $data_path . $mapped_output_file);
+    
+        #format results for file output
+        print OUTPUT_FILE "$_\n" foreach @map_results;
+        
+        close OUTPUT_FILE;
+    } else {
+        print "\n\nSorry, there are no mapped results.\n\n";
+    }
+}
+
+
+# --------------------------------------------------------------------
+# main
+# --------------------------------------------------------------------
+
+init;
+#test_inputs;
+perform_map;
+create_mapfile;
+
+exit;
diff --git a/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping-HEAD.pl~ b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping-HEAD.pl~
new file mode 100644 (file)
index 0000000..5d01cd7
--- /dev/null
@@ -0,0 +1,248 @@
+#!/usr/bin/perl -w
+use strict;
+
+# --------------------------------------------------------------------
+# Rice Reactome - CHEBI Ontology Mapping Script
+#
+# Justin Preece, 10/06/10
+#
+# Purpose: Map CHEBI ontology terms onto Rice Reactome database.
+#
+# Inputs:
+#   CHEBI OBO file (preset)
+#   Rice Reactome file (preset, provided by YuanMing Wu)
+#   (Header)    [ReactomeID]    [Compound_Name]            [CAS]           [LIGAND]    [RiceCyc]
+#   (Row)       923893          S-adenosyl-L-methionine    29908-03-0      C00019      S-ADENOSYLMETHIONINE        ** the '-' (dash) symbol will be applied to any empty columns
+#
+# Outputs: tab-del mapping file (reactome_chebi_mapping.txt)
+#   (Header)    [ReactomeID]    [CHEBI]    [XREF_Type]    [XREF_ID]       
+#   (Row)       923893          15414      CAS            29908-03-0
+#   (Row)       923893          15414      LIGAND         C00019
+#   (Row)       923893          15414      RiceCyc        S-ADENOSYLMETHIONINE      ** this would be a rare mapping occurrence
+# --------------------------------------------------------------------
+
+
+# --------------------------------------------------------------------
+# modules
+# --------------------------------------------------------------------
+
+use Bio::OntologyIO;
+
+# --------------------------------------------------------------------
+# declarations
+# --------------------------------------------------------------------
+
+# set paths to data files
+my $data_path = "/home/preecej/Documents/Projects/Reactome/";
+my $chebi_obo_file = "chebi_sample.obo";
+my $reactome_file = "RiceReferenceMolecules_sample.txt";
+my $mapped_output_file = "reactome_chebi_mapping.txt";
+
+my $ont; # chebi ontology
+
+my %reactome_CAS; # rice reactome CAS hash
+my %reactome_LIGAND; # rice reactome LIGAND hash
+my %reactome_RiceCyc; # rice reactome RiceCyc hash
+
+my @map_results = (); # successful mappings between chebi and reactome
+
+
+# --------------------------------------------------------------------
+# functions
+# --------------------------------------------------------------------
+
+
+# setup chebi parser and reactome data
+# --------------------------------------------------------------------
+sub init
+{
+    # init ontology parser
+    my $parser = Bio::OntologyIO->new (
+        -format => "obo",
+        -file => $data_path . $chebi_obo_file);
+
+    # init ontology
+    $ont = $parser->next_ontology();
+    $parser->close();
+
+    # read rice reactome file into 3 separate hashes
+    open(REACTOME_FILE,$data_path . $reactome_file);
+
+    my $line = <REACTOME_FILE>; # skip the header
+    
+    while (<REACTOME_FILE>)
+    {
+        $line = $_;
+        chomp $line;
+        my @reactome_entry = split(/\t/, $line); # break up our tab-del line
+
+        # load up this reactome entry's ID, CAS, LIGAND, and RiceCyc values
+        my $reactome_id = $reactome_entry[0];
+        my $CAS_id = $reactome_entry[2];
+        my $LIGAND_id = $reactome_entry[3];
+        my $RiceCyc_term = $reactome_entry[4];
+
+        # There is a possibility that a single CAS, LIGAND, or RiceCyc
+        # identifier may appear in more than one reactome entry. This
+        # temp array allows each matched hash value to hold more than 
+        # one ReactomeID, if necessary.
+        
+        # --CAS Hash Load--
+        if ($CAS_id ne "-") # keep those "-" placeholders out
+        {
+            # build the CAS hash; each value may hold 1...n reactome
+            # ids (as an array)
+            push @{$reactome_CAS{$CAS_id}}, $reactome_id;
+        }
+
+        # similarly...
+
+        # --LIGAND Hash Load--
+        if ($LIGAND_id ne "-")
+        {
+            push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id;
+        }
+
+        # --RiceCyc Hash Load--
+        if ($RiceCyc_term ne "-")
+        {
+            push @{$reactome_RiceCyc{"\U$RiceCyc_term"}}, $reactome_id;
+        }
+    }
+    close REACTOME_FILE;
+}
+
+
+# spit out some data to make sure you've read in the files correctly
+# --------------------------------------------------------------------
+sub test_inputs
+{
+    # output basic stats on chebi ontology
+    print "\n[Ontology Stats]\n";
+    print "read ontology ",$ont->name()," with ",
+        scalar($ont->get_root_terms)," root terms, and ",
+        scalar($ont->get_all_terms)," total terms, and ",
+        scalar($ont->get_leaf_terms)," leaf terms\n";
+
+    # all chebi terms in the ontology
+    print "\n[CHEBI Term List from \$ont]\n";
+    foreach my $term ($ont->get_all_terms) {
+        my @synonyms = $term->get_synonyms;
+        my @xrefs = $term->get_dbxrefs;
+
+        print $term->identifier;
+        print " \|NAME\| ";
+        if (defined($term->name)) {
+            print $term->name;
+        }
+        print " \|SYNONYMS\| ";
+        print "$_," foreach @synonyms;
+        print " \|XREFS\| ";
+        print "$_" foreach @xrefs;
+        foreach my $xref (@xrefs) {
+            print $xref->primary_id;
+        }
+        print "\n\n";
+    }
+
+    # show reactome hashes - this is important, give >1 dupes to Pankaj
+    # for manual reference
+    my $k; my @v;
+    print "\n[Reactome Hashes]\n";
+    print "\n--CAS Hash--\n";
+    for $k (keys %reactome_CAS) {
+        #if (@{$reactome_CAS{$k}} > 1) {
+            print "$k: @{$reactome_CAS{$k}}\n";
+        #}
+    }
+    print "\n--LIGAND Hash--\n";
+    for $k (keys %reactome_LIGAND) {
+        #if (@{$reactome_LIGAND{$k}} > 1) {
+            print "$k: @{$reactome_LIGAND{$k}}\n";
+        #}
+    }
+    print "\n--RiceCyc Hash--\n";
+    for $k (keys %reactome_RiceCyc) {
+        #if (@{$reactome_RiceCyc{$k}} > 1) {
+            print "$k: @{$reactome_RiceCyc{$k}}\n";
+        #}
+    }
+}
+
+
+# map the chebi terms to the reactome entries
+# --------------------------------------------------------------------
+sub perform_map
+{
+    my @chebi_obo_terms = $ont->get_all_terms;
+    #print $_->identifier . "\n" foreach @chebi_obo_terms;
+
+    # loop through each chebi term
+    foreach my $term (@chebi_obo_terms)
+    {
+        # set locals for matching each term property
+        my $term_name;
+        if (defined($term->name)) {
+            $term_name = $term->name;
+        } else {
+            $term_name = "";
+        }
+        my @term_synonyms = $term->get_synonyms;
+
+        # attempt CHEBI match on CAS ID
+
+        # attempt CHEBI match on LIGAND ID
+        
+        # attempt CHEBI match on RiceCyc names
+        if (defined($reactome_RiceCyc{"\U$term_name"})) {
+            push (@map_results, "$reactome_RiceCyc{$term_name}\t",
+                "$term->identifier\t",
+                "RiceCyc\t",
+                $term_name);
+        } else { # check the term synonyms, if needed
+            foreach my $synonym (@term_synonyms) {
+                print "";
+            }
+        }
+    }
+}
+
+# sample format - remove later
+#   [ReactomeID]    [CHEBI]    [XREF_Type]    [XREF_ID]       
+#   923893          15414      CAS            29908-03-0
+#   923893          15414      LIGAND         C00019
+#   923893          15414      RiceCyc        S-ADENOSYLMETHIONINE
+
+
+# put the results in the mapped output file
+# --------------------------------------------------------------------
+sub create_mapfile
+{
+    if (@map_results > 0)
+    {
+        # add a header to the results array
+        unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID");
+        
+        # setup output file
+        open(OUTPUT_FILE,">>" . $data_path . $mapped_output_file);
+    
+        #format results for file output
+        print OUTPUT_FILE "$_\n" foreach @map_results;
+        
+        close OUTPUT_FILE;
+    } else {
+        print "\n\nSorry, there are no mapped results.\n\n";
+    }
+}
+
+
+# --------------------------------------------------------------------
+# main
+# --------------------------------------------------------------------
+
+init;
+#test_inputs;
+perform_map;
+create_mapfile;
+
+exit;