Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
added individual hashes for each input type; attempted population of CAS
authorpreecej <preecej@localhost>
Fri, 8 Oct 2010 00:51:38 +0000 (00:51 +0000)
committerpreecej <preecej@localhost>
Fri, 8 Oct 2010 00:51:38 +0000 (00:51 +0000)
input hash; stubbed out main mapping algo

svn path=/; revision=59

preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl

index 85e201b1059f5a61d52bdbe1adbff531bf33435c..cc12279e096b46b2a9105ae3a327568d1c626935 100755 (executable)
@@ -21,12 +21,14 @@ use strict;
 #   (Row)       923893          15414      RiceCyc        S-ADENOSYLMETHIONINE      ** this would be a rare mapping occurrence; only if CAS and LIGAND mappings are not available
 # --------------------------------------------------------------------
 
+
 # --------------------------------------------------------------------
 # modules
 # --------------------------------------------------------------------
 
 use Bio::OntologyIO;
 
+
 # --------------------------------------------------------------------
 # declarations
 # --------------------------------------------------------------------
@@ -37,74 +39,137 @@ my $chebi_obo_file = "chebi_sample.obo";
 my $reactome_file = "RiceReferenceMolecules_sample.txt";
 my $mapped_output_file = "reactome_chebi_mapping.txt";
 
-my $parser; # chebi ontology parser
-my @reactome_ref_molecules; # rice reactomes
+my $ont; # chebi ontology
+
+my %reactome_CAS; # rice reactome CAS hash
+my %reactome_LIGAND; # rice reactome LIGAND hash
+my %reactome_RiceCyc; # rice reactome RiceCyc hash
+
+my @map_results; # successful mappings between chebi and reactome
+
 
 # --------------------------------------------------------------------
 # functions
 # --------------------------------------------------------------------
 
+
 # setup chebi parser and reactome data
 # --------------------------------------------------------------------
 sub init
 {
-    # initialize ontology parser
-    $parser = Bio::OntologyIO->new (
+    # init ontology parser
+    my $parser = Bio::OntologyIO->new (
         -format => "obo",
         -file => $data_path . $chebi_obo_file);
+
+    # init ontology
+    $ont = $parser->next_ontology();
+    $parser->close();
+
+    # read rice reactome file into 3 separate hashes
+    open(REACTOME_FILE,$data_path . $reactome_file);
+
+    my $line = <REACTOME_FILE>; # skip the header
     
-    # read chebi file into array
-    #open(CHEBI_OBO_FILE,$data_path . $chebi_obo_file);
-    #my @chebi_obo_terms = <CHEBI_OBO_FILE>;
-    #chomp(@chebi_obo_terms);
-    #close CHEBI_OBO_FILE;
+    while (<REACTOME_FILE>)
+    {
+        $line = $_;
+        chomp $line;
+        my @reactome_entry = split(/\t/, $line); # break up our tab-del line
+
+        # There is a possibility that a single CAS, LIGAND, or RiceCyc
+        # identifier may appear in more than one reactome entry. This
+        # temp array allows each hash match to hold more than 
+        # ReactomeID, if necessary.
+        my @temp_ary = ();
+        
+        # --CAS Hash Load--
+        # if this reactome entry is already in the hash, just 
+        # append it to the hash's array
+        if(defined($reactome_CAS{$reactome_entry[2]}))
+        {
+            $reactome_CAS{push(@_,$reactome_entry[2])} = $reactome_entry[0];
+        # otherwise, just add the reactome entry to the hash
+        } else {
+            @temp_ary = (); # clear it for re-use
+            if ($reactome_entry[2] != "-") # keep those "-" placeholders out
+            {
+                $reactome_CAS{$temp_ary[$reactome_entry[2]]}
+                    = $reactome_entry[0];
+            }
+        }
+    }
+
+    close REACTOME_FILE;
     
     # read rice reactome file into array
-    open(REACTOME_FILE,$data_path . $reactome_file);
-    @reactome_ref_molecules = <REACTOME_FILE>;
-    chomp(@reactome_ref_molecules);
-    close REACTOME_FILE;
+    #open(REACTOME_FILE,$data_path . $reactome_file);
+    #@reactome_ref_molecules = <REACTOME_FILE>;
+    #chomp(@reactome_ref_molecules);
+    #close REACTOME_FILE;
 }
 
+
 # spit out some data to make sure you've read in the files correctly
 # --------------------------------------------------------------------
 sub test_inputs
 {
     # output basic stats on chebi ontology
-    while (my $ont = $parser->next_ontology()) {
-        print "read ontology ",$ont->name()," with ",
-            scalar($ont->get_root_terms)," root terms, and ",
-            scalar($ont->get_all_terms)," total terms, and ",
-            scalar($ont->get_leaf_terms)," leaf terms\n";
-    }
-
-    # test - show arrays
-    # print "$_\n" foreach @chebi_obo_terms;
-    # print "$_\n" foreach @reactome_ref_molecules;
+    print "\n[Ontology Stats]\n";
+    print "read ontology ",$ont->name()," with ",
+        scalar($ont->get_root_terms)," root terms, and ",
+        scalar($ont->get_all_terms)," total terms, and ",
+        scalar($ont->get_leaf_terms)," leaf terms\n";
+    
+    # show reactome hashes
+    print "\n[Reactome Hashes]\n";
+    print "\n--CAS Hash--\n";
+    print "$_\n" foreach %reactome_CAS;
+    print "\n";
 }
 
-# map the chebi terms to the reactome entries
+
+# map the chebi terms to the reactome entries (brute-force algo)
 # --------------------------------------------------------------------
 sub perform_map
+{
+    # iterate ontology; use brute-force matching to map
+    my @chebi_obo_terms = $ont->get_all_terms();
+
+    #print $_->identifier() . "\n" foreach @chebi_obo_terms;
+
+    # --pseudo--
+    #   loop through each chebi term (~33K)
+    #       if match chebi to reactome hashes on CAS, LIGAND, and/or RiceCyc name;
+    #           write tab-del mapping string directly to results array for EACH type of match
+        
+    foreach my $term (@chebi_obo_terms) {
+        print $term->identifier() . "\n";
+    }
+}
+
+
+# put the results in the mapped output file
+# --------------------------------------------------------------------
+sub create_mapfile
 {
     # setup output file
     open(OUTPUT_FILE,">>" . $data_path . $mapped_output_file);
 
-    # do brute-force matching here
-
-    # actually print out matches here
-    #print OUTPUT_FILE "$_\n" foreach @reactome_ref_molecules;
+    #format results for file output
+    print OUTPUT_FILE "$_\n" foreach @map_results;
     
-    # cleanup
     close OUTPUT_FILE;
 }
 
+
 # --------------------------------------------------------------------
 # main
 # --------------------------------------------------------------------
 
 init;
 test_inputs;
-# perform_map;
+perform_map;
+create_mapfile;
 
 exit;