From: preecej Date: Fri, 8 Oct 2010 00:51:38 +0000 (+0000) Subject: added individual hashes for each input type; attempted population of CAS X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=51585089436efff85c1b06932da56130b31c1167;p=old-jaiswallab-svn%2F.git added individual hashes for each input type; attempted population of CAS input hash; stubbed out main mapping algo svn path=/; revision=59 --- diff --git a/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl index 85e201b..cc12279 100755 --- a/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl +++ b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl @@ -21,12 +21,14 @@ use strict; # (Row) 923893 15414 RiceCyc S-ADENOSYLMETHIONINE ** this would be a rare mapping occurrence; only if CAS and LIGAND mappings are not available # -------------------------------------------------------------------- + # -------------------------------------------------------------------- # modules # -------------------------------------------------------------------- use Bio::OntologyIO; + # -------------------------------------------------------------------- # declarations # -------------------------------------------------------------------- @@ -37,74 +39,137 @@ my $chebi_obo_file = "chebi_sample.obo"; my $reactome_file = "RiceReferenceMolecules_sample.txt"; my $mapped_output_file = "reactome_chebi_mapping.txt"; -my $parser; # chebi ontology parser -my @reactome_ref_molecules; # rice reactomes +my $ont; # chebi ontology + +my %reactome_CAS; # rice reactome CAS hash +my %reactome_LIGAND; # rice reactome LIGAND hash +my %reactome_RiceCyc; # rice reactome RiceCyc hash + +my @map_results; # successful mappings between chebi and reactome + # -------------------------------------------------------------------- # functions # -------------------------------------------------------------------- + # setup chebi parser and reactome data # -------------------------------------------------------------------- sub init { - # initialize ontology parser - $parser = Bio::OntologyIO->new ( + # init ontology parser + my $parser = Bio::OntologyIO->new ( -format => "obo", -file => $data_path . $chebi_obo_file); + + # init ontology + $ont = $parser->next_ontology(); + $parser->close(); + + # read rice reactome file into 3 separate hashes + open(REACTOME_FILE,$data_path . $reactome_file); + + my $line = ; # skip the header - # read chebi file into array - #open(CHEBI_OBO_FILE,$data_path . $chebi_obo_file); - #my @chebi_obo_terms = ; - #chomp(@chebi_obo_terms); - #close CHEBI_OBO_FILE; + while () + { + $line = $_; + chomp $line; + my @reactome_entry = split(/\t/, $line); # break up our tab-del line + + # There is a possibility that a single CAS, LIGAND, or RiceCyc + # identifier may appear in more than one reactome entry. This + # temp array allows each hash match to hold more than + # ReactomeID, if necessary. + my @temp_ary = (); + + # --CAS Hash Load-- + # if this reactome entry is already in the hash, just + # append it to the hash's array + if(defined($reactome_CAS{$reactome_entry[2]})) + { + $reactome_CAS{push(@_,$reactome_entry[2])} = $reactome_entry[0]; + # otherwise, just add the reactome entry to the hash + } else { + @temp_ary = (); # clear it for re-use + if ($reactome_entry[2] != "-") # keep those "-" placeholders out + { + $reactome_CAS{$temp_ary[$reactome_entry[2]]} + = $reactome_entry[0]; + } + } + } + + close REACTOME_FILE; # read rice reactome file into array - open(REACTOME_FILE,$data_path . $reactome_file); - @reactome_ref_molecules = ; - chomp(@reactome_ref_molecules); - close REACTOME_FILE; + #open(REACTOME_FILE,$data_path . $reactome_file); + #@reactome_ref_molecules = ; + #chomp(@reactome_ref_molecules); + #close REACTOME_FILE; } + # spit out some data to make sure you've read in the files correctly # -------------------------------------------------------------------- sub test_inputs { # output basic stats on chebi ontology - while (my $ont = $parser->next_ontology()) { - print "read ontology ",$ont->name()," with ", - scalar($ont->get_root_terms)," root terms, and ", - scalar($ont->get_all_terms)," total terms, and ", - scalar($ont->get_leaf_terms)," leaf terms\n"; - } - - # test - show arrays - # print "$_\n" foreach @chebi_obo_terms; - # print "$_\n" foreach @reactome_ref_molecules; + print "\n[Ontology Stats]\n"; + print "read ontology ",$ont->name()," with ", + scalar($ont->get_root_terms)," root terms, and ", + scalar($ont->get_all_terms)," total terms, and ", + scalar($ont->get_leaf_terms)," leaf terms\n"; + + # show reactome hashes + print "\n[Reactome Hashes]\n"; + print "\n--CAS Hash--\n"; + print "$_\n" foreach %reactome_CAS; + print "\n"; } -# map the chebi terms to the reactome entries + +# map the chebi terms to the reactome entries (brute-force algo) # -------------------------------------------------------------------- sub perform_map +{ + # iterate ontology; use brute-force matching to map + my @chebi_obo_terms = $ont->get_all_terms(); + + #print $_->identifier() . "\n" foreach @chebi_obo_terms; + + # --pseudo-- + # loop through each chebi term (~33K) + # if match chebi to reactome hashes on CAS, LIGAND, and/or RiceCyc name; + # write tab-del mapping string directly to results array for EACH type of match + + foreach my $term (@chebi_obo_terms) { + print $term->identifier() . "\n"; + } +} + + +# put the results in the mapped output file +# -------------------------------------------------------------------- +sub create_mapfile { # setup output file open(OUTPUT_FILE,">>" . $data_path . $mapped_output_file); - # do brute-force matching here - - # actually print out matches here - #print OUTPUT_FILE "$_\n" foreach @reactome_ref_molecules; + #format results for file output + print OUTPUT_FILE "$_\n" foreach @map_results; - # cleanup close OUTPUT_FILE; } + # -------------------------------------------------------------------- # main # -------------------------------------------------------------------- init; test_inputs; -# perform_map; +perform_map; +create_mapfile; exit;