From: preecej Date: Tue, 12 Oct 2010 00:30:43 +0000 (+0000) Subject: Added code for all mappings, statistical counters, and STDOUT; tested with X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=58f6fe3f2ee62afb8d13e1776119bd70b1a3643b;p=old-jaiswallab-svn%2F.git Added code for all mappings, statistical counters, and STDOUT; tested with small data set that contained several intentional overlaps between reactome and chebi terms svn path=/; revision=65 --- diff --git a/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl index 4ef2a36..bb44688 100755 --- a/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl +++ b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping.pl @@ -1,7 +1,7 @@ #!/usr/bin/perl -w use strict; -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- # Rice Reactome - CHEBI Ontology Mapping Script # # Justin Preece, 10/06/10 @@ -9,35 +9,43 @@ use strict; # Purpose: Map CHEBI ontology terms onto Rice Reactome database. # # Inputs: +# # CHEBI OBO file (preset) +# # Rice Reactome file (preset, provided by YuanMing Wu) # (Header) [ReactomeID] [Compound_Name] [CAS] [LIGAND] [RiceCyc] # (Row) 923893 S-adenosyl-L-methionine 29908-03-0 C00019 S-ADENOSYLMETHIONINE ** the '-' (dash) symbol will be applied to any empty columns # # Outputs: tab-del mapping file (reactome_chebi_mapping.txt) -# (Header) [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID] -# (Row) 923893 15414 CAS 29908-03-0 -# (Row) 923893 15414 LIGAND C00019 -# (Row) 923893 15414 RiceCyc S-ADENOSYLMETHIONINE ** this would be a rare mapping occurrence -# -------------------------------------------------------------------- +# +# [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID] +# 923893 15414 CAS 29908-03-0 +# 923893 15414 LIGAND C00019 +# 923893 15414 RiceCycTerm S-ADENOSYLMETHIONINE +# 923893 15414 RiceCycSynonym s-adenosylmethionine +# --------------------------------------------------------------------------- -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- # modules -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- use GO::Parser; -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- # declarations -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- # set paths to data files -my $data_path = "/Users/cindypreece/Desktop/jaiswal_lab/projects/reactome/"; +my $data_path = "/home/preecej/Documents/Projects/Reactome/"; my $chebi_obo_file = "chebi_sample.obo"; -my $reactome_file = "RiceReferenceMolecules.txt"; +my $reactome_file = "RiceReferenceMolecules_sample.txt"; my $mapped_output_file = "reactome_chebi_mapping.txt"; +# options +my $allow_obsolete_terms = 1; +my $allow_synonyms = 1; + my $ont; # chebi ontology my %reactome_CAS; # rice reactome CAS hash @@ -47,13 +55,13 @@ my %reactome_RiceCyc; # rice reactome RiceCyc hash my @map_results = (); # successful mappings between chebi and reactome -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- # functions -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- # setup chebi parser and reactome data -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- sub init { # init ontology parser and ontology @@ -61,30 +69,24 @@ sub init $parser->parse($data_path . $chebi_obo_file); $ont = $parser->handler->graph; -# my $parser = Bio::OntologyIO->new ( -# -format => "obo", -# -file => $data_path . $chebi_obo_file); - - # init ontology -# $ont = $parser->next_ontology(); -# $parser->close(); - # read rice reactome file into 3 separate hashes open(REACTOME_FILE,$data_path . $reactome_file); my $line = ; # skip the header + my $reactome_count = 0; while () { $line = $_; chomp $line; + $reactome_count++; my @reactome_entry = split(/\t/, $line); # break up our tab-del line # load up this reactome entry's ID, CAS, LIGAND, and RiceCyc values my $reactome_id = $reactome_entry[0]; my $CAS_id = $reactome_entry[2]; my $LIGAND_id = $reactome_entry[3]; - my $RiceCyc_term = $reactome_entry[4]; + my $RiceCyc_term = uc $reactome_entry[4]; # for case-insensitivity # There is a possibility that a single CAS, LIGAND, or RiceCyc # identifier may appear in more than one reactome entry. This @@ -92,8 +94,7 @@ sub init # one ReactomeID, if necessary. # --CAS Hash Load-- - if ($CAS_id ne "-") # keep those "-" placeholders out - { + if ($CAS_id ne "-") { # keep those "-" placeholders out # build the CAS hash; each value may hold 1...n reactome # ids (as an array) push @{$reactome_CAS{$CAS_id}}, $reactome_id; @@ -102,28 +103,36 @@ sub init # similarly... # --LIGAND Hash Load-- - if ($LIGAND_id ne "-") - { + if ($LIGAND_id ne "-") { push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id; } # --RiceCyc Hash Load-- - if ($RiceCyc_term ne "-") - { + if ($RiceCyc_term ne "-") { push @{$reactome_RiceCyc{"\U$RiceCyc_term"}}, $reactome_id; } } close REACTOME_FILE; + + print "\n[Reactome Stats]", + "\nTotal Oryza Reactome Entries: $reactome_count\n"; + } # spit out some data to make sure you've read in the files correctly -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- sub test_inputs { - # output new GO version of parsed ontology - my $terms = $ont->get_all_terms; - foreach my $term (@$terms) { + # basic ontology info + print "[Node Count]: " . $ont->node_count . "\n"; + + # get all chebi terms in the ontology + my $terms = $ont->get_all_nodes; + + # output contents of parsed ontology + foreach my $term (@$terms) + { print "\n" . $term->acc . " " . $term->name . "\n[SYNONYMS]\n"; my $synonyms = $term->synonym_list; @@ -142,36 +151,9 @@ sub test_inputs print "\n"; } - # output basic stats on chebi ontology -# print "\n[Ontology Stats]\n"; -# print "read ontology ",$ont->name()," with ", -# scalar($ont->get_root_terms)," root terms, and ", -# scalar($ont->get_all_terms)," total terms, and ", -# scalar($ont->get_leaf_terms)," leaf terms\n"; - - # all chebi terms in the ontology -# print "\n[CHEBI Term List from \$ont]\n"; -# foreach my $term ($ont->get_all_terms) { -# my @synonyms = $term->get_synonyms; -# my @xrefs = $term->get_dbxrefs; - -# print $term->identifier; -# print " \|NAME\| "; -# if (defined($term->name)) { -# print $term->name; -# } -# print " \|SYNONYMS\| "; -# print "$_," foreach @synonyms; -# print " \|XREFS\| "; -# print "$_" foreach @xrefs; -# foreach my $xref (@xrefs) { -# print $xref->primary_id; -# } -# print "\n\n"; -# } - - # show reactome hashes - this is important, give >1 dupes to Pankaj - # for manual reference + # show dupes in reactome hashes - give data to Pankaj; + # this is important b/c the duplicates may represent erroneous data in + # the Rice Reactome dataset my $k; my @v; print "\n[Reactome Hashes - Dupes]\n"; print "\n--CAS Hash--\n"; @@ -196,51 +178,131 @@ sub test_inputs # map the chebi terms to the reactome entries -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- sub perform_map { - my @chebi_obo_terms = $ont->get_all_terms; - #print $_->identifier . "\n" foreach @chebi_obo_terms; - + my $chebi_obo_terms = $ont->get_all_nodes; + + # vars for mapping stats + my $attempted_mappings = 0; + my $successful_mappings = 0; + my $attempted_CAS_mappings = 0; + my $successful_CAS_mappings = 0; + my $attempted_LIGAND_mappings = 0; + my $successful_LIGAND_mappings = 0; + my $attempted_name_mappings = 0; + my $successful_name_mappings = 0; + my $attempted_synonym_mappings = 0; + my $successful_synonym_mappings = 0; + # loop through each chebi term - foreach my $term (@chebi_obo_terms) - { - # set locals for matching each term property - my $term_name; - if (defined($term->name)) { - $term_name = $term->name; - } else { - $term_name = ""; - } - my @term_synonyms = $term->get_synonyms; - - # attempt CHEBI match on CAS ID - - # attempt CHEBI match on LIGAND ID - - # attempt CHEBI match on RiceCyc names - if (defined($reactome_RiceCyc{"\U$term_name"})) { - push (@map_results, "$reactome_RiceCyc{$term_name}\t", - "$term->identifier\t", - "RiceCyc\t", - $term_name); - } else { # check the term synonyms, if needed - foreach my $synonym (@term_synonyms) { - print ""; + foreach my $term (@$chebi_obo_terms) + { + # eliminate "typedef" nodes (non-CHEBI terms), also check for obsolete + # terms and whether to allow them + if (($term->acc =~ m/^CHEBI:/) + && (!$term->is_obsolete || ($term->is_obsolete && $allow_obsolete_terms))) + { + # attempt CHEBI match on CAS and LIGAND ID's + $attempted_mappings++; + my $xrefs = $term->dbxref_list; + foreach my $xref (@$xrefs) + { + $attempted_CAS_mappings++; + $attempted_LIGAND_mappings++; + + # temp-foo to skirt an interpolation problem + my $tmp_key = $xref->xref_key; + + if (defined($reactome_CAS{"$tmp_key"})) + { + foreach my $tmp_reactome_id (@{$reactome_CAS{$tmp_key}}) + { + $successful_CAS_mappings++; + push (@map_results, "$tmp_reactome_id\t" . + $term->acc . "\t" . + "CAS\t" . + $tmp_key); + } + } + + if (defined($reactome_LIGAND{"$tmp_key"})) + { + foreach my $tmp_reactome_id (@{$reactome_LIGAND{$tmp_key}}) + { + $successful_LIGAND_mappings++; + push (@map_results, "$tmp_reactome_id\t" . + $term->acc . "\t" . + "LIGAND\t" . + $tmp_key); + } + } + } + + # attempt CHEBI match on RiceCyc names... + $attempted_name_mappings++; + + # more temp-foo to skirt said interpolation problem + my $tmp_name = uc $term->name; + + if (defined($reactome_RiceCyc{"$tmp_name"})) + { + $successful_name_mappings++; + push (@map_results, "@{$reactome_RiceCyc{$tmp_name}}\t" . + $term->acc . "\t" . + "RiceCycTerm\t" . + $term->name); + } + # ...and synonyms (optional) + if ($allow_synonyms) + { + my $synonyms = $term->synonym_list; + foreach my $synonym (@$synonyms) + { + $attempted_synonym_mappings++; + + # yet more temp-foo to skirt interpolation problem + my $tmp_syn = "\U$synonym"; + + if (defined($reactome_RiceCyc{$tmp_syn})) + { + $successful_synonym_mappings++; + push (@map_results, "@{$reactome_RiceCyc{$tmp_syn}}\t" . + $term->acc . "\t" . + "RiceCycSynonym\t" . + $synonym); + } + } } } } + # send up some stats on the mapping process + $successful_mappings = + + $successful_CAS_mappings + + $successful_LIGAND_mappings + + $successful_name_mappings + + $successful_synonym_mappings; + + print "\n[ChEBI Stats]", + "\nNodes in File: " . $ont->node_count, + "\nTotal Attempted Mappings (with usable ChEBI terms): " . $attempted_mappings, + "\nTotal Successful Mappings: " . $successful_mappings . "\n"; + + print "\n[Mapping Breakdown by Type]", + "\nCAS (matches/attempts): ", + "$successful_CAS_mappings/$attempted_CAS_mappings ", + "(note: can include ChemIDplus and KEGG COMPUND db duplicates)", + "\nLIGAND: ", + "$successful_LIGAND_mappings/$attempted_LIGAND_mappings", + "\nTerm Names: ", + "$successful_name_mappings/$attempted_name_mappings", + "\nTerm Synonyms: ", + "$successful_synonym_mappings/$attempted_synonym_mappings\n\n"; } -# sample format - remove later -# [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID] -# 923893 15414 CAS 29908-03-0 -# 923893 15414 LIGAND C00019 -# 923893 15414 RiceCyc S-ADENOSYLMETHIONINE - # put the results in the mapped output file -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- sub create_mapfile { if (@map_results > 0) @@ -249,7 +311,7 @@ sub create_mapfile unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID"); # setup output file - open(OUTPUT_FILE,">>" . $data_path . $mapped_output_file); + open(OUTPUT_FILE,">" . $data_path . $mapped_output_file); #format results for file output print OUTPUT_FILE "$_\n" foreach @map_results; @@ -261,13 +323,13 @@ sub create_mapfile } -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- # main -# -------------------------------------------------------------------- +# --------------------------------------------------------------------------- init; -test_inputs; -#perform_map; +#test_inputs; +perform_map; create_mapfile; exit;