From: preecej Date: Sat, 9 Oct 2010 07:21:33 +0000 (+0000) Subject: copying X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=07aa22c37c59837d902c3811ee4c526aa5d55a36;p=old-jaiswallab-svn%2F.git copying svn path=/; revision=62 --- diff --git a/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping-HEAD.pl b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping-HEAD.pl new file mode 100644 index 0000000..ec31169 --- /dev/null +++ b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping-HEAD.pl @@ -0,0 +1,250 @@ +#!/usr/bin/perl -w +use strict; + +# SVN test from jedit on Mac + +# -------------------------------------------------------------------- +# Rice Reactome - CHEBI Ontology Mapping Script +# +# Justin Preece, 10/06/10 +# +# Purpose: Map CHEBI ontology terms onto Rice Reactome database. +# +# Inputs: +# CHEBI OBO file (preset) +# Rice Reactome file (preset, provided by YuanMing Wu) +# (Header) [ReactomeID] [Compound_Name] [CAS] [LIGAND] [RiceCyc] +# (Row) 923893 S-adenosyl-L-methionine 29908-03-0 C00019 S-ADENOSYLMETHIONINE ** the '-' (dash) symbol will be applied to any empty columns +# +# Outputs: tab-del mapping file (reactome_chebi_mapping.txt) +# (Header) [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID] +# (Row) 923893 15414 CAS 29908-03-0 +# (Row) 923893 15414 LIGAND C00019 +# (Row) 923893 15414 RiceCyc S-ADENOSYLMETHIONINE ** this would be a rare mapping occurrence +# -------------------------------------------------------------------- + + +# -------------------------------------------------------------------- +# modules +# -------------------------------------------------------------------- + +use Bio::OntologyIO; + +# -------------------------------------------------------------------- +# declarations +# -------------------------------------------------------------------- + +# set paths to data files +my $data_path = "/home/preecej/Documents/Projects/Reactome/"; +my $chebi_obo_file = "chebi_sample.obo"; +my $reactome_file = "RiceReferenceMolecules_sample.txt"; +my $mapped_output_file = "reactome_chebi_mapping.txt"; + +my $ont; # chebi ontology + +my %reactome_CAS; # rice reactome CAS hash +my %reactome_LIGAND; # rice reactome LIGAND hash +my %reactome_RiceCyc; # rice reactome RiceCyc hash + +my @map_results = (); # successful mappings between chebi and reactome + + +# -------------------------------------------------------------------- +# functions +# -------------------------------------------------------------------- + + +# setup chebi parser and reactome data +# -------------------------------------------------------------------- +sub init +{ + # init ontology parser + my $parser = Bio::OntologyIO->new ( + -format => "obo", + -file => $data_path . $chebi_obo_file); + + # init ontology + $ont = $parser->next_ontology(); + $parser->close(); + + # read rice reactome file into 3 separate hashes + open(REACTOME_FILE,$data_path . $reactome_file); + + my $line = ; # skip the header + + while () + { + $line = $_; + chomp $line; + my @reactome_entry = split(/\t/, $line); # break up our tab-del line + + # load up this reactome entry's ID, CAS, LIGAND, and RiceCyc values + my $reactome_id = $reactome_entry[0]; + my $CAS_id = $reactome_entry[2]; + my $LIGAND_id = $reactome_entry[3]; + my $RiceCyc_term = $reactome_entry[4]; + + # There is a possibility that a single CAS, LIGAND, or RiceCyc + # identifier may appear in more than one reactome entry. This + # temp array allows each matched hash value to hold more than + # one ReactomeID, if necessary. + + # --CAS Hash Load-- + if ($CAS_id ne "-") # keep those "-" placeholders out + { + # build the CAS hash; each value may hold 1...n reactome + # ids (as an array) + push @{$reactome_CAS{$CAS_id}}, $reactome_id; + } + + # similarly... + + # --LIGAND Hash Load-- + if ($LIGAND_id ne "-") + { + push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id; + } + + # --RiceCyc Hash Load-- + if ($RiceCyc_term ne "-") + { + push @{$reactome_RiceCyc{"\U$RiceCyc_term"}}, $reactome_id; + } + } + close REACTOME_FILE; +} + + +# spit out some data to make sure you've read in the files correctly +# -------------------------------------------------------------------- +sub test_inputs +{ + # output basic stats on chebi ontology + print "\n[Ontology Stats]\n"; + print "read ontology ",$ont->name()," with ", + scalar($ont->get_root_terms)," root terms, and ", + scalar($ont->get_all_terms)," total terms, and ", + scalar($ont->get_leaf_terms)," leaf terms\n"; + + # all chebi terms in the ontology + print "\n[CHEBI Term List from \$ont]\n"; + foreach my $term ($ont->get_all_terms) { + my @synonyms = $term->get_synonyms; + my @xrefs = $term->get_dbxrefs; + + print $term->identifier; + print " \|NAME\| "; + if (defined($term->name)) { + print $term->name; + } + print " \|SYNONYMS\| "; + print "$_," foreach @synonyms; + print " \|XREFS\| "; + print "$_" foreach @xrefs; + foreach my $xref (@xrefs) { + print $xref->primary_id; + } + print "\n\n"; + } + + # show reactome hashes - this is important, give >1 dupes to Pankaj + # for manual reference + my $k; my @v; + print "\n[Reactome Hashes]\n"; + print "\n--CAS Hash--\n"; + for $k (keys %reactome_CAS) { + #if (@{$reactome_CAS{$k}} > 1) { + print "$k: @{$reactome_CAS{$k}}\n"; + #} + } + print "\n--LIGAND Hash--\n"; + for $k (keys %reactome_LIGAND) { + #if (@{$reactome_LIGAND{$k}} > 1) { + print "$k: @{$reactome_LIGAND{$k}}\n"; + #} + } + print "\n--RiceCyc Hash--\n"; + for $k (keys %reactome_RiceCyc) { + #if (@{$reactome_RiceCyc{$k}} > 1) { + print "$k: @{$reactome_RiceCyc{$k}}\n"; + #} + } +} + + +# map the chebi terms to the reactome entries +# -------------------------------------------------------------------- +sub perform_map +{ + my @chebi_obo_terms = $ont->get_all_terms; + #print $_->identifier . "\n" foreach @chebi_obo_terms; + + # loop through each chebi term + foreach my $term (@chebi_obo_terms) + { + # set locals for matching each term property + my $term_name; + if (defined($term->name)) { + $term_name = $term->name; + } else { + $term_name = ""; + } + my @term_synonyms = $term->get_synonyms; + + # attempt CHEBI match on CAS ID + + # attempt CHEBI match on LIGAND ID + + # attempt CHEBI match on RiceCyc names + if (defined($reactome_RiceCyc{"\U$term_name"})) { + push (@map_results, "$reactome_RiceCyc{$term_name}\t", + "$term->identifier\t", + "RiceCyc\t", + $term_name); + } else { # check the term synonyms, if needed + foreach my $synonym (@term_synonyms) { + print ""; + } + } + } +} + +# sample format - remove later +# [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID] +# 923893 15414 CAS 29908-03-0 +# 923893 15414 LIGAND C00019 +# 923893 15414 RiceCyc S-ADENOSYLMETHIONINE + + +# put the results in the mapped output file +# -------------------------------------------------------------------- +sub create_mapfile +{ + if (@map_results > 0) + { + # add a header to the results array + unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID"); + + # setup output file + open(OUTPUT_FILE,">>" . $data_path . $mapped_output_file); + + #format results for file output + print OUTPUT_FILE "$_\n" foreach @map_results; + + close OUTPUT_FILE; + } else { + print "\n\nSorry, there are no mapped results.\n\n"; + } +} + + +# -------------------------------------------------------------------- +# main +# -------------------------------------------------------------------- + +init; +#test_inputs; +perform_map; +create_mapfile; + +exit; diff --git a/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping-HEAD.pl~ b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping-HEAD.pl~ new file mode 100644 index 0000000..5d01cd7 --- /dev/null +++ b/preecej/perl_singletons/reactome_chebi_mapping/reactome_chebi_mapping-HEAD.pl~ @@ -0,0 +1,248 @@ +#!/usr/bin/perl -w +use strict; + +# -------------------------------------------------------------------- +# Rice Reactome - CHEBI Ontology Mapping Script +# +# Justin Preece, 10/06/10 +# +# Purpose: Map CHEBI ontology terms onto Rice Reactome database. +# +# Inputs: +# CHEBI OBO file (preset) +# Rice Reactome file (preset, provided by YuanMing Wu) +# (Header) [ReactomeID] [Compound_Name] [CAS] [LIGAND] [RiceCyc] +# (Row) 923893 S-adenosyl-L-methionine 29908-03-0 C00019 S-ADENOSYLMETHIONINE ** the '-' (dash) symbol will be applied to any empty columns +# +# Outputs: tab-del mapping file (reactome_chebi_mapping.txt) +# (Header) [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID] +# (Row) 923893 15414 CAS 29908-03-0 +# (Row) 923893 15414 LIGAND C00019 +# (Row) 923893 15414 RiceCyc S-ADENOSYLMETHIONINE ** this would be a rare mapping occurrence +# -------------------------------------------------------------------- + + +# -------------------------------------------------------------------- +# modules +# -------------------------------------------------------------------- + +use Bio::OntologyIO; + +# -------------------------------------------------------------------- +# declarations +# -------------------------------------------------------------------- + +# set paths to data files +my $data_path = "/home/preecej/Documents/Projects/Reactome/"; +my $chebi_obo_file = "chebi_sample.obo"; +my $reactome_file = "RiceReferenceMolecules_sample.txt"; +my $mapped_output_file = "reactome_chebi_mapping.txt"; + +my $ont; # chebi ontology + +my %reactome_CAS; # rice reactome CAS hash +my %reactome_LIGAND; # rice reactome LIGAND hash +my %reactome_RiceCyc; # rice reactome RiceCyc hash + +my @map_results = (); # successful mappings between chebi and reactome + + +# -------------------------------------------------------------------- +# functions +# -------------------------------------------------------------------- + + +# setup chebi parser and reactome data +# -------------------------------------------------------------------- +sub init +{ + # init ontology parser + my $parser = Bio::OntologyIO->new ( + -format => "obo", + -file => $data_path . $chebi_obo_file); + + # init ontology + $ont = $parser->next_ontology(); + $parser->close(); + + # read rice reactome file into 3 separate hashes + open(REACTOME_FILE,$data_path . $reactome_file); + + my $line = ; # skip the header + + while () + { + $line = $_; + chomp $line; + my @reactome_entry = split(/\t/, $line); # break up our tab-del line + + # load up this reactome entry's ID, CAS, LIGAND, and RiceCyc values + my $reactome_id = $reactome_entry[0]; + my $CAS_id = $reactome_entry[2]; + my $LIGAND_id = $reactome_entry[3]; + my $RiceCyc_term = $reactome_entry[4]; + + # There is a possibility that a single CAS, LIGAND, or RiceCyc + # identifier may appear in more than one reactome entry. This + # temp array allows each matched hash value to hold more than + # one ReactomeID, if necessary. + + # --CAS Hash Load-- + if ($CAS_id ne "-") # keep those "-" placeholders out + { + # build the CAS hash; each value may hold 1...n reactome + # ids (as an array) + push @{$reactome_CAS{$CAS_id}}, $reactome_id; + } + + # similarly... + + # --LIGAND Hash Load-- + if ($LIGAND_id ne "-") + { + push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id; + } + + # --RiceCyc Hash Load-- + if ($RiceCyc_term ne "-") + { + push @{$reactome_RiceCyc{"\U$RiceCyc_term"}}, $reactome_id; + } + } + close REACTOME_FILE; +} + + +# spit out some data to make sure you've read in the files correctly +# -------------------------------------------------------------------- +sub test_inputs +{ + # output basic stats on chebi ontology + print "\n[Ontology Stats]\n"; + print "read ontology ",$ont->name()," with ", + scalar($ont->get_root_terms)," root terms, and ", + scalar($ont->get_all_terms)," total terms, and ", + scalar($ont->get_leaf_terms)," leaf terms\n"; + + # all chebi terms in the ontology + print "\n[CHEBI Term List from \$ont]\n"; + foreach my $term ($ont->get_all_terms) { + my @synonyms = $term->get_synonyms; + my @xrefs = $term->get_dbxrefs; + + print $term->identifier; + print " \|NAME\| "; + if (defined($term->name)) { + print $term->name; + } + print " \|SYNONYMS\| "; + print "$_," foreach @synonyms; + print " \|XREFS\| "; + print "$_" foreach @xrefs; + foreach my $xref (@xrefs) { + print $xref->primary_id; + } + print "\n\n"; + } + + # show reactome hashes - this is important, give >1 dupes to Pankaj + # for manual reference + my $k; my @v; + print "\n[Reactome Hashes]\n"; + print "\n--CAS Hash--\n"; + for $k (keys %reactome_CAS) { + #if (@{$reactome_CAS{$k}} > 1) { + print "$k: @{$reactome_CAS{$k}}\n"; + #} + } + print "\n--LIGAND Hash--\n"; + for $k (keys %reactome_LIGAND) { + #if (@{$reactome_LIGAND{$k}} > 1) { + print "$k: @{$reactome_LIGAND{$k}}\n"; + #} + } + print "\n--RiceCyc Hash--\n"; + for $k (keys %reactome_RiceCyc) { + #if (@{$reactome_RiceCyc{$k}} > 1) { + print "$k: @{$reactome_RiceCyc{$k}}\n"; + #} + } +} + + +# map the chebi terms to the reactome entries +# -------------------------------------------------------------------- +sub perform_map +{ + my @chebi_obo_terms = $ont->get_all_terms; + #print $_->identifier . "\n" foreach @chebi_obo_terms; + + # loop through each chebi term + foreach my $term (@chebi_obo_terms) + { + # set locals for matching each term property + my $term_name; + if (defined($term->name)) { + $term_name = $term->name; + } else { + $term_name = ""; + } + my @term_synonyms = $term->get_synonyms; + + # attempt CHEBI match on CAS ID + + # attempt CHEBI match on LIGAND ID + + # attempt CHEBI match on RiceCyc names + if (defined($reactome_RiceCyc{"\U$term_name"})) { + push (@map_results, "$reactome_RiceCyc{$term_name}\t", + "$term->identifier\t", + "RiceCyc\t", + $term_name); + } else { # check the term synonyms, if needed + foreach my $synonym (@term_synonyms) { + print ""; + } + } + } +} + +# sample format - remove later +# [ReactomeID] [CHEBI] [XREF_Type] [XREF_ID] +# 923893 15414 CAS 29908-03-0 +# 923893 15414 LIGAND C00019 +# 923893 15414 RiceCyc S-ADENOSYLMETHIONINE + + +# put the results in the mapped output file +# -------------------------------------------------------------------- +sub create_mapfile +{ + if (@map_results > 0) + { + # add a header to the results array + unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID"); + + # setup output file + open(OUTPUT_FILE,">>" . $data_path . $mapped_output_file); + + #format results for file output + print OUTPUT_FILE "$_\n" foreach @map_results; + + close OUTPUT_FILE; + } else { + print "\n\nSorry, there are no mapped results.\n\n"; + } +} + + +# -------------------------------------------------------------------- +# main +# -------------------------------------------------------------------- + +init; +#test_inputs; +perform_map; +create_mapfile; + +exit;