From 9968ea3fd5b58dec1d1550943acd8f3ceb2aeef1 Mon Sep 17 00:00:00 2001 From: preecej Date: Wed, 25 May 2011 19:54:01 +0000 Subject: [PATCH] svn path=/; revision=100 --- .../semantic_wiki/PS_TransformForImport.pl | 458 ++++++++++++++++++ 1 file changed, 458 insertions(+) create mode 100644 preecej/semantic_wiki/PS_TransformForImport.pl diff --git a/preecej/semantic_wiki/PS_TransformForImport.pl b/preecej/semantic_wiki/PS_TransformForImport.pl new file mode 100644 index 0000000..b8a91bd --- /dev/null +++ b/preecej/semantic_wiki/PS_TransformForImport.pl @@ -0,0 +1,458 @@ +#!/usr/bin/perl -w + +=head1 NAME + +Plant Semantics Import Transformation Script + +=head1 VERSION + +0.1 (svn r100) + +=head1 DESCRIPTION + +Transform external gene annotation data into an XML document readable +by the MediaWiki extension DataTransfer (Special:ImportXML) feature. +Also generates appropriate provenance of data based on header of +import file. + +=head1 USAGE + +PS_TransformForImport.pl -i INPUT_FILE -o OUTPUT_FILE + +=head1 USAGE + + -i|--input Name of input CSV or tab-del file. + -t|--type Specifies input type of file ('csv' or 'tab') + -o|--output Name of output file. + +=head1 DEPENDENCIES + +Requires that the input files contain two headers: the first will hold +the provenance information associated with the imported data, and the +second will specify the template and field names for the annotation +data. + +=head2 Provenance Header Format Example + + [Provenance] + Source Date Time Stamp=Apr 2 2008 + Source Data=Oryzabase + Source Version=rel. 10 + Source URI=http://www.shigen.nig.ac.jp/rice/oryzabase/ + Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/downloadAboutGeneAction.do?fileCategory=geneListAll + +=head2 Data Header Format Example + + Gene[Accession ID],Gene[Gene Symbol],Reference_Publication[Species Name],Reference_Publication[Species ID] + +=head1 AUTHOR + +Justin Preece + Faculty Research Assistant, Bioinformatics + Jaiswal Lab, Botany & Plant Pathology + Oregon State University + L + +=cut + +# --------------------------------------------------------------------------- +# modules +# --------------------------------------------------------------------------- + +# general +use strict; +use Cwd; qw(cwd); + +# specific +use GO::Parser; + +# --------------------------------------------------------------------------- +# declarations +# --------------------------------------------------------------------------- + +# set paths to data files +my $data_path = "/home/preecej/Documents/projects/reactome/reactome_to_chebi_mapping/AraCyc/gk_central_041811/no_synonyms/"; +my $chebi_obo_file = "../../chebi_v78.obo"; +my $reactome_file = "AraReferenceMolecules.txt"; +my $mapped_output_file = "1.2_reactome_chebi_mapping_complete.txt"; +my $sorted_output_file = "1.2_reactome_chebi_mapping_complete_sorted.txt"; +my $unique_mappings = "1.2_reactome_unique_mappings.txt"; +my $sorted_no_match_file = "1.2_reactome_entries_with_no_chebi_match.txt"; + +# options +my $allow_obsolete_terms = 1; +my $allow_cyc = 0; +my $allow_synonyms = 0; + +my $ont; # chebi ontology + +my %reactome_CompoundName; # reactome Compound Name hash +my %reactome_CAS; # reactome CAS hash +my %reactome_LIGAND; # reactome LIGAND hash +my %reactome_Cyc; # reactome Cyc hash + +my @map_results = (); # successful mappings between chebi and reactome + + +# --------------------------------------------------------------------------- +# functions +# --------------------------------------------------------------------------- + + +# setup chebi parser and reactome data +# --------------------------------------------------------------------------- +sub init +{ + # init ontology parser and ontology + my $parser = GO::Parser->new({handler=>'obj'}); + $parser->parse($data_path . $chebi_obo_file); + $ont = $parser->handler->graph; + + # read reactome file into 3 separate hashes + open(REACTOME_FILE,$data_path . $reactome_file); + + my $line = ; # skip the header + my $reactome_count = 0; + + while () + { + $line = $_; + chomp $line; + $reactome_count++; + my @reactome_entry = split(/\t/, $line); # break up our tab-del line + + # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and Cyc values + my $reactome_id = $reactome_entry[0]; + my $compound_name = uc $reactome_entry[1]; # for case-insensitivity + + # strips off "AN " and "A " indefinite articles + $compound_name =~ s/^ //; + $compound_name =~ s/^AN //; + $compound_name =~ s/^A //; + + my $CAS_id = $reactome_entry[2]; + my $LIGAND_id = $reactome_entry[3]; + my $Cyc_term = uc $reactome_entry[4]; # for case-insensitivity + + # There is a possibility that a single CAS, LIGAND, or Cyc + # identifier may appear in more than one reactome entry. This + # temp array allows each matched hash value to hold more than + # one ReactomeID, if necessary. + + # --CAS Hash Load-- + if ($CAS_id ne "-") { # keep those "-" placeholders out + # build the CAS hash; each value may hold 1...n reactome + # ids (as an array) + push @{$reactome_CAS{$CAS_id}}, $reactome_id; + } + + # similarly... + + # --LIGAND Hash Load-- + if ($LIGAND_id ne "-") { + push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id; + } + + # --CompoundName Hash Load-- + if ($compound_name ne "-") { + push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id; + } + + # --Cyc Hash Load-- + if ($allow_cyc) + { + if ($Cyc_term ne "-") { + push @{$reactome_Cyc{"$Cyc_term"}}, $reactome_id; + } + + } + } + close REACTOME_FILE; + + print "\n[Reactome Stats]", + "\nTotal Reactome Entries: $reactome_count\n"; + +} + + +# spit out some data to make sure you've read in the files correctly +# --------------------------------------------------------------------------- +sub test_inputs +{ + # basic ontology info + print "[Node Count]: " . $ont->node_count . "\n"; + + # get all chebi terms in the ontology + my $terms = $ont->get_all_nodes; + + # output contents of parsed ontology + foreach my $term (@$terms) + { + print "\n" . $term->acc . " " . $term->name . "\n[SYNONYMS]\n"; + + my $synonyms = $term->synonym_list; + foreach my $synonym (@$synonyms) { + print $synonym . "\n"; + } + + print "[XREFS]\n"; + my $xrefs = $term->dbxref_list; + foreach my $xref (@$xrefs) { + print $xref->xref_key . ",", + $xref->xref_keytype . ",", + $xref->xref_dbname . ",", + $xref->xref_desc . "\n"; + } + print "\n"; + } + + # show dupes in reactome hashes - give data to Pankaj; + # this is important b/c the duplicates may represent erroneous data in + # the Reactome dataset + my $k; my @v; + print "\n[Reactome Hashes - Dupes]\n"; + print "\n--CAS Hash--\n"; + for $k (keys %reactome_CAS) { + if (@{$reactome_CAS{$k}} > 1) { + print "$k: @{$reactome_CAS{$k}}\n"; + } + } + print "\n--LIGAND Hash--\n"; + for $k (keys %reactome_LIGAND) { + if (@{$reactome_LIGAND{$k}} > 1) { + print "$k: @{$reactome_LIGAND{$k}}\n"; + } + } + print "\n--CompoundName Hash--\n"; + for $k (keys %reactome_CompoundName) { + if (@{$reactome_CompoundName{$k}} > 1) { + print "$k: @{$reactome_CompoundName{$k}}\n"; + } + } + if ($allow_cyc) + { + print "\n--Cyc Hash--\n"; + for $k (keys %reactome_Cyc) { + if (@{$reactome_Cyc{$k}} > 1) { + print "$k: @{$reactome_Cyc{$k}}\n"; + } + } + } +} + + +# map the chebi terms to the reactome entries +# --------------------------------------------------------------------------- +sub perform_map +{ + my $chebi_obo_terms = $ont->get_all_nodes; + + # vars for mapping stats + my $attempted_mappings = 0; + my $successful_mappings = 0; + my $attempted_CAS_mappings = 0; + my $successful_CAS_mappings = 0; + my $attempted_LIGAND_mappings = 0; + my $successful_LIGAND_mappings = 0; + my $attempted_name_mappings = 0; + my $successful_name_mappings = 0; + my $attempted_synonym_mappings = 0; + my $successful_synonym_mappings = 0; + + # loop through each chebi term + foreach my $term (@$chebi_obo_terms) + { + # eliminate "typedef" nodes (non-CHEBI terms), also check for obsolete + # terms and whether to allow them + if (($term->acc =~ m/^CHEBI:/) + && (!$term->is_obsolete || ($term->is_obsolete && $allow_obsolete_terms))) + { + # attempt CHEBI match on CAS and LIGAND ID's + $attempted_mappings++; + my $xrefs = $term->dbxref_list; + foreach my $xref (@$xrefs) + { + $attempted_CAS_mappings++; + $attempted_LIGAND_mappings++; + + # temp-foo to skirt an interpolation problem + my $tmp_key = $xref->xref_key; + + if (defined($reactome_CAS{"$tmp_key"})) + { + foreach my $tmp_reactome_id (@{$reactome_CAS{$tmp_key}}) + { + $successful_CAS_mappings++; + push (@map_results, "$tmp_reactome_id\t" . + $term->acc . "\t" . + "CAS\t" . + $tmp_key); + } + } + + if (defined($reactome_LIGAND{"$tmp_key"})) + { + foreach my $tmp_reactome_id (@{$reactome_LIGAND{$tmp_key}}) + { + $successful_LIGAND_mappings++; + push (@map_results, "$tmp_reactome_id\t" . + $term->acc . "\t" . + "LIGAND\t" . + $tmp_key); + } + } + } + + # attempt CHEBI match on Reactome Compound Names (and optional Cyc names/synonyms)... + $attempted_name_mappings++; + + # more temp-foo to skirt said interpolation problem + my $tmp_name = uc $term->name; + + # reactome compound names... + if (defined($reactome_CompoundName{"$tmp_name"})) + { + foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_name}}) + { + $successful_name_mappings++; + push (@map_results, "$tmp_reactome_id\t" . + $term->acc . "\t" . + "CompoundTerm\t" . + $term->name); + } + } + # ...and synonyms (optional) + if ($allow_synonyms) + { + my $synonyms = $term->synonym_list; + foreach my $synonym (@$synonyms) + { + $attempted_synonym_mappings++; + + # yet more temp-foo to skirt interpolation problem + my $tmp_syn = "\U$synonym"; + + if (defined($reactome_CompoundName{$tmp_syn})) + { + foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_syn}}) + { + $successful_synonym_mappings++; + push (@map_results, "$tmp_reactome_id\t" . + $term->acc . "\t" . + "CompoundSynonym\t" . + $synonym); + } + } + } + } + + # Cyc names... + if ($allow_cyc) + { + if (defined($reactome_Cyc{"$tmp_name"})) + { + foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_name}}) + { + $successful_name_mappings++; + push (@map_results, "$tmp_reactome_id\t" . + $term->acc . "\t" . + "CycTerm\t" . + $term->name); + } + } + # ...and synonyms (optional) + if ($allow_synonyms) + { + my $synonyms = $term->synonym_list; + foreach my $synonym (@$synonyms) + { + $attempted_synonym_mappings++; + + # yet more temp-foo to skirt interpolation problem + my $tmp_syn = "\U$synonym"; + + if (defined($reactome_Cyc{$tmp_syn})) + { + foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_syn}}) + { + $successful_synonym_mappings++; + push (@map_results, "$tmp_reactome_id\t" . + $term->acc . "\t" . + "CycSynonym\t" . + $synonym); + } + } + } + } + } + } + } + # send up some stats on the mapping process + $successful_mappings = + + $successful_CAS_mappings + + $successful_LIGAND_mappings + + $successful_name_mappings + + $successful_synonym_mappings; + + print "\n[ChEBI Stats]", + "\nNodes in File: " . $ont->node_count, + "\nTotal Attempted Mappings (with usable ChEBI terms): " . $attempted_mappings, + "\nTotal Successful Mappings: " . $successful_mappings . "\n"; + + print "\n[Mapping Breakdown by Type]", + "\nCAS (matches/attempts): ", + "$successful_CAS_mappings/$attempted_CAS_mappings ", + "(note: can include ChemIDplus and KEGG COMPUND db duplicates)", + "\nLIGAND: ", + "$successful_LIGAND_mappings/$attempted_LIGAND_mappings", + "\nTerm Names " . ($allow_cyc ? "includes Cyc terms and synonyms" : "") . ": ", + "$successful_name_mappings/$attempted_name_mappings"; + if ($allow_synonyms) + { + print "\nTerm Synonyms: ", + "$successful_synonym_mappings/$attempted_synonym_mappings"; + } + print "\n\n"; +} + + +# put the results in the mapped output file +# --------------------------------------------------------------------------- +sub create_mapfile +{ + if (@map_results > 0) + { + # add a header to the results array + unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID"); + + # setup output file + open(OUTPUT_FILE,">" . $data_path . $mapped_output_file); + + #format results for file output + print OUTPUT_FILE "$_\n" foreach @map_results; + + close OUTPUT_FILE; + + # sort on all cols (keep the header at the top), remove exact dupes + system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file"; + + # also produce files listing unique Reactome entries having a match...and those without a match + system "awk 'NR == 1; NR > 1 {print \$1}' $data_path$sorted_output_file | uniq > $data_path$unique_mappings"; + system "cat $data_path$reactome_file | grep -vf $data_path$unique_mappings | sort > $data_path$sorted_no_match_file"; + } else { + print "\n\nSorry, there are no mapped results.\n\n"; + } +} + + +# --------------------------------------------------------------------------- +# main +# --------------------------------------------------------------------------- + +init; +#test_inputs; +perform_map; +create_mapfile; + +exit; + -- 2.34.1