From 0991fd023cd6d6c90009dffcd58a00d8e0f27d2a Mon Sep 17 00:00:00 2001 From: preecej Date: Thu, 26 May 2011 04:07:22 +0000 Subject: [PATCH] Initial pseudo-coding of PS import script svn path=/; revision=101 --- .../semantic_wiki/PS_TransformForImport.pl | 418 +++--------------- 1 file changed, 55 insertions(+), 363 deletions(-) diff --git a/preecej/semantic_wiki/PS_TransformForImport.pl b/preecej/semantic_wiki/PS_TransformForImport.pl index b8a91bd..9878e5b 100644 --- a/preecej/semantic_wiki/PS_TransformForImport.pl +++ b/preecej/semantic_wiki/PS_TransformForImport.pl @@ -17,13 +17,13 @@ import file. =head1 USAGE -PS_TransformForImport.pl -i INPUT_FILE -o OUTPUT_FILE +PS_TransformForImport.pl -i INPUT_FILE -t TYPE -o OUTPUT_FILE -=head1 USAGE +=head1 OPTIONS - -i|--input Name of input CSV or tab-del file. - -t|--type Specifies input type of file ('csv' or 'tab') - -o|--output Name of output file. + -i Name of input CSV or tab-del file. + -t Specifies input type of file ('csv' or 'tab') + -o Name of output file. =head1 DEPENDENCIES @@ -39,11 +39,17 @@ data. Source Data=Oryzabase Source Version=rel. 10 Source URI=http://www.shigen.nig.ac.jp/rice/oryzabase/ - Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/downloadAboutGeneAction.do?fileCategory=geneListAll + Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/... -=head2 Data Header Format Example +=head2 Data Header Format Example (field separator may also be a tab) - Gene[Accession ID],Gene[Gene Symbol],Reference_Publication[Species Name],Reference_Publication[Species ID] + [Data] + Template=Gene Annotation + Gene Name,Gene Annotation,Gene Symbol,... + val1,val2,val3,... + " + " + " =head1 AUTHOR @@ -61,117 +67,56 @@ Justin Preece # general use strict; -use Cwd; qw(cwd); +use Cwd; +use Switch; +use Getopt::Std; # specific -use GO::Parser; +use XML::Smart; # --------------------------------------------------------------------------- # declarations # --------------------------------------------------------------------------- -# set paths to data files -my $data_path = "/home/preecej/Documents/projects/reactome/reactome_to_chebi_mapping/AraCyc/gk_central_041811/no_synonyms/"; -my $chebi_obo_file = "../../chebi_v78.obo"; -my $reactome_file = "AraReferenceMolecules.txt"; -my $mapped_output_file = "1.2_reactome_chebi_mapping_complete.txt"; -my $sorted_output_file = "1.2_reactome_chebi_mapping_complete_sorted.txt"; -my $unique_mappings = "1.2_reactome_unique_mappings.txt"; -my $sorted_no_match_file = "1.2_reactome_entries_with_no_chebi_match.txt"; - -# options -my $allow_obsolete_terms = 1; -my $allow_cyc = 0; -my $allow_synonyms = 0; - -my $ont; # chebi ontology +# read and set options +my %opts; # arg options +getopts('i:t:o:', \%opts); -my %reactome_CompoundName; # reactome Compound Name hash -my %reactome_CAS; # reactome CAS hash -my %reactome_LIGAND; # reactome LIGAND hash -my %reactome_Cyc; # reactome Cyc hash - -my @map_results = (); # successful mappings between chebi and reactome +# set paths to data files +my $path = getcwd() . "/"; +my $input_file = $path; +my $output_file = $path; +my $file_type; + +foreach my $key (%opts) { + my $value = $opts{$key}; + switch ($key) { + case "i" { $input_file = $input_file . $value; } + case "t" { $file_type = $value; } + case "o" { $output_file = $output_file . $value; } + } +} +print "$input_file\n"; +print "$output_file\n"; +print "$file_type\n"; # --------------------------------------------------------------------------- # functions # --------------------------------------------------------------------------- - -# setup chebi parser and reactome data -# --------------------------------------------------------------------------- -sub init +sub import_data { - # init ontology parser and ontology - my $parser = GO::Parser->new({handler=>'obj'}); - $parser->parse($data_path . $chebi_obo_file); - $ont = $parser->handler->graph; - - # read reactome file into 3 separate hashes - open(REACTOME_FILE,$data_path . $reactome_file); - - my $line = ; # skip the header - my $reactome_count = 0; + # open file - while () - { - $line = $_; - chomp $line; - $reactome_count++; - my @reactome_entry = split(/\t/, $line); # break up our tab-del line - - # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and Cyc values - my $reactome_id = $reactome_entry[0]; - my $compound_name = uc $reactome_entry[1]; # for case-insensitivity - - # strips off "AN " and "A " indefinite articles - $compound_name =~ s/^ //; - $compound_name =~ s/^AN //; - $compound_name =~ s/^A //; - - my $CAS_id = $reactome_entry[2]; - my $LIGAND_id = $reactome_entry[3]; - my $Cyc_term = uc $reactome_entry[4]; # for case-insensitivity - - # There is a possibility that a single CAS, LIGAND, or Cyc - # identifier may appear in more than one reactome entry. This - # temp array allows each matched hash value to hold more than - # one ReactomeID, if necessary. - - # --CAS Hash Load-- - if ($CAS_id ne "-") { # keep those "-" placeholders out - # build the CAS hash; each value may hold 1...n reactome - # ids (as an array) - push @{$reactome_CAS{$CAS_id}}, $reactome_id; - } - - # similarly... - - # --LIGAND Hash Load-- - if ($LIGAND_id ne "-") { - push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id; - } - - # --CompoundName Hash Load-- - if ($compound_name ne "-") { - push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id; - } - - # --Cyc Hash Load-- - if ($allow_cyc) - { - if ($Cyc_term ne "-") { - push @{$reactome_Cyc{"$Cyc_term"}}, $reactome_id; - } - - } - } - close REACTOME_FILE; + # read in provenance header + + # read in data template + + # read in data fields (columns) + + # loop through data rows and add all fields to a named hash - print "\n[Reactome Stats]", - "\nTotal Reactome Entries: $reactome_count\n"; - } @@ -179,269 +124,17 @@ sub init # --------------------------------------------------------------------------- sub test_inputs { - # basic ontology info - print "[Node Count]: " . $ont->node_count . "\n"; - - # get all chebi terms in the ontology - my $terms = $ont->get_all_nodes; - - # output contents of parsed ontology - foreach my $term (@$terms) - { - print "\n" . $term->acc . " " . $term->name . "\n[SYNONYMS]\n"; - - my $synonyms = $term->synonym_list; - foreach my $synonym (@$synonyms) { - print $synonym . "\n"; - } - - print "[XREFS]\n"; - my $xrefs = $term->dbxref_list; - foreach my $xref (@$xrefs) { - print $xref->xref_key . ",", - $xref->xref_keytype . ",", - $xref->xref_dbname . ",", - $xref->xref_desc . "\n"; - } - print "\n"; - } - - # show dupes in reactome hashes - give data to Pankaj; - # this is important b/c the duplicates may represent erroneous data in - # the Reactome dataset - my $k; my @v; - print "\n[Reactome Hashes - Dupes]\n"; - print "\n--CAS Hash--\n"; - for $k (keys %reactome_CAS) { - if (@{$reactome_CAS{$k}} > 1) { - print "$k: @{$reactome_CAS{$k}}\n"; - } - } - print "\n--LIGAND Hash--\n"; - for $k (keys %reactome_LIGAND) { - if (@{$reactome_LIGAND{$k}} > 1) { - print "$k: @{$reactome_LIGAND{$k}}\n"; - } - } - print "\n--CompoundName Hash--\n"; - for $k (keys %reactome_CompoundName) { - if (@{$reactome_CompoundName{$k}} > 1) { - print "$k: @{$reactome_CompoundName{$k}}\n"; - } - } - if ($allow_cyc) - { - print "\n--Cyc Hash--\n"; - for $k (keys %reactome_Cyc) { - if (@{$reactome_Cyc{$k}} > 1) { - print "$k: @{$reactome_Cyc{$k}}\n"; - } - } - } + # print out your hash } -# map the chebi terms to the reactome entries # --------------------------------------------------------------------------- -sub perform_map +sub write_xml { - my $chebi_obo_terms = $ont->get_all_nodes; - - # vars for mapping stats - my $attempted_mappings = 0; - my $successful_mappings = 0; - my $attempted_CAS_mappings = 0; - my $successful_CAS_mappings = 0; - my $attempted_LIGAND_mappings = 0; - my $successful_LIGAND_mappings = 0; - my $attempted_name_mappings = 0; - my $successful_name_mappings = 0; - my $attempted_synonym_mappings = 0; - my $successful_synonym_mappings = 0; + # loop through the hash and build annotation data and provenance xml docs + # simultaneously - # loop through each chebi term - foreach my $term (@$chebi_obo_terms) - { - # eliminate "typedef" nodes (non-CHEBI terms), also check for obsolete - # terms and whether to allow them - if (($term->acc =~ m/^CHEBI:/) - && (!$term->is_obsolete || ($term->is_obsolete && $allow_obsolete_terms))) - { - # attempt CHEBI match on CAS and LIGAND ID's - $attempted_mappings++; - my $xrefs = $term->dbxref_list; - foreach my $xref (@$xrefs) - { - $attempted_CAS_mappings++; - $attempted_LIGAND_mappings++; - - # temp-foo to skirt an interpolation problem - my $tmp_key = $xref->xref_key; - - if (defined($reactome_CAS{"$tmp_key"})) - { - foreach my $tmp_reactome_id (@{$reactome_CAS{$tmp_key}}) - { - $successful_CAS_mappings++; - push (@map_results, "$tmp_reactome_id\t" . - $term->acc . "\t" . - "CAS\t" . - $tmp_key); - } - } - - if (defined($reactome_LIGAND{"$tmp_key"})) - { - foreach my $tmp_reactome_id (@{$reactome_LIGAND{$tmp_key}}) - { - $successful_LIGAND_mappings++; - push (@map_results, "$tmp_reactome_id\t" . - $term->acc . "\t" . - "LIGAND\t" . - $tmp_key); - } - } - } - - # attempt CHEBI match on Reactome Compound Names (and optional Cyc names/synonyms)... - $attempted_name_mappings++; - - # more temp-foo to skirt said interpolation problem - my $tmp_name = uc $term->name; - - # reactome compound names... - if (defined($reactome_CompoundName{"$tmp_name"})) - { - foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_name}}) - { - $successful_name_mappings++; - push (@map_results, "$tmp_reactome_id\t" . - $term->acc . "\t" . - "CompoundTerm\t" . - $term->name); - } - } - # ...and synonyms (optional) - if ($allow_synonyms) - { - my $synonyms = $term->synonym_list; - foreach my $synonym (@$synonyms) - { - $attempted_synonym_mappings++; - - # yet more temp-foo to skirt interpolation problem - my $tmp_syn = "\U$synonym"; - - if (defined($reactome_CompoundName{$tmp_syn})) - { - foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_syn}}) - { - $successful_synonym_mappings++; - push (@map_results, "$tmp_reactome_id\t" . - $term->acc . "\t" . - "CompoundSynonym\t" . - $synonym); - } - } - } - } - - # Cyc names... - if ($allow_cyc) - { - if (defined($reactome_Cyc{"$tmp_name"})) - { - foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_name}}) - { - $successful_name_mappings++; - push (@map_results, "$tmp_reactome_id\t" . - $term->acc . "\t" . - "CycTerm\t" . - $term->name); - } - } - # ...and synonyms (optional) - if ($allow_synonyms) - { - my $synonyms = $term->synonym_list; - foreach my $synonym (@$synonyms) - { - $attempted_synonym_mappings++; - - # yet more temp-foo to skirt interpolation problem - my $tmp_syn = "\U$synonym"; - - if (defined($reactome_Cyc{$tmp_syn})) - { - foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_syn}}) - { - $successful_synonym_mappings++; - push (@map_results, "$tmp_reactome_id\t" . - $term->acc . "\t" . - "CycSynonym\t" . - $synonym); - } - } - } - } - } - } - } - # send up some stats on the mapping process - $successful_mappings = - + $successful_CAS_mappings - + $successful_LIGAND_mappings - + $successful_name_mappings - + $successful_synonym_mappings; - - print "\n[ChEBI Stats]", - "\nNodes in File: " . $ont->node_count, - "\nTotal Attempted Mappings (with usable ChEBI terms): " . $attempted_mappings, - "\nTotal Successful Mappings: " . $successful_mappings . "\n"; - - print "\n[Mapping Breakdown by Type]", - "\nCAS (matches/attempts): ", - "$successful_CAS_mappings/$attempted_CAS_mappings ", - "(note: can include ChemIDplus and KEGG COMPUND db duplicates)", - "\nLIGAND: ", - "$successful_LIGAND_mappings/$attempted_LIGAND_mappings", - "\nTerm Names " . ($allow_cyc ? "includes Cyc terms and synonyms" : "") . ": ", - "$successful_name_mappings/$attempted_name_mappings"; - if ($allow_synonyms) - { - print "\nTerm Synonyms: ", - "$successful_synonym_mappings/$attempted_synonym_mappings"; - } - print "\n\n"; -} - - -# put the results in the mapped output file -# --------------------------------------------------------------------------- -sub create_mapfile -{ - if (@map_results > 0) - { - # add a header to the results array - unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID"); - - # setup output file - open(OUTPUT_FILE,">" . $data_path . $mapped_output_file); - - #format results for file output - print OUTPUT_FILE "$_\n" foreach @map_results; - - close OUTPUT_FILE; - - # sort on all cols (keep the header at the top), remove exact dupes - system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file"; - - # also produce files listing unique Reactome entries having a match...and those without a match - system "awk 'NR == 1; NR > 1 {print \$1}' $data_path$sorted_output_file | uniq > $data_path$unique_mappings"; - system "cat $data_path$reactome_file | grep -vf $data_path$unique_mappings | sort > $data_path$sorted_no_match_file"; - } else { - print "\n\nSorry, there are no mapped results.\n\n"; - } + # write xml docs to a single ImportXML file } @@ -449,10 +142,9 @@ sub create_mapfile # main # --------------------------------------------------------------------------- -init; -#test_inputs; -perform_map; -create_mapfile; +import_data; +test_inputs; +write_xml; exit; -- 2.34.1