=head1 USAGE
-PS_TransformForImport.pl -i INPUT_FILE -o OUTPUT_FILE
+PS_TransformForImport.pl -i INPUT_FILE -t TYPE -o OUTPUT_FILE
-=head1 USAGE
+=head1 OPTIONS
- -i|--input Name of input CSV or tab-del file.
- -t|--type Specifies input type of file ('csv' or 'tab')
- -o|--output Name of output file.
+ -i Name of input CSV or tab-del file.
+ -t Specifies input type of file ('csv' or 'tab')
+ -o Name of output file.
=head1 DEPENDENCIES
Source Data=Oryzabase
Source Version=rel. 10
Source URI=http://www.shigen.nig.ac.jp/rice/oryzabase/
- Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/downloadAboutGeneAction.do?fileCategory=geneListAll
+ Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/...
-=head2 Data Header Format Example
+=head2 Data Header Format Example (field separator may also be a tab)
- Gene[Accession ID],Gene[Gene Symbol],Reference_Publication[Species Name],Reference_Publication[Species ID]
+ [Data]
+ Template=Gene Annotation
+ Gene Name,Gene Annotation,Gene Symbol,...
+ val1,val2,val3,...
+ "
+ "
+ "
=head1 AUTHOR
# general
use strict;
-use Cwd; qw(cwd);
+use Cwd;
+use Switch;
+use Getopt::Std;
# specific
-use GO::Parser;
+use XML::Smart;
# ---------------------------------------------------------------------------
# declarations
# ---------------------------------------------------------------------------
-# set paths to data files
-my $data_path = "/home/preecej/Documents/projects/reactome/reactome_to_chebi_mapping/AraCyc/gk_central_041811/no_synonyms/";
-my $chebi_obo_file = "../../chebi_v78.obo";
-my $reactome_file = "AraReferenceMolecules.txt";
-my $mapped_output_file = "1.2_reactome_chebi_mapping_complete.txt";
-my $sorted_output_file = "1.2_reactome_chebi_mapping_complete_sorted.txt";
-my $unique_mappings = "1.2_reactome_unique_mappings.txt";
-my $sorted_no_match_file = "1.2_reactome_entries_with_no_chebi_match.txt";
-
-# options
-my $allow_obsolete_terms = 1;
-my $allow_cyc = 0;
-my $allow_synonyms = 0;
-
-my $ont; # chebi ontology
+# read and set options
+my %opts; # arg options
+getopts('i:t:o:', \%opts);
-my %reactome_CompoundName; # reactome Compound Name hash
-my %reactome_CAS; # reactome CAS hash
-my %reactome_LIGAND; # reactome LIGAND hash
-my %reactome_Cyc; # reactome Cyc hash
-
-my @map_results = (); # successful mappings between chebi and reactome
+# set paths to data files
+my $path = getcwd() . "/";
+my $input_file = $path;
+my $output_file = $path;
+my $file_type;
+
+foreach my $key (%opts) {
+ my $value = $opts{$key};
+ switch ($key) {
+ case "i" { $input_file = $input_file . $value; }
+ case "t" { $file_type = $value; }
+ case "o" { $output_file = $output_file . $value; }
+ }
+}
+print "$input_file\n";
+print "$output_file\n";
+print "$file_type\n";
# ---------------------------------------------------------------------------
# functions
# ---------------------------------------------------------------------------
-
-# setup chebi parser and reactome data
-# ---------------------------------------------------------------------------
-sub init
+sub import_data
{
- # init ontology parser and ontology
- my $parser = GO::Parser->new({handler=>'obj'});
- $parser->parse($data_path . $chebi_obo_file);
- $ont = $parser->handler->graph;
-
- # read reactome file into 3 separate hashes
- open(REACTOME_FILE,$data_path . $reactome_file);
-
- my $line = <REACTOME_FILE>; # skip the header
- my $reactome_count = 0;
+ # open file
- while (<REACTOME_FILE>)
- {
- $line = $_;
- chomp $line;
- $reactome_count++;
- my @reactome_entry = split(/\t/, $line); # break up our tab-del line
-
- # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and Cyc values
- my $reactome_id = $reactome_entry[0];
- my $compound_name = uc $reactome_entry[1]; # for case-insensitivity
-
- # strips off "AN " and "A " indefinite articles
- $compound_name =~ s/^ //;
- $compound_name =~ s/^AN //;
- $compound_name =~ s/^A //;
-
- my $CAS_id = $reactome_entry[2];
- my $LIGAND_id = $reactome_entry[3];
- my $Cyc_term = uc $reactome_entry[4]; # for case-insensitivity
-
- # There is a possibility that a single CAS, LIGAND, or Cyc
- # identifier may appear in more than one reactome entry. This
- # temp array allows each matched hash value to hold more than
- # one ReactomeID, if necessary.
-
- # --CAS Hash Load--
- if ($CAS_id ne "-") { # keep those "-" placeholders out
- # build the CAS hash; each value may hold 1...n reactome
- # ids (as an array)
- push @{$reactome_CAS{$CAS_id}}, $reactome_id;
- }
-
- # similarly...
-
- # --LIGAND Hash Load--
- if ($LIGAND_id ne "-") {
- push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id;
- }
-
- # --CompoundName Hash Load--
- if ($compound_name ne "-") {
- push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id;
- }
-
- # --Cyc Hash Load--
- if ($allow_cyc)
- {
- if ($Cyc_term ne "-") {
- push @{$reactome_Cyc{"$Cyc_term"}}, $reactome_id;
- }
-
- }
- }
- close REACTOME_FILE;
+ # read in provenance header
+
+ # read in data template
+
+ # read in data fields (columns)
+
+ # loop through data rows and add all fields to a named hash
- print "\n[Reactome Stats]",
- "\nTotal Reactome Entries: $reactome_count\n";
-
}
# ---------------------------------------------------------------------------
sub test_inputs
{
- # basic ontology info
- print "[Node Count]: " . $ont->node_count . "\n";
-
- # get all chebi terms in the ontology
- my $terms = $ont->get_all_nodes;
-
- # output contents of parsed ontology
- foreach my $term (@$terms)
- {
- print "\n" . $term->acc . " " . $term->name . "\n[SYNONYMS]\n";
-
- my $synonyms = $term->synonym_list;
- foreach my $synonym (@$synonyms) {
- print $synonym . "\n";
- }
-
- print "[XREFS]\n";
- my $xrefs = $term->dbxref_list;
- foreach my $xref (@$xrefs) {
- print $xref->xref_key . ",",
- $xref->xref_keytype . ",",
- $xref->xref_dbname . ",",
- $xref->xref_desc . "\n";
- }
- print "\n";
- }
-
- # show dupes in reactome hashes - give data to Pankaj;
- # this is important b/c the duplicates may represent erroneous data in
- # the Reactome dataset
- my $k; my @v;
- print "\n[Reactome Hashes - Dupes]\n";
- print "\n--CAS Hash--\n";
- for $k (keys %reactome_CAS) {
- if (@{$reactome_CAS{$k}} > 1) {
- print "$k: @{$reactome_CAS{$k}}\n";
- }
- }
- print "\n--LIGAND Hash--\n";
- for $k (keys %reactome_LIGAND) {
- if (@{$reactome_LIGAND{$k}} > 1) {
- print "$k: @{$reactome_LIGAND{$k}}\n";
- }
- }
- print "\n--CompoundName Hash--\n";
- for $k (keys %reactome_CompoundName) {
- if (@{$reactome_CompoundName{$k}} > 1) {
- print "$k: @{$reactome_CompoundName{$k}}\n";
- }
- }
- if ($allow_cyc)
- {
- print "\n--Cyc Hash--\n";
- for $k (keys %reactome_Cyc) {
- if (@{$reactome_Cyc{$k}} > 1) {
- print "$k: @{$reactome_Cyc{$k}}\n";
- }
- }
- }
+ # print out your hash
}
-# map the chebi terms to the reactome entries
# ---------------------------------------------------------------------------
-sub perform_map
+sub write_xml
{
- my $chebi_obo_terms = $ont->get_all_nodes;
-
- # vars for mapping stats
- my $attempted_mappings = 0;
- my $successful_mappings = 0;
- my $attempted_CAS_mappings = 0;
- my $successful_CAS_mappings = 0;
- my $attempted_LIGAND_mappings = 0;
- my $successful_LIGAND_mappings = 0;
- my $attempted_name_mappings = 0;
- my $successful_name_mappings = 0;
- my $attempted_synonym_mappings = 0;
- my $successful_synonym_mappings = 0;
+ # loop through the hash and build annotation data and provenance xml docs
+ # simultaneously
- # loop through each chebi term
- foreach my $term (@$chebi_obo_terms)
- {
- # eliminate "typedef" nodes (non-CHEBI terms), also check for obsolete
- # terms and whether to allow them
- if (($term->acc =~ m/^CHEBI:/)
- && (!$term->is_obsolete || ($term->is_obsolete && $allow_obsolete_terms)))
- {
- # attempt CHEBI match on CAS and LIGAND ID's
- $attempted_mappings++;
- my $xrefs = $term->dbxref_list;
- foreach my $xref (@$xrefs)
- {
- $attempted_CAS_mappings++;
- $attempted_LIGAND_mappings++;
-
- # temp-foo to skirt an interpolation problem
- my $tmp_key = $xref->xref_key;
-
- if (defined($reactome_CAS{"$tmp_key"}))
- {
- foreach my $tmp_reactome_id (@{$reactome_CAS{$tmp_key}})
- {
- $successful_CAS_mappings++;
- push (@map_results, "$tmp_reactome_id\t" .
- $term->acc . "\t" .
- "CAS\t" .
- $tmp_key);
- }
- }
-
- if (defined($reactome_LIGAND{"$tmp_key"}))
- {
- foreach my $tmp_reactome_id (@{$reactome_LIGAND{$tmp_key}})
- {
- $successful_LIGAND_mappings++;
- push (@map_results, "$tmp_reactome_id\t" .
- $term->acc . "\t" .
- "LIGAND\t" .
- $tmp_key);
- }
- }
- }
-
- # attempt CHEBI match on Reactome Compound Names (and optional Cyc names/synonyms)...
- $attempted_name_mappings++;
-
- # more temp-foo to skirt said interpolation problem
- my $tmp_name = uc $term->name;
-
- # reactome compound names...
- if (defined($reactome_CompoundName{"$tmp_name"}))
- {
- foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_name}})
- {
- $successful_name_mappings++;
- push (@map_results, "$tmp_reactome_id\t" .
- $term->acc . "\t" .
- "CompoundTerm\t" .
- $term->name);
- }
- }
- # ...and synonyms (optional)
- if ($allow_synonyms)
- {
- my $synonyms = $term->synonym_list;
- foreach my $synonym (@$synonyms)
- {
- $attempted_synonym_mappings++;
-
- # yet more temp-foo to skirt interpolation problem
- my $tmp_syn = "\U$synonym";
-
- if (defined($reactome_CompoundName{$tmp_syn}))
- {
- foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_syn}})
- {
- $successful_synonym_mappings++;
- push (@map_results, "$tmp_reactome_id\t" .
- $term->acc . "\t" .
- "CompoundSynonym\t" .
- $synonym);
- }
- }
- }
- }
-
- # Cyc names...
- if ($allow_cyc)
- {
- if (defined($reactome_Cyc{"$tmp_name"}))
- {
- foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_name}})
- {
- $successful_name_mappings++;
- push (@map_results, "$tmp_reactome_id\t" .
- $term->acc . "\t" .
- "CycTerm\t" .
- $term->name);
- }
- }
- # ...and synonyms (optional)
- if ($allow_synonyms)
- {
- my $synonyms = $term->synonym_list;
- foreach my $synonym (@$synonyms)
- {
- $attempted_synonym_mappings++;
-
- # yet more temp-foo to skirt interpolation problem
- my $tmp_syn = "\U$synonym";
-
- if (defined($reactome_Cyc{$tmp_syn}))
- {
- foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_syn}})
- {
- $successful_synonym_mappings++;
- push (@map_results, "$tmp_reactome_id\t" .
- $term->acc . "\t" .
- "CycSynonym\t" .
- $synonym);
- }
- }
- }
- }
- }
- }
- }
- # send up some stats on the mapping process
- $successful_mappings =
- + $successful_CAS_mappings
- + $successful_LIGAND_mappings
- + $successful_name_mappings
- + $successful_synonym_mappings;
-
- print "\n[ChEBI Stats]",
- "\nNodes in File: " . $ont->node_count,
- "\nTotal Attempted Mappings (with usable ChEBI terms): " . $attempted_mappings,
- "\nTotal Successful Mappings: " . $successful_mappings . "\n";
-
- print "\n[Mapping Breakdown by Type]",
- "\nCAS (matches/attempts): ",
- "$successful_CAS_mappings/$attempted_CAS_mappings ",
- "(note: can include ChemIDplus and KEGG COMPUND db duplicates)",
- "\nLIGAND: ",
- "$successful_LIGAND_mappings/$attempted_LIGAND_mappings",
- "\nTerm Names " . ($allow_cyc ? "includes Cyc terms and synonyms" : "") . ": ",
- "$successful_name_mappings/$attempted_name_mappings";
- if ($allow_synonyms)
- {
- print "\nTerm Synonyms: ",
- "$successful_synonym_mappings/$attempted_synonym_mappings";
- }
- print "\n\n";
-}
-
-
-# put the results in the mapped output file
-# ---------------------------------------------------------------------------
-sub create_mapfile
-{
- if (@map_results > 0)
- {
- # add a header to the results array
- unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID");
-
- # setup output file
- open(OUTPUT_FILE,">" . $data_path . $mapped_output_file);
-
- #format results for file output
- print OUTPUT_FILE "$_\n" foreach @map_results;
-
- close OUTPUT_FILE;
-
- # sort on all cols (keep the header at the top), remove exact dupes
- system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file";
-
- # also produce files listing unique Reactome entries having a match...and those without a match
- system "awk 'NR == 1; NR > 1 {print \$1}' $data_path$sorted_output_file | uniq > $data_path$unique_mappings";
- system "cat $data_path$reactome_file | grep -vf $data_path$unique_mappings | sort > $data_path$sorted_no_match_file";
- } else {
- print "\n\nSorry, there are no mapped results.\n\n";
- }
+ # write xml docs to a single ImportXML file
}
# main
# ---------------------------------------------------------------------------
-init;
-#test_inputs;
-perform_map;
-create_mapfile;
+import_data;
+test_inputs;
+write_xml;
exit;