From 0991fd023cd6d6c90009dffcd58a00d8e0f27d2a Mon Sep 17 00:00:00 2001
From: preecej <preecej@localhost>
Date: Thu, 26 May 2011 04:07:22 +0000
Subject: [PATCH] Initial pseudo-coding of PS import script

svn path=/; revision=101
---
 .../semantic_wiki/PS_TransformForImport.pl    | 418 +++---------------
 1 file changed, 55 insertions(+), 363 deletions(-)

diff --git a/preecej/semantic_wiki/PS_TransformForImport.pl b/preecej/semantic_wiki/PS_TransformForImport.pl
index b8a91bd..9878e5b 100644
--- a/preecej/semantic_wiki/PS_TransformForImport.pl
+++ b/preecej/semantic_wiki/PS_TransformForImport.pl
@@ -17,13 +17,13 @@ import file.
 
 =head1 USAGE
 
-PS_TransformForImport.pl -i INPUT_FILE -o OUTPUT_FILE
+PS_TransformForImport.pl -i INPUT_FILE -t TYPE -o OUTPUT_FILE
 
-=head1 USAGE
+=head1 OPTIONS
 
-    -i|--input  Name of input CSV or tab-del file.
-    -t|--type   Specifies input type of file ('csv' or 'tab')
-    -o|--output Name of output file.
+    -i  Name of input CSV or tab-del file.
+    -t  Specifies input type of file ('csv' or 'tab')
+    -o  Name of output file.
 
 =head1 DEPENDENCIES
 
@@ -39,11 +39,17 @@ data.
     Source Data=Oryzabase
     Source Version=rel. 10
     Source URI=http://www.shigen.nig.ac.jp/rice/oryzabase/
-    Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/downloadAboutGeneAction.do?fileCategory=geneListAll
+    Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/...
 
-=head2 Data Header Format Example
+=head2 Data Header Format Example (field separator may also be a tab)
 
-    Gene[Accession ID],Gene[Gene Symbol],Reference_Publication[Species Name],Reference_Publication[Species ID]
+    [Data]
+    Template=Gene Annotation
+    Gene Name,Gene Annotation,Gene Symbol,...
+    val1,val2,val3,...
+    "
+    "
+    "
 
 =head1 AUTHOR
 
@@ -61,117 +67,56 @@ Justin Preece
 
 # general
 use strict;
-use Cwd; qw(cwd);
+use Cwd;
+use Switch;
+use Getopt::Std;
 
 # specific
-use GO::Parser;
+use XML::Smart;
 
 # ---------------------------------------------------------------------------
 # declarations
 # ---------------------------------------------------------------------------
 
-# set paths to data files
-my $data_path = "/home/preecej/Documents/projects/reactome/reactome_to_chebi_mapping/AraCyc/gk_central_041811/no_synonyms/";
-my $chebi_obo_file = "../../chebi_v78.obo";
-my $reactome_file = "AraReferenceMolecules.txt";
-my $mapped_output_file = "1.2_reactome_chebi_mapping_complete.txt";
-my $sorted_output_file = "1.2_reactome_chebi_mapping_complete_sorted.txt";
-my $unique_mappings = "1.2_reactome_unique_mappings.txt";
-my $sorted_no_match_file = "1.2_reactome_entries_with_no_chebi_match.txt";
-
-# options
-my $allow_obsolete_terms = 1;
-my $allow_cyc = 0;
-my $allow_synonyms = 0;
-
-my $ont; # chebi ontology
+# read and set options
+my %opts; # arg options
+getopts('i:t:o:', \%opts);
 
-my %reactome_CompoundName; # reactome Compound Name hash
-my %reactome_CAS; # reactome CAS hash
-my %reactome_LIGAND; # reactome LIGAND hash
-my %reactome_Cyc; # reactome Cyc hash
-
-my @map_results = (); # successful mappings between chebi and reactome
+# set paths to data files
+my $path = getcwd() . "/";
+my $input_file = $path;
+my $output_file = $path;
+my $file_type;
+
+foreach my $key (%opts) { 
+    my $value = $opts{$key};
+    switch ($key) {
+        case "i" { $input_file = $input_file . $value; }
+        case "t" { $file_type = $value; }
+        case "o" { $output_file = $output_file . $value; }
+    }
+}
 
+print "$input_file\n";
+print "$output_file\n";
+print "$file_type\n";
 
 # ---------------------------------------------------------------------------
 # functions
 # ---------------------------------------------------------------------------
 
-
-# setup chebi parser and reactome data
-# ---------------------------------------------------------------------------
-sub init
+sub import_data
 {
-    # init ontology parser and ontology
-    my $parser = GO::Parser->new({handler=>'obj'});
-    $parser->parse($data_path . $chebi_obo_file);
-    $ont = $parser->handler->graph;
-      
-    # read reactome file into 3 separate hashes
-    open(REACTOME_FILE,$data_path . $reactome_file);
-
-    my $line = <REACTOME_FILE>; # skip the header
-    my $reactome_count = 0;
+    # open file
     
-    while (<REACTOME_FILE>)
-    {
-        $line = $_;
-        chomp $line;
-        $reactome_count++;
-        my @reactome_entry = split(/\t/, $line); # break up our tab-del line
-
-        # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and Cyc values
-        my $reactome_id = $reactome_entry[0];
-        my $compound_name = uc $reactome_entry[1]; # for case-insensitivity
-
-        # strips off "AN " and "A " indefinite articles
-        $compound_name =~ s/^ //;
-        $compound_name =~ s/^AN //;
-        $compound_name =~ s/^A //;
-
-        my $CAS_id = $reactome_entry[2];
-        my $LIGAND_id = $reactome_entry[3];
-        my $Cyc_term = uc $reactome_entry[4]; # for case-insensitivity
-
-        # There is a possibility that a single CAS, LIGAND, or Cyc
-        # identifier may appear in more than one reactome entry. This
-        # temp array allows each matched hash value to hold more than 
-        # one ReactomeID, if necessary.
-        
-        # --CAS Hash Load--
-        if ($CAS_id ne "-") { # keep those "-" placeholders out
-            # build the CAS hash; each value may hold 1...n reactome
-            # ids (as an array)
-            push @{$reactome_CAS{$CAS_id}}, $reactome_id;
-        }
-
-        # similarly...
-
-        # --LIGAND Hash Load--
-        if ($LIGAND_id ne "-") {
-            push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id;
-        }
-
-        # --CompoundName Hash Load--
-        if ($compound_name ne "-") {
-            push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id;
-        }
-        
-        # --Cyc Hash Load--
-        if ($allow_cyc)
-        {
-            if ($Cyc_term ne "-") {
-                push @{$reactome_Cyc{"$Cyc_term"}}, $reactome_id;
-            }
-
-        }
-    }
-    close REACTOME_FILE;
+    # read in provenance header
+    
+    # read in data template
+    
+    # read in data fields (columns)
+    
+    # loop through data rows and add all fields to a named hash
     
-    print "\n[Reactome Stats]",
-        "\nTotal Reactome Entries: $reactome_count\n";
-
 }
 
 
@@ -179,269 +124,17 @@ sub init
 # ---------------------------------------------------------------------------
 sub test_inputs
 {
-    # basic ontology info
-    print "[Node Count]: " . $ont->node_count . "\n"; 
-
-    # get all chebi terms in the ontology
-	my $terms = $ont->get_all_nodes;
-	
-	# output contents of parsed ontology
-	foreach my $term (@$terms)
-	{
-		print "\n" . $term->acc . " " . $term->name . "\n[SYNONYMS]\n";
-
-		my $synonyms = $term->synonym_list;
-		foreach my $synonym (@$synonyms) {
-			print $synonym . "\n";
-		}
-		
-		print "[XREFS]\n";
-		my $xrefs = $term->dbxref_list;
-		foreach my $xref (@$xrefs) {
-			print $xref->xref_key . ",",
-				$xref->xref_keytype . ",",
-				$xref->xref_dbname . ",",
-				$xref->xref_desc . "\n";
-		}
-		print "\n";
-	}
-
-    # show dupes in reactome hashes - give data to Pankaj;
-    # this is important b/c the duplicates may represent erroneous data in
-    # the Reactome dataset
-    my $k; my @v;
-    print "\n[Reactome Hashes - Dupes]\n";
-    print "\n--CAS Hash--\n";
-    for $k (keys %reactome_CAS) {
-        if (@{$reactome_CAS{$k}} > 1) {
-            print "$k: @{$reactome_CAS{$k}}\n";
-        }
-    }
-    print "\n--LIGAND Hash--\n";
-    for $k (keys %reactome_LIGAND) {
-        if (@{$reactome_LIGAND{$k}} > 1) {
-            print "$k: @{$reactome_LIGAND{$k}}\n";
-        }
-    }
-    print "\n--CompoundName Hash--\n";
-    for $k (keys %reactome_CompoundName) {
-        if (@{$reactome_CompoundName{$k}} > 1) {
-            print "$k: @{$reactome_CompoundName{$k}}\n";
-        }
-    }
-    if ($allow_cyc)
-    {
-        print "\n--Cyc Hash--\n";
-        for $k (keys %reactome_Cyc) {
-            if (@{$reactome_Cyc{$k}} > 1) {
-                print "$k: @{$reactome_Cyc{$k}}\n";
-            }
-        }
-    }
+    # print out your hash
 }
 
 
-# map the chebi terms to the reactome entries
 # ---------------------------------------------------------------------------
-sub perform_map
+sub write_xml
 {
-    my $chebi_obo_terms = $ont->get_all_nodes;
-    
-    # vars for mapping stats
-    my $attempted_mappings = 0;
-    my $successful_mappings = 0;
-    my $attempted_CAS_mappings = 0;
-    my $successful_CAS_mappings = 0;
-    my $attempted_LIGAND_mappings = 0;
-    my $successful_LIGAND_mappings = 0;
-    my $attempted_name_mappings = 0;
-    my $successful_name_mappings = 0;
-    my $attempted_synonym_mappings = 0;
-    my $successful_synonym_mappings = 0;
+    # loop through the hash and build annotation data and provenance xml docs
+    # simultaneously
     
-    # loop through each chebi term
-	foreach my $term (@$chebi_obo_terms)
-	{
-	    # eliminate "typedef" nodes (non-CHEBI terms), also check for obsolete
-	    # terms and whether to allow them
-        if (($term->acc =~ m/^CHEBI:/)
-            && (!$term->is_obsolete || ($term->is_obsolete && $allow_obsolete_terms)))
-        {
-            # attempt CHEBI match on CAS and LIGAND ID's
-            $attempted_mappings++;
-            my $xrefs = $term->dbxref_list;
-            foreach my $xref (@$xrefs)
-            {
-                $attempted_CAS_mappings++;
-                $attempted_LIGAND_mappings++;
-
-                # temp-foo to skirt an interpolation problem
-                my $tmp_key = $xref->xref_key;
-
-                if (defined($reactome_CAS{"$tmp_key"}))
-                {
-                    foreach my $tmp_reactome_id (@{$reactome_CAS{$tmp_key}})
-                    {
-                        $successful_CAS_mappings++;
-                        push (@map_results, "$tmp_reactome_id\t" .
-                            $term->acc . "\t" . 
-                            "CAS\t" .
-                            $tmp_key);
-                    }
-                }
-
-                if (defined($reactome_LIGAND{"$tmp_key"}))
-                {
-                    foreach my $tmp_reactome_id (@{$reactome_LIGAND{$tmp_key}})
-                    {
-                        $successful_LIGAND_mappings++;
-                        push (@map_results, "$tmp_reactome_id\t" .
-                            $term->acc . "\t" . 
-                            "LIGAND\t" .
-                            $tmp_key);
-                    }
-                }
-            }
-                
-            # attempt CHEBI match on Reactome Compound Names (and optional Cyc names/synonyms)...
-            $attempted_name_mappings++;
-
-            # more temp-foo to skirt said interpolation problem 
-            my $tmp_name = uc $term->name; 
-
-            # reactome compound names...
-            if (defined($reactome_CompoundName{"$tmp_name"}))
-            {
-                foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_name}})
-                {
-                    $successful_name_mappings++;
-                    push (@map_results, "$tmp_reactome_id\t" .
-                        $term->acc . "\t" . 
-                        "CompoundTerm\t" .
-                        $term->name);
-                }
-            }
-            # ...and synonyms (optional)
-            if ($allow_synonyms)
-            {
-                my $synonyms = $term->synonym_list;
-                foreach my $synonym (@$synonyms)
-                {
-                    $attempted_synonym_mappings++;
-
-                    # yet more temp-foo to skirt interpolation problem 
-                    my $tmp_syn = "\U$synonym"; 
-
-                    if (defined($reactome_CompoundName{$tmp_syn}))
-                    {
-                        foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_syn}})
-                        {
-                            $successful_synonym_mappings++;
-                            push (@map_results, "$tmp_reactome_id\t" .
-                                $term->acc . "\t" .
-                                "CompoundSynonym\t" .
-                                $synonym);
-                        }
-                    }
-                }
-            }
-
-            # Cyc names...
-            if ($allow_cyc)
-            {
-                if (defined($reactome_Cyc{"$tmp_name"}))
-                {
-                    foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_name}})
-                    {
-                        $successful_name_mappings++;
-                        push (@map_results, "$tmp_reactome_id\t" .
-                            $term->acc . "\t" . 
-                            "CycTerm\t" .
-                            $term->name);
-                    }
-                }
-                # ...and synonyms (optional)
-                if ($allow_synonyms)
-                {
-                    my $synonyms = $term->synonym_list;
-                    foreach my $synonym (@$synonyms)
-                    {
-                        $attempted_synonym_mappings++;
-    
-                        # yet more temp-foo to skirt interpolation problem 
-                        my $tmp_syn = "\U$synonym"; 
-    
-                        if (defined($reactome_Cyc{$tmp_syn}))
-                        {
-                            foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_syn}})
-                            {
-                                $successful_synonym_mappings++;
-                                push (@map_results, "$tmp_reactome_id\t" .
-                                    $term->acc . "\t" .
-                                    "CycSynonym\t" .
-                                    $synonym);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    # send up some stats on the mapping process
-    $successful_mappings =
-        + $successful_CAS_mappings
-        + $successful_LIGAND_mappings
-        + $successful_name_mappings
-        + $successful_synonym_mappings;
-        
-    print "\n[ChEBI Stats]",
-        "\nNodes in File: " . $ont->node_count,
-        "\nTotal Attempted Mappings (with usable ChEBI terms): " . $attempted_mappings,
-        "\nTotal Successful Mappings: " . $successful_mappings . "\n";
-
-    print "\n[Mapping Breakdown by Type]",
-        "\nCAS (matches/attempts): ",
-            "$successful_CAS_mappings/$attempted_CAS_mappings ",
-            "(note: can include ChemIDplus and KEGG COMPUND db duplicates)",
-        "\nLIGAND: ",
-            "$successful_LIGAND_mappings/$attempted_LIGAND_mappings",
-        "\nTerm Names " . ($allow_cyc ? "includes Cyc terms and synonyms" : "") . ": ",
-            "$successful_name_mappings/$attempted_name_mappings";
-    if ($allow_synonyms)
-    {
-        print "\nTerm Synonyms: ",
-            "$successful_synonym_mappings/$attempted_synonym_mappings";
-    }
-    print "\n\n";
-}
-
-
-# put the results in the mapped output file
-# ---------------------------------------------------------------------------
-sub create_mapfile
-{
-    if (@map_results > 0)
-    {
-        # add a header to the results array
-        unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID");
-        
-        # setup output file
-        open(OUTPUT_FILE,">" . $data_path . $mapped_output_file);
-    
-        #format results for file output
-        print OUTPUT_FILE "$_\n" foreach @map_results;
-        
-        close OUTPUT_FILE;
-        
-        # sort on all cols (keep the header at the top), remove exact dupes
-        system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file";
-        
-        # also produce files listing unique Reactome entries having a match...and those without a match
-        system "awk 'NR == 1; NR > 1 {print \$1}' $data_path$sorted_output_file | uniq > $data_path$unique_mappings";
-        system "cat $data_path$reactome_file | grep -vf $data_path$unique_mappings | sort > $data_path$sorted_no_match_file";
-    } else {
-        print "\n\nSorry, there are no mapped results.\n\n";
-    }
+    # write xml docs to a single ImportXML file
 }
 
 
@@ -449,10 +142,9 @@ sub create_mapfile
 # main
 # ---------------------------------------------------------------------------
 
-init;
-#test_inputs;
-perform_map;
-create_mapfile;
+import_data;
+test_inputs;
+write_xml;
 
 exit;
 
-- 
2.34.1