From 9968ea3fd5b58dec1d1550943acd8f3ceb2aeef1 Mon Sep 17 00:00:00 2001
From: preecej <preecej@localhost>
Date: Wed, 25 May 2011 19:54:01 +0000
Subject: [PATCH] svn path=/; revision=100

---
 .../semantic_wiki/PS_TransformForImport.pl    | 458 ++++++++++++++++++
 1 file changed, 458 insertions(+)
 create mode 100644 preecej/semantic_wiki/PS_TransformForImport.pl

diff --git a/preecej/semantic_wiki/PS_TransformForImport.pl b/preecej/semantic_wiki/PS_TransformForImport.pl
new file mode 100644
index 0000000..b8a91bd
--- /dev/null
+++ b/preecej/semantic_wiki/PS_TransformForImport.pl
@@ -0,0 +1,458 @@
+#!/usr/bin/perl -w
+
+=head1 NAME
+
+Plant Semantics Import Transformation Script
+
+=head1 VERSION
+
+0.1 (svn r100)
+
+=head1 DESCRIPTION
+
+Transform external gene annotation data into an XML document readable
+by the MediaWiki extension DataTransfer (Special:ImportXML) feature.
+Also generates appropriate provenance of data based on header of 
+import file.
+
+=head1 USAGE
+
+PS_TransformForImport.pl -i INPUT_FILE -o OUTPUT_FILE
+
+=head1 USAGE
+
+    -i|--input  Name of input CSV or tab-del file.
+    -t|--type   Specifies input type of file ('csv' or 'tab')
+    -o|--output Name of output file.
+
+=head1 DEPENDENCIES
+
+Requires that the input files contain two headers: the first will hold 
+the provenance information associated with the imported data, and the 
+second will specify the template and field names for the annotation 
+data.
+
+=head2 Provenance Header Format Example
+
+    [Provenance]
+    Source Date Time Stamp=Apr 2 2008
+    Source Data=Oryzabase
+    Source Version=rel. 10
+    Source URI=http://www.shigen.nig.ac.jp/rice/oryzabase/
+    Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/downloadAboutGeneAction.do?fileCategory=geneListAll
+
+=head2 Data Header Format Example
+
+    Gene[Accession ID],Gene[Gene Symbol],Reference_Publication[Species Name],Reference_Publication[Species ID]
+
+=head1 AUTHOR
+
+Justin Preece
+    Faculty Research Assistant, Bioinformatics
+    Jaiswal Lab, Botany & Plant Pathology
+    Oregon State University
+    L<mailto:preecej@science.oregonstate.edu>
+
+=cut
+
+# ---------------------------------------------------------------------------
+# modules
+# ---------------------------------------------------------------------------
+
+# general
+use strict;
+use Cwd; qw(cwd);
+
+# specific
+use GO::Parser;
+
+# ---------------------------------------------------------------------------
+# declarations
+# ---------------------------------------------------------------------------
+
+# set paths to data files
+my $data_path = "/home/preecej/Documents/projects/reactome/reactome_to_chebi_mapping/AraCyc/gk_central_041811/no_synonyms/";
+my $chebi_obo_file = "../../chebi_v78.obo";
+my $reactome_file = "AraReferenceMolecules.txt";
+my $mapped_output_file = "1.2_reactome_chebi_mapping_complete.txt";
+my $sorted_output_file = "1.2_reactome_chebi_mapping_complete_sorted.txt";
+my $unique_mappings = "1.2_reactome_unique_mappings.txt";
+my $sorted_no_match_file = "1.2_reactome_entries_with_no_chebi_match.txt";
+
+# options
+my $allow_obsolete_terms = 1;
+my $allow_cyc = 0;
+my $allow_synonyms = 0;
+
+my $ont; # chebi ontology
+
+my %reactome_CompoundName; # reactome Compound Name hash
+my %reactome_CAS; # reactome CAS hash
+my %reactome_LIGAND; # reactome LIGAND hash
+my %reactome_Cyc; # reactome Cyc hash
+
+my @map_results = (); # successful mappings between chebi and reactome
+
+
+# ---------------------------------------------------------------------------
+# functions
+# ---------------------------------------------------------------------------
+
+
+# setup chebi parser and reactome data
+# ---------------------------------------------------------------------------
+sub init
+{
+    # init ontology parser and ontology
+    my $parser = GO::Parser->new({handler=>'obj'});
+    $parser->parse($data_path . $chebi_obo_file);
+    $ont = $parser->handler->graph;
+      
+    # read reactome file into 3 separate hashes
+    open(REACTOME_FILE,$data_path . $reactome_file);
+
+    my $line = <REACTOME_FILE>; # skip the header
+    my $reactome_count = 0;
+    
+    while (<REACTOME_FILE>)
+    {
+        $line = $_;
+        chomp $line;
+        $reactome_count++;
+        my @reactome_entry = split(/\t/, $line); # break up our tab-del line
+
+        # load up this reactome entry's Compound_Name, ID, CAS, LIGAND, and Cyc values
+        my $reactome_id = $reactome_entry[0];
+        my $compound_name = uc $reactome_entry[1]; # for case-insensitivity
+
+        # strips off "AN " and "A " indefinite articles
+        $compound_name =~ s/^ //;
+        $compound_name =~ s/^AN //;
+        $compound_name =~ s/^A //;
+
+        my $CAS_id = $reactome_entry[2];
+        my $LIGAND_id = $reactome_entry[3];
+        my $Cyc_term = uc $reactome_entry[4]; # for case-insensitivity
+
+        # There is a possibility that a single CAS, LIGAND, or Cyc
+        # identifier may appear in more than one reactome entry. This
+        # temp array allows each matched hash value to hold more than 
+        # one ReactomeID, if necessary.
+        
+        # --CAS Hash Load--
+        if ($CAS_id ne "-") { # keep those "-" placeholders out
+            # build the CAS hash; each value may hold 1...n reactome
+            # ids (as an array)
+            push @{$reactome_CAS{$CAS_id}}, $reactome_id;
+        }
+
+        # similarly...
+
+        # --LIGAND Hash Load--
+        if ($LIGAND_id ne "-") {
+            push @{$reactome_LIGAND{$LIGAND_id}}, $reactome_id;
+        }
+
+        # --CompoundName Hash Load--
+        if ($compound_name ne "-") {
+            push @{$reactome_CompoundName{"$compound_name"}}, $reactome_id;
+        }
+        
+        # --Cyc Hash Load--
+        if ($allow_cyc)
+        {
+            if ($Cyc_term ne "-") {
+                push @{$reactome_Cyc{"$Cyc_term"}}, $reactome_id;
+            }
+
+        }
+    }
+    close REACTOME_FILE;
+    
+    print "\n[Reactome Stats]",
+        "\nTotal Reactome Entries: $reactome_count\n";
+
+}
+
+
+# spit out some data to make sure you've read in the files correctly
+# ---------------------------------------------------------------------------
+sub test_inputs
+{
+    # basic ontology info
+    print "[Node Count]: " . $ont->node_count . "\n"; 
+
+    # get all chebi terms in the ontology
+	my $terms = $ont->get_all_nodes;
+	
+	# output contents of parsed ontology
+	foreach my $term (@$terms)
+	{
+		print "\n" . $term->acc . " " . $term->name . "\n[SYNONYMS]\n";
+
+		my $synonyms = $term->synonym_list;
+		foreach my $synonym (@$synonyms) {
+			print $synonym . "\n";
+		}
+		
+		print "[XREFS]\n";
+		my $xrefs = $term->dbxref_list;
+		foreach my $xref (@$xrefs) {
+			print $xref->xref_key . ",",
+				$xref->xref_keytype . ",",
+				$xref->xref_dbname . ",",
+				$xref->xref_desc . "\n";
+		}
+		print "\n";
+	}
+
+    # show dupes in reactome hashes - give data to Pankaj;
+    # this is important b/c the duplicates may represent erroneous data in
+    # the Reactome dataset
+    my $k; my @v;
+    print "\n[Reactome Hashes - Dupes]\n";
+    print "\n--CAS Hash--\n";
+    for $k (keys %reactome_CAS) {
+        if (@{$reactome_CAS{$k}} > 1) {
+            print "$k: @{$reactome_CAS{$k}}\n";
+        }
+    }
+    print "\n--LIGAND Hash--\n";
+    for $k (keys %reactome_LIGAND) {
+        if (@{$reactome_LIGAND{$k}} > 1) {
+            print "$k: @{$reactome_LIGAND{$k}}\n";
+        }
+    }
+    print "\n--CompoundName Hash--\n";
+    for $k (keys %reactome_CompoundName) {
+        if (@{$reactome_CompoundName{$k}} > 1) {
+            print "$k: @{$reactome_CompoundName{$k}}\n";
+        }
+    }
+    if ($allow_cyc)
+    {
+        print "\n--Cyc Hash--\n";
+        for $k (keys %reactome_Cyc) {
+            if (@{$reactome_Cyc{$k}} > 1) {
+                print "$k: @{$reactome_Cyc{$k}}\n";
+            }
+        }
+    }
+}
+
+
+# map the chebi terms to the reactome entries
+# ---------------------------------------------------------------------------
+sub perform_map
+{
+    my $chebi_obo_terms = $ont->get_all_nodes;
+    
+    # vars for mapping stats
+    my $attempted_mappings = 0;
+    my $successful_mappings = 0;
+    my $attempted_CAS_mappings = 0;
+    my $successful_CAS_mappings = 0;
+    my $attempted_LIGAND_mappings = 0;
+    my $successful_LIGAND_mappings = 0;
+    my $attempted_name_mappings = 0;
+    my $successful_name_mappings = 0;
+    my $attempted_synonym_mappings = 0;
+    my $successful_synonym_mappings = 0;
+    
+    # loop through each chebi term
+	foreach my $term (@$chebi_obo_terms)
+	{
+	    # eliminate "typedef" nodes (non-CHEBI terms), also check for obsolete
+	    # terms and whether to allow them
+        if (($term->acc =~ m/^CHEBI:/)
+            && (!$term->is_obsolete || ($term->is_obsolete && $allow_obsolete_terms)))
+        {
+            # attempt CHEBI match on CAS and LIGAND ID's
+            $attempted_mappings++;
+            my $xrefs = $term->dbxref_list;
+            foreach my $xref (@$xrefs)
+            {
+                $attempted_CAS_mappings++;
+                $attempted_LIGAND_mappings++;
+
+                # temp-foo to skirt an interpolation problem
+                my $tmp_key = $xref->xref_key;
+
+                if (defined($reactome_CAS{"$tmp_key"}))
+                {
+                    foreach my $tmp_reactome_id (@{$reactome_CAS{$tmp_key}})
+                    {
+                        $successful_CAS_mappings++;
+                        push (@map_results, "$tmp_reactome_id\t" .
+                            $term->acc . "\t" . 
+                            "CAS\t" .
+                            $tmp_key);
+                    }
+                }
+
+                if (defined($reactome_LIGAND{"$tmp_key"}))
+                {
+                    foreach my $tmp_reactome_id (@{$reactome_LIGAND{$tmp_key}})
+                    {
+                        $successful_LIGAND_mappings++;
+                        push (@map_results, "$tmp_reactome_id\t" .
+                            $term->acc . "\t" . 
+                            "LIGAND\t" .
+                            $tmp_key);
+                    }
+                }
+            }
+                
+            # attempt CHEBI match on Reactome Compound Names (and optional Cyc names/synonyms)...
+            $attempted_name_mappings++;
+
+            # more temp-foo to skirt said interpolation problem 
+            my $tmp_name = uc $term->name; 
+
+            # reactome compound names...
+            if (defined($reactome_CompoundName{"$tmp_name"}))
+            {
+                foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_name}})
+                {
+                    $successful_name_mappings++;
+                    push (@map_results, "$tmp_reactome_id\t" .
+                        $term->acc . "\t" . 
+                        "CompoundTerm\t" .
+                        $term->name);
+                }
+            }
+            # ...and synonyms (optional)
+            if ($allow_synonyms)
+            {
+                my $synonyms = $term->synonym_list;
+                foreach my $synonym (@$synonyms)
+                {
+                    $attempted_synonym_mappings++;
+
+                    # yet more temp-foo to skirt interpolation problem 
+                    my $tmp_syn = "\U$synonym"; 
+
+                    if (defined($reactome_CompoundName{$tmp_syn}))
+                    {
+                        foreach my $tmp_reactome_id (@{$reactome_CompoundName{$tmp_syn}})
+                        {
+                            $successful_synonym_mappings++;
+                            push (@map_results, "$tmp_reactome_id\t" .
+                                $term->acc . "\t" .
+                                "CompoundSynonym\t" .
+                                $synonym);
+                        }
+                    }
+                }
+            }
+
+            # Cyc names...
+            if ($allow_cyc)
+            {
+                if (defined($reactome_Cyc{"$tmp_name"}))
+                {
+                    foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_name}})
+                    {
+                        $successful_name_mappings++;
+                        push (@map_results, "$tmp_reactome_id\t" .
+                            $term->acc . "\t" . 
+                            "CycTerm\t" .
+                            $term->name);
+                    }
+                }
+                # ...and synonyms (optional)
+                if ($allow_synonyms)
+                {
+                    my $synonyms = $term->synonym_list;
+                    foreach my $synonym (@$synonyms)
+                    {
+                        $attempted_synonym_mappings++;
+    
+                        # yet more temp-foo to skirt interpolation problem 
+                        my $tmp_syn = "\U$synonym"; 
+    
+                        if (defined($reactome_Cyc{$tmp_syn}))
+                        {
+                            foreach my $tmp_reactome_id (@{$reactome_Cyc{$tmp_syn}})
+                            {
+                                $successful_synonym_mappings++;
+                                push (@map_results, "$tmp_reactome_id\t" .
+                                    $term->acc . "\t" .
+                                    "CycSynonym\t" .
+                                    $synonym);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    # send up some stats on the mapping process
+    $successful_mappings =
+        + $successful_CAS_mappings
+        + $successful_LIGAND_mappings
+        + $successful_name_mappings
+        + $successful_synonym_mappings;
+        
+    print "\n[ChEBI Stats]",
+        "\nNodes in File: " . $ont->node_count,
+        "\nTotal Attempted Mappings (with usable ChEBI terms): " . $attempted_mappings,
+        "\nTotal Successful Mappings: " . $successful_mappings . "\n";
+
+    print "\n[Mapping Breakdown by Type]",
+        "\nCAS (matches/attempts): ",
+            "$successful_CAS_mappings/$attempted_CAS_mappings ",
+            "(note: can include ChemIDplus and KEGG COMPUND db duplicates)",
+        "\nLIGAND: ",
+            "$successful_LIGAND_mappings/$attempted_LIGAND_mappings",
+        "\nTerm Names " . ($allow_cyc ? "includes Cyc terms and synonyms" : "") . ": ",
+            "$successful_name_mappings/$attempted_name_mappings";
+    if ($allow_synonyms)
+    {
+        print "\nTerm Synonyms: ",
+            "$successful_synonym_mappings/$attempted_synonym_mappings";
+    }
+    print "\n\n";
+}
+
+
+# put the results in the mapped output file
+# ---------------------------------------------------------------------------
+sub create_mapfile
+{
+    if (@map_results > 0)
+    {
+        # add a header to the results array
+        unshift (@map_results, "ReactomeID\tCHEBI\tXREF_Type\tXREF_ID");
+        
+        # setup output file
+        open(OUTPUT_FILE,">" . $data_path . $mapped_output_file);
+    
+        #format results for file output
+        print OUTPUT_FILE "$_\n" foreach @map_results;
+        
+        close OUTPUT_FILE;
+        
+        # sort on all cols (keep the header at the top), remove exact dupes
+        system "awk 'NR == 1; NR > 1 {print \$0 | \"sort\"}' $data_path$mapped_output_file | uniq > $data_path$sorted_output_file";
+        
+        # also produce files listing unique Reactome entries having a match...and those without a match
+        system "awk 'NR == 1; NR > 1 {print \$1}' $data_path$sorted_output_file | uniq > $data_path$unique_mappings";
+        system "cat $data_path$reactome_file | grep -vf $data_path$unique_mappings | sort > $data_path$sorted_no_match_file";
+    } else {
+        print "\n\nSorry, there are no mapped results.\n\n";
+    }
+}
+
+
+# ---------------------------------------------------------------------------
+# main
+# ---------------------------------------------------------------------------
+
+init;
+#test_inputs;
+perform_map;
+create_mapfile;
+
+exit;
+
-- 
2.34.1