From 6a3dac2291f9d659e9da76a9a5d1bb229cdd5cdc Mon Sep 17 00:00:00 2001
From: preecej <preecej@localhost>
Date: Thu, 25 Aug 2011 00:23:16 +0000
Subject: [PATCH] In progress; Revised ontology info, ontology references,
 evidence, evidence codes, and gene synonyms. Generated XML pages for all
 except Publications. That's next.

svn path=/; revision=170
---
 .../semantic_wiki/paw_TransformForImport.pl   | 205 ++++++++++--------
 1 file changed, 116 insertions(+), 89 deletions(-)

diff --git a/preecej/semantic_wiki/paw_TransformForImport.pl b/preecej/semantic_wiki/paw_TransformForImport.pl
index df3be4f..40c2495 100644
--- a/preecej/semantic_wiki/paw_TransformForImport.pl
+++ b/preecej/semantic_wiki/paw_TransformForImport.pl
@@ -129,8 +129,8 @@ my @field_data;
 my $xml; # represents the xml doc
 
 # gaf
-my %aspects;
-my $ontology_name;
+my %ontology_info; # holds ontology names and aspects, keyed by abbreviation 
+my $curr_ontology_type; # flag to track the proper ontology to reference, line by line
 my %annotation_universals; # holds values assumed not to vary across the file
 my %annotations; # keyed on Gene Symbol
 
@@ -252,72 +252,83 @@ sub import_generic
     }
 }
 
-# add new ontology reference data
+# Add new ontology reference data. Aggregates add'l evidence codes, evidence, 
+# and publications for the existing term id.
 # ---------------------------------------------------------------------------
-sub add_ontology_ref_data($)
+sub add_ontology_ref_data($$)
 {
     my %ref_data = %{$_[0]};
+    my $ont_type = $_[1];
+ 
     my @evidence = split('\|', $ref_data{"with_or_from"});
     my @publications = split('\|', $ref_data{"db_reference"});
+ 
+    my $i = 0;
     for (@publications) {
         if (!(substr($_,0,5) eq "PMID:")) {
-            splice(@publications,$_,1);
+            splice(@publications,$i,1);
         }
+        $i++;
     }
 
-    # TODO: can you eliminate this if..else by compressing the hash aggregation?
-    
-    # if this is the first annotation for this current gene symbol and term id, add it
-    if (!exists $annotations{$ref_data{"db_object_symbol"}}
-            {"Ontological References"}
-                {$ref_data{"term_id"}})
+    # aspect (assumes only one, allows overwrite)
+    $annotations{$ref_data{"db_object_symbol"}}
+        {"Ontological References"} 
+            {$ref_data{"term_id"}}
+            {"Aspect"} = $ontology_info{$ont_type}
+                            {"aspects"}
+                                {uc($ref_data{"aspect"})};
+
+    # evidence codes
+    $annotations{$ref_data{"db_object_symbol"}}
+        {"Ontological References"}
+            {$ref_data{"term_id"}}
+                {"Evidence Codes"}{$ref_data{"evidence_code"}} = "";
+
+    # evidence
+    for (@evidence)
     {
-        # print "*** NEW REF: $ref_data{'term_id'} ***\n"; # TEST
-        
-        # add new ontology data
-        my $annotation_ontology_ref = {
-            "Aspect" => $aspects{uc($ref_data{"aspect"})}, # assumes only one
-            "Evidence Codes" => { $ref_data{"evidence_code"} => "" }
-            };
-
-        # add Evidence
-        for (@evidence) {
-            $$annotation_ontology_ref{"Evidence"}{$_} = "";
-        }
-            
-        # add Publication data
-        for (@publications) {
-            $$annotation_ontology_ref{"Publications"}{$_} = ""; }
-            
         $annotations{$ref_data{"db_object_symbol"}}
-            {"Ontological References"} 
-                {$ref_data{"term_id"}} = $annotation_ontology_ref;
-
-        if ($debug) { print "...<DEBUG: \%annotation_ontology_ref>\n"
-            . Dumper($annotation_ontology_ref) . "\n\n"; }
+            {"Ontological References"}
+                {$ref_data{"term_id"}}
+                    {"Evidence"}{$_} = "";
     }
-    else # aggregate add'l evidence codes, evidence, and publications for the existing term id
+
+    # publications
+    for (@publications)
     {
-        # print "*** ADD'L REF: $ref_data{'term_id'} ***\n";  # TEST
-        
         $annotations{$ref_data{"db_object_symbol"}}
             {"Ontological References"}
                 {$ref_data{"term_id"}}
-                {"Evidence Codes"}{$ref_data{"evidence_code"}} = "";
+                    {"Publications"}{$_} = "";
+    }
+}
 
-        for (@evidence) {
-            $annotations{$ref_data{"db_object_symbol"}}
-                {"Ontological References"}
-                    {$ref_data{"term_id"}}
-                        {"Evidence"}{$_} = "";
+# populate a hash containing ontology types and their aspects; used as a 
+# reference for aspect proper names when importing GAF data for transformation
+# ---------------------------------------------------------------------------
+sub initialize_ontology
+{
+    %ontology_info = (
+        GO => {
+            name => "Gene Ontology",
+            aspects => {
+                P => "Biological Process",
+                C => "Cellular Component",
+                F => "Molecular Function"
+            }
+        },
+        PO => {
+            name => "Plant Ontology",
+            aspects => {
+                A => "Plant Anatomy",
+                G => "Plant Growth and Development Stage"
+            }
         }
-
-        for (@publications) {
-            $annotations{$ref_data{"db_object_symbol"}}
-                {"Ontological References"}
-                    {$ref_data{"term_id"}}
-                        {"Publications"}{$_} = ""; }
-    }
+    );
+    
+    if ($debug) { print "...<DEBUG: \%ontology_info>\n"
+    . Dumper(\%ontology_info) . "\n\n"; }
 }
 
 # read, parse, and store GAF annotations
@@ -353,9 +364,15 @@ sub import_gaf
 	#                                                               "Evidence" => % strings ("with_or_from" )
 	#                                                               "Publications" => % PMID's from "db:reference", used to create separate Pub pages
     
+	# set up a hash of ontology types and aspects to be referenced during data import
+	initialize_ontology();
+
     # loop through data rows and build hashed annotation data structure
     my $count = 0;
     
+    # regex for locating a useable accession id from a locus (species-specific)
+    my $locus_finder_expr;  
+    
     while (<INPUT_FILE>)
     {
         $count++;
@@ -396,30 +413,19 @@ sub import_gaf
             
             if ($debug) { print "...<DEBUG: \%annotation_universals>\n"
                 . Dumper(\%annotation_universals) . "\n\n"; }
-    
-            # TODO: set all ontology aspects in a multi-hash (group by ontology)
-            # and keep a flag to monitor the currently-needed ontology
-            
-            # identify what kind of ontology term we're dealing with and set our 
-            # aspects and ontology name accordingly
-            switch ((split(':',$curr_line_hash{"term_id"}))[0]) {
-                case 'GO' { # Gene Ontology
-                    %aspects = (
-                        P => "Biological Process",
-                        C => "Cellular Component",
-                        F => "Molecular Function"
-                    );
-                    $ontology_name = "Gene Ontology";
-                }
-                case 'PO' { # Plant Ontology
-                    %aspects = (
-                        A => "Plant Anatomy",
-                        G => "Plant Growth and Development Stage"
-                    );
-                    $ontology_name = "Plant Ontology";
+
+            # set species-specific values
+            switch ($annotation_universals{"Species ID"}) {
+                case "NCBI:3702" { $locus_finder_expr = "/[Aa][Tt].[Gg]/"; }
+                else {
+                    die($annotation_universals{"Species ID"} 
+                        . " is not a valid NCBI taxon ID.\n");
                 }
             }
         }
+        
+        # set the ontology for the current line
+        $curr_ontology_type = uc((split(':',$curr_line_hash{"term_id"}))[0]);
 
         # check to see if Gene Symbol hash key exists (for grouping)
         # if not, add the new Gene Symbol and its associated props
@@ -435,7 +441,7 @@ sub import_gaf
                 . Dumper(\@synonyms) . "\n\n"; }
             
             # find the gene locus, if it is listed (first "AT.G")
-            my @loci = grep /[Aa][Tt].[Gg]/, @synonyms;
+            my @loci = grep $locus_finder_expr, @synonyms;
 
             if ($debug) { print "...<DEBUG: \@loci>\n"
                 . Dumper(\@loci) . "\n\n"; }
@@ -448,18 +454,26 @@ sub import_gaf
             }
             else # no match; attempt to use the Gene Symbol instead
             {
-                if ($curr_line_hash{"db_object_symbol"} =~ /[Aa][Tt].[Gg]/)
+                if ($curr_line_hash{"db_object_symbol"} =~ $locus_finder_expr)
                 {
-                    # the split drops the variant/allele signifier
+                    # the split drops the variant/allele signifier, if present
                     $locus = (split('.',$curr_line_hash{"db_object_symbol"}))[0];
                 }
+                else # no match; attempt to use the Gene Name instead
+                {
+                    if ($curr_line_hash{"db_object_name"} =~ $locus_finder_expr)
+                    {
+                        $locus = (split('.',$curr_line_hash{"db_object_name"}))[0];
+                    }
+                }
             }
+            
             # chromosome = third char in locus, if it exists
             my $chromosome = ($locus ne "" ? (split('',$locus))[2] : "");
             
             # set some sort of pseudo-unique value as the accession id, 
             # in order of succession: locus, then symbol
-            # (note: this is dangerous; a stable identifier is preferred)
+            # (NOTE: this is dangerous; a stable identifier is preferred)
             my $accession_id = 
                 $locus ? $locus : $curr_line_hash{"db_object_symbol"};
             
@@ -481,7 +495,7 @@ sub import_gaf
             # add new gene annotation and assign props
             $annotations{$curr_line_hash{"db_object_symbol"}} = $annotation_properties;
 
-            add_ontology_ref_data(\%curr_line_hash); # add the first ontology reference (every time)            
+            add_ontology_ref_data(\%curr_line_hash, $curr_ontology_type); # add the first ontology reference (every time)            
         }
         # that Gene Symbol does exist, so we just need to roll-up multi-line 
         # annotation information, like gene synonyms and ontology refs
@@ -494,7 +508,7 @@ sub import_gaf
 			    $annotations{$curr_line_hash{"db_object_symbol"}}
 			        {"Gene Synonyms"}{$_} = ""; }
 
-            add_ontology_ref_data(\%curr_line_hash); # add add'l ontology reference data            
+            add_ontology_ref_data(\%curr_line_hash, $curr_ontology_type); # add add'l ontology reference data            
         }
     }
     if ($debug) { print "...<DEBUG: \%annotations>\n"
@@ -705,7 +719,7 @@ sub transform_generic
 # ---------------------------------------------------------------------------
 sub transform_gaf
 {
-    # define temaplates and their fields for Provenance-generation
+    # define templates and their fields for Provenance-generation
     my $template_field_map = {
         Annotation => [
             'Species Name',
@@ -724,7 +738,8 @@ sub transform_gaf
             'Ontology',
             'Term ID',
             'Aspect',
-            'Evidence Code'
+            'Evidence Code',
+            'Evidence'
             ]
         };
     
@@ -778,37 +793,49 @@ sub transform_gaf
             $writer->endTag("Page");
             
             # create gene synonyms page
-            if (length($annotations{$annotation}{'Gene Synonyms'}) > 0)
+            if (scalar keys (%{$annotations{$annotation}{'Gene Synonyms'}}) > 0)
             {
                 $writer->startTag("Page",Title=>"Annotation:$annot_title_count/Gene Synonyms");
                     $writer->startTag("Template",Name=>"Gene_Synonyms");
                         $writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page");
                     $writer->endTag("Template");
     
-                    foreach (split('\|',$annotations{$annotation}{'Gene Synonyms'}))
+                    foreach my $synonym (keys %{$annotations{$annotation}{'Gene Synonyms'}})
                     {
                         $writer->startTag("Template",Name=>"Gene_Synonym_Repeater");
-                            $writer->dataElement("Field", $_, Name=>"Gene Synonym");    
+                        $writer->dataElement("Field", $synonym, Name=>"Gene Synonym");    
                         $writer->endTag("Template");
                     }
                 $writer->endTag("Page");
             }
             
             # create ont refs page
-            if (scalar(@{$annotations{$annotation}{"Ontological Reference"}}) > 0)
+            if (scalar keys (%{$annotations{$annotation}{"Ontological References"}}) > 0)
             {
                 $writer->startTag("Page",Title=>"Annotation:$annot_title_count/Ontologies");
                     $writer->startTag("Template",Name=>"Ontological_References");
                         $writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page");
                     $writer->endTag("Template");
     
-                    foreach (@{$annotations{$annotation}{"Ontological Reference"}})
+                    foreach my $ont_term (keys %{$annotations{$annotation}{"Ontological References"}})
                     {
                         $writer->startTag("Template",Name=>"Ontological_Reference_Repeater");
-                            $writer->dataElement("Field", $ontology_name, Name=>"Ontology");
-                            $writer->dataElement("Field", $$_{"Term ID"}, Name=>"Term ID");
-                            $writer->dataElement("Field", $$_{"Aspect"}, Name=>"Aspect");
-                            $writer->dataElement("Field", $$_{"Evidence Codes"}, Name=>"Evidence Code");
+                            $writer->dataElement("Field", $ontology_info{$curr_ontology_type}{"name"}, Name=>"Ontology");
+                            $writer->dataElement("Field", $ont_term, Name=>"Term ID");
+                            $writer->dataElement("Field", $annotations{$annotation}{"Ontological References"}{$ont_term}{"Aspect"}, Name=>"Aspect");
+
+                            my @evidence_codes;
+                            foreach my $evidence_code (keys %{$annotations{$annotation}{"Ontological References"}{$ont_term}{"Evidence Codes"}}) {
+                               push @evidence_codes, $evidence_code; 
+                            }
+                            $writer->dataElement("Field", join(', ',@evidence_codes), Name=>"Evidence Code"); 
+
+                            my @ary_evidence;
+                            foreach my $evidence (keys %{$annotations{$annotation}{"Ontological References"}{$ont_term}{"Evidence"}}) {
+                               push @ary_evidence, $evidence; 
+                            }
+                            $writer->dataElement("Field", join(', ',@ary_evidence), Name=>"Evidence");
+
                         $writer->endTag("Template");
                     }
                 $writer->endTag("Page");
@@ -820,6 +847,8 @@ sub transform_gaf
                     $writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page");
                 $writer->endTag("Template");
                 
+                    # TODO: make further use of template map here to create provenance for SIO's
+                    
                     foreach (@{@$template_field_map{"Annotation"}})
                     {
                         $writer->startTag("Template",Name=>"Provenance_Repeater");
@@ -829,7 +858,6 @@ sub transform_gaf
                             $writer->dataElement("Field", "Source:$SOURCE_TITLE_SEED", Name=>"Source");
                         $writer->endTag("Template");
                     }
-                
             $writer->endTag("Page");
             
             $annot_title_count++;
@@ -875,7 +903,6 @@ sub show_output
 init;
 import_data;
 if ($verbose) { show_input; }
-exit(0); # TEST
 write_xml();
 if ($verbose) { show_output; }
 
-- 
2.34.1