In progress; Revised ontology info, ontology references, evidence, evidence codes,

author preecej <preecej@localhost>

Thu, 25 Aug 2011 00:23:16 +0000 (00:23 +0000)

committer preecej <preecej@localhost>

Thu, 25 Aug 2011 00:23:16 +0000 (00:23 +0000)
author preecej <preecej@localhost>
Thu, 25 Aug 2011 00:23:16 +0000 (00:23 +0000)
committer preecej <preecej@localhost>
Thu, 25 Aug 2011 00:23:16 +0000 (00:23 +0000)
diff --git a/preecej/semantic_wiki/paw_TransformForImport.pl b/preecej/semantic_wiki/paw_TransformForImport.pl

index df3be4fca727221ba1c161329bdcb9370ff9f405..40c24959a0e7105ca52bec6bd65172ab9ee2c14b 100644 (file)
--- a/preecej/semantic_wiki/paw_TransformForImport.pl
+++ b/preecej/semantic_wiki/paw_TransformForImport.pl
@@ -129,8 +129,8 @@ my @field_data;
  my $xml; # represents the xml doc
  
  # gaf
-my %aspects;
-my $ontology_name;
+my %ontology_info; # holds ontology names and aspects, keyed by abbreviation 
+my $curr_ontology_type; # flag to track the proper ontology to reference, line by line
  my %annotation_universals; # holds values assumed not to vary across the file
  my %annotations; # keyed on Gene Symbol
  
@@ -252,72 +252,83 @@ sub import_generic
      }
  }
  
-# add new ontology reference data
+# Add new ontology reference data. Aggregates add'l evidence codes, evidence, 
+# and publications for the existing term id.
  # ---------------------------------------------------------------------------
-sub add_ontology_ref_data($)
+sub add_ontology_ref_data($$)
  {
      my %ref_data = %{$_[0]};
+    my $ont_type = $_[1];
+ 
      my @evidence = split('\|', $ref_data{"with_or_from"});
      my @publications = split('\|', $ref_data{"db_reference"});
+ 
+    my $i = 0;
      for (@publications) {
          if (!(substr($_,0,5) eq "PMID:")) {
-            splice(@publications,$_,1);
+            splice(@publications,$i,1);
          }
+        $i++;
      }
  
-    # TODO: can you eliminate this if..else by compressing the hash aggregation?
-    
-    # if this is the first annotation for this current gene symbol and term id, add it
-    if (!exists $annotations{$ref_data{"db_object_symbol"}}
-            {"Ontological References"}
-                {$ref_data{"term_id"}})
+    # aspect (assumes only one, allows overwrite)
+    $annotations{$ref_data{"db_object_symbol"}}
+        {"Ontological References"} 
+            {$ref_data{"term_id"}}
+            {"Aspect"} = $ontology_info{$ont_type}
+                            {"aspects"}
+                                {uc($ref_data{"aspect"})};
+
+    # evidence codes
+    $annotations{$ref_data{"db_object_symbol"}}
+        {"Ontological References"}
+            {$ref_data{"term_id"}}
+                {"Evidence Codes"}{$ref_data{"evidence_code"}} = "";
+
+    # evidence
+    for (@evidence)
      {
-        # print "*** NEW REF: $ref_data{'term_id'} ***\n"; # TEST
-        
-        # add new ontology data
-        my $annotation_ontology_ref = {
-            "Aspect" => $aspects{uc($ref_data{"aspect"})}, # assumes only one
-            "Evidence Codes" => { $ref_data{"evidence_code"} => "" }
-            };
-
-        # add Evidence
-        for (@evidence) {
-            $$annotation_ontology_ref{"Evidence"}{$_} = "";
-        }
-            
-        # add Publication data
-        for (@publications) {
-            $$annotation_ontology_ref{"Publications"}{$_} = ""; }
-            
          $annotations{$ref_data{"db_object_symbol"}}
-            {"Ontological References"} 
-                {$ref_data{"term_id"}} = $annotation_ontology_ref;
-
-        if ($debug) { print "...<DEBUG: \%annotation_ontology_ref>\n"
-            . Dumper($annotation_ontology_ref) . "\n\n"; }
+            {"Ontological References"}
+                {$ref_data{"term_id"}}
+                    {"Evidence"}{$_} = "";
      }
-    else # aggregate add'l evidence codes, evidence, and publications for the existing term id
+
+    # publications
+    for (@publications)
      {
-        # print "*** ADD'L REF: $ref_data{'term_id'} ***\n";  # TEST
-        
          $annotations{$ref_data{"db_object_symbol"}}
              {"Ontological References"}
                  {$ref_data{"term_id"}}
-                {"Evidence Codes"}{$ref_data{"evidence_code"}} = "";
+                    {"Publications"}{$_} = "";
+    }
+}
  
-        for (@evidence) {
-            $annotations{$ref_data{"db_object_symbol"}}
-                {"Ontological References"}
-                    {$ref_data{"term_id"}}
-                        {"Evidence"}{$_} = "";
+# populate a hash containing ontology types and their aspects; used as a 
+# reference for aspect proper names when importing GAF data for transformation
+# ---------------------------------------------------------------------------
+sub initialize_ontology
+{
+    %ontology_info = (
+        GO => {
+            name => "Gene Ontology",
+            aspects => {
+                P => "Biological Process",
+                C => "Cellular Component",
+                F => "Molecular Function"
+            }
+        },
+        PO => {
+            name => "Plant Ontology",
+            aspects => {
+                A => "Plant Anatomy",
+                G => "Plant Growth and Development Stage"
+            }
          }
-
-        for (@publications) {
-            $annotations{$ref_data{"db_object_symbol"}}
-                {"Ontological References"}
-                    {$ref_data{"term_id"}}
-                        {"Publications"}{$_} = ""; }
-    }
+    );
+    
+    if ($debug) { print "...<DEBUG: \%ontology_info>\n"
+    . Dumper(\%ontology_info) . "\n\n"; }
  }
  
  # read, parse, and store GAF annotations
@@ -353,9 +364,15 @@ sub import_gaf
         #                                                               "Evidence" => % strings ("with_or_from" )
         #                                                               "Publications" => % PMID's from "db:reference", used to create separate Pub pages
      
+       # set up a hash of ontology types and aspects to be referenced during data import
+       initialize_ontology();
+
      # loop through data rows and build hashed annotation data structure
      my $count = 0;
      
+    # regex for locating a useable accession id from a locus (species-specific)
+    my $locus_finder_expr;  
+    
      while (<INPUT_FILE>)
      {
          $count++;
@@ -396,30 +413,19 @@ sub import_gaf
              
              if ($debug) { print "...<DEBUG: \%annotation_universals>\n"
                  . Dumper(\%annotation_universals) . "\n\n"; }
-    
-            # TODO: set all ontology aspects in a multi-hash (group by ontology)
-            # and keep a flag to monitor the currently-needed ontology
-            
-            # identify what kind of ontology term we're dealing with and set our 
-            # aspects and ontology name accordingly
-            switch ((split(':',$curr_line_hash{"term_id"}))[0]) {
-                case 'GO' { # Gene Ontology
-                    %aspects = (
-                        P => "Biological Process",
-                        C => "Cellular Component",
-                        F => "Molecular Function"
-                    );
-                    $ontology_name = "Gene Ontology";
-                }
-                case 'PO' { # Plant Ontology
-                    %aspects = (
-                        A => "Plant Anatomy",
-                        G => "Plant Growth and Development Stage"
-                    );
-                    $ontology_name = "Plant Ontology";
+
+            # set species-specific values
+            switch ($annotation_universals{"Species ID"}) {
+                case "NCBI:3702" { $locus_finder_expr = "/[Aa][Tt].[Gg]/"; }
+                else {
+                    die($annotation_universals{"Species ID"} 
+                        . " is not a valid NCBI taxon ID.\n");
                  }
              }
          }
+        
+        # set the ontology for the current line
+        $curr_ontology_type = uc((split(':',$curr_line_hash{"term_id"}))[0]);
  
          # check to see if Gene Symbol hash key exists (for grouping)
          # if not, add the new Gene Symbol and its associated props
@@ -435,7 +441,7 @@ sub import_gaf
                  . Dumper(\@synonyms) . "\n\n"; }
              
              # find the gene locus, if it is listed (first "AT.G")
-            my @loci = grep /[Aa][Tt].[Gg]/, @synonyms;
+            my @loci = grep $locus_finder_expr, @synonyms;
  
              if ($debug) { print "...<DEBUG: \@loci>\n"
                  . Dumper(\@loci) . "\n\n"; }
@@ -448,18 +454,26 @@ sub import_gaf
              }
              else # no match; attempt to use the Gene Symbol instead
              {
-                if ($curr_line_hash{"db_object_symbol"} =~ /[Aa][Tt].[Gg]/)
+                if ($curr_line_hash{"db_object_symbol"} =~ $locus_finder_expr)
                  {
-                    # the split drops the variant/allele signifier
+                    # the split drops the variant/allele signifier, if present
                      $locus = (split('.',$curr_line_hash{"db_object_symbol"}))[0];
                  }
+                else # no match; attempt to use the Gene Name instead
+                {
+                    if ($curr_line_hash{"db_object_name"} =~ $locus_finder_expr)
+                    {
+                        $locus = (split('.',$curr_line_hash{"db_object_name"}))[0];
+                    }
+                }
              }
+            
              # chromosome = third char in locus, if it exists
              my $chromosome = ($locus ne "" ? (split('',$locus))[2] : "");
              
              # set some sort of pseudo-unique value as the accession id, 
              # in order of succession: locus, then symbol
-            # (note: this is dangerous; a stable identifier is preferred)
+            # (NOTE: this is dangerous; a stable identifier is preferred)
              my $accession_id = 
                  $locus ? $locus : $curr_line_hash{"db_object_symbol"};
              
@@ -481,7 +495,7 @@ sub import_gaf
              # add new gene annotation and assign props
              $annotations{$curr_line_hash{"db_object_symbol"}} = $annotation_properties;
  
-            add_ontology_ref_data(\%curr_line_hash); # add the first ontology reference (every time)            
+            add_ontology_ref_data(\%curr_line_hash, $curr_ontology_type); # add the first ontology reference (every time)            
          }
          # that Gene Symbol does exist, so we just need to roll-up multi-line 
          # annotation information, like gene synonyms and ontology refs
@@ -494,7 +508,7 @@ sub import_gaf
                             $annotations{$curr_line_hash{"db_object_symbol"}}
                                 {"Gene Synonyms"}{$_} = ""; }
  
-            add_ontology_ref_data(\%curr_line_hash); # add add'l ontology reference data            
+            add_ontology_ref_data(\%curr_line_hash, $curr_ontology_type); # add add'l ontology reference data            
          }
      }
      if ($debug) { print "...<DEBUG: \%annotations>\n"
@@ -705,7 +719,7 @@ sub transform_generic
  # ---------------------------------------------------------------------------
  sub transform_gaf
  {
-    # define temaplates and their fields for Provenance-generation
+    # define templates and their fields for Provenance-generation
      my $template_field_map = {
          Annotation => [
              'Species Name',
@@ -724,7 +738,8 @@ sub transform_gaf
              'Ontology',
              'Term ID',
              'Aspect',
-            'Evidence Code'
+            'Evidence Code',
+            'Evidence'
              ]
          };
      
@@ -778,37 +793,49 @@ sub transform_gaf
              $writer->endTag("Page");
              
              # create gene synonyms page
-            if (length($annotations{$annotation}{'Gene Synonyms'}) > 0)
+            if (scalar keys (%{$annotations{$annotation}{'Gene Synonyms'}}) > 0)
              {
                  $writer->startTag("Page",Title=>"Annotation:$annot_title_count/Gene Synonyms");
                      $writer->startTag("Template",Name=>"Gene_Synonyms");
                          $writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page");
                      $writer->endTag("Template");
      
-                    foreach (split('\|',$annotations{$annotation}{'Gene Synonyms'}))
+                    foreach my $synonym (keys %{$annotations{$annotation}{'Gene Synonyms'}})
                      {
                          $writer->startTag("Template",Name=>"Gene_Synonym_Repeater");
-                            $writer->dataElement("Field", $_, Name=>"Gene Synonym");    
+                        $writer->dataElement("Field", $synonym, Name=>"Gene Synonym");    
                          $writer->endTag("Template");
                      }
                  $writer->endTag("Page");
              }
              
              # create ont refs page
-            if (scalar(@{$annotations{$annotation}{"Ontological Reference"}}) > 0)
+            if (scalar keys (%{$annotations{$annotation}{"Ontological References"}}) > 0)
              {
                  $writer->startTag("Page",Title=>"Annotation:$annot_title_count/Ontologies");
                      $writer->startTag("Template",Name=>"Ontological_References");
                          $writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page");
                      $writer->endTag("Template");
      
-                    foreach (@{$annotations{$annotation}{"Ontological Reference"}})
+                    foreach my $ont_term (keys %{$annotations{$annotation}{"Ontological References"}})
                      {
                          $writer->startTag("Template",Name=>"Ontological_Reference_Repeater");
-                            $writer->dataElement("Field", $ontology_name, Name=>"Ontology");
-                            $writer->dataElement("Field", $$_{"Term ID"}, Name=>"Term ID");
-                            $writer->dataElement("Field", $$_{"Aspect"}, Name=>"Aspect");
-                            $writer->dataElement("Field", $$_{"Evidence Codes"}, Name=>"Evidence Code");
+                            $writer->dataElement("Field", $ontology_info{$curr_ontology_type}{"name"}, Name=>"Ontology");
+                            $writer->dataElement("Field", $ont_term, Name=>"Term ID");
+                            $writer->dataElement("Field", $annotations{$annotation}{"Ontological References"}{$ont_term}{"Aspect"}, Name=>"Aspect");
+
+                            my @evidence_codes;
+                            foreach my $evidence_code (keys %{$annotations{$annotation}{"Ontological References"}{$ont_term}{"Evidence Codes"}}) {
+                               push @evidence_codes, $evidence_code; 
+                            }
+                            $writer->dataElement("Field", join(', ',@evidence_codes), Name=>"Evidence Code"); 
+
+                            my @ary_evidence;
+                            foreach my $evidence (keys %{$annotations{$annotation}{"Ontological References"}{$ont_term}{"Evidence"}}) {
+                               push @ary_evidence, $evidence; 
+                            }
+                            $writer->dataElement("Field", join(', ',@ary_evidence), Name=>"Evidence");
+
                          $writer->endTag("Template");
                      }
                  $writer->endTag("Page");
@@ -820,6 +847,8 @@ sub transform_gaf
                      $writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page");
                  $writer->endTag("Template");
                  
+                    # TODO: make further use of template map here to create provenance for SIO's
+                    
                      foreach (@{@$template_field_map{"Annotation"}})
                      {
                          $writer->startTag("Template",Name=>"Provenance_Repeater");
@@ -829,7 +858,6 @@ sub transform_gaf
                              $writer->dataElement("Field", "Source:$SOURCE_TITLE_SEED", Name=>"Source");
                          $writer->endTag("Template");
                      }
-                
              $writer->endTag("Page");
              
              $annot_title_count++;
@@ -875,7 +903,6 @@ sub show_output
  init;
  import_data;
  if ($verbose) { show_input; }
-exit(0); # TEST
  write_xml();
  if ($verbose) { show_output; }
author	preecej <preecej@localhost>
	Thu, 25 Aug 2011 00:23:16 +0000 (00:23 +0000)
committer	preecej <preecej@localhost>
	Thu, 25 Aug 2011 00:23:16 +0000 (00:23 +0000)