In progress; added handling for grouped ont term ids in gaf import, inc multiple...

author preecej <preecej@localhost>

Wed, 24 Aug 2011 01:29:34 +0000 (01:29 +0000)

committer preecej <preecej@localhost>

Wed, 24 Aug 2011 01:29:34 +0000 (01:29 +0000)
author preecej <preecej@localhost>
Wed, 24 Aug 2011 01:29:34 +0000 (01:29 +0000)
committer preecej <preecej@localhost>
Wed, 24 Aug 2011 01:29:34 +0000 (01:29 +0000)
diff --git a/preecej/semantic_wiki/paw_TransformForImport.pl b/preecej/semantic_wiki/paw_TransformForImport.pl

index 5dd0e8851240eabb6098e9b1bac09336c10616d8..df3be4fca727221ba1c161329bdcb9370ff9f405 100644 (file)
--- a/preecej/semantic_wiki/paw_TransformForImport.pl
+++ b/preecej/semantic_wiki/paw_TransformForImport.pl
@@ -6,7 +6,7 @@ Planteome Annotation Wiki - Data Import Script
  
  =head1 VERSION
  
-0.2
+0.3
  
  =head1 DESCRIPTION
  
@@ -16,8 +16,10 @@ by the MediaWiki extension DataTransfer (Special:ImportXML) feature.
  Also generates appropriate provenance of data based on a prefixed
  Source header in the import file.
  
-New to this revision: Added a GAF file type option for import; 
-accommodates ontologically-focused annotations.
+Handles GAF file type option for import; accommodates ontologically-
+focused annotations.
+
+New to this revision: ... 
  
  =head1 USAGE
  
@@ -50,10 +52,11 @@ be imported in accordance with the input type.
      Source URI=http://www.shigen.nig.ac.jp/rice/oryzabase/
      Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/...
  
-=head2 Data Header Format Examples
+=head2 Data Header Generic Format Examples
  
-    NOTE: The field separator may also be a tab instead, but must be 
-    employed consistently across the entire file.
+    NOTES: The field separator may also be a tab instead, but must be 
+    employed consistently across the entire file. GAF files do not 
+    require a [Format] section.
  
      [Format]
      Template=Annotation
@@ -249,16 +252,89 @@ sub import_generic
      }
  }
  
+# add new ontology reference data
+# ---------------------------------------------------------------------------
+sub add_ontology_ref_data($)
+{
+    my %ref_data = %{$_[0]};
+    my @evidence = split('\|', $ref_data{"with_or_from"});
+    my @publications = split('\|', $ref_data{"db_reference"});
+    for (@publications) {
+        if (!(substr($_,0,5) eq "PMID:")) {
+            splice(@publications,$_,1);
+        }
+    }
+
+    # TODO: can you eliminate this if..else by compressing the hash aggregation?
+    
+    # if this is the first annotation for this current gene symbol and term id, add it
+    if (!exists $annotations{$ref_data{"db_object_symbol"}}
+            {"Ontological References"}
+                {$ref_data{"term_id"}})
+    {
+        # print "*** NEW REF: $ref_data{'term_id'} ***\n"; # TEST
+        
+        # add new ontology data
+        my $annotation_ontology_ref = {
+            "Aspect" => $aspects{uc($ref_data{"aspect"})}, # assumes only one
+            "Evidence Codes" => { $ref_data{"evidence_code"} => "" }
+            };
+
+        # add Evidence
+        for (@evidence) {
+            $$annotation_ontology_ref{"Evidence"}{$_} = "";
+        }
+            
+        # add Publication data
+        for (@publications) {
+            $$annotation_ontology_ref{"Publications"}{$_} = ""; }
+            
+        $annotations{$ref_data{"db_object_symbol"}}
+            {"Ontological References"} 
+                {$ref_data{"term_id"}} = $annotation_ontology_ref;
+
+        if ($debug) { print "...<DEBUG: \%annotation_ontology_ref>\n"
+            . Dumper($annotation_ontology_ref) . "\n\n"; }
+    }
+    else # aggregate add'l evidence codes, evidence, and publications for the existing term id
+    {
+        # print "*** ADD'L REF: $ref_data{'term_id'} ***\n";  # TEST
+        
+        $annotations{$ref_data{"db_object_symbol"}}
+            {"Ontological References"}
+                {$ref_data{"term_id"}}
+                {"Evidence Codes"}{$ref_data{"evidence_code"}} = "";
+
+        for (@evidence) {
+            $annotations{$ref_data{"db_object_symbol"}}
+                {"Ontological References"}
+                    {$ref_data{"term_id"}}
+                        {"Evidence"}{$_} = "";
+        }
+
+        for (@publications) {
+            $annotations{$ref_data{"db_object_symbol"}}
+                {"Ontological References"}
+                    {$ref_data{"term_id"}}
+                        {"Publications"}{$_} = ""; }
+    }
+}
+
  # read, parse, and store GAF annotations
  # ---------------------------------------------------------------------------
  sub import_gaf
  {
         #[GAF implied data structure]
-    # singletons: db/Source (not really, but OK for now), taxon/Species ID, assigned_by, gene_type/Gene Type (later: proteins, too)
-    # unneeded: db_object_id, date, qualifier, db_reference (get PMID from here later), with_or_from (later)
-    # unvaried fields (gene-level): db_object_symbol/Gene Symbol, db_object_name/Gene Name, db_object_synonym/Gene Locus|Source Accession ID|Chromosome|Gene Synonyms (see below), 
-        # varied fields (gene synonyms): db_object_synonym/Gene Synonym
-    # varied fields (ontology-level): term_id/Term ID, evidence_code/Evidence Code, aspect/Aspect,  
+    # singletons: db/Source (not really, but OK for now), taxon/Species ID, 
+    #   assigned_by, gene_type/Gene Type (later: proteins, too)
+    # lower priority: db_object_id, date
+    # not needed: qualifier
+    # unvaried fields (gene-level): db_object_symbol/Gene Symbol, 
+    #   db_object_name/Gene Name, 
+    #   db_object_synonym/Gene Locus|Source Accession ID|Chromosome|Gene Synonyms (see below), 
+    # varied fields (gene synonyms): db_object_synonym/Gene Synonym
+    # varied fields (ontology-level): term_id/Term ID, evidence_code/Evidence Code, 
+    #   aspect/Aspect, db_reference (multi-, get PMID from here), with_or_from (multi-)
  
      # [Annotation Object Structure]
      # %annotation_universals
@@ -267,14 +343,15 @@ sub import_gaf
      #     "Gene Type"
      # %annotations
      #     "$Gene Symbol" => %annotation_properties
-       #                           "Gene Name" => string
-       #                                               "Gene Locus"/"Source Accession ID" (first uc("AT.G"))  => string
-       #                                               "Chromosome" (AT#G in "Gene Locus")  => string
-       #                                               "Gene Synonyms"  => pipe-delimited string of synonyms
-       #                                               "Ontological Reference" => %annotation_ontology_refs
-       #                                                                                                       "Term ID"  => string
-       #                                                                                                               "Evidence Code" => string
-       #                                                                                                               "Aspect" => string
+       #                           "Gene Name" => string
+       #                                               "Gene Locus"/"Source Accession ID" (first uc("AT.G"))  => string
+       #                                               "Chromosome" (AT#G in "Gene Locus")  => string
+       #                                               "Gene Synonyms"  => % strings
+       #                           "Ontological References" => % "$Term ID" => %annotation_ontology_ref
+       #                                                                                               "Aspect" => string (assumes only one)
+       #                                                                                                   "Evidence Codes" => % strings
+       #                                                               "Evidence" => % strings ("with_or_from" )
+       #                                                               "Publications" => % PMID's from "db:reference", used to create separate Pub pages
      
      # loop through data rows and build hashed annotation data structure
      my $count = 0;
@@ -290,18 +367,19 @@ sub import_gaf
              . Dumper(\@curr_line_ary) . "\n\n"; }
          
          my %curr_line_hash = (
-            "db" =>                 $curr_line_ary[0], # Source
-            "db_object_symbol" =>   $curr_line_ary[2], # Gene Symbol
-            "term_id" =>              $curr_line_ary[4], # Term ID
-            "evidence_code" =>      $curr_line_ary[6], # Evidence Code
-            "aspect" =>             $curr_line_ary[8], # Aspect
-            "db_object_name" =>     $curr_line_ary[9], # Gene Name
+            "db"                => $curr_line_ary[0], # Source
+            "db_object_symbol"  => $curr_line_ary[2], # Gene Symbol
+            "term_id"           => $curr_line_ary[4], # Term ID
+            "db_reference"      => $curr_line_ary[5], # inc. Publication info (PMID)
+            "evidence_code"     => $curr_line_ary[6], # Evidence Code
+            "with_or_from"      => $curr_line_ary[7], # Evidence (data)
+            "aspect"            => $curr_line_ary[8], # Aspect
+            "db_object_name"    => $curr_line_ary[9], # Gene Name
  
              # Gene Locus, Source Accession ID, Chromosome, Gene Synonyms
-            "db_object_synonym" =>  $curr_line_ary[10],
-
-            "db_object_type" =>     $curr_line_ary[11], # Gene Type
-            "taxon" =>              $curr_line_ary[12] # Species ID
+            "db_object_synonym" => $curr_line_ary[10],
+            "db_object_type"    => $curr_line_ary[11], # Gene Type
+            "taxon"             => $curr_line_ary[12] # Species ID
              );
  
          if ($debug) { print "...<DEBUG: \%curr_line_hash>\n"
@@ -319,6 +397,9 @@ sub import_gaf
              if ($debug) { print "...<DEBUG: \%annotation_universals>\n"
                  . Dumper(\%annotation_universals) . "\n\n"; }
      
+            # TODO: set all ontology aspects in a multi-hash (group by ontology)
+            # and keep a flag to monitor the currently-needed ontology
+            
              # identify what kind of ontology term we're dealing with and set our 
              # aspects and ontology name accordingly
              switch ((split(':',$curr_line_hash{"term_id"}))[0]) {
@@ -344,6 +425,9 @@ sub import_gaf
          # if not, add the new Gene Symbol and its associated props
          if (!exists $annotations{$curr_line_hash{"db_object_symbol"}})
          {
+            
+            # print "\n*** NEW SYMBOL: $curr_line_hash{'db_object_symbol'} ***\n"; # TEST
+
              # prepare Gene Locus, Source Accession ID, Chromosome, Gene Synonyms
              my @synonyms = split('\|',$curr_line_hash{"db_object_synonym"});
              
@@ -384,30 +468,34 @@ sub import_gaf
                  "Accession ID"  => $accession_id,
                  "Gene Name"     => $curr_line_hash{"db_object_name"},
                  "Gene Locus"    => $locus,
-                               "Chromosome"    => $chromosome,
-                               "Gene Synonyms" => $curr_line_hash{"db_object_synonym"} # pipe-delimited string
+                               "Chromosome"    => $chromosome
                             };
  
+            # add synonyms
+                       for (split('\|', $curr_line_hash{"db_object_synonym"})) {
+                           $$annotation_properties{"Gene Synonyms"}{$_} = ""; }
+                       
              if ($debug) { print "...<DEBUG: \%annotation_properties>\n"
                  . Dumper($annotation_properties) . "\n\n"; }
  
-            
              # add new gene annotation and assign props
              $annotations{$curr_line_hash{"db_object_symbol"}} = $annotation_properties;
-        }
-        
-        # add new ontology data (this happens on every line of data)
-        my $annotation_ontology_ref = {
-            "Term ID" => $curr_line_hash{"term_id"},
-            "Aspect" => $aspects{uc($curr_line_hash{"aspect"})},
-            "Evidence Code" => $curr_line_hash{"evidence_code"}
-            };
  
-        if ($debug) { print "...<DEBUG: \@annotation_ontology_refs>\n"
-            . Dumper($annotation_ontology_ref) . "\n\n"; }
+            add_ontology_ref_data(\%curr_line_hash); # add the first ontology reference (every time)            
+        }
+        # that Gene Symbol does exist, so we just need to roll-up multi-line 
+        # annotation information, like gene synonyms and ontology refs
+        else 
+        {
+            #print "\n*** EXISTING SYMBOL: $curr_line_hash{'db_object_symbol'} ***\n"; # TEST
  
-        push @{$annotations{$curr_line_hash{"db_object_symbol"}}{"Ontological Reference"}}, $annotation_ontology_ref;
+            # add any add'l synonyms
+                       for (split('\|', $curr_line_hash{"db_object_synonym"})) {
+                           $annotations{$curr_line_hash{"db_object_symbol"}}
+                               {"Gene Synonyms"}{$_} = ""; }
  
+            add_ontology_ref_data(\%curr_line_hash); # add add'l ontology reference data            
+        }
      }
      if ($debug) { print "...<DEBUG: \%annotations>\n"
          . Dumper(\%annotations) . "\n\n"; }
@@ -720,7 +808,7 @@ sub transform_gaf
                              $writer->dataElement("Field", $ontology_name, Name=>"Ontology");
                              $writer->dataElement("Field", $$_{"Term ID"}, Name=>"Term ID");
                              $writer->dataElement("Field", $$_{"Aspect"}, Name=>"Aspect");
-                            $writer->dataElement("Field", $$_{"Evidence Code"}, Name=>"Evidence Code");
+                            $writer->dataElement("Field", $$_{"Evidence Codes"}, Name=>"Evidence Code");
                          $writer->endTag("Template");
                      }
                  $writer->endTag("Page");
@@ -787,6 +875,7 @@ sub show_output
  init;
  import_data;
  if ($verbose) { show_input; }
+exit(0); # TEST
  write_xml();
  if ($verbose) { show_output; }
author	preecej <preecej@localhost>
	Wed, 24 Aug 2011 01:29:34 +0000 (01:29 +0000)
committer	preecej <preecej@localhost>
	Wed, 24 Aug 2011 01:29:34 +0000 (01:29 +0000)