From: preecej <preecej@localhost>
Date: Wed, 24 Aug 2011 01:29:34 +0000 (+0000)
Subject: In progress; added handling for grouped ont term ids in gaf import, inc multiple... 
X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=fe9fa71c93134d4274da4195347d83bc363d0451;p=old-jaiswallab-svn%2F.git

In progress; added handling for grouped ont term ids in gaf import, inc multiple gene
synonyms, evidence codes, evidence data (with_or_from), and publication
ids (db:reference PMID's). Need to complete by building corresponding wiki XML pages
and properties.

svn path=/; revision=169
---

diff --git a/preecej/semantic_wiki/paw_TransformForImport.pl b/preecej/semantic_wiki/paw_TransformForImport.pl
index 5dd0e88..df3be4f 100644
--- a/preecej/semantic_wiki/paw_TransformForImport.pl
+++ b/preecej/semantic_wiki/paw_TransformForImport.pl
@@ -6,7 +6,7 @@ Planteome Annotation Wiki - Data Import Script
 
 =head1 VERSION
 
-0.2
+0.3
 
 =head1 DESCRIPTION
 
@@ -16,8 +16,10 @@ by the MediaWiki extension DataTransfer (Special:ImportXML) feature.
 Also generates appropriate provenance of data based on a prefixed
 Source header in the import file.
 
-New to this revision: Added a GAF file type option for import; 
-accommodates ontologically-focused annotations.
+Handles GAF file type option for import; accommodates ontologically-
+focused annotations.
+
+New to this revision: ... 
 
 =head1 USAGE
 
@@ -50,10 +52,11 @@ be imported in accordance with the input type.
     Source URI=http://www.shigen.nig.ac.jp/rice/oryzabase/
     Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/...
 
-=head2 Data Header Format Examples
+=head2 Data Header Generic Format Examples
 
-    NOTE: The field separator may also be a tab instead, but must be 
-    employed consistently across the entire file.
+    NOTES: The field separator may also be a tab instead, but must be 
+    employed consistently across the entire file. GAF files do not 
+    require a [Format] section.
 
     [Format]
     Template=Annotation
@@ -249,16 +252,89 @@ sub import_generic
     }
 }
 
+# add new ontology reference data
+# ---------------------------------------------------------------------------
+sub add_ontology_ref_data($)
+{
+    my %ref_data = %{$_[0]};
+    my @evidence = split('\|', $ref_data{"with_or_from"});
+    my @publications = split('\|', $ref_data{"db_reference"});
+    for (@publications) {
+        if (!(substr($_,0,5) eq "PMID:")) {
+            splice(@publications,$_,1);
+        }
+    }
+
+    # TODO: can you eliminate this if..else by compressing the hash aggregation?
+    
+    # if this is the first annotation for this current gene symbol and term id, add it
+    if (!exists $annotations{$ref_data{"db_object_symbol"}}
+            {"Ontological References"}
+                {$ref_data{"term_id"}})
+    {
+        # print "*** NEW REF: $ref_data{'term_id'} ***\n"; # TEST
+        
+        # add new ontology data
+        my $annotation_ontology_ref = {
+            "Aspect" => $aspects{uc($ref_data{"aspect"})}, # assumes only one
+            "Evidence Codes" => { $ref_data{"evidence_code"} => "" }
+            };
+
+        # add Evidence
+        for (@evidence) {
+            $$annotation_ontology_ref{"Evidence"}{$_} = "";
+        }
+            
+        # add Publication data
+        for (@publications) {
+            $$annotation_ontology_ref{"Publications"}{$_} = ""; }
+            
+        $annotations{$ref_data{"db_object_symbol"}}
+            {"Ontological References"} 
+                {$ref_data{"term_id"}} = $annotation_ontology_ref;
+
+        if ($debug) { print "...<DEBUG: \%annotation_ontology_ref>\n"
+            . Dumper($annotation_ontology_ref) . "\n\n"; }
+    }
+    else # aggregate add'l evidence codes, evidence, and publications for the existing term id
+    {
+        # print "*** ADD'L REF: $ref_data{'term_id'} ***\n";  # TEST
+        
+        $annotations{$ref_data{"db_object_symbol"}}
+            {"Ontological References"}
+                {$ref_data{"term_id"}}
+                {"Evidence Codes"}{$ref_data{"evidence_code"}} = "";
+
+        for (@evidence) {
+            $annotations{$ref_data{"db_object_symbol"}}
+                {"Ontological References"}
+                    {$ref_data{"term_id"}}
+                        {"Evidence"}{$_} = "";
+        }
+
+        for (@publications) {
+            $annotations{$ref_data{"db_object_symbol"}}
+                {"Ontological References"}
+                    {$ref_data{"term_id"}}
+                        {"Publications"}{$_} = ""; }
+    }
+}
+
 # read, parse, and store GAF annotations
 # ---------------------------------------------------------------------------
 sub import_gaf
 {
    	#[GAF implied data structure]
-    # singletons: db/Source (not really, but OK for now), taxon/Species ID, assigned_by, gene_type/Gene Type (later: proteins, too)
-    # unneeded: db_object_id, date, qualifier, db_reference (get PMID from here later), with_or_from (later)
-    # unvaried fields (gene-level): db_object_symbol/Gene Symbol, db_object_name/Gene Name, db_object_synonym/Gene Locus|Source Accession ID|Chromosome|Gene Synonyms (see below), 
-        # varied fields (gene synonyms): db_object_synonym/Gene Synonym
-    # varied fields (ontology-level): term_id/Term ID, evidence_code/Evidence Code, aspect/Aspect,  
+    # singletons: db/Source (not really, but OK for now), taxon/Species ID, 
+    #   assigned_by, gene_type/Gene Type (later: proteins, too)
+    # lower priority: db_object_id, date
+    # not needed: qualifier
+    # unvaried fields (gene-level): db_object_symbol/Gene Symbol, 
+    #   db_object_name/Gene Name, 
+    #   db_object_synonym/Gene Locus|Source Accession ID|Chromosome|Gene Synonyms (see below), 
+    # varied fields (gene synonyms): db_object_synonym/Gene Synonym
+    # varied fields (ontology-level): term_id/Term ID, evidence_code/Evidence Code, 
+    #   aspect/Aspect, db_reference (multi-, get PMID from here), with_or_from (multi-)
 
     # [Annotation Object Structure]
     # %annotation_universals
@@ -267,14 +343,15 @@ sub import_gaf
     #     "Gene Type"
     # %annotations
     #     "$Gene Symbol" => %annotation_properties
-	# 	                    "Gene Name" => string
-	# 						"Gene Locus"/"Source Accession ID" (first uc("AT.G"))  => string
-	# 						"Chromosome" (AT#G in "Gene Locus")  => string
-	# 						"Gene Synonyms"  => pipe-delimited string of synonyms
-	# 						"Ontological Reference" => %annotation_ontology_refs
-	# 											    		"Term ID"  => string
-	# 														"Evidence Code" => string
-	# 														"Aspect" => string
+	#     	                    "Gene Name" => string
+	# 		    				"Gene Locus"/"Source Accession ID" (first uc("AT.G"))  => string
+	# 			    			"Chromosome" (AT#G in "Gene Locus")  => string
+	# 				    		"Gene Synonyms"  => % strings
+	#                           "Ontological References" => % "$Term ID" => %annotation_ontology_ref
+	#             				    						        "Aspect" => string (assumes only one)
+	#     				                						    "Evidence Codes" => % strings
+	#                                                               "Evidence" => % strings ("with_or_from" )
+	#                                                               "Publications" => % PMID's from "db:reference", used to create separate Pub pages
     
     # loop through data rows and build hashed annotation data structure
     my $count = 0;
@@ -290,18 +367,19 @@ sub import_gaf
             . Dumper(\@curr_line_ary) . "\n\n"; }
         
         my %curr_line_hash = (
-            "db" =>                 $curr_line_ary[0], # Source
-            "db_object_symbol" =>   $curr_line_ary[2], # Gene Symbol
-            "term_id" =>              $curr_line_ary[4], # Term ID
-            "evidence_code" =>      $curr_line_ary[6], # Evidence Code
-            "aspect" =>             $curr_line_ary[8], # Aspect
-            "db_object_name" =>     $curr_line_ary[9], # Gene Name
+            "db"                => $curr_line_ary[0], # Source
+            "db_object_symbol"  => $curr_line_ary[2], # Gene Symbol
+            "term_id"           => $curr_line_ary[4], # Term ID
+            "db_reference"      => $curr_line_ary[5], # inc. Publication info (PMID)
+            "evidence_code"     => $curr_line_ary[6], # Evidence Code
+            "with_or_from"      => $curr_line_ary[7], # Evidence (data)
+            "aspect"            => $curr_line_ary[8], # Aspect
+            "db_object_name"    => $curr_line_ary[9], # Gene Name
 
             # Gene Locus, Source Accession ID, Chromosome, Gene Synonyms
-            "db_object_synonym" =>  $curr_line_ary[10],
-
-            "db_object_type" =>     $curr_line_ary[11], # Gene Type
-            "taxon" =>              $curr_line_ary[12] # Species ID
+            "db_object_synonym" => $curr_line_ary[10],
+            "db_object_type"    => $curr_line_ary[11], # Gene Type
+            "taxon"             => $curr_line_ary[12] # Species ID
             );
 
         if ($debug) { print "...<DEBUG: \%curr_line_hash>\n"
@@ -319,6 +397,9 @@ sub import_gaf
             if ($debug) { print "...<DEBUG: \%annotation_universals>\n"
                 . Dumper(\%annotation_universals) . "\n\n"; }
     
+            # TODO: set all ontology aspects in a multi-hash (group by ontology)
+            # and keep a flag to monitor the currently-needed ontology
+            
             # identify what kind of ontology term we're dealing with and set our 
             # aspects and ontology name accordingly
             switch ((split(':',$curr_line_hash{"term_id"}))[0]) {
@@ -344,6 +425,9 @@ sub import_gaf
         # if not, add the new Gene Symbol and its associated props
         if (!exists $annotations{$curr_line_hash{"db_object_symbol"}})
         {
+            
+            # print "\n*** NEW SYMBOL: $curr_line_hash{'db_object_symbol'} ***\n"; # TEST
+
             # prepare Gene Locus, Source Accession ID, Chromosome, Gene Synonyms
             my @synonyms = split('\|',$curr_line_hash{"db_object_synonym"});
             
@@ -384,30 +468,34 @@ sub import_gaf
                 "Accession ID"  => $accession_id,
                 "Gene Name"     => $curr_line_hash{"db_object_name"},
                 "Gene Locus"    => $locus,
-	 			"Chromosome"    => $chromosome,
-				"Gene Synonyms" => $curr_line_hash{"db_object_synonym"} # pipe-delimited string
+	 			"Chromosome"    => $chromosome
 			    };
 
+            # add synonyms
+			for (split('\|', $curr_line_hash{"db_object_synonym"})) {
+			    $$annotation_properties{"Gene Synonyms"}{$_} = ""; }
+			
             if ($debug) { print "...<DEBUG: \%annotation_properties>\n"
                 . Dumper($annotation_properties) . "\n\n"; }
 
-            
             # add new gene annotation and assign props
             $annotations{$curr_line_hash{"db_object_symbol"}} = $annotation_properties;
-        }
-        
-        # add new ontology data (this happens on every line of data)
-        my $annotation_ontology_ref = {
-            "Term ID" => $curr_line_hash{"term_id"},
-            "Aspect" => $aspects{uc($curr_line_hash{"aspect"})},
-            "Evidence Code" => $curr_line_hash{"evidence_code"}
-            };
 
-        if ($debug) { print "...<DEBUG: \@annotation_ontology_refs>\n"
-            . Dumper($annotation_ontology_ref) . "\n\n"; }
+            add_ontology_ref_data(\%curr_line_hash); # add the first ontology reference (every time)            
+        }
+        # that Gene Symbol does exist, so we just need to roll-up multi-line 
+        # annotation information, like gene synonyms and ontology refs
+        else 
+        {
+            #print "\n*** EXISTING SYMBOL: $curr_line_hash{'db_object_symbol'} ***\n"; # TEST
 
-        push @{$annotations{$curr_line_hash{"db_object_symbol"}}{"Ontological Reference"}}, $annotation_ontology_ref;
+            # add any add'l synonyms
+			for (split('\|', $curr_line_hash{"db_object_synonym"})) {
+			    $annotations{$curr_line_hash{"db_object_symbol"}}
+			        {"Gene Synonyms"}{$_} = ""; }
 
+            add_ontology_ref_data(\%curr_line_hash); # add add'l ontology reference data            
+        }
     }
     if ($debug) { print "...<DEBUG: \%annotations>\n"
         . Dumper(\%annotations) . "\n\n"; }
@@ -720,7 +808,7 @@ sub transform_gaf
                             $writer->dataElement("Field", $ontology_name, Name=>"Ontology");
                             $writer->dataElement("Field", $$_{"Term ID"}, Name=>"Term ID");
                             $writer->dataElement("Field", $$_{"Aspect"}, Name=>"Aspect");
-                            $writer->dataElement("Field", $$_{"Evidence Code"}, Name=>"Evidence Code");
+                            $writer->dataElement("Field", $$_{"Evidence Codes"}, Name=>"Evidence Code");
                         $writer->endTag("Template");
                     }
                 $writer->endTag("Page");
@@ -787,6 +875,7 @@ sub show_output
 init;
 import_data;
 if ($verbose) { show_input; }
+exit(0); # TEST
 write_xml();
 if ($verbose) { show_output; }