From e35d6e9662cb6f20d730bd531ad8e7ad28a18bd7 Mon Sep 17 00:00:00 2001
From: preecej <preecej@localhost>
Date: Thu, 28 Jul 2011 18:36:18 +0000
Subject: [PATCH] Completed data read-in of GAF file
 (paw_TransformForImport.pl)

svn path=/; revision=126
---
 preecej/php_singletons/cmd_line_test.php      |  14 +
 .../semantic_wiki/paw_TransformForImport.pl   | 330 +++++++++++++++---
 2 files changed, 287 insertions(+), 57 deletions(-)
 create mode 100644 preecej/php_singletons/cmd_line_test.php

diff --git a/preecej/php_singletons/cmd_line_test.php b/preecej/php_singletons/cmd_line_test.php
new file mode 100644
index 0000000..0a58b4e
--- /dev/null
+++ b/preecej/php_singletons/cmd_line_test.php
@@ -0,0 +1,14 @@
+<?php
+
+print "test start\n";
+
+$doc = new DOMDocument();
+$doc->load('/home/preecej/Documents/projects/pathvisio/Ath_scratch.gpml'); 
+
+$entry = $doc->getElementsByTagName("Pathway");
+$author = $entry->item(0)->getAttribute("Author");
+print  "Author:$author\n";
+
+print "test over\n";
+
+?>
diff --git a/preecej/semantic_wiki/paw_TransformForImport.pl b/preecej/semantic_wiki/paw_TransformForImport.pl
index 92a4713..7605392 100644
--- a/preecej/semantic_wiki/paw_TransformForImport.pl
+++ b/preecej/semantic_wiki/paw_TransformForImport.pl
@@ -94,6 +94,7 @@ use strict;
 use Cwd;
 use Switch;
 use Getopt::Std;
+use Data::Dumper;
 
 # specific
 use XML::Smart;
@@ -111,16 +112,31 @@ my $output_file;
 my $verbose = 0; # flag for verbose output
 my $debug = 0; # debugging switch
 
-# data storage
+# global data storage ----------
+
+# universal 
 my %source;
+my $xml;
+my $output_data;
+my %aspects;
+
+# tab, csv
 my $template_name;
 my @field_names;
 my @field_data;
-my $xml;
-my $output_data;
 
-my $source_title_seed = 3;
-my $annot_title_seed = 3;
+# gaf
+my %annotation_universals; # holds values assumed not to vary across the file
+my %annotations; # keyed on Gene Symbol
+
+# other config constants
+
+# page id seeds, until I can figure out how to auto-increment w/in the import 
+# script
+my $SOURCE_TITLE_SEED = 3; 
+my $ANNOT_TITLE_SEED = 3;
+
+$Data::Dumper::Pad = "... "; 
 
 # ---------------------------------------------------------------------------
 # functions
@@ -188,19 +204,14 @@ sub init
 }
 
 
-# read, parse, and store source, template, and annotations
+# read, parse, and store generic CSV and tab templates and annotations
 # ---------------------------------------------------------------------------
-sub import_data
+sub import_generic
 {
-    print "Opening input file and reading header info...\n\n";
-
-    # open file
-    open(INPUT_FILE,$input_file) or die("Could not open input file.");
-
-    my $line; # all-purpose line counter
-    
-    # read in the source data
+    # read in "[Format] section...
+    my $line;
     my $count = 0;
+    
     while (<INPUT_FILE>)
     {
        $count++;
@@ -208,39 +219,13 @@ sub import_data
        chomp $line;
        my $data_val = (split('=',$line))[1];
        switch ($count) {
-           case 2 { $source{'SourceDateStamp'} = $data_val; }
-           case 3 { $source{'SourceDatabase'} = $data_val; }
-           case 4 { $source{'SourceVersion'} = $data_val; }
-           case 5 { $source{'SourceURI'} = $data_val; }
-           case 6 { $source{'SourceFile'} = $data_val; }
+           case 2 { $template_name = $data_val; }
+           case 3 { @field_names = split($file_del,$data_val); }
            else {;}
        }
-       if ($count == 6) { last; }
-    }
-
-    # read in "[Format] section if filetype is tab or csv"
-    $count++;
-    if ($file_type =~ /(csv)|(tab)/) { 
-
-        $count = 0;
-        while (<INPUT_FILE>)
-        {
-           $count++;
-           $line = $_;
-           chomp $line;
-           my $data_val = (split('=',$line))[1];
-           switch ($count) {
-               case 2 { $template_name = $data_val; }
-               case 3 { @field_names = split($file_del,$data_val); }
-               else {;}
-           }
-           if ($count == 3) { last; }
-        }
+       if ($count == 3) { last; }
     }
    
-    print "Reading data...\n\n";
-    $line = <INPUT_FILE>; # skip "[Data]"
-    
     # loop through data rows and add all data fields to an array of hashes
     while (<INPUT_FILE>)
     {
@@ -258,6 +243,200 @@ sub import_data
         }
         push @field_data, \%tmp_hash;
     }
+}
+
+# read, parse, and store GAF annotations
+# ---------------------------------------------------------------------------
+sub import_gaf
+{
+   	#[GAF implied data structure]
+    # singletons: db/Source (not really, but OK for now), taxon/Species ID, assigned_by, gene_type/Gene Type (later: proteins, too)
+    # unneeded: db_object_id, date, qualifier, db_reference (get PMID from here later), with_or_from (later)
+    # unvaried fields (gene-level): db_object_symbol/Gene Symbol, db_object_name/Gene Name, db_object_synonym/Gene Locus|Source Accession ID|Chromosome|Gene Synonyms (see below), 
+        # varied fields (gene synonyms): db_object_synonym/Gene Synonym
+    # varied fields (ontology-level): term_id/Term ID, evidence_code/Evidence Code, aspect/Aspect,  
+
+    # [Annotation Object Structure]
+    # %annotation_universals
+    #     "Source"
+    #     "Species ID"
+    #     "Gene Type"
+    # %annotations
+    #     "$Gene Symbol" => %annotation_properties
+	# 	                    "Gene Name" => string
+	# 						"Gene Locus"/"Source Accession ID" (first uc("AT.G"))  => string
+	# 						"Chromosome" (AT#G in "Gene Locus")  => string
+	# 						"Gene Synonyms"  => pipe-delimited string of synonyms
+	# 						"Ontological Reference" => %annotation_ontology_refs
+	# 											    		"Term ID"  => string
+	# 														"Evidence Code" => string
+	# 														"Aspect" => string
+    
+    # loop through data rows and build hashed annotation data structure
+    my $count = 0;
+    
+    while (<INPUT_FILE>)
+    {
+        $count++;
+        my $line = $_;
+        chomp $line;
+        
+        my @curr_line_ary = split("\t", $line);
+        if ($debug) { print "...<DEBUG: \@curr_line_ary>\n"
+            . Dumper(\@curr_line_ary) . "\n\n"; }
+        
+        my %curr_line_hash = (
+            "db" =>                 $curr_line_ary[0], # Source
+            "db_object_symbol" =>   $curr_line_ary[2], # Gene Symbol
+            "term_id" =>              $curr_line_ary[4], # Term ID
+            "evidence_code" =>      $curr_line_ary[6], # Evidence Code
+            "aspect" =>             $curr_line_ary[8], # Aspect
+            "db_object_name" =>     $curr_line_ary[9], # Gene Name
+
+            # Gene Locus, Source Accession ID, Chromosome, Gene Synonyms
+            "db_object_synonym" =>  $curr_line_ary[10],
+
+            "db_object_type" =>     $curr_line_ary[11], # Gene Type
+            "taxon" =>              $curr_line_ary[12] # Species ID
+            );
+
+        if ($debug) { print "...<DEBUG: \%curr_line_hash>\n"
+            . Dumper(\%curr_line_hash) . "\n\n"; }
+        
+        # grab the unvaried values from the first line
+        if ($count == 1) {
+            %annotation_universals = (
+                "Source"    => $curr_line_hash{"db"},
+                "Gene Type" => $curr_line_hash{"db_object_type"},
+                "Species"   => "NCBI:" . (split(':',$curr_line_hash{"taxon"}))[1]
+                );
+            
+            if ($debug) { print "...<DEBUG: \%annotation_universals>\n"
+                . Dumper(\%annotation_universals) . "\n\n"; }
+    
+            # identify what kind of ontology term we're dealing with and set our 
+            # aspect names accordingly
+            switch ((split(':',$curr_line_hash{"term_id"}))[0]) {
+                case 'GO' { # Gene Ontology
+                    %aspects = (
+                        P => "Biological Process",
+                        C => "Cellular Component",
+                        F => "Molecular Function"
+                    );
+                }
+                case 'PO' { # Plant Ontology
+                    %aspects = (
+                        A => "Plant Anatomy",
+                        G => "Plant Growth and Development Stage"
+                    );
+                }
+            }
+        }
+
+        # check to see if Gene Symbol hash key exists (for grouping)
+        # if not, add the new Gene Symbol and its associated props
+        if (!exists $annotations{$curr_line_hash{"db_object_symbol"}})
+        {
+            # prepare Gene Locus, Source Accession ID, Chromosome, Gene Synonyms
+            my @synonyms = split('\|',$curr_line_hash{"db_object_synonym"});
+            
+            if ($debug) { print "...<DEBUG: \@synonyms>\n"
+                . Dumper(\@synonyms) . "\n\n"; }
+            
+            # find the gene locus, if it is listed (first "AT.G")
+            my @loci = grep /[Aa][Tt].[Gg]/, @synonyms;
+
+            if ($debug) { print "...<DEBUG: \@loci>\n"
+                . Dumper(\@loci) . "\n\n"; }
+
+            my $locus = "";
+            
+            if (scalar(@loci) > 0) # we have at least one match; use the first one
+            {    
+                $locus = $loci[0]; 
+            }
+            else # no match; attempt to use the Gene Synonym instead
+            {
+                if ($curr_line_hash{"db_object_symbol"} =~ /[Aa][Tt].[Gg]/)
+                {
+                    # the split drops the variant/allele signifier
+                    $locus = (split('.',$curr_line_hash{"db_object_symbol"}))[0];
+                }
+            }
+            # chromosome = third char in locus, if it exists
+            my $chromosome = ($locus ne "" ? (split('',$locus))[2] : "");
+            
+            # set up props
+            my $annotation_properties = {
+                "Gene Name"     => $curr_line_hash{"db_object_name"},
+                "Gene Locus"    => $locus, # also used for Source Accession ID
+	 			"Chromosome"    => $chromosome,
+				"Gene Synonyms" => $curr_line_hash{"db_object_synonym"}, # pipe-delimited string
+			    };
+
+            if ($debug) { print "...<DEBUG: \%annotation_properties>\n"
+                . Dumper($annotation_properties) . "\n\n"; }
+
+            
+            # add new gene annotation and assign props
+            $annotations{$curr_line_hash{"db_object_symbol"}} = $annotation_properties;
+        }
+        
+        # add new ontology data (this happens on every line of data)
+        my $annotation_ontology_ref = {
+            "Term ID" => $curr_line_hash{"term_id"},
+            "Aspect" => $aspects{uc($curr_line_hash{"aspect"})},
+            "Evidence Code" => $curr_line_hash{"evidence_code"}
+            };
+
+        if ($debug) { print "...<DEBUG: \@annotation_ontology_refs>\n"
+            . Dumper($annotation_ontology_ref) . "\n\n"; }
+
+        push @{$annotations{$curr_line_hash{"db_object_symbol"}}{"Ontological Reference"}}, $annotation_ontology_ref;
+
+    }
+    if ($debug) { print "...<DEBUG: \%annotations>\n"
+        . Dumper(\%annotations) . "\n\n"; }
+}
+
+
+# read, parse, and store source
+# ---------------------------------------------------------------------------
+sub import_data
+{
+    print "Opening input file and reading header info...\n\n";
+
+    # open file
+    open(INPUT_FILE,$input_file) or die("Could not open input file.");
+
+    # read in the source data
+    my $count = 0;
+    my $line; 
+    while (<INPUT_FILE>)
+    {
+       $count++;
+       $line = $_;
+       chomp $line;
+       my $data_val = (split('=',$line))[1];
+       switch ($count) {
+           case 2 { $source{'SourceDateStamp'} = $data_val; }
+           case 3 { $source{'SourceDatabase'} = $data_val; }
+           case 4 { $source{'SourceVersion'} = $data_val; }
+           case 5 { $source{'SourceURI'} = $data_val; }
+           case 6 { $source{'SourceFile'} = $data_val; }
+           else {;}
+       }
+       if ($count == 6) { last; }
+    }
+
+    print "Reading data...\n\n";
+    $line = <INPUT_FILE>; # skip "[Data]"
+
+    switch ($file_type) {
+        case ('csv' || 'tab') { import_generic(); }
+        case 'gaf'            { import_gaf(); }
+    }
+
     close INPUT_FILE;
 }
 
@@ -272,17 +451,29 @@ sub show_input
     }
     print "\n";
 
-    if ($file_type =~ /(csv)|(tab)/) { 
-        print "[Template]\n$template_name\n\n";
-        print "[Fields]\n" . join(', ',@field_names) . "\n\n";
-    }
-        
-    print "[Data]\n";
-    foreach my $row (@field_data) {
-        foreach my $key (keys %$row) {
-            print "$key => " . $row->{$key} . "\n";
+    switch ($file_type) {
+        case ('csv' || 'tab') {
+            print "[Template]\n$template_name\n\n";
+            print "[Fields]\n" . join(', ',@field_names) . "\n\n";
+            
+            print "[Data]\n";
+            foreach my $row (@field_data) {
+                foreach my $key (keys %$row) {
+                    print "$key => " . $row->{$key} . "\n";
+                }
+                print "\n";
+            }
+        }
+        case 'gaf' {
+            print "[Data]\n";
+
+            for my $key (keys %annotation_universals) {
+                print "$key: " . $annotation_universals{$key} . "\n";
+            }
+            print "\n";
+            
+            print "[Annotations]\n" . Dumper(\%annotations) . "\n\n";
         }
-        print "\n";
     }
     print "\n";
 }
@@ -302,7 +493,7 @@ sub transform_generic
 
     # set root element, source page and elements
     # (temp set of page title until moved to import extension) 
-    $xml->{Pages}{Page} = {Title => "Source:$source_title_seed"};
+    $xml->{Pages}{Page} = {Title => "Source:$SOURCE_TITLE_SEED"};
 
     $curr_node = $xml->{Pages}{Page};
     $curr_node->{Template} = {Name => 'Source'};
@@ -323,7 +514,7 @@ sub transform_generic
     
     if ($debug) { print "Current node: " . $curr_node->path . "\n"; }
     
-    my $next_page_title_id = $annot_title_seed;
+    my $next_page_title_id = $ANNOT_TITLE_SEED;
     
     # iterate through the data
     foreach my $row (@field_data) {
@@ -388,7 +579,7 @@ sub transform_generic
 
             # assign the relevant provenance field data            
             $curr_prov_node->{Field}[0] = {Name => 'Source'};
-            $curr_prov_node->{Field}[0]->content(0,"Source:$source_title_seed");
+            $curr_prov_node->{Field}[0]->content(0,"Source:$SOURCE_TITLE_SEED");
             $curr_prov_node->{Field}[1] = {Name => 'Source Accession ID'};
             $curr_prov_node->{Field}[1]->content(0,$curr_accession_id);
             $curr_prov_node->{Field}[2] = {Name => 'Source Template'};
@@ -412,6 +603,31 @@ sub transform_generic
 # ---------------------------------------------------------------------------
 sub transform_gaf
 {
+    # 
+    
+    # my $parser = new XML::DOM::Parser;
+    # my $doc = $parser->parsefile ("file.xml");
+    
+    # # print all HREF attributes of all CODEBASE elements
+    # my $nodes = $doc->getElementsByTagName ("CODEBASE");
+    # my $n = $nodes->getLength;
+    
+    # for (my $i = 0; $i < $n; $i++)
+    # {
+    #  my $node = $nodes->item ($i);
+    #  my $href = $node->getAttributeNode ("HREF");
+    #  print $href->getValue . "\n";
+    # }
+    
+    # # Print doc file
+    # $doc->printToFile ("out.xml");
+    
+    # # Print to string
+    # print $doc->toString;
+    
+    # # Avoid memory leaks - cleanup circular references for garbage collection
+    # $doc->dispose;
+
     $output_data = "hullo, gaf";
 }
 
-- 
2.34.1