Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Began XML doc write using XML::Writer
authorpreecej <preecej@localhost>
Fri, 29 Jul 2011 01:42:52 +0000 (01:42 +0000)
committerpreecej <preecej@localhost>
Fri, 29 Jul 2011 01:42:52 +0000 (01:42 +0000)
svn path=/; revision=129

preecej/semantic_wiki/paw_TransformForImport.pl

index 76053928dcd33fd3bf4108012ff1eaec846712bc..cdbf52083c7739ef7689e0caa0caacabfe4f4cd2 100644 (file)
@@ -98,7 +98,8 @@ use Data::Dumper;
 
 # specific
 use XML::Smart;
-use XML::DOM;
+use XML::Writer;
+use IO::File;
 
 # ---------------------------------------------------------------------------
 # declarations
@@ -116,7 +117,6 @@ my $debug = 0; # debugging switch
 
 # universal 
 my %source;
-my $xml;
 my $output_data;
 my %aspects;
 
@@ -124,17 +124,20 @@ my %aspects;
 my $template_name;
 my @field_names;
 my @field_data;
+my $xml; # represents the xml doc
 
 # gaf
 my %annotation_universals; # holds values assumed not to vary across the file
 my %annotations; # keyed on Gene Symbol
 
-# other config constants
+# other config constants (including temporary arrangements)
 
 # page id seeds, until I can figure out how to auto-increment w/in the import 
 # script
-my $SOURCE_TITLE_SEED = 3; 
-my $ANNOT_TITLE_SEED = 3;
+my $SOURCE_TITLE_SEED = 4; 
+my $ANNOT_TITLE_SEED = 14;
+
+my $species_name = "Arabidopsis thaliana"; # obviously temporary; need an NCBI lookup
 
 $Data::Dumper::Pad = "... "; 
 
@@ -195,7 +198,7 @@ sub init
         . "Designated input file type: $file_type\n"
         . "Output File: $output_file\n"
         . "Running in verbose mode? " . ($verbose ? "Yes" : "No") . "\n"
-        . "Running in debug mode? " . ($verbose ? "Yes" : "No") . "\n"
+        . "Running in debug mode? " . ($debug ? "Yes" : "No") . "\n"
         . "\n"
         . "------------------------------------------------------------\n"
         . "------------------------------------------------------------\n"
@@ -306,9 +309,10 @@ sub import_gaf
         # grab the unvaried values from the first line
         if ($count == 1) {
             %annotation_universals = (
-                "Source"    => $curr_line_hash{"db"},
-                "Gene Type" => $curr_line_hash{"db_object_type"},
-                "Species"   => "NCBI:" . (split(':',$curr_line_hash{"taxon"}))[1]
+                "Source"       => $curr_line_hash{"db"}, # currently not in use
+                "Gene Type"    => $curr_line_hash{"db_object_type"},
+                "Species ID"   => "NCBI:" . (split(':',$curr_line_hash{"taxon"}))[1],
+                "Species Name" => $species_name # TODO: get this from NCBI
                 );
             
             if ($debug) { print "...<DEBUG: \%annotation_universals>\n"
@@ -371,7 +375,7 @@ sub import_gaf
                 "Gene Name"     => $curr_line_hash{"db_object_name"},
                 "Gene Locus"    => $locus, # also used for Source Accession ID
                                "Chromosome"    => $chromosome,
-                               "Gene Synonyms" => $curr_line_hash{"db_object_synonym"}, # pipe-delimited string
+                               "Gene Synonyms" => $curr_line_hash{"db_object_synonym"} # pipe-delimited string
                            };
 
             if ($debug) { print "...<DEBUG: \%annotation_properties>\n"
@@ -603,32 +607,98 @@ sub transform_generic
 # ---------------------------------------------------------------------------
 sub transform_gaf
 {
-    # 
-    
-    # my $parser = new XML::DOM::Parser;
-    # my $doc = $parser->parsefile ("file.xml");
-    
-    # # print all HREF attributes of all CODEBASE elements
-    # my $nodes = $doc->getElementsByTagName ("CODEBASE");
-    # my $n = $nodes->getLength;
-    
-    # for (my $i = 0; $i < $n; $i++)
-    # {
-    #  my $node = $nodes->item ($i);
-    #  my $href = $node->getAttributeNode ("HREF");
-    #  print $href->getValue . "\n";
-    # }
-    
-    # # Print doc file
-    # $doc->printToFile ("out.xml");
-    
-    # # Print to string
-    # print $doc->toString;
+    # create new xml doc, write to string
+    my $writer = new XML::Writer(
+        OUTPUT      => \$output_data,
+        DATA_MODE   => 1,
+        DATA_INDENT => 4,
+        ENCODING    => 'utf-8'
+        );
+
+    # create root elements
+    $writer->xmlDecl;
+    $writer->startTag("Pages");
+
+        # create source page
+        $writer->startTag("Page",Title=>"Source:$SOURCE_TITLE_SEED");
+            $writer->startTag("Template",Name=>"Source");
+
+            # iterate the source hash for element name attribs and vals
+            my @pretty_elements;
+            foreach my $element (keys %source)
+            {
+                # split on CamelCase (saves a few lines and was fun to write)
+                for ($element) {
+                    @pretty_elements = /[A-Z](?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
+                }
+                $writer->startTag("Field",Name=>"@pretty_elements");
+                    $writer->characters($source{$element});
+                $writer->endTag("Field");
+            }
+
+            $writer->endTag("Template");
+        $writer->endTag("Page");
     
-    # # Avoid memory leaks - cleanup circular references for garbage collection
-    # $doc->dispose;
+        # iterate %annotations
+        my $annot_title_count = $ANNOT_TITLE_SEED;
+        
+        foreach my $annotation (keys %annotations)
+        {
+            #my %curr_annot = \$annotation; # ?
+
+            # create annotation page
+            $writer->startTag("Page",Title=>"Annotation:$annot_title_count");
+                $writer->startTag("Template",Name=>"Annotation");
+
+                $writer->startTag("Field",Name=>"Species Name");
+                $writer->characters($annotation_universals{'Species Name'});
+                $writer->endTag("Field");
+                $writer->startTag("Field",Name=>"Species ID");
+                    $writer->characters($annotation_universals{'Species ID'});
+                $writer->endTag("Field");
+                $writer->startTag("Field",Name=>"Gene Symbol");
+                    $writer->characters($annotation);
+                $writer->endTag("Field");
+                # $writer->startTag("Field",Name=>"Gene Name");
+                #     $writer->characters();
+                # $writer->endTag("Field");
+                # $writer->startTag("Field",Name=>"Gene Locus");
+                #     $writer->characters();
+                # $writer->endTag("Field");
+                $writer->startTag("Field",Name=>"Gene Type");
+                    $writer->characters($annotation_universals{'Gene Type'});
+                $writer->endTag("Field");
+                # $writer->startTag("Field",Name=>"Chromosome");
+                #     $writer->characters();
+                # $writer->endTag("Field");
+                $writer->startTag("Field",Name=>"Has Phenotype");
+                    $writer->characters("No");
+                $writer->endTag("Field");
+
+                $writer->endTag("Template");
+            $writer->endTag("Page");
+            
+            # start provenance page (separate node and attach at the end?)
+            
+            # iterate synonyms 
+                # create gene synonyms page
+                # add to provenance
+            
+            # iterate @%ont refs
+                # create ont ref
+                # add to provenance
 
-    $output_data = "hullo, gaf";
+            $annot_title_count++;
+        }
+        
+    # close doc
+    $writer->endTag("Pages");
+    $writer->end();
+
+    # write doc to file
+    open(OUTPUT_FILE,">$output_file");
+    print OUTPUT_FILE $output_data;
+    close OUTPUT_FILE;
 }