From 79cd553e96fefd5bfbac5957a170dbc50fbb5882 Mon Sep 17 00:00:00 2001 From: preecej Date: Fri, 29 Jul 2011 01:42:52 +0000 Subject: [PATCH] Began XML doc write using XML::Writer svn path=/; revision=129 --- .../semantic_wiki/paw_TransformForImport.pl | 138 +++++++++++++----- 1 file changed, 104 insertions(+), 34 deletions(-) diff --git a/preecej/semantic_wiki/paw_TransformForImport.pl b/preecej/semantic_wiki/paw_TransformForImport.pl index 7605392..cdbf520 100644 --- a/preecej/semantic_wiki/paw_TransformForImport.pl +++ b/preecej/semantic_wiki/paw_TransformForImport.pl @@ -98,7 +98,8 @@ use Data::Dumper; # specific use XML::Smart; -use XML::DOM; +use XML::Writer; +use IO::File; # --------------------------------------------------------------------------- # declarations @@ -116,7 +117,6 @@ my $debug = 0; # debugging switch # universal my %source; -my $xml; my $output_data; my %aspects; @@ -124,17 +124,20 @@ my %aspects; my $template_name; my @field_names; my @field_data; +my $xml; # represents the xml doc # gaf my %annotation_universals; # holds values assumed not to vary across the file my %annotations; # keyed on Gene Symbol -# other config constants +# other config constants (including temporary arrangements) # page id seeds, until I can figure out how to auto-increment w/in the import # script -my $SOURCE_TITLE_SEED = 3; -my $ANNOT_TITLE_SEED = 3; +my $SOURCE_TITLE_SEED = 4; +my $ANNOT_TITLE_SEED = 14; + +my $species_name = "Arabidopsis thaliana"; # obviously temporary; need an NCBI lookup $Data::Dumper::Pad = "... "; @@ -195,7 +198,7 @@ sub init . "Designated input file type: $file_type\n" . "Output File: $output_file\n" . "Running in verbose mode? " . ($verbose ? "Yes" : "No") . "\n" - . "Running in debug mode? " . ($verbose ? "Yes" : "No") . "\n" + . "Running in debug mode? " . ($debug ? "Yes" : "No") . "\n" . "\n" . "------------------------------------------------------------\n" . "------------------------------------------------------------\n" @@ -306,9 +309,10 @@ sub import_gaf # grab the unvaried values from the first line if ($count == 1) { %annotation_universals = ( - "Source" => $curr_line_hash{"db"}, - "Gene Type" => $curr_line_hash{"db_object_type"}, - "Species" => "NCBI:" . (split(':',$curr_line_hash{"taxon"}))[1] + "Source" => $curr_line_hash{"db"}, # currently not in use + "Gene Type" => $curr_line_hash{"db_object_type"}, + "Species ID" => "NCBI:" . (split(':',$curr_line_hash{"taxon"}))[1], + "Species Name" => $species_name # TODO: get this from NCBI ); if ($debug) { print "...\n" @@ -371,7 +375,7 @@ sub import_gaf "Gene Name" => $curr_line_hash{"db_object_name"}, "Gene Locus" => $locus, # also used for Source Accession ID "Chromosome" => $chromosome, - "Gene Synonyms" => $curr_line_hash{"db_object_synonym"}, # pipe-delimited string + "Gene Synonyms" => $curr_line_hash{"db_object_synonym"} # pipe-delimited string }; if ($debug) { print "...\n" @@ -603,32 +607,98 @@ sub transform_generic # --------------------------------------------------------------------------- sub transform_gaf { - # - - # my $parser = new XML::DOM::Parser; - # my $doc = $parser->parsefile ("file.xml"); - - # # print all HREF attributes of all CODEBASE elements - # my $nodes = $doc->getElementsByTagName ("CODEBASE"); - # my $n = $nodes->getLength; - - # for (my $i = 0; $i < $n; $i++) - # { - # my $node = $nodes->item ($i); - # my $href = $node->getAttributeNode ("HREF"); - # print $href->getValue . "\n"; - # } - - # # Print doc file - # $doc->printToFile ("out.xml"); - - # # Print to string - # print $doc->toString; + # create new xml doc, write to string + my $writer = new XML::Writer( + OUTPUT => \$output_data, + DATA_MODE => 1, + DATA_INDENT => 4, + ENCODING => 'utf-8' + ); + + # create root elements + $writer->xmlDecl; + $writer->startTag("Pages"); + + # create source page + $writer->startTag("Page",Title=>"Source:$SOURCE_TITLE_SEED"); + $writer->startTag("Template",Name=>"Source"); + + # iterate the source hash for element name attribs and vals + my @pretty_elements; + foreach my $element (keys %source) + { + # split on CamelCase (saves a few lines and was fun to write) + for ($element) { + @pretty_elements = /[A-Z](?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g; + } + $writer->startTag("Field",Name=>"@pretty_elements"); + $writer->characters($source{$element}); + $writer->endTag("Field"); + } + + $writer->endTag("Template"); + $writer->endTag("Page"); - # # Avoid memory leaks - cleanup circular references for garbage collection - # $doc->dispose; + # iterate %annotations + my $annot_title_count = $ANNOT_TITLE_SEED; + + foreach my $annotation (keys %annotations) + { + #my %curr_annot = \$annotation; # ? + + # create annotation page + $writer->startTag("Page",Title=>"Annotation:$annot_title_count"); + $writer->startTag("Template",Name=>"Annotation"); + + $writer->startTag("Field",Name=>"Species Name"); + $writer->characters($annotation_universals{'Species Name'}); + $writer->endTag("Field"); + $writer->startTag("Field",Name=>"Species ID"); + $writer->characters($annotation_universals{'Species ID'}); + $writer->endTag("Field"); + $writer->startTag("Field",Name=>"Gene Symbol"); + $writer->characters($annotation); + $writer->endTag("Field"); + # $writer->startTag("Field",Name=>"Gene Name"); + # $writer->characters(); + # $writer->endTag("Field"); + # $writer->startTag("Field",Name=>"Gene Locus"); + # $writer->characters(); + # $writer->endTag("Field"); + $writer->startTag("Field",Name=>"Gene Type"); + $writer->characters($annotation_universals{'Gene Type'}); + $writer->endTag("Field"); + # $writer->startTag("Field",Name=>"Chromosome"); + # $writer->characters(); + # $writer->endTag("Field"); + $writer->startTag("Field",Name=>"Has Phenotype"); + $writer->characters("No"); + $writer->endTag("Field"); + + $writer->endTag("Template"); + $writer->endTag("Page"); + + # start provenance page (separate node and attach at the end?) + + # iterate synonyms + # create gene synonyms page + # add to provenance + + # iterate @%ont refs + # create ont ref + # add to provenance - $output_data = "hullo, gaf"; + $annot_title_count++; + } + + # close doc + $writer->endTag("Pages"); + $writer->end(); + + # write doc to file + open(OUTPUT_FILE,">$output_file"); + print OUTPUT_FILE $output_data; + close OUTPUT_FILE; } -- 2.34.1