# specific
use XML::Smart;
-use XML::DOM;
+use XML::Writer;
+use IO::File;
# ---------------------------------------------------------------------------
# declarations
# universal
my %source;
-my $xml;
my $output_data;
my %aspects;
my $template_name;
my @field_names;
my @field_data;
+my $xml; # represents the xml doc
# gaf
my %annotation_universals; # holds values assumed not to vary across the file
my %annotations; # keyed on Gene Symbol
-# other config constants
+# other config constants (including temporary arrangements)
# page id seeds, until I can figure out how to auto-increment w/in the import
# script
-my $SOURCE_TITLE_SEED = 3;
-my $ANNOT_TITLE_SEED = 3;
+my $SOURCE_TITLE_SEED = 4;
+my $ANNOT_TITLE_SEED = 14;
+
+my $species_name = "Arabidopsis thaliana"; # obviously temporary; need an NCBI lookup
$Data::Dumper::Pad = "... ";
. "Designated input file type: $file_type\n"
. "Output File: $output_file\n"
. "Running in verbose mode? " . ($verbose ? "Yes" : "No") . "\n"
- . "Running in debug mode? " . ($verbose ? "Yes" : "No") . "\n"
+ . "Running in debug mode? " . ($debug ? "Yes" : "No") . "\n"
. "\n"
. "------------------------------------------------------------\n"
. "------------------------------------------------------------\n"
# grab the unvaried values from the first line
if ($count == 1) {
%annotation_universals = (
- "Source" => $curr_line_hash{"db"},
- "Gene Type" => $curr_line_hash{"db_object_type"},
- "Species" => "NCBI:" . (split(':',$curr_line_hash{"taxon"}))[1]
+ "Source" => $curr_line_hash{"db"}, # currently not in use
+ "Gene Type" => $curr_line_hash{"db_object_type"},
+ "Species ID" => "NCBI:" . (split(':',$curr_line_hash{"taxon"}))[1],
+ "Species Name" => $species_name # TODO: get this from NCBI
);
if ($debug) { print "...<DEBUG: \%annotation_universals>\n"
"Gene Name" => $curr_line_hash{"db_object_name"},
"Gene Locus" => $locus, # also used for Source Accession ID
"Chromosome" => $chromosome,
- "Gene Synonyms" => $curr_line_hash{"db_object_synonym"}, # pipe-delimited string
+ "Gene Synonyms" => $curr_line_hash{"db_object_synonym"} # pipe-delimited string
};
if ($debug) { print "...<DEBUG: \%annotation_properties>\n"
# ---------------------------------------------------------------------------
sub transform_gaf
{
- #
-
- # my $parser = new XML::DOM::Parser;
- # my $doc = $parser->parsefile ("file.xml");
-
- # # print all HREF attributes of all CODEBASE elements
- # my $nodes = $doc->getElementsByTagName ("CODEBASE");
- # my $n = $nodes->getLength;
-
- # for (my $i = 0; $i < $n; $i++)
- # {
- # my $node = $nodes->item ($i);
- # my $href = $node->getAttributeNode ("HREF");
- # print $href->getValue . "\n";
- # }
-
- # # Print doc file
- # $doc->printToFile ("out.xml");
-
- # # Print to string
- # print $doc->toString;
+ # create new xml doc, write to string
+ my $writer = new XML::Writer(
+ OUTPUT => \$output_data,
+ DATA_MODE => 1,
+ DATA_INDENT => 4,
+ ENCODING => 'utf-8'
+ );
+
+ # create root elements
+ $writer->xmlDecl;
+ $writer->startTag("Pages");
+
+ # create source page
+ $writer->startTag("Page",Title=>"Source:$SOURCE_TITLE_SEED");
+ $writer->startTag("Template",Name=>"Source");
+
+ # iterate the source hash for element name attribs and vals
+ my @pretty_elements;
+ foreach my $element (keys %source)
+ {
+ # split on CamelCase (saves a few lines and was fun to write)
+ for ($element) {
+ @pretty_elements = /[A-Z](?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
+ }
+ $writer->startTag("Field",Name=>"@pretty_elements");
+ $writer->characters($source{$element});
+ $writer->endTag("Field");
+ }
+
+ $writer->endTag("Template");
+ $writer->endTag("Page");
- # # Avoid memory leaks - cleanup circular references for garbage collection
- # $doc->dispose;
+ # iterate %annotations
+ my $annot_title_count = $ANNOT_TITLE_SEED;
+
+ foreach my $annotation (keys %annotations)
+ {
+ #my %curr_annot = \$annotation; # ?
+
+ # create annotation page
+ $writer->startTag("Page",Title=>"Annotation:$annot_title_count");
+ $writer->startTag("Template",Name=>"Annotation");
+
+ $writer->startTag("Field",Name=>"Species Name");
+ $writer->characters($annotation_universals{'Species Name'});
+ $writer->endTag("Field");
+ $writer->startTag("Field",Name=>"Species ID");
+ $writer->characters($annotation_universals{'Species ID'});
+ $writer->endTag("Field");
+ $writer->startTag("Field",Name=>"Gene Symbol");
+ $writer->characters($annotation);
+ $writer->endTag("Field");
+ # $writer->startTag("Field",Name=>"Gene Name");
+ # $writer->characters();
+ # $writer->endTag("Field");
+ # $writer->startTag("Field",Name=>"Gene Locus");
+ # $writer->characters();
+ # $writer->endTag("Field");
+ $writer->startTag("Field",Name=>"Gene Type");
+ $writer->characters($annotation_universals{'Gene Type'});
+ $writer->endTag("Field");
+ # $writer->startTag("Field",Name=>"Chromosome");
+ # $writer->characters();
+ # $writer->endTag("Field");
+ $writer->startTag("Field",Name=>"Has Phenotype");
+ $writer->characters("No");
+ $writer->endTag("Field");
+
+ $writer->endTag("Template");
+ $writer->endTag("Page");
+
+ # start provenance page (separate node and attach at the end?)
+
+ # iterate synonyms
+ # create gene synonyms page
+ # add to provenance
+
+ # iterate @%ont refs
+ # create ont ref
+ # add to provenance
- $output_data = "hullo, gaf";
+ $annot_title_count++;
+ }
+
+ # close doc
+ $writer->endTag("Pages");
+ $writer->end();
+
+ # write doc to file
+ open(OUTPUT_FILE,">$output_file");
+ print OUTPUT_FILE $output_data;
+ close OUTPUT_FILE;
}