From 5176d0b50b8129786492a8e5ab7a4447698ea207 Mon Sep 17 00:00:00 2001 From: preecej Date: Fri, 29 Jul 2011 22:36:33 +0000 Subject: [PATCH] Completed GAF import transformation script for scale testing. svn path=/; revision=130 --- .../semantic_wiki/paw_TransformForImport.pl | 131 ++++++++++++------ 1 file changed, 90 insertions(+), 41 deletions(-) diff --git a/preecej/semantic_wiki/paw_TransformForImport.pl b/preecej/semantic_wiki/paw_TransformForImport.pl index cdbf520..f1f3dd7 100644 --- a/preecej/semantic_wiki/paw_TransformForImport.pl +++ b/preecej/semantic_wiki/paw_TransformForImport.pl @@ -118,7 +118,6 @@ my $debug = 0; # debugging switch # universal my %source; my $output_data; -my %aspects; # tab, csv my $template_name; @@ -127,6 +126,8 @@ my @field_data; my $xml; # represents the xml doc # gaf +my %aspects; +my $ontology_name; my %annotation_universals; # holds values assumed not to vary across the file my %annotations; # keyed on Gene Symbol @@ -319,7 +320,7 @@ sub import_gaf . Dumper(\%annotation_universals) . "\n\n"; } # identify what kind of ontology term we're dealing with and set our - # aspect names accordingly + # aspects and ontology name accordingly switch ((split(':',$curr_line_hash{"term_id"}))[0]) { case 'GO' { # Gene Ontology %aspects = ( @@ -327,12 +328,14 @@ sub import_gaf C => "Cellular Component", F => "Molecular Function" ); + $ontology_name = "Gene Ontology"; } case 'PO' { # Plant Ontology %aspects = ( A => "Plant Anatomy", G => "Plant Growth and Development Stage" ); + $ontology_name = "Plant Ontology"; } } } @@ -607,6 +610,29 @@ sub transform_generic # --------------------------------------------------------------------------- sub transform_gaf { + # define temaplates and their fields for Provenance-generation + my $template_field_map = { + Annotation => [ + 'Species Name', + 'Species ID', + 'Gene Symbol', + 'Gene Name', + 'Gene Locus', + 'Gene Type', + 'Chromosome', + 'Has Phenotype' + ], + Gene_Synonym_Repeater => [ + 'Gene Synonym' + ], + Ontological_Reference_Repeater => [ + 'Ontology', + 'Term ID', + 'Aspect', + 'Evidence Code' + ] + }; + # create new xml doc, write to string my $writer = new XML::Writer( OUTPUT => \$output_data, @@ -631,9 +657,7 @@ sub transform_gaf for ($element) { @pretty_elements = /[A-Z](?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g; } - $writer->startTag("Field",Name=>"@pretty_elements"); - $writer->characters($source{$element}); - $writer->endTag("Field"); + $writer->dataElement("Field", $source{$element}, Name=>"@pretty_elements"); } $writer->endTag("Template"); @@ -644,50 +668,75 @@ sub transform_gaf foreach my $annotation (keys %annotations) { - #my %curr_annot = \$annotation; # ? - # create annotation page $writer->startTag("Page",Title=>"Annotation:$annot_title_count"); $writer->startTag("Template",Name=>"Annotation"); - - $writer->startTag("Field",Name=>"Species Name"); - $writer->characters($annotation_universals{'Species Name'}); - $writer->endTag("Field"); - $writer->startTag("Field",Name=>"Species ID"); - $writer->characters($annotation_universals{'Species ID'}); - $writer->endTag("Field"); - $writer->startTag("Field",Name=>"Gene Symbol"); - $writer->characters($annotation); - $writer->endTag("Field"); - # $writer->startTag("Field",Name=>"Gene Name"); - # $writer->characters(); - # $writer->endTag("Field"); - # $writer->startTag("Field",Name=>"Gene Locus"); - # $writer->characters(); - # $writer->endTag("Field"); - $writer->startTag("Field",Name=>"Gene Type"); - $writer->characters($annotation_universals{'Gene Type'}); - $writer->endTag("Field"); - # $writer->startTag("Field",Name=>"Chromosome"); - # $writer->characters(); - # $writer->endTag("Field"); - $writer->startTag("Field",Name=>"Has Phenotype"); - $writer->characters("No"); - $writer->endTag("Field"); - + $writer->dataElement("Field", $annotation_universals{"Species Name"}, Name=>"Species Name"); + $writer->dataElement("Field", $annotation_universals{"Species ID"}, Name=>"Species ID"); + $writer->dataElement("Field", $annotation, Name=>"Gene Symbol"); + $writer->dataElement("Field", $annotations{$annotation}{"Gene Name"}, Name=>"Gene Name"); + $writer->dataElement("Field", $annotations{$annotation}{"Gene Locus"}, Name=>"Gene Locus"); + $writer->dataElement("Field", ucfirst($annotation_universals{"Gene Type"}), Name=>"Gene Type"); + $writer->dataElement("Field", $annotations{$annotation}{"Chromosome"}, Name=>"Chromosome"); + $writer->dataElement("Field", "No", Name=>"Has Phenotype"); $writer->endTag("Template"); $writer->endTag("Page"); - # start provenance page (separate node and attach at the end?) + # create gene synonyms page + if (length($annotations{$annotation}{'Gene Synonyms'}) > 0) + { + $writer->startTag("Page",Title=>"Annotation:$annot_title_count/Gene Synonyms"); + $writer->startTag("Template",Name=>"Gene_Synonyms"); + $writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page"); + $writer->endTag("Template"); + + foreach (split('\|',$annotations{$annotation}{'Gene Synonyms'})) + { + $writer->startTag("Template",Name=>"Gene_Synonym_Repeater"); + $writer->dataElement("Field", $_, Name=>"Gene Synonym"); + $writer->endTag("Template"); + } + $writer->endTag("Page"); + } + + # create ont refs page + if (scalar(@{$annotations{$annotation}{"Ontological Reference"}}) > 0) + { + $writer->startTag("Page",Title=>"Annotation:$annot_title_count/Ontologies"); + $writer->startTag("Template",Name=>"Ontological_References"); + $writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page"); + $writer->endTag("Template"); + + foreach (@{$annotations{$annotation}{"Ontological Reference"}}) + { + $writer->startTag("Template",Name=>"Ontological_Reference_Repeater"); + $writer->dataElement("Field", $ontology_name, Name=>"Ontology"); + $writer->dataElement("Field", $$_{"Term ID"}, Name=>"Term ID"); + $writer->dataElement("Field", $$_{"Aspect"}, Name=>"Aspect"); + $writer->dataElement("Field", $$_{"Evidence Code"}, Name=>"Evidence Code"); + $writer->endTag("Template"); + } + $writer->endTag("Page"); + } - # iterate synonyms - # create gene synonyms page - # add to provenance + # create provenance page + $writer->startTag("Page",Title=>"Annotation:$annot_title_count/Provenance"); + $writer->startTag("Template",Name=>"Provenance"); + $writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page"); + $writer->endTag("Template"); + + foreach (@{@$template_field_map{"Annotation"}}) + { + $writer->startTag("Template",Name=>"Provenance_Repeater"); + $writer->dataElement("Field", $_, Name=>"Source Field"); + $writer->dataElement("Field", "Annotation", Name=>"Source Template"); + $writer->dataElement("Field", $annotations{$annotation}{"Gene Locus"}, Name=>"Source Accession ID"); + $writer->dataElement("Field", "Source:$SOURCE_TITLE_SEED", Name=>"Source"); + $writer->endTag("Template"); + } + + $writer->endTag("Page"); - # iterate @%ont refs - # create ont ref - # add to provenance - $annot_title_count++; } -- 2.34.1