From 6a3dac2291f9d659e9da76a9a5d1bb229cdd5cdc Mon Sep 17 00:00:00 2001 From: preecej Date: Thu, 25 Aug 2011 00:23:16 +0000 Subject: [PATCH] In progress; Revised ontology info, ontology references, evidence, evidence codes, and gene synonyms. Generated XML pages for all except Publications. That's next. svn path=/; revision=170 --- .../semantic_wiki/paw_TransformForImport.pl | 205 ++++++++++-------- 1 file changed, 116 insertions(+), 89 deletions(-) diff --git a/preecej/semantic_wiki/paw_TransformForImport.pl b/preecej/semantic_wiki/paw_TransformForImport.pl index df3be4f..40c2495 100644 --- a/preecej/semantic_wiki/paw_TransformForImport.pl +++ b/preecej/semantic_wiki/paw_TransformForImport.pl @@ -129,8 +129,8 @@ my @field_data; my $xml; # represents the xml doc # gaf -my %aspects; -my $ontology_name; +my %ontology_info; # holds ontology names and aspects, keyed by abbreviation +my $curr_ontology_type; # flag to track the proper ontology to reference, line by line my %annotation_universals; # holds values assumed not to vary across the file my %annotations; # keyed on Gene Symbol @@ -252,72 +252,83 @@ sub import_generic } } -# add new ontology reference data +# Add new ontology reference data. Aggregates add'l evidence codes, evidence, +# and publications for the existing term id. # --------------------------------------------------------------------------- -sub add_ontology_ref_data($) +sub add_ontology_ref_data($$) { my %ref_data = %{$_[0]}; + my $ont_type = $_[1]; + my @evidence = split('\|', $ref_data{"with_or_from"}); my @publications = split('\|', $ref_data{"db_reference"}); + + my $i = 0; for (@publications) { if (!(substr($_,0,5) eq "PMID:")) { - splice(@publications,$_,1); + splice(@publications,$i,1); } + $i++; } - # TODO: can you eliminate this if..else by compressing the hash aggregation? - - # if this is the first annotation for this current gene symbol and term id, add it - if (!exists $annotations{$ref_data{"db_object_symbol"}} - {"Ontological References"} - {$ref_data{"term_id"}}) + # aspect (assumes only one, allows overwrite) + $annotations{$ref_data{"db_object_symbol"}} + {"Ontological References"} + {$ref_data{"term_id"}} + {"Aspect"} = $ontology_info{$ont_type} + {"aspects"} + {uc($ref_data{"aspect"})}; + + # evidence codes + $annotations{$ref_data{"db_object_symbol"}} + {"Ontological References"} + {$ref_data{"term_id"}} + {"Evidence Codes"}{$ref_data{"evidence_code"}} = ""; + + # evidence + for (@evidence) { - # print "*** NEW REF: $ref_data{'term_id'} ***\n"; # TEST - - # add new ontology data - my $annotation_ontology_ref = { - "Aspect" => $aspects{uc($ref_data{"aspect"})}, # assumes only one - "Evidence Codes" => { $ref_data{"evidence_code"} => "" } - }; - - # add Evidence - for (@evidence) { - $$annotation_ontology_ref{"Evidence"}{$_} = ""; - } - - # add Publication data - for (@publications) { - $$annotation_ontology_ref{"Publications"}{$_} = ""; } - $annotations{$ref_data{"db_object_symbol"}} - {"Ontological References"} - {$ref_data{"term_id"}} = $annotation_ontology_ref; - - if ($debug) { print "...\n" - . Dumper($annotation_ontology_ref) . "\n\n"; } + {"Ontological References"} + {$ref_data{"term_id"}} + {"Evidence"}{$_} = ""; } - else # aggregate add'l evidence codes, evidence, and publications for the existing term id + + # publications + for (@publications) { - # print "*** ADD'L REF: $ref_data{'term_id'} ***\n"; # TEST - $annotations{$ref_data{"db_object_symbol"}} {"Ontological References"} {$ref_data{"term_id"}} - {"Evidence Codes"}{$ref_data{"evidence_code"}} = ""; + {"Publications"}{$_} = ""; + } +} - for (@evidence) { - $annotations{$ref_data{"db_object_symbol"}} - {"Ontological References"} - {$ref_data{"term_id"}} - {"Evidence"}{$_} = ""; +# populate a hash containing ontology types and their aspects; used as a +# reference for aspect proper names when importing GAF data for transformation +# --------------------------------------------------------------------------- +sub initialize_ontology +{ + %ontology_info = ( + GO => { + name => "Gene Ontology", + aspects => { + P => "Biological Process", + C => "Cellular Component", + F => "Molecular Function" + } + }, + PO => { + name => "Plant Ontology", + aspects => { + A => "Plant Anatomy", + G => "Plant Growth and Development Stage" + } } - - for (@publications) { - $annotations{$ref_data{"db_object_symbol"}} - {"Ontological References"} - {$ref_data{"term_id"}} - {"Publications"}{$_} = ""; } - } + ); + + if ($debug) { print "...\n" + . Dumper(\%ontology_info) . "\n\n"; } } # read, parse, and store GAF annotations @@ -353,9 +364,15 @@ sub import_gaf # "Evidence" => % strings ("with_or_from" ) # "Publications" => % PMID's from "db:reference", used to create separate Pub pages + # set up a hash of ontology types and aspects to be referenced during data import + initialize_ontology(); + # loop through data rows and build hashed annotation data structure my $count = 0; + # regex for locating a useable accession id from a locus (species-specific) + my $locus_finder_expr; + while () { $count++; @@ -396,30 +413,19 @@ sub import_gaf if ($debug) { print "...\n" . Dumper(\%annotation_universals) . "\n\n"; } - - # TODO: set all ontology aspects in a multi-hash (group by ontology) - # and keep a flag to monitor the currently-needed ontology - - # identify what kind of ontology term we're dealing with and set our - # aspects and ontology name accordingly - switch ((split(':',$curr_line_hash{"term_id"}))[0]) { - case 'GO' { # Gene Ontology - %aspects = ( - P => "Biological Process", - C => "Cellular Component", - F => "Molecular Function" - ); - $ontology_name = "Gene Ontology"; - } - case 'PO' { # Plant Ontology - %aspects = ( - A => "Plant Anatomy", - G => "Plant Growth and Development Stage" - ); - $ontology_name = "Plant Ontology"; + + # set species-specific values + switch ($annotation_universals{"Species ID"}) { + case "NCBI:3702" { $locus_finder_expr = "/[Aa][Tt].[Gg]/"; } + else { + die($annotation_universals{"Species ID"} + . " is not a valid NCBI taxon ID.\n"); } } } + + # set the ontology for the current line + $curr_ontology_type = uc((split(':',$curr_line_hash{"term_id"}))[0]); # check to see if Gene Symbol hash key exists (for grouping) # if not, add the new Gene Symbol and its associated props @@ -435,7 +441,7 @@ sub import_gaf . Dumper(\@synonyms) . "\n\n"; } # find the gene locus, if it is listed (first "AT.G") - my @loci = grep /[Aa][Tt].[Gg]/, @synonyms; + my @loci = grep $locus_finder_expr, @synonyms; if ($debug) { print "...\n" . Dumper(\@loci) . "\n\n"; } @@ -448,18 +454,26 @@ sub import_gaf } else # no match; attempt to use the Gene Symbol instead { - if ($curr_line_hash{"db_object_symbol"} =~ /[Aa][Tt].[Gg]/) + if ($curr_line_hash{"db_object_symbol"} =~ $locus_finder_expr) { - # the split drops the variant/allele signifier + # the split drops the variant/allele signifier, if present $locus = (split('.',$curr_line_hash{"db_object_symbol"}))[0]; } + else # no match; attempt to use the Gene Name instead + { + if ($curr_line_hash{"db_object_name"} =~ $locus_finder_expr) + { + $locus = (split('.',$curr_line_hash{"db_object_name"}))[0]; + } + } } + # chromosome = third char in locus, if it exists my $chromosome = ($locus ne "" ? (split('',$locus))[2] : ""); # set some sort of pseudo-unique value as the accession id, # in order of succession: locus, then symbol - # (note: this is dangerous; a stable identifier is preferred) + # (NOTE: this is dangerous; a stable identifier is preferred) my $accession_id = $locus ? $locus : $curr_line_hash{"db_object_symbol"}; @@ -481,7 +495,7 @@ sub import_gaf # add new gene annotation and assign props $annotations{$curr_line_hash{"db_object_symbol"}} = $annotation_properties; - add_ontology_ref_data(\%curr_line_hash); # add the first ontology reference (every time) + add_ontology_ref_data(\%curr_line_hash, $curr_ontology_type); # add the first ontology reference (every time) } # that Gene Symbol does exist, so we just need to roll-up multi-line # annotation information, like gene synonyms and ontology refs @@ -494,7 +508,7 @@ sub import_gaf $annotations{$curr_line_hash{"db_object_symbol"}} {"Gene Synonyms"}{$_} = ""; } - add_ontology_ref_data(\%curr_line_hash); # add add'l ontology reference data + add_ontology_ref_data(\%curr_line_hash, $curr_ontology_type); # add add'l ontology reference data } } if ($debug) { print "...\n" @@ -705,7 +719,7 @@ sub transform_generic # --------------------------------------------------------------------------- sub transform_gaf { - # define temaplates and their fields for Provenance-generation + # define templates and their fields for Provenance-generation my $template_field_map = { Annotation => [ 'Species Name', @@ -724,7 +738,8 @@ sub transform_gaf 'Ontology', 'Term ID', 'Aspect', - 'Evidence Code' + 'Evidence Code', + 'Evidence' ] }; @@ -778,37 +793,49 @@ sub transform_gaf $writer->endTag("Page"); # create gene synonyms page - if (length($annotations{$annotation}{'Gene Synonyms'}) > 0) + if (scalar keys (%{$annotations{$annotation}{'Gene Synonyms'}}) > 0) { $writer->startTag("Page",Title=>"Annotation:$annot_title_count/Gene Synonyms"); $writer->startTag("Template",Name=>"Gene_Synonyms"); $writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page"); $writer->endTag("Template"); - foreach (split('\|',$annotations{$annotation}{'Gene Synonyms'})) + foreach my $synonym (keys %{$annotations{$annotation}{'Gene Synonyms'}}) { $writer->startTag("Template",Name=>"Gene_Synonym_Repeater"); - $writer->dataElement("Field", $_, Name=>"Gene Synonym"); + $writer->dataElement("Field", $synonym, Name=>"Gene Synonym"); $writer->endTag("Template"); } $writer->endTag("Page"); } # create ont refs page - if (scalar(@{$annotations{$annotation}{"Ontological Reference"}}) > 0) + if (scalar keys (%{$annotations{$annotation}{"Ontological References"}}) > 0) { $writer->startTag("Page",Title=>"Annotation:$annot_title_count/Ontologies"); $writer->startTag("Template",Name=>"Ontological_References"); $writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page"); $writer->endTag("Template"); - foreach (@{$annotations{$annotation}{"Ontological Reference"}}) + foreach my $ont_term (keys %{$annotations{$annotation}{"Ontological References"}}) { $writer->startTag("Template",Name=>"Ontological_Reference_Repeater"); - $writer->dataElement("Field", $ontology_name, Name=>"Ontology"); - $writer->dataElement("Field", $$_{"Term ID"}, Name=>"Term ID"); - $writer->dataElement("Field", $$_{"Aspect"}, Name=>"Aspect"); - $writer->dataElement("Field", $$_{"Evidence Codes"}, Name=>"Evidence Code"); + $writer->dataElement("Field", $ontology_info{$curr_ontology_type}{"name"}, Name=>"Ontology"); + $writer->dataElement("Field", $ont_term, Name=>"Term ID"); + $writer->dataElement("Field", $annotations{$annotation}{"Ontological References"}{$ont_term}{"Aspect"}, Name=>"Aspect"); + + my @evidence_codes; + foreach my $evidence_code (keys %{$annotations{$annotation}{"Ontological References"}{$ont_term}{"Evidence Codes"}}) { + push @evidence_codes, $evidence_code; + } + $writer->dataElement("Field", join(', ',@evidence_codes), Name=>"Evidence Code"); + + my @ary_evidence; + foreach my $evidence (keys %{$annotations{$annotation}{"Ontological References"}{$ont_term}{"Evidence"}}) { + push @ary_evidence, $evidence; + } + $writer->dataElement("Field", join(', ',@ary_evidence), Name=>"Evidence"); + $writer->endTag("Template"); } $writer->endTag("Page"); @@ -820,6 +847,8 @@ sub transform_gaf $writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page"); $writer->endTag("Template"); + # TODO: make further use of template map here to create provenance for SIO's + foreach (@{@$template_field_map{"Annotation"}}) { $writer->startTag("Template",Name=>"Provenance_Repeater"); @@ -829,7 +858,6 @@ sub transform_gaf $writer->dataElement("Field", "Source:$SOURCE_TITLE_SEED", Name=>"Source"); $writer->endTag("Template"); } - $writer->endTag("Page"); $annot_title_count++; @@ -875,7 +903,6 @@ sub show_output init; import_data; if ($verbose) { show_input; } -exit(0); # TEST write_xml(); if ($verbose) { show_output; } -- 2.34.1