my $xml; # represents the xml doc
# gaf
-my %aspects;
-my $ontology_name;
+my %ontology_info; # holds ontology names and aspects, keyed by abbreviation
+my $curr_ontology_type; # flag to track the proper ontology to reference, line by line
my %annotation_universals; # holds values assumed not to vary across the file
my %annotations; # keyed on Gene Symbol
}
}
-# add new ontology reference data
+# Add new ontology reference data. Aggregates add'l evidence codes, evidence,
+# and publications for the existing term id.
# ---------------------------------------------------------------------------
-sub add_ontology_ref_data($)
+sub add_ontology_ref_data($$)
{
my %ref_data = %{$_[0]};
+ my $ont_type = $_[1];
+
my @evidence = split('\|', $ref_data{"with_or_from"});
my @publications = split('\|', $ref_data{"db_reference"});
+
+ my $i = 0;
for (@publications) {
if (!(substr($_,0,5) eq "PMID:")) {
- splice(@publications,$_,1);
+ splice(@publications,$i,1);
}
+ $i++;
}
- # TODO: can you eliminate this if..else by compressing the hash aggregation?
-
- # if this is the first annotation for this current gene symbol and term id, add it
- if (!exists $annotations{$ref_data{"db_object_symbol"}}
- {"Ontological References"}
- {$ref_data{"term_id"}})
+ # aspect (assumes only one, allows overwrite)
+ $annotations{$ref_data{"db_object_symbol"}}
+ {"Ontological References"}
+ {$ref_data{"term_id"}}
+ {"Aspect"} = $ontology_info{$ont_type}
+ {"aspects"}
+ {uc($ref_data{"aspect"})};
+
+ # evidence codes
+ $annotations{$ref_data{"db_object_symbol"}}
+ {"Ontological References"}
+ {$ref_data{"term_id"}}
+ {"Evidence Codes"}{$ref_data{"evidence_code"}} = "";
+
+ # evidence
+ for (@evidence)
{
- # print "*** NEW REF: $ref_data{'term_id'} ***\n"; # TEST
-
- # add new ontology data
- my $annotation_ontology_ref = {
- "Aspect" => $aspects{uc($ref_data{"aspect"})}, # assumes only one
- "Evidence Codes" => { $ref_data{"evidence_code"} => "" }
- };
-
- # add Evidence
- for (@evidence) {
- $$annotation_ontology_ref{"Evidence"}{$_} = "";
- }
-
- # add Publication data
- for (@publications) {
- $$annotation_ontology_ref{"Publications"}{$_} = ""; }
-
$annotations{$ref_data{"db_object_symbol"}}
- {"Ontological References"}
- {$ref_data{"term_id"}} = $annotation_ontology_ref;
-
- if ($debug) { print "...<DEBUG: \%annotation_ontology_ref>\n"
- . Dumper($annotation_ontology_ref) . "\n\n"; }
+ {"Ontological References"}
+ {$ref_data{"term_id"}}
+ {"Evidence"}{$_} = "";
}
- else # aggregate add'l evidence codes, evidence, and publications for the existing term id
+
+ # publications
+ for (@publications)
{
- # print "*** ADD'L REF: $ref_data{'term_id'} ***\n"; # TEST
-
$annotations{$ref_data{"db_object_symbol"}}
{"Ontological References"}
{$ref_data{"term_id"}}
- {"Evidence Codes"}{$ref_data{"evidence_code"}} = "";
+ {"Publications"}{$_} = "";
+ }
+}
- for (@evidence) {
- $annotations{$ref_data{"db_object_symbol"}}
- {"Ontological References"}
- {$ref_data{"term_id"}}
- {"Evidence"}{$_} = "";
+# populate a hash containing ontology types and their aspects; used as a
+# reference for aspect proper names when importing GAF data for transformation
+# ---------------------------------------------------------------------------
+sub initialize_ontology
+{
+ %ontology_info = (
+ GO => {
+ name => "Gene Ontology",
+ aspects => {
+ P => "Biological Process",
+ C => "Cellular Component",
+ F => "Molecular Function"
+ }
+ },
+ PO => {
+ name => "Plant Ontology",
+ aspects => {
+ A => "Plant Anatomy",
+ G => "Plant Growth and Development Stage"
+ }
}
-
- for (@publications) {
- $annotations{$ref_data{"db_object_symbol"}}
- {"Ontological References"}
- {$ref_data{"term_id"}}
- {"Publications"}{$_} = ""; }
- }
+ );
+
+ if ($debug) { print "...<DEBUG: \%ontology_info>\n"
+ . Dumper(\%ontology_info) . "\n\n"; }
}
# read, parse, and store GAF annotations
# "Evidence" => % strings ("with_or_from" )
# "Publications" => % PMID's from "db:reference", used to create separate Pub pages
+ # set up a hash of ontology types and aspects to be referenced during data import
+ initialize_ontology();
+
# loop through data rows and build hashed annotation data structure
my $count = 0;
+ # regex for locating a useable accession id from a locus (species-specific)
+ my $locus_finder_expr;
+
while (<INPUT_FILE>)
{
$count++;
if ($debug) { print "...<DEBUG: \%annotation_universals>\n"
. Dumper(\%annotation_universals) . "\n\n"; }
-
- # TODO: set all ontology aspects in a multi-hash (group by ontology)
- # and keep a flag to monitor the currently-needed ontology
-
- # identify what kind of ontology term we're dealing with and set our
- # aspects and ontology name accordingly
- switch ((split(':',$curr_line_hash{"term_id"}))[0]) {
- case 'GO' { # Gene Ontology
- %aspects = (
- P => "Biological Process",
- C => "Cellular Component",
- F => "Molecular Function"
- );
- $ontology_name = "Gene Ontology";
- }
- case 'PO' { # Plant Ontology
- %aspects = (
- A => "Plant Anatomy",
- G => "Plant Growth and Development Stage"
- );
- $ontology_name = "Plant Ontology";
+
+ # set species-specific values
+ switch ($annotation_universals{"Species ID"}) {
+ case "NCBI:3702" { $locus_finder_expr = "/[Aa][Tt].[Gg]/"; }
+ else {
+ die($annotation_universals{"Species ID"}
+ . " is not a valid NCBI taxon ID.\n");
}
}
}
+
+ # set the ontology for the current line
+ $curr_ontology_type = uc((split(':',$curr_line_hash{"term_id"}))[0]);
# check to see if Gene Symbol hash key exists (for grouping)
# if not, add the new Gene Symbol and its associated props
. Dumper(\@synonyms) . "\n\n"; }
# find the gene locus, if it is listed (first "AT.G")
- my @loci = grep /[Aa][Tt].[Gg]/, @synonyms;
+ my @loci = grep $locus_finder_expr, @synonyms;
if ($debug) { print "...<DEBUG: \@loci>\n"
. Dumper(\@loci) . "\n\n"; }
}
else # no match; attempt to use the Gene Symbol instead
{
- if ($curr_line_hash{"db_object_symbol"} =~ /[Aa][Tt].[Gg]/)
+ if ($curr_line_hash{"db_object_symbol"} =~ $locus_finder_expr)
{
- # the split drops the variant/allele signifier
+ # the split drops the variant/allele signifier, if present
$locus = (split('.',$curr_line_hash{"db_object_symbol"}))[0];
}
+ else # no match; attempt to use the Gene Name instead
+ {
+ if ($curr_line_hash{"db_object_name"} =~ $locus_finder_expr)
+ {
+ $locus = (split('.',$curr_line_hash{"db_object_name"}))[0];
+ }
+ }
}
+
# chromosome = third char in locus, if it exists
my $chromosome = ($locus ne "" ? (split('',$locus))[2] : "");
# set some sort of pseudo-unique value as the accession id,
# in order of succession: locus, then symbol
- # (note: this is dangerous; a stable identifier is preferred)
+ # (NOTE: this is dangerous; a stable identifier is preferred)
my $accession_id =
$locus ? $locus : $curr_line_hash{"db_object_symbol"};
# add new gene annotation and assign props
$annotations{$curr_line_hash{"db_object_symbol"}} = $annotation_properties;
- add_ontology_ref_data(\%curr_line_hash); # add the first ontology reference (every time)
+ add_ontology_ref_data(\%curr_line_hash, $curr_ontology_type); # add the first ontology reference (every time)
}
# that Gene Symbol does exist, so we just need to roll-up multi-line
# annotation information, like gene synonyms and ontology refs
$annotations{$curr_line_hash{"db_object_symbol"}}
{"Gene Synonyms"}{$_} = ""; }
- add_ontology_ref_data(\%curr_line_hash); # add add'l ontology reference data
+ add_ontology_ref_data(\%curr_line_hash, $curr_ontology_type); # add add'l ontology reference data
}
}
if ($debug) { print "...<DEBUG: \%annotations>\n"
# ---------------------------------------------------------------------------
sub transform_gaf
{
- # define temaplates and their fields for Provenance-generation
+ # define templates and their fields for Provenance-generation
my $template_field_map = {
Annotation => [
'Species Name',
'Ontology',
'Term ID',
'Aspect',
- 'Evidence Code'
+ 'Evidence Code',
+ 'Evidence'
]
};
$writer->endTag("Page");
# create gene synonyms page
- if (length($annotations{$annotation}{'Gene Synonyms'}) > 0)
+ if (scalar keys (%{$annotations{$annotation}{'Gene Synonyms'}}) > 0)
{
$writer->startTag("Page",Title=>"Annotation:$annot_title_count/Gene Synonyms");
$writer->startTag("Template",Name=>"Gene_Synonyms");
$writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page");
$writer->endTag("Template");
- foreach (split('\|',$annotations{$annotation}{'Gene Synonyms'}))
+ foreach my $synonym (keys %{$annotations{$annotation}{'Gene Synonyms'}})
{
$writer->startTag("Template",Name=>"Gene_Synonym_Repeater");
- $writer->dataElement("Field", $_, Name=>"Gene Synonym");
+ $writer->dataElement("Field", $synonym, Name=>"Gene Synonym");
$writer->endTag("Template");
}
$writer->endTag("Page");
}
# create ont refs page
- if (scalar(@{$annotations{$annotation}{"Ontological Reference"}}) > 0)
+ if (scalar keys (%{$annotations{$annotation}{"Ontological References"}}) > 0)
{
$writer->startTag("Page",Title=>"Annotation:$annot_title_count/Ontologies");
$writer->startTag("Template",Name=>"Ontological_References");
$writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page");
$writer->endTag("Template");
- foreach (@{$annotations{$annotation}{"Ontological Reference"}})
+ foreach my $ont_term (keys %{$annotations{$annotation}{"Ontological References"}})
{
$writer->startTag("Template",Name=>"Ontological_Reference_Repeater");
- $writer->dataElement("Field", $ontology_name, Name=>"Ontology");
- $writer->dataElement("Field", $$_{"Term ID"}, Name=>"Term ID");
- $writer->dataElement("Field", $$_{"Aspect"}, Name=>"Aspect");
- $writer->dataElement("Field", $$_{"Evidence Codes"}, Name=>"Evidence Code");
+ $writer->dataElement("Field", $ontology_info{$curr_ontology_type}{"name"}, Name=>"Ontology");
+ $writer->dataElement("Field", $ont_term, Name=>"Term ID");
+ $writer->dataElement("Field", $annotations{$annotation}{"Ontological References"}{$ont_term}{"Aspect"}, Name=>"Aspect");
+
+ my @evidence_codes;
+ foreach my $evidence_code (keys %{$annotations{$annotation}{"Ontological References"}{$ont_term}{"Evidence Codes"}}) {
+ push @evidence_codes, $evidence_code;
+ }
+ $writer->dataElement("Field", join(', ',@evidence_codes), Name=>"Evidence Code");
+
+ my @ary_evidence;
+ foreach my $evidence (keys %{$annotations{$annotation}{"Ontological References"}{$ont_term}{"Evidence"}}) {
+ push @ary_evidence, $evidence;
+ }
+ $writer->dataElement("Field", join(', ',@ary_evidence), Name=>"Evidence");
+
$writer->endTag("Template");
}
$writer->endTag("Page");
$writer->dataElement("Field", "Annotation:$annot_title_count", Name=>"Annotation Page");
$writer->endTag("Template");
+ # TODO: make further use of template map here to create provenance for SIO's
+
foreach (@{@$template_field_map{"Annotation"}})
{
$writer->startTag("Template",Name=>"Provenance_Repeater");
$writer->dataElement("Field", "Source:$SOURCE_TITLE_SEED", Name=>"Source");
$writer->endTag("Template");
}
-
$writer->endTag("Page");
$annot_title_count++;
init;
import_data;
if ($verbose) { show_input; }
-exit(0); # TEST
write_xml();
if ($verbose) { show_output; }