=head1 VERSION
-0.2
+0.3
=head1 DESCRIPTION
Also generates appropriate provenance of data based on a prefixed
Source header in the import file.
-New to this revision: Added a GAF file type option for import;
-accommodates ontologically-focused annotations.
+Handles GAF file type option for import; accommodates ontologically-
+focused annotations.
+
+New to this revision: ...
=head1 USAGE
Source URI=http://www.shigen.nig.ac.jp/rice/oryzabase/
Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/...
-=head2 Data Header Format Examples
+=head2 Data Header Generic Format Examples
- NOTE: The field separator may also be a tab instead, but must be
- employed consistently across the entire file.
+ NOTES: The field separator may also be a tab instead, but must be
+ employed consistently across the entire file. GAF files do not
+ require a [Format] section.
[Format]
Template=Annotation
}
}
+# add new ontology reference data
+# ---------------------------------------------------------------------------
+sub add_ontology_ref_data($)
+{
+ my %ref_data = %{$_[0]};
+ my @evidence = split('\|', $ref_data{"with_or_from"});
+ my @publications = split('\|', $ref_data{"db_reference"});
+ for (@publications) {
+ if (!(substr($_,0,5) eq "PMID:")) {
+ splice(@publications,$_,1);
+ }
+ }
+
+ # TODO: can you eliminate this if..else by compressing the hash aggregation?
+
+ # if this is the first annotation for this current gene symbol and term id, add it
+ if (!exists $annotations{$ref_data{"db_object_symbol"}}
+ {"Ontological References"}
+ {$ref_data{"term_id"}})
+ {
+ # print "*** NEW REF: $ref_data{'term_id'} ***\n"; # TEST
+
+ # add new ontology data
+ my $annotation_ontology_ref = {
+ "Aspect" => $aspects{uc($ref_data{"aspect"})}, # assumes only one
+ "Evidence Codes" => { $ref_data{"evidence_code"} => "" }
+ };
+
+ # add Evidence
+ for (@evidence) {
+ $$annotation_ontology_ref{"Evidence"}{$_} = "";
+ }
+
+ # add Publication data
+ for (@publications) {
+ $$annotation_ontology_ref{"Publications"}{$_} = ""; }
+
+ $annotations{$ref_data{"db_object_symbol"}}
+ {"Ontological References"}
+ {$ref_data{"term_id"}} = $annotation_ontology_ref;
+
+ if ($debug) { print "...<DEBUG: \%annotation_ontology_ref>\n"
+ . Dumper($annotation_ontology_ref) . "\n\n"; }
+ }
+ else # aggregate add'l evidence codes, evidence, and publications for the existing term id
+ {
+ # print "*** ADD'L REF: $ref_data{'term_id'} ***\n"; # TEST
+
+ $annotations{$ref_data{"db_object_symbol"}}
+ {"Ontological References"}
+ {$ref_data{"term_id"}}
+ {"Evidence Codes"}{$ref_data{"evidence_code"}} = "";
+
+ for (@evidence) {
+ $annotations{$ref_data{"db_object_symbol"}}
+ {"Ontological References"}
+ {$ref_data{"term_id"}}
+ {"Evidence"}{$_} = "";
+ }
+
+ for (@publications) {
+ $annotations{$ref_data{"db_object_symbol"}}
+ {"Ontological References"}
+ {$ref_data{"term_id"}}
+ {"Publications"}{$_} = ""; }
+ }
+}
+
# read, parse, and store GAF annotations
# ---------------------------------------------------------------------------
sub import_gaf
{
#[GAF implied data structure]
- # singletons: db/Source (not really, but OK for now), taxon/Species ID, assigned_by, gene_type/Gene Type (later: proteins, too)
- # unneeded: db_object_id, date, qualifier, db_reference (get PMID from here later), with_or_from (later)
- # unvaried fields (gene-level): db_object_symbol/Gene Symbol, db_object_name/Gene Name, db_object_synonym/Gene Locus|Source Accession ID|Chromosome|Gene Synonyms (see below),
- # varied fields (gene synonyms): db_object_synonym/Gene Synonym
- # varied fields (ontology-level): term_id/Term ID, evidence_code/Evidence Code, aspect/Aspect,
+ # singletons: db/Source (not really, but OK for now), taxon/Species ID,
+ # assigned_by, gene_type/Gene Type (later: proteins, too)
+ # lower priority: db_object_id, date
+ # not needed: qualifier
+ # unvaried fields (gene-level): db_object_symbol/Gene Symbol,
+ # db_object_name/Gene Name,
+ # db_object_synonym/Gene Locus|Source Accession ID|Chromosome|Gene Synonyms (see below),
+ # varied fields (gene synonyms): db_object_synonym/Gene Synonym
+ # varied fields (ontology-level): term_id/Term ID, evidence_code/Evidence Code,
+ # aspect/Aspect, db_reference (multi-, get PMID from here), with_or_from (multi-)
# [Annotation Object Structure]
# %annotation_universals
# "Gene Type"
# %annotations
# "$Gene Symbol" => %annotation_properties
- # "Gene Name" => string
- # "Gene Locus"/"Source Accession ID" (first uc("AT.G")) => string
- # "Chromosome" (AT#G in "Gene Locus") => string
- # "Gene Synonyms" => pipe-delimited string of synonyms
- # "Ontological Reference" => %annotation_ontology_refs
- # "Term ID" => string
- # "Evidence Code" => string
- # "Aspect" => string
+ # "Gene Name" => string
+ # "Gene Locus"/"Source Accession ID" (first uc("AT.G")) => string
+ # "Chromosome" (AT#G in "Gene Locus") => string
+ # "Gene Synonyms" => % strings
+ # "Ontological References" => % "$Term ID" => %annotation_ontology_ref
+ # "Aspect" => string (assumes only one)
+ # "Evidence Codes" => % strings
+ # "Evidence" => % strings ("with_or_from" )
+ # "Publications" => % PMID's from "db:reference", used to create separate Pub pages
# loop through data rows and build hashed annotation data structure
my $count = 0;
. Dumper(\@curr_line_ary) . "\n\n"; }
my %curr_line_hash = (
- "db" => $curr_line_ary[0], # Source
- "db_object_symbol" => $curr_line_ary[2], # Gene Symbol
- "term_id" => $curr_line_ary[4], # Term ID
- "evidence_code" => $curr_line_ary[6], # Evidence Code
- "aspect" => $curr_line_ary[8], # Aspect
- "db_object_name" => $curr_line_ary[9], # Gene Name
+ "db" => $curr_line_ary[0], # Source
+ "db_object_symbol" => $curr_line_ary[2], # Gene Symbol
+ "term_id" => $curr_line_ary[4], # Term ID
+ "db_reference" => $curr_line_ary[5], # inc. Publication info (PMID)
+ "evidence_code" => $curr_line_ary[6], # Evidence Code
+ "with_or_from" => $curr_line_ary[7], # Evidence (data)
+ "aspect" => $curr_line_ary[8], # Aspect
+ "db_object_name" => $curr_line_ary[9], # Gene Name
# Gene Locus, Source Accession ID, Chromosome, Gene Synonyms
- "db_object_synonym" => $curr_line_ary[10],
-
- "db_object_type" => $curr_line_ary[11], # Gene Type
- "taxon" => $curr_line_ary[12] # Species ID
+ "db_object_synonym" => $curr_line_ary[10],
+ "db_object_type" => $curr_line_ary[11], # Gene Type
+ "taxon" => $curr_line_ary[12] # Species ID
);
if ($debug) { print "...<DEBUG: \%curr_line_hash>\n"
if ($debug) { print "...<DEBUG: \%annotation_universals>\n"
. Dumper(\%annotation_universals) . "\n\n"; }
+ # TODO: set all ontology aspects in a multi-hash (group by ontology)
+ # and keep a flag to monitor the currently-needed ontology
+
# identify what kind of ontology term we're dealing with and set our
# aspects and ontology name accordingly
switch ((split(':',$curr_line_hash{"term_id"}))[0]) {
# if not, add the new Gene Symbol and its associated props
if (!exists $annotations{$curr_line_hash{"db_object_symbol"}})
{
+
+ # print "\n*** NEW SYMBOL: $curr_line_hash{'db_object_symbol'} ***\n"; # TEST
+
# prepare Gene Locus, Source Accession ID, Chromosome, Gene Synonyms
my @synonyms = split('\|',$curr_line_hash{"db_object_synonym"});
"Accession ID" => $accession_id,
"Gene Name" => $curr_line_hash{"db_object_name"},
"Gene Locus" => $locus,
- "Chromosome" => $chromosome,
- "Gene Synonyms" => $curr_line_hash{"db_object_synonym"} # pipe-delimited string
+ "Chromosome" => $chromosome
};
+ # add synonyms
+ for (split('\|', $curr_line_hash{"db_object_synonym"})) {
+ $$annotation_properties{"Gene Synonyms"}{$_} = ""; }
+
if ($debug) { print "...<DEBUG: \%annotation_properties>\n"
. Dumper($annotation_properties) . "\n\n"; }
-
# add new gene annotation and assign props
$annotations{$curr_line_hash{"db_object_symbol"}} = $annotation_properties;
- }
-
- # add new ontology data (this happens on every line of data)
- my $annotation_ontology_ref = {
- "Term ID" => $curr_line_hash{"term_id"},
- "Aspect" => $aspects{uc($curr_line_hash{"aspect"})},
- "Evidence Code" => $curr_line_hash{"evidence_code"}
- };
- if ($debug) { print "...<DEBUG: \@annotation_ontology_refs>\n"
- . Dumper($annotation_ontology_ref) . "\n\n"; }
+ add_ontology_ref_data(\%curr_line_hash); # add the first ontology reference (every time)
+ }
+ # that Gene Symbol does exist, so we just need to roll-up multi-line
+ # annotation information, like gene synonyms and ontology refs
+ else
+ {
+ #print "\n*** EXISTING SYMBOL: $curr_line_hash{'db_object_symbol'} ***\n"; # TEST
- push @{$annotations{$curr_line_hash{"db_object_symbol"}}{"Ontological Reference"}}, $annotation_ontology_ref;
+ # add any add'l synonyms
+ for (split('\|', $curr_line_hash{"db_object_synonym"})) {
+ $annotations{$curr_line_hash{"db_object_symbol"}}
+ {"Gene Synonyms"}{$_} = ""; }
+ add_ontology_ref_data(\%curr_line_hash); # add add'l ontology reference data
+ }
}
if ($debug) { print "...<DEBUG: \%annotations>\n"
. Dumper(\%annotations) . "\n\n"; }
$writer->dataElement("Field", $ontology_name, Name=>"Ontology");
$writer->dataElement("Field", $$_{"Term ID"}, Name=>"Term ID");
$writer->dataElement("Field", $$_{"Aspect"}, Name=>"Aspect");
- $writer->dataElement("Field", $$_{"Evidence Code"}, Name=>"Evidence Code");
+ $writer->dataElement("Field", $$_{"Evidence Codes"}, Name=>"Evidence Code");
$writer->endTag("Template");
}
$writer->endTag("Page");
init;
import_data;
if ($verbose) { show_input; }
+exit(0); # TEST
write_xml();
if ($verbose) { show_output; }