From: preecej Date: Wed, 24 Aug 2011 01:29:34 +0000 (+0000) Subject: In progress; added handling for grouped ont term ids in gaf import, inc multiple... X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=fe9fa71c93134d4274da4195347d83bc363d0451;p=old-jaiswallab-svn%2F.git In progress; added handling for grouped ont term ids in gaf import, inc multiple gene synonyms, evidence codes, evidence data (with_or_from), and publication ids (db:reference PMID's). Need to complete by building corresponding wiki XML pages and properties. svn path=/; revision=169 --- diff --git a/preecej/semantic_wiki/paw_TransformForImport.pl b/preecej/semantic_wiki/paw_TransformForImport.pl index 5dd0e88..df3be4f 100644 --- a/preecej/semantic_wiki/paw_TransformForImport.pl +++ b/preecej/semantic_wiki/paw_TransformForImport.pl @@ -6,7 +6,7 @@ Planteome Annotation Wiki - Data Import Script =head1 VERSION -0.2 +0.3 =head1 DESCRIPTION @@ -16,8 +16,10 @@ by the MediaWiki extension DataTransfer (Special:ImportXML) feature. Also generates appropriate provenance of data based on a prefixed Source header in the import file. -New to this revision: Added a GAF file type option for import; -accommodates ontologically-focused annotations. +Handles GAF file type option for import; accommodates ontologically- +focused annotations. + +New to this revision: ... =head1 USAGE @@ -50,10 +52,11 @@ be imported in accordance with the input type. Source URI=http://www.shigen.nig.ac.jp/rice/oryzabase/ Source File=http://www.shigen.nig.ac.jp/rice/oryzabase/genes/... -=head2 Data Header Format Examples +=head2 Data Header Generic Format Examples - NOTE: The field separator may also be a tab instead, but must be - employed consistently across the entire file. + NOTES: The field separator may also be a tab instead, but must be + employed consistently across the entire file. GAF files do not + require a [Format] section. [Format] Template=Annotation @@ -249,16 +252,89 @@ sub import_generic } } +# add new ontology reference data +# --------------------------------------------------------------------------- +sub add_ontology_ref_data($) +{ + my %ref_data = %{$_[0]}; + my @evidence = split('\|', $ref_data{"with_or_from"}); + my @publications = split('\|', $ref_data{"db_reference"}); + for (@publications) { + if (!(substr($_,0,5) eq "PMID:")) { + splice(@publications,$_,1); + } + } + + # TODO: can you eliminate this if..else by compressing the hash aggregation? + + # if this is the first annotation for this current gene symbol and term id, add it + if (!exists $annotations{$ref_data{"db_object_symbol"}} + {"Ontological References"} + {$ref_data{"term_id"}}) + { + # print "*** NEW REF: $ref_data{'term_id'} ***\n"; # TEST + + # add new ontology data + my $annotation_ontology_ref = { + "Aspect" => $aspects{uc($ref_data{"aspect"})}, # assumes only one + "Evidence Codes" => { $ref_data{"evidence_code"} => "" } + }; + + # add Evidence + for (@evidence) { + $$annotation_ontology_ref{"Evidence"}{$_} = ""; + } + + # add Publication data + for (@publications) { + $$annotation_ontology_ref{"Publications"}{$_} = ""; } + + $annotations{$ref_data{"db_object_symbol"}} + {"Ontological References"} + {$ref_data{"term_id"}} = $annotation_ontology_ref; + + if ($debug) { print "...\n" + . Dumper($annotation_ontology_ref) . "\n\n"; } + } + else # aggregate add'l evidence codes, evidence, and publications for the existing term id + { + # print "*** ADD'L REF: $ref_data{'term_id'} ***\n"; # TEST + + $annotations{$ref_data{"db_object_symbol"}} + {"Ontological References"} + {$ref_data{"term_id"}} + {"Evidence Codes"}{$ref_data{"evidence_code"}} = ""; + + for (@evidence) { + $annotations{$ref_data{"db_object_symbol"}} + {"Ontological References"} + {$ref_data{"term_id"}} + {"Evidence"}{$_} = ""; + } + + for (@publications) { + $annotations{$ref_data{"db_object_symbol"}} + {"Ontological References"} + {$ref_data{"term_id"}} + {"Publications"}{$_} = ""; } + } +} + # read, parse, and store GAF annotations # --------------------------------------------------------------------------- sub import_gaf { #[GAF implied data structure] - # singletons: db/Source (not really, but OK for now), taxon/Species ID, assigned_by, gene_type/Gene Type (later: proteins, too) - # unneeded: db_object_id, date, qualifier, db_reference (get PMID from here later), with_or_from (later) - # unvaried fields (gene-level): db_object_symbol/Gene Symbol, db_object_name/Gene Name, db_object_synonym/Gene Locus|Source Accession ID|Chromosome|Gene Synonyms (see below), - # varied fields (gene synonyms): db_object_synonym/Gene Synonym - # varied fields (ontology-level): term_id/Term ID, evidence_code/Evidence Code, aspect/Aspect, + # singletons: db/Source (not really, but OK for now), taxon/Species ID, + # assigned_by, gene_type/Gene Type (later: proteins, too) + # lower priority: db_object_id, date + # not needed: qualifier + # unvaried fields (gene-level): db_object_symbol/Gene Symbol, + # db_object_name/Gene Name, + # db_object_synonym/Gene Locus|Source Accession ID|Chromosome|Gene Synonyms (see below), + # varied fields (gene synonyms): db_object_synonym/Gene Synonym + # varied fields (ontology-level): term_id/Term ID, evidence_code/Evidence Code, + # aspect/Aspect, db_reference (multi-, get PMID from here), with_or_from (multi-) # [Annotation Object Structure] # %annotation_universals @@ -267,14 +343,15 @@ sub import_gaf # "Gene Type" # %annotations # "$Gene Symbol" => %annotation_properties - # "Gene Name" => string - # "Gene Locus"/"Source Accession ID" (first uc("AT.G")) => string - # "Chromosome" (AT#G in "Gene Locus") => string - # "Gene Synonyms" => pipe-delimited string of synonyms - # "Ontological Reference" => %annotation_ontology_refs - # "Term ID" => string - # "Evidence Code" => string - # "Aspect" => string + # "Gene Name" => string + # "Gene Locus"/"Source Accession ID" (first uc("AT.G")) => string + # "Chromosome" (AT#G in "Gene Locus") => string + # "Gene Synonyms" => % strings + # "Ontological References" => % "$Term ID" => %annotation_ontology_ref + # "Aspect" => string (assumes only one) + # "Evidence Codes" => % strings + # "Evidence" => % strings ("with_or_from" ) + # "Publications" => % PMID's from "db:reference", used to create separate Pub pages # loop through data rows and build hashed annotation data structure my $count = 0; @@ -290,18 +367,19 @@ sub import_gaf . Dumper(\@curr_line_ary) . "\n\n"; } my %curr_line_hash = ( - "db" => $curr_line_ary[0], # Source - "db_object_symbol" => $curr_line_ary[2], # Gene Symbol - "term_id" => $curr_line_ary[4], # Term ID - "evidence_code" => $curr_line_ary[6], # Evidence Code - "aspect" => $curr_line_ary[8], # Aspect - "db_object_name" => $curr_line_ary[9], # Gene Name + "db" => $curr_line_ary[0], # Source + "db_object_symbol" => $curr_line_ary[2], # Gene Symbol + "term_id" => $curr_line_ary[4], # Term ID + "db_reference" => $curr_line_ary[5], # inc. Publication info (PMID) + "evidence_code" => $curr_line_ary[6], # Evidence Code + "with_or_from" => $curr_line_ary[7], # Evidence (data) + "aspect" => $curr_line_ary[8], # Aspect + "db_object_name" => $curr_line_ary[9], # Gene Name # Gene Locus, Source Accession ID, Chromosome, Gene Synonyms - "db_object_synonym" => $curr_line_ary[10], - - "db_object_type" => $curr_line_ary[11], # Gene Type - "taxon" => $curr_line_ary[12] # Species ID + "db_object_synonym" => $curr_line_ary[10], + "db_object_type" => $curr_line_ary[11], # Gene Type + "taxon" => $curr_line_ary[12] # Species ID ); if ($debug) { print "...\n" @@ -319,6 +397,9 @@ sub import_gaf if ($debug) { print "...\n" . Dumper(\%annotation_universals) . "\n\n"; } + # TODO: set all ontology aspects in a multi-hash (group by ontology) + # and keep a flag to monitor the currently-needed ontology + # identify what kind of ontology term we're dealing with and set our # aspects and ontology name accordingly switch ((split(':',$curr_line_hash{"term_id"}))[0]) { @@ -344,6 +425,9 @@ sub import_gaf # if not, add the new Gene Symbol and its associated props if (!exists $annotations{$curr_line_hash{"db_object_symbol"}}) { + + # print "\n*** NEW SYMBOL: $curr_line_hash{'db_object_symbol'} ***\n"; # TEST + # prepare Gene Locus, Source Accession ID, Chromosome, Gene Synonyms my @synonyms = split('\|',$curr_line_hash{"db_object_synonym"}); @@ -384,30 +468,34 @@ sub import_gaf "Accession ID" => $accession_id, "Gene Name" => $curr_line_hash{"db_object_name"}, "Gene Locus" => $locus, - "Chromosome" => $chromosome, - "Gene Synonyms" => $curr_line_hash{"db_object_synonym"} # pipe-delimited string + "Chromosome" => $chromosome }; + # add synonyms + for (split('\|', $curr_line_hash{"db_object_synonym"})) { + $$annotation_properties{"Gene Synonyms"}{$_} = ""; } + if ($debug) { print "...\n" . Dumper($annotation_properties) . "\n\n"; } - # add new gene annotation and assign props $annotations{$curr_line_hash{"db_object_symbol"}} = $annotation_properties; - } - - # add new ontology data (this happens on every line of data) - my $annotation_ontology_ref = { - "Term ID" => $curr_line_hash{"term_id"}, - "Aspect" => $aspects{uc($curr_line_hash{"aspect"})}, - "Evidence Code" => $curr_line_hash{"evidence_code"} - }; - if ($debug) { print "...\n" - . Dumper($annotation_ontology_ref) . "\n\n"; } + add_ontology_ref_data(\%curr_line_hash); # add the first ontology reference (every time) + } + # that Gene Symbol does exist, so we just need to roll-up multi-line + # annotation information, like gene synonyms and ontology refs + else + { + #print "\n*** EXISTING SYMBOL: $curr_line_hash{'db_object_symbol'} ***\n"; # TEST - push @{$annotations{$curr_line_hash{"db_object_symbol"}}{"Ontological Reference"}}, $annotation_ontology_ref; + # add any add'l synonyms + for (split('\|', $curr_line_hash{"db_object_synonym"})) { + $annotations{$curr_line_hash{"db_object_symbol"}} + {"Gene Synonyms"}{$_} = ""; } + add_ontology_ref_data(\%curr_line_hash); # add add'l ontology reference data + } } if ($debug) { print "...\n" . Dumper(\%annotations) . "\n\n"; } @@ -720,7 +808,7 @@ sub transform_gaf $writer->dataElement("Field", $ontology_name, Name=>"Ontology"); $writer->dataElement("Field", $$_{"Term ID"}, Name=>"Term ID"); $writer->dataElement("Field", $$_{"Aspect"}, Name=>"Aspect"); - $writer->dataElement("Field", $$_{"Evidence Code"}, Name=>"Evidence Code"); + $writer->dataElement("Field", $$_{"Evidence Codes"}, Name=>"Evidence Code"); $writer->endTag("Template"); } $writer->endTag("Page"); @@ -787,6 +875,7 @@ sub show_output init; import_data; if ($verbose) { show_input; } +exit(0); # TEST write_xml(); if ($verbose) { show_output; }