From ccaf7c16c4751a2275ba47e467e395504ad2af7a Mon Sep 17 00:00:00 2001 From: preecej Date: Sat, 7 Jul 2012 01:33:59 +0000 Subject: [PATCH] Updated to work for Vitis vinifera (grape) imports. Also handled some "missing data" scenarios for the Gene Name and Evidence fields. svn path=/; revision=355 --- planteome/paw/paw_TransformForImport.pl | 49 ++++++++++++++++++++----- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/planteome/paw/paw_TransformForImport.pl b/planteome/paw/paw_TransformForImport.pl index 95c0a7f..28d6643 100644 --- a/planteome/paw/paw_TransformForImport.pl +++ b/planteome/paw/paw_TransformForImport.pl @@ -36,7 +36,7 @@ paw_TransformForImport.pl -i INPUT_FILE -t TYPE -o OUTPUT_FILE -v -d supplied value; consequent files will be numbered in this format: '-#.suffix' -s Seed values (comma-delimited) for Source, Annotation, and - Publication IDs (in that order, e.g. '2,38,15'); Defaults to + Publication IDs (in that order, e.g. '3,38,15'); Defaults to '1' for all seed values -v View verbose information -d View debugging information @@ -262,6 +262,12 @@ sub import_generic chomp $line; my @tmp_data_ary = split($file_del, $line); + for (@tmp_data_ary) { + if ($_ eq '-' || $_ eq '.') { + $_ = ""; + } + } + my %tmp_hash; if ($debug) { print join(',',@tmp_data_ary) . "\n"; } @@ -403,6 +409,11 @@ sub import_gaf chomp $line; my @curr_line_ary = split("\t", $line); + for (@curr_line_ary) { + if ($_ eq '-' || $_ eq '.') { + $_ = ""; + } + } if ($debug) { print "...\n" . Dumper(\@curr_line_ary) . "\n\n"; } @@ -444,6 +455,10 @@ sub import_gaf $locus_finder_expr = "^(LOC_|[Oo][Ss]|osa-)"; $annotation_universals{"Species Name"} = "Oryza sativa"; # temp; need an NCBI lookup } + case "NCBI:29760" { + $locus_finder_expr = "^(Vv)"; + $annotation_universals{"Species Name"} = "Vitis vinifera"; # temp; need an NCBI lookup + } else { die($annotation_universals{"Species ID"} . " is not a valid NCBI taxon ID.\n"); @@ -516,6 +531,10 @@ sub import_gaf # for Ath, third char in locus, if it exists $chromosome = ($locus ne "" ? (split('',$locus))[2] : ""); } + if ($annotation_universals{"Species ID"} eq "NCBI:29760") { + # for Vv, third and fourth char in locus, if it exists + $chromosome = ($locus ne "" ? (split('',$locus))[2] . (split('',$locus))[3] : ""); + } } # set some sort of pseudo-unique value as the accession id, @@ -531,7 +550,7 @@ sub import_gaf "Gene Locus" => $locus, "Chromosome" => $chromosome }; - + # add synonyms for (split('\|', $curr_line_hash{"db_object_synonym"})) { $$annotation_properties{"Gene Synonyms"}{$_} = ""; } @@ -823,7 +842,9 @@ sub transform_gaf $writer->dataElement("Field", $annotation_universals{"Species Name"}, Name=>"Species Name"); $writer->dataElement("Field", $annotation_universals{"Species ID"}, Name=>"Species ID"); $writer->dataElement("Field", $annotation, Name=>"Gene Symbol"); - $writer->dataElement("Field", $annotations{$annotation}{"Gene Name"}, Name=>"Gene Name"); + if (length($annotations{$annotation}{"Gene Name"}) > 0) { + $writer->dataElement("Field", $annotations{$annotation}{"Gene Name"}, Name=>"Gene Name"); + } $writer->dataElement("Field", $annotations{$annotation}{"Gene Locus"}, Name=>"Gene Locus"); $writer->dataElement("Field", ucfirst($annotation_universals{"Gene Type"}), Name=>"Gene Type"); $writer->dataElement("Field", $annotations{$annotation}{"Chromosome"}, Name=>"Chromosome"); @@ -887,7 +908,9 @@ sub transform_gaf foreach my $evidence (keys %{$annotations{$annotation}{"Ontological References"}{$ont_term}{"Evidence"}}) { push @ary_evidence, $evidence; } - $writer->dataElement("Field", join(', ',@ary_evidence), Name=>"Evidence"); + if (scalar(@ary_evidence) > 0) { + $writer->dataElement("Field", join(', ',@ary_evidence), Name=>"Evidence"); + } $writer->endTag("Template"); } @@ -903,12 +926,18 @@ sub transform_gaf # items on the Annotation page foreach (@{@$template_field_map{"Annotation"}}) { - $writer->startTag("Template",Name=>"Provenance_Repeater"); - $writer->dataElement("Field", $_, Name=>"Source Field or Object"); - $writer->dataElement("Field", "Annotations", Name=>"Source Category"); - $writer->dataElement("Field", $annotations{$annotation}{"Accession ID"}, Name=>"Source Accession ID"); - $writer->dataElement("Field", "Source:$SOURCE_TITLE_SEED", Name=>"Source"); - $writer->endTag("Template"); + # include checks for all locations of annotation data + if (length($annotations{$annotation}{$_}) > 0 + || length($annotation_universals{$_}) > 0 + || $_ eq "Gene Symbol" # this will always be there; it's the key for the annotations + || $_ eq "Has Phenotype") { + $writer->startTag("Template",Name=>"Provenance_Repeater"); + $writer->dataElement("Field", $_, Name=>"Source Field or Object"); + $writer->dataElement("Field", "Annotations", Name=>"Source Category"); + $writer->dataElement("Field", $annotations{$annotation}{"Accession ID"}, Name=>"Source Accession ID"); + $writer->dataElement("Field", "Source:$SOURCE_TITLE_SEED", Name=>"Source"); + $writer->endTag("Template"); + } } # items on the Gene Synonyms subpage -- 2.34.1