From: preecej Date: Sat, 3 Sep 2011 00:19:57 +0000 (+0000) Subject: Tweaked for oryza (refined multi-species handling) X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=cb918e607891c0d48520d75052fbe20e1bc9c58c;p=old-jaiswallab-svn%2F.git Tweaked for oryza (refined multi-species handling) svn path=/; revision=175 --- diff --git a/preecej/semantic_wiki/paw_TransformForImport.pl b/preecej/semantic_wiki/paw_TransformForImport.pl index ce7e82c..c20c010 100644 --- a/preecej/semantic_wiki/paw_TransformForImport.pl +++ b/preecej/semantic_wiki/paw_TransformForImport.pl @@ -138,9 +138,9 @@ my %annotations; # keyed on Gene Symbol # page id seeds, until I can figure out how to auto-increment w/in the import # script -my $SOURCE_TITLE_SEED = 1; -my $ANNOT_TITLE_SEED = 1; -my $PUB_TITLE_SEED = 1; +my $SOURCE_TITLE_SEED = 2; +my $ANNOT_TITLE_SEED = 21; +my $PUB_TITLE_SEED = 7; $Data::Dumper::Pad = "... "; @@ -413,9 +413,13 @@ sub import_gaf # set species-specific values switch ($annotation_universals{"Species ID"}) { case "NCBI:3702" { - $locus_finder_expr = "/[Aa][Tt].[Gg]/"; + $locus_finder_expr = "^([Aa][Tt].[Gg])"; $annotation_universals{"Species Name"} = "Arabidopsis thaliana"; # temp; need an NCBI lookup } + case "NCBI:4530" { + $locus_finder_expr = "^(LOC_|[Oo][Ss]|osa-)"; + $annotation_universals{"Species Name"} = "Oryza sativa"; # temp; need an NCBI lookup + } else { die($annotation_universals{"Species ID"} . " is not a valid NCBI taxon ID.\n"); @@ -438,18 +442,22 @@ sub import_gaf # print "\n*** NEW SYMBOL: $curr_line_hash{'db_object_symbol'} ***\n"; # TEST # prepare Gene Locus, Source Accession ID, Chromosome, Gene Synonyms - my @synonyms = split('\|',$curr_line_hash{"db_object_synonym"}); - - if ($debug) { print "...\n" - . Dumper(\@synonyms) . "\n\n"; } - - # find the gene locus, if it is listed (first "AT.G") - my @loci = grep $locus_finder_expr, @synonyms; - - if ($debug) { print "...\n" - . Dumper(\@loci) . "\n\n"; } - + my @loci; my $locus = ""; + + if (exists $curr_line_hash{"db_object_synonym"}) + { + my @synonyms = split('\|',$curr_line_hash{"db_object_synonym"}); + + if ($debug) { print "...\n" + . Dumper(\@synonyms) . "\n\n"; } + + # find the gene locus, if it is listed + @loci = grep /$locus_finder_expr/, @synonyms; + + if ($debug) { print "...\n" + . Dumper(\@loci) . "\n\n"; } + } if (scalar(@loci) > 0) # we have at least one match; use the first one { @@ -460,7 +468,14 @@ sub import_gaf if ($curr_line_hash{"db_object_symbol"} =~ $locus_finder_expr) { # the split drops the variant/allele signifier, if present - $locus = (split('.',$curr_line_hash{"db_object_symbol"}))[0]; + if (!(split('.',$curr_line_hash{"db_object_symbol"}))[0]) + { + $locus = $curr_line_hash{"db_object_symbol"}; + } + else + { + $locus = (split('.',$curr_line_hash{"db_object_symbol"}))[0]; + } } else # no match; attempt to use the Gene Name instead { @@ -471,9 +486,14 @@ sub import_gaf } } - # chromosome = third char in locus, if it exists - my $chromosome = ($locus ne "" ? (split('',$locus))[2] : ""); - + my $chromosome = ""; + if ($locus) { + if ($annotation_universals{"Species ID"} eq "NCBI:3702") { + # for Ath, third char in locus, if it exists + $chromosome = ($locus ne "" ? (split('',$locus))[2] : ""); + } + } + # set some sort of pseudo-unique value as the accession id, # in order of succession: locus, then symbol # (NOTE: this is dangerous; a stable identifier is preferred)