Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Tweaked for oryza (refined multi-species handling)
authorpreecej <preecej@localhost>
Sat, 3 Sep 2011 00:19:57 +0000 (00:19 +0000)
committerpreecej <preecej@localhost>
Sat, 3 Sep 2011 00:19:57 +0000 (00:19 +0000)
svn path=/; revision=175

preecej/semantic_wiki/paw_TransformForImport.pl

index ce7e82c855c1f309497abc1c2d12c7cca3def30a..c20c010ca4a09e9ced61acaf9503cfcb69465501 100644 (file)
@@ -138,9 +138,9 @@ my %annotations; # keyed on Gene Symbol
 
 # page id seeds, until I can figure out how to auto-increment w/in the import 
 # script
-my $SOURCE_TITLE_SEED = 1;
-my $ANNOT_TITLE_SEED = 1;
-my $PUB_TITLE_SEED = 1;
+my $SOURCE_TITLE_SEED = 2;
+my $ANNOT_TITLE_SEED = 21;
+my $PUB_TITLE_SEED = 7;
 
 $Data::Dumper::Pad = "... "; 
 
@@ -413,9 +413,13 @@ sub import_gaf
             # set species-specific values
             switch ($annotation_universals{"Species ID"}) {
                 case "NCBI:3702" {
-                    $locus_finder_expr = "/[Aa][Tt].[Gg]/";
+                    $locus_finder_expr = "^([Aa][Tt].[Gg])";
                     $annotation_universals{"Species Name"} = "Arabidopsis thaliana"; # temp; need an NCBI lookup
                 }
+                case "NCBI:4530" {
+                    $locus_finder_expr = "^(LOC_|[Oo][Ss]|osa-)";
+                    $annotation_universals{"Species Name"} = "Oryza sativa"; # temp; need an NCBI lookup
+                }
                 else {
                     die($annotation_universals{"Species ID"} 
                         . " is not a valid NCBI taxon ID.\n");
@@ -438,18 +442,22 @@ sub import_gaf
             # print "\n*** NEW SYMBOL: $curr_line_hash{'db_object_symbol'} ***\n"; # TEST
 
             # prepare Gene Locus, Source Accession ID, Chromosome, Gene Synonyms
-            my @synonyms = split('\|',$curr_line_hash{"db_object_synonym"});
-            
-            if ($debug) { print "...<DEBUG: \@synonyms>\n"
-                . Dumper(\@synonyms) . "\n\n"; }
-            
-            # find the gene locus, if it is listed (first "AT.G")
-            my @loci = grep $locus_finder_expr, @synonyms;
-
-            if ($debug) { print "...<DEBUG: \@loci>\n"
-                . Dumper(\@loci) . "\n\n"; }
-
+            my @loci;
             my $locus = "";
+
+            if (exists $curr_line_hash{"db_object_synonym"})
+            {
+                my @synonyms = split('\|',$curr_line_hash{"db_object_synonym"});
+                
+                if ($debug) { print "...<DEBUG: \@synonyms>\n"
+                    . Dumper(\@synonyms) . "\n\n"; }
+                
+                # find the gene locus, if it is listed
+                @loci = grep /$locus_finder_expr/, @synonyms;
+    
+                if ($debug) { print "...<DEBUG: \@loci>\n"
+                    . Dumper(\@loci) . "\n\n"; }
+            }
             
             if (scalar(@loci) > 0) # we have at least one match; use the first one
             {    
@@ -460,7 +468,14 @@ sub import_gaf
                 if ($curr_line_hash{"db_object_symbol"} =~ $locus_finder_expr)
                 {
                     # the split drops the variant/allele signifier, if present
-                    $locus = (split('.',$curr_line_hash{"db_object_symbol"}))[0];
+                    if (!(split('.',$curr_line_hash{"db_object_symbol"}))[0])
+                    {
+                        $locus = $curr_line_hash{"db_object_symbol"};
+                    }
+                    else
+                    {
+                        $locus = (split('.',$curr_line_hash{"db_object_symbol"}))[0];
+                    }
                 }
                 else # no match; attempt to use the Gene Name instead
                 {
@@ -471,9 +486,14 @@ sub import_gaf
                 }
             }
             
-            # chromosome = third char in locus, if it exists
-            my $chromosome = ($locus ne "" ? (split('',$locus))[2] : "");
-            
+            my $chromosome = "";
+            if ($locus) {
+                if ($annotation_universals{"Species ID"} eq "NCBI:3702") {
+                    # for Ath, third char in locus, if it exists
+                    $chromosome = ($locus ne "" ? (split('',$locus))[2] : "");
+                }
+            }
+                
             # set some sort of pseudo-unique value as the accession id, 
             # in order of succession: locus, then symbol
             # (NOTE: this is dangerous; a stable identifier is preferred)