Started adding GAF format input option to Rev 0.2

author preecej <preecej@localhost>

Sat, 23 Jul 2011 00:47:37 +0000 (00:47 +0000)

committer preecej <preecej@localhost>

Sat, 23 Jul 2011 00:47:37 +0000 (00:47 +0000)
author preecej <preecej@localhost>
Sat, 23 Jul 2011 00:47:37 +0000 (00:47 +0000)
committer preecej <preecej@localhost>
Sat, 23 Jul 2011 00:47:37 +0000 (00:47 +0000)
diff --git a/preecej/semantic_wiki/paw_TransformForImport.pl b/preecej/semantic_wiki/paw_TransformForImport.pl

index b1b0b38d9d57c46bdeaa1b7c1aecb8283757996e..a6fbb62e67cdc9ac4cd5fc3b8c1a177a9a751667 100644 (file)
--- a/preecej/semantic_wiki/paw_TransformForImport.pl
+++ b/preecej/semantic_wiki/paw_TransformForImport.pl
@@ -2,7 +2,7 @@
  
  =head1 NAME
  
-Plant Semantics Import Transformation Script
+Planteome Annotation Wiki - Data Import Script
  
  =head1 VERSION
  
@@ -11,33 +11,35 @@ Plant Semantics Import Transformation Script
  =head1 DESCRIPTION
  
  Transform external gene annotation data into an XML document readable
-by the MediaWiki extension DataTransfer (Special:ImportXML) feature,
-including subpages. Also generates appropriate provenance of data based on header of 
-import file.
+by the MediaWiki extension DataTransfer (Special:ImportXML) feature.
+
+Also generates appropriate provenance of data based on a prefixed
+Source header in the import file.
+
+New to this revision: Added a GAF file type option for import; 
+accommodates ontologically-focused annotations.
  
  =head1 USAGE
  
-PS_TransformForImport.pl -i INPUT_FILE -t TYPE -o OUTPUT_FILE -v -d
+paw_TransformForImport.pl -i INPUT_FILE -t TYPE -o OUTPUT_FILE -v -d
  
  =head1 OPTIONS
  
-    -i  Name of input CSV or tab-del file.
-    -t  Specifies input type of file ('csv' or 'tab')
-    -o  Name of output file.
-    -v  View verbose information 
+    -i  Name of input file
+        (if no path supplied, assumes current working directory)
+    -t  Specifies input type of file
+        (allowed values: 'csv', 'tab', 'gaf')
+    -o  Name of output XML file.
+        (if no path supplied, assumes current working directory)
+    -v  View verbose information
      -d  View debugging information
  
  =head1 DEPENDENCIES
  
  Requires that the input file contain at least two headers: the first 
  will hold the provenance information associated with the imported 
-data, and all following headers will specify the templates and field
-names for the annotation data.
-
-The main templates pertain to Annotations, Sources, and Publications.
-Other templates are to be used for subsidiary reference data 
-(i.e. gene synonyms, external references, ontology terms, 
-and sequence data).
+data, and all following headers will specify the annotation data to
+be imported in accordance with the input type.
  
  =head2 Source Header Format Example
  
@@ -70,8 +72,8 @@ and sequence data).
      "
      "
  
-    NOTE: One of the fields in each format MUST be named "Accession ID" for the
-    tracking of provenance.
+    NOTE: One of the fields in each format MUST be named "Accession
+    ID" for the tracking of provenance.
  
  =head1 AUTHOR
  
@@ -95,22 +97,19 @@ use Getopt::Std;
  
  # specific
  use XML::Smart;
+use XML::DOM;
  
  # ---------------------------------------------------------------------------
  # declarations
  # ---------------------------------------------------------------------------
  
  my %opts; # arg options
-my $file_type; # tab or csv
-my $file_del;
+my $file_type; # tab, csv, gaf
+my $input_file;
+my $output_file;
  my $verbose = 0; # flag for verbose output
  my $debug = 0; # debugging switch
  
-# set paths for data files
-my $path = getcwd() . "/";
-my $input_file = $path;
-my $output_file = $path;
-
  # data storage
  my %source;
  my $template_name;
@@ -119,7 +118,7 @@ my @field_data;
  my $xml;
  my $output_data;
  
-my $prov_title_seed = 2;
+my $source_title_seed = 3;
  my $annot_title_seed = 3;
  
  # ---------------------------------------------------------------------------
@@ -136,17 +135,33 @@ sub init
      foreach my $key (keys %opts) { 
          my $value = $opts{$key};
          switch ($key) {
-            case "i" { $input_file = $input_file . $value; }
+            case "i" { 
+                if ($value =~ /\//) { # assume path
+                    $input_file = $value;
+                } else {
+                    $input_file = getcwd() . "\/$value";
+                }
+            }
              case "t" { $file_type = $value; }
-            case "o" { $output_file = $output_file . $value; }
+            
+            case "o" {
+                if ($value =~ /\//) { # assume path
+                    $output_file = $value;
+                } else {
+                    $output_file = getcwd() . "\/$value";
+                }
+            }
              case "v" { $verbose = 1; }
+            
              case "d" { $debug = 1; }
          }
      }
      
-    # split data on either commas or tabs, dependent on format
-    $file_del = ($file_type eq "csv") ? ',' : '\t';
-    
+    # split data on either commas or tabs, if file type is generic
+    if ($file_type = "csv" || $file_type = "tab") {
+        my $file_del = ($file_type eq "csv") ? ',' : '\t';
+    }
+        
      system "clear";
      print "\n"
          . "------------------------------------------------------------\n"
@@ -257,7 +272,7 @@ sub write_xml
  
      # set root element, source page and elements
      # (temp set of page title until moved to import extension) 
-    $xml->{Pages}{Page} = {Title => "Source:$prov_title_seed"};
+    $xml->{Pages}{Page} = {Title => "Source:$source_title_seed"};
  
      $curr_node = $xml->{Pages}{Page};
      $curr_node->{Template} = {Name => 'Source'};
@@ -343,7 +358,7 @@ sub write_xml
  
              # assign the relevant provenance field data            
              $curr_prov_node->{Field}[0] = {Name => 'Source'};
-            $curr_prov_node->{Field}[0]->content(0,"Source:$prov_title_seed");
+            $curr_prov_node->{Field}[0]->content(0,"Source:$source_title_seed");
              $curr_prov_node->{Field}[1] = {Name => 'Source Accession ID'};
              $curr_prov_node->{Field}[1]->content(0,$curr_accession_id);
              $curr_prov_node->{Field}[2] = {Name => 'Source Template'};
author	preecej <preecej@localhost>
	Sat, 23 Jul 2011 00:47:37 +0000 (00:47 +0000)
committer	preecej <preecej@localhost>
	Sat, 23 Jul 2011 00:47:37 +0000 (00:47 +0000)