Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Completed first working version.
authorpreecej <preecej@localhost>
Thu, 2 Jun 2011 20:48:20 +0000 (20:48 +0000)
committerpreecej <preecej@localhost>
Thu, 2 Jun 2011 20:48:20 +0000 (20:48 +0000)
svn path=/; revision=105

preecej/semantic_wiki/PS_TransformForImport.pl

index 496011d8c6b749bb5086bf35bc5be0b09f571098..1a219b09ba1d661e73d890ca771fe5b51de66f15 100644 (file)
@@ -24,6 +24,8 @@ PS_TransformForImport.pl -i INPUT_FILE -t TYPE -o OUTPUT_FILE
     -i  Name of input CSV or tab-del file.
     -t  Specifies input type of file ('csv' or 'tab')
     -o  Name of output file.
+    -v  View verbose information 
+    -d  View debugging information
 
 =head1 DEPENDENCIES
 
@@ -43,15 +45,16 @@ data.
 
 =head2 Data Header Format Example (field separator may also be a tab)
 
+    [Format]
+    Template=Gene
+    Fields=Species Name,Gene Symbol,Gene Name,Chromosome,Gene Type,Accession ID
     [Data]
-    Template=Gene Annotation
-    Gene Name,Gene Annotation,Gene Symbol,...
     val1,val2,val3,...
     "
     "
     "
 
-    NOTE: One of the fields must be named Accession ID for the
+    NOTE: One of the fields MUST be named "Accession ID" for the
     tracking of provenance.
 
 =head1 AUTHOR
@@ -100,6 +103,9 @@ my @field_data;
 my $xml;
 my $output_data;
 
+my $prov_title_seed = 3;
+my $annot_title_seed = 1563;
+
 # ---------------------------------------------------------------------------
 # functions
 # ---------------------------------------------------------------------------
@@ -122,6 +128,7 @@ sub init
         }
     }
     
+    # split data on either commas or tabs, dependent on format
     $file_del = ($file_type eq "csv") ? ',' : '\t';
     
     system "clear";
@@ -147,7 +154,7 @@ sub init
 # ---------------------------------------------------------------------------
 sub import_data
 {
-    print "Opening file and reading header info...\n\n";
+    print "Opening input file and reading header info...\n\n";
 
     # open file
     open(INPUT_FILE,$input_file) or die("Could not open input file.");
@@ -224,34 +231,117 @@ sub show_input
 sub write_xml
 {
     print "Transforming data...\n\n";
-    my $current_node; # placeholder for node cursor
+    my $curr_node; # placeholder for general node cursor
+    my $curr_prov_node; # placeholder for node cursor in provenance pages
+    my $curr_annot_node; # placeholder for node cursor in annotation pages
+    my $curr_accession_id; # holds each rows accession id for provenance marking
+
     
     $xml = new XML::Smart($output_data, 'XML::Smart::Parser');
 
     # set root element, provenance page and elements
-    $xml->{Pages}{Page} = {Title => 'PROV:3'};
+    # (temp set of page title until moved to import extension) 
+    $xml->{Pages}{Page} = {Title => "PROV:$prov_title_seed"};
 
-    $current_node = $xml->{Pages}{Page};
-    $current_node->{Template} = {Name => 'Provenance'};
-    $current_node = $current_node->{Template};    
+    $curr_node = $xml->{Pages}{Page};
+    $curr_node->{Template} = {Name => 'Provenance'};
+    $curr_node = $curr_node->{Template};    
     
-    $current_node->{Field}[0] = {Name => 'Source Data Time Stamp'};
-    $current_node = $current_node->{Field}[0];
-    $current_node->content(0,$provenance{'SourceDateTimeStamp'});
-    $current_node = $current_node->back();
-
-    $current_node->{Field}[1] = {Name => 'Source Database'};
-    $current_node = $current_node->{Field}[1];
-    $current_node->content(0,$provenance{'SourceDatabase'});
-    $current_node = $current_node->back();
+    $curr_node->{Field}[0] = {Name => 'Source Data Time Stamp'};
+    $curr_node->{Field}[0]->content(0,$provenance{'SourceDateTimeStamp'});
+    $curr_node->{Field}[1] = {Name => 'Source Database'};
+    $curr_node->{Field}[1]->content(0,$provenance{'SourceDatabase'});
+    $curr_node->{Field}[2] = {Name => 'Source Version'};
+    $curr_node->{Field}[2]->content(0,$provenance{'SourceVersion'});
+    $curr_node->{Field}[3] = {Name => 'Source URI'};
+    $curr_node->{Field}[3]->content(0,$provenance{'SourceURI'});
+    $curr_node->{Field}[4] = {Name => 'Source File'};
+    $curr_node->{Field}[4]->content(0,$provenance{'SourceFile'});
     
-    # iterate through the data
+    $curr_node = $curr_node->back->back; # return to <Pages> node
     
-        # build the annotation page
+    if ($debug) { print "Current node: " . $curr_node->path . "\n"; }
+    
+    my $next_page_title_id = $annot_title_seed;
+    
+    # iterate through the data
+    foreach my $row (@field_data) {
+
+        # set up next annotation page
+        my $next_page = { Title => "PS:$next_page_title_id" };
+        push(@{$curr_node->{Page}}, $next_page);
+
+        $curr_annot_node = $curr_node->{Page}(
+            "Title","eq","PS:$next_page_title_id");
+        
+        if ($debug) { print "Curr annot node: ".$curr_annot_node->path."\n";}
+
+        $curr_annot_node->{Template} = {Name => "$template_name"};
+        $curr_annot_node = $curr_annot_node->{Template};    
+      
+        # set up next provenance page
+        my $next_page = { Title => "PS:$next_page_title_id/PROV" };
+        push(@{$curr_node->{Page}}, $next_page);
 
-        # add a template to the annotation provenance page
+        $curr_prov_node = $curr_node->{Page}(
+            "Title","eq","PS:$next_page_title_id/PROV");
         
+        if ($debug) {print "Curr prov node: " . $curr_prov_node->path . "\n"; }
+
+        $curr_prov_node->{Template} = {Name => 'Provenance_Reference_Data'};
+        $curr_prov_node = $curr_prov_node->{Template};    
+        $curr_prov_node->{Field} = {Name => 'Annotation Page'};
+        $curr_prov_node->{Field}->content(0,"PS:$next_page_title_id");
+        $curr_prov_node = $curr_prov_node->back;
+
+        my $field_ct = 0; # counter for field position in pages 
+
+        # grab the Accession ID for the current row of data
+        foreach my $key (keys %$row) {
+            if ($key eq "Accession ID") {
+                $curr_accession_id = $row->{$key};
+                if ($debug) {
+                    print "* Found Accession ID: $curr_accession_id *\n";
+                }
+            }
+        }
+        if (!(defined $curr_accession_id)) {
+            die "Error: No Accession ID available\n";
+        }
+        
+        # iterate through the annotation data and assign to <Field> elements
+        foreach my $key (keys %$row) {
+            if ($debug) { print "$key => " . $row->{$key} . "\n"; }
+
+            # build the annotation page
+            $curr_annot_node->{Field}[$field_ct] = {Name => $key};
+            $curr_annot_node->{Field}[$field_ct]->content(0,$row->{$key});
+            $field_ct++;
+            
+            # add a corresponding template to the annotation provenance page
+            my $next_prov_node = {Name => 'Provenance_Reference_Data_Repeater'};
+            push(@{$curr_prov_node->{Template}}, $next_prov_node);
+            
+            # grab the last template you added
+            $curr_prov_node = @{$curr_prov_node->{Template}}[-1];
+
+            # assign the relevant provenance field data            
+            $curr_prov_node->{Field}[0] = {Name => 'Provenance Page'};
+            $curr_prov_node->{Field}[0]->content(0,"PROV:$prov_title_seed");
+            $curr_prov_node->{Field}[1] = {Name => 'Accession ID'};
+            $curr_prov_node->{Field}[1]->content(0,$curr_accession_id);
+            $curr_prov_node->{Field}[2] = {Name => 'Source Field'};
+            $curr_prov_node->{Field}[2]->content(0,$key);
+            $curr_prov_node->{Field}[3] = {Name => 'Source Template'};
+            $curr_prov_node->{Field}[3]->content(0,$template_name);
+            
+            $curr_prov_node = $curr_prov_node->back;
+        }
+        $next_page_title_id++;
+    }
+
     # write out xml doc to a single ImportXML file
+    print "Writing data to output file...\n\n";
     $xml->save($output_file);
 }