From 58e743aa15dcff0f96bc66196e037347f57906e3 Mon Sep 17 00:00:00 2001 From: preecej Date: Thu, 2 Jun 2011 20:48:20 +0000 Subject: [PATCH] Completed first working version. svn path=/; revision=105 --- .../semantic_wiki/PS_TransformForImport.pl | 132 +++++++++++++++--- 1 file changed, 111 insertions(+), 21 deletions(-) diff --git a/preecej/semantic_wiki/PS_TransformForImport.pl b/preecej/semantic_wiki/PS_TransformForImport.pl index 496011d..1a219b0 100644 --- a/preecej/semantic_wiki/PS_TransformForImport.pl +++ b/preecej/semantic_wiki/PS_TransformForImport.pl @@ -24,6 +24,8 @@ PS_TransformForImport.pl -i INPUT_FILE -t TYPE -o OUTPUT_FILE -i Name of input CSV or tab-del file. -t Specifies input type of file ('csv' or 'tab') -o Name of output file. + -v View verbose information + -d View debugging information =head1 DEPENDENCIES @@ -43,15 +45,16 @@ data. =head2 Data Header Format Example (field separator may also be a tab) + [Format] + Template=Gene + Fields=Species Name,Gene Symbol,Gene Name,Chromosome,Gene Type,Accession ID [Data] - Template=Gene Annotation - Gene Name,Gene Annotation,Gene Symbol,... val1,val2,val3,... " " " - NOTE: One of the fields must be named Accession ID for the + NOTE: One of the fields MUST be named "Accession ID" for the tracking of provenance. =head1 AUTHOR @@ -100,6 +103,9 @@ my @field_data; my $xml; my $output_data; +my $prov_title_seed = 3; +my $annot_title_seed = 1563; + # --------------------------------------------------------------------------- # functions # --------------------------------------------------------------------------- @@ -122,6 +128,7 @@ sub init } } + # split data on either commas or tabs, dependent on format $file_del = ($file_type eq "csv") ? ',' : '\t'; system "clear"; @@ -147,7 +154,7 @@ sub init # --------------------------------------------------------------------------- sub import_data { - print "Opening file and reading header info...\n\n"; + print "Opening input file and reading header info...\n\n"; # open file open(INPUT_FILE,$input_file) or die("Could not open input file."); @@ -224,34 +231,117 @@ sub show_input sub write_xml { print "Transforming data...\n\n"; - my $current_node; # placeholder for node cursor + my $curr_node; # placeholder for general node cursor + my $curr_prov_node; # placeholder for node cursor in provenance pages + my $curr_annot_node; # placeholder for node cursor in annotation pages + my $curr_accession_id; # holds each rows accession id for provenance marking + $xml = new XML::Smart($output_data, 'XML::Smart::Parser'); # set root element, provenance page and elements - $xml->{Pages}{Page} = {Title => 'PROV:3'}; + # (temp set of page title until moved to import extension) + $xml->{Pages}{Page} = {Title => "PROV:$prov_title_seed"}; - $current_node = $xml->{Pages}{Page}; - $current_node->{Template} = {Name => 'Provenance'}; - $current_node = $current_node->{Template}; + $curr_node = $xml->{Pages}{Page}; + $curr_node->{Template} = {Name => 'Provenance'}; + $curr_node = $curr_node->{Template}; - $current_node->{Field}[0] = {Name => 'Source Data Time Stamp'}; - $current_node = $current_node->{Field}[0]; - $current_node->content(0,$provenance{'SourceDateTimeStamp'}); - $current_node = $current_node->back(); - - $current_node->{Field}[1] = {Name => 'Source Database'}; - $current_node = $current_node->{Field}[1]; - $current_node->content(0,$provenance{'SourceDatabase'}); - $current_node = $current_node->back(); + $curr_node->{Field}[0] = {Name => 'Source Data Time Stamp'}; + $curr_node->{Field}[0]->content(0,$provenance{'SourceDateTimeStamp'}); + $curr_node->{Field}[1] = {Name => 'Source Database'}; + $curr_node->{Field}[1]->content(0,$provenance{'SourceDatabase'}); + $curr_node->{Field}[2] = {Name => 'Source Version'}; + $curr_node->{Field}[2]->content(0,$provenance{'SourceVersion'}); + $curr_node->{Field}[3] = {Name => 'Source URI'}; + $curr_node->{Field}[3]->content(0,$provenance{'SourceURI'}); + $curr_node->{Field}[4] = {Name => 'Source File'}; + $curr_node->{Field}[4]->content(0,$provenance{'SourceFile'}); - # iterate through the data + $curr_node = $curr_node->back->back; # return to node - # build the annotation page + if ($debug) { print "Current node: " . $curr_node->path . "\n"; } + + my $next_page_title_id = $annot_title_seed; + + # iterate through the data + foreach my $row (@field_data) { + + # set up next annotation page + my $next_page = { Title => "PS:$next_page_title_id" }; + push(@{$curr_node->{Page}}, $next_page); + + $curr_annot_node = $curr_node->{Page}( + "Title","eq","PS:$next_page_title_id"); + + if ($debug) { print "Curr annot node: ".$curr_annot_node->path."\n";} + + $curr_annot_node->{Template} = {Name => "$template_name"}; + $curr_annot_node = $curr_annot_node->{Template}; + + # set up next provenance page + my $next_page = { Title => "PS:$next_page_title_id/PROV" }; + push(@{$curr_node->{Page}}, $next_page); - # add a template to the annotation provenance page + $curr_prov_node = $curr_node->{Page}( + "Title","eq","PS:$next_page_title_id/PROV"); + if ($debug) {print "Curr prov node: " . $curr_prov_node->path . "\n"; } + + $curr_prov_node->{Template} = {Name => 'Provenance_Reference_Data'}; + $curr_prov_node = $curr_prov_node->{Template}; + $curr_prov_node->{Field} = {Name => 'Annotation Page'}; + $curr_prov_node->{Field}->content(0,"PS:$next_page_title_id"); + $curr_prov_node = $curr_prov_node->back; + + my $field_ct = 0; # counter for field position in pages + + # grab the Accession ID for the current row of data + foreach my $key (keys %$row) { + if ($key eq "Accession ID") { + $curr_accession_id = $row->{$key}; + if ($debug) { + print "* Found Accession ID: $curr_accession_id *\n"; + } + } + } + if (!(defined $curr_accession_id)) { + die "Error: No Accession ID available\n"; + } + + # iterate through the annotation data and assign to elements + foreach my $key (keys %$row) { + if ($debug) { print "$key => " . $row->{$key} . "\n"; } + + # build the annotation page + $curr_annot_node->{Field}[$field_ct] = {Name => $key}; + $curr_annot_node->{Field}[$field_ct]->content(0,$row->{$key}); + $field_ct++; + + # add a corresponding template to the annotation provenance page + my $next_prov_node = {Name => 'Provenance_Reference_Data_Repeater'}; + push(@{$curr_prov_node->{Template}}, $next_prov_node); + + # grab the last template you added + $curr_prov_node = @{$curr_prov_node->{Template}}[-1]; + + # assign the relevant provenance field data + $curr_prov_node->{Field}[0] = {Name => 'Provenance Page'}; + $curr_prov_node->{Field}[0]->content(0,"PROV:$prov_title_seed"); + $curr_prov_node->{Field}[1] = {Name => 'Accession ID'}; + $curr_prov_node->{Field}[1]->content(0,$curr_accession_id); + $curr_prov_node->{Field}[2] = {Name => 'Source Field'}; + $curr_prov_node->{Field}[2]->content(0,$key); + $curr_prov_node->{Field}[3] = {Name => 'Source Template'}; + $curr_prov_node->{Field}[3]->content(0,$template_name); + + $curr_prov_node = $curr_prov_node->back; + } + $next_page_title_id++; + } + # write out xml doc to a single ImportXML file + print "Writing data to output file...\n\n"; $xml->save($output_file); } -- 2.34.1