-i Name of input CSV or tab-del file.
-t Specifies input type of file ('csv' or 'tab')
-o Name of output file.
+ -v View verbose information
+ -d View debugging information
=head1 DEPENDENCIES
=head2 Data Header Format Example (field separator may also be a tab)
+ [Format]
+ Template=Gene
+ Fields=Species Name,Gene Symbol,Gene Name,Chromosome,Gene Type,Accession ID
[Data]
- Template=Gene Annotation
- Gene Name,Gene Annotation,Gene Symbol,...
val1,val2,val3,...
"
"
"
- NOTE: One of the fields must be named Accession ID for the
+ NOTE: One of the fields MUST be named "Accession ID" for the
tracking of provenance.
=head1 AUTHOR
my $xml;
my $output_data;
+my $prov_title_seed = 3;
+my $annot_title_seed = 1563;
+
# ---------------------------------------------------------------------------
# functions
# ---------------------------------------------------------------------------
}
}
+ # split data on either commas or tabs, dependent on format
$file_del = ($file_type eq "csv") ? ',' : '\t';
system "clear";
# ---------------------------------------------------------------------------
sub import_data
{
- print "Opening file and reading header info...\n\n";
+ print "Opening input file and reading header info...\n\n";
# open file
open(INPUT_FILE,$input_file) or die("Could not open input file.");
sub write_xml
{
print "Transforming data...\n\n";
- my $current_node; # placeholder for node cursor
+ my $curr_node; # placeholder for general node cursor
+ my $curr_prov_node; # placeholder for node cursor in provenance pages
+ my $curr_annot_node; # placeholder for node cursor in annotation pages
+ my $curr_accession_id; # holds each rows accession id for provenance marking
+
$xml = new XML::Smart($output_data, 'XML::Smart::Parser');
# set root element, provenance page and elements
- $xml->{Pages}{Page} = {Title => 'PROV:3'};
+ # (temp set of page title until moved to import extension)
+ $xml->{Pages}{Page} = {Title => "PROV:$prov_title_seed"};
- $current_node = $xml->{Pages}{Page};
- $current_node->{Template} = {Name => 'Provenance'};
- $current_node = $current_node->{Template};
+ $curr_node = $xml->{Pages}{Page};
+ $curr_node->{Template} = {Name => 'Provenance'};
+ $curr_node = $curr_node->{Template};
- $current_node->{Field}[0] = {Name => 'Source Data Time Stamp'};
- $current_node = $current_node->{Field}[0];
- $current_node->content(0,$provenance{'SourceDateTimeStamp'});
- $current_node = $current_node->back();
-
- $current_node->{Field}[1] = {Name => 'Source Database'};
- $current_node = $current_node->{Field}[1];
- $current_node->content(0,$provenance{'SourceDatabase'});
- $current_node = $current_node->back();
+ $curr_node->{Field}[0] = {Name => 'Source Data Time Stamp'};
+ $curr_node->{Field}[0]->content(0,$provenance{'SourceDateTimeStamp'});
+ $curr_node->{Field}[1] = {Name => 'Source Database'};
+ $curr_node->{Field}[1]->content(0,$provenance{'SourceDatabase'});
+ $curr_node->{Field}[2] = {Name => 'Source Version'};
+ $curr_node->{Field}[2]->content(0,$provenance{'SourceVersion'});
+ $curr_node->{Field}[3] = {Name => 'Source URI'};
+ $curr_node->{Field}[3]->content(0,$provenance{'SourceURI'});
+ $curr_node->{Field}[4] = {Name => 'Source File'};
+ $curr_node->{Field}[4]->content(0,$provenance{'SourceFile'});
- # iterate through the data
+ $curr_node = $curr_node->back->back; # return to <Pages> node
- # build the annotation page
+ if ($debug) { print "Current node: " . $curr_node->path . "\n"; }
+
+ my $next_page_title_id = $annot_title_seed;
+
+ # iterate through the data
+ foreach my $row (@field_data) {
+
+ # set up next annotation page
+ my $next_page = { Title => "PS:$next_page_title_id" };
+ push(@{$curr_node->{Page}}, $next_page);
+
+ $curr_annot_node = $curr_node->{Page}(
+ "Title","eq","PS:$next_page_title_id");
+
+ if ($debug) { print "Curr annot node: ".$curr_annot_node->path."\n";}
+
+ $curr_annot_node->{Template} = {Name => "$template_name"};
+ $curr_annot_node = $curr_annot_node->{Template};
+
+ # set up next provenance page
+ my $next_page = { Title => "PS:$next_page_title_id/PROV" };
+ push(@{$curr_node->{Page}}, $next_page);
- # add a template to the annotation provenance page
+ $curr_prov_node = $curr_node->{Page}(
+ "Title","eq","PS:$next_page_title_id/PROV");
+ if ($debug) {print "Curr prov node: " . $curr_prov_node->path . "\n"; }
+
+ $curr_prov_node->{Template} = {Name => 'Provenance_Reference_Data'};
+ $curr_prov_node = $curr_prov_node->{Template};
+ $curr_prov_node->{Field} = {Name => 'Annotation Page'};
+ $curr_prov_node->{Field}->content(0,"PS:$next_page_title_id");
+ $curr_prov_node = $curr_prov_node->back;
+
+ my $field_ct = 0; # counter for field position in pages
+
+ # grab the Accession ID for the current row of data
+ foreach my $key (keys %$row) {
+ if ($key eq "Accession ID") {
+ $curr_accession_id = $row->{$key};
+ if ($debug) {
+ print "* Found Accession ID: $curr_accession_id *\n";
+ }
+ }
+ }
+ if (!(defined $curr_accession_id)) {
+ die "Error: No Accession ID available\n";
+ }
+
+ # iterate through the annotation data and assign to <Field> elements
+ foreach my $key (keys %$row) {
+ if ($debug) { print "$key => " . $row->{$key} . "\n"; }
+
+ # build the annotation page
+ $curr_annot_node->{Field}[$field_ct] = {Name => $key};
+ $curr_annot_node->{Field}[$field_ct]->content(0,$row->{$key});
+ $field_ct++;
+
+ # add a corresponding template to the annotation provenance page
+ my $next_prov_node = {Name => 'Provenance_Reference_Data_Repeater'};
+ push(@{$curr_prov_node->{Template}}, $next_prov_node);
+
+ # grab the last template you added
+ $curr_prov_node = @{$curr_prov_node->{Template}}[-1];
+
+ # assign the relevant provenance field data
+ $curr_prov_node->{Field}[0] = {Name => 'Provenance Page'};
+ $curr_prov_node->{Field}[0]->content(0,"PROV:$prov_title_seed");
+ $curr_prov_node->{Field}[1] = {Name => 'Accession ID'};
+ $curr_prov_node->{Field}[1]->content(0,$curr_accession_id);
+ $curr_prov_node->{Field}[2] = {Name => 'Source Field'};
+ $curr_prov_node->{Field}[2]->content(0,$key);
+ $curr_prov_node->{Field}[3] = {Name => 'Source Template'};
+ $curr_prov_node->{Field}[3]->content(0,$template_name);
+
+ $curr_prov_node = $curr_prov_node->back;
+ }
+ $next_page_title_id++;
+ }
+
# write out xml doc to a single ImportXML file
+ print "Writing data to output file...\n\n";
$xml->save($output_file);
}