=head1 NAME
-Plant Semantics Import Transformation Script
+Planteome Annotation Wiki - Data Import Script
=head1 VERSION
=head1 DESCRIPTION
Transform external gene annotation data into an XML document readable
-by the MediaWiki extension DataTransfer (Special:ImportXML) feature,
-including subpages. Also generates appropriate provenance of data based on header of
-import file.
+by the MediaWiki extension DataTransfer (Special:ImportXML) feature.
+
+Also generates appropriate provenance of data based on a prefixed
+Source header in the import file.
+
+New to this revision: Added a GAF file type option for import;
+accommodates ontologically-focused annotations.
=head1 USAGE
-PS_TransformForImport.pl -i INPUT_FILE -t TYPE -o OUTPUT_FILE -v -d
+paw_TransformForImport.pl -i INPUT_FILE -t TYPE -o OUTPUT_FILE -v -d
=head1 OPTIONS
- -i Name of input CSV or tab-del file.
- -t Specifies input type of file ('csv' or 'tab')
- -o Name of output file.
- -v View verbose information
+ -i Name of input file
+ (if no path supplied, assumes current working directory)
+ -t Specifies input type of file
+ (allowed values: 'csv', 'tab', 'gaf')
+ -o Name of output XML file.
+ (if no path supplied, assumes current working directory)
+ -v View verbose information
-d View debugging information
=head1 DEPENDENCIES
Requires that the input file contain at least two headers: the first
will hold the provenance information associated with the imported
-data, and all following headers will specify the templates and field
-names for the annotation data.
-
-The main templates pertain to Annotations, Sources, and Publications.
-Other templates are to be used for subsidiary reference data
-(i.e. gene synonyms, external references, ontology terms,
-and sequence data).
+data, and all following headers will specify the annotation data to
+be imported in accordance with the input type.
=head2 Source Header Format Example
"
"
- NOTE: One of the fields in each format MUST be named "Accession ID" for the
- tracking of provenance.
+ NOTE: One of the fields in each format MUST be named "Accession
+ ID" for the tracking of provenance.
=head1 AUTHOR
# specific
use XML::Smart;
+use XML::DOM;
# ---------------------------------------------------------------------------
# declarations
# ---------------------------------------------------------------------------
my %opts; # arg options
-my $file_type; # tab or csv
-my $file_del;
+my $file_type; # tab, csv, gaf
+my $input_file;
+my $output_file;
my $verbose = 0; # flag for verbose output
my $debug = 0; # debugging switch
-# set paths for data files
-my $path = getcwd() . "/";
-my $input_file = $path;
-my $output_file = $path;
-
# data storage
my %source;
my $template_name;
my $xml;
my $output_data;
-my $prov_title_seed = 2;
+my $source_title_seed = 3;
my $annot_title_seed = 3;
# ---------------------------------------------------------------------------
foreach my $key (keys %opts) {
my $value = $opts{$key};
switch ($key) {
- case "i" { $input_file = $input_file . $value; }
+ case "i" {
+ if ($value =~ /\//) { # assume path
+ $input_file = $value;
+ } else {
+ $input_file = getcwd() . "\/$value";
+ }
+ }
case "t" { $file_type = $value; }
- case "o" { $output_file = $output_file . $value; }
+
+ case "o" {
+ if ($value =~ /\//) { # assume path
+ $output_file = $value;
+ } else {
+ $output_file = getcwd() . "\/$value";
+ }
+ }
case "v" { $verbose = 1; }
+
case "d" { $debug = 1; }
}
}
- # split data on either commas or tabs, dependent on format
- $file_del = ($file_type eq "csv") ? ',' : '\t';
-
+ # split data on either commas or tabs, if file type is generic
+ if ($file_type = "csv" || $file_type = "tab") {
+ my $file_del = ($file_type eq "csv") ? ',' : '\t';
+ }
+
system "clear";
print "\n"
. "------------------------------------------------------------\n"
# set root element, source page and elements
# (temp set of page title until moved to import extension)
- $xml->{Pages}{Page} = {Title => "Source:$prov_title_seed"};
+ $xml->{Pages}{Page} = {Title => "Source:$source_title_seed"};
$curr_node = $xml->{Pages}{Page};
$curr_node->{Template} = {Name => 'Source'};
# assign the relevant provenance field data
$curr_prov_node->{Field}[0] = {Name => 'Source'};
- $curr_prov_node->{Field}[0]->content(0,"Source:$prov_title_seed");
+ $curr_prov_node->{Field}[0]->content(0,"Source:$source_title_seed");
$curr_prov_node->{Field}[1] = {Name => 'Source Accession ID'};
$curr_prov_node->{Field}[1]->content(0,$curr_accession_id);
$curr_prov_node->{Field}[2] = {Name => 'Source Template'};