=head1 DESCRIPTION
Swap out one set of genes for another in an existing PathVisio GPML
-file.
+file. Optionally removes literature references.
+
+If multiple replacement genes map to a single original gene,
+multiple PathVisio boxes will be drawn in place of the
+original.
+
+If a group associating multiple gene boxes already existed,
+that group will extend to any replacement genes. If no group
+existed previously, and multiple replacement gene boxes are required,
+a new group will be created.
+
+The replacement gene symbols can be prefixed to separate them from the
+original, and an ordinal suffix ('-#') will be added to a group of
+replacement genes.
+
+Any new gene boxes may be painted with a custom color and border, and
+will be stacked and offset for ease of visualization (much like a deck
+of cards).
=head1 USAGE
-pathway_gene_swapper.pl -i INPUT_FILE -g GENE_FILE -c CONFIG_FILE -o OUTPUT_FILE -v -d
+pathway_gene_swapper.pl -i INPUT_FILE -g GENE_FILE -c CONFIG_FILE -o OUTPUT_FILE -v -G -d
=head1 OPTIONS
-o Name of output GPML file.
(NOTE: if no path supplied for input files,
current working directory is assumed)
+ -L Remove literature references.
-v View verbose information
+ -G Display GPML input/output documents
-d View debugging information
=head1 DEPENDENCIES and PREREQUISITES
- - Non-standard Perl modules: Switch, XML::DOM
+ - Non-standard Perl modules: Switch, Data::Dumper, XML::DOM
- The input file must be a valid GPML file
- The CSV file must have a single-line column header.
The first column must have one and only one gene -- the
"original" gene.
- The second column may have one or more genes or gene variants
- -- the "replacement" gene(s).
-
- If the second column contains multiple genes or gene variants,
- multiple PathVisio boxes will be drawn in place of the
- original.
+ The second column may have one and only one gene variants or
+ paralog -- the "replacement" gene(s).
- - The config file may have any or all of the following entries:
+ - The config file may have any or all of the following entries,
+ in addition to the required fields (in any order):
+ Source= (required)
+ Database= (required)
+ Version= (required)
Title=
- MaintainedBy=
+ Author=
+ Maintainer=
Organism=
+ CommentPrefix= (will precede back-reference to prior source,
+ default: "Previously labeled as: ")
+ LabelPrefix= (precedes current gene label)
BoxBorder= (RRGGBB hex, default: black)
BoxColor= (RRGGBB hex, default: white)
- BoxWidth= (integer, in px)
- CommentPrefix= (will precede back-reference to prior source)
- LabelPrefix= (precedes current gene label)
- X-Offset= (integer, in px)
- Y-Offset= (integer, in px)
-
+ BoxWidth= (integer, in px, default: 60)
+ X-Offset= (integer, in px, default: 5px)
+ Y-Offset= (integer, in px, default: 4px)
+
=head1 AUTHORS
Justin Preece and Mamatha Hanumappa
my $input_gene_file;
my $input_config_file;
my $output_file;
+my $remove_lit = 0; # flag to remove literature and lit references
my $verbose = 0; # flag for verbose output
+my $doc_mode = 0; # flag for extra GPML doc output
my $debug = 0; # debugging switch
# global data containers
my $data_field = $line_ary[0];
my $data_val = $line_ary[1];
- $local_config_hash{$data_field} = $data_val;
+ if ($data_val)
+ {
+ $local_config_hash{$data_field} = $data_val;
+ }
}
close(CONFIG_FILE);
+
+ # check for required settings
+ if (!$local_config_hash{"Source"}) {
+ die("You are required to provide a Source for your new data.\n"
+ . "Run \"perldoc pathway_gene_swapper.pl\" for more information.\n\n"); }
+ if (!$local_config_hash{"Database"}) {
+ die("You are required to identify a Database for your new data.\n"
+ . "Run \"perldoc pathway_gene_swapper.pl\" for more information.\n\n"); }
+ if (!$local_config_hash{"Version"}) {
+ die("You are required to provide a Version idenifier for your new data.\n"
+ . "Run \"perldoc pathway_gene_swapper.pl\" for more information.\n\n"); }
+
+ # set defaults, if none declared
+ if (!$local_config_hash{"CommentPrefix"}) {
+ $local_config_hash{"CommentPrefix"} = "Previously labeled as: ";
+ }
+ if (!$local_config_hash{"BoxWidth"}) {
+ $local_config_hash{"BoxWidth"} = 60;
+ }
+ if (!$local_config_hash{"X-Offset"}) {
+ $local_config_hash{"X-Offset"} = 4;
+ }
+ if (!$local_config_hash{"Y-Offset"}) {
+ $local_config_hash{"Y-Offset"} = 3;
+ }
return %local_config_hash;
}
sub init
{
# read and set options
- getopts('i:g:c:o:vd', \%opts);
+ getopts('i:g:c:o:LvGd', \%opts);
foreach my $key (keys %opts) {
my $value = $opts{$key};
$output_file = getcwd() . "\/$value";
}
}
+ case "L" { $remove_lit = 1; }
case "v" { $verbose = 1; }
-
+ case "G" { $doc_mode = 1; }
case "d" { $debug = 1; }
}
}
. "Output File: $output_file\n"
. "\n"
. "Running in verbose mode? " . ($verbose ? "Yes" : "No") . "\n"
+ . "Running in document mode? " . ($doc_mode ? "Yes" : "No") . "\n"
. "Running in debug mode? " . ($debug ? "Yes" : "No") . "\n"
. "\n"
. "------------------------------------------------------------\n"
# ---------------------------------------------------------------------------
sub show_input
{
- print "[Configuration Settings]\n";
- print Dumper(\%configs) . "\n\n";
- print "\n";
-
- print "[Gene Mappings]\n";
- print Dumper(\%swap_genes) . "\n\n";
- print "\n";
-
- # print "[Source GPML]\n";
- # print $gpml_doc->toString;
- # print "\n";
+ if ($verbose)
+ {
+ print "[Configuration Settings]\n";
+ print Dumper(\%configs) . "\n\n";
+ print "\n";
+
+ print "[Gene Mappings]\n";
+ print Dumper(\%swap_genes) . "\n\n";
+ print "\n";
+ }
+ if ($doc_mode)
+ {
+ print "[Source GPML]\n";
+ print $gpml_doc->toString;
+ print "\n";
+ }
}
# ---------------------------------------------------------------------------
if (exists ${$_[0]}{$candidate_id})
{
# print "not unique...\n"; # TEST
- $candidate_id = create_unique_hex_id($_[0],$_[1]);
+ $candidate_id = &create_unique_hex_id($_[0],$_[1]);
}
else
{
my $pathway_node = ($gpml_doc->getElementsByTagName("Pathway"))[0];
# change Pathway header info to config settings
- $pathway_node->setAttribute("Name",$configs{"Title"});
- $pathway_node->setAttribute("Author",$configs{"Author"});
- $pathway_node->setAttribute("Maintainer",$configs{"Maintainer"});
- $pathway_node->setAttribute("Version",$configs{"Date"});
- $pathway_node->setAttribute("Organism",$configs{"Organism"});
+ if ($configs{"Title"}) {
+ $pathway_node->setAttribute("Name",$configs{"Title"}); }
+ if ($configs{"Author"}) {
+ $pathway_node->setAttribute("Author",$configs{"Author"}); }
+ if ($configs{"Maintainer"}) {
+ $pathway_node->setAttribute("Maintainer",$configs{"Maintainer"}); }
+ if ($configs{"Version"}) {
+ $pathway_node->setAttribute("Version",$configs{"Version"}); }
+ if ($configs{"Organism"}) {
+ $pathway_node->setAttribute("Organism",$configs{"Organism"}); }
- # remove all <BiopaxRef> and <bp:PublicationXref> elements and children
- my $biopax_node = ($pathway_node->getElementsByTagName("Biopax"))[0];
- $pathway_node->removeChild($biopax_node);
-
+ # get all "gene box" data nodes
my $data_nodes = $pathway_node->getElementsByTagName("DataNode");
-
# print $data_nodes->getLength . "\n"; # TEST
- for (@$data_nodes)
+ # remove all <BiopaxRef> and <bp:PublicationXref> elements and children
+ if ($remove_lit)
{
- my $curr_datanode = $_;
- my $biopaxref_nodes = $curr_datanode->getElementsByTagName("BiopaxRef");
- for (@$biopaxref_nodes)
+ print "Removing literature references...\n\n";
+ my $biopax_node = ($pathway_node->getElementsByTagName("Biopax"))[0];
+ $pathway_node->removeChild($biopax_node);
+
+ for (@$data_nodes)
{
- # print $_->getTagName . "\n"; # TEST
- $curr_datanode->removeChild($_);
+ my $curr_datanode = $_;
+ my $biopaxref_nodes = $curr_datanode->getElementsByTagName("BiopaxRef");
+ for (@$biopaxref_nodes)
+ {
+ # print $_->getTagName . "\n"; # TEST
+ $curr_datanode->removeChild($_);
+ }
}
}
-
+
# will hold a convenient list of data node references in the gpml doc,
# indexed by the id of the gene located in the <Xref> element for each
# node
}
}
+ print "\n";
if ($debug) {
print "...<DEBUG: \%data_nodes_by_gene_id>\n";
# rename TextLabel...
# prefix (from config), suffix: new '-#' for multiple paralogs
$curr_new_node->setAttribute("TextLabel",
- $configs{"LabelPrefix"}
+ (($configs{"LabelPrefix"}) ? $configs{"LabelPrefix"} : "")
. $curr_new_node->getAttributeNode("TextLabel")->getValue
. (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : ""));
# change box width and colors (<Graphics...Color="4488ff" ... />)
my $curr_graphics = ($curr_new_node->getElementsByTagName("Graphics"))[0];
$curr_graphics->setAttribute("Width",$configs{"BoxWidth"});
- $curr_graphics->setAttribute("Color",$configs{"BoxBorder"});
- $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"});
+ if ($configs{"BoxBorder"}) {
+ $curr_graphics->setAttribute("Color",$configs{"BoxBorder"}); }
+ if ($configs{"BoxColor"}) {
+ $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"}); }
if ($is_first_paralog)
{
# [later]
# .After PathVisio sanity check, before communication:
# .remove added back-ref TAIR comments (just rerun w/o added <Comment>?)
-
- # [usage example]
- # my $nodes = $gpml_doc->getElementsByTagName("DataNode");
- # my $n = $nodes->getLength;
-
- # for (my $i = 0; $i < $n; $i++)
- # {
- # my $node = $nodes->item ($i);
- # my $gene_label = $node->getAttributeNode("TextLabel");
- # print $gene_label->getValue . "\n";
- # }
}
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
sub show_output
{
- # print "[Modified GPML Output]\n";
- # print $gpml_doc->toString;
- # print "\n";
+ if ($doc_mode)
+ {
+ print "\n[Modified GPML Output]\n";
+ print $gpml_doc->toString;
+ print "\n";
+ }
}
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
sub export_data
{
- print "Writing GPML to output file...\n\n";
+ print "\nWriting GPML to output file...\n\n";
# ensures utf-8 encoding (for accent marks, etc.)
open my $out_file_handle, ">:utf8", "$output_file" or die $!;
# ---------------------------------------------------------------------------
init;
+
import_data;
-if ($verbose) { show_input; }
+show_input;
swap_genes();
-if ($verbose) { show_output; }
+show_output;
export_data;
+
$gpml_doc->dispose; # cleanup
exit;
+# ---------------------------------------------------------------------------
+# end
+# ---------------------------------------------------------------------------
+