=head1 VERSION
-0.1
+0.2
=head1 DESCRIPTION
existed previously, and multiple replacement gene boxes are required,
a new group will be created.
+If an original gene had multiple instances displayed on the pathway
+diagram, each instance will be subjected to the replacement process.
+
The replacement gene symbols can be prefixed to separate them from the
original, and an ordinal suffix ('-#') will be added to a group of
replacement genes.
my $debug = 0; # debugging switch
# global data containers
-my %configs; # holds global configuration settings
-my %swap_genes; # holds original and swapped genes
-my $gpml_doc; #holds imported GPML data for manipulation and output
+my %configs; # configuration settings
+my %swap_genes; # original and swapped genes
+my $gpml_doc; # imported GPML data for manipulation and output
+my %unmapped_genes; # original genes not mapped to homologs
$Data::Dumper::Pad = "... ";
}
}
- # will hold a convenient list of data node references in the gpml doc,
+ # will hold a convenient nested hash of data node references in the gpml doc,
# indexed by the id of the gene located in the <Xref> element for each
- # node
+ # node, and sub-indexed by the GraphId of each corresponding node
my %data_nodes_by_gene_id;
# create a hash of all 5-digit hex ids in the gpml doc (this is the black list)
# also build a data node hash to make lookup easier in the next section
my $curr_xref_id = ($_->getElementsByTagName("Xref"))[0]
->getAttributeNode("ID")->getValue;
+ my $curr_graph_id = $_->getAttributeNode("GraphId")->getValue;
$curr_xref_id =~ s/\s+$//; # rtrim whitespace
+ $curr_xref_id =~ s/^\s+//; # ltrim whitespace
#$curr_xref_id =~ s/^("|')("|')$//; # strip quotes - may not be necessary
if (length($curr_xref_id) > 0)
{
- if ($curr_xref_id eq "AT3G12810") { print "** hit on AT3G12810\n"; } # TEST
- $data_nodes_by_gene_id{$curr_xref_id} = $_;
+ #if ($curr_xref_id eq "AT3G12810") { print "** hit on AT3G12810\n"; } # TEST
+ $data_nodes_by_gene_id{$curr_xref_id}{$curr_graph_id} = $_;
}
else
{
}
}
}
-
print "\n";
if ($debug) {
print "...<DEBUG: \%data_nodes_by_gene_id>\n";
- foreach my $tmp_node (keys %data_nodes_by_gene_id) {
- print "... $tmp_node => $data_nodes_by_gene_id{$tmp_node}\n";
- if ($tmp_node eq "AT3G12810") { print "** hit on AT3G12810 node\n"; } # TEST
+ foreach my $tmp_gene (keys %data_nodes_by_gene_id) {
+ foreach my $tmp_node (keys %{$data_nodes_by_gene_id{$tmp_gene}}) {
+ print "... $tmp_gene => $tmp_node => $data_nodes_by_gene_id{$tmp_gene}{$tmp_node}\n";
+ #if ($tmp_gene eq "AT3G12810") { print "** hit on AT3G12810 node\n"; } # TEST
+ }
}
print "\n";
}
{
# print $old_gene . "\n"; # TEST
- # find curr old gene node in doc
+ # find curr old gene nodes in doc
if (exists $data_nodes_by_gene_id{$old_gene})
{
- my $curr_old_gene_node = $data_nodes_by_gene_id{$old_gene};
- # print $curr_old_gene_node . "\n"; # TEST
-
- # holds list of newly-created nodes, used to replace old node
- my @new_nodes_map;
-
- # iterate through new gene replacements
- for (@{$swap_genes{$old_gene}})
- {
- # copy the curr old gene node
- my $new_node = $curr_old_gene_node->cloneNode("deep");
-
- # print "[$_]\n$new_node->toString\n\n"; # TEST
-
- # add to new nodes ary
- push @new_nodes_map, [$new_node, $_];
- }
- # print "@new_nodes_map\n"; # TEST
-
- # if more than one new homolog exists, and the old gene doesn't
- # already belong to a group, you'll need a new Group for multiple
- # gene boxes
- my $new_GroupId;
-
- if (scalar(@new_nodes_map) > 1)
+ # iterate through each node by its GraphId
+ foreach my $curr_old_genes_by_hex_id (keys %{$data_nodes_by_gene_id{$old_gene}})
{
- # if curr old gene does not belong to a group
-
- # print $curr_old_gene_node->toString . "\n"; # TEST
- # print $curr_old_gene_node->getAttribute("GroupRef"); # TEST
-
- if (!$curr_old_gene_node->getAttribute("GroupRef"))
+ my $curr_old_gene_node = $data_nodes_by_gene_id{$old_gene}{$curr_old_genes_by_hex_id};
+ # print $curr_old_gene_node . "\n"; # TEST
+
+ # holds list of newly-created nodes, used to replace old node
+ my @new_nodes_map;
+
+ # iterate through new gene replacements
+ for (@{$swap_genes{$old_gene}})
{
- #print "no existing group ref\n"; # TEST
-
- # generate a new GroupId and Group.GraphId hex ids not
- # already in use
- $new_GroupId = create_unique_hex_id(\%existing_hex_ids,"Group.GroupId");
- # my $new_Group_GraphId = create_unique_hex_id(\%existing_hex_ids,"Group.GraphId");
- #print "new group id: $new_GroupId\n"; # TEST
- # print "$new_GroupId, $new_Group_GraphId\n"; # TEST
-
- # create a new group node
- my $new_group = $gpml_doc->createElement("Group");
- $new_group->setAttribute("GroupId",$new_GroupId);
- #$new_group->setAttribute("GraphId",$new_Group_GraphId);
- $new_group->setAttribute("Style","Group");
-
- # add to beginning of group nodes
- $pathway_node->insertBefore($new_group,${$group_nodes}[0]);
+ # copy the curr old gene node
+ my $new_node = $curr_old_gene_node->cloneNode("deep");
+
+ # print "[$_]\n$new_node->toString\n\n"; # TEST
+
+ # add to new nodes ary
+ push @new_nodes_map, [$new_node, $_];
}
- }
-
- # flag for determining if there are one or many replacement homologs
- my $is_first_homolog = 1;
-
- # makes sure each box is increasingly offset from the original
- # (in all three dimensions)
- my $offset_multiplier = 0;
- my $gene_suffix_counter = 0; # used to affix numbers to multiple new gene symbols
-
- # for new nodes ary
- for (@new_nodes_map)
- {
+ # print "@new_nodes_map\n"; # TEST
+
+ # if more than one new homolog exists, and the old gene doesn't
+ # already belong to a group, you'll need a new Group for multiple
+ # gene boxes
+ my $new_GroupId;
+
if (scalar(@new_nodes_map) > 1)
{
- $gene_suffix_counter++;
+ # if curr old gene does not belong to a group
+
+ # print $curr_old_gene_node->toString . "\n"; # TEST
+ # print $curr_old_gene_node->getAttribute("GroupRef"); # TEST
+
+ if (!$curr_old_gene_node->getAttribute("GroupRef"))
+ {
+ #print "no existing group ref\n"; # TEST
+
+ # generate a new GroupId and Group.GraphId hex ids not
+ # already in use
+ $new_GroupId = create_unique_hex_id(\%existing_hex_ids,"Group.GroupId");
+ # my $new_Group_GraphId = create_unique_hex_id(\%existing_hex_ids,"Group.GraphId");
+ #print "new group id: $new_GroupId\n"; # TEST
+ # print "$new_GroupId, $new_Group_GraphId\n"; # TEST
+
+ # create a new group node
+ my $new_group = $gpml_doc->createElement("Group");
+ $new_group->setAttribute("GroupId",$new_GroupId);
+ #$new_group->setAttribute("GraphId",$new_Group_GraphId);
+ $new_group->setAttribute("Style","Group");
+
+ # add to beginning of group nodes
+ $pathway_node->insertBefore($new_group,${$group_nodes}[0]);
+ }
}
- my $curr_new_node = $$_[0];
- my $curr_homolog = $$_[1];
- # print "$_: $curr_new_node, $curr_homolog\n"; # TEST
- # print "[Curr New Node before editing...]\n" . $curr_new_node->toString . "\n\n"; # TEST
-
- # update all new nodes w/ attributes...
-
- # rename TextLabel...
- # prefix (from config), suffix: new '-#' for multiple homologs
- $curr_new_node->setAttribute("TextLabel",
- (($configs{"LabelPrefix"}) ? $configs{"LabelPrefix"} : "")
- . $curr_new_node->getAttributeNode("TextLabel")->getValue
- . (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : ""));
+ # flag for determining if there are one or many replacement homologs
+ my $is_first_homolog = 1;
+
+ # makes sure each box is increasingly offset from the original
+ # (in all three dimensions)
+ my $offset_multiplier = 0;
+ my $gene_suffix_counter = 0; # used to affix numbers to multiple new gene symbols
- # add new GroupRef if necessary
- if ($new_GroupId)
+ # for new nodes ary
+ for (@new_nodes_map)
{
- $curr_new_node->setAttribute("GroupRef",$new_GroupId);
+ if (scalar(@new_nodes_map) > 1)
+ {
+ $gene_suffix_counter++;
+ }
+
+ my $curr_new_node = $$_[0];
+ my $curr_homolog = $$_[1];
+ # print "$_: $curr_new_node, $curr_homolog\n"; # TEST
+ # print "[Curr New Node before editing...]\n" . $curr_new_node->toString . "\n\n"; # TEST
+
+ # update all new nodes w/ attributes...
+
+ # rename TextLabel...
+ # prefix (from config), suffix: new '-#' for multiple homologs
+ $curr_new_node->setAttribute("TextLabel",
+ (($configs{"LabelPrefix"}) ? $configs{"LabelPrefix"} : "")
+ . $curr_new_node->getAttributeNode("TextLabel")->getValue
+ . (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : ""));
+
+ # add new GroupRef if necessary
+ if ($new_GroupId)
+ {
+ $curr_new_node->setAttribute("GroupRef",$new_GroupId);
+ }
+
+ # add Comment back-referencing TAIR locus id (use "source" attribute)
+ # NOTE: order is important in GPML; the <Comment> tags are first
+ my $new_comment = $gpml_doc->createElement("Comment");
+ $new_comment->setAttribute("Source",$configs{"Source"});
+ $new_comment->addText($configs{"CommentPrefix"} . " $old_gene.");
+ $curr_new_node->insertBefore($new_comment,$curr_new_node->getFirstChild); # assumes other child nodes
+
+ # edit <Xref Database="JGI" ID="Egrandis..." />
+ my $curr_xref = ($curr_new_node->getElementsByTagName("Xref"))[0];
+ $curr_xref->setAttribute("Database",$configs{"Database"});
+ $curr_xref->setAttribute("ID",$curr_homolog);
+
+ # change box width and colors (<Graphics...Color="4488ff" ... />)
+ my $curr_graphics = ($curr_new_node->getElementsByTagName("Graphics"))[0];
+ $curr_graphics->setAttribute("Width",$configs{"BoxWidth"});
+ if ($configs{"BoxBorder"}) {
+ $curr_graphics->setAttribute("Color",$configs{"BoxBorder"}); }
+ if ($configs{"BoxColor"}) {
+ $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"}); }
+
+ if ($is_first_homolog)
+ {
+ # print "that was the first homolog...\n"; # TEST
+ $is_first_homolog = 0; # first homolog complete
+ }
+ else # add'l homologs required
+ {
+ $offset_multiplier++;
+
+ # print "that was an add'l homolog, change more attrs...\n"; # TEST
+ # update add'l nodes w/ special attributes...
+
+ # generate a new DataNode GraphId not already in use
+ my $new_GraphId =
+ create_unique_hex_id(\%existing_hex_ids,"DataNode.GraphId");
+ # print $new_GraphId . "\n"; # TEST
+ $curr_new_node->setAttribute("GraphId",$new_GraphId);
+
+ # decrement the Z-order
+ $curr_graphics->setAttribute("ZOrder",
+ $curr_graphics->getAttributeNode("ZOrder")->getValue
+ - $offset_multiplier);
+ # stagger the extra boxes by decrementing the coords
+ $curr_graphics->setAttribute("CenterX",
+ $curr_graphics->getAttributeNode("CenterX")->getValue
+ - ($configs{"X-Offset"} * $offset_multiplier));
+ $curr_graphics->setAttribute("CenterY",
+ $curr_graphics->getAttributeNode("CenterY")->getValue
+ - ($configs{"Y-Offset"} * $offset_multiplier));
+ }
}
+ undef $new_GroupId; # clear this out so we can test against its existence next time
- # add Comment back-referencing TAIR locus id (use "source" attribute)
- # NOTE: order is important in GPML; the <Comment> tags are first
- my $new_comment = $gpml_doc->createElement("Comment");
- $new_comment->setAttribute("Source",$configs{"Source"});
- $new_comment->addText($configs{"CommentPrefix"} . " $old_gene.");
- $curr_new_node->insertBefore($new_comment,$curr_new_node->getFirstChild); # assumes other child nodes
-
- # edit <Xref Database="JGI" ID="Egrandis..." />
- my $curr_xref = ($curr_new_node->getElementsByTagName("Xref"))[0];
- $curr_xref->setAttribute("Database",$configs{"Database"});
- $curr_xref->setAttribute("ID",$curr_homolog);
-
- # change box width and colors (<Graphics...Color="4488ff" ... />)
- my $curr_graphics = ($curr_new_node->getElementsByTagName("Graphics"))[0];
- $curr_graphics->setAttribute("Width",$configs{"BoxWidth"});
- if ($configs{"BoxBorder"}) {
- $curr_graphics->setAttribute("Color",$configs{"BoxBorder"}); }
- if ($configs{"BoxColor"}) {
- $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"}); }
-
- if ($is_first_homolog)
- {
- # print "that was the first homolog...\n"; # TEST
- $is_first_homolog = 0; # first homolog complete
+ # replace old node w/ new node(s)
+ for (@new_nodes_map) {
+ # add all the new nodes...
+ $pathway_node->insertBefore($$_[0],$curr_old_gene_node);
}
- else # add'l homologs required
- {
- $offset_multiplier++;
-
- # print "that was an add'l homolog, change more attrs...\n"; # TEST
- # update add'l nodes w/ special attributes...
-
- # generate a new DataNode GraphId not already in use
- my $new_GraphId =
- create_unique_hex_id(\%existing_hex_ids,"DataNode.GraphId");
- # print $new_GraphId . "\n"; # TEST
- $curr_new_node->setAttribute("GraphId",$new_GraphId);
-
- # decrement the Z-order
- $curr_graphics->setAttribute("ZOrder",
- $curr_graphics->getAttributeNode("ZOrder")->getValue
- - $offset_multiplier);
- # stagger the extra boxes by decrementing the coords
- $curr_graphics->setAttribute("CenterX",
- $curr_graphics->getAttributeNode("CenterX")->getValue
- - ($configs{"X-Offset"} * $offset_multiplier));
- $curr_graphics->setAttribute("CenterY",
- $curr_graphics->getAttributeNode("CenterY")->getValue
- - ($configs{"Y-Offset"} * $offset_multiplier));
- }
- }
- undef $new_GroupId; # clear this out so we can test against its existence next time
-
- # replace old node w/ new node(s)
- for (@new_nodes_map) {
- # add all the new nodes...
- $pathway_node->insertBefore($$_[0],$curr_old_gene_node);
+ # ...and remove the original node
+ $pathway_node->removeChild($curr_old_gene_node);
}
- # ...and remove the original node
- $pathway_node->removeChild($curr_old_gene_node);
+ # once mapped, remove the old gene so we are left with a list of
+ # unmapped original genes (for show_ouput())
+ delete($data_nodes_by_gene_id{$old_gene});
}
else
{
. "PathVisio GPML document.\n";
}
}
-
- # [later]
- # .After PathVisio sanity check, before communication:
- # .remove added back-ref TAIR comments (just rerun w/o added <Comment>?)
+ %unmapped_genes = %data_nodes_by_gene_id; # whatever is left over
}
# ---------------------------------------------------------------------------
# ---------------------------------------------------------------------------
sub show_output
{
+ if ($verbose) {
+ print "[Unmapped original genes]\n";
+ my $count = 0;
+ foreach my $tmp_gene (keys %unmapped_genes)
+ {
+ foreach my $tmp_node (keys %{$unmapped_genes{$tmp_gene}})
+ {
+ $count++;
+ print $unmapped_genes{$tmp_gene}{$tmp_node}
+ ->getAttributeNode("TextLabel")->getValue
+ . "\t($tmp_gene)\n";
+ }
+ }
+ print "\nTotal unmapped genes: $count\n";
+ }
if ($doc_mode)
{
print "\n[Modified GPML Output]\n";