From: preecej Date: Fri, 12 Aug 2011 23:23:28 +0000 (+0000) Subject: Added ability to map to multiple original gene boxes; added verbose X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=7857afa89841ca03cc117f7c0930a00aab86bf9d;p=old-jaiswallab-svn%2F.git Added ability to map to multiple original gene boxes; added verbose display and count of unmapped original genes svn path=/; revision=154 --- diff --git a/preecej/perl_singletons/pathway_gene_swapper.pl b/preecej/perl_singletons/pathway_gene_swapper.pl index 7e3459f..cc9c4d7 100644 --- a/preecej/perl_singletons/pathway_gene_swapper.pl +++ b/preecej/perl_singletons/pathway_gene_swapper.pl @@ -6,7 +6,7 @@ Pathway Gene Swapper =head1 VERSION -0.1 +0.2 =head1 DESCRIPTION @@ -22,6 +22,9 @@ that group will extend to any replacement genes. If no group existed previously, and multiple replacement gene boxes are required, a new group will be created. +If an original gene had multiple instances displayed on the pathway +diagram, each instance will be subjected to the replacement process. + The replacement gene symbols can be prefixed to separate them from the original, and an ordinal suffix ('-#') will be added to a group of replacement genes. @@ -119,9 +122,10 @@ my $doc_mode = 0; # flag for extra GPML doc output my $debug = 0; # debugging switch # global data containers -my %configs; # holds global configuration settings -my %swap_genes; # holds original and swapped genes -my $gpml_doc; #holds imported GPML data for manipulation and output +my %configs; # configuration settings +my %swap_genes; # original and swapped genes +my $gpml_doc; # imported GPML data for manipulation and output +my %unmapped_genes; # original genes not mapped to homologs $Data::Dumper::Pad = "... "; @@ -447,9 +451,9 @@ sub swap_genes } } - # will hold a convenient list of data node references in the gpml doc, + # will hold a convenient nested hash of data node references in the gpml doc, # indexed by the id of the gene located in the element for each - # node + # node, and sub-indexed by the GraphId of each corresponding node my %data_nodes_by_gene_id; # create a hash of all 5-digit hex ids in the gpml doc (this is the black list) @@ -467,14 +471,16 @@ sub swap_genes # also build a data node hash to make lookup easier in the next section my $curr_xref_id = ($_->getElementsByTagName("Xref"))[0] ->getAttributeNode("ID")->getValue; + my $curr_graph_id = $_->getAttributeNode("GraphId")->getValue; $curr_xref_id =~ s/\s+$//; # rtrim whitespace + $curr_xref_id =~ s/^\s+//; # ltrim whitespace #$curr_xref_id =~ s/^("|')("|')$//; # strip quotes - may not be necessary if (length($curr_xref_id) > 0) { - if ($curr_xref_id eq "AT3G12810") { print "** hit on AT3G12810\n"; } # TEST - $data_nodes_by_gene_id{$curr_xref_id} = $_; + #if ($curr_xref_id eq "AT3G12810") { print "** hit on AT3G12810\n"; } # TEST + $data_nodes_by_gene_id{$curr_xref_id}{$curr_graph_id} = $_; } else { @@ -485,14 +491,15 @@ sub swap_genes } } } - print "\n"; if ($debug) { print "...\n"; - foreach my $tmp_node (keys %data_nodes_by_gene_id) { - print "... $tmp_node => $data_nodes_by_gene_id{$tmp_node}\n"; - if ($tmp_node eq "AT3G12810") { print "** hit on AT3G12810 node\n"; } # TEST + foreach my $tmp_gene (keys %data_nodes_by_gene_id) { + foreach my $tmp_node (keys %{$data_nodes_by_gene_id{$tmp_gene}}) { + print "... $tmp_gene => $tmp_node => $data_nodes_by_gene_id{$tmp_gene}{$tmp_node}\n"; + #if ($tmp_gene eq "AT3G12810") { print "** hit on AT3G12810 node\n"; } # TEST + } } print "\n"; } @@ -520,158 +527,165 @@ sub swap_genes { # print $old_gene . "\n"; # TEST - # find curr old gene node in doc + # find curr old gene nodes in doc if (exists $data_nodes_by_gene_id{$old_gene}) { - my $curr_old_gene_node = $data_nodes_by_gene_id{$old_gene}; - # print $curr_old_gene_node . "\n"; # TEST - - # holds list of newly-created nodes, used to replace old node - my @new_nodes_map; - - # iterate through new gene replacements - for (@{$swap_genes{$old_gene}}) - { - # copy the curr old gene node - my $new_node = $curr_old_gene_node->cloneNode("deep"); - - # print "[$_]\n$new_node->toString\n\n"; # TEST - - # add to new nodes ary - push @new_nodes_map, [$new_node, $_]; - } - # print "@new_nodes_map\n"; # TEST - - # if more than one new homolog exists, and the old gene doesn't - # already belong to a group, you'll need a new Group for multiple - # gene boxes - my $new_GroupId; - - if (scalar(@new_nodes_map) > 1) + # iterate through each node by its GraphId + foreach my $curr_old_genes_by_hex_id (keys %{$data_nodes_by_gene_id{$old_gene}}) { - # if curr old gene does not belong to a group - - # print $curr_old_gene_node->toString . "\n"; # TEST - # print $curr_old_gene_node->getAttribute("GroupRef"); # TEST - - if (!$curr_old_gene_node->getAttribute("GroupRef")) + my $curr_old_gene_node = $data_nodes_by_gene_id{$old_gene}{$curr_old_genes_by_hex_id}; + # print $curr_old_gene_node . "\n"; # TEST + + # holds list of newly-created nodes, used to replace old node + my @new_nodes_map; + + # iterate through new gene replacements + for (@{$swap_genes{$old_gene}}) { - #print "no existing group ref\n"; # TEST - - # generate a new GroupId and Group.GraphId hex ids not - # already in use - $new_GroupId = create_unique_hex_id(\%existing_hex_ids,"Group.GroupId"); - # my $new_Group_GraphId = create_unique_hex_id(\%existing_hex_ids,"Group.GraphId"); - #print "new group id: $new_GroupId\n"; # TEST - # print "$new_GroupId, $new_Group_GraphId\n"; # TEST - - # create a new group node - my $new_group = $gpml_doc->createElement("Group"); - $new_group->setAttribute("GroupId",$new_GroupId); - #$new_group->setAttribute("GraphId",$new_Group_GraphId); - $new_group->setAttribute("Style","Group"); - - # add to beginning of group nodes - $pathway_node->insertBefore($new_group,${$group_nodes}[0]); + # copy the curr old gene node + my $new_node = $curr_old_gene_node->cloneNode("deep"); + + # print "[$_]\n$new_node->toString\n\n"; # TEST + + # add to new nodes ary + push @new_nodes_map, [$new_node, $_]; } - } - - # flag for determining if there are one or many replacement homologs - my $is_first_homolog = 1; - - # makes sure each box is increasingly offset from the original - # (in all three dimensions) - my $offset_multiplier = 0; - my $gene_suffix_counter = 0; # used to affix numbers to multiple new gene symbols - - # for new nodes ary - for (@new_nodes_map) - { + # print "@new_nodes_map\n"; # TEST + + # if more than one new homolog exists, and the old gene doesn't + # already belong to a group, you'll need a new Group for multiple + # gene boxes + my $new_GroupId; + if (scalar(@new_nodes_map) > 1) { - $gene_suffix_counter++; + # if curr old gene does not belong to a group + + # print $curr_old_gene_node->toString . "\n"; # TEST + # print $curr_old_gene_node->getAttribute("GroupRef"); # TEST + + if (!$curr_old_gene_node->getAttribute("GroupRef")) + { + #print "no existing group ref\n"; # TEST + + # generate a new GroupId and Group.GraphId hex ids not + # already in use + $new_GroupId = create_unique_hex_id(\%existing_hex_ids,"Group.GroupId"); + # my $new_Group_GraphId = create_unique_hex_id(\%existing_hex_ids,"Group.GraphId"); + #print "new group id: $new_GroupId\n"; # TEST + # print "$new_GroupId, $new_Group_GraphId\n"; # TEST + + # create a new group node + my $new_group = $gpml_doc->createElement("Group"); + $new_group->setAttribute("GroupId",$new_GroupId); + #$new_group->setAttribute("GraphId",$new_Group_GraphId); + $new_group->setAttribute("Style","Group"); + + # add to beginning of group nodes + $pathway_node->insertBefore($new_group,${$group_nodes}[0]); + } } - my $curr_new_node = $$_[0]; - my $curr_homolog = $$_[1]; - # print "$_: $curr_new_node, $curr_homolog\n"; # TEST - # print "[Curr New Node before editing...]\n" . $curr_new_node->toString . "\n\n"; # TEST - - # update all new nodes w/ attributes... - - # rename TextLabel... - # prefix (from config), suffix: new '-#' for multiple homologs - $curr_new_node->setAttribute("TextLabel", - (($configs{"LabelPrefix"}) ? $configs{"LabelPrefix"} : "") - . $curr_new_node->getAttributeNode("TextLabel")->getValue - . (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : "")); + # flag for determining if there are one or many replacement homologs + my $is_first_homolog = 1; + + # makes sure each box is increasingly offset from the original + # (in all three dimensions) + my $offset_multiplier = 0; + my $gene_suffix_counter = 0; # used to affix numbers to multiple new gene symbols - # add new GroupRef if necessary - if ($new_GroupId) + # for new nodes ary + for (@new_nodes_map) { - $curr_new_node->setAttribute("GroupRef",$new_GroupId); + if (scalar(@new_nodes_map) > 1) + { + $gene_suffix_counter++; + } + + my $curr_new_node = $$_[0]; + my $curr_homolog = $$_[1]; + # print "$_: $curr_new_node, $curr_homolog\n"; # TEST + # print "[Curr New Node before editing...]\n" . $curr_new_node->toString . "\n\n"; # TEST + + # update all new nodes w/ attributes... + + # rename TextLabel... + # prefix (from config), suffix: new '-#' for multiple homologs + $curr_new_node->setAttribute("TextLabel", + (($configs{"LabelPrefix"}) ? $configs{"LabelPrefix"} : "") + . $curr_new_node->getAttributeNode("TextLabel")->getValue + . (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : "")); + + # add new GroupRef if necessary + if ($new_GroupId) + { + $curr_new_node->setAttribute("GroupRef",$new_GroupId); + } + + # add Comment back-referencing TAIR locus id (use "source" attribute) + # NOTE: order is important in GPML; the tags are first + my $new_comment = $gpml_doc->createElement("Comment"); + $new_comment->setAttribute("Source",$configs{"Source"}); + $new_comment->addText($configs{"CommentPrefix"} . " $old_gene."); + $curr_new_node->insertBefore($new_comment,$curr_new_node->getFirstChild); # assumes other child nodes + + # edit + my $curr_xref = ($curr_new_node->getElementsByTagName("Xref"))[0]; + $curr_xref->setAttribute("Database",$configs{"Database"}); + $curr_xref->setAttribute("ID",$curr_homolog); + + # change box width and colors () + my $curr_graphics = ($curr_new_node->getElementsByTagName("Graphics"))[0]; + $curr_graphics->setAttribute("Width",$configs{"BoxWidth"}); + if ($configs{"BoxBorder"}) { + $curr_graphics->setAttribute("Color",$configs{"BoxBorder"}); } + if ($configs{"BoxColor"}) { + $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"}); } + + if ($is_first_homolog) + { + # print "that was the first homolog...\n"; # TEST + $is_first_homolog = 0; # first homolog complete + } + else # add'l homologs required + { + $offset_multiplier++; + + # print "that was an add'l homolog, change more attrs...\n"; # TEST + # update add'l nodes w/ special attributes... + + # generate a new DataNode GraphId not already in use + my $new_GraphId = + create_unique_hex_id(\%existing_hex_ids,"DataNode.GraphId"); + # print $new_GraphId . "\n"; # TEST + $curr_new_node->setAttribute("GraphId",$new_GraphId); + + # decrement the Z-order + $curr_graphics->setAttribute("ZOrder", + $curr_graphics->getAttributeNode("ZOrder")->getValue + - $offset_multiplier); + # stagger the extra boxes by decrementing the coords + $curr_graphics->setAttribute("CenterX", + $curr_graphics->getAttributeNode("CenterX")->getValue + - ($configs{"X-Offset"} * $offset_multiplier)); + $curr_graphics->setAttribute("CenterY", + $curr_graphics->getAttributeNode("CenterY")->getValue + - ($configs{"Y-Offset"} * $offset_multiplier)); + } } + undef $new_GroupId; # clear this out so we can test against its existence next time - # add Comment back-referencing TAIR locus id (use "source" attribute) - # NOTE: order is important in GPML; the tags are first - my $new_comment = $gpml_doc->createElement("Comment"); - $new_comment->setAttribute("Source",$configs{"Source"}); - $new_comment->addText($configs{"CommentPrefix"} . " $old_gene."); - $curr_new_node->insertBefore($new_comment,$curr_new_node->getFirstChild); # assumes other child nodes - - # edit - my $curr_xref = ($curr_new_node->getElementsByTagName("Xref"))[0]; - $curr_xref->setAttribute("Database",$configs{"Database"}); - $curr_xref->setAttribute("ID",$curr_homolog); - - # change box width and colors () - my $curr_graphics = ($curr_new_node->getElementsByTagName("Graphics"))[0]; - $curr_graphics->setAttribute("Width",$configs{"BoxWidth"}); - if ($configs{"BoxBorder"}) { - $curr_graphics->setAttribute("Color",$configs{"BoxBorder"}); } - if ($configs{"BoxColor"}) { - $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"}); } - - if ($is_first_homolog) - { - # print "that was the first homolog...\n"; # TEST - $is_first_homolog = 0; # first homolog complete + # replace old node w/ new node(s) + for (@new_nodes_map) { + # add all the new nodes... + $pathway_node->insertBefore($$_[0],$curr_old_gene_node); } - else # add'l homologs required - { - $offset_multiplier++; - - # print "that was an add'l homolog, change more attrs...\n"; # TEST - # update add'l nodes w/ special attributes... - - # generate a new DataNode GraphId not already in use - my $new_GraphId = - create_unique_hex_id(\%existing_hex_ids,"DataNode.GraphId"); - # print $new_GraphId . "\n"; # TEST - $curr_new_node->setAttribute("GraphId",$new_GraphId); - - # decrement the Z-order - $curr_graphics->setAttribute("ZOrder", - $curr_graphics->getAttributeNode("ZOrder")->getValue - - $offset_multiplier); - # stagger the extra boxes by decrementing the coords - $curr_graphics->setAttribute("CenterX", - $curr_graphics->getAttributeNode("CenterX")->getValue - - ($configs{"X-Offset"} * $offset_multiplier)); - $curr_graphics->setAttribute("CenterY", - $curr_graphics->getAttributeNode("CenterY")->getValue - - ($configs{"Y-Offset"} * $offset_multiplier)); - } - } - undef $new_GroupId; # clear this out so we can test against its existence next time - - # replace old node w/ new node(s) - for (@new_nodes_map) { - # add all the new nodes... - $pathway_node->insertBefore($$_[0],$curr_old_gene_node); + # ...and remove the original node + $pathway_node->removeChild($curr_old_gene_node); } - # ...and remove the original node - $pathway_node->removeChild($curr_old_gene_node); + # once mapped, remove the old gene so we are left with a list of + # unmapped original genes (for show_ouput()) + delete($data_nodes_by_gene_id{$old_gene}); } else { @@ -679,10 +693,7 @@ sub swap_genes . "PathVisio GPML document.\n"; } } - - # [later] - # .After PathVisio sanity check, before communication: - # .remove added back-ref TAIR comments (just rerun w/o added ?) + %unmapped_genes = %data_nodes_by_gene_id; # whatever is left over } # --------------------------------------------------------------------------- @@ -692,6 +703,21 @@ Displays the transformed data. Verbose only. # --------------------------------------------------------------------------- sub show_output { + if ($verbose) { + print "[Unmapped original genes]\n"; + my $count = 0; + foreach my $tmp_gene (keys %unmapped_genes) + { + foreach my $tmp_node (keys %{$unmapped_genes{$tmp_gene}}) + { + $count++; + print $unmapped_genes{$tmp_gene}{$tmp_node} + ->getAttributeNode("TextLabel")->getValue + . "\t($tmp_gene)\n"; + } + } + print "\nTotal unmapped genes: $count\n"; + } if ($doc_mode) { print "\n[Modified GPML Output]\n";