From 399cd3c1dc09889a568330f08604cf367a1bc090 Mon Sep 17 00:00:00 2001 From: preecej Date: Wed, 10 Aug 2011 22:26:37 +0000 Subject: [PATCH] Added multiple paralog gene box illustration, coloring, and unique hexid generation svn path=/; revision=148 --- .../perl_singletons/pathway_gene_swapper.pl | 133 +++++++++++++----- 1 file changed, 100 insertions(+), 33 deletions(-) diff --git a/preecej/perl_singletons/pathway_gene_swapper.pl b/preecej/perl_singletons/pathway_gene_swapper.pl index f8cbeaa..41b0162 100644 --- a/preecej/perl_singletons/pathway_gene_swapper.pl +++ b/preecej/perl_singletons/pathway_gene_swapper.pl @@ -296,6 +296,46 @@ sub show_input # print "\n"; } +# --------------------------------------------------------------------------- +=item B +Generates a "random" 5-digit hexadecimal id, checks to see if it +already exists in the hex list, and if not, adds it to the list +of hex ids already present in the GPML doc. Otherwise, generates +another "random" id and repeats the process until a new unique id +is identified. +Returns a string containing the new hex id. +=cut +# --------------------------------------------------------------------------- +sub create_unique_hex_id($$) +{ + # NOTE: This algorithm breaks down at VERY large scale (100K genes+). The + # larger the number of original genes, groups, and new paralogs you need to + # create, the more inefficient it becomes to make sure your "random" 5-digit + # hex number is not already present in your "existing ids" list via + # recursion. However, for a few hundred or thousand genes, it should be ok. + + my $first_digit; # limited to a..f + my $last_four_digits; # 0..f + + $first_digit = (('a'..'f')[rand(6)]); + $last_four_digits .= ((0..9,'a'..'f')[rand(16)]) for 1..4; + + my $candidate_id = $first_digit . $last_four_digits; + + if (exists ${$_[0]}{$candidate_id}) + { + # print "not unique...\n"; # TEST + $candidate_id = create_unique_hex_id($_[0]); + } + else + { + # print "unique!\n"; # TEST + ${$_[0]}{$candidate_id} = $_[1]; + } + + return $candidate_id; +} + # --------------------------------------------------------------------------- =item B Substitutes gene data. @@ -320,7 +360,7 @@ sub swap_genes my $data_nodes = $pathway_node->getElementsByTagName("DataNode"); - # TEST: print $data_nodes->getLength . "\n"; + # print $data_nodes->getLength . "\n"; # TEST for (@$data_nodes) { @@ -328,7 +368,7 @@ sub swap_genes my $biopaxref_nodes = $curr_datanode->getElementsByTagName("BiopaxRef"); for (@$biopaxref_nodes) { - # TEST: print $_->getTagName . "\n"; + # print $_->getTagName . "\n"; # TEST $curr_datanode->removeChild($_); } } @@ -343,7 +383,7 @@ sub swap_genes my %existing_hex_ids; for (@$data_nodes) { - # TEST: print $_ . "\n"; + # print $_ . "\n"; # TEST if ($_->getAttributeNode("GraphId")) { $existing_hex_ids{$_->getAttributeNode("GraphId")->getValue} @@ -400,13 +440,13 @@ sub swap_genes # iterate through gene mappings from csv file foreach my $old_gene (keys %swap_genes) { - # TEST: print $old_gene . "\n"; + # print $old_gene . "\n"; # TEST # find curr old gene node in doc if (exists $data_nodes_by_gene_id{$old_gene}) { my $curr_old_gene_node = $data_nodes_by_gene_id{$old_gene}; - # TEST: print $curr_old_gene_node . "\n"; + # print $curr_old_gene_node . "\n"; # TEST # holds list of newly-created nodes, used to replace old node my @new_nodes_map; @@ -417,13 +457,12 @@ sub swap_genes # copy the curr old gene node my $new_node = $curr_old_gene_node->cloneNode("deep"); - # TEST: print "[$_]\n$new_node->toString\n\n"; + # print "[$_]\n$new_node->toString\n\n"; # TEST # add to new nodes ary - # TODO: if this errs, perhaps catch and skip? push @new_nodes_map, [$new_node, $_]; } - # TEST: print "@new_nodes_map\n"; + # print "@new_nodes_map\n"; # TEST # if more than one new gene maps, you'll need a group for multiple gene boxes if (scalar(@new_nodes_map) > 1) @@ -440,35 +479,33 @@ sub swap_genes # flag for determining if there are one or many replacement paralogs my $is_first_paralog = 1; + # makes sure each box is increasingly offset from the original + # (in all three dimensions) + my $offset_multiplier = 0; + my $gene_suffix_counter = 0; # used to affix numbers to multiple new gene symbols + # for new nodes ary for (@new_nodes_map) { - # TODO: check for new group ref to use - # TODO: integrate attr settings into the if..else below - + if (scalar(@new_nodes_map) > 1) + { + $gene_suffix_counter++; + # TODO: check for new group ref to use + } + my $curr_new_node = $$_[0]; my $curr_paralog = $$_[1]; - # TEST: print "$_: $curr_new_node, $curr_paralog\n"; + # print "$_: $curr_new_node, $curr_paralog\n"; # TEST + # print "[Curr New Node before editing...]\n" . $curr_new_node->toString . "\n\n"; # TEST - if ($is_first_paralog) - { - # print "first paralog...\n"; - $is_first_paralog = 0; # first paralog complete - } - else # add'l paralogs required - { - # print "add'l paralog...\n"; - # SUB: generate a new DataNode hex id not already in use - # update add'l nodes w/ special attributes: - # decrement the Z-order - # decrement CenterX and CenterY by 10px each - } # update all new nodes w/ attributes... - # rename TextLabel (prefix (config), TODO: suffix: -alpha or numeric, based on regex) + # rename TextLabel... + # prefix (from config), suffix: new '-#' for multiple paralogs $curr_new_node->setAttribute("TextLabel", $configs{"LabelPrefix"} - . $curr_new_node->getAttributeNode("TextLabel")->getValue); + . $curr_new_node->getAttributeNode("TextLabel")->getValue + . (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : "")); # add Comment back-referencing TAIR locus id (use "source" attribute) # NOTE: order is important in GPML; the tags are first @@ -487,16 +524,46 @@ sub swap_genes $curr_graphics->setAttribute("Width",$configs{"BoxWidth"}); $curr_graphics->setAttribute("Color",$configs{"BoxBorder"}); $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"}); + + if ($is_first_paralog) + { + # print "that was the first paralog...\n"; # TEST + $is_first_paralog = 0; # first paralog complete + } + else # add'l paralogs required + { + $offset_multiplier++; + + # print "that was an add'l paralog, change more attrs...\n"; # TEST + # update add'l nodes w/ special attributes... + + # generate a new DataNode GraphId not already in use + my $new_GraphId = + create_unique_hex_id(\%existing_hex_ids,"DataNode.GraphId"); + # print $new_GraphId . "\n"; # TEST + $curr_new_node->setAttribute("GraphId",$new_GraphId); + + # decrement the Z-order + $curr_graphics->setAttribute("ZOrder", + $curr_graphics->getAttributeNode("ZOrder")->getValue + - $offset_multiplier); + # stagger the extra boxes by decrementing the coords + $curr_graphics->setAttribute("CenterX", + $curr_graphics->getAttributeNode("CenterX")->getValue + - ($configs{"X-Offset"} * $offset_multiplier)); + $curr_graphics->setAttribute("CenterY", + $curr_graphics->getAttributeNode("CenterY")->getValue + - ($configs{"Y-Offset"} * $offset_multiplier)); + } } - # TODO: roll these node statments into the loop above - # replace old node w/ new node(s); do this in place + # replace old node w/ new node(s) for (@new_nodes_map) { - # replace one... - $pathway_node->replaceChild($$_[0],$curr_old_gene_node); - # ...and add the others immediately preceding - #insertBefore(); + # add all the new nodes... + $pathway_node->insertBefore($$_[0],$curr_old_gene_node); } + # ...and remove the original node + $pathway_node->removeChild($curr_old_gene_node); } else { -- 2.34.1