Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Added multiple paralog gene box illustration, coloring, and unique hexid
authorpreecej <preecej@localhost>
Wed, 10 Aug 2011 22:26:37 +0000 (22:26 +0000)
committerpreecej <preecej@localhost>
Wed, 10 Aug 2011 22:26:37 +0000 (22:26 +0000)
generation

svn path=/; revision=148

preecej/perl_singletons/pathway_gene_swapper.pl

index f8cbeaa8cb51716152e9c0ec8453f7de48da17fc..41b01625e18a36b556e7b5cda1d972a464f9795a 100644 (file)
@@ -296,6 +296,46 @@ sub show_input
     # print "\n";
 }
 
+# ---------------------------------------------------------------------------
+=item B<string create_unique_hex_id()>
+Generates a "random" 5-digit hexadecimal id, checks to see if it 
+already exists in the hex list, and if not, adds it to the list
+of hex ids already present in the GPML doc. Otherwise, generates
+another "random" id and repeats the process until a new unique id
+is identified.
+Returns a string containing the new hex id.
+=cut
+# ---------------------------------------------------------------------------
+sub create_unique_hex_id($$)
+{
+    # NOTE: This algorithm breaks down at VERY large scale (100K genes+). The
+    # larger the number of original genes, groups, and new paralogs you need to
+    # create, the more inefficient it becomes to make sure your "random" 5-digit
+    # hex number is not already present in your "existing ids" list via 
+    # recursion. However, for a few hundred or thousand genes, it should be ok.
+    
+    my $first_digit; # limited to a..f
+    my $last_four_digits; # 0..f 
+
+    $first_digit = (('a'..'f')[rand(6)]);
+    $last_four_digits .= ((0..9,'a'..'f')[rand(16)]) for 1..4;
+
+    my $candidate_id = $first_digit . $last_four_digits;  
+
+    if (exists ${$_[0]}{$candidate_id})
+    {
+        # print "not unique...\n"; # TEST
+        $candidate_id = create_unique_hex_id($_[0]);
+    }
+    else
+    {
+        # print "unique!\n"; # TEST
+        ${$_[0]}{$candidate_id} = $_[1];
+    }
+    
+    return $candidate_id;
+}
+
 # ---------------------------------------------------------------------------
 =item B<void swap_genes()>
 Substitutes gene data.
@@ -320,7 +360,7 @@ sub swap_genes
 
     my $data_nodes = $pathway_node->getElementsByTagName("DataNode");
 
-    # TEST: print $data_nodes->getLength . "\n";
+    # print $data_nodes->getLength . "\n"; # TEST 
 
     for (@$data_nodes)
     {
@@ -328,7 +368,7 @@ sub swap_genes
         my $biopaxref_nodes = $curr_datanode->getElementsByTagName("BiopaxRef");
         for (@$biopaxref_nodes)
         {
-            # TEST: print $_->getTagName . "\n";
+            # print $_->getTagName . "\n"; # TEST
             $curr_datanode->removeChild($_);
         }
     }
@@ -343,7 +383,7 @@ sub swap_genes
     my %existing_hex_ids;
     for (@$data_nodes)
     {
-        # TEST: print $_ . "\n";
+        # print $_ . "\n"; # TEST
         if ($_->getAttributeNode("GraphId"))
         {
             $existing_hex_ids{$_->getAttributeNode("GraphId")->getValue}
@@ -400,13 +440,13 @@ sub swap_genes
     # iterate through gene mappings from csv file
     foreach my $old_gene (keys %swap_genes)
     {
-        # TEST: print $old_gene . "\n";
+        # print $old_gene . "\n"; # TEST
 
         # find curr old gene node in doc
         if (exists $data_nodes_by_gene_id{$old_gene})
         {
             my $curr_old_gene_node = $data_nodes_by_gene_id{$old_gene};
-            # TEST: print $curr_old_gene_node . "\n";
+            # print $curr_old_gene_node . "\n"; # TEST
 
             # holds list of newly-created nodes, used to replace old node
             my @new_nodes_map;
@@ -417,13 +457,12 @@ sub swap_genes
                 # copy the curr old gene node
                 my $new_node = $curr_old_gene_node->cloneNode("deep");
 
-                # TEST: print "[$_]\n$new_node->toString\n\n";
+                # print "[$_]\n$new_node->toString\n\n"; # TEST
 
                 # add to new nodes ary
-                # TODO: if this errs, perhaps catch and skip?
                 push @new_nodes_map, [$new_node, $_];
             }
-            # TEST: print "@new_nodes_map\n";
+            # print "@new_nodes_map\n"; # TEST
 
             # if more than one new gene maps, you'll need a group for multiple gene boxes
             if (scalar(@new_nodes_map) > 1)
@@ -440,35 +479,33 @@ sub swap_genes
             # flag for determining if there are one or many replacement paralogs
             my $is_first_paralog = 1; 
 
+            # makes sure each box is increasingly offset from the original
+            # (in all three dimensions)
+            my $offset_multiplier = 0;
+            my $gene_suffix_counter = 0; # used to affix numbers to multiple new gene symbols 
+            
             # for new nodes ary
             for (@new_nodes_map)
             {
-                # TODO: check for new group ref to use
-                # TODO: integrate attr settings into the if..else below 
-               
+                if (scalar(@new_nodes_map) > 1)
+                {
+                    $gene_suffix_counter++;
+                    # TODO: check for new group ref to use
+                }
+                
                 my $curr_new_node = $$_[0];
                 my $curr_paralog = $$_[1];
-                # TEST: print "$_: $curr_new_node, $curr_paralog\n";
+                # print "$_: $curr_new_node, $curr_paralog\n"; # TEST
+                # print "[Curr New Node before editing...]\n" . $curr_new_node->toString . "\n\n"; # TEST
                 
-                if ($is_first_paralog) 
-                {
-                    # print "first paralog...\n";
-                    $is_first_paralog = 0; # first paralog complete 
-                }
-                else # add'l paralogs required
-                {
-                    # print "add'l paralog...\n";
-                    # SUB: generate a new DataNode hex id not already in use
-                    # update add'l nodes w/ special attributes:
-                        # decrement the Z-order
-                        # decrement CenterX and CenterY by 10px each
-                }                
                 # update all new nodes w/ attributes...
 
-                # rename TextLabel (prefix (config), TODO: suffix: -alpha or numeric, based on regex)
+                # rename TextLabel...
+                # prefix (from config), suffix: new '-#' for multiple paralogs
                 $curr_new_node->setAttribute("TextLabel",
                     $configs{"LabelPrefix"} 
-                        . $curr_new_node->getAttributeNode("TextLabel")->getValue);
+                        . $curr_new_node->getAttributeNode("TextLabel")->getValue
+                        . (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : ""));
                 
                 # add Comment back-referencing TAIR locus id (use "source" attribute)
                 # NOTE: order is important in GPML; the <Comment> tags are first
@@ -487,16 +524,46 @@ sub swap_genes
                 $curr_graphics->setAttribute("Width",$configs{"BoxWidth"});
                 $curr_graphics->setAttribute("Color",$configs{"BoxBorder"});
                 $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"});
+
+                if ($is_first_paralog) 
+                {
+                    # print "that was the first paralog...\n"; # TEST
+                    $is_first_paralog = 0; # first paralog complete
+                }
+                else # add'l paralogs required
+                {
+                    $offset_multiplier++;
+                    
+                    # print "that was an add'l paralog, change more attrs...\n"; # TEST
+                    # update add'l nodes w/ special attributes...
+
+                    # generate a new DataNode GraphId not already in use
+                    my $new_GraphId = 
+                        create_unique_hex_id(\%existing_hex_ids,"DataNode.GraphId");
+                    # print $new_GraphId . "\n"; # TEST
+                    $curr_new_node->setAttribute("GraphId",$new_GraphId);
+                    
+                    # decrement the Z-order
+                    $curr_graphics->setAttribute("ZOrder",
+                        $curr_graphics->getAttributeNode("ZOrder")->getValue
+                            - $offset_multiplier);                    
+                    # stagger the extra boxes by decrementing the coords 
+                    $curr_graphics->setAttribute("CenterX",
+                        $curr_graphics->getAttributeNode("CenterX")->getValue
+                            - ($configs{"X-Offset"} * $offset_multiplier));
+                    $curr_graphics->setAttribute("CenterY",
+                        $curr_graphics->getAttributeNode("CenterY")->getValue
+                            - ($configs{"Y-Offset"} * $offset_multiplier));
+                }                
             }
             
-            # TODO: roll these node statments into the loop above
-            # replace old node w/ new node(s); do this in place
+            # replace old node w/ new node(s)
             for (@new_nodes_map) {
-                # replace one...
-                $pathway_node->replaceChild($$_[0],$curr_old_gene_node);
-                # ...and add the others immediately preceding
-                #insertBefore();
+                # add all the new nodes...
+                $pathway_node->insertBefore($$_[0],$curr_old_gene_node);
             }
+            # ...and remove the original node
+            $pathway_node->removeChild($curr_old_gene_node);
         }
         else
         {