Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Added ability to map to multiple original gene boxes; added verbose
authorpreecej <preecej@localhost>
Fri, 12 Aug 2011 23:23:28 +0000 (23:23 +0000)
committerpreecej <preecej@localhost>
Fri, 12 Aug 2011 23:23:28 +0000 (23:23 +0000)
display and count of unmapped original genes

svn path=/; revision=154

preecej/perl_singletons/pathway_gene_swapper.pl

index 7e3459f75d513066a42d3582182613c3647d49a5..cc9c4d7eb4f8d49748adbfe904cca252650acadb 100644 (file)
@@ -6,7 +6,7 @@ Pathway Gene Swapper
 
 =head1 VERSION
 
-0.1
+0.2
 
 =head1 DESCRIPTION
 
@@ -22,6 +22,9 @@ that group will extend to any replacement genes. If no group
 existed previously, and multiple replacement gene boxes are required,
 a new group will be created.
 
+If an original gene had multiple instances displayed on the pathway
+diagram, each instance will be subjected to the replacement process.
+
 The replacement gene symbols can be prefixed to separate them from the 
 original, and an ordinal suffix ('-#') will be added to a group of 
 replacement genes.
@@ -119,9 +122,10 @@ my $doc_mode = 0; # flag for extra GPML doc output
 my $debug = 0; # debugging switch
 
 # global data containers
-my %configs; # holds global configuration settings
-my %swap_genes; # holds original and swapped genes
-my $gpml_doc; #holds imported GPML data for manipulation and output
+my %configs; # configuration settings
+my %swap_genes; # original and swapped genes
+my $gpml_doc; # imported GPML data for manipulation and output
+my %unmapped_genes; # original genes not mapped to homologs 
 
 $Data::Dumper::Pad = "... "; 
 
@@ -447,9 +451,9 @@ sub swap_genes
         }
     }
 
-    # will hold a convenient list of data node references in the gpml doc,
+    # will hold a convenient nested hash of data node references in the gpml doc,
     # indexed by the id of the gene located in the <Xref> element for each
-    # node
+    # node, and sub-indexed by the GraphId of each corresponding node
     my %data_nodes_by_gene_id;
     
     # create a hash of all 5-digit hex ids in the gpml doc (this is the black list)
@@ -467,14 +471,16 @@ sub swap_genes
         # also build a data node hash to make lookup easier in the next section
         my $curr_xref_id = ($_->getElementsByTagName("Xref"))[0]
             ->getAttributeNode("ID")->getValue;
+        my $curr_graph_id = $_->getAttributeNode("GraphId")->getValue;
 
         $curr_xref_id =~ s/\s+$//; # rtrim whitespace
+        $curr_xref_id =~ s/^\s+//; # ltrim whitespace
         #$curr_xref_id =~ s/^("|')("|')$//; # strip quotes - may not be necessary
         
         if (length($curr_xref_id) > 0)
         {
-            if ($curr_xref_id eq "AT3G12810") { print "** hit on AT3G12810\n"; } # TEST
-            $data_nodes_by_gene_id{$curr_xref_id} = $_;
+            #if ($curr_xref_id eq "AT3G12810") { print "** hit on AT3G12810\n"; } # TEST
+            $data_nodes_by_gene_id{$curr_xref_id}{$curr_graph_id} = $_;
         }
         else
         {
@@ -485,14 +491,15 @@ sub swap_genes
             }
         }
     }
-    
     print "\n";
 
     if ($debug) {
         print "...<DEBUG: \%data_nodes_by_gene_id>\n";
-        foreach my $tmp_node (keys %data_nodes_by_gene_id) {
-            print "...   $tmp_node => $data_nodes_by_gene_id{$tmp_node}\n";
-            if ($tmp_node eq "AT3G12810") { print "** hit on AT3G12810 node\n"; } # TEST
+        foreach my $tmp_gene (keys %data_nodes_by_gene_id) {
+            foreach my $tmp_node (keys %{$data_nodes_by_gene_id{$tmp_gene}}) {
+                print "...   $tmp_gene => $tmp_node => $data_nodes_by_gene_id{$tmp_gene}{$tmp_node}\n";
+                #if ($tmp_gene eq "AT3G12810") { print "** hit on AT3G12810 node\n"; } # TEST
+            }
         }
         print "\n";
     }
@@ -520,158 +527,165 @@ sub swap_genes
     {
         # print $old_gene . "\n"; # TEST
 
-        # find curr old gene node in doc
+        # find curr old gene nodes in doc
         if (exists $data_nodes_by_gene_id{$old_gene})
         {
-            my $curr_old_gene_node = $data_nodes_by_gene_id{$old_gene};
-            # print $curr_old_gene_node . "\n"; # TEST
-
-            # holds list of newly-created nodes, used to replace old node
-            my @new_nodes_map;
-
-            # iterate through new gene replacements
-            for (@{$swap_genes{$old_gene}})
-            {
-                # copy the curr old gene node
-                my $new_node = $curr_old_gene_node->cloneNode("deep");
-
-                # print "[$_]\n$new_node->toString\n\n"; # TEST
-
-                # add to new nodes ary
-                push @new_nodes_map, [$new_node, $_];
-            }
-            # print "@new_nodes_map\n"; # TEST
-
-            # if more than one new homolog exists, and the old gene doesn't
-            # already belong to a group, you'll need a new Group for multiple 
-            # gene boxes
-            my $new_GroupId;
-            
-            if (scalar(@new_nodes_map) > 1)
+            # iterate through each node by its GraphId
+            foreach my $curr_old_genes_by_hex_id (keys %{$data_nodes_by_gene_id{$old_gene}})
             {
-                # if curr old gene does not belong to a group
-
-                # print $curr_old_gene_node->toString . "\n"; # TEST
-                # print $curr_old_gene_node->getAttribute("GroupRef"); # TEST
-
-                if (!$curr_old_gene_node->getAttribute("GroupRef"))
+                my $curr_old_gene_node = $data_nodes_by_gene_id{$old_gene}{$curr_old_genes_by_hex_id};
+                # print $curr_old_gene_node . "\n"; # TEST
+    
+                # holds list of newly-created nodes, used to replace old node
+                my @new_nodes_map;
+    
+                # iterate through new gene replacements
+                for (@{$swap_genes{$old_gene}})
                 {
-                    #print "no existing group ref\n"; # TEST
-
-                    # generate a new GroupId and Group.GraphId hex ids not 
-                    # already in use
-                    $new_GroupId = create_unique_hex_id(\%existing_hex_ids,"Group.GroupId");
-                    # my $new_Group_GraphId = create_unique_hex_id(\%existing_hex_ids,"Group.GraphId");
-                    #print "new group id: $new_GroupId\n"; # TEST
-                    # print "$new_GroupId, $new_Group_GraphId\n"; # TEST
-
-                    # create a new group node
-                    my $new_group = $gpml_doc->createElement("Group");
-                    $new_group->setAttribute("GroupId",$new_GroupId);
-                    #$new_group->setAttribute("GraphId",$new_Group_GraphId);
-                    $new_group->setAttribute("Style","Group");
-
-                    # add to beginning of group nodes
-                    $pathway_node->insertBefore($new_group,${$group_nodes}[0]);  
+                    # copy the curr old gene node
+                    my $new_node = $curr_old_gene_node->cloneNode("deep");
+    
+                    # print "[$_]\n$new_node->toString\n\n"; # TEST
+    
+                    # add to new nodes ary
+                    push @new_nodes_map, [$new_node, $_];
                 }
-            }
-            
-            # flag for determining if there are one or many replacement homologs
-            my $is_first_homolog = 1; 
-
-            # makes sure each box is increasingly offset from the original
-            # (in all three dimensions)
-            my $offset_multiplier = 0;
-            my $gene_suffix_counter = 0; # used to affix numbers to multiple new gene symbols 
-            
-            # for new nodes ary
-            for (@new_nodes_map)
-            {
+                # print "@new_nodes_map\n"; # TEST
+    
+                # if more than one new homolog exists, and the old gene doesn't
+                # already belong to a group, you'll need a new Group for multiple 
+                # gene boxes
+                my $new_GroupId;
+                
                 if (scalar(@new_nodes_map) > 1)
                 {
-                    $gene_suffix_counter++;
+                    # if curr old gene does not belong to a group
+    
+                    # print $curr_old_gene_node->toString . "\n"; # TEST
+                    # print $curr_old_gene_node->getAttribute("GroupRef"); # TEST
+    
+                    if (!$curr_old_gene_node->getAttribute("GroupRef"))
+                    {
+                        #print "no existing group ref\n"; # TEST
+    
+                        # generate a new GroupId and Group.GraphId hex ids not 
+                        # already in use
+                        $new_GroupId = create_unique_hex_id(\%existing_hex_ids,"Group.GroupId");
+                        # my $new_Group_GraphId = create_unique_hex_id(\%existing_hex_ids,"Group.GraphId");
+                        #print "new group id: $new_GroupId\n"; # TEST
+                        # print "$new_GroupId, $new_Group_GraphId\n"; # TEST
+    
+                        # create a new group node
+                        my $new_group = $gpml_doc->createElement("Group");
+                        $new_group->setAttribute("GroupId",$new_GroupId);
+                        #$new_group->setAttribute("GraphId",$new_Group_GraphId);
+                        $new_group->setAttribute("Style","Group");
+    
+                        # add to beginning of group nodes
+                        $pathway_node->insertBefore($new_group,${$group_nodes}[0]);  
+                    }
                 }
                 
-                my $curr_new_node = $$_[0];
-                my $curr_homolog = $$_[1];
-                # print "$_: $curr_new_node, $curr_homolog\n"; # TEST
-                # print "[Curr New Node before editing...]\n" . $curr_new_node->toString . "\n\n"; # TEST
-                
-                # update all new nodes w/ attributes...
-
-                # rename TextLabel...
-                # prefix (from config), suffix: new '-#' for multiple homologs
-                $curr_new_node->setAttribute("TextLabel",
-                    (($configs{"LabelPrefix"}) ? $configs{"LabelPrefix"} : "")
-                        . $curr_new_node->getAttributeNode("TextLabel")->getValue
-                        . (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : ""));
+                # flag for determining if there are one or many replacement homologs
+                my $is_first_homolog = 1; 
+    
+                # makes sure each box is increasingly offset from the original
+                # (in all three dimensions)
+                my $offset_multiplier = 0;
+                my $gene_suffix_counter = 0; # used to affix numbers to multiple new gene symbols 
                 
-                # add new GroupRef if necessary
-                if ($new_GroupId)
+                # for new nodes ary
+                for (@new_nodes_map)
                 {
-                    $curr_new_node->setAttribute("GroupRef",$new_GroupId);
+                    if (scalar(@new_nodes_map) > 1)
+                    {
+                        $gene_suffix_counter++;
+                    }
+                    
+                    my $curr_new_node = $$_[0];
+                    my $curr_homolog = $$_[1];
+                    # print "$_: $curr_new_node, $curr_homolog\n"; # TEST
+                    # print "[Curr New Node before editing...]\n" . $curr_new_node->toString . "\n\n"; # TEST
+                    
+                    # update all new nodes w/ attributes...
+    
+                    # rename TextLabel...
+                    # prefix (from config), suffix: new '-#' for multiple homologs
+                    $curr_new_node->setAttribute("TextLabel",
+                        (($configs{"LabelPrefix"}) ? $configs{"LabelPrefix"} : "")
+                            . $curr_new_node->getAttributeNode("TextLabel")->getValue
+                            . (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : ""));
+                    
+                    # add new GroupRef if necessary
+                    if ($new_GroupId)
+                    {
+                        $curr_new_node->setAttribute("GroupRef",$new_GroupId);
+                    }
+                    
+                    # add Comment back-referencing TAIR locus id (use "source" attribute)
+                    # NOTE: order is important in GPML; the <Comment> tags are first
+                    my $new_comment = $gpml_doc->createElement("Comment");
+                    $new_comment->setAttribute("Source",$configs{"Source"});
+                    $new_comment->addText($configs{"CommentPrefix"} . " $old_gene.");
+                    $curr_new_node->insertBefore($new_comment,$curr_new_node->getFirstChild); # assumes other child nodes  
+                    
+                    # edit <Xref Database="JGI" ID="Egrandis..." />
+                    my $curr_xref = ($curr_new_node->getElementsByTagName("Xref"))[0];
+                    $curr_xref->setAttribute("Database",$configs{"Database"});
+                    $curr_xref->setAttribute("ID",$curr_homolog);
+    
+                    # change box width and colors (<Graphics...Color="4488ff" ... />)
+                    my $curr_graphics = ($curr_new_node->getElementsByTagName("Graphics"))[0];
+                    $curr_graphics->setAttribute("Width",$configs{"BoxWidth"});
+                    if ($configs{"BoxBorder"}) {
+                        $curr_graphics->setAttribute("Color",$configs{"BoxBorder"}); }
+                    if ($configs{"BoxColor"}) {
+                        $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"}); }
+    
+                    if ($is_first_homolog) 
+                    {
+                        # print "that was the first homolog...\n"; # TEST
+                        $is_first_homolog = 0; # first homolog complete
+                    }
+                    else # add'l homologs required
+                    {
+                        $offset_multiplier++;
+                        
+                        # print "that was an add'l homolog, change more attrs...\n"; # TEST
+                        # update add'l nodes w/ special attributes...
+    
+                        # generate a new DataNode GraphId not already in use
+                        my $new_GraphId = 
+                            create_unique_hex_id(\%existing_hex_ids,"DataNode.GraphId");
+                        # print $new_GraphId . "\n"; # TEST
+                        $curr_new_node->setAttribute("GraphId",$new_GraphId);
+                        
+                        # decrement the Z-order
+                        $curr_graphics->setAttribute("ZOrder",
+                            $curr_graphics->getAttributeNode("ZOrder")->getValue
+                                - $offset_multiplier);                    
+                        # stagger the extra boxes by decrementing the coords 
+                        $curr_graphics->setAttribute("CenterX",
+                            $curr_graphics->getAttributeNode("CenterX")->getValue
+                                - ($configs{"X-Offset"} * $offset_multiplier));
+                        $curr_graphics->setAttribute("CenterY",
+                            $curr_graphics->getAttributeNode("CenterY")->getValue
+                                - ($configs{"Y-Offset"} * $offset_multiplier));
+                    }                
                 }
+                undef $new_GroupId; # clear this out so we can test against its existence next time
                 
-                # add Comment back-referencing TAIR locus id (use "source" attribute)
-                # NOTE: order is important in GPML; the <Comment> tags are first
-                my $new_comment = $gpml_doc->createElement("Comment");
-                $new_comment->setAttribute("Source",$configs{"Source"});
-                $new_comment->addText($configs{"CommentPrefix"} . " $old_gene.");
-                $curr_new_node->insertBefore($new_comment,$curr_new_node->getFirstChild); # assumes other child nodes  
-                
-                # edit <Xref Database="JGI" ID="Egrandis..." />
-                my $curr_xref = ($curr_new_node->getElementsByTagName("Xref"))[0];
-                $curr_xref->setAttribute("Database",$configs{"Database"});
-                $curr_xref->setAttribute("ID",$curr_homolog);
-
-                # change box width and colors (<Graphics...Color="4488ff" ... />)
-                my $curr_graphics = ($curr_new_node->getElementsByTagName("Graphics"))[0];
-                $curr_graphics->setAttribute("Width",$configs{"BoxWidth"});
-                if ($configs{"BoxBorder"}) {
-                    $curr_graphics->setAttribute("Color",$configs{"BoxBorder"}); }
-                if ($configs{"BoxColor"}) {
-                    $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"}); }
-
-                if ($is_first_homolog) 
-                {
-                    # print "that was the first homolog...\n"; # TEST
-                    $is_first_homolog = 0; # first homolog complete
+                # replace old node w/ new node(s)
+                for (@new_nodes_map) {
+                    # add all the new nodes...
+                    $pathway_node->insertBefore($$_[0],$curr_old_gene_node);
                 }
-                else # add'l homologs required
-                {
-                    $offset_multiplier++;
-                    
-                    # print "that was an add'l homolog, change more attrs...\n"; # TEST
-                    # update add'l nodes w/ special attributes...
-
-                    # generate a new DataNode GraphId not already in use
-                    my $new_GraphId = 
-                        create_unique_hex_id(\%existing_hex_ids,"DataNode.GraphId");
-                    # print $new_GraphId . "\n"; # TEST
-                    $curr_new_node->setAttribute("GraphId",$new_GraphId);
-                    
-                    # decrement the Z-order
-                    $curr_graphics->setAttribute("ZOrder",
-                        $curr_graphics->getAttributeNode("ZOrder")->getValue
-                            - $offset_multiplier);                    
-                    # stagger the extra boxes by decrementing the coords 
-                    $curr_graphics->setAttribute("CenterX",
-                        $curr_graphics->getAttributeNode("CenterX")->getValue
-                            - ($configs{"X-Offset"} * $offset_multiplier));
-                    $curr_graphics->setAttribute("CenterY",
-                        $curr_graphics->getAttributeNode("CenterY")->getValue
-                            - ($configs{"Y-Offset"} * $offset_multiplier));
-                }                
-            }
-            undef $new_GroupId; # clear this out so we can test against its existence next time
-            
-            # replace old node w/ new node(s)
-            for (@new_nodes_map) {
-                # add all the new nodes...
-                $pathway_node->insertBefore($$_[0],$curr_old_gene_node);
+                # ...and remove the original node
+                $pathway_node->removeChild($curr_old_gene_node);
             }
-            # ...and remove the original node
-            $pathway_node->removeChild($curr_old_gene_node);
+            # once mapped, remove the old gene so we are left with a list of 
+            # unmapped original genes (for show_ouput())
+            delete($data_nodes_by_gene_id{$old_gene});
         }
         else
         {
@@ -679,10 +693,7 @@ sub swap_genes
                 . "PathVisio GPML document.\n";
         }
     }
-    
-    # [later]
-    # .After PathVisio sanity check, before communication:
-    #   .remove added back-ref TAIR comments (just rerun w/o added <Comment>?)
+    %unmapped_genes = %data_nodes_by_gene_id; # whatever is left over
 }
 
 # ---------------------------------------------------------------------------
@@ -692,6 +703,21 @@ Displays the transformed data. Verbose only.
 # ---------------------------------------------------------------------------
 sub show_output
 {
+    if ($verbose) {
+        print "[Unmapped original genes]\n";
+        my $count = 0;
+        foreach my $tmp_gene (keys %unmapped_genes)
+        {
+            foreach my $tmp_node (keys %{$unmapped_genes{$tmp_gene}})
+            {
+                $count++;
+                print $unmapped_genes{$tmp_gene}{$tmp_node}
+                    ->getAttributeNode("TextLabel")->getValue
+                    . "\t($tmp_gene)\n";
+            }
+        }
+        print "\nTotal unmapped genes: $count\n";
+    }
     if ($doc_mode)
     { 
         print "\n[Modified GPML Output]\n";