Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Began modifying DataNodes for 1-to-1: added label prefix, added Comment
authorpreecej <preecej@localhost>
Wed, 10 Aug 2011 00:17:11 +0000 (00:17 +0000)
committerpreecej <preecej@localhost>
Wed, 10 Aug 2011 00:17:11 +0000 (00:17 +0000)
svn path=/; revision=145

preecej/perl_singletons/pathway_gene_swapper.pl

index 5066f39cc1469035708586a0a72b02c79a07fac8..c205efa9c2598a8b9fe0daa07f51de7851dff481 100644 (file)
@@ -258,6 +258,7 @@ sub import_data
             . "Original gene count: " . $original_gene_count . "\n"
             . "Replacement paralog count: $replacement_paralog_count\n\n"; 
     
+        print "[Number of paralogs per original gene]\n";
         foreach my $orig_gene_key (keys %swap_genes)
         {
             print "$orig_gene_key: " . scalar(@{$swap_genes{$orig_gene_key}}) . "\n";
@@ -290,9 +291,9 @@ sub show_input
     print Dumper(\%swap_genes) . "\n\n";
     print "\n";
 
-    print "[Source GPML]\n";
-    print $gpml_doc->toString;
-    print "\n";
+    print "[Source GPML]\n";
+    print $gpml_doc->toString;
+    print "\n";
 }
 
 # ---------------------------------------------------------------------------
@@ -302,11 +303,18 @@ Substitutes gene data.
 # ---------------------------------------------------------------------------
 sub swap_genes
 {
-    print "Swapping gene data and making other modifications...\n\n";
-        
-    # remove all <BiopaxRef> and <bp:PublicationXref> elements and children
+    print "Swapping gene data and making other document modifications...\n\n";
+
     my $pathway_node = ($gpml_doc->getElementsByTagName("Pathway"))[0];
 
+    # change Pathway header info to config settings
+    $pathway_node->setAttribute("Name",$configs{"Title"});
+    $pathway_node->setAttribute("Author",$configs{"Author"});
+    $pathway_node->setAttribute("Maintainer",$configs{"Maintainer"});
+    $pathway_node->setAttribute("Version",$configs{"Date"});
+    $pathway_node->setAttribute("Organism",$configs{"Organism"});
+    
+    # remove all <BiopaxRef> and <bp:PublicationXref> elements and children
     my $biopax_node = ($pathway_node->getElementsByTagName("Biopax"))[0];
     $pathway_node->removeChild($biopax_node);
 
@@ -346,12 +354,32 @@ sub swap_genes
         # also build a data node hash to make lookup easier in the next section
         my $curr_xref_id = ($_->getElementsByTagName("Xref"))[0]
             ->getAttributeNode("ID")->getValue;
-        $curr_xref_id =~ s/\s+$//; # rtrim whitespace 
-        $data_nodes_by_gene_id{$curr_xref_id} = $_;
+
+        $curr_xref_id =~ s/\s+$//; # rtrim whitespace
+        #$curr_xref_id =~ s/^("|')("|')$//; # strip quotes - may not be necessary
+        
+        if (length($curr_xref_id) > 0)
+        {
+            $data_nodes_by_gene_id{$curr_xref_id} = $_;
+        }
+        else
+        {
+            if ($verbose) {
+                print "WARNING: Found DataNode (TextLabel: " 
+                    . $_->getAttributeNode('TextLabel')->getValue . ") "
+                    . "with missing Xref ID.\n"
+            }
+        }
     }
+    
 
-    # if ($debug) { print "...<DEBUG: \%data_nodes_by_gene_id>\n"
-    #     . Dumper(\%data_nodes_by_gene_id) . "\n\n"; }
+    if ($debug) {
+        print "...<DEBUG: \%data_nodes_by_gene_id>\n";
+        foreach my $tmp_node (keys %data_nodes_by_gene_id) {
+            print "...   $tmp_node => $data_nodes_by_gene_id{$tmp_node}\n";
+        }
+        print "\n";
+    }
 
     my $group_nodes = $pathway_node->getElementsByTagName("Group");
     for (@$group_nodes)
@@ -385,24 +413,31 @@ sub swap_genes
             # holds list of newly-created nodes, used to replace old node
             my @new_nodes;
 
-            # copy the curr old gene node
-            my $new_node = $curr_old_gene_node->cloneNode("deep");
-
             # testing...
-            $new_node->setAttribute("GraphId","zzzzz");
-            #print $new_node->getAttributeNode("GraphId")->getValue . "\n";
-            print $new_node->toString;
+            $new_node->setAttribute("GraphId","zzzzz");
+            # #print $new_node->getAttributeNode("GraphId")->getValue . "\n";
+            print $new_node->toString;
 
             # iterate through new gene replacements
-            # for (@{$swap_genes{$old_gene}})
-            # {
-            #     # add to new nodes ary
-                 push @new_nodes, $new_node;
-            # }
+            for (@{$swap_genes{$old_gene}})
+            {
+                # copy the curr old gene node
+                my $new_node = $curr_old_gene_node->cloneNode("deep");
+
+                # testing...
+                # print "[$_]\n";
+                # print $new_node->toString;
+                # print "\n\n";
+
+                # add to new nodes ary
+                # TODO: if this errs, perhaps catch and skip?
+                push @new_nodes, $new_node;
+            }
             # print "@new_nodes\n";
 
             # if more than one new gene maps, you'll need a group for multiple gene boxes
-            
+            if (scalar(@new_nodes) > 1)
+            {
                 # if curr old gene does not belong to a group
                     # create a new group node
                     # SUB: generate a new groupRef hex id not already in use
@@ -410,18 +445,44 @@ sub swap_genes
                 # for each new gene box
                     # use new or existing group id for new boxes
                     # add to new nodes ary
-    
+            }
+            
+            # flag for determining if there are one or many replacement paralogs
+            my $is_first_paralog = 1; 
+
             # for new nodes ary
-                # SUB: generate a new DataNode hex id not already in use
+            for (@new_nodes)
+            {
+                if ($is_first_paralog) 
+                {
+                    # print "first paralog...\n";
+                    $is_first_paralog = 0; # first paralog complete 
+                }
+                else # add'l paralogs required
+                {
+                    # print "add'l paralog...\n";
+                    # SUB: generate a new DataNode hex id not already in use
+                }                
                 # update new nodes w/ attributes:
-                    # rename TextLabel (prefix: Eu-, suffix: -alpha or numeric, based on regex)
-                    # add Comment back-referencing TAIR locus id (use "source" attribute)
-                    # edit <Xref Database="JGI" ID="Egrandis..." />
-                    # decrement the Z-order
-                    # decrement CenterX and CenterY by 10px each
-                    # change box Width if needed
-                    # color the box (<Graphics...Color="4488ff" />)
-
+                # rename TextLabel (prefix (config), suffix: -alpha or numeric, based on regex)
+                $_->setAttribute("TextLabel",
+                    $configs{"LabelPrefix"} 
+                        . $_->getAttributeNode("TextLabel")->getValue);
+                
+                # add Comment back-referencing TAIR locus id (use "source" attribute)
+                # NOTE: order is important in GPML; the <Comment> tags are first
+                my $new_comment = $gpml_doc->createElement("Comment");
+                $new_comment->setAttribute("Source",$configs{"Source"});
+                $new_comment->addText($configs{"CommentPrefix"} . $old_gene . ".");
+                $_->insertBefore($new_comment,$_->getFirstChild); # assumes other child nodes  
+                
+                # edit <Xref Database="JGI" ID="Egrandis..." />
+                # decrement the Z-order
+                # decrement CenterX and CenterY by 10px each
+                # change box Width if needed
+                # color the box (<Graphics...Color="4488ff" />)
+            }
+            
             # replace old node w/ new node(s); do this in place
             for (@new_nodes) {
                 # replace one
@@ -460,9 +521,9 @@ Displays the transformed data. Verbose only.
 # ---------------------------------------------------------------------------
 sub show_output
 {
-    print "[Modified GPML Output]\n";
-    print $gpml_doc->toString;
-    print "\n";
+    print "[Modified GPML Output]\n";
+    print $gpml_doc->toString;
+    print "\n";
 }
 
 # ---------------------------------------------------------------------------