From 48d3bb05cd3f90ea12e170d7bf0ee35d9e31504e Mon Sep 17 00:00:00 2001 From: preecej Date: Thu, 11 Aug 2011 19:20:23 +0000 Subject: [PATCH] Added options, set config defaults, hardened the code svn path=/; revision=150 --- .../perl_singletons/pathway_gene_swapper.pl | 206 ++++++++++++------ 1 file changed, 137 insertions(+), 69 deletions(-) diff --git a/preecej/perl_singletons/pathway_gene_swapper.pl b/preecej/perl_singletons/pathway_gene_swapper.pl index 25772e7..1e05a49 100644 --- a/preecej/perl_singletons/pathway_gene_swapper.pl +++ b/preecej/perl_singletons/pathway_gene_swapper.pl @@ -11,11 +11,28 @@ Pathway Gene Swapper =head1 DESCRIPTION Swap out one set of genes for another in an existing PathVisio GPML -file. +file. Optionally removes literature references. + +If multiple replacement genes map to a single original gene, +multiple PathVisio boxes will be drawn in place of the +original. + +If a group associating multiple gene boxes already existed, +that group will extend to any replacement genes. If no group +existed previously, and multiple replacement gene boxes are required, +a new group will be created. + +The replacement gene symbols can be prefixed to separate them from the +original, and an ordinal suffix ('-#') will be added to a group of +replacement genes. + +Any new gene boxes may be painted with a custom color and border, and +will be stacked and offset for ease of visualization (much like a deck +of cards). =head1 USAGE -pathway_gene_swapper.pl -i INPUT_FILE -g GENE_FILE -c CONFIG_FILE -o OUTPUT_FILE -v -d +pathway_gene_swapper.pl -i INPUT_FILE -g GENE_FILE -c CONFIG_FILE -o OUTPUT_FILE -v -G -d =head1 OPTIONS @@ -25,38 +42,42 @@ pathway_gene_swapper.pl -i INPUT_FILE -g GENE_FILE -c CONFIG_FILE -o OUTPUT_FILE -o Name of output GPML file. (NOTE: if no path supplied for input files, current working directory is assumed) + -L Remove literature references. -v View verbose information + -G Display GPML input/output documents -d View debugging information =head1 DEPENDENCIES and PREREQUISITES - - Non-standard Perl modules: Switch, XML::DOM + - Non-standard Perl modules: Switch, Data::Dumper, XML::DOM - The input file must be a valid GPML file - The CSV file must have a single-line column header. The first column must have one and only one gene -- the "original" gene. - The second column may have one or more genes or gene variants - -- the "replacement" gene(s). - - If the second column contains multiple genes or gene variants, - multiple PathVisio boxes will be drawn in place of the - original. + The second column may have one and only one gene variants or + paralog -- the "replacement" gene(s). - - The config file may have any or all of the following entries: + - The config file may have any or all of the following entries, + in addition to the required fields (in any order): + Source= (required) + Database= (required) + Version= (required) Title= - MaintainedBy= + Author= + Maintainer= Organism= + CommentPrefix= (will precede back-reference to prior source, + default: "Previously labeled as: ") + LabelPrefix= (precedes current gene label) BoxBorder= (RRGGBB hex, default: black) BoxColor= (RRGGBB hex, default: white) - BoxWidth= (integer, in px) - CommentPrefix= (will precede back-reference to prior source) - LabelPrefix= (precedes current gene label) - X-Offset= (integer, in px) - Y-Offset= (integer, in px) - + BoxWidth= (integer, in px, default: 60) + X-Offset= (integer, in px, default: 5px) + Y-Offset= (integer, in px, default: 4px) + =head1 AUTHORS Justin Preece and Mamatha Hanumappa @@ -92,7 +113,9 @@ my $input_gpml_file; my $input_gene_file; my $input_config_file; my $output_file; +my $remove_lit = 0; # flag to remove literature and lit references my $verbose = 0; # flag for verbose output +my $doc_mode = 0; # flag for extra GPML doc output my $debug = 0; # debugging switch # global data containers @@ -133,10 +156,38 @@ sub config($) my $data_field = $line_ary[0]; my $data_val = $line_ary[1]; - $local_config_hash{$data_field} = $data_val; + if ($data_val) + { + $local_config_hash{$data_field} = $data_val; + } } close(CONFIG_FILE); + + # check for required settings + if (!$local_config_hash{"Source"}) { + die("You are required to provide a Source for your new data.\n" + . "Run \"perldoc pathway_gene_swapper.pl\" for more information.\n\n"); } + if (!$local_config_hash{"Database"}) { + die("You are required to identify a Database for your new data.\n" + . "Run \"perldoc pathway_gene_swapper.pl\" for more information.\n\n"); } + if (!$local_config_hash{"Version"}) { + die("You are required to provide a Version idenifier for your new data.\n" + . "Run \"perldoc pathway_gene_swapper.pl\" for more information.\n\n"); } + + # set defaults, if none declared + if (!$local_config_hash{"CommentPrefix"}) { + $local_config_hash{"CommentPrefix"} = "Previously labeled as: "; + } + if (!$local_config_hash{"BoxWidth"}) { + $local_config_hash{"BoxWidth"} = 60; + } + if (!$local_config_hash{"X-Offset"}) { + $local_config_hash{"X-Offset"} = 4; + } + if (!$local_config_hash{"Y-Offset"}) { + $local_config_hash{"Y-Offset"} = 3; + } return %local_config_hash; } @@ -151,7 +202,7 @@ screen output. sub init { # read and set options - getopts('i:g:c:o:vd', \%opts); + getopts('i:g:c:o:LvGd', \%opts); foreach my $key (keys %opts) { my $value = $opts{$key}; @@ -184,8 +235,9 @@ sub init $output_file = getcwd() . "\/$value"; } } + case "L" { $remove_lit = 1; } case "v" { $verbose = 1; } - + case "G" { $doc_mode = 1; } case "d" { $debug = 1; } } } @@ -204,6 +256,7 @@ sub init . "Output File: $output_file\n" . "\n" . "Running in verbose mode? " . ($verbose ? "Yes" : "No") . "\n" + . "Running in document mode? " . ($doc_mode ? "Yes" : "No") . "\n" . "Running in debug mode? " . ($debug ? "Yes" : "No") . "\n" . "\n" . "------------------------------------------------------------\n" @@ -283,17 +336,22 @@ Verbose only. # --------------------------------------------------------------------------- sub show_input { - print "[Configuration Settings]\n"; - print Dumper(\%configs) . "\n\n"; - print "\n"; - - print "[Gene Mappings]\n"; - print Dumper(\%swap_genes) . "\n\n"; - print "\n"; - - # print "[Source GPML]\n"; - # print $gpml_doc->toString; - # print "\n"; + if ($verbose) + { + print "[Configuration Settings]\n"; + print Dumper(\%configs) . "\n\n"; + print "\n"; + + print "[Gene Mappings]\n"; + print Dumper(\%swap_genes) . "\n\n"; + print "\n"; + } + if ($doc_mode) + { + print "[Source GPML]\n"; + print $gpml_doc->toString; + print "\n"; + } } # --------------------------------------------------------------------------- @@ -325,7 +383,7 @@ sub create_unique_hex_id($$) if (exists ${$_[0]}{$candidate_id}) { # print "not unique...\n"; # TEST - $candidate_id = create_unique_hex_id($_[0],$_[1]); + $candidate_id = &create_unique_hex_id($_[0],$_[1]); } else { @@ -348,31 +406,40 @@ sub swap_genes my $pathway_node = ($gpml_doc->getElementsByTagName("Pathway"))[0]; # change Pathway header info to config settings - $pathway_node->setAttribute("Name",$configs{"Title"}); - $pathway_node->setAttribute("Author",$configs{"Author"}); - $pathway_node->setAttribute("Maintainer",$configs{"Maintainer"}); - $pathway_node->setAttribute("Version",$configs{"Date"}); - $pathway_node->setAttribute("Organism",$configs{"Organism"}); + if ($configs{"Title"}) { + $pathway_node->setAttribute("Name",$configs{"Title"}); } + if ($configs{"Author"}) { + $pathway_node->setAttribute("Author",$configs{"Author"}); } + if ($configs{"Maintainer"}) { + $pathway_node->setAttribute("Maintainer",$configs{"Maintainer"}); } + if ($configs{"Version"}) { + $pathway_node->setAttribute("Version",$configs{"Version"}); } + if ($configs{"Organism"}) { + $pathway_node->setAttribute("Organism",$configs{"Organism"}); } - # remove all and elements and children - my $biopax_node = ($pathway_node->getElementsByTagName("Biopax"))[0]; - $pathway_node->removeChild($biopax_node); - + # get all "gene box" data nodes my $data_nodes = $pathway_node->getElementsByTagName("DataNode"); - # print $data_nodes->getLength . "\n"; # TEST - for (@$data_nodes) + # remove all and elements and children + if ($remove_lit) { - my $curr_datanode = $_; - my $biopaxref_nodes = $curr_datanode->getElementsByTagName("BiopaxRef"); - for (@$biopaxref_nodes) + print "Removing literature references...\n\n"; + my $biopax_node = ($pathway_node->getElementsByTagName("Biopax"))[0]; + $pathway_node->removeChild($biopax_node); + + for (@$data_nodes) { - # print $_->getTagName . "\n"; # TEST - $curr_datanode->removeChild($_); + my $curr_datanode = $_; + my $biopaxref_nodes = $curr_datanode->getElementsByTagName("BiopaxRef"); + for (@$biopaxref_nodes) + { + # print $_->getTagName . "\n"; # TEST + $curr_datanode->removeChild($_); + } } } - + # will hold a convenient list of data node references in the gpml doc, # indexed by the id of the gene located in the element for each # node @@ -410,6 +477,7 @@ sub swap_genes } } + print "\n"; if ($debug) { print "...\n"; @@ -524,7 +592,7 @@ sub swap_genes # rename TextLabel... # prefix (from config), suffix: new '-#' for multiple paralogs $curr_new_node->setAttribute("TextLabel", - $configs{"LabelPrefix"} + (($configs{"LabelPrefix"}) ? $configs{"LabelPrefix"} : "") . $curr_new_node->getAttributeNode("TextLabel")->getValue . (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : "")); @@ -549,8 +617,10 @@ sub swap_genes # change box width and colors () my $curr_graphics = ($curr_new_node->getElementsByTagName("Graphics"))[0]; $curr_graphics->setAttribute("Width",$configs{"BoxWidth"}); - $curr_graphics->setAttribute("Color",$configs{"BoxBorder"}); - $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"}); + if ($configs{"BoxBorder"}) { + $curr_graphics->setAttribute("Color",$configs{"BoxBorder"}); } + if ($configs{"BoxColor"}) { + $curr_graphics->setAttribute("FillColor",$configs{"BoxColor"}); } if ($is_first_paralog) { @@ -603,17 +673,6 @@ sub swap_genes # [later] # .After PathVisio sanity check, before communication: # .remove added back-ref TAIR comments (just rerun w/o added ?) - - # [usage example] - # my $nodes = $gpml_doc->getElementsByTagName("DataNode"); - # my $n = $nodes->getLength; - - # for (my $i = 0; $i < $n; $i++) - # { - # my $node = $nodes->item ($i); - # my $gene_label = $node->getAttributeNode("TextLabel"); - # print $gene_label->getValue . "\n"; - # } } # --------------------------------------------------------------------------- @@ -623,9 +682,12 @@ Displays the transformed data. Verbose only. # --------------------------------------------------------------------------- sub show_output { - # print "[Modified GPML Output]\n"; - # print $gpml_doc->toString; - # print "\n"; + if ($doc_mode) + { + print "\n[Modified GPML Output]\n"; + print $gpml_doc->toString; + print "\n"; + } } # --------------------------------------------------------------------------- @@ -635,7 +697,7 @@ Writes the transformed GPML doc out to the specified output file. # --------------------------------------------------------------------------- sub export_data { - print "Writing GPML to output file...\n\n"; + print "\nWriting GPML to output file...\n\n"; # ensures utf-8 encoding (for accent marks, etc.) open my $out_file_handle, ">:utf8", "$output_file" or die $!; @@ -651,11 +713,17 @@ sub export_data # --------------------------------------------------------------------------- init; + import_data; -if ($verbose) { show_input; } +show_input; swap_genes(); -if ($verbose) { show_output; } +show_output; export_data; + $gpml_doc->dispose; # cleanup exit; +# --------------------------------------------------------------------------- +# end +# --------------------------------------------------------------------------- + -- 2.34.1