From d5c77c56398e03ef4bc858bd263b8685d4f29e4c Mon Sep 17 00:00:00 2001 From: preecej Date: Fri, 14 Oct 2011 20:22:14 +0000 Subject: [PATCH] minor adjustments to maize script; in progress: adding gene symbol feature to gene swapper svn path=/; revision=184 --- .../perl_singletons/pathway_gene_swapper.pl | 92 ++++++++++++++----- .../zea_Maize_PO_CoGe_name_swap.pl | 32 +++---- 2 files changed, 83 insertions(+), 41 deletions(-) diff --git a/preecej/perl_singletons/pathway_gene_swapper.pl b/preecej/perl_singletons/pathway_gene_swapper.pl index 7c98f29..82d9812 100644 --- a/preecej/perl_singletons/pathway_gene_swapper.pl +++ b/preecej/perl_singletons/pathway_gene_swapper.pl @@ -55,6 +55,7 @@ pathway_gene_swapper.pl -i INPUT_FILE -g GENE_FILE -c CONFIG_FILE -o OUTPUT_FILE -o Name of output GPML file. (NOTE: if no path supplied for input files, current working directory is assumed) + -s use provided gene symbols instead of config file's LabelPrefix -L Remove literature references. -v View verbose information -G Display GPML input/output documents @@ -127,6 +128,7 @@ my $input_gene_file; my $input_config_file; my $output_file; my $remove_lit = 0; # flag to remove literature and lit references +my $use_symbols = 0; # flag to indicate use of provided gene symbols my $verbose = 0; # flag for verbose output my $doc_mode = 0; # flag for extra GPML doc output my $debug = 0; # debugging switch @@ -217,7 +219,7 @@ screen output. sub init { # read and set options - getopts('i:g:c:o:LvGd', \%opts); + getopts('i:g:c:o:LsvGd', \%opts); foreach my $key (keys %opts) { my $value = $opts{$key}; @@ -250,6 +252,7 @@ sub init $output_file = getcwd() . "\/$value"; } } + case "s" { $use_symbols = 1; } case "L" { $remove_lit = 1; } case "v" { $verbose = 1; } case "G" { $doc_mode = 1; } @@ -270,6 +273,8 @@ sub init . "\n" . "Output File: $output_file\n" . "\n" + . "Use provided gene symbols? " . ($use_symbols ? "Yes" : "No") . "\n" + . "Remove literature references? " . ($remove_lit ? "Yes" : "No") . "\n" . "Running in verbose mode? " . ($verbose ? "Yes" : "No") . "\n" . "Running in document mode? " . ($doc_mode ? "Yes" : "No") . "\n" . "Running in debug mode? " . ($debug ? "Yes" : "No") . "\n" @@ -299,26 +304,48 @@ sub import_data my $original_gene_count = 0; my $replacement_homolog_count = 0; + my $orig_data_item; + my $new_symbol = ""; + my $new_data_item; + + # ignore header my $line = ; while () { - $line = $_; - chomp $line; - my @line_ary = split(',',$line); - my $data_field = $line_ary[0]; - my $data_val = $line_ary[1]; - - #Does ath_gene exist? - if (!exists $swap_genes{$data_field}) - { + $line = $_; + chomp $line; + my @line_ary = split(',',$line); + my $orig_data_item = $line_ary[0]; + + if ($use_symbols) { + if (scalar(@line_ary) != 3) { + die("If you specify that your gene-mapping file includes " + . "symbols, then your CSV input file must have three " + . "columns of data (in this order): " + . "old gene, new symbol, new gene\n"); + } + $new_symbol = $line_ary[1]; + $new_data_item = $line_ary[2]; + } + else + { + $new_data_item = $line_ary[1]; + } + + #Does ath_gene exist? + if (!exists $swap_genes{$orig_data_item}) + { $original_gene_count++; - } - $replacement_homolog_count++; # count this every time - - # add new gene to hash value (array) for old gene hash key - push @{$swap_genes{$data_field}},$data_val; - } + } + $replacement_homolog_count++; # count this every time + + # add new gene to hash value (array) for old gene hash key + push @{$swap_genes{$orig_data_item}}, + { "symbol"=>$new_symbol, "new_item"=>$new_data_item }; + + $new_symbol = ""; # reset for next iter. + } if ($verbose) # give add'l stats on gene and homolog counts { @@ -623,18 +650,32 @@ sub swap_genes } my $curr_new_node = $$_[0]; - my $curr_homolog = $$_[1]; - # print "$_: $curr_new_node, $curr_homolog\n"; # TEST - # print "[Curr New Node before editing...]\n" . $curr_new_node->toString . "\n\n"; # TEST - + my $curr_symbol = ""; + + if ($use_symbols) + { + $curr_symbol = ${$$_[1]}[0]; + } + + my $curr_homolog = ${$$_[1]}[1]; + + #print "$_: $curr_new_node, $curr_symbol, $curr_homolog\n"; # TEST + #print "[Curr New Node before editing...]\n" . $curr_new_node->toString . "\n\n"; # TEST + # update all new nodes w/ attributes... # rename TextLabel... - # prefix (from config), suffix: new '-#' for multiple homologs - $curr_new_node->setAttribute("TextLabel", - (($configs{"LabelPrefix"}) ? $configs{"LabelPrefix"} : "") - . $curr_new_node->getAttributeNode("TextLabel")->getValue - . (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : "")); + if ($use_symbols && length($curr_symbol) > 0) # apply the provided gene symbol + { + $curr_new_node->setAttribute("TextLabel", $curr_symbol); + } + else # prefix (from config), suffix: new '-#' for multiple homologs + { + $curr_new_node->setAttribute("TextLabel", + (($configs{"LabelPrefix"}) ? $configs{"LabelPrefix"} : "") + . $curr_new_node->getAttributeNode("TextLabel")->getValue + . (($gene_suffix_counter > 0) ? "-$gene_suffix_counter" : "")); + } # add new GroupRef if necessary if ($new_GroupId) @@ -772,6 +813,7 @@ init; import_data; show_input; +exit swap_genes(); show_output; export_data; diff --git a/preecej/perl_singletons/zea_Maize_PO_CoGe_name_swap.pl b/preecej/perl_singletons/zea_Maize_PO_CoGe_name_swap.pl index e76fc58..5dda3e7 100644 --- a/preecej/perl_singletons/zea_Maize_PO_CoGe_name_swap.pl +++ b/preecej/perl_singletons/zea_Maize_PO_CoGe_name_swap.pl @@ -18,21 +18,21 @@ while () $line = $_; chomp $line; - my @curr_line = split(',',$line); - - my $gene_symbol = $curr_line[0]; - $gene_symbol =~ tr/"//d; - my $gene_name = $curr_line[2]; - $gene_name =~ tr/"//d; - my $gene_model_id = $curr_line[8]; - $gene_model_id =~ tr/"//d; - - #print $gene_symbol . "\|" . $gene_name . "\|" . $gene_model_id . "\n"; - - my $gene_model_expr = "^(GRMZM)"; - if ($gene_model_id =~ $gene_model_expr) { - $classical_genes_by_gene_model{$gene_model_id} = [ $gene_symbol, $gene_name ]; - } + my @curr_line = split(',',$line); + + my $gene_symbol = $curr_line[0]; + $gene_symbol =~ tr/"//d; + my $gene_name = $curr_line[2]; + $gene_name =~ tr/"//d; + my $gene_model_id = $curr_line[8]; + $gene_model_id =~ tr/"//d; + + #print $gene_symbol . "\|" . $gene_name . "\|" . $gene_model_id . "\n"; + + my $gene_model_expr = "^(GRMZM)"; + if ($gene_model_id =~ $gene_model_expr) { + $classical_genes_by_gene_model{$gene_model_id} = [ $gene_symbol, $gene_name ]; + } } close (IN_FILE); @@ -45,7 +45,7 @@ open(ASSOC_IN_FILE, $ARGV[1]); open(OUT_FILE,">" . (split('\.',$assoc_file_name))[0] . "_named.assoc"); -while () +while () { $line = $_; chomp $line; -- 2.34.1