From fec356b9c55f898d7196712cccb8ee92ffc4a4a2 Mon Sep 17 00:00:00 2001 From: elserj Date: Fri, 14 Aug 2009 21:18:55 +0000 Subject: [PATCH] Justin's initial import of his scripts svn path=/; revision=1 --- interactome_scripts/check_interact.pl | 59 +++ interactome_scripts/flowering_interaction.pl | 3 + interactome_scripts/interact.pl | 278 ++++++++++++ interactome_scripts/interact_perl_sort.pl | 306 +++++++++++++ interactome_scripts/stats_interact.pl | 103 +++++ interactome_scripts/stats_interact_Ath.pl | 103 +++++ interactome_scripts/tree_script.sh | 57 +++ server_admin_scripts/linkchecker.pl | 435 +++++++++++++++++++ 8 files changed, 1344 insertions(+) create mode 100755 interactome_scripts/check_interact.pl create mode 100755 interactome_scripts/flowering_interaction.pl create mode 100755 interactome_scripts/interact.pl create mode 100755 interactome_scripts/interact_perl_sort.pl create mode 100755 interactome_scripts/stats_interact.pl create mode 100755 interactome_scripts/stats_interact_Ath.pl create mode 100755 interactome_scripts/tree_script.sh create mode 100755 server_admin_scripts/linkchecker.pl diff --git a/interactome_scripts/check_interact.pl b/interactome_scripts/check_interact.pl new file mode 100755 index 0000000..9949a05 --- /dev/null +++ b/interactome_scripts/check_interact.pl @@ -0,0 +1,59 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +# print usage if other than two arguments to the script are found +if ($#ARGV != 1) { + print "usage: check_interact.pl Experimental_file Predicted_file\n"; + exit; +} + +# open the experimental data file for reading +open(exper_file,"$ARGV[0]") || die "Error: file '$ARGV[0]' can not be opened\n"; + +# add the data to an array, calculate the length and get rid of newlines +my @exper_array=; +my $exper_len=@exper_array; + + +close(exper_file); + +# do the same for the predicted file +open(predict_file, "$ARGV[1]") || die "Error: file '$ARGV[1]' can not be opened\n"; + +my @predict_array=; +my $predict_len=@predict_array; + +close(predict_file); + +# loop through each experimentally verified gene interaction +for (my $i=0;$i<$exper_len;$i++) { + my ($exper_gene, $exper_inter_type, $exper_inter) = split ("\t", $exper_array[$i]); + # get rid of line feed and carriage return characters + $exper_inter =~ s/\r//g; + $exper_inter =~ s/\n//g; + # Add prefixes to gene identifiers + $exper_gene = "LOC_" . $exper_gene; + $exper_inter = "LOC_" . $exper_inter; + # put the genes in alphabetical order so that order doesn't matter (predicted ones are done in interact.pl + # See comments in interact.pl for more info + if ($exper_gene gt $exper_inter) { + my $temp = $exper_gene; + $exper_gene=$exper_inter; + $exper_inter = $temp; + } + + # loop through each predicted gene interaction + for (my $j=0;$j<$predict_len;$j++) { + my ($predict_gene, $predict_inter_type, $predict_inter) = split ("\t", $predict_array[$j]); + $predict_inter =~ s/\r//g; + $predict_inter =~ s/\n//g; + + # if both the intereaction genes have matches in both files (experiment and predicted) + if ($exper_gene eq $predict_gene and $exper_inter eq $predict_inter) { + print "match found with $predict_gene -> $predict_inter\n"; + } + } +} + diff --git a/interactome_scripts/flowering_interaction.pl b/interactome_scripts/flowering_interaction.pl new file mode 100755 index 0000000..6b1d643 --- /dev/null +++ b/interactome_scripts/flowering_interaction.pl @@ -0,0 +1,3 @@ +#!/usr/bin/perl + + diff --git a/interactome_scripts/interact.pl b/interactome_scripts/interact.pl new file mode 100755 index 0000000..8172c96 --- /dev/null +++ b/interactome_scripts/interact.pl @@ -0,0 +1,278 @@ +#!/usr/bin/perl +# Originally written by Justin Elser 2/2/09 +# Version 0.95 - completed 2/19/09 +# Version 0.96 - still working on +# This file takes the two files that define the network and the orthologies and +# outputs a predicted interactome +# +# updated 2/19/09 to filter out duplicates A -> B = B -> A in the interactions +# +# 0.96 will add the ability to use a separate (optional) paralog list +use strict; +use warnings; + +# print usage if other than three arguments to the script are found +if ($#ARGV < 2 || $#ARGV > 3) { + print "usage: interact.pl Network_file Orthology_file Paralog_file(optional) Output_file\n"; + exit; +} + +my $network_file_name = $ARGV[0]; +my $ortho_file_name = $ARGV[1]; + +my $para_file_name = ""; +my $out_file_name = ""; +my $para = ""; + +if ($#ARGV ==3) { + $para_file_name = $ARGV[2]; + $out_file_name = $ARGV[3]; + $para=1; # true +}else{ + $out_file_name = $ARGV[2]; + $para=0; #false +} + +my %para_hash; + +# read in the paralogs first so it is not in the loop, but the hash is available to the loop +if ($para) { + + open(paralog_file, "$para_file_name") || die "Error: file '$para_file_name' can not be opened\n"; + + my $min_ident_para = 0; # set the threshold for paralog identities + while() { + my $entry = $_; + # strip off newline characters + $entry =~ s/\r//g; + $entry =~ s/\n//g; + # split the columns into 3 separate variables + my ($gene_1, $gene_2, $ident) = split("\t", $entry); + # skip if $ident is empty or Nan + next if(!defined($ident)); + next if($ident =~ /D/); + next if($ident eq ""); + next if($ident =~ /\%/); + # change gene ids to all caps + $gene_1 =~ tr/a-z/A-Z/; + $gene_2 =~ tr/a-z/A-Z/; + + # put the genes in ASCII order to help remove dupes + if($gene_1 gt $gene_2) { + my $temp = $gene_1; + $gene_1 = $gene_2; + $gene_2 = $temp; + } + + # only use paralogs with high confidence values + if($ident>=$min_ident_para) { + # most genes will show up many times, so push values for hash key + if(defined($para_hash{$gene_1})) { + my $oldhash = $para_hash{$gene_1}; + $oldhash->{'gene2'} = "$oldhash->{'gene2'}\t$gene_2";; + $para_hash{$gene_1} = $oldhash; + #print "$oldhash\n"; + }else{ + my %hash; + $hash{'gene1'} = $gene_1; + $hash{'gene2'} = $gene_2; + $para_hash{$gene_1} = \%hash; + } + } + + } + + close(paralog_file); + +} + +#while (my $key = each %para_hash) { +# my $value = $para_hash{$key}->{'gene2'}; +# print "$key\t$value\n"; +#} +#exit; + + + +# Set the minimum % identity to be used. Higher values mean higher confidence in +# orthology +#my $min_ident=33; +# loop through minimum identity, decreasing by 5 +for(my $min_ident=100; $min_ident>=0; $min_ident-=50) { + +# Two files are to be read as input. This one describes the interactions (network map) +open(network_file,"$network_file_name") || die "Error: file '$network_file_name' can not be opened\n"; + +# import the network map entries into a hash table +my %map_hash; +my @map_array; + +while(){ + my $entry = $_; + # strip off newline characters + $entry =~ s/\r//g; + $entry =~ s/\n//g; + # split the 3 columns into separate variables + my ($map_gene_id, $interaction_type, $interaction_gene) = split("\t", $entry); + # change gene ids to all caps + $map_gene_id =~ tr/a-z/A-Z/; + $interaction_gene =~ tr/a-z/A-Z/; + + + # some genes have multiple interactions + if(defined($map_hash{$map_gene_id})){ + my $oldhash = $map_hash{$map_gene_id}; + $oldhash->{'type'} = "$oldhash->{'type'}\t$interaction_type";; + $oldhash->{'inter'} = "$oldhash->{'inter'}\t$interaction_gene";; + $map_hash{$map_gene_id} = $oldhash; + }else{ + my %hash; + $hash{'map'} = $map_gene_id; + $hash{'type'} = $interaction_type; + $hash{'inter'} = $interaction_gene; + $map_hash{$map_gene_id} = \%hash; + push @map_array, $map_gene_id; + } +} + +# close file +close(network_file); + + +# This file gives the orthologs +open(ortho_file,"<$ortho_file_name") || die "Error: file '$ortho_file_name' can not be opened\n"; + +my %ortho_hash; +my @ortho_array; + +# import the ortholog entries into a hash table +while(){ + my $id1 = $_; + # strip off newline characters + $id1 =~ s/\r//g; + $id1 =~ s/\n//g; + # split the 3 columns into separate variables + my ($gene_id, $ortho_id, $ident_id) = split("\t", $id1); + # strip off gene id suffix and convert to all caps + $gene_id =~ s/\-TAIR\-G//g; + $gene_id =~ tr/a-z/A-Z/; + $ortho_id =~ tr/a-z/A-Z/; + # check to make sure the %identity is above the minimum defined above + if($ident_id =~ /\d/ && $ident_id>=$min_ident){ + # if gene_id already found, add the new ortholog to hash + if(defined($ortho_hash{$gene_id})){ + my $oldhash = $ortho_hash{$gene_id}; + $oldhash->{'ortho'} = "$oldhash->{'ortho'}\t$ortho_id";; + $ortho_hash{$gene_id} = $oldhash; + # if new gene_id, create hash element + }else{ + my %hash; + $hash{'ortho'} = $ortho_id; + $hash{'gene'} = $gene_id; + $ortho_hash{$gene_id} = \%hash; + push @ortho_array, $gene_id; + } + } +} +# close file +close(ortho_file); + +# open file for output +open(output_file,">$out_file_name" . "_$min_ident" . ".sif") || die "Error: file '$out_file_name' can not be opened for writing\n"; +my $out_file = "$out_file_name" . "_$min_ident" . ".sif"; + + +foreach my $id_entry (@map_array) { + my $inter_type=$map_hash{$id_entry}->{'type'}; + my $gene_map=$map_hash{$id_entry}->{'inter'}; + #print "$id_entry\t$gene_map\n"; + + # create array of interaction types + my @inter_array = split("\t", $inter_type); + + # create array of interaction genes + my @gene_map_array = split("\t", $gene_map); + + # check to see if ortholog exists for reference gene + if(exists($ortho_hash{$id_entry})) { + # create array of orthologs for reference gene + my @gene_ortho_array = split("\t", $ortho_hash{$id_entry}->{'ortho'}); + # find the length of the array from map_hash to be used in loop + my $max_index = @inter_array; + + # run for each ortholog for reference gene + foreach my $gene_ortho (@gene_ortho_array) { + # for each element of the arrays inter_array and gene_map_array + for (my $i=0; $i<$max_index; $i++) { + my $gene_map_ortho=$gene_map_array[$i]; + # check to make sure that an ortholog for the interaction gene is defined + if(exists($ortho_hash{$gene_map_ortho})) { + # create array of orthologs for interaction genes + my @gene_map_ortho_array = split("\t", $ortho_hash{$gene_map_ortho}->{'ortho'}); + # loop through each interaction gene + foreach my $gene_map_inter (@gene_map_ortho_array) { + # if using paralogs, loop through paralogs for both genes + if ($para) { + my @gene_1_array; + my @gene_2_array; + #print "$gene_ortho\n"; + if (exists($para_hash{$gene_ortho})) { + @gene_1_array = split("\t", $para_hash{$gene_ortho}->{'gene2'}); + #print "filling array 1\n"; + } + # this will add back the original gene to the beginning of the array + unshift(@gene_1_array, $gene_ortho); + #print "$gene_ortho\n array @gene_1_array\n"; + #print "$gene_ortho\t$gene_map_inter\t\t\t$gene_1_array[0]\t$gene_1_array[1]\n"; + + if (exists($para_hash{$gene_map_inter})) { + @gene_2_array = split("\t", $para_hash{$gene_map_inter}->{'gene2'}); + #print "filling array 2\n"; + } + unshift(@gene_2_array, $gene_map_inter); + #print "@gene_2_array\n"; + + foreach my $gene_1 (@gene_1_array) { + foreach my $gene_2 (@gene_2_array) { + if ($gene_1 gt $gene_2) { + my $temp = $gene_1; + $gene_1 = $gene_2; + $gene_2 = $temp; + } + printf output_file "$gene_1\t$inter_array[$i]\t$gene_2\n"; + #printf output_file "$gene_1\n"; + } + } + + } else { + # check to see if the two entries are in alphatical order + # if they are not, swap them so they are + # Note that if the interaction type depends on direction, this breaks that + if ($gene_ortho gt $gene_map_inter) { # if inter_type depends on direction, can extend the if statement to account for specific interactions to be skipped + my $temp = $gene_ortho; + $gene_ortho = $gene_map_inter; + $gene_map_inter = $temp; + } + + # print results to output file in sif format + printf output_file "$gene_ortho\t$inter_array[$i]\t$gene_map_inter\n"; + } + } + + } + } + } + } + +} + +print "min_ident = $min_ident\n"; + + +close(output_file); +# sort the file and get rid of duplicates +# Note: this requires replacing the file, hence the mv command +system "sort $out_file | uniq > $out_file.tmp; mv $out_file.tmp $out_file"; +# end loop of minimum identities ($min_ident) +} + diff --git a/interactome_scripts/interact_perl_sort.pl b/interactome_scripts/interact_perl_sort.pl new file mode 100755 index 0000000..c30cbf5 --- /dev/null +++ b/interactome_scripts/interact_perl_sort.pl @@ -0,0 +1,306 @@ +#!/usr/bin/perl +# +# +# ONLY USE THIS PROGRAM FOR SMALL INTERACTION CASES!!!!!!!!!!!!!! +# +# +# +# Originally written by Justin Elser 2/2/09 +# Version 0.95 - completed 2/19/09 +# Version 0.96 - still working on +# This file takes the two files that define the network and the orthologies and +# outputs a predicted interactome +# +# updated 2/19/09 to filter out duplicates A -> B = B -> A in the interactions +# +# this version of interact.pl will try to do the sort in perl using hashes, hopefully faster than the linux sort +# After some testing, it may be faster (don't know), but is horribly memory inefficient. Took over 4 GB of RAM for min_ident=50. +# Would totally kill a system to try to do min_ident=0. Maybe on a machine with >=64GB RAM? +# +# +# +# +# ONLY USE THIS PROGRAM FOR SMALL INTERACTION CASES!!!!!!!!!!!!!! +use strict; +use warnings; + +# print usage if other than three arguments to the script are found +if ($#ARGV < 2 || $#ARGV > 3) { + print "usage: interact_perl_sort.pl Network_file Orthology_file Paralog_file(optional) Output_file\n"; + exit; +} + +my $network_file_name = $ARGV[0]; +my $ortho_file_name = $ARGV[1]; + +my $para_file_name = ""; +my $out_file_name = ""; +my $para = ""; + +if ($#ARGV ==3) { + $para_file_name = $ARGV[2]; + $out_file_name = $ARGV[3]; + $para=1; # true +}else{ + $out_file_name = $ARGV[2]; + $para=0; #false +} + +my %para_hash; + +# read in the paralogs first so it is not in the loop, but the hash is available to the loop +if ($para) { + + open(paralog_file, "$para_file_name") || die "Error: file '$para_file_name' can not be opened\n"; + + my $min_ident_para = 0; # set the threshold for paralog identities + while() { + my $entry = $_; + # strip off newline characters + $entry =~ s/\r//g; + $entry =~ s/\n//g; + # split the columns into 3 separate variables + my ($gene_1, $gene_2, $ident) = split("\t", $entry); + # skip if $ident is empty or Nan + next if(!defined($ident)); + next if($ident =~ /D/); + next if($ident eq ""); + next if($ident =~ /\%/); + # change gene ids to all caps + $gene_1 =~ tr/a-z/A-Z/; + $gene_2 =~ tr/a-z/A-Z/; + + # put the genes in ASCII order to help remove dupes + if($gene_1 gt $gene_2) { + my $temp = $gene_1; + $gene_1 = $gene_2; + $gene_2 = $temp; + } + + # only use paralogs with high confidence values + if($ident>=$min_ident_para) { + # most genes will show up many times, so push values for hash key + if(defined($para_hash{$gene_1})) { + my $oldhash = $para_hash{$gene_1}; + $oldhash->{'gene2'} = "$oldhash->{'gene2'}\t$gene_2";; + $para_hash{$gene_1} = $oldhash; + #print "$oldhash\n"; + }else{ + my %hash; + $hash{'gene1'} = $gene_1; + $hash{'gene2'} = $gene_2; + $para_hash{$gene_1} = \%hash; + } + } + + } + + close(paralog_file); + +} + +#while (my $key = each %para_hash) { +# my $value = $para_hash{$key}->{'gene2'}; +# print "$key\t$value\n"; +#} +#exit; + +my %unsorted_hash; + + +# Set the minimum % identity to be used. Higher values mean higher confidence in +# orthology +#my $min_ident=33; +# loop through minimum identity, decreasing by 5 +for(my $min_ident=100; $min_ident>=0; $min_ident-=50) { + +# Two files are to be read as input. This one describes the interactions (network map) +open(network_file,"$network_file_name") || die "Error: file '$network_file_name' can not be opened\n"; + +# import the network map entries into a hash table +my %map_hash; +my @map_array; + +while(){ + my $entry = $_; + # strip off newline characters + $entry =~ s/\r//g; + $entry =~ s/\n//g; + # split the 3 columns into separate variables + my ($map_gene_id, $interaction_type, $interaction_gene) = split("\t", $entry); + # change gene ids to all caps + $map_gene_id =~ tr/a-z/A-Z/; + $interaction_gene =~ tr/a-z/A-Z/; + + + # some genes have multiple interactions + if(defined($map_hash{$map_gene_id})){ + my $oldhash = $map_hash{$map_gene_id}; + $oldhash->{'type'} = "$oldhash->{'type'}\t$interaction_type";; + $oldhash->{'inter'} = "$oldhash->{'inter'}\t$interaction_gene";; + $map_hash{$map_gene_id} = $oldhash; + }else{ + my %hash; + $hash{'map'} = $map_gene_id; + $hash{'type'} = $interaction_type; + $hash{'inter'} = $interaction_gene; + $map_hash{$map_gene_id} = \%hash; + push @map_array, $map_gene_id; + } +} + +# close file +close(network_file); + + +# This file gives the orthologs +open(ortho_file,"<$ortho_file_name") || die "Error: file '$ortho_file_name' can not be opened\n"; + +my %ortho_hash; +my @ortho_array; + +# import the ortholog entries into a hash table +while(){ + my $id1 = $_; + # strip off newline characters + $id1 =~ s/\r//g; + $id1 =~ s/\n//g; + # split the 3 columns into separate variables + my ($gene_id, $ortho_id, $ident_id) = split("\t", $id1); + # strip off gene id suffix and convert to all caps + $gene_id =~ s/\-TAIR\-G//g; + $gene_id =~ tr/a-z/A-Z/; + $ortho_id =~ tr/a-z/A-Z/; + # check to make sure the %identity is above the minimum defined above + if($ident_id =~ /\d/ && $ident_id>=$min_ident){ + # if gene_id already found, add the new ortholog to hash + if(defined($ortho_hash{$gene_id})){ + my $oldhash = $ortho_hash{$gene_id}; + $oldhash->{'ortho'} = "$oldhash->{'ortho'}\t$ortho_id";; + $ortho_hash{$gene_id} = $oldhash; + # if new gene_id, create hash element + }else{ + my %hash; + $hash{'ortho'} = $ortho_id; + $hash{'gene'} = $gene_id; + $ortho_hash{$gene_id} = \%hash; + push @ortho_array, $gene_id; + } + } +} +# close file +close(ortho_file); + +# open file for output +open(output_file,">$out_file_name" . "_$min_ident" . ".sif") || die "Error: file '$out_file_name' can not be opened for writing\n"; +my $out_file = "$out_file_name" . "_$min_ident" . ".sif"; + +my $line_num = 0; + +#my %sorted_hash; +foreach my $id_entry (@map_array) { + my $inter_type=$map_hash{$id_entry}->{'type'}; + my $gene_map=$map_hash{$id_entry}->{'inter'}; + #print "$id_entry\t$gene_map\n"; + + # create array of interaction types + my @inter_array = split("\t", $inter_type); + + # create array of interaction genes + my @gene_map_array = split("\t", $gene_map); + + # check to see if ortholog exists for reference gene + if(exists($ortho_hash{$id_entry})) { + # create array of orthologs for reference gene + my @gene_ortho_array = split("\t", $ortho_hash{$id_entry}->{'ortho'}); + # find the length of the array from map_hash to be used in loop + my $max_index = @inter_array; + + # run for each ortholog for reference gene + foreach my $gene_ortho (@gene_ortho_array) { + # for each element of the arrays inter_array and gene_map_array + for (my $i=0; $i<$max_index; $i++) { + my $gene_map_ortho=$gene_map_array[$i]; + # check to make sure that an ortholog for the interaction gene is defined + if(exists($ortho_hash{$gene_map_ortho})) { + # create array of orthologs for interaction genes + my @gene_map_ortho_array = split("\t", $ortho_hash{$gene_map_ortho}->{'ortho'}); + # loop through each interaction gene + foreach my $gene_map_inter (@gene_map_ortho_array) { + # if using paralogs, loop through paralogs for both genes + if ($para) { + my @gene_1_array; + my @gene_2_array; + #print "$gene_ortho\n"; + if (exists($para_hash{$gene_ortho})) { + @gene_1_array = split("\t", $para_hash{$gene_ortho}->{'gene2'}); + #print "filling array 1\n"; + } + # this will add back the original gene to the beginning of the array + unshift(@gene_1_array, $gene_ortho); + #print "$gene_ortho\n array @gene_1_array\n"; + #print "$gene_ortho\t$gene_map_inter\t\t\t$gene_1_array[0]\t$gene_1_array[1]\n"; + + if (exists($para_hash{$gene_map_inter})) { + @gene_2_array = split("\t", $para_hash{$gene_map_inter}->{'gene2'}); + #print "filling array 2\n"; + } + unshift(@gene_2_array, $gene_map_inter); + #print "@gene_2_array\n"; + + foreach my $gene_1 (@gene_1_array) { + foreach my $gene_2 (@gene_2_array) { + if ($gene_1 gt $gene_2) { + my $temp = $gene_1; + $gene_1 = $gene_2; + $gene_2 = $temp; + } + $unsorted_hash{$line_num} = "$gene_1\t$inter_array[$i]\t$gene_2"; + $line_num++; + #printf output_file "$gene_1\t$inter_array[$i]\t$gene_2\n"; + #printf output_file "$gene_1\n"; + } + } + + } else { + # check to see if the two entries are in alphatical order + # if they are not, swap them so they are + # Note that if the interaction type depends on direction, this breaks that + if ($gene_ortho gt $gene_map_inter) { # if inter_type depends on direction, can extend the if statement to account for specific interactions to be skipped + my $temp = $gene_ortho; + $gene_ortho = $gene_map_inter; + $gene_map_inter = $temp; + } + + $unsorted_hash{$line_num} = "$gene_ortho\t$inter_array[$i]\t$gene_map_inter"; + $line_num++; + # print results to output file in sif format + #printf output_file "$gene_ortho\t$inter_array[$i]\t$gene_map_inter\n"; + } + } + + } + } + } + } + +} + +print "min_ident = $min_ident\n"; + +foreach my $key (sort keys %unsorted_hash) { + my $value = $unsorted_hash{$key}; + print output_file "$value\n"; + } +print "done sorting\n"; + +%unsorted_hash = (); +close(output_file); +# sort the file and get rid of duplicates +# Note: this requires replacing the file, hence the mv command +#system "sort $out_file | uniq > $out_file.tmp; mv $out_file.tmp $out_file"; +# end loop of minimum identities ($min_ident) +} + +#sub by_ascii { $unsorted_hash{$a} <=> $unsorted_hash{$b} } + diff --git a/interactome_scripts/stats_interact.pl b/interactome_scripts/stats_interact.pl new file mode 100755 index 0000000..d8e9a86 --- /dev/null +++ b/interactome_scripts/stats_interact.pl @@ -0,0 +1,103 @@ +#!/usr/bin/perl + +############################################################## +# Written by Justin Elser 2/23/09 # +# Dept of BPP, Oregon State University # +# elserj@science.oregonstate.edu # +# Version 0.9 # +# this program is meant to be part of a pipeline including # +# interact.pl, therefore, it is written assuming the # +# input files are from that program (interact.pl) # +# # +# still to do----- # +# not all stats are computed yet # +############################################################## + +use strict; +use warnings; + +if ($#ARGV != 1) { + print "usage : stats_interact.pl prefix_for_in_files output_file\n"; + exit; +} +open(main_output_file, ">$ARGV[1]") || die "Error: file '$ARGV[1]' can not be opened\n"; +my $out_file = "$ARGV[1]"; + +printf main_output_file "Minimum identity\tNum of lines\tUnique gene count\tMax Gene IDs\t# max Gene\t# min genes\n"; + +for (my $min_ident=100; $min_ident>=0; $min_ident-=5) { + open (in_file_prefix, "$ARGV[0]" . "_$min_ident" . ".sif") || die "Error: file '$ARGV[0]' can not be opened\n"; + + open (output_file, ">$ARGV[1]" . "_$min_ident") || die "Error: file '$ARGV[1]' can not be opened\n"; + + + # start doing the stats + # earlier versions of the script did this using system calls to cat, grep, uniq, etc... + # implementing directly in perl has increased the performance of the program greatly + my $count = 0; # total number of unique interactions + my %seen = (); # hash that stores how many times each gene has been seen + while (my $line = ) { + $count++; + my ($gene_1,$inter,$gene_2) = split ("\t",$line); + chomp $gene_2; + if ($gene_1 eq $gene_2) { + $seen{"$gene_1"}++; + } + else { + $seen{"$gene_1"}++; + $seen{"$gene_2"}++; + } + } + my $uniq_gene_count = keys %seen; # gives the count of unique genes + + # this block populates the max and min lists. max_list holds the gene id(s) + # that comes up the most. min_list holds the gene ids that only show up once + my $max = undef; + my @max_list = undef; + my @min_list = (); + while( my ($key, $value) = each %seen ) { + if(!defined $max) { + $max = $value; + @max_list = ($key); + } elsif ($seen{$key} == $max) { + push @max_list, $key; + } elsif ($seen{$key} > $max) { + $max = $value; + @max_list = ($key); + } + if($value == 1) { + push @min_list, $key; + } + } + + my $min_count = @min_list; + + printf main_output_file "$min_ident\t$count\t$uniq_gene_count\t@max_list\t$max\t$min_count\n"; + + + + my %hash = (); + + # this block will calculate the number of genes that have occurred a given number of times + # for example, if there are 10 genes that occur 5 times, the key of this hash is 5, value is 10 + while( my ($key, $value) = each %seen ) { + $hash{"$value"}++; + } + + foreach my $key (sort by_number keys %hash) { # see the sub description below + my $value = $hash{$key}; + print output_file "$key\t$value\n"; + } + + + close(in_file_prefix); + close(output_file); + +} + +close(main_output_file); + +# the perl sort() sorts by ASCII value, not numerically, this subroutine fixes the sort +sub by_number { + if ($a < $b) {-1} elsif ($a > $b) {1} else {0} +} diff --git a/interactome_scripts/stats_interact_Ath.pl b/interactome_scripts/stats_interact_Ath.pl new file mode 100755 index 0000000..16b3806 --- /dev/null +++ b/interactome_scripts/stats_interact_Ath.pl @@ -0,0 +1,103 @@ +#!/usr/bin/perl + +############################################################## +# Written by Justin Elser 2/23/09 # +# Dept of BPP, Oregon State University # +# elserj@science.oregonstate.edu # +# Version 0.9 # +# this program is meant to be part of a pipeline including # +# interact.pl, therefore, it is written assuming the # +# input files are from that program (interact.pl) # +# # +# still to do----- # +# not all stats are computed yet # +############################################################## + +use strict; +use warnings; + +if ($#ARGV != 1) { + print "usage : stats_interact_Ath.pl prefix_for_in_files output_file\n"; + exit; +} +open(main_output_file, ">$ARGV[1]") || die "Error: file '$ARGV[1]' can not be opened\n"; +my $out_file = "$ARGV[1]"; + +printf main_output_file "Num of lines\tUnique gene count\tMax Gene IDs\t# max Gene\t# min genes\n"; + + + open (in_file_prefix, "$ARGV[0]") || die "Error: file '$ARGV[0]' can not be opened\n"; + + open (output_file, ">$ARGV[1]" . "_num") || die "Error: file '$ARGV[1]' can not be opened\n"; + + + # start doing the stats + # earlier versions of the script did this using system calls to cat, grep, uniq, etc... + # implementing directly in perl has increased the performance of the program greatly + my $count = 0; # total number of unique interactions + my %seen = (); # hash that stores how many times each gene has been seen + while (my $line = ) { + $count++; + my ($gene_1,$inter,$gene_2) = split ("\t",$line); + chomp $gene_2; + if ($gene_1 eq $gene_2) { + $seen{"$gene_1"}++; + } + else { + $seen{"$gene_1"}++; + $seen{"$gene_2"}++; + } + } + my $uniq_gene_count = keys %seen; # gives the count of unique genes + + # this block populates the max and min lists. max_list holds the gene id(s) + # that comes up the most. min_list holds the gene ids that only show up once + my $max = undef; + my @max_list = undef; + my @min_list = (); + while( my ($key, $value) = each %seen ) { + if(!defined $max) { + $max = $value; + @max_list = ($key); + } elsif ($seen{$key} == $max) { + push @max_list, $key; + } elsif ($seen{$key} > $max) { + $max = $value; + @max_list = ($key); + } + if($value == 1) { + push @min_list, $key; + } + } + + my $min_count = @min_list; + + printf main_output_file "$count\t$uniq_gene_count\t@max_list\t$max\t$min_count\n"; + + + + my %hash = (); + + # this block will calculate the number of genes that have occurred a given number of times + # for example, if there are 10 genes that occur 5 times, the key of this hash is 5, value is 10 + while( my ($key, $value) = each %seen ) { + $hash{"$value"}++; + } + + foreach my $key (sort by_number keys %hash) { # see the sub description below + my $value = $hash{$key}; + print output_file "$key\t$value\n"; + } + + + close(in_file_prefix); + close(output_file); + + + +close(main_output_file); + +# the perl sort() sorts by ASCII value, not numerically, this subroutine fixes the sort +sub by_number { + if ($a < $b) {-1} elsif ($a > $b) {1} else {0} +} diff --git a/interactome_scripts/tree_script.sh b/interactome_scripts/tree_script.sh new file mode 100755 index 0000000..0269100 --- /dev/null +++ b/interactome_scripts/tree_script.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +#[ -r $1 ] || [ $2="rice" ] || (echo "usage: $0 input_file rice/Ath" exit) + +if [ ! -r $1 ] +then + echo "usage: $0 input_file rice/Ath" + exit +fi + +case $2 in +Ath) +alphas=( At2g23070 At3g50000 At5g67380 At2g23080 ) +betas=( At4g17640 At5g47080 At3g60250 At2g44680 ) +;; +rice) +alphas=( LOC_Os03g55490 LOC_Os03g10940 LOC_Os03g55389 LOC_Os07g02350 ) +betas=( LOC_Os10g41520 LOC_Os07g31280 ) +;; +*) +echo "usage: $0 input_file rice/Ath" +exit +;; +esac + +first=1 +for id in ${alphas[@]} +do + if [ $first -eq 1 ] + then + first=0 + alphagrep="$id" + else + alphagrep="$echo $alphagrep|$id" + fi +done + +first=1 +for id in ${betas[@]} +do + if [ $first -eq 1 ] + then + first=0 + betagrep="$id" + else + betagrep="$echo $betagrep|$id" + fi +done + +case $2 in +Ath) +egrep -i ""$alphagrep"" $1 | egrep -i ""$betagrep"" | sed "s/${alphas[0]}/${alphas[0]}_alpha/gi" | sed "s/${alphas[1]}/${alphas[1]}_alpha/gi" | sed "s/${alphas[2]}/${alphas[2]}_alpha/gi" | sed "s/${alphas[3]}/${alphas[3]}_alpha/gi" | sed "s/${betas[0]}/${betas[0]}_beta/gi" | sed "s/${betas[1]}/${betas[1]}_beta/gi" | sed "s/${betas[2]}/${betas[2]}_beta/gi" | sed "s/${betas[3]}/${betas[3]}_beta/gi" +;; +rice) +egrep -i ""$alphagrep"" $1 | egrep -i ""$betagrep"" | sed "s/${alphas[0]}/${alphas[0]}_alpha/gi" | sed "s/${alphas[1]}/${alphas[1]}_alpha/gi" | sed "s/${alphas[2]}/${alphas[2]}_alpha/gi" | sed "s/${alphas[3]}/${alphas[3]}_alpha/gi" | sed "s/${betas[0]}/${betas[0]}_beta/gi" | sed "s/${betas[1]}/${betas[1]}_beta/gi" +;; +esac diff --git a/server_admin_scripts/linkchecker.pl b/server_admin_scripts/linkchecker.pl new file mode 100755 index 0000000..1b42ca0 --- /dev/null +++ b/server_admin_scripts/linkchecker.pl @@ -0,0 +1,435 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use English qw( -no_match_vars ); +use File::Basename; +use Getopt::Long; +use Gramene::Config; +use HTML::Lint; +use Mail::Sendmail; +use Pod::Usage; +use Readonly; +use WWW::Mechanize::Timed; +use YAML qw(Load); + +$| = 1; + +Readonly my $VERSION + => sprintf '%d.%02d', qq$Revision: 1.14 $ =~ /(\d+)\.(\d+)/; + +my %parents = (); +my %failed = (); +my %previously_seen = (); + +my $cfile = Gramene::Config->new; +my %options = ( + 'noisy' => 0, + 'validate' => 0, + 'send_email' => 0, + 'site' => $cfile->get('link_checker')->{'site'}, + 'man' => 0, + 'version' => 0, + 'email_from' => $cfile->get('link_checker')->{'email_from'}, + 'email_to' => $cfile->get('link_checker')->{'email_to'}, + 'depth' => 10_000_000, #stop checking after a depth of 10,000,000 (essentially, "don't stop") + 'content_test_file' => $cfile->get('link_checker')->{'content_test_file'}, + 'white_list_file' => $cfile->get('link_checker')->{'white_list_file'}, + 'black_list_file' => $cfile->get('link_checker')->{'black_list_file'}, +); + +GetOptions( + \%options, + 'noisy', + 'validate', + 'send_email', + 'to_console', + 'site=s', + 'email_from=s', + 'email_to=s', + 'content_test_file=s', + 'white_list_file=s', + 'black_list_file=s', + 'depth=i', + 'help', + 'man', + 'version', +); + +if ( $options{'help'} || $options{'man'} ) { + pod2usage({ + -exitval => 0, + -verbose => $options{'man'} ? 2 : 1 + }); +}; + +if ( $options{'version'} ) { + my $prog = basename( $PROGRAM_NAME ); + print "$prog v$VERSION\n"; + exit 0; +} +my ($global_tests, $url_tests) = ({}, {}); + +if ( $options{'content_test_file'} ) { + open (my $test_fh, "<", $options{'content_test_file'}) or die "Could not open $options{'content_test_file'}"; + local $/ = undef; + my $file_contents = <$test_fh>; + close $test_fh; + ($global_tests, $url_tests) = Load($file_contents); +} + +my ($noisy, $validate, $send_email, $site, $from, $to, $depth) = @options{qw(noisy validate send_email site email_from email_to depth)}; + +if ( $site !~ m{^http://} ) { + $site = 'http://' . $site; +} + +$site = fix_link($site); + +my $white_list = {}; +my $black_list = {}; +foreach my $set ((['white_list_file', $white_list], ['black_list_file', $black_list])) { + my ($option, $list) = @$set; + if ( $options{$option} ) { + open (my $list_fh, "<", $options{$option}) or die "Could not open $options{$option}"; + local $/ = undef; + my $file_contents = <$list_fh>; + close $list_fh; + my $url_array = Load($file_contents); + %$list = map {($_, 1)} map {/^http/ ? $_ : $site . $_} @$url_array; + + } +} + +my $message = undef; +my (@slow, @invalid, @failed, @content_errors); + + +my @links = ([$site, $depth]); + +while (@links) { + + my $set = shift @links; + + my ($link, $depth) = @$set; + + #make any formatting mods we want to the link + $link = fix_link($link); + + next if $depth <= 0; + next if ! traceable_url($link); + will_grab_link($link); + + push @links, grab_links($link, $depth - 1); + +} + +# failures are a special case. We need to format at the end, so we can ensure that we include all parents that reference this link as a failure. +if (@failed) { + foreach my $link (@failed) { + $link = format_failed_link($link); + } +} + +if (@slow || @invalid || @failed || @content_errors) { + @slow = ("No slow links found\n") if !@slow; + @failed = ("No failed found\n") if !@failed; + @content_errors = ("No content errors found\n") if ! @content_errors; + $message .= "Slow links:\n===========\n" . join('', @slow) . "\n"; + $message .= "Failed links:\n=============\n" . join('', @failed) . "\n"; + $message .= "Validation errors:\n================\n" . join('', @invalid) . "\n" if @invalid; + $message .= "Content errors:\n================\n" . join('', @content_errors) . "\n" if @content_errors; +} +else { + $message = "No problems found"; +} + +#if we want to send the message AND we actually have one to send AND we have somewhere to send it, then send it... +if ($send_email && $message && $to && $from) { + my %mail = ( + 'To' => $to, + 'From' => $from, + 'Subject' => "Gramene broken links on $site at " . localtime, + 'Message' => $message, + ); + + sendmail(%mail) or die $Mail::Sendmail::error; +} +else { + print "$message\n"; +} + +# this function has been modified. Instead of a list of links, it now returns a list of arrayrefs. +# the first element is the link, and the second is the depth at which it was found. +# if the depth hits 0, we stop +sub grab_links { + my $link = shift or die "Cannot grab links w/o url"; + my $depth = shift; + + my $mech = WWW::Mechanize::Timed->new(); + + return () if previously_seen_url($link); + + $mech->get($link); + + my $time = $mech->client_response_receive_time; + + link_is_slow($link, $time) if is_slow($time); + + if ($mech->status =~ /^2/) { + + if ( $mech->is_html ) { + my $lint = HTML::Lint->new(); + $lint->parse( $mech->content ); + + validation_errors($link, $lint->errors) if $lint->errors; + + if ($options{'content_test_file'}) { + check_contents($link, $mech->content); + } + + } + + my @links = map {$_->URI->abs} $mech->links; + foreach my $child (@links) { + $child = fix_link($child); + push @{$parents{$child}}, $link; + } + return map {[ $_, $depth - 1]} @links; + } + else { + $failed{$link} = $mech->status; + failed_link($link); + return (); + } +} + +sub fix_link { + my $link = shift; + + return $link if $white_list->{$link}; + + #ice any anchors. For our purposes, they're superfluous. + $link =~ s/#.+//; + #nuke query strings, too. To try and cut down on links traversed. + $link =~ s/\?.+//; + #and drop any trailing slashes + $link =~ s!/$!!; + + return $link; +} + +# Returns a boolean value whether we think a link is slow or not. + +sub is_slow { + my $time = shift || 0; + + return $time > 2; +} + +# These are the URLs that we assume we want to follow. + +sub traceable_url { + my $link = shift; + + return 0 if + $link !~ m!$site! + || $link =~ /^javascript:/ + || $link !~ /http/ + || $black_list->{$link} + #|| $previously_seen{$link}++ + ; + + return 1; +} + +sub previously_seen_url { + my $link = shift; + return $previously_seen{$link}++; +} + +#the various IO functions +sub will_grab_link { + my $link = shift; + + my $output = "Grabbing ($link)\n"; + + print STDERR $output if $noisy; + + #$message .= $output if $validate; +} + +sub link_is_slow { + + my $link = shift; + my $time = shift; + + my $output = "Slow link: $link - took $time seconds\n\n"; + + push @slow, $output; + + print STDERR $output if $noisy; +} + +sub validation_errors { + + my $page = shift; + my @errors = @_; + + my $output = "Validation errors on page $page\n"; + + foreach my $error (@errors) { + $output .= "\t" . $error->as_string . "\n"; + } + + $output .= "\n"; + + if ($validate) { + push @invalid, $output; + } + + print STDERR $output if $noisy && $validate; +} + +sub failed_link { + my $link = shift; + + push @failed, $link; + + print STDERR format_failed_link($link) if $noisy; +} + +sub format_failed_link { + my $link = shift; + my $output = "FAILED $link: " . $failed{$link} . "\n"; + $output .= "linked from:\n"; + my $seen = {}; + foreach my $parent (grep {! $seen->{$_}++} @{$parents{$link}}) { + $output .= "\t$parent\n"; + } + + $output .= "\n"; + + return $output; +} + +sub check_contents { + my $link = shift; + my $content = shift; + + my @should_find = @{ $global_tests->{'valid'} }; + my @should_not_find = @{ $global_tests->{'invalid'} }; + + if ($url_tests->{$link}) { + push @should_find, @{ $url_tests->{$link}->{'valid'} }; + push @should_not_find, @{ $url_tests->{$link}->{'invalid'} }; + } + + my @return = (); + + foreach my $should (@should_find) { + if ($content !~ /$should/) { + push @return, " Could not find text: $should"; + } + } + + foreach my $shouldnt (@should_not_find) { + if ($content =~ /$shouldnt/) { + push @return, " Incorrectly found text: $shouldnt"; + } + } + + if (@return) { + my $output = join("\n", "CONTENT ERROR: $link", @return) . "\n"; + push @content_errors, $output; + print STDERR $output if $noisy; + } + +} +__END__ + +# ---------------------------------------------------- + +=head1 NAME + +linkchecker.pl - check links on a website + +=head1 VERSION + +This documentation refers to linkchecker.pl version $Revision: 1.14 $ + +=head1 SYNOPSIS + + linkchecker.pl [options] + +Options: + + --noisy Prints out progress to STDERR as the program + is running. Defaults to 0 + --validate Includes HTML validation errors. Defaults to 0. + WARNING - this can produce a lot of output + --send_email Will send out the results report to the address + specified in email_to in the linkchecker section of + the config file If this flag isn't set, the final + report will be printed to the console instead. This + flag should be set in the crontab + --email_from The email address the report will be sent from. + Defaults to the address in the config file + --email_to The email address the report will be sent to. + This is usually a group mailing list. + Defaults to the address in the config file + --site The site you want to run the checker on. Defaults + to the site in the link_checker section of the config + --depth depth to follow links to, based off of the front page. + --depth 2 follows links two deep, --depth 1 one, and so on. + file + --content_test_file This is a YAML file that contains information on text + that you should or should NOT find at the given URL. Reads + the file from the config file by default. See full docs for YAML + file format. + --white_list_file URLs are normally cleaned up a little bit. Anchors are dropped, + query strings are dropped, and so on. The white list file is a YAML + file that lets us provide links that should be viewed as is. + --black_list_file a YAML file containing URLs that we don't want to visit on the + site, for whatever reason. + --help Show brief help and exit + --man Show full documentation + --version Show version and exit + +=head1 DESCRIPTION + +Checks your links. By default, runs against the site listed in the +link_checker section of the gramene config file. + +=head1 AUTHOR + +Jim Thomason Ethomason@cshl.eduE. + +=head1 EXAMPLE YAML +--- +valid: + - Gramene +invalid: + - Error +--- +http://dev.gramene.org : + valid: + - Release #29 + - Gramene + invalid: + - Release #28 + - Gramene Sucks + +The first YAML doc section contains global values. Things we should find on every +or no page. The second doc section refers to specific content at individual URLs. + +See http://www.yaml.org/ for information on YAML docs. + +=head1 COPYRIGHT + +Copyright (c) 2007-2009 Cold Spring Harbor Laboratory + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=cut -- 2.34.1