--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+# print usage if other than two arguments to the script are found
+if ($#ARGV != 1) {
+ print "usage: check_interact.pl Experimental_file Predicted_file\n";
+ exit;
+}
+
+# open the experimental data file for reading
+open(exper_file,"$ARGV[0]") || die "Error: file '$ARGV[0]' can not be opened\n";
+
+# add the data to an array, calculate the length and get rid of newlines
+my @exper_array=<exper_file>;
+my $exper_len=@exper_array;
+
+
+close(exper_file);
+
+# do the same for the predicted file
+open(predict_file, "$ARGV[1]") || die "Error: file '$ARGV[1]' can not be opened\n";
+
+my @predict_array=<predict_file>;
+my $predict_len=@predict_array;
+
+close(predict_file);
+
+# loop through each experimentally verified gene interaction
+for (my $i=0;$i<$exper_len;$i++) {
+ my ($exper_gene, $exper_inter_type, $exper_inter) = split ("\t", $exper_array[$i]);
+ # get rid of line feed and carriage return characters
+ $exper_inter =~ s/\r//g;
+ $exper_inter =~ s/\n//g;
+ # Add prefixes to gene identifiers
+ $exper_gene = "LOC_" . $exper_gene;
+ $exper_inter = "LOC_" . $exper_inter;
+ # put the genes in alphabetical order so that order doesn't matter (predicted ones are done in interact.pl
+ # See comments in interact.pl for more info
+ if ($exper_gene gt $exper_inter) {
+ my $temp = $exper_gene;
+ $exper_gene=$exper_inter;
+ $exper_inter = $temp;
+ }
+
+ # loop through each predicted gene interaction
+ for (my $j=0;$j<$predict_len;$j++) {
+ my ($predict_gene, $predict_inter_type, $predict_inter) = split ("\t", $predict_array[$j]);
+ $predict_inter =~ s/\r//g;
+ $predict_inter =~ s/\n//g;
+
+ # if both the intereaction genes have matches in both files (experiment and predicted)
+ if ($exper_gene eq $predict_gene and $exper_inter eq $predict_inter) {
+ print "match found with $predict_gene -> $predict_inter\n";
+ }
+ }
+}
+
--- /dev/null
+#!/usr/bin/perl
+
+
--- /dev/null
+#!/usr/bin/perl
+# Originally written by Justin Elser 2/2/09
+# Version 0.95 - completed 2/19/09
+# Version 0.96 - still working on
+# This file takes the two files that define the network and the orthologies and
+# outputs a predicted interactome
+#
+# updated 2/19/09 to filter out duplicates A -> B = B -> A in the interactions
+#
+# 0.96 will add the ability to use a separate (optional) paralog list
+use strict;
+use warnings;
+
+# print usage if other than three arguments to the script are found
+if ($#ARGV < 2 || $#ARGV > 3) {
+ print "usage: interact.pl Network_file Orthology_file Paralog_file(optional) Output_file\n";
+ exit;
+}
+
+my $network_file_name = $ARGV[0];
+my $ortho_file_name = $ARGV[1];
+
+my $para_file_name = "";
+my $out_file_name = "";
+my $para = "";
+
+if ($#ARGV ==3) {
+ $para_file_name = $ARGV[2];
+ $out_file_name = $ARGV[3];
+ $para=1; # true
+}else{
+ $out_file_name = $ARGV[2];
+ $para=0; #false
+}
+
+my %para_hash;
+
+# read in the paralogs first so it is not in the loop, but the hash is available to the loop
+if ($para) {
+
+ open(paralog_file, "$para_file_name") || die "Error: file '$para_file_name' can not be opened\n";
+
+ my $min_ident_para = 0; # set the threshold for paralog identities
+ while(<paralog_file>) {
+ my $entry = $_;
+ # strip off newline characters
+ $entry =~ s/\r//g;
+ $entry =~ s/\n//g;
+ # split the columns into 3 separate variables
+ my ($gene_1, $gene_2, $ident) = split("\t", $entry);
+ # skip if $ident is empty or Nan
+ next if(!defined($ident));
+ next if($ident =~ /D/);
+ next if($ident eq "");
+ next if($ident =~ /\%/);
+ # change gene ids to all caps
+ $gene_1 =~ tr/a-z/A-Z/;
+ $gene_2 =~ tr/a-z/A-Z/;
+
+ # put the genes in ASCII order to help remove dupes
+ if($gene_1 gt $gene_2) {
+ my $temp = $gene_1;
+ $gene_1 = $gene_2;
+ $gene_2 = $temp;
+ }
+
+ # only use paralogs with high confidence values
+ if($ident>=$min_ident_para) {
+ # most genes will show up many times, so push values for hash key
+ if(defined($para_hash{$gene_1})) {
+ my $oldhash = $para_hash{$gene_1};
+ $oldhash->{'gene2'} = "$oldhash->{'gene2'}\t$gene_2";;
+ $para_hash{$gene_1} = $oldhash;
+ #print "$oldhash\n";
+ }else{
+ my %hash;
+ $hash{'gene1'} = $gene_1;
+ $hash{'gene2'} = $gene_2;
+ $para_hash{$gene_1} = \%hash;
+ }
+ }
+
+ }
+
+ close(paralog_file);
+
+}
+
+#while (my $key = each %para_hash) {
+# my $value = $para_hash{$key}->{'gene2'};
+# print "$key\t$value\n";
+#}
+#exit;
+
+
+
+# Set the minimum % identity to be used. Higher values mean higher confidence in
+# orthology
+#my $min_ident=33;
+# loop through minimum identity, decreasing by 5
+for(my $min_ident=100; $min_ident>=0; $min_ident-=50) {
+
+# Two files are to be read as input. This one describes the interactions (network map)
+open(network_file,"$network_file_name") || die "Error: file '$network_file_name' can not be opened\n";
+
+# import the network map entries into a hash table
+my %map_hash;
+my @map_array;
+
+while(<network_file>){
+ my $entry = $_;
+ # strip off newline characters
+ $entry =~ s/\r//g;
+ $entry =~ s/\n//g;
+ # split the 3 columns into separate variables
+ my ($map_gene_id, $interaction_type, $interaction_gene) = split("\t", $entry);
+ # change gene ids to all caps
+ $map_gene_id =~ tr/a-z/A-Z/;
+ $interaction_gene =~ tr/a-z/A-Z/;
+
+
+ # some genes have multiple interactions
+ if(defined($map_hash{$map_gene_id})){
+ my $oldhash = $map_hash{$map_gene_id};
+ $oldhash->{'type'} = "$oldhash->{'type'}\t$interaction_type";;
+ $oldhash->{'inter'} = "$oldhash->{'inter'}\t$interaction_gene";;
+ $map_hash{$map_gene_id} = $oldhash;
+ }else{
+ my %hash;
+ $hash{'map'} = $map_gene_id;
+ $hash{'type'} = $interaction_type;
+ $hash{'inter'} = $interaction_gene;
+ $map_hash{$map_gene_id} = \%hash;
+ push @map_array, $map_gene_id;
+ }
+}
+
+# close file
+close(network_file);
+
+
+# This file gives the orthologs
+open(ortho_file,"<$ortho_file_name") || die "Error: file '$ortho_file_name' can not be opened\n";
+
+my %ortho_hash;
+my @ortho_array;
+
+# import the ortholog entries into a hash table
+while(<ortho_file>){
+ my $id1 = $_;
+ # strip off newline characters
+ $id1 =~ s/\r//g;
+ $id1 =~ s/\n//g;
+ # split the 3 columns into separate variables
+ my ($gene_id, $ortho_id, $ident_id) = split("\t", $id1);
+ # strip off gene id suffix and convert to all caps
+ $gene_id =~ s/\-TAIR\-G//g;
+ $gene_id =~ tr/a-z/A-Z/;
+ $ortho_id =~ tr/a-z/A-Z/;
+ # check to make sure the %identity is above the minimum defined above
+ if($ident_id =~ /\d/ && $ident_id>=$min_ident){
+ # if gene_id already found, add the new ortholog to hash
+ if(defined($ortho_hash{$gene_id})){
+ my $oldhash = $ortho_hash{$gene_id};
+ $oldhash->{'ortho'} = "$oldhash->{'ortho'}\t$ortho_id";;
+ $ortho_hash{$gene_id} = $oldhash;
+ # if new gene_id, create hash element
+ }else{
+ my %hash;
+ $hash{'ortho'} = $ortho_id;
+ $hash{'gene'} = $gene_id;
+ $ortho_hash{$gene_id} = \%hash;
+ push @ortho_array, $gene_id;
+ }
+ }
+}
+# close file
+close(ortho_file);
+
+# open file for output
+open(output_file,">$out_file_name" . "_$min_ident" . ".sif") || die "Error: file '$out_file_name' can not be opened for writing\n";
+my $out_file = "$out_file_name" . "_$min_ident" . ".sif";
+
+
+foreach my $id_entry (@map_array) {
+ my $inter_type=$map_hash{$id_entry}->{'type'};
+ my $gene_map=$map_hash{$id_entry}->{'inter'};
+ #print "$id_entry\t$gene_map\n";
+
+ # create array of interaction types
+ my @inter_array = split("\t", $inter_type);
+
+ # create array of interaction genes
+ my @gene_map_array = split("\t", $gene_map);
+
+ # check to see if ortholog exists for reference gene
+ if(exists($ortho_hash{$id_entry})) {
+ # create array of orthologs for reference gene
+ my @gene_ortho_array = split("\t", $ortho_hash{$id_entry}->{'ortho'});
+ # find the length of the array from map_hash to be used in loop
+ my $max_index = @inter_array;
+
+ # run for each ortholog for reference gene
+ foreach my $gene_ortho (@gene_ortho_array) {
+ # for each element of the arrays inter_array and gene_map_array
+ for (my $i=0; $i<$max_index; $i++) {
+ my $gene_map_ortho=$gene_map_array[$i];
+ # check to make sure that an ortholog for the interaction gene is defined
+ if(exists($ortho_hash{$gene_map_ortho})) {
+ # create array of orthologs for interaction genes
+ my @gene_map_ortho_array = split("\t", $ortho_hash{$gene_map_ortho}->{'ortho'});
+ # loop through each interaction gene
+ foreach my $gene_map_inter (@gene_map_ortho_array) {
+ # if using paralogs, loop through paralogs for both genes
+ if ($para) {
+ my @gene_1_array;
+ my @gene_2_array;
+ #print "$gene_ortho\n";
+ if (exists($para_hash{$gene_ortho})) {
+ @gene_1_array = split("\t", $para_hash{$gene_ortho}->{'gene2'});
+ #print "filling array 1\n";
+ }
+ # this will add back the original gene to the beginning of the array
+ unshift(@gene_1_array, $gene_ortho);
+ #print "$gene_ortho\n array @gene_1_array\n";
+ #print "$gene_ortho\t$gene_map_inter\t\t\t$gene_1_array[0]\t$gene_1_array[1]\n";
+
+ if (exists($para_hash{$gene_map_inter})) {
+ @gene_2_array = split("\t", $para_hash{$gene_map_inter}->{'gene2'});
+ #print "filling array 2\n";
+ }
+ unshift(@gene_2_array, $gene_map_inter);
+ #print "@gene_2_array\n";
+
+ foreach my $gene_1 (@gene_1_array) {
+ foreach my $gene_2 (@gene_2_array) {
+ if ($gene_1 gt $gene_2) {
+ my $temp = $gene_1;
+ $gene_1 = $gene_2;
+ $gene_2 = $temp;
+ }
+ printf output_file "$gene_1\t$inter_array[$i]\t$gene_2\n";
+ #printf output_file "$gene_1\n";
+ }
+ }
+
+ } else {
+ # check to see if the two entries are in alphatical order
+ # if they are not, swap them so they are
+ # Note that if the interaction type depends on direction, this breaks that
+ if ($gene_ortho gt $gene_map_inter) { # if inter_type depends on direction, can extend the if statement to account for specific interactions to be skipped
+ my $temp = $gene_ortho;
+ $gene_ortho = $gene_map_inter;
+ $gene_map_inter = $temp;
+ }
+
+ # print results to output file in sif format
+ printf output_file "$gene_ortho\t$inter_array[$i]\t$gene_map_inter\n";
+ }
+ }
+
+ }
+ }
+ }
+ }
+
+}
+
+print "min_ident = $min_ident\n";
+
+
+close(output_file);
+# sort the file and get rid of duplicates
+# Note: this requires replacing the file, hence the mv command
+system "sort $out_file | uniq > $out_file.tmp; mv $out_file.tmp $out_file";
+# end loop of minimum identities ($min_ident)
+}
+
--- /dev/null
+#!/usr/bin/perl
+#
+#
+# ONLY USE THIS PROGRAM FOR SMALL INTERACTION CASES!!!!!!!!!!!!!!
+#
+#
+#
+# Originally written by Justin Elser 2/2/09
+# Version 0.95 - completed 2/19/09
+# Version 0.96 - still working on
+# This file takes the two files that define the network and the orthologies and
+# outputs a predicted interactome
+#
+# updated 2/19/09 to filter out duplicates A -> B = B -> A in the interactions
+#
+# this version of interact.pl will try to do the sort in perl using hashes, hopefully faster than the linux sort
+# After some testing, it may be faster (don't know), but is horribly memory inefficient. Took over 4 GB of RAM for min_ident=50.
+# Would totally kill a system to try to do min_ident=0. Maybe on a machine with >=64GB RAM?
+#
+#
+#
+#
+# ONLY USE THIS PROGRAM FOR SMALL INTERACTION CASES!!!!!!!!!!!!!!
+use strict;
+use warnings;
+
+# print usage if other than three arguments to the script are found
+if ($#ARGV < 2 || $#ARGV > 3) {
+ print "usage: interact_perl_sort.pl Network_file Orthology_file Paralog_file(optional) Output_file\n";
+ exit;
+}
+
+my $network_file_name = $ARGV[0];
+my $ortho_file_name = $ARGV[1];
+
+my $para_file_name = "";
+my $out_file_name = "";
+my $para = "";
+
+if ($#ARGV ==3) {
+ $para_file_name = $ARGV[2];
+ $out_file_name = $ARGV[3];
+ $para=1; # true
+}else{
+ $out_file_name = $ARGV[2];
+ $para=0; #false
+}
+
+my %para_hash;
+
+# read in the paralogs first so it is not in the loop, but the hash is available to the loop
+if ($para) {
+
+ open(paralog_file, "$para_file_name") || die "Error: file '$para_file_name' can not be opened\n";
+
+ my $min_ident_para = 0; # set the threshold for paralog identities
+ while(<paralog_file>) {
+ my $entry = $_;
+ # strip off newline characters
+ $entry =~ s/\r//g;
+ $entry =~ s/\n//g;
+ # split the columns into 3 separate variables
+ my ($gene_1, $gene_2, $ident) = split("\t", $entry);
+ # skip if $ident is empty or Nan
+ next if(!defined($ident));
+ next if($ident =~ /D/);
+ next if($ident eq "");
+ next if($ident =~ /\%/);
+ # change gene ids to all caps
+ $gene_1 =~ tr/a-z/A-Z/;
+ $gene_2 =~ tr/a-z/A-Z/;
+
+ # put the genes in ASCII order to help remove dupes
+ if($gene_1 gt $gene_2) {
+ my $temp = $gene_1;
+ $gene_1 = $gene_2;
+ $gene_2 = $temp;
+ }
+
+ # only use paralogs with high confidence values
+ if($ident>=$min_ident_para) {
+ # most genes will show up many times, so push values for hash key
+ if(defined($para_hash{$gene_1})) {
+ my $oldhash = $para_hash{$gene_1};
+ $oldhash->{'gene2'} = "$oldhash->{'gene2'}\t$gene_2";;
+ $para_hash{$gene_1} = $oldhash;
+ #print "$oldhash\n";
+ }else{
+ my %hash;
+ $hash{'gene1'} = $gene_1;
+ $hash{'gene2'} = $gene_2;
+ $para_hash{$gene_1} = \%hash;
+ }
+ }
+
+ }
+
+ close(paralog_file);
+
+}
+
+#while (my $key = each %para_hash) {
+# my $value = $para_hash{$key}->{'gene2'};
+# print "$key\t$value\n";
+#}
+#exit;
+
+my %unsorted_hash;
+
+
+# Set the minimum % identity to be used. Higher values mean higher confidence in
+# orthology
+#my $min_ident=33;
+# loop through minimum identity, decreasing by 5
+for(my $min_ident=100; $min_ident>=0; $min_ident-=50) {
+
+# Two files are to be read as input. This one describes the interactions (network map)
+open(network_file,"$network_file_name") || die "Error: file '$network_file_name' can not be opened\n";
+
+# import the network map entries into a hash table
+my %map_hash;
+my @map_array;
+
+while(<network_file>){
+ my $entry = $_;
+ # strip off newline characters
+ $entry =~ s/\r//g;
+ $entry =~ s/\n//g;
+ # split the 3 columns into separate variables
+ my ($map_gene_id, $interaction_type, $interaction_gene) = split("\t", $entry);
+ # change gene ids to all caps
+ $map_gene_id =~ tr/a-z/A-Z/;
+ $interaction_gene =~ tr/a-z/A-Z/;
+
+
+ # some genes have multiple interactions
+ if(defined($map_hash{$map_gene_id})){
+ my $oldhash = $map_hash{$map_gene_id};
+ $oldhash->{'type'} = "$oldhash->{'type'}\t$interaction_type";;
+ $oldhash->{'inter'} = "$oldhash->{'inter'}\t$interaction_gene";;
+ $map_hash{$map_gene_id} = $oldhash;
+ }else{
+ my %hash;
+ $hash{'map'} = $map_gene_id;
+ $hash{'type'} = $interaction_type;
+ $hash{'inter'} = $interaction_gene;
+ $map_hash{$map_gene_id} = \%hash;
+ push @map_array, $map_gene_id;
+ }
+}
+
+# close file
+close(network_file);
+
+
+# This file gives the orthologs
+open(ortho_file,"<$ortho_file_name") || die "Error: file '$ortho_file_name' can not be opened\n";
+
+my %ortho_hash;
+my @ortho_array;
+
+# import the ortholog entries into a hash table
+while(<ortho_file>){
+ my $id1 = $_;
+ # strip off newline characters
+ $id1 =~ s/\r//g;
+ $id1 =~ s/\n//g;
+ # split the 3 columns into separate variables
+ my ($gene_id, $ortho_id, $ident_id) = split("\t", $id1);
+ # strip off gene id suffix and convert to all caps
+ $gene_id =~ s/\-TAIR\-G//g;
+ $gene_id =~ tr/a-z/A-Z/;
+ $ortho_id =~ tr/a-z/A-Z/;
+ # check to make sure the %identity is above the minimum defined above
+ if($ident_id =~ /\d/ && $ident_id>=$min_ident){
+ # if gene_id already found, add the new ortholog to hash
+ if(defined($ortho_hash{$gene_id})){
+ my $oldhash = $ortho_hash{$gene_id};
+ $oldhash->{'ortho'} = "$oldhash->{'ortho'}\t$ortho_id";;
+ $ortho_hash{$gene_id} = $oldhash;
+ # if new gene_id, create hash element
+ }else{
+ my %hash;
+ $hash{'ortho'} = $ortho_id;
+ $hash{'gene'} = $gene_id;
+ $ortho_hash{$gene_id} = \%hash;
+ push @ortho_array, $gene_id;
+ }
+ }
+}
+# close file
+close(ortho_file);
+
+# open file for output
+open(output_file,">$out_file_name" . "_$min_ident" . ".sif") || die "Error: file '$out_file_name' can not be opened for writing\n";
+my $out_file = "$out_file_name" . "_$min_ident" . ".sif";
+
+my $line_num = 0;
+
+#my %sorted_hash;
+foreach my $id_entry (@map_array) {
+ my $inter_type=$map_hash{$id_entry}->{'type'};
+ my $gene_map=$map_hash{$id_entry}->{'inter'};
+ #print "$id_entry\t$gene_map\n";
+
+ # create array of interaction types
+ my @inter_array = split("\t", $inter_type);
+
+ # create array of interaction genes
+ my @gene_map_array = split("\t", $gene_map);
+
+ # check to see if ortholog exists for reference gene
+ if(exists($ortho_hash{$id_entry})) {
+ # create array of orthologs for reference gene
+ my @gene_ortho_array = split("\t", $ortho_hash{$id_entry}->{'ortho'});
+ # find the length of the array from map_hash to be used in loop
+ my $max_index = @inter_array;
+
+ # run for each ortholog for reference gene
+ foreach my $gene_ortho (@gene_ortho_array) {
+ # for each element of the arrays inter_array and gene_map_array
+ for (my $i=0; $i<$max_index; $i++) {
+ my $gene_map_ortho=$gene_map_array[$i];
+ # check to make sure that an ortholog for the interaction gene is defined
+ if(exists($ortho_hash{$gene_map_ortho})) {
+ # create array of orthologs for interaction genes
+ my @gene_map_ortho_array = split("\t", $ortho_hash{$gene_map_ortho}->{'ortho'});
+ # loop through each interaction gene
+ foreach my $gene_map_inter (@gene_map_ortho_array) {
+ # if using paralogs, loop through paralogs for both genes
+ if ($para) {
+ my @gene_1_array;
+ my @gene_2_array;
+ #print "$gene_ortho\n";
+ if (exists($para_hash{$gene_ortho})) {
+ @gene_1_array = split("\t", $para_hash{$gene_ortho}->{'gene2'});
+ #print "filling array 1\n";
+ }
+ # this will add back the original gene to the beginning of the array
+ unshift(@gene_1_array, $gene_ortho);
+ #print "$gene_ortho\n array @gene_1_array\n";
+ #print "$gene_ortho\t$gene_map_inter\t\t\t$gene_1_array[0]\t$gene_1_array[1]\n";
+
+ if (exists($para_hash{$gene_map_inter})) {
+ @gene_2_array = split("\t", $para_hash{$gene_map_inter}->{'gene2'});
+ #print "filling array 2\n";
+ }
+ unshift(@gene_2_array, $gene_map_inter);
+ #print "@gene_2_array\n";
+
+ foreach my $gene_1 (@gene_1_array) {
+ foreach my $gene_2 (@gene_2_array) {
+ if ($gene_1 gt $gene_2) {
+ my $temp = $gene_1;
+ $gene_1 = $gene_2;
+ $gene_2 = $temp;
+ }
+ $unsorted_hash{$line_num} = "$gene_1\t$inter_array[$i]\t$gene_2";
+ $line_num++;
+ #printf output_file "$gene_1\t$inter_array[$i]\t$gene_2\n";
+ #printf output_file "$gene_1\n";
+ }
+ }
+
+ } else {
+ # check to see if the two entries are in alphatical order
+ # if they are not, swap them so they are
+ # Note that if the interaction type depends on direction, this breaks that
+ if ($gene_ortho gt $gene_map_inter) { # if inter_type depends on direction, can extend the if statement to account for specific interactions to be skipped
+ my $temp = $gene_ortho;
+ $gene_ortho = $gene_map_inter;
+ $gene_map_inter = $temp;
+ }
+
+ $unsorted_hash{$line_num} = "$gene_ortho\t$inter_array[$i]\t$gene_map_inter";
+ $line_num++;
+ # print results to output file in sif format
+ #printf output_file "$gene_ortho\t$inter_array[$i]\t$gene_map_inter\n";
+ }
+ }
+
+ }
+ }
+ }
+ }
+
+}
+
+print "min_ident = $min_ident\n";
+
+foreach my $key (sort keys %unsorted_hash) {
+ my $value = $unsorted_hash{$key};
+ print output_file "$value\n";
+ }
+print "done sorting\n";
+
+%unsorted_hash = ();
+close(output_file);
+# sort the file and get rid of duplicates
+# Note: this requires replacing the file, hence the mv command
+#system "sort $out_file | uniq > $out_file.tmp; mv $out_file.tmp $out_file";
+# end loop of minimum identities ($min_ident)
+}
+
+#sub by_ascii { $unsorted_hash{$a} <=> $unsorted_hash{$b} }
+
--- /dev/null
+#!/usr/bin/perl
+
+##############################################################
+# Written by Justin Elser 2/23/09 #
+# Dept of BPP, Oregon State University #
+# elserj@science.oregonstate.edu #
+# Version 0.9 #
+# this program is meant to be part of a pipeline including #
+# interact.pl, therefore, it is written assuming the #
+# input files are from that program (interact.pl) #
+# #
+# still to do----- #
+# not all stats are computed yet #
+##############################################################
+
+use strict;
+use warnings;
+
+if ($#ARGV != 1) {
+ print "usage : stats_interact.pl prefix_for_in_files output_file\n";
+ exit;
+}
+open(main_output_file, ">$ARGV[1]") || die "Error: file '$ARGV[1]' can not be opened\n";
+my $out_file = "$ARGV[1]";
+
+printf main_output_file "Minimum identity\tNum of lines\tUnique gene count\tMax Gene IDs\t# max Gene\t# min genes\n";
+
+for (my $min_ident=100; $min_ident>=0; $min_ident-=5) {
+ open (in_file_prefix, "$ARGV[0]" . "_$min_ident" . ".sif") || die "Error: file '$ARGV[0]' can not be opened\n";
+
+ open (output_file, ">$ARGV[1]" . "_$min_ident") || die "Error: file '$ARGV[1]' can not be opened\n";
+
+
+ # start doing the stats
+ # earlier versions of the script did this using system calls to cat, grep, uniq, etc...
+ # implementing directly in perl has increased the performance of the program greatly
+ my $count = 0; # total number of unique interactions
+ my %seen = (); # hash that stores how many times each gene has been seen
+ while (my $line = <in_file_prefix>) {
+ $count++;
+ my ($gene_1,$inter,$gene_2) = split ("\t",$line);
+ chomp $gene_2;
+ if ($gene_1 eq $gene_2) {
+ $seen{"$gene_1"}++;
+ }
+ else {
+ $seen{"$gene_1"}++;
+ $seen{"$gene_2"}++;
+ }
+ }
+ my $uniq_gene_count = keys %seen; # gives the count of unique genes
+
+ # this block populates the max and min lists. max_list holds the gene id(s)
+ # that comes up the most. min_list holds the gene ids that only show up once
+ my $max = undef;
+ my @max_list = undef;
+ my @min_list = ();
+ while( my ($key, $value) = each %seen ) {
+ if(!defined $max) {
+ $max = $value;
+ @max_list = ($key);
+ } elsif ($seen{$key} == $max) {
+ push @max_list, $key;
+ } elsif ($seen{$key} > $max) {
+ $max = $value;
+ @max_list = ($key);
+ }
+ if($value == 1) {
+ push @min_list, $key;
+ }
+ }
+
+ my $min_count = @min_list;
+
+ printf main_output_file "$min_ident\t$count\t$uniq_gene_count\t@max_list\t$max\t$min_count\n";
+
+
+
+ my %hash = ();
+
+ # this block will calculate the number of genes that have occurred a given number of times
+ # for example, if there are 10 genes that occur 5 times, the key of this hash is 5, value is 10
+ while( my ($key, $value) = each %seen ) {
+ $hash{"$value"}++;
+ }
+
+ foreach my $key (sort by_number keys %hash) { # see the sub description below
+ my $value = $hash{$key};
+ print output_file "$key\t$value\n";
+ }
+
+
+ close(in_file_prefix);
+ close(output_file);
+
+}
+
+close(main_output_file);
+
+# the perl sort() sorts by ASCII value, not numerically, this subroutine fixes the sort
+sub by_number {
+ if ($a < $b) {-1} elsif ($a > $b) {1} else {0}
+}
--- /dev/null
+#!/usr/bin/perl
+
+##############################################################
+# Written by Justin Elser 2/23/09 #
+# Dept of BPP, Oregon State University #
+# elserj@science.oregonstate.edu #
+# Version 0.9 #
+# this program is meant to be part of a pipeline including #
+# interact.pl, therefore, it is written assuming the #
+# input files are from that program (interact.pl) #
+# #
+# still to do----- #
+# not all stats are computed yet #
+##############################################################
+
+use strict;
+use warnings;
+
+if ($#ARGV != 1) {
+ print "usage : stats_interact_Ath.pl prefix_for_in_files output_file\n";
+ exit;
+}
+open(main_output_file, ">$ARGV[1]") || die "Error: file '$ARGV[1]' can not be opened\n";
+my $out_file = "$ARGV[1]";
+
+printf main_output_file "Num of lines\tUnique gene count\tMax Gene IDs\t# max Gene\t# min genes\n";
+
+
+ open (in_file_prefix, "$ARGV[0]") || die "Error: file '$ARGV[0]' can not be opened\n";
+
+ open (output_file, ">$ARGV[1]" . "_num") || die "Error: file '$ARGV[1]' can not be opened\n";
+
+
+ # start doing the stats
+ # earlier versions of the script did this using system calls to cat, grep, uniq, etc...
+ # implementing directly in perl has increased the performance of the program greatly
+ my $count = 0; # total number of unique interactions
+ my %seen = (); # hash that stores how many times each gene has been seen
+ while (my $line = <in_file_prefix>) {
+ $count++;
+ my ($gene_1,$inter,$gene_2) = split ("\t",$line);
+ chomp $gene_2;
+ if ($gene_1 eq $gene_2) {
+ $seen{"$gene_1"}++;
+ }
+ else {
+ $seen{"$gene_1"}++;
+ $seen{"$gene_2"}++;
+ }
+ }
+ my $uniq_gene_count = keys %seen; # gives the count of unique genes
+
+ # this block populates the max and min lists. max_list holds the gene id(s)
+ # that comes up the most. min_list holds the gene ids that only show up once
+ my $max = undef;
+ my @max_list = undef;
+ my @min_list = ();
+ while( my ($key, $value) = each %seen ) {
+ if(!defined $max) {
+ $max = $value;
+ @max_list = ($key);
+ } elsif ($seen{$key} == $max) {
+ push @max_list, $key;
+ } elsif ($seen{$key} > $max) {
+ $max = $value;
+ @max_list = ($key);
+ }
+ if($value == 1) {
+ push @min_list, $key;
+ }
+ }
+
+ my $min_count = @min_list;
+
+ printf main_output_file "$count\t$uniq_gene_count\t@max_list\t$max\t$min_count\n";
+
+
+
+ my %hash = ();
+
+ # this block will calculate the number of genes that have occurred a given number of times
+ # for example, if there are 10 genes that occur 5 times, the key of this hash is 5, value is 10
+ while( my ($key, $value) = each %seen ) {
+ $hash{"$value"}++;
+ }
+
+ foreach my $key (sort by_number keys %hash) { # see the sub description below
+ my $value = $hash{$key};
+ print output_file "$key\t$value\n";
+ }
+
+
+ close(in_file_prefix);
+ close(output_file);
+
+
+
+close(main_output_file);
+
+# the perl sort() sorts by ASCII value, not numerically, this subroutine fixes the sort
+sub by_number {
+ if ($a < $b) {-1} elsif ($a > $b) {1} else {0}
+}
--- /dev/null
+#!/bin/bash
+
+#[ -r $1 ] || [ $2="rice" ] || (echo "usage: $0 input_file rice/Ath" exit)
+
+if [ ! -r $1 ]
+then
+ echo "usage: $0 input_file rice/Ath"
+ exit
+fi
+
+case $2 in
+Ath)
+alphas=( At2g23070 At3g50000 At5g67380 At2g23080 )
+betas=( At4g17640 At5g47080 At3g60250 At2g44680 )
+;;
+rice)
+alphas=( LOC_Os03g55490 LOC_Os03g10940 LOC_Os03g55389 LOC_Os07g02350 )
+betas=( LOC_Os10g41520 LOC_Os07g31280 )
+;;
+*)
+echo "usage: $0 input_file rice/Ath"
+exit
+;;
+esac
+
+first=1
+for id in ${alphas[@]}
+do
+ if [ $first -eq 1 ]
+ then
+ first=0
+ alphagrep="$id"
+ else
+ alphagrep="$echo $alphagrep|$id"
+ fi
+done
+
+first=1
+for id in ${betas[@]}
+do
+ if [ $first -eq 1 ]
+ then
+ first=0
+ betagrep="$id"
+ else
+ betagrep="$echo $betagrep|$id"
+ fi
+done
+
+case $2 in
+Ath)
+egrep -i ""$alphagrep"" $1 | egrep -i ""$betagrep"" | sed "s/${alphas[0]}/${alphas[0]}_alpha/gi" | sed "s/${alphas[1]}/${alphas[1]}_alpha/gi" | sed "s/${alphas[2]}/${alphas[2]}_alpha/gi" | sed "s/${alphas[3]}/${alphas[3]}_alpha/gi" | sed "s/${betas[0]}/${betas[0]}_beta/gi" | sed "s/${betas[1]}/${betas[1]}_beta/gi" | sed "s/${betas[2]}/${betas[2]}_beta/gi" | sed "s/${betas[3]}/${betas[3]}_beta/gi"
+;;
+rice)
+egrep -i ""$alphagrep"" $1 | egrep -i ""$betagrep"" | sed "s/${alphas[0]}/${alphas[0]}_alpha/gi" | sed "s/${alphas[1]}/${alphas[1]}_alpha/gi" | sed "s/${alphas[2]}/${alphas[2]}_alpha/gi" | sed "s/${alphas[3]}/${alphas[3]}_alpha/gi" | sed "s/${betas[0]}/${betas[0]}_beta/gi" | sed "s/${betas[1]}/${betas[1]}_beta/gi"
+;;
+esac
--- /dev/null
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+use English qw( -no_match_vars );
+use File::Basename;
+use Getopt::Long;
+use Gramene::Config;
+use HTML::Lint;
+use Mail::Sendmail;
+use Pod::Usage;
+use Readonly;
+use WWW::Mechanize::Timed;
+use YAML qw(Load);
+
+$| = 1;
+
+Readonly my $VERSION
+ => sprintf '%d.%02d', qq$Revision: 1.14 $ =~ /(\d+)\.(\d+)/;
+
+my %parents = ();
+my %failed = ();
+my %previously_seen = ();
+
+my $cfile = Gramene::Config->new;
+my %options = (
+ 'noisy' => 0,
+ 'validate' => 0,
+ 'send_email' => 0,
+ 'site' => $cfile->get('link_checker')->{'site'},
+ 'man' => 0,
+ 'version' => 0,
+ 'email_from' => $cfile->get('link_checker')->{'email_from'},
+ 'email_to' => $cfile->get('link_checker')->{'email_to'},
+ 'depth' => 10_000_000, #stop checking after a depth of 10,000,000 (essentially, "don't stop")
+ 'content_test_file' => $cfile->get('link_checker')->{'content_test_file'},
+ 'white_list_file' => $cfile->get('link_checker')->{'white_list_file'},
+ 'black_list_file' => $cfile->get('link_checker')->{'black_list_file'},
+);
+
+GetOptions(
+ \%options,
+ 'noisy',
+ 'validate',
+ 'send_email',
+ 'to_console',
+ 'site=s',
+ 'email_from=s',
+ 'email_to=s',
+ 'content_test_file=s',
+ 'white_list_file=s',
+ 'black_list_file=s',
+ 'depth=i',
+ 'help',
+ 'man',
+ 'version',
+);
+
+if ( $options{'help'} || $options{'man'} ) {
+ pod2usage({
+ -exitval => 0,
+ -verbose => $options{'man'} ? 2 : 1
+ });
+};
+
+if ( $options{'version'} ) {
+ my $prog = basename( $PROGRAM_NAME );
+ print "$prog v$VERSION\n";
+ exit 0;
+}
+my ($global_tests, $url_tests) = ({}, {});
+
+if ( $options{'content_test_file'} ) {
+ open (my $test_fh, "<", $options{'content_test_file'}) or die "Could not open $options{'content_test_file'}";
+ local $/ = undef;
+ my $file_contents = <$test_fh>;
+ close $test_fh;
+ ($global_tests, $url_tests) = Load($file_contents);
+}
+
+my ($noisy, $validate, $send_email, $site, $from, $to, $depth) = @options{qw(noisy validate send_email site email_from email_to depth)};
+
+if ( $site !~ m{^http://} ) {
+ $site = 'http://' . $site;
+}
+
+$site = fix_link($site);
+
+my $white_list = {};
+my $black_list = {};
+foreach my $set ((['white_list_file', $white_list], ['black_list_file', $black_list])) {
+ my ($option, $list) = @$set;
+ if ( $options{$option} ) {
+ open (my $list_fh, "<", $options{$option}) or die "Could not open $options{$option}";
+ local $/ = undef;
+ my $file_contents = <$list_fh>;
+ close $list_fh;
+ my $url_array = Load($file_contents);
+ %$list = map {($_, 1)} map {/^http/ ? $_ : $site . $_} @$url_array;
+
+ }
+}
+
+my $message = undef;
+my (@slow, @invalid, @failed, @content_errors);
+
+
+my @links = ([$site, $depth]);
+
+while (@links) {
+
+ my $set = shift @links;
+
+ my ($link, $depth) = @$set;
+
+ #make any formatting mods we want to the link
+ $link = fix_link($link);
+
+ next if $depth <= 0;
+ next if ! traceable_url($link);
+ will_grab_link($link);
+
+ push @links, grab_links($link, $depth - 1);
+
+}
+
+# failures are a special case. We need to format at the end, so we can ensure that we include all parents that reference this link as a failure.
+if (@failed) {
+ foreach my $link (@failed) {
+ $link = format_failed_link($link);
+ }
+}
+
+if (@slow || @invalid || @failed || @content_errors) {
+ @slow = ("No slow links found\n") if !@slow;
+ @failed = ("No failed found\n") if !@failed;
+ @content_errors = ("No content errors found\n") if ! @content_errors;
+ $message .= "Slow links:\n===========\n" . join('', @slow) . "\n";
+ $message .= "Failed links:\n=============\n" . join('', @failed) . "\n";
+ $message .= "Validation errors:\n================\n" . join('', @invalid) . "\n" if @invalid;
+ $message .= "Content errors:\n================\n" . join('', @content_errors) . "\n" if @content_errors;
+}
+else {
+ $message = "No problems found";
+}
+
+#if we want to send the message AND we actually have one to send AND we have somewhere to send it, then send it...
+if ($send_email && $message && $to && $from) {
+ my %mail = (
+ 'To' => $to,
+ 'From' => $from,
+ 'Subject' => "Gramene broken links on $site at " . localtime,
+ 'Message' => $message,
+ );
+
+ sendmail(%mail) or die $Mail::Sendmail::error;
+}
+else {
+ print "$message\n";
+}
+
+# this function has been modified. Instead of a list of links, it now returns a list of arrayrefs.
+# the first element is the link, and the second is the depth at which it was found.
+# if the depth hits 0, we stop
+sub grab_links {
+ my $link = shift or die "Cannot grab links w/o url";
+ my $depth = shift;
+
+ my $mech = WWW::Mechanize::Timed->new();
+
+ return () if previously_seen_url($link);
+
+ $mech->get($link);
+
+ my $time = $mech->client_response_receive_time;
+
+ link_is_slow($link, $time) if is_slow($time);
+
+ if ($mech->status =~ /^2/) {
+
+ if ( $mech->is_html ) {
+ my $lint = HTML::Lint->new();
+ $lint->parse( $mech->content );
+
+ validation_errors($link, $lint->errors) if $lint->errors;
+
+ if ($options{'content_test_file'}) {
+ check_contents($link, $mech->content);
+ }
+
+ }
+
+ my @links = map {$_->URI->abs} $mech->links;
+ foreach my $child (@links) {
+ $child = fix_link($child);
+ push @{$parents{$child}}, $link;
+ }
+ return map {[ $_, $depth - 1]} @links;
+ }
+ else {
+ $failed{$link} = $mech->status;
+ failed_link($link);
+ return ();
+ }
+}
+
+sub fix_link {
+ my $link = shift;
+
+ return $link if $white_list->{$link};
+
+ #ice any anchors. For our purposes, they're superfluous.
+ $link =~ s/#.+//;
+ #nuke query strings, too. To try and cut down on links traversed.
+ $link =~ s/\?.+//;
+ #and drop any trailing slashes
+ $link =~ s!/$!!;
+
+ return $link;
+}
+
+# Returns a boolean value whether we think a link is slow or not.
+
+sub is_slow {
+ my $time = shift || 0;
+
+ return $time > 2;
+}
+
+# These are the URLs that we assume we want to follow.
+
+sub traceable_url {
+ my $link = shift;
+
+ return 0 if
+ $link !~ m!$site!
+ || $link =~ /^javascript:/
+ || $link !~ /http/
+ || $black_list->{$link}
+ #|| $previously_seen{$link}++
+ ;
+
+ return 1;
+}
+
+sub previously_seen_url {
+ my $link = shift;
+ return $previously_seen{$link}++;
+}
+
+#the various IO functions
+sub will_grab_link {
+ my $link = shift;
+
+ my $output = "Grabbing ($link)\n";
+
+ print STDERR $output if $noisy;
+
+ #$message .= $output if $validate;
+}
+
+sub link_is_slow {
+
+ my $link = shift;
+ my $time = shift;
+
+ my $output = "Slow link: $link - took $time seconds\n\n";
+
+ push @slow, $output;
+
+ print STDERR $output if $noisy;
+}
+
+sub validation_errors {
+
+ my $page = shift;
+ my @errors = @_;
+
+ my $output = "Validation errors on page $page\n";
+
+ foreach my $error (@errors) {
+ $output .= "\t" . $error->as_string . "\n";
+ }
+
+ $output .= "\n";
+
+ if ($validate) {
+ push @invalid, $output;
+ }
+
+ print STDERR $output if $noisy && $validate;
+}
+
+sub failed_link {
+ my $link = shift;
+
+ push @failed, $link;
+
+ print STDERR format_failed_link($link) if $noisy;
+}
+
+sub format_failed_link {
+ my $link = shift;
+ my $output = "FAILED $link: " . $failed{$link} . "\n";
+ $output .= "linked from:\n";
+ my $seen = {};
+ foreach my $parent (grep {! $seen->{$_}++} @{$parents{$link}}) {
+ $output .= "\t$parent\n";
+ }
+
+ $output .= "\n";
+
+ return $output;
+}
+
+sub check_contents {
+ my $link = shift;
+ my $content = shift;
+
+ my @should_find = @{ $global_tests->{'valid'} };
+ my @should_not_find = @{ $global_tests->{'invalid'} };
+
+ if ($url_tests->{$link}) {
+ push @should_find, @{ $url_tests->{$link}->{'valid'} };
+ push @should_not_find, @{ $url_tests->{$link}->{'invalid'} };
+ }
+
+ my @return = ();
+
+ foreach my $should (@should_find) {
+ if ($content !~ /$should/) {
+ push @return, " Could not find text: $should";
+ }
+ }
+
+ foreach my $shouldnt (@should_not_find) {
+ if ($content =~ /$shouldnt/) {
+ push @return, " Incorrectly found text: $shouldnt";
+ }
+ }
+
+ if (@return) {
+ my $output = join("\n", "CONTENT ERROR: $link", @return) . "\n";
+ push @content_errors, $output;
+ print STDERR $output if $noisy;
+ }
+
+}
+__END__
+
+# ----------------------------------------------------
+
+=head1 NAME
+
+linkchecker.pl - check links on a website
+
+=head1 VERSION
+
+This documentation refers to linkchecker.pl version $Revision: 1.14 $
+
+=head1 SYNOPSIS
+
+ linkchecker.pl [options]
+
+Options:
+
+ --noisy Prints out progress to STDERR as the program
+ is running. Defaults to 0
+ --validate Includes HTML validation errors. Defaults to 0.
+ WARNING - this can produce a lot of output
+ --send_email Will send out the results report to the address
+ specified in email_to in the linkchecker section of
+ the config file If this flag isn't set, the final
+ report will be printed to the console instead. This
+ flag should be set in the crontab
+ --email_from The email address the report will be sent from.
+ Defaults to the address in the config file
+ --email_to The email address the report will be sent to.
+ This is usually a group mailing list.
+ Defaults to the address in the config file
+ --site The site you want to run the checker on. Defaults
+ to the site in the link_checker section of the config
+ --depth depth to follow links to, based off of the front page.
+ --depth 2 follows links two deep, --depth 1 one, and so on.
+ file
+ --content_test_file This is a YAML file that contains information on text
+ that you should or should NOT find at the given URL. Reads
+ the file from the config file by default. See full docs for YAML
+ file format.
+ --white_list_file URLs are normally cleaned up a little bit. Anchors are dropped,
+ query strings are dropped, and so on. The white list file is a YAML
+ file that lets us provide links that should be viewed as is.
+ --black_list_file a YAML file containing URLs that we don't want to visit on the
+ site, for whatever reason.
+ --help Show brief help and exit
+ --man Show full documentation
+ --version Show version and exit
+
+=head1 DESCRIPTION
+
+Checks your links. By default, runs against the site listed in the
+link_checker section of the gramene config file.
+
+=head1 AUTHOR
+
+Jim Thomason E<lt>thomason@cshl.eduE<gt>.
+
+=head1 EXAMPLE YAML
+---
+valid:
+ - Gramene
+invalid:
+ - Error
+---
+http://dev.gramene.org :
+ valid:
+ - Release #29
+ - Gramene
+ invalid:
+ - Release #28
+ - Gramene Sucks
+
+The first YAML doc section contains global values. Things we should find on every
+or no page. The second doc section refers to specific content at individual URLs.
+
+See http://www.yaml.org/ for information on YAML docs.
+
+=head1 COPYRIGHT
+
+Copyright (c) 2007-2009 Cold Spring Harbor Laboratory
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself.
+
+=cut