From af63bcfb7e8d25e57b23e4e12b5a896cab8cd7e8 Mon Sep 17 00:00:00 2001 From: elserj Date: Wed, 19 Feb 2020 20:32:46 +0000 Subject: [PATCH] Add script to generate ortholog association files from GO Ath file svn path=/; revision=668 --- .../association_file_from_orthologs.pl | 96 +++++++++++++++++++ .../association_file_from_orthologs_simple.pl | 74 ++++++++++++++ 2 files changed, 170 insertions(+) create mode 100755 interactome_scripts/association_file_from_orthologs.pl create mode 100755 interactome_scripts/association_file_from_orthologs_simple.pl diff --git a/interactome_scripts/association_file_from_orthologs.pl b/interactome_scripts/association_file_from_orthologs.pl new file mode 100755 index 0000000..66e327c --- /dev/null +++ b/interactome_scripts/association_file_from_orthologs.pl @@ -0,0 +1,96 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Time::localtime; +my $tm=localtime; +my ($day,$month,$year)=($tm->mday,$tm->mon,$tm->year); +my $curr_date = "$year$month$day"; + + +# check for arguments and explain usage +if ($#ARGV !=3) { + print "usage: association_file_from_orthologs.pl in_assoc_file gene_name_file ortho_file out_file\n"; + exit; +} + +# Grab the find_species subroutines to get the species and taxon id +if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") { + require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl"; +}elsif(-e "$ENV{HOME}/bin/find_species.pl") { + require "$ENV{HOME}/bin/find_species.pl"; +} + + +my $ingaffile = $ARGV[0]; +my $ingenefile = $ARGV[1]; +my $inorthofile = $ARGV[2]; +my $outfile = $ARGV[3]; + +# Get species and taxon +my $species = find_species("$ingaffile"); +my $found_taxon = find_taxon($species); + +my %gene_hash; +# hash to store symbol->gene lookups +open (INGENEFILE, $ingenefile); +while() { + my $line = $_; + chomp $line; + my ($gene, $description, $trembl, $swissprot, $uniprot) = split("\t", $line); + if($swissprot ne "") { + $gene_hash{$swissprot}->{'gene'} = $gene; + $gene_hash{$swissprot}->{'description'} = $description; + } + + if($uniprot ne "") { + $gene_hash{$uniprot}->{'gene'} = $gene; + $gene_hash{$uniprot}->{'description'} = $description; + } + +} +close(INGENEFILE); + +my %ortho_hash; +open (INORTHOFILE, $inorthofile); +while() { + my $line = $_; + chomp $line; + next if ($line =~ /^id/); + my ($id, $gene, $ortho, $score) = split("\t", $line); + + if($score > "0.05") { + $ortho_hash{$gene} = $ortho; + } +} +close(INORTHOFILE); + + +open (INGAFFILE, $ingaffile); +open (OUTFILE, ">$outfile"); + +print OUTFILE "!gaf-version: 2.0\n"; +print OUTFILE "!Created via ortholog similiarity from Arabidopsis thaliana\n"; +while() { + my $line = $_; + chomp $line; + #skip comments + next if ($line =~ /^!/); + + my ($db,$db_id,$db_symbol,$qual,$ont_id,$db_ref,$ev,$with,$aspect,$db_obj_name,$db_obj_syn,$db_obj_type,$taxon,$date,$assigned_by,$annot_ext,$gp_form_id) = split("\t", $line); + #skip with evidence codes IEA and ISS + next if ($ev eq "IEA"); + next if ($ev eq "ISS"); + #check if symbol found in %gene_hash + if(defined($gene_hash{$db_id})) { + if(defined($ortho_hash{$gene_hash{$db_id}->{'gene'}})) { + my $ortho_gene = $ortho_hash{$gene_hash{$db_id}->{'gene'}}; + my $ortho_desc = $gene_hash{$db_id}->{'description'}; + print OUTFILE "$db\t$ortho_gene\t$ortho_gene\t$qual\t$ont_id\t$db_ref\t$ev\tUniProtKB:$gene_hash{$db_id}->{'gene'}\t$aspect\t$ortho_desc\t$db_obj_syn\t$db_obj_type\ttaxon:$found_taxon\t$curr_date\tJustin_Elser\t$annot_ext\t$gp_form_id\n"; + } + } + +} +close(INGAFFILE); +close(OUTFILE); diff --git a/interactome_scripts/association_file_from_orthologs_simple.pl b/interactome_scripts/association_file_from_orthologs_simple.pl new file mode 100755 index 0000000..16e7dca --- /dev/null +++ b/interactome_scripts/association_file_from_orthologs_simple.pl @@ -0,0 +1,74 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Time::localtime; +my $tm=localtime; +my ($day,$month,$year)=($tm->mday,$tm->mon,$tm->year); +$year += 1900; +$month = sprintf '%02d', $month+1; +$day = sprintf '%02d', $day; +my $curr_date = "$year$month$day"; +print "$curr_date\n"; + + + +# check for arguments and explain usage +if ($#ARGV !=2) { + print "usage: association_file_from_orthologs.pl in_assoc_file ortho_file out_file\n"; + exit; + + +# Grab the find_species subroutines to get the species and taxon id +if(-e "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl") { + require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl"; +}elsif(-e "$ENV{HOME}/bin/find_species.pl") { + require "$ENV{HOME}/bin/find_species.pl"; +} + + +my $ingaffile = $ARGV[0]; +my $inorthofile = $ARGV[1]; +my $outfile = $ARGV[2]; + +# Get species and taxon +my $species = find_species("$ingaffile"); +my $new_taxon = find_taxon($species); + + +my %ortho_hash; +open (INORTHOFILE, $inorthofile); +while() { + my $line = $_; + chomp $line; + next if ($line =~ /^id/); + my ($gene, $ortho) = split("\t", $line); + + $ortho_hash{$ortho} = $gene; + +} +close(INORTHOFILE); + + +open (INGAFFILE, $ingaffile); +open (OUTFILE, ">$outfile"); + +print OUTFILE "!gaf-version: 2.0\n"; +print OUTFILE "!Created via ortholog similiarity from Arabidopsis thaliana\n"; +while() { + my $line = $_; + chomp $line; + #skip comments + next if ($line =~ /^!/); + + my ($db,$db_id,$db_symbol,$qual,$ont_id,$db_ref,$ev,$with,$aspect,$db_obj_name,$db_obj_syn,$db_obj_type,$taxon,$date,$assigned_by,$annot_ext,$gp_form_id) = split("\t", $line); + #check if symbol found in %gene_hash + if(defined($ortho_hash{$db_obj_name})) { + my $ortho_gene = $ortho_hash{$db_obj_name}; + print OUTFILE "MaizeGDB\t$ortho_gene\t$ortho_gene\t$qual\t$ont_id\tPMID:24919147|PMID:21186353\tISO\tAGI_LocusCode:$db_obj_name\t$aspect\t$ortho_gene\t\t$db_obj_type\ttaxon:$new_taxon\t$curr_date\tPlanteome:Justin_Elser\t$annot_ext\t$gp_form_id\n"; + } + +} +close(INGAFFILE); +close(OUTFILE); -- 2.34.1