From: elserj Date: Fri, 14 Feb 2014 17:40:36 +0000 (+0000) Subject: Updated for new species and fixed a couple of regex issues X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=30bc9c4dc783c17e2b59e92d6264df6b560b379f;p=old-jaiswallab-svn%2F.git Updated for new species and fixed a couple of regex issues svn path=/; revision=527 --- diff --git a/interactome_scripts/find_species.pl b/interactome_scripts/find_species.pl index 48317a1..c93b9fa 100755 --- a/interactome_scripts/find_species.pl +++ b/interactome_scripts/find_species.pl @@ -24,12 +24,12 @@ sub find_species { $species = "Batrachochytrium_dendrobatidis"; }elsif ($temp =~ /Brachypodium\_distachyon/) { $species = "Brachypodium_distachyon"; - }elsif ($temp =~ /Brachypodium\_sylvaticum\_Corvallis/) { - $species = "Brachypodium_sylvaticum_Corvallis"; - }elsif ($temp =~ /Brachypodium\_sylvaticum\_Greece/) { - $species = "Brachypodium_sylvaticum_Greece"; - }elsif ($temp =~ /Brachypodium\_sylvaticum\_Spain/) { - $species = "Brachypodium_sylvaticum_Spain"; + }elsif ($temp =~ /Brachypodium\_sylvaticum\.Corvallis/) { + $species = "Brachypodium_sylvaticum.Corvallis"; + }elsif ($temp =~ /Brachypodium\_sylvaticum\.Greece/) { + $species = "Brachypodium_sylvaticum.Greece"; + }elsif ($temp =~ /Brachypodium\_sylvaticum\.Spain/) { + $species = "Brachypodium_sylvaticum.Spain"; }elsif ($temp =~ /Brassica\_rapa/) { $species = "Brassica_rapa"; }elsif ($temp =~ /elegans/) { @@ -54,6 +54,12 @@ sub find_species { $species = "Drosophila_melanogaster"; }elsif ($temp =~ /Ectocarpus/) { $species = "Ectocarpus_siliculosus"; + }elsif ($temp =~ /Elaphocordyceps\_capitata/) { + $species = "Elaphocordyceps_capitata"; + }elsif ($temp =~ /Elaphocordyceps\_ophioglossoides/) { + $species = "Elaphocordyceps_ophioglossoides"; + }elsif ($temp =~ /Elaphocordyceps\_paradoxa/) { + $species = "Elaphocordyceps_paradoxa"; }elsif ($temp =~ /Epichloe_festuca/) { $species = "Epichloe\_festuca"; }elsif ($temp =~ /Escherichia\_coli/) { @@ -65,9 +71,9 @@ sub find_species { }elsif ($temp =~ /Fusarium_graminearum/) { $species = "Fusarium\_graminearum"; }elsif ($temp =~ /Fusarium_oxysporum/) { - $species = "Fusarium\_oxysporum"; - }elsif ($temp =~ /Fusarium_verticilliodes/) { - $species = "Fusarium\_verticilliodes"; + $species = "Fusarium\_oxysporum.4287"; + }elsif ($temp =~ /Fusarium_verticillioides/) { + $species = "Fusarium\_verticillioides"; }elsif ($temp =~ /Gadus_morhua/) { $species = "Gadus\_morhua"; }elsif ($temp =~ /Gasterosteus/) { @@ -128,9 +134,9 @@ sub find_species { $species = "Oryza_punctata"; }elsif ($temp =~ /Oryza\_rufipogon/) { $species = "Oryza_rufipogon"; - }elsif ($temp =~ /Oryza\_sativa.indica\_gramene/) { + }elsif ($temp =~ /Oryza\_sativa.indica\.gramene/) { $species = "Oryza_sativa.indica.gramene"; - }elsif ($temp =~ /Oryza\_sativa.indica\_iplant/) { + }elsif ($temp =~ /Oryza\_sativa.indica\.iplant/) { $species = "Oryza_sativa.indica.iplant"; }elsif ($temp =~ /Oryza\_sativa.japonica.IRGSP/) { $species = "Oryza_sativa.japonica.IRGSP"; @@ -148,6 +154,12 @@ sub find_species { $species = "Phyllostachys_heterocycla"; }elsif ($temp =~ /Physcomit/) { $species = "Physcomitrella_patens"; + }elsif ($temp =~ /Phytophthora/) { + $species = "Phytophthora_infestans"; + }elsif ($temp =~ /Picea/) { + $species = "Picea_abies"; + }elsif ($temp =~ /Pinus/) { + $species = "Pinus_taeda"; }elsif ($temp =~ /Populus/) { $species = "Populus_trichocarpa"; }elsif ($temp =~ /Prunus/) { @@ -164,6 +176,8 @@ sub find_species { $species = "Schizosaccharomyces_pombe"; }elsif ($temp =~ /Selaginella/) { $species = "Selaginella_moellendorffii"; + }elsif ($temp =~ /Setaria/) { + $species = "Setaria_italica"; }elsif ($temp =~ /Solanum_tuberosum/) { $species = "Solanum_tuberosum"; }elsif ($temp =~ /Solanum_lycopersicum/) { @@ -178,26 +192,16 @@ sub find_species { $species = "Tetraodon\_nigroviridis"; }elsif ($temp =~ /Theobroma\_cacao/) { $species = "Theobroma_cacao"; - }elsif ($temp =~ /TmDV92/) { - $species = "TmDV92"; - }elsif ($temp =~ /TmG3116/) { - $species = "TmG3116"; }elsif ($temp =~ /Tolypocladium\_inflatum/) { $species = "Tolypocladium_inflatum"; - }elsif ($temp =~ /Trichoderma\_atroviride/) { - $species = "Trichoderma_atroviride"; - }elsif ($temp =~ /Trichoderma\_reesii/) { - $species = "Trichoderma_reesii"; - }elsif ($temp =~ /Trichoderma\_virens/) { - $species = "Trichoderma_virens"; }elsif ($temp =~ /Trichodesmium/) { $species = "Trichodesmium_erythraeum"; - }elsif ($temp =~ /triticum\_aestivum/) { + }elsif ($temp =~ /Triticum\_aestivum/) { $species = "Triticum_aestivum"; - }elsif ($temp =~ /Triticum\_monococcumDV92/) { - $species = "Triticum_monococcumDV92"; - }elsif ($temp =~ /Triticum\_monococcumG3116/) { - $species = "Triticum_monococcumG3116"; + }elsif ($temp =~ /Triticum\_monococcum\.DV92/) { + $species = "Triticum_monococcum.DV92"; + }elsif ($temp =~ /Triticum\_monococcum\.G3116/) { + $species = "Triticum_monococcum.G3116"; }elsif ($temp =~ /Triticum\_urartu/) { $species = "Triticum_urartu"; }elsif ($temp =~ /Vitis\_vinifera/) { @@ -227,17 +231,17 @@ sub find_gene { my ($gene_id,$isomer) = split(/\|/, $gene_header); $gene = $isomer; }elsif ($species eq "Batrachochytrium_dendrobatidis") { - my ($gene_id,$transcript,$info) = split(/\s\|\s/, $gene_header); - $gene = $gene_id; + my ($transcript,$gene_id,$info) = split(/\s\|\s/, $gene_header); + $gene = $transcript; }elsif ($species eq "Brachypodium_distachyon") { my ($gene_id,$isomer) = split(/\|/, $gene_header); $gene = $isomer; - }elsif ($species eq "Brachypodium_sylvaticum_Corvallis") { + }elsif ($species eq "Brachypodium_sylvaticum.Corvallis") { my ($gene_id,$temp,$start,$end) = split(/\|/, $gene_header); $gene = $gene_id; - }elsif ($species eq "Brachypodium_sylvaticum_Greece") { + }elsif ($species eq "Brachypodium_sylvaticum.Greece") { $gene = $gene_header; - }elsif ($species eq "Brachypodium_sylvaticum_Spain") { + }elsif ($species eq "Brachypodium_sylvaticum.Spain") { my ($gene_id,$temp,$start,$end) = split(/\|/, $gene_header); $gene = $gene_id; }elsif ($species eq "Brassica_rapa") { @@ -278,8 +282,15 @@ sub find_gene { }elsif ($species eq "Ectocarpus_siliculosus") { my ($gene_id,$temp) = split(/\|/, $gene_header); $gene = $gene_id; - }elsif ($species eq "Epichloe_festuca") { + }elsif ($species eq "Elaphocordyceps_capitata") { + $gene = $gene_header; + }elsif ($species eq "Elaphocordyceps_ophioglossoides") { + $gene = $gene_header; + }elsif ($species eq "Elaphocordyceps_paradoxa") { $gene = $gene_header; + }elsif ($species eq "Epichloe_festuca") { + my ($gene_id, $temp) = split(/\s/, $gene_header); + $gene = $gene_id; }elsif ($species eq "Escherichia_coli") { $gene = $gene_header; #??? }elsif ($species eq "Eucalyptus_grandis") { @@ -290,13 +301,14 @@ sub find_gene { $gene_id =~ s/\-v1\.0\-hybrid//; $gene = $gene_id; }elsif ($species eq "Fusarium_graminearum") { - my ($gene_id, $info) = split(/\s\|\s/, $gene_header); - $gene = $gene_id; - }elsif ($species eq "Fusarium_oxysporum") { - $gene = $gene_header; - }elsif ($species eq "Fusarium_verticilliodes") { - my ($gene_id, $info) = split(/\s\|\s/, $gene_header); - $gene = $gene_id; + my ($transcript, $gene_id, $info) = split(/\s\|\s/, $gene_header); + $gene = $transcript; + }elsif ($species eq "Fusarium_oxysporum.4287") { + my ($transcript, $gene_id, $info) = split(/\s\|\s/, $gene_header); + $gene = $transcript; + }elsif ($species eq "Fusarium_verticillioides") { + my ($transcript, $gene_id, $info) = split(/\s\|\s/, $gene_header); + $gene = $transcript; }elsif ($species eq "Gadus_morhua") { my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); $transcript =~ s/transcript\://; @@ -345,7 +357,7 @@ sub find_gene { $gene = $transcript; }elsif ($species eq "Mimulus_guttatus") { my ($gene_id,$transcript) = split(/\|/, $gene_header); - $gene = $gene_id; + $gene = $transcript; }elsif ($species eq "Mus_musculus") { my ($protein,$type,$chrom,$gene_id,$transcript) = split(/\s/,$gene_header); $gene = $protein; @@ -353,8 +365,9 @@ sub find_gene { my ($protein,$type,$chrom,$gene_id,$transcript) = split(/\s/,$gene_header); $gene = $protein; }elsif ($species eq "Nectria_haematococca") { - my ($source, $spec, $int_id, $gene_id) = split(/\|/, $gene_header); - $gene = $gene_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Neurospora_crassa") { my ($transcript,$gene_id,$temp) = split(/\s\|\s/,$gene_header); $gene = $transcript; @@ -363,7 +376,7 @@ sub find_gene { $gene = $transcript; }elsif ($species eq "Oncorhynchus_mykiss") { my ($gi, $number, $type, $gene_id, $function) = split(/\|/,$gene_header); - $gene = $gene_id; + $gene = $number; }elsif ($species eq "Leersia_perrieri") { my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); $gene = $gene_id; @@ -399,64 +412,81 @@ sub find_gene { $gene = $gene_id; }elsif ($species eq "Oryza_sativa.indica.gramene") { my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); - $transcript =~ s/transcript\://; - $gene = $transcript; + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Oryza_sativa.japonica.iplant") { my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); $gene = $gene_id; }elsif ($species eq "Oryza_sativa.japonica.IRGSP") { my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); - $transcript =~ s/transcript\://; - $gene = $transcript; + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Oryza_sativa.japonica.MSU") { my ($gene_id,$transcript) = split(/\|/,$gene_header); $gene = $transcript; }elsif ($species eq "Oryzias_latipes") { - my ($gene_id, $transcript_id) = split(/\|/,$gene_header); - $gene = $transcript_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Pediculus_humanus") { - my ($source,$gene_pa,$func,$unknown,$gene_id) = split(/\|/, $gene_header); - $gene_id =~ s/gene\://; + my ($gene_pa,$func,$func2,$unknown,$gene_id) = split(/[\|\s+]/, $gene_header); $gene = $gene_pa; }elsif ($species eq "Phoenix_dactylifera") { - my ($gene_id, $temp) = split(/\s/, $gene_header); + my ($gene_id, $locus_tag, $product, $strand) = split(/\s\[/, $gene_header); $gene = $gene_id; }elsif ($species eq "Phyllostachys_heterocycla") { my ($gene_id, $gene_model, $location, $dot, $strand, $dot2, $info) = split (/\s+/, $gene_header); $gene = $gene_id; }elsif ($species eq "Physcomitrella_patens") { - my ($name,$pac_id) = split(/\|/,$gene_header); - $gene = $name; #??? + my ($gene_id,$transcript_id) = split(/\|/,$gene_header); + $gene = $transcript_id; + }elsif ($species eq "Phytophthora_infestans") { + my ($transcript, $gene_id, $function) = split(/\s\|\s/,$gene_header); + $gene = $transcript; + }elsif ($species eq "Picea_abies") { + my ($gene_id, $confidence) = split(/\s/,$gene_header); + $gene = $gene_id; + }elsif ($species eq "Pinus_taeda") { + my ($gene_id,$temp) = split(/\:/,$gene_header); + $gene_id =~ /\D+(\d+)/; + print "$1\n"; + $gene = $1; }elsif ($species eq "Populus_trichocarpa") { my ($gene_id,$transcript_id) = split(/\|/,$gene_header); $gene = $transcript_id; }elsif ($species eq "Prunus_persica") { - my ($gene_id,$temp) = split(/\s/,$gene_header); + my ($transcript,$gene_id) = split(/\|/,$gene_header); $gene = $gene_id; }elsif ($species eq "Rattus_norvegicus") { - my ($gene_id,$transcript,$chrom) = split(/\|/, $gene_header); - $gene = $gene_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Rhizopus_oryzae") { - my ($gene_id,$func) = split(/\s\|\s/, $gene_header); - $gene = $gene_id; + my ($jgi,$temp,$num,$gene_id) = split(/\|/, $gene_header); + $gene = $num; }elsif ($species eq "Ricinus_communis") { - my ($name,$transcript,$gene_id,$temp) = split(/\|/, $gene_header); + my ($transcript,$model) = split(/\|/, $gene_header); $gene = $transcript; }elsif ($species eq "Saccharomyces_cerevisiae") { - my ($gene_id,$gene_name,$transcript,$temp) = split(/\s/,$gene_header); - $gene = $gene_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript }elsif ($species eq "Schizosaccharomyces_pombe") { - my ($gene_id,$gene_name,$unknown,$func,$name,$chrom,$temp) = split(/\s/,$gene_header); - $gene = $gene_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript }elsif ($species eq "Selaginella_moellendorffii") { - my ($name,$locus_id,$chrom_id,$prot_id) = split(/\|/,$gene_header); - $gene = $prot_id; #??? - }elsif ($species eq "Solanum_tuberosum") { - my ($protein,$transcript,$type) = split(/\s/,$gene_header); - $gene = $protein; + my ($locus_id,$prot_id) = split(/\|/,$gene_header); + $gene = $locus_id; + }elsif ($species eq "Setaria_italica") { + my ($locus_id,$prot_id) = split(/\|/,$gene_header); + $gene = $prot_id; }elsif ($species eq "Solanum_lycopersicum") { - my ($protein,$type,$location,$gene_1,$transcript) = split(/\s/,$gene_header); - $gene = $protein; + my ($locus_id,$prot_id) = split(/\|/,$gene_header); + $gene = $locus_id; + }elsif ($species eq "Solanum_tuberosum") { + my ($locus_id,$prot_id) = split(/\|/,$gene_header); + $gene = $prot_id; }elsif ($species eq "Sorghum_bicolor") { my ($gene_id,$transcript_id) = split(/\|/,$gene_header); $gene = $transcript_id; @@ -464,54 +494,42 @@ sub find_gene { my ($gene_id,$temp) = split(" ",$gene_header); $gene = $gene_id; #??? }elsif ($species eq "Takifugu_rubripes") { - my ($gene_id, $transcript_id) = split(/\|/,$gene_header); - $gene = $transcript_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Tetraodon_nigroviridis") { - my ($gene_id, $transcript_id) = split(/\|/,$gene_header); - $gene = $transcript_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Theobroma_cacao") { - my ($gene_id,$temp) = split(/\s/,$gene_header); + my ($gene_id,$transcript) = split(/\s/,$gene_header); $gene = $gene_id; - }elsif ($species eq "TmDV92") { - my ($gene_id,$temp) = split(/\s+/,$gene_header); - $gene = $gene_id; - }elsif ($species eq "TmG3116") { - my ($gene_id,$temp) = split(/\s+/,$gene_header); - $gene = $gene_id; }elsif ($species eq "Tolypocladium_inflatum") { - my ($gene_id, $type, $info) = split(/\s/, $gene_header); - $gene = $gene_id; - }elsif ($species eq "Trichoderma_atroviride") { - my ($source, $spec, $int_id, $gene_id) = split(/\|/,$gene_header); - $gene = $gene_id; - }elsif ($species eq "Trichoderma_reesii") { - my ($source, $spec, $int_id, $gene_id) = split(/\|/,$gene_header); - $gene = $gene_id; - }elsif ($species eq "Trichoderma_virens") { - my ($temp,$unknown,$source,$gene_id,$func) = split(/\|/, $gene_header); - $gene = $gene_id; + $gene = $gene_header; }elsif ($species eq "Trichodesmium_erythraeum") { - my ($temp,$unknown,$source,$gene_id,$func) = split(/\|/, $gene_header); - $gene = $gene_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Triticum_aestivum") { - my ($type,$transcript,$source,$gene_id,$func) = split(/\|/, $gene_header); - $gene = $transcript; - }elsif ($species eq "Triticum_monococcumDV92") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; + }elsif ($species eq "Triticum_monococcum.DV92") { my ($gene_id,$temp) = split(/\s+/,$gene_header); $gene = $gene_id; - }elsif ($species eq "Triticum_monococcumG3116") { + }elsif ($species eq "Triticum_monococcum.G3116") { my ($gene_id,$temp) = split(/\s+/,$gene_header); $gene = $gene_id; }elsif ($species eq "Triticum_urartu") { my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); - $transcript =~ s/transcript\://; - $gene = $transcript; + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Vitis_vinifera") { - #my ($name,$gene_id,$chrom_id,$id) = split(/\|/,$gene_header); - $gene = $gene_header; #??? + my ($gene_id,$transcript_id) = split(/\|/,$gene_header); + $gene = $gene_id; }elsif ($species eq "Zea_mays") { my ($gene_id,$transcript) = split(/\|/,$gene_header); - $gene = $transcript; + $gene = $gene_id; }else { die "Error: Gene id can not be found for species $species!"; @@ -526,8 +544,8 @@ sub find_gene_synonym { my $species = $_[1]; my $synonym; if ($species eq "Batrachochytrium_dendrobatidis") { - my ($gene_id,$transcript,$info) = split(/\s\|\s/, $gene_header); - $synonym = $transcript; + my ($transcript,$gene_id,$info) = split(/\s\|\s/, $gene_header); + $synonym = $gene_id; }elsif ($species eq "Caenorhabditis_elegans") { my ($locus_id,$unknown,$gene_id,$temp) = split(/\|/, $gene_header); $synonym = $gene_id; #??? @@ -556,11 +574,13 @@ sub find_gene_synonym { my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); $transcript =~ s/transcript\://; $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; $synonym = "$peptide,$gene_id"; }elsif ($species eq "Drosophila_melanogaster") { my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); $transcript =~ s/transcript\://; $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; $synonym = "$peptide,$gene_id"; }elsif ($species eq "Eucalyptus_grandis") { my ($gene_id,$transcript_id) = split(/\|/,$gene_header); @@ -570,15 +590,26 @@ sub find_gene_synonym { $gene_id =~ s/\-v1\.0\-hybrid//; $mrna_id =~ s/\-v1\.0\-hybrid//; $synonym = $mrna_id; + }elsif ($species eq "Fusarium_graminearum") { + my ($transcript, $gene_id, $info) = split(/\s\|\s/, $gene_header); + $synonym = $gene_id; + }elsif ($species eq "Fusarium_oxysporum.4287") { + my ($transcript, $gene_id, $info) = split(/\s\|\s/, $gene_header); + $synonym = $gene_id; + }elsif ($species eq "Fusarium_verticillioides") { + my ($transcript, $gene_id, $info) = split(/\s\|\s/, $gene_header); + $synonym = $gene_id; }elsif ($species eq "Gadus_morhua") { my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); $transcript =~ s/transcript\://; $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; $synonym = "$peptide,$gene_id"; }elsif ($species eq "Gasterosteus_aculeatus") { my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); $transcript =~ s/transcript\://; $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; $synonym = "$peptide,$gene_id"; }elsif ($species eq "Glycine_max") { my ($locus_id,$isomer) = split(/\|/,$gene_header); @@ -590,11 +621,13 @@ sub find_gene_synonym { my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); $transcript =~ s/transcript\://; $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; $synonym = "$peptide,$gene_id"; }elsif ($species eq "Hordeum_vulgare") { my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); $transcript =~ s/transcript\://; $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; $synonym = "$peptide,$gene_id"; }elsif ($species eq "Linum_usitatissimum") { my ($transcript,$gene_id) = split(/\|/, $gene_header); @@ -618,11 +651,19 @@ sub find_gene_synonym { my ($protein,$type,$chrom,$gene_id,$transcript) = split(/\s/,$gene_header); $transcript =~ s/transcript\://; $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; $synonym = "$peptide,$gene_id"; }elsif ($species eq "Musa_acuminata") { my ($protein,$type,$chrom,$gene_id,$transcript) = split(/\s/,$gene_header); $transcript =~ s/transcript\://; $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Nectria_haematococca") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; $synonym = "$peptide,$gene_id"; }elsif ($species eq "Neurospora_crassa") { my ($transcript,$gene_id,$temp) = split(/\s\|\s/,$gene_header); @@ -632,124 +673,125 @@ sub find_gene_synonym { $synonym = $gene_id; }elsif ($species eq "Oncorhynchus_mykiss") { my ($gi, $number, $type, $gene_id, $function) = split(/\|/,$gene_header); - $synonym = $number; - }elsif ($species eq "Oryza_sativa.indica.gramene") { - my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); - $transcript =~ s/transcript\://; - $gene_id =~ s/gene\://; - $synonym = "$peptide,$gene_id"; - }elsif ($species eq "Oryza_sativa.japonica.IRGSP") { - my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); - $transcript =~ s/transcript\://; - $gene_id =~ s/gene\://; - $synonym = "$peptide,$gene_id"; + $synonym = $gene_id; + }elsif ($species eq "Oryza_sativa.indica.gramene") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Oryza_sativa.japonica.IRGSP") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; + $synonym = "$peptide,$gene_id"; }elsif ($species eq "Oryzias_latipes") { - my ($gene_id, $transcript_id) = split(/\|/,$gene_header); - $gene = $transcript_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; + $synonym = "$peptide,$gene_id"; }elsif ($species eq "Pediculus_humanus") { - my ($source,$gene_pa,$func,$unknown,$gene_id) = split(/\|/, $gene_header); + my ($gene_pa,$func,$func2,$unknown,$gene_id) = split(/[\|\s+]/, $gene_header); $gene_id =~ s/gene\://; - $gene = $gene_pa; + $synonym = $gene_id; }elsif ($species eq "Phoenix_dactylifera") { - my ($gene_id, $temp) = split(/\s/, $gene_header); - $gene = $gene_id; + my ($gene_id, $locus_tag, $product, $strand) = split(/\s\[/, $gene_header); + $locus_tag =~ s/locus_tag=//; + $locus_tag =~ s/"//g; + $synonym = $locus_tag; }elsif ($species eq "Phyllostachys_heterocycla") { my ($gene_id, $gene_model, $location, $dot, $strand, $dot2, $info) = split (/\s+/, $gene_header); $gene = $gene_id; }elsif ($species eq "Physcomitrella_patens") { - my ($name,$pac_id) = split(/\|/,$gene_header); - $gene = $name; #??? + my ($gene_id,$transcript_id) = split(/\|/,$gene_header); + $synonym = $gene_id; + }elsif ($species eq "Phytophthora_infestans") { + my ($transcript, $gene_id, $function) = split(/\s\|\s/,$gene_header); + $synonym = $gene_id; }elsif ($species eq "Populus_trichocarpa") { my ($gene_id,$transcript_id) = split(/\|/,$gene_header); - $gene = $transcript_id; + $synonym = $gene_id; }elsif ($species eq "Prunus_persica") { - my ($gene_id,$temp) = split(/\s/,$gene_header); - $gene = $gene_id; + my ($transcript,$gene_id) = split(/\|/,$gene_header); + $synonym = $transcript; }elsif ($species eq "Rattus_norvegicus") { - my ($gene_id,$transcript,$chrom) = split(/\|/, $gene_header); - $gene = $gene_id; - }elsif ($species eq "Rhizopus_oryzae") { - my ($gene_id,$func) = split(/\s\|\s/, $gene_header); - $gene = $gene_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; + $synonym = "$peptide,$gene_id"; }elsif ($species eq "Ricinus_communis") { - my ($name,$transcript,$gene_id,$temp) = split(/\|/, $gene_header); - $gene = $transcript; + my ($transcript,$model) = split(/\|/, $gene_header); + $synonym = $model; }elsif ($species eq "Saccharomyces_cerevisiae") { - my ($gene_id,$gene_name,$transcript,$temp) = split(/\s/,$gene_header); - $gene = $gene_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; + $synonym = "$peptide,$gene_id"; }elsif ($species eq "Schizosaccharomyces_pombe") { - my ($gene_id,$gene_name,$unknown,$func,$name,$chrom,$temp) = split(/\s/,$gene_header); - $gene = $gene_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; + $synonym = "$peptide,$gene_id"; }elsif ($species eq "Selaginella_moellendorffii") { - my ($name,$locus_id,$chrom_id,$prot_id) = split(/\|/,$gene_header); - $gene = $prot_id; #??? - }elsif ($species eq "Solanum_tuberosum") { - my ($protein,$transcript,$type) = split(/\s/,$gene_header); - $gene = $protein; + my ($locus_id,$prot_id) = split(/\|/,$gene_header); + $synonym = $prot_id; + }elsif ($species eq "Setaria_italica") { + my ($locus_id,$prot_id) = split(/\|/,$gene_header); + $synonym = $locus_id; }elsif ($species eq "Solanum_lycopersicum") { - my ($protein,$type,$location,$gene_1,$transcript) = split(/\s/,$gene_header); - $gene = $protein; + my ($locus_id,$prot_id) = split(/\|/,$gene_header); + $synonym = $locus_id; + }elsif ($species eq "Solanum_tuberosum") { + my ($locus_id,$prot_id) = split(/\|/,$gene_header); + $synonym = $locus_id; }elsif ($species eq "Sorghum_bicolor") { my ($gene_id,$transcript_id) = split(/\|/,$gene_header); $synonym = $gene_id; - }elsif ($species eq "Synechocystis_pcc6803") { - my ($gene_id,$temp) = split(" ",$gene_header); - $gene = $gene_id; #??? }elsif ($species eq "Takifugu_rubripes") { - my ($gene_id, $transcript_id) = split(/\|/,$gene_header); - $gene = $transcript_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; + $synonym = "$peptide,$gene_id"; }elsif ($species eq "Tetraodon_nigroviridis") { - my ($gene_id, $transcript_id) = split(/\|/,$gene_header); - $gene = $transcript_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; + $synonym = "$peptide,$gene_id"; }elsif ($species eq "Theobroma_cacao") { - my ($gene_id,$temp) = split(/\s/,$gene_header); - $gene = $gene_id; - }elsif ($species eq "TmDV92") { - my ($gene_id,$temp) = split(/\s+/,$gene_header); - $gene = $gene_id; - }elsif ($species eq "TmG3116") { - my ($gene_id,$temp) = split(/\s+/,$gene_header); - $gene = $gene_id; - }elsif ($species eq "Tolypocladium_inflatum") { - my ($gene_id, $type, $info) = split(/\s/, $gene_header); - $gene = $gene_id; - }elsif ($species eq "Trichoderma_atroviride") { - my ($source, $spec, $int_id, $gene_id) = split(/\|/,$gene_header); - $gene = $gene_id; - }elsif ($species eq "Trichoderma_reesii") { - my ($source, $spec, $int_id, $gene_id) = split(/\|/,$gene_header); - $gene = $gene_id; - }elsif ($species eq "Trichoderma_virens") { - my ($temp,$unknown,$source,$gene_id,$func) = split(/\|/, $gene_header); - $gene = $gene_id; + my ($gene_id,$transcript) = split(/\s/,$gene_header); + $synonym = $transcript; }elsif ($species eq "Trichodesmium_erythraeum") { my ($temp,$unknown,$source,$gene_id,$func) = split(/\|/, $gene_header); $gene = $gene_id; }elsif ($species eq "Triticum_aestivum") { - my ($type,$transcript,$source,$gene_id,$func) = split(/\|/, $gene_header); - $gene = $transcript; - }elsif ($species eq "Triticum_monococcumDV92") { - my ($gene_id,$temp) = split(/\s+/,$gene_header); - $gene = $gene_id; - }elsif ($species eq "Triticum_monococcumG3116") { - my ($gene_id,$temp) = split(/\s+/,$gene_header); - $gene = $gene_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $peptide =~ s/\:pep//; + $synonym = "$peptide,$gene_id"; }elsif ($species eq "Triticum_urartu") { - my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); - $transcript =~ s/transcript\://; - $gene_id =~ s/gene\://; - $synonym = "$peptide,$gene_id"; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $synonym = "$peptide,$gene_id"; }elsif ($species eq "Vitis_vinifera") { - #my ($name,$gene_id,$chrom_id,$id) = split(/\|/,$gene_header); - $gene = $gene_header; #??? + my ($gene_id,$transcript_id) = split(/\|/,$gene_header); + $synonym = $transcript_id; }elsif ($species eq "Zea_mays") { my ($gene_id,$transcript) = split(/\|/,$gene_header); - $synonym = $gene_id; + $synonym = $transcript; }else { - die "Error: Gene id can not be found for species $species!"; + $synonym = "null"; } - return $gene; + return $synonym; } @@ -760,7 +802,7 @@ sub all_species_array { push (@spec_array, "Amborella_trichopoda"); push (@spec_array, "Arabidopsis_lyrata"); push (@spec_array, "Arabidopsis_thaliana"); - #push (@spec_array, "Batrachochytrium_dendrobatidis"); + push (@spec_array, "Batrachochytrium_dendrobatidis"); push (@spec_array, "Brachypodium_distachyon"); push (@spec_array, "Brachypodium_sylvaticum_Corvallis"); push (@spec_array, "Brachypodium_sylvaticum_Greece"); @@ -777,13 +819,16 @@ sub all_species_array { push (@spec_array, "Danio_rerio"); push (@spec_array, "Drosophila_melanogaster"); push (@spec_array, "Ectocarpus_siliculosus"); + push (@spec_array, "Elaphocordyceps_capitata"); + push (@spec_array, "Elaphocordyceps_ophioglossoides"); + push (@spec_array, "Elaphocordyceps_paradoxa"); push (@spec_array, "Epichloe_festuca"); push (@spec_array, "Escherichia_coli"); push (@spec_array, "Eucalyptus_grandis"); push (@spec_array, "Fragaria_vesca"); push (@spec_array, "Fusarium_graminearum"); - push (@spec_array, "Fusarium_oxysporum"); - push (@spec_array, "Fusarium_verticilliodes"); + push (@spec_array, "Fusarium_oxysporum.4287"); + push (@spec_array, "Fusarium_verticillioides"); push (@spec_array, "Gadus_morhua"); push (@spec_array, "Gasterosteus_aculeatus"); push (@spec_array, "Glycine_max"); @@ -804,12 +849,26 @@ sub all_species_array { push (@spec_array, "Neurospora_crassa"); push (@spec_array, "Nostoc_punctiforme"); push (@spec_array, "Oncorhynchus_mykiss"); - push (@spec_array, "Oryza_sativa"); + push (@spec_array, "Oryza_barthii"); + push (@spec_array, "Oryza_brachyantha"); + push (@spec_array, "Oryza_glaberrima"); + push (@spec_array, "Oryza_glumaepatula"); + push (@spec_array, "Oryza_longistaminata"); + push (@spec_array, "Oryza_meridionalis"); + push (@spec_array, "Oryza_nivara"); + push (@spec_array, "Oryza_punctata"); + push (@spec_array, "Oryza_rufipogon"); + push (@spec_array, "Oryza_sativa.indica.gramene"); + push (@spec_array, "Oryza_sativa.indica.iplant"); + push (@spec_array, "Oryza_sativa.japonica.iplant"); + push (@spec_array, "Oryza_sativa.japonica.IRGSP"); + push (@spec_array, "Oryza_sativa.japonica.MSU"); push (@spec_array, "Oryzias_latipes"); push (@spec_array, "Pediculus_humanus"); - push (@spec_array, "Phoenix_dactylifera"); push (@spec_array, "Phyllostachys_heterocycla"); push (@spec_array, "Physcomitrella_patens"); + push (@spec_array, "Phytophthora_infestans"); + push (@spec_array, "Picea_abies"); push (@spec_array, "Populus_trichocarpa"); push (@spec_array, "Prunus_persica"); push (@spec_array, "Rattus_norvegicus"); @@ -818,6 +877,7 @@ sub all_species_array { push (@spec_array, "Saccharomyces_cerevisiae"); push (@spec_array, "Schizosaccharomyces_pombe"); push (@spec_array, "Selaginella_moellendorffii"); + push (@spec_array, "Setaria_italica"); push (@spec_array, "Solanum_tuberosum"); push (@spec_array, "Solanum_lycopersicum"); push (@spec_array, "Sorghum_bicolor"); @@ -825,16 +885,11 @@ sub all_species_array { push (@spec_array, "Takifugu_rubripes"); push (@spec_array, "Tetraodon_nigroviridis"); push (@spec_array, "Theobroma_cacao"); - push (@spec_array, "TmDV92"); - push (@spec_array, "TmG3116"); push (@spec_array, "Tolypocladium_inflatum"); - push (@spec_array, "Trichoderma_atroviride"); - push (@spec_array, "Trichoderma_reesii"); - push (@spec_array, "Trichoderma_virens"); push (@spec_array, "Trichodesmium_erythraeum"); push (@spec_array, "Triticum_aestivum"); - push (@spec_array, "Triticum_monococcumDV92"); - push (@spec_array, "Triticum_monococcumG3116"); + push (@spec_array, "Triticum_monococcum.DV92"); + push (@spec_array, "Triticum_monococcum.G3116"); push (@spec_array, "Triticum_urartu"); push (@spec_array, "Vitis_vinifera"); push (@spec_array, "Zea_mays"); @@ -844,7 +899,7 @@ sub all_species_array { sub retrieval_info { my $file = $_[0]; - $file =~ /([a-zA-Z]*\_+[a-zA-Z0-9]+)\_((?:[a-zA-Z0-9+-.]+\_?)+)\_(\d+\_\d+\_\d+).fa/; + $file =~ /([a-zA-Z]*\_+[a-zA-Z0-9.]+)\_((?:[a-zA-Z0-9+-.]+\_?)+)\_(\d+\_\d+\_\d+).fa/; my $species = $1; my $method = $2; my $date = $3;