From: elserj Date: Sat, 18 Jan 2014 00:25:32 +0000 (+0000) Subject: Added new species and fixed for new downloads. Also added some reporting to the... X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=bd5b6541cc20880126185f4998fe5263a5870dd1;p=old-jaiswallab-svn%2F.git Added new species and fixed for new downloads. Also added some reporting to the verify script. svn path=/; revision=523 --- diff --git a/interactome_scripts/fasta_verify.pl b/interactome_scripts/fasta_verify.pl index 92b7c94..58beb93 100755 --- a/interactome_scripts/fasta_verify.pl +++ b/interactome_scripts/fasta_verify.pl @@ -27,6 +27,9 @@ my $counter = 0; my %gene_id_hash; +my $error_counter = 0; +my $gene_counter = 0; + while() { my $line = $_; chomp $line; @@ -36,33 +39,47 @@ while() { my $curr_line_is_header = 0; if($line =~ /^>/) { $curr_line_is_header = 1; + $gene_counter++; } if($curr_line_is_header == 1 && $prev_line_is_header == 1) { print "Error: Header found with no sequence data on line $counter in file $in_file\n"; + $error_counter++; + } + + if($line =~ /Sequence\sunavailable/ && $prev_line_is_header == 1) { + print "Error: Sequence unavailable text found on line $counter in file $in_file\n"; + $error_counter++; } # detect if line after header is blank if($line !~ /^[a-zA-Z*]/ && $prev_line_is_header == 1) { print "Error: Blank line found on line $counter in file $in_file\n"; + $error_counter++; } # detect if header line has tabs in it if($curr_line_is_header ==1 && $line =~ /\t/) { print "Error: Tab character in header on line $counter in file $in_file\n"; print "Reccommend using sed 's/\\t/\|/g' to replace tabs with pipe symbols\n"; + $error_counter++; } # set prev_line_header info for next time through loop if($curr_line_is_header) { $prev_line_is_header = 1; my $gene = find_gene($line,$species); + #my $gene = $line; # use if already gone through fasta_header_fix.pl if(!defined($gene_id_hash{$gene})) { $gene_id_hash{$gene} = $line; }else{ - print "Error: gene $gene has multiple sequences associated with it in file $in_file\n" + print "Error: gene $gene has multiple sequences associated with it in file $in_file\n"; + $error_counter++; } }else{ $prev_line_is_header = 0; } } + +print "Number of errors = $error_counter\n"; +print "Number of genes = $gene_counter\n"; diff --git a/interactome_scripts/fasta_verify_loop.sh b/interactome_scripts/fasta_verify_loop.sh index e2b6910..3e3ed8f 100755 --- a/interactome_scripts/fasta_verify_loop.sh +++ b/interactome_scripts/fasta_verify_loop.sh @@ -2,5 +2,6 @@ for i in *.fa do + echo $i fasta_verify.pl $i done diff --git a/interactome_scripts/find_species.pl b/interactome_scripts/find_species.pl index 614aafc..48317a1 100755 --- a/interactome_scripts/find_species.pl +++ b/interactome_scripts/find_species.pl @@ -11,152 +11,203 @@ sub find_species { my $temp = $_[0]; + my $species; if ($temp =~ /Aegilops\_tauschii/) { - $temp = "Aegilops_tauschii"; + $species = "Aegilops_tauschii"; + }elsif ($temp =~ /Amborella\_trichopoda/) { + $species = "Amborella_trichopoda"; }elsif ($temp =~ /Arabidopsis\_lyrata/) { - $temp = "Arabidopsis_lyrata"; + $species = "Arabidopsis_lyrata"; }elsif ($temp =~ /Arabidopsis\_thaliana/) { - $temp = "Arabidopsis_thaliana"; + $species = "Arabidopsis_thaliana"; }elsif ($temp =~ /Batrachochytrium/) { - $temp = "Batrachochytrium_distachyon"; - }elsif ($temp =~ /Brachypodium/) { - $temp = "Brachypodium_distachyon"; + $species = "Batrachochytrium_dendrobatidis"; + }elsif ($temp =~ /Brachypodium\_distachyon/) { + $species = "Brachypodium_distachyon"; + }elsif ($temp =~ /Brachypodium\_sylvaticum\_Corvallis/) { + $species = "Brachypodium_sylvaticum_Corvallis"; + }elsif ($temp =~ /Brachypodium\_sylvaticum\_Greece/) { + $species = "Brachypodium_sylvaticum_Greece"; + }elsif ($temp =~ /Brachypodium\_sylvaticum\_Spain/) { + $species = "Brachypodium_sylvaticum_Spain"; + }elsif ($temp =~ /Brassica\_rapa/) { + $species = "Brassica_rapa"; }elsif ($temp =~ /elegans/) { - $temp = "Caenorhabditis_elegans"; + $species = "Caenorhabditis_elegans"; + }elsif ($temp =~ /Cajanus\_cajan/) { + $species = "Cajanus_cajan"; }elsif ($temp =~ /Carica\_papaya/) { - $temp = "Carica_papaya"; + $species = "Carica_papaya"; }elsif ($temp =~ /Chlamy/) { - $temp = "Chlamydomonas_reinhardtii"; + $species = "Chlamydomonas_reinhardtii"; + }elsif ($temp =~ /Cicer\_arietinum/) { + $species = "Cicer_arietinum"; }elsif ($temp =~ /Citrus\_clementina/) { - $temp = "Citrus_clementina"; + $species = "Citrus_clementina"; }elsif ($temp =~ /Citrus\_sinensis/) { - $temp = "Citrus_sinensis"; + $species = "Citrus_sinensis"; }elsif ($temp =~ /Cucumis\_sativus/) { - $temp = "Cucumis_sativus"; + $species = "Cucumis_sativus"; }elsif ($temp =~ /Danio/) { - $temp = "Danio_rerio"; + $species = "Danio_rerio"; }elsif ($temp =~ /Drosophila/) { - $temp = "Drosophila_melanogaster"; + $species = "Drosophila_melanogaster"; }elsif ($temp =~ /Ectocarpus/) { - $temp = "Ectocarpus_siliculosus"; + $species = "Ectocarpus_siliculosus"; }elsif ($temp =~ /Epichloe_festuca/) { - $temp = "Epichloe\_festuca"; + $species = "Epichloe\_festuca"; }elsif ($temp =~ /Escherichia\_coli/) { - $temp = "Escherichia_coli"; + $species = "Escherichia_coli"; }elsif ($temp =~ /Eucalyptus\_grandis/) { - $temp = "Eucalyptus_grandis"; + $species = "Eucalyptus_grandis"; }elsif ($temp =~ /Fragaria/) { - $temp = "Fragaria_vesca"; + $species = "Fragaria_vesca"; }elsif ($temp =~ /Fusarium_graminearum/) { - $temp = "Fusarium\_graminearum"; + $species = "Fusarium\_graminearum"; }elsif ($temp =~ /Fusarium_oxysporum/) { - $temp = "Fusarium\_oxysporum"; + $species = "Fusarium\_oxysporum"; }elsif ($temp =~ /Fusarium_verticilliodes/) { - $temp = "Fusarium\_verticilliodes"; + $species = "Fusarium\_verticilliodes"; }elsif ($temp =~ /Gadus_morhua/) { - $temp = "Gadus\_morhua"; - }elsif ($temp =~ /Gastroerosteus/) { - $temp = "Gastroerosteus\_aculeatus"; + $species = "Gadus\_morhua"; + }elsif ($temp =~ /Gasterosteus/) { + $species = "Gasterosteus\_aculeatus"; }elsif ($temp =~ /Glycine/) { - $temp = "Glycine_max"; + $species = "Glycine_max"; + }elsif ($temp =~ /Gossypium\_raimondii/) { + $species = "Gossypium_raimondii"; }elsif ($temp =~ /Homo\_sapiens/) { - $temp = "Homo_sapiens"; + $species = "Homo_sapiens"; }elsif ($temp =~ /Hordeum\_vulgare/) { - $temp = "Hordeum_vulgare"; + $species = "Hordeum_vulgare"; }elsif ($temp =~ /Jatropha/) { - $temp = "Jatropha_curcas"; + $species = "Jatropha_curcas"; }elsif ($temp =~ /Laccaria/) { - $temp = "Laccaria_bicolor"; + $species = "Laccaria_bicolor"; + }elsif ($temp =~ /Leersia\_perrieri/) { + $species = "Leersia_perrieri"; + }elsif ($temp =~ /Linum\_usitatissimum/) { + $species = "Linum_usitatissimum"; }elsif ($temp =~ /Magnaporthe/) { - $temp = "Magnaporthe_grissa"; + $species = "Magnaporthe_grissa"; }elsif ($temp =~ /Malus/) { - $temp = "Malus_domestica"; + $species = "Malus_domestica"; }elsif ($temp =~ /Manihot/) { - $temp = "Manihot_esculenta"; + $species = "Manihot_esculenta"; }elsif ($temp =~ /Medicago/) { - $temp = "Medicago_truncatula"; + $species = "Medicago_truncatula"; }elsif ($temp =~ /Mimulus/) { - $temp = "Mimulus_guttatus"; + $species = "Mimulus_guttatus"; }elsif ($temp =~ /Mus\_musculus/) { - $temp = "Mus_musculus"; + $species = "Mus_musculus"; }elsif ($temp =~ /Musa/) { - $temp = "Musa_acuminata"; + $species = "Musa_acuminata"; }elsif ($temp =~ /Nectria\_haematococca/) { - $temp = "Nectria_haematococca"; + $species = "Nectria_haematococca"; }elsif ($temp =~ /Neurospora/) { - $temp = "Neurospora_crassa"; + $species = "Neurospora_crassa"; }elsif ($temp =~ /Nostoc/) { - $temp = "Nostoc_punctiforme"; + $species = "Nostoc_punctiforme"; }elsif ($temp =~ /Oncorhynchus\_mykiss/) { - $temp = "Oncorhynchus\_mykiss"; - }elsif ($temp =~ /Oryza\_sativa/) { - $temp = "Oryza_sativa"; + $species = "Oncorhynchus\_mykiss"; + }elsif ($temp =~ /Oryza\_barthii/) { + $species = "Oryza_barthii"; + }elsif ($temp =~ /Oryza\_brachyantha/) { + $species = "Oryza_brachyantha"; + }elsif ($temp =~ /Oryza\_glaberrima/) { + $species = "Oryza_glaberrima"; + }elsif ($temp =~ /Oryza\_glumaepatula/) { + $species = "Oryza_glumaepatula"; + }elsif ($temp =~ /Oryza\_longistaminata/) { + $species = "Oryza_longistaminata"; + }elsif ($temp =~ /Oryza\_meridionalis/) { + $species = "Oryza_meridionalis"; + }elsif ($temp =~ /Oryza\_nivara/) { + $species = "Oryza_nivara"; + }elsif ($temp =~ /Oryza\_punctata/) { + $species = "Oryza_punctata"; + }elsif ($temp =~ /Oryza\_rufipogon/) { + $species = "Oryza_rufipogon"; + }elsif ($temp =~ /Oryza\_sativa.indica\_gramene/) { + $species = "Oryza_sativa.indica.gramene"; + }elsif ($temp =~ /Oryza\_sativa.indica\_iplant/) { + $species = "Oryza_sativa.indica.iplant"; + }elsif ($temp =~ /Oryza\_sativa.japonica.IRGSP/) { + $species = "Oryza_sativa.japonica.IRGSP"; + }elsif ($temp =~ /Oryza\_sativa.japonica.MSU/) { + $species = "Oryza_sativa.japonica.MSU"; + }elsif ($temp =~ /Oryza\_sativa.japonica\_iplant/) { + $species = "Oryza_sativa.japonica.iplant"; }elsif ($temp =~ /Oryzias\_latipes/) { - $temp = "Oryzias\_latipes"; + $species = "Oryzias\_latipes"; }elsif ($temp =~ /Pediculus/) { - $temp = "Pediculus_humanus"; + $species = "Pediculus_humanus"; }elsif ($temp =~ /Phoenix/) { - $temp = "Phoenix_dactylifera"; + $species = "Phoenix_dactylifera"; }elsif ($temp =~ /Phyllostachys/) { - $temp = "Phyllostachys_heterocycla"; + $species = "Phyllostachys_heterocycla"; }elsif ($temp =~ /Physcomit/) { - $temp = "Physcomitrella_patens"; + $species = "Physcomitrella_patens"; }elsif ($temp =~ /Populus/) { - $temp = "Populus_trichocarpa"; + $species = "Populus_trichocarpa"; }elsif ($temp =~ /Prunus/) { - $temp = "Prunus_persica"; + $species = "Prunus_persica"; }elsif ($temp =~ /Rattus/) { - $temp = "Rattus_norvegicus"; + $species = "Rattus_norvegicus"; }elsif ($temp =~ /Rhizopus/) { - $temp = "Rhizopus_oryzae"; + $species = "Rhizopus_oryzae"; }elsif ($temp =~ /Ricinus/) { - $temp = "Ricinus_communis"; + $species = "Ricinus_communis"; }elsif ($temp =~ /cerevisiae/) { - $temp = "Saccharomyces_cerevisiae"; + $species = "Saccharomyces_cerevisiae"; }elsif ($temp =~ /pombe/) { - $temp = "Schizosaccharomyces_pombe"; + $species = "Schizosaccharomyces_pombe"; }elsif ($temp =~ /Selaginella/) { - $temp = "Selaginella_moellendorffii"; + $species = "Selaginella_moellendorffii"; }elsif ($temp =~ /Solanum_tuberosum/) { - $temp = "Solanum_tuberosum"; + $species = "Solanum_tuberosum"; }elsif ($temp =~ /Solanum_lycopersicum/) { - $temp = "Solanum_lycopersicum"; + $species = "Solanum_lycopersicum"; }elsif ($temp =~ /Sorghum/) { - $temp = "Sorghum_bicolor"; + $species = "Sorghum_bicolor"; }elsif ($temp =~ /Synechocystis/) { - $temp = "Synechocystis_pcc6803"; + $species = "Synechocystis_pcc6803"; }elsif ($temp =~ /Takifugu/) { - $temp = "Takifugu\_rubripes"; + $species = "Takifugu\_rubripes"; }elsif ($temp =~ /Tetraodon/) { - $temp = "Tetraodon\_nigroviridis"; + $species = "Tetraodon\_nigroviridis"; }elsif ($temp =~ /Theobroma\_cacao/) { - $temp = "Theobroma_cacao"; + $species = "Theobroma_cacao"; }elsif ($temp =~ /TmDV92/) { - $temp = "TmDV92"; + $species = "TmDV92"; }elsif ($temp =~ /TmG3116/) { - $temp = "TmG3116"; + $species = "TmG3116"; }elsif ($temp =~ /Tolypocladium\_inflatum/) { - $temp = "Tolypocladium_inflatum"; + $species = "Tolypocladium_inflatum"; }elsif ($temp =~ /Trichoderma\_atroviride/) { - $temp = "Trichoderma_atroviride"; + $species = "Trichoderma_atroviride"; }elsif ($temp =~ /Trichoderma\_reesii/) { - $temp = "Trichoderma_reesii"; + $species = "Trichoderma_reesii"; }elsif ($temp =~ /Trichoderma\_virens/) { - $temp = "Trichoderma_virens"; + $species = "Trichoderma_virens"; }elsif ($temp =~ /Trichodesmium/) { - $temp = "Trichodesmium_erythraeum"; + $species = "Trichodesmium_erythraeum"; }elsif ($temp =~ /triticum\_aestivum/) { - $temp = "Triticum_aestivum"; + $species = "Triticum_aestivum"; + }elsif ($temp =~ /Triticum\_monococcumDV92/) { + $species = "Triticum_monococcumDV92"; + }elsif ($temp =~ /Triticum\_monococcumG3116/) { + $species = "Triticum_monococcumG3116"; }elsif ($temp =~ /Triticum\_urartu/) { - $temp = "Triticum_urartu"; + $species = "Triticum_urartu"; }elsif ($temp =~ /Vitis\_vinifera/) { - $temp = "Vitis_vinifera"; + $species = "Vitis_vinifera"; }elsif ($temp =~ /Zea\_mays/) { - $temp = "Zea_mays"; + $species = "Zea_mays"; }else { die "Error: Species can not be found from file name $temp!"; } - return $temp; + return $species; } sub find_gene { @@ -167,54 +218,76 @@ sub find_gene { if ($species eq "Aegilops_tauschii") { my ($gene_id, $type, $location, $info) = split(/\s/, $gene_header); $gene = $gene_id; + }elsif ($species eq "Amborella_trichopoda") { + $gene = $gene_header; }elsif ($species eq "Arabidopsis_lyrata") { - my ($name,$gene_id,$scaffold,$isomer) = split(/\|/, $gene_header); - $gene = $isomer; + my ($name,$gene_id) = split(/\|/, $gene_header); + $gene = $gene_id; }elsif ($species eq "Arabidopsis_thaliana") { - my ($name,$gene_id,$chrom,$isomer) = split(/\|/, $gene_header); + my ($gene_id,$isomer) = split(/\|/, $gene_header); $gene = $isomer; - }elsif ($species eq "Batrachochytrium_distachyon") { + }elsif ($species eq "Batrachochytrium_dendrobatidis") { my ($gene_id,$transcript,$info) = split(/\s\|\s/, $gene_header); $gene = $gene_id; }elsif ($species eq "Brachypodium_distachyon") { - my ($name,$gene_id,$chrom,$isomer) = split(/\|/, $gene_header); + my ($gene_id,$isomer) = split(/\|/, $gene_header); $gene = $isomer; + }elsif ($species eq "Brachypodium_sylvaticum_Corvallis") { + my ($gene_id,$temp,$start,$end) = split(/\|/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Brachypodium_sylvaticum_Greece") { + $gene = $gene_header; + }elsif ($species eq "Brachypodium_sylvaticum_Spain") { + my ($gene_id,$temp,$start,$end) = split(/\|/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Brassica_rapa") { + my ($gene_id,$transcript) = split(/\|/, $gene_header); + $gene = $transcript; #???? }elsif ($species eq "Caenorhabditis_elegans") { my ($locus_id,$unknown,$gene_id,$temp) = split(/\|/, $gene_header); $gene = $locus_id; #??? + }elsif ($species eq "Cajanus_cajan") { + my ($gene_id,$type,$locus,$temp) = split(/\s/, $gene_header); + $gene = $gene_id; }elsif ($species eq "Carica_papaya") { - my $supercontig = $gene_header; - $gene = $supercontig; + my ($contig,$gene_id) = split(/\|/, $gene_header); + $gene = $gene_id; }elsif ($species eq "Chlamydomonas_reinhardtii") { - my ($name,$locus_id,$chrom,$transcript) = split(/\|/,$gene_header); + my ($locus,$transcript) = split(/\|/,$gene_header); $gene = $transcript; #??? + }elsif ($species eq "Cicer_arietinum") { + my ($gi,$number,$ref,$gene_id,$function) = split(/\|/, $gene_header); + $gene = $gene_id; }elsif ($species eq "Citrus_clementina") { - my ($gene_id,$pacid) = split(/\|/,$gene_header); + my ($transcript_id,$gene_id) = split(/\|/,$gene_header); $gene = $gene_id; }elsif ($species eq "Citrus_sinensis") { - my ($gene_id,$pacid) = split(/\|/,$gene_header); + my ($transcript_id,$gene_id) = split(/\|/,$gene_header); $gene = $gene_id; }elsif ($species eq "Cucumis_sativus") { - my ($name,$main_isomer,$scaffold,$isomer) = split(/\|/,$gene_header); + my ($gene_id,$isomer) = split(/\|/,$gene_header); $gene = $isomer; }elsif ($species eq "Danio_rerio") { - my ($gene_id,$transcript) = split(/\|/,$gene_header); - $gene = $gene_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Drosophila_melanogaster") { - my ($gene_id,$temp) = split(/\s/,$gene_header); - $gene = $gene_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Ectocarpus_siliculosus") { - my ($temp,$transcript,$source,$gene_id,$func) = split(/\|/, $gene_header); + my ($gene_id,$temp) = split(/\|/, $gene_header); $gene = $gene_id; }elsif ($species eq "Epichloe_festuca") { $gene = $gene_header; }elsif ($species eq "Escherichia_coli") { $gene = $gene_header; #??? }elsif ($species eq "Eucalyptus_grandis") { - my ($gene_id,$pacid) = split(/\|/,$gene_header); - $gene = $gene_id; + my ($gene_id,$transcript_id) = split(/\|/,$gene_header); + $gene = $transcript_id; }elsif ($species eq "Fragaria_vesca") { - my ($gene_id, $mrna_id, $method, $length) = split(/\s\|\s/, $gene_header); + my ($gene_id, $mrna_id) = split(/\|/, $gene_header); + $gene_id =~ s/\-v1\.0\-hybrid//; $gene = $gene_id; }elsif ($species eq "Fusarium_graminearum") { my ($gene_id, $info) = split(/\s\|\s/, $gene_header); @@ -225,19 +298,26 @@ sub find_gene { my ($gene_id, $info) = split(/\s\|\s/, $gene_header); $gene = $gene_id; }elsif ($species eq "Gadus_morhua") { - my ($gene_id, $transcript_id) = split(/\|/,$gene_header); - $gene = $transcript_id; - }elsif ($species eq "Gastroerosteus_aculeatus") { - my ($gene_id, $transcript_id) = split(/\|/,$gene_header); - $gene = $transcript_id; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; + }elsif ($species eq "Gasterosteus_aculeatus") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Glycine_max") { - my ($name,$locus_id,$isomer,$chrom) = split(/\|/,$gene_header); + my ($locus_id,$isomer) = split(/\|/,$gene_header); + $gene = $isomer; + }elsif ($species eq "Gossypium_raimondii") { + my ($locus_id,$isomer) = split(/\|/,$gene_header); $gene = $isomer; }elsif ($species eq "Homo_sapiens") { - my ($protein,$type,$chrom,$gene_id,$transcript) = split(/\s/,$gene_header); - $gene = $protein; + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Hordeum_vulgare") { - my ($gene_id,$transcript) = split(/\|/,$gene_header); + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; $gene = $transcript; }elsif ($species eq "Jatropha_curcas") { my ($gene_id, $temp) = split(/\s/,$gene_header); @@ -245,19 +325,26 @@ sub find_gene { }elsif ($species eq "Laccaria_bicolor") { my ($source,$spec,$gene_id,$scaffold) = split(/\|/, $gene_header); $gene = $gene_id; + }elsif ($species eq "Leersia_perrieri") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Linum_usitatissimum") { + my ($transcript,$gene_id) = split(/\|/, $gene_header); + $gene = $gene_id; }elsif ($species eq "Magnaporthe_grissa") { my ($transcript,$gene_id,$temp) = split(/\s\|\s/, $gene_header); $gene = $transcript; }elsif ($species eq "Malus_domestica" ) { - $gene = $gene_header; + my ($gene_id1,$gene_id2) = split(/\|/, $gene_header); + $gene = $gene_id2; }elsif ($species eq "Manihot_esculenta") { - my ($name,$scaffold,$gene_id,$temp) = split(/\|/, $gene_header); + my ($transcript,$gene_id) = split(/\|/, $gene_header); $gene = $gene_id; }elsif ($species eq "Medicago_truncatula") { - my ($name,$gene_id,$chrom,$temp) = split(/\|/, $gene_header); - $gene = $gene_id; + my ($gene_id,$transcript) = split(/\|/, $gene_header); + $gene = $transcript; }elsif ($species eq "Mimulus_guttatus") { - my ($name,$gene_id,$scaffold,$temp) = split(/\|/, $gene_header); + my ($gene_id,$transcript) = split(/\|/, $gene_header); $gene = $gene_id; }elsif ($species eq "Mus_musculus") { my ($protein,$type,$chrom,$gene_id,$transcript) = split(/\s/,$gene_header); @@ -275,11 +362,55 @@ sub find_gene { my ($temp,$transcript,$source,$gene_id,$func) = split(/\|/, $gene_header); $gene = $transcript; }elsif ($species eq "Oncorhynchus_mykiss") { - my ($gene_id, $offset, $coord_1, $coord_2) = split(/\s/,$gene_header); + my ($gi, $number, $type, $gene_id, $function) = split(/\|/,$gene_header); $gene = $gene_id; - }elsif ($species eq "Oryza_sativa") { - my ($isomer,$temp,$type) = split(/\|/,$gene_header); - $gene = $isomer; + }elsif ($species eq "Leersia_perrieri") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Oryza_barthii") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Oryza_brachyantha") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Oryza_glaberrima") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Oryza_glumaepatula") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Oryza_longistaminata") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Oryza_meridionalis") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Oryza_nivara") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Oryza_punctata") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Oryza_rufipogon") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Oryza_sativa.indica.iplant") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Oryza_sativa.indica.gramene") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; + }elsif ($species eq "Oryza_sativa.japonica.iplant") { + my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Oryza_sativa.japonica.IRGSP") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; + }elsif ($species eq "Oryza_sativa.japonica.MSU") { + my ($gene_id,$transcript) = split(/\|/,$gene_header); + $gene = $transcript; }elsif ($species eq "Oryzias_latipes") { my ($gene_id, $transcript_id) = split(/\|/,$gene_header); $gene = $transcript_id; @@ -365,17 +496,22 @@ sub find_gene { }elsif ($species eq "Triticum_aestivum") { my ($type,$transcript,$source,$gene_id,$func) = split(/\|/, $gene_header); $gene = $transcript; - }elsif ($species eq "Triticum_urartu") { - my ($gene_id, $type, $location, $info) = split(/\s/, $gene_header); + }elsif ($species eq "Triticum_monococcumDV92") { + my ($gene_id,$temp) = split(/\s+/,$gene_header); + $gene = $gene_id; + }elsif ($species eq "Triticum_monococcumG3116") { + my ($gene_id,$temp) = split(/\s+/,$gene_header); $gene = $gene_id; + }elsif ($species eq "Triticum_urartu") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene = $transcript; }elsif ($species eq "Vitis_vinifera") { #my ($name,$gene_id,$chrom_id,$id) = split(/\|/,$gene_header); $gene = $gene_header; #??? }elsif ($species eq "Zea_mays") { - #my ($transcript,$seq_type,$coord,$parent_transcript,$parent_gene) = split(/;\s/, $gene_header); - my ($spec, $gene_id, $unknown, $transcript) = split(/\|/, $gene_header); + my ($gene_id,$transcript) = split(/\|/,$gene_header); $gene = $transcript; - $gene =~ s/\sseq=translation//g; }else { die "Error: Gene id can not be found for species $species!"; @@ -383,17 +519,258 @@ sub find_gene { return $gene; } + +sub find_gene_synonym { + my $gene_header = $_[0]; + $gene_header =~ s/^>//; # strip off the header line identifier, if it isn't already + my $species = $_[1]; + my $synonym; + if ($species eq "Batrachochytrium_dendrobatidis") { + my ($gene_id,$transcript,$info) = split(/\s\|\s/, $gene_header); + $synonym = $transcript; + }elsif ($species eq "Caenorhabditis_elegans") { + my ($locus_id,$unknown,$gene_id,$temp) = split(/\|/, $gene_header); + $synonym = $gene_id; #??? + }elsif ($species eq "Cajanus_cajan") { + my ($gene_id,$type,$locus,$temp) = split(/\s/, $gene_header); + $synonym = $locus; + }elsif ($species eq "Carica_papaya") { + my ($contig,$gene_id) = split(/\|/, $gene_header); + $synonym = $contig; + }elsif ($species eq "Chlamydomonas_reinhardtii") { + my ($locus,$transcript) = split(/\|/,$gene_header); + $synonym = $locus; #??? + }elsif ($species eq "Cicer_arietinum") { + my ($gi,$number,$ref,$gene_id,$function) = split(/\|/, $gene_header); + $synonym = $number; + }elsif ($species eq "Citrus_clementina") { + my ($transcript_id,$gene_id) = split(/\|/,$gene_header); + $synonym = $transcript_id; + }elsif ($species eq "Citrus_sinensis") { + my ($transcript_id,$gene_id) = split(/\|/,$gene_header); + $synonym = $transcript_id; + }elsif ($species eq "Cucumis_sativus") { + my ($gene_id,$isomer) = split(/\|/,$gene_header); + $synonym = $gene_id; + }elsif ($species eq "Danio_rerio") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Drosophila_melanogaster") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Eucalyptus_grandis") { + my ($gene_id,$transcript_id) = split(/\|/,$gene_header); + $synonym = $gene_id; + }elsif ($species eq "Fragaria_vesca") { + my ($gene_id, $mrna_id) = split(/\|/, $gene_header); + $gene_id =~ s/\-v1\.0\-hybrid//; + $mrna_id =~ s/\-v1\.0\-hybrid//; + $synonym = $mrna_id; + }elsif ($species eq "Gadus_morhua") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Gasterosteus_aculeatus") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Glycine_max") { + my ($locus_id,$isomer) = split(/\|/,$gene_header); + $gene = $isomer; + }elsif ($species eq "Gossypium_raimondii") { + my ($locus_id,$isomer) = split(/\|/,$gene_header); + $synonym = $locus_id; + }elsif ($species eq "Homo_sapiens") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Hordeum_vulgare") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Linum_usitatissimum") { + my ($transcript,$gene_id) = split(/\|/, $gene_header); + $synonym = $transcript; + }elsif ($species eq "Magnaporthe_grissa") { + my ($transcript,$gene_id,$temp) = split(/\s\|\s/, $gene_header); + $synonym = $gene_id; + }elsif ($species eq "Malus_domestica" ) { + my ($gene_id1,$gene_id2) = split(/\|/, $gene_header); + $synonym = $gene_id1; + }elsif ($species eq "Manihot_esculenta") { + my ($transcript,$gene_id) = split(/\|/, $gene_header); + $synonym = $transcript; + }elsif ($species eq "Medicago_truncatula") { + my ($gene_id,$transcript) = split(/\|/, $gene_header); + $synonym = $gene_id; + }elsif ($species eq "Mimulus_guttatus") { + my ($gene_id,$transcript) = split(/\|/, $gene_header); + $synonym = $gene_id; + }elsif ($species eq "Mus_musculus") { + my ($protein,$type,$chrom,$gene_id,$transcript) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Musa_acuminata") { + my ($protein,$type,$chrom,$gene_id,$transcript) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Neurospora_crassa") { + my ($transcript,$gene_id,$temp) = split(/\s\|\s/,$gene_header); + $synonym = $gene_id; + }elsif ($species eq "Nostoc_punctiforme") { + my ($temp,$transcript,$source,$gene_id,$func) = split(/\|/, $gene_header); + $synonym = $gene_id; + }elsif ($species eq "Oncorhynchus_mykiss") { + my ($gi, $number, $type, $gene_id, $function) = split(/\|/,$gene_header); + $synonym = $number; + }elsif ($species eq "Oryza_sativa.indica.gramene") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Oryza_sativa.japonica.IRGSP") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Oryzias_latipes") { + my ($gene_id, $transcript_id) = split(/\|/,$gene_header); + $gene = $transcript_id; + }elsif ($species eq "Pediculus_humanus") { + my ($source,$gene_pa,$func,$unknown,$gene_id) = split(/\|/, $gene_header); + $gene_id =~ s/gene\://; + $gene = $gene_pa; + }elsif ($species eq "Phoenix_dactylifera") { + my ($gene_id, $temp) = split(/\s/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Phyllostachys_heterocycla") { + my ($gene_id, $gene_model, $location, $dot, $strand, $dot2, $info) = split (/\s+/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Physcomitrella_patens") { + my ($name,$pac_id) = split(/\|/,$gene_header); + $gene = $name; #??? + }elsif ($species eq "Populus_trichocarpa") { + my ($gene_id,$transcript_id) = split(/\|/,$gene_header); + $gene = $transcript_id; + }elsif ($species eq "Prunus_persica") { + my ($gene_id,$temp) = split(/\s/,$gene_header); + $gene = $gene_id; + }elsif ($species eq "Rattus_norvegicus") { + my ($gene_id,$transcript,$chrom) = split(/\|/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Rhizopus_oryzae") { + my ($gene_id,$func) = split(/\s\|\s/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Ricinus_communis") { + my ($name,$transcript,$gene_id,$temp) = split(/\|/, $gene_header); + $gene = $transcript; + }elsif ($species eq "Saccharomyces_cerevisiae") { + my ($gene_id,$gene_name,$transcript,$temp) = split(/\s/,$gene_header); + $gene = $gene_id; + }elsif ($species eq "Schizosaccharomyces_pombe") { + my ($gene_id,$gene_name,$unknown,$func,$name,$chrom,$temp) = split(/\s/,$gene_header); + $gene = $gene_id; + }elsif ($species eq "Selaginella_moellendorffii") { + my ($name,$locus_id,$chrom_id,$prot_id) = split(/\|/,$gene_header); + $gene = $prot_id; #??? + }elsif ($species eq "Solanum_tuberosum") { + my ($protein,$transcript,$type) = split(/\s/,$gene_header); + $gene = $protein; + }elsif ($species eq "Solanum_lycopersicum") { + my ($protein,$type,$location,$gene_1,$transcript) = split(/\s/,$gene_header); + $gene = $protein; + }elsif ($species eq "Sorghum_bicolor") { + my ($gene_id,$transcript_id) = split(/\|/,$gene_header); + $synonym = $gene_id; + }elsif ($species eq "Synechocystis_pcc6803") { + my ($gene_id,$temp) = split(" ",$gene_header); + $gene = $gene_id; #??? + }elsif ($species eq "Takifugu_rubripes") { + my ($gene_id, $transcript_id) = split(/\|/,$gene_header); + $gene = $transcript_id; + }elsif ($species eq "Tetraodon_nigroviridis") { + my ($gene_id, $transcript_id) = split(/\|/,$gene_header); + $gene = $transcript_id; + }elsif ($species eq "Theobroma_cacao") { + my ($gene_id,$temp) = split(/\s/,$gene_header); + $gene = $gene_id; + }elsif ($species eq "TmDV92") { + my ($gene_id,$temp) = split(/\s+/,$gene_header); + $gene = $gene_id; + }elsif ($species eq "TmG3116") { + my ($gene_id,$temp) = split(/\s+/,$gene_header); + $gene = $gene_id; + }elsif ($species eq "Tolypocladium_inflatum") { + my ($gene_id, $type, $info) = split(/\s/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Trichoderma_atroviride") { + my ($source, $spec, $int_id, $gene_id) = split(/\|/,$gene_header); + $gene = $gene_id; + }elsif ($species eq "Trichoderma_reesii") { + my ($source, $spec, $int_id, $gene_id) = split(/\|/,$gene_header); + $gene = $gene_id; + }elsif ($species eq "Trichoderma_virens") { + my ($temp,$unknown,$source,$gene_id,$func) = split(/\|/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Trichodesmium_erythraeum") { + my ($temp,$unknown,$source,$gene_id,$func) = split(/\|/, $gene_header); + $gene = $gene_id; + }elsif ($species eq "Triticum_aestivum") { + my ($type,$transcript,$source,$gene_id,$func) = split(/\|/, $gene_header); + $gene = $transcript; + }elsif ($species eq "Triticum_monococcumDV92") { + my ($gene_id,$temp) = split(/\s+/,$gene_header); + $gene = $gene_id; + }elsif ($species eq "Triticum_monococcumG3116") { + my ($gene_id,$temp) = split(/\s+/,$gene_header); + $gene = $gene_id; + }elsif ($species eq "Triticum_urartu") { + my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header); + $transcript =~ s/transcript\://; + $gene_id =~ s/gene\://; + $synonym = "$peptide,$gene_id"; + }elsif ($species eq "Vitis_vinifera") { + #my ($name,$gene_id,$chrom_id,$id) = split(/\|/,$gene_header); + $gene = $gene_header; #??? + }elsif ($species eq "Zea_mays") { + my ($gene_id,$transcript) = split(/\|/,$gene_header); + $synonym = $gene_id; + + }else { + die "Error: Gene id can not be found for species $species!"; + } + return $gene; + +} + sub all_species_array { # return all species in an array my @spec_array; push (@spec_array, "Aegilops_tauschii"); + push (@spec_array, "Amborella_trichopoda"); push (@spec_array, "Arabidopsis_lyrata"); push (@spec_array, "Arabidopsis_thaliana"); - push (@spec_array, "Batrachochytrium_distachyon"); + #push (@spec_array, "Batrachochytrium_dendrobatidis"); push (@spec_array, "Brachypodium_distachyon"); + push (@spec_array, "Brachypodium_sylvaticum_Corvallis"); + push (@spec_array, "Brachypodium_sylvaticum_Greece"); + push (@spec_array, "Brachypodium_sylvaticum_Spain"); + push (@spec_array, "Brassica_rapa"); push (@spec_array, "Caenorhabditis_elegans"); + push (@spec_array, "Cajanus_cajan"); push (@spec_array, "Carica_papaya"); push (@spec_array, "Chlamydomonas_reinhardtii"); + push (@spec_array, "Cicer_arietinum"); push (@spec_array, "Citrus_clementina"); push (@spec_array, "Citrus_sinensis"); push (@spec_array, "Cucumis_sativus"); @@ -408,12 +785,14 @@ sub all_species_array { push (@spec_array, "Fusarium_oxysporum"); push (@spec_array, "Fusarium_verticilliodes"); push (@spec_array, "Gadus_morhua"); - push (@spec_array, "Gastroerosteus_aculeatus"); + push (@spec_array, "Gasterosteus_aculeatus"); push (@spec_array, "Glycine_max"); + push (@spec_array, "Gossypium_raimondii"); push (@spec_array, "Homo_sapiens"); push (@spec_array, "Hordeum_vulgare"); push (@spec_array, "Jatropha_curcas"); push (@spec_array, "Laccaria_bicolor"); + push (@spec_array, "Linum_usitatissimum"); push (@spec_array, "Magnaporthe_grissa"); push (@spec_array, "Malus_domestica"); push (@spec_array, "Manihot_esculenta"); @@ -454,6 +833,8 @@ sub all_species_array { push (@spec_array, "Trichoderma_virens"); push (@spec_array, "Trichodesmium_erythraeum"); push (@spec_array, "Triticum_aestivum"); + push (@spec_array, "Triticum_monococcumDV92"); + push (@spec_array, "Triticum_monococcumG3116"); push (@spec_array, "Triticum_urartu"); push (@spec_array, "Vitis_vinifera"); push (@spec_array, "Zea_mays"); @@ -463,7 +844,7 @@ sub all_species_array { sub retrieval_info { my $file = $_[0]; - $file =~ /([a-zA-Z]*\_[a-zA-Z0-9]*)\_((?:[a-zA-Z0-9]+\_?)+)\_(\d+\_\d+\_\d+).fa/; + $file =~ /([a-zA-Z]*\_+[a-zA-Z0-9]+)\_((?:[a-zA-Z0-9+-.]+\_?)+)\_(\d+\_\d+\_\d+).fa/; my $species = $1; my $method = $2; my $date = $3;