Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Added new species and fixed for new downloads. Also added some reporting to the...
authorelserj <elserj@localhost>
Sat, 18 Jan 2014 00:25:32 +0000 (00:25 +0000)
committerelserj <elserj@localhost>
Sat, 18 Jan 2014 00:25:32 +0000 (00:25 +0000)
svn path=/; revision=523

interactome_scripts/fasta_verify.pl
interactome_scripts/fasta_verify_loop.sh
interactome_scripts/find_species.pl

index 92b7c946a68125c3b2cf129b6eaad4a08665971a..58beb937f7e9b957645c91b2ef06072046d8f158 100755 (executable)
@@ -27,6 +27,9 @@ my $counter = 0;
 
 my %gene_id_hash;
 
+my $error_counter = 0;
+my $gene_counter = 0;
+
 while(<in_file>) {
        my $line = $_;
        chomp $line;
@@ -36,33 +39,47 @@ while(<in_file>) {
        my $curr_line_is_header = 0;
        if($line =~ /^>/) {
                $curr_line_is_header = 1;
+               $gene_counter++;
        }
        
        if($curr_line_is_header == 1 && $prev_line_is_header == 1) {
                print "Error: Header found with no sequence data on line $counter in file $in_file\n";
+               $error_counter++;
+       }
+       
+       if($line =~ /Sequence\sunavailable/ && $prev_line_is_header == 1) {
+                       print "Error: Sequence unavailable text found on line $counter in file $in_file\n";
+                       $error_counter++;
        }
        
        # detect if line after header is blank
        if($line !~ /^[a-zA-Z*]/ && $prev_line_is_header == 1) {
                print "Error: Blank line found on line $counter in file $in_file\n";
+               $error_counter++;
        }
        
        # detect if header line has tabs in it
        if($curr_line_is_header ==1 && $line =~ /\t/) {
                print "Error: Tab character in header on line $counter in file $in_file\n";
                print "Reccommend using sed 's/\\t/\|/g' to replace tabs with pipe symbols\n";
+               $error_counter++;
        }
 
        # set prev_line_header info for next time through loop
        if($curr_line_is_header) {
                $prev_line_is_header = 1;
                my $gene = find_gene($line,$species);
+               #my $gene = $line; # use if already gone through fasta_header_fix.pl
                if(!defined($gene_id_hash{$gene})) {
                        $gene_id_hash{$gene} = $line;
                }else{
-                       print "Error: gene $gene has multiple sequences associated with it in file $in_file\n"
+                       print "Error: gene $gene has multiple sequences associated with it in file $in_file\n";
+                       $error_counter++;
                }
        }else{
                $prev_line_is_header = 0;
        }
 }
+
+print "Number of errors = $error_counter\n";
+print "Number of genes  = $gene_counter\n";
index e2b6910e0b920cb7a42c04046897d24617af5b6d..3e3ed8f6e5695b4dd1e8e490459239f130a821e4 100755 (executable)
@@ -2,5 +2,6 @@
 
 for i in *.fa
 do
+       echo $i
        fasta_verify.pl $i
 done
index 614aafc1bceea01e3de6605b655356e84857e5c3..48317a13f9e96d5c63a33391da44799dcb0576d7 100755 (executable)
 
 sub find_species {
        my $temp = $_[0];
+       my $species;
        if ($temp =~ /Aegilops\_tauschii/) {
-               $temp = "Aegilops_tauschii";
+               $species = "Aegilops_tauschii";
+       }elsif ($temp =~ /Amborella\_trichopoda/) {
+               $species = "Amborella_trichopoda";
        }elsif ($temp =~ /Arabidopsis\_lyrata/) {
-               $temp = "Arabidopsis_lyrata";
+               $species = "Arabidopsis_lyrata";
        }elsif ($temp =~ /Arabidopsis\_thaliana/) {
-               $temp = "Arabidopsis_thaliana";
+               $species = "Arabidopsis_thaliana";
        }elsif ($temp =~ /Batrachochytrium/) {
-               $temp = "Batrachochytrium_distachyon";
-       }elsif ($temp =~ /Brachypodium/) {
-               $temp = "Brachypodium_distachyon";
+               $species = "Batrachochytrium_dendrobatidis";
+       }elsif ($temp =~ /Brachypodium\_distachyon/) {
+               $species = "Brachypodium_distachyon";
+       }elsif ($temp =~ /Brachypodium\_sylvaticum\_Corvallis/) {
+               $species = "Brachypodium_sylvaticum_Corvallis";
+       }elsif ($temp =~ /Brachypodium\_sylvaticum\_Greece/) {
+               $species = "Brachypodium_sylvaticum_Greece";
+       }elsif ($temp =~ /Brachypodium\_sylvaticum\_Spain/) {
+               $species = "Brachypodium_sylvaticum_Spain";
+       }elsif ($temp =~ /Brassica\_rapa/) {
+               $species = "Brassica_rapa";
        }elsif ($temp =~ /elegans/) {
-               $temp = "Caenorhabditis_elegans";
+               $species = "Caenorhabditis_elegans";
+       }elsif ($temp =~ /Cajanus\_cajan/) {
+               $species = "Cajanus_cajan";
        }elsif ($temp =~ /Carica\_papaya/) {
-               $temp = "Carica_papaya";
+               $species = "Carica_papaya";
        }elsif ($temp =~ /Chlamy/) {
-               $temp = "Chlamydomonas_reinhardtii";
+               $species = "Chlamydomonas_reinhardtii";
+       }elsif ($temp =~ /Cicer\_arietinum/) {
+               $species = "Cicer_arietinum";
        }elsif ($temp =~ /Citrus\_clementina/) {
-               $temp = "Citrus_clementina";
+               $species = "Citrus_clementina";
        }elsif ($temp =~ /Citrus\_sinensis/) {
-               $temp = "Citrus_sinensis";
+               $species = "Citrus_sinensis";
        }elsif ($temp =~ /Cucumis\_sativus/) {
-               $temp = "Cucumis_sativus";
+               $species = "Cucumis_sativus";
        }elsif ($temp =~ /Danio/) {
-               $temp = "Danio_rerio";
+               $species = "Danio_rerio";
        }elsif ($temp =~ /Drosophila/) {
-               $temp = "Drosophila_melanogaster";
+               $species = "Drosophila_melanogaster";
        }elsif ($temp =~ /Ectocarpus/) {
-               $temp = "Ectocarpus_siliculosus";
+               $species = "Ectocarpus_siliculosus";
        }elsif ($temp =~ /Epichloe_festuca/) {
-               $temp = "Epichloe\_festuca";
+               $species = "Epichloe\_festuca";
        }elsif ($temp =~ /Escherichia\_coli/) {
-               $temp = "Escherichia_coli";
+               $species = "Escherichia_coli";
        }elsif ($temp =~ /Eucalyptus\_grandis/) {
-               $temp = "Eucalyptus_grandis";
+               $species = "Eucalyptus_grandis";
        }elsif ($temp =~ /Fragaria/) {
-               $temp = "Fragaria_vesca";
+               $species = "Fragaria_vesca";
        }elsif ($temp =~ /Fusarium_graminearum/) {
-               $temp = "Fusarium\_graminearum";
+               $species = "Fusarium\_graminearum";
        }elsif ($temp =~ /Fusarium_oxysporum/) {
-               $temp = "Fusarium\_oxysporum";
+               $species = "Fusarium\_oxysporum";
        }elsif ($temp =~ /Fusarium_verticilliodes/) {
-               $temp = "Fusarium\_verticilliodes";
+               $species = "Fusarium\_verticilliodes";
        }elsif ($temp =~ /Gadus_morhua/) {
-               $temp = "Gadus\_morhua";
-       }elsif ($temp =~ /Gastroerosteus/) {
-               $temp = "Gastroerosteus\_aculeatus";
+               $species = "Gadus\_morhua";
+       }elsif ($temp =~ /Gasterosteus/) {
+               $species = "Gasterosteus\_aculeatus";
        }elsif ($temp =~ /Glycine/) {
-               $temp = "Glycine_max";
+               $species = "Glycine_max";
+       }elsif ($temp =~ /Gossypium\_raimondii/) {
+               $species = "Gossypium_raimondii";
        }elsif ($temp =~ /Homo\_sapiens/) {
-               $temp = "Homo_sapiens";
+               $species = "Homo_sapiens";
        }elsif ($temp =~ /Hordeum\_vulgare/) {
-               $temp = "Hordeum_vulgare";
+               $species = "Hordeum_vulgare";
        }elsif ($temp =~ /Jatropha/) {
-               $temp = "Jatropha_curcas";
+               $species = "Jatropha_curcas";
        }elsif ($temp =~ /Laccaria/) {
-               $temp = "Laccaria_bicolor";
+               $species = "Laccaria_bicolor";
+       }elsif ($temp =~ /Leersia\_perrieri/) {
+               $species = "Leersia_perrieri";
+       }elsif ($temp =~ /Linum\_usitatissimum/) {
+               $species = "Linum_usitatissimum";
        }elsif ($temp =~ /Magnaporthe/) {
-               $temp = "Magnaporthe_grissa";
+               $species = "Magnaporthe_grissa";
        }elsif ($temp =~ /Malus/) {
-               $temp = "Malus_domestica";
+               $species = "Malus_domestica";
        }elsif ($temp =~ /Manihot/) {
-               $temp = "Manihot_esculenta";
+               $species = "Manihot_esculenta";
        }elsif ($temp =~ /Medicago/) {
-               $temp = "Medicago_truncatula";
+               $species = "Medicago_truncatula";
        }elsif ($temp =~ /Mimulus/) {
-               $temp = "Mimulus_guttatus";
+               $species = "Mimulus_guttatus";
        }elsif ($temp =~ /Mus\_musculus/) {
-               $temp = "Mus_musculus";
+               $species = "Mus_musculus";
        }elsif ($temp =~ /Musa/) {
-               $temp = "Musa_acuminata";
+               $species = "Musa_acuminata";
        }elsif ($temp =~ /Nectria\_haematococca/) {
-               $temp = "Nectria_haematococca";
+               $species = "Nectria_haematococca";
        }elsif ($temp =~ /Neurospora/) {
-               $temp = "Neurospora_crassa";
+               $species = "Neurospora_crassa";
        }elsif ($temp =~ /Nostoc/) {
-               $temp = "Nostoc_punctiforme";
+               $species = "Nostoc_punctiforme";
        }elsif ($temp =~ /Oncorhynchus\_mykiss/) {
-               $temp = "Oncorhynchus\_mykiss";
-       }elsif ($temp =~ /Oryza\_sativa/) {
-               $temp = "Oryza_sativa";
+               $species = "Oncorhynchus\_mykiss";
+       }elsif ($temp =~ /Oryza\_barthii/) {
+               $species = "Oryza_barthii";
+       }elsif ($temp =~ /Oryza\_brachyantha/) {
+               $species = "Oryza_brachyantha";
+       }elsif ($temp =~ /Oryza\_glaberrima/) {
+               $species = "Oryza_glaberrima";
+       }elsif ($temp =~ /Oryza\_glumaepatula/) {
+               $species = "Oryza_glumaepatula";
+       }elsif ($temp =~ /Oryza\_longistaminata/) {
+               $species = "Oryza_longistaminata";
+       }elsif ($temp =~ /Oryza\_meridionalis/) {
+               $species = "Oryza_meridionalis";
+       }elsif ($temp =~ /Oryza\_nivara/) {
+               $species = "Oryza_nivara";
+       }elsif ($temp =~ /Oryza\_punctata/) {
+               $species = "Oryza_punctata";
+       }elsif ($temp =~ /Oryza\_rufipogon/) {
+               $species = "Oryza_rufipogon";
+       }elsif ($temp =~ /Oryza\_sativa.indica\_gramene/) {
+               $species = "Oryza_sativa.indica.gramene";
+       }elsif ($temp =~ /Oryza\_sativa.indica\_iplant/) {
+               $species = "Oryza_sativa.indica.iplant";
+       }elsif ($temp =~ /Oryza\_sativa.japonica.IRGSP/) {
+               $species = "Oryza_sativa.japonica.IRGSP";
+       }elsif ($temp =~ /Oryza\_sativa.japonica.MSU/) {
+               $species = "Oryza_sativa.japonica.MSU";
+       }elsif ($temp =~ /Oryza\_sativa.japonica\_iplant/) {
+               $species = "Oryza_sativa.japonica.iplant";
        }elsif ($temp =~ /Oryzias\_latipes/) {
-               $temp = "Oryzias\_latipes";
+               $species = "Oryzias\_latipes";
        }elsif ($temp =~ /Pediculus/) {
-               $temp = "Pediculus_humanus";
+               $species = "Pediculus_humanus";
        }elsif ($temp =~ /Phoenix/) {
-               $temp = "Phoenix_dactylifera";
+               $species = "Phoenix_dactylifera";
        }elsif ($temp =~ /Phyllostachys/) {
-               $temp = "Phyllostachys_heterocycla";
+               $species = "Phyllostachys_heterocycla";
        }elsif ($temp =~ /Physcomit/) {
-               $temp = "Physcomitrella_patens";
+               $species = "Physcomitrella_patens";
        }elsif ($temp =~ /Populus/) {
-               $temp = "Populus_trichocarpa";
+               $species = "Populus_trichocarpa";
        }elsif ($temp =~ /Prunus/) {
-               $temp = "Prunus_persica";
+               $species = "Prunus_persica";
        }elsif ($temp =~ /Rattus/) {
-               $temp = "Rattus_norvegicus";
+               $species = "Rattus_norvegicus";
        }elsif ($temp =~ /Rhizopus/) {
-               $temp = "Rhizopus_oryzae";
+               $species = "Rhizopus_oryzae";
        }elsif ($temp =~ /Ricinus/) {
-               $temp = "Ricinus_communis";
+               $species = "Ricinus_communis";
        }elsif ($temp =~ /cerevisiae/) {
-               $temp = "Saccharomyces_cerevisiae";
+               $species = "Saccharomyces_cerevisiae";
        }elsif ($temp =~ /pombe/) {
-               $temp = "Schizosaccharomyces_pombe";
+               $species = "Schizosaccharomyces_pombe";
        }elsif ($temp =~ /Selaginella/) {
-               $temp = "Selaginella_moellendorffii";
+               $species = "Selaginella_moellendorffii";
        }elsif ($temp =~ /Solanum_tuberosum/) {
-                       $temp = "Solanum_tuberosum";
+               $species = "Solanum_tuberosum";
        }elsif ($temp =~ /Solanum_lycopersicum/) {
-                       $temp = "Solanum_lycopersicum";
+               $species = "Solanum_lycopersicum";
        }elsif ($temp =~ /Sorghum/) {
-               $temp = "Sorghum_bicolor";
+               $species = "Sorghum_bicolor";
        }elsif ($temp =~ /Synechocystis/) {
-               $temp = "Synechocystis_pcc6803";
+               $species = "Synechocystis_pcc6803";
        }elsif ($temp =~ /Takifugu/) {
-               $temp = "Takifugu\_rubripes";
+               $species = "Takifugu\_rubripes";
        }elsif ($temp =~ /Tetraodon/) {
-               $temp = "Tetraodon\_nigroviridis";
+               $species = "Tetraodon\_nigroviridis";
        }elsif ($temp =~ /Theobroma\_cacao/) {
-               $temp = "Theobroma_cacao";
+               $species = "Theobroma_cacao";
        }elsif ($temp =~ /TmDV92/) {
-               $temp = "TmDV92";
+               $species = "TmDV92";
        }elsif ($temp =~ /TmG3116/) {
-               $temp = "TmG3116";
+               $species = "TmG3116";
        }elsif ($temp =~ /Tolypocladium\_inflatum/) {
-               $temp = "Tolypocladium_inflatum";
+               $species = "Tolypocladium_inflatum";
        }elsif ($temp =~ /Trichoderma\_atroviride/) {
-               $temp = "Trichoderma_atroviride";
+               $species = "Trichoderma_atroviride";
        }elsif ($temp =~ /Trichoderma\_reesii/) {
-               $temp = "Trichoderma_reesii";
+               $species = "Trichoderma_reesii";
        }elsif ($temp =~ /Trichoderma\_virens/) {
-               $temp = "Trichoderma_virens";
+               $species = "Trichoderma_virens";
        }elsif ($temp =~ /Trichodesmium/) {
-               $temp = "Trichodesmium_erythraeum";
+               $species = "Trichodesmium_erythraeum";
        }elsif ($temp =~ /triticum\_aestivum/) {
-               $temp = "Triticum_aestivum";
+               $species = "Triticum_aestivum";
+       }elsif ($temp =~ /Triticum\_monococcumDV92/) {
+               $species = "Triticum_monococcumDV92";
+       }elsif ($temp =~ /Triticum\_monococcumG3116/) {
+               $species = "Triticum_monococcumG3116";
        }elsif ($temp =~ /Triticum\_urartu/) {
-               $temp = "Triticum_urartu";
+               $species = "Triticum_urartu";
        }elsif ($temp =~ /Vitis\_vinifera/) {
-               $temp = "Vitis_vinifera";
+               $species = "Vitis_vinifera";
        }elsif ($temp =~ /Zea\_mays/) {
-               $temp = "Zea_mays";
+               $species = "Zea_mays";
        }else {
                die "Error: Species can not be found from file name $temp!";
        }
-       return $temp;
+       return $species;
 }
 
 sub find_gene {
@@ -167,54 +218,76 @@ sub find_gene {
        if ($species eq "Aegilops_tauschii") {
                my ($gene_id, $type, $location, $info) = split(/\s/, $gene_header);
                $gene = $gene_id;
+       }elsif ($species eq "Amborella_trichopoda") {
+               $gene = $gene_header;
        }elsif ($species eq "Arabidopsis_lyrata") {
-               my ($name,$gene_id,$scaffold,$isomer) = split(/\|/, $gene_header);
-               $gene = $isomer;
+               my ($name,$gene_id) = split(/\|/, $gene_header);
+               $gene = $gene_id;
        }elsif ($species eq "Arabidopsis_thaliana") {
-               my ($name,$gene_id,$chrom,$isomer) = split(/\|/, $gene_header);
+               my ($gene_id,$isomer) = split(/\|/, $gene_header);
                $gene = $isomer;
-       }elsif ($species eq "Batrachochytrium_distachyon") {
+       }elsif ($species eq "Batrachochytrium_dendrobatidis") {
                my ($gene_id,$transcript,$info) = split(/\s\|\s/, $gene_header);
                $gene = $gene_id;
        }elsif ($species eq "Brachypodium_distachyon") {
-               my ($name,$gene_id,$chrom,$isomer) = split(/\|/, $gene_header);
+               my ($gene_id,$isomer) = split(/\|/, $gene_header);
                $gene = $isomer;
+       }elsif ($species eq "Brachypodium_sylvaticum_Corvallis") {
+               my ($gene_id,$temp,$start,$end) = split(/\|/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Brachypodium_sylvaticum_Greece") {
+               $gene = $gene_header;
+       }elsif ($species eq "Brachypodium_sylvaticum_Spain") {
+               my ($gene_id,$temp,$start,$end) = split(/\|/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Brassica_rapa") {
+               my ($gene_id,$transcript) = split(/\|/, $gene_header);
+               $gene = $transcript; #????
        }elsif ($species eq "Caenorhabditis_elegans") {
                my ($locus_id,$unknown,$gene_id,$temp) = split(/\|/, $gene_header);
                $gene = $locus_id; #???
+       }elsif ($species eq "Cajanus_cajan") {
+               my ($gene_id,$type,$locus,$temp) = split(/\s/, $gene_header);
+               $gene = $gene_id;
        }elsif ($species eq "Carica_papaya") {
-               my $supercontig = $gene_header;
-               $gene = $supercontig;
+               my ($contig,$gene_id) = split(/\|/, $gene_header);
+               $gene = $gene_id;
        }elsif ($species eq "Chlamydomonas_reinhardtii") {
-               my ($name,$locus_id,$chrom,$transcript) = split(/\|/,$gene_header);
+               my ($locus,$transcript) = split(/\|/,$gene_header);
                $gene = $transcript; #???
+       }elsif ($species eq "Cicer_arietinum") {
+               my ($gi,$number,$ref,$gene_id,$function) = split(/\|/, $gene_header);
+               $gene = $gene_id;
        }elsif ($species eq "Citrus_clementina") {
-               my ($gene_id,$pacid) = split(/\|/,$gene_header);
+               my ($transcript_id,$gene_id) = split(/\|/,$gene_header);
                $gene = $gene_id;
        }elsif ($species eq "Citrus_sinensis") {
-               my ($gene_id,$pacid) = split(/\|/,$gene_header);
+               my ($transcript_id,$gene_id) = split(/\|/,$gene_header);
                $gene = $gene_id;
        }elsif ($species eq "Cucumis_sativus") {
-               my ($name,$main_isomer,$scaffold,$isomer) = split(/\|/,$gene_header);
+               my ($gene_id,$isomer) = split(/\|/,$gene_header);
                $gene = $isomer;
        }elsif ($species eq "Danio_rerio") {
-               my ($gene_id,$transcript) = split(/\|/,$gene_header);
-               $gene = $gene_id;
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene = $transcript;
        }elsif ($species eq "Drosophila_melanogaster") {
-               my ($gene_id,$temp) = split(/\s/,$gene_header);
-               $gene = $gene_id;
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene = $transcript;
        }elsif ($species eq "Ectocarpus_siliculosus") {
-               my ($temp,$transcript,$source,$gene_id,$func) = split(/\|/, $gene_header);
+               my ($gene_id,$temp) = split(/\|/, $gene_header);
                $gene = $gene_id;
        }elsif ($species eq "Epichloe_festuca") {
                $gene = $gene_header;
        }elsif ($species eq "Escherichia_coli") {
                $gene = $gene_header; #???
        }elsif ($species eq "Eucalyptus_grandis") {
-               my ($gene_id,$pacid) = split(/\|/,$gene_header);
-               $gene = $gene_id;
+               my ($gene_id,$transcript_id) = split(/\|/,$gene_header);
+               $gene = $transcript_id;
        }elsif ($species eq "Fragaria_vesca") {
-               my ($gene_id, $mrna_id, $method, $length) = split(/\s\|\s/, $gene_header);
+               my ($gene_id, $mrna_id) = split(/\|/, $gene_header);
+               $gene_id =~ s/\-v1\.0\-hybrid//;
                $gene = $gene_id;
        }elsif ($species eq "Fusarium_graminearum") {
                my ($gene_id, $info) = split(/\s\|\s/, $gene_header);
@@ -225,19 +298,26 @@ sub find_gene {
                my ($gene_id, $info) = split(/\s\|\s/, $gene_header);
                $gene = $gene_id;
        }elsif ($species eq "Gadus_morhua") {
-               my ($gene_id, $transcript_id) = split(/\|/,$gene_header);
-               $gene = $transcript_id;
-       }elsif ($species eq "Gastroerosteus_aculeatus") {
-               my ($gene_id, $transcript_id) = split(/\|/,$gene_header);
-               $gene = $transcript_id;
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene = $transcript;
+       }elsif ($species eq "Gasterosteus_aculeatus") {
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene = $transcript;
        }elsif ($species eq "Glycine_max") {
-               my ($name,$locus_id,$isomer,$chrom) = split(/\|/,$gene_header);
+               my ($locus_id,$isomer) = split(/\|/,$gene_header);
+               $gene = $isomer;
+       }elsif ($species eq "Gossypium_raimondii") {
+               my ($locus_id,$isomer) = split(/\|/,$gene_header);
                $gene = $isomer;
        }elsif ($species eq "Homo_sapiens") {
-               my ($protein,$type,$chrom,$gene_id,$transcript) = split(/\s/,$gene_header);
-               $gene = $protein;
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene = $transcript;
        }elsif ($species eq "Hordeum_vulgare") {
-               my ($gene_id,$transcript) = split(/\|/,$gene_header);
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
                $gene = $transcript;
        }elsif ($species eq "Jatropha_curcas") {
                my ($gene_id, $temp) = split(/\s/,$gene_header);
@@ -245,19 +325,26 @@ sub find_gene {
        }elsif ($species eq "Laccaria_bicolor") {
                my ($source,$spec,$gene_id,$scaffold) = split(/\|/, $gene_header);
                $gene = $gene_id;
+       }elsif ($species eq "Leersia_perrieri") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Linum_usitatissimum") {
+               my ($transcript,$gene_id) = split(/\|/, $gene_header);
+               $gene = $gene_id;
        }elsif ($species eq "Magnaporthe_grissa") {
                my ($transcript,$gene_id,$temp) = split(/\s\|\s/, $gene_header);
                $gene = $transcript;
        }elsif ($species eq "Malus_domestica" ) {
-               $gene = $gene_header;
+               my ($gene_id1,$gene_id2) = split(/\|/, $gene_header);
+               $gene = $gene_id2;
        }elsif ($species eq "Manihot_esculenta") {
-               my ($name,$scaffold,$gene_id,$temp) = split(/\|/, $gene_header);
+               my ($transcript,$gene_id) = split(/\|/, $gene_header);
                $gene = $gene_id;
        }elsif ($species eq "Medicago_truncatula") {
-               my ($name,$gene_id,$chrom,$temp) = split(/\|/, $gene_header);
-               $gene = $gene_id;
+               my ($gene_id,$transcript) = split(/\|/, $gene_header);
+               $gene = $transcript;
        }elsif ($species eq "Mimulus_guttatus") {
-               my ($name,$gene_id,$scaffold,$temp) = split(/\|/, $gene_header);
+               my ($gene_id,$transcript) = split(/\|/, $gene_header);
                $gene = $gene_id;
        }elsif ($species eq "Mus_musculus") {
                my ($protein,$type,$chrom,$gene_id,$transcript) = split(/\s/,$gene_header);
@@ -275,11 +362,55 @@ sub find_gene {
                my ($temp,$transcript,$source,$gene_id,$func) = split(/\|/, $gene_header);
                $gene = $transcript;
        }elsif ($species eq "Oncorhynchus_mykiss") {
-               my ($gene_id, $offset, $coord_1, $coord_2) = split(/\s/,$gene_header);
+               my ($gi, $number, $type, $gene_id, $function) = split(/\|/,$gene_header);
                $gene = $gene_id;
-       }elsif ($species eq "Oryza_sativa") {
-               my ($isomer,$temp,$type) = split(/\|/,$gene_header);
-               $gene = $isomer;
+       }elsif ($species eq "Leersia_perrieri") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Oryza_barthii") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Oryza_brachyantha") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Oryza_glaberrima") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Oryza_glumaepatula") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Oryza_longistaminata") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Oryza_meridionalis") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Oryza_nivara") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Oryza_punctata") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Oryza_rufipogon") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Oryza_sativa.indica.iplant") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Oryza_sativa.indica.gramene") {
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+                $transcript =~ s/transcript\://;
+                $gene = $transcript;
+       }elsif ($species eq "Oryza_sativa.japonica.iplant") {
+               my ($gene_id,$chrom,$program,$function,$type) = split(/[\|\s]+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Oryza_sativa.japonica.IRGSP") {
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+                $transcript =~ s/transcript\://;
+                $gene = $transcript;
+       }elsif ($species eq "Oryza_sativa.japonica.MSU") {
+               my ($gene_id,$transcript) = split(/\|/,$gene_header);
+                $gene = $transcript;
        }elsif ($species eq "Oryzias_latipes") {
                my ($gene_id, $transcript_id) = split(/\|/,$gene_header);
                $gene = $transcript_id;
@@ -365,17 +496,22 @@ sub find_gene {
        }elsif ($species eq "Triticum_aestivum") {
                my ($type,$transcript,$source,$gene_id,$func) = split(/\|/, $gene_header);
                $gene = $transcript;
-       }elsif ($species eq "Triticum_urartu") {
-               my ($gene_id, $type, $location, $info) = split(/\s/, $gene_header);
+       }elsif ($species eq "Triticum_monococcumDV92") {
+               my ($gene_id,$temp) = split(/\s+/,$gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Triticum_monococcumG3116") {
+               my ($gene_id,$temp) = split(/\s+/,$gene_header);
                $gene = $gene_id;
+       }elsif ($species eq "Triticum_urartu") {
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+                $transcript =~ s/transcript\://;
+                $gene = $transcript;
        }elsif ($species eq "Vitis_vinifera") {
                #my ($name,$gene_id,$chrom_id,$id) = split(/\|/,$gene_header);
                $gene = $gene_header; #???
        }elsif ($species eq "Zea_mays") {
-               #my ($transcript,$seq_type,$coord,$parent_transcript,$parent_gene) = split(/;\s/, $gene_header);
-               my ($spec, $gene_id, $unknown, $transcript) = split(/\|/, $gene_header);
+               my ($gene_id,$transcript) = split(/\|/,$gene_header);
                $gene = $transcript;
-               $gene =~ s/\sseq=translation//g;
                
        }else {
                die "Error: Gene id can not be found for species $species!";
@@ -383,17 +519,258 @@ sub find_gene {
        return $gene;
 }
 
+
+sub find_gene_synonym {
+       my $gene_header = $_[0];
+       $gene_header =~ s/^>//; # strip off the header line identifier, if it isn't already
+       my $species = $_[1];
+       my $synonym;
+       if ($species eq "Batrachochytrium_dendrobatidis") {
+               my ($gene_id,$transcript,$info) = split(/\s\|\s/, $gene_header);
+               $synonym = $transcript;
+       }elsif ($species eq "Caenorhabditis_elegans") {
+               my ($locus_id,$unknown,$gene_id,$temp) = split(/\|/, $gene_header);
+               $synonym = $gene_id; #???
+       }elsif ($species eq "Cajanus_cajan") {
+               my ($gene_id,$type,$locus,$temp) = split(/\s/, $gene_header);
+               $synonym = $locus;
+       }elsif ($species eq "Carica_papaya") {
+               my ($contig,$gene_id) = split(/\|/, $gene_header);
+               $synonym = $contig;
+       }elsif ($species eq "Chlamydomonas_reinhardtii") {
+               my ($locus,$transcript) = split(/\|/,$gene_header);
+               $synonym = $locus; #???
+       }elsif ($species eq "Cicer_arietinum") {
+               my ($gi,$number,$ref,$gene_id,$function) = split(/\|/, $gene_header);
+               $synonym = $number;
+       }elsif ($species eq "Citrus_clementina") {
+               my ($transcript_id,$gene_id) = split(/\|/,$gene_header);
+               $synonym = $transcript_id;
+       }elsif ($species eq "Citrus_sinensis") {
+               my ($transcript_id,$gene_id) = split(/\|/,$gene_header);
+               $synonym = $transcript_id;
+       }elsif ($species eq "Cucumis_sativus") {
+               my ($gene_id,$isomer) = split(/\|/,$gene_header);
+               $synonym = $gene_id;
+       }elsif ($species eq "Danio_rerio") {
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene_id =~ s/gene\://;
+               $synonym = "$peptide,$gene_id";
+       }elsif ($species eq "Drosophila_melanogaster") {
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene_id =~ s/gene\://;
+               $synonym = "$peptide,$gene_id";
+       }elsif ($species eq "Eucalyptus_grandis") {
+               my ($gene_id,$transcript_id) = split(/\|/,$gene_header);
+               $synonym = $gene_id;
+       }elsif ($species eq "Fragaria_vesca") {
+               my ($gene_id, $mrna_id) = split(/\|/, $gene_header);
+               $gene_id =~ s/\-v1\.0\-hybrid//;
+               $mrna_id =~ s/\-v1\.0\-hybrid//;
+               $synonym = $mrna_id;
+       }elsif ($species eq "Gadus_morhua") {
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene_id =~ s/gene\://;
+               $synonym = "$peptide,$gene_id";
+       }elsif ($species eq "Gasterosteus_aculeatus") {
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene_id =~ s/gene\://;
+               $synonym = "$peptide,$gene_id";
+       }elsif ($species eq "Glycine_max") {
+               my ($locus_id,$isomer) = split(/\|/,$gene_header);
+               $gene = $isomer;
+       }elsif ($species eq "Gossypium_raimondii") {
+               my ($locus_id,$isomer) = split(/\|/,$gene_header);
+               $synonym = $locus_id;
+       }elsif ($species eq "Homo_sapiens") {
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene_id =~ s/gene\://;
+               $synonym = "$peptide,$gene_id";
+       }elsif ($species eq "Hordeum_vulgare") {
+               my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene_id =~ s/gene\://;
+               $synonym = "$peptide,$gene_id";
+       }elsif ($species eq "Linum_usitatissimum") {
+               my ($transcript,$gene_id) = split(/\|/, $gene_header);
+               $synonym = $transcript;
+       }elsif ($species eq "Magnaporthe_grissa") {
+               my ($transcript,$gene_id,$temp) = split(/\s\|\s/, $gene_header);
+               $synonym = $gene_id;
+       }elsif ($species eq "Malus_domestica" ) {
+               my ($gene_id1,$gene_id2) = split(/\|/, $gene_header);
+               $synonym = $gene_id1;
+       }elsif ($species eq "Manihot_esculenta") {
+               my ($transcript,$gene_id) = split(/\|/, $gene_header);
+               $synonym = $transcript;
+       }elsif ($species eq "Medicago_truncatula") {
+               my ($gene_id,$transcript) = split(/\|/, $gene_header);
+               $synonym = $gene_id;
+       }elsif ($species eq "Mimulus_guttatus") {
+               my ($gene_id,$transcript) = split(/\|/, $gene_header);
+               $synonym = $gene_id;
+       }elsif ($species eq "Mus_musculus") {
+               my ($protein,$type,$chrom,$gene_id,$transcript) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene_id =~ s/gene\://;
+               $synonym = "$peptide,$gene_id";
+       }elsif ($species eq "Musa_acuminata") {
+               my ($protein,$type,$chrom,$gene_id,$transcript) = split(/\s/,$gene_header);
+               $transcript =~ s/transcript\://;
+               $gene_id =~ s/gene\://;
+               $synonym = "$peptide,$gene_id";
+       }elsif ($species eq "Neurospora_crassa") {
+               my ($transcript,$gene_id,$temp) = split(/\s\|\s/,$gene_header);
+               $synonym = $gene_id;
+       }elsif ($species eq "Nostoc_punctiforme") {
+               my ($temp,$transcript,$source,$gene_id,$func) = split(/\|/, $gene_header);
+               $synonym = $gene_id;
+       }elsif ($species eq "Oncorhynchus_mykiss") {
+               my ($gi, $number, $type, $gene_id, $function) = split(/\|/,$gene_header);
+               $synonym = $number;
+        }elsif ($species eq "Oryza_sativa.indica.gramene") {
+                my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+                $transcript =~ s/transcript\://;
+                $gene_id =~ s/gene\://;
+                $synonym = "$peptide,$gene_id";
+        }elsif ($species eq "Oryza_sativa.japonica.IRGSP") {
+                my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+                $transcript =~ s/transcript\://;
+                $gene_id =~ s/gene\://;
+                $synonym = "$peptide,$gene_id";
+       }elsif ($species eq "Oryzias_latipes") {
+               my ($gene_id, $transcript_id) = split(/\|/,$gene_header);
+               $gene = $transcript_id;
+       }elsif ($species eq "Pediculus_humanus") {
+               my ($source,$gene_pa,$func,$unknown,$gene_id) = split(/\|/, $gene_header);
+               $gene_id =~ s/gene\://;
+               $gene = $gene_pa;
+       }elsif ($species eq "Phoenix_dactylifera") {
+               my ($gene_id, $temp) = split(/\s/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Phyllostachys_heterocycla") {
+               my ($gene_id, $gene_model, $location, $dot, $strand, $dot2, $info) = split (/\s+/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Physcomitrella_patens") {
+               my ($name,$pac_id) = split(/\|/,$gene_header);
+               $gene = $name; #???
+       }elsif ($species eq "Populus_trichocarpa") {
+               my ($gene_id,$transcript_id) = split(/\|/,$gene_header);
+               $gene = $transcript_id;
+       }elsif ($species eq "Prunus_persica") {
+               my ($gene_id,$temp) = split(/\s/,$gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Rattus_norvegicus") {
+               my ($gene_id,$transcript,$chrom) = split(/\|/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Rhizopus_oryzae") {
+               my ($gene_id,$func) = split(/\s\|\s/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Ricinus_communis") {
+               my ($name,$transcript,$gene_id,$temp) = split(/\|/, $gene_header);
+               $gene = $transcript;
+       }elsif ($species eq "Saccharomyces_cerevisiae") {
+               my ($gene_id,$gene_name,$transcript,$temp) = split(/\s/,$gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Schizosaccharomyces_pombe") {
+               my ($gene_id,$gene_name,$unknown,$func,$name,$chrom,$temp) = split(/\s/,$gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Selaginella_moellendorffii") {
+               my ($name,$locus_id,$chrom_id,$prot_id) = split(/\|/,$gene_header);
+               $gene = $prot_id; #???
+       }elsif ($species eq "Solanum_tuberosum") {
+               my ($protein,$transcript,$type) = split(/\s/,$gene_header);
+               $gene = $protein;
+       }elsif ($species eq "Solanum_lycopersicum") {
+               my ($protein,$type,$location,$gene_1,$transcript) = split(/\s/,$gene_header);
+               $gene = $protein;
+       }elsif ($species eq "Sorghum_bicolor") {
+               my ($gene_id,$transcript_id) = split(/\|/,$gene_header);
+               $synonym = $gene_id;
+       }elsif ($species eq "Synechocystis_pcc6803") {
+               my ($gene_id,$temp) = split(" ",$gene_header);
+               $gene = $gene_id; #???
+       }elsif ($species eq "Takifugu_rubripes") {
+               my ($gene_id, $transcript_id) = split(/\|/,$gene_header);
+               $gene = $transcript_id;
+       }elsif ($species eq "Tetraodon_nigroviridis") {
+               my ($gene_id, $transcript_id) = split(/\|/,$gene_header);
+               $gene = $transcript_id;
+       }elsif ($species eq "Theobroma_cacao") {
+               my ($gene_id,$temp) = split(/\s/,$gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "TmDV92") {
+               my ($gene_id,$temp) = split(/\s+/,$gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "TmG3116") {
+               my ($gene_id,$temp) = split(/\s+/,$gene_header);
+               $gene = $gene_id;       
+       }elsif ($species eq "Tolypocladium_inflatum") {
+               my ($gene_id, $type, $info) = split(/\s/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Trichoderma_atroviride") {
+               my ($source, $spec, $int_id, $gene_id) = split(/\|/,$gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Trichoderma_reesii") {
+               my ($source, $spec, $int_id, $gene_id) = split(/\|/,$gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Trichoderma_virens") {
+               my ($temp,$unknown,$source,$gene_id,$func) = split(/\|/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Trichodesmium_erythraeum") {
+               my ($temp,$unknown,$source,$gene_id,$func) = split(/\|/, $gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Triticum_aestivum") {
+               my ($type,$transcript,$source,$gene_id,$func) = split(/\|/, $gene_header);
+               $gene = $transcript;
+       }elsif ($species eq "Triticum_monococcumDV92") {
+               my ($gene_id,$temp) = split(/\s+/,$gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Triticum_monococcumG3116") {
+               my ($gene_id,$temp) = split(/\s+/,$gene_header);
+               $gene = $gene_id;
+       }elsif ($species eq "Triticum_urartu") {
+                my ($peptide,$temp,$chrom,$gene_id,$transcript,$gene_type,$transcript_type) = split(/\s/,$gene_header);
+                $transcript =~ s/transcript\://;
+                $gene_id =~ s/gene\://;
+                $synonym = "$peptide,$gene_id";
+       }elsif ($species eq "Vitis_vinifera") {
+               #my ($name,$gene_id,$chrom_id,$id) = split(/\|/,$gene_header);
+               $gene = $gene_header; #???
+       }elsif ($species eq "Zea_mays") {
+               my ($gene_id,$transcript) = split(/\|/,$gene_header);
+               $synonym = $gene_id;
+               
+       }else {
+               die "Error: Gene id can not be found for species $species!";
+       }
+       return $gene;
+       
+}
+
 sub all_species_array {
        # return all species in an array
        my @spec_array;
        push (@spec_array, "Aegilops_tauschii");
+       push (@spec_array, "Amborella_trichopoda");
        push (@spec_array, "Arabidopsis_lyrata");
        push (@spec_array, "Arabidopsis_thaliana");
-       push (@spec_array, "Batrachochytrium_distachyon");
+       #push (@spec_array, "Batrachochytrium_dendrobatidis");
        push (@spec_array, "Brachypodium_distachyon");
+       push (@spec_array, "Brachypodium_sylvaticum_Corvallis");
+       push (@spec_array, "Brachypodium_sylvaticum_Greece");
+       push (@spec_array, "Brachypodium_sylvaticum_Spain");
+       push (@spec_array, "Brassica_rapa");
        push (@spec_array, "Caenorhabditis_elegans");
+       push (@spec_array, "Cajanus_cajan");
        push (@spec_array, "Carica_papaya");
        push (@spec_array, "Chlamydomonas_reinhardtii");
+       push (@spec_array, "Cicer_arietinum");
        push (@spec_array, "Citrus_clementina");
        push (@spec_array, "Citrus_sinensis");
        push (@spec_array, "Cucumis_sativus");
@@ -408,12 +785,14 @@ sub all_species_array {
        push (@spec_array, "Fusarium_oxysporum");
        push (@spec_array, "Fusarium_verticilliodes");
        push (@spec_array, "Gadus_morhua");
-       push (@spec_array, "Gastroerosteus_aculeatus");
+       push (@spec_array, "Gasterosteus_aculeatus");
        push (@spec_array, "Glycine_max");
+       push (@spec_array, "Gossypium_raimondii");
        push (@spec_array, "Homo_sapiens");
        push (@spec_array, "Hordeum_vulgare");
        push (@spec_array, "Jatropha_curcas");
        push (@spec_array, "Laccaria_bicolor");
+       push (@spec_array, "Linum_usitatissimum");
        push (@spec_array, "Magnaporthe_grissa");
        push (@spec_array, "Malus_domestica");
        push (@spec_array, "Manihot_esculenta");
@@ -454,6 +833,8 @@ sub all_species_array {
        push (@spec_array, "Trichoderma_virens");
        push (@spec_array, "Trichodesmium_erythraeum");
        push (@spec_array, "Triticum_aestivum");
+       push (@spec_array, "Triticum_monococcumDV92");
+       push (@spec_array, "Triticum_monococcumG3116");
        push (@spec_array, "Triticum_urartu");
        push (@spec_array, "Vitis_vinifera");
        push (@spec_array, "Zea_mays");
@@ -463,7 +844,7 @@ sub all_species_array {
 
 sub retrieval_info {
        my $file = $_[0];
-       $file =~ /([a-zA-Z]*\_[a-zA-Z0-9]*)\_((?:[a-zA-Z0-9]+\_?)+)\_(\d+\_\d+\_\d+).fa/;
+       $file =~ /([a-zA-Z]*\_+[a-zA-Z0-9]+)\_((?:[a-zA-Z0-9+-.]+\_?)+)\_(\d+\_\d+\_\d+).fa/;
        my $species = $1;
        my $method = $2;
        my $date = $3;