return $synonym;
}
}
+
+
+# subroutine to filter out non-canonical transcripts
+sub trans_filter {
+ my $species = $_[0];
+ my $gene = $_[1];
+ my $is_transcript = 1;
+ if($species ne "Jatropha_curcas") { #Jatropha gene ids are .10, .20, .30, etc...
+ if ($gene =~ /\.[2-9]$/) {$is_transcript = 0;}
+ elsif ($gene =~ /\.1[0-9]$/) {$is_transcript = 0;}
+ elsif ($gene =~ /\.2[0-9]$/) {$is_transcript = 0;}
+ elsif ($gene =~ /\.3[0-9]$/) {$is_transcript = 0;}
+ elsif ($gene =~ /\.4[0-9]$/) {$is_transcript = 0;}
+ elsif ($gene =~ /\.5[0-9]$/) {$is_transcript = 0;}
+ elsif ($gene =~ /\.6[0-9]$/) {$is_transcript = 0;}
+ elsif ($gene =~ /\.7[0-9]$/) {$is_transcript = 0;}
+ elsif ($gene =~ /\.8[0-9]$/) {$is_transcript = 0;}
+ elsif ($gene =~ /\.9[0-9]$/) {$is_transcript = 0;}
+ }
+
+ # Maize genes that are not the first transcript
+ if($species eq "Zea_mays") {
+ if ($gene !~ /\_T001$/) {$is_transcript = 0;}
+ }
+
+ # Theobroma genes that are not the first transcript
+ if($species eq "Theobroma_cacao") {
+ if ($gene =~ /t[2-9]$/) {$is_transcript = 0;}
+ elsif ($gene =~ /t1[0-9]$/) {$is_transcript = 0;}
+ }
+return $is_transcript;
+}
+
1;
use DbiFloret;
-my $dbh = DbiFloret::dbconnect;
+my $dbh = DbiFloret::dbconnect_local;
my @files = glob("*.fa");
$retrieval_date =~ /(\d+)\_(\d+)\_(\d\d)/;
my $formatted_date = "20".$3."-".$1."-".$2;
+ my $total_genes = 0;
+ my $canonical_genes = 0;
+
# insert version info into database. If species previously entered, location and date will be updated.
my $safe_info_table = $dbh->quote_identifier("version_info");
# Create the database for version info if it doesn't already exist
$dbh->do("CREATE TABLE IF NOT EXISTS $safe_info_table (
- `species` varchar(255) NOT NULL,
+ `species` varchar(100) NOT NULL,
`retrieval_location` varchar(255) NOT NULL,
`date` date NOT NULL,
PRIMARY KEY (`species`)
) ENGINE=MyISAM");
+ # Create the table for isoform info if it doesn't already exist
+ my $safe_isoform_table = $dbh->quote_identifier("isoform_info");
+ $dbh->do("CREATE TABLE IF NOT EXISTS $safe_isoform_table (
+ `species` varchar(100) NOT NULL,
+ `total_genes` int(11) NOT NULL,
+ `canonical_genes` int(11) NOT NULL,
+ PRIMARY KEY (`species`)
+ ) ENGINE=MyISAM");
$dbh->do("insert into $safe_info_table (species,retrieval_location,date) values ('$species','$retrieval_location','$formatted_date') on duplicate key update retrieval_location='$retrieval_location', date='$formatted_date'");
$dbh->do("drop table if exists $safe_table");
$dbh->do("CREATE TABLE $safe_table (
- `gene_id` VARCHAR( 255 ) NOT NULL ,
+ `gene_id` VARCHAR( 100 ) NOT NULL ,
`gene_header` TEXT NOT NULL ,
`sequence` TEXT NOT NULL ,
UNIQUE ( `gene_id` )
- ) TYPE = MYISAM");
+ ) ENGINE = MYISAM");
$dbh->do("CREATE TABLE if NOT EXISTS $safe_syn_table(
- `gene_id` VARCHAR( 255 ) NOT NULL ,
- `species` VARCHAR( 255 ) NOT NULL ,
+ `gene_id` VARCHAR( 100 ) NOT NULL ,
+ `species` VARCHAR( 100 ) NOT NULL ,
`synonyms` TEXT ,
UNIQUE KEY `gene_species_idx` (`gene_id`, `species`)
- ) TYPE = MYISAM");
+ ) ENGINE = MYISAM");
# and the statement handler to do the inserts
my $insert_sth = $dbh->prepare("insert into $safe_table (gene_id, gene_header, sequence) values (?,?,?)");
- my $insert_syn_sth = $dbh->prepare("insert into $safe_syn_table (gene_id, species, synonyms) values (?,?,?)");
+ my $insert_syn_sth = $dbh->prepare("replace into $safe_syn_table (gene_id, species, synonyms) values (?,?,?)");
+ my $insert_counts_sth = $dbh->prepare("replace into $safe_isoform_table (species, total_genes, canonical_genes) values (?,?,?)");
# create the variables
$gene =~ s/^\>//;
$synonyms = find_gene_synonym($gene_header,$species);
$seq = "";
+ if(trans_filter($species, $gene)) {
+ $canonical_genes++;
+ }
+ $total_genes++;
}else{
$seq = "$seq"."$line";
}
$insert_syn_sth->execute($key,$species,$seq_hash{$key}->{'synonyms'});
}
}
+ $insert_counts_sth->execute($species,$total_genes,$canonical_genes);
}