From f4d6e37f92ba47a463ca40e46ca5a748a4ca0e08 Mon Sep 17 00:00:00 2001 From: elserj Date: Sun, 25 Aug 2019 18:14:46 +0000 Subject: [PATCH] Update to include canonical transcript counts when loading protein sequences svn path=/; revision=665 --- interactome_scripts/find_species.pl | 33 ++++++++++++++++++++++++++ interactome_scripts/load_species_db.pl | 33 +++++++++++++++++++------- 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/interactome_scripts/find_species.pl b/interactome_scripts/find_species.pl index 8b008af..ff087fd 100755 --- a/interactome_scripts/find_species.pl +++ b/interactome_scripts/find_species.pl @@ -1483,4 +1483,37 @@ sub mart_synonym { return $synonym; } } + + +# subroutine to filter out non-canonical transcripts +sub trans_filter { + my $species = $_[0]; + my $gene = $_[1]; + my $is_transcript = 1; + if($species ne "Jatropha_curcas") { #Jatropha gene ids are .10, .20, .30, etc... + if ($gene =~ /\.[2-9]$/) {$is_transcript = 0;} + elsif ($gene =~ /\.1[0-9]$/) {$is_transcript = 0;} + elsif ($gene =~ /\.2[0-9]$/) {$is_transcript = 0;} + elsif ($gene =~ /\.3[0-9]$/) {$is_transcript = 0;} + elsif ($gene =~ /\.4[0-9]$/) {$is_transcript = 0;} + elsif ($gene =~ /\.5[0-9]$/) {$is_transcript = 0;} + elsif ($gene =~ /\.6[0-9]$/) {$is_transcript = 0;} + elsif ($gene =~ /\.7[0-9]$/) {$is_transcript = 0;} + elsif ($gene =~ /\.8[0-9]$/) {$is_transcript = 0;} + elsif ($gene =~ /\.9[0-9]$/) {$is_transcript = 0;} + } + + # Maize genes that are not the first transcript + if($species eq "Zea_mays") { + if ($gene !~ /\_T001$/) {$is_transcript = 0;} + } + + # Theobroma genes that are not the first transcript + if($species eq "Theobroma_cacao") { + if ($gene =~ /t[2-9]$/) {$is_transcript = 0;} + elsif ($gene =~ /t1[0-9]$/) {$is_transcript = 0;} + } +return $is_transcript; +} + 1; diff --git a/interactome_scripts/load_species_db.pl b/interactome_scripts/load_species_db.pl index c38dc12..bf7227d 100755 --- a/interactome_scripts/load_species_db.pl +++ b/interactome_scripts/load_species_db.pl @@ -37,7 +37,7 @@ use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts"; use DbiFloret; -my $dbh = DbiFloret::dbconnect; +my $dbh = DbiFloret::dbconnect_local; my @files = glob("*.fa"); @@ -49,15 +49,26 @@ foreach my $file (@files) { $retrieval_date =~ /(\d+)\_(\d+)\_(\d\d)/; my $formatted_date = "20".$3."-".$1."-".$2; + my $total_genes = 0; + my $canonical_genes = 0; + # insert version info into database. If species previously entered, location and date will be updated. my $safe_info_table = $dbh->quote_identifier("version_info"); # Create the database for version info if it doesn't already exist $dbh->do("CREATE TABLE IF NOT EXISTS $safe_info_table ( - `species` varchar(255) NOT NULL, + `species` varchar(100) NOT NULL, `retrieval_location` varchar(255) NOT NULL, `date` date NOT NULL, PRIMARY KEY (`species`) ) ENGINE=MyISAM"); + # Create the table for isoform info if it doesn't already exist + my $safe_isoform_table = $dbh->quote_identifier("isoform_info"); + $dbh->do("CREATE TABLE IF NOT EXISTS $safe_isoform_table ( + `species` varchar(100) NOT NULL, + `total_genes` int(11) NOT NULL, + `canonical_genes` int(11) NOT NULL, + PRIMARY KEY (`species`) + ) ENGINE=MyISAM"); $dbh->do("insert into $safe_info_table (species,retrieval_location,date) values ('$species','$retrieval_location','$formatted_date') on duplicate key update retrieval_location='$retrieval_location', date='$formatted_date'"); @@ -70,22 +81,23 @@ foreach my $file (@files) { $dbh->do("drop table if exists $safe_table"); $dbh->do("CREATE TABLE $safe_table ( - `gene_id` VARCHAR( 255 ) NOT NULL , + `gene_id` VARCHAR( 100 ) NOT NULL , `gene_header` TEXT NOT NULL , `sequence` TEXT NOT NULL , UNIQUE ( `gene_id` ) - ) TYPE = MYISAM"); + ) ENGINE = MYISAM"); $dbh->do("CREATE TABLE if NOT EXISTS $safe_syn_table( - `gene_id` VARCHAR( 255 ) NOT NULL , - `species` VARCHAR( 255 ) NOT NULL , + `gene_id` VARCHAR( 100 ) NOT NULL , + `species` VARCHAR( 100 ) NOT NULL , `synonyms` TEXT , UNIQUE KEY `gene_species_idx` (`gene_id`, `species`) - ) TYPE = MYISAM"); + ) ENGINE = MYISAM"); # and the statement handler to do the inserts my $insert_sth = $dbh->prepare("insert into $safe_table (gene_id, gene_header, sequence) values (?,?,?)"); - my $insert_syn_sth = $dbh->prepare("insert into $safe_syn_table (gene_id, species, synonyms) values (?,?,?)"); + my $insert_syn_sth = $dbh->prepare("replace into $safe_syn_table (gene_id, species, synonyms) values (?,?,?)"); + my $insert_counts_sth = $dbh->prepare("replace into $safe_isoform_table (species, total_genes, canonical_genes) values (?,?,?)"); # create the variables @@ -115,6 +127,10 @@ foreach my $file (@files) { $gene =~ s/^\>//; $synonyms = find_gene_synonym($gene_header,$species); $seq = ""; + if(trans_filter($species, $gene)) { + $canonical_genes++; + } + $total_genes++; }else{ $seq = "$seq"."$line"; } @@ -134,6 +150,7 @@ foreach my $file (@files) { $insert_syn_sth->execute($key,$species,$seq_hash{$key}->{'synonyms'}); } } + $insert_counts_sth->execute($species,$total_genes,$canonical_genes); } -- 2.34.1