Update to include canonical transcript counts when loading protein

author elserj <elserj@localhost>

Sun, 25 Aug 2019 18:14:46 +0000 (18:14 +0000)

committer elserj <elserj@localhost>

Sun, 25 Aug 2019 18:14:46 +0000 (18:14 +0000)
author elserj <elserj@localhost>
Sun, 25 Aug 2019 18:14:46 +0000 (18:14 +0000)
committer elserj <elserj@localhost>
Sun, 25 Aug 2019 18:14:46 +0000 (18:14 +0000)
diff --git a/interactome_scripts/find_species.pl b/interactome_scripts/find_species.pl

index 8b008af5126342a281dc11516e6f2547f54424d2..ff087fd5eb283729b5e3e678d0334dd2170bda84 100755 (executable)
--- a/interactome_scripts/find_species.pl
+++ b/interactome_scripts/find_species.pl
@@ -1483,4 +1483,37 @@ sub mart_synonym {
                                 return $synonym;
                 }
  }
+
+
+# subroutine to filter out non-canonical transcripts
+sub trans_filter {
+               my $species = $_[0];
+               my $gene = $_[1];
+               my $is_transcript = 1;
+               if($species ne "Jatropha_curcas") { #Jatropha gene ids are .10, .20, .30, etc...
+                       if ($gene =~ /\.[2-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.1[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.2[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.3[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.4[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.5[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.6[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.7[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.8[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.9[0-9]$/) {$is_transcript = 0;}
+               }
+                               
+               # Maize genes that are not the first transcript
+               if($species eq "Zea_mays") {
+                               if ($gene !~ /\_T001$/) {$is_transcript = 0;}
+               }
+                               
+               # Theobroma genes that are not the first transcript
+               if($species eq "Theobroma_cacao") {
+                               if ($gene =~ /t[2-9]$/) {$is_transcript = 0;}
+                               elsif ($gene =~ /t1[0-9]$/) {$is_transcript = 0;}
+               }
+return $is_transcript;
+}
+
  1;
diff --git a/interactome_scripts/load_species_db.pl b/interactome_scripts/load_species_db.pl

index c38dc1252cbca00a2f08e5071104fbc88fa1e905..bf7227d4520afd72e6798acabd510990259d7ff3 100755 (executable)
--- a/interactome_scripts/load_species_db.pl
+++ b/interactome_scripts/load_species_db.pl
@@ -37,7 +37,7 @@ use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts";
  
  use DbiFloret;
  
-my $dbh = DbiFloret::dbconnect;
+my $dbh = DbiFloret::dbconnect_local;
  
  my @files = glob("*.fa");
  
@@ -49,15 +49,26 @@ foreach my $file (@files) {
         $retrieval_date =~ /(\d+)\_(\d+)\_(\d\d)/;
         my $formatted_date = "20".$3."-".$1."-".$2;
         
+       my $total_genes = 0;
+       my $canonical_genes = 0;
+       
         # insert version info into database.  If species previously entered, location and date will be updated.
         my $safe_info_table = $dbh->quote_identifier("version_info");
         # Create the database for version info if it doesn't already exist
         $dbh->do("CREATE TABLE IF NOT EXISTS $safe_info_table (
-                       `species` varchar(255) NOT NULL,
+                       `species` varchar(100) NOT NULL,
                         `retrieval_location` varchar(255) NOT NULL,
                         `date` date NOT NULL,
                         PRIMARY KEY (`species`)
                         ) ENGINE=MyISAM");
+       # Create the table for isoform info if it doesn't already exist
+       my $safe_isoform_table = $dbh->quote_identifier("isoform_info");
+       $dbh->do("CREATE TABLE IF NOT EXISTS $safe_isoform_table (
+                       `species` varchar(100) NOT NULL,
+                       `total_genes` int(11) NOT NULL,
+                       `canonical_genes` int(11) NOT NULL,
+                       PRIMARY KEY (`species`)
+                       ) ENGINE=MyISAM");
  
  
         $dbh->do("insert into $safe_info_table (species,retrieval_location,date) values ('$species','$retrieval_location','$formatted_date') on duplicate key update retrieval_location='$retrieval_location', date='$formatted_date'");
@@ -70,22 +81,23 @@ foreach my $file (@files) {
         
         $dbh->do("drop table if exists $safe_table");
         $dbh->do("CREATE TABLE $safe_table (
-               `gene_id` VARCHAR( 255 ) NOT NULL ,
+               `gene_id` VARCHAR( 100 ) NOT NULL ,
                 `gene_header` TEXT NOT NULL ,
                 `sequence` TEXT NOT NULL ,
                 UNIQUE ( `gene_id` )
-               ) TYPE = MYISAM");
+               ) ENGINE = MYISAM");
  
         $dbh->do("CREATE TABLE if NOT EXISTS  $safe_syn_table(
-                       `gene_id` VARCHAR( 255 ) NOT NULL ,
-                       `species` VARCHAR( 255 ) NOT NULL ,
+                       `gene_id` VARCHAR( 100 ) NOT NULL ,
+                       `species` VARCHAR( 100 ) NOT NULL ,
                         `synonyms` TEXT ,
                         UNIQUE KEY `gene_species_idx` (`gene_id`, `species`)
-                       ) TYPE = MYISAM");
+                       ) ENGINE = MYISAM");
  
         # and the statement handler to do the inserts
         my $insert_sth = $dbh->prepare("insert into $safe_table (gene_id, gene_header, sequence) values (?,?,?)");
-       my $insert_syn_sth = $dbh->prepare("insert into $safe_syn_table (gene_id, species, synonyms) values (?,?,?)");
+       my $insert_syn_sth = $dbh->prepare("replace into $safe_syn_table (gene_id, species, synonyms) values (?,?,?)");
+       my $insert_counts_sth = $dbh->prepare("replace into $safe_isoform_table (species, total_genes, canonical_genes) values (?,?,?)");
         
         # create the variables
         
@@ -115,6 +127,10 @@ foreach my $file (@files) {
                         $gene =~ s/^\>//;
                         $synonyms = find_gene_synonym($gene_header,$species);
                         $seq = "";
+                       if(trans_filter($species, $gene)) {
+                                       $canonical_genes++;
+                       }
+                       $total_genes++;
                 }else{
                         $seq = "$seq"."$line";
                 }
@@ -134,6 +150,7 @@ foreach my $file (@files) {
                                 $insert_syn_sth->execute($key,$species,$seq_hash{$key}->{'synonyms'});
                 }
         }
+       $insert_counts_sth->execute($species,$total_genes,$canonical_genes);
  }
author	elserj <elserj@localhost>
	Sun, 25 Aug 2019 18:14:46 +0000 (18:14 +0000)
committer	elserj <elserj@localhost>
	Sun, 25 Aug 2019 18:14:46 +0000 (18:14 +0000)
interactome_scripts/find_species.pl		patch \| blob \| history
interactome_scripts/load_species_db.pl		patch \| blob \| history