Hello!

To see the file structure, click on "tree".

Note that updates take place every 10 minutes, commits may not be seen immediately.
Update to include canonical transcript counts when loading protein
authorelserj <elserj@localhost>
Sun, 25 Aug 2019 18:14:46 +0000 (18:14 +0000)
committerelserj <elserj@localhost>
Sun, 25 Aug 2019 18:14:46 +0000 (18:14 +0000)
sequences

svn path=/; revision=665

interactome_scripts/find_species.pl
interactome_scripts/load_species_db.pl

index 8b008af5126342a281dc11516e6f2547f54424d2..ff087fd5eb283729b5e3e678d0334dd2170bda84 100755 (executable)
@@ -1483,4 +1483,37 @@ sub mart_synonym {
                                return $synonym;
                }
 }
+
+
+# subroutine to filter out non-canonical transcripts
+sub trans_filter {
+               my $species = $_[0];
+               my $gene = $_[1];
+               my $is_transcript = 1;
+               if($species ne "Jatropha_curcas") { #Jatropha gene ids are .10, .20, .30, etc...
+                       if ($gene =~ /\.[2-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.1[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.2[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.3[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.4[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.5[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.6[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.7[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.8[0-9]$/) {$is_transcript = 0;}
+                       elsif ($gene =~ /\.9[0-9]$/) {$is_transcript = 0;}
+               }
+                               
+               # Maize genes that are not the first transcript
+               if($species eq "Zea_mays") {
+                               if ($gene !~ /\_T001$/) {$is_transcript = 0;}
+               }
+                               
+               # Theobroma genes that are not the first transcript
+               if($species eq "Theobroma_cacao") {
+                               if ($gene =~ /t[2-9]$/) {$is_transcript = 0;}
+                               elsif ($gene =~ /t1[0-9]$/) {$is_transcript = 0;}
+               }
+return $is_transcript;
+}
+
 1;
index c38dc1252cbca00a2f08e5071104fbc88fa1e905..bf7227d4520afd72e6798acabd510990259d7ff3 100755 (executable)
@@ -37,7 +37,7 @@ use lib "$ENV{HOME}/scripts/jaiswallab/interactome_scripts";
 
 use DbiFloret;
 
-my $dbh = DbiFloret::dbconnect;
+my $dbh = DbiFloret::dbconnect_local;
 
 my @files = glob("*.fa");
 
@@ -49,15 +49,26 @@ foreach my $file (@files) {
        $retrieval_date =~ /(\d+)\_(\d+)\_(\d\d)/;
        my $formatted_date = "20".$3."-".$1."-".$2;
        
+       my $total_genes = 0;
+       my $canonical_genes = 0;
+       
        # insert version info into database.  If species previously entered, location and date will be updated.
        my $safe_info_table = $dbh->quote_identifier("version_info");
        # Create the database for version info if it doesn't already exist
        $dbh->do("CREATE TABLE IF NOT EXISTS $safe_info_table (
-                       `species` varchar(255) NOT NULL,
+                       `species` varchar(100) NOT NULL,
                        `retrieval_location` varchar(255) NOT NULL,
                        `date` date NOT NULL,
                        PRIMARY KEY (`species`)
                        ) ENGINE=MyISAM");
+       # Create the table for isoform info if it doesn't already exist
+       my $safe_isoform_table = $dbh->quote_identifier("isoform_info");
+       $dbh->do("CREATE TABLE IF NOT EXISTS $safe_isoform_table (
+                       `species` varchar(100) NOT NULL,
+                       `total_genes` int(11) NOT NULL,
+                       `canonical_genes` int(11) NOT NULL,
+                       PRIMARY KEY (`species`)
+                       ) ENGINE=MyISAM");
 
 
        $dbh->do("insert into $safe_info_table (species,retrieval_location,date) values ('$species','$retrieval_location','$formatted_date') on duplicate key update retrieval_location='$retrieval_location', date='$formatted_date'");
@@ -70,22 +81,23 @@ foreach my $file (@files) {
        
        $dbh->do("drop table if exists $safe_table");
        $dbh->do("CREATE TABLE $safe_table (
-               `gene_id` VARCHAR( 255 ) NOT NULL ,
+               `gene_id` VARCHAR( 100 ) NOT NULL ,
                `gene_header` TEXT NOT NULL ,
                `sequence` TEXT NOT NULL ,
                UNIQUE ( `gene_id` )
-               ) TYPE = MYISAM");
+               ) ENGINE = MYISAM");
 
        $dbh->do("CREATE TABLE if NOT EXISTS  $safe_syn_table(
-                       `gene_id` VARCHAR( 255 ) NOT NULL ,
-                       `species` VARCHAR( 255 ) NOT NULL ,
+                       `gene_id` VARCHAR( 100 ) NOT NULL ,
+                       `species` VARCHAR( 100 ) NOT NULL ,
                        `synonyms` TEXT ,
                        UNIQUE KEY `gene_species_idx` (`gene_id`, `species`)
-                       ) TYPE = MYISAM");
+                       ) ENGINE = MYISAM");
 
        # and the statement handler to do the inserts
        my $insert_sth = $dbh->prepare("insert into $safe_table (gene_id, gene_header, sequence) values (?,?,?)");
-       my $insert_syn_sth = $dbh->prepare("insert into $safe_syn_table (gene_id, species, synonyms) values (?,?,?)");
+       my $insert_syn_sth = $dbh->prepare("replace into $safe_syn_table (gene_id, species, synonyms) values (?,?,?)");
+       my $insert_counts_sth = $dbh->prepare("replace into $safe_isoform_table (species, total_genes, canonical_genes) values (?,?,?)");
        
        # create the variables
        
@@ -115,6 +127,10 @@ foreach my $file (@files) {
                        $gene =~ s/^\>//;
                        $synonyms = find_gene_synonym($gene_header,$species);
                        $seq = "";
+                       if(trans_filter($species, $gene)) {
+                                       $canonical_genes++;
+                       }
+                       $total_genes++;
                }else{
                        $seq = "$seq"."$line";
                }
@@ -134,6 +150,7 @@ foreach my $file (@files) {
                                $insert_syn_sth->execute($key,$species,$seq_hash{$key}->{'synonyms'});
                }
        }
+       $insert_counts_sth->execute($species,$total_genes,$canonical_genes);
 }