From 9f661ebc21fd09708d824c79914b4d7d25f292b8 Mon Sep 17 00:00:00 2001 From: elserj Date: Thu, 23 Sep 2010 21:54:22 +0000 Subject: [PATCH] Added species db loading script to svn. This script will parse the fasta files and put the gene_id, gene_header, and sequences into a database svn path=/; revision=50 --- interactome_scripts/load_species_db.pl | 103 +++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100755 interactome_scripts/load_species_db.pl diff --git a/interactome_scripts/load_species_db.pl b/interactome_scripts/load_species_db.pl new file mode 100755 index 0000000..e5aad70 --- /dev/null +++ b/interactome_scripts/load_species_db.pl @@ -0,0 +1,103 @@ +#!/usr/bin/perl + +############################################################### +# Written by Justin Elser 2/22/10 # +# # +# This program takes all fasta files in the current working # +# directory and puts them in a database named protein # +# sequences # +# # +# Initial version 0.1 # +# Shouldn't have to change much once this is done # +# except to maybe add more species detection # +# which is actually done in the pulled in subs # +# # +############################################################### + +use strict; +use warnings; + +use DBI; +use Term::Screen::ReadLine; + +# pull in the find_species and find_gene common subroutines +require "$ENV{HOME}/scripts/jaiswallab/interactome_scripts/find_species.pl"; + + +# define the database handle to be used + +my $screen = Term::Screen::ReadLine->new(); + # clear the screen + $screen->clrscr; + # ask for username + $screen->at(0,0)->puts("Username: "); + my $username = $screen->readline(ROW => 0, COL=>11); + + # ask for password, replace character presses with stars + $screen->at(1,0)->puts("Password: "); + my $password = $screen->readline(ROW => 1, COL => 11, PASSWORD => 1); + + $screen->at(2,0); + undef $screen; + +my $dbh = DBI->connect('DBI:mysql:protein_sequences;host=floret.cgrb.oregonstate.edu', $username, $password, + { RaiseError=> 1, AutoCommit=>1 } + ) or die "Failed to connect to database: $DBI::errstr"; + + +my @files = glob("*.fa"); + +foreach my $file (@files) { + my $species = find_species($file); + print "on species $species\n"; + + # create the database table + my $safe_table = $dbh->quote_identifier($species); + + $dbh->do("drop table if exists $safe_table"); + $dbh->do("CREATE TABLE $safe_table ( + `gene_id` VARCHAR( 255 ) NOT NULL , + `gene_header` TEXT NOT NULL , + `sequence` TEXT NOT NULL , + UNIQUE ( `gene_id` ) + ) TYPE = MYISAM"); + + # and the statement handler to do the inserts + my $insert_sth = $dbh->prepare("insert into $safe_table (gene_id, gene_header, sequence) values (?,?,?)"); + + # create the variables + my %seq_hash; # keys is $gene, values are $seq and $gene_header + my $gene; + my $seq; + + open(in_file,$file); + + while() { + my $gene_header; + my $line = $_; + chomp $line; + if($line =~ /^\>/) { + if(defined($gene)) { + $seq_hash{$gene}->{'sequence'} = $seq; + $seq_hash{$gene}->{'gene_header'} = $gene_header; + } + $gene_header = $line; + $gene = find_gene($gene_header,$species); + $gene =~ s/^\>//; + $seq = ""; + }else{ + $seq = "$seq"."$line"; + } + } + + # add the last gene to the hash + $seq_hash{$gene}->{'sequence'} = $seq; + $seq_hash{$gene}->{'gene_header'} = $gene_header; + + foreach my $key (keys %seq_hash) { + $insert_sth->execute($key,$seq_hash{$key}->{'gene_header'},$seq_hash{$key}->{'sequence'}); + } +} + + + -- 2.34.1