documentation

author athreyab <athreyab@localhost>

Tue, 5 Jun 2012 23:35:32 +0000 (23:35 +0000)

committer athreyab <athreyab@localhost>

Tue, 5 Jun 2012 23:35:32 +0000 (23:35 +0000)
author athreyab <athreyab@localhost>
Tue, 5 Jun 2012 23:35:32 +0000 (23:35 +0000)
committer athreyab <athreyab@localhost>
Tue, 5 Jun 2012 23:35:32 +0000 (23:35 +0000)
diff --git a/Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs.pl b/Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs.pl

new file mode 100755 (executable)

index 0000000..1eb37ed
--- /dev/null
+++ b/Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs.pl
@@ -0,0 +1,50 @@
+#!/usr/bin/perl
+use warnings;
+use strict;
+
+my $infile=$ARGV[0];
+die "usage : perl ./script <single fasta to be broken into pairs and singletons>\n" unless $infile;
+
+my $pairs_out=$infile.".pairs";
+my $single_out=$infile.".singletons";
+
+my %reads;
+open(IN,"<",$infile) || die "cannot open $infile!\n$!\nexiting...\n";
+until(eof(IN)){
+
+#>6_1_1046_17414\1   -- or --  >HWI-ST609:130:D0KU9ACXX:3:1101:1343:1910\2
+
+       my $header=<IN>;
+       my $seq   =<IN>;
+       chomp $header;
+       chomp $seq;
+       $header=~s/\>//;
+       my $pairnum;
+       ($header,$pairnum)=split(/\\/,$header);
+       if($reads{$header}){
+               if($reads{$header}{$pairnum}){
+                       die "$header pair $pairnum already defined. weird.\n";
+               }else{
+                       $reads{$header}{$pairnum}=$seq;
+               }
+       }else{
+               my %hash;
+               $hash{$pairnum}=$seq;
+               $reads{$header}=\%hash;
+       }
+}
+close IN;
+
+open(PAIRS,">",$pairs_out) || die "cannot open $pairs_out!\n$!\nexiting...\n";
+open(SINGS,">",$single_out) || die "cannot open $single_out!\n$!\nexiting...\n";
+foreach my $key (keys %reads){
+       if ($reads{$key}{1} && $reads{$key}{2}){
+               print PAIRS ">$key\\1\n$reads{$key}{1}\n$key\\2\n$reads{$key}{2}\n";
+       }elsif($reads{$key}{1} && !$reads{$key}{2}){
+               print SINGS ">$key\\1\n$reads{$key}{1}\n";
+       }elsif($reads{$key}{2} && !$reads{$key}{1}){
+               print SINGS ">$key\\2\n$reads{$key}{2}\n";
+       }
+}
+close PAIRS;
+close SINGS;
diff --git a/Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs_bck.pl b/Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs_bck.pl

new file mode 100755 (executable)

index 0000000..11036fc

Binary files /dev/null and b/Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs_bck.pl differ
diff --git a/Personnel/athreyab/illumina_parse/new_illumina/create_pairs b/Personnel/athreyab/illumina_parse/new_illumina/create_pairs

new file mode 100755 (executable)

index 0000000..10046ef
--- /dev/null
+++ b/Personnel/athreyab/illumina_parse/new_illumina/create_pairs
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# creates .pair and singleton files . Acts on a folder containing all the
+# files and generates an output folder containing pair and singleton
+# files. Surprisingly the bash script i wrote to do the whole thing was
+# incredibly slower than the perl script that Sam was already using. 
+# This bash script basically calls the perl script to do all the work.
+
+
+function usage(){
+       echo -e "Usage: /create_pairs <input folder> <output foldder> <max_rate> <slash_string>\n";
+}
+
+input_folder=$1
+output_folder=$2
+max_rate=$3
+slash_string=$4
+#show help
+if [ "$input_folder"  == "--help" ]
+then
+       echo -e "Help\n"
+       usage;
+       exit;
+fi
+
+#check if the input and output folder are defined in command line
+if [ -z "$input_folder" ]
+then
+       echo -e "Input folder is not defined \n"
+       usage;
+       exit;   
+elif [ -z "$output_folder" ]
+then
+       echo -e "Output folder is not defined \n"
+       usage;
+       exit;
+#check if max_rate is defined
+elif [ -z "$max_rate" ]
+then
+       echo -e "max rate is not defined \n"
+       usage;
+else
+       if [ -d "$output_folder" ]; then
+               echo "output directory already exists. Exiting now!"
+               exit;
+       fi
+        mkdir $output_folder
+        # call the illumina parse script. This script converts the fastq
+        # files to fasta format and removes low quality entries. By doing
+        # this, we can speed up.
+       ./illumina_parse.pl -fi "$input_folder" "$output_folder" "$max_rate" "$slash_string"
+       # files array1.txt and array2.txt are temporary files containing
+       # the file names . 
+       ls "$output_folder/" | grep "R1" | sort > "array1.txt"
+       ls "$output_folder/" | grep "R2" | sort > "array2.txt"
+       # read files array1 and array2.txt to create an array in bash.
+       index1=0;
+       while read line ; do
+               array1[$index1]="$line"
+               index1=$(($index1+1))
+       done < array1.txt
+       index2=0;
+       while read line ; do    
+               array2[$index2]="$line"
+               index2=$(($index2+1))
+       done < array2.txt
+       # make sure array1 and array2 are of equal size. If not, it means
+       # some readings are missing. Print a warning message about it and 
+       # move along
+       if [ $index1 -eq $index2 ]
+       then
+               size=$index1;
+       elif [ $index1 > $index2 ]
+       then
+               size=$index2;
+               echo -e "Reading R2 misses one or few files\n";
+       else
+               size=$index1;
+               echo -e "Reading R1 misses one or few files\n";
+       fi
+       echo -e "size is $size\n";  
+       i=0;
+       # loop throught the array.
+       # 1. combine files from read 1 and read2.
+       # 2. and then run sam's script to create pair and singleton files
+       while [ $i -lt  $size ]; do
+               echo -e "Copying $output_folder/${array2[$i]} into $output_folder/${array1[$i]}\n";
+               infile="$output_folder/${array2[$i]}";
+               infile1="$output_folder/${array1[$i]}";
+               # append file from reading 2 into file from reading 1
+               cat "$infile" >> "$infile1";
+               # remove file from reading2 now that it is appended to
+               # file form read1.
+               rm -f "$infile";
+               echo -e "Creating singleton and pair files from $infile1\n"
+               ./break_pile_to_pairs.pl "$infile1"
+               # we don't need the file anymore. remove it
+               rm -f $infile1
+               i=$(($i+1))
+       done
+fi
diff --git a/Personnel/athreyab/illumina_parse/new_illumina/illumina_parse.pl b/Personnel/athreyab/illumina_parse/new_illumina/illumina_parse.pl

new file mode 100755 (executable)

index 0000000..7521fc9
--- /dev/null
+++ b/Personnel/athreyab/illumina_parse/new_illumina/illumina_parse.pl
@@ -0,0 +1,139 @@
+#!/usr/bin/perl
+
+use Switch;
+
+
+$optionId = $ARGV[0];
+
+
+# if command line option is -co, then it is for 
+if($optionId eq "-co" && ($#ARGV == 2 || $#ARGV == 3)){
+       $in_folder = $ARGV[1];
+       $out_folder = $ARGV[2];
+       $slash_string = $ARGV[3];
+       if($slash_string eq ""){
+               $slash_string = '\\';   
+       }
+       process_folders($in_folder,$out_folder,"",$slash_string);
+}
+
+# if command line option is -fi, then it is for filtering
+# entries with low quality. we also convert the file to fa format.
+elsif($optionId eq "-fi" && ($#ARGV == 3 || $#ARGV == 4)){
+       $in_folder = $ARGV[1];
+       $out_folder = $ARGV[2];
+       $max_rate = $ARGV[3];
+       $slash_string = $ARGV[4];
+       if($max_rate =~ /[^0-9\.]/){
+               print "\nmax_rate supplied: '$max_rate', is not a number. check if you supplied all 4 params - option(-fi), an input directory, output directory and a max_rate \n";
+               showUsage();
+       }
+       if($slash_string eq ""){
+               $slash_string = '\\';   
+       }
+       process_folders($in_folder,$out_folder,$max_rate,$slash_string);
+}
+elsif($optionId eq "-h" |$optionId eq '--help'){
+       showUsage();    
+}
+else{
+       print "Incorrect parameters were supplied\n";
+       showUsage();
+}
+
+sub showUsage
+{
+       print "\nUsage: \n";
+       print "convert fastq to fasta format:\t./illumina_parse -co <input_path> <output_path> [<slash_string>]\n";
+       print "filter low quality entries:\t./illumina_parse -fi <input_path> <output_path> <max_rate> [<slash_string>]\n";
+       print "[] = optional parameter\n";
+       die ""; 
+}
+
+sub isnan { ! defined( $_[0] <=> 9**9**9 ) }
+# checks if the % of N is less than max rate specified in the command line
+sub filterRead {
+        my $seq         =shift;
+        my $maxRate     =shift;
+        my $length      =length($seq);
+        my $count       =0;
+        $count++ while $seq=~m/N/g;
+       $count++ while $seq=~m/\./g;
+        return 1 if(($count/$length)<=$maxRate);
+        return 0;
+}
+
+sub process_folders{
+       $in_folder = $_[0];
+       $out_folder = $_[1];
+       $max_rate = $_[2];
+       $slash_string = $_[3];
+       opendir(DIR, $in_folder) or die "can't open directory $in_folder: $!\n";
+       unless(-d $out_folder){
+          mkdir $out_folder or die "can't create directory $out_folder: $! \n";
+       }
+       # read file names from the input folder
+       @files = readdir DIR;
+       $y_entries = 0;
+       $total_entries = 0;
+       $high_qual_entries = 0;
+       # loop through each file in the input folder
+       foreach $fileName (@files) {
+               # make sure the file is fastq file
+               if(index($fileName,'.fastq') != -1){
+                       open(in_file, "<","$in_folder/$fileName") || die "Error: file '$in_folder/$fileName' can not be opened\n";
+                       print "processing file $in_folder/$fileName\n";
+                       $fileName =~ s/.fastq/.fa/;
+                       open(out_file,">>","$out_folder/$fileName") || die "Error: file $out_folder/$fileName can not be opened\n";
+                       until(eof(in_file)){
+                                                       # read 4 lines - each entry contains 4 lines
+                               $h1=<in_file>;
+                                                       $seq=<in_file>;
+                               $h2=<in_file>;
+                               $qual=<in_file>;
+                               # counter for total no of entries
+                                                 $total_entries = $total_entries + 1;
+                                                 # trim the firsst two lines.
+                               chomp $h1;
+                               chomp $seq;
+                               # repalce @ with >
+                               $h1=~s/\@/>/;
+                               # discard the entry if it has :y in the header
+                               if(index($h1, ' 1:Y') != -1 || index($h1,' 2:Y') != -1){
+                                        $y_entries = $y_entries + 1;
+                                        next;
+                               }
+                               # some formatting sam asked for
+                               if(index($h1, ' 1:N') != -1){
+                                       #$temp = $slash_string."1:N";
+                                       $temp = ":N";
+                                       $h1 =~ s/ 1:N/$temp/;
+                                       $h1 = $h1.$slash_string."1";
+                               }
+                               elsif(index($h1, ' 2:N') != -1){
+                                       #$temp = $slash_string."2:N";
+                                       $temp = ":N";
+                                       $h1 =~ s/ 2:N/$temp/;
+                                       $h1 = $h1.$slash_string."2";
+                               }
+                                                       
+                               # write to output file
+                               if($max_rate eq ""){
+                                       print out_file "$h1\n$seq\n";
+                               }
+                               elsif(filterRead($seq,$max_rate)){
+                                       print out_file "$h1\n$seq\n";
+                                       $high_qual_entries = $high_qual_entries + 1;
+                               }
+                       }
+               close in_file;
+               close out_file;
+               print "successfully written $fileName\n";
+               }
+       }       
+       print "total entries: $total_entries\n";
+       print "low quality entries(entries containing 1:Y or 2:Y in the header): $y_entries\n";
+       $qual_entries = $total_entries - $high_qual_entries;
+       print "entries removed due to exceeding max_rate: $qual_entries \n";
+       closedir DIR;
+}
author	athreyab <athreyab@localhost>
	Tue, 5 Jun 2012 23:35:32 +0000 (23:35 +0000)
committer	athreyab <athreyab@localhost>
	Tue, 5 Jun 2012 23:35:32 +0000 (23:35 +0000)
Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs.pl	[new file with mode: 0755]	patch \| blob
Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs_bck.pl	[new file with mode: 0755]	patch \| blob
Personnel/athreyab/illumina_parse/new_illumina/create_pairs	[new file with mode: 0755]	patch \| blob
Personnel/athreyab/illumina_parse/new_illumina/illumina_parse.pl	[new file with mode: 0755]	patch \| blob