--- /dev/null
+#!/usr/bin/perl
+use warnings;
+use strict;
+
+my $infile=$ARGV[0];
+die "usage : perl ./script <single fasta to be broken into pairs and singletons>\n" unless $infile;
+
+my $pairs_out=$infile.".pairs";
+my $single_out=$infile.".singletons";
+
+my %reads;
+open(IN,"<",$infile) || die "cannot open $infile!\n$!\nexiting...\n";
+until(eof(IN)){
+
+#>6_1_1046_17414\1 -- or -- >HWI-ST609:130:D0KU9ACXX:3:1101:1343:1910\2
+
+ my $header=<IN>;
+ my $seq =<IN>;
+ chomp $header;
+ chomp $seq;
+ $header=~s/\>//;
+ my $pairnum;
+ ($header,$pairnum)=split(/\\/,$header);
+ if($reads{$header}){
+ if($reads{$header}{$pairnum}){
+ die "$header pair $pairnum already defined. weird.\n";
+ }else{
+ $reads{$header}{$pairnum}=$seq;
+ }
+ }else{
+ my %hash;
+ $hash{$pairnum}=$seq;
+ $reads{$header}=\%hash;
+ }
+}
+close IN;
+
+open(PAIRS,">",$pairs_out) || die "cannot open $pairs_out!\n$!\nexiting...\n";
+open(SINGS,">",$single_out) || die "cannot open $single_out!\n$!\nexiting...\n";
+foreach my $key (keys %reads){
+ if ($reads{$key}{1} && $reads{$key}{2}){
+ print PAIRS ">$key\\1\n$reads{$key}{1}\n$key\\2\n$reads{$key}{2}\n";
+ }elsif($reads{$key}{1} && !$reads{$key}{2}){
+ print SINGS ">$key\\1\n$reads{$key}{1}\n";
+ }elsif($reads{$key}{2} && !$reads{$key}{1}){
+ print SINGS ">$key\\2\n$reads{$key}{2}\n";
+ }
+}
+close PAIRS;
+close SINGS;
--- /dev/null
+#!/bin/bash
+
+# creates .pair and singleton files . Acts on a folder containing all the
+# files and generates an output folder containing pair and singleton
+# files. Surprisingly the bash script i wrote to do the whole thing was
+# incredibly slower than the perl script that Sam was already using.
+# This bash script basically calls the perl script to do all the work.
+
+
+function usage(){
+ echo -e "Usage: /create_pairs <input folder> <output foldder> <max_rate> <slash_string>\n";
+}
+
+input_folder=$1
+output_folder=$2
+max_rate=$3
+slash_string=$4
+#show help
+if [ "$input_folder" == "--help" ]
+then
+ echo -e "Help\n"
+ usage;
+ exit;
+fi
+
+#check if the input and output folder are defined in command line
+if [ -z "$input_folder" ]
+then
+ echo -e "Input folder is not defined \n"
+ usage;
+ exit;
+elif [ -z "$output_folder" ]
+then
+ echo -e "Output folder is not defined \n"
+ usage;
+ exit;
+#check if max_rate is defined
+elif [ -z "$max_rate" ]
+then
+ echo -e "max rate is not defined \n"
+ usage;
+else
+ if [ -d "$output_folder" ]; then
+ echo "output directory already exists. Exiting now!"
+ exit;
+ fi
+ mkdir $output_folder
+ # call the illumina parse script. This script converts the fastq
+ # files to fasta format and removes low quality entries. By doing
+ # this, we can speed up.
+ ./illumina_parse.pl -fi "$input_folder" "$output_folder" "$max_rate" "$slash_string"
+ # files array1.txt and array2.txt are temporary files containing
+ # the file names .
+ ls "$output_folder/" | grep "R1" | sort > "array1.txt"
+ ls "$output_folder/" | grep "R2" | sort > "array2.txt"
+ # read files array1 and array2.txt to create an array in bash.
+ index1=0;
+ while read line ; do
+ array1[$index1]="$line"
+ index1=$(($index1+1))
+ done < array1.txt
+ index2=0;
+ while read line ; do
+ array2[$index2]="$line"
+ index2=$(($index2+1))
+ done < array2.txt
+ # make sure array1 and array2 are of equal size. If not, it means
+ # some readings are missing. Print a warning message about it and
+ # move along
+ if [ $index1 -eq $index2 ]
+ then
+ size=$index1;
+ elif [ $index1 > $index2 ]
+ then
+ size=$index2;
+ echo -e "Reading R2 misses one or few files\n";
+ else
+ size=$index1;
+ echo -e "Reading R1 misses one or few files\n";
+ fi
+ echo -e "size is $size\n";
+ i=0;
+ # loop throught the array.
+ # 1. combine files from read 1 and read2.
+ # 2. and then run sam's script to create pair and singleton files
+ while [ $i -lt $size ]; do
+ echo -e "Copying $output_folder/${array2[$i]} into $output_folder/${array1[$i]}\n";
+ infile="$output_folder/${array2[$i]}";
+ infile1="$output_folder/${array1[$i]}";
+ # append file from reading 2 into file from reading 1
+ cat "$infile" >> "$infile1";
+ # remove file from reading2 now that it is appended to
+ # file form read1.
+ rm -f "$infile";
+ echo -e "Creating singleton and pair files from $infile1\n"
+ ./break_pile_to_pairs.pl "$infile1"
+ # we don't need the file anymore. remove it
+ rm -f $infile1
+ i=$(($i+1))
+ done
+fi
--- /dev/null
+#!/usr/bin/perl
+
+use Switch;
+
+
+$optionId = $ARGV[0];
+
+
+# if command line option is -co, then it is for
+if($optionId eq "-co" && ($#ARGV == 2 || $#ARGV == 3)){
+ $in_folder = $ARGV[1];
+ $out_folder = $ARGV[2];
+ $slash_string = $ARGV[3];
+ if($slash_string eq ""){
+ $slash_string = '\\';
+ }
+ process_folders($in_folder,$out_folder,"",$slash_string);
+}
+
+# if command line option is -fi, then it is for filtering
+# entries with low quality. we also convert the file to fa format.
+elsif($optionId eq "-fi" && ($#ARGV == 3 || $#ARGV == 4)){
+ $in_folder = $ARGV[1];
+ $out_folder = $ARGV[2];
+ $max_rate = $ARGV[3];
+ $slash_string = $ARGV[4];
+ if($max_rate =~ /[^0-9\.]/){
+ print "\nmax_rate supplied: '$max_rate', is not a number. check if you supplied all 4 params - option(-fi), an input directory, output directory and a max_rate \n";
+ showUsage();
+ }
+ if($slash_string eq ""){
+ $slash_string = '\\';
+ }
+ process_folders($in_folder,$out_folder,$max_rate,$slash_string);
+}
+elsif($optionId eq "-h" |$optionId eq '--help'){
+ showUsage();
+}
+else{
+ print "Incorrect parameters were supplied\n";
+ showUsage();
+}
+
+sub showUsage
+{
+ print "\nUsage: \n";
+ print "convert fastq to fasta format:\t./illumina_parse -co <input_path> <output_path> [<slash_string>]\n";
+ print "filter low quality entries:\t./illumina_parse -fi <input_path> <output_path> <max_rate> [<slash_string>]\n";
+ print "[] = optional parameter\n";
+ die "";
+}
+
+sub isnan { ! defined( $_[0] <=> 9**9**9 ) }
+# checks if the % of N is less than max rate specified in the command line
+sub filterRead {
+ my $seq =shift;
+ my $maxRate =shift;
+ my $length =length($seq);
+ my $count =0;
+ $count++ while $seq=~m/N/g;
+ $count++ while $seq=~m/\./g;
+ return 1 if(($count/$length)<=$maxRate);
+ return 0;
+}
+
+sub process_folders{
+ $in_folder = $_[0];
+ $out_folder = $_[1];
+ $max_rate = $_[2];
+ $slash_string = $_[3];
+ opendir(DIR, $in_folder) or die "can't open directory $in_folder: $!\n";
+ unless(-d $out_folder){
+ mkdir $out_folder or die "can't create directory $out_folder: $! \n";
+ }
+ # read file names from the input folder
+ @files = readdir DIR;
+ $y_entries = 0;
+ $total_entries = 0;
+ $high_qual_entries = 0;
+ # loop through each file in the input folder
+ foreach $fileName (@files) {
+ # make sure the file is fastq file
+ if(index($fileName,'.fastq') != -1){
+ open(in_file, "<","$in_folder/$fileName") || die "Error: file '$in_folder/$fileName' can not be opened\n";
+ print "processing file $in_folder/$fileName\n";
+ $fileName =~ s/.fastq/.fa/;
+ open(out_file,">>","$out_folder/$fileName") || die "Error: file $out_folder/$fileName can not be opened\n";
+ until(eof(in_file)){
+ # read 4 lines - each entry contains 4 lines
+ $h1=<in_file>;
+ $seq=<in_file>;
+ $h2=<in_file>;
+ $qual=<in_file>;
+ # counter for total no of entries
+ $total_entries = $total_entries + 1;
+ # trim the firsst two lines.
+ chomp $h1;
+ chomp $seq;
+ # repalce @ with >
+ $h1=~s/\@/>/;
+ # discard the entry if it has :y in the header
+ if(index($h1, ' 1:Y') != -1 || index($h1,' 2:Y') != -1){
+ $y_entries = $y_entries + 1;
+ next;
+ }
+ # some formatting sam asked for
+ if(index($h1, ' 1:N') != -1){
+ #$temp = $slash_string."1:N";
+ $temp = ":N";
+ $h1 =~ s/ 1:N/$temp/;
+ $h1 = $h1.$slash_string."1";
+ }
+ elsif(index($h1, ' 2:N') != -1){
+ #$temp = $slash_string."2:N";
+ $temp = ":N";
+ $h1 =~ s/ 2:N/$temp/;
+ $h1 = $h1.$slash_string."2";
+ }
+
+ # write to output file
+ if($max_rate eq ""){
+ print out_file "$h1\n$seq\n";
+ }
+ elsif(filterRead($seq,$max_rate)){
+ print out_file "$h1\n$seq\n";
+ $high_qual_entries = $high_qual_entries + 1;
+ }
+ }
+ close in_file;
+ close out_file;
+ print "successfully written $fileName\n";
+ }
+ }
+ print "total entries: $total_entries\n";
+ print "low quality entries(entries containing 1:Y or 2:Y in the header): $y_entries\n";
+ $qual_entries = $total_entries - $high_qual_entries;
+ print "entries removed due to exceeding max_rate: $qual_entries \n";
+ closedir DIR;
+}