From 423f343868d060502556e1f7747eacecb7929b3d Mon Sep 17 00:00:00 2001 From: athreyab Date: Tue, 5 Jun 2012 23:35:32 +0000 Subject: [PATCH] documentation svn path=/; revision=343 --- .../new_illumina/break_pile_to_pairs.pl | 50 +++++++ .../new_illumina/break_pile_to_pairs_bck.pl | Bin 0 -> 2043 bytes .../illumina_parse/new_illumina/create_pairs | 101 +++++++++++++ .../new_illumina/illumina_parse.pl | 139 ++++++++++++++++++ 4 files changed, 290 insertions(+) create mode 100755 Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs.pl create mode 100755 Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs_bck.pl create mode 100755 Personnel/athreyab/illumina_parse/new_illumina/create_pairs create mode 100755 Personnel/athreyab/illumina_parse/new_illumina/illumina_parse.pl diff --git a/Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs.pl b/Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs.pl new file mode 100755 index 0000000..1eb37ed --- /dev/null +++ b/Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs.pl @@ -0,0 +1,50 @@ +#!/usr/bin/perl +use warnings; +use strict; + +my $infile=$ARGV[0]; +die "usage : perl ./script \n" unless $infile; + +my $pairs_out=$infile.".pairs"; +my $single_out=$infile.".singletons"; + +my %reads; +open(IN,"<",$infile) || die "cannot open $infile!\n$!\nexiting...\n"; +until(eof(IN)){ + +#>6_1_1046_17414\1 -- or -- >HWI-ST609:130:D0KU9ACXX:3:1101:1343:1910\2 + + my $header=; + my $seq =; + chomp $header; + chomp $seq; + $header=~s/\>//; + my $pairnum; + ($header,$pairnum)=split(/\\/,$header); + if($reads{$header}){ + if($reads{$header}{$pairnum}){ + die "$header pair $pairnum already defined. weird.\n"; + }else{ + $reads{$header}{$pairnum}=$seq; + } + }else{ + my %hash; + $hash{$pairnum}=$seq; + $reads{$header}=\%hash; + } +} +close IN; + +open(PAIRS,">",$pairs_out) || die "cannot open $pairs_out!\n$!\nexiting...\n"; +open(SINGS,">",$single_out) || die "cannot open $single_out!\n$!\nexiting...\n"; +foreach my $key (keys %reads){ + if ($reads{$key}{1} && $reads{$key}{2}){ + print PAIRS ">$key\\1\n$reads{$key}{1}\n$key\\2\n$reads{$key}{2}\n"; + }elsif($reads{$key}{1} && !$reads{$key}{2}){ + print SINGS ">$key\\1\n$reads{$key}{1}\n"; + }elsif($reads{$key}{2} && !$reads{$key}{1}){ + print SINGS ">$key\\2\n$reads{$key}{2}\n"; + } +} +close PAIRS; +close SINGS; diff --git a/Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs_bck.pl b/Personnel/athreyab/illumina_parse/new_illumina/break_pile_to_pairs_bck.pl new file mode 100755 index 0000000000000000000000000000000000000000..11036fc35a6fa85ee21485a2a433914ca1f2b4da GIT binary patch literal 2043 zcmV=y!_wQJ@_h=d{ z2vZm^uzsPX3usgaTt>+z7qCj>CCg@TndYMWt`^q*!y-d@{K#YUvEU+$lDpYFu_oq| zX-fTt(x2L{sSM}^&nyj0?Xg+6Fttxndj;2E*>xipz;p`MU=Xr+W5JCnLYCCR_#-Z( zbJLI^PLT5uiHW(2O!I9JvfvY-QPcqynV&4778J|C&loJ(DoWTAWgtk`YZ8iaLE#zv zwg+hs#fMtjAmWguLL1#jXP7LmOgB>0V*5`GxGx%S*eYh@^bS6vo;7&&v^nDn`3@AUc-e3aDRArM$dA+uk!p+pONvJ|lTC%&{ zLgt;@>zT$fMN^khfi8S{(kHYzC3BTsg|~3*RgN{X!kl*O?K_s?nnm#x@UZSa7oR;W zNJAEn_R6E*uq!A@D(1MMI(wC(H>p7CO25KTsp76Yz^2a`2f~?VuwwVH!ensSy$o6H zhu|Ex>OOCk3R^3YQ95uoeIP!j-H2+nKmo%b7NA7vh}BC%8~R923bY^f-rYRm zHi`6rgexT!LaYwLJKZhkhOwHUdq~nPC3O-5U&v!hY8K39`ipDTa+? zCyJEge=!qynNfSD>q=!AR$01+(yZ)W*fDNV?bBj|OAC>z%Z=Jb^T4ZCkW@ioM&@G*oS3F*ryCjwKP+Ji_D@%T_{;U^WJk|50}c$G9J5EtCDtl9{bf<^|g?09|)dd;lt&6DC)rqqhn zAWeQ{I7zO2EGZ+J2SK*>#l>9g&c}d4Lq$HU^i|8T|15_w)~XP)MtIXjp?sJ(sMvI9}mSmo(*Jv=WmBAE94S5g+2mEV&b5 zp|8Gl$!M=B7Nj{wScr`(o?^UvSB&GNIsLkJK00^Q655&bA#wH8#6+GY;NmgHsYPly z1*+cVw8Y=rQyA$I+&NKuJ#8T`huwy@DB4`Tn=KAB_l0V4>4qiPCAz==aMg#(M~}v3 zxxIm(bOexCt(#PW7hs>LV#pI5`FIy!qcv1`P=SWY>iJm0ZQ55qb<{mnqU54DcGzov zRHNloK9k5OBB%Jai{daKi-vYvq{5GP7sKc-T+r#+Vc>~neh`99)<0s6LsvQK0rfuI zNBP|7YJR^nl!w04gOe!?T(c65RjwSZZT0BdlJWnXWhsV89K0D? zWIKQV9lh4Ax~UmO+RVu(5rLwIx5kY?^)Wdke!8h z;5LllyJx)tgj9PU=~YD4^oYp6a%cXdGgFzZ)llg6p#$#4U(4d5R93tWconNxwZ@mI zaRh2#8gaZt340I<@6!R)AL}Jg^wt&(Q-D6}J!WX3sM=*pwUV%EOQEYukW4yKz^St+ z(ZgMfRvnre85-!k*Q4`ZiOzd9I})pWgDY;R86tH!>Acc{&<#R=1_R z%c^0j_j-Hn{jX(FmW)O~{{?Z%>6cn*d!iRgZ907bFLOq7AbSWHlRrY0+k}1_ew1&SMSY7s{lERBaik+1=}1R9 Z(vgmIq$3^aNJna+zXO6?DXjn~006;w0P_F< literal 0 HcmV?d00001 diff --git a/Personnel/athreyab/illumina_parse/new_illumina/create_pairs b/Personnel/athreyab/illumina_parse/new_illumina/create_pairs new file mode 100755 index 0000000..10046ef --- /dev/null +++ b/Personnel/athreyab/illumina_parse/new_illumina/create_pairs @@ -0,0 +1,101 @@ +#!/bin/bash + +# creates .pair and singleton files . Acts on a folder containing all the +# files and generates an output folder containing pair and singleton +# files. Surprisingly the bash script i wrote to do the whole thing was +# incredibly slower than the perl script that Sam was already using. +# This bash script basically calls the perl script to do all the work. + + +function usage(){ + echo -e "Usage: /create_pairs \n"; +} + +input_folder=$1 +output_folder=$2 +max_rate=$3 +slash_string=$4 +#show help +if [ "$input_folder" == "--help" ] +then + echo -e "Help\n" + usage; + exit; +fi + +#check if the input and output folder are defined in command line +if [ -z "$input_folder" ] +then + echo -e "Input folder is not defined \n" + usage; + exit; +elif [ -z "$output_folder" ] +then + echo -e "Output folder is not defined \n" + usage; + exit; +#check if max_rate is defined +elif [ -z "$max_rate" ] +then + echo -e "max rate is not defined \n" + usage; +else + if [ -d "$output_folder" ]; then + echo "output directory already exists. Exiting now!" + exit; + fi + mkdir $output_folder + # call the illumina parse script. This script converts the fastq + # files to fasta format and removes low quality entries. By doing + # this, we can speed up. + ./illumina_parse.pl -fi "$input_folder" "$output_folder" "$max_rate" "$slash_string" + # files array1.txt and array2.txt are temporary files containing + # the file names . + ls "$output_folder/" | grep "R1" | sort > "array1.txt" + ls "$output_folder/" | grep "R2" | sort > "array2.txt" + # read files array1 and array2.txt to create an array in bash. + index1=0; + while read line ; do + array1[$index1]="$line" + index1=$(($index1+1)) + done < array1.txt + index2=0; + while read line ; do + array2[$index2]="$line" + index2=$(($index2+1)) + done < array2.txt + # make sure array1 and array2 are of equal size. If not, it means + # some readings are missing. Print a warning message about it and + # move along + if [ $index1 -eq $index2 ] + then + size=$index1; + elif [ $index1 > $index2 ] + then + size=$index2; + echo -e "Reading R2 misses one or few files\n"; + else + size=$index1; + echo -e "Reading R1 misses one or few files\n"; + fi + echo -e "size is $size\n"; + i=0; + # loop throught the array. + # 1. combine files from read 1 and read2. + # 2. and then run sam's script to create pair and singleton files + while [ $i -lt $size ]; do + echo -e "Copying $output_folder/${array2[$i]} into $output_folder/${array1[$i]}\n"; + infile="$output_folder/${array2[$i]}"; + infile1="$output_folder/${array1[$i]}"; + # append file from reading 2 into file from reading 1 + cat "$infile" >> "$infile1"; + # remove file from reading2 now that it is appended to + # file form read1. + rm -f "$infile"; + echo -e "Creating singleton and pair files from $infile1\n" + ./break_pile_to_pairs.pl "$infile1" + # we don't need the file anymore. remove it + rm -f $infile1 + i=$(($i+1)) + done +fi diff --git a/Personnel/athreyab/illumina_parse/new_illumina/illumina_parse.pl b/Personnel/athreyab/illumina_parse/new_illumina/illumina_parse.pl new file mode 100755 index 0000000..7521fc9 --- /dev/null +++ b/Personnel/athreyab/illumina_parse/new_illumina/illumina_parse.pl @@ -0,0 +1,139 @@ +#!/usr/bin/perl + +use Switch; + + +$optionId = $ARGV[0]; + + +# if command line option is -co, then it is for +if($optionId eq "-co" && ($#ARGV == 2 || $#ARGV == 3)){ + $in_folder = $ARGV[1]; + $out_folder = $ARGV[2]; + $slash_string = $ARGV[3]; + if($slash_string eq ""){ + $slash_string = '\\'; + } + process_folders($in_folder,$out_folder,"",$slash_string); +} + +# if command line option is -fi, then it is for filtering +# entries with low quality. we also convert the file to fa format. +elsif($optionId eq "-fi" && ($#ARGV == 3 || $#ARGV == 4)){ + $in_folder = $ARGV[1]; + $out_folder = $ARGV[2]; + $max_rate = $ARGV[3]; + $slash_string = $ARGV[4]; + if($max_rate =~ /[^0-9\.]/){ + print "\nmax_rate supplied: '$max_rate', is not a number. check if you supplied all 4 params - option(-fi), an input directory, output directory and a max_rate \n"; + showUsage(); + } + if($slash_string eq ""){ + $slash_string = '\\'; + } + process_folders($in_folder,$out_folder,$max_rate,$slash_string); +} +elsif($optionId eq "-h" |$optionId eq '--help'){ + showUsage(); +} +else{ + print "Incorrect parameters were supplied\n"; + showUsage(); +} + +sub showUsage +{ + print "\nUsage: \n"; + print "convert fastq to fasta format:\t./illumina_parse -co []\n"; + print "filter low quality entries:\t./illumina_parse -fi []\n"; + print "[] = optional parameter\n"; + die ""; +} + +sub isnan { ! defined( $_[0] <=> 9**9**9 ) } +# checks if the % of N is less than max rate specified in the command line +sub filterRead { + my $seq =shift; + my $maxRate =shift; + my $length =length($seq); + my $count =0; + $count++ while $seq=~m/N/g; + $count++ while $seq=~m/\./g; + return 1 if(($count/$length)<=$maxRate); + return 0; +} + +sub process_folders{ + $in_folder = $_[0]; + $out_folder = $_[1]; + $max_rate = $_[2]; + $slash_string = $_[3]; + opendir(DIR, $in_folder) or die "can't open directory $in_folder: $!\n"; + unless(-d $out_folder){ + mkdir $out_folder or die "can't create directory $out_folder: $! \n"; + } + # read file names from the input folder + @files = readdir DIR; + $y_entries = 0; + $total_entries = 0; + $high_qual_entries = 0; + # loop through each file in the input folder + foreach $fileName (@files) { + # make sure the file is fastq file + if(index($fileName,'.fastq') != -1){ + open(in_file, "<","$in_folder/$fileName") || die "Error: file '$in_folder/$fileName' can not be opened\n"; + print "processing file $in_folder/$fileName\n"; + $fileName =~ s/.fastq/.fa/; + open(out_file,">>","$out_folder/$fileName") || die "Error: file $out_folder/$fileName can not be opened\n"; + until(eof(in_file)){ + # read 4 lines - each entry contains 4 lines + $h1=; + $seq=; + $h2=; + $qual=; + # counter for total no of entries + $total_entries = $total_entries + 1; + # trim the firsst two lines. + chomp $h1; + chomp $seq; + # repalce @ with > + $h1=~s/\@/>/; + # discard the entry if it has :y in the header + if(index($h1, ' 1:Y') != -1 || index($h1,' 2:Y') != -1){ + $y_entries = $y_entries + 1; + next; + } + # some formatting sam asked for + if(index($h1, ' 1:N') != -1){ + #$temp = $slash_string."1:N"; + $temp = ":N"; + $h1 =~ s/ 1:N/$temp/; + $h1 = $h1.$slash_string."1"; + } + elsif(index($h1, ' 2:N') != -1){ + #$temp = $slash_string."2:N"; + $temp = ":N"; + $h1 =~ s/ 2:N/$temp/; + $h1 = $h1.$slash_string."2"; + } + + # write to output file + if($max_rate eq ""){ + print out_file "$h1\n$seq\n"; + } + elsif(filterRead($seq,$max_rate)){ + print out_file "$h1\n$seq\n"; + $high_qual_entries = $high_qual_entries + 1; + } + } + close in_file; + close out_file; + print "successfully written $fileName\n"; + } + } + print "total entries: $total_entries\n"; + print "low quality entries(entries containing 1:Y or 2:Y in the header): $y_entries\n"; + $qual_entries = $total_entries - $high_qual_entries; + print "entries removed due to exceeding max_rate: $qual_entries \n"; + closedir DIR; +} -- 2.34.1