#!/usr/bin/env perl # Convert a fastq to a fasta/qual combo using BioPerl, with some Linux commands use Bio::Perl; use Data::Dumper; use strict; use warnings; use threads; use Thread::Queue; use Getopt::Long; my $settings={}; $|=1; my %numSequences; # static for a subroutine exit(main()); sub main{ die("Usage: $0 -i inputFastqFile [-n numCpus -q outputQualfile -f outputFastaFile]") if(@ARGV<1); GetOptions($settings,('numCpus=s','input=s','qualOut=s','fastaOut=s')); my $file=$$settings{input}||die("input parameter missing"); my $outfasta=$$settings{fastaOut}||"$file.fasta"; my $outqual=$$settings{qualOut}||"$file.qual"; my $numCpus=$$settings{numCpus}||1; my @subfile=splitFastq($file,$numCpus); for my $f(@subfile){ threads->create(\&convert,$f,"$f.fasta","$f.qual"); } $_->join for (threads->list); # join the sub files together joinFastqFiles(\@subfile,$file); return 0; } sub convert{ my($file,$outfasta,$outqual)=@_; my $numSequences=numSequences($file); my $reportEvery=int($numSequences/100) || 1; print "$numSequences sequences to convert in $file\n"; my $in=Bio::SeqIO->new(-file=>$file,-format=>"fastq-illumina"); my $seqOut=Bio::SeqIO->new(-file=>">$outfasta",-format=>"fasta"); my $qualOut=Bio::SeqIO->new(-file=>">$outqual",-format=>"qual"); my $seqCount=0; my $percentDone=0; while(my $seq=$in->next_seq){ $seqOut->write_seq($seq); $qualOut->write_seq($seq); $seqCount++; if($seqCount%$reportEvery == 0){ $percentDone++; print "$percentDone%.."; } } print "Done with subfile $file.\n"; return 1; } sub joinFastqFiles{ my($subfile,$outfileBasename)=@_; my($command,$subfasta,$subqual); # fasta $subfasta.="$_.fasta " for(@$subfile); $command="cat $subfasta > $outfileBasename.fasta"; system($command); # qual $subqual.="$_.qual " for (@$subfile); $command="cat $subqual > $outfileBasename.qual"; system($command); return 1; } sub splitFastq{ my($file,$numCpus)=@_; my $prefix="FQ"; # for fastq my $numSequences=numSequences($file); my $numSequencesPerFile=int($numSequences/$numCpus); my $numSequencesPerFileRemainder=$numSequences % $numCpus; my $numLinesPerFile=$numSequencesPerFile*4; # four lines per read; this could become incorrect if there is a really long read (not currently likely) system("rm -r tmp;mkdir tmp;"); system("split -l $numLinesPerFile $file 'tmp/FQ'"); return glob("tmp/FQ*"); } # use Linux to find the number of sequences quickly, but cache the value because it is still a slow process # This should probably changed to `wc -l`/4 but I don't have time to test the change # TODO for anyone reading this: please change this method to wc -l divided by 4. sub numSequences{ my $file=shift; return $numSequences{$file} if($numSequences{$file}); my $num=`grep -c '^\@' $file`; chomp($num); $numSequences{$file}=$num; return $num; }
mens rolex replica watches buy replica u-boat watches rate replica watches http://bigreplicastore.com/ – replica vestal watches replica car watches
Heck yeah bay-bee keep them cmonig!