小生这厢有礼了(BioFaceBook Personal Blog) » perl

Merging separate sequence and quality files to FASTQ

szypanther — Thu, 07 Jun 2012 06:51:29 +0000

#!/usr/bin/perl -w

use strict;

use Bio::SeqIO;
use Bio::Seq::Quality;

use Getopt::Long;

die "pass a fasta and a fasta-quality file\n"
  unless @ARGV;

my ($seq_infile,$qual_infile)
  = (scalar @ARGV == 1) ?($ARGV[0], "$ARGV[0].qual") : @ARGV;

## Create input objects for both a seq (fasta) and qual file

my $in_seq_obj =
  Bio::SeqIO->new( -file   => $seq_infile,
		   -format => 'fasta',
		 );

my $in_qual_obj =
  Bio::SeqIO->new( -file   => $qual_infile,
		   -format => 'qual',
		 );

my $out_fastq_obj =
  Bio::SeqIO->new( -format => 'fastq'
		 );

while (1){
  ## create objects for both a seq and its associated qual
  my $seq_obj  = $in_seq_obj->next_seq || last;
  my $qual_obj = $in_qual_obj->next_seq;

  die "foo!\n"
    unless
      $seq_obj->id eq
	$qual_obj->id;

  ## Here we use seq and qual object methods feed info for new BSQ
  ## object.
  my $bsq_obj =
    Bio::Seq::Quality->
	new( -id   => $seq_obj->id,
	     -seq  => $seq_obj->seq,
	     -qual => $qual_obj->qual,
	   );

  ## and print it out.
  $out_fastq_obj->write_fastq($bsq_obj);
}

Converting FASTQ to FASTA QUAL files

szypanther — Thu, 07 Jun 2012 06:48:57 +0000

#!/usr/bin/env perl

# Convert a fastq to a fasta/qual combo using BioPerl, with some Linux commands

use Bio::Perl;
use Data::Dumper;
use strict;
use warnings;
use threads;
use Thread::Queue;
use Getopt::Long;

my $settings={};

$|=1;
my %numSequences; # static for a subroutine

exit(main());

sub main{
  die("Usage: $0 -i inputFastqFile [-n numCpus -q outputQualfile -f outputFastaFile]") if(@ARGV<1);

  GetOptions($settings,('numCpus=s','input=s','qualOut=s','fastaOut=s'));

  my $file=$$settings{input}||die("input parameter missing");
  my $outfasta=$$settings{fastaOut}||"$file.fasta";
  my $outqual=$$settings{qualOut}||"$file.qual";
  my $numCpus=$$settings{numCpus}||1;

  my @subfile=splitFastq($file,$numCpus);
  for my $f(@subfile){
    threads->create(\&convert,$f,"$f.fasta","$f.qual");
  }
  $_->join for (threads->list);
  # join the sub files together
  joinFastqFiles(\@subfile,$file);

  return 0;
}

sub convert{
  my($file,$outfasta,$outqual)=@_;

  my $numSequences=numSequences($file);
  my $reportEvery=int($numSequences/100) || 1;
  print "$numSequences sequences to convert in $file\n";

  my $in=Bio::SeqIO->new(-file=>$file,-format=>"fastq-illumina");
  my $seqOut=Bio::SeqIO->new(-file=>">$outfasta",-format=>"fasta");
  my $qualOut=Bio::SeqIO->new(-file=>">$outqual",-format=>"qual");
  my $seqCount=0;
  my $percentDone=0;
  while(my $seq=$in->next_seq){
    $seqOut->write_seq($seq);
    $qualOut->write_seq($seq);
    $seqCount++;
    if($seqCount%$reportEvery == 0){
      $percentDone++;
      print "$percentDone%..";
    }
  }
  print "Done with subfile $file.\n";
  return 1;
}

sub joinFastqFiles{
  my($subfile,$outfileBasename)=@_;
  my($command,$subfasta,$subqual);

  # fasta
  $subfasta.="$_.fasta " for(@$subfile);
  $command="cat $subfasta > $outfileBasename.fasta";
  system($command);

  # qual
  $subqual.="$_.qual " for (@$subfile);
  $command="cat $subqual > $outfileBasename.qual";
  system($command);

  return 1;
} 

sub splitFastq{
  my($file,$numCpus)=@_;
  my $prefix="FQ"; # for fastq
  my $numSequences=numSequences($file);
  my $numSequencesPerFile=int($numSequences/$numCpus);
  my $numSequencesPerFileRemainder=$numSequences % $numCpus;
  my $numLinesPerFile=$numSequencesPerFile*4; # four lines per read; this could become incorrect if there is a really long read (not currently likely)
  system("rm -r tmp;mkdir tmp;");
  system("split -l $numLinesPerFile $file 'tmp/FQ'");

  return glob("tmp/FQ*");
} 

# use Linux to find the number of sequences quickly, but cache the value because it is still a slow process
# This should probably changed to `wc -l`/4 but I don't have time to test the change
# TODO for anyone reading this: please change this method to wc -l divided by 4.
sub numSequences{
  my $file=shift;
  return $numSequences{$file} if($numSequences{$file});
  my $num=`grep -c '^\@' $file`;
  chomp($num);
  $numSequences{$file}=$num;
  return $num;
}

使用CPAN模块自动安装perl模块

szypanther — Mon, 14 May 2012 02:13:25 +0000

命令：
perl　-MCPAN　-e　shell　

初次运行CPAN时需要做一些设置，一路回车在最后选一个最近的CPAN镜像站点。例如国内的中国自由软件库： ftp://freesoft.cgi.gov.cn/pub/languages/perl/CPAN

获得帮助
cpan>h

列出CPAN上所有模块的列表
cpan>m

安装模块

perl -MCPAN -e shell
cpan> install IO::Wrap
cpan> install Net::Server
cpan> install MIME::Words

也可以合并成一条命令，如：
perl -MCPAN -e ‘install Net::Server’

退出
cpan>q
如果自动安装失败，可以手动安装跳过测试：
cd /root/.cpan/build/Net-Server-0.97/
perl Makefile.PL
make install