#! /usr/bin/env python2.7 desc = """ Split an ACE format assembly file into multiple single ACE files each with a single contig. The resulting ACE contig files are formatted so that they can be read by CodonCode (http://www.codoncode.com). $ ACEAssemblySplitter.py test.ace -s 2 -e 3 - prints the second and third contigs in the test.ace file Version 1.0 Cymon J. Cox Nov 2011 """ import sys try: import argparse except ImportError: print "Requires Python >= 2.7." sys.exit() import os import argparse import textwrap import subprocess def count(filename, verbose=True): cmd = "grep -c '^CO ' %s" % filename child = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) stdout, stderr = child.communicate() if not child.returncode == 0: print "grep returned error %s" % child.returncode print "stderr: %s\n" %stderr sys.exit(1) else: if verbose: print "Number of contigs in %s: %s" % (filename, stdout) return int(stdout) def main(filename, count_only, start, end, verbose): if not os.path.exists(filename): print "Cannot find file %s" % filename sys.exit(1) if count_only: count(filename) sys.exit(0) else: if start or end: n_contigs = count(filename, verbose=False) if start and end: if start > end: msg = "Cannot start at %i and end at %i" print msg (start, end) sys.exit(1) if start > n_contigs: msg = "Only %i contigs in %s - cannot start at %i" print msg % (n_contigs, filename, start) sys.exit(1) if end > n_contigs: msg = "Only %i contigs in %s - cannot end at %i" print msg % (n_contigs, filename, end) sys.exit(1) if not end: end = n_contigs isPreviousAF = False inRT = False inCT = False skipNextLine = False inHeader = True foundStart = False lines = [] counter = 1 fileLines = open(filename).readlines() for line in fileLines: if line.startswith("CO "): if lines != []: if inHeader: #Skip over header: lines = [] inHeader = False lines.append(line) continue #Are we there yet? if not foundStart: if counter == start: foundStart = True else: lines = [] counter +=1 lines.append(line) continue if foundStart: if counter < end: #CO FucusALL_rep_c1 1819 1428 94 U name = lines[0].split()[1] outfilename = "%s.ace" % name head = "AS 1 1\n\n" lines.insert(0, head) fh = open(outfilename, "w") fh.writelines(lines) fh.close() counter +=1 lines = [] else: break lines.append(line) continue if skipNextLine: skipNextLine = False continue #Delete CT and RT stanzas elif line.startswith("RT{"): inRT = True continue elif line.startswith("CT{"): inCT = True elif line.startswith("}"): #Both CT and RT finish with a C} inCT = inRT = False #Need to miss the next "\n" line skipNextLine = True continue elif inCT or inRT: continue #Delete the blank line after AF line elif line.startswith("AF"): isPreviousAF = True lines.append(line) elif line.startswith("\n") and isPreviousAF: isPreviousAF = False continue elif line.startswith("RD "): #delete previous "\n" line lines = lines[:-1] lines.append(line) elif line.startswith("DS "): #delete previous "\n" line lines = lines[:-1] lines.append(line) else: lines.append(line) #print the last one: name = lines[0].split()[1] outfilename = "%s.ace" % name head = "AS 1 1\n\n" lines.insert(0, head) fh = open(outfilename, "w") fh.writelines(lines) fh.close() counter +=1 if verbose: print "Done. Split %i Contigs." % counter if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent(desc), ) parser.add_argument("filename", help="ACE assembly filename", ) parser.add_argument("-c", "--count", default=False, action='store_true', help="Count number of contigs in assembly only. Default=False") parser.add_argument("-s", "--start", default=1, type=int, dest="start", help="Start at record number N (integer). Default=1") parser.add_argument("-e", "--end", default=None, type=int, dest="end", help="End at record number N (integer). Default=last in file") parser.add_argument("-v", "--verbose", default=False, action='store_true', help="Verbose. Default=False") args = parser.parse_args() main(args.filename, args.count, args.start, args.end, args.verbose)
Recent Comments