Source code for yabul.fasta

FASTA reading and writing

from __future__ import print_function, division, absolute_import

import gzip
import textwrap

import pandas

[docs]def write_fasta(filename, sequences): """ Write sequences to a FASTA. Parameters ---------- filename : string File to write. If it ends with '.gz' the file will be gzip compressed. sequences : iterable of (name, sequence) pairs Sequences to write. Both name and sequence should be strings. """ # wt is "open for writing in text mode" handle = (, "wt") if filename.endswith(".gz") else open(filename, "wt")) try: for name, sequence in sequences: handle.write(">") handle.write(name) handle.write("\n") for line in textwrap.wrap(sequence): handle.write(line) handle.write("\n") handle.write("\n") finally: handle.close()
[docs]def read_fasta(filename): """ Parse a fasta file to a pandas DataFrame. Compression is supported (via pandas read_csv) and is inferred by extension: '.gz', '.bz2', '.zip', or '.xz'. Parameters ---------- filename : string Returns ------- pandas.DataFrame with columns "description" and "sequence". The index of the DataFrame is the "sequence ID", i.e. the first space-separated token of the description. """ # We (mis-) use pandas to read the file. lines = pandas.read_csv( filename, header=None, skip_blank_lines=True, dtype=str, na_filter=False, quoting=3, # QUOTE_NONE comment=';', # Fasta comment lines start with ';' sep="\0", # null separator: never split, always read one column ) assert lines.shape[1] == 1 # one column lines.columns = ["raw"] result = [] current_id = None pieces = [] for line in lines.raw: if line.startswith(">"): if current_id is not None: result.append((current_id, "".join(pieces))) pieces.clear() current_id = line[1:] else: pieces.append(line) # Handle last sequence if current_id is not None: result.append((current_id, "".join(pieces))) result = pandas.DataFrame(result, columns=["description", "sequence"]) result.index = result.description.str.split().str.get(0) = "id" return result