Source code for treetime.seq_utils

import numpy as np
from Bio import Seq, SeqRecord


alphabet_synonyms = {'nuc':'nuc', 'nucleotide':'nuc', 'aa':'aa', 'aminoacid':'aa',
                     'nuc_nogap':'nuc_nogap', 'nucleotide_nogap':'nuc_nogap',
                     'aa_nogap':'aa_nogap', 'aminoacid_nogap':'aa_nogap',
                     'DNA':'nuc', 'DNA_nogap':'nuc_nogap'}

alphabets = {
            "nuc":           np.array(['A', 'C', 'G', 'T', '-']),

            "nuc_nogap":np.array(['A', 'C', 'G', 'T']),

            "aa":            np.array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K',
                                       'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V',
                                       'W', 'Y', '*', '-']),

            "aa_nogap": np.array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K',
                                       'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V',
                                       'W', 'Y'])
            }

profile_maps = {
'nuc':{
    'A': np.array([1, 0, 0, 0, 0], dtype='float'),
    'C': np.array([0, 1, 0, 0, 0], dtype='float'),
    'G': np.array([0, 0, 1, 0, 0], dtype='float'),
    'T': np.array([0, 0, 0, 1, 0], dtype='float'),
    '-': np.array([0, 0, 0, 0, 1], dtype='float'),
    'N': np.array([1, 1, 1, 1, 1], dtype='float'),
    'X': np.array([1, 1, 1, 1, 1], dtype='float'),
    'R': np.array([1, 0, 1, 0, 0], dtype='float'),
    'Y': np.array([0, 1, 0, 1, 0], dtype='float'),
    'S': np.array([0, 1, 1, 0, 0], dtype='float'),
    'W': np.array([1, 0, 0, 1, 0], dtype='float'),
    'K': np.array([0, 0, 1, 1, 0], dtype='float'),
    'M': np.array([1, 1, 0, 0, 0], dtype='float'),
    'D': np.array([1, 0, 1, 1, 0], dtype='float'),
    'H': np.array([1, 1, 0, 1, 0], dtype='float'),
    'B': np.array([0, 1, 1, 1, 0], dtype='float'),
    'V': np.array([1, 1, 1, 0, 0], dtype='float')
    },

'nuc_nogap':{
    'A': np.array([1, 0, 0, 0], dtype='float'),
    'C': np.array([0, 1, 0, 0], dtype='float'),
    'G': np.array([0, 0, 1, 0], dtype='float'),
    'T': np.array([0, 0, 0, 1], dtype='float'),
    '-': np.array([1, 1, 1, 1], dtype='float'), # gaps are completely ignored in distance computations
    'N': np.array([1, 1, 1, 1], dtype='float'),
    'X': np.array([1, 1, 1, 1], dtype='float'),
    'R': np.array([1, 0, 1, 0], dtype='float'),
    'Y': np.array([0, 1, 0, 1], dtype='float'),
    'S': np.array([0, 1, 1, 0], dtype='float'),
    'W': np.array([1, 0, 0, 1], dtype='float'),
    'K': np.array([0, 0, 1, 1], dtype='float'),
    'M': np.array([1, 1, 0, 0], dtype='float'),
    'D': np.array([1, 0, 1, 1], dtype='float'),
    'H': np.array([1, 1, 0, 1], dtype='float'),
    'B': np.array([0, 1, 1, 1], dtype='float'),
    'V': np.array([1, 1, 1, 0], dtype='float')
    },

'aa':{
    'A': np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Alanine         Ala
    'C': np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Cysteine        Cys
    'D': np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Aspartic AciD   Asp
    'E': np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamic Acid   Glu
    'F': np.array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Phenylalanine   Phe
    'G': np.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glycine         Gly
    'H': np.array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Histidine       His
    'I': np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Isoleucine      Ile
    'K': np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Lysine          Lys
    'L': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Leucine         Leu
    'M': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Methionine      Met
    'N': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #AsparagiNe      Asn
    'P': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Proline         Pro
    'Q': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamine       Gln
    'R': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #ARginine        Arg
    'S': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype='float'), #Serine          Ser
    'T': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype='float'), #Threonine       Thr
    'V': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype='float'), #Valine          Val
    'W': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype='float'), #Tryptophan      Trp
    'Y': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype='float'), #Tyrosine        Tyr
    '*': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype='float'), #stop
    '-': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype='float'), #gap
    'X': np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype='float'), #not specified/any
    'B': np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Asparagine/Aspartic Acid    Asx
    'Z': np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamine/Glutamic Acid     Glx
    },

'aa_nogap':{
    'A': np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Alanine         Ala
    'C': np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Cysteine        Cys
    'D': np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Aspartic AciD   Asp
    'E': np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamic Acid   Glu
    'F': np.array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Phenylalanine   Phe
    'G': np.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Glycine         Gly
    'H': np.array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Histidine       His
    'I': np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Isoleucine      Ile
    'K': np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Lysine          Lys
    'L': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Leucine         Leu
    'M': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Methionine      Met
    'N': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #AsparagiNe      Asn
    'P': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Proline         Pro
    'Q': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamine       Gln
    'R': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], dtype='float'), #ARginine        Arg
    'S': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype='float'), #Serine          Ser
    'T': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype='float'), #Threonine       Thr
    'V': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], dtype='float'), #Valine          Val
    'W': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype='float'), #Tryptophan      Trp
    'Y': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype='float'), #Tyrosine        Tyr
    'X': np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype='float'), #not specified/any
    'B': np.array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype='float'), #Asparagine/Aspartic Acid    Asx
    'Z': np.array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype='float'), #Glutamine/Glutamic Acid     Glx
    }
}


def extend_profile(gtr, aln, logger=None):
    tmp_unique_chars = []
    for seq in aln:
        tmp_unique_chars.extend(np.unique(seq))

    unique_chars = np.unique(tmp_unique_chars)
    for c in unique_chars:
        if c not in gtr.profile_map:
            gtr.profile_map[c] = np.ones(gtr.n_states)
            if logger:
                logger("WARNING: character %s is unknown. Treating it as missing information"%c,1,warn=True)


def guess_alphabet(aln):
    total=0
    nuc_count = 0
    for seq in aln:
        total += len(seq)
        for n in np.array(list('acgtACGT-N')):
            nuc_count += np.sum(seq==n)
    if nuc_count>0.9*total:
        return 'nuc'
    else:
        return 'aa'


[docs]def seq2array(seq, word_length=1, convert_upper=False, fill_overhangs=False, ambiguous='N'): """ Take the raw sequence, substitute the "overhanging" gaps with 'N' (missequenced), and convert the sequence to the numpy array of chars. Parameters ---------- seq : Biopython.SeqRecord, str, iterable Sequence as an object of SeqRecord, string or iterable word_length : int, optional 1 for nucleotide or amino acids, 3 for codons etc. convert_upper : bool, optional convert the sequence to upper case fill_overhangs : bool If True, substitute the "overhanging" gaps with ambiguous character symbol ambiguous : char Specify the character for ambiguous state ('N' default for nucleotide) Returns ------- sequence : np.array Sequence as 1D numpy array of chars """ if isinstance(seq, str): seq_str = seq elif isinstance(seq, Seq.Seq): seq_str = str(seq) elif isinstance(seq, SeqRecord.SeqRecord): seq_str = str(seq.seq) else: raise TypeError("seq2array: sequence must be Bio.Seq, Bio.SeqRecord, or string. Got "+str(seq)) if convert_upper: seq_str = seq_str.upper() if word_length==1: seq_array = np.array(list(seq_str)) else: if len(seq_str)%word_length: raise ValueError("sequence length has to be multiple of word length") seq_array = np.array([seq_str[i*word_length:(i+1)*word_length] for i in range(len(seq_str)/word_length)]) # substitute overhanging unsequenced tails if fill_overhangs: gaps = np.where(seq_array != '-')[0] if len(gaps): seq_array[:gaps[0]] = ambiguous seq_array[gaps[-1]+1:] = ambiguous else: seq_array[:] = ambiguous return seq_array
[docs]def seq2prof(seq, profile_map): """ Convert the given character sequence into the profile according to the alphabet specified. Parameters ---------- seq : numpy.array Sequence to be converted to the profile profile_map : dic Mapping valid characters to profiles Returns ------- idx : numpy.array Profile for the character. Zero array if the character not found """ return np.array([profile_map[k] for k in seq])
[docs]def prof2seq(profile, gtr, sample_from_prof=False, normalize=True, rng=None): """ Convert profile to sequence and normalize profile across sites. Parameters ---------- profile : numpy 2D array Profile. Shape of the profile should be (L x a), where L - sequence length, a - alphabet size. gtr : gtr.GTR Instance of the GTR class to supply the sequence alphabet collapse_prof : bool Whether to convert the profile to the delta-function Returns ------- seq : numpy.array Sequence as numpy array of length L prof_values : numpy.array Values of the profile for the chosen sequence characters (length L) idx : numpy.array Indices chosen from profile as array of length L """ if rng is None: rng = np.random.default_rng() # normalize profile such that probabilities at each site sum to one if normalize: tmp_profile, pre=normalize_profile(profile, return_offset=False) else: tmp_profile = profile # sample sequence according to the probabilities in the profile # (sampling from cumulative distribution over the different states) if sample_from_prof: cumdis = tmp_profile.cumsum(axis=1).T randnum = rng.random(size=cumdis.shape[1]) idx = np.argmax(cumdis>=randnum, axis=0) else: idx = tmp_profile.argmax(axis=1) seq = gtr.alphabet[idx] # max LH over the alphabet prof_values = tmp_profile[np.arange(tmp_profile.shape[0]), idx] return seq, prof_values, idx
def normalize_profile(in_profile, log=False, return_offset = True): """return a normalized version of a profile matrix Parameters ---------- in_profile : np.array shape Lxq, will be normalized to one across each row log : bool, optional treat the input as log probabilities return_offset : bool, optional return the log of the scale factor for each row Returns ------- tuple normalized profile (fresh np object) and offset (if return_offset==True) """ if log: tmp_prefactor = in_profile.max(axis=1) tmp_prof = np.exp(in_profile.T - tmp_prefactor).T else: tmp_prefactor = 0.0 tmp_prof = in_profile norm_vector = tmp_prof.sum(axis=1) return (np.einsum('ai,a->ai',tmp_prof,1.0/norm_vector), (np.log(norm_vector) + tmp_prefactor) if return_offset else None)