Source code for sgrna_modeler.features

import numpy as np
import pandas as pd
from Bio.SeqUtils import MeltingTemp
import re
import os.path

nt_codes = {'A':[1,0,0,0],
            'C':[0,1,0,0],
            'G':[0,0,1,0],
            'T':[0,0,0,1]}

[docs]def encode_seqs(seqs):
    # 3d array with samples x position x nt
    encoded_seqs = np.array([[nt_codes.get(x) for x in seq] for seq in seqs])
    return encoded_seqs

## Feature Engineering
[docs]def get_frac_g_or_c(dict, guide_sequence):
    """Get gc content
    :param context: sequence
    :param len: length of guide
    :param guide_start: position that guide starts
    :return: curr_dict with gc content
    """
    g_count = guide_sequence.count('G')
    c_count = guide_sequence.count('C')
    gc_frac = (g_count + c_count)/len(guide_sequence)
    dict['GC content'] = gc_frac
    return dict

[docs]def get_one_nt_counts(dict, guide, nts):
    for nt in nts:
        nt_frac = guide.count(nt)/len(guide)
        dict[nt] = nt_frac
    return dict

[docs]def get_two_nt_counts(dict, guide, nts):
    for nt1 in nts:
        for nt2 in nts:
            two_mer = nt1 + nt2
            nts_counts = guide.count(two_mer)
            nts_frac = nts_counts/(len(guide) - 1)
            dict[nt1 + nt2] = nts_frac
    return dict

[docs]def get_three_nt_counts(dict, guide, nts):
    for nt1 in nts:
        for nt2 in nts:
            for nt3 in nts:
                k_mer = nt1 + nt2 + nt3
                nts_counts = guide.count(k_mer)
                nts_frac = nts_counts/(len(guide) - 2)
                dict[nt1 + nt2 + nt3] = nts_frac
    return dict

[docs]def get_one_nt_pos(dict, context_sequence, nts, context_order):
    for i in range(len(context_order)):
        curr_nt = context_sequence[i]
        for nt in nts:
            key = context_order[i] + nt
            if curr_nt == nt:
                dict[key] = 1
            else:
                dict[key] = 0
    return dict

[docs]def get_two_nt_pos(dict, context_sequence, nts, context_order):
    for i in range(len(context_order) - 1):
        curr_nts = context_sequence[i:i+2]
        for nt1 in nts:
            for nt2 in nts:
                match_nts = nt1+nt2
                key = context_order[i] + match_nts
                if curr_nts == match_nts:
                    dict[key] = 1
                else:
                    dict[key] = 0
    return dict

[docs]def get_three_nt_pos(dict, context_sequence, nts, context_order):
    for i in range(len(context_order) - 2):
        curr_nts = context_sequence[i:i+3]
        for nt1 in nts:
            for nt2 in nts:
                for nt3 in nts:
                    match_nts = nt1+nt2+nt3
                    key = context_order[i] + match_nts
                    if curr_nts == match_nts:
                        dict[key] = 1
                    else:
                        dict[key] = 0
    return dict

[docs]def get_thermo(dict, guide_sequence, context_sequence):
    # Use Biopython to get thermo info. from context and guides
    dict['Tm, context'] = MeltingTemp.Tm_NN(context_sequence)
    third = len(guide_sequence)//3
    dict['Tm, start'] = MeltingTemp.Tm_NN(guide_sequence[0:third])
    dict['Tm, mid'] = MeltingTemp.Tm_NN(guide_sequence[third:2*third])
    dict['Tm, end'] = MeltingTemp.Tm_NN(guide_sequence[2*third:])
    return dict

[docs]def get_context_order(k):
    """
    :param k: length of kmer
    :return: list of characters of each nt position
    """
    context_order = [str(x + 1) for x in range(k)]
    return context_order

[docs]def get_guide_sequence(context, guide_start, guide_length):
    return context[(guide_start-1):(guide_start + guide_length -1)]

[docs]def get_physiochemical(curr_dict, guide, nts, physiochemical_data):
    nt_counts = np.array([guide.count(nt1+nt2) for nt1 in nts for nt2 in nts])
    no_dinucs = physiochemical_data.drop(columns=['Dinucleotides'])
    numeric_physio = np.transpose(np.array(no_dinucs))
    physio_sum = np.sum(nt_counts*numeric_physio, axis = 1)
    curr_dict = {**curr_dict, **dict(zip(no_dinucs.keys(), physio_sum))}
    return curr_dict

[docs]def get_zipper_pos(dict, context_sequence, nts, context_order):
    for i in range(len(context_order) - 2):
        curr_nts = context_sequence[i] + context_sequence[i+2]
        for nt1 in nts:
            for nt2 in nts:
                match_nts = nt1+nt2
                key = context_order[i] + nt1 + 'N' + nt2
                if curr_nts == match_nts:
                    dict[key] = 1
                else:
                    dict[key] = 0
    return dict

[docs]def get_zipper_counts(dict, guide, nts):
    for nt1 in nts:
        for nt2 in nts:
            regex = '(' + nt1 + '(A|C|T|G)' + nt2 + ')'
            nts_counts = len(re.findall(regex, guide))
            nts_frac = nts_counts/(len(guide) - 2)
            dict[nt1 + 'N' + nt2] = nts_frac
    return dict

[docs]def get_rep_counts(dict, context, nts, length):
    for nt in nts:
        k_mer = nt*length
        nts_counts = context.count(k_mer)
        nts_frac = nts_counts/(len(context) - 2)
        dict[nt + '*' + str(length)] = nts_frac
    return dict

[docs]def get_double_zipper(dict, context_sequence, nts, context_order):
    for i in range(len(context_order) - 3):
        curr_nts = context_sequence[i] + context_sequence[i+3]
        for nt1 in nts:
            for nt2 in nts:
                match_nts = nt1+nt2
                key = context_order[i] + nt1 + 'NN' + nt2
                if curr_nts == match_nts:
                    dict[key] = 1
                else:
                    dict[key] = 0
    return dict


[docs]def featurize_guides(kmers, features = None,
                     guide_start = 9, guide_length = 20):
    """Take guides and encodes for modeling
    :param kmers: vector of context sequences
    :param features: boolean dictionary of which feature types to inlcude:
        ['Pos. Ind. 1mer', 'Pos. Ind. 2mer', 'Pos. Ind. 3mer',
        'Pos. Ind. Zipper','Pos. Dep. 1mer', 'Pos. Dep. 2mer',
        'Pos. Dep. 3mer', 'Pos. Dep. Zipper', 'Pos. Ind. Rep.', 'GC content',
        'Tm', 'Physio', 'Double Zipper']
    :param pam_start: int
    :param pam_end: int
    :param guide_start: int
    :param guide_end: int
    :return: featurized matrix
    """
    if features == None:
        # RS2
        features = ['Pos. Ind. 1mer', 'Pos. Ind. 2mer',
                    'Pos. Dep. 1mer', 'Pos. Dep. 2mer',
                    'GC content', 'Tm']
    possible_feats = {'Pos. Ind. 1mer', 'Pos. Ind. 2mer', 'Pos. Ind. 3mer',
                      'Pos. Ind. Zipper','Pos. Dep. 1mer', 'Pos. Dep. 2mer',
                      'Pos. Dep. 3mer', 'Pos. Dep. Zipper', 'Pos. Ind. Rep.',
                      'GC content', 'Tm', 'Physio', 'Double Zipper'}
    if not set(features).issubset(possible_feats):
        diff = features - possible_feats
        assert ValueError(str(diff) + 'Are not currently supported as features')
    current_path = os.path.abspath(os.path.dirname(__file__))
    physio_path = os.path.join(current_path, 'data/features/physiochem.csv.zip')
    physiochemical_data = pd.read_csv(physio_path)
    k = len(kmers[0])
    context_order = get_context_order(k)
    nts = ['A', 'C', 'T', 'G']
    feature_dict_list = []
    for i in range(len(kmers)):
        curr_dict = {}
        context = kmers[i]
        guide_sequence = get_guide_sequence(context, guide_start, guide_length)
        if 'GC content' in features:
            curr_dict = get_frac_g_or_c(curr_dict, guide_sequence)
        if 'Pos. Ind. 1mer' in features:
            curr_dict = get_one_nt_counts(curr_dict, guide_sequence, nts)
        if 'Pos. Ind. 2mer' in features:
            curr_dict = get_two_nt_counts(curr_dict, guide_sequence, nts)
        if 'Pos. Ind. 3mer' in features:
            curr_dict = get_three_nt_counts(curr_dict, guide_sequence, nts)
        if 'Pos. Dep. 1mer' in features:
            curr_dict = get_one_nt_pos(curr_dict, context, nts, context_order)
        if 'Pos. Dep. 2mer' in features:
            curr_dict = get_two_nt_pos(curr_dict, context, nts, context_order)
        if 'Pos. Dep. 3mer' in features:
            curr_dict = get_three_nt_pos(curr_dict, context, nts, context_order)
        if 'Tm' in features:
            curr_dict = get_thermo(curr_dict, guide_sequence, context)
        if 'Physio' in features:
            curr_dict = get_physiochemical(curr_dict, guide_sequence, nts, physiochemical_data)
        if 'Pos. Dep. Zipper' in features:
            curr_dict = get_zipper_pos(curr_dict, context, nts, context_order)
        if 'Pos. Ind. Zipper' in features:
            curr_dict = get_zipper_counts(curr_dict, guide_sequence, nts)
        if 'Pos. Ind. Rep.' in features:
            curr_dict = get_rep_counts(curr_dict, context, nts, 4)
        if 'Double Zipper' in features:
            curr_dict = get_double_zipper(curr_dict, context, nts, context_order)
        feature_dict_list.append(curr_dict)
    feature_matrix = pd.DataFrame(feature_dict_list)
    return feature_matrix