Source code for sgrna_modeler.features

import numpy as np
import pandas as pd
from Bio.SeqUtils import MeltingTemp
import re
import os.path

nt_codes = {'A':[1,0,0,0],
            'C':[0,1,0,0],
            'G':[0,0,1,0],
            'T':[0,0,0,1]}

[docs]def encode_seqs(seqs): # 3d array with samples x position x nt encoded_seqs = np.array([[nt_codes.get(x) for x in seq] for seq in seqs]) return encoded_seqs
## Feature Engineering
[docs]def get_frac_g_or_c(dict, guide_sequence): """Get gc content :param context: sequence :param len: length of guide :param guide_start: position that guide starts :return: curr_dict with gc content """ g_count = guide_sequence.count('G') c_count = guide_sequence.count('C') gc_frac = (g_count + c_count)/len(guide_sequence) dict['GC content'] = gc_frac return dict
[docs]def get_one_nt_counts(dict, guide, nts): for nt in nts: nt_frac = guide.count(nt)/len(guide) dict[nt] = nt_frac return dict
[docs]def get_two_nt_counts(dict, guide, nts): for nt1 in nts: for nt2 in nts: two_mer = nt1 + nt2 nts_counts = guide.count(two_mer) nts_frac = nts_counts/(len(guide) - 1) dict[nt1 + nt2] = nts_frac return dict
[docs]def get_three_nt_counts(dict, guide, nts): for nt1 in nts: for nt2 in nts: for nt3 in nts: k_mer = nt1 + nt2 + nt3 nts_counts = guide.count(k_mer) nts_frac = nts_counts/(len(guide) - 2) dict[nt1 + nt2 + nt3] = nts_frac return dict
[docs]def get_one_nt_pos(dict, context_sequence, nts, context_order): for i in range(len(context_order)): curr_nt = context_sequence[i] for nt in nts: key = context_order[i] + nt if curr_nt == nt: dict[key] = 1 else: dict[key] = 0 return dict
[docs]def get_two_nt_pos(dict, context_sequence, nts, context_order): for i in range(len(context_order) - 1): curr_nts = context_sequence[i:i+2] for nt1 in nts: for nt2 in nts: match_nts = nt1+nt2 key = context_order[i] + match_nts if curr_nts == match_nts: dict[key] = 1 else: dict[key] = 0 return dict
[docs]def get_three_nt_pos(dict, context_sequence, nts, context_order): for i in range(len(context_order) - 2): curr_nts = context_sequence[i:i+3] for nt1 in nts: for nt2 in nts: for nt3 in nts: match_nts = nt1+nt2+nt3 key = context_order[i] + match_nts if curr_nts == match_nts: dict[key] = 1 else: dict[key] = 0 return dict
[docs]def get_thermo(dict, guide_sequence, context_sequence): # Use Biopython to get thermo info. from context and guides dict['Tm, context'] = MeltingTemp.Tm_NN(context_sequence) third = len(guide_sequence)//3 dict['Tm, start'] = MeltingTemp.Tm_NN(guide_sequence[0:third]) dict['Tm, mid'] = MeltingTemp.Tm_NN(guide_sequence[third:2*third]) dict['Tm, end'] = MeltingTemp.Tm_NN(guide_sequence[2*third:]) return dict
[docs]def get_context_order(k): """ :param k: length of kmer :return: list of characters of each nt position """ context_order = [str(x + 1) for x in range(k)] return context_order
[docs]def get_guide_sequence(context, guide_start, guide_length): return context[(guide_start-1):(guide_start + guide_length -1)]
[docs]def get_physiochemical(curr_dict, guide, nts, physiochemical_data): nt_counts = np.array([guide.count(nt1+nt2) for nt1 in nts for nt2 in nts]) no_dinucs = physiochemical_data.drop(columns=['Dinucleotides']) numeric_physio = np.transpose(np.array(no_dinucs)) physio_sum = np.sum(nt_counts*numeric_physio, axis = 1) curr_dict = {**curr_dict, **dict(zip(no_dinucs.keys(), physio_sum))} return curr_dict
[docs]def get_zipper_pos(dict, context_sequence, nts, context_order): for i in range(len(context_order) - 2): curr_nts = context_sequence[i] + context_sequence[i+2] for nt1 in nts: for nt2 in nts: match_nts = nt1+nt2 key = context_order[i] + nt1 + 'N' + nt2 if curr_nts == match_nts: dict[key] = 1 else: dict[key] = 0 return dict
[docs]def get_zipper_counts(dict, guide, nts): for nt1 in nts: for nt2 in nts: regex = '(' + nt1 + '(A|C|T|G)' + nt2 + ')' nts_counts = len(re.findall(regex, guide)) nts_frac = nts_counts/(len(guide) - 2) dict[nt1 + 'N' + nt2] = nts_frac return dict
[docs]def get_rep_counts(dict, context, nts, length): for nt in nts: k_mer = nt*length nts_counts = context.count(k_mer) nts_frac = nts_counts/(len(context) - 2) dict[nt + '*' + str(length)] = nts_frac return dict
[docs]def get_double_zipper(dict, context_sequence, nts, context_order): for i in range(len(context_order) - 3): curr_nts = context_sequence[i] + context_sequence[i+3] for nt1 in nts: for nt2 in nts: match_nts = nt1+nt2 key = context_order[i] + nt1 + 'NN' + nt2 if curr_nts == match_nts: dict[key] = 1 else: dict[key] = 0 return dict
[docs]def featurize_guides(kmers, features = None, guide_start = 9, guide_length = 20): """Take guides and encodes for modeling :param kmers: vector of context sequences :param features: boolean dictionary of which feature types to inlcude: ['Pos. Ind. 1mer', 'Pos. Ind. 2mer', 'Pos. Ind. 3mer', 'Pos. Ind. Zipper','Pos. Dep. 1mer', 'Pos. Dep. 2mer', 'Pos. Dep. 3mer', 'Pos. Dep. Zipper', 'Pos. Ind. Rep.', 'GC content', 'Tm', 'Physio', 'Double Zipper'] :param pam_start: int :param pam_end: int :param guide_start: int :param guide_end: int :return: featurized matrix """ if features == None: # RS2 features = ['Pos. Ind. 1mer', 'Pos. Ind. 2mer', 'Pos. Dep. 1mer', 'Pos. Dep. 2mer', 'GC content', 'Tm'] possible_feats = {'Pos. Ind. 1mer', 'Pos. Ind. 2mer', 'Pos. Ind. 3mer', 'Pos. Ind. Zipper','Pos. Dep. 1mer', 'Pos. Dep. 2mer', 'Pos. Dep. 3mer', 'Pos. Dep. Zipper', 'Pos. Ind. Rep.', 'GC content', 'Tm', 'Physio', 'Double Zipper'} if not set(features).issubset(possible_feats): diff = features - possible_feats assert ValueError(str(diff) + 'Are not currently supported as features') current_path = os.path.abspath(os.path.dirname(__file__)) physio_path = os.path.join(current_path, 'data/features/physiochem.csv.zip') physiochemical_data = pd.read_csv(physio_path) k = len(kmers[0]) context_order = get_context_order(k) nts = ['A', 'C', 'T', 'G'] feature_dict_list = [] for i in range(len(kmers)): curr_dict = {} context = kmers[i] guide_sequence = get_guide_sequence(context, guide_start, guide_length) if 'GC content' in features: curr_dict = get_frac_g_or_c(curr_dict, guide_sequence) if 'Pos. Ind. 1mer' in features: curr_dict = get_one_nt_counts(curr_dict, guide_sequence, nts) if 'Pos. Ind. 2mer' in features: curr_dict = get_two_nt_counts(curr_dict, guide_sequence, nts) if 'Pos. Ind. 3mer' in features: curr_dict = get_three_nt_counts(curr_dict, guide_sequence, nts) if 'Pos. Dep. 1mer' in features: curr_dict = get_one_nt_pos(curr_dict, context, nts, context_order) if 'Pos. Dep. 2mer' in features: curr_dict = get_two_nt_pos(curr_dict, context, nts, context_order) if 'Pos. Dep. 3mer' in features: curr_dict = get_three_nt_pos(curr_dict, context, nts, context_order) if 'Tm' in features: curr_dict = get_thermo(curr_dict, guide_sequence, context) if 'Physio' in features: curr_dict = get_physiochemical(curr_dict, guide_sequence, nts, physiochemical_data) if 'Pos. Dep. Zipper' in features: curr_dict = get_zipper_pos(curr_dict, context, nts, context_order) if 'Pos. Ind. Zipper' in features: curr_dict = get_zipper_counts(curr_dict, guide_sequence, nts) if 'Pos. Ind. Rep.' in features: curr_dict = get_rep_counts(curr_dict, context, nts, 4) if 'Double Zipper' in features: curr_dict = get_double_zipper(curr_dict, context, nts, context_order) feature_dict_list.append(curr_dict) feature_matrix = pd.DataFrame(feature_dict_list) return feature_matrix