Source code for sgrna_modeler.models

from sgrna_modeler import features as fe
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from tensorflow import keras as k
import pandas as pd
import os
from joblib import load
import sgrna_modeler.enzymes as en

[docs]def curr_path(): return os.path.dirname(__file__)
[docs]def get_deepcpf1_weights(): path = os.path.join(curr_path(), 'data/saved_models/Seq_deepCpf1_weights_tf.h5') return path
[docs]def get_enpam_gb(): path = os.path.join(curr_path(), 'data/saved_models/enPAM_GB.joblib') return path
[docs]def build_kim2018(input_shape=(34, 4)): """ Build a convolutional neural network From: Kim, Hui Kwon, et al. "Deep learning improves prediction of CRISPR–Cpf1 guide RNA activity." \ Nature biotechnology 36.3 (2018): 239. :param input_shape: guide length by nts (4) :type input_shape: tuple :return: CNN architecture :rtype: keras Model object """ """Build a Convolutional neural network model from Kim 2018 Parmeters --------- input_shape: tuple, optional (default (34, 4) shape of the first layer of the model Returns ------- model: keras model object """ Input_SEQ = k.layers.Input(shape=input_shape) C1 = k.layers.Convolution1D(80, 5, activation='relu')(Input_SEQ) P1 = k.layers.AveragePooling1D(2)(C1) F = k.layers.Flatten()(P1) DO1 = k.layers.Dropout(0.3)(F) D1 = k.layers.Dense(80, activation='relu')(DO1) DO2 = k.layers.Dropout(0.3)(D1) D2 = k.layers.Dense(40, activation='relu')(DO2) DO3 = k.layers.Dropout(0.3)(D2) D3 = k.layers.Dense(40, activation='relu')(DO3) DO4 = k.layers.Dropout(0.3)(D3) Output = k.layers.Dense(1, activation='linear')(DO4) model = k.models.Model(inputs = Input_SEQ, outputs = Output) return model
[docs]class KerasSgrnaModel(object): """This class is for creating, training, and predicting guide activity with a Keras model :param random_state: set random state in train/test split for reproducibility :type random_stat: int :param val_frac: amount of data to use for early stopping :type val_frac: float :param base_arc: base architecture to build neural network, defaults to build_kim2018 :type base_arc: function, which takes an input shape and returns a keras model :Example: >>> from sgrna_modeler import datasets as da >>> from sgrna_modeler import models as sg >>> train_data = da.load_kim_2018_train() >>> train_model = sg.KerasSgrnaModel() >>> train_model.fit(train_data) >>> test_data = da.load_kim_2018_test() >>> test_predictions = train_model.predict(test_data) """ def __init__(self, random_state = 7, val_frac = 0.1, base_arc = None): """Constructor """ self.base_name = 'Keras_CNN' self.val_frac = val_frac self.random_state = random_state if base_arc is None: self.base_arc = build_kim2018 else: self.base_arc = base_arc self.train_dataset = None self.enzyme = None self.model = None self.model_history = None self.train_name = None
[docs] def load_weights(self, weights, enzyme, name): """Load previously trained weights :param enzyme: cas9 or cas12a :type enyme: dict :param weights: filepath to weights :type weights: str :param name: name of the model :type name:str """ if weights is None: weights = get_deepcpf1_weights() self.train_name = 'Seq-DeepCpf1' self.enzyme = en.cas12a else: self.train_name = name self.enzyme = enzyme model = self.base_arc(input_shape = (self.enzyme['context_length'],4)) model.load_weights(weights) self.model = model return self
[docs] def fit(self, train_dataset): """ Fit a model to the training data :param train_dataset: training data :type train_dataset: :class:`sgrna_modeler.datasets.ActivityData` :return: self """ self.train_dataset = train_dataset self.train_name = train_dataset.name self.enzyme = train_dataset.enzyme train_val_x, y = train_dataset.get_xy() encoded_train_val_x = fe.encode_seqs(train_val_x) train_x, val_x, train_y, val_y = train_test_split(encoded_train_val_x, y, test_size=self.val_frac, random_state=self.random_state) model = self.base_arc(input_shape = (self.enzyme['context_length'],4)) model.compile(optimizer='RMSprop',loss='mse',metrics=['mae']) self.model_history = model.fit(train_x, train_y, epochs = 200, validation_data = (val_x, val_y), callbacks = [k.callbacks.EarlyStopping(patience=20,restore_best_weights=True), k.callbacks.History()], verbose = 0) self.model = model return self
[docs] def predict(self, test_dataset): """Predict activity of test data :param test_dataset: testing data :type test_dataset: :class:`sgrna_modeler.datasets.ActivityData` :return: dataframe of predictions and other meta information :rtype: pandas dataframe """ x, y = test_dataset.get_xy() encoded_x = fe.encode_seqs(x) predictions = self.model.predict(encoded_x) out_data = pd.DataFrame({'kmer': x, 'y': y}) if test_dataset.group_column: out_data['group'] = test_dataset.data[test_dataset.group_column] else: out_data['group'] = '' out_data['prediction'] = predictions out_data['model'] = self.base_name out_data['training_data'] = self.train_name out_data['test_data'] = test_dataset.name return out_data
[docs] def predict_seqs(self, seqs): """ Predict from sequences :param seqs: sequences to predict :return: numeric vector of predcitions """ featurized_x = fe.encode_seqs(seqs) predictions = self.model.predict(featurized_x).flatten() return predictions
[docs]class SklearnSgrnaModel(object): """scikit-learn gradient boosting for modeling sgRNA activity :param random_state: set random state in train/test split for reproducibility :type random_state: int :param val_frac: amount of data to use for early stopping :type val_frac: float :param model: base model :type model: sklearn GradientBoostingRegressor :param features: features to model :type features: list :Example: >>> from sgrna_modeler import datasets as da >>> from sgrna_modeler import models as sg >>> train_model = sg.SklearnSgrnaModel() >>> rs2_data = da.load_doench_2016() >>> train_model.fit(rs2_data) """ def __init__(self, random_state = 7, val_frac = 0.1, model = None, features = None): """Constructor """ self.base_name = 'Sklearn_GB' self.val_frac = val_frac self.random_state = random_state if model is None: # Gradient boosted model self.model = ensemble.GradientBoostingRegressor(n_iter_no_change=20, validation_fraction = self.val_frac, random_state=self.random_state) else: self.model = model if features is None: # Default features for RuleSet2 self.features = ['Pos. Ind. 1mer', 'Pos. Ind. 2mer', 'Pos. Dep. 1mer', 'Pos. Dep. 2mer', 'GC content', 'Tm'] else: self.features = features self.enzyme = None self.train_dataset = None self.train_name = None
[docs] def load_model(self, model, enzyme, name): """Load previously trained model :param enzyme: cas9 or cas12a :type enyme: dict :param model: filepath to trained model :type model: str (*.joblib) :param name: name of the model :type name:str """ self.enzyme = enzyme self.model = load(model) self.train_name = name return self
[docs] def fit(self, train_dataset): """ Fit a model to the training data :param train_dataset: training data :type train_dataset: :class:`sgrna_modeler.datasets.ActivityData` :return: self """ self.train_name = train_dataset.name self.enzyme = train_dataset.enzyme train_val_x, y = train_dataset.get_xy() featurized_train_val_x = fe.featurize_guides(train_val_x, features=self.features, guide_start = self.enzyme['guide_start'], guide_length = self.enzyme['guide_length']) self.model.fit(featurized_train_val_x, y) return self
[docs] def predict(self, test_dataset): """Predict activity of test data :param test_dataset: testing data :type test_dataset: :class:`sgrna_modeler.datasets.ActivityData` :return: dataframe of predictions and other meta information :rtype: pandas dataframe """ x, y = test_dataset.get_xy() featurized_x = fe.featurize_guides(x, features=self.features, guide_start=test_dataset.enzyme['guide_start'], guide_length=test_dataset.enzyme['guide_length']) predictions = self.model.predict(featurized_x) out_data = pd.DataFrame({'kmer': x, 'y': y}) if test_dataset.group_column: out_data['group'] = test_dataset.data[test_dataset.group_column] else: out_data['group'] = '' out_data['prediction'] = predictions out_data['model'] = self.base_name out_data['training_data'] = self.train_name out_data['test_data'] = test_dataset.name return out_data
[docs] def predict_seqs(self, seqs): """ Predict from sequences :param seqs: sequences to predict :return: numeric vector of predcitions """ featurized_x = fe.featurize_guides(seqs, features=self.features, guide_start=self.enzyme['guide_start'], guide_length=self.enzyme['guide_length']) predictions = self.model.predict(featurized_x) return predictions