Source code for sgrna_modeler.datasets

"""Training and testing datasets for modeling guide activity.

Includes class for generating new dataset objects, so new data can be easily modeled and tested

    :Example:

    >>> import pandas as pd
    >>> import sgrna_modeler.enzyme as en
    >>> new_data = pd.read_csv('new dataset')
    >>> new_dataset = Activity_Data(new_data, en.cas9, '30mer', 'activity', 'new data')
"""
import pandas as pd
from sgrna_modeler import enzymes as en
import os


[docs]def curr_path(): return os.path.dirname(__file__)
[docs]class ActivityData(object): """Store information about activity data :param data: data to model :type data: pandas dataframe :param enzyme: cas9 or cas12a :type enzyme: dict :param kmer_column: sequences to model :type kmer_column: str :param name: name of the dataset :type name: str :param group_column: column to include in prediction output :type group_column:str """ def __init__(self, data, enzyme, kmer_column, activity_column, name, group_column = ''): """Inits Activity data""" self.data = data self.enzyme = enzyme self.kmer_column = kmer_column self.activity_column = activity_column self.name = 'D_' + name self.group_column = group_column
[docs] def get_xy(self): """Gets modeling matrix (x) and output matrix (y) :return two series, x and y :rtype pandas series """ x = self.data[self.kmer_column] y = self.data[self.activity_column] return x, y
# SpCas9 Datasets
[docs]def load_doench_2016(): """Data from: Doench, John G., et al. "Optimized sgRNA design to maximize activity and minimize off-target effects of CRISPR-Cas9." \ Nature biotechnology 34.2 (2016): 184. :Example: >>> import sgrna_modeler.datasets as da >>> doench = da.load_doench_2016() >>> doench.data Unnamed: 0 30mer ... drug predictions 0 0 CAGAAAAAAAAACACTGCAACAAGAGGGTA ... nodrug 0.544412 1 1 TTTTAAAAAACCTACCGTAAACTCGGGTCA ... PLX_2uM 0.617512 2 2 TCAGAAAAAGCAGCGTCAGTGGATTGGCCC ... nodrug 0.476232 3 3 AATAAAAAATAGGATTCCCAGCTTTGGAAG ... PLX_2uM 0.459882 4 4 GATGAAAAATATGTAAACAGCATTTGGGAC ... PLX_2uM 0.290841 ... ... ... ... ... 5305 5305 GCACTTTGGTGTGGCTGACTGAGTGGGCCA ... PLX_2uM 0.586758 5306 5306 TTCTTTTGTAAGAACCCGCTGTGTTGGTTT ... PLX_2uM 0.492066 5307 5307 GCCCTTTGTCATCGTAGGAAGATATGGCTG ... AZD_200nM 0.479728 5308 5308 CAAATTTGTTCTTTAAATGGCTACAGGAGG ... AZD_200nM 0.478952 5309 5309 CAAATTTGTTCTTTAAATGGCTACAGGAGG ... PLX_2uM 0.478952 [5310 rows x 9 columns] """ data = pd.read_csv(os.path.join(curr_path(), 'data/datasets/Doench_2016.csv.zip')) data_class = ActivityData(data=data, enzyme=en.cas9, kmer_column='30mer', activity_column='score_drug_gene_rank', name='Doench_2016', group_column='Target gene') return data_class
[docs]def load_meyers_2017_train(): """Essential genes from GeckoV2 achilles screens: Meyers, Robin M., et al. "Computational correction of copy number effect improves specificity of CRISPR–Cas9 \ essentiality screens in cancer cells." Nature genetics 49.12 (2017): 1779-1784. Mean activity is averaged accross screens after Z-scoring by non-essentials :Example: >>> import sgrna_modeler.datasets as da >>> meyers_2017_train = da.load_meyers_2017_train() >>> meyers_2017_train.data Species Build Chromosome Number ... Percent Protein Notes mean_activity 0 human GRCh38 1.0 ... 24.87 NaN -0.230160 1 human GRCh38 1.0 ... 23.26 NaN 3.045755 2 human GRCh38 1.0 ... 18.60 NaN 1.307097 3 human GRCh38 1.0 ... 18.07 NaN -1.307698 4 human GRCh38 1.0 ... 13.95 NaN 1.278670 ... ... ... ... ... ... ... 7897 human GRCh38 NaN ... 11.78 NaN 1.959897 7898 human GRCh38 NaN ... 13.14 NaN -0.429659 7899 human GRCh38 NaN ... 16.01 NaN 1.187820 7900 human GRCh38 NaN ... 19.03 NaN 1.573194 7901 human GRCh38 NaN ... 39.62 NaN 2.044455 [7902 rows x 20 columns] """ data = pd.read_csv(os.path.join(curr_path(), 'data/datasets/Meyers_2017_Train.csv.zip')) data_class = ActivityData(data=data, enzyme=en.cas9, kmer_column='sgRNA context sequence', activity_column='mean_activity', name='Meyers_2017_Train', group_column='Gene Symbol') return data_class
[docs]def load_meyers_2017_test(): """Essential genes from GeckoV2 achilles screens: Meyers, Robin M., et al. "Computational correction of copy number effect improves specificity of CRISPR–Cas9 \ essentiality screens in cancer cells." Nature genetics 49.12 (2017): 1779-1784. Mean activity is averaged accross screens after Z-scoring by non-essentials :Example: >>> import sgrna_modeler.datasets as da >>> meyers_2017_test = da.load_meyers_2017_test() >>> meyers_2017_test.data Species Build Chromosome Number ... Percent Protein Notes mean_activity 0 human GRCh38 1.0 ... 22.12 NaN 3.325952 1 human GRCh38 1.0 ... 2.30 NaN 2.645421 2 human GRCh38 1.0 ... 56.87 NaN 2.040191 3 human GRCh38 1.0 ... 40.38 NaN 3.356250 4 human GRCh38 1.0 ... 40.11 NaN 1.602670 .. ... ... ... ... ... ... ... 667 human GRCh38 NaN ... 8.56 NaN 1.240547 668 human GRCh38 NaN ... 8.95 NaN 1.078080 669 human GRCh38 NaN ... 30.93 NaN -0.364154 670 human GRCh38 NaN ... 34.24 NaN 2.605412 671 human GRCh38 NaN ... 41.25 NaN 2.620977 [672 rows x 20 columns] """ data = pd.read_csv(os.path.join(curr_path(), 'data/datasets/Meyers_2017_Test.csv.zip')) data_class = ActivityData(data=data, enzyme=en.cas9, kmer_column='sgRNA context sequence', activity_column='mean_activity', name='Meyers_2017_Test', group_column='Gene Symbol') return data_class
[docs]def load_kim_2019_train(): """ Indel frequencies from: Kim, Hui Kwon, et al. "SpCas9 activity prediction by DeepSpCas9, a deep learning–based model with high \ generalization performance." Science advances 5.11 (2019): eaax9249. :Example: >>> import sgrna_modeler.datasets as da >>> kim_2019_train = da.load_kim_2019_train() >>> kim_2019_train.data Barcode ... Background subtracted indel (%) 0 TTTGACACACACGCACTAG ... 24.287805 1 TTTGACACACACTCGTATG ... 69.500438 2 TTTGACACACACTCTCGTC ... 25.994760 3 TTTGACACACACTCTGCTG ... 57.964590 4 TTTGACACACACTGCATAT ... 39.355020 ... ... ... 12827 TTTGTGTGTCTCGTATCAC ... 40.853256 12828 TTTGTGTGTCTCTACACGC ... 11.480880 12829 TTTGTGTGTCTCTCACGTA ... 63.861469 12830 TTTGTGTGTCTCTCTAGTC ... 51.650932 12831 TTTGTGTGTCTCTCTCAGA ... 40.019124 [12832 rows x 9 columns] """ data = pd.read_csv(os.path.join(curr_path(), 'data/datasets/Kim_2019_Train.csv.zip')) data_class = ActivityData(data=data, enzyme=en.cas9, kmer_column='Target context sequence (4+20+3+3)', activity_column='Background subtracted indel (%)', name='Kim_2019_Train') return data_class
[docs]def load_kim_2019_test(): """ Indel frequencies from: Kim, Hui Kwon, et al. "SpCas9 activity prediction by DeepSpCas9, a deep learning–based model with high \ generalization performance." Science advances 5.11 (2019): eaax9249. :Example: >>> import sgrna_modeler.datasets as da >>> kim_2019_test = da.load_kim_2019_test() >>> kim_2019_test.data Target context sequence (4+20+3+3) ... Background subtracted indel frequencies (average, %) 0 AAAACTGTGAGTGTGGGACCTGCTGGGGGC ... 44.125755 1 AAACACAACCAATCCGAGGCCTTCTGGGTC ... 12.163189 2 AAACTGTGAGTGTGGGACCTGCTGGGGGCT ... 68.901263 3 AAACTTGAGAGCTTTCATAAAGCTTGGCAA ... 13.135690 4 AAAGAAGCGGACTTTAAAGTTCGAGGGAGA ... 48.355156 .. ... ... ... 537 TTTGCAGCGCGTTGACTTATTCATGGGTCA ... 36.249050 538 TTTGCTAGGAATATTGAAGGGGGCAGGGGA ... 38.622947 539 TTTGTGGTGGTTGCTATGGTAATCCGGCAC ... 12.246218 540 TTTTTACAATTCTGTGAGTTAGAGTGGGCA ... 0.385915 541 TTTTTGAGGTGCACTAATAGAGGGTGGAGT ... 41.100730 [542 rows x 5 columns] """ data = pd.read_csv(os.path.join(curr_path(), 'data/datasets/Kim_2019_Test.csv.zip')) data_class = ActivityData(data=data, enzyme=en.cas9, kmer_column='Target context sequence (4+20+3+3)', activity_column='Background subtracted indel frequencies\r(average, %)', name='Kim_2019_Test') return data_class
# AsCas12a datasets
[docs]def load_kim_2018_train(): """ Indel frequencies from: Kim, Hui Kwon, et al. "Deep learning improves prediction of CRISPR–Cpf1 guide RNA activity." \ Nature biotechnology 36.3 (2018): 239. :Example: >>> import sgrna_modeler.datasets as da >>> kim_2018_train = da.load_kim_2018_train() >>> kim_2018_train.data 50 bp synthetic target and target context sequence 10 bp + PAM + 23 bp protospacer + 17 bp) ... Indel frequency 0 TGCGCGAGCGTTTAAAAAACATCGAACGCATCTGCTGCCTAGCTTG... ... 14.711302 1 CTAAAGAAACTTTAAAAATCTTTTCTGCCAGATCTCCAGAAGCTTG... ... 0.238095 2 TTGCCATTGTTTTAAAACAGGTTCTGTACTTGATCTCTCCAGCTTG... ... 88.079746 3 TTGCACATATTTTAAAACTGAGTTCAAAGACCACTCTTCCAGCTTG... ... 75.392670 4 TAGACTAATGTTTAAAAGCAAGTGCAAGTCTTTGGAATCTAGCTTG... ... 63.320080 ... ... ... 14995 TCCATCTTCATTTTTTTTGTAGAGTAGGGCTTTATTTCCAAGCTTG... ... -0.467290 14996 CCTTCTCTCCTTTTTTTTTCAAGATCTGATTCTTCTTGCAAGCTTG... ... 0.000000 14997 CCAGGACTTGTTTTTTTTTCAATCTGTTCATCTTGGACCAAGCTTG... ... 0.239006 14998 ACCATCATAATTTTTTTTTGCAACATAGCCATTTCTTTTTAGCTTG... ... -0.272826 14999 GAGCGCTTCTTTTTTTTTTTCGGGGTCTCGTTGCTGGGCGAGCTTG... ... -2.766164 [15000 rows x 10 columns] """ data = pd.read_csv(os.path.join(curr_path(), 'data/datasets/Kim_2018_Train.csv.zip')) data_class = ActivityData(data=data, enzyme=en.cas12a, kmer_column='Context Sequence', activity_column='Indel frequency', name='Kim_2018_Train') return data_class
[docs]def load_kim_2018_test(): """ Indel frequencies from: Kim, Hui Kwon, et al. "Deep learning improves prediction of CRISPR–Cpf1 guide RNA activity." \ Nature biotechnology 36.3 (2018): 239. :Example: >>> import sgrna_modeler.datasets as da >>> kim_2018_test = da.load_kim_2018_test() >>> kim_2018_test.data 50 bp synthetic target and target context sequence ... Indel frequency 0 GCAATTTGGTTTTAAAACAGAATATACAGTCTAAAAAACCAGCTTG... ... 71.580711 1 CTGATGGCCATTTAAACAACTCTTTGAGCTCTCCAGTTCAAGCTTG... ... 19.672949 2 TTTAGATGATTTTAAACCAGCATCTATAGACACTTCCTGTAGCTTG... ... 75.641026 3 ACATTTGGACTTTAAACCCAAACTACTTGTCCAACGGTACAGCTTG... ... 46.920217 4 CTCTACCAGGTTTAAACGCTTCCACACTTGTGTCAGTAATAGCTTG... ... 54.981550 ... ... ... 2958 AGTTTGGAATTTTTTTTACACTGATCCTCAGCACATCTCAAGCTTG... ... -0.378500 2959 CAGGCTTTCTTTTTTTTCCTTTCCTAGTTGGTTCATTCCCAGCTTG... ... 0.189438 2960 AACAGTGGCTTTTTTTTGCTGCTAGCACATATGTATGGGTAGCTTG... ... -2.857143 2961 CAGCCTCATGTTTTTTTGGGAACCAATCGATAATCACATTAGCTTG... ... 11.275673 2962 TTGGATTGTGTTTTTTTTTAGCACCTTATTTTCCTTGAAGAGCTTG... ... -1.675978 [2963 rows x 10 columns] """ data = pd.read_csv(os.path.join(curr_path(), 'data/datasets/Kim_2018_Test.csv.zip')) data_class = ActivityData(data=data, enzyme=en.cas12a, kmer_column='Context Sequence', activity_column='Indel frequency', name='Kim_2018_Test') return data_class
# enAsCas12a datasets