├── .devcontainer.json ├── .github └── workflows │ └── main.yml ├── .gitignore ├── 00_seq.ipynb ├── 01_targetdata.ipynb ├── 02_targetfeat.ipynb ├── 03_predicttarg.ipynb ├── 04_predict.ipynb ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── RuleSet3.pkl ├── docker-compose.yml ├── docs ├── .gitignore ├── Gemfile ├── Gemfile.lock ├── _config.yml ├── _data │ ├── sidebars │ │ └── home_sidebar.yml │ └── topnav.yml ├── feed.xml ├── images │ ├── output_18_0.png │ └── output_42_0.png ├── index.html ├── predict.html ├── predicttarg.html ├── seq.html ├── sidebar.json ├── sitemap.xml ├── targetdata.html └── targetfeat.html ├── index.ipynb ├── rs3 ├── RuleSet3.pkl ├── __init__.py ├── _nbdev.py ├── predict.py ├── predicttarg.py ├── seq.py ├── target_lite_model.pkl ├── target_model.pkl ├── targetdata.py └── targetfeat.py ├── settings.ini ├── setup.py ├── target_lite_model.pkl ├── target_model.pkl └── test_data ├── Aguirre2016_activity.csv ├── Behan2019_activity.csv ├── codon_map.csv ├── sgrna-designs.txt ├── sgrna-designs_BCL2L1_MCL1_EEF2.txt ├── sgrna-designs_BCL2L1_MCL1_EEF2_na.txt └── target_data ├── aa_seqs.pq ├── conservation.pq └── protein_domains.pq /.devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nbdev_template-codespaces", 3 | "dockerComposeFile": "docker-compose.yml", 4 | "service": "watcher", 5 | "settings": {"terminal.integrated.shell.linux": "/bin/bash"}, 6 | "mounts": [ "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ], 7 | "forwardPorts": [4000, 8080], 8 | "appPort": [4000, 8080], 9 | "extensions": ["ms-python.python", 10 | "ms-azuretools.vscode-docker"], 11 | "runServices": ["notebook", "jekyll", "watcher"], 12 | "postStartCommand": "pip install -e ." 13 | } 14 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v1 8 | - uses: actions/setup-python@v1 9 | with: 10 | python-version: '3.7' 11 | architecture: 'x64' 12 | - name: Install the library 13 | run: | 14 | pip install nbdev jupyter 15 | pip install -e .[dev] 16 | - name: Read all notebooks 17 | run: | 18 | nbdev_read_nbs 19 | - name: Check if all notebooks are cleaned 20 | run: | 21 | echo "Check we are starting with clean git checkout" 22 | if [ -n "$(git status -uno -s)" ]; then echo "git status is not clean"; false; fi 23 | echo "Trying to strip out notebooks" 24 | nbdev_clean_nbs 25 | echo "Check that strip out was unnecessary" 26 | git status -s # display the status to see which nbs need cleaning up 27 | if [ -n "$(git status -uno -s)" ]; then echo -e "!!! Detected unstripped out notebooks\n!!!Remember to run nbdev_install_git_hooks"; false; fi 28 | - name: Check if there is no diff library/notebooks 29 | run: | 30 | if [ -n "$(nbdev_diff_nbs)" ]; then echo -e "!!! Detected difference between the notebooks and the library"; false; fi 31 | - name: Run tests 32 | run: | 33 | nbdev_test_nbs --fname=index.ipynb --n_workers=1 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | .gitattributes 3 | .last_checked 4 | .gitconfig 5 | *.bak 6 | *.log 7 | *~ 8 | ~* 9 | _tmp* 10 | tmp* 11 | tags 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | env/ 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | .hypothesis/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # SageMath parsed files 92 | *.sage.py 93 | 94 | # dotenv 95 | .env 96 | 97 | # virtualenv 98 | .venv 99 | venv/ 100 | ENV/ 101 | 102 | # Spyder project settings 103 | .spyderproject 104 | .spyproject 105 | 106 | # Rope project settings 107 | .ropeproject 108 | 109 | # mkdocs documentation 110 | /site 111 | 112 | # mypy 113 | .mypy_cache/ 114 | 115 | .vscode 116 | *.swp 117 | 118 | # osx generated files 119 | .DS_Store 120 | .DS_Store? 121 | .Trashes 122 | ehthumbs.db 123 | Thumbs.db 124 | .idea 125 | 126 | # pytest 127 | .pytest_cache 128 | 129 | # tools/trust-doc-nbs 130 | docs_src/.last_checked 131 | 132 | # symlinks to fastai 133 | docs_src/fastai 134 | tools/fastai 135 | 136 | # link checker 137 | checklink/cookies.txt 138 | 139 | # .gitconfig is now autogenerated 140 | .gitconfig 141 | 142 | -------------------------------------------------------------------------------- /02_targetfeat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# default_exp targetfeat" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# targetfeat\n", 17 | "> Module to generate target site features" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# export\n", 27 | "import pandas as pd\n", 28 | "from Bio.SeqUtils.ProtParam import ProteinAnalysis\n", 29 | "import warnings" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "from rs3 import targetdata" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import multiprocessing\n", 48 | "max_n_jobs = multiprocessing.cpu_count()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# export\n", 58 | "def add_target_columns(design_df, transcript_id_col='Target Transcript',\n", 59 | " cut_pos_col='Target Cut Length',\n", 60 | " transcript_base_col='Transcript Base'):\n", 61 | " \"\"\"Add ['AA Index' and 'Transcript Base'] to design df\n", 62 | "\n", 63 | " :param design_df: DataFrame\n", 64 | " :return: DataFrame\n", 65 | " \"\"\"\n", 66 | " out_df = design_df.copy()\n", 67 | " out_df['AA Index'] = (out_df[cut_pos_col] - 1) // 3 + 1\n", 68 | " out_df[transcript_base_col] = out_df[transcript_id_col].str.split('.', expand=True)[0]\n", 69 | " return out_df" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "design_df = pd.read_table('test_data/sgrna-designs.txt')\n", 79 | "design_targ_df = add_target_columns(design_df)\n", 80 | "assert 'AA Index' in design_targ_df.columns" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## Position Features\n", 88 | "\n", 89 | "The first feature class we consider is where the guide targets within the annotated transcript" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# export\n", 99 | "def get_position_features(sg_df, id_cols):\n", 100 | " \"\"\"Get features ['Target Cut %', 'sense']\n", 101 | "\n", 102 | " :param sg_df: DataFrame\n", 103 | " :param id_cols: list\n", 104 | " :return: DataFrame\n", 105 | " \"\"\"\n", 106 | " position_df = sg_df[id_cols + ['Target Cut %']].copy()\n", 107 | " position_df['sense'] = sg_df['Orientation'] == 'sense'\n", 108 | " return position_df" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "## Amino Acid Features\n", 116 | "\n", 117 | "We calculate a set of features from the amino acid sequence around the cutsite itself" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "aas = ['A', 'C', 'D', 'E', 'F',\n", 127 | " 'G', 'H', 'I', 'K', 'L',\n", 128 | " 'M', 'N', 'P', 'Q', 'R',\n", 129 | " 'S', 'T', 'V', 'W', 'Y', '*']" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "# export\n", 139 | "def get_one_aa_frac(feature_dict, aa_sequence, aas):\n", 140 | " \"\"\"Get fraction of single aa\n", 141 | "\n", 142 | " :param feature_dict: dict, feature dictionary\n", 143 | " :param aa_sequence: str, amino acid sequence\n", 144 | " :param aas: list, list of amino acids\n", 145 | " \"\"\"\n", 146 | " for aa in aas:\n", 147 | " aa_frac = aa_sequence.count(aa) / len(aa_sequence)\n", 148 | " feature_dict[aa] = aa_frac" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "one_aa_ft = {}\n", 158 | "get_one_aa_frac(one_aa_ft, 'ACDG*-', aas)\n", 159 | "assert one_aa_ft['A'] == 1/6\n", 160 | "assert one_aa_ft['Q'] == 0" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "# export\n", 170 | "def get_aa_aromaticity(feature_dict, analyzed_seq):\n", 171 | " \"\"\"Get fraction of aromatic amino acids in a sequence.\n", 172 | "\n", 173 | " Phe (F) + Trp (W) + Tyr (Y)\n", 174 | "\n", 175 | " :param feature_dict:\n", 176 | " :param analyzed_seq: ProteinAnalysis object\n", 177 | " \"\"\"\n", 178 | " feature_dict['Aromaticity'] = analyzed_seq.aromaticity()\n", 179 | "\n", 180 | "\n", 181 | "def get_aa_hydrophobicity(feature_dict, analyzed_seq):\n", 182 | " \"\"\"Grand Average of Hydropathy\n", 183 | "\n", 184 | " The GRAVY value is calculated by adding the hydropathy value for each residue and dividing\n", 185 | " by the length of the sequence (Kyte and Doolittle; 1982). The larger the number, the more hydrophobic the\n", 186 | " amino acid\n", 187 | "\n", 188 | " :param feature_dict: dict\n", 189 | " :param analyzed_seq: ProteinAnalysis object\n", 190 | " \"\"\"\n", 191 | " feature_dict['Hydrophobicity'] = analyzed_seq.gravy()\n", 192 | "\n", 193 | "\n", 194 | "def get_aa_ip(feature_dict, analyzed_seq):\n", 195 | " \"\"\"Get the Isoelectric Point of an amino acid sequence\n", 196 | "\n", 197 | " Charge of amino acid\n", 198 | "\n", 199 | " :param feature_dict: dict\n", 200 | " :param analyzed_seq: ProteinAnalysis object\n", 201 | " \"\"\"\n", 202 | " feature_dict['Isoelectric Point'] = analyzed_seq.isoelectric_point()\n", 203 | "\n", 204 | "\n", 205 | "def get_aa_secondary_structure(feature_dict, analyzed_seq):\n", 206 | " \"\"\"Get the fraction of amion acids that tend to be in a helix, turn or sheet\n", 207 | "\n", 208 | " :param feature_dict: dict\n", 209 | " :param analyzed_seq: ProteinAnalysis object\n", 210 | " \"\"\"\n", 211 | " feature_dict['Helix'], feature_dict['Turn'], feature_dict['Sheet'] = analyzed_seq.secondary_structure_fraction()\n" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "aa_biochemical_fts1 = {}\n", 221 | "get_aa_aromaticity(aa_biochemical_fts1, ProteinAnalysis('FWYA'))\n", 222 | "aa_biochemical_fts2 = {}\n", 223 | "get_aa_aromaticity(aa_biochemical_fts2, ProteinAnalysis('AAAA'))\n", 224 | "assert aa_biochemical_fts1['Aromaticity'] > aa_biochemical_fts2['Aromaticity']" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "# export\n", 234 | "def featurize_aa_seqs(aa_sequences, features=None):\n", 235 | " \"\"\"Get feature DataFrame for a list of amino acid sequences\n", 236 | "\n", 237 | " :param aa_sequences: list of str\n", 238 | " :param features: list or None\n", 239 | " :return: DataFrame\n", 240 | " \"\"\"\n", 241 | " if features is None:\n", 242 | " features = ['Pos. Ind. 1mer', 'Hydrophobicity', 'Aromaticity',\n", 243 | " 'Isoelectric Point', 'Secondary Structure']\n", 244 | " aas = ['A', 'C', 'D', 'E', 'F',\n", 245 | " 'G', 'H', 'I', 'K', 'L',\n", 246 | " 'M', 'N', 'P', 'Q', 'R',\n", 247 | " 'S', 'T', 'V', 'W', 'Y', '*']\n", 248 | " clean_aa_seqs = aa_sequences.str.replace('\\*|-', '', regex=True)\n", 249 | " feature_dict_list = []\n", 250 | " for i, (aa_sequence, clean_sequence) in enumerate(zip(aa_sequences, clean_aa_seqs)):\n", 251 | " analyzed_seq = ProteinAnalysis(clean_sequence)\n", 252 | " feature_dict = {}\n", 253 | " if 'Pos. Ind. 1mer' in features:\n", 254 | " get_one_aa_frac(feature_dict, aa_sequence, aas)\n", 255 | " if 'Hydrophobicity' in features:\n", 256 | " get_aa_hydrophobicity(feature_dict, analyzed_seq)\n", 257 | " if 'Aromaticity' in features:\n", 258 | " get_aa_aromaticity(feature_dict, analyzed_seq)\n", 259 | " if 'Isoelectric Point' in features:\n", 260 | " get_aa_ip(feature_dict, analyzed_seq)\n", 261 | " if 'Secondary Structure' in features:\n", 262 | " get_aa_secondary_structure(feature_dict, analyzed_seq)\n", 263 | " feature_dict_list.append(feature_dict)\n", 264 | " feature_matrix = pd.DataFrame(feature_dict_list)\n", 265 | " feature_matrix.index = aa_sequences\n", 266 | " return feature_matrix" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "ft_dict_df = featurize_aa_seqs(pd.Series(['ACDG*-', 'CDG*--', 'LLLLLL']))\n", 276 | "assert ft_dict_df.loc['LLLLLL', 'Hydrophobicity'] == ft_dict_df['Hydrophobicity'].max()" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "# export\n", 286 | "def extract_amino_acid_subsequence(sg_aas, width):\n", 287 | " \"\"\" Get the amino acid subsequence with a width of `width` on either side of the Amino Acid index\n", 288 | "\n", 289 | " :param sg_aas: DataFrame, sgRNA designs merged with amino acid sequence\n", 290 | " :param width: int\n", 291 | " :return: DataFrame\n", 292 | " \"\"\"\n", 293 | " # Pad the sequences at the beginning and end, so our index doesn't go over\n", 294 | " l_padding = '-' * (width + 1) # can cut just before the CDS\n", 295 | " r_padding = '-' * width # can cut the stop codon\n", 296 | " # add stop codon at the end of the sequence\n", 297 | " sg_aas_subseq = sg_aas.copy()\n", 298 | " sg_aas_subseq['extended_seq'] = l_padding + sg_aas_subseq['seq'] + '*' + r_padding\n", 299 | " sg_aas_subseq['AA 0-Indexed'] = sg_aas_subseq['AA Index'] - 1\n", 300 | " sg_aas_subseq['AA 0-Indexed padded'] = sg_aas_subseq['AA 0-Indexed'] + len(l_padding)\n", 301 | " sg_aas_subseq['seq_start'] = (sg_aas_subseq['AA 0-Indexed padded'] - width).astype(int)\n", 302 | " sg_aas_subseq['seq_end'] = (sg_aas_subseq['AA 0-Indexed padded'] + width).astype(int)\n", 303 | " sg_aas_subseq['AA Subsequence'] = sg_aas_subseq.apply(lambda row: row['extended_seq'][row['seq_start']:(row['seq_end'] + 1)],\n", 304 | " axis=1)\n", 305 | " return sg_aas_subseq\n" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "small_aa_seq_df = pd.DataFrame({'AA Index': [1, 5, 9],\n", 315 | " 'seq': ['MAVLKYSLW']*3})\n", 316 | "small_aa_subseq_df = extract_amino_acid_subsequence(small_aa_seq_df, 2)\n", 317 | "actual_subseqs = small_aa_subseq_df['AA Subsequence']\n", 318 | "expected_subseqs = ['--MAV', 'VLKYS', 'SLW*-']\n", 319 | "assert len(actual_subseqs) == len(expected_subseqs)\n", 320 | "assert all([a == b for a, b in zip(actual_subseqs, expected_subseqs)])" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "# export\n", 330 | "def get_aa_subseq_df(sg_designs, aa_seq_df, width, id_cols,\n", 331 | " transcript_base_col='Transcript Base',\n", 332 | " target_transcript_col='Target Transcript',\n", 333 | " aa_index_col='AA Index'):\n", 334 | " \"\"\"Get the amino acid subsequences for a design dataframe\n", 335 | "\n", 336 | " :param sg_designs: DataFrame\n", 337 | " :param aa_seq_df: DataFrame, Transcript Base and (AA) seq\n", 338 | " :param width: int, length on each side of the cut site\n", 339 | " :param transcript_base_col: str\n", 340 | " :param target_transcript_col: str\n", 341 | " :param aa_index_col: str\n", 342 | " :return: DataFrame\n", 343 | " \"\"\"\n", 344 | " sg_aas = (aa_seq_df.merge(sg_designs[list(set(id_cols +\n", 345 | " [target_transcript_col, transcript_base_col, aa_index_col]))],\n", 346 | " how='inner',\n", 347 | " on=[target_transcript_col, transcript_base_col]))\n", 348 | " sg_aas_subseq = extract_amino_acid_subsequence(sg_aas, width)\n", 349 | " return sg_aas_subseq" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "name": "stdout", 359 | "output_type": "stream", 360 | "text": [ 361 | "Getting amino acid sequences\n" 362 | ] 363 | }, 364 | { 365 | "name": "stderr", 366 | "output_type": "stream", 367 | "text": [ 368 | "100%|█████████████████████████████████████████████| 4/4 [00:04<00:00, 1.19s/it]\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "aa_seq_df = targetdata.build_transcript_aa_seq_df(design_targ_df, n_jobs=2)" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "aa_subseq_df = get_aa_subseq_df(sg_designs=design_targ_df, aa_seq_df=aa_seq_df, width=16,\n", 383 | " id_cols=['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation'])\n", 384 | "assert (aa_subseq_df['AA Subsequence'].str.len() == 33).all()\n", 385 | "assert aa_subseq_df.shape[0] == design_targ_df.shape[0]" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "codon_map_df = pd.read_csv('test_data/codon_map.csv')\n", 395 | "\n", 396 | "def get_rev_comp(sgrna):\n", 397 | " \"\"\"Get reverse compliment of a guide\"\"\"\n", 398 | " nt_map = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}\n", 399 | " rev_comp = ''\n", 400 | " for nt in sgrna:\n", 401 | " rev_comp += nt_map[nt]\n", 402 | " rev_comp = rev_comp[::-1]\n", 403 | " return rev_comp\n", 404 | "\n", 405 | "codon_map = pd.Series(codon_map_df['Amino Acid'].values, index=codon_map_df['Codon']).to_dict()\n", 406 | "row = aa_subseq_df.sample(1, random_state=1).iloc[0, :]\n", 407 | "subseq = row['AA Subsequence']\n", 408 | "context = row['sgRNA Context Sequence']\n", 409 | "rc_context = get_rev_comp(context)\n", 410 | "translations = dict()\n", 411 | "rc_translations = dict()\n", 412 | "for i in [0, 1, 2]:\n", 413 | " translations[i] = ''.join([codon_map[context[j:j+3]] for j in range(i, len(context), 3)\n", 414 | " if (j + 3) <= len(context)])\n", 415 | " rc_translations[i] = ''.join([codon_map[rc_context[j:j+3]] for j in range(i, len(rc_context), 3)\n", 416 | " if (j + 3) <= len(rc_context)])\n", 417 | "assert ((translations[0] in subseq) or (translations[1] in subseq) or (translations[2] in subseq) or\n", 418 | " (rc_translations[0] in subseq) or (rc_translations[1] in subseq) or (rc_translations[2] in subseq))" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "# export\n", 428 | "def get_amino_acid_features(aa_subseq_df, features, id_cols):\n", 429 | " \"\"\"Featurize amino acid sequences\n", 430 | "\n", 431 | " :param aa_subseq_df: DataFrame\n", 432 | " :param features: list\n", 433 | " :param id_cols: list\n", 434 | " :return: DataFrame\n", 435 | " \"\"\"\n", 436 | "\n", 437 | " # Zero-indexed for python\n", 438 | " # filter out sequences without the canonical amino acids\n", 439 | " aa_set = set('ARNDCQEGHILKMFPSTWYV*-')\n", 440 | " filtered_sg_aas = (aa_subseq_df[aa_subseq_df['AA Subsequence'].apply(lambda s: set(s) <= aa_set)]\n", 441 | " .reset_index(drop=True))\n", 442 | " filtered_diff = (aa_subseq_df.shape[0] - filtered_sg_aas.shape[0])\n", 443 | " if filtered_diff > 0:\n", 444 | " warnings.warn('Ignored ' + str(filtered_diff) + ' amino acid sequences with non-canonical amino acids')\n", 445 | " aa_features = featurize_aa_seqs(filtered_sg_aas['AA Subsequence'], features=features)\n", 446 | " aa_features_annot = pd.concat([filtered_sg_aas[id_cols + ['AA Subsequence']]\n", 447 | " .reset_index(drop=True),\n", 448 | " aa_features.reset_index(drop=True)], axis=1)\n", 449 | " return aa_features_annot\n" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "aa_features = get_amino_acid_features(aa_subseq_df=aa_subseq_df,\n", 459 | " features=['Pos. Ind. 1mer',\n", 460 | " 'Hydrophobicity', 'Aromaticity',\n", 461 | " 'Isoelectric Point', 'Secondary Structure'],\n", 462 | " id_cols=['sgRNA Context Sequence', 'Target Cut Length',\n", 463 | " 'Target Transcript', 'Orientation'])\n", 464 | "assert aa_features['L'].idxmax() == aa_features['Hydrophobicity'].idxmax()" 465 | ] 466 | }, 467 | { 468 | "cell_type": "markdown", 469 | "metadata": {}, 470 | "source": [ 471 | "## Protein Domain Features" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "#export\n", 481 | "def get_protein_domain_features(sg_design_df, protein_domains, id_cols,\n", 482 | " sources=None,\n", 483 | " transcript_base_col='Transcript Base',\n", 484 | " aa_index_col='AA Index',\n", 485 | " domain_type_col='type',\n", 486 | " domain_start_col='start',\n", 487 | " domain_end_col='end'):\n", 488 | " \"\"\"Get binary dataframe of protein domains\n", 489 | "\n", 490 | " :param sg_design_df: DataFrame, with columns [transcript_base_col, aa_index_col]\n", 491 | " :param protein_domains: DataFrame, with columns [transcript_base_col, domain_type_col]\n", 492 | " :param id_cols: list\n", 493 | " :param sources: list. list of database types to include\n", 494 | " :param transcript_base_col: str\n", 495 | " :param aa_index_col: str\n", 496 | " :param domain_type_col: str\n", 497 | " :param domain_start_col: str\n", 498 | " :param domain_end_col: str\n", 499 | " :return: DataFrame, with binary features for protein domains\n", 500 | " \"\"\"\n", 501 | " if sources is None:\n", 502 | " sources = ['Pfam', 'PANTHER', 'HAMAP', 'SuperFamily', 'TIGRfam', 'ncoils', 'Gene3D',\n", 503 | " 'Prosite_patterns', 'Seg', 'SignalP', 'TMHMM', 'MobiDBLite',\n", 504 | " 'PIRSF', 'PRINTS', 'Smart', 'Prosite_profiles'] # exclude sifts\n", 505 | " protein_domains = protein_domains[protein_domains[domain_type_col].isin(sources)]\n", 506 | " clean_designs = sg_design_df[list(set(id_cols + [transcript_base_col, aa_index_col]))].copy()\n", 507 | " designs_domains = clean_designs.merge(protein_domains,\n", 508 | " how='inner', on=transcript_base_col)\n", 509 | " # Note - not every sgRNA will be present in the feature df\n", 510 | " filtered_domains = (designs_domains[designs_domains[aa_index_col].between(designs_domains[domain_start_col],\n", 511 | " designs_domains[domain_end_col])]\n", 512 | " .copy())\n", 513 | " filtered_domains = filtered_domains[id_cols + [domain_type_col]].drop_duplicates()\n", 514 | " filtered_domains['present'] = 1\n", 515 | " domain_feature_df = (filtered_domains.pivot_table(values='present',\n", 516 | " index=id_cols,\n", 517 | " columns='type', fill_value=0)\n", 518 | " .reset_index())\n", 519 | " # Ensure all domain columns are present for testing\n", 520 | " full_column_df = pd.DataFrame(columns=id_cols + sources, dtype=int) # empty\n", 521 | " domain_feature_df = pd.concat([full_column_df, domain_feature_df]).fillna(0)\n", 522 | " domain_feature_df[sources] = domain_feature_df[sources].astype(int)\n", 523 | " return domain_feature_df" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": {}, 530 | "outputs": [ 531 | { 532 | "name": "stdout", 533 | "output_type": "stream", 534 | "text": [ 535 | "Getting protein domains\n" 536 | ] 537 | }, 538 | { 539 | "name": "stderr", 540 | "output_type": "stream", 541 | "text": [ 542 | "100%|█████████████████████████████████████████| 200/200 [00:49<00:00, 4.02it/s]\n" 543 | ] 544 | } 545 | ], 546 | "source": [ 547 | "domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)\n", 548 | "protein_domain_feature_df = get_protein_domain_features(design_targ_df, domain_df, sources=None,\n", 549 | " id_cols=['sgRNA Context Sequence', 'Target Cut Length',\n", 550 | " 'AA Index', 'Target Transcript', 'Orientation'])" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [ 559 | "assert protein_domain_feature_df.loc[protein_domain_feature_df['sgRNA Context Sequence'] == 'AAAAGAGCCATGAATCTAAACATCAGGAAT',\n", 560 | " ['PANTHER', 'ncoils', 'Seg', 'MobiDBLite']].sum(axis=1).values[0] == 4" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": {}, 566 | "source": [ 567 | "## Conservation Features" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": null, 573 | "metadata": {}, 574 | "outputs": [], 575 | "source": [ 576 | "# export\n", 577 | "def get_conservation_ranges(cut_pos, small_width, large_width):\n", 578 | " small_range = range(cut_pos - small_width + 1, cut_pos + small_width + 1)\n", 579 | " large_range = range(cut_pos - large_width + 1, cut_pos + large_width + 1)\n", 580 | " return small_range, large_range\n", 581 | "\n", 582 | "\n", 583 | "def get_conservation_features(sg_designs, conservation_df, conservation_column,\n", 584 | " small_width, large_width, id_cols):\n", 585 | " \"\"\"Get conservation features\n", 586 | "\n", 587 | " :param sg_designs: DataFrame\n", 588 | " :param conservation_df: DataFrame, tidy conservation scores indexed by Transcript Base and target position\n", 589 | " :param conservation_column: str, name of column to calculate scores with\n", 590 | " :param small_width: int, small window length to average scores in one direction\n", 591 | " :param large_width: int, large window length to average scores in the one direction\n", 592 | " :return: DataFrame of conservation features\n", 593 | " \"\"\"\n", 594 | " sg_designs_width = sg_designs[id_cols + ['Transcript Base']].copy()\n", 595 | " sg_designs_width['target position small'], sg_designs_width['target position large'] = \\\n", 596 | " zip(*sg_designs_width['Target Cut Length']\n", 597 | " .apply(get_conservation_ranges, small_width=small_width,\n", 598 | " large_width=large_width))\n", 599 | " small_width_conservation = (sg_designs_width.drop('target position large', axis=1)\n", 600 | " .rename({'target position small': 'target position'}, axis=1)\n", 601 | " .explode('target position')\n", 602 | " .merge(conservation_df, how='inner',\n", 603 | " on=['Target Transcript', 'Transcript Base', 'target position'])\n", 604 | " .groupby(id_cols)\n", 605 | " .agg(cons=(conservation_column, 'mean'))\n", 606 | " .rename({'cons': 'cons_' + str(small_width * 2)}, axis=1)\n", 607 | " .reset_index())\n", 608 | " large_width_conservation = (sg_designs_width.drop('target position small', axis=1)\n", 609 | " .rename({'target position large': 'target position'}, axis=1)\n", 610 | " .explode('target position')\n", 611 | " .merge(conservation_df, how='inner',\n", 612 | " on=['Target Transcript', 'Transcript Base', 'target position'])\n", 613 | " .groupby(id_cols)\n", 614 | " .agg(cons=(conservation_column, 'mean'))\n", 615 | " .rename({'cons': 'cons_' + str(large_width * 2)}, axis=1)\n", 616 | " .reset_index())\n", 617 | " cons_feature_df = small_width_conservation.merge(large_width_conservation, how='outer',\n", 618 | " on=id_cols)\n", 619 | " return cons_feature_df" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "metadata": {}, 626 | "outputs": [ 627 | { 628 | "name": "stdout", 629 | "output_type": "stream", 630 | "text": [ 631 | "Getting conservation\n" 632 | ] 633 | }, 634 | { 635 | "name": "stderr", 636 | "output_type": "stream", 637 | "text": [ 638 | "100%|█████████████████████████████████████████| 200/200 [06:28<00:00, 1.94s/it]\n" 639 | ] 640 | } 641 | ], 642 | "source": [ 643 | "conservation_df = targetdata.build_conservation_df(design_targ_df, n_jobs=max_n_jobs)\n", 644 | "conservation_features = get_conservation_features(design_targ_df, conservation_df,\n", 645 | " small_width=2, large_width=16,\n", 646 | " conservation_column='ranked_conservation',\n", 647 | " id_cols=['sgRNA Context Sequence', 'Target Cut Length',\n", 648 | " 'Target Transcript', 'Orientation'])\n", 649 | "merged_features = protein_domain_feature_df.merge(conservation_features, how='inner', on=['sgRNA Context Sequence',\n", 650 | " 'Target Cut Length',\n", 651 | " 'Target Transcript',\n", 652 | " 'Orientation'])\n", 653 | "smart_avg_cons = merged_features.loc[merged_features['Smart'].astype(bool), 'cons_32'].mean()\n", 654 | "non_smart_avg_cons = merged_features.loc[~merged_features['Smart'].astype(bool), 'cons_32'].mean()\n", 655 | "assert smart_avg_cons > non_smart_avg_cons" 656 | ] 657 | }, 658 | { 659 | "cell_type": "markdown", 660 | "metadata": {}, 661 | "source": [ 662 | "## Combining target features\n", 663 | "\n", 664 | "We'll combine, the position, amino acid and domain feature matrices into a single target feature matrix" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": null, 670 | "metadata": {}, 671 | "outputs": [], 672 | "source": [ 673 | "# export\n", 674 | "def merge_feature_dfs(design_df,\n", 675 | " aa_subseq_df, aa_features=None,\n", 676 | " domain_df=None,\n", 677 | " conservation_df=None,\n", 678 | " id_cols=None):\n", 679 | " if id_cols is None:\n", 680 | " id_cols = ['sgRNA Context Sequence', 'Target Cut Length',\n", 681 | " 'Target Transcript', 'Orientation']\n", 682 | " if aa_features is None:\n", 683 | " aa_features = ['Pos. Ind. 1mer',\n", 684 | " 'Hydrophobicity', 'Aromaticity',\n", 685 | " 'Isoelectric Point', 'Secondary Structure']\n", 686 | " if design_df[id_cols].drop_duplicates().shape[0] != design_df.shape[0]:\n", 687 | " raise ValueError('id_cols must uniquely identify rows of the design dataframe')\n", 688 | " feature_df_dict = dict()\n", 689 | " feature_list = list()\n", 690 | " position_feature_df = get_position_features(design_df, id_cols=id_cols)\n", 691 | " feature_df_dict['position'] = position_feature_df\n", 692 | " feature_list.extend(['Target Cut %', 'sense'])\n", 693 | " if domain_df is not None:\n", 694 | " feature_df_dict['domain'] = domain_df\n", 695 | " feature_list.extend(['Pfam', 'PANTHER', 'HAMAP', 'SuperFamily', 'TIGRfam', 'ncoils', 'Gene3D',\n", 696 | " 'Prosite_patterns', 'Seg', 'SignalP', 'TMHMM', 'MobiDBLite',\n", 697 | " 'PIRSF', 'PRINTS', 'Smart', 'Prosite_profiles'])\n", 698 | " if conservation_df is not None:\n", 699 | " feature_df_dict['conservation'] = conservation_df\n", 700 | " # hardcoded\n", 701 | " feature_list.extend(['cons_4', 'cons_32'])\n", 702 | " aa_feature_df = get_amino_acid_features(aa_subseq_df, aa_features, id_cols)\n", 703 | " feature_list.extend(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I',\n", 704 | " 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '*',\n", 705 | " 'Hydrophobicity', 'Aromaticity', 'Isoelectric Point', 'Helix', 'Turn',\n", 706 | " 'Sheet'])\n", 707 | " feature_df_dict['aa'] = aa_feature_df\n", 708 | " feature_df = design_df[id_cols]\n", 709 | " for key, df in feature_df_dict.items():\n", 710 | " feature_df = pd.merge(feature_df, df, how='left', on=id_cols)\n", 711 | " return feature_df, feature_list\n" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": null, 717 | "metadata": {}, 718 | "outputs": [], 719 | "source": [ 720 | "feature_df, feature_list = merge_feature_dfs(design_df=design_df,\n", 721 | " aa_subseq_df=aa_subseq_df,\n", 722 | " domain_df=protein_domain_feature_df,\n", 723 | " conservation_df=conservation_features)" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "assert feature_df[feature_list].shape[1] == len(feature_list)" 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": null, 738 | "metadata": {}, 739 | "outputs": [], 740 | "source": [] 741 | } 742 | ], 743 | "metadata": { 744 | "kernelspec": { 745 | "display_name": "rs3_v2", 746 | "language": "python", 747 | "name": "rs3_v2" 748 | } 749 | }, 750 | "nbformat": 4, 751 | "nbformat_minor": 4 752 | } 753 | -------------------------------------------------------------------------------- /03_predicttarg.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# default_exp predicttarg" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# predicttarg\n", 17 | "\n", 18 | "> Rule set 3 target-site predictions" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# export\n", 28 | "from rs3 import targetfeat\n", 29 | "import joblib\n", 30 | "import os" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import lightgbm\n", 40 | "import pandas as pd\n", 41 | "from rs3 import targetdata\n", 42 | "from scipy import stats\n", 43 | "import numpy as np" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "__file__ = os.path.abspath('') + '/03_predicttarg.ipynb'\n", 53 | "import multiprocessing\n", 54 | "max_n_jobs = multiprocessing.cpu_count()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# export\n", 64 | "def load_target_model(lite=False):\n", 65 | " \"\"\"Load rule set 3 target model\"\"\"\n", 66 | " if lite:\n", 67 | " model_name = 'target_lite_model.pkl'\n", 68 | " else:\n", 69 | " model_name = 'target_model.pkl'\n", 70 | " model = joblib.load(os.path.join(os.path.dirname(__file__), model_name))\n", 71 | " return model" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stderr", 81 | "output_type": "stream", 82 | "text": [ 83 | "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n", 84 | " warnings.warn(\n", 85 | "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n", 86 | " warnings.warn(\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "assert type(load_target_model()['regressor']) == lightgbm.sklearn.LGBMRegressor" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "# export\n", 101 | "def predict_target(design_df, aa_subseq_df, domain_feature_df=None,\n", 102 | " conservation_feature_df=None, id_cols=None):\n", 103 | " \"\"\"Make predictions using the Rule Set 3 target model. Note that if the protein_domain_df\n", 104 | " or conservation_df are not supplied, then the lite model will be used, otherwise the full model is used.\n", 105 | "\n", 106 | " :param design_df: DataFrame\n", 107 | " :param aa_subseq_df: DataFrame\n", 108 | " :param domain_feature_df: DataFrame\n", 109 | " :param id_cols: list or str\n", 110 | " :return: list\n", 111 | " \"\"\"\n", 112 | " if (domain_feature_df is None) or (conservation_feature_df is None):\n", 113 | " lite = True\n", 114 | " domain_feature_df = None\n", 115 | " conservation_feature_df = None\n", 116 | " else:\n", 117 | " lite = False\n", 118 | " model = load_target_model(lite=lite)\n", 119 | " if id_cols is None:\n", 120 | " id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']\n", 121 | " target_feature_df, target_feature_cols = targetfeat.merge_feature_dfs(design_df,\n", 122 | " aa_subseq_df=aa_subseq_df,\n", 123 | " domain_df=domain_feature_df,\n", 124 | " conservation_df=conservation_feature_df,\n", 125 | " id_cols=id_cols)\n", 126 | " X_target = target_feature_df[target_feature_cols]\n", 127 | " predictions = model.predict(X_target)\n", 128 | " return predictions" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "design_df = pd.read_table('test_data/sgrna-designs.txt')\n", 138 | "design_targ_df = targetfeat.add_target_columns(design_df)\n", 139 | "id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "Getting amino acid sequences\n" 152 | ] 153 | }, 154 | { 155 | "name": "stderr", 156 | "output_type": "stream", 157 | "text": [ 158 | "100%|█████████████████████████████████████████████| 4/4 [00:04<00:00, 1.12s/it]\n" 159 | ] 160 | }, 161 | { 162 | "data": { 163 | "text/html": [ 164 | "
\n", 165 | "\n", 178 | "\n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | "
Target TranscriptTarget Total LengthTranscript BaseversionmoleculedescidseqAA lenAA IndexOrientationsgRNA Context SequenceTarget Cut Lengthextended_seqAA 0-IndexedAA 0-Indexed paddedseq_startseq_endAA Subsequence
0ENST00000259457.8834ENST000002594573proteinNoneENSP00000259457MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...27764senseTGGAGCAGATACAAGAGCAACTGAAGGGAT191-----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF...63806496GVVYKDGIVLGADTRATEGMVVADKNCSKIHFI
1ENST00000259457.8834ENST000002594573proteinNoneENSP00000259457MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...27746senseCCGGAAAACTGGCACGACCATCGCTGGGGT137-----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF...45624678AKRGYKLPKVRKTGTTIAGVVYKDGIVLGADTR
2ENST00000394249.81863ENST000003942493proteinNoneENSP00000377793MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...620106senseTAGAAAAAGATTTGCGCACCCAAGTGGAAT316-----------------MRRSEVLAEESIVCLQKALNHLREIWELI...105122106138EEGETTILQLEKDLRTQVELMRKQKKERKQELK
3ENST00000394249.81863ENST000003942493proteinNoneENSP00000377793MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...620263antisenseTGGCCTTTGACCCAGACATAATGGTGGCCA787-----------------MRRSEVLAEESIVCLQKALNHLREIWELI...262279263295WDRLQIPEEEREAVATIMSGSKAKVRKALQLEV
4ENST00000361337.32298ENST000003613372proteinNoneENSP00000354522MSGDHLHNDSQIEADFRLNDSHKHKDKHKDREHRHKEHKKEKDREK...765140antisenseAAATACTCACTCATCCTCATCTCGAGGTCT420-----------------MSGDHLHNDSQIEADFRLNDSHKHKDKHK...139156140172GYFVPPKEDIKPLKRPRDEDDADYKPKKIKTED
............................................................
395ENST00000454402.71023ENST000004544022proteinNoneENSP00000408295METSALKQQEQPAATKIRNLPWVEKYRPQTLNDLISHQDILSTIQK...34074antisenseTGTCTTTATATAGCTGTTTCGCACAGGCTA220-----------------METSALKQQEQPAATKIRNLPWVEKYRPQ...739074106LYGPPGTGKTSTILACAKQLYKDKEFGSMVLEL
396ENST00000254998.3423ENST000002549982proteinNoneENSP00000254998MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...14027senseTTGTCAATGTCTACTACACCACCATGGATA79-----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD...26432759DQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTA
397ENST00000254998.3423ENST000002549982proteinNoneENSP00000254998MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...14039senseGGCGTTTGCTGTCCCGCCTGTACATGGGCA115-----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD...38553971VYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQ
398ENST00000381685.102067ENST000003816855proteinNoneENSP00000371101MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI...688259antisenseACTAGCAATGGCTTATCAGATCGAAGGTCA776-----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK...258275259291TMAVGTTTGQVLLYDLRSDKPLLVKDHQYGLPI
399ENST00000381685.102067ENST000003816855proteinNoneENSP00000371101MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI...688108senseAAATTTTGTCTGATGACTACTCAAAGGTAT322-----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK...107124108140CLDSEVVTFEILSDDYSKIVFLHNDRYIEFHSQ
\n", 448 | "

400 rows × 19 columns

\n", 449 | "
" 450 | ], 451 | "text/plain": [ 452 | " Target Transcript Target Total Length Transcript Base version \\\n", 453 | "0 ENST00000259457.8 834 ENST00000259457 3 \n", 454 | "1 ENST00000259457.8 834 ENST00000259457 3 \n", 455 | "2 ENST00000394249.8 1863 ENST00000394249 3 \n", 456 | "3 ENST00000394249.8 1863 ENST00000394249 3 \n", 457 | "4 ENST00000361337.3 2298 ENST00000361337 2 \n", 458 | ".. ... ... ... ... \n", 459 | "395 ENST00000454402.7 1023 ENST00000454402 2 \n", 460 | "396 ENST00000254998.3 423 ENST00000254998 2 \n", 461 | "397 ENST00000254998.3 423 ENST00000254998 2 \n", 462 | "398 ENST00000381685.10 2067 ENST00000381685 5 \n", 463 | "399 ENST00000381685.10 2067 ENST00000381685 5 \n", 464 | "\n", 465 | " molecule desc id \\\n", 466 | "0 protein None ENSP00000259457 \n", 467 | "1 protein None ENSP00000259457 \n", 468 | "2 protein None ENSP00000377793 \n", 469 | "3 protein None ENSP00000377793 \n", 470 | "4 protein None ENSP00000354522 \n", 471 | ".. ... ... ... \n", 472 | "395 protein None ENSP00000408295 \n", 473 | "396 protein None ENSP00000254998 \n", 474 | "397 protein None ENSP00000254998 \n", 475 | "398 protein None ENSP00000371101 \n", 476 | "399 protein None ENSP00000371101 \n", 477 | "\n", 478 | " seq AA len AA Index \\\n", 479 | "0 MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI... 277 64 \n", 480 | "1 MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI... 277 46 \n", 481 | "2 MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH... 620 106 \n", 482 | "3 MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH... 620 263 \n", 483 | "4 MSGDHLHNDSQIEADFRLNDSHKHKDKHKDREHRHKEHKKEKDREK... 765 140 \n", 484 | ".. ... ... ... \n", 485 | "395 METSALKQQEQPAATKIRNLPWVEKYRPQTLNDLISHQDILSTIQK... 340 74 \n", 486 | "396 MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV... 140 27 \n", 487 | "397 MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV... 140 39 \n", 488 | "398 MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI... 688 259 \n", 489 | "399 MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI... 688 108 \n", 490 | "\n", 491 | " Orientation sgRNA Context Sequence Target Cut Length \\\n", 492 | "0 sense TGGAGCAGATACAAGAGCAACTGAAGGGAT 191 \n", 493 | "1 sense CCGGAAAACTGGCACGACCATCGCTGGGGT 137 \n", 494 | "2 sense TAGAAAAAGATTTGCGCACCCAAGTGGAAT 316 \n", 495 | "3 antisense TGGCCTTTGACCCAGACATAATGGTGGCCA 787 \n", 496 | "4 antisense AAATACTCACTCATCCTCATCTCGAGGTCT 420 \n", 497 | ".. ... ... ... \n", 498 | "395 antisense TGTCTTTATATAGCTGTTTCGCACAGGCTA 220 \n", 499 | "396 sense TTGTCAATGTCTACTACACCACCATGGATA 79 \n", 500 | "397 sense GGCGTTTGCTGTCCCGCCTGTACATGGGCA 115 \n", 501 | "398 antisense ACTAGCAATGGCTTATCAGATCGAAGGTCA 776 \n", 502 | "399 sense AAATTTTGTCTGATGACTACTCAAAGGTAT 322 \n", 503 | "\n", 504 | " extended_seq AA 0-Indexed \\\n", 505 | "0 -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF... 63 \n", 506 | "1 -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF... 45 \n", 507 | "2 -----------------MRRSEVLAEESIVCLQKALNHLREIWELI... 105 \n", 508 | "3 -----------------MRRSEVLAEESIVCLQKALNHLREIWELI... 262 \n", 509 | "4 -----------------MSGDHLHNDSQIEADFRLNDSHKHKDKHK... 139 \n", 510 | ".. ... ... \n", 511 | "395 -----------------METSALKQQEQPAATKIRNLPWVEKYRPQ... 73 \n", 512 | "396 -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD... 26 \n", 513 | "397 -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD... 38 \n", 514 | "398 -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK... 258 \n", 515 | "399 -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK... 107 \n", 516 | "\n", 517 | " AA 0-Indexed padded seq_start seq_end \\\n", 518 | "0 80 64 96 \n", 519 | "1 62 46 78 \n", 520 | "2 122 106 138 \n", 521 | "3 279 263 295 \n", 522 | "4 156 140 172 \n", 523 | ".. ... ... ... \n", 524 | "395 90 74 106 \n", 525 | "396 43 27 59 \n", 526 | "397 55 39 71 \n", 527 | "398 275 259 291 \n", 528 | "399 124 108 140 \n", 529 | "\n", 530 | " AA Subsequence \n", 531 | "0 GVVYKDGIVLGADTRATEGMVVADKNCSKIHFI \n", 532 | "1 AKRGYKLPKVRKTGTTIAGVVYKDGIVLGADTR \n", 533 | "2 EEGETTILQLEKDLRTQVELMRKQKKERKQELK \n", 534 | "3 WDRLQIPEEEREAVATIMSGSKAKVRKALQLEV \n", 535 | "4 GYFVPPKEDIKPLKRPRDEDDADYKPKKIKTED \n", 536 | ".. ... \n", 537 | "395 LYGPPGTGKTSTILACAKQLYKDKEFGSMVLEL \n", 538 | "396 DQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTA \n", 539 | "397 VYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQ \n", 540 | "398 TMAVGTTTGQVLLYDLRSDKPLLVKDHQYGLPI \n", 541 | "399 CLDSEVVTFEILSDDYSKIVFLHNDRYIEFHSQ \n", 542 | "\n", 543 | "[400 rows x 19 columns]" 544 | ] 545 | }, 546 | "execution_count": null, 547 | "metadata": {}, 548 | "output_type": "execute_result" 549 | } 550 | ], 551 | "source": [ 552 | "## aa sequences\n", 553 | "aa_seq_df = targetdata.build_transcript_aa_seq_df(design_df, n_jobs=2)\n", 554 | "aa_subseq_df = targetfeat.get_aa_subseq_df(sg_designs=design_targ_df, aa_seq_df=aa_seq_df, width=16,\n", 555 | " id_cols=id_cols)\n", 556 | "aa_subseq_df" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "metadata": {}, 563 | "outputs": [ 564 | { 565 | "name": "stdout", 566 | "output_type": "stream", 567 | "text": [ 568 | "Getting protein domains\n" 569 | ] 570 | }, 571 | { 572 | "name": "stderr", 573 | "output_type": "stream", 574 | "text": [ 575 | "100%|█████████████████████████████████████████| 200/200 [00:53<00:00, 3.75it/s]\n" 576 | ] 577 | } 578 | ], 579 | "source": [ 580 | "## domains\n", 581 | "domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)\n", 582 | "domain_feature_df = targetfeat.get_protein_domain_features(design_targ_df, domain_df, sources=None,\n", 583 | " id_cols=id_cols)" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": null, 589 | "metadata": {}, 590 | "outputs": [ 591 | { 592 | "name": "stdout", 593 | "output_type": "stream", 594 | "text": [ 595 | "Getting conservation\n" 596 | ] 597 | }, 598 | { 599 | "name": "stderr", 600 | "output_type": "stream", 601 | "text": [ 602 | "100%|█████████████████████████████████████████| 200/200 [06:24<00:00, 1.92s/it]\n" 603 | ] 604 | }, 605 | { 606 | "data": { 607 | "text/html": [ 608 | "
\n", 609 | "\n", 622 | "\n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | "
sgRNA Context SequenceTarget Cut LengthTarget TranscriptOrientationcons_4cons_32
0AAAAGAATGATGAAAAGACACCACAGGGAG244ENST00000610426.5sense0.2182310.408844
1AAAAGAGCCATGAATCTAAACATCAGGAAT640ENST00000223073.6sense0.1298250.278180
2AAAAGCGCCAAATGGCCCGAGAATTGGGAG709ENST00000331923.9sense0.4709060.532305
3AAACAGAAAAAGTTAAAATCACCAAGGTGT496ENST00000283882.4sense0.5805560.602708
4AAACAGATGGAAGATGCTTACCGGGGGACC132ENST00000393047.8sense0.2834470.414293
.....................
395TTTGATTGCATTAAGGTTGGACTCTGGATT246ENST00000249269.9sense0.5806120.618707
396TTTGCCCACAGCTCCAAAGCATCGCGGAGA130ENST00000227618.8sense0.3237700.416368
397TTTTACAGTGCGATGTATGATGTATGGCTT119ENST00000338366.6sense0.7880000.537417
398TTTTGGATCTCGTAGTGATTCAAGAGGGAA233ENST00000629496.3sense0.2396300.347615
399TTTTTGTTACTACAGGTTCGCTGCTGGGAA201ENST00000395840.6sense0.6937670.639044
\n", 736 | "

400 rows × 6 columns

\n", 737 | "
" 738 | ], 739 | "text/plain": [ 740 | " sgRNA Context Sequence Target Cut Length Target Transcript \\\n", 741 | "0 AAAAGAATGATGAAAAGACACCACAGGGAG 244 ENST00000610426.5 \n", 742 | "1 AAAAGAGCCATGAATCTAAACATCAGGAAT 640 ENST00000223073.6 \n", 743 | "2 AAAAGCGCCAAATGGCCCGAGAATTGGGAG 709 ENST00000331923.9 \n", 744 | "3 AAACAGAAAAAGTTAAAATCACCAAGGTGT 496 ENST00000283882.4 \n", 745 | "4 AAACAGATGGAAGATGCTTACCGGGGGACC 132 ENST00000393047.8 \n", 746 | ".. ... ... ... \n", 747 | "395 TTTGATTGCATTAAGGTTGGACTCTGGATT 246 ENST00000249269.9 \n", 748 | "396 TTTGCCCACAGCTCCAAAGCATCGCGGAGA 130 ENST00000227618.8 \n", 749 | "397 TTTTACAGTGCGATGTATGATGTATGGCTT 119 ENST00000338366.6 \n", 750 | "398 TTTTGGATCTCGTAGTGATTCAAGAGGGAA 233 ENST00000629496.3 \n", 751 | "399 TTTTTGTTACTACAGGTTCGCTGCTGGGAA 201 ENST00000395840.6 \n", 752 | "\n", 753 | " Orientation cons_4 cons_32 \n", 754 | "0 sense 0.218231 0.408844 \n", 755 | "1 sense 0.129825 0.278180 \n", 756 | "2 sense 0.470906 0.532305 \n", 757 | "3 sense 0.580556 0.602708 \n", 758 | "4 sense 0.283447 0.414293 \n", 759 | ".. ... ... ... \n", 760 | "395 sense 0.580612 0.618707 \n", 761 | "396 sense 0.323770 0.416368 \n", 762 | "397 sense 0.788000 0.537417 \n", 763 | "398 sense 0.239630 0.347615 \n", 764 | "399 sense 0.693767 0.639044 \n", 765 | "\n", 766 | "[400 rows x 6 columns]" 767 | ] 768 | }, 769 | "execution_count": null, 770 | "metadata": {}, 771 | "output_type": "execute_result" 772 | } 773 | ], 774 | "source": [ 775 | "## conservation\n", 776 | "conservation_df = targetdata.build_conservation_df(design_df, n_jobs=max_n_jobs)\n", 777 | "conservation_feature_df = targetfeat.get_conservation_features(design_targ_df, conservation_df,\n", 778 | " small_width=2, large_width=16,\n", 779 | " conservation_column='ranked_conservation',\n", 780 | " id_cols=id_cols)\n", 781 | "conservation_feature_df" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": null, 787 | "metadata": {}, 788 | "outputs": [ 789 | { 790 | "name": "stderr", 791 | "output_type": "stream", 792 | "text": [ 793 | "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n", 794 | " warnings.warn(\n", 795 | "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n", 796 | " warnings.warn(\n" 797 | ] 798 | } 799 | ], 800 | "source": [ 801 | "predictions = predict_target(design_df=design_df,\n", 802 | " aa_subseq_df=aa_subseq_df,\n", 803 | " domain_feature_df=domain_feature_df,\n", 804 | " conservation_feature_df=conservation_feature_df)\n", 805 | "design_df['Target Score'] = predictions" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": null, 811 | "metadata": {}, 812 | "outputs": [ 813 | { 814 | "name": "stderr", 815 | "output_type": "stream", 816 | "text": [ 817 | "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n", 818 | " warnings.warn(\n", 819 | "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n", 820 | " warnings.warn(\n" 821 | ] 822 | } 823 | ], 824 | "source": [ 825 | "lite_predictions = predict_target(design_df=design_df,\n", 826 | " aa_subseq_df=aa_subseq_df)\n", 827 | "design_df['Target Score Lite'] = lite_predictions" 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": null, 833 | "metadata": {}, 834 | "outputs": [ 835 | { 836 | "data": { 837 | "text/plain": [ 838 | "0 TGGAGCAGATACAAGAGCAACTGAAGGGAT\n", 839 | "1 CCGGAAAACTGGCACGACCATCGCTGGGGT\n", 840 | "2 TAGAAAAAGATTTGCGCACCCAAGTGGAAT\n", 841 | "3 TGGCCTTTGACCCAGACATAATGGTGGCCA\n", 842 | "4 AAATACTCACTCATCCTCATCTCGAGGTCT\n", 843 | " ... \n", 844 | "395 TGTCTTTATATAGCTGTTTCGCACAGGCTA\n", 845 | "396 TTGTCAATGTCTACTACACCACCATGGATA\n", 846 | "397 GGCGTTTGCTGTCCCGCCTGTACATGGGCA\n", 847 | "398 ACTAGCAATGGCTTATCAGATCGAAGGTCA\n", 848 | "399 AAATTTTGTCTGATGACTACTCAAAGGTAT\n", 849 | "Name: sgRNA Context Sequence, Length: 400, dtype: object" 850 | ] 851 | }, 852 | "execution_count": null, 853 | "metadata": {}, 854 | "output_type": "execute_result" 855 | } 856 | ], 857 | "source": [ 858 | "design_df['sgRNA Context Sequence']" 859 | ] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": null, 864 | "metadata": {}, 865 | "outputs": [], 866 | "source": [ 867 | "assert stats.pearsonr(design_df['Target Score'], design_df['Target Score Lite'])[0] > 0.7" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": null, 873 | "metadata": {}, 874 | "outputs": [], 875 | "source": [ 876 | "sanger_df = pd.read_csv('test_data/Behan2019_activity.csv')\n", 877 | "gecko_df = pd.read_csv('test_data/Aguirre2016_activity.csv')\n", 878 | "\n", 879 | "sanger_designs = sanger_df.merge(design_df, how='inner',\n", 880 | " on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',\n", 881 | " 'Target Cut %'])\n", 882 | "gecko_designs = gecko_df.merge(design_df, how='inner',\n", 883 | " on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',\n", 884 | " 'Target Cut %'])\n", 885 | "assert stats.pearsonr(sanger_designs['avg_mean_centered_neg_lfc'],\n", 886 | " sanger_designs['Target Score'])[0] > 0.2\n", 887 | "assert stats.pearsonr(gecko_designs['avg_mean_centered_neg_lfc'],\n", 888 | " gecko_designs['Target Score'])[0] > 0.05" 889 | ] 890 | } 891 | ], 892 | "metadata": { 893 | "kernelspec": { 894 | "display_name": "rs3_v2", 895 | "language": "python", 896 | "name": "rs3_v2" 897 | } 898 | }, 899 | "nbformat": 4, 900 | "nbformat_minor": 4 901 | } 902 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | ## How to get started 4 | 5 | Before anything else, please install the git hooks that run automatic scripts during each commit and merge to strip the notebooks of superfluous metadata (and avoid merge conflicts). After cloning the repository, run the following command inside it: 6 | ``` 7 | nbdev_install_git_hooks 8 | ``` 9 | 10 | ## Did you find a bug? 11 | 12 | * Ensure the bug was not already reported by searching on GitHub under Issues. 13 | * If you're unable to find an open issue addressing the problem, open a new one. Be sure to include a title and clear description, as much relevant information as possible, and a code sample or an executable test case demonstrating the expected behavior that is not occurring. 14 | * Be sure to add the complete error messages. 15 | 16 | #### Did you write a patch that fixes a bug? 17 | 18 | * Open a new GitHub pull request with the patch. 19 | * Ensure that your PR includes a test that fails without your patch, and pass with it. 20 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable. 21 | 22 | ## PR submission guidelines 23 | 24 | * Keep each PR focused. While it's more convenient, do not combine several unrelated fixes together. Create as many branches as needing to keep each PR focused. 25 | * Do not mix style changes/fixes with "functional" changes. It's very difficult to review such PRs and it most likely get rejected. 26 | * Do not add/remove vertical whitespace. Preserve the original style of the file you edit as much as you can. 27 | * Do not turn an already submitted PR into your development playground. If after you submitted PR, you discovered that more work is needed - close the PR, do the required work and then submit a new PR. Otherwise each of your commits requires attention from maintainers of the project. 28 | * If, however, you submitted a PR and received a request for changes, you should proceed with commits inside that PR, so that the maintainer can see the incremental fixes and won't need to review the whole PR again. In the exception case where you realize it'll take many many commits to complete the requests, then it's probably best to close the PR, do the work and then submit it again. Use common sense where you'd choose one way over another. 29 | 30 | ## Do you want to contribute to the documentation? 31 | 32 | * Docs are automatically created from the notebooks in the nbs folder. 33 | 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include settings.ini 2 | include LICENSE 3 | include CONTRIBUTING.md 4 | include README.md 5 | include rs3/RuleSet3.pkl 6 | include rs3/target_model.pkl 7 | include rs3/target_lite_model.pkl 8 | recursive-exclude * __pycache__ 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .ONESHELL: 2 | SHELL := /bin/bash 3 | SRC = $(wildcard ./*.ipynb) 4 | 5 | all: rs3 docs 6 | 7 | rs3: $(SRC) 8 | nbdev_build_lib 9 | touch rs3 10 | 11 | sync: 12 | nbdev_update_lib 13 | 14 | docs_serve: docs 15 | cd docs && bundle exec jekyll serve 16 | 17 | docs: $(SRC) 18 | nbdev_build_docs 19 | touch docs 20 | 21 | test: 22 | nbdev_test_nbs 23 | 24 | release: pypi conda_release 25 | nbdev_bump_version 26 | 27 | conda_release: 28 | fastrelease_conda_package 29 | 30 | pypi: dist 31 | twine upload --repository pypi dist/* 32 | 33 | dist: clean 34 | python setup.py sdist bdist_wheel 35 | 36 | clean: 37 | rm -rf dist -------------------------------------------------------------------------------- /RuleSet3.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/RuleSet3.pkl -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | fastai: &fastai 4 | restart: unless-stopped 5 | working_dir: /data 6 | image: fastai/codespaces 7 | logging: 8 | driver: json-file 9 | options: 10 | max-size: 50m 11 | stdin_open: true 12 | tty: true 13 | volumes: 14 | - .:/data/ 15 | 16 | notebook: 17 | <<: *fastai 18 | command: bash -c "pip install -e . && jupyter notebook --allow-root --no-browser --ip=0.0.0.0 --port=8080 --NotebookApp.token='' --NotebookApp.password=''" 19 | ports: 20 | - "8080:8080" 21 | 22 | watcher: 23 | <<: *fastai 24 | command: watchmedo shell-command --command nbdev_build_docs --pattern *.ipynb --recursive --drop 25 | network_mode: host # for GitHub Codespaces https://github.com/features/codespaces/ 26 | 27 | jekyll: 28 | <<: *fastai 29 | ports: 30 | - "4000:4000" 31 | command: > 32 | bash -c "pip install . 33 | && nbdev_build_docs && cd docs 34 | && bundle i 35 | && chmod -R u+rwx . && bundle exec jekyll serve --host 0.0.0.0" 36 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _site/ 2 | -------------------------------------------------------------------------------- /docs/Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gem 'github-pages', group: :jekyll_plugins 4 | 5 | # Added at 2019-11-25 10:11:40 -0800 by jhoward: 6 | gem "nokogiri", "< 1.11.1" 7 | gem "jekyll", ">= 3.7" 8 | gem "kramdown", ">= 2.3.1" 9 | gem "jekyll-remote-theme" 10 | -------------------------------------------------------------------------------- /docs/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | activesupport (6.0.3.6) 5 | concurrent-ruby (~> 1.0, >= 1.0.2) 6 | i18n (>= 0.7, < 2) 7 | minitest (~> 5.1) 8 | tzinfo (~> 1.1) 9 | zeitwerk (~> 2.2, >= 2.2.2) 10 | addressable (2.7.0) 11 | public_suffix (>= 2.0.2, < 5.0) 12 | coffee-script (2.4.1) 13 | coffee-script-source 14 | execjs 15 | coffee-script-source (1.11.1) 16 | colorator (1.1.0) 17 | commonmarker (0.17.13) 18 | ruby-enum (~> 0.5) 19 | concurrent-ruby (1.1.8) 20 | dnsruby (1.61.5) 21 | simpleidn (~> 0.1) 22 | em-websocket (0.5.2) 23 | eventmachine (>= 0.12.9) 24 | http_parser.rb (~> 0.6.0) 25 | ethon (0.12.0) 26 | ffi (>= 1.3.0) 27 | eventmachine (1.2.7) 28 | execjs (2.7.0) 29 | faraday (1.3.0) 30 | faraday-net_http (~> 1.0) 31 | multipart-post (>= 1.2, < 3) 32 | ruby2_keywords 33 | faraday-net_http (1.0.1) 34 | ffi (1.15.0) 35 | forwardable-extended (2.6.0) 36 | gemoji (3.0.1) 37 | github-pages (214) 38 | github-pages-health-check (= 1.17.0) 39 | jekyll (= 3.9.0) 40 | jekyll-avatar (= 0.7.0) 41 | jekyll-coffeescript (= 1.1.1) 42 | jekyll-commonmark-ghpages (= 0.1.6) 43 | jekyll-default-layout (= 0.1.4) 44 | jekyll-feed (= 0.15.1) 45 | jekyll-gist (= 1.5.0) 46 | jekyll-github-metadata (= 2.13.0) 47 | jekyll-mentions (= 1.6.0) 48 | jekyll-optional-front-matter (= 0.3.2) 49 | jekyll-paginate (= 1.1.0) 50 | jekyll-readme-index (= 0.3.0) 51 | jekyll-redirect-from (= 0.16.0) 52 | jekyll-relative-links (= 0.6.1) 53 | jekyll-remote-theme (= 0.4.3) 54 | jekyll-sass-converter (= 1.5.2) 55 | jekyll-seo-tag (= 2.7.1) 56 | jekyll-sitemap (= 1.4.0) 57 | jekyll-swiss (= 1.0.0) 58 | jekyll-theme-architect (= 0.1.1) 59 | jekyll-theme-cayman (= 0.1.1) 60 | jekyll-theme-dinky (= 0.1.1) 61 | jekyll-theme-hacker (= 0.1.2) 62 | jekyll-theme-leap-day (= 0.1.1) 63 | jekyll-theme-merlot (= 0.1.1) 64 | jekyll-theme-midnight (= 0.1.1) 65 | jekyll-theme-minimal (= 0.1.1) 66 | jekyll-theme-modernist (= 0.1.1) 67 | jekyll-theme-primer (= 0.5.4) 68 | jekyll-theme-slate (= 0.1.1) 69 | jekyll-theme-tactile (= 0.1.1) 70 | jekyll-theme-time-machine (= 0.1.1) 71 | jekyll-titles-from-headings (= 0.5.3) 72 | jemoji (= 0.12.0) 73 | kramdown (= 2.3.1) 74 | kramdown-parser-gfm (= 1.1.0) 75 | liquid (= 4.0.3) 76 | mercenary (~> 0.3) 77 | minima (= 2.5.1) 78 | nokogiri (>= 1.10.4, < 2.0) 79 | rouge (= 3.26.0) 80 | terminal-table (~> 1.4) 81 | github-pages-health-check (1.17.0) 82 | addressable (~> 2.3) 83 | dnsruby (~> 1.60) 84 | octokit (~> 4.0) 85 | public_suffix (>= 2.0.2, < 5.0) 86 | typhoeus (~> 1.3) 87 | html-pipeline (2.14.0) 88 | activesupport (>= 2) 89 | nokogiri (>= 1.4) 90 | http_parser.rb (0.6.0) 91 | i18n (0.9.5) 92 | concurrent-ruby (~> 1.0) 93 | jekyll (3.9.0) 94 | addressable (~> 2.4) 95 | colorator (~> 1.0) 96 | em-websocket (~> 0.5) 97 | i18n (~> 0.7) 98 | jekyll-sass-converter (~> 1.0) 99 | jekyll-watch (~> 2.0) 100 | kramdown (>= 1.17, < 3) 101 | liquid (~> 4.0) 102 | mercenary (~> 0.3.3) 103 | pathutil (~> 0.9) 104 | rouge (>= 1.7, < 4) 105 | safe_yaml (~> 1.0) 106 | jekyll-avatar (0.7.0) 107 | jekyll (>= 3.0, < 5.0) 108 | jekyll-coffeescript (1.1.1) 109 | coffee-script (~> 2.2) 110 | coffee-script-source (~> 1.11.1) 111 | jekyll-commonmark (1.3.1) 112 | commonmarker (~> 0.14) 113 | jekyll (>= 3.7, < 5.0) 114 | jekyll-commonmark-ghpages (0.1.6) 115 | commonmarker (~> 0.17.6) 116 | jekyll-commonmark (~> 1.2) 117 | rouge (>= 2.0, < 4.0) 118 | jekyll-default-layout (0.1.4) 119 | jekyll (~> 3.0) 120 | jekyll-feed (0.15.1) 121 | jekyll (>= 3.7, < 5.0) 122 | jekyll-gist (1.5.0) 123 | octokit (~> 4.2) 124 | jekyll-github-metadata (2.13.0) 125 | jekyll (>= 3.4, < 5.0) 126 | octokit (~> 4.0, != 4.4.0) 127 | jekyll-mentions (1.6.0) 128 | html-pipeline (~> 2.3) 129 | jekyll (>= 3.7, < 5.0) 130 | jekyll-optional-front-matter (0.3.2) 131 | jekyll (>= 3.0, < 5.0) 132 | jekyll-paginate (1.1.0) 133 | jekyll-readme-index (0.3.0) 134 | jekyll (>= 3.0, < 5.0) 135 | jekyll-redirect-from (0.16.0) 136 | jekyll (>= 3.3, < 5.0) 137 | jekyll-relative-links (0.6.1) 138 | jekyll (>= 3.3, < 5.0) 139 | jekyll-remote-theme (0.4.3) 140 | addressable (~> 2.0) 141 | jekyll (>= 3.5, < 5.0) 142 | jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) 143 | rubyzip (>= 1.3.0, < 3.0) 144 | jekyll-sass-converter (1.5.2) 145 | sass (~> 3.4) 146 | jekyll-seo-tag (2.7.1) 147 | jekyll (>= 3.8, < 5.0) 148 | jekyll-sitemap (1.4.0) 149 | jekyll (>= 3.7, < 5.0) 150 | jekyll-swiss (1.0.0) 151 | jekyll-theme-architect (0.1.1) 152 | jekyll (~> 3.5) 153 | jekyll-seo-tag (~> 2.0) 154 | jekyll-theme-cayman (0.1.1) 155 | jekyll (~> 3.5) 156 | jekyll-seo-tag (~> 2.0) 157 | jekyll-theme-dinky (0.1.1) 158 | jekyll (~> 3.5) 159 | jekyll-seo-tag (~> 2.0) 160 | jekyll-theme-hacker (0.1.2) 161 | jekyll (> 3.5, < 5.0) 162 | jekyll-seo-tag (~> 2.0) 163 | jekyll-theme-leap-day (0.1.1) 164 | jekyll (~> 3.5) 165 | jekyll-seo-tag (~> 2.0) 166 | jekyll-theme-merlot (0.1.1) 167 | jekyll (~> 3.5) 168 | jekyll-seo-tag (~> 2.0) 169 | jekyll-theme-midnight (0.1.1) 170 | jekyll (~> 3.5) 171 | jekyll-seo-tag (~> 2.0) 172 | jekyll-theme-minimal (0.1.1) 173 | jekyll (~> 3.5) 174 | jekyll-seo-tag (~> 2.0) 175 | jekyll-theme-modernist (0.1.1) 176 | jekyll (~> 3.5) 177 | jekyll-seo-tag (~> 2.0) 178 | jekyll-theme-primer (0.5.4) 179 | jekyll (> 3.5, < 5.0) 180 | jekyll-github-metadata (~> 2.9) 181 | jekyll-seo-tag (~> 2.0) 182 | jekyll-theme-slate (0.1.1) 183 | jekyll (~> 3.5) 184 | jekyll-seo-tag (~> 2.0) 185 | jekyll-theme-tactile (0.1.1) 186 | jekyll (~> 3.5) 187 | jekyll-seo-tag (~> 2.0) 188 | jekyll-theme-time-machine (0.1.1) 189 | jekyll (~> 3.5) 190 | jekyll-seo-tag (~> 2.0) 191 | jekyll-titles-from-headings (0.5.3) 192 | jekyll (>= 3.3, < 5.0) 193 | jekyll-watch (2.2.1) 194 | listen (~> 3.0) 195 | jemoji (0.12.0) 196 | gemoji (~> 3.0) 197 | html-pipeline (~> 2.2) 198 | jekyll (>= 3.0, < 5.0) 199 | kramdown (2.3.1) 200 | rexml 201 | kramdown-parser-gfm (1.1.0) 202 | kramdown (~> 2.0) 203 | liquid (4.0.3) 204 | listen (3.5.1) 205 | rb-fsevent (~> 0.10, >= 0.10.3) 206 | rb-inotify (~> 0.9, >= 0.9.10) 207 | mercenary (0.3.6) 208 | mini_portile2 (2.5.0) 209 | minima (2.5.1) 210 | jekyll (>= 3.5, < 5.0) 211 | jekyll-feed (~> 0.9) 212 | jekyll-seo-tag (~> 2.1) 213 | minitest (5.14.4) 214 | multipart-post (2.1.1) 215 | nokogiri (1.11.0) 216 | mini_portile2 (~> 2.5.0) 217 | racc (~> 1.4) 218 | octokit (4.20.0) 219 | faraday (>= 0.9) 220 | sawyer (~> 0.8.0, >= 0.5.3) 221 | pathutil (0.16.2) 222 | forwardable-extended (~> 2.6) 223 | public_suffix (4.0.6) 224 | racc (1.5.2) 225 | rb-fsevent (0.10.4) 226 | rb-inotify (0.10.1) 227 | ffi (~> 1.0) 228 | rexml (3.2.5) 229 | rouge (3.26.0) 230 | ruby-enum (0.9.0) 231 | i18n 232 | ruby2_keywords (0.0.4) 233 | rubyzip (2.3.0) 234 | safe_yaml (1.0.5) 235 | sass (3.7.4) 236 | sass-listen (~> 4.0.0) 237 | sass-listen (4.0.0) 238 | rb-fsevent (~> 0.9, >= 0.9.4) 239 | rb-inotify (~> 0.9, >= 0.9.7) 240 | sawyer (0.8.2) 241 | addressable (>= 2.3.5) 242 | faraday (> 0.8, < 2.0) 243 | simpleidn (0.2.1) 244 | unf (~> 0.1.4) 245 | terminal-table (1.8.0) 246 | unicode-display_width (~> 1.1, >= 1.1.1) 247 | thread_safe (0.3.6) 248 | typhoeus (1.4.0) 249 | ethon (>= 0.9.0) 250 | tzinfo (1.2.9) 251 | thread_safe (~> 0.1) 252 | unf (0.1.4) 253 | unf_ext 254 | unf_ext (0.0.7.7) 255 | unicode-display_width (1.7.0) 256 | zeitwerk (2.4.2) 257 | 258 | PLATFORMS 259 | ruby 260 | 261 | DEPENDENCIES 262 | github-pages 263 | jekyll (>= 3.7) 264 | jekyll-remote-theme 265 | kramdown (>= 2.3.1) 266 | nokogiri (< 1.11.1) 267 | 268 | BUNDLED WITH 269 | 2.1.4 270 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | repository: gpp-rnd/rs3 2 | output: web 3 | topnav_title: rs3 4 | site_title: rs3 5 | company_name: Genetic Perturbation Platform, Broad Institute 6 | description: Predict the activity of CRISPR sgRNAs 7 | # Set to false to disable KaTeX math 8 | use_math: true 9 | # Add Google analytics id if you have one and want to use it here 10 | google_analytics: 11 | # See http://nbdev.fast.ai/search for help with adding Search 12 | google_search: 13 | 14 | host: 127.0.0.1 15 | # the preview server used. Leave as is. 16 | port: 4000 17 | # the port where the preview is rendered. 18 | 19 | exclude: 20 | - .idea/ 21 | - .gitignore 22 | - vendor 23 | 24 | exclude: [vendor] 25 | 26 | highlighter: rouge 27 | markdown: kramdown 28 | kramdown: 29 | input: GFM 30 | auto_ids: true 31 | hard_wrap: false 32 | syntax_highlighter: rouge 33 | 34 | collections: 35 | tooltips: 36 | output: false 37 | 38 | defaults: 39 | - 40 | scope: 41 | path: "" 42 | type: "pages" 43 | values: 44 | layout: "page" 45 | comments: true 46 | search: true 47 | sidebar: home_sidebar 48 | topnav: topnav 49 | - 50 | scope: 51 | path: "" 52 | type: "tooltips" 53 | values: 54 | layout: "page" 55 | comments: true 56 | search: true 57 | tooltip: true 58 | 59 | sidebars: 60 | - home_sidebar 61 | 62 | plugins: 63 | - jekyll-remote-theme 64 | 65 | remote_theme: fastai/nbdev-jekyll-theme 66 | baseurl: /rs3/ -------------------------------------------------------------------------------- /docs/_data/sidebars/home_sidebar.yml: -------------------------------------------------------------------------------- 1 | 2 | ################################################# 3 | ### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ### 4 | ################################################# 5 | # Instead edit ../../sidebar.json 6 | entries: 7 | - folders: 8 | - folderitems: 9 | - output: web,pdf 10 | title: Overview 11 | url: / 12 | - output: web,pdf 13 | title: seq 14 | url: seq.html 15 | - output: web,pdf 16 | title: targetdata 17 | url: targetdata.html 18 | - output: web,pdf 19 | title: targetfeat 20 | url: targetfeat.html 21 | - output: web,pdf 22 | title: predicttarg 23 | url: predicttarg.html 24 | - output: web,pdf 25 | title: predict 26 | url: predict.html 27 | output: web 28 | title: rs3 29 | output: web 30 | title: Sidebar 31 | -------------------------------------------------------------------------------- /docs/_data/topnav.yml: -------------------------------------------------------------------------------- 1 | topnav: 2 | - title: Topnav 3 | items: 4 | - title: github 5 | external_url: https://github.com/gpp-rnd/rs3/tree/master/ 6 | 7 | #Topnav dropdowns 8 | topnav_dropdowns: 9 | - title: Topnav dropdowns 10 | folders: -------------------------------------------------------------------------------- /docs/feed.xml: -------------------------------------------------------------------------------- 1 | --- 2 | search: exclude 3 | layout: none 4 | --- 5 | 6 | 7 | 8 | 9 | {{ site.title | xml_escape }} 10 | {{ site.description | xml_escape }} 11 | {{ site.url }}/ 12 | 13 | {{ site.time | date_to_rfc822 }} 14 | {{ site.time | date_to_rfc822 }} 15 | Jekyll v{{ jekyll.version }} 16 | {% for post in site.posts limit:10 %} 17 | 18 | {{ post.title | xml_escape }} 19 | {{ post.content | xml_escape }} 20 | {{ post.date | date_to_rfc822 }} 21 | {{ post.url | prepend: site.url }} 22 | {{ post.url | prepend: site.url }} 23 | {% for tag in post.tags %} 24 | {{ tag | xml_escape }} 25 | {% endfor %} 26 | {% for tag in page.tags %} 27 | {{ cat | xml_escape }} 28 | {% endfor %} 29 | 30 | {% endfor %} 31 | 32 | 33 | -------------------------------------------------------------------------------- /docs/images/output_18_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/docs/images/output_18_0.png -------------------------------------------------------------------------------- /docs/images/output_42_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/docs/images/output_42_0.png -------------------------------------------------------------------------------- /docs/predicttarg.html: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | title: predicttarg 4 | 5 | 6 | keywords: fastai 7 | sidebar: home_sidebar 8 | 9 | summary: "Rule set 3 target-site predictions" 10 | description: "Rule set 3 target-site predictions" 11 | nb_path: "03_predicttarg.ipynb" 12 | --- 13 | 22 | 23 |
24 | 25 | {% raw %} 26 | 27 |
28 | 29 |
30 | {% endraw %} 31 | 32 | {% raw %} 33 | 34 |
35 | 36 |
37 | {% endraw %} 38 | 39 | {% raw %} 40 | 41 |
42 |
43 | 44 |
45 |
46 |
import lightgbm
 47 | import pandas as pd
 48 | from rs3 import targetdata
 49 | from scipy import stats
 50 | import numpy as np
 51 | 
52 | 53 |
54 |
55 |
56 | 57 |
58 | {% endraw %} 59 | 60 | {% raw %} 61 | 62 |
63 |
64 | 65 |
66 |
67 |
__file__ = os.path.abspath('') + '/03_predicttarg.ipynb'
 68 | import multiprocessing
 69 | max_n_jobs = multiprocessing.cpu_count()
 70 | 
71 | 72 |
73 |
74 |
75 | 76 |
77 | {% endraw %} 78 | 79 | {% raw %} 80 | 81 |
82 | 83 |
84 |
85 | 86 |
87 | 88 | 89 |
90 |

load_target_model[source]

load_target_model(lite=False)

91 |
92 |

Load rule set 3 target model

93 | 94 |
95 | 96 |
97 | 98 |
99 |
100 | 101 |
102 | {% endraw %} 103 | 104 | {% raw %} 105 | 106 |
107 | 108 |
109 | {% endraw %} 110 | 111 | {% raw %} 112 | 113 |
114 |
115 | 116 |
117 |
118 |
assert type(load_target_model()['regressor']) == lightgbm.sklearn.LGBMRegressor
119 | 
120 | 121 |
122 |
123 |
124 | 125 |
126 |
127 | 128 |
129 | 130 |
131 |
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
132 |   warnings.warn(
133 | /Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
134 |   warnings.warn(
135 | 
136 |
137 |
138 | 139 |
140 |
141 | 142 |
143 | {% endraw %} 144 | 145 | {% raw %} 146 | 147 |
148 | 149 |
150 |
151 | 152 |
153 | 154 | 155 |
156 |

predict_target[source]

predict_target(design_df, aa_subseq_df, domain_feature_df=None, conservation_feature_df=None, id_cols=None)

157 |
158 |

Make predictions using the Rule Set 3 target model. Note that if the protein_domain_df 159 | or conservation_df are not supplied, then the lite model will be used, otherwise the full model is used.

160 |

:param design_df: DataFrame 161 | :param aa_subseq_df: DataFrame 162 | :param domain_feature_df: DataFrame 163 | :param id_cols: list or str 164 | :return: list

165 | 166 |
167 | 168 |
169 | 170 |
171 |
172 | 173 |
174 | {% endraw %} 175 | 176 | {% raw %} 177 | 178 |
179 | 180 |
181 | {% endraw %} 182 | 183 | {% raw %} 184 | 185 |
186 |
187 | 188 |
189 |
190 |
design_df = pd.read_table('test_data/sgrna-designs.txt')
191 | design_targ_df = targetfeat.add_target_columns(design_df)
192 | id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']
193 | 
194 | 195 |
196 |
197 |
198 | 199 |
200 | {% endraw %} 201 | 202 | {% raw %} 203 | 204 |
205 |
206 | 207 |
208 |
209 |
aa_seq_df = targetdata.build_transcript_aa_seq_df(design_df, n_jobs=2)
210 | aa_subseq_df = targetfeat.get_aa_subseq_df(sg_designs=design_targ_df, aa_seq_df=aa_seq_df, width=16,
211 |                                            id_cols=id_cols)
212 | aa_subseq_df
213 | 
214 | 215 |
216 |
217 |
218 | 219 |
220 |
221 | 222 |
223 | 224 |
225 |
Getting amino acid sequences
226 | 
227 |
228 |
229 | 230 |
231 | 232 |
233 |
100%|██████████| 4/4 [00:04<00:00,  1.04s/it]
234 | 
235 |
236 |
237 | 238 |
239 | 240 | 241 |
242 |
243 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 |
Target TranscriptTarget Total LengthTranscript BasedescmoleculeseqidversionAA lensgRNA Context SequenceAA IndexTarget Cut LengthOrientationextended_seqAA 0-IndexedAA 0-Indexed paddedseq_startseq_endAA Subsequence
0ENST00000259457.8834ENST00000259457NoneproteinMAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...ENSP000002594573277TGGAGCAGATACAAGAGCAACTGAAGGGAT64191sense-----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF...63806496GVVYKDGIVLGADTRATEGMVVADKNCSKIHFI
1ENST00000259457.8834ENST00000259457NoneproteinMAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...ENSP000002594573277CCGGAAAACTGGCACGACCATCGCTGGGGT46137sense-----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF...45624678AKRGYKLPKVRKTGTTIAGVVYKDGIVLGADTR
2ENST00000394249.81863ENST00000394249NoneproteinMRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...ENSP000003777933620TAGAAAAAGATTTGCGCACCCAAGTGGAAT106316sense-----------------MRRSEVLAEESIVCLQKALNHLREIWELI...105122106138EEGETTILQLEKDLRTQVELMRKQKKERKQELK
3ENST00000394249.81863ENST00000394249NoneproteinMRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...ENSP000003777933620TGGCCTTTGACCCAGACATAATGGTGGCCA263787antisense-----------------MRRSEVLAEESIVCLQKALNHLREIWELI...262279263295WDRLQIPEEEREAVATIMSGSKAKVRKALQLEV
4ENST00000361337.32298ENST00000361337NoneproteinMSGDHLHNDSQIEADFRLNDSHKHKDKHKDREHRHKEHKKEKDREK...ENSP000003545222765AAATACTCACTCATCCTCATCTCGAGGTCT140420antisense-----------------MSGDHLHNDSQIEADFRLNDSHKHKDKHK...139156140172GYFVPPKEDIKPLKRPRDEDDADYKPKKIKTED
............................................................
395ENST00000454402.71023ENST00000454402NoneproteinMETSALKQQEQPAATKIRNLPWVEKYRPQTLNDLISHQDILSTIQK...ENSP000004082952340TGTCTTTATATAGCTGTTTCGCACAGGCTA74220antisense-----------------METSALKQQEQPAATKIRNLPWVEKYRPQ...739074106LYGPPGTGKTSTILACAKQLYKDKEFGSMVLEL
396ENST00000254998.3423ENST00000254998NoneproteinMASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...ENSP000002549982140TTGTCAATGTCTACTACACCACCATGGATA2779sense-----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD...26432759DQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTA
397ENST00000254998.3423ENST00000254998NoneproteinMASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...ENSP000002549982140GGCGTTTGCTGTCCCGCCTGTACATGGGCA39115sense-----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD...38553971VYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQ
398ENST00000381685.102067ENST00000381685NoneproteinMQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI...ENSP000003711015688ACTAGCAATGGCTTATCAGATCGAAGGTCA259776antisense-----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK...258275259291TMAVGTTTGQVLLYDLRSDKPLLVKDHQYGLPI
399ENST00000381685.102067ENST00000381685NoneproteinMQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI...ENSP000003711015688AAATTTTGTCTGATGACTACTCAAAGGTAT108322sense-----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK...107124108140CLDSEVVTFEILSDDYSKIVFLHNDRYIEFHSQ
526 |

400 rows × 19 columns

527 |
528 |
529 | 530 |
531 | 532 |
533 |
534 | 535 |
536 | {% endraw %} 537 | 538 | {% raw %} 539 | 540 |
541 |
542 | 543 |
544 |
545 |
domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)
546 | domain_feature_df = targetfeat.get_protein_domain_features(design_targ_df, domain_df, sources=None,
547 |                                                            id_cols=id_cols)
548 | 
549 | 550 |
551 |
552 |
553 | 554 |
555 |
556 | 557 |
558 | 559 |
560 |
Getting protein domains
561 | 
562 |
563 |
564 | 565 |
566 | 567 |
568 |
100%|██████████| 200/200 [00:48<00:00,  4.12it/s]
569 | 
570 |
571 |
572 | 573 |
574 |
575 | 576 |
577 | {% endraw %} 578 | 579 | {% raw %} 580 | 581 |
582 |
583 | 584 |
585 |
586 |
conservation_df = targetdata.build_conservation_df(design_df, n_jobs=max_n_jobs)
587 | conservation_feature_df = targetfeat.get_conservation_features(design_targ_df, conservation_df,
588 |                                                              small_width=2, large_width=16,
589 |                                                              conservation_column='ranked_conservation',
590 |                                                              id_cols=id_cols)
591 | conservation_feature_df
592 | 
593 | 594 |
595 |
596 |
597 | 598 |
599 |
600 | 601 |
602 | 603 |
604 |
Getting conservation
605 | 
606 |
607 |
608 | 609 |
610 | 611 |
612 |
100%|██████████| 200/200 [03:53<00:00,  1.17s/it]
613 | 
614 |
615 |
616 | 617 |
618 | 619 | 620 |
621 |
622 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 |
sgRNA Context SequenceTarget Cut LengthTarget TranscriptOrientationcons_4cons_32
0AAAAGAATGATGAAAAGACACCACAGGGAG244ENST00000610426.5sense0.2182310.408844
1AAAAGAGCCATGAATCTAAACATCAGGAAT640ENST00000223073.6sense0.1298250.278180
2AAAAGCGCCAAATGGCCCGAGAATTGGGAG709ENST00000331923.9sense0.4709060.532305
3AAACAGAAAAAGTTAAAATCACCAAGGTGT496ENST00000283882.4sense0.5805560.602708
4AAACAGATGGAAGATGCTTACCGGGGGACC132ENST00000393047.8sense0.2834470.414293
.....................
395TTTGATTGCATTAAGGTTGGACTCTGGATT246ENST00000249269.9sense0.5806120.618707
396TTTGCCCACAGCTCCAAAGCATCGCGGAGA130ENST00000227618.8sense0.3237700.416368
397TTTTACAGTGCGATGTATGATGTATGGCTT119ENST00000338366.6sense0.7880000.537417
398TTTTGGATCTCGTAGTGATTCAAGAGGGAA233ENST00000629496.3sense0.2396300.347615
399TTTTTGTTACTACAGGTTCGCTGCTGGGAA201ENST00000395840.6sense0.6937670.639044
749 |

400 rows × 6 columns

750 |
751 |
752 | 753 |
754 | 755 |
756 |
757 | 758 |
759 | {% endraw %} 760 | 761 | {% raw %} 762 | 763 |
764 |
765 | 766 |
767 |
768 |
predictions = predict_target(design_df=design_df,
769 |                              aa_subseq_df=aa_subseq_df,
770 |                              domain_feature_df=domain_feature_df,
771 |                              conservation_feature_df=conservation_feature_df)
772 | design_df['Target Score'] = predictions
773 | 
774 | 775 |
776 |
777 |
778 | 779 |
780 |
781 | 782 |
783 | 784 |
785 |
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
786 |   warnings.warn(
787 | /Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
788 |   warnings.warn(
789 | 
790 |
791 |
792 | 793 |
794 |
795 | 796 |
797 | {% endraw %} 798 | 799 | {% raw %} 800 | 801 |
802 |
803 | 804 |
805 |
806 |
lite_predictions = predict_target(design_df=design_df,
807 |                                   aa_subseq_df=aa_subseq_df)
808 | design_df['Target Score Lite'] = lite_predictions
809 | 
810 | 811 |
812 |
813 |
814 | 815 |
816 |
817 | 818 |
819 | 820 |
821 |
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
822 |   warnings.warn(
823 | /Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
824 |   warnings.warn(
825 | 
826 |
827 |
828 | 829 |
830 |
831 | 832 |
833 | {% endraw %} 834 | 835 | {% raw %} 836 | 837 |
838 |
839 | 840 |
841 |
842 |
design_df['sgRNA Context Sequence']
843 | 
844 | 845 |
846 |
847 |
848 | 849 |
850 |
851 | 852 |
853 | 854 | 855 | 856 |
857 |
0      TGGAGCAGATACAAGAGCAACTGAAGGGAT
858 | 1      CCGGAAAACTGGCACGACCATCGCTGGGGT
859 | 2      TAGAAAAAGATTTGCGCACCCAAGTGGAAT
860 | 3      TGGCCTTTGACCCAGACATAATGGTGGCCA
861 | 4      AAATACTCACTCATCCTCATCTCGAGGTCT
862 |                     ...              
863 | 395    TGTCTTTATATAGCTGTTTCGCACAGGCTA
864 | 396    TTGTCAATGTCTACTACACCACCATGGATA
865 | 397    GGCGTTTGCTGTCCCGCCTGTACATGGGCA
866 | 398    ACTAGCAATGGCTTATCAGATCGAAGGTCA
867 | 399    AAATTTTGTCTGATGACTACTCAAAGGTAT
868 | Name: sgRNA Context Sequence, Length: 400, dtype: object
869 |
870 | 871 |
872 | 873 |
874 |
875 | 876 |
877 | {% endraw %} 878 | 879 | {% raw %} 880 | 881 |
882 |
883 | 884 |
885 |
886 |
assert stats.pearsonr(design_df['Target Score'], design_df['Target Score Lite'])[0] > 0.7
887 | 
888 | 889 |
890 |
891 |
892 | 893 |
894 | {% endraw %} 895 | 896 | {% raw %} 897 | 898 |
899 |
900 | 901 |
902 |
903 |
sanger_df = pd.read_csv('test_data/Behan2019_activity.csv')
904 | gecko_df = pd.read_csv('test_data/Aguirre2016_activity.csv')
905 | 
906 | sanger_designs = sanger_df.merge(design_df, how='inner',
907 |                                  on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',
908 |                                      'Target Cut %'])
909 | gecko_designs = gecko_df.merge(design_df, how='inner',
910 |                                 on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',
911 |                                     'Target Cut %'])
912 | assert stats.pearsonr(sanger_designs['avg_mean_centered_neg_lfc'],
913 |                       sanger_designs['Target Score'])[0] > 0.2
914 | assert stats.pearsonr(gecko_designs['avg_mean_centered_neg_lfc'],
915 |                       gecko_designs['Target Score'])[0] > 0.05
916 | 
917 | 918 |
919 |
920 |
921 | 922 |
923 | {% endraw %} 924 | 925 | {% raw %} 926 | 927 |
928 |
929 | 930 |
931 |
932 |
rs_dev_target_lite_predictions = (pd.read_csv('test_data/target_lite_score_export.csv')
933 |                                   .rename({'Target Lite Score': 'Target Score Lite'}, axis=1))
934 | rs_dev_target_predictions = pd.read_csv('test_data/target_score_export.csv')
935 | merged_rs_dev_predictions = rs_dev_target_lite_predictions.merge(rs_dev_target_predictions,
936 |                                                                  how='inner')
937 | merged_rs_dev_rs3_predictions = (design_df
938 |                                  .merge(merged_rs_dev_predictions,
939 |                                         how='inner',
940 |                                         on=['sgRNA Context Sequence', 'Target Cut Length',
941 |                                             'Target Transcript', 'Orientation'],
942 |                                         suffixes=[' rs3', ' rs_dev']))
943 | assert np.allclose(merged_rs_dev_rs3_predictions['Target Score rs3'], merged_rs_dev_rs3_predictions['Target Score rs_dev'])
944 | assert np.allclose(merged_rs_dev_rs3_predictions['Target Score Lite rs3'], merged_rs_dev_rs3_predictions['Target Score Lite rs_dev'])
945 | 
946 | 947 |
948 |
949 |
950 | 951 |
952 | {% endraw %} 953 | 954 |
955 | 956 | 957 | -------------------------------------------------------------------------------- /docs/sidebar.json: -------------------------------------------------------------------------------- 1 | { 2 | "rs3": { 3 | "Overview": "/", 4 | "seq": "seq.html", 5 | "targetdata": "targetdata.html", 6 | "targetfeat": "targetfeat.html", 7 | "predicttarg": "predicttarg.html", 8 | "predict": "predict.html" 9 | } 10 | } -------------------------------------------------------------------------------- /docs/sitemap.xml: -------------------------------------------------------------------------------- 1 | --- 2 | layout: none 3 | search: exclude 4 | --- 5 | 6 | 7 | 8 | {% for post in site.posts %} 9 | {% unless post.search == "exclude" %} 10 | 11 | {{site.url}}{{post.url}} 12 | 13 | {% endunless %} 14 | {% endfor %} 15 | 16 | 17 | {% for page in site.pages %} 18 | {% unless page.search == "exclude" %} 19 | 20 | {{site.url}}{{ page.url}} 21 | 22 | {% endunless %} 23 | {% endfor %} 24 | -------------------------------------------------------------------------------- /rs3/RuleSet3.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/rs3/RuleSet3.pkl -------------------------------------------------------------------------------- /rs3/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.16" 2 | -------------------------------------------------------------------------------- /rs3/_nbdev.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED BY NBDEV! DO NOT EDIT! 2 | 3 | __all__ = ["index", "modules", "custom_doc_links", "git_url"] 4 | 5 | index = {"load_seq_model": "00_seq.ipynb", 6 | "featurize_context": "00_seq.ipynb", 7 | "predict_seq": "00_seq.ipynb", 8 | "ensembl_post": "01_targetdata.ipynb", 9 | "chunks": "01_targetdata.ipynb", 10 | "post_transcript_sequence_chunk": "01_targetdata.ipynb", 11 | "post_transcript_sequence": "01_targetdata.ipynb", 12 | "build_transcript_aa_seq_df": "01_targetdata.ipynb", 13 | "ensembl_get": "01_targetdata.ipynb", 14 | "get_translation_overlap": "01_targetdata.ipynb", 15 | "build_translation_overlap_df": "01_targetdata.ipynb", 16 | "write_transcript_data": "01_targetdata.ipynb", 17 | "get_transcript_info": "01_targetdata.ipynb", 18 | "get_conservation": "01_targetdata.ipynb", 19 | "get_exon_conservation": "01_targetdata.ipynb", 20 | "get_transcript_conservation": "01_targetdata.ipynb", 21 | "get_transcript_conservation_safe": "01_targetdata.ipynb", 22 | "build_conservation_df": "01_targetdata.ipynb", 23 | "write_conservation_data": "01_targetdata.ipynb", 24 | "add_target_columns": "02_targetfeat.ipynb", 25 | "get_position_features": "02_targetfeat.ipynb", 26 | "get_one_aa_frac": "02_targetfeat.ipynb", 27 | "get_aa_aromaticity": "02_targetfeat.ipynb", 28 | "get_aa_hydrophobicity": "02_targetfeat.ipynb", 29 | "get_aa_ip": "02_targetfeat.ipynb", 30 | "get_aa_secondary_structure": "02_targetfeat.ipynb", 31 | "featurize_aa_seqs": "02_targetfeat.ipynb", 32 | "extract_amino_acid_subsequence": "02_targetfeat.ipynb", 33 | "get_aa_subseq_df": "02_targetfeat.ipynb", 34 | "get_amino_acid_features": "02_targetfeat.ipynb", 35 | "get_protein_domain_features": "02_targetfeat.ipynb", 36 | "get_conservation_ranges": "02_targetfeat.ipynb", 37 | "get_conservation_features": "02_targetfeat.ipynb", 38 | "merge_feature_dfs": "02_targetfeat.ipynb", 39 | "load_target_model": "03_predicttarg.ipynb", 40 | "predict_target": "03_predicttarg.ipynb", 41 | "predict_seq_tracr": "04_predict.ipynb", 42 | "combine_target_seq_scores": "04_predict.ipynb", 43 | "predict": "04_predict.ipynb"} 44 | 45 | modules = ["seq.py", 46 | "targetdata.py", 47 | "targetfeat.py", 48 | "predicttarg.py", 49 | "predict.py"] 50 | 51 | doc_url = "https://gpp-rnd.github.io/rs3/" 52 | 53 | git_url = "https://github.com/gpp-rnd/rs3/tree/master/" 54 | 55 | def custom_doc_links(name): return None 56 | -------------------------------------------------------------------------------- /rs3/predict.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 04_predict.ipynb (unless otherwise specified). 2 | 3 | __all__ = ['predict_seq_tracr', 'combine_target_seq_scores', 'predict'] 4 | 5 | # Cell 6 | import pandas as pd 7 | import warnings 8 | 9 | from .seq import predict_seq 10 | from .targetdata import (build_translation_overlap_df, 11 | build_transcript_aa_seq_df, 12 | build_conservation_df) 13 | from .targetfeat import (add_target_columns, 14 | get_aa_subseq_df, 15 | get_protein_domain_features, 16 | get_conservation_features) 17 | from .predicttarg import predict_target 18 | 19 | # Cell 20 | from pandas.api.types import is_list_like 21 | 22 | def predict_seq_tracr(design_df, tracr, context_col, ref_tracrs, n_jobs): 23 | if not tracr in ref_tracrs: 24 | raise ValueError('tracrRNA must be one of ' + ','.join(ref_tracrs)) 25 | design_df['RS3 Sequence Score (' + tracr + ' tracr)'] = predict_seq(design_df[context_col], sequence_tracr=tracr, 26 | n_jobs=n_jobs) 27 | 28 | def combine_target_seq_scores(design_df, tracr, target_score_col, lite): 29 | full_rs_name = 'RS3 Sequence (' + tracr + ' tracr) + Target Score' 30 | if lite: 31 | full_rs_name += 'Lite' 32 | design_df[full_rs_name] = \ 33 | design_df['RS3 Sequence Score (' + tracr + ' tracr)'] + \ 34 | design_df[target_score_col] 35 | 36 | def predict(design_df, tracr=None, target=False, 37 | aa_seq_file=None, domain_file=None, 38 | conservatin_file=None, 39 | id_cols=None, 40 | context_col='sgRNA Context Sequence', 41 | transcript_id_col='Target Transcript', 42 | transcript_base_col='Transcript Base', 43 | transcript_len_col='Target Total Length', 44 | n_jobs_min=1, n_jobs_max=1, lite=True): 45 | """Make predictions using RS3 46 | 47 | :param design_df: DataFrame 48 | :param tracr: str or list 49 | :param target: bool, whether to include target scores 50 | :param aa_seq_file: str, path to precomputed amino acid sequences 51 | :param domain_file: str, path to precomputed domain file 52 | :param id_cols: list or None 53 | :param context_col: str 54 | :param transcript_id_col: str 55 | :param transcript_base_col: str 56 | :param transcript_len_col: str 57 | :param n_jobs_min: int 58 | :return: DataFram 59 | """ 60 | if id_cols is None: 61 | id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation'] 62 | out_df = design_df.copy() 63 | ref_tracrs = ['Hsu2013', 'Chen2013'] 64 | if type(tracr) is str: 65 | predict_seq_tracr(out_df, tracr, context_col, ref_tracrs, n_jobs=n_jobs_max) 66 | elif is_list_like(tracr): 67 | for t in tracr: 68 | predict_seq_tracr(out_df, t, context_col, ref_tracrs, n_jobs=n_jobs_max) 69 | else: 70 | raise ValueError('Could not recognize tracr input: ' + str(tracr)) 71 | if target: 72 | out_df = add_target_columns(out_df, 73 | transcript_base_col=transcript_base_col) 74 | transcript_bases = pd.Series(out_df[transcript_base_col].unique()) 75 | if aa_seq_file is None: 76 | aa_seq_df = build_transcript_aa_seq_df(out_df, 77 | transcript_id_col=transcript_id_col, 78 | transcript_len_col=transcript_len_col, 79 | n_jobs=n_jobs_min) 80 | else: 81 | aa_seq_df = pd.read_parquet(aa_seq_file, engine='pyarrow', 82 | filters=[[(transcript_base_col, 'in', transcript_bases)]]) 83 | missing_transcripts_aa = transcript_bases[~transcript_bases.isin(aa_seq_df[transcript_base_col])] 84 | if len(missing_transcripts_aa) > 0: 85 | warnings.warn('Missing amino acid sequences for transcripts: ' + 86 | ','.join(missing_transcripts_aa)) 87 | out_df['Missing translation information'] = out_df[transcript_base_col].isin(missing_transcripts_aa) 88 | aa_subseq_df = get_aa_subseq_df(sg_designs=out_df, aa_seq_df=aa_seq_df, width=16, 89 | id_cols=id_cols) 90 | if lite: 91 | target_score_col = 'Target Score Lite' 92 | out_df[target_score_col] = predict_target(design_df=out_df, aa_subseq_df=aa_subseq_df, 93 | id_cols=id_cols) 94 | else: 95 | if domain_file is None: 96 | domain_df = build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=n_jobs_min) 97 | else: 98 | domain_df = pd.read_parquet(domain_file, engine='pyarrow', 99 | filters=[[(transcript_base_col, 'in', transcript_bases)]]) 100 | # No warning for domain, since some transcripts aren't annotated with any domains 101 | domain_feature_df = get_protein_domain_features(out_df, domain_df, 102 | id_cols=id_cols, transcript_base_col=transcript_base_col) 103 | if conservatin_file is None: 104 | conservation_df = build_conservation_df(out_df, n_jobs=n_jobs_max) 105 | else: 106 | conservation_df = pd.read_parquet(conservatin_file, engine='pyarrow', 107 | filters=[[(transcript_base_col, 'in', transcript_bases)]]) 108 | missing_transcripts_cons = transcript_bases[~transcript_bases.isin(conservation_df[transcript_base_col])] 109 | if len(missing_transcripts_cons) > 0: 110 | warnings.warn('Missing conservation scores for transcripts: ' + 111 | ','.join(missing_transcripts_cons)) 112 | out_df['Missing conservation information'] = out_df[transcript_base_col].isin(missing_transcripts_cons) 113 | conservation_feature_df = get_conservation_features(out_df, conservation_df, 114 | small_width=2, large_width=16, 115 | conservation_column='ranked_conservation', 116 | id_cols=id_cols) 117 | target_score_col = 'Target Score' 118 | out_df[target_score_col] = predict_target(design_df=out_df, aa_subseq_df=aa_subseq_df, 119 | domain_feature_df=domain_feature_df, 120 | conservation_feature_df=conservation_feature_df, 121 | id_cols=id_cols) 122 | if type(tracr) is str: 123 | combine_target_seq_scores(out_df, tracr, target_score_col, lite) 124 | else: # list 125 | for t in tracr: 126 | combine_target_seq_scores(out_df, t, target_score_col, lite) 127 | return out_df -------------------------------------------------------------------------------- /rs3/predicttarg.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 03_predicttarg.ipynb (unless otherwise specified). 2 | 3 | __all__ = ['load_target_model', 'predict_target'] 4 | 5 | # Cell 6 | from rs3 import targetfeat 7 | import joblib 8 | import os 9 | 10 | # Cell 11 | def load_target_model(lite=False): 12 | """Load rule set 3 target model""" 13 | if lite: 14 | model_name = 'target_lite_model.pkl' 15 | else: 16 | model_name = 'target_model.pkl' 17 | model = joblib.load(os.path.join(os.path.dirname(__file__), model_name)) 18 | return model 19 | 20 | # Cell 21 | def predict_target(design_df, aa_subseq_df, domain_feature_df=None, 22 | conservation_feature_df=None, id_cols=None): 23 | """Make predictions using the Rule Set 3 target model. Note that if the protein_domain_df 24 | or conservation_df are not supplied, then the lite model will be used, otherwise the full model is used. 25 | 26 | :param design_df: DataFrame 27 | :param aa_subseq_df: DataFrame 28 | :param domain_feature_df: DataFrame 29 | :param id_cols: list or str 30 | :return: list 31 | """ 32 | if (domain_feature_df is None) or (conservation_feature_df is None): 33 | lite = True 34 | domain_feature_df = None 35 | conservation_feature_df = None 36 | else: 37 | lite = False 38 | model = load_target_model(lite=lite) 39 | if id_cols is None: 40 | id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation'] 41 | target_feature_df, target_feature_cols = targetfeat.merge_feature_dfs(design_df, 42 | aa_subseq_df=aa_subseq_df, 43 | domain_df=domain_feature_df, 44 | conservation_df=conservation_feature_df, 45 | id_cols=id_cols) 46 | X_target = target_feature_df[target_feature_cols] 47 | predictions = model.predict(X_target) 48 | return predictions -------------------------------------------------------------------------------- /rs3/seq.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 00_seq.ipynb (unless otherwise specified). 2 | 3 | __all__ = ['load_seq_model', 'featurize_context', 'predict_seq'] 4 | 5 | # Cell 6 | import joblib 7 | import sglearn 8 | import pandas as pd 9 | import os 10 | 11 | # Cell 12 | def load_seq_model(): 13 | """Load rule set 3 sequence model""" 14 | model = joblib.load(os.path.join(os.path.dirname(__file__), 'RuleSet3.pkl')) 15 | return model 16 | 17 | # Cell 18 | def featurize_context(context_sequences, sequence_tracr='Hsu2013', ref_tracrs=None, 19 | n_jobs=1): 20 | """Featurize context sequences 21 | 22 | :param context_sequences: list-like 23 | :param sequence_tracr: list-like or str 24 | :return: DataFrame, feature matrix 25 | """ 26 | if ref_tracrs is None: 27 | ref_tracrs = ['Hsu2013', 'Chen2013'] 28 | context_series = pd.Series(context_sequences) 29 | if not (context_series.str.len() == 30).all(): 30 | raise ValueError('All context sequences must be 30 nucleotides') 31 | featurized_sgrnas = sglearn.featurize_guides(context_sequences, 32 | n_jobs=n_jobs) 33 | for tracr in ref_tracrs: 34 | if type(sequence_tracr) is str: 35 | featurized_sgrnas[tracr + ' tracr'] = int(sequence_tracr == tracr) 36 | else: # list-like 37 | featurized_sgrnas[tracr + ' tracr'] = ((pd.Series(sequence_tracr) == tracr) 38 | .astype(int) 39 | .to_list()) 40 | return featurized_sgrnas 41 | 42 | # Cell 43 | def predict_seq(context_sequences, sequence_tracr='Hsu2013', ref_tracrs=None, n_jobs=1): 44 | """Predict the activity of context sequence for SpCas9 Knockout using sequence information only 45 | 46 | :param context_sequences: list of str 47 | :return: list of float, predictions 48 | """ 49 | model = load_seq_model() 50 | print('Calculating sequence-based features') 51 | featurized_sgrnas = featurize_context(context_sequences, sequence_tracr=sequence_tracr, ref_tracrs=ref_tracrs, 52 | n_jobs=n_jobs) 53 | seq_predictions = model.predict(featurized_sgrnas) 54 | return seq_predictions -------------------------------------------------------------------------------- /rs3/target_lite_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/rs3/target_lite_model.pkl -------------------------------------------------------------------------------- /rs3/target_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/rs3/target_model.pkl -------------------------------------------------------------------------------- /rs3/targetdata.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 01_targetdata.ipynb (unless otherwise specified). 2 | 3 | __all__ = ['ensembl_post', 'chunks', 'post_transcript_sequence_chunk', 'post_transcript_sequence', 4 | 'build_transcript_aa_seq_df', 'ensembl_get', 'get_translation_overlap', 'build_translation_overlap_df', 5 | 'write_transcript_data', 'get_transcript_info', 'get_conservation', 'get_exon_conservation', 6 | 'get_transcript_conservation', 'get_transcript_conservation_safe', 'build_conservation_df', 7 | 'write_conservation_data'] 8 | 9 | # Cell 10 | import requests 11 | import json 12 | import pandas as pd 13 | from joblib import Parallel, delayed 14 | from tqdm import tqdm 15 | import warnings 16 | import os 17 | from scipy import stats 18 | import multiprocessing 19 | 20 | # Cell 21 | def ensembl_post(ext, data, headers=None, params=None): 22 | """Generic wrapper for using POST requests to the ensembl rest API 23 | 24 | :param ext: str, url extension 25 | :param data: dict, query data 26 | :param headers: dict or None, meta-information for query 27 | :param params: dict or None, parameters for query 28 | :return: Response object 29 | """ 30 | if params is None: 31 | params = {} 32 | if headers is None: 33 | headers = {} 34 | data = json.dumps(data) 35 | r = requests.post("https://rest.ensembl.org"+ext, headers=headers, data=data, params=params) 36 | if not r.ok: 37 | r.raise_for_status() 38 | else: 39 | return r 40 | 41 | # Cell 42 | def chunks(lst, n): 43 | """Yield successive n-sized chunks from lst. 44 | 45 | lst: list 46 | n: int 47 | 48 | returns: generator of list chunks 49 | """ 50 | for i in range(0, len(lst), n): 51 | yield lst[i:i + n] 52 | 53 | def post_transcript_sequence_chunk(ids, params, headers): 54 | """Helper function for post_transcript_sequence 55 | 56 | :param ids: list 57 | :param params: dict 58 | :param headers: dict 59 | :return: dict 60 | """ 61 | data = {'ids': ids} 62 | r = ensembl_post("/sequence/id/", data=data, params=params, 63 | headers=headers) 64 | seq = r.json() 65 | return seq 66 | 67 | def post_transcript_sequence(ensembl_ids, seq_type='protein', max_queries=50, 68 | n_jobs=1, **kwargs): 69 | """Request multiple types of sequence by stable identifier. Supports feature masking and expand options. 70 | Uses https://rest.ensembl.org/documentation/info/sequence_id_post 71 | 72 | :param ensembl_ids: list of str 73 | :param seq_type: str, one of [genomic, cds, cdna, protein] 74 | :param max_queries: int, maximum number of queries for post 75 | :param n_jobs: int, number of jobs to run in parallel 76 | :param kwargs: additional parameter arguments 77 | :return: list, dict of sequences 5' to 3' in the same orientation as the input transcript 78 | """ 79 | headers={"content-type" : "application/json", "accept" : "application/json"} 80 | params = {'type': seq_type, **kwargs} 81 | id_chunks = list(chunks(ensembl_ids, max_queries)) 82 | seqs = Parallel(n_jobs=n_jobs)(delayed(post_transcript_sequence_chunk) 83 | (ids, params, headers) for ids in tqdm(id_chunks)) 84 | # flatten list 85 | seqs = [item for sublist in seqs for item in sublist] 86 | return seqs 87 | 88 | # Cell 89 | def build_transcript_aa_seq_df(design_df, transcript_id_col='Target Transcript', 90 | transcript_len_col='Target Total Length', n_jobs=1): 91 | """Get amino acid sequence for transcripts of interest 92 | 93 | :param design_df: DataFrame 94 | :param transcript_id_col: str, column with ensembl transcript id 95 | :param transcript_len_col: str, column with length of transcript 96 | :param n_jobs: int, number of jobs to use to query transcripts 97 | :return: DataFrame 98 | """ 99 | unique_transcripts = design_df[[transcript_id_col, transcript_len_col]].drop_duplicates() 100 | unique_transcripts['Transcript Base'] = unique_transcripts[transcript_id_col].str.split('.', expand=True)[0] 101 | print("Getting amino acid sequences") 102 | aa_seqs = post_transcript_sequence(unique_transcripts['Transcript Base'].to_list(), 103 | n_jobs=n_jobs) 104 | aa_seq_df = (pd.DataFrame(aa_seqs) 105 | .rename({'query': 'Transcript Base'}, axis=1)) 106 | missing_seqs = (unique_transcripts['Transcript Base'][~unique_transcripts['Transcript Base'].isin( 107 | aa_seq_df['Transcript Base'] 108 | )]) 109 | if len(missing_seqs) > 0: 110 | warnings.warn('Unable to find translations for the following transcripts: ' + ', '.join(missing_seqs)) 111 | aa_seq_len_df = (unique_transcripts.merge(aa_seq_df, on='Transcript Base')) 112 | aa_seq_len_df['AA len'] = aa_seq_len_df['seq'].str.len() 113 | filtered_aa_seq_len_df = (aa_seq_len_df[aa_seq_len_df[transcript_len_col] == 114 | (aa_seq_len_df['AA len'] + 1)*3 ] 115 | .reset_index(drop=True)) 116 | filtered_seqs = (aa_seq_len_df['Transcript Base'][~aa_seq_len_df['Transcript Base'].isin( 117 | filtered_aa_seq_len_df['Transcript Base'] 118 | )]) 119 | if len(filtered_seqs) > 0: 120 | warnings.warn('Filtered transcripts where the transcript length and amino acid ' + 121 | 'sequence length did not agree: ' + ', '.join(filtered_seqs)) 122 | return filtered_aa_seq_len_df 123 | 124 | # Cell 125 | def ensembl_get(ext, query=None, headers=None, params=None): 126 | """Generic wrapper for using GET requests to the ensembl rest API 127 | 128 | ext: str, url extension | 129 | query: str or None, end of url extension specifying species, taxon, esnembl_id etc | 130 | headers: dict or None, meta-information for query | 131 | params: dict or None, parameters for query | 132 | 133 | returns: Response object 134 | """ 135 | if query is None: 136 | query = '' 137 | if params is None: 138 | params = {} 139 | if headers is None: 140 | headers = {} 141 | r = requests.get("https://rest.ensembl.org"+ext+query, params=params, headers=headers) 142 | if not r.ok: 143 | r.raise_for_status() 144 | else: 145 | return r 146 | 147 | def get_translation_overlap(ensembl_id): 148 | """Get features that overlap with translation, such as protein domains 149 | 150 | :param ensembl_id: str 151 | :return: DataFrame 152 | """ 153 | headers = {'content-type': 'application/json'} 154 | ext = '/overlap/translation/' + ensembl_id 155 | r = ensembl_get(ext, headers=headers) 156 | decoded = r.json() 157 | return decoded 158 | 159 | # Cell 160 | def build_translation_overlap_df(protein_ids, n_jobs=1): 161 | """Get protein domain information 162 | 163 | :param protein_ids: list of str, ensemble protein IDs 164 | :param n_jobs: int 165 | :return: DataFrame 166 | """ 167 | print('Getting protein domains') 168 | translation_overlap_list = Parallel(n_jobs=n_jobs)(delayed(get_translation_overlap) 169 | (id) for id in tqdm(protein_ids)) 170 | # flatten list 171 | translation_overlap_list = [item for sublist in translation_overlap_list for item in sublist] 172 | translation_overlap_df = pd.DataFrame(translation_overlap_list).rename({'Parent': 'Transcript Base'}, axis=1) 173 | return translation_overlap_df 174 | 175 | # Cell 176 | def write_transcript_data(design_df, transcript_id_col='Target Transcript', 177 | transcript_len_col='Target Total Length', n_jobs=1, 178 | overwrite=True, filepath='./data/target_data/', 179 | aa_seq_name='aa_seqs.pq', 180 | protein_domain_name='protein_domains.pq'): 181 | """Write amino acid sequences and protein domain information to parquet files 182 | 183 | :param design_df: DataFrame 184 | :param transcript_id_col: str 185 | :param transcript_len_col: str 186 | :param n_jobs: int 187 | :param overwrite: bool, whether to overwrite existing file 188 | :param filepath: str, directory for output sequences 189 | :param aa_seq_name: str, name of amino acid sequence file 190 | :param protein_domain_name: str, name of protein domain file 191 | """ 192 | if (os.path.isfile(filepath + aa_seq_name) or os.path.isfile(filepath + protein_domain_name)) and (not overwrite): 193 | raise ValueError('Transcript data already exits and cannot be overwritten') 194 | else: 195 | transcript_aa_seq_df = build_transcript_aa_seq_df(design_df, transcript_id_col=transcript_id_col, 196 | transcript_len_col=transcript_len_col, 197 | n_jobs=n_jobs) 198 | translation_overlap_df = build_translation_overlap_df(transcript_aa_seq_df['id'], 199 | n_jobs=n_jobs) 200 | if not os.path.isdir(filepath): 201 | print('Creating new directory ' + filepath) 202 | os.makedirs(filepath) 203 | transcript_aa_seq_df.to_parquet(path=filepath + aa_seq_name, engine='pyarrow', 204 | index=False) 205 | translation_overlap_df.to_parquet(path=filepath + protein_domain_name, engine='pyarrow', 206 | index=False) 207 | 208 | # Cell 209 | def get_transcript_info(base_transcript): 210 | """Using an ensembl transcript ID, get 211 | 212 | :param base_transcript: str 213 | :return: (exon_df, trans_sr, chr) 214 | exon_df: DataFrame, with global exon start and end position 215 | trans_sr: Series, with global translation start and stop positions for CDS and translation length 216 | chr: str 217 | 218 | """ 219 | r = ensembl_get("/lookup/id/" + base_transcript + "?expand=1", 220 | headers={"Content-Type": "application/json"}, params={'expand': '1'}) 221 | decoded = r.json() 222 | exon_df = pd.DataFrame(decoded['Exon']) 223 | trans_sr = pd.Series(decoded['Translation']) 224 | chr = decoded['seq_region_name'] 225 | return exon_df, trans_sr, chr 226 | 227 | # Cell 228 | def get_conservation(chr, start, end, genome): 229 | """Get conservation scores for a given region of a genome 230 | 231 | :param chr: str, chromosome number 232 | :param start: int 233 | :param end: int 234 | :param genome: str 235 | :return: DataFrame 236 | """ 237 | api_url = 'http://api.genome.ucsc.edu/getData/track' 238 | if genome == 'hg38': 239 | track = 'phyloP100way' 240 | elif genome == 'mm39': 241 | track = 'phyloP35way' 242 | else: 243 | raise ValueError('Genome not recognized') 244 | chrom = 'chr' + chr 245 | params = { 246 | 'genome': genome, 247 | 'track': track, 248 | 'start': start, 249 | 'end': end, 250 | 'chrom': chrom 251 | } 252 | results = requests.get(api_url, data=params) 253 | if results.ok: 254 | value_df = (pd.DataFrame([pd.Series(x) for x in pd.read_json(results.content.decode('utf8'))[chrom].values]) 255 | .rename(columns={'value': 'conservation'})) 256 | else: 257 | raise ValueError(results.reason) 258 | return value_df 259 | 260 | # Cell 261 | def get_exon_conservation(exon_df, chr, genome): 262 | """Get conservation scores for each exon 263 | 264 | :param exon_df: DataFrame 265 | :param chr: str 266 | :param genome: str 267 | :return: DataFrame 268 | """ 269 | conservation_dict = {} 270 | for i, row in exon_df.set_index('id').iterrows(): 271 | # subtract one since the nucleotide conservation corresponds to the "end" index 272 | conservation_dict[i] = get_conservation(chr, row['start'] - 1, row['end'], genome) 273 | # get the conservation of i 274 | conservation_df = (pd.concat(conservation_dict) 275 | .reset_index(level=0) 276 | .reset_index(drop=True) 277 | .rename({'level_0': 'exon_id', 278 | 'end': 'genomic position'}, axis=1) 279 | .drop('start', axis=1)) 280 | return conservation_df 281 | 282 | 283 | def get_transcript_conservation(transcript_id, target_strand, genome): 284 | """Get conservation scores for a transcript 285 | 286 | :param transcript_id: str 287 | :param target_strand: str, '+' or '-' 288 | :param genome: str 289 | :return: DataFrame 290 | """ 291 | exon_df, trans_sr, chr = get_transcript_info(transcript_id) 292 | # only include translated positions 293 | exon_df['start'] = exon_df['start'].apply(lambda x: max(x, trans_sr['start'])) 294 | exon_df['end'] = exon_df['end'].apply(lambda x: min(x, trans_sr['end'])) 295 | exon_df = exon_df[exon_df['end'] > exon_df['start']].reset_index(drop=True) 296 | conservation_df = get_exon_conservation(exon_df, chr, genome) 297 | conservation_df['Transcript Base'] = transcript_id 298 | if target_strand == '-': 299 | ascending = False 300 | else: 301 | ascending = True 302 | conservation_df = (conservation_df 303 | .sort_values('genomic position', ascending=ascending) 304 | .reset_index(drop=True)) 305 | conservation_df['target position'] = conservation_df.index + 1 306 | conservation_df['chromosome'] = chr 307 | conservation_df['genome'] = genome 308 | conservation_df['translation length'] = trans_sr['length'] 309 | return conservation_df 310 | 311 | # Cell 312 | def get_transcript_conservation_safe(transcript_id, target_strand, genome): 313 | """Helper function for parrallel query. Return None when conservation dataframe cannot be assembled""" 314 | try: 315 | return get_transcript_conservation(transcript_id, target_strand, genome) 316 | except: 317 | return None 318 | 319 | 320 | def build_conservation_df(design_df, n_jobs=1): 321 | transcript_refseq_df = (design_df[['Target Transcript', 'Strand of Target', 'Target Total Length']] 322 | .drop_duplicates()) 323 | if not (transcript_refseq_df['Target Transcript'].str.startswith('ENST') | 324 | transcript_refseq_df['Target Transcript'].str.startswith('ENSMUST')).all(): 325 | raise ValueError('Must supply human or mouse Ensembl transcript IDs as input') 326 | print('Getting conservation') 327 | transcript_refseq_df['Transcript Base'] = (transcript_refseq_df['Target Transcript'].str.split('.', expand=True)[0]) 328 | transcript_refseq_df['genome'] = transcript_refseq_df['Transcript Base'].apply(lambda trans: 329 | 'mm39' if 'MUS' in trans else 'hg38') 330 | all_transcript_conservation_list = Parallel(n_jobs)(delayed(get_transcript_conservation_safe) 331 | (row['Transcript Base'], 332 | row['Strand of Target'], 333 | row['genome']) 334 | for _, row in tqdm(transcript_refseq_df.iterrows(), 335 | total=transcript_refseq_df.shape[0])) 336 | transcript_conservation_list = [] 337 | failed_list = [] 338 | transcript_list = transcript_refseq_df['Transcript Base'].to_list() 339 | for i, conservation_df in enumerate(all_transcript_conservation_list): 340 | if conservation_df is None: 341 | failed_list.append(transcript_list[i]) 342 | else: 343 | transcript_conservation_list.append(conservation_df) 344 | if len(failed_list) > 0: 345 | warnings.warn('Failed to get conservation scores for ' + str(len(failed_list)) + 346 | ' transcripts' + ', '.join(failed_list)) 347 | transcript_conservation_df = (pd.concat(transcript_conservation_list)) 348 | transcript_cons_designs = (transcript_conservation_df 349 | .merge(transcript_refseq_df, how='inner', 350 | on=['Transcript Base', 'genome'])) 351 | filtered_transcript_conservation = transcript_cons_designs[ 352 | (transcript_cons_designs['translation length'] + 1)*3 == transcript_cons_designs['Target Total Length']].copy() 353 | mismatched_transcripts = transcript_conservation_df['Transcript Base'][ 354 | ~transcript_conservation_df['Transcript Base'].isin(filtered_transcript_conservation['Transcript Base'])] 355 | if len(mismatched_transcripts) > 0: 356 | warnings.warn('Filtered: ' + str(len(mismatched_transcripts)) + 357 | ' transcripts with mismatched length:' + ','.join(mismatched_transcripts)) 358 | filtered_transcript_conservation['ranked_conservation'] = (filtered_transcript_conservation.groupby('Transcript Base') 359 | ['conservation'] 360 | .rank(pct=True)) 361 | return filtered_transcript_conservation 362 | 363 | # Cell 364 | def write_conservation_data(design_df, n_jobs=1, 365 | overwrite=True, filepath='./data/target_data/', 366 | cons_file_name='conservation.pq'): 367 | """Write conservation scores to parquet files 368 | 369 | :param design_df: DataFrame 370 | :param n_jobs: int 371 | :param overwrite: bool, whether to overwrite existing file 372 | :param filepath: str, directory for output sequences 373 | :param cons_file_name: str, name of conservation file 374 | """ 375 | if os.path.isfile(filepath + cons_file_name) and (not overwrite): 376 | raise ValueError('Conservation data already exits and cannot be overwritten') 377 | else: 378 | conservation_df = build_conservation_df(design_df, n_jobs=n_jobs) 379 | if not os.path.isdir(filepath): 380 | print('Creating new directory ' + filepath) 381 | os.makedirs(filepath) 382 | conservation_df.to_parquet(path=filepath + cons_file_name, engine='pyarrow', 383 | index=False) -------------------------------------------------------------------------------- /rs3/targetfeat.py: -------------------------------------------------------------------------------- 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 02_targetfeat.ipynb (unless otherwise specified). 2 | 3 | __all__ = ['add_target_columns', 'get_position_features', 'get_one_aa_frac', 'get_aa_aromaticity', 4 | 'get_aa_hydrophobicity', 'get_aa_ip', 'get_aa_secondary_structure', 'featurize_aa_seqs', 5 | 'extract_amino_acid_subsequence', 'get_aa_subseq_df', 'get_amino_acid_features', 6 | 'get_protein_domain_features', 'get_conservation_ranges', 'get_conservation_features', 'merge_feature_dfs'] 7 | 8 | # Cell 9 | import pandas as pd 10 | from Bio.SeqUtils.ProtParam import ProteinAnalysis 11 | import warnings 12 | 13 | # Cell 14 | def add_target_columns(design_df, transcript_id_col='Target Transcript', 15 | cut_pos_col='Target Cut Length', 16 | transcript_base_col='Transcript Base'): 17 | """Add ['AA Index' and 'Transcript Base'] to design df 18 | 19 | :param design_df: DataFrame 20 | :return: DataFrame 21 | """ 22 | out_df = design_df.copy() 23 | out_df['AA Index'] = (out_df[cut_pos_col] - 1) // 3 + 1 24 | out_df[transcript_base_col] = out_df[transcript_id_col].str.split('.', expand=True)[0] 25 | return out_df 26 | 27 | # Cell 28 | def get_position_features(sg_df, id_cols): 29 | """Get features ['Target Cut %', 'sense'] 30 | 31 | :param sg_df: DataFrame 32 | :param id_cols: list 33 | :return: DataFrame 34 | """ 35 | position_df = sg_df[id_cols + ['Target Cut %']].copy() 36 | position_df['sense'] = sg_df['Orientation'] == 'sense' 37 | return position_df 38 | 39 | # Cell 40 | def get_one_aa_frac(feature_dict, aa_sequence, aas): 41 | """Get fraction of single aa 42 | 43 | :param feature_dict: dict, feature dictionary 44 | :param aa_sequence: str, amino acid sequence 45 | :param aas: list, list of amino acids 46 | """ 47 | for aa in aas: 48 | aa_frac = aa_sequence.count(aa) / len(aa_sequence) 49 | feature_dict[aa] = aa_frac 50 | 51 | # Cell 52 | def get_aa_aromaticity(feature_dict, analyzed_seq): 53 | """Get fraction of aromatic amino acids in a sequence. 54 | 55 | Phe (F) + Trp (W) + Tyr (Y) 56 | 57 | :param feature_dict: 58 | :param analyzed_seq: ProteinAnalysis object 59 | """ 60 | feature_dict['Aromaticity'] = analyzed_seq.aromaticity() 61 | 62 | 63 | def get_aa_hydrophobicity(feature_dict, analyzed_seq): 64 | """Grand Average of Hydropathy 65 | 66 | The GRAVY value is calculated by adding the hydropathy value for each residue and dividing 67 | by the length of the sequence (Kyte and Doolittle; 1982). The larger the number, the more hydrophobic the 68 | amino acid 69 | 70 | :param feature_dict: dict 71 | :param analyzed_seq: ProteinAnalysis object 72 | """ 73 | feature_dict['Hydrophobicity'] = analyzed_seq.gravy() 74 | 75 | 76 | def get_aa_ip(feature_dict, analyzed_seq): 77 | """Get the Isoelectric Point of an amino acid sequence 78 | 79 | Charge of amino acid 80 | 81 | :param feature_dict: dict 82 | :param analyzed_seq: ProteinAnalysis object 83 | """ 84 | feature_dict['Isoelectric Point'] = analyzed_seq.isoelectric_point() 85 | 86 | 87 | def get_aa_secondary_structure(feature_dict, analyzed_seq): 88 | """Get the fraction of amion acids that tend to be in a helix, turn or sheet 89 | 90 | :param feature_dict: dict 91 | :param analyzed_seq: ProteinAnalysis object 92 | """ 93 | feature_dict['Helix'], feature_dict['Turn'], feature_dict['Sheet'] = analyzed_seq.secondary_structure_fraction() 94 | 95 | 96 | # Cell 97 | def featurize_aa_seqs(aa_sequences, features=None): 98 | """Get feature DataFrame for a list of amino acid sequences 99 | 100 | :param aa_sequences: list of str 101 | :param features: list or None 102 | :return: DataFrame 103 | """ 104 | if features is None: 105 | features = ['Pos. Ind. 1mer', 'Hydrophobicity', 'Aromaticity', 106 | 'Isoelectric Point', 'Secondary Structure'] 107 | aas = ['A', 'C', 'D', 'E', 'F', 108 | 'G', 'H', 'I', 'K', 'L', 109 | 'M', 'N', 'P', 'Q', 'R', 110 | 'S', 'T', 'V', 'W', 'Y', '*'] 111 | clean_aa_seqs = aa_sequences.str.replace('\*|-', '', regex=True) 112 | feature_dict_list = [] 113 | for i, (aa_sequence, clean_sequence) in enumerate(zip(aa_sequences, clean_aa_seqs)): 114 | analyzed_seq = ProteinAnalysis(clean_sequence) 115 | feature_dict = {} 116 | if 'Pos. Ind. 1mer' in features: 117 | get_one_aa_frac(feature_dict, aa_sequence, aas) 118 | if 'Hydrophobicity' in features: 119 | get_aa_hydrophobicity(feature_dict, analyzed_seq) 120 | if 'Aromaticity' in features: 121 | get_aa_aromaticity(feature_dict, analyzed_seq) 122 | if 'Isoelectric Point' in features: 123 | get_aa_ip(feature_dict, analyzed_seq) 124 | if 'Secondary Structure' in features: 125 | get_aa_secondary_structure(feature_dict, analyzed_seq) 126 | feature_dict_list.append(feature_dict) 127 | feature_matrix = pd.DataFrame(feature_dict_list) 128 | feature_matrix.index = aa_sequences 129 | return feature_matrix 130 | 131 | # Cell 132 | def extract_amino_acid_subsequence(sg_aas, width): 133 | """ Get the amino acid subsequence with a width of `width` on either side of the Amino Acid index 134 | 135 | :param sg_aas: DataFrame, sgRNA designs merged with amino acid sequence 136 | :param width: int 137 | :return: DataFrame 138 | """ 139 | # Pad the sequences at the beginning and end, so our index doesn't go over 140 | l_padding = '-' * (width + 1) # can cut just before the CDS 141 | r_padding = '-' * width # can cut the stop codon 142 | # add stop codon at the end of the sequence 143 | sg_aas_subseq = sg_aas.copy() 144 | sg_aas_subseq['extended_seq'] = l_padding + sg_aas_subseq['seq'] + '*' + r_padding 145 | sg_aas_subseq['AA 0-Indexed'] = sg_aas_subseq['AA Index'] - 1 146 | sg_aas_subseq['AA 0-Indexed padded'] = sg_aas_subseq['AA 0-Indexed'] + len(l_padding) 147 | sg_aas_subseq['seq_start'] = (sg_aas_subseq['AA 0-Indexed padded'] - width).astype(int) 148 | sg_aas_subseq['seq_end'] = (sg_aas_subseq['AA 0-Indexed padded'] + width).astype(int) 149 | sg_aas_subseq['AA Subsequence'] = sg_aas_subseq.apply(lambda row: row['extended_seq'][row['seq_start']:(row['seq_end'] + 1)], 150 | axis=1) 151 | return sg_aas_subseq 152 | 153 | 154 | # Cell 155 | def get_aa_subseq_df(sg_designs, aa_seq_df, width, id_cols, 156 | transcript_base_col='Transcript Base', 157 | target_transcript_col='Target Transcript', 158 | aa_index_col='AA Index'): 159 | """Get the amino acid subsequences for a design dataframe 160 | 161 | :param sg_designs: DataFrame 162 | :param aa_seq_df: DataFrame, Transcript Base and (AA) seq 163 | :param width: int, length on each side of the cut site 164 | :param transcript_base_col: str 165 | :param target_transcript_col: str 166 | :param aa_index_col: str 167 | :return: DataFrame 168 | """ 169 | sg_aas = (aa_seq_df.merge(sg_designs[list(set(id_cols + 170 | [target_transcript_col, transcript_base_col, aa_index_col]))], 171 | how='inner', 172 | on=[target_transcript_col, transcript_base_col])) 173 | sg_aas_subseq = extract_amino_acid_subsequence(sg_aas, width) 174 | return sg_aas_subseq 175 | 176 | # Cell 177 | def get_amino_acid_features(aa_subseq_df, features, id_cols): 178 | """Featurize amino acid sequences 179 | 180 | :param aa_subseq_df: DataFrame 181 | :param features: list 182 | :param id_cols: list 183 | :return: DataFrame 184 | """ 185 | 186 | # Zero-indexed for python 187 | # filter out sequences without the canonical amino acids 188 | aa_set = set('ARNDCQEGHILKMFPSTWYV*-') 189 | filtered_sg_aas = (aa_subseq_df[aa_subseq_df['AA Subsequence'].apply(lambda s: set(s) <= aa_set)] 190 | .reset_index(drop=True)) 191 | filtered_diff = (aa_subseq_df.shape[0] - filtered_sg_aas.shape[0]) 192 | if filtered_diff > 0: 193 | warnings.warn('Ignored ' + str(filtered_diff) + ' amino acid sequences with non-canonical amino acids') 194 | aa_features = featurize_aa_seqs(filtered_sg_aas['AA Subsequence'], features=features) 195 | aa_features_annot = pd.concat([filtered_sg_aas[id_cols + ['AA Subsequence']] 196 | .reset_index(drop=True), 197 | aa_features.reset_index(drop=True)], axis=1) 198 | return aa_features_annot 199 | 200 | 201 | # Cell 202 | def get_protein_domain_features(sg_design_df, protein_domains, id_cols, 203 | sources=None, 204 | transcript_base_col='Transcript Base', 205 | aa_index_col='AA Index', 206 | domain_type_col='type', 207 | domain_start_col='start', 208 | domain_end_col='end'): 209 | """Get binary dataframe of protein domains 210 | 211 | :param sg_design_df: DataFrame, with columns [transcript_base_col, aa_index_col] 212 | :param protein_domains: DataFrame, with columns [transcript_base_col, domain_type_col] 213 | :param id_cols: list 214 | :param sources: list. list of database types to include 215 | :param transcript_base_col: str 216 | :param aa_index_col: str 217 | :param domain_type_col: str 218 | :param domain_start_col: str 219 | :param domain_end_col: str 220 | :return: DataFrame, with binary features for protein domains 221 | """ 222 | if sources is None: 223 | sources = ['Pfam', 'PANTHER', 'HAMAP', 'SuperFamily', 'TIGRfam', 'ncoils', 'Gene3D', 224 | 'Prosite_patterns', 'Seg', 'SignalP', 'TMHMM', 'MobiDBLite', 225 | 'PIRSF', 'PRINTS', 'Smart', 'Prosite_profiles'] # exclude sifts 226 | protein_domains = protein_domains[protein_domains[domain_type_col].isin(sources)] 227 | clean_designs = sg_design_df[list(set(id_cols + [transcript_base_col, aa_index_col]))].copy() 228 | designs_domains = clean_designs.merge(protein_domains, 229 | how='inner', on=transcript_base_col) 230 | # Note - not every sgRNA will be present in the feature df 231 | filtered_domains = (designs_domains[designs_domains[aa_index_col].between(designs_domains[domain_start_col], 232 | designs_domains[domain_end_col])] 233 | .copy()) 234 | filtered_domains = filtered_domains[id_cols + [domain_type_col]].drop_duplicates() 235 | filtered_domains['present'] = 1 236 | domain_feature_df = (filtered_domains.pivot_table(values='present', 237 | index=id_cols, 238 | columns='type', fill_value=0) 239 | .reset_index()) 240 | # Ensure all domain columns are present for testing 241 | full_column_df = pd.DataFrame(columns=id_cols + sources, dtype=int) # empty 242 | domain_feature_df = pd.concat([full_column_df, domain_feature_df]).fillna(0) 243 | domain_feature_df[sources] = domain_feature_df[sources].astype(int) 244 | return domain_feature_df 245 | 246 | # Cell 247 | def get_conservation_ranges(cut_pos, small_width, large_width): 248 | small_range = range(cut_pos - small_width + 1, cut_pos + small_width + 1) 249 | large_range = range(cut_pos - large_width + 1, cut_pos + large_width + 1) 250 | return small_range, large_range 251 | 252 | 253 | def get_conservation_features(sg_designs, conservation_df, conservation_column, 254 | small_width, large_width, id_cols): 255 | """Get conservation features 256 | 257 | :param sg_designs: DataFrame 258 | :param conservation_df: DataFrame, tidy conservation scores indexed by Transcript Base and target position 259 | :param conservation_column: str, name of column to calculate scores with 260 | :param small_width: int, small window length to average scores in one direction 261 | :param large_width: int, large window length to average scores in the one direction 262 | :return: DataFrame of conservation features 263 | """ 264 | sg_designs_width = sg_designs[id_cols + ['Transcript Base']].copy() 265 | sg_designs_width['target position small'], sg_designs_width['target position large'] = \ 266 | zip(*sg_designs_width['Target Cut Length'] 267 | .apply(get_conservation_ranges, small_width=small_width, 268 | large_width=large_width)) 269 | small_width_conservation = (sg_designs_width.drop('target position large', axis=1) 270 | .rename({'target position small': 'target position'}, axis=1) 271 | .explode('target position') 272 | .merge(conservation_df, how='inner', 273 | on=['Target Transcript', 'Transcript Base', 'target position']) 274 | .groupby(id_cols) 275 | .agg(cons=(conservation_column, 'mean')) 276 | .rename({'cons': 'cons_' + str(small_width * 2)}, axis=1) 277 | .reset_index()) 278 | large_width_conservation = (sg_designs_width.drop('target position small', axis=1) 279 | .rename({'target position large': 'target position'}, axis=1) 280 | .explode('target position') 281 | .merge(conservation_df, how='inner', 282 | on=['Target Transcript', 'Transcript Base', 'target position']) 283 | .groupby(id_cols) 284 | .agg(cons=(conservation_column, 'mean')) 285 | .rename({'cons': 'cons_' + str(large_width * 2)}, axis=1) 286 | .reset_index()) 287 | cons_feature_df = small_width_conservation.merge(large_width_conservation, how='outer', 288 | on=id_cols) 289 | return cons_feature_df 290 | 291 | # Cell 292 | def merge_feature_dfs(design_df, 293 | aa_subseq_df, aa_features=None, 294 | domain_df=None, 295 | conservation_df=None, 296 | id_cols=None): 297 | if id_cols is None: 298 | id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 299 | 'Target Transcript', 'Orientation'] 300 | if aa_features is None: 301 | aa_features = ['Pos. Ind. 1mer', 302 | 'Hydrophobicity', 'Aromaticity', 303 | 'Isoelectric Point', 'Secondary Structure'] 304 | if design_df[id_cols].drop_duplicates().shape[0] != design_df.shape[0]: 305 | raise ValueError('id_cols must uniquely identify rows of the design dataframe') 306 | feature_df_dict = dict() 307 | feature_list = list() 308 | position_feature_df = get_position_features(design_df, id_cols=id_cols) 309 | feature_df_dict['position'] = position_feature_df 310 | feature_list.extend(['Target Cut %', 'sense']) 311 | if domain_df is not None: 312 | feature_df_dict['domain'] = domain_df 313 | feature_list.extend(['Pfam', 'PANTHER', 'HAMAP', 'SuperFamily', 'TIGRfam', 'ncoils', 'Gene3D', 314 | 'Prosite_patterns', 'Seg', 'SignalP', 'TMHMM', 'MobiDBLite', 315 | 'PIRSF', 'PRINTS', 'Smart', 'Prosite_profiles']) 316 | if conservation_df is not None: 317 | feature_df_dict['conservation'] = conservation_df 318 | # hardcoded 319 | feature_list.extend(['cons_4', 'cons_32']) 320 | aa_feature_df = get_amino_acid_features(aa_subseq_df, aa_features, id_cols) 321 | feature_list.extend(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 322 | 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '*', 323 | 'Hydrophobicity', 'Aromaticity', 'Isoelectric Point', 'Helix', 'Turn', 324 | 'Sheet']) 325 | feature_df_dict['aa'] = aa_feature_df 326 | feature_df = design_df[id_cols] 327 | for key, df in feature_df_dict.items(): 328 | feature_df = pd.merge(feature_df, df, how='left', on=id_cols) 329 | return feature_df, feature_list 330 | -------------------------------------------------------------------------------- /settings.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | host = github 3 | lib_name = rs3 4 | user = gpp-rnd 5 | description = Predict the activity of CRISPR sgRNAs 6 | keywords = rs3, CRISPR, sgrna 7 | author = Peter Deweirdt 8 | author_email = petedeweirdt@gmail.com 9 | copyright = Genetic Perturbation Platform, Broad Institute 10 | branch = master 11 | version = 0.0.16 12 | min_python = 3.7 13 | audience = Developers 14 | language = English 15 | custom_sidebar = False 16 | license = apache2 17 | status = 2 18 | requirements = joblib>=1.0.1 pandas>=1.0.0 lightgbm>=3.0.0,<=3.3.5 sglearn>=1.2.5 tqdm>=4.61.2 pyarrow>=4.0.1 biopython>=1.78 scikit-learn>=0.24.2 requests>=2.25.1 19 | dev_requirements = gpplot>=0.5.0 seaborn>=0.11.0 scipy>=1.0.1 jupyterlab>=3.0.0 nbdev>=1.1.14,<2.0.0 matplotlib>=3.3.4 tabulate>=0.8.9 jupyter-client<=6.1.12 20 | nbs_path = . 21 | doc_path = docs 22 | recursive = False 23 | doc_host = https://gpp-rnd.github.io 24 | doc_baseurl = /rs3/ 25 | git_url = https://github.com/gpp-rnd/rs3/tree/master/ 26 | lib_path = rs3 27 | title = rs3 28 | 29 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from pkg_resources import parse_version 2 | from configparser import ConfigParser 3 | import setuptools 4 | import re 5 | import sys 6 | 7 | assert parse_version(setuptools.__version__) >= parse_version('36.2') 8 | 9 | # note: all settings are in settings.ini; edit there, not here 10 | config = ConfigParser(delimiters=['=']) 11 | config.read('settings.ini') 12 | cfg = config['DEFAULT'] 13 | 14 | cfg_keys = 'version description keywords author author_email'.split() 15 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split() 16 | for o in expected: 17 | assert o in cfg, "missing expected setting: {}".format(o) 18 | setup_cfg = {o: cfg[o] for o in cfg_keys} 19 | 20 | if len(sys.argv) > 1 and sys.argv[1] == 'version': 21 | print(setup_cfg['version']) 22 | exit() 23 | 24 | licenses = { 25 | 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'), 26 | 'mit': ('MIT License', 'OSI Approved :: MIT License'), 27 | 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'), 28 | 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'), 29 | 'bsd3': ('BSD License', 'OSI Approved :: BSD License'), 30 | } 31 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha', 32 | '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ] 33 | py_versions = '2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8'.split() 34 | 35 | lic = licenses.get(cfg['license'].lower(), (cfg['license'], None)) 36 | min_python = cfg['min_python'] 37 | 38 | requirements = ['pip', 'packaging'] 39 | if cfg.get('requirements'): 40 | requirements += cfg.get('requirements', '').split() 41 | if cfg.get('pip_requirements'): 42 | requirements += cfg.get('pip_requirements', '').split() 43 | dev_requirements = (cfg.get('dev_requirements') or '').split() 44 | 45 | long_description = open('README.md').read() 46 | # ![png](docs/images/output_13_0.png) 47 | for ext in ['png', 'svg']: 48 | long_description = re.sub(r'!\['+ext+'\]\((.*)\)', '!['+ext+']('+'https://raw.githubusercontent.com/{}/{}'.format(cfg['user'],cfg['lib_name'])+'/'+cfg['branch']+'/\\1)', long_description) 49 | long_description = re.sub(r'src=\"(.*)\.'+ext+'\"', 'src=\"https://raw.githubusercontent.com/{}/{}'.format(cfg['user'],cfg['lib_name'])+'/'+cfg['branch']+'/\\1.'+ext+'\"', long_description) 50 | 51 | setuptools.setup( 52 | name = cfg['lib_name'], 53 | license = lic[0], 54 | classifiers = [ 55 | 'Development Status :: ' + statuses[int(cfg['status'])], 56 | 'Intended Audience :: ' + cfg['audience'].title(), 57 | 'Natural Language :: ' + cfg['language'].title(), 58 | ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []), 59 | url = cfg['git_url'], 60 | packages = setuptools.find_packages(), 61 | include_package_data = True, 62 | install_requires = requirements, 63 | extras_require={ 'dev': dev_requirements }, 64 | python_requires = '>=' + cfg['min_python'], 65 | long_description = long_description, 66 | long_description_content_type = 'text/markdown', 67 | zip_safe = False, 68 | entry_points = { 'console_scripts': cfg.get('console_scripts','').split() }, 69 | **setup_cfg) 70 | 71 | -------------------------------------------------------------------------------- /target_lite_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/target_lite_model.pkl -------------------------------------------------------------------------------- /target_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/target_model.pkl -------------------------------------------------------------------------------- /test_data/codon_map.csv: -------------------------------------------------------------------------------- 1 | Codon,Amino Acid,Property 2 | TTT,F,Nonpolar 3 | TTC,F,Nonpolar 4 | TTA,L,Nonpolar 5 | TTG,L,Nonpolar 6 | CTT,L,Nonpolar 7 | CTC,L,Nonpolar 8 | CTA,L,Nonpolar 9 | CTG,L,Nonpolar 10 | ATT,I,Nonpolar 11 | ATC,I,Nonpolar 12 | ATA,I,Nonpolar 13 | ATG,M,Nonpolar 14 | GTT,V,Nonpolar 15 | GTC,V,Nonpolar 16 | GTA,V,Nonpolar 17 | GTG,V,Nonpolar 18 | TCT,S,Polar 19 | TCC,S,Polar 20 | TCA,S,Polar 21 | TCG,S,Polar 22 | CCT,P,Nonpolar 23 | CCC,P,Nonpolar 24 | CCA,P,Nonpolar 25 | CCG,P,Nonpolar 26 | ACT,T,Polar 27 | ACC,T,Polar 28 | ACA,T,Polar 29 | ACG,T,Polar 30 | GCT,A,Nonpolar 31 | GCC,A,Nonpolar 32 | GCA,A,Nonpolar 33 | GCG,A,Nonpolar 34 | TAT,Y,Polar 35 | TAC,Y,Polar 36 | TAA,*,Stop 37 | TAG,*,Stop 38 | CAT,H,Basic 39 | CAC,H,Basic 40 | CAA,Q,Polar 41 | CAG,Q,Polar 42 | AAT,N,Polar 43 | AAC,N,Polar 44 | AAA,K,Basic 45 | AAG,K,Basic 46 | GAT,D,Acidic 47 | GAC,D,Acidic 48 | GAA,E,Acidic 49 | GAG,E,Acidic 50 | TGT,C,Polar 51 | TGC,C,Polar 52 | TGA,*,Stop 53 | TGG,W,Nonpolar 54 | CGT,R,Basic 55 | CGC,R,Basic 56 | CGA,R,Basic 57 | CGG,R,Basic 58 | AGT,S,Polar 59 | AGC,S,Polar 60 | AGA,R,Basic 61 | AGG,R,Basic 62 | GGT,G,Nonpolar 63 | GGC,G,Nonpolar 64 | GGA,G,Nonpolar 65 | GGG,G,Nonpolar -------------------------------------------------------------------------------- /test_data/target_data/aa_seqs.pq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/test_data/target_data/aa_seqs.pq -------------------------------------------------------------------------------- /test_data/target_data/conservation.pq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/test_data/target_data/conservation.pq -------------------------------------------------------------------------------- /test_data/target_data/protein_domains.pq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/test_data/target_data/protein_domains.pq --------------------------------------------------------------------------------