├── .devcontainer.json
├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── 00_seq.ipynb
├── 01_targetdata.ipynb
├── 02_targetfeat.ipynb
├── 03_predicttarg.ipynb
├── 04_predict.ipynb
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── RuleSet3.pkl
├── docker-compose.yml
├── docs
    ├── .gitignore
    ├── Gemfile
    ├── Gemfile.lock
    ├── _config.yml
    ├── _data
    │   ├── sidebars
    │   │   └── home_sidebar.yml
    │   └── topnav.yml
    ├── feed.xml
    ├── images
    │   ├── output_18_0.png
    │   └── output_42_0.png
    ├── index.html
    ├── predict.html
    ├── predicttarg.html
    ├── seq.html
    ├── sidebar.json
    ├── sitemap.xml
    ├── targetdata.html
    └── targetfeat.html
├── index.ipynb
├── rs3
    ├── RuleSet3.pkl
    ├── __init__.py
    ├── _nbdev.py
    ├── predict.py
    ├── predicttarg.py
    ├── seq.py
    ├── target_lite_model.pkl
    ├── target_model.pkl
    ├── targetdata.py
    └── targetfeat.py
├── settings.ini
├── setup.py
├── target_lite_model.pkl
├── target_model.pkl
└── test_data
    ├── Aguirre2016_activity.csv
    ├── Behan2019_activity.csv
    ├── codon_map.csv
    ├── sgrna-designs.txt
    ├── sgrna-designs_BCL2L1_MCL1_EEF2.txt
    ├── sgrna-designs_BCL2L1_MCL1_EEF2_na.txt
    └── target_data
        ├── aa_seqs.pq
        ├── conservation.pq
        └── protein_domains.pq


/.devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "nbdev_template-codespaces",
 3 |     "dockerComposeFile": "docker-compose.yml",
 4 |     "service": "watcher",
 5 |     "settings": {"terminal.integrated.shell.linux": "/bin/bash"},
 6 |     "mounts": [ "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ],
 7 |     "forwardPorts": [4000, 8080],
 8 |     "appPort": [4000, 8080],
 9 |     "extensions": ["ms-python.python",
10 |                    "ms-azuretools.vscode-docker"],
11 |     "runServices": ["notebook", "jekyll", "watcher"],
12 |     "postStartCommand": "pip install -e ."
13 | }
14 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   build:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |     - uses: actions/checkout@v1
 8 |     - uses: actions/setup-python@v1
 9 |       with:
10 |         python-version: '3.7'
11 |         architecture: 'x64'
12 |     - name: Install the library
13 |       run: |
14 |         pip install nbdev jupyter
15 |         pip install -e .[dev]
16 |     - name: Read all notebooks
17 |       run: |
18 |         nbdev_read_nbs
19 |     - name: Check if all notebooks are cleaned
20 |       run: |
21 |         echo "Check we are starting with clean git checkout"
22 |         if [ -n "$(git status -uno -s)" ]; then echo "git status is not clean"; false; fi
23 |         echo "Trying to strip out notebooks"
24 |         nbdev_clean_nbs
25 |         echo "Check that strip out was unnecessary"
26 |         git status -s # display the status to see which nbs need cleaning up
27 |         if [ -n "$(git status -uno -s)" ]; then echo -e "!!! Detected unstripped out notebooks\n!!!Remember to run nbdev_install_git_hooks"; false; fi
28 |     - name: Check if there is no diff library/notebooks
29 |       run: |
30 |         if [ -n "$(nbdev_diff_nbs)" ]; then echo -e "!!! Detected difference between the notebooks and the library"; false; fi
31 |     - name: Run tests
32 |       run: |
33 |         nbdev_test_nbs --fname=index.ipynb --n_workers=1
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.bak
  2 | .gitattributes
  3 | .last_checked
  4 | .gitconfig
  5 | *.bak
  6 | *.log
  7 | *~
  8 | ~*
  9 | _tmp*
 10 | tmp*
 11 | tags
 12 | 
 13 | # Byte-compiled / optimized / DLL files
 14 | __pycache__/
 15 | *.py[cod]
 16 | *$py.class
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | env/
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | .eggs/
 30 | lib/
 31 | lib64/
 32 | parts/
 33 | sdist/
 34 | var/
 35 | wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | 
 40 | # PyInstaller
 41 | #  Usually these files are written by a python script from a template
 42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 43 | *.manifest
 44 | *.spec
 45 | 
 46 | # Installer logs
 47 | pip-log.txt
 48 | pip-delete-this-directory.txt
 49 | 
 50 | # Unit test / coverage reports
 51 | htmlcov/
 52 | .tox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | .hypothesis/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # celery beat schedule file
 89 | celerybeat-schedule
 90 | 
 91 | # SageMath parsed files
 92 | *.sage.py
 93 | 
 94 | # dotenv
 95 | .env
 96 | 
 97 | # virtualenv
 98 | .venv
 99 | venv/
100 | ENV/
101 | 
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 | 
106 | # Rope project settings
107 | .ropeproject
108 | 
109 | # mkdocs documentation
110 | /site
111 | 
112 | # mypy
113 | .mypy_cache/
114 | 
115 | .vscode
116 | *.swp
117 | 
118 | # osx generated files
119 | .DS_Store
120 | .DS_Store?
121 | .Trashes
122 | ehthumbs.db
123 | Thumbs.db
124 | .idea
125 | 
126 | # pytest
127 | .pytest_cache
128 | 
129 | # tools/trust-doc-nbs
130 | docs_src/.last_checked
131 | 
132 | # symlinks to fastai
133 | docs_src/fastai
134 | tools/fastai
135 | 
136 | # link checker
137 | checklink/cookies.txt
138 | 
139 | # .gitconfig is now autogenerated
140 | .gitconfig
141 | 
142 | 


--------------------------------------------------------------------------------
/02_targetfeat.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# default_exp targetfeat"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# targetfeat\n",
 17 |     "> Module to generate target site features"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# export\n",
 27 |     "import pandas as pd\n",
 28 |     "from Bio.SeqUtils.ProtParam import ProteinAnalysis\n",
 29 |     "import warnings"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "from rs3 import targetdata"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "import multiprocessing\n",
 48 |     "max_n_jobs = multiprocessing.cpu_count()"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# export\n",
 58 |     "def add_target_columns(design_df, transcript_id_col='Target Transcript',\n",
 59 |     "                       cut_pos_col='Target Cut Length',\n",
 60 |     "                       transcript_base_col='Transcript Base'):\n",
 61 |     "    \"\"\"Add ['AA Index' and 'Transcript Base'] to design df\n",
 62 |     "\n",
 63 |     "    :param design_df: DataFrame\n",
 64 |     "    :return: DataFrame\n",
 65 |     "    \"\"\"\n",
 66 |     "    out_df = design_df.copy()\n",
 67 |     "    out_df['AA Index'] = (out_df[cut_pos_col] - 1) // 3 + 1\n",
 68 |     "    out_df[transcript_base_col] = out_df[transcript_id_col].str.split('.', expand=True)[0]\n",
 69 |     "    return out_df"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "design_df = pd.read_table('test_data/sgrna-designs.txt')\n",
 79 |     "design_targ_df = add_target_columns(design_df)\n",
 80 |     "assert 'AA Index' in design_targ_df.columns"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "## Position Features\n",
 88 |     "\n",
 89 |     "The first feature class we consider is where the guide targets within the annotated transcript"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# export\n",
 99 |     "def get_position_features(sg_df, id_cols):\n",
100 |     "    \"\"\"Get  features ['Target Cut %', 'sense']\n",
101 |     "\n",
102 |     "    :param sg_df: DataFrame\n",
103 |     "    :param id_cols: list\n",
104 |     "    :return: DataFrame\n",
105 |     "    \"\"\"\n",
106 |     "    position_df = sg_df[id_cols + ['Target Cut %']].copy()\n",
107 |     "    position_df['sense'] = sg_df['Orientation'] == 'sense'\n",
108 |     "    return position_df"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "## Amino Acid Features\n",
116 |     "\n",
117 |     "We calculate a set of features from the amino acid sequence around the cutsite itself"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "aas = ['A', 'C', 'D', 'E', 'F',\n",
127 |     "       'G', 'H', 'I', 'K', 'L',\n",
128 |     "       'M', 'N', 'P', 'Q', 'R',\n",
129 |     "       'S', 'T', 'V', 'W', 'Y', '*']"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "# export\n",
139 |     "def get_one_aa_frac(feature_dict, aa_sequence, aas):\n",
140 |     "    \"\"\"Get fraction of single aa\n",
141 |     "\n",
142 |     "    :param feature_dict: dict, feature dictionary\n",
143 |     "    :param aa_sequence: str, amino acid sequence\n",
144 |     "    :param aas: list, list of amino acids\n",
145 |     "    \"\"\"\n",
146 |     "    for aa in aas:\n",
147 |     "        aa_frac = aa_sequence.count(aa) / len(aa_sequence)\n",
148 |     "        feature_dict[aa] = aa_frac"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "one_aa_ft = {}\n",
158 |     "get_one_aa_frac(one_aa_ft, 'ACDG*-', aas)\n",
159 |     "assert one_aa_ft['A'] == 1/6\n",
160 |     "assert one_aa_ft['Q'] == 0"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "# export\n",
170 |     "def get_aa_aromaticity(feature_dict, analyzed_seq):\n",
171 |     "    \"\"\"Get fraction of aromatic amino acids in a sequence.\n",
172 |     "\n",
173 |     "    Phe (F) + Trp (W) + Tyr (Y)\n",
174 |     "\n",
175 |     "    :param feature_dict:\n",
176 |     "    :param analyzed_seq: ProteinAnalysis object\n",
177 |     "    \"\"\"\n",
178 |     "    feature_dict['Aromaticity'] = analyzed_seq.aromaticity()\n",
179 |     "\n",
180 |     "\n",
181 |     "def get_aa_hydrophobicity(feature_dict, analyzed_seq):\n",
182 |     "    \"\"\"Grand Average of Hydropathy\n",
183 |     "\n",
184 |     "     The GRAVY value is calculated by adding the hydropathy value for each residue and dividing\n",
185 |     "     by the length of the sequence (Kyte and Doolittle; 1982). The larger the number, the more hydrophobic the\n",
186 |     "     amino acid\n",
187 |     "\n",
188 |     "    :param feature_dict: dict\n",
189 |     "    :param analyzed_seq: ProteinAnalysis object\n",
190 |     "    \"\"\"\n",
191 |     "    feature_dict['Hydrophobicity'] = analyzed_seq.gravy()\n",
192 |     "\n",
193 |     "\n",
194 |     "def get_aa_ip(feature_dict, analyzed_seq):\n",
195 |     "    \"\"\"Get the Isoelectric Point of an amino acid sequence\n",
196 |     "\n",
197 |     "    Charge of amino acid\n",
198 |     "\n",
199 |     "    :param feature_dict: dict\n",
200 |     "    :param analyzed_seq: ProteinAnalysis object\n",
201 |     "    \"\"\"\n",
202 |     "    feature_dict['Isoelectric Point'] = analyzed_seq.isoelectric_point()\n",
203 |     "\n",
204 |     "\n",
205 |     "def get_aa_secondary_structure(feature_dict, analyzed_seq):\n",
206 |     "    \"\"\"Get the fraction of amion acids that tend to be in a helix, turn or sheet\n",
207 |     "\n",
208 |     "    :param feature_dict: dict\n",
209 |     "    :param analyzed_seq: ProteinAnalysis object\n",
210 |     "    \"\"\"\n",
211 |     "    feature_dict['Helix'], feature_dict['Turn'], feature_dict['Sheet'] = analyzed_seq.secondary_structure_fraction()\n"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "aa_biochemical_fts1 = {}\n",
221 |     "get_aa_aromaticity(aa_biochemical_fts1, ProteinAnalysis('FWYA'))\n",
222 |     "aa_biochemical_fts2 = {}\n",
223 |     "get_aa_aromaticity(aa_biochemical_fts2, ProteinAnalysis('AAAA'))\n",
224 |     "assert aa_biochemical_fts1['Aromaticity'] > aa_biochemical_fts2['Aromaticity']"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "# export\n",
234 |     "def featurize_aa_seqs(aa_sequences, features=None):\n",
235 |     "    \"\"\"Get feature DataFrame for a list of amino acid sequences\n",
236 |     "\n",
237 |     "    :param aa_sequences: list of str\n",
238 |     "    :param features: list or None\n",
239 |     "    :return: DataFrame\n",
240 |     "    \"\"\"\n",
241 |     "    if features is None:\n",
242 |     "        features = ['Pos. Ind. 1mer', 'Hydrophobicity', 'Aromaticity',\n",
243 |     "                    'Isoelectric Point', 'Secondary Structure']\n",
244 |     "    aas = ['A', 'C', 'D', 'E', 'F',\n",
245 |     "           'G', 'H', 'I', 'K', 'L',\n",
246 |     "           'M', 'N', 'P', 'Q', 'R',\n",
247 |     "           'S', 'T', 'V', 'W', 'Y', '*']\n",
248 |     "    clean_aa_seqs = aa_sequences.str.replace('\\*|-', '', regex=True)\n",
249 |     "    feature_dict_list = []\n",
250 |     "    for i, (aa_sequence, clean_sequence) in enumerate(zip(aa_sequences, clean_aa_seqs)):\n",
251 |     "        analyzed_seq = ProteinAnalysis(clean_sequence)\n",
252 |     "        feature_dict = {}\n",
253 |     "        if 'Pos. Ind. 1mer' in features:\n",
254 |     "            get_one_aa_frac(feature_dict, aa_sequence, aas)\n",
255 |     "        if 'Hydrophobicity' in features:\n",
256 |     "            get_aa_hydrophobicity(feature_dict, analyzed_seq)\n",
257 |     "        if 'Aromaticity' in features:\n",
258 |     "            get_aa_aromaticity(feature_dict, analyzed_seq)\n",
259 |     "        if 'Isoelectric Point' in features:\n",
260 |     "            get_aa_ip(feature_dict, analyzed_seq)\n",
261 |     "        if 'Secondary Structure' in features:\n",
262 |     "            get_aa_secondary_structure(feature_dict, analyzed_seq)\n",
263 |     "        feature_dict_list.append(feature_dict)\n",
264 |     "    feature_matrix = pd.DataFrame(feature_dict_list)\n",
265 |     "    feature_matrix.index = aa_sequences\n",
266 |     "    return feature_matrix"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "ft_dict_df = featurize_aa_seqs(pd.Series(['ACDG*-', 'CDG*--', 'LLLLLL']))\n",
276 |     "assert ft_dict_df.loc['LLLLLL', 'Hydrophobicity'] == ft_dict_df['Hydrophobicity'].max()"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "# export\n",
286 |     "def extract_amino_acid_subsequence(sg_aas, width):\n",
287 |     "    \"\"\" Get the amino acid subsequence with a width of `width` on either side of the Amino Acid index\n",
288 |     "\n",
289 |     "    :param sg_aas: DataFrame, sgRNA designs merged with amino acid sequence\n",
290 |     "    :param width: int\n",
291 |     "    :return: DataFrame\n",
292 |     "    \"\"\"\n",
293 |     "    # Pad the sequences at the beginning and end, so our index doesn't go over\n",
294 |     "    l_padding = '-' * (width + 1)  # can cut just before the CDS\n",
295 |     "    r_padding = '-' * width  # can cut the stop codon\n",
296 |     "    # add stop codon at the end of the sequence\n",
297 |     "    sg_aas_subseq = sg_aas.copy()\n",
298 |     "    sg_aas_subseq['extended_seq'] = l_padding + sg_aas_subseq['seq'] + '*' + r_padding\n",
299 |     "    sg_aas_subseq['AA 0-Indexed'] = sg_aas_subseq['AA Index'] - 1\n",
300 |     "    sg_aas_subseq['AA 0-Indexed padded'] = sg_aas_subseq['AA 0-Indexed'] + len(l_padding)\n",
301 |     "    sg_aas_subseq['seq_start'] = (sg_aas_subseq['AA 0-Indexed padded'] - width).astype(int)\n",
302 |     "    sg_aas_subseq['seq_end'] = (sg_aas_subseq['AA 0-Indexed padded'] + width).astype(int)\n",
303 |     "    sg_aas_subseq['AA Subsequence'] = sg_aas_subseq.apply(lambda row: row['extended_seq'][row['seq_start']:(row['seq_end'] + 1)],\n",
304 |     "                                                    axis=1)\n",
305 |     "    return sg_aas_subseq\n"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "small_aa_seq_df = pd.DataFrame({'AA Index': [1, 5, 9],\n",
315 |     "                                    'seq': ['MAVLKYSLW']*3})\n",
316 |     "small_aa_subseq_df = extract_amino_acid_subsequence(small_aa_seq_df, 2)\n",
317 |     "actual_subseqs = small_aa_subseq_df['AA Subsequence']\n",
318 |     "expected_subseqs = ['--MAV', 'VLKYS', 'SLW*-']\n",
319 |     "assert len(actual_subseqs) == len(expected_subseqs)\n",
320 |     "assert all([a == b for a, b in zip(actual_subseqs, expected_subseqs)])"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "# export\n",
330 |     "def get_aa_subseq_df(sg_designs, aa_seq_df, width, id_cols,\n",
331 |     "                     transcript_base_col='Transcript Base',\n",
332 |     "                     target_transcript_col='Target Transcript',\n",
333 |     "                     aa_index_col='AA Index'):\n",
334 |     "    \"\"\"Get the amino acid subsequences for a design dataframe\n",
335 |     "\n",
336 |     "    :param sg_designs: DataFrame\n",
337 |     "    :param aa_seq_df: DataFrame, Transcript Base and (AA) seq\n",
338 |     "    :param width: int, length on each side of the cut site\n",
339 |     "    :param transcript_base_col: str\n",
340 |     "    :param target_transcript_col: str\n",
341 |     "    :param aa_index_col: str\n",
342 |     "    :return: DataFrame\n",
343 |     "    \"\"\"\n",
344 |     "    sg_aas = (aa_seq_df.merge(sg_designs[list(set(id_cols +\n",
345 |     "                                                  [target_transcript_col, transcript_base_col, aa_index_col]))],\n",
346 |     "                              how='inner',\n",
347 |     "                              on=[target_transcript_col, transcript_base_col]))\n",
348 |     "    sg_aas_subseq = extract_amino_acid_subsequence(sg_aas, width)\n",
349 |     "    return sg_aas_subseq"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": null,
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "name": "stdout",
359 |      "output_type": "stream",
360 |      "text": [
361 |       "Getting amino acid sequences\n"
362 |      ]
363 |     },
364 |     {
365 |      "name": "stderr",
366 |      "output_type": "stream",
367 |      "text": [
368 |       "100%|█████████████████████████████████████████████| 4/4 [00:04<00:00,  1.19s/it]\n"
369 |      ]
370 |     }
371 |    ],
372 |    "source": [
373 |     "aa_seq_df = targetdata.build_transcript_aa_seq_df(design_targ_df, n_jobs=2)"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {},
380 |    "outputs": [],
381 |    "source": [
382 |     "aa_subseq_df = get_aa_subseq_df(sg_designs=design_targ_df, aa_seq_df=aa_seq_df, width=16,\n",
383 |     "                                id_cols=['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation'])\n",
384 |     "assert (aa_subseq_df['AA Subsequence'].str.len() == 33).all()\n",
385 |     "assert aa_subseq_df.shape[0] == design_targ_df.shape[0]"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "codon_map_df = pd.read_csv('test_data/codon_map.csv')\n",
395 |     "\n",
396 |     "def get_rev_comp(sgrna):\n",
397 |     "    \"\"\"Get reverse compliment of a guide\"\"\"\n",
398 |     "    nt_map = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}\n",
399 |     "    rev_comp = ''\n",
400 |     "    for nt in sgrna:\n",
401 |     "        rev_comp += nt_map[nt]\n",
402 |     "    rev_comp = rev_comp[::-1]\n",
403 |     "    return rev_comp\n",
404 |     "\n",
405 |     "codon_map = pd.Series(codon_map_df['Amino Acid'].values, index=codon_map_df['Codon']).to_dict()\n",
406 |     "row = aa_subseq_df.sample(1, random_state=1).iloc[0, :]\n",
407 |     "subseq = row['AA Subsequence']\n",
408 |     "context = row['sgRNA Context Sequence']\n",
409 |     "rc_context = get_rev_comp(context)\n",
410 |     "translations = dict()\n",
411 |     "rc_translations = dict()\n",
412 |     "for i in [0, 1, 2]:\n",
413 |     "    translations[i] = ''.join([codon_map[context[j:j+3]] for j in range(i, len(context), 3)\n",
414 |     "                               if (j + 3) <= len(context)])\n",
415 |     "    rc_translations[i] = ''.join([codon_map[rc_context[j:j+3]] for j in range(i, len(rc_context), 3)\n",
416 |     "                                  if (j + 3) <= len(rc_context)])\n",
417 |     "assert ((translations[0] in subseq) or (translations[1] in subseq) or (translations[2] in subseq) or\n",
418 |     "        (rc_translations[0] in subseq) or (rc_translations[1] in subseq) or (rc_translations[2] in subseq))"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "# export\n",
428 |     "def get_amino_acid_features(aa_subseq_df, features, id_cols):\n",
429 |     "    \"\"\"Featurize amino acid sequences\n",
430 |     "\n",
431 |     "    :param aa_subseq_df: DataFrame\n",
432 |     "    :param features: list\n",
433 |     "    :param id_cols: list\n",
434 |     "    :return: DataFrame\n",
435 |     "    \"\"\"\n",
436 |     "\n",
437 |     "    # Zero-indexed for python\n",
438 |     "    # filter out sequences without the canonical amino acids\n",
439 |     "    aa_set = set('ARNDCQEGHILKMFPSTWYV*-')\n",
440 |     "    filtered_sg_aas = (aa_subseq_df[aa_subseq_df['AA Subsequence'].apply(lambda s: set(s) <= aa_set)]\n",
441 |     "                       .reset_index(drop=True))\n",
442 |     "    filtered_diff = (aa_subseq_df.shape[0] - filtered_sg_aas.shape[0])\n",
443 |     "    if filtered_diff > 0:\n",
444 |     "        warnings.warn('Ignored ' + str(filtered_diff) + ' amino acid sequences with non-canonical amino acids')\n",
445 |     "    aa_features = featurize_aa_seqs(filtered_sg_aas['AA Subsequence'], features=features)\n",
446 |     "    aa_features_annot = pd.concat([filtered_sg_aas[id_cols + ['AA Subsequence']]\n",
447 |     "                                   .reset_index(drop=True),\n",
448 |     "                                   aa_features.reset_index(drop=True)], axis=1)\n",
449 |     "    return aa_features_annot\n"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": null,
455 |    "metadata": {},
456 |    "outputs": [],
457 |    "source": [
458 |     "aa_features = get_amino_acid_features(aa_subseq_df=aa_subseq_df,\n",
459 |     "                                      features=['Pos. Ind. 1mer',\n",
460 |     "                                                'Hydrophobicity', 'Aromaticity',\n",
461 |     "                                                'Isoelectric Point', 'Secondary Structure'],\n",
462 |     "                                      id_cols=['sgRNA Context Sequence', 'Target Cut Length',\n",
463 |     "                                               'Target Transcript', 'Orientation'])\n",
464 |     "assert aa_features['L'].idxmax() == aa_features['Hydrophobicity'].idxmax()"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "markdown",
469 |    "metadata": {},
470 |    "source": [
471 |     "## Protein Domain Features"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": null,
477 |    "metadata": {},
478 |    "outputs": [],
479 |    "source": [
480 |     "#export\n",
481 |     "def get_protein_domain_features(sg_design_df, protein_domains, id_cols,\n",
482 |     "                                sources=None,\n",
483 |     "                                transcript_base_col='Transcript Base',\n",
484 |     "                                aa_index_col='AA Index',\n",
485 |     "                                domain_type_col='type',\n",
486 |     "                                domain_start_col='start',\n",
487 |     "                                domain_end_col='end'):\n",
488 |     "    \"\"\"Get binary dataframe of protein domains\n",
489 |     "\n",
490 |     "    :param sg_design_df: DataFrame, with columns [transcript_base_col, aa_index_col]\n",
491 |     "    :param protein_domains: DataFrame, with columns [transcript_base_col, domain_type_col]\n",
492 |     "    :param id_cols: list\n",
493 |     "    :param sources: list. list of database types to include\n",
494 |     "    :param transcript_base_col: str\n",
495 |     "    :param aa_index_col: str\n",
496 |     "    :param domain_type_col: str\n",
497 |     "    :param domain_start_col: str\n",
498 |     "    :param domain_end_col: str\n",
499 |     "    :return: DataFrame, with binary features for protein domains\n",
500 |     "    \"\"\"\n",
501 |     "    if sources is None:\n",
502 |     "        sources = ['Pfam', 'PANTHER', 'HAMAP', 'SuperFamily', 'TIGRfam', 'ncoils', 'Gene3D',\n",
503 |     "                   'Prosite_patterns', 'Seg', 'SignalP', 'TMHMM', 'MobiDBLite',\n",
504 |     "                   'PIRSF', 'PRINTS', 'Smart', 'Prosite_profiles']  # exclude sifts\n",
505 |     "    protein_domains = protein_domains[protein_domains[domain_type_col].isin(sources)]\n",
506 |     "    clean_designs = sg_design_df[list(set(id_cols + [transcript_base_col, aa_index_col]))].copy()\n",
507 |     "    designs_domains = clean_designs.merge(protein_domains,\n",
508 |     "                                          how='inner', on=transcript_base_col)\n",
509 |     "    # Note - not every sgRNA will be present in the feature df\n",
510 |     "    filtered_domains = (designs_domains[designs_domains[aa_index_col].between(designs_domains[domain_start_col],\n",
511 |     "                                                                              designs_domains[domain_end_col])]\n",
512 |     "                        .copy())\n",
513 |     "    filtered_domains = filtered_domains[id_cols + [domain_type_col]].drop_duplicates()\n",
514 |     "    filtered_domains['present'] = 1\n",
515 |     "    domain_feature_df = (filtered_domains.pivot_table(values='present',\n",
516 |     "                                                      index=id_cols,\n",
517 |     "                                                      columns='type', fill_value=0)\n",
518 |     "                         .reset_index())\n",
519 |     "    # Ensure all domain columns are present for testing\n",
520 |     "    full_column_df = pd.DataFrame(columns=id_cols + sources, dtype=int)  # empty\n",
521 |     "    domain_feature_df = pd.concat([full_column_df, domain_feature_df]).fillna(0)\n",
522 |     "    domain_feature_df[sources] = domain_feature_df[sources].astype(int)\n",
523 |     "    return domain_feature_df"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": null,
529 |    "metadata": {},
530 |    "outputs": [
531 |     {
532 |      "name": "stdout",
533 |      "output_type": "stream",
534 |      "text": [
535 |       "Getting protein domains\n"
536 |      ]
537 |     },
538 |     {
539 |      "name": "stderr",
540 |      "output_type": "stream",
541 |      "text": [
542 |       "100%|█████████████████████████████████████████| 200/200 [00:49<00:00,  4.02it/s]\n"
543 |      ]
544 |     }
545 |    ],
546 |    "source": [
547 |     "domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)\n",
548 |     "protein_domain_feature_df = get_protein_domain_features(design_targ_df, domain_df, sources=None,\n",
549 |     "                                                        id_cols=['sgRNA Context Sequence', 'Target Cut Length',\n",
550 |     "                                                                 'AA Index', 'Target Transcript', 'Orientation'])"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": null,
556 |    "metadata": {},
557 |    "outputs": [],
558 |    "source": [
559 |     "assert protein_domain_feature_df.loc[protein_domain_feature_df['sgRNA Context Sequence'] == 'AAAAGAGCCATGAATCTAAACATCAGGAAT',\n",
560 |     "                                     ['PANTHER', 'ncoils', 'Seg', 'MobiDBLite']].sum(axis=1).values[0] == 4"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "markdown",
565 |    "metadata": {},
566 |    "source": [
567 |     "## Conservation Features"
568 |    ]
569 |   },
570 |   {
571 |    "cell_type": "code",
572 |    "execution_count": null,
573 |    "metadata": {},
574 |    "outputs": [],
575 |    "source": [
576 |     "# export\n",
577 |     "def get_conservation_ranges(cut_pos, small_width, large_width):\n",
578 |     "    small_range = range(cut_pos - small_width + 1, cut_pos + small_width + 1)\n",
579 |     "    large_range = range(cut_pos - large_width + 1, cut_pos + large_width + 1)\n",
580 |     "    return small_range, large_range\n",
581 |     "\n",
582 |     "\n",
583 |     "def get_conservation_features(sg_designs, conservation_df, conservation_column,\n",
584 |     "                              small_width, large_width, id_cols):\n",
585 |     "    \"\"\"Get conservation features\n",
586 |     "\n",
587 |     "    :param sg_designs: DataFrame\n",
588 |     "    :param conservation_df: DataFrame, tidy conservation scores indexed by Transcript Base and target position\n",
589 |     "    :param conservation_column: str, name of column to calculate scores with\n",
590 |     "    :param small_width: int, small window length to average scores in one direction\n",
591 |     "    :param large_width: int, large window length to average scores in the one direction\n",
592 |     "    :return: DataFrame of conservation features\n",
593 |     "    \"\"\"\n",
594 |     "    sg_designs_width = sg_designs[id_cols + ['Transcript Base']].copy()\n",
595 |     "    sg_designs_width['target position small'], sg_designs_width['target position large'] =  \\\n",
596 |     "        zip(*sg_designs_width['Target Cut Length']\n",
597 |     "            .apply(get_conservation_ranges, small_width=small_width,\n",
598 |     "                   large_width=large_width))\n",
599 |     "    small_width_conservation = (sg_designs_width.drop('target position large', axis=1)\n",
600 |     "                                .rename({'target position small': 'target position'}, axis=1)\n",
601 |     "                                .explode('target position')\n",
602 |     "                                .merge(conservation_df, how='inner',\n",
603 |     "                                       on=['Target Transcript', 'Transcript Base', 'target position'])\n",
604 |     "                                .groupby(id_cols)\n",
605 |     "                                .agg(cons=(conservation_column, 'mean'))\n",
606 |     "                                .rename({'cons': 'cons_' + str(small_width * 2)}, axis=1)\n",
607 |     "                                .reset_index())\n",
608 |     "    large_width_conservation = (sg_designs_width.drop('target position small', axis=1)\n",
609 |     "                                .rename({'target position large': 'target position'}, axis=1)\n",
610 |     "                                .explode('target position')\n",
611 |     "                                .merge(conservation_df, how='inner',\n",
612 |     "                                       on=['Target Transcript', 'Transcript Base', 'target position'])\n",
613 |     "                                .groupby(id_cols)\n",
614 |     "                                .agg(cons=(conservation_column, 'mean'))\n",
615 |     "                                .rename({'cons': 'cons_' + str(large_width * 2)}, axis=1)\n",
616 |     "                                .reset_index())\n",
617 |     "    cons_feature_df = small_width_conservation.merge(large_width_conservation, how='outer',\n",
618 |     "                                                     on=id_cols)\n",
619 |     "    return cons_feature_df"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": null,
625 |    "metadata": {},
626 |    "outputs": [
627 |     {
628 |      "name": "stdout",
629 |      "output_type": "stream",
630 |      "text": [
631 |       "Getting conservation\n"
632 |      ]
633 |     },
634 |     {
635 |      "name": "stderr",
636 |      "output_type": "stream",
637 |      "text": [
638 |       "100%|█████████████████████████████████████████| 200/200 [06:28<00:00,  1.94s/it]\n"
639 |      ]
640 |     }
641 |    ],
642 |    "source": [
643 |     "conservation_df = targetdata.build_conservation_df(design_targ_df, n_jobs=max_n_jobs)\n",
644 |     "conservation_features = get_conservation_features(design_targ_df, conservation_df,\n",
645 |     "                                                  small_width=2, large_width=16,\n",
646 |     "                                                  conservation_column='ranked_conservation',\n",
647 |     "                                                  id_cols=['sgRNA Context Sequence', 'Target Cut Length',\n",
648 |     "                                                           'Target Transcript', 'Orientation'])\n",
649 |     "merged_features = protein_domain_feature_df.merge(conservation_features, how='inner', on=['sgRNA Context Sequence',\n",
650 |     "                                                                                          'Target Cut Length',\n",
651 |     "                                                                                          'Target Transcript',\n",
652 |     "                                                                                          'Orientation'])\n",
653 |     "smart_avg_cons = merged_features.loc[merged_features['Smart'].astype(bool), 'cons_32'].mean()\n",
654 |     "non_smart_avg_cons = merged_features.loc[~merged_features['Smart'].astype(bool), 'cons_32'].mean()\n",
655 |     "assert smart_avg_cons > non_smart_avg_cons"
656 |    ]
657 |   },
658 |   {
659 |    "cell_type": "markdown",
660 |    "metadata": {},
661 |    "source": [
662 |     "## Combining target features\n",
663 |     "\n",
664 |     "We'll combine, the position, amino acid and domain feature matrices into a single target feature matrix"
665 |    ]
666 |   },
667 |   {
668 |    "cell_type": "code",
669 |    "execution_count": null,
670 |    "metadata": {},
671 |    "outputs": [],
672 |    "source": [
673 |     "# export\n",
674 |     "def merge_feature_dfs(design_df,\n",
675 |     "                      aa_subseq_df, aa_features=None,\n",
676 |     "                      domain_df=None,\n",
677 |     "                      conservation_df=None,\n",
678 |     "                      id_cols=None):\n",
679 |     "    if id_cols is None:\n",
680 |     "        id_cols = ['sgRNA Context Sequence', 'Target Cut Length',\n",
681 |     "                   'Target Transcript', 'Orientation']\n",
682 |     "    if aa_features is None:\n",
683 |     "        aa_features = ['Pos. Ind. 1mer',\n",
684 |     "                       'Hydrophobicity', 'Aromaticity',\n",
685 |     "                       'Isoelectric Point', 'Secondary Structure']\n",
686 |     "    if design_df[id_cols].drop_duplicates().shape[0] != design_df.shape[0]:\n",
687 |     "        raise ValueError('id_cols must uniquely identify rows of the design dataframe')\n",
688 |     "    feature_df_dict = dict()\n",
689 |     "    feature_list = list()\n",
690 |     "    position_feature_df = get_position_features(design_df, id_cols=id_cols)\n",
691 |     "    feature_df_dict['position'] = position_feature_df\n",
692 |     "    feature_list.extend(['Target Cut %', 'sense'])\n",
693 |     "    if domain_df is not None:\n",
694 |     "        feature_df_dict['domain'] = domain_df\n",
695 |     "        feature_list.extend(['Pfam', 'PANTHER', 'HAMAP', 'SuperFamily', 'TIGRfam', 'ncoils', 'Gene3D',\n",
696 |     "                             'Prosite_patterns', 'Seg', 'SignalP', 'TMHMM', 'MobiDBLite',\n",
697 |     "                             'PIRSF', 'PRINTS', 'Smart', 'Prosite_profiles'])\n",
698 |     "    if conservation_df is not None:\n",
699 |     "        feature_df_dict['conservation'] = conservation_df\n",
700 |     "        # hardcoded\n",
701 |     "        feature_list.extend(['cons_4', 'cons_32'])\n",
702 |     "    aa_feature_df = get_amino_acid_features(aa_subseq_df, aa_features, id_cols)\n",
703 |     "    feature_list.extend(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I',\n",
704 |     "                         'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '*',\n",
705 |     "                         'Hydrophobicity', 'Aromaticity', 'Isoelectric Point', 'Helix', 'Turn',\n",
706 |     "                         'Sheet'])\n",
707 |     "    feature_df_dict['aa'] = aa_feature_df\n",
708 |     "    feature_df = design_df[id_cols]\n",
709 |     "    for key, df in feature_df_dict.items():\n",
710 |     "        feature_df = pd.merge(feature_df, df, how='left', on=id_cols)\n",
711 |     "    return feature_df, feature_list\n"
712 |    ]
713 |   },
714 |   {
715 |    "cell_type": "code",
716 |    "execution_count": null,
717 |    "metadata": {},
718 |    "outputs": [],
719 |    "source": [
720 |     "feature_df, feature_list = merge_feature_dfs(design_df=design_df,\n",
721 |     "                                             aa_subseq_df=aa_subseq_df,\n",
722 |     "                                             domain_df=protein_domain_feature_df,\n",
723 |     "                                             conservation_df=conservation_features)"
724 |    ]
725 |   },
726 |   {
727 |    "cell_type": "code",
728 |    "execution_count": null,
729 |    "metadata": {},
730 |    "outputs": [],
731 |    "source": [
732 |     "assert feature_df[feature_list].shape[1] == len(feature_list)"
733 |    ]
734 |   },
735 |   {
736 |    "cell_type": "code",
737 |    "execution_count": null,
738 |    "metadata": {},
739 |    "outputs": [],
740 |    "source": []
741 |   }
742 |  ],
743 |  "metadata": {
744 |   "kernelspec": {
745 |    "display_name": "rs3_v2",
746 |    "language": "python",
747 |    "name": "rs3_v2"
748 |   }
749 |  },
750 |  "nbformat": 4,
751 |  "nbformat_minor": 4
752 | }
753 | 


--------------------------------------------------------------------------------
/03_predicttarg.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# default_exp predicttarg"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# predicttarg\n",
 17 |     "\n",
 18 |     "> Rule set 3 target-site predictions"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# export\n",
 28 |     "from rs3 import targetfeat\n",
 29 |     "import joblib\n",
 30 |     "import os"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "import lightgbm\n",
 40 |     "import pandas as pd\n",
 41 |     "from rs3 import targetdata\n",
 42 |     "from scipy import stats\n",
 43 |     "import numpy as np"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "__file__ = os.path.abspath('') + '/03_predicttarg.ipynb'\n",
 53 |     "import multiprocessing\n",
 54 |     "max_n_jobs = multiprocessing.cpu_count()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "# export\n",
 64 |     "def load_target_model(lite=False):\n",
 65 |     "    \"\"\"Load rule set 3 target model\"\"\"\n",
 66 |     "    if lite:\n",
 67 |     "        model_name = 'target_lite_model.pkl'\n",
 68 |     "    else:\n",
 69 |     "        model_name = 'target_model.pkl'\n",
 70 |     "    model = joblib.load(os.path.join(os.path.dirname(__file__), model_name))\n",
 71 |     "    return model"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stderr",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n",
 84 |       "  warnings.warn(\n",
 85 |       "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n",
 86 |       "  warnings.warn(\n"
 87 |      ]
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "assert type(load_target_model()['regressor']) == lightgbm.sklearn.LGBMRegressor"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "# export\n",
101 |     "def predict_target(design_df, aa_subseq_df, domain_feature_df=None,\n",
102 |     "                   conservation_feature_df=None, id_cols=None):\n",
103 |     "    \"\"\"Make predictions using the Rule Set 3 target model. Note that if the protein_domain_df\n",
104 |     "    or conservation_df are not supplied, then the lite model will be used, otherwise the full model is used.\n",
105 |     "\n",
106 |     "    :param design_df: DataFrame\n",
107 |     "    :param aa_subseq_df: DataFrame\n",
108 |     "    :param domain_feature_df: DataFrame\n",
109 |     "    :param id_cols: list or str\n",
110 |     "    :return: list\n",
111 |     "    \"\"\"\n",
112 |     "    if (domain_feature_df is None) or (conservation_feature_df is None):\n",
113 |     "        lite = True\n",
114 |     "        domain_feature_df = None\n",
115 |     "        conservation_feature_df = None\n",
116 |     "    else:\n",
117 |     "        lite = False\n",
118 |     "    model = load_target_model(lite=lite)\n",
119 |     "    if id_cols is None:\n",
120 |     "        id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']\n",
121 |     "    target_feature_df, target_feature_cols = targetfeat.merge_feature_dfs(design_df,\n",
122 |     "                                                                          aa_subseq_df=aa_subseq_df,\n",
123 |     "                                                                          domain_df=domain_feature_df,\n",
124 |     "                                                                          conservation_df=conservation_feature_df,\n",
125 |     "                                                                          id_cols=id_cols)\n",
126 |     "    X_target = target_feature_df[target_feature_cols]\n",
127 |     "    predictions = model.predict(X_target)\n",
128 |     "    return predictions"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "design_df = pd.read_table('test_data/sgrna-designs.txt')\n",
138 |     "design_targ_df = targetfeat.add_target_columns(design_df)\n",
139 |     "id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "name": "stdout",
149 |      "output_type": "stream",
150 |      "text": [
151 |       "Getting amino acid sequences\n"
152 |      ]
153 |     },
154 |     {
155 |      "name": "stderr",
156 |      "output_type": "stream",
157 |      "text": [
158 |       "100%|█████████████████████████████████████████████| 4/4 [00:04<00:00,  1.12s/it]\n"
159 |      ]
160 |     },
161 |     {
162 |      "data": {
163 |       "text/html": [
164 |        "<div>\n",
165 |        "<style scoped>\n",
166 |        "    .dataframe tbody tr th:only-of-type {\n",
167 |        "        vertical-align: middle;\n",
168 |        "    }\n",
169 |        "\n",
170 |        "    .dataframe tbody tr th {\n",
171 |        "        vertical-align: top;\n",
172 |        "    }\n",
173 |        "\n",
174 |        "    .dataframe thead th {\n",
175 |        "        text-align: right;\n",
176 |        "    }\n",
177 |        "</style>\n",
178 |        "<table border=\"1\" class=\"dataframe\">\n",
179 |        "  <thead>\n",
180 |        "    <tr style=\"text-align: right;\">\n",
181 |        "      <th></th>\n",
182 |        "      <th>Target Transcript</th>\n",
183 |        "      <th>Target Total Length</th>\n",
184 |        "      <th>Transcript Base</th>\n",
185 |        "      <th>version</th>\n",
186 |        "      <th>molecule</th>\n",
187 |        "      <th>desc</th>\n",
188 |        "      <th>id</th>\n",
189 |        "      <th>seq</th>\n",
190 |        "      <th>AA len</th>\n",
191 |        "      <th>AA Index</th>\n",
192 |        "      <th>Orientation</th>\n",
193 |        "      <th>sgRNA Context Sequence</th>\n",
194 |        "      <th>Target Cut Length</th>\n",
195 |        "      <th>extended_seq</th>\n",
196 |        "      <th>AA 0-Indexed</th>\n",
197 |        "      <th>AA 0-Indexed padded</th>\n",
198 |        "      <th>seq_start</th>\n",
199 |        "      <th>seq_end</th>\n",
200 |        "      <th>AA Subsequence</th>\n",
201 |        "    </tr>\n",
202 |        "  </thead>\n",
203 |        "  <tbody>\n",
204 |        "    <tr>\n",
205 |        "      <th>0</th>\n",
206 |        "      <td>ENST00000259457.8</td>\n",
207 |        "      <td>834</td>\n",
208 |        "      <td>ENST00000259457</td>\n",
209 |        "      <td>3</td>\n",
210 |        "      <td>protein</td>\n",
211 |        "      <td>None</td>\n",
212 |        "      <td>ENSP00000259457</td>\n",
213 |        "      <td>MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...</td>\n",
214 |        "      <td>277</td>\n",
215 |        "      <td>64</td>\n",
216 |        "      <td>sense</td>\n",
217 |        "      <td>TGGAGCAGATACAAGAGCAACTGAAGGGAT</td>\n",
218 |        "      <td>191</td>\n",
219 |        "      <td>-----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF...</td>\n",
220 |        "      <td>63</td>\n",
221 |        "      <td>80</td>\n",
222 |        "      <td>64</td>\n",
223 |        "      <td>96</td>\n",
224 |        "      <td>GVVYKDGIVLGADTRATEGMVVADKNCSKIHFI</td>\n",
225 |        "    </tr>\n",
226 |        "    <tr>\n",
227 |        "      <th>1</th>\n",
228 |        "      <td>ENST00000259457.8</td>\n",
229 |        "      <td>834</td>\n",
230 |        "      <td>ENST00000259457</td>\n",
231 |        "      <td>3</td>\n",
232 |        "      <td>protein</td>\n",
233 |        "      <td>None</td>\n",
234 |        "      <td>ENSP00000259457</td>\n",
235 |        "      <td>MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...</td>\n",
236 |        "      <td>277</td>\n",
237 |        "      <td>46</td>\n",
238 |        "      <td>sense</td>\n",
239 |        "      <td>CCGGAAAACTGGCACGACCATCGCTGGGGT</td>\n",
240 |        "      <td>137</td>\n",
241 |        "      <td>-----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF...</td>\n",
242 |        "      <td>45</td>\n",
243 |        "      <td>62</td>\n",
244 |        "      <td>46</td>\n",
245 |        "      <td>78</td>\n",
246 |        "      <td>AKRGYKLPKVRKTGTTIAGVVYKDGIVLGADTR</td>\n",
247 |        "    </tr>\n",
248 |        "    <tr>\n",
249 |        "      <th>2</th>\n",
250 |        "      <td>ENST00000394249.8</td>\n",
251 |        "      <td>1863</td>\n",
252 |        "      <td>ENST00000394249</td>\n",
253 |        "      <td>3</td>\n",
254 |        "      <td>protein</td>\n",
255 |        "      <td>None</td>\n",
256 |        "      <td>ENSP00000377793</td>\n",
257 |        "      <td>MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...</td>\n",
258 |        "      <td>620</td>\n",
259 |        "      <td>106</td>\n",
260 |        "      <td>sense</td>\n",
261 |        "      <td>TAGAAAAAGATTTGCGCACCCAAGTGGAAT</td>\n",
262 |        "      <td>316</td>\n",
263 |        "      <td>-----------------MRRSEVLAEESIVCLQKALNHLREIWELI...</td>\n",
264 |        "      <td>105</td>\n",
265 |        "      <td>122</td>\n",
266 |        "      <td>106</td>\n",
267 |        "      <td>138</td>\n",
268 |        "      <td>EEGETTILQLEKDLRTQVELMRKQKKERKQELK</td>\n",
269 |        "    </tr>\n",
270 |        "    <tr>\n",
271 |        "      <th>3</th>\n",
272 |        "      <td>ENST00000394249.8</td>\n",
273 |        "      <td>1863</td>\n",
274 |        "      <td>ENST00000394249</td>\n",
275 |        "      <td>3</td>\n",
276 |        "      <td>protein</td>\n",
277 |        "      <td>None</td>\n",
278 |        "      <td>ENSP00000377793</td>\n",
279 |        "      <td>MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...</td>\n",
280 |        "      <td>620</td>\n",
281 |        "      <td>263</td>\n",
282 |        "      <td>antisense</td>\n",
283 |        "      <td>TGGCCTTTGACCCAGACATAATGGTGGCCA</td>\n",
284 |        "      <td>787</td>\n",
285 |        "      <td>-----------------MRRSEVLAEESIVCLQKALNHLREIWELI...</td>\n",
286 |        "      <td>262</td>\n",
287 |        "      <td>279</td>\n",
288 |        "      <td>263</td>\n",
289 |        "      <td>295</td>\n",
290 |        "      <td>WDRLQIPEEEREAVATIMSGSKAKVRKALQLEV</td>\n",
291 |        "    </tr>\n",
292 |        "    <tr>\n",
293 |        "      <th>4</th>\n",
294 |        "      <td>ENST00000361337.3</td>\n",
295 |        "      <td>2298</td>\n",
296 |        "      <td>ENST00000361337</td>\n",
297 |        "      <td>2</td>\n",
298 |        "      <td>protein</td>\n",
299 |        "      <td>None</td>\n",
300 |        "      <td>ENSP00000354522</td>\n",
301 |        "      <td>MSGDHLHNDSQIEADFRLNDSHKHKDKHKDREHRHKEHKKEKDREK...</td>\n",
302 |        "      <td>765</td>\n",
303 |        "      <td>140</td>\n",
304 |        "      <td>antisense</td>\n",
305 |        "      <td>AAATACTCACTCATCCTCATCTCGAGGTCT</td>\n",
306 |        "      <td>420</td>\n",
307 |        "      <td>-----------------MSGDHLHNDSQIEADFRLNDSHKHKDKHK...</td>\n",
308 |        "      <td>139</td>\n",
309 |        "      <td>156</td>\n",
310 |        "      <td>140</td>\n",
311 |        "      <td>172</td>\n",
312 |        "      <td>GYFVPPKEDIKPLKRPRDEDDADYKPKKIKTED</td>\n",
313 |        "    </tr>\n",
314 |        "    <tr>\n",
315 |        "      <th>...</th>\n",
316 |        "      <td>...</td>\n",
317 |        "      <td>...</td>\n",
318 |        "      <td>...</td>\n",
319 |        "      <td>...</td>\n",
320 |        "      <td>...</td>\n",
321 |        "      <td>...</td>\n",
322 |        "      <td>...</td>\n",
323 |        "      <td>...</td>\n",
324 |        "      <td>...</td>\n",
325 |        "      <td>...</td>\n",
326 |        "      <td>...</td>\n",
327 |        "      <td>...</td>\n",
328 |        "      <td>...</td>\n",
329 |        "      <td>...</td>\n",
330 |        "      <td>...</td>\n",
331 |        "      <td>...</td>\n",
332 |        "      <td>...</td>\n",
333 |        "      <td>...</td>\n",
334 |        "      <td>...</td>\n",
335 |        "    </tr>\n",
336 |        "    <tr>\n",
337 |        "      <th>395</th>\n",
338 |        "      <td>ENST00000454402.7</td>\n",
339 |        "      <td>1023</td>\n",
340 |        "      <td>ENST00000454402</td>\n",
341 |        "      <td>2</td>\n",
342 |        "      <td>protein</td>\n",
343 |        "      <td>None</td>\n",
344 |        "      <td>ENSP00000408295</td>\n",
345 |        "      <td>METSALKQQEQPAATKIRNLPWVEKYRPQTLNDLISHQDILSTIQK...</td>\n",
346 |        "      <td>340</td>\n",
347 |        "      <td>74</td>\n",
348 |        "      <td>antisense</td>\n",
349 |        "      <td>TGTCTTTATATAGCTGTTTCGCACAGGCTA</td>\n",
350 |        "      <td>220</td>\n",
351 |        "      <td>-----------------METSALKQQEQPAATKIRNLPWVEKYRPQ...</td>\n",
352 |        "      <td>73</td>\n",
353 |        "      <td>90</td>\n",
354 |        "      <td>74</td>\n",
355 |        "      <td>106</td>\n",
356 |        "      <td>LYGPPGTGKTSTILACAKQLYKDKEFGSMVLEL</td>\n",
357 |        "    </tr>\n",
358 |        "    <tr>\n",
359 |        "      <th>396</th>\n",
360 |        "      <td>ENST00000254998.3</td>\n",
361 |        "      <td>423</td>\n",
362 |        "      <td>ENST00000254998</td>\n",
363 |        "      <td>2</td>\n",
364 |        "      <td>protein</td>\n",
365 |        "      <td>None</td>\n",
366 |        "      <td>ENSP00000254998</td>\n",
367 |        "      <td>MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...</td>\n",
368 |        "      <td>140</td>\n",
369 |        "      <td>27</td>\n",
370 |        "      <td>sense</td>\n",
371 |        "      <td>TTGTCAATGTCTACTACACCACCATGGATA</td>\n",
372 |        "      <td>79</td>\n",
373 |        "      <td>-----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD...</td>\n",
374 |        "      <td>26</td>\n",
375 |        "      <td>43</td>\n",
376 |        "      <td>27</td>\n",
377 |        "      <td>59</td>\n",
378 |        "      <td>DQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTA</td>\n",
379 |        "    </tr>\n",
380 |        "    <tr>\n",
381 |        "      <th>397</th>\n",
382 |        "      <td>ENST00000254998.3</td>\n",
383 |        "      <td>423</td>\n",
384 |        "      <td>ENST00000254998</td>\n",
385 |        "      <td>2</td>\n",
386 |        "      <td>protein</td>\n",
387 |        "      <td>None</td>\n",
388 |        "      <td>ENSP00000254998</td>\n",
389 |        "      <td>MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...</td>\n",
390 |        "      <td>140</td>\n",
391 |        "      <td>39</td>\n",
392 |        "      <td>sense</td>\n",
393 |        "      <td>GGCGTTTGCTGTCCCGCCTGTACATGGGCA</td>\n",
394 |        "      <td>115</td>\n",
395 |        "      <td>-----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD...</td>\n",
396 |        "      <td>38</td>\n",
397 |        "      <td>55</td>\n",
398 |        "      <td>39</td>\n",
399 |        "      <td>71</td>\n",
400 |        "      <td>VYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQ</td>\n",
401 |        "    </tr>\n",
402 |        "    <tr>\n",
403 |        "      <th>398</th>\n",
404 |        "      <td>ENST00000381685.10</td>\n",
405 |        "      <td>2067</td>\n",
406 |        "      <td>ENST00000381685</td>\n",
407 |        "      <td>5</td>\n",
408 |        "      <td>protein</td>\n",
409 |        "      <td>None</td>\n",
410 |        "      <td>ENSP00000371101</td>\n",
411 |        "      <td>MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI...</td>\n",
412 |        "      <td>688</td>\n",
413 |        "      <td>259</td>\n",
414 |        "      <td>antisense</td>\n",
415 |        "      <td>ACTAGCAATGGCTTATCAGATCGAAGGTCA</td>\n",
416 |        "      <td>776</td>\n",
417 |        "      <td>-----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK...</td>\n",
418 |        "      <td>258</td>\n",
419 |        "      <td>275</td>\n",
420 |        "      <td>259</td>\n",
421 |        "      <td>291</td>\n",
422 |        "      <td>TMAVGTTTGQVLLYDLRSDKPLLVKDHQYGLPI</td>\n",
423 |        "    </tr>\n",
424 |        "    <tr>\n",
425 |        "      <th>399</th>\n",
426 |        "      <td>ENST00000381685.10</td>\n",
427 |        "      <td>2067</td>\n",
428 |        "      <td>ENST00000381685</td>\n",
429 |        "      <td>5</td>\n",
430 |        "      <td>protein</td>\n",
431 |        "      <td>None</td>\n",
432 |        "      <td>ENSP00000371101</td>\n",
433 |        "      <td>MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI...</td>\n",
434 |        "      <td>688</td>\n",
435 |        "      <td>108</td>\n",
436 |        "      <td>sense</td>\n",
437 |        "      <td>AAATTTTGTCTGATGACTACTCAAAGGTAT</td>\n",
438 |        "      <td>322</td>\n",
439 |        "      <td>-----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK...</td>\n",
440 |        "      <td>107</td>\n",
441 |        "      <td>124</td>\n",
442 |        "      <td>108</td>\n",
443 |        "      <td>140</td>\n",
444 |        "      <td>CLDSEVVTFEILSDDYSKIVFLHNDRYIEFHSQ</td>\n",
445 |        "    </tr>\n",
446 |        "  </tbody>\n",
447 |        "</table>\n",
448 |        "<p>400 rows × 19 columns</p>\n",
449 |        "</div>"
450 |       ],
451 |       "text/plain": [
452 |        "      Target Transcript  Target Total Length  Transcript Base  version  \\\n",
453 |        "0     ENST00000259457.8                  834  ENST00000259457        3   \n",
454 |        "1     ENST00000259457.8                  834  ENST00000259457        3   \n",
455 |        "2     ENST00000394249.8                 1863  ENST00000394249        3   \n",
456 |        "3     ENST00000394249.8                 1863  ENST00000394249        3   \n",
457 |        "4     ENST00000361337.3                 2298  ENST00000361337        2   \n",
458 |        "..                  ...                  ...              ...      ...   \n",
459 |        "395   ENST00000454402.7                 1023  ENST00000454402        2   \n",
460 |        "396   ENST00000254998.3                  423  ENST00000254998        2   \n",
461 |        "397   ENST00000254998.3                  423  ENST00000254998        2   \n",
462 |        "398  ENST00000381685.10                 2067  ENST00000381685        5   \n",
463 |        "399  ENST00000381685.10                 2067  ENST00000381685        5   \n",
464 |        "\n",
465 |        "    molecule  desc               id  \\\n",
466 |        "0    protein  None  ENSP00000259457   \n",
467 |        "1    protein  None  ENSP00000259457   \n",
468 |        "2    protein  None  ENSP00000377793   \n",
469 |        "3    protein  None  ENSP00000377793   \n",
470 |        "4    protein  None  ENSP00000354522   \n",
471 |        "..       ...   ...              ...   \n",
472 |        "395  protein  None  ENSP00000408295   \n",
473 |        "396  protein  None  ENSP00000254998   \n",
474 |        "397  protein  None  ENSP00000254998   \n",
475 |        "398  protein  None  ENSP00000371101   \n",
476 |        "399  protein  None  ENSP00000371101   \n",
477 |        "\n",
478 |        "                                                   seq  AA len  AA Index  \\\n",
479 |        "0    MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...     277        64   \n",
480 |        "1    MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...     277        46   \n",
481 |        "2    MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...     620       106   \n",
482 |        "3    MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...     620       263   \n",
483 |        "4    MSGDHLHNDSQIEADFRLNDSHKHKDKHKDREHRHKEHKKEKDREK...     765       140   \n",
484 |        "..                                                 ...     ...       ...   \n",
485 |        "395  METSALKQQEQPAATKIRNLPWVEKYRPQTLNDLISHQDILSTIQK...     340        74   \n",
486 |        "396  MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...     140        27   \n",
487 |        "397  MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...     140        39   \n",
488 |        "398  MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI...     688       259   \n",
489 |        "399  MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI...     688       108   \n",
490 |        "\n",
491 |        "    Orientation          sgRNA Context Sequence  Target Cut Length  \\\n",
492 |        "0         sense  TGGAGCAGATACAAGAGCAACTGAAGGGAT                191   \n",
493 |        "1         sense  CCGGAAAACTGGCACGACCATCGCTGGGGT                137   \n",
494 |        "2         sense  TAGAAAAAGATTTGCGCACCCAAGTGGAAT                316   \n",
495 |        "3     antisense  TGGCCTTTGACCCAGACATAATGGTGGCCA                787   \n",
496 |        "4     antisense  AAATACTCACTCATCCTCATCTCGAGGTCT                420   \n",
497 |        "..          ...                             ...                ...   \n",
498 |        "395   antisense  TGTCTTTATATAGCTGTTTCGCACAGGCTA                220   \n",
499 |        "396       sense  TTGTCAATGTCTACTACACCACCATGGATA                 79   \n",
500 |        "397       sense  GGCGTTTGCTGTCCCGCCTGTACATGGGCA                115   \n",
501 |        "398   antisense  ACTAGCAATGGCTTATCAGATCGAAGGTCA                776   \n",
502 |        "399       sense  AAATTTTGTCTGATGACTACTCAAAGGTAT                322   \n",
503 |        "\n",
504 |        "                                          extended_seq  AA 0-Indexed  \\\n",
505 |        "0    -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF...            63   \n",
506 |        "1    -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF...            45   \n",
507 |        "2    -----------------MRRSEVLAEESIVCLQKALNHLREIWELI...           105   \n",
508 |        "3    -----------------MRRSEVLAEESIVCLQKALNHLREIWELI...           262   \n",
509 |        "4    -----------------MSGDHLHNDSQIEADFRLNDSHKHKDKHK...           139   \n",
510 |        "..                                                 ...           ...   \n",
511 |        "395  -----------------METSALKQQEQPAATKIRNLPWVEKYRPQ...            73   \n",
512 |        "396  -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD...            26   \n",
513 |        "397  -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD...            38   \n",
514 |        "398  -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK...           258   \n",
515 |        "399  -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK...           107   \n",
516 |        "\n",
517 |        "     AA 0-Indexed padded  seq_start  seq_end  \\\n",
518 |        "0                     80         64       96   \n",
519 |        "1                     62         46       78   \n",
520 |        "2                    122        106      138   \n",
521 |        "3                    279        263      295   \n",
522 |        "4                    156        140      172   \n",
523 |        "..                   ...        ...      ...   \n",
524 |        "395                   90         74      106   \n",
525 |        "396                   43         27       59   \n",
526 |        "397                   55         39       71   \n",
527 |        "398                  275        259      291   \n",
528 |        "399                  124        108      140   \n",
529 |        "\n",
530 |        "                        AA Subsequence  \n",
531 |        "0    GVVYKDGIVLGADTRATEGMVVADKNCSKIHFI  \n",
532 |        "1    AKRGYKLPKVRKTGTTIAGVVYKDGIVLGADTR  \n",
533 |        "2    EEGETTILQLEKDLRTQVELMRKQKKERKQELK  \n",
534 |        "3    WDRLQIPEEEREAVATIMSGSKAKVRKALQLEV  \n",
535 |        "4    GYFVPPKEDIKPLKRPRDEDDADYKPKKIKTED  \n",
536 |        "..                                 ...  \n",
537 |        "395  LYGPPGTGKTSTILACAKQLYKDKEFGSMVLEL  \n",
538 |        "396  DQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTA  \n",
539 |        "397  VYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQ  \n",
540 |        "398  TMAVGTTTGQVLLYDLRSDKPLLVKDHQYGLPI  \n",
541 |        "399  CLDSEVVTFEILSDDYSKIVFLHNDRYIEFHSQ  \n",
542 |        "\n",
543 |        "[400 rows x 19 columns]"
544 |       ]
545 |      },
546 |      "execution_count": null,
547 |      "metadata": {},
548 |      "output_type": "execute_result"
549 |     }
550 |    ],
551 |    "source": [
552 |     "## aa sequences\n",
553 |     "aa_seq_df = targetdata.build_transcript_aa_seq_df(design_df, n_jobs=2)\n",
554 |     "aa_subseq_df = targetfeat.get_aa_subseq_df(sg_designs=design_targ_df, aa_seq_df=aa_seq_df, width=16,\n",
555 |     "                                           id_cols=id_cols)\n",
556 |     "aa_subseq_df"
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "code",
561 |    "execution_count": null,
562 |    "metadata": {},
563 |    "outputs": [
564 |     {
565 |      "name": "stdout",
566 |      "output_type": "stream",
567 |      "text": [
568 |       "Getting protein domains\n"
569 |      ]
570 |     },
571 |     {
572 |      "name": "stderr",
573 |      "output_type": "stream",
574 |      "text": [
575 |       "100%|█████████████████████████████████████████| 200/200 [00:53<00:00,  3.75it/s]\n"
576 |      ]
577 |     }
578 |    ],
579 |    "source": [
580 |     "## domains\n",
581 |     "domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)\n",
582 |     "domain_feature_df = targetfeat.get_protein_domain_features(design_targ_df, domain_df, sources=None,\n",
583 |     "                                                           id_cols=id_cols)"
584 |    ]
585 |   },
586 |   {
587 |    "cell_type": "code",
588 |    "execution_count": null,
589 |    "metadata": {},
590 |    "outputs": [
591 |     {
592 |      "name": "stdout",
593 |      "output_type": "stream",
594 |      "text": [
595 |       "Getting conservation\n"
596 |      ]
597 |     },
598 |     {
599 |      "name": "stderr",
600 |      "output_type": "stream",
601 |      "text": [
602 |       "100%|█████████████████████████████████████████| 200/200 [06:24<00:00,  1.92s/it]\n"
603 |      ]
604 |     },
605 |     {
606 |      "data": {
607 |       "text/html": [
608 |        "<div>\n",
609 |        "<style scoped>\n",
610 |        "    .dataframe tbody tr th:only-of-type {\n",
611 |        "        vertical-align: middle;\n",
612 |        "    }\n",
613 |        "\n",
614 |        "    .dataframe tbody tr th {\n",
615 |        "        vertical-align: top;\n",
616 |        "    }\n",
617 |        "\n",
618 |        "    .dataframe thead th {\n",
619 |        "        text-align: right;\n",
620 |        "    }\n",
621 |        "</style>\n",
622 |        "<table border=\"1\" class=\"dataframe\">\n",
623 |        "  <thead>\n",
624 |        "    <tr style=\"text-align: right;\">\n",
625 |        "      <th></th>\n",
626 |        "      <th>sgRNA Context Sequence</th>\n",
627 |        "      <th>Target Cut Length</th>\n",
628 |        "      <th>Target Transcript</th>\n",
629 |        "      <th>Orientation</th>\n",
630 |        "      <th>cons_4</th>\n",
631 |        "      <th>cons_32</th>\n",
632 |        "    </tr>\n",
633 |        "  </thead>\n",
634 |        "  <tbody>\n",
635 |        "    <tr>\n",
636 |        "      <th>0</th>\n",
637 |        "      <td>AAAAGAATGATGAAAAGACACCACAGGGAG</td>\n",
638 |        "      <td>244</td>\n",
639 |        "      <td>ENST00000610426.5</td>\n",
640 |        "      <td>sense</td>\n",
641 |        "      <td>0.218231</td>\n",
642 |        "      <td>0.408844</td>\n",
643 |        "    </tr>\n",
644 |        "    <tr>\n",
645 |        "      <th>1</th>\n",
646 |        "      <td>AAAAGAGCCATGAATCTAAACATCAGGAAT</td>\n",
647 |        "      <td>640</td>\n",
648 |        "      <td>ENST00000223073.6</td>\n",
649 |        "      <td>sense</td>\n",
650 |        "      <td>0.129825</td>\n",
651 |        "      <td>0.278180</td>\n",
652 |        "    </tr>\n",
653 |        "    <tr>\n",
654 |        "      <th>2</th>\n",
655 |        "      <td>AAAAGCGCCAAATGGCCCGAGAATTGGGAG</td>\n",
656 |        "      <td>709</td>\n",
657 |        "      <td>ENST00000331923.9</td>\n",
658 |        "      <td>sense</td>\n",
659 |        "      <td>0.470906</td>\n",
660 |        "      <td>0.532305</td>\n",
661 |        "    </tr>\n",
662 |        "    <tr>\n",
663 |        "      <th>3</th>\n",
664 |        "      <td>AAACAGAAAAAGTTAAAATCACCAAGGTGT</td>\n",
665 |        "      <td>496</td>\n",
666 |        "      <td>ENST00000283882.4</td>\n",
667 |        "      <td>sense</td>\n",
668 |        "      <td>0.580556</td>\n",
669 |        "      <td>0.602708</td>\n",
670 |        "    </tr>\n",
671 |        "    <tr>\n",
672 |        "      <th>4</th>\n",
673 |        "      <td>AAACAGATGGAAGATGCTTACCGGGGGACC</td>\n",
674 |        "      <td>132</td>\n",
675 |        "      <td>ENST00000393047.8</td>\n",
676 |        "      <td>sense</td>\n",
677 |        "      <td>0.283447</td>\n",
678 |        "      <td>0.414293</td>\n",
679 |        "    </tr>\n",
680 |        "    <tr>\n",
681 |        "      <th>...</th>\n",
682 |        "      <td>...</td>\n",
683 |        "      <td>...</td>\n",
684 |        "      <td>...</td>\n",
685 |        "      <td>...</td>\n",
686 |        "      <td>...</td>\n",
687 |        "      <td>...</td>\n",
688 |        "    </tr>\n",
689 |        "    <tr>\n",
690 |        "      <th>395</th>\n",
691 |        "      <td>TTTGATTGCATTAAGGTTGGACTCTGGATT</td>\n",
692 |        "      <td>246</td>\n",
693 |        "      <td>ENST00000249269.9</td>\n",
694 |        "      <td>sense</td>\n",
695 |        "      <td>0.580612</td>\n",
696 |        "      <td>0.618707</td>\n",
697 |        "    </tr>\n",
698 |        "    <tr>\n",
699 |        "      <th>396</th>\n",
700 |        "      <td>TTTGCCCACAGCTCCAAAGCATCGCGGAGA</td>\n",
701 |        "      <td>130</td>\n",
702 |        "      <td>ENST00000227618.8</td>\n",
703 |        "      <td>sense</td>\n",
704 |        "      <td>0.323770</td>\n",
705 |        "      <td>0.416368</td>\n",
706 |        "    </tr>\n",
707 |        "    <tr>\n",
708 |        "      <th>397</th>\n",
709 |        "      <td>TTTTACAGTGCGATGTATGATGTATGGCTT</td>\n",
710 |        "      <td>119</td>\n",
711 |        "      <td>ENST00000338366.6</td>\n",
712 |        "      <td>sense</td>\n",
713 |        "      <td>0.788000</td>\n",
714 |        "      <td>0.537417</td>\n",
715 |        "    </tr>\n",
716 |        "    <tr>\n",
717 |        "      <th>398</th>\n",
718 |        "      <td>TTTTGGATCTCGTAGTGATTCAAGAGGGAA</td>\n",
719 |        "      <td>233</td>\n",
720 |        "      <td>ENST00000629496.3</td>\n",
721 |        "      <td>sense</td>\n",
722 |        "      <td>0.239630</td>\n",
723 |        "      <td>0.347615</td>\n",
724 |        "    </tr>\n",
725 |        "    <tr>\n",
726 |        "      <th>399</th>\n",
727 |        "      <td>TTTTTGTTACTACAGGTTCGCTGCTGGGAA</td>\n",
728 |        "      <td>201</td>\n",
729 |        "      <td>ENST00000395840.6</td>\n",
730 |        "      <td>sense</td>\n",
731 |        "      <td>0.693767</td>\n",
732 |        "      <td>0.639044</td>\n",
733 |        "    </tr>\n",
734 |        "  </tbody>\n",
735 |        "</table>\n",
736 |        "<p>400 rows × 6 columns</p>\n",
737 |        "</div>"
738 |       ],
739 |       "text/plain": [
740 |        "             sgRNA Context Sequence  Target Cut Length  Target Transcript  \\\n",
741 |        "0    AAAAGAATGATGAAAAGACACCACAGGGAG                244  ENST00000610426.5   \n",
742 |        "1    AAAAGAGCCATGAATCTAAACATCAGGAAT                640  ENST00000223073.6   \n",
743 |        "2    AAAAGCGCCAAATGGCCCGAGAATTGGGAG                709  ENST00000331923.9   \n",
744 |        "3    AAACAGAAAAAGTTAAAATCACCAAGGTGT                496  ENST00000283882.4   \n",
745 |        "4    AAACAGATGGAAGATGCTTACCGGGGGACC                132  ENST00000393047.8   \n",
746 |        "..                              ...                ...                ...   \n",
747 |        "395  TTTGATTGCATTAAGGTTGGACTCTGGATT                246  ENST00000249269.9   \n",
748 |        "396  TTTGCCCACAGCTCCAAAGCATCGCGGAGA                130  ENST00000227618.8   \n",
749 |        "397  TTTTACAGTGCGATGTATGATGTATGGCTT                119  ENST00000338366.6   \n",
750 |        "398  TTTTGGATCTCGTAGTGATTCAAGAGGGAA                233  ENST00000629496.3   \n",
751 |        "399  TTTTTGTTACTACAGGTTCGCTGCTGGGAA                201  ENST00000395840.6   \n",
752 |        "\n",
753 |        "    Orientation    cons_4   cons_32  \n",
754 |        "0         sense  0.218231  0.408844  \n",
755 |        "1         sense  0.129825  0.278180  \n",
756 |        "2         sense  0.470906  0.532305  \n",
757 |        "3         sense  0.580556  0.602708  \n",
758 |        "4         sense  0.283447  0.414293  \n",
759 |        "..          ...       ...       ...  \n",
760 |        "395       sense  0.580612  0.618707  \n",
761 |        "396       sense  0.323770  0.416368  \n",
762 |        "397       sense  0.788000  0.537417  \n",
763 |        "398       sense  0.239630  0.347615  \n",
764 |        "399       sense  0.693767  0.639044  \n",
765 |        "\n",
766 |        "[400 rows x 6 columns]"
767 |       ]
768 |      },
769 |      "execution_count": null,
770 |      "metadata": {},
771 |      "output_type": "execute_result"
772 |     }
773 |    ],
774 |    "source": [
775 |     "## conservation\n",
776 |     "conservation_df = targetdata.build_conservation_df(design_df, n_jobs=max_n_jobs)\n",
777 |     "conservation_feature_df = targetfeat.get_conservation_features(design_targ_df, conservation_df,\n",
778 |     "                                                             small_width=2, large_width=16,\n",
779 |     "                                                             conservation_column='ranked_conservation',\n",
780 |     "                                                             id_cols=id_cols)\n",
781 |     "conservation_feature_df"
782 |    ]
783 |   },
784 |   {
785 |    "cell_type": "code",
786 |    "execution_count": null,
787 |    "metadata": {},
788 |    "outputs": [
789 |     {
790 |      "name": "stderr",
791 |      "output_type": "stream",
792 |      "text": [
793 |       "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n",
794 |       "  warnings.warn(\n",
795 |       "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n",
796 |       "  warnings.warn(\n"
797 |      ]
798 |     }
799 |    ],
800 |    "source": [
801 |     "predictions = predict_target(design_df=design_df,\n",
802 |     "                             aa_subseq_df=aa_subseq_df,\n",
803 |     "                             domain_feature_df=domain_feature_df,\n",
804 |     "                             conservation_feature_df=conservation_feature_df)\n",
805 |     "design_df['Target Score'] = predictions"
806 |    ]
807 |   },
808 |   {
809 |    "cell_type": "code",
810 |    "execution_count": null,
811 |    "metadata": {},
812 |    "outputs": [
813 |     {
814 |      "name": "stderr",
815 |      "output_type": "stream",
816 |      "text": [
817 |       "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n",
818 |       "  warnings.warn(\n",
819 |       "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n",
820 |       "  warnings.warn(\n"
821 |      ]
822 |     }
823 |    ],
824 |    "source": [
825 |     "lite_predictions = predict_target(design_df=design_df,\n",
826 |     "                                  aa_subseq_df=aa_subseq_df)\n",
827 |     "design_df['Target Score Lite'] = lite_predictions"
828 |    ]
829 |   },
830 |   {
831 |    "cell_type": "code",
832 |    "execution_count": null,
833 |    "metadata": {},
834 |    "outputs": [
835 |     {
836 |      "data": {
837 |       "text/plain": [
838 |        "0      TGGAGCAGATACAAGAGCAACTGAAGGGAT\n",
839 |        "1      CCGGAAAACTGGCACGACCATCGCTGGGGT\n",
840 |        "2      TAGAAAAAGATTTGCGCACCCAAGTGGAAT\n",
841 |        "3      TGGCCTTTGACCCAGACATAATGGTGGCCA\n",
842 |        "4      AAATACTCACTCATCCTCATCTCGAGGTCT\n",
843 |        "                    ...              \n",
844 |        "395    TGTCTTTATATAGCTGTTTCGCACAGGCTA\n",
845 |        "396    TTGTCAATGTCTACTACACCACCATGGATA\n",
846 |        "397    GGCGTTTGCTGTCCCGCCTGTACATGGGCA\n",
847 |        "398    ACTAGCAATGGCTTATCAGATCGAAGGTCA\n",
848 |        "399    AAATTTTGTCTGATGACTACTCAAAGGTAT\n",
849 |        "Name: sgRNA Context Sequence, Length: 400, dtype: object"
850 |       ]
851 |      },
852 |      "execution_count": null,
853 |      "metadata": {},
854 |      "output_type": "execute_result"
855 |     }
856 |    ],
857 |    "source": [
858 |     "design_df['sgRNA Context Sequence']"
859 |    ]
860 |   },
861 |   {
862 |    "cell_type": "code",
863 |    "execution_count": null,
864 |    "metadata": {},
865 |    "outputs": [],
866 |    "source": [
867 |     "assert stats.pearsonr(design_df['Target Score'], design_df['Target Score Lite'])[0] > 0.7"
868 |    ]
869 |   },
870 |   {
871 |    "cell_type": "code",
872 |    "execution_count": null,
873 |    "metadata": {},
874 |    "outputs": [],
875 |    "source": [
876 |     "sanger_df = pd.read_csv('test_data/Behan2019_activity.csv')\n",
877 |     "gecko_df = pd.read_csv('test_data/Aguirre2016_activity.csv')\n",
878 |     "\n",
879 |     "sanger_designs = sanger_df.merge(design_df, how='inner',\n",
880 |     "                                 on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',\n",
881 |     "                                     'Target Cut %'])\n",
882 |     "gecko_designs = gecko_df.merge(design_df, how='inner',\n",
883 |     "                                on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',\n",
884 |     "                                    'Target Cut %'])\n",
885 |     "assert stats.pearsonr(sanger_designs['avg_mean_centered_neg_lfc'],\n",
886 |     "                      sanger_designs['Target Score'])[0] > 0.2\n",
887 |     "assert stats.pearsonr(gecko_designs['avg_mean_centered_neg_lfc'],\n",
888 |     "                      gecko_designs['Target Score'])[0] > 0.05"
889 |    ]
890 |   }
891 |  ],
892 |  "metadata": {
893 |   "kernelspec": {
894 |    "display_name": "rs3_v2",
895 |    "language": "python",
896 |    "name": "rs3_v2"
897 |   }
898 |  },
899 |  "nbformat": 4,
900 |  "nbformat_minor": 4
901 | }
902 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | ## How to get started
 4 | 
 5 | Before anything else, please install the git hooks that run automatic scripts during each commit and merge to strip the notebooks of superfluous metadata (and avoid merge conflicts). After cloning the repository, run the following command inside it:
 6 | ```
 7 | nbdev_install_git_hooks
 8 | ```
 9 | 
10 | ## Did you find a bug?
11 | 
12 | * Ensure the bug was not already reported by searching on GitHub under Issues.
13 | * If you're unable to find an open issue addressing the problem, open a new one. Be sure to include a title and clear description, as much relevant information as possible, and a code sample or an executable test case demonstrating the expected behavior that is not occurring.
14 | * Be sure to add the complete error messages.
15 | 
16 | #### Did you write a patch that fixes a bug?
17 | 
18 | * Open a new GitHub pull request with the patch.
19 | * Ensure that your PR includes a test that fails without your patch, and pass with it.
20 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable.
21 | 
22 | ## PR submission guidelines
23 | 
24 | * Keep each PR focused. While it's more convenient, do not combine several unrelated fixes together. Create as many branches as needing to keep each PR focused.
25 | * Do not mix style changes/fixes with "functional" changes. It's very difficult to review such PRs and it most likely get rejected.
26 | * Do not add/remove vertical whitespace. Preserve the original style of the file you edit as much as you can.
27 | * Do not turn an already submitted PR into your development playground. If after you submitted PR, you discovered that more work is needed - close the PR, do the required work and then submit a new PR. Otherwise each of your commits requires attention from maintainers of the project.
28 | * If, however, you submitted a PR and received a request for changes, you should proceed with commits inside that PR, so that the maintainer can see the incremental fixes and won't need to review the whole PR again. In the exception case where you realize it'll take many many commits to complete the requests, then it's probably best to close the PR, do the work and then submit it again. Use common sense where you'd choose one way over another.
29 | 
30 | ## Do you want to contribute to the documentation?
31 | 
32 | * Docs are automatically created from the notebooks in the nbs folder.
33 | 
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include settings.ini
2 | include LICENSE
3 | include CONTRIBUTING.md
4 | include README.md
5 | include rs3/RuleSet3.pkl
6 | include rs3/target_model.pkl
7 | include rs3/target_lite_model.pkl
8 | recursive-exclude * __pycache__
9 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .ONESHELL:
 2 | SHELL := /bin/bash
 3 | SRC = $(wildcard ./*.ipynb)
 4 | 
 5 | all: rs3 docs
 6 | 
 7 | rs3: $(SRC)
 8 | 	nbdev_build_lib
 9 | 	touch rs3
10 | 
11 | sync:
12 | 	nbdev_update_lib
13 | 
14 | docs_serve: docs
15 | 	cd docs && bundle exec jekyll serve
16 | 
17 | docs: $(SRC)
18 | 	nbdev_build_docs
19 | 	touch docs
20 | 
21 | test:
22 | 	nbdev_test_nbs
23 | 
24 | release: pypi conda_release
25 | 	nbdev_bump_version
26 | 
27 | conda_release:
28 | 	fastrelease_conda_package
29 | 
30 | pypi: dist
31 | 	twine upload --repository pypi dist/*
32 | 
33 | dist: clean
34 | 	python setup.py sdist bdist_wheel
35 | 
36 | clean:
37 | 	rm -rf dist


--------------------------------------------------------------------------------
/RuleSet3.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/RuleSet3.pkl


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   fastai: &fastai
 4 |     restart: unless-stopped
 5 |     working_dir: /data
 6 |     image: fastai/codespaces
 7 |     logging:
 8 |       driver: json-file
 9 |       options:
10 |         max-size: 50m
11 |     stdin_open: true
12 |     tty: true
13 |     volumes:
14 |       - .:/data/
15 | 
16 |   notebook:
17 |     <<: *fastai
18 |     command: bash -c "pip install -e . && jupyter notebook --allow-root --no-browser --ip=0.0.0.0 --port=8080 --NotebookApp.token='' --NotebookApp.password=''"
19 |     ports:
20 |       - "8080:8080"
21 | 
22 |   watcher:
23 |     <<: *fastai
24 |     command: watchmedo shell-command --command nbdev_build_docs --pattern *.ipynb --recursive --drop
25 |     network_mode: host # for GitHub Codespaces https://github.com/features/codespaces/
26 | 
27 |   jekyll:
28 |     <<: *fastai
29 |     ports:
30 |      - "4000:4000"
31 |     command: >
32 |      bash -c "pip install .
33 |      && nbdev_build_docs && cd docs
34 |      && bundle i
35 |      && chmod -R u+rwx . && bundle exec jekyll serve --host 0.0.0.0"
36 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _site/
2 | 


--------------------------------------------------------------------------------
/docs/Gemfile:
--------------------------------------------------------------------------------
 1 | source "https://rubygems.org"
 2 | 
 3 | gem 'github-pages', group: :jekyll_plugins
 4 | 
 5 | # Added at 2019-11-25 10:11:40 -0800 by jhoward:
 6 | gem "nokogiri", "< 1.11.1"
 7 | gem "jekyll", ">= 3.7"
 8 | gem "kramdown", ">= 2.3.1"
 9 | gem "jekyll-remote-theme"
10 | 


--------------------------------------------------------------------------------
/docs/Gemfile.lock:
--------------------------------------------------------------------------------
  1 | GEM
  2 |   remote: https://rubygems.org/
  3 |   specs:
  4 |     activesupport (6.0.3.6)
  5 |       concurrent-ruby (~> 1.0, >= 1.0.2)
  6 |       i18n (>= 0.7, < 2)
  7 |       minitest (~> 5.1)
  8 |       tzinfo (~> 1.1)
  9 |       zeitwerk (~> 2.2, >= 2.2.2)
 10 |     addressable (2.7.0)
 11 |       public_suffix (>= 2.0.2, < 5.0)
 12 |     coffee-script (2.4.1)
 13 |       coffee-script-source
 14 |       execjs
 15 |     coffee-script-source (1.11.1)
 16 |     colorator (1.1.0)
 17 |     commonmarker (0.17.13)
 18 |       ruby-enum (~> 0.5)
 19 |     concurrent-ruby (1.1.8)
 20 |     dnsruby (1.61.5)
 21 |       simpleidn (~> 0.1)
 22 |     em-websocket (0.5.2)
 23 |       eventmachine (>= 0.12.9)
 24 |       http_parser.rb (~> 0.6.0)
 25 |     ethon (0.12.0)
 26 |       ffi (>= 1.3.0)
 27 |     eventmachine (1.2.7)
 28 |     execjs (2.7.0)
 29 |     faraday (1.3.0)
 30 |       faraday-net_http (~> 1.0)
 31 |       multipart-post (>= 1.2, < 3)
 32 |       ruby2_keywords
 33 |     faraday-net_http (1.0.1)
 34 |     ffi (1.15.0)
 35 |     forwardable-extended (2.6.0)
 36 |     gemoji (3.0.1)
 37 |     github-pages (214)
 38 |       github-pages-health-check (= 1.17.0)
 39 |       jekyll (= 3.9.0)
 40 |       jekyll-avatar (= 0.7.0)
 41 |       jekyll-coffeescript (= 1.1.1)
 42 |       jekyll-commonmark-ghpages (= 0.1.6)
 43 |       jekyll-default-layout (= 0.1.4)
 44 |       jekyll-feed (= 0.15.1)
 45 |       jekyll-gist (= 1.5.0)
 46 |       jekyll-github-metadata (= 2.13.0)
 47 |       jekyll-mentions (= 1.6.0)
 48 |       jekyll-optional-front-matter (= 0.3.2)
 49 |       jekyll-paginate (= 1.1.0)
 50 |       jekyll-readme-index (= 0.3.0)
 51 |       jekyll-redirect-from (= 0.16.0)
 52 |       jekyll-relative-links (= 0.6.1)
 53 |       jekyll-remote-theme (= 0.4.3)
 54 |       jekyll-sass-converter (= 1.5.2)
 55 |       jekyll-seo-tag (= 2.7.1)
 56 |       jekyll-sitemap (= 1.4.0)
 57 |       jekyll-swiss (= 1.0.0)
 58 |       jekyll-theme-architect (= 0.1.1)
 59 |       jekyll-theme-cayman (= 0.1.1)
 60 |       jekyll-theme-dinky (= 0.1.1)
 61 |       jekyll-theme-hacker (= 0.1.2)
 62 |       jekyll-theme-leap-day (= 0.1.1)
 63 |       jekyll-theme-merlot (= 0.1.1)
 64 |       jekyll-theme-midnight (= 0.1.1)
 65 |       jekyll-theme-minimal (= 0.1.1)
 66 |       jekyll-theme-modernist (= 0.1.1)
 67 |       jekyll-theme-primer (= 0.5.4)
 68 |       jekyll-theme-slate (= 0.1.1)
 69 |       jekyll-theme-tactile (= 0.1.1)
 70 |       jekyll-theme-time-machine (= 0.1.1)
 71 |       jekyll-titles-from-headings (= 0.5.3)
 72 |       jemoji (= 0.12.0)
 73 |       kramdown (= 2.3.1)
 74 |       kramdown-parser-gfm (= 1.1.0)
 75 |       liquid (= 4.0.3)
 76 |       mercenary (~> 0.3)
 77 |       minima (= 2.5.1)
 78 |       nokogiri (>= 1.10.4, < 2.0)
 79 |       rouge (= 3.26.0)
 80 |       terminal-table (~> 1.4)
 81 |     github-pages-health-check (1.17.0)
 82 |       addressable (~> 2.3)
 83 |       dnsruby (~> 1.60)
 84 |       octokit (~> 4.0)
 85 |       public_suffix (>= 2.0.2, < 5.0)
 86 |       typhoeus (~> 1.3)
 87 |     html-pipeline (2.14.0)
 88 |       activesupport (>= 2)
 89 |       nokogiri (>= 1.4)
 90 |     http_parser.rb (0.6.0)
 91 |     i18n (0.9.5)
 92 |       concurrent-ruby (~> 1.0)
 93 |     jekyll (3.9.0)
 94 |       addressable (~> 2.4)
 95 |       colorator (~> 1.0)
 96 |       em-websocket (~> 0.5)
 97 |       i18n (~> 0.7)
 98 |       jekyll-sass-converter (~> 1.0)
 99 |       jekyll-watch (~> 2.0)
100 |       kramdown (>= 1.17, < 3)
101 |       liquid (~> 4.0)
102 |       mercenary (~> 0.3.3)
103 |       pathutil (~> 0.9)
104 |       rouge (>= 1.7, < 4)
105 |       safe_yaml (~> 1.0)
106 |     jekyll-avatar (0.7.0)
107 |       jekyll (>= 3.0, < 5.0)
108 |     jekyll-coffeescript (1.1.1)
109 |       coffee-script (~> 2.2)
110 |       coffee-script-source (~> 1.11.1)
111 |     jekyll-commonmark (1.3.1)
112 |       commonmarker (~> 0.14)
113 |       jekyll (>= 3.7, < 5.0)
114 |     jekyll-commonmark-ghpages (0.1.6)
115 |       commonmarker (~> 0.17.6)
116 |       jekyll-commonmark (~> 1.2)
117 |       rouge (>= 2.0, < 4.0)
118 |     jekyll-default-layout (0.1.4)
119 |       jekyll (~> 3.0)
120 |     jekyll-feed (0.15.1)
121 |       jekyll (>= 3.7, < 5.0)
122 |     jekyll-gist (1.5.0)
123 |       octokit (~> 4.2)
124 |     jekyll-github-metadata (2.13.0)
125 |       jekyll (>= 3.4, < 5.0)
126 |       octokit (~> 4.0, != 4.4.0)
127 |     jekyll-mentions (1.6.0)
128 |       html-pipeline (~> 2.3)
129 |       jekyll (>= 3.7, < 5.0)
130 |     jekyll-optional-front-matter (0.3.2)
131 |       jekyll (>= 3.0, < 5.0)
132 |     jekyll-paginate (1.1.0)
133 |     jekyll-readme-index (0.3.0)
134 |       jekyll (>= 3.0, < 5.0)
135 |     jekyll-redirect-from (0.16.0)
136 |       jekyll (>= 3.3, < 5.0)
137 |     jekyll-relative-links (0.6.1)
138 |       jekyll (>= 3.3, < 5.0)
139 |     jekyll-remote-theme (0.4.3)
140 |       addressable (~> 2.0)
141 |       jekyll (>= 3.5, < 5.0)
142 |       jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
143 |       rubyzip (>= 1.3.0, < 3.0)
144 |     jekyll-sass-converter (1.5.2)
145 |       sass (~> 3.4)
146 |     jekyll-seo-tag (2.7.1)
147 |       jekyll (>= 3.8, < 5.0)
148 |     jekyll-sitemap (1.4.0)
149 |       jekyll (>= 3.7, < 5.0)
150 |     jekyll-swiss (1.0.0)
151 |     jekyll-theme-architect (0.1.1)
152 |       jekyll (~> 3.5)
153 |       jekyll-seo-tag (~> 2.0)
154 |     jekyll-theme-cayman (0.1.1)
155 |       jekyll (~> 3.5)
156 |       jekyll-seo-tag (~> 2.0)
157 |     jekyll-theme-dinky (0.1.1)
158 |       jekyll (~> 3.5)
159 |       jekyll-seo-tag (~> 2.0)
160 |     jekyll-theme-hacker (0.1.2)
161 |       jekyll (> 3.5, < 5.0)
162 |       jekyll-seo-tag (~> 2.0)
163 |     jekyll-theme-leap-day (0.1.1)
164 |       jekyll (~> 3.5)
165 |       jekyll-seo-tag (~> 2.0)
166 |     jekyll-theme-merlot (0.1.1)
167 |       jekyll (~> 3.5)
168 |       jekyll-seo-tag (~> 2.0)
169 |     jekyll-theme-midnight (0.1.1)
170 |       jekyll (~> 3.5)
171 |       jekyll-seo-tag (~> 2.0)
172 |     jekyll-theme-minimal (0.1.1)
173 |       jekyll (~> 3.5)
174 |       jekyll-seo-tag (~> 2.0)
175 |     jekyll-theme-modernist (0.1.1)
176 |       jekyll (~> 3.5)
177 |       jekyll-seo-tag (~> 2.0)
178 |     jekyll-theme-primer (0.5.4)
179 |       jekyll (> 3.5, < 5.0)
180 |       jekyll-github-metadata (~> 2.9)
181 |       jekyll-seo-tag (~> 2.0)
182 |     jekyll-theme-slate (0.1.1)
183 |       jekyll (~> 3.5)
184 |       jekyll-seo-tag (~> 2.0)
185 |     jekyll-theme-tactile (0.1.1)
186 |       jekyll (~> 3.5)
187 |       jekyll-seo-tag (~> 2.0)
188 |     jekyll-theme-time-machine (0.1.1)
189 |       jekyll (~> 3.5)
190 |       jekyll-seo-tag (~> 2.0)
191 |     jekyll-titles-from-headings (0.5.3)
192 |       jekyll (>= 3.3, < 5.0)
193 |     jekyll-watch (2.2.1)
194 |       listen (~> 3.0)
195 |     jemoji (0.12.0)
196 |       gemoji (~> 3.0)
197 |       html-pipeline (~> 2.2)
198 |       jekyll (>= 3.0, < 5.0)
199 |     kramdown (2.3.1)
200 |       rexml
201 |     kramdown-parser-gfm (1.1.0)
202 |       kramdown (~> 2.0)
203 |     liquid (4.0.3)
204 |     listen (3.5.1)
205 |       rb-fsevent (~> 0.10, >= 0.10.3)
206 |       rb-inotify (~> 0.9, >= 0.9.10)
207 |     mercenary (0.3.6)
208 |     mini_portile2 (2.5.0)
209 |     minima (2.5.1)
210 |       jekyll (>= 3.5, < 5.0)
211 |       jekyll-feed (~> 0.9)
212 |       jekyll-seo-tag (~> 2.1)
213 |     minitest (5.14.4)
214 |     multipart-post (2.1.1)
215 |     nokogiri (1.11.0)
216 |       mini_portile2 (~> 2.5.0)
217 |       racc (~> 1.4)
218 |     octokit (4.20.0)
219 |       faraday (>= 0.9)
220 |       sawyer (~> 0.8.0, >= 0.5.3)
221 |     pathutil (0.16.2)
222 |       forwardable-extended (~> 2.6)
223 |     public_suffix (4.0.6)
224 |     racc (1.5.2)
225 |     rb-fsevent (0.10.4)
226 |     rb-inotify (0.10.1)
227 |       ffi (~> 1.0)
228 |     rexml (3.2.5)
229 |     rouge (3.26.0)
230 |     ruby-enum (0.9.0)
231 |       i18n
232 |     ruby2_keywords (0.0.4)
233 |     rubyzip (2.3.0)
234 |     safe_yaml (1.0.5)
235 |     sass (3.7.4)
236 |       sass-listen (~> 4.0.0)
237 |     sass-listen (4.0.0)
238 |       rb-fsevent (~> 0.9, >= 0.9.4)
239 |       rb-inotify (~> 0.9, >= 0.9.7)
240 |     sawyer (0.8.2)
241 |       addressable (>= 2.3.5)
242 |       faraday (> 0.8, < 2.0)
243 |     simpleidn (0.2.1)
244 |       unf (~> 0.1.4)
245 |     terminal-table (1.8.0)
246 |       unicode-display_width (~> 1.1, >= 1.1.1)
247 |     thread_safe (0.3.6)
248 |     typhoeus (1.4.0)
249 |       ethon (>= 0.9.0)
250 |     tzinfo (1.2.9)
251 |       thread_safe (~> 0.1)
252 |     unf (0.1.4)
253 |       unf_ext
254 |     unf_ext (0.0.7.7)
255 |     unicode-display_width (1.7.0)
256 |     zeitwerk (2.4.2)
257 | 
258 | PLATFORMS
259 |   ruby
260 | 
261 | DEPENDENCIES
262 |   github-pages
263 |   jekyll (>= 3.7)
264 |   jekyll-remote-theme
265 |   kramdown (>= 2.3.1)
266 |   nokogiri (< 1.11.1)
267 | 
268 | BUNDLED WITH
269 |    2.1.4
270 | 


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
 1 | repository: gpp-rnd/rs3
 2 | output: web
 3 | topnav_title: rs3
 4 | site_title: rs3
 5 | company_name: Genetic Perturbation Platform, Broad Institute
 6 | description: Predict the activity of CRISPR sgRNAs
 7 | # Set to false to disable KaTeX math
 8 | use_math: true
 9 | # Add Google analytics id if you have one and want to use it here
10 | google_analytics:
11 | # See http://nbdev.fast.ai/search for help with adding Search
12 | google_search:
13 | 
14 | host: 127.0.0.1
15 | # the preview server used. Leave as is.
16 | port: 4000
17 | # the port where the preview is rendered.
18 | 
19 | exclude:
20 |   - .idea/
21 |   - .gitignore
22 |   - vendor
23 |  
24 | exclude: [vendor]
25 | 
26 | highlighter: rouge
27 | markdown: kramdown
28 | kramdown:
29 |  input: GFM
30 |  auto_ids: true
31 |  hard_wrap: false
32 |  syntax_highlighter: rouge
33 | 
34 | collections:
35 |   tooltips:
36 |     output: false
37 | 
38 | defaults:
39 |   -
40 |     scope:
41 |       path: ""
42 |       type: "pages"
43 |     values:
44 |       layout: "page"
45 |       comments: true
46 |       search: true
47 |       sidebar: home_sidebar
48 |       topnav: topnav
49 |   -
50 |     scope:
51 |       path: ""
52 |       type: "tooltips"
53 |     values:
54 |       layout: "page"
55 |       comments: true
56 |       search: true
57 |       tooltip: true
58 | 
59 | sidebars:
60 | - home_sidebar
61 | 
62 | plugins:
63 |     - jekyll-remote-theme
64 |     
65 | remote_theme: fastai/nbdev-jekyll-theme
66 | baseurl: /rs3/


--------------------------------------------------------------------------------
/docs/_data/sidebars/home_sidebar.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | #################################################
 3 | ### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
 4 | #################################################
 5 | # Instead edit ../../sidebar.json
 6 | entries:
 7 | - folders:
 8 |   - folderitems:
 9 |     - output: web,pdf
10 |       title: Overview
11 |       url: /
12 |     - output: web,pdf
13 |       title: seq
14 |       url: seq.html
15 |     - output: web,pdf
16 |       title: targetdata
17 |       url: targetdata.html
18 |     - output: web,pdf
19 |       title: targetfeat
20 |       url: targetfeat.html
21 |     - output: web,pdf
22 |       title: predicttarg
23 |       url: predicttarg.html
24 |     - output: web,pdf
25 |       title: predict
26 |       url: predict.html
27 |     output: web
28 |     title: rs3
29 |   output: web
30 |   title: Sidebar
31 | 


--------------------------------------------------------------------------------
/docs/_data/topnav.yml:
--------------------------------------------------------------------------------
 1 | topnav:
 2 | - title: Topnav
 3 |   items:
 4 |     - title: github
 5 |       external_url: https://github.com/gpp-rnd/rs3/tree/master/
 6 | 
 7 | #Topnav dropdowns
 8 | topnav_dropdowns:
 9 | - title: Topnav dropdowns
10 |   folders:


--------------------------------------------------------------------------------
/docs/feed.xml:
--------------------------------------------------------------------------------
 1 | ---
 2 | search: exclude
 3 | layout: none
 4 | ---
 5 | 
 6 | <?xml version="1.0" encoding="UTF-8"?>
 7 | <rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
 8 |     <channel>
 9 |         <title>{{ site.title | xml_escape }}</title>
10 |         <description>{{ site.description | xml_escape }}</description>
11 |         <link>{{ site.url }}/</link>
12 |         <atom:link href="{{ "/feed.xml" | prepend: site.url }}" rel="self" type="application/rss+xml"/>
13 |         <pubDate>{{ site.time | date_to_rfc822 }}</pubDate>
14 |         <lastBuildDate>{{ site.time | date_to_rfc822 }}</lastBuildDate>
15 |         <generator>Jekyll v{{ jekyll.version }}</generator>
16 |         {% for post in site.posts limit:10 %}
17 |         <item>
18 |             <title>{{ post.title | xml_escape }}</title>
19 |             <description>{{ post.content | xml_escape }}</description>
20 |             <pubDate>{{ post.date | date_to_rfc822 }}</pubDate>
21 |             <link>{{ post.url | prepend: site.url }}</link>
22 |             <guid isPermaLink="true">{{ post.url | prepend: site.url }}</guid>
23 |             {% for tag in post.tags %}
24 |             <category>{{ tag | xml_escape }}</category>
25 |             {% endfor %}
26 |             {% for tag in page.tags %}
27 |             <category>{{ cat | xml_escape }}</category>
28 |             {% endfor %}
29 |         </item>
30 |         {% endfor %}
31 |     </channel>
32 | </rss>
33 | 


--------------------------------------------------------------------------------
/docs/images/output_18_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/docs/images/output_18_0.png


--------------------------------------------------------------------------------
/docs/images/output_42_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/docs/images/output_42_0.png


--------------------------------------------------------------------------------
/docs/predicttarg.html:
--------------------------------------------------------------------------------
  1 | ---
  2 | 
  3 | title: predicttarg
  4 | 
  5 | 
  6 | keywords: fastai
  7 | sidebar: home_sidebar
  8 | 
  9 | summary: "Rule set 3 target-site predictions"
 10 | description: "Rule set 3 target-site predictions"
 11 | nb_path: "03_predicttarg.ipynb"
 12 | ---
 13 | <!--
 14 | 
 15 | #################################################
 16 | ### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
 17 | #################################################
 18 | # file to edit: 03_predicttarg.ipynb
 19 | # command to build the docs after a change: nbdev_build_docs
 20 | 
 21 | -->
 22 | 
 23 | <div class="container" id="notebook-container">
 24 |         
 25 |     {% raw %}
 26 |     
 27 | <div class="cell border-box-sizing code_cell rendered">
 28 | 
 29 | </div>
 30 |     {% endraw %}
 31 | 
 32 |     {% raw %}
 33 |     
 34 | <div class="cell border-box-sizing code_cell rendered">
 35 | 
 36 | </div>
 37 |     {% endraw %}
 38 | 
 39 |     {% raw %}
 40 |     
 41 | <div class="cell border-box-sizing code_cell rendered">
 42 | <div class="input">
 43 | 
 44 | <div class="inner_cell">
 45 |     <div class="input_area">
 46 | <div class=" highlight hl-ipython3"><pre><span></span><span class="kn">import</span> <span class="nn">lightgbm</span>
 47 | <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
 48 | <span class="kn">from</span> <span class="nn">rs3</span> <span class="kn">import</span> <span class="n">targetdata</span>
 49 | <span class="kn">from</span> <span class="nn">scipy</span> <span class="kn">import</span> <span class="n">stats</span>
 50 | <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
 51 | </pre></div>
 52 | 
 53 |     </div>
 54 | </div>
 55 | </div>
 56 | 
 57 | </div>
 58 |     {% endraw %}
 59 | 
 60 |     {% raw %}
 61 |     
 62 | <div class="cell border-box-sizing code_cell rendered">
 63 | <div class="input">
 64 | 
 65 | <div class="inner_cell">
 66 |     <div class="input_area">
 67 | <div class=" highlight hl-ipython3"><pre><span></span><span class="vm">__file__</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">abspath</span><span class="p">(</span><span class="s1">&#39;&#39;</span><span class="p">)</span> <span class="o">+</span> <span class="s1">&#39;/03_predicttarg.ipynb&#39;</span>
 68 | <span class="kn">import</span> <span class="nn">multiprocessing</span>
 69 | <span class="n">max_n_jobs</span> <span class="o">=</span> <span class="n">multiprocessing</span><span class="o">.</span><span class="n">cpu_count</span><span class="p">()</span>
 70 | </pre></div>
 71 | 
 72 |     </div>
 73 | </div>
 74 | </div>
 75 | 
 76 | </div>
 77 |     {% endraw %}
 78 | 
 79 |     {% raw %}
 80 |     
 81 | <div class="cell border-box-sizing code_cell rendered">
 82 | 
 83 | <div class="output_wrapper">
 84 | <div class="output">
 85 | 
 86 | <div class="output_area">
 87 | 
 88 | 
 89 | <div class="output_markdown rendered_html output_subarea ">
 90 | <h4 id="load_target_model" class="doc_header"><code>load_target_model</code><a href="https://github.com/gpp-rnd/rs3/tree/master/rs3/predicttarg.py#L11" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>load_target_model</code>(<strong><code>lite</code></strong>=<em><code>False</code></em>)</p>
 91 | </blockquote>
 92 | <p>Load rule set 3 target model</p>
 93 | 
 94 | </div>
 95 | 
 96 | </div>
 97 | 
 98 | </div>
 99 | </div>
100 | 
101 | </div>
102 |     {% endraw %}
103 | 
104 |     {% raw %}
105 |     
106 | <div class="cell border-box-sizing code_cell rendered">
107 | 
108 | </div>
109 |     {% endraw %}
110 | 
111 |     {% raw %}
112 |     
113 | <div class="cell border-box-sizing code_cell rendered">
114 | <div class="input">
115 | 
116 | <div class="inner_cell">
117 |     <div class="input_area">
118 | <div class=" highlight hl-ipython3"><pre><span></span><span class="k">assert</span> <span class="nb">type</span><span class="p">(</span><span class="n">load_target_model</span><span class="p">()[</span><span class="s1">&#39;regressor&#39;</span><span class="p">])</span> <span class="o">==</span> <span class="n">lightgbm</span><span class="o">.</span><span class="n">sklearn</span><span class="o">.</span><span class="n">LGBMRegressor</span>
119 | </pre></div>
120 | 
121 |     </div>
122 | </div>
123 | </div>
124 | 
125 | <div class="output_wrapper">
126 | <div class="output">
127 | 
128 | <div class="output_area">
129 | 
130 | <div class="output_subarea output_stream output_stderr output_text">
131 | <pre>/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
132 |   warnings.warn(
133 | /Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
134 |   warnings.warn(
135 | </pre>
136 | </div>
137 | </div>
138 | 
139 | </div>
140 | </div>
141 | 
142 | </div>
143 |     {% endraw %}
144 | 
145 |     {% raw %}
146 |     
147 | <div class="cell border-box-sizing code_cell rendered">
148 | 
149 | <div class="output_wrapper">
150 | <div class="output">
151 | 
152 | <div class="output_area">
153 | 
154 | 
155 | <div class="output_markdown rendered_html output_subarea ">
156 | <h4 id="predict_target" class="doc_header"><code>predict_target</code><a href="https://github.com/gpp-rnd/rs3/tree/master/rs3/predicttarg.py#L21" class="source_link" style="float:right">[source]</a></h4><blockquote><p><code>predict_target</code>(<strong><code>design_df</code></strong>, <strong><code>aa_subseq_df</code></strong>, <strong><code>domain_feature_df</code></strong>=<em><code>None</code></em>, <strong><code>conservation_feature_df</code></strong>=<em><code>None</code></em>, <strong><code>id_cols</code></strong>=<em><code>None</code></em>)</p>
157 | </blockquote>
158 | <p>Make predictions using the Rule Set 3 target model. Note that if the protein_domain_df
159 | or conservation_df are not supplied, then the lite model will be used, otherwise the full model is used.</p>
160 | <p>:param design_df: DataFrame
161 | :param aa_subseq_df: DataFrame
162 | :param domain_feature_df: DataFrame
163 | :param id_cols: list or str
164 | :return: list</p>
165 | 
166 | </div>
167 | 
168 | </div>
169 | 
170 | </div>
171 | </div>
172 | 
173 | </div>
174 |     {% endraw %}
175 | 
176 |     {% raw %}
177 |     
178 | <div class="cell border-box-sizing code_cell rendered">
179 | 
180 | </div>
181 |     {% endraw %}
182 | 
183 |     {% raw %}
184 |     
185 | <div class="cell border-box-sizing code_cell rendered">
186 | <div class="input">
187 | 
188 | <div class="inner_cell">
189 |     <div class="input_area">
190 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">design_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_table</span><span class="p">(</span><span class="s1">&#39;test_data/sgrna-designs.txt&#39;</span><span class="p">)</span>
191 | <span class="n">design_targ_df</span> <span class="o">=</span> <span class="n">targetfeat</span><span class="o">.</span><span class="n">add_target_columns</span><span class="p">(</span><span class="n">design_df</span><span class="p">)</span>
192 | <span class="n">id_cols</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;sgRNA Context Sequence&#39;</span><span class="p">,</span> <span class="s1">&#39;Target Cut Length&#39;</span><span class="p">,</span> <span class="s1">&#39;Target Transcript&#39;</span><span class="p">,</span> <span class="s1">&#39;Orientation&#39;</span><span class="p">]</span>
193 | </pre></div>
194 | 
195 |     </div>
196 | </div>
197 | </div>
198 | 
199 | </div>
200 |     {% endraw %}
201 | 
202 |     {% raw %}
203 |     
204 | <div class="cell border-box-sizing code_cell rendered">
205 | <div class="input">
206 | 
207 | <div class="inner_cell">
208 |     <div class="input_area">
209 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">aa_seq_df</span> <span class="o">=</span> <span class="n">targetdata</span><span class="o">.</span><span class="n">build_transcript_aa_seq_df</span><span class="p">(</span><span class="n">design_df</span><span class="p">,</span> <span class="n">n_jobs</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
210 | <span class="n">aa_subseq_df</span> <span class="o">=</span> <span class="n">targetfeat</span><span class="o">.</span><span class="n">get_aa_subseq_df</span><span class="p">(</span><span class="n">sg_designs</span><span class="o">=</span><span class="n">design_targ_df</span><span class="p">,</span> <span class="n">aa_seq_df</span><span class="o">=</span><span class="n">aa_seq_df</span><span class="p">,</span> <span class="n">width</span><span class="o">=</span><span class="mi">16</span><span class="p">,</span>
211 |                                            <span class="n">id_cols</span><span class="o">=</span><span class="n">id_cols</span><span class="p">)</span>
212 | <span class="n">aa_subseq_df</span>
213 | </pre></div>
214 | 
215 |     </div>
216 | </div>
217 | </div>
218 | 
219 | <div class="output_wrapper">
220 | <div class="output">
221 | 
222 | <div class="output_area">
223 | 
224 | <div class="output_subarea output_stream output_stdout output_text">
225 | <pre>Getting amino acid sequences
226 | </pre>
227 | </div>
228 | </div>
229 | 
230 | <div class="output_area">
231 | 
232 | <div class="output_subarea output_stream output_stderr output_text">
233 | <pre>100%|██████████| 4/4 [00:04&lt;00:00,  1.04s/it]
234 | </pre>
235 | </div>
236 | </div>
237 | 
238 | <div class="output_area">
239 | 
240 | 
241 | <div class="output_html rendered_html output_subarea output_execute_result">
242 | <div>
243 | <style scoped>
244 |     .dataframe tbody tr th:only-of-type {
245 |         vertical-align: middle;
246 |     }
247 | 
248 |     .dataframe tbody tr th {
249 |         vertical-align: top;
250 |     }
251 | 
252 |     .dataframe thead th {
253 |         text-align: right;
254 |     }
255 | </style>
256 | <table border="1" class="dataframe">
257 |   <thead>
258 |     <tr style="text-align: right;">
259 |       <th></th>
260 |       <th>Target Transcript</th>
261 |       <th>Target Total Length</th>
262 |       <th>Transcript Base</th>
263 |       <th>desc</th>
264 |       <th>molecule</th>
265 |       <th>seq</th>
266 |       <th>id</th>
267 |       <th>version</th>
268 |       <th>AA len</th>
269 |       <th>sgRNA Context Sequence</th>
270 |       <th>AA Index</th>
271 |       <th>Target Cut Length</th>
272 |       <th>Orientation</th>
273 |       <th>extended_seq</th>
274 |       <th>AA 0-Indexed</th>
275 |       <th>AA 0-Indexed padded</th>
276 |       <th>seq_start</th>
277 |       <th>seq_end</th>
278 |       <th>AA Subsequence</th>
279 |     </tr>
280 |   </thead>
281 |   <tbody>
282 |     <tr>
283 |       <th>0</th>
284 |       <td>ENST00000259457.8</td>
285 |       <td>834</td>
286 |       <td>ENST00000259457</td>
287 |       <td>None</td>
288 |       <td>protein</td>
289 |       <td>MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...</td>
290 |       <td>ENSP00000259457</td>
291 |       <td>3</td>
292 |       <td>277</td>
293 |       <td>TGGAGCAGATACAAGAGCAACTGAAGGGAT</td>
294 |       <td>64</td>
295 |       <td>191</td>
296 |       <td>sense</td>
297 |       <td>-----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF...</td>
298 |       <td>63</td>
299 |       <td>80</td>
300 |       <td>64</td>
301 |       <td>96</td>
302 |       <td>GVVYKDGIVLGADTRATEGMVVADKNCSKIHFI</td>
303 |     </tr>
304 |     <tr>
305 |       <th>1</th>
306 |       <td>ENST00000259457.8</td>
307 |       <td>834</td>
308 |       <td>ENST00000259457</td>
309 |       <td>None</td>
310 |       <td>protein</td>
311 |       <td>MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...</td>
312 |       <td>ENSP00000259457</td>
313 |       <td>3</td>
314 |       <td>277</td>
315 |       <td>CCGGAAAACTGGCACGACCATCGCTGGGGT</td>
316 |       <td>46</td>
317 |       <td>137</td>
318 |       <td>sense</td>
319 |       <td>-----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF...</td>
320 |       <td>45</td>
321 |       <td>62</td>
322 |       <td>46</td>
323 |       <td>78</td>
324 |       <td>AKRGYKLPKVRKTGTTIAGVVYKDGIVLGADTR</td>
325 |     </tr>
326 |     <tr>
327 |       <th>2</th>
328 |       <td>ENST00000394249.8</td>
329 |       <td>1863</td>
330 |       <td>ENST00000394249</td>
331 |       <td>None</td>
332 |       <td>protein</td>
333 |       <td>MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...</td>
334 |       <td>ENSP00000377793</td>
335 |       <td>3</td>
336 |       <td>620</td>
337 |       <td>TAGAAAAAGATTTGCGCACCCAAGTGGAAT</td>
338 |       <td>106</td>
339 |       <td>316</td>
340 |       <td>sense</td>
341 |       <td>-----------------MRRSEVLAEESIVCLQKALNHLREIWELI...</td>
342 |       <td>105</td>
343 |       <td>122</td>
344 |       <td>106</td>
345 |       <td>138</td>
346 |       <td>EEGETTILQLEKDLRTQVELMRKQKKERKQELK</td>
347 |     </tr>
348 |     <tr>
349 |       <th>3</th>
350 |       <td>ENST00000394249.8</td>
351 |       <td>1863</td>
352 |       <td>ENST00000394249</td>
353 |       <td>None</td>
354 |       <td>protein</td>
355 |       <td>MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...</td>
356 |       <td>ENSP00000377793</td>
357 |       <td>3</td>
358 |       <td>620</td>
359 |       <td>TGGCCTTTGACCCAGACATAATGGTGGCCA</td>
360 |       <td>263</td>
361 |       <td>787</td>
362 |       <td>antisense</td>
363 |       <td>-----------------MRRSEVLAEESIVCLQKALNHLREIWELI...</td>
364 |       <td>262</td>
365 |       <td>279</td>
366 |       <td>263</td>
367 |       <td>295</td>
368 |       <td>WDRLQIPEEEREAVATIMSGSKAKVRKALQLEV</td>
369 |     </tr>
370 |     <tr>
371 |       <th>4</th>
372 |       <td>ENST00000361337.3</td>
373 |       <td>2298</td>
374 |       <td>ENST00000361337</td>
375 |       <td>None</td>
376 |       <td>protein</td>
377 |       <td>MSGDHLHNDSQIEADFRLNDSHKHKDKHKDREHRHKEHKKEKDREK...</td>
378 |       <td>ENSP00000354522</td>
379 |       <td>2</td>
380 |       <td>765</td>
381 |       <td>AAATACTCACTCATCCTCATCTCGAGGTCT</td>
382 |       <td>140</td>
383 |       <td>420</td>
384 |       <td>antisense</td>
385 |       <td>-----------------MSGDHLHNDSQIEADFRLNDSHKHKDKHK...</td>
386 |       <td>139</td>
387 |       <td>156</td>
388 |       <td>140</td>
389 |       <td>172</td>
390 |       <td>GYFVPPKEDIKPLKRPRDEDDADYKPKKIKTED</td>
391 |     </tr>
392 |     <tr>
393 |       <th>...</th>
394 |       <td>...</td>
395 |       <td>...</td>
396 |       <td>...</td>
397 |       <td>...</td>
398 |       <td>...</td>
399 |       <td>...</td>
400 |       <td>...</td>
401 |       <td>...</td>
402 |       <td>...</td>
403 |       <td>...</td>
404 |       <td>...</td>
405 |       <td>...</td>
406 |       <td>...</td>
407 |       <td>...</td>
408 |       <td>...</td>
409 |       <td>...</td>
410 |       <td>...</td>
411 |       <td>...</td>
412 |       <td>...</td>
413 |     </tr>
414 |     <tr>
415 |       <th>395</th>
416 |       <td>ENST00000454402.7</td>
417 |       <td>1023</td>
418 |       <td>ENST00000454402</td>
419 |       <td>None</td>
420 |       <td>protein</td>
421 |       <td>METSALKQQEQPAATKIRNLPWVEKYRPQTLNDLISHQDILSTIQK...</td>
422 |       <td>ENSP00000408295</td>
423 |       <td>2</td>
424 |       <td>340</td>
425 |       <td>TGTCTTTATATAGCTGTTTCGCACAGGCTA</td>
426 |       <td>74</td>
427 |       <td>220</td>
428 |       <td>antisense</td>
429 |       <td>-----------------METSALKQQEQPAATKIRNLPWVEKYRPQ...</td>
430 |       <td>73</td>
431 |       <td>90</td>
432 |       <td>74</td>
433 |       <td>106</td>
434 |       <td>LYGPPGTGKTSTILACAKQLYKDKEFGSMVLEL</td>
435 |     </tr>
436 |     <tr>
437 |       <th>396</th>
438 |       <td>ENST00000254998.3</td>
439 |       <td>423</td>
440 |       <td>ENST00000254998</td>
441 |       <td>None</td>
442 |       <td>protein</td>
443 |       <td>MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...</td>
444 |       <td>ENSP00000254998</td>
445 |       <td>2</td>
446 |       <td>140</td>
447 |       <td>TTGTCAATGTCTACTACACCACCATGGATA</td>
448 |       <td>27</td>
449 |       <td>79</td>
450 |       <td>sense</td>
451 |       <td>-----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD...</td>
452 |       <td>26</td>
453 |       <td>43</td>
454 |       <td>27</td>
455 |       <td>59</td>
456 |       <td>DQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTA</td>
457 |     </tr>
458 |     <tr>
459 |       <th>397</th>
460 |       <td>ENST00000254998.3</td>
461 |       <td>423</td>
462 |       <td>ENST00000254998</td>
463 |       <td>None</td>
464 |       <td>protein</td>
465 |       <td>MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...</td>
466 |       <td>ENSP00000254998</td>
467 |       <td>2</td>
468 |       <td>140</td>
469 |       <td>GGCGTTTGCTGTCCCGCCTGTACATGGGCA</td>
470 |       <td>39</td>
471 |       <td>115</td>
472 |       <td>sense</td>
473 |       <td>-----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD...</td>
474 |       <td>38</td>
475 |       <td>55</td>
476 |       <td>39</td>
477 |       <td>71</td>
478 |       <td>VYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQ</td>
479 |     </tr>
480 |     <tr>
481 |       <th>398</th>
482 |       <td>ENST00000381685.10</td>
483 |       <td>2067</td>
484 |       <td>ENST00000381685</td>
485 |       <td>None</td>
486 |       <td>protein</td>
487 |       <td>MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI...</td>
488 |       <td>ENSP00000371101</td>
489 |       <td>5</td>
490 |       <td>688</td>
491 |       <td>ACTAGCAATGGCTTATCAGATCGAAGGTCA</td>
492 |       <td>259</td>
493 |       <td>776</td>
494 |       <td>antisense</td>
495 |       <td>-----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK...</td>
496 |       <td>258</td>
497 |       <td>275</td>
498 |       <td>259</td>
499 |       <td>291</td>
500 |       <td>TMAVGTTTGQVLLYDLRSDKPLLVKDHQYGLPI</td>
501 |     </tr>
502 |     <tr>
503 |       <th>399</th>
504 |       <td>ENST00000381685.10</td>
505 |       <td>2067</td>
506 |       <td>ENST00000381685</td>
507 |       <td>None</td>
508 |       <td>protein</td>
509 |       <td>MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI...</td>
510 |       <td>ENSP00000371101</td>
511 |       <td>5</td>
512 |       <td>688</td>
513 |       <td>AAATTTTGTCTGATGACTACTCAAAGGTAT</td>
514 |       <td>108</td>
515 |       <td>322</td>
516 |       <td>sense</td>
517 |       <td>-----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK...</td>
518 |       <td>107</td>
519 |       <td>124</td>
520 |       <td>108</td>
521 |       <td>140</td>
522 |       <td>CLDSEVVTFEILSDDYSKIVFLHNDRYIEFHSQ</td>
523 |     </tr>
524 |   </tbody>
525 | </table>
526 | <p>400 rows × 19 columns</p>
527 | </div>
528 | </div>
529 | 
530 | </div>
531 | 
532 | </div>
533 | </div>
534 | 
535 | </div>
536 |     {% endraw %}
537 | 
538 |     {% raw %}
539 |     
540 | <div class="cell border-box-sizing code_cell rendered">
541 | <div class="input">
542 | 
543 | <div class="inner_cell">
544 |     <div class="input_area">
545 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">domain_df</span> <span class="o">=</span> <span class="n">targetdata</span><span class="o">.</span><span class="n">build_translation_overlap_df</span><span class="p">(</span><span class="n">aa_seq_df</span><span class="p">[</span><span class="s1">&#39;id&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">unique</span><span class="p">(),</span> <span class="n">n_jobs</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
546 | <span class="n">domain_feature_df</span> <span class="o">=</span> <span class="n">targetfeat</span><span class="o">.</span><span class="n">get_protein_domain_features</span><span class="p">(</span><span class="n">design_targ_df</span><span class="p">,</span> <span class="n">domain_df</span><span class="p">,</span> <span class="n">sources</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
547 |                                                            <span class="n">id_cols</span><span class="o">=</span><span class="n">id_cols</span><span class="p">)</span>
548 | </pre></div>
549 | 
550 |     </div>
551 | </div>
552 | </div>
553 | 
554 | <div class="output_wrapper">
555 | <div class="output">
556 | 
557 | <div class="output_area">
558 | 
559 | <div class="output_subarea output_stream output_stdout output_text">
560 | <pre>Getting protein domains
561 | </pre>
562 | </div>
563 | </div>
564 | 
565 | <div class="output_area">
566 | 
567 | <div class="output_subarea output_stream output_stderr output_text">
568 | <pre>100%|██████████| 200/200 [00:48&lt;00:00,  4.12it/s]
569 | </pre>
570 | </div>
571 | </div>
572 | 
573 | </div>
574 | </div>
575 | 
576 | </div>
577 |     {% endraw %}
578 | 
579 |     {% raw %}
580 |     
581 | <div class="cell border-box-sizing code_cell rendered">
582 | <div class="input">
583 | 
584 | <div class="inner_cell">
585 |     <div class="input_area">
586 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">conservation_df</span> <span class="o">=</span> <span class="n">targetdata</span><span class="o">.</span><span class="n">build_conservation_df</span><span class="p">(</span><span class="n">design_df</span><span class="p">,</span> <span class="n">n_jobs</span><span class="o">=</span><span class="n">max_n_jobs</span><span class="p">)</span>
587 | <span class="n">conservation_feature_df</span> <span class="o">=</span> <span class="n">targetfeat</span><span class="o">.</span><span class="n">get_conservation_features</span><span class="p">(</span><span class="n">design_targ_df</span><span class="p">,</span> <span class="n">conservation_df</span><span class="p">,</span>
588 |                                                              <span class="n">small_width</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span> <span class="n">large_width</span><span class="o">=</span><span class="mi">16</span><span class="p">,</span>
589 |                                                              <span class="n">conservation_column</span><span class="o">=</span><span class="s1">&#39;ranked_conservation&#39;</span><span class="p">,</span>
590 |                                                              <span class="n">id_cols</span><span class="o">=</span><span class="n">id_cols</span><span class="p">)</span>
591 | <span class="n">conservation_feature_df</span>
592 | </pre></div>
593 | 
594 |     </div>
595 | </div>
596 | </div>
597 | 
598 | <div class="output_wrapper">
599 | <div class="output">
600 | 
601 | <div class="output_area">
602 | 
603 | <div class="output_subarea output_stream output_stdout output_text">
604 | <pre>Getting conservation
605 | </pre>
606 | </div>
607 | </div>
608 | 
609 | <div class="output_area">
610 | 
611 | <div class="output_subarea output_stream output_stderr output_text">
612 | <pre>100%|██████████| 200/200 [03:53&lt;00:00,  1.17s/it]
613 | </pre>
614 | </div>
615 | </div>
616 | 
617 | <div class="output_area">
618 | 
619 | 
620 | <div class="output_html rendered_html output_subarea output_execute_result">
621 | <div>
622 | <style scoped>
623 |     .dataframe tbody tr th:only-of-type {
624 |         vertical-align: middle;
625 |     }
626 | 
627 |     .dataframe tbody tr th {
628 |         vertical-align: top;
629 |     }
630 | 
631 |     .dataframe thead th {
632 |         text-align: right;
633 |     }
634 | </style>
635 | <table border="1" class="dataframe">
636 |   <thead>
637 |     <tr style="text-align: right;">
638 |       <th></th>
639 |       <th>sgRNA Context Sequence</th>
640 |       <th>Target Cut Length</th>
641 |       <th>Target Transcript</th>
642 |       <th>Orientation</th>
643 |       <th>cons_4</th>
644 |       <th>cons_32</th>
645 |     </tr>
646 |   </thead>
647 |   <tbody>
648 |     <tr>
649 |       <th>0</th>
650 |       <td>AAAAGAATGATGAAAAGACACCACAGGGAG</td>
651 |       <td>244</td>
652 |       <td>ENST00000610426.5</td>
653 |       <td>sense</td>
654 |       <td>0.218231</td>
655 |       <td>0.408844</td>
656 |     </tr>
657 |     <tr>
658 |       <th>1</th>
659 |       <td>AAAAGAGCCATGAATCTAAACATCAGGAAT</td>
660 |       <td>640</td>
661 |       <td>ENST00000223073.6</td>
662 |       <td>sense</td>
663 |       <td>0.129825</td>
664 |       <td>0.278180</td>
665 |     </tr>
666 |     <tr>
667 |       <th>2</th>
668 |       <td>AAAAGCGCCAAATGGCCCGAGAATTGGGAG</td>
669 |       <td>709</td>
670 |       <td>ENST00000331923.9</td>
671 |       <td>sense</td>
672 |       <td>0.470906</td>
673 |       <td>0.532305</td>
674 |     </tr>
675 |     <tr>
676 |       <th>3</th>
677 |       <td>AAACAGAAAAAGTTAAAATCACCAAGGTGT</td>
678 |       <td>496</td>
679 |       <td>ENST00000283882.4</td>
680 |       <td>sense</td>
681 |       <td>0.580556</td>
682 |       <td>0.602708</td>
683 |     </tr>
684 |     <tr>
685 |       <th>4</th>
686 |       <td>AAACAGATGGAAGATGCTTACCGGGGGACC</td>
687 |       <td>132</td>
688 |       <td>ENST00000393047.8</td>
689 |       <td>sense</td>
690 |       <td>0.283447</td>
691 |       <td>0.414293</td>
692 |     </tr>
693 |     <tr>
694 |       <th>...</th>
695 |       <td>...</td>
696 |       <td>...</td>
697 |       <td>...</td>
698 |       <td>...</td>
699 |       <td>...</td>
700 |       <td>...</td>
701 |     </tr>
702 |     <tr>
703 |       <th>395</th>
704 |       <td>TTTGATTGCATTAAGGTTGGACTCTGGATT</td>
705 |       <td>246</td>
706 |       <td>ENST00000249269.9</td>
707 |       <td>sense</td>
708 |       <td>0.580612</td>
709 |       <td>0.618707</td>
710 |     </tr>
711 |     <tr>
712 |       <th>396</th>
713 |       <td>TTTGCCCACAGCTCCAAAGCATCGCGGAGA</td>
714 |       <td>130</td>
715 |       <td>ENST00000227618.8</td>
716 |       <td>sense</td>
717 |       <td>0.323770</td>
718 |       <td>0.416368</td>
719 |     </tr>
720 |     <tr>
721 |       <th>397</th>
722 |       <td>TTTTACAGTGCGATGTATGATGTATGGCTT</td>
723 |       <td>119</td>
724 |       <td>ENST00000338366.6</td>
725 |       <td>sense</td>
726 |       <td>0.788000</td>
727 |       <td>0.537417</td>
728 |     </tr>
729 |     <tr>
730 |       <th>398</th>
731 |       <td>TTTTGGATCTCGTAGTGATTCAAGAGGGAA</td>
732 |       <td>233</td>
733 |       <td>ENST00000629496.3</td>
734 |       <td>sense</td>
735 |       <td>0.239630</td>
736 |       <td>0.347615</td>
737 |     </tr>
738 |     <tr>
739 |       <th>399</th>
740 |       <td>TTTTTGTTACTACAGGTTCGCTGCTGGGAA</td>
741 |       <td>201</td>
742 |       <td>ENST00000395840.6</td>
743 |       <td>sense</td>
744 |       <td>0.693767</td>
745 |       <td>0.639044</td>
746 |     </tr>
747 |   </tbody>
748 | </table>
749 | <p>400 rows × 6 columns</p>
750 | </div>
751 | </div>
752 | 
753 | </div>
754 | 
755 | </div>
756 | </div>
757 | 
758 | </div>
759 |     {% endraw %}
760 | 
761 |     {% raw %}
762 |     
763 | <div class="cell border-box-sizing code_cell rendered">
764 | <div class="input">
765 | 
766 | <div class="inner_cell">
767 |     <div class="input_area">
768 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">predictions</span> <span class="o">=</span> <span class="n">predict_target</span><span class="p">(</span><span class="n">design_df</span><span class="o">=</span><span class="n">design_df</span><span class="p">,</span>
769 |                              <span class="n">aa_subseq_df</span><span class="o">=</span><span class="n">aa_subseq_df</span><span class="p">,</span>
770 |                              <span class="n">domain_feature_df</span><span class="o">=</span><span class="n">domain_feature_df</span><span class="p">,</span>
771 |                              <span class="n">conservation_feature_df</span><span class="o">=</span><span class="n">conservation_feature_df</span><span class="p">)</span>
772 | <span class="n">design_df</span><span class="p">[</span><span class="s1">&#39;Target Score&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">predictions</span>
773 | </pre></div>
774 | 
775 |     </div>
776 | </div>
777 | </div>
778 | 
779 | <div class="output_wrapper">
780 | <div class="output">
781 | 
782 | <div class="output_area">
783 | 
784 | <div class="output_subarea output_stream output_stderr output_text">
785 | <pre>/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
786 |   warnings.warn(
787 | /Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
788 |   warnings.warn(
789 | </pre>
790 | </div>
791 | </div>
792 | 
793 | </div>
794 | </div>
795 | 
796 | </div>
797 |     {% endraw %}
798 | 
799 |     {% raw %}
800 |     
801 | <div class="cell border-box-sizing code_cell rendered">
802 | <div class="input">
803 | 
804 | <div class="inner_cell">
805 |     <div class="input_area">
806 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">lite_predictions</span> <span class="o">=</span> <span class="n">predict_target</span><span class="p">(</span><span class="n">design_df</span><span class="o">=</span><span class="n">design_df</span><span class="p">,</span>
807 |                                   <span class="n">aa_subseq_df</span><span class="o">=</span><span class="n">aa_subseq_df</span><span class="p">)</span>
808 | <span class="n">design_df</span><span class="p">[</span><span class="s1">&#39;Target Score Lite&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">lite_predictions</span>
809 | </pre></div>
810 | 
811 |     </div>
812 | </div>
813 | </div>
814 | 
815 | <div class="output_wrapper">
816 | <div class="output">
817 | 
818 | <div class="output_area">
819 | 
820 | <div class="output_subarea output_stream output_stderr output_text">
821 | <pre>/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
822 |   warnings.warn(
823 | /Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
824 |   warnings.warn(
825 | </pre>
826 | </div>
827 | </div>
828 | 
829 | </div>
830 | </div>
831 | 
832 | </div>
833 |     {% endraw %}
834 | 
835 |     {% raw %}
836 |     
837 | <div class="cell border-box-sizing code_cell rendered">
838 | <div class="input">
839 | 
840 | <div class="inner_cell">
841 |     <div class="input_area">
842 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">design_df</span><span class="p">[</span><span class="s1">&#39;sgRNA Context Sequence&#39;</span><span class="p">]</span>
843 | </pre></div>
844 | 
845 |     </div>
846 | </div>
847 | </div>
848 | 
849 | <div class="output_wrapper">
850 | <div class="output">
851 | 
852 | <div class="output_area">
853 | 
854 | 
855 | 
856 | <div class="output_text output_subarea output_execute_result">
857 | <pre>0      TGGAGCAGATACAAGAGCAACTGAAGGGAT
858 | 1      CCGGAAAACTGGCACGACCATCGCTGGGGT
859 | 2      TAGAAAAAGATTTGCGCACCCAAGTGGAAT
860 | 3      TGGCCTTTGACCCAGACATAATGGTGGCCA
861 | 4      AAATACTCACTCATCCTCATCTCGAGGTCT
862 |                     ...              
863 | 395    TGTCTTTATATAGCTGTTTCGCACAGGCTA
864 | 396    TTGTCAATGTCTACTACACCACCATGGATA
865 | 397    GGCGTTTGCTGTCCCGCCTGTACATGGGCA
866 | 398    ACTAGCAATGGCTTATCAGATCGAAGGTCA
867 | 399    AAATTTTGTCTGATGACTACTCAAAGGTAT
868 | Name: sgRNA Context Sequence, Length: 400, dtype: object</pre>
869 | </div>
870 | 
871 | </div>
872 | 
873 | </div>
874 | </div>
875 | 
876 | </div>
877 |     {% endraw %}
878 | 
879 |     {% raw %}
880 |     
881 | <div class="cell border-box-sizing code_cell rendered">
882 | <div class="input">
883 | 
884 | <div class="inner_cell">
885 |     <div class="input_area">
886 | <div class=" highlight hl-ipython3"><pre><span></span><span class="k">assert</span> <span class="n">stats</span><span class="o">.</span><span class="n">pearsonr</span><span class="p">(</span><span class="n">design_df</span><span class="p">[</span><span class="s1">&#39;Target Score&#39;</span><span class="p">],</span> <span class="n">design_df</span><span class="p">[</span><span class="s1">&#39;Target Score Lite&#39;</span><span class="p">])[</span><span class="mi">0</span><span class="p">]</span> <span class="o">&gt;</span> <span class="mf">0.7</span>
887 | </pre></div>
888 | 
889 |     </div>
890 | </div>
891 | </div>
892 | 
893 | </div>
894 |     {% endraw %}
895 | 
896 |     {% raw %}
897 |     
898 | <div class="cell border-box-sizing code_cell rendered">
899 | <div class="input">
900 | 
901 | <div class="inner_cell">
902 |     <div class="input_area">
903 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">sanger_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;test_data/Behan2019_activity.csv&#39;</span><span class="p">)</span>
904 | <span class="n">gecko_df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;test_data/Aguirre2016_activity.csv&#39;</span><span class="p">)</span>
905 | 
906 | <span class="n">sanger_designs</span> <span class="o">=</span> <span class="n">sanger_df</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span><span class="n">design_df</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s1">&#39;inner&#39;</span><span class="p">,</span>
907 |                                  <span class="n">on</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;sgRNA Sequence&#39;</span><span class="p">,</span> <span class="s1">&#39;sgRNA Context Sequence&#39;</span><span class="p">,</span> <span class="s1">&#39;Target Gene Symbol&#39;</span><span class="p">,</span>
908 |                                      <span class="s1">&#39;Target Cut %&#39;</span><span class="p">])</span>
909 | <span class="n">gecko_designs</span> <span class="o">=</span> <span class="n">gecko_df</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span><span class="n">design_df</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s1">&#39;inner&#39;</span><span class="p">,</span>
910 |                                 <span class="n">on</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;sgRNA Sequence&#39;</span><span class="p">,</span> <span class="s1">&#39;sgRNA Context Sequence&#39;</span><span class="p">,</span> <span class="s1">&#39;Target Gene Symbol&#39;</span><span class="p">,</span>
911 |                                     <span class="s1">&#39;Target Cut %&#39;</span><span class="p">])</span>
912 | <span class="k">assert</span> <span class="n">stats</span><span class="o">.</span><span class="n">pearsonr</span><span class="p">(</span><span class="n">sanger_designs</span><span class="p">[</span><span class="s1">&#39;avg_mean_centered_neg_lfc&#39;</span><span class="p">],</span>
913 |                       <span class="n">sanger_designs</span><span class="p">[</span><span class="s1">&#39;Target Score&#39;</span><span class="p">])[</span><span class="mi">0</span><span class="p">]</span> <span class="o">&gt;</span> <span class="mf">0.2</span>
914 | <span class="k">assert</span> <span class="n">stats</span><span class="o">.</span><span class="n">pearsonr</span><span class="p">(</span><span class="n">gecko_designs</span><span class="p">[</span><span class="s1">&#39;avg_mean_centered_neg_lfc&#39;</span><span class="p">],</span>
915 |                       <span class="n">gecko_designs</span><span class="p">[</span><span class="s1">&#39;Target Score&#39;</span><span class="p">])[</span><span class="mi">0</span><span class="p">]</span> <span class="o">&gt;</span> <span class="mf">0.05</span>
916 | </pre></div>
917 | 
918 |     </div>
919 | </div>
920 | </div>
921 | 
922 | </div>
923 |     {% endraw %}
924 | 
925 |     {% raw %}
926 |     
927 | <div class="cell border-box-sizing code_cell rendered">
928 | <div class="input">
929 | 
930 | <div class="inner_cell">
931 |     <div class="input_area">
932 | <div class=" highlight hl-ipython3"><pre><span></span><span class="n">rs_dev_target_lite_predictions</span> <span class="o">=</span> <span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;test_data/target_lite_score_export.csv&#39;</span><span class="p">)</span>
933 |                                   <span class="o">.</span><span class="n">rename</span><span class="p">({</span><span class="s1">&#39;Target Lite Score&#39;</span><span class="p">:</span> <span class="s1">&#39;Target Score Lite&#39;</span><span class="p">},</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">))</span>
934 | <span class="n">rs_dev_target_predictions</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;test_data/target_score_export.csv&#39;</span><span class="p">)</span>
935 | <span class="n">merged_rs_dev_predictions</span> <span class="o">=</span> <span class="n">rs_dev_target_lite_predictions</span><span class="o">.</span><span class="n">merge</span><span class="p">(</span><span class="n">rs_dev_target_predictions</span><span class="p">,</span>
936 |                                                                  <span class="n">how</span><span class="o">=</span><span class="s1">&#39;inner&#39;</span><span class="p">)</span>
937 | <span class="n">merged_rs_dev_rs3_predictions</span> <span class="o">=</span> <span class="p">(</span><span class="n">design_df</span>
938 |                                  <span class="o">.</span><span class="n">merge</span><span class="p">(</span><span class="n">merged_rs_dev_predictions</span><span class="p">,</span>
939 |                                         <span class="n">how</span><span class="o">=</span><span class="s1">&#39;inner&#39;</span><span class="p">,</span>
940 |                                         <span class="n">on</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;sgRNA Context Sequence&#39;</span><span class="p">,</span> <span class="s1">&#39;Target Cut Length&#39;</span><span class="p">,</span>
941 |                                             <span class="s1">&#39;Target Transcript&#39;</span><span class="p">,</span> <span class="s1">&#39;Orientation&#39;</span><span class="p">],</span>
942 |                                         <span class="n">suffixes</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39; rs3&#39;</span><span class="p">,</span> <span class="s1">&#39; rs_dev&#39;</span><span class="p">]))</span>
943 | <span class="k">assert</span> <span class="n">np</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">merged_rs_dev_rs3_predictions</span><span class="p">[</span><span class="s1">&#39;Target Score rs3&#39;</span><span class="p">],</span> <span class="n">merged_rs_dev_rs3_predictions</span><span class="p">[</span><span class="s1">&#39;Target Score rs_dev&#39;</span><span class="p">])</span>
944 | <span class="k">assert</span> <span class="n">np</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">merged_rs_dev_rs3_predictions</span><span class="p">[</span><span class="s1">&#39;Target Score Lite rs3&#39;</span><span class="p">],</span> <span class="n">merged_rs_dev_rs3_predictions</span><span class="p">[</span><span class="s1">&#39;Target Score Lite rs_dev&#39;</span><span class="p">])</span>
945 | </pre></div>
946 | 
947 |     </div>
948 | </div>
949 | </div>
950 | 
951 | </div>
952 |     {% endraw %}
953 | 
954 | </div>
955 |  
956 | 
957 | 


--------------------------------------------------------------------------------
/docs/sidebar.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "rs3": {
 3 |     "Overview": "/",
 4 |     "seq": "seq.html",
 5 |     "targetdata": "targetdata.html",
 6 |     "targetfeat": "targetfeat.html",
 7 |     "predicttarg": "predicttarg.html",
 8 |     "predict": "predict.html"
 9 |   }
10 | }


--------------------------------------------------------------------------------
/docs/sitemap.xml:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: none
 3 | search: exclude
 4 | ---
 5 | 
 6 | <?xml version="1.0" encoding="UTF-8"?>
 7 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 8 |   {% for post in site.posts %}
 9 |   {% unless post.search == "exclude" %}
10 |   <url>
11 |     <loc>{{site.url}}{{post.url}}</loc>
12 |   </url>
13 |   {% endunless %}
14 |   {% endfor %}
15 | 
16 | 
17 |   {% for page in site.pages %}
18 |   {% unless page.search == "exclude" %}
19 |   <url>
20 |     <loc>{{site.url}}{{ page.url}}</loc>
21 |   </url>
22 |   {% endunless %}
23 |   {% endfor %}
24 | </urlset>


--------------------------------------------------------------------------------
/rs3/RuleSet3.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/rs3/RuleSet3.pkl


--------------------------------------------------------------------------------
/rs3/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.16"
2 | 


--------------------------------------------------------------------------------
/rs3/_nbdev.py:
--------------------------------------------------------------------------------
 1 | # AUTOGENERATED BY NBDEV! DO NOT EDIT!
 2 | 
 3 | __all__ = ["index", "modules", "custom_doc_links", "git_url"]
 4 | 
 5 | index = {"load_seq_model": "00_seq.ipynb",
 6 |          "featurize_context": "00_seq.ipynb",
 7 |          "predict_seq": "00_seq.ipynb",
 8 |          "ensembl_post": "01_targetdata.ipynb",
 9 |          "chunks": "01_targetdata.ipynb",
10 |          "post_transcript_sequence_chunk": "01_targetdata.ipynb",
11 |          "post_transcript_sequence": "01_targetdata.ipynb",
12 |          "build_transcript_aa_seq_df": "01_targetdata.ipynb",
13 |          "ensembl_get": "01_targetdata.ipynb",
14 |          "get_translation_overlap": "01_targetdata.ipynb",
15 |          "build_translation_overlap_df": "01_targetdata.ipynb",
16 |          "write_transcript_data": "01_targetdata.ipynb",
17 |          "get_transcript_info": "01_targetdata.ipynb",
18 |          "get_conservation": "01_targetdata.ipynb",
19 |          "get_exon_conservation": "01_targetdata.ipynb",
20 |          "get_transcript_conservation": "01_targetdata.ipynb",
21 |          "get_transcript_conservation_safe": "01_targetdata.ipynb",
22 |          "build_conservation_df": "01_targetdata.ipynb",
23 |          "write_conservation_data": "01_targetdata.ipynb",
24 |          "add_target_columns": "02_targetfeat.ipynb",
25 |          "get_position_features": "02_targetfeat.ipynb",
26 |          "get_one_aa_frac": "02_targetfeat.ipynb",
27 |          "get_aa_aromaticity": "02_targetfeat.ipynb",
28 |          "get_aa_hydrophobicity": "02_targetfeat.ipynb",
29 |          "get_aa_ip": "02_targetfeat.ipynb",
30 |          "get_aa_secondary_structure": "02_targetfeat.ipynb",
31 |          "featurize_aa_seqs": "02_targetfeat.ipynb",
32 |          "extract_amino_acid_subsequence": "02_targetfeat.ipynb",
33 |          "get_aa_subseq_df": "02_targetfeat.ipynb",
34 |          "get_amino_acid_features": "02_targetfeat.ipynb",
35 |          "get_protein_domain_features": "02_targetfeat.ipynb",
36 |          "get_conservation_ranges": "02_targetfeat.ipynb",
37 |          "get_conservation_features": "02_targetfeat.ipynb",
38 |          "merge_feature_dfs": "02_targetfeat.ipynb",
39 |          "load_target_model": "03_predicttarg.ipynb",
40 |          "predict_target": "03_predicttarg.ipynb",
41 |          "predict_seq_tracr": "04_predict.ipynb",
42 |          "combine_target_seq_scores": "04_predict.ipynb",
43 |          "predict": "04_predict.ipynb"}
44 | 
45 | modules = ["seq.py",
46 |            "targetdata.py",
47 |            "targetfeat.py",
48 |            "predicttarg.py",
49 |            "predict.py"]
50 | 
51 | doc_url = "https://gpp-rnd.github.io/rs3/"
52 | 
53 | git_url = "https://github.com/gpp-rnd/rs3/tree/master/"
54 | 
55 | def custom_doc_links(name): return None
56 | 


--------------------------------------------------------------------------------
/rs3/predict.py:
--------------------------------------------------------------------------------
  1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 04_predict.ipynb (unless otherwise specified).
  2 | 
  3 | __all__ = ['predict_seq_tracr', 'combine_target_seq_scores', 'predict']
  4 | 
  5 | # Cell
  6 | import pandas as pd
  7 | import warnings
  8 | 
  9 | from .seq import predict_seq
 10 | from .targetdata import (build_translation_overlap_df,
 11 |                             build_transcript_aa_seq_df,
 12 |                             build_conservation_df)
 13 | from .targetfeat import (add_target_columns,
 14 |                             get_aa_subseq_df,
 15 |                             get_protein_domain_features,
 16 |                             get_conservation_features)
 17 | from .predicttarg import predict_target
 18 | 
 19 | # Cell
 20 | from pandas.api.types import is_list_like
 21 | 
 22 | def predict_seq_tracr(design_df, tracr, context_col, ref_tracrs, n_jobs):
 23 |     if not tracr in ref_tracrs:
 24 |         raise ValueError('tracrRNA must be one of ' + ','.join(ref_tracrs))
 25 |     design_df['RS3 Sequence Score (' + tracr + ' tracr)'] = predict_seq(design_df[context_col], sequence_tracr=tracr,
 26 |                                                                         n_jobs=n_jobs)
 27 | 
 28 | def combine_target_seq_scores(design_df, tracr, target_score_col, lite):
 29 |     full_rs_name = 'RS3 Sequence (' + tracr + ' tracr) + Target Score'
 30 |     if lite:
 31 |         full_rs_name += 'Lite'
 32 |     design_df[full_rs_name] = \
 33 |         design_df['RS3 Sequence Score (' + tracr + ' tracr)'] + \
 34 |         design_df[target_score_col]
 35 | 
 36 | def predict(design_df, tracr=None, target=False,
 37 |             aa_seq_file=None, domain_file=None,
 38 |             conservatin_file=None,
 39 |             id_cols=None,
 40 |             context_col='sgRNA Context Sequence',
 41 |             transcript_id_col='Target Transcript',
 42 |             transcript_base_col='Transcript Base',
 43 |             transcript_len_col='Target Total Length',
 44 |             n_jobs_min=1, n_jobs_max=1, lite=True):
 45 |     """Make predictions using RS3
 46 | 
 47 |     :param design_df: DataFrame
 48 |     :param tracr: str or list
 49 |     :param target: bool, whether to include target scores
 50 |     :param aa_seq_file: str, path to precomputed amino acid sequences
 51 |     :param domain_file: str, path to precomputed domain file
 52 |     :param id_cols: list or None
 53 |     :param context_col: str
 54 |     :param transcript_id_col: str
 55 |     :param transcript_base_col: str
 56 |     :param transcript_len_col: str
 57 |     :param n_jobs_min: int
 58 |     :return: DataFram
 59 |     """
 60 |     if id_cols is None:
 61 |         id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']
 62 |     out_df = design_df.copy()
 63 |     ref_tracrs = ['Hsu2013', 'Chen2013']
 64 |     if type(tracr) is str:
 65 |         predict_seq_tracr(out_df, tracr, context_col, ref_tracrs, n_jobs=n_jobs_max)
 66 |     elif is_list_like(tracr):
 67 |         for t in tracr:
 68 |             predict_seq_tracr(out_df, t, context_col, ref_tracrs, n_jobs=n_jobs_max)
 69 |     else:
 70 |         raise ValueError('Could not recognize tracr input: ' + str(tracr))
 71 |     if target:
 72 |         out_df = add_target_columns(out_df,
 73 |                                     transcript_base_col=transcript_base_col)
 74 |         transcript_bases = pd.Series(out_df[transcript_base_col].unique())
 75 |         if aa_seq_file is None:
 76 |             aa_seq_df = build_transcript_aa_seq_df(out_df,
 77 |                                                    transcript_id_col=transcript_id_col,
 78 |                                                    transcript_len_col=transcript_len_col,
 79 |                                                    n_jobs=n_jobs_min)
 80 |         else:
 81 |             aa_seq_df = pd.read_parquet(aa_seq_file, engine='pyarrow',
 82 |                                         filters=[[(transcript_base_col, 'in', transcript_bases)]])
 83 |         missing_transcripts_aa = transcript_bases[~transcript_bases.isin(aa_seq_df[transcript_base_col])]
 84 |         if len(missing_transcripts_aa) > 0:
 85 |             warnings.warn('Missing amino acid sequences for transcripts: ' +
 86 |                           ','.join(missing_transcripts_aa))
 87 |             out_df['Missing translation information'] = out_df[transcript_base_col].isin(missing_transcripts_aa)
 88 |         aa_subseq_df = get_aa_subseq_df(sg_designs=out_df, aa_seq_df=aa_seq_df, width=16,
 89 |                                         id_cols=id_cols)
 90 |         if lite:
 91 |             target_score_col = 'Target Score Lite'
 92 |             out_df[target_score_col] = predict_target(design_df=out_df, aa_subseq_df=aa_subseq_df,
 93 |                                                       id_cols=id_cols)
 94 |         else:
 95 |             if domain_file is None:
 96 |                 domain_df = build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=n_jobs_min)
 97 |             else:
 98 |                 domain_df = pd.read_parquet(domain_file, engine='pyarrow',
 99 |                                             filters=[[(transcript_base_col, 'in', transcript_bases)]])
100 |                 # No warning for domain, since some transcripts aren't annotated with any domains
101 |             domain_feature_df = get_protein_domain_features(out_df, domain_df,
102 |                                                            id_cols=id_cols, transcript_base_col=transcript_base_col)
103 |             if conservatin_file is None:
104 |                 conservation_df = build_conservation_df(out_df, n_jobs=n_jobs_max)
105 |             else:
106 |                 conservation_df = pd.read_parquet(conservatin_file, engine='pyarrow',
107 |                                                   filters=[[(transcript_base_col, 'in', transcript_bases)]])
108 |             missing_transcripts_cons = transcript_bases[~transcript_bases.isin(conservation_df[transcript_base_col])]
109 |             if len(missing_transcripts_cons) > 0:
110 |                 warnings.warn('Missing conservation scores for transcripts: ' +
111 |                               ','.join(missing_transcripts_cons))
112 |             out_df['Missing conservation information'] = out_df[transcript_base_col].isin(missing_transcripts_cons)
113 |             conservation_feature_df = get_conservation_features(out_df, conservation_df,
114 |                                                                 small_width=2, large_width=16,
115 |                                                                 conservation_column='ranked_conservation',
116 |                                                                 id_cols=id_cols)
117 |             target_score_col = 'Target Score'
118 |             out_df[target_score_col] = predict_target(design_df=out_df, aa_subseq_df=aa_subseq_df,
119 |                                                       domain_feature_df=domain_feature_df,
120 |                                                       conservation_feature_df=conservation_feature_df,
121 |                                                       id_cols=id_cols)
122 |         if type(tracr) is str:
123 |             combine_target_seq_scores(out_df, tracr, target_score_col, lite)
124 |         else: # list
125 |             for t in tracr:
126 |                 combine_target_seq_scores(out_df, t, target_score_col, lite)
127 |     return out_df


--------------------------------------------------------------------------------
/rs3/predicttarg.py:
--------------------------------------------------------------------------------
 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 03_predicttarg.ipynb (unless otherwise specified).
 2 | 
 3 | __all__ = ['load_target_model', 'predict_target']
 4 | 
 5 | # Cell
 6 | from rs3 import targetfeat
 7 | import joblib
 8 | import os
 9 | 
10 | # Cell
11 | def load_target_model(lite=False):
12 |     """Load rule set 3 target model"""
13 |     if lite:
14 |         model_name = 'target_lite_model.pkl'
15 |     else:
16 |         model_name = 'target_model.pkl'
17 |     model = joblib.load(os.path.join(os.path.dirname(__file__), model_name))
18 |     return model
19 | 
20 | # Cell
21 | def predict_target(design_df, aa_subseq_df, domain_feature_df=None,
22 |                    conservation_feature_df=None, id_cols=None):
23 |     """Make predictions using the Rule Set 3 target model. Note that if the protein_domain_df
24 |     or conservation_df are not supplied, then the lite model will be used, otherwise the full model is used.
25 | 
26 |     :param design_df: DataFrame
27 |     :param aa_subseq_df: DataFrame
28 |     :param domain_feature_df: DataFrame
29 |     :param id_cols: list or str
30 |     :return: list
31 |     """
32 |     if (domain_feature_df is None) or (conservation_feature_df is None):
33 |         lite = True
34 |         domain_feature_df = None
35 |         conservation_feature_df = None
36 |     else:
37 |         lite = False
38 |     model = load_target_model(lite=lite)
39 |     if id_cols is None:
40 |         id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']
41 |     target_feature_df, target_feature_cols = targetfeat.merge_feature_dfs(design_df,
42 |                                                                           aa_subseq_df=aa_subseq_df,
43 |                                                                           domain_df=domain_feature_df,
44 |                                                                           conservation_df=conservation_feature_df,
45 |                                                                           id_cols=id_cols)
46 |     X_target = target_feature_df[target_feature_cols]
47 |     predictions = model.predict(X_target)
48 |     return predictions


--------------------------------------------------------------------------------
/rs3/seq.py:
--------------------------------------------------------------------------------
 1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 00_seq.ipynb (unless otherwise specified).
 2 | 
 3 | __all__ = ['load_seq_model', 'featurize_context', 'predict_seq']
 4 | 
 5 | # Cell
 6 | import joblib
 7 | import sglearn
 8 | import pandas as pd
 9 | import os
10 | 
11 | # Cell
12 | def load_seq_model():
13 |     """Load rule set 3 sequence model"""
14 |     model = joblib.load(os.path.join(os.path.dirname(__file__), 'RuleSet3.pkl'))
15 |     return model
16 | 
17 | # Cell
18 | def featurize_context(context_sequences, sequence_tracr='Hsu2013', ref_tracrs=None,
19 |                       n_jobs=1):
20 |     """Featurize context sequences
21 | 
22 |     :param context_sequences: list-like
23 |     :param sequence_tracr: list-like or str
24 |     :return: DataFrame, feature matrix
25 |     """
26 |     if ref_tracrs is None:
27 |         ref_tracrs = ['Hsu2013', 'Chen2013']
28 |     context_series = pd.Series(context_sequences)
29 |     if not (context_series.str.len() == 30).all():
30 |         raise  ValueError('All context sequences must be 30 nucleotides')
31 |     featurized_sgrnas = sglearn.featurize_guides(context_sequences,
32 |                                                  n_jobs=n_jobs)
33 |     for tracr in ref_tracrs:
34 |         if type(sequence_tracr) is str:
35 |             featurized_sgrnas[tracr + ' tracr'] = int(sequence_tracr == tracr)
36 |         else: # list-like
37 |             featurized_sgrnas[tracr + ' tracr'] = ((pd.Series(sequence_tracr) == tracr)
38 |                                                    .astype(int)
39 |                                                    .to_list())
40 |     return featurized_sgrnas
41 | 
42 | # Cell
43 | def predict_seq(context_sequences, sequence_tracr='Hsu2013', ref_tracrs=None, n_jobs=1):
44 |     """Predict the activity of context sequence for SpCas9 Knockout using sequence information only
45 | 
46 |     :param context_sequences: list of str
47 |     :return: list of float, predictions
48 |     """
49 |     model = load_seq_model()
50 |     print('Calculating sequence-based features')
51 |     featurized_sgrnas = featurize_context(context_sequences, sequence_tracr=sequence_tracr, ref_tracrs=ref_tracrs,
52 |                                           n_jobs=n_jobs)
53 |     seq_predictions = model.predict(featurized_sgrnas)
54 |     return seq_predictions


--------------------------------------------------------------------------------
/rs3/target_lite_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/rs3/target_lite_model.pkl


--------------------------------------------------------------------------------
/rs3/target_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/rs3/target_model.pkl


--------------------------------------------------------------------------------
/rs3/targetdata.py:
--------------------------------------------------------------------------------
  1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 01_targetdata.ipynb (unless otherwise specified).
  2 | 
  3 | __all__ = ['ensembl_post', 'chunks', 'post_transcript_sequence_chunk', 'post_transcript_sequence',
  4 |            'build_transcript_aa_seq_df', 'ensembl_get', 'get_translation_overlap', 'build_translation_overlap_df',
  5 |            'write_transcript_data', 'get_transcript_info', 'get_conservation', 'get_exon_conservation',
  6 |            'get_transcript_conservation', 'get_transcript_conservation_safe', 'build_conservation_df',
  7 |            'write_conservation_data']
  8 | 
  9 | # Cell
 10 | import requests
 11 | import json
 12 | import pandas as pd
 13 | from joblib import Parallel, delayed
 14 | from tqdm import tqdm
 15 | import warnings
 16 | import os
 17 | from scipy import stats
 18 | import multiprocessing
 19 | 
 20 | # Cell
 21 | def ensembl_post(ext, data, headers=None, params=None):
 22 |     """Generic wrapper for using POST requests to the ensembl rest API
 23 | 
 24 |     :param ext: str, url extension
 25 |     :param data: dict, query data
 26 |     :param headers: dict or None,  meta-information for query
 27 |     :param params: dict or None, parameters for query
 28 |     :return: Response object
 29 |     """
 30 |     if params is None:
 31 |         params = {}
 32 |     if headers is None:
 33 |         headers = {}
 34 |     data = json.dumps(data)
 35 |     r = requests.post("https://rest.ensembl.org"+ext, headers=headers, data=data, params=params)
 36 |     if not r.ok:
 37 |         r.raise_for_status()
 38 |     else:
 39 |         return r
 40 | 
 41 | # Cell
 42 | def chunks(lst, n):
 43 |     """Yield successive n-sized chunks from lst.
 44 | 
 45 |     lst: list
 46 |     n: int
 47 | 
 48 |     returns: generator of list chunks
 49 |     """
 50 |     for i in range(0, len(lst), n):
 51 |         yield lst[i:i + n]
 52 | 
 53 | def post_transcript_sequence_chunk(ids, params, headers):
 54 |     """Helper function for post_transcript_sequence
 55 | 
 56 |     :param ids: list
 57 |     :param params: dict
 58 |     :param headers: dict
 59 |     :return: dict
 60 |     """
 61 |     data = {'ids': ids}
 62 |     r = ensembl_post("/sequence/id/", data=data, params=params,
 63 |                      headers=headers)
 64 |     seq = r.json()
 65 |     return seq
 66 | 
 67 | def post_transcript_sequence(ensembl_ids, seq_type='protein', max_queries=50,
 68 |                              n_jobs=1, **kwargs):
 69 |     """Request multiple types of sequence by stable identifier. Supports feature masking and expand options.
 70 |     Uses https://rest.ensembl.org/documentation/info/sequence_id_post
 71 | 
 72 |     :param ensembl_ids: list of str
 73 |     :param seq_type: str, one of [genomic, cds, cdna, protein]
 74 |     :param max_queries: int, maximum number of queries for post
 75 |     :param n_jobs: int, number of jobs to run in parallel
 76 |     :param kwargs: additional parameter arguments
 77 |     :return: list, dict of sequences 5' to 3' in the same orientation as the input transcript
 78 |     """
 79 |     headers={"content-type" : "application/json", "accept" : "application/json"}
 80 |     params = {'type': seq_type, **kwargs}
 81 |     id_chunks = list(chunks(ensembl_ids, max_queries))
 82 |     seqs = Parallel(n_jobs=n_jobs)(delayed(post_transcript_sequence_chunk)
 83 |                                    (ids, params, headers) for ids in tqdm(id_chunks))
 84 |     # flatten list
 85 |     seqs = [item for sublist in seqs for item in sublist]
 86 |     return seqs
 87 | 
 88 | # Cell
 89 | def build_transcript_aa_seq_df(design_df, transcript_id_col='Target Transcript',
 90 |                                transcript_len_col='Target Total Length', n_jobs=1):
 91 |     """Get amino acid sequence for transcripts of interest
 92 | 
 93 |     :param design_df: DataFrame
 94 |     :param transcript_id_col: str, column with ensembl transcript id
 95 |     :param transcript_len_col: str, column with length of transcript
 96 |     :param n_jobs: int, number of jobs to use to query transcripts
 97 |     :return: DataFrame
 98 |     """
 99 |     unique_transcripts = design_df[[transcript_id_col, transcript_len_col]].drop_duplicates()
100 |     unique_transcripts['Transcript Base'] = unique_transcripts[transcript_id_col].str.split('.', expand=True)[0]
101 |     print("Getting amino acid sequences")
102 |     aa_seqs = post_transcript_sequence(unique_transcripts['Transcript Base'].to_list(),
103 |                                        n_jobs=n_jobs)
104 |     aa_seq_df = (pd.DataFrame(aa_seqs)
105 |                  .rename({'query': 'Transcript Base'}, axis=1))
106 |     missing_seqs = (unique_transcripts['Transcript Base'][~unique_transcripts['Transcript Base'].isin(
107 |         aa_seq_df['Transcript Base']
108 |     )])
109 |     if len(missing_seqs) > 0:
110 |         warnings.warn('Unable to find translations for the following transcripts: ' + ', '.join(missing_seqs))
111 |     aa_seq_len_df = (unique_transcripts.merge(aa_seq_df, on='Transcript Base'))
112 |     aa_seq_len_df['AA len'] = aa_seq_len_df['seq'].str.len()
113 |     filtered_aa_seq_len_df = (aa_seq_len_df[aa_seq_len_df[transcript_len_col] ==
114 |                                             (aa_seq_len_df['AA len'] + 1)*3 ]
115 |                               .reset_index(drop=True))
116 |     filtered_seqs = (aa_seq_len_df['Transcript Base'][~aa_seq_len_df['Transcript Base'].isin(
117 |         filtered_aa_seq_len_df['Transcript Base']
118 |     )])
119 |     if len(filtered_seqs) > 0:
120 |         warnings.warn('Filtered transcripts where the transcript length and amino acid ' +
121 |                       'sequence length did not agree: ' + ', '.join(filtered_seqs))
122 |     return filtered_aa_seq_len_df
123 | 
124 | # Cell
125 | def ensembl_get(ext, query=None, headers=None, params=None):
126 |     """Generic wrapper for using GET requests to the ensembl rest API
127 | 
128 |     ext: str, url extension |
129 |     query: str or None, end of url extension specifying species, taxon, esnembl_id etc |
130 |     headers: dict or None,  meta-information for query |
131 |     params: dict or None, parameters for query |
132 | 
133 |     returns: Response object
134 |     """
135 |     if query is None:
136 |         query = ''
137 |     if params is None:
138 |         params = {}
139 |     if headers is None:
140 |         headers = {}
141 |     r = requests.get("https://rest.ensembl.org"+ext+query, params=params, headers=headers)
142 |     if not r.ok:
143 |         r.raise_for_status()
144 |     else:
145 |         return r
146 | 
147 | def get_translation_overlap(ensembl_id):
148 |     """Get features that overlap with translation, such as protein domains
149 | 
150 |     :param ensembl_id: str
151 |     :return: DataFrame
152 |     """
153 |     headers = {'content-type': 'application/json'}
154 |     ext = '/overlap/translation/' + ensembl_id
155 |     r = ensembl_get(ext, headers=headers)
156 |     decoded = r.json()
157 |     return decoded
158 | 
159 | # Cell
160 | def build_translation_overlap_df(protein_ids, n_jobs=1):
161 |     """Get protein domain information
162 | 
163 |     :param protein_ids: list of str, ensemble protein IDs
164 |     :param n_jobs: int
165 |     :return: DataFrame
166 |     """
167 |     print('Getting protein domains')
168 |     translation_overlap_list = Parallel(n_jobs=n_jobs)(delayed(get_translation_overlap)
169 |                                                        (id) for id in tqdm(protein_ids))
170 |     # flatten list
171 |     translation_overlap_list = [item for sublist in translation_overlap_list for item in sublist]
172 |     translation_overlap_df = pd.DataFrame(translation_overlap_list).rename({'Parent': 'Transcript Base'}, axis=1)
173 |     return translation_overlap_df
174 | 
175 | # Cell
176 | def write_transcript_data(design_df, transcript_id_col='Target Transcript',
177 |                           transcript_len_col='Target Total Length', n_jobs=1,
178 |                           overwrite=True, filepath='./data/target_data/',
179 |                           aa_seq_name='aa_seqs.pq',
180 |                           protein_domain_name='protein_domains.pq'):
181 |     """Write amino acid sequences and protein domain information to parquet files
182 | 
183 |     :param design_df: DataFrame
184 |     :param transcript_id_col: str
185 |     :param transcript_len_col: str
186 |     :param n_jobs: int
187 |     :param overwrite: bool, whether to overwrite existing file
188 |     :param filepath: str, directory for output sequences
189 |     :param aa_seq_name: str, name of amino acid sequence file
190 |     :param protein_domain_name: str, name of protein domain file
191 |     """
192 |     if (os.path.isfile(filepath + aa_seq_name) or os.path.isfile(filepath + protein_domain_name)) and (not overwrite):
193 |         raise ValueError('Transcript data already exits and cannot be overwritten')
194 |     else:
195 |         transcript_aa_seq_df = build_transcript_aa_seq_df(design_df, transcript_id_col=transcript_id_col,
196 |                                                           transcript_len_col=transcript_len_col,
197 |                                                           n_jobs=n_jobs)
198 |         translation_overlap_df = build_translation_overlap_df(transcript_aa_seq_df['id'],
199 |                                                               n_jobs=n_jobs)
200 |         if not os.path.isdir(filepath):
201 |             print('Creating new directory ' + filepath)
202 |             os.makedirs(filepath)
203 |         transcript_aa_seq_df.to_parquet(path=filepath + aa_seq_name, engine='pyarrow',
204 |                                         index=False)
205 |         translation_overlap_df.to_parquet(path=filepath + protein_domain_name, engine='pyarrow',
206 |                                           index=False)
207 | 
208 | # Cell
209 | def get_transcript_info(base_transcript):
210 |     """Using an ensembl transcript ID, get
211 | 
212 |     :param base_transcript: str
213 |     :return: (exon_df, trans_sr, chr)
214 |         exon_df: DataFrame, with global exon start and end position
215 |         trans_sr: Series, with global translation start and stop positions for CDS and translation length
216 |         chr: str
217 | 
218 |     """
219 |     r = ensembl_get("/lookup/id/" + base_transcript + "?expand=1",
220 |                     headers={"Content-Type": "application/json"}, params={'expand': '1'})
221 |     decoded = r.json()
222 |     exon_df = pd.DataFrame(decoded['Exon'])
223 |     trans_sr = pd.Series(decoded['Translation'])
224 |     chr = decoded['seq_region_name']
225 |     return exon_df, trans_sr, chr
226 | 
227 | # Cell
228 | def get_conservation(chr, start, end, genome):
229 |     """Get conservation scores for a given region of a genome
230 | 
231 |     :param chr: str, chromosome number
232 |     :param start: int
233 |     :param end: int
234 |     :param genome: str
235 |     :return: DataFrame
236 |     """
237 |     api_url = 'http://api.genome.ucsc.edu/getData/track'
238 |     if genome == 'hg38':
239 |         track = 'phyloP100way'
240 |     elif genome == 'mm39':
241 |         track = 'phyloP35way'
242 |     else:
243 |         raise ValueError('Genome not recognized')
244 |     chrom = 'chr' + chr
245 |     params = {
246 |         'genome': genome,
247 |         'track': track,
248 |         'start': start,
249 |         'end': end,
250 |         'chrom': chrom
251 |     }
252 |     results = requests.get(api_url, data=params)
253 |     if results.ok:
254 |         value_df = (pd.DataFrame([pd.Series(x) for x in pd.read_json(results.content.decode('utf8'))[chrom].values])
255 |                     .rename(columns={'value': 'conservation'}))
256 |     else:
257 |         raise ValueError(results.reason)
258 |     return value_df
259 | 
260 | # Cell
261 | def get_exon_conservation(exon_df, chr, genome):
262 |     """Get conservation scores for each exon
263 | 
264 |     :param exon_df: DataFrame
265 |     :param chr: str
266 |     :param genome: str
267 |     :return: DataFrame
268 |     """
269 |     conservation_dict = {}
270 |     for i, row in exon_df.set_index('id').iterrows():
271 |         # subtract one since the nucleotide conservation corresponds to the "end" index
272 |         conservation_dict[i] = get_conservation(chr, row['start'] - 1, row['end'], genome)
273 |         # get the conservation of i
274 |     conservation_df = (pd.concat(conservation_dict)
275 |                        .reset_index(level=0)
276 |                        .reset_index(drop=True)
277 |                        .rename({'level_0': 'exon_id',
278 |                                 'end': 'genomic position'}, axis=1)
279 |                        .drop('start', axis=1))
280 |     return conservation_df
281 | 
282 | 
283 | def get_transcript_conservation(transcript_id, target_strand, genome):
284 |     """Get conservation scores for a transcript
285 | 
286 |     :param transcript_id: str
287 |     :param target_strand: str, '+' or '-'
288 |     :param genome: str
289 |     :return: DataFrame
290 |     """
291 |     exon_df, trans_sr, chr = get_transcript_info(transcript_id)
292 |     # only include translated positions
293 |     exon_df['start'] = exon_df['start'].apply(lambda x: max(x, trans_sr['start']))
294 |     exon_df['end'] = exon_df['end'].apply(lambda x: min(x, trans_sr['end']))
295 |     exon_df = exon_df[exon_df['end'] > exon_df['start']].reset_index(drop=True)
296 |     conservation_df = get_exon_conservation(exon_df, chr, genome)
297 |     conservation_df['Transcript Base'] = transcript_id
298 |     if target_strand == '-':
299 |         ascending = False
300 |     else:
301 |         ascending = True
302 |     conservation_df = (conservation_df
303 |                        .sort_values('genomic position', ascending=ascending)
304 |                        .reset_index(drop=True))
305 |     conservation_df['target position'] = conservation_df.index + 1
306 |     conservation_df['chromosome'] = chr
307 |     conservation_df['genome'] = genome
308 |     conservation_df['translation length'] = trans_sr['length']
309 |     return conservation_df
310 | 
311 | # Cell
312 | def get_transcript_conservation_safe(transcript_id, target_strand, genome):
313 |     """Helper function for parrallel query. Return None when conservation dataframe cannot be assembled"""
314 |     try:
315 |         return get_transcript_conservation(transcript_id, target_strand, genome)
316 |     except:
317 |         return None
318 | 
319 | 
320 | def build_conservation_df(design_df, n_jobs=1):
321 |     transcript_refseq_df = (design_df[['Target Transcript', 'Strand of Target', 'Target Total Length']]
322 |                             .drop_duplicates())
323 |     if not (transcript_refseq_df['Target Transcript'].str.startswith('ENST') |
324 |             transcript_refseq_df['Target Transcript'].str.startswith('ENSMUST')).all():
325 |         raise ValueError('Must supply human or mouse Ensembl transcript IDs as input')
326 |     print('Getting conservation')
327 |     transcript_refseq_df['Transcript Base'] = (transcript_refseq_df['Target Transcript'].str.split('.', expand=True)[0])
328 |     transcript_refseq_df['genome'] = transcript_refseq_df['Transcript Base'].apply(lambda trans:
329 |                                                                                    'mm39' if 'MUS' in trans else 'hg38')
330 |     all_transcript_conservation_list = Parallel(n_jobs)(delayed(get_transcript_conservation_safe)
331 |                                                         (row['Transcript Base'],
332 |                                                          row['Strand of Target'],
333 |                                                          row['genome'])
334 |                                                          for _, row in tqdm(transcript_refseq_df.iterrows(),
335 |                                                                             total=transcript_refseq_df.shape[0]))
336 |     transcript_conservation_list = []
337 |     failed_list = []
338 |     transcript_list = transcript_refseq_df['Transcript Base'].to_list()
339 |     for i, conservation_df in enumerate(all_transcript_conservation_list):
340 |         if conservation_df is None:
341 |             failed_list.append(transcript_list[i])
342 |         else:
343 |             transcript_conservation_list.append(conservation_df)
344 |     if len(failed_list) > 0:
345 |         warnings.warn('Failed to get conservation scores for ' + str(len(failed_list)) +
346 |                       ' transcripts' + ', '.join(failed_list))
347 |     transcript_conservation_df = (pd.concat(transcript_conservation_list))
348 |     transcript_cons_designs = (transcript_conservation_df
349 |                                .merge(transcript_refseq_df, how='inner',
350 |                                       on=['Transcript Base', 'genome']))
351 |     filtered_transcript_conservation = transcript_cons_designs[
352 |         (transcript_cons_designs['translation length'] + 1)*3 == transcript_cons_designs['Target Total Length']].copy()
353 |     mismatched_transcripts = transcript_conservation_df['Transcript Base'][
354 |         ~transcript_conservation_df['Transcript Base'].isin(filtered_transcript_conservation['Transcript Base'])]
355 |     if len(mismatched_transcripts) > 0:
356 |         warnings.warn('Filtered: ' + str(len(mismatched_transcripts)) +
357 |                       ' transcripts with mismatched length:' + ','.join(mismatched_transcripts))
358 |     filtered_transcript_conservation['ranked_conservation'] = (filtered_transcript_conservation.groupby('Transcript Base')
359 |                                                                ['conservation']
360 |                                                                .rank(pct=True))
361 |     return filtered_transcript_conservation
362 | 
363 | # Cell
364 | def write_conservation_data(design_df, n_jobs=1,
365 |                             overwrite=True, filepath='./data/target_data/',
366 |                             cons_file_name='conservation.pq'):
367 |     """Write conservation scores to parquet files
368 | 
369 |     :param design_df: DataFrame
370 |     :param n_jobs: int
371 |     :param overwrite: bool, whether to overwrite existing file
372 |     :param filepath: str, directory for output sequences
373 |     :param cons_file_name: str, name of conservation file
374 |     """
375 |     if os.path.isfile(filepath + cons_file_name) and (not overwrite):
376 |         raise ValueError('Conservation data already exits and cannot be overwritten')
377 |     else:
378 |         conservation_df = build_conservation_df(design_df, n_jobs=n_jobs)
379 |         if not os.path.isdir(filepath):
380 |             print('Creating new directory ' + filepath)
381 |             os.makedirs(filepath)
382 |         conservation_df.to_parquet(path=filepath + cons_file_name, engine='pyarrow',
383 |                                    index=False)


--------------------------------------------------------------------------------
/rs3/targetfeat.py:
--------------------------------------------------------------------------------
  1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 02_targetfeat.ipynb (unless otherwise specified).
  2 | 
  3 | __all__ = ['add_target_columns', 'get_position_features', 'get_one_aa_frac', 'get_aa_aromaticity',
  4 |            'get_aa_hydrophobicity', 'get_aa_ip', 'get_aa_secondary_structure', 'featurize_aa_seqs',
  5 |            'extract_amino_acid_subsequence', 'get_aa_subseq_df', 'get_amino_acid_features',
  6 |            'get_protein_domain_features', 'get_conservation_ranges', 'get_conservation_features', 'merge_feature_dfs']
  7 | 
  8 | # Cell
  9 | import pandas as pd
 10 | from Bio.SeqUtils.ProtParam import ProteinAnalysis
 11 | import warnings
 12 | 
 13 | # Cell
 14 | def add_target_columns(design_df, transcript_id_col='Target Transcript',
 15 |                        cut_pos_col='Target Cut Length',
 16 |                        transcript_base_col='Transcript Base'):
 17 |     """Add ['AA Index' and 'Transcript Base'] to design df
 18 | 
 19 |     :param design_df: DataFrame
 20 |     :return: DataFrame
 21 |     """
 22 |     out_df = design_df.copy()
 23 |     out_df['AA Index'] = (out_df[cut_pos_col] - 1) // 3 + 1
 24 |     out_df[transcript_base_col] = out_df[transcript_id_col].str.split('.', expand=True)[0]
 25 |     return out_df
 26 | 
 27 | # Cell
 28 | def get_position_features(sg_df, id_cols):
 29 |     """Get  features ['Target Cut %', 'sense']
 30 | 
 31 |     :param sg_df: DataFrame
 32 |     :param id_cols: list
 33 |     :return: DataFrame
 34 |     """
 35 |     position_df = sg_df[id_cols + ['Target Cut %']].copy()
 36 |     position_df['sense'] = sg_df['Orientation'] == 'sense'
 37 |     return position_df
 38 | 
 39 | # Cell
 40 | def get_one_aa_frac(feature_dict, aa_sequence, aas):
 41 |     """Get fraction of single aa
 42 | 
 43 |     :param feature_dict: dict, feature dictionary
 44 |     :param aa_sequence: str, amino acid sequence
 45 |     :param aas: list, list of amino acids
 46 |     """
 47 |     for aa in aas:
 48 |         aa_frac = aa_sequence.count(aa) / len(aa_sequence)
 49 |         feature_dict[aa] = aa_frac
 50 | 
 51 | # Cell
 52 | def get_aa_aromaticity(feature_dict, analyzed_seq):
 53 |     """Get fraction of aromatic amino acids in a sequence.
 54 | 
 55 |     Phe (F) + Trp (W) + Tyr (Y)
 56 | 
 57 |     :param feature_dict:
 58 |     :param analyzed_seq: ProteinAnalysis object
 59 |     """
 60 |     feature_dict['Aromaticity'] = analyzed_seq.aromaticity()
 61 | 
 62 | 
 63 | def get_aa_hydrophobicity(feature_dict, analyzed_seq):
 64 |     """Grand Average of Hydropathy
 65 | 
 66 |      The GRAVY value is calculated by adding the hydropathy value for each residue and dividing
 67 |      by the length of the sequence (Kyte and Doolittle; 1982). The larger the number, the more hydrophobic the
 68 |      amino acid
 69 | 
 70 |     :param feature_dict: dict
 71 |     :param analyzed_seq: ProteinAnalysis object
 72 |     """
 73 |     feature_dict['Hydrophobicity'] = analyzed_seq.gravy()
 74 | 
 75 | 
 76 | def get_aa_ip(feature_dict, analyzed_seq):
 77 |     """Get the Isoelectric Point of an amino acid sequence
 78 | 
 79 |     Charge of amino acid
 80 | 
 81 |     :param feature_dict: dict
 82 |     :param analyzed_seq: ProteinAnalysis object
 83 |     """
 84 |     feature_dict['Isoelectric Point'] = analyzed_seq.isoelectric_point()
 85 | 
 86 | 
 87 | def get_aa_secondary_structure(feature_dict, analyzed_seq):
 88 |     """Get the fraction of amion acids that tend to be in a helix, turn or sheet
 89 | 
 90 |     :param feature_dict: dict
 91 |     :param analyzed_seq: ProteinAnalysis object
 92 |     """
 93 |     feature_dict['Helix'], feature_dict['Turn'], feature_dict['Sheet'] = analyzed_seq.secondary_structure_fraction()
 94 | 
 95 | 
 96 | # Cell
 97 | def featurize_aa_seqs(aa_sequences, features=None):
 98 |     """Get feature DataFrame for a list of amino acid sequences
 99 | 
100 |     :param aa_sequences: list of str
101 |     :param features: list or None
102 |     :return: DataFrame
103 |     """
104 |     if features is None:
105 |         features = ['Pos. Ind. 1mer', 'Hydrophobicity', 'Aromaticity',
106 |                     'Isoelectric Point', 'Secondary Structure']
107 |     aas = ['A', 'C', 'D', 'E', 'F',
108 |            'G', 'H', 'I', 'K', 'L',
109 |            'M', 'N', 'P', 'Q', 'R',
110 |            'S', 'T', 'V', 'W', 'Y', '*']
111 |     clean_aa_seqs = aa_sequences.str.replace('\*|-', '', regex=True)
112 |     feature_dict_list = []
113 |     for i, (aa_sequence, clean_sequence) in enumerate(zip(aa_sequences, clean_aa_seqs)):
114 |         analyzed_seq = ProteinAnalysis(clean_sequence)
115 |         feature_dict = {}
116 |         if 'Pos. Ind. 1mer' in features:
117 |             get_one_aa_frac(feature_dict, aa_sequence, aas)
118 |         if 'Hydrophobicity' in features:
119 |             get_aa_hydrophobicity(feature_dict, analyzed_seq)
120 |         if 'Aromaticity' in features:
121 |             get_aa_aromaticity(feature_dict, analyzed_seq)
122 |         if 'Isoelectric Point' in features:
123 |             get_aa_ip(feature_dict, analyzed_seq)
124 |         if 'Secondary Structure' in features:
125 |             get_aa_secondary_structure(feature_dict, analyzed_seq)
126 |         feature_dict_list.append(feature_dict)
127 |     feature_matrix = pd.DataFrame(feature_dict_list)
128 |     feature_matrix.index = aa_sequences
129 |     return feature_matrix
130 | 
131 | # Cell
132 | def extract_amino_acid_subsequence(sg_aas, width):
133 |     """ Get the amino acid subsequence with a width of `width` on either side of the Amino Acid index
134 | 
135 |     :param sg_aas: DataFrame, sgRNA designs merged with amino acid sequence
136 |     :param width: int
137 |     :return: DataFrame
138 |     """
139 |     # Pad the sequences at the beginning and end, so our index doesn't go over
140 |     l_padding = '-' * (width + 1)  # can cut just before the CDS
141 |     r_padding = '-' * width  # can cut the stop codon
142 |     # add stop codon at the end of the sequence
143 |     sg_aas_subseq = sg_aas.copy()
144 |     sg_aas_subseq['extended_seq'] = l_padding + sg_aas_subseq['seq'] + '*' + r_padding
145 |     sg_aas_subseq['AA 0-Indexed'] = sg_aas_subseq['AA Index'] - 1
146 |     sg_aas_subseq['AA 0-Indexed padded'] = sg_aas_subseq['AA 0-Indexed'] + len(l_padding)
147 |     sg_aas_subseq['seq_start'] = (sg_aas_subseq['AA 0-Indexed padded'] - width).astype(int)
148 |     sg_aas_subseq['seq_end'] = (sg_aas_subseq['AA 0-Indexed padded'] + width).astype(int)
149 |     sg_aas_subseq['AA Subsequence'] = sg_aas_subseq.apply(lambda row: row['extended_seq'][row['seq_start']:(row['seq_end'] + 1)],
150 |                                                     axis=1)
151 |     return sg_aas_subseq
152 | 
153 | 
154 | # Cell
155 | def get_aa_subseq_df(sg_designs, aa_seq_df, width, id_cols,
156 |                      transcript_base_col='Transcript Base',
157 |                      target_transcript_col='Target Transcript',
158 |                      aa_index_col='AA Index'):
159 |     """Get the amino acid subsequences for a design dataframe
160 | 
161 |     :param sg_designs: DataFrame
162 |     :param aa_seq_df: DataFrame, Transcript Base and (AA) seq
163 |     :param width: int, length on each side of the cut site
164 |     :param transcript_base_col: str
165 |     :param target_transcript_col: str
166 |     :param aa_index_col: str
167 |     :return: DataFrame
168 |     """
169 |     sg_aas = (aa_seq_df.merge(sg_designs[list(set(id_cols +
170 |                                                   [target_transcript_col, transcript_base_col, aa_index_col]))],
171 |                               how='inner',
172 |                               on=[target_transcript_col, transcript_base_col]))
173 |     sg_aas_subseq = extract_amino_acid_subsequence(sg_aas, width)
174 |     return sg_aas_subseq
175 | 
176 | # Cell
177 | def get_amino_acid_features(aa_subseq_df, features, id_cols):
178 |     """Featurize amino acid sequences
179 | 
180 |     :param aa_subseq_df: DataFrame
181 |     :param features: list
182 |     :param id_cols: list
183 |     :return: DataFrame
184 |     """
185 | 
186 |     # Zero-indexed for python
187 |     # filter out sequences without the canonical amino acids
188 |     aa_set = set('ARNDCQEGHILKMFPSTWYV*-')
189 |     filtered_sg_aas = (aa_subseq_df[aa_subseq_df['AA Subsequence'].apply(lambda s: set(s) <= aa_set)]
190 |                        .reset_index(drop=True))
191 |     filtered_diff = (aa_subseq_df.shape[0] - filtered_sg_aas.shape[0])
192 |     if filtered_diff > 0:
193 |         warnings.warn('Ignored ' + str(filtered_diff) + ' amino acid sequences with non-canonical amino acids')
194 |     aa_features = featurize_aa_seqs(filtered_sg_aas['AA Subsequence'], features=features)
195 |     aa_features_annot = pd.concat([filtered_sg_aas[id_cols + ['AA Subsequence']]
196 |                                    .reset_index(drop=True),
197 |                                    aa_features.reset_index(drop=True)], axis=1)
198 |     return aa_features_annot
199 | 
200 | 
201 | # Cell
202 | def get_protein_domain_features(sg_design_df, protein_domains, id_cols,
203 |                                 sources=None,
204 |                                 transcript_base_col='Transcript Base',
205 |                                 aa_index_col='AA Index',
206 |                                 domain_type_col='type',
207 |                                 domain_start_col='start',
208 |                                 domain_end_col='end'):
209 |     """Get binary dataframe of protein domains
210 | 
211 |     :param sg_design_df: DataFrame, with columns [transcript_base_col, aa_index_col]
212 |     :param protein_domains: DataFrame, with columns [transcript_base_col, domain_type_col]
213 |     :param id_cols: list
214 |     :param sources: list. list of database types to include
215 |     :param transcript_base_col: str
216 |     :param aa_index_col: str
217 |     :param domain_type_col: str
218 |     :param domain_start_col: str
219 |     :param domain_end_col: str
220 |     :return: DataFrame, with binary features for protein domains
221 |     """
222 |     if sources is None:
223 |         sources = ['Pfam', 'PANTHER', 'HAMAP', 'SuperFamily', 'TIGRfam', 'ncoils', 'Gene3D',
224 |                    'Prosite_patterns', 'Seg', 'SignalP', 'TMHMM', 'MobiDBLite',
225 |                    'PIRSF', 'PRINTS', 'Smart', 'Prosite_profiles']  # exclude sifts
226 |     protein_domains = protein_domains[protein_domains[domain_type_col].isin(sources)]
227 |     clean_designs = sg_design_df[list(set(id_cols + [transcript_base_col, aa_index_col]))].copy()
228 |     designs_domains = clean_designs.merge(protein_domains,
229 |                                           how='inner', on=transcript_base_col)
230 |     # Note - not every sgRNA will be present in the feature df
231 |     filtered_domains = (designs_domains[designs_domains[aa_index_col].between(designs_domains[domain_start_col],
232 |                                                                               designs_domains[domain_end_col])]
233 |                         .copy())
234 |     filtered_domains = filtered_domains[id_cols + [domain_type_col]].drop_duplicates()
235 |     filtered_domains['present'] = 1
236 |     domain_feature_df = (filtered_domains.pivot_table(values='present',
237 |                                                       index=id_cols,
238 |                                                       columns='type', fill_value=0)
239 |                          .reset_index())
240 |     # Ensure all domain columns are present for testing
241 |     full_column_df = pd.DataFrame(columns=id_cols + sources, dtype=int)  # empty
242 |     domain_feature_df = pd.concat([full_column_df, domain_feature_df]).fillna(0)
243 |     domain_feature_df[sources] = domain_feature_df[sources].astype(int)
244 |     return domain_feature_df
245 | 
246 | # Cell
247 | def get_conservation_ranges(cut_pos, small_width, large_width):
248 |     small_range = range(cut_pos - small_width + 1, cut_pos + small_width + 1)
249 |     large_range = range(cut_pos - large_width + 1, cut_pos + large_width + 1)
250 |     return small_range, large_range
251 | 
252 | 
253 | def get_conservation_features(sg_designs, conservation_df, conservation_column,
254 |                               small_width, large_width, id_cols):
255 |     """Get conservation features
256 | 
257 |     :param sg_designs: DataFrame
258 |     :param conservation_df: DataFrame, tidy conservation scores indexed by Transcript Base and target position
259 |     :param conservation_column: str, name of column to calculate scores with
260 |     :param small_width: int, small window length to average scores in one direction
261 |     :param large_width: int, large window length to average scores in the one direction
262 |     :return: DataFrame of conservation features
263 |     """
264 |     sg_designs_width = sg_designs[id_cols + ['Transcript Base']].copy()
265 |     sg_designs_width['target position small'], sg_designs_width['target position large'] =  \
266 |         zip(*sg_designs_width['Target Cut Length']
267 |             .apply(get_conservation_ranges, small_width=small_width,
268 |                    large_width=large_width))
269 |     small_width_conservation = (sg_designs_width.drop('target position large', axis=1)
270 |                                 .rename({'target position small': 'target position'}, axis=1)
271 |                                 .explode('target position')
272 |                                 .merge(conservation_df, how='inner',
273 |                                        on=['Target Transcript', 'Transcript Base', 'target position'])
274 |                                 .groupby(id_cols)
275 |                                 .agg(cons=(conservation_column, 'mean'))
276 |                                 .rename({'cons': 'cons_' + str(small_width * 2)}, axis=1)
277 |                                 .reset_index())
278 |     large_width_conservation = (sg_designs_width.drop('target position small', axis=1)
279 |                                 .rename({'target position large': 'target position'}, axis=1)
280 |                                 .explode('target position')
281 |                                 .merge(conservation_df, how='inner',
282 |                                        on=['Target Transcript', 'Transcript Base', 'target position'])
283 |                                 .groupby(id_cols)
284 |                                 .agg(cons=(conservation_column, 'mean'))
285 |                                 .rename({'cons': 'cons_' + str(large_width * 2)}, axis=1)
286 |                                 .reset_index())
287 |     cons_feature_df = small_width_conservation.merge(large_width_conservation, how='outer',
288 |                                                      on=id_cols)
289 |     return cons_feature_df
290 | 
291 | # Cell
292 | def merge_feature_dfs(design_df,
293 |                       aa_subseq_df, aa_features=None,
294 |                       domain_df=None,
295 |                       conservation_df=None,
296 |                       id_cols=None):
297 |     if id_cols is None:
298 |         id_cols = ['sgRNA Context Sequence', 'Target Cut Length',
299 |                    'Target Transcript', 'Orientation']
300 |     if aa_features is None:
301 |         aa_features = ['Pos. Ind. 1mer',
302 |                        'Hydrophobicity', 'Aromaticity',
303 |                        'Isoelectric Point', 'Secondary Structure']
304 |     if design_df[id_cols].drop_duplicates().shape[0] != design_df.shape[0]:
305 |         raise ValueError('id_cols must uniquely identify rows of the design dataframe')
306 |     feature_df_dict = dict()
307 |     feature_list = list()
308 |     position_feature_df = get_position_features(design_df, id_cols=id_cols)
309 |     feature_df_dict['position'] = position_feature_df
310 |     feature_list.extend(['Target Cut %', 'sense'])
311 |     if domain_df is not None:
312 |         feature_df_dict['domain'] = domain_df
313 |         feature_list.extend(['Pfam', 'PANTHER', 'HAMAP', 'SuperFamily', 'TIGRfam', 'ncoils', 'Gene3D',
314 |                              'Prosite_patterns', 'Seg', 'SignalP', 'TMHMM', 'MobiDBLite',
315 |                              'PIRSF', 'PRINTS', 'Smart', 'Prosite_profiles'])
316 |     if conservation_df is not None:
317 |         feature_df_dict['conservation'] = conservation_df
318 |         # hardcoded
319 |         feature_list.extend(['cons_4', 'cons_32'])
320 |     aa_feature_df = get_amino_acid_features(aa_subseq_df, aa_features, id_cols)
321 |     feature_list.extend(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
322 |                          'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '*',
323 |                          'Hydrophobicity', 'Aromaticity', 'Isoelectric Point', 'Helix', 'Turn',
324 |                          'Sheet'])
325 |     feature_df_dict['aa'] = aa_feature_df
326 |     feature_df = design_df[id_cols]
327 |     for key, df in feature_df_dict.items():
328 |         feature_df = pd.merge(feature_df, df, how='left', on=id_cols)
329 |     return feature_df, feature_list
330 | 


--------------------------------------------------------------------------------
/settings.ini:
--------------------------------------------------------------------------------
 1 | [DEFAULT]
 2 | host = github
 3 | lib_name = rs3
 4 | user = gpp-rnd
 5 | description = Predict the activity of CRISPR sgRNAs
 6 | keywords = rs3, CRISPR, sgrna
 7 | author = Peter Deweirdt
 8 | author_email = petedeweirdt@gmail.com
 9 | copyright = Genetic Perturbation Platform, Broad Institute
10 | branch = master
11 | version = 0.0.16
12 | min_python = 3.7
13 | audience = Developers
14 | language = English
15 | custom_sidebar = False
16 | license = apache2
17 | status = 2
18 | requirements = joblib>=1.0.1 pandas>=1.0.0 lightgbm>=3.0.0,<=3.3.5 sglearn>=1.2.5 tqdm>=4.61.2 pyarrow>=4.0.1 biopython>=1.78 scikit-learn>=0.24.2 requests>=2.25.1
19 | dev_requirements = gpplot>=0.5.0 seaborn>=0.11.0 scipy>=1.0.1 jupyterlab>=3.0.0 nbdev>=1.1.14,<2.0.0 matplotlib>=3.3.4 tabulate>=0.8.9 jupyter-client<=6.1.12
20 | nbs_path = .
21 | doc_path = docs
22 | recursive = False
23 | doc_host = https://gpp-rnd.github.io
24 | doc_baseurl = /rs3/
25 | git_url = https://github.com/gpp-rnd/rs3/tree/master/
26 | lib_path = rs3
27 | title = rs3
28 | 
29 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from pkg_resources import parse_version
 2 | from configparser import ConfigParser
 3 | import setuptools
 4 | import re
 5 | import sys
 6 | 
 7 | assert parse_version(setuptools.__version__) >= parse_version('36.2')
 8 | 
 9 | # note: all settings are in settings.ini; edit there, not here
10 | config = ConfigParser(delimiters=['='])
11 | config.read('settings.ini')
12 | cfg = config['DEFAULT']
13 | 
14 | cfg_keys = 'version description keywords author author_email'.split()
15 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split()
16 | for o in expected:
17 |     assert o in cfg, "missing expected setting: {}".format(o)
18 | setup_cfg = {o: cfg[o] for o in cfg_keys}
19 | 
20 | if len(sys.argv) > 1 and sys.argv[1] == 'version':
21 |     print(setup_cfg['version'])
22 |     exit()
23 | 
24 | licenses = {
25 |     'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'),
26 |     'mit': ('MIT License', 'OSI Approved :: MIT License'),
27 |     'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'),
28 |     'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'),
29 |     'bsd3': ('BSD License', 'OSI Approved :: BSD License'),
30 | }
31 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha',
32 |              '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ]
33 | py_versions = '2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8'.split()
34 | 
35 | lic = licenses.get(cfg['license'].lower(), (cfg['license'], None))
36 | min_python = cfg['min_python']
37 | 
38 | requirements = ['pip', 'packaging']
39 | if cfg.get('requirements'):
40 |     requirements += cfg.get('requirements', '').split()
41 | if cfg.get('pip_requirements'):
42 |     requirements += cfg.get('pip_requirements', '').split()
43 | dev_requirements = (cfg.get('dev_requirements') or '').split()
44 | 
45 | long_description = open('README.md').read()
46 | # ![png](docs/images/output_13_0.png)
47 | for ext in ['png', 'svg']:
48 |     long_description = re.sub(r'!\['+ext+'\]\((.*)\)', '!['+ext+']('+'https://raw.githubusercontent.com/{}/{}'.format(cfg['user'],cfg['lib_name'])+'/'+cfg['branch']+'/\\1)', long_description)
49 |     long_description = re.sub(r'src=\"(.*)\.'+ext+'\"', 'src=\"https://raw.githubusercontent.com/{}/{}'.format(cfg['user'],cfg['lib_name'])+'/'+cfg['branch']+'/\\1.'+ext+'\"', long_description)
50 | 
51 | setuptools.setup(
52 |     name = cfg['lib_name'],
53 |     license = lic[0],
54 |     classifiers = [
55 |         'Development Status :: ' + statuses[int(cfg['status'])],
56 |         'Intended Audience :: ' + cfg['audience'].title(),
57 |         'Natural Language :: ' + cfg['language'].title(),
58 |     ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []),
59 |     url = cfg['git_url'],
60 |     packages = setuptools.find_packages(),
61 |     include_package_data = True,
62 |     install_requires = requirements,
63 |     extras_require={ 'dev': dev_requirements },
64 |     python_requires  = '>=' + cfg['min_python'],
65 |     long_description = long_description,
66 |     long_description_content_type = 'text/markdown',
67 |     zip_safe = False,
68 |     entry_points = { 'console_scripts': cfg.get('console_scripts','').split() },
69 |     **setup_cfg)
70 | 
71 | 


--------------------------------------------------------------------------------
/target_lite_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/target_lite_model.pkl


--------------------------------------------------------------------------------
/target_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/target_model.pkl


--------------------------------------------------------------------------------
/test_data/codon_map.csv:
--------------------------------------------------------------------------------
 1 | ﻿Codon,Amino Acid,Property
 2 | TTT,F,Nonpolar
 3 | TTC,F,Nonpolar
 4 | TTA,L,Nonpolar
 5 | TTG,L,Nonpolar
 6 | CTT,L,Nonpolar
 7 | CTC,L,Nonpolar
 8 | CTA,L,Nonpolar
 9 | CTG,L,Nonpolar
10 | ATT,I,Nonpolar
11 | ATC,I,Nonpolar
12 | ATA,I,Nonpolar
13 | ATG,M,Nonpolar
14 | GTT,V,Nonpolar
15 | GTC,V,Nonpolar
16 | GTA,V,Nonpolar
17 | GTG,V,Nonpolar
18 | TCT,S,Polar
19 | TCC,S,Polar
20 | TCA,S,Polar
21 | TCG,S,Polar
22 | CCT,P,Nonpolar
23 | CCC,P,Nonpolar
24 | CCA,P,Nonpolar
25 | CCG,P,Nonpolar
26 | ACT,T,Polar
27 | ACC,T,Polar
28 | ACA,T,Polar
29 | ACG,T,Polar
30 | GCT,A,Nonpolar
31 | GCC,A,Nonpolar
32 | GCA,A,Nonpolar
33 | GCG,A,Nonpolar
34 | TAT,Y,Polar
35 | TAC,Y,Polar
36 | TAA,*,Stop
37 | TAG,*,Stop
38 | CAT,H,Basic
39 | CAC,H,Basic
40 | CAA,Q,Polar
41 | CAG,Q,Polar
42 | AAT,N,Polar
43 | AAC,N,Polar
44 | AAA,K,Basic
45 | AAG,K,Basic
46 | GAT,D,Acidic
47 | GAC,D,Acidic
48 | GAA,E,Acidic
49 | GAG,E,Acidic
50 | TGT,C,Polar
51 | TGC,C,Polar
52 | TGA,*,Stop
53 | TGG,W,Nonpolar
54 | CGT,R,Basic
55 | CGC,R,Basic
56 | CGA,R,Basic
57 | CGG,R,Basic
58 | AGT,S,Polar
59 | AGC,S,Polar
60 | AGA,R,Basic
61 | AGG,R,Basic
62 | GGT,G,Nonpolar
63 | GGC,G,Nonpolar
64 | GGA,G,Nonpolar
65 | GGG,G,Nonpolar


--------------------------------------------------------------------------------
/test_data/target_data/aa_seqs.pq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/test_data/target_data/aa_seqs.pq


--------------------------------------------------------------------------------
/test_data/target_data/conservation.pq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/test_data/target_data/conservation.pq


--------------------------------------------------------------------------------
/test_data/target_data/protein_domains.pq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/test_data/target_data/protein_domains.pq


--------------------------------------------------------------------------------