├── .devcontainer.json
├── .github
└── workflows
│ └── main.yml
├── .gitignore
├── 00_seq.ipynb
├── 01_targetdata.ipynb
├── 02_targetfeat.ipynb
├── 03_predicttarg.ipynb
├── 04_predict.ipynb
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── RuleSet3.pkl
├── docker-compose.yml
├── docs
├── .gitignore
├── Gemfile
├── Gemfile.lock
├── _config.yml
├── _data
│ ├── sidebars
│ │ └── home_sidebar.yml
│ └── topnav.yml
├── feed.xml
├── images
│ ├── output_18_0.png
│ └── output_42_0.png
├── index.html
├── predict.html
├── predicttarg.html
├── seq.html
├── sidebar.json
├── sitemap.xml
├── targetdata.html
└── targetfeat.html
├── index.ipynb
├── rs3
├── RuleSet3.pkl
├── __init__.py
├── _nbdev.py
├── predict.py
├── predicttarg.py
├── seq.py
├── target_lite_model.pkl
├── target_model.pkl
├── targetdata.py
└── targetfeat.py
├── settings.ini
├── setup.py
├── target_lite_model.pkl
├── target_model.pkl
└── test_data
├── Aguirre2016_activity.csv
├── Behan2019_activity.csv
├── codon_map.csv
├── sgrna-designs.txt
├── sgrna-designs_BCL2L1_MCL1_EEF2.txt
├── sgrna-designs_BCL2L1_MCL1_EEF2_na.txt
└── target_data
├── aa_seqs.pq
├── conservation.pq
└── protein_domains.pq
/.devcontainer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "nbdev_template-codespaces",
3 | "dockerComposeFile": "docker-compose.yml",
4 | "service": "watcher",
5 | "settings": {"terminal.integrated.shell.linux": "/bin/bash"},
6 | "mounts": [ "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ],
7 | "forwardPorts": [4000, 8080],
8 | "appPort": [4000, 8080],
9 | "extensions": ["ms-python.python",
10 | "ms-azuretools.vscode-docker"],
11 | "runServices": ["notebook", "jekyll", "watcher"],
12 | "postStartCommand": "pip install -e ."
13 | }
14 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 | on: [push, pull_request]
3 | jobs:
4 | build:
5 | runs-on: ubuntu-latest
6 | steps:
7 | - uses: actions/checkout@v1
8 | - uses: actions/setup-python@v1
9 | with:
10 | python-version: '3.7'
11 | architecture: 'x64'
12 | - name: Install the library
13 | run: |
14 | pip install nbdev jupyter
15 | pip install -e .[dev]
16 | - name: Read all notebooks
17 | run: |
18 | nbdev_read_nbs
19 | - name: Check if all notebooks are cleaned
20 | run: |
21 | echo "Check we are starting with clean git checkout"
22 | if [ -n "$(git status -uno -s)" ]; then echo "git status is not clean"; false; fi
23 | echo "Trying to strip out notebooks"
24 | nbdev_clean_nbs
25 | echo "Check that strip out was unnecessary"
26 | git status -s # display the status to see which nbs need cleaning up
27 | if [ -n "$(git status -uno -s)" ]; then echo -e "!!! Detected unstripped out notebooks\n!!!Remember to run nbdev_install_git_hooks"; false; fi
28 | - name: Check if there is no diff library/notebooks
29 | run: |
30 | if [ -n "$(nbdev_diff_nbs)" ]; then echo -e "!!! Detected difference between the notebooks and the library"; false; fi
31 | - name: Run tests
32 | run: |
33 | nbdev_test_nbs --fname=index.ipynb --n_workers=1
34 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.bak
2 | .gitattributes
3 | .last_checked
4 | .gitconfig
5 | *.bak
6 | *.log
7 | *~
8 | ~*
9 | _tmp*
10 | tmp*
11 | tags
12 |
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | *.py[cod]
16 | *$py.class
17 |
18 | # C extensions
19 | *.so
20 |
21 | # Distribution / packaging
22 | .Python
23 | env/
24 | build/
25 | develop-eggs/
26 | dist/
27 | downloads/
28 | eggs/
29 | .eggs/
30 | lib/
31 | lib64/
32 | parts/
33 | sdist/
34 | var/
35 | wheels/
36 | *.egg-info/
37 | .installed.cfg
38 | *.egg
39 |
40 | # PyInstaller
41 | # Usually these files are written by a python script from a template
42 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
43 | *.manifest
44 | *.spec
45 |
46 | # Installer logs
47 | pip-log.txt
48 | pip-delete-this-directory.txt
49 |
50 | # Unit test / coverage reports
51 | htmlcov/
52 | .tox/
53 | .coverage
54 | .coverage.*
55 | .cache
56 | nosetests.xml
57 | coverage.xml
58 | *.cover
59 | .hypothesis/
60 |
61 | # Translations
62 | *.mo
63 | *.pot
64 |
65 | # Django stuff:
66 | *.log
67 | local_settings.py
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # celery beat schedule file
89 | celerybeat-schedule
90 |
91 | # SageMath parsed files
92 | *.sage.py
93 |
94 | # dotenv
95 | .env
96 |
97 | # virtualenv
98 | .venv
99 | venv/
100 | ENV/
101 |
102 | # Spyder project settings
103 | .spyderproject
104 | .spyproject
105 |
106 | # Rope project settings
107 | .ropeproject
108 |
109 | # mkdocs documentation
110 | /site
111 |
112 | # mypy
113 | .mypy_cache/
114 |
115 | .vscode
116 | *.swp
117 |
118 | # osx generated files
119 | .DS_Store
120 | .DS_Store?
121 | .Trashes
122 | ehthumbs.db
123 | Thumbs.db
124 | .idea
125 |
126 | # pytest
127 | .pytest_cache
128 |
129 | # tools/trust-doc-nbs
130 | docs_src/.last_checked
131 |
132 | # symlinks to fastai
133 | docs_src/fastai
134 | tools/fastai
135 |
136 | # link checker
137 | checklink/cookies.txt
138 |
139 | # .gitconfig is now autogenerated
140 | .gitconfig
141 |
142 |
--------------------------------------------------------------------------------
/02_targetfeat.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# default_exp targetfeat"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "# targetfeat\n",
17 | "> Module to generate target site features"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "# export\n",
27 | "import pandas as pd\n",
28 | "from Bio.SeqUtils.ProtParam import ProteinAnalysis\n",
29 | "import warnings"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "from rs3 import targetdata"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "import multiprocessing\n",
48 | "max_n_jobs = multiprocessing.cpu_count()"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "# export\n",
58 | "def add_target_columns(design_df, transcript_id_col='Target Transcript',\n",
59 | " cut_pos_col='Target Cut Length',\n",
60 | " transcript_base_col='Transcript Base'):\n",
61 | " \"\"\"Add ['AA Index' and 'Transcript Base'] to design df\n",
62 | "\n",
63 | " :param design_df: DataFrame\n",
64 | " :return: DataFrame\n",
65 | " \"\"\"\n",
66 | " out_df = design_df.copy()\n",
67 | " out_df['AA Index'] = (out_df[cut_pos_col] - 1) // 3 + 1\n",
68 | " out_df[transcript_base_col] = out_df[transcript_id_col].str.split('.', expand=True)[0]\n",
69 | " return out_df"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "design_df = pd.read_table('test_data/sgrna-designs.txt')\n",
79 | "design_targ_df = add_target_columns(design_df)\n",
80 | "assert 'AA Index' in design_targ_df.columns"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "## Position Features\n",
88 | "\n",
89 | "The first feature class we consider is where the guide targets within the annotated transcript"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": null,
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "# export\n",
99 | "def get_position_features(sg_df, id_cols):\n",
100 | " \"\"\"Get features ['Target Cut %', 'sense']\n",
101 | "\n",
102 | " :param sg_df: DataFrame\n",
103 | " :param id_cols: list\n",
104 | " :return: DataFrame\n",
105 | " \"\"\"\n",
106 | " position_df = sg_df[id_cols + ['Target Cut %']].copy()\n",
107 | " position_df['sense'] = sg_df['Orientation'] == 'sense'\n",
108 | " return position_df"
109 | ]
110 | },
111 | {
112 | "cell_type": "markdown",
113 | "metadata": {},
114 | "source": [
115 | "## Amino Acid Features\n",
116 | "\n",
117 | "We calculate a set of features from the amino acid sequence around the cutsite itself"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "aas = ['A', 'C', 'D', 'E', 'F',\n",
127 | " 'G', 'H', 'I', 'K', 'L',\n",
128 | " 'M', 'N', 'P', 'Q', 'R',\n",
129 | " 'S', 'T', 'V', 'W', 'Y', '*']"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "# export\n",
139 | "def get_one_aa_frac(feature_dict, aa_sequence, aas):\n",
140 | " \"\"\"Get fraction of single aa\n",
141 | "\n",
142 | " :param feature_dict: dict, feature dictionary\n",
143 | " :param aa_sequence: str, amino acid sequence\n",
144 | " :param aas: list, list of amino acids\n",
145 | " \"\"\"\n",
146 | " for aa in aas:\n",
147 | " aa_frac = aa_sequence.count(aa) / len(aa_sequence)\n",
148 | " feature_dict[aa] = aa_frac"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": null,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "one_aa_ft = {}\n",
158 | "get_one_aa_frac(one_aa_ft, 'ACDG*-', aas)\n",
159 | "assert one_aa_ft['A'] == 1/6\n",
160 | "assert one_aa_ft['Q'] == 0"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": null,
166 | "metadata": {},
167 | "outputs": [],
168 | "source": [
169 | "# export\n",
170 | "def get_aa_aromaticity(feature_dict, analyzed_seq):\n",
171 | " \"\"\"Get fraction of aromatic amino acids in a sequence.\n",
172 | "\n",
173 | " Phe (F) + Trp (W) + Tyr (Y)\n",
174 | "\n",
175 | " :param feature_dict:\n",
176 | " :param analyzed_seq: ProteinAnalysis object\n",
177 | " \"\"\"\n",
178 | " feature_dict['Aromaticity'] = analyzed_seq.aromaticity()\n",
179 | "\n",
180 | "\n",
181 | "def get_aa_hydrophobicity(feature_dict, analyzed_seq):\n",
182 | " \"\"\"Grand Average of Hydropathy\n",
183 | "\n",
184 | " The GRAVY value is calculated by adding the hydropathy value for each residue and dividing\n",
185 | " by the length of the sequence (Kyte and Doolittle; 1982). The larger the number, the more hydrophobic the\n",
186 | " amino acid\n",
187 | "\n",
188 | " :param feature_dict: dict\n",
189 | " :param analyzed_seq: ProteinAnalysis object\n",
190 | " \"\"\"\n",
191 | " feature_dict['Hydrophobicity'] = analyzed_seq.gravy()\n",
192 | "\n",
193 | "\n",
194 | "def get_aa_ip(feature_dict, analyzed_seq):\n",
195 | " \"\"\"Get the Isoelectric Point of an amino acid sequence\n",
196 | "\n",
197 | " Charge of amino acid\n",
198 | "\n",
199 | " :param feature_dict: dict\n",
200 | " :param analyzed_seq: ProteinAnalysis object\n",
201 | " \"\"\"\n",
202 | " feature_dict['Isoelectric Point'] = analyzed_seq.isoelectric_point()\n",
203 | "\n",
204 | "\n",
205 | "def get_aa_secondary_structure(feature_dict, analyzed_seq):\n",
206 | " \"\"\"Get the fraction of amion acids that tend to be in a helix, turn or sheet\n",
207 | "\n",
208 | " :param feature_dict: dict\n",
209 | " :param analyzed_seq: ProteinAnalysis object\n",
210 | " \"\"\"\n",
211 | " feature_dict['Helix'], feature_dict['Turn'], feature_dict['Sheet'] = analyzed_seq.secondary_structure_fraction()\n"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": null,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "aa_biochemical_fts1 = {}\n",
221 | "get_aa_aromaticity(aa_biochemical_fts1, ProteinAnalysis('FWYA'))\n",
222 | "aa_biochemical_fts2 = {}\n",
223 | "get_aa_aromaticity(aa_biochemical_fts2, ProteinAnalysis('AAAA'))\n",
224 | "assert aa_biochemical_fts1['Aromaticity'] > aa_biochemical_fts2['Aromaticity']"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": null,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "# export\n",
234 | "def featurize_aa_seqs(aa_sequences, features=None):\n",
235 | " \"\"\"Get feature DataFrame for a list of amino acid sequences\n",
236 | "\n",
237 | " :param aa_sequences: list of str\n",
238 | " :param features: list or None\n",
239 | " :return: DataFrame\n",
240 | " \"\"\"\n",
241 | " if features is None:\n",
242 | " features = ['Pos. Ind. 1mer', 'Hydrophobicity', 'Aromaticity',\n",
243 | " 'Isoelectric Point', 'Secondary Structure']\n",
244 | " aas = ['A', 'C', 'D', 'E', 'F',\n",
245 | " 'G', 'H', 'I', 'K', 'L',\n",
246 | " 'M', 'N', 'P', 'Q', 'R',\n",
247 | " 'S', 'T', 'V', 'W', 'Y', '*']\n",
248 | " clean_aa_seqs = aa_sequences.str.replace('\\*|-', '', regex=True)\n",
249 | " feature_dict_list = []\n",
250 | " for i, (aa_sequence, clean_sequence) in enumerate(zip(aa_sequences, clean_aa_seqs)):\n",
251 | " analyzed_seq = ProteinAnalysis(clean_sequence)\n",
252 | " feature_dict = {}\n",
253 | " if 'Pos. Ind. 1mer' in features:\n",
254 | " get_one_aa_frac(feature_dict, aa_sequence, aas)\n",
255 | " if 'Hydrophobicity' in features:\n",
256 | " get_aa_hydrophobicity(feature_dict, analyzed_seq)\n",
257 | " if 'Aromaticity' in features:\n",
258 | " get_aa_aromaticity(feature_dict, analyzed_seq)\n",
259 | " if 'Isoelectric Point' in features:\n",
260 | " get_aa_ip(feature_dict, analyzed_seq)\n",
261 | " if 'Secondary Structure' in features:\n",
262 | " get_aa_secondary_structure(feature_dict, analyzed_seq)\n",
263 | " feature_dict_list.append(feature_dict)\n",
264 | " feature_matrix = pd.DataFrame(feature_dict_list)\n",
265 | " feature_matrix.index = aa_sequences\n",
266 | " return feature_matrix"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {},
273 | "outputs": [],
274 | "source": [
275 | "ft_dict_df = featurize_aa_seqs(pd.Series(['ACDG*-', 'CDG*--', 'LLLLLL']))\n",
276 | "assert ft_dict_df.loc['LLLLLL', 'Hydrophobicity'] == ft_dict_df['Hydrophobicity'].max()"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": null,
282 | "metadata": {},
283 | "outputs": [],
284 | "source": [
285 | "# export\n",
286 | "def extract_amino_acid_subsequence(sg_aas, width):\n",
287 | " \"\"\" Get the amino acid subsequence with a width of `width` on either side of the Amino Acid index\n",
288 | "\n",
289 | " :param sg_aas: DataFrame, sgRNA designs merged with amino acid sequence\n",
290 | " :param width: int\n",
291 | " :return: DataFrame\n",
292 | " \"\"\"\n",
293 | " # Pad the sequences at the beginning and end, so our index doesn't go over\n",
294 | " l_padding = '-' * (width + 1) # can cut just before the CDS\n",
295 | " r_padding = '-' * width # can cut the stop codon\n",
296 | " # add stop codon at the end of the sequence\n",
297 | " sg_aas_subseq = sg_aas.copy()\n",
298 | " sg_aas_subseq['extended_seq'] = l_padding + sg_aas_subseq['seq'] + '*' + r_padding\n",
299 | " sg_aas_subseq['AA 0-Indexed'] = sg_aas_subseq['AA Index'] - 1\n",
300 | " sg_aas_subseq['AA 0-Indexed padded'] = sg_aas_subseq['AA 0-Indexed'] + len(l_padding)\n",
301 | " sg_aas_subseq['seq_start'] = (sg_aas_subseq['AA 0-Indexed padded'] - width).astype(int)\n",
302 | " sg_aas_subseq['seq_end'] = (sg_aas_subseq['AA 0-Indexed padded'] + width).astype(int)\n",
303 | " sg_aas_subseq['AA Subsequence'] = sg_aas_subseq.apply(lambda row: row['extended_seq'][row['seq_start']:(row['seq_end'] + 1)],\n",
304 | " axis=1)\n",
305 | " return sg_aas_subseq\n"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "small_aa_seq_df = pd.DataFrame({'AA Index': [1, 5, 9],\n",
315 | " 'seq': ['MAVLKYSLW']*3})\n",
316 | "small_aa_subseq_df = extract_amino_acid_subsequence(small_aa_seq_df, 2)\n",
317 | "actual_subseqs = small_aa_subseq_df['AA Subsequence']\n",
318 | "expected_subseqs = ['--MAV', 'VLKYS', 'SLW*-']\n",
319 | "assert len(actual_subseqs) == len(expected_subseqs)\n",
320 | "assert all([a == b for a, b in zip(actual_subseqs, expected_subseqs)])"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "metadata": {},
327 | "outputs": [],
328 | "source": [
329 | "# export\n",
330 | "def get_aa_subseq_df(sg_designs, aa_seq_df, width, id_cols,\n",
331 | " transcript_base_col='Transcript Base',\n",
332 | " target_transcript_col='Target Transcript',\n",
333 | " aa_index_col='AA Index'):\n",
334 | " \"\"\"Get the amino acid subsequences for a design dataframe\n",
335 | "\n",
336 | " :param sg_designs: DataFrame\n",
337 | " :param aa_seq_df: DataFrame, Transcript Base and (AA) seq\n",
338 | " :param width: int, length on each side of the cut site\n",
339 | " :param transcript_base_col: str\n",
340 | " :param target_transcript_col: str\n",
341 | " :param aa_index_col: str\n",
342 | " :return: DataFrame\n",
343 | " \"\"\"\n",
344 | " sg_aas = (aa_seq_df.merge(sg_designs[list(set(id_cols +\n",
345 | " [target_transcript_col, transcript_base_col, aa_index_col]))],\n",
346 | " how='inner',\n",
347 | " on=[target_transcript_col, transcript_base_col]))\n",
348 | " sg_aas_subseq = extract_amino_acid_subsequence(sg_aas, width)\n",
349 | " return sg_aas_subseq"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": null,
355 | "metadata": {},
356 | "outputs": [
357 | {
358 | "name": "stdout",
359 | "output_type": "stream",
360 | "text": [
361 | "Getting amino acid sequences\n"
362 | ]
363 | },
364 | {
365 | "name": "stderr",
366 | "output_type": "stream",
367 | "text": [
368 | "100%|█████████████████████████████████████████████| 4/4 [00:04<00:00, 1.19s/it]\n"
369 | ]
370 | }
371 | ],
372 | "source": [
373 | "aa_seq_df = targetdata.build_transcript_aa_seq_df(design_targ_df, n_jobs=2)"
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": null,
379 | "metadata": {},
380 | "outputs": [],
381 | "source": [
382 | "aa_subseq_df = get_aa_subseq_df(sg_designs=design_targ_df, aa_seq_df=aa_seq_df, width=16,\n",
383 | " id_cols=['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation'])\n",
384 | "assert (aa_subseq_df['AA Subsequence'].str.len() == 33).all()\n",
385 | "assert aa_subseq_df.shape[0] == design_targ_df.shape[0]"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": null,
391 | "metadata": {},
392 | "outputs": [],
393 | "source": [
394 | "codon_map_df = pd.read_csv('test_data/codon_map.csv')\n",
395 | "\n",
396 | "def get_rev_comp(sgrna):\n",
397 | " \"\"\"Get reverse compliment of a guide\"\"\"\n",
398 | " nt_map = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}\n",
399 | " rev_comp = ''\n",
400 | " for nt in sgrna:\n",
401 | " rev_comp += nt_map[nt]\n",
402 | " rev_comp = rev_comp[::-1]\n",
403 | " return rev_comp\n",
404 | "\n",
405 | "codon_map = pd.Series(codon_map_df['Amino Acid'].values, index=codon_map_df['Codon']).to_dict()\n",
406 | "row = aa_subseq_df.sample(1, random_state=1).iloc[0, :]\n",
407 | "subseq = row['AA Subsequence']\n",
408 | "context = row['sgRNA Context Sequence']\n",
409 | "rc_context = get_rev_comp(context)\n",
410 | "translations = dict()\n",
411 | "rc_translations = dict()\n",
412 | "for i in [0, 1, 2]:\n",
413 | " translations[i] = ''.join([codon_map[context[j:j+3]] for j in range(i, len(context), 3)\n",
414 | " if (j + 3) <= len(context)])\n",
415 | " rc_translations[i] = ''.join([codon_map[rc_context[j:j+3]] for j in range(i, len(rc_context), 3)\n",
416 | " if (j + 3) <= len(rc_context)])\n",
417 | "assert ((translations[0] in subseq) or (translations[1] in subseq) or (translations[2] in subseq) or\n",
418 | " (rc_translations[0] in subseq) or (rc_translations[1] in subseq) or (rc_translations[2] in subseq))"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": null,
424 | "metadata": {},
425 | "outputs": [],
426 | "source": [
427 | "# export\n",
428 | "def get_amino_acid_features(aa_subseq_df, features, id_cols):\n",
429 | " \"\"\"Featurize amino acid sequences\n",
430 | "\n",
431 | " :param aa_subseq_df: DataFrame\n",
432 | " :param features: list\n",
433 | " :param id_cols: list\n",
434 | " :return: DataFrame\n",
435 | " \"\"\"\n",
436 | "\n",
437 | " # Zero-indexed for python\n",
438 | " # filter out sequences without the canonical amino acids\n",
439 | " aa_set = set('ARNDCQEGHILKMFPSTWYV*-')\n",
440 | " filtered_sg_aas = (aa_subseq_df[aa_subseq_df['AA Subsequence'].apply(lambda s: set(s) <= aa_set)]\n",
441 | " .reset_index(drop=True))\n",
442 | " filtered_diff = (aa_subseq_df.shape[0] - filtered_sg_aas.shape[0])\n",
443 | " if filtered_diff > 0:\n",
444 | " warnings.warn('Ignored ' + str(filtered_diff) + ' amino acid sequences with non-canonical amino acids')\n",
445 | " aa_features = featurize_aa_seqs(filtered_sg_aas['AA Subsequence'], features=features)\n",
446 | " aa_features_annot = pd.concat([filtered_sg_aas[id_cols + ['AA Subsequence']]\n",
447 | " .reset_index(drop=True),\n",
448 | " aa_features.reset_index(drop=True)], axis=1)\n",
449 | " return aa_features_annot\n"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": null,
455 | "metadata": {},
456 | "outputs": [],
457 | "source": [
458 | "aa_features = get_amino_acid_features(aa_subseq_df=aa_subseq_df,\n",
459 | " features=['Pos. Ind. 1mer',\n",
460 | " 'Hydrophobicity', 'Aromaticity',\n",
461 | " 'Isoelectric Point', 'Secondary Structure'],\n",
462 | " id_cols=['sgRNA Context Sequence', 'Target Cut Length',\n",
463 | " 'Target Transcript', 'Orientation'])\n",
464 | "assert aa_features['L'].idxmax() == aa_features['Hydrophobicity'].idxmax()"
465 | ]
466 | },
467 | {
468 | "cell_type": "markdown",
469 | "metadata": {},
470 | "source": [
471 | "## Protein Domain Features"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": null,
477 | "metadata": {},
478 | "outputs": [],
479 | "source": [
480 | "#export\n",
481 | "def get_protein_domain_features(sg_design_df, protein_domains, id_cols,\n",
482 | " sources=None,\n",
483 | " transcript_base_col='Transcript Base',\n",
484 | " aa_index_col='AA Index',\n",
485 | " domain_type_col='type',\n",
486 | " domain_start_col='start',\n",
487 | " domain_end_col='end'):\n",
488 | " \"\"\"Get binary dataframe of protein domains\n",
489 | "\n",
490 | " :param sg_design_df: DataFrame, with columns [transcript_base_col, aa_index_col]\n",
491 | " :param protein_domains: DataFrame, with columns [transcript_base_col, domain_type_col]\n",
492 | " :param id_cols: list\n",
493 | " :param sources: list. list of database types to include\n",
494 | " :param transcript_base_col: str\n",
495 | " :param aa_index_col: str\n",
496 | " :param domain_type_col: str\n",
497 | " :param domain_start_col: str\n",
498 | " :param domain_end_col: str\n",
499 | " :return: DataFrame, with binary features for protein domains\n",
500 | " \"\"\"\n",
501 | " if sources is None:\n",
502 | " sources = ['Pfam', 'PANTHER', 'HAMAP', 'SuperFamily', 'TIGRfam', 'ncoils', 'Gene3D',\n",
503 | " 'Prosite_patterns', 'Seg', 'SignalP', 'TMHMM', 'MobiDBLite',\n",
504 | " 'PIRSF', 'PRINTS', 'Smart', 'Prosite_profiles'] # exclude sifts\n",
505 | " protein_domains = protein_domains[protein_domains[domain_type_col].isin(sources)]\n",
506 | " clean_designs = sg_design_df[list(set(id_cols + [transcript_base_col, aa_index_col]))].copy()\n",
507 | " designs_domains = clean_designs.merge(protein_domains,\n",
508 | " how='inner', on=transcript_base_col)\n",
509 | " # Note - not every sgRNA will be present in the feature df\n",
510 | " filtered_domains = (designs_domains[designs_domains[aa_index_col].between(designs_domains[domain_start_col],\n",
511 | " designs_domains[domain_end_col])]\n",
512 | " .copy())\n",
513 | " filtered_domains = filtered_domains[id_cols + [domain_type_col]].drop_duplicates()\n",
514 | " filtered_domains['present'] = 1\n",
515 | " domain_feature_df = (filtered_domains.pivot_table(values='present',\n",
516 | " index=id_cols,\n",
517 | " columns='type', fill_value=0)\n",
518 | " .reset_index())\n",
519 | " # Ensure all domain columns are present for testing\n",
520 | " full_column_df = pd.DataFrame(columns=id_cols + sources, dtype=int) # empty\n",
521 | " domain_feature_df = pd.concat([full_column_df, domain_feature_df]).fillna(0)\n",
522 | " domain_feature_df[sources] = domain_feature_df[sources].astype(int)\n",
523 | " return domain_feature_df"
524 | ]
525 | },
526 | {
527 | "cell_type": "code",
528 | "execution_count": null,
529 | "metadata": {},
530 | "outputs": [
531 | {
532 | "name": "stdout",
533 | "output_type": "stream",
534 | "text": [
535 | "Getting protein domains\n"
536 | ]
537 | },
538 | {
539 | "name": "stderr",
540 | "output_type": "stream",
541 | "text": [
542 | "100%|█████████████████████████████████████████| 200/200 [00:49<00:00, 4.02it/s]\n"
543 | ]
544 | }
545 | ],
546 | "source": [
547 | "domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)\n",
548 | "protein_domain_feature_df = get_protein_domain_features(design_targ_df, domain_df, sources=None,\n",
549 | " id_cols=['sgRNA Context Sequence', 'Target Cut Length',\n",
550 | " 'AA Index', 'Target Transcript', 'Orientation'])"
551 | ]
552 | },
553 | {
554 | "cell_type": "code",
555 | "execution_count": null,
556 | "metadata": {},
557 | "outputs": [],
558 | "source": [
559 | "assert protein_domain_feature_df.loc[protein_domain_feature_df['sgRNA Context Sequence'] == 'AAAAGAGCCATGAATCTAAACATCAGGAAT',\n",
560 | " ['PANTHER', 'ncoils', 'Seg', 'MobiDBLite']].sum(axis=1).values[0] == 4"
561 | ]
562 | },
563 | {
564 | "cell_type": "markdown",
565 | "metadata": {},
566 | "source": [
567 | "## Conservation Features"
568 | ]
569 | },
570 | {
571 | "cell_type": "code",
572 | "execution_count": null,
573 | "metadata": {},
574 | "outputs": [],
575 | "source": [
576 | "# export\n",
577 | "def get_conservation_ranges(cut_pos, small_width, large_width):\n",
578 | " small_range = range(cut_pos - small_width + 1, cut_pos + small_width + 1)\n",
579 | " large_range = range(cut_pos - large_width + 1, cut_pos + large_width + 1)\n",
580 | " return small_range, large_range\n",
581 | "\n",
582 | "\n",
583 | "def get_conservation_features(sg_designs, conservation_df, conservation_column,\n",
584 | " small_width, large_width, id_cols):\n",
585 | " \"\"\"Get conservation features\n",
586 | "\n",
587 | " :param sg_designs: DataFrame\n",
588 | " :param conservation_df: DataFrame, tidy conservation scores indexed by Transcript Base and target position\n",
589 | " :param conservation_column: str, name of column to calculate scores with\n",
590 | " :param small_width: int, small window length to average scores in one direction\n",
591 | " :param large_width: int, large window length to average scores in the one direction\n",
592 | " :return: DataFrame of conservation features\n",
593 | " \"\"\"\n",
594 | " sg_designs_width = sg_designs[id_cols + ['Transcript Base']].copy()\n",
595 | " sg_designs_width['target position small'], sg_designs_width['target position large'] = \\\n",
596 | " zip(*sg_designs_width['Target Cut Length']\n",
597 | " .apply(get_conservation_ranges, small_width=small_width,\n",
598 | " large_width=large_width))\n",
599 | " small_width_conservation = (sg_designs_width.drop('target position large', axis=1)\n",
600 | " .rename({'target position small': 'target position'}, axis=1)\n",
601 | " .explode('target position')\n",
602 | " .merge(conservation_df, how='inner',\n",
603 | " on=['Target Transcript', 'Transcript Base', 'target position'])\n",
604 | " .groupby(id_cols)\n",
605 | " .agg(cons=(conservation_column, 'mean'))\n",
606 | " .rename({'cons': 'cons_' + str(small_width * 2)}, axis=1)\n",
607 | " .reset_index())\n",
608 | " large_width_conservation = (sg_designs_width.drop('target position small', axis=1)\n",
609 | " .rename({'target position large': 'target position'}, axis=1)\n",
610 | " .explode('target position')\n",
611 | " .merge(conservation_df, how='inner',\n",
612 | " on=['Target Transcript', 'Transcript Base', 'target position'])\n",
613 | " .groupby(id_cols)\n",
614 | " .agg(cons=(conservation_column, 'mean'))\n",
615 | " .rename({'cons': 'cons_' + str(large_width * 2)}, axis=1)\n",
616 | " .reset_index())\n",
617 | " cons_feature_df = small_width_conservation.merge(large_width_conservation, how='outer',\n",
618 | " on=id_cols)\n",
619 | " return cons_feature_df"
620 | ]
621 | },
622 | {
623 | "cell_type": "code",
624 | "execution_count": null,
625 | "metadata": {},
626 | "outputs": [
627 | {
628 | "name": "stdout",
629 | "output_type": "stream",
630 | "text": [
631 | "Getting conservation\n"
632 | ]
633 | },
634 | {
635 | "name": "stderr",
636 | "output_type": "stream",
637 | "text": [
638 | "100%|█████████████████████████████████████████| 200/200 [06:28<00:00, 1.94s/it]\n"
639 | ]
640 | }
641 | ],
642 | "source": [
643 | "conservation_df = targetdata.build_conservation_df(design_targ_df, n_jobs=max_n_jobs)\n",
644 | "conservation_features = get_conservation_features(design_targ_df, conservation_df,\n",
645 | " small_width=2, large_width=16,\n",
646 | " conservation_column='ranked_conservation',\n",
647 | " id_cols=['sgRNA Context Sequence', 'Target Cut Length',\n",
648 | " 'Target Transcript', 'Orientation'])\n",
649 | "merged_features = protein_domain_feature_df.merge(conservation_features, how='inner', on=['sgRNA Context Sequence',\n",
650 | " 'Target Cut Length',\n",
651 | " 'Target Transcript',\n",
652 | " 'Orientation'])\n",
653 | "smart_avg_cons = merged_features.loc[merged_features['Smart'].astype(bool), 'cons_32'].mean()\n",
654 | "non_smart_avg_cons = merged_features.loc[~merged_features['Smart'].astype(bool), 'cons_32'].mean()\n",
655 | "assert smart_avg_cons > non_smart_avg_cons"
656 | ]
657 | },
658 | {
659 | "cell_type": "markdown",
660 | "metadata": {},
661 | "source": [
662 | "## Combining target features\n",
663 | "\n",
664 | "We'll combine, the position, amino acid and domain feature matrices into a single target feature matrix"
665 | ]
666 | },
667 | {
668 | "cell_type": "code",
669 | "execution_count": null,
670 | "metadata": {},
671 | "outputs": [],
672 | "source": [
673 | "# export\n",
674 | "def merge_feature_dfs(design_df,\n",
675 | " aa_subseq_df, aa_features=None,\n",
676 | " domain_df=None,\n",
677 | " conservation_df=None,\n",
678 | " id_cols=None):\n",
679 | " if id_cols is None:\n",
680 | " id_cols = ['sgRNA Context Sequence', 'Target Cut Length',\n",
681 | " 'Target Transcript', 'Orientation']\n",
682 | " if aa_features is None:\n",
683 | " aa_features = ['Pos. Ind. 1mer',\n",
684 | " 'Hydrophobicity', 'Aromaticity',\n",
685 | " 'Isoelectric Point', 'Secondary Structure']\n",
686 | " if design_df[id_cols].drop_duplicates().shape[0] != design_df.shape[0]:\n",
687 | " raise ValueError('id_cols must uniquely identify rows of the design dataframe')\n",
688 | " feature_df_dict = dict()\n",
689 | " feature_list = list()\n",
690 | " position_feature_df = get_position_features(design_df, id_cols=id_cols)\n",
691 | " feature_df_dict['position'] = position_feature_df\n",
692 | " feature_list.extend(['Target Cut %', 'sense'])\n",
693 | " if domain_df is not None:\n",
694 | " feature_df_dict['domain'] = domain_df\n",
695 | " feature_list.extend(['Pfam', 'PANTHER', 'HAMAP', 'SuperFamily', 'TIGRfam', 'ncoils', 'Gene3D',\n",
696 | " 'Prosite_patterns', 'Seg', 'SignalP', 'TMHMM', 'MobiDBLite',\n",
697 | " 'PIRSF', 'PRINTS', 'Smart', 'Prosite_profiles'])\n",
698 | " if conservation_df is not None:\n",
699 | " feature_df_dict['conservation'] = conservation_df\n",
700 | " # hardcoded\n",
701 | " feature_list.extend(['cons_4', 'cons_32'])\n",
702 | " aa_feature_df = get_amino_acid_features(aa_subseq_df, aa_features, id_cols)\n",
703 | " feature_list.extend(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I',\n",
704 | " 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '*',\n",
705 | " 'Hydrophobicity', 'Aromaticity', 'Isoelectric Point', 'Helix', 'Turn',\n",
706 | " 'Sheet'])\n",
707 | " feature_df_dict['aa'] = aa_feature_df\n",
708 | " feature_df = design_df[id_cols]\n",
709 | " for key, df in feature_df_dict.items():\n",
710 | " feature_df = pd.merge(feature_df, df, how='left', on=id_cols)\n",
711 | " return feature_df, feature_list\n"
712 | ]
713 | },
714 | {
715 | "cell_type": "code",
716 | "execution_count": null,
717 | "metadata": {},
718 | "outputs": [],
719 | "source": [
720 | "feature_df, feature_list = merge_feature_dfs(design_df=design_df,\n",
721 | " aa_subseq_df=aa_subseq_df,\n",
722 | " domain_df=protein_domain_feature_df,\n",
723 | " conservation_df=conservation_features)"
724 | ]
725 | },
726 | {
727 | "cell_type": "code",
728 | "execution_count": null,
729 | "metadata": {},
730 | "outputs": [],
731 | "source": [
732 | "assert feature_df[feature_list].shape[1] == len(feature_list)"
733 | ]
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": null,
738 | "metadata": {},
739 | "outputs": [],
740 | "source": []
741 | }
742 | ],
743 | "metadata": {
744 | "kernelspec": {
745 | "display_name": "rs3_v2",
746 | "language": "python",
747 | "name": "rs3_v2"
748 | }
749 | },
750 | "nbformat": 4,
751 | "nbformat_minor": 4
752 | }
753 |
--------------------------------------------------------------------------------
/03_predicttarg.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# default_exp predicttarg"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "# predicttarg\n",
17 | "\n",
18 | "> Rule set 3 target-site predictions"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "# export\n",
28 | "from rs3 import targetfeat\n",
29 | "import joblib\n",
30 | "import os"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "import lightgbm\n",
40 | "import pandas as pd\n",
41 | "from rs3 import targetdata\n",
42 | "from scipy import stats\n",
43 | "import numpy as np"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "__file__ = os.path.abspath('') + '/03_predicttarg.ipynb'\n",
53 | "import multiprocessing\n",
54 | "max_n_jobs = multiprocessing.cpu_count()"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "# export\n",
64 | "def load_target_model(lite=False):\n",
65 | " \"\"\"Load rule set 3 target model\"\"\"\n",
66 | " if lite:\n",
67 | " model_name = 'target_lite_model.pkl'\n",
68 | " else:\n",
69 | " model_name = 'target_model.pkl'\n",
70 | " model = joblib.load(os.path.join(os.path.dirname(__file__), model_name))\n",
71 | " return model"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "name": "stderr",
81 | "output_type": "stream",
82 | "text": [
83 | "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n",
84 | " warnings.warn(\n",
85 | "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n",
86 | " warnings.warn(\n"
87 | ]
88 | }
89 | ],
90 | "source": [
91 | "assert type(load_target_model()['regressor']) == lightgbm.sklearn.LGBMRegressor"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "# export\n",
101 | "def predict_target(design_df, aa_subseq_df, domain_feature_df=None,\n",
102 | " conservation_feature_df=None, id_cols=None):\n",
103 | " \"\"\"Make predictions using the Rule Set 3 target model. Note that if the protein_domain_df\n",
104 | " or conservation_df are not supplied, then the lite model will be used, otherwise the full model is used.\n",
105 | "\n",
106 | " :param design_df: DataFrame\n",
107 | " :param aa_subseq_df: DataFrame\n",
108 | " :param domain_feature_df: DataFrame\n",
109 | " :param id_cols: list or str\n",
110 | " :return: list\n",
111 | " \"\"\"\n",
112 | " if (domain_feature_df is None) or (conservation_feature_df is None):\n",
113 | " lite = True\n",
114 | " domain_feature_df = None\n",
115 | " conservation_feature_df = None\n",
116 | " else:\n",
117 | " lite = False\n",
118 | " model = load_target_model(lite=lite)\n",
119 | " if id_cols is None:\n",
120 | " id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']\n",
121 | " target_feature_df, target_feature_cols = targetfeat.merge_feature_dfs(design_df,\n",
122 | " aa_subseq_df=aa_subseq_df,\n",
123 | " domain_df=domain_feature_df,\n",
124 | " conservation_df=conservation_feature_df,\n",
125 | " id_cols=id_cols)\n",
126 | " X_target = target_feature_df[target_feature_cols]\n",
127 | " predictions = model.predict(X_target)\n",
128 | " return predictions"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": [
137 | "design_df = pd.read_table('test_data/sgrna-designs.txt')\n",
138 | "design_targ_df = targetfeat.add_target_columns(design_df)\n",
139 | "id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "name": "stdout",
149 | "output_type": "stream",
150 | "text": [
151 | "Getting amino acid sequences\n"
152 | ]
153 | },
154 | {
155 | "name": "stderr",
156 | "output_type": "stream",
157 | "text": [
158 | "100%|█████████████████████████████████████████████| 4/4 [00:04<00:00, 1.12s/it]\n"
159 | ]
160 | },
161 | {
162 | "data": {
163 | "text/html": [
164 | "
\n",
165 | "\n",
178 | "
\n",
179 | " \n",
180 | " \n",
181 | " \n",
182 | " Target Transcript \n",
183 | " Target Total Length \n",
184 | " Transcript Base \n",
185 | " version \n",
186 | " molecule \n",
187 | " desc \n",
188 | " id \n",
189 | " seq \n",
190 | " AA len \n",
191 | " AA Index \n",
192 | " Orientation \n",
193 | " sgRNA Context Sequence \n",
194 | " Target Cut Length \n",
195 | " extended_seq \n",
196 | " AA 0-Indexed \n",
197 | " AA 0-Indexed padded \n",
198 | " seq_start \n",
199 | " seq_end \n",
200 | " AA Subsequence \n",
201 | " \n",
202 | " \n",
203 | " \n",
204 | " \n",
205 | " 0 \n",
206 | " ENST00000259457.8 \n",
207 | " 834 \n",
208 | " ENST00000259457 \n",
209 | " 3 \n",
210 | " protein \n",
211 | " None \n",
212 | " ENSP00000259457 \n",
213 | " MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI... \n",
214 | " 277 \n",
215 | " 64 \n",
216 | " sense \n",
217 | " TGGAGCAGATACAAGAGCAACTGAAGGGAT \n",
218 | " 191 \n",
219 | " -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF... \n",
220 | " 63 \n",
221 | " 80 \n",
222 | " 64 \n",
223 | " 96 \n",
224 | " GVVYKDGIVLGADTRATEGMVVADKNCSKIHFI \n",
225 | " \n",
226 | " \n",
227 | " 1 \n",
228 | " ENST00000259457.8 \n",
229 | " 834 \n",
230 | " ENST00000259457 \n",
231 | " 3 \n",
232 | " protein \n",
233 | " None \n",
234 | " ENSP00000259457 \n",
235 | " MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI... \n",
236 | " 277 \n",
237 | " 46 \n",
238 | " sense \n",
239 | " CCGGAAAACTGGCACGACCATCGCTGGGGT \n",
240 | " 137 \n",
241 | " -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF... \n",
242 | " 45 \n",
243 | " 62 \n",
244 | " 46 \n",
245 | " 78 \n",
246 | " AKRGYKLPKVRKTGTTIAGVVYKDGIVLGADTR \n",
247 | " \n",
248 | " \n",
249 | " 2 \n",
250 | " ENST00000394249.8 \n",
251 | " 1863 \n",
252 | " ENST00000394249 \n",
253 | " 3 \n",
254 | " protein \n",
255 | " None \n",
256 | " ENSP00000377793 \n",
257 | " MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH... \n",
258 | " 620 \n",
259 | " 106 \n",
260 | " sense \n",
261 | " TAGAAAAAGATTTGCGCACCCAAGTGGAAT \n",
262 | " 316 \n",
263 | " -----------------MRRSEVLAEESIVCLQKALNHLREIWELI... \n",
264 | " 105 \n",
265 | " 122 \n",
266 | " 106 \n",
267 | " 138 \n",
268 | " EEGETTILQLEKDLRTQVELMRKQKKERKQELK \n",
269 | " \n",
270 | " \n",
271 | " 3 \n",
272 | " ENST00000394249.8 \n",
273 | " 1863 \n",
274 | " ENST00000394249 \n",
275 | " 3 \n",
276 | " protein \n",
277 | " None \n",
278 | " ENSP00000377793 \n",
279 | " MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH... \n",
280 | " 620 \n",
281 | " 263 \n",
282 | " antisense \n",
283 | " TGGCCTTTGACCCAGACATAATGGTGGCCA \n",
284 | " 787 \n",
285 | " -----------------MRRSEVLAEESIVCLQKALNHLREIWELI... \n",
286 | " 262 \n",
287 | " 279 \n",
288 | " 263 \n",
289 | " 295 \n",
290 | " WDRLQIPEEEREAVATIMSGSKAKVRKALQLEV \n",
291 | " \n",
292 | " \n",
293 | " 4 \n",
294 | " ENST00000361337.3 \n",
295 | " 2298 \n",
296 | " ENST00000361337 \n",
297 | " 2 \n",
298 | " protein \n",
299 | " None \n",
300 | " ENSP00000354522 \n",
301 | " MSGDHLHNDSQIEADFRLNDSHKHKDKHKDREHRHKEHKKEKDREK... \n",
302 | " 765 \n",
303 | " 140 \n",
304 | " antisense \n",
305 | " AAATACTCACTCATCCTCATCTCGAGGTCT \n",
306 | " 420 \n",
307 | " -----------------MSGDHLHNDSQIEADFRLNDSHKHKDKHK... \n",
308 | " 139 \n",
309 | " 156 \n",
310 | " 140 \n",
311 | " 172 \n",
312 | " GYFVPPKEDIKPLKRPRDEDDADYKPKKIKTED \n",
313 | " \n",
314 | " \n",
315 | " ... \n",
316 | " ... \n",
317 | " ... \n",
318 | " ... \n",
319 | " ... \n",
320 | " ... \n",
321 | " ... \n",
322 | " ... \n",
323 | " ... \n",
324 | " ... \n",
325 | " ... \n",
326 | " ... \n",
327 | " ... \n",
328 | " ... \n",
329 | " ... \n",
330 | " ... \n",
331 | " ... \n",
332 | " ... \n",
333 | " ... \n",
334 | " ... \n",
335 | " \n",
336 | " \n",
337 | " 395 \n",
338 | " ENST00000454402.7 \n",
339 | " 1023 \n",
340 | " ENST00000454402 \n",
341 | " 2 \n",
342 | " protein \n",
343 | " None \n",
344 | " ENSP00000408295 \n",
345 | " METSALKQQEQPAATKIRNLPWVEKYRPQTLNDLISHQDILSTIQK... \n",
346 | " 340 \n",
347 | " 74 \n",
348 | " antisense \n",
349 | " TGTCTTTATATAGCTGTTTCGCACAGGCTA \n",
350 | " 220 \n",
351 | " -----------------METSALKQQEQPAATKIRNLPWVEKYRPQ... \n",
352 | " 73 \n",
353 | " 90 \n",
354 | " 74 \n",
355 | " 106 \n",
356 | " LYGPPGTGKTSTILACAKQLYKDKEFGSMVLEL \n",
357 | " \n",
358 | " \n",
359 | " 396 \n",
360 | " ENST00000254998.3 \n",
361 | " 423 \n",
362 | " ENST00000254998 \n",
363 | " 2 \n",
364 | " protein \n",
365 | " None \n",
366 | " ENSP00000254998 \n",
367 | " MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV... \n",
368 | " 140 \n",
369 | " 27 \n",
370 | " sense \n",
371 | " TTGTCAATGTCTACTACACCACCATGGATA \n",
372 | " 79 \n",
373 | " -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD... \n",
374 | " 26 \n",
375 | " 43 \n",
376 | " 27 \n",
377 | " 59 \n",
378 | " DQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTA \n",
379 | " \n",
380 | " \n",
381 | " 397 \n",
382 | " ENST00000254998.3 \n",
383 | " 423 \n",
384 | " ENST00000254998 \n",
385 | " 2 \n",
386 | " protein \n",
387 | " None \n",
388 | " ENSP00000254998 \n",
389 | " MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV... \n",
390 | " 140 \n",
391 | " 39 \n",
392 | " sense \n",
393 | " GGCGTTTGCTGTCCCGCCTGTACATGGGCA \n",
394 | " 115 \n",
395 | " -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD... \n",
396 | " 38 \n",
397 | " 55 \n",
398 | " 39 \n",
399 | " 71 \n",
400 | " VYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQ \n",
401 | " \n",
402 | " \n",
403 | " 398 \n",
404 | " ENST00000381685.10 \n",
405 | " 2067 \n",
406 | " ENST00000381685 \n",
407 | " 5 \n",
408 | " protein \n",
409 | " None \n",
410 | " ENSP00000371101 \n",
411 | " MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI... \n",
412 | " 688 \n",
413 | " 259 \n",
414 | " antisense \n",
415 | " ACTAGCAATGGCTTATCAGATCGAAGGTCA \n",
416 | " 776 \n",
417 | " -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK... \n",
418 | " 258 \n",
419 | " 275 \n",
420 | " 259 \n",
421 | " 291 \n",
422 | " TMAVGTTTGQVLLYDLRSDKPLLVKDHQYGLPI \n",
423 | " \n",
424 | " \n",
425 | " 399 \n",
426 | " ENST00000381685.10 \n",
427 | " 2067 \n",
428 | " ENST00000381685 \n",
429 | " 5 \n",
430 | " protein \n",
431 | " None \n",
432 | " ENSP00000371101 \n",
433 | " MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI... \n",
434 | " 688 \n",
435 | " 108 \n",
436 | " sense \n",
437 | " AAATTTTGTCTGATGACTACTCAAAGGTAT \n",
438 | " 322 \n",
439 | " -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK... \n",
440 | " 107 \n",
441 | " 124 \n",
442 | " 108 \n",
443 | " 140 \n",
444 | " CLDSEVVTFEILSDDYSKIVFLHNDRYIEFHSQ \n",
445 | " \n",
446 | " \n",
447 | "
\n",
448 | "
400 rows × 19 columns
\n",
449 | "
"
450 | ],
451 | "text/plain": [
452 | " Target Transcript Target Total Length Transcript Base version \\\n",
453 | "0 ENST00000259457.8 834 ENST00000259457 3 \n",
454 | "1 ENST00000259457.8 834 ENST00000259457 3 \n",
455 | "2 ENST00000394249.8 1863 ENST00000394249 3 \n",
456 | "3 ENST00000394249.8 1863 ENST00000394249 3 \n",
457 | "4 ENST00000361337.3 2298 ENST00000361337 2 \n",
458 | ".. ... ... ... ... \n",
459 | "395 ENST00000454402.7 1023 ENST00000454402 2 \n",
460 | "396 ENST00000254998.3 423 ENST00000254998 2 \n",
461 | "397 ENST00000254998.3 423 ENST00000254998 2 \n",
462 | "398 ENST00000381685.10 2067 ENST00000381685 5 \n",
463 | "399 ENST00000381685.10 2067 ENST00000381685 5 \n",
464 | "\n",
465 | " molecule desc id \\\n",
466 | "0 protein None ENSP00000259457 \n",
467 | "1 protein None ENSP00000259457 \n",
468 | "2 protein None ENSP00000377793 \n",
469 | "3 protein None ENSP00000377793 \n",
470 | "4 protein None ENSP00000354522 \n",
471 | ".. ... ... ... \n",
472 | "395 protein None ENSP00000408295 \n",
473 | "396 protein None ENSP00000254998 \n",
474 | "397 protein None ENSP00000254998 \n",
475 | "398 protein None ENSP00000371101 \n",
476 | "399 protein None ENSP00000371101 \n",
477 | "\n",
478 | " seq AA len AA Index \\\n",
479 | "0 MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI... 277 64 \n",
480 | "1 MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI... 277 46 \n",
481 | "2 MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH... 620 106 \n",
482 | "3 MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH... 620 263 \n",
483 | "4 MSGDHLHNDSQIEADFRLNDSHKHKDKHKDREHRHKEHKKEKDREK... 765 140 \n",
484 | ".. ... ... ... \n",
485 | "395 METSALKQQEQPAATKIRNLPWVEKYRPQTLNDLISHQDILSTIQK... 340 74 \n",
486 | "396 MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV... 140 27 \n",
487 | "397 MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV... 140 39 \n",
488 | "398 MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI... 688 259 \n",
489 | "399 MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI... 688 108 \n",
490 | "\n",
491 | " Orientation sgRNA Context Sequence Target Cut Length \\\n",
492 | "0 sense TGGAGCAGATACAAGAGCAACTGAAGGGAT 191 \n",
493 | "1 sense CCGGAAAACTGGCACGACCATCGCTGGGGT 137 \n",
494 | "2 sense TAGAAAAAGATTTGCGCACCCAAGTGGAAT 316 \n",
495 | "3 antisense TGGCCTTTGACCCAGACATAATGGTGGCCA 787 \n",
496 | "4 antisense AAATACTCACTCATCCTCATCTCGAGGTCT 420 \n",
497 | ".. ... ... ... \n",
498 | "395 antisense TGTCTTTATATAGCTGTTTCGCACAGGCTA 220 \n",
499 | "396 sense TTGTCAATGTCTACTACACCACCATGGATA 79 \n",
500 | "397 sense GGCGTTTGCTGTCCCGCCTGTACATGGGCA 115 \n",
501 | "398 antisense ACTAGCAATGGCTTATCAGATCGAAGGTCA 776 \n",
502 | "399 sense AAATTTTGTCTGATGACTACTCAAAGGTAT 322 \n",
503 | "\n",
504 | " extended_seq AA 0-Indexed \\\n",
505 | "0 -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF... 63 \n",
506 | "1 -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF... 45 \n",
507 | "2 -----------------MRRSEVLAEESIVCLQKALNHLREIWELI... 105 \n",
508 | "3 -----------------MRRSEVLAEESIVCLQKALNHLREIWELI... 262 \n",
509 | "4 -----------------MSGDHLHNDSQIEADFRLNDSHKHKDKHK... 139 \n",
510 | ".. ... ... \n",
511 | "395 -----------------METSALKQQEQPAATKIRNLPWVEKYRPQ... 73 \n",
512 | "396 -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD... 26 \n",
513 | "397 -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD... 38 \n",
514 | "398 -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK... 258 \n",
515 | "399 -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK... 107 \n",
516 | "\n",
517 | " AA 0-Indexed padded seq_start seq_end \\\n",
518 | "0 80 64 96 \n",
519 | "1 62 46 78 \n",
520 | "2 122 106 138 \n",
521 | "3 279 263 295 \n",
522 | "4 156 140 172 \n",
523 | ".. ... ... ... \n",
524 | "395 90 74 106 \n",
525 | "396 43 27 59 \n",
526 | "397 55 39 71 \n",
527 | "398 275 259 291 \n",
528 | "399 124 108 140 \n",
529 | "\n",
530 | " AA Subsequence \n",
531 | "0 GVVYKDGIVLGADTRATEGMVVADKNCSKIHFI \n",
532 | "1 AKRGYKLPKVRKTGTTIAGVVYKDGIVLGADTR \n",
533 | "2 EEGETTILQLEKDLRTQVELMRKQKKERKQELK \n",
534 | "3 WDRLQIPEEEREAVATIMSGSKAKVRKALQLEV \n",
535 | "4 GYFVPPKEDIKPLKRPRDEDDADYKPKKIKTED \n",
536 | ".. ... \n",
537 | "395 LYGPPGTGKTSTILACAKQLYKDKEFGSMVLEL \n",
538 | "396 DQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTA \n",
539 | "397 VYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQ \n",
540 | "398 TMAVGTTTGQVLLYDLRSDKPLLVKDHQYGLPI \n",
541 | "399 CLDSEVVTFEILSDDYSKIVFLHNDRYIEFHSQ \n",
542 | "\n",
543 | "[400 rows x 19 columns]"
544 | ]
545 | },
546 | "execution_count": null,
547 | "metadata": {},
548 | "output_type": "execute_result"
549 | }
550 | ],
551 | "source": [
552 | "## aa sequences\n",
553 | "aa_seq_df = targetdata.build_transcript_aa_seq_df(design_df, n_jobs=2)\n",
554 | "aa_subseq_df = targetfeat.get_aa_subseq_df(sg_designs=design_targ_df, aa_seq_df=aa_seq_df, width=16,\n",
555 | " id_cols=id_cols)\n",
556 | "aa_subseq_df"
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": null,
562 | "metadata": {},
563 | "outputs": [
564 | {
565 | "name": "stdout",
566 | "output_type": "stream",
567 | "text": [
568 | "Getting protein domains\n"
569 | ]
570 | },
571 | {
572 | "name": "stderr",
573 | "output_type": "stream",
574 | "text": [
575 | "100%|█████████████████████████████████████████| 200/200 [00:53<00:00, 3.75it/s]\n"
576 | ]
577 | }
578 | ],
579 | "source": [
580 | "## domains\n",
581 | "domain_df = targetdata.build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=2)\n",
582 | "domain_feature_df = targetfeat.get_protein_domain_features(design_targ_df, domain_df, sources=None,\n",
583 | " id_cols=id_cols)"
584 | ]
585 | },
586 | {
587 | "cell_type": "code",
588 | "execution_count": null,
589 | "metadata": {},
590 | "outputs": [
591 | {
592 | "name": "stdout",
593 | "output_type": "stream",
594 | "text": [
595 | "Getting conservation\n"
596 | ]
597 | },
598 | {
599 | "name": "stderr",
600 | "output_type": "stream",
601 | "text": [
602 | "100%|█████████████████████████████████████████| 200/200 [06:24<00:00, 1.92s/it]\n"
603 | ]
604 | },
605 | {
606 | "data": {
607 | "text/html": [
608 | "\n",
609 | "\n",
622 | "
\n",
623 | " \n",
624 | " \n",
625 | " \n",
626 | " sgRNA Context Sequence \n",
627 | " Target Cut Length \n",
628 | " Target Transcript \n",
629 | " Orientation \n",
630 | " cons_4 \n",
631 | " cons_32 \n",
632 | " \n",
633 | " \n",
634 | " \n",
635 | " \n",
636 | " 0 \n",
637 | " AAAAGAATGATGAAAAGACACCACAGGGAG \n",
638 | " 244 \n",
639 | " ENST00000610426.5 \n",
640 | " sense \n",
641 | " 0.218231 \n",
642 | " 0.408844 \n",
643 | " \n",
644 | " \n",
645 | " 1 \n",
646 | " AAAAGAGCCATGAATCTAAACATCAGGAAT \n",
647 | " 640 \n",
648 | " ENST00000223073.6 \n",
649 | " sense \n",
650 | " 0.129825 \n",
651 | " 0.278180 \n",
652 | " \n",
653 | " \n",
654 | " 2 \n",
655 | " AAAAGCGCCAAATGGCCCGAGAATTGGGAG \n",
656 | " 709 \n",
657 | " ENST00000331923.9 \n",
658 | " sense \n",
659 | " 0.470906 \n",
660 | " 0.532305 \n",
661 | " \n",
662 | " \n",
663 | " 3 \n",
664 | " AAACAGAAAAAGTTAAAATCACCAAGGTGT \n",
665 | " 496 \n",
666 | " ENST00000283882.4 \n",
667 | " sense \n",
668 | " 0.580556 \n",
669 | " 0.602708 \n",
670 | " \n",
671 | " \n",
672 | " 4 \n",
673 | " AAACAGATGGAAGATGCTTACCGGGGGACC \n",
674 | " 132 \n",
675 | " ENST00000393047.8 \n",
676 | " sense \n",
677 | " 0.283447 \n",
678 | " 0.414293 \n",
679 | " \n",
680 | " \n",
681 | " ... \n",
682 | " ... \n",
683 | " ... \n",
684 | " ... \n",
685 | " ... \n",
686 | " ... \n",
687 | " ... \n",
688 | " \n",
689 | " \n",
690 | " 395 \n",
691 | " TTTGATTGCATTAAGGTTGGACTCTGGATT \n",
692 | " 246 \n",
693 | " ENST00000249269.9 \n",
694 | " sense \n",
695 | " 0.580612 \n",
696 | " 0.618707 \n",
697 | " \n",
698 | " \n",
699 | " 396 \n",
700 | " TTTGCCCACAGCTCCAAAGCATCGCGGAGA \n",
701 | " 130 \n",
702 | " ENST00000227618.8 \n",
703 | " sense \n",
704 | " 0.323770 \n",
705 | " 0.416368 \n",
706 | " \n",
707 | " \n",
708 | " 397 \n",
709 | " TTTTACAGTGCGATGTATGATGTATGGCTT \n",
710 | " 119 \n",
711 | " ENST00000338366.6 \n",
712 | " sense \n",
713 | " 0.788000 \n",
714 | " 0.537417 \n",
715 | " \n",
716 | " \n",
717 | " 398 \n",
718 | " TTTTGGATCTCGTAGTGATTCAAGAGGGAA \n",
719 | " 233 \n",
720 | " ENST00000629496.3 \n",
721 | " sense \n",
722 | " 0.239630 \n",
723 | " 0.347615 \n",
724 | " \n",
725 | " \n",
726 | " 399 \n",
727 | " TTTTTGTTACTACAGGTTCGCTGCTGGGAA \n",
728 | " 201 \n",
729 | " ENST00000395840.6 \n",
730 | " sense \n",
731 | " 0.693767 \n",
732 | " 0.639044 \n",
733 | " \n",
734 | " \n",
735 | "
\n",
736 | "
400 rows × 6 columns
\n",
737 | "
"
738 | ],
739 | "text/plain": [
740 | " sgRNA Context Sequence Target Cut Length Target Transcript \\\n",
741 | "0 AAAAGAATGATGAAAAGACACCACAGGGAG 244 ENST00000610426.5 \n",
742 | "1 AAAAGAGCCATGAATCTAAACATCAGGAAT 640 ENST00000223073.6 \n",
743 | "2 AAAAGCGCCAAATGGCCCGAGAATTGGGAG 709 ENST00000331923.9 \n",
744 | "3 AAACAGAAAAAGTTAAAATCACCAAGGTGT 496 ENST00000283882.4 \n",
745 | "4 AAACAGATGGAAGATGCTTACCGGGGGACC 132 ENST00000393047.8 \n",
746 | ".. ... ... ... \n",
747 | "395 TTTGATTGCATTAAGGTTGGACTCTGGATT 246 ENST00000249269.9 \n",
748 | "396 TTTGCCCACAGCTCCAAAGCATCGCGGAGA 130 ENST00000227618.8 \n",
749 | "397 TTTTACAGTGCGATGTATGATGTATGGCTT 119 ENST00000338366.6 \n",
750 | "398 TTTTGGATCTCGTAGTGATTCAAGAGGGAA 233 ENST00000629496.3 \n",
751 | "399 TTTTTGTTACTACAGGTTCGCTGCTGGGAA 201 ENST00000395840.6 \n",
752 | "\n",
753 | " Orientation cons_4 cons_32 \n",
754 | "0 sense 0.218231 0.408844 \n",
755 | "1 sense 0.129825 0.278180 \n",
756 | "2 sense 0.470906 0.532305 \n",
757 | "3 sense 0.580556 0.602708 \n",
758 | "4 sense 0.283447 0.414293 \n",
759 | ".. ... ... ... \n",
760 | "395 sense 0.580612 0.618707 \n",
761 | "396 sense 0.323770 0.416368 \n",
762 | "397 sense 0.788000 0.537417 \n",
763 | "398 sense 0.239630 0.347615 \n",
764 | "399 sense 0.693767 0.639044 \n",
765 | "\n",
766 | "[400 rows x 6 columns]"
767 | ]
768 | },
769 | "execution_count": null,
770 | "metadata": {},
771 | "output_type": "execute_result"
772 | }
773 | ],
774 | "source": [
775 | "## conservation\n",
776 | "conservation_df = targetdata.build_conservation_df(design_df, n_jobs=max_n_jobs)\n",
777 | "conservation_feature_df = targetfeat.get_conservation_features(design_targ_df, conservation_df,\n",
778 | " small_width=2, large_width=16,\n",
779 | " conservation_column='ranked_conservation',\n",
780 | " id_cols=id_cols)\n",
781 | "conservation_feature_df"
782 | ]
783 | },
784 | {
785 | "cell_type": "code",
786 | "execution_count": null,
787 | "metadata": {},
788 | "outputs": [
789 | {
790 | "name": "stderr",
791 | "output_type": "stream",
792 | "text": [
793 | "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n",
794 | " warnings.warn(\n",
795 | "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n",
796 | " warnings.warn(\n"
797 | ]
798 | }
799 | ],
800 | "source": [
801 | "predictions = predict_target(design_df=design_df,\n",
802 | " aa_subseq_df=aa_subseq_df,\n",
803 | " domain_feature_df=domain_feature_df,\n",
804 | " conservation_feature_df=conservation_feature_df)\n",
805 | "design_df['Target Score'] = predictions"
806 | ]
807 | },
808 | {
809 | "cell_type": "code",
810 | "execution_count": null,
811 | "metadata": {},
812 | "outputs": [
813 | {
814 | "name": "stderr",
815 | "output_type": "stream",
816 | "text": [
817 | "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n",
818 | " warnings.warn(\n",
819 | "/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.\n",
820 | " warnings.warn(\n"
821 | ]
822 | }
823 | ],
824 | "source": [
825 | "lite_predictions = predict_target(design_df=design_df,\n",
826 | " aa_subseq_df=aa_subseq_df)\n",
827 | "design_df['Target Score Lite'] = lite_predictions"
828 | ]
829 | },
830 | {
831 | "cell_type": "code",
832 | "execution_count": null,
833 | "metadata": {},
834 | "outputs": [
835 | {
836 | "data": {
837 | "text/plain": [
838 | "0 TGGAGCAGATACAAGAGCAACTGAAGGGAT\n",
839 | "1 CCGGAAAACTGGCACGACCATCGCTGGGGT\n",
840 | "2 TAGAAAAAGATTTGCGCACCCAAGTGGAAT\n",
841 | "3 TGGCCTTTGACCCAGACATAATGGTGGCCA\n",
842 | "4 AAATACTCACTCATCCTCATCTCGAGGTCT\n",
843 | " ... \n",
844 | "395 TGTCTTTATATAGCTGTTTCGCACAGGCTA\n",
845 | "396 TTGTCAATGTCTACTACACCACCATGGATA\n",
846 | "397 GGCGTTTGCTGTCCCGCCTGTACATGGGCA\n",
847 | "398 ACTAGCAATGGCTTATCAGATCGAAGGTCA\n",
848 | "399 AAATTTTGTCTGATGACTACTCAAAGGTAT\n",
849 | "Name: sgRNA Context Sequence, Length: 400, dtype: object"
850 | ]
851 | },
852 | "execution_count": null,
853 | "metadata": {},
854 | "output_type": "execute_result"
855 | }
856 | ],
857 | "source": [
858 | "design_df['sgRNA Context Sequence']"
859 | ]
860 | },
861 | {
862 | "cell_type": "code",
863 | "execution_count": null,
864 | "metadata": {},
865 | "outputs": [],
866 | "source": [
867 | "assert stats.pearsonr(design_df['Target Score'], design_df['Target Score Lite'])[0] > 0.7"
868 | ]
869 | },
870 | {
871 | "cell_type": "code",
872 | "execution_count": null,
873 | "metadata": {},
874 | "outputs": [],
875 | "source": [
876 | "sanger_df = pd.read_csv('test_data/Behan2019_activity.csv')\n",
877 | "gecko_df = pd.read_csv('test_data/Aguirre2016_activity.csv')\n",
878 | "\n",
879 | "sanger_designs = sanger_df.merge(design_df, how='inner',\n",
880 | " on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',\n",
881 | " 'Target Cut %'])\n",
882 | "gecko_designs = gecko_df.merge(design_df, how='inner',\n",
883 | " on=['sgRNA Sequence', 'sgRNA Context Sequence', 'Target Gene Symbol',\n",
884 | " 'Target Cut %'])\n",
885 | "assert stats.pearsonr(sanger_designs['avg_mean_centered_neg_lfc'],\n",
886 | " sanger_designs['Target Score'])[0] > 0.2\n",
887 | "assert stats.pearsonr(gecko_designs['avg_mean_centered_neg_lfc'],\n",
888 | " gecko_designs['Target Score'])[0] > 0.05"
889 | ]
890 | }
891 | ],
892 | "metadata": {
893 | "kernelspec": {
894 | "display_name": "rs3_v2",
895 | "language": "python",
896 | "name": "rs3_v2"
897 | }
898 | },
899 | "nbformat": 4,
900 | "nbformat_minor": 4
901 | }
902 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to contribute
2 |
3 | ## How to get started
4 |
5 | Before anything else, please install the git hooks that run automatic scripts during each commit and merge to strip the notebooks of superfluous metadata (and avoid merge conflicts). After cloning the repository, run the following command inside it:
6 | ```
7 | nbdev_install_git_hooks
8 | ```
9 |
10 | ## Did you find a bug?
11 |
12 | * Ensure the bug was not already reported by searching on GitHub under Issues.
13 | * If you're unable to find an open issue addressing the problem, open a new one. Be sure to include a title and clear description, as much relevant information as possible, and a code sample or an executable test case demonstrating the expected behavior that is not occurring.
14 | * Be sure to add the complete error messages.
15 |
16 | #### Did you write a patch that fixes a bug?
17 |
18 | * Open a new GitHub pull request with the patch.
19 | * Ensure that your PR includes a test that fails without your patch, and pass with it.
20 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable.
21 |
22 | ## PR submission guidelines
23 |
24 | * Keep each PR focused. While it's more convenient, do not combine several unrelated fixes together. Create as many branches as needing to keep each PR focused.
25 | * Do not mix style changes/fixes with "functional" changes. It's very difficult to review such PRs and it most likely get rejected.
26 | * Do not add/remove vertical whitespace. Preserve the original style of the file you edit as much as you can.
27 | * Do not turn an already submitted PR into your development playground. If after you submitted PR, you discovered that more work is needed - close the PR, do the required work and then submit a new PR. Otherwise each of your commits requires attention from maintainers of the project.
28 | * If, however, you submitted a PR and received a request for changes, you should proceed with commits inside that PR, so that the maintainer can see the incremental fixes and won't need to review the whole PR again. In the exception case where you realize it'll take many many commits to complete the requests, then it's probably best to close the PR, do the work and then submit it again. Use common sense where you'd choose one way over another.
29 |
30 | ## Do you want to contribute to the documentation?
31 |
32 | * Docs are automatically created from the notebooks in the nbs folder.
33 |
34 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include settings.ini
2 | include LICENSE
3 | include CONTRIBUTING.md
4 | include README.md
5 | include rs3/RuleSet3.pkl
6 | include rs3/target_model.pkl
7 | include rs3/target_lite_model.pkl
8 | recursive-exclude * __pycache__
9 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .ONESHELL:
2 | SHELL := /bin/bash
3 | SRC = $(wildcard ./*.ipynb)
4 |
5 | all: rs3 docs
6 |
7 | rs3: $(SRC)
8 | nbdev_build_lib
9 | touch rs3
10 |
11 | sync:
12 | nbdev_update_lib
13 |
14 | docs_serve: docs
15 | cd docs && bundle exec jekyll serve
16 |
17 | docs: $(SRC)
18 | nbdev_build_docs
19 | touch docs
20 |
21 | test:
22 | nbdev_test_nbs
23 |
24 | release: pypi conda_release
25 | nbdev_bump_version
26 |
27 | conda_release:
28 | fastrelease_conda_package
29 |
30 | pypi: dist
31 | twine upload --repository pypi dist/*
32 |
33 | dist: clean
34 | python setup.py sdist bdist_wheel
35 |
36 | clean:
37 | rm -rf dist
--------------------------------------------------------------------------------
/RuleSet3.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/RuleSet3.pkl
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 | services:
3 | fastai: &fastai
4 | restart: unless-stopped
5 | working_dir: /data
6 | image: fastai/codespaces
7 | logging:
8 | driver: json-file
9 | options:
10 | max-size: 50m
11 | stdin_open: true
12 | tty: true
13 | volumes:
14 | - .:/data/
15 |
16 | notebook:
17 | <<: *fastai
18 | command: bash -c "pip install -e . && jupyter notebook --allow-root --no-browser --ip=0.0.0.0 --port=8080 --NotebookApp.token='' --NotebookApp.password=''"
19 | ports:
20 | - "8080:8080"
21 |
22 | watcher:
23 | <<: *fastai
24 | command: watchmedo shell-command --command nbdev_build_docs --pattern *.ipynb --recursive --drop
25 | network_mode: host # for GitHub Codespaces https://github.com/features/codespaces/
26 |
27 | jekyll:
28 | <<: *fastai
29 | ports:
30 | - "4000:4000"
31 | command: >
32 | bash -c "pip install .
33 | && nbdev_build_docs && cd docs
34 | && bundle i
35 | && chmod -R u+rwx . && bundle exec jekyll serve --host 0.0.0.0"
36 |
--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _site/
2 |
--------------------------------------------------------------------------------
/docs/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 |
3 | gem 'github-pages', group: :jekyll_plugins
4 |
5 | # Added at 2019-11-25 10:11:40 -0800 by jhoward:
6 | gem "nokogiri", "< 1.11.1"
7 | gem "jekyll", ">= 3.7"
8 | gem "kramdown", ">= 2.3.1"
9 | gem "jekyll-remote-theme"
10 |
--------------------------------------------------------------------------------
/docs/Gemfile.lock:
--------------------------------------------------------------------------------
1 | GEM
2 | remote: https://rubygems.org/
3 | specs:
4 | activesupport (6.0.3.6)
5 | concurrent-ruby (~> 1.0, >= 1.0.2)
6 | i18n (>= 0.7, < 2)
7 | minitest (~> 5.1)
8 | tzinfo (~> 1.1)
9 | zeitwerk (~> 2.2, >= 2.2.2)
10 | addressable (2.7.0)
11 | public_suffix (>= 2.0.2, < 5.0)
12 | coffee-script (2.4.1)
13 | coffee-script-source
14 | execjs
15 | coffee-script-source (1.11.1)
16 | colorator (1.1.0)
17 | commonmarker (0.17.13)
18 | ruby-enum (~> 0.5)
19 | concurrent-ruby (1.1.8)
20 | dnsruby (1.61.5)
21 | simpleidn (~> 0.1)
22 | em-websocket (0.5.2)
23 | eventmachine (>= 0.12.9)
24 | http_parser.rb (~> 0.6.0)
25 | ethon (0.12.0)
26 | ffi (>= 1.3.0)
27 | eventmachine (1.2.7)
28 | execjs (2.7.0)
29 | faraday (1.3.0)
30 | faraday-net_http (~> 1.0)
31 | multipart-post (>= 1.2, < 3)
32 | ruby2_keywords
33 | faraday-net_http (1.0.1)
34 | ffi (1.15.0)
35 | forwardable-extended (2.6.0)
36 | gemoji (3.0.1)
37 | github-pages (214)
38 | github-pages-health-check (= 1.17.0)
39 | jekyll (= 3.9.0)
40 | jekyll-avatar (= 0.7.0)
41 | jekyll-coffeescript (= 1.1.1)
42 | jekyll-commonmark-ghpages (= 0.1.6)
43 | jekyll-default-layout (= 0.1.4)
44 | jekyll-feed (= 0.15.1)
45 | jekyll-gist (= 1.5.0)
46 | jekyll-github-metadata (= 2.13.0)
47 | jekyll-mentions (= 1.6.0)
48 | jekyll-optional-front-matter (= 0.3.2)
49 | jekyll-paginate (= 1.1.0)
50 | jekyll-readme-index (= 0.3.0)
51 | jekyll-redirect-from (= 0.16.0)
52 | jekyll-relative-links (= 0.6.1)
53 | jekyll-remote-theme (= 0.4.3)
54 | jekyll-sass-converter (= 1.5.2)
55 | jekyll-seo-tag (= 2.7.1)
56 | jekyll-sitemap (= 1.4.0)
57 | jekyll-swiss (= 1.0.0)
58 | jekyll-theme-architect (= 0.1.1)
59 | jekyll-theme-cayman (= 0.1.1)
60 | jekyll-theme-dinky (= 0.1.1)
61 | jekyll-theme-hacker (= 0.1.2)
62 | jekyll-theme-leap-day (= 0.1.1)
63 | jekyll-theme-merlot (= 0.1.1)
64 | jekyll-theme-midnight (= 0.1.1)
65 | jekyll-theme-minimal (= 0.1.1)
66 | jekyll-theme-modernist (= 0.1.1)
67 | jekyll-theme-primer (= 0.5.4)
68 | jekyll-theme-slate (= 0.1.1)
69 | jekyll-theme-tactile (= 0.1.1)
70 | jekyll-theme-time-machine (= 0.1.1)
71 | jekyll-titles-from-headings (= 0.5.3)
72 | jemoji (= 0.12.0)
73 | kramdown (= 2.3.1)
74 | kramdown-parser-gfm (= 1.1.0)
75 | liquid (= 4.0.3)
76 | mercenary (~> 0.3)
77 | minima (= 2.5.1)
78 | nokogiri (>= 1.10.4, < 2.0)
79 | rouge (= 3.26.0)
80 | terminal-table (~> 1.4)
81 | github-pages-health-check (1.17.0)
82 | addressable (~> 2.3)
83 | dnsruby (~> 1.60)
84 | octokit (~> 4.0)
85 | public_suffix (>= 2.0.2, < 5.0)
86 | typhoeus (~> 1.3)
87 | html-pipeline (2.14.0)
88 | activesupport (>= 2)
89 | nokogiri (>= 1.4)
90 | http_parser.rb (0.6.0)
91 | i18n (0.9.5)
92 | concurrent-ruby (~> 1.0)
93 | jekyll (3.9.0)
94 | addressable (~> 2.4)
95 | colorator (~> 1.0)
96 | em-websocket (~> 0.5)
97 | i18n (~> 0.7)
98 | jekyll-sass-converter (~> 1.0)
99 | jekyll-watch (~> 2.0)
100 | kramdown (>= 1.17, < 3)
101 | liquid (~> 4.0)
102 | mercenary (~> 0.3.3)
103 | pathutil (~> 0.9)
104 | rouge (>= 1.7, < 4)
105 | safe_yaml (~> 1.0)
106 | jekyll-avatar (0.7.0)
107 | jekyll (>= 3.0, < 5.0)
108 | jekyll-coffeescript (1.1.1)
109 | coffee-script (~> 2.2)
110 | coffee-script-source (~> 1.11.1)
111 | jekyll-commonmark (1.3.1)
112 | commonmarker (~> 0.14)
113 | jekyll (>= 3.7, < 5.0)
114 | jekyll-commonmark-ghpages (0.1.6)
115 | commonmarker (~> 0.17.6)
116 | jekyll-commonmark (~> 1.2)
117 | rouge (>= 2.0, < 4.0)
118 | jekyll-default-layout (0.1.4)
119 | jekyll (~> 3.0)
120 | jekyll-feed (0.15.1)
121 | jekyll (>= 3.7, < 5.0)
122 | jekyll-gist (1.5.0)
123 | octokit (~> 4.2)
124 | jekyll-github-metadata (2.13.0)
125 | jekyll (>= 3.4, < 5.0)
126 | octokit (~> 4.0, != 4.4.0)
127 | jekyll-mentions (1.6.0)
128 | html-pipeline (~> 2.3)
129 | jekyll (>= 3.7, < 5.0)
130 | jekyll-optional-front-matter (0.3.2)
131 | jekyll (>= 3.0, < 5.0)
132 | jekyll-paginate (1.1.0)
133 | jekyll-readme-index (0.3.0)
134 | jekyll (>= 3.0, < 5.0)
135 | jekyll-redirect-from (0.16.0)
136 | jekyll (>= 3.3, < 5.0)
137 | jekyll-relative-links (0.6.1)
138 | jekyll (>= 3.3, < 5.0)
139 | jekyll-remote-theme (0.4.3)
140 | addressable (~> 2.0)
141 | jekyll (>= 3.5, < 5.0)
142 | jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
143 | rubyzip (>= 1.3.0, < 3.0)
144 | jekyll-sass-converter (1.5.2)
145 | sass (~> 3.4)
146 | jekyll-seo-tag (2.7.1)
147 | jekyll (>= 3.8, < 5.0)
148 | jekyll-sitemap (1.4.0)
149 | jekyll (>= 3.7, < 5.0)
150 | jekyll-swiss (1.0.0)
151 | jekyll-theme-architect (0.1.1)
152 | jekyll (~> 3.5)
153 | jekyll-seo-tag (~> 2.0)
154 | jekyll-theme-cayman (0.1.1)
155 | jekyll (~> 3.5)
156 | jekyll-seo-tag (~> 2.0)
157 | jekyll-theme-dinky (0.1.1)
158 | jekyll (~> 3.5)
159 | jekyll-seo-tag (~> 2.0)
160 | jekyll-theme-hacker (0.1.2)
161 | jekyll (> 3.5, < 5.0)
162 | jekyll-seo-tag (~> 2.0)
163 | jekyll-theme-leap-day (0.1.1)
164 | jekyll (~> 3.5)
165 | jekyll-seo-tag (~> 2.0)
166 | jekyll-theme-merlot (0.1.1)
167 | jekyll (~> 3.5)
168 | jekyll-seo-tag (~> 2.0)
169 | jekyll-theme-midnight (0.1.1)
170 | jekyll (~> 3.5)
171 | jekyll-seo-tag (~> 2.0)
172 | jekyll-theme-minimal (0.1.1)
173 | jekyll (~> 3.5)
174 | jekyll-seo-tag (~> 2.0)
175 | jekyll-theme-modernist (0.1.1)
176 | jekyll (~> 3.5)
177 | jekyll-seo-tag (~> 2.0)
178 | jekyll-theme-primer (0.5.4)
179 | jekyll (> 3.5, < 5.0)
180 | jekyll-github-metadata (~> 2.9)
181 | jekyll-seo-tag (~> 2.0)
182 | jekyll-theme-slate (0.1.1)
183 | jekyll (~> 3.5)
184 | jekyll-seo-tag (~> 2.0)
185 | jekyll-theme-tactile (0.1.1)
186 | jekyll (~> 3.5)
187 | jekyll-seo-tag (~> 2.0)
188 | jekyll-theme-time-machine (0.1.1)
189 | jekyll (~> 3.5)
190 | jekyll-seo-tag (~> 2.0)
191 | jekyll-titles-from-headings (0.5.3)
192 | jekyll (>= 3.3, < 5.0)
193 | jekyll-watch (2.2.1)
194 | listen (~> 3.0)
195 | jemoji (0.12.0)
196 | gemoji (~> 3.0)
197 | html-pipeline (~> 2.2)
198 | jekyll (>= 3.0, < 5.0)
199 | kramdown (2.3.1)
200 | rexml
201 | kramdown-parser-gfm (1.1.0)
202 | kramdown (~> 2.0)
203 | liquid (4.0.3)
204 | listen (3.5.1)
205 | rb-fsevent (~> 0.10, >= 0.10.3)
206 | rb-inotify (~> 0.9, >= 0.9.10)
207 | mercenary (0.3.6)
208 | mini_portile2 (2.5.0)
209 | minima (2.5.1)
210 | jekyll (>= 3.5, < 5.0)
211 | jekyll-feed (~> 0.9)
212 | jekyll-seo-tag (~> 2.1)
213 | minitest (5.14.4)
214 | multipart-post (2.1.1)
215 | nokogiri (1.11.0)
216 | mini_portile2 (~> 2.5.0)
217 | racc (~> 1.4)
218 | octokit (4.20.0)
219 | faraday (>= 0.9)
220 | sawyer (~> 0.8.0, >= 0.5.3)
221 | pathutil (0.16.2)
222 | forwardable-extended (~> 2.6)
223 | public_suffix (4.0.6)
224 | racc (1.5.2)
225 | rb-fsevent (0.10.4)
226 | rb-inotify (0.10.1)
227 | ffi (~> 1.0)
228 | rexml (3.2.5)
229 | rouge (3.26.0)
230 | ruby-enum (0.9.0)
231 | i18n
232 | ruby2_keywords (0.0.4)
233 | rubyzip (2.3.0)
234 | safe_yaml (1.0.5)
235 | sass (3.7.4)
236 | sass-listen (~> 4.0.0)
237 | sass-listen (4.0.0)
238 | rb-fsevent (~> 0.9, >= 0.9.4)
239 | rb-inotify (~> 0.9, >= 0.9.7)
240 | sawyer (0.8.2)
241 | addressable (>= 2.3.5)
242 | faraday (> 0.8, < 2.0)
243 | simpleidn (0.2.1)
244 | unf (~> 0.1.4)
245 | terminal-table (1.8.0)
246 | unicode-display_width (~> 1.1, >= 1.1.1)
247 | thread_safe (0.3.6)
248 | typhoeus (1.4.0)
249 | ethon (>= 0.9.0)
250 | tzinfo (1.2.9)
251 | thread_safe (~> 0.1)
252 | unf (0.1.4)
253 | unf_ext
254 | unf_ext (0.0.7.7)
255 | unicode-display_width (1.7.0)
256 | zeitwerk (2.4.2)
257 |
258 | PLATFORMS
259 | ruby
260 |
261 | DEPENDENCIES
262 | github-pages
263 | jekyll (>= 3.7)
264 | jekyll-remote-theme
265 | kramdown (>= 2.3.1)
266 | nokogiri (< 1.11.1)
267 |
268 | BUNDLED WITH
269 | 2.1.4
270 |
--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | repository: gpp-rnd/rs3
2 | output: web
3 | topnav_title: rs3
4 | site_title: rs3
5 | company_name: Genetic Perturbation Platform, Broad Institute
6 | description: Predict the activity of CRISPR sgRNAs
7 | # Set to false to disable KaTeX math
8 | use_math: true
9 | # Add Google analytics id if you have one and want to use it here
10 | google_analytics:
11 | # See http://nbdev.fast.ai/search for help with adding Search
12 | google_search:
13 |
14 | host: 127.0.0.1
15 | # the preview server used. Leave as is.
16 | port: 4000
17 | # the port where the preview is rendered.
18 |
19 | exclude:
20 | - .idea/
21 | - .gitignore
22 | - vendor
23 |
24 | exclude: [vendor]
25 |
26 | highlighter: rouge
27 | markdown: kramdown
28 | kramdown:
29 | input: GFM
30 | auto_ids: true
31 | hard_wrap: false
32 | syntax_highlighter: rouge
33 |
34 | collections:
35 | tooltips:
36 | output: false
37 |
38 | defaults:
39 | -
40 | scope:
41 | path: ""
42 | type: "pages"
43 | values:
44 | layout: "page"
45 | comments: true
46 | search: true
47 | sidebar: home_sidebar
48 | topnav: topnav
49 | -
50 | scope:
51 | path: ""
52 | type: "tooltips"
53 | values:
54 | layout: "page"
55 | comments: true
56 | search: true
57 | tooltip: true
58 |
59 | sidebars:
60 | - home_sidebar
61 |
62 | plugins:
63 | - jekyll-remote-theme
64 |
65 | remote_theme: fastai/nbdev-jekyll-theme
66 | baseurl: /rs3/
--------------------------------------------------------------------------------
/docs/_data/sidebars/home_sidebar.yml:
--------------------------------------------------------------------------------
1 |
2 | #################################################
3 | ### THIS FILE WAS AUTOGENERATED! DO NOT EDIT! ###
4 | #################################################
5 | # Instead edit ../../sidebar.json
6 | entries:
7 | - folders:
8 | - folderitems:
9 | - output: web,pdf
10 | title: Overview
11 | url: /
12 | - output: web,pdf
13 | title: seq
14 | url: seq.html
15 | - output: web,pdf
16 | title: targetdata
17 | url: targetdata.html
18 | - output: web,pdf
19 | title: targetfeat
20 | url: targetfeat.html
21 | - output: web,pdf
22 | title: predicttarg
23 | url: predicttarg.html
24 | - output: web,pdf
25 | title: predict
26 | url: predict.html
27 | output: web
28 | title: rs3
29 | output: web
30 | title: Sidebar
31 |
--------------------------------------------------------------------------------
/docs/_data/topnav.yml:
--------------------------------------------------------------------------------
1 | topnav:
2 | - title: Topnav
3 | items:
4 | - title: github
5 | external_url: https://github.com/gpp-rnd/rs3/tree/master/
6 |
7 | #Topnav dropdowns
8 | topnav_dropdowns:
9 | - title: Topnav dropdowns
10 | folders:
--------------------------------------------------------------------------------
/docs/feed.xml:
--------------------------------------------------------------------------------
1 | ---
2 | search: exclude
3 | layout: none
4 | ---
5 |
6 |
7 |
8 |
9 | {{ site.title | xml_escape }}
10 | {{ site.description | xml_escape }}
11 | {{ site.url }}/
12 |
13 | {{ site.time | date_to_rfc822 }}
14 | {{ site.time | date_to_rfc822 }}
15 | Jekyll v{{ jekyll.version }}
16 | {% for post in site.posts limit:10 %}
17 | -
18 |
{{ post.title | xml_escape }}
19 | {{ post.content | xml_escape }}
20 | {{ post.date | date_to_rfc822 }}
21 | {{ post.url | prepend: site.url }}
22 | {{ post.url | prepend: site.url }}
23 | {% for tag in post.tags %}
24 | {{ tag | xml_escape }}
25 | {% endfor %}
26 | {% for tag in page.tags %}
27 | {{ cat | xml_escape }}
28 | {% endfor %}
29 |
30 | {% endfor %}
31 |
32 |
33 |
--------------------------------------------------------------------------------
/docs/images/output_18_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/docs/images/output_18_0.png
--------------------------------------------------------------------------------
/docs/images/output_42_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/docs/images/output_42_0.png
--------------------------------------------------------------------------------
/docs/predicttarg.html:
--------------------------------------------------------------------------------
1 | ---
2 |
3 | title: predicttarg
4 |
5 |
6 | keywords: fastai
7 | sidebar: home_sidebar
8 |
9 | summary: "Rule set 3 target-site predictions"
10 | description: "Rule set 3 target-site predictions"
11 | nb_path: "03_predicttarg.ipynb"
12 | ---
13 |
22 |
23 |
24 |
25 | {% raw %}
26 |
27 |
28 |
29 |
30 | {% endraw %}
31 |
32 | {% raw %}
33 |
34 |
35 |
36 |
37 | {% endraw %}
38 |
39 | {% raw %}
40 |
41 |
58 | {% endraw %}
59 |
60 | {% raw %}
61 |
62 |
77 | {% endraw %}
78 |
79 | {% raw %}
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
load_target_model
(lite
=False
)
91 |
92 |
Load rule set 3 target model
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 | {% endraw %}
103 |
104 | {% raw %}
105 |
106 |
107 |
108 |
109 | {% endraw %}
110 |
111 | {% raw %}
112 |
113 |
114 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
132 | warnings.warn(
133 | /Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
134 | warnings.warn(
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 | {% endraw %}
144 |
145 | {% raw %}
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
predict_target
(design_df
, aa_subseq_df
, domain_feature_df
=None
, conservation_feature_df
=None
, id_cols
=None
)
157 |
158 |
Make predictions using the Rule Set 3 target model. Note that if the protein_domain_df
159 | or conservation_df are not supplied, then the lite model will be used, otherwise the full model is used.
160 |
:param design_df: DataFrame
161 | :param aa_subseq_df: DataFrame
162 | :param domain_feature_df: DataFrame
163 | :param id_cols: list or str
164 | :return: list
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 | {% endraw %}
175 |
176 | {% raw %}
177 |
178 |
179 |
180 |
181 | {% endraw %}
182 |
183 | {% raw %}
184 |
185 |
200 | {% endraw %}
201 |
202 | {% raw %}
203 |
204 |
205 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
Getting amino acid sequences
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
100%|██████████| 4/4 [00:04<00:00, 1.04s/it]
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
256 |
257 |
258 |
259 |
260 | Target Transcript
261 | Target Total Length
262 | Transcript Base
263 | desc
264 | molecule
265 | seq
266 | id
267 | version
268 | AA len
269 | sgRNA Context Sequence
270 | AA Index
271 | Target Cut Length
272 | Orientation
273 | extended_seq
274 | AA 0-Indexed
275 | AA 0-Indexed padded
276 | seq_start
277 | seq_end
278 | AA Subsequence
279 |
280 |
281 |
282 |
283 | 0
284 | ENST00000259457.8
285 | 834
286 | ENST00000259457
287 | None
288 | protein
289 | MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...
290 | ENSP00000259457
291 | 3
292 | 277
293 | TGGAGCAGATACAAGAGCAACTGAAGGGAT
294 | 64
295 | 191
296 | sense
297 | -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF...
298 | 63
299 | 80
300 | 64
301 | 96
302 | GVVYKDGIVLGADTRATEGMVVADKNCSKIHFI
303 |
304 |
305 | 1
306 | ENST00000259457.8
307 | 834
308 | ENST00000259457
309 | None
310 | protein
311 | MAAVSVYAPPVGGFSFDNCRRNAVLEADFAKRGYKLPKVRKTGTTI...
312 | ENSP00000259457
313 | 3
314 | 277
315 | CCGGAAAACTGGCACGACCATCGCTGGGGT
316 | 46
317 | 137
318 | sense
319 | -----------------MAAVSVYAPPVGGFSFDNCRRNAVLEADF...
320 | 45
321 | 62
322 | 46
323 | 78
324 | AKRGYKLPKVRKTGTTIAGVVYKDGIVLGADTR
325 |
326 |
327 | 2
328 | ENST00000394249.8
329 | 1863
330 | ENST00000394249
331 | None
332 | protein
333 | MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...
334 | ENSP00000377793
335 | 3
336 | 620
337 | TAGAAAAAGATTTGCGCACCCAAGTGGAAT
338 | 106
339 | 316
340 | sense
341 | -----------------MRRSEVLAEESIVCLQKALNHLREIWELI...
342 | 105
343 | 122
344 | 106
345 | 138
346 | EEGETTILQLEKDLRTQVELMRKQKKERKQELK
347 |
348 |
349 | 3
350 | ENST00000394249.8
351 | 1863
352 | ENST00000394249
353 | None
354 | protein
355 | MRRSEVLAEESIVCLQKALNHLREIWELIGIPEDQRLQRTEVVKKH...
356 | ENSP00000377793
357 | 3
358 | 620
359 | TGGCCTTTGACCCAGACATAATGGTGGCCA
360 | 263
361 | 787
362 | antisense
363 | -----------------MRRSEVLAEESIVCLQKALNHLREIWELI...
364 | 262
365 | 279
366 | 263
367 | 295
368 | WDRLQIPEEEREAVATIMSGSKAKVRKALQLEV
369 |
370 |
371 | 4
372 | ENST00000361337.3
373 | 2298
374 | ENST00000361337
375 | None
376 | protein
377 | MSGDHLHNDSQIEADFRLNDSHKHKDKHKDREHRHKEHKKEKDREK...
378 | ENSP00000354522
379 | 2
380 | 765
381 | AAATACTCACTCATCCTCATCTCGAGGTCT
382 | 140
383 | 420
384 | antisense
385 | -----------------MSGDHLHNDSQIEADFRLNDSHKHKDKHK...
386 | 139
387 | 156
388 | 140
389 | 172
390 | GYFVPPKEDIKPLKRPRDEDDADYKPKKIKTED
391 |
392 |
393 | ...
394 | ...
395 | ...
396 | ...
397 | ...
398 | ...
399 | ...
400 | ...
401 | ...
402 | ...
403 | ...
404 | ...
405 | ...
406 | ...
407 | ...
408 | ...
409 | ...
410 | ...
411 | ...
412 | ...
413 |
414 |
415 | 395
416 | ENST00000454402.7
417 | 1023
418 | ENST00000454402
419 | None
420 | protein
421 | METSALKQQEQPAATKIRNLPWVEKYRPQTLNDLISHQDILSTIQK...
422 | ENSP00000408295
423 | 2
424 | 340
425 | TGTCTTTATATAGCTGTTTCGCACAGGCTA
426 | 74
427 | 220
428 | antisense
429 | -----------------METSALKQQEQPAATKIRNLPWVEKYRPQ...
430 | 73
431 | 90
432 | 74
433 | 106
434 | LYGPPGTGKTSTILACAKQLYKDKEFGSMVLEL
435 |
436 |
437 | 396
438 | ENST00000254998.3
439 | 423
440 | ENST00000254998
441 | None
442 | protein
443 | MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...
444 | ENSP00000254998
445 | 2
446 | 140
447 | TTGTCAATGTCTACTACACCACCATGGATA
448 | 27
449 | 79
450 | sense
451 | -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD...
452 | 26
453 | 43
454 | 27
455 | 59
456 | DQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTA
457 |
458 |
459 | 397
460 | ENST00000254998.3
461 | 423
462 | ENST00000254998
463 | None
464 | protein
465 | MASVDFKTYVDQACRAAEEFVNVYYTTMDKRRRLLSRLYMGTATLV...
466 | ENSP00000254998
467 | 2
468 | 140
469 | GGCGTTTGCTGTCCCGCCTGTACATGGGCA
470 | 39
471 | 115
472 | sense
473 | -----------------MASVDFKTYVDQACRAAEEFVNVYYTTMD...
474 | 38
475 | 55
476 | 39
477 | 71
478 | VYYTTMDKRRRLLSRLYMGTATLVWNGNAVSGQ
479 |
480 |
481 | 398
482 | ENST00000381685.10
483 | 2067
484 | ENST00000381685
485 | None
486 | protein
487 | MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI...
488 | ENSP00000371101
489 | 5
490 | 688
491 | ACTAGCAATGGCTTATCAGATCGAAGGTCA
492 | 259
493 | 776
494 | antisense
495 | -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK...
496 | 258
497 | 275
498 | 259
499 | 291
500 | TMAVGTTTGQVLLYDLRSDKPLLVKDHQYGLPI
501 |
502 |
503 | 399
504 | ENST00000381685.10
505 | 2067
506 | ENST00000381685
507 | None
508 | protein
509 | MQVSSLNEVKIYSLSCGKSLPEWLSDRKKRALQKKDVDVRRRIELI...
510 | ENSP00000371101
511 | 5
512 | 688
513 | AAATTTTGTCTGATGACTACTCAAAGGTAT
514 | 108
515 | 322
516 | sense
517 | -----------------MQVSSLNEVKIYSLSCGKSLPEWLSDRKK...
518 | 107
519 | 124
520 | 108
521 | 140
522 | CLDSEVVTFEILSDDYSKIVFLHNDRYIEFHSQ
523 |
524 |
525 |
526 |
400 rows × 19 columns
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 | {% endraw %}
537 |
538 | {% raw %}
539 |
540 |
541 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
Getting protein domains
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
100%|██████████| 200/200 [00:48<00:00, 4.12it/s]
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 | {% endraw %}
578 |
579 | {% raw %}
580 |
581 |
582 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
Getting conservation
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
100%|██████████| 200/200 [03:53<00:00, 1.17s/it]
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
635 |
636 |
637 |
638 |
639 | sgRNA Context Sequence
640 | Target Cut Length
641 | Target Transcript
642 | Orientation
643 | cons_4
644 | cons_32
645 |
646 |
647 |
648 |
649 | 0
650 | AAAAGAATGATGAAAAGACACCACAGGGAG
651 | 244
652 | ENST00000610426.5
653 | sense
654 | 0.218231
655 | 0.408844
656 |
657 |
658 | 1
659 | AAAAGAGCCATGAATCTAAACATCAGGAAT
660 | 640
661 | ENST00000223073.6
662 | sense
663 | 0.129825
664 | 0.278180
665 |
666 |
667 | 2
668 | AAAAGCGCCAAATGGCCCGAGAATTGGGAG
669 | 709
670 | ENST00000331923.9
671 | sense
672 | 0.470906
673 | 0.532305
674 |
675 |
676 | 3
677 | AAACAGAAAAAGTTAAAATCACCAAGGTGT
678 | 496
679 | ENST00000283882.4
680 | sense
681 | 0.580556
682 | 0.602708
683 |
684 |
685 | 4
686 | AAACAGATGGAAGATGCTTACCGGGGGACC
687 | 132
688 | ENST00000393047.8
689 | sense
690 | 0.283447
691 | 0.414293
692 |
693 |
694 | ...
695 | ...
696 | ...
697 | ...
698 | ...
699 | ...
700 | ...
701 |
702 |
703 | 395
704 | TTTGATTGCATTAAGGTTGGACTCTGGATT
705 | 246
706 | ENST00000249269.9
707 | sense
708 | 0.580612
709 | 0.618707
710 |
711 |
712 | 396
713 | TTTGCCCACAGCTCCAAAGCATCGCGGAGA
714 | 130
715 | ENST00000227618.8
716 | sense
717 | 0.323770
718 | 0.416368
719 |
720 |
721 | 397
722 | TTTTACAGTGCGATGTATGATGTATGGCTT
723 | 119
724 | ENST00000338366.6
725 | sense
726 | 0.788000
727 | 0.537417
728 |
729 |
730 | 398
731 | TTTTGGATCTCGTAGTGATTCAAGAGGGAA
732 | 233
733 | ENST00000629496.3
734 | sense
735 | 0.239630
736 | 0.347615
737 |
738 |
739 | 399
740 | TTTTTGTTACTACAGGTTCGCTGCTGGGAA
741 | 201
742 | ENST00000395840.6
743 | sense
744 | 0.693767
745 | 0.639044
746 |
747 |
748 |
749 |
400 rows × 6 columns
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 | {% endraw %}
760 |
761 | {% raw %}
762 |
763 |
764 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
785 |
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
786 | warnings.warn(
787 | /Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
788 | warnings.warn(
789 |
790 |
791 |
792 |
793 |
794 |
795 |
796 |
797 | {% endraw %}
798 |
799 | {% raw %}
800 |
801 |
802 |
814 |
815 |
816 |
817 |
818 |
819 |
820 |
821 |
/Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator SimpleImputer from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
822 | warnings.warn(
823 | /Users/pdeweird/opt/anaconda3/envs/rs3/lib/python3.8/site-packages/sklearn/base.py:310: UserWarning: Trying to unpickle estimator Pipeline from version 1.0.dev0 when using version 0.24.2. This might lead to breaking code or invalid results. Use at your own risk.
824 | warnings.warn(
825 |
826 |
827 |
828 |
829 |
830 |
831 |
832 |
833 | {% endraw %}
834 |
835 | {% raw %}
836 |
837 |
838 |
848 |
849 |
850 |
851 |
852 |
853 |
854 |
855 |
856 |
857 |
0 TGGAGCAGATACAAGAGCAACTGAAGGGAT
858 | 1 CCGGAAAACTGGCACGACCATCGCTGGGGT
859 | 2 TAGAAAAAGATTTGCGCACCCAAGTGGAAT
860 | 3 TGGCCTTTGACCCAGACATAATGGTGGCCA
861 | 4 AAATACTCACTCATCCTCATCTCGAGGTCT
862 | ...
863 | 395 TGTCTTTATATAGCTGTTTCGCACAGGCTA
864 | 396 TTGTCAATGTCTACTACACCACCATGGATA
865 | 397 GGCGTTTGCTGTCCCGCCTGTACATGGGCA
866 | 398 ACTAGCAATGGCTTATCAGATCGAAGGTCA
867 | 399 AAATTTTGTCTGATGACTACTCAAAGGTAT
868 | Name: sgRNA Context Sequence, Length: 400, dtype: object
869 |
870 |
871 |
872 |
873 |
874 |
875 |
876 |
877 | {% endraw %}
878 |
879 | {% raw %}
880 |
881 |
894 | {% endraw %}
895 |
896 | {% raw %}
897 |
898 |
923 | {% endraw %}
924 |
925 | {% raw %}
926 |
927 |
952 | {% endraw %}
953 |
954 |
955 |
956 |
957 |
--------------------------------------------------------------------------------
/docs/sidebar.json:
--------------------------------------------------------------------------------
1 | {
2 | "rs3": {
3 | "Overview": "/",
4 | "seq": "seq.html",
5 | "targetdata": "targetdata.html",
6 | "targetfeat": "targetfeat.html",
7 | "predicttarg": "predicttarg.html",
8 | "predict": "predict.html"
9 | }
10 | }
--------------------------------------------------------------------------------
/docs/sitemap.xml:
--------------------------------------------------------------------------------
1 | ---
2 | layout: none
3 | search: exclude
4 | ---
5 |
6 |
7 |
8 | {% for post in site.posts %}
9 | {% unless post.search == "exclude" %}
10 |
11 | {{site.url}}{{post.url}}
12 |
13 | {% endunless %}
14 | {% endfor %}
15 |
16 |
17 | {% for page in site.pages %}
18 | {% unless page.search == "exclude" %}
19 |
20 | {{site.url}}{{ page.url}}
21 |
22 | {% endunless %}
23 | {% endfor %}
24 |
--------------------------------------------------------------------------------
/rs3/RuleSet3.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/rs3/RuleSet3.pkl
--------------------------------------------------------------------------------
/rs3/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.16"
2 |
--------------------------------------------------------------------------------
/rs3/_nbdev.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED BY NBDEV! DO NOT EDIT!
2 |
3 | __all__ = ["index", "modules", "custom_doc_links", "git_url"]
4 |
5 | index = {"load_seq_model": "00_seq.ipynb",
6 | "featurize_context": "00_seq.ipynb",
7 | "predict_seq": "00_seq.ipynb",
8 | "ensembl_post": "01_targetdata.ipynb",
9 | "chunks": "01_targetdata.ipynb",
10 | "post_transcript_sequence_chunk": "01_targetdata.ipynb",
11 | "post_transcript_sequence": "01_targetdata.ipynb",
12 | "build_transcript_aa_seq_df": "01_targetdata.ipynb",
13 | "ensembl_get": "01_targetdata.ipynb",
14 | "get_translation_overlap": "01_targetdata.ipynb",
15 | "build_translation_overlap_df": "01_targetdata.ipynb",
16 | "write_transcript_data": "01_targetdata.ipynb",
17 | "get_transcript_info": "01_targetdata.ipynb",
18 | "get_conservation": "01_targetdata.ipynb",
19 | "get_exon_conservation": "01_targetdata.ipynb",
20 | "get_transcript_conservation": "01_targetdata.ipynb",
21 | "get_transcript_conservation_safe": "01_targetdata.ipynb",
22 | "build_conservation_df": "01_targetdata.ipynb",
23 | "write_conservation_data": "01_targetdata.ipynb",
24 | "add_target_columns": "02_targetfeat.ipynb",
25 | "get_position_features": "02_targetfeat.ipynb",
26 | "get_one_aa_frac": "02_targetfeat.ipynb",
27 | "get_aa_aromaticity": "02_targetfeat.ipynb",
28 | "get_aa_hydrophobicity": "02_targetfeat.ipynb",
29 | "get_aa_ip": "02_targetfeat.ipynb",
30 | "get_aa_secondary_structure": "02_targetfeat.ipynb",
31 | "featurize_aa_seqs": "02_targetfeat.ipynb",
32 | "extract_amino_acid_subsequence": "02_targetfeat.ipynb",
33 | "get_aa_subseq_df": "02_targetfeat.ipynb",
34 | "get_amino_acid_features": "02_targetfeat.ipynb",
35 | "get_protein_domain_features": "02_targetfeat.ipynb",
36 | "get_conservation_ranges": "02_targetfeat.ipynb",
37 | "get_conservation_features": "02_targetfeat.ipynb",
38 | "merge_feature_dfs": "02_targetfeat.ipynb",
39 | "load_target_model": "03_predicttarg.ipynb",
40 | "predict_target": "03_predicttarg.ipynb",
41 | "predict_seq_tracr": "04_predict.ipynb",
42 | "combine_target_seq_scores": "04_predict.ipynb",
43 | "predict": "04_predict.ipynb"}
44 |
45 | modules = ["seq.py",
46 | "targetdata.py",
47 | "targetfeat.py",
48 | "predicttarg.py",
49 | "predict.py"]
50 |
51 | doc_url = "https://gpp-rnd.github.io/rs3/"
52 |
53 | git_url = "https://github.com/gpp-rnd/rs3/tree/master/"
54 |
55 | def custom_doc_links(name): return None
56 |
--------------------------------------------------------------------------------
/rs3/predict.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 04_predict.ipynb (unless otherwise specified).
2 |
3 | __all__ = ['predict_seq_tracr', 'combine_target_seq_scores', 'predict']
4 |
5 | # Cell
6 | import pandas as pd
7 | import warnings
8 |
9 | from .seq import predict_seq
10 | from .targetdata import (build_translation_overlap_df,
11 | build_transcript_aa_seq_df,
12 | build_conservation_df)
13 | from .targetfeat import (add_target_columns,
14 | get_aa_subseq_df,
15 | get_protein_domain_features,
16 | get_conservation_features)
17 | from .predicttarg import predict_target
18 |
19 | # Cell
20 | from pandas.api.types import is_list_like
21 |
22 | def predict_seq_tracr(design_df, tracr, context_col, ref_tracrs, n_jobs):
23 | if not tracr in ref_tracrs:
24 | raise ValueError('tracrRNA must be one of ' + ','.join(ref_tracrs))
25 | design_df['RS3 Sequence Score (' + tracr + ' tracr)'] = predict_seq(design_df[context_col], sequence_tracr=tracr,
26 | n_jobs=n_jobs)
27 |
28 | def combine_target_seq_scores(design_df, tracr, target_score_col, lite):
29 | full_rs_name = 'RS3 Sequence (' + tracr + ' tracr) + Target Score'
30 | if lite:
31 | full_rs_name += 'Lite'
32 | design_df[full_rs_name] = \
33 | design_df['RS3 Sequence Score (' + tracr + ' tracr)'] + \
34 | design_df[target_score_col]
35 |
36 | def predict(design_df, tracr=None, target=False,
37 | aa_seq_file=None, domain_file=None,
38 | conservatin_file=None,
39 | id_cols=None,
40 | context_col='sgRNA Context Sequence',
41 | transcript_id_col='Target Transcript',
42 | transcript_base_col='Transcript Base',
43 | transcript_len_col='Target Total Length',
44 | n_jobs_min=1, n_jobs_max=1, lite=True):
45 | """Make predictions using RS3
46 |
47 | :param design_df: DataFrame
48 | :param tracr: str or list
49 | :param target: bool, whether to include target scores
50 | :param aa_seq_file: str, path to precomputed amino acid sequences
51 | :param domain_file: str, path to precomputed domain file
52 | :param id_cols: list or None
53 | :param context_col: str
54 | :param transcript_id_col: str
55 | :param transcript_base_col: str
56 | :param transcript_len_col: str
57 | :param n_jobs_min: int
58 | :return: DataFram
59 | """
60 | if id_cols is None:
61 | id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']
62 | out_df = design_df.copy()
63 | ref_tracrs = ['Hsu2013', 'Chen2013']
64 | if type(tracr) is str:
65 | predict_seq_tracr(out_df, tracr, context_col, ref_tracrs, n_jobs=n_jobs_max)
66 | elif is_list_like(tracr):
67 | for t in tracr:
68 | predict_seq_tracr(out_df, t, context_col, ref_tracrs, n_jobs=n_jobs_max)
69 | else:
70 | raise ValueError('Could not recognize tracr input: ' + str(tracr))
71 | if target:
72 | out_df = add_target_columns(out_df,
73 | transcript_base_col=transcript_base_col)
74 | transcript_bases = pd.Series(out_df[transcript_base_col].unique())
75 | if aa_seq_file is None:
76 | aa_seq_df = build_transcript_aa_seq_df(out_df,
77 | transcript_id_col=transcript_id_col,
78 | transcript_len_col=transcript_len_col,
79 | n_jobs=n_jobs_min)
80 | else:
81 | aa_seq_df = pd.read_parquet(aa_seq_file, engine='pyarrow',
82 | filters=[[(transcript_base_col, 'in', transcript_bases)]])
83 | missing_transcripts_aa = transcript_bases[~transcript_bases.isin(aa_seq_df[transcript_base_col])]
84 | if len(missing_transcripts_aa) > 0:
85 | warnings.warn('Missing amino acid sequences for transcripts: ' +
86 | ','.join(missing_transcripts_aa))
87 | out_df['Missing translation information'] = out_df[transcript_base_col].isin(missing_transcripts_aa)
88 | aa_subseq_df = get_aa_subseq_df(sg_designs=out_df, aa_seq_df=aa_seq_df, width=16,
89 | id_cols=id_cols)
90 | if lite:
91 | target_score_col = 'Target Score Lite'
92 | out_df[target_score_col] = predict_target(design_df=out_df, aa_subseq_df=aa_subseq_df,
93 | id_cols=id_cols)
94 | else:
95 | if domain_file is None:
96 | domain_df = build_translation_overlap_df(aa_seq_df['id'].unique(), n_jobs=n_jobs_min)
97 | else:
98 | domain_df = pd.read_parquet(domain_file, engine='pyarrow',
99 | filters=[[(transcript_base_col, 'in', transcript_bases)]])
100 | # No warning for domain, since some transcripts aren't annotated with any domains
101 | domain_feature_df = get_protein_domain_features(out_df, domain_df,
102 | id_cols=id_cols, transcript_base_col=transcript_base_col)
103 | if conservatin_file is None:
104 | conservation_df = build_conservation_df(out_df, n_jobs=n_jobs_max)
105 | else:
106 | conservation_df = pd.read_parquet(conservatin_file, engine='pyarrow',
107 | filters=[[(transcript_base_col, 'in', transcript_bases)]])
108 | missing_transcripts_cons = transcript_bases[~transcript_bases.isin(conservation_df[transcript_base_col])]
109 | if len(missing_transcripts_cons) > 0:
110 | warnings.warn('Missing conservation scores for transcripts: ' +
111 | ','.join(missing_transcripts_cons))
112 | out_df['Missing conservation information'] = out_df[transcript_base_col].isin(missing_transcripts_cons)
113 | conservation_feature_df = get_conservation_features(out_df, conservation_df,
114 | small_width=2, large_width=16,
115 | conservation_column='ranked_conservation',
116 | id_cols=id_cols)
117 | target_score_col = 'Target Score'
118 | out_df[target_score_col] = predict_target(design_df=out_df, aa_subseq_df=aa_subseq_df,
119 | domain_feature_df=domain_feature_df,
120 | conservation_feature_df=conservation_feature_df,
121 | id_cols=id_cols)
122 | if type(tracr) is str:
123 | combine_target_seq_scores(out_df, tracr, target_score_col, lite)
124 | else: # list
125 | for t in tracr:
126 | combine_target_seq_scores(out_df, t, target_score_col, lite)
127 | return out_df
--------------------------------------------------------------------------------
/rs3/predicttarg.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 03_predicttarg.ipynb (unless otherwise specified).
2 |
3 | __all__ = ['load_target_model', 'predict_target']
4 |
5 | # Cell
6 | from rs3 import targetfeat
7 | import joblib
8 | import os
9 |
10 | # Cell
11 | def load_target_model(lite=False):
12 | """Load rule set 3 target model"""
13 | if lite:
14 | model_name = 'target_lite_model.pkl'
15 | else:
16 | model_name = 'target_model.pkl'
17 | model = joblib.load(os.path.join(os.path.dirname(__file__), model_name))
18 | return model
19 |
20 | # Cell
21 | def predict_target(design_df, aa_subseq_df, domain_feature_df=None,
22 | conservation_feature_df=None, id_cols=None):
23 | """Make predictions using the Rule Set 3 target model. Note that if the protein_domain_df
24 | or conservation_df are not supplied, then the lite model will be used, otherwise the full model is used.
25 |
26 | :param design_df: DataFrame
27 | :param aa_subseq_df: DataFrame
28 | :param domain_feature_df: DataFrame
29 | :param id_cols: list or str
30 | :return: list
31 | """
32 | if (domain_feature_df is None) or (conservation_feature_df is None):
33 | lite = True
34 | domain_feature_df = None
35 | conservation_feature_df = None
36 | else:
37 | lite = False
38 | model = load_target_model(lite=lite)
39 | if id_cols is None:
40 | id_cols = ['sgRNA Context Sequence', 'Target Cut Length', 'Target Transcript', 'Orientation']
41 | target_feature_df, target_feature_cols = targetfeat.merge_feature_dfs(design_df,
42 | aa_subseq_df=aa_subseq_df,
43 | domain_df=domain_feature_df,
44 | conservation_df=conservation_feature_df,
45 | id_cols=id_cols)
46 | X_target = target_feature_df[target_feature_cols]
47 | predictions = model.predict(X_target)
48 | return predictions
--------------------------------------------------------------------------------
/rs3/seq.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 00_seq.ipynb (unless otherwise specified).
2 |
3 | __all__ = ['load_seq_model', 'featurize_context', 'predict_seq']
4 |
5 | # Cell
6 | import joblib
7 | import sglearn
8 | import pandas as pd
9 | import os
10 |
11 | # Cell
12 | def load_seq_model():
13 | """Load rule set 3 sequence model"""
14 | model = joblib.load(os.path.join(os.path.dirname(__file__), 'RuleSet3.pkl'))
15 | return model
16 |
17 | # Cell
18 | def featurize_context(context_sequences, sequence_tracr='Hsu2013', ref_tracrs=None,
19 | n_jobs=1):
20 | """Featurize context sequences
21 |
22 | :param context_sequences: list-like
23 | :param sequence_tracr: list-like or str
24 | :return: DataFrame, feature matrix
25 | """
26 | if ref_tracrs is None:
27 | ref_tracrs = ['Hsu2013', 'Chen2013']
28 | context_series = pd.Series(context_sequences)
29 | if not (context_series.str.len() == 30).all():
30 | raise ValueError('All context sequences must be 30 nucleotides')
31 | featurized_sgrnas = sglearn.featurize_guides(context_sequences,
32 | n_jobs=n_jobs)
33 | for tracr in ref_tracrs:
34 | if type(sequence_tracr) is str:
35 | featurized_sgrnas[tracr + ' tracr'] = int(sequence_tracr == tracr)
36 | else: # list-like
37 | featurized_sgrnas[tracr + ' tracr'] = ((pd.Series(sequence_tracr) == tracr)
38 | .astype(int)
39 | .to_list())
40 | return featurized_sgrnas
41 |
42 | # Cell
43 | def predict_seq(context_sequences, sequence_tracr='Hsu2013', ref_tracrs=None, n_jobs=1):
44 | """Predict the activity of context sequence for SpCas9 Knockout using sequence information only
45 |
46 | :param context_sequences: list of str
47 | :return: list of float, predictions
48 | """
49 | model = load_seq_model()
50 | print('Calculating sequence-based features')
51 | featurized_sgrnas = featurize_context(context_sequences, sequence_tracr=sequence_tracr, ref_tracrs=ref_tracrs,
52 | n_jobs=n_jobs)
53 | seq_predictions = model.predict(featurized_sgrnas)
54 | return seq_predictions
--------------------------------------------------------------------------------
/rs3/target_lite_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/rs3/target_lite_model.pkl
--------------------------------------------------------------------------------
/rs3/target_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/rs3/target_model.pkl
--------------------------------------------------------------------------------
/rs3/targetdata.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 01_targetdata.ipynb (unless otherwise specified).
2 |
3 | __all__ = ['ensembl_post', 'chunks', 'post_transcript_sequence_chunk', 'post_transcript_sequence',
4 | 'build_transcript_aa_seq_df', 'ensembl_get', 'get_translation_overlap', 'build_translation_overlap_df',
5 | 'write_transcript_data', 'get_transcript_info', 'get_conservation', 'get_exon_conservation',
6 | 'get_transcript_conservation', 'get_transcript_conservation_safe', 'build_conservation_df',
7 | 'write_conservation_data']
8 |
9 | # Cell
10 | import requests
11 | import json
12 | import pandas as pd
13 | from joblib import Parallel, delayed
14 | from tqdm import tqdm
15 | import warnings
16 | import os
17 | from scipy import stats
18 | import multiprocessing
19 |
20 | # Cell
21 | def ensembl_post(ext, data, headers=None, params=None):
22 | """Generic wrapper for using POST requests to the ensembl rest API
23 |
24 | :param ext: str, url extension
25 | :param data: dict, query data
26 | :param headers: dict or None, meta-information for query
27 | :param params: dict or None, parameters for query
28 | :return: Response object
29 | """
30 | if params is None:
31 | params = {}
32 | if headers is None:
33 | headers = {}
34 | data = json.dumps(data)
35 | r = requests.post("https://rest.ensembl.org"+ext, headers=headers, data=data, params=params)
36 | if not r.ok:
37 | r.raise_for_status()
38 | else:
39 | return r
40 |
41 | # Cell
42 | def chunks(lst, n):
43 | """Yield successive n-sized chunks from lst.
44 |
45 | lst: list
46 | n: int
47 |
48 | returns: generator of list chunks
49 | """
50 | for i in range(0, len(lst), n):
51 | yield lst[i:i + n]
52 |
53 | def post_transcript_sequence_chunk(ids, params, headers):
54 | """Helper function for post_transcript_sequence
55 |
56 | :param ids: list
57 | :param params: dict
58 | :param headers: dict
59 | :return: dict
60 | """
61 | data = {'ids': ids}
62 | r = ensembl_post("/sequence/id/", data=data, params=params,
63 | headers=headers)
64 | seq = r.json()
65 | return seq
66 |
67 | def post_transcript_sequence(ensembl_ids, seq_type='protein', max_queries=50,
68 | n_jobs=1, **kwargs):
69 | """Request multiple types of sequence by stable identifier. Supports feature masking and expand options.
70 | Uses https://rest.ensembl.org/documentation/info/sequence_id_post
71 |
72 | :param ensembl_ids: list of str
73 | :param seq_type: str, one of [genomic, cds, cdna, protein]
74 | :param max_queries: int, maximum number of queries for post
75 | :param n_jobs: int, number of jobs to run in parallel
76 | :param kwargs: additional parameter arguments
77 | :return: list, dict of sequences 5' to 3' in the same orientation as the input transcript
78 | """
79 | headers={"content-type" : "application/json", "accept" : "application/json"}
80 | params = {'type': seq_type, **kwargs}
81 | id_chunks = list(chunks(ensembl_ids, max_queries))
82 | seqs = Parallel(n_jobs=n_jobs)(delayed(post_transcript_sequence_chunk)
83 | (ids, params, headers) for ids in tqdm(id_chunks))
84 | # flatten list
85 | seqs = [item for sublist in seqs for item in sublist]
86 | return seqs
87 |
88 | # Cell
89 | def build_transcript_aa_seq_df(design_df, transcript_id_col='Target Transcript',
90 | transcript_len_col='Target Total Length', n_jobs=1):
91 | """Get amino acid sequence for transcripts of interest
92 |
93 | :param design_df: DataFrame
94 | :param transcript_id_col: str, column with ensembl transcript id
95 | :param transcript_len_col: str, column with length of transcript
96 | :param n_jobs: int, number of jobs to use to query transcripts
97 | :return: DataFrame
98 | """
99 | unique_transcripts = design_df[[transcript_id_col, transcript_len_col]].drop_duplicates()
100 | unique_transcripts['Transcript Base'] = unique_transcripts[transcript_id_col].str.split('.', expand=True)[0]
101 | print("Getting amino acid sequences")
102 | aa_seqs = post_transcript_sequence(unique_transcripts['Transcript Base'].to_list(),
103 | n_jobs=n_jobs)
104 | aa_seq_df = (pd.DataFrame(aa_seqs)
105 | .rename({'query': 'Transcript Base'}, axis=1))
106 | missing_seqs = (unique_transcripts['Transcript Base'][~unique_transcripts['Transcript Base'].isin(
107 | aa_seq_df['Transcript Base']
108 | )])
109 | if len(missing_seqs) > 0:
110 | warnings.warn('Unable to find translations for the following transcripts: ' + ', '.join(missing_seqs))
111 | aa_seq_len_df = (unique_transcripts.merge(aa_seq_df, on='Transcript Base'))
112 | aa_seq_len_df['AA len'] = aa_seq_len_df['seq'].str.len()
113 | filtered_aa_seq_len_df = (aa_seq_len_df[aa_seq_len_df[transcript_len_col] ==
114 | (aa_seq_len_df['AA len'] + 1)*3 ]
115 | .reset_index(drop=True))
116 | filtered_seqs = (aa_seq_len_df['Transcript Base'][~aa_seq_len_df['Transcript Base'].isin(
117 | filtered_aa_seq_len_df['Transcript Base']
118 | )])
119 | if len(filtered_seqs) > 0:
120 | warnings.warn('Filtered transcripts where the transcript length and amino acid ' +
121 | 'sequence length did not agree: ' + ', '.join(filtered_seqs))
122 | return filtered_aa_seq_len_df
123 |
124 | # Cell
125 | def ensembl_get(ext, query=None, headers=None, params=None):
126 | """Generic wrapper for using GET requests to the ensembl rest API
127 |
128 | ext: str, url extension |
129 | query: str or None, end of url extension specifying species, taxon, esnembl_id etc |
130 | headers: dict or None, meta-information for query |
131 | params: dict or None, parameters for query |
132 |
133 | returns: Response object
134 | """
135 | if query is None:
136 | query = ''
137 | if params is None:
138 | params = {}
139 | if headers is None:
140 | headers = {}
141 | r = requests.get("https://rest.ensembl.org"+ext+query, params=params, headers=headers)
142 | if not r.ok:
143 | r.raise_for_status()
144 | else:
145 | return r
146 |
147 | def get_translation_overlap(ensembl_id):
148 | """Get features that overlap with translation, such as protein domains
149 |
150 | :param ensembl_id: str
151 | :return: DataFrame
152 | """
153 | headers = {'content-type': 'application/json'}
154 | ext = '/overlap/translation/' + ensembl_id
155 | r = ensembl_get(ext, headers=headers)
156 | decoded = r.json()
157 | return decoded
158 |
159 | # Cell
160 | def build_translation_overlap_df(protein_ids, n_jobs=1):
161 | """Get protein domain information
162 |
163 | :param protein_ids: list of str, ensemble protein IDs
164 | :param n_jobs: int
165 | :return: DataFrame
166 | """
167 | print('Getting protein domains')
168 | translation_overlap_list = Parallel(n_jobs=n_jobs)(delayed(get_translation_overlap)
169 | (id) for id in tqdm(protein_ids))
170 | # flatten list
171 | translation_overlap_list = [item for sublist in translation_overlap_list for item in sublist]
172 | translation_overlap_df = pd.DataFrame(translation_overlap_list).rename({'Parent': 'Transcript Base'}, axis=1)
173 | return translation_overlap_df
174 |
175 | # Cell
176 | def write_transcript_data(design_df, transcript_id_col='Target Transcript',
177 | transcript_len_col='Target Total Length', n_jobs=1,
178 | overwrite=True, filepath='./data/target_data/',
179 | aa_seq_name='aa_seqs.pq',
180 | protein_domain_name='protein_domains.pq'):
181 | """Write amino acid sequences and protein domain information to parquet files
182 |
183 | :param design_df: DataFrame
184 | :param transcript_id_col: str
185 | :param transcript_len_col: str
186 | :param n_jobs: int
187 | :param overwrite: bool, whether to overwrite existing file
188 | :param filepath: str, directory for output sequences
189 | :param aa_seq_name: str, name of amino acid sequence file
190 | :param protein_domain_name: str, name of protein domain file
191 | """
192 | if (os.path.isfile(filepath + aa_seq_name) or os.path.isfile(filepath + protein_domain_name)) and (not overwrite):
193 | raise ValueError('Transcript data already exits and cannot be overwritten')
194 | else:
195 | transcript_aa_seq_df = build_transcript_aa_seq_df(design_df, transcript_id_col=transcript_id_col,
196 | transcript_len_col=transcript_len_col,
197 | n_jobs=n_jobs)
198 | translation_overlap_df = build_translation_overlap_df(transcript_aa_seq_df['id'],
199 | n_jobs=n_jobs)
200 | if not os.path.isdir(filepath):
201 | print('Creating new directory ' + filepath)
202 | os.makedirs(filepath)
203 | transcript_aa_seq_df.to_parquet(path=filepath + aa_seq_name, engine='pyarrow',
204 | index=False)
205 | translation_overlap_df.to_parquet(path=filepath + protein_domain_name, engine='pyarrow',
206 | index=False)
207 |
208 | # Cell
209 | def get_transcript_info(base_transcript):
210 | """Using an ensembl transcript ID, get
211 |
212 | :param base_transcript: str
213 | :return: (exon_df, trans_sr, chr)
214 | exon_df: DataFrame, with global exon start and end position
215 | trans_sr: Series, with global translation start and stop positions for CDS and translation length
216 | chr: str
217 |
218 | """
219 | r = ensembl_get("/lookup/id/" + base_transcript + "?expand=1",
220 | headers={"Content-Type": "application/json"}, params={'expand': '1'})
221 | decoded = r.json()
222 | exon_df = pd.DataFrame(decoded['Exon'])
223 | trans_sr = pd.Series(decoded['Translation'])
224 | chr = decoded['seq_region_name']
225 | return exon_df, trans_sr, chr
226 |
227 | # Cell
228 | def get_conservation(chr, start, end, genome):
229 | """Get conservation scores for a given region of a genome
230 |
231 | :param chr: str, chromosome number
232 | :param start: int
233 | :param end: int
234 | :param genome: str
235 | :return: DataFrame
236 | """
237 | api_url = 'http://api.genome.ucsc.edu/getData/track'
238 | if genome == 'hg38':
239 | track = 'phyloP100way'
240 | elif genome == 'mm39':
241 | track = 'phyloP35way'
242 | else:
243 | raise ValueError('Genome not recognized')
244 | chrom = 'chr' + chr
245 | params = {
246 | 'genome': genome,
247 | 'track': track,
248 | 'start': start,
249 | 'end': end,
250 | 'chrom': chrom
251 | }
252 | results = requests.get(api_url, data=params)
253 | if results.ok:
254 | value_df = (pd.DataFrame([pd.Series(x) for x in pd.read_json(results.content.decode('utf8'))[chrom].values])
255 | .rename(columns={'value': 'conservation'}))
256 | else:
257 | raise ValueError(results.reason)
258 | return value_df
259 |
260 | # Cell
261 | def get_exon_conservation(exon_df, chr, genome):
262 | """Get conservation scores for each exon
263 |
264 | :param exon_df: DataFrame
265 | :param chr: str
266 | :param genome: str
267 | :return: DataFrame
268 | """
269 | conservation_dict = {}
270 | for i, row in exon_df.set_index('id').iterrows():
271 | # subtract one since the nucleotide conservation corresponds to the "end" index
272 | conservation_dict[i] = get_conservation(chr, row['start'] - 1, row['end'], genome)
273 | # get the conservation of i
274 | conservation_df = (pd.concat(conservation_dict)
275 | .reset_index(level=0)
276 | .reset_index(drop=True)
277 | .rename({'level_0': 'exon_id',
278 | 'end': 'genomic position'}, axis=1)
279 | .drop('start', axis=1))
280 | return conservation_df
281 |
282 |
283 | def get_transcript_conservation(transcript_id, target_strand, genome):
284 | """Get conservation scores for a transcript
285 |
286 | :param transcript_id: str
287 | :param target_strand: str, '+' or '-'
288 | :param genome: str
289 | :return: DataFrame
290 | """
291 | exon_df, trans_sr, chr = get_transcript_info(transcript_id)
292 | # only include translated positions
293 | exon_df['start'] = exon_df['start'].apply(lambda x: max(x, trans_sr['start']))
294 | exon_df['end'] = exon_df['end'].apply(lambda x: min(x, trans_sr['end']))
295 | exon_df = exon_df[exon_df['end'] > exon_df['start']].reset_index(drop=True)
296 | conservation_df = get_exon_conservation(exon_df, chr, genome)
297 | conservation_df['Transcript Base'] = transcript_id
298 | if target_strand == '-':
299 | ascending = False
300 | else:
301 | ascending = True
302 | conservation_df = (conservation_df
303 | .sort_values('genomic position', ascending=ascending)
304 | .reset_index(drop=True))
305 | conservation_df['target position'] = conservation_df.index + 1
306 | conservation_df['chromosome'] = chr
307 | conservation_df['genome'] = genome
308 | conservation_df['translation length'] = trans_sr['length']
309 | return conservation_df
310 |
311 | # Cell
312 | def get_transcript_conservation_safe(transcript_id, target_strand, genome):
313 | """Helper function for parrallel query. Return None when conservation dataframe cannot be assembled"""
314 | try:
315 | return get_transcript_conservation(transcript_id, target_strand, genome)
316 | except:
317 | return None
318 |
319 |
320 | def build_conservation_df(design_df, n_jobs=1):
321 | transcript_refseq_df = (design_df[['Target Transcript', 'Strand of Target', 'Target Total Length']]
322 | .drop_duplicates())
323 | if not (transcript_refseq_df['Target Transcript'].str.startswith('ENST') |
324 | transcript_refseq_df['Target Transcript'].str.startswith('ENSMUST')).all():
325 | raise ValueError('Must supply human or mouse Ensembl transcript IDs as input')
326 | print('Getting conservation')
327 | transcript_refseq_df['Transcript Base'] = (transcript_refseq_df['Target Transcript'].str.split('.', expand=True)[0])
328 | transcript_refseq_df['genome'] = transcript_refseq_df['Transcript Base'].apply(lambda trans:
329 | 'mm39' if 'MUS' in trans else 'hg38')
330 | all_transcript_conservation_list = Parallel(n_jobs)(delayed(get_transcript_conservation_safe)
331 | (row['Transcript Base'],
332 | row['Strand of Target'],
333 | row['genome'])
334 | for _, row in tqdm(transcript_refseq_df.iterrows(),
335 | total=transcript_refseq_df.shape[0]))
336 | transcript_conservation_list = []
337 | failed_list = []
338 | transcript_list = transcript_refseq_df['Transcript Base'].to_list()
339 | for i, conservation_df in enumerate(all_transcript_conservation_list):
340 | if conservation_df is None:
341 | failed_list.append(transcript_list[i])
342 | else:
343 | transcript_conservation_list.append(conservation_df)
344 | if len(failed_list) > 0:
345 | warnings.warn('Failed to get conservation scores for ' + str(len(failed_list)) +
346 | ' transcripts' + ', '.join(failed_list))
347 | transcript_conservation_df = (pd.concat(transcript_conservation_list))
348 | transcript_cons_designs = (transcript_conservation_df
349 | .merge(transcript_refseq_df, how='inner',
350 | on=['Transcript Base', 'genome']))
351 | filtered_transcript_conservation = transcript_cons_designs[
352 | (transcript_cons_designs['translation length'] + 1)*3 == transcript_cons_designs['Target Total Length']].copy()
353 | mismatched_transcripts = transcript_conservation_df['Transcript Base'][
354 | ~transcript_conservation_df['Transcript Base'].isin(filtered_transcript_conservation['Transcript Base'])]
355 | if len(mismatched_transcripts) > 0:
356 | warnings.warn('Filtered: ' + str(len(mismatched_transcripts)) +
357 | ' transcripts with mismatched length:' + ','.join(mismatched_transcripts))
358 | filtered_transcript_conservation['ranked_conservation'] = (filtered_transcript_conservation.groupby('Transcript Base')
359 | ['conservation']
360 | .rank(pct=True))
361 | return filtered_transcript_conservation
362 |
363 | # Cell
364 | def write_conservation_data(design_df, n_jobs=1,
365 | overwrite=True, filepath='./data/target_data/',
366 | cons_file_name='conservation.pq'):
367 | """Write conservation scores to parquet files
368 |
369 | :param design_df: DataFrame
370 | :param n_jobs: int
371 | :param overwrite: bool, whether to overwrite existing file
372 | :param filepath: str, directory for output sequences
373 | :param cons_file_name: str, name of conservation file
374 | """
375 | if os.path.isfile(filepath + cons_file_name) and (not overwrite):
376 | raise ValueError('Conservation data already exits and cannot be overwritten')
377 | else:
378 | conservation_df = build_conservation_df(design_df, n_jobs=n_jobs)
379 | if not os.path.isdir(filepath):
380 | print('Creating new directory ' + filepath)
381 | os.makedirs(filepath)
382 | conservation_df.to_parquet(path=filepath + cons_file_name, engine='pyarrow',
383 | index=False)
--------------------------------------------------------------------------------
/rs3/targetfeat.py:
--------------------------------------------------------------------------------
1 | # AUTOGENERATED! DO NOT EDIT! File to edit: 02_targetfeat.ipynb (unless otherwise specified).
2 |
3 | __all__ = ['add_target_columns', 'get_position_features', 'get_one_aa_frac', 'get_aa_aromaticity',
4 | 'get_aa_hydrophobicity', 'get_aa_ip', 'get_aa_secondary_structure', 'featurize_aa_seqs',
5 | 'extract_amino_acid_subsequence', 'get_aa_subseq_df', 'get_amino_acid_features',
6 | 'get_protein_domain_features', 'get_conservation_ranges', 'get_conservation_features', 'merge_feature_dfs']
7 |
8 | # Cell
9 | import pandas as pd
10 | from Bio.SeqUtils.ProtParam import ProteinAnalysis
11 | import warnings
12 |
13 | # Cell
14 | def add_target_columns(design_df, transcript_id_col='Target Transcript',
15 | cut_pos_col='Target Cut Length',
16 | transcript_base_col='Transcript Base'):
17 | """Add ['AA Index' and 'Transcript Base'] to design df
18 |
19 | :param design_df: DataFrame
20 | :return: DataFrame
21 | """
22 | out_df = design_df.copy()
23 | out_df['AA Index'] = (out_df[cut_pos_col] - 1) // 3 + 1
24 | out_df[transcript_base_col] = out_df[transcript_id_col].str.split('.', expand=True)[0]
25 | return out_df
26 |
27 | # Cell
28 | def get_position_features(sg_df, id_cols):
29 | """Get features ['Target Cut %', 'sense']
30 |
31 | :param sg_df: DataFrame
32 | :param id_cols: list
33 | :return: DataFrame
34 | """
35 | position_df = sg_df[id_cols + ['Target Cut %']].copy()
36 | position_df['sense'] = sg_df['Orientation'] == 'sense'
37 | return position_df
38 |
39 | # Cell
40 | def get_one_aa_frac(feature_dict, aa_sequence, aas):
41 | """Get fraction of single aa
42 |
43 | :param feature_dict: dict, feature dictionary
44 | :param aa_sequence: str, amino acid sequence
45 | :param aas: list, list of amino acids
46 | """
47 | for aa in aas:
48 | aa_frac = aa_sequence.count(aa) / len(aa_sequence)
49 | feature_dict[aa] = aa_frac
50 |
51 | # Cell
52 | def get_aa_aromaticity(feature_dict, analyzed_seq):
53 | """Get fraction of aromatic amino acids in a sequence.
54 |
55 | Phe (F) + Trp (W) + Tyr (Y)
56 |
57 | :param feature_dict:
58 | :param analyzed_seq: ProteinAnalysis object
59 | """
60 | feature_dict['Aromaticity'] = analyzed_seq.aromaticity()
61 |
62 |
63 | def get_aa_hydrophobicity(feature_dict, analyzed_seq):
64 | """Grand Average of Hydropathy
65 |
66 | The GRAVY value is calculated by adding the hydropathy value for each residue and dividing
67 | by the length of the sequence (Kyte and Doolittle; 1982). The larger the number, the more hydrophobic the
68 | amino acid
69 |
70 | :param feature_dict: dict
71 | :param analyzed_seq: ProteinAnalysis object
72 | """
73 | feature_dict['Hydrophobicity'] = analyzed_seq.gravy()
74 |
75 |
76 | def get_aa_ip(feature_dict, analyzed_seq):
77 | """Get the Isoelectric Point of an amino acid sequence
78 |
79 | Charge of amino acid
80 |
81 | :param feature_dict: dict
82 | :param analyzed_seq: ProteinAnalysis object
83 | """
84 | feature_dict['Isoelectric Point'] = analyzed_seq.isoelectric_point()
85 |
86 |
87 | def get_aa_secondary_structure(feature_dict, analyzed_seq):
88 | """Get the fraction of amion acids that tend to be in a helix, turn or sheet
89 |
90 | :param feature_dict: dict
91 | :param analyzed_seq: ProteinAnalysis object
92 | """
93 | feature_dict['Helix'], feature_dict['Turn'], feature_dict['Sheet'] = analyzed_seq.secondary_structure_fraction()
94 |
95 |
96 | # Cell
97 | def featurize_aa_seqs(aa_sequences, features=None):
98 | """Get feature DataFrame for a list of amino acid sequences
99 |
100 | :param aa_sequences: list of str
101 | :param features: list or None
102 | :return: DataFrame
103 | """
104 | if features is None:
105 | features = ['Pos. Ind. 1mer', 'Hydrophobicity', 'Aromaticity',
106 | 'Isoelectric Point', 'Secondary Structure']
107 | aas = ['A', 'C', 'D', 'E', 'F',
108 | 'G', 'H', 'I', 'K', 'L',
109 | 'M', 'N', 'P', 'Q', 'R',
110 | 'S', 'T', 'V', 'W', 'Y', '*']
111 | clean_aa_seqs = aa_sequences.str.replace('\*|-', '', regex=True)
112 | feature_dict_list = []
113 | for i, (aa_sequence, clean_sequence) in enumerate(zip(aa_sequences, clean_aa_seqs)):
114 | analyzed_seq = ProteinAnalysis(clean_sequence)
115 | feature_dict = {}
116 | if 'Pos. Ind. 1mer' in features:
117 | get_one_aa_frac(feature_dict, aa_sequence, aas)
118 | if 'Hydrophobicity' in features:
119 | get_aa_hydrophobicity(feature_dict, analyzed_seq)
120 | if 'Aromaticity' in features:
121 | get_aa_aromaticity(feature_dict, analyzed_seq)
122 | if 'Isoelectric Point' in features:
123 | get_aa_ip(feature_dict, analyzed_seq)
124 | if 'Secondary Structure' in features:
125 | get_aa_secondary_structure(feature_dict, analyzed_seq)
126 | feature_dict_list.append(feature_dict)
127 | feature_matrix = pd.DataFrame(feature_dict_list)
128 | feature_matrix.index = aa_sequences
129 | return feature_matrix
130 |
131 | # Cell
132 | def extract_amino_acid_subsequence(sg_aas, width):
133 | """ Get the amino acid subsequence with a width of `width` on either side of the Amino Acid index
134 |
135 | :param sg_aas: DataFrame, sgRNA designs merged with amino acid sequence
136 | :param width: int
137 | :return: DataFrame
138 | """
139 | # Pad the sequences at the beginning and end, so our index doesn't go over
140 | l_padding = '-' * (width + 1) # can cut just before the CDS
141 | r_padding = '-' * width # can cut the stop codon
142 | # add stop codon at the end of the sequence
143 | sg_aas_subseq = sg_aas.copy()
144 | sg_aas_subseq['extended_seq'] = l_padding + sg_aas_subseq['seq'] + '*' + r_padding
145 | sg_aas_subseq['AA 0-Indexed'] = sg_aas_subseq['AA Index'] - 1
146 | sg_aas_subseq['AA 0-Indexed padded'] = sg_aas_subseq['AA 0-Indexed'] + len(l_padding)
147 | sg_aas_subseq['seq_start'] = (sg_aas_subseq['AA 0-Indexed padded'] - width).astype(int)
148 | sg_aas_subseq['seq_end'] = (sg_aas_subseq['AA 0-Indexed padded'] + width).astype(int)
149 | sg_aas_subseq['AA Subsequence'] = sg_aas_subseq.apply(lambda row: row['extended_seq'][row['seq_start']:(row['seq_end'] + 1)],
150 | axis=1)
151 | return sg_aas_subseq
152 |
153 |
154 | # Cell
155 | def get_aa_subseq_df(sg_designs, aa_seq_df, width, id_cols,
156 | transcript_base_col='Transcript Base',
157 | target_transcript_col='Target Transcript',
158 | aa_index_col='AA Index'):
159 | """Get the amino acid subsequences for a design dataframe
160 |
161 | :param sg_designs: DataFrame
162 | :param aa_seq_df: DataFrame, Transcript Base and (AA) seq
163 | :param width: int, length on each side of the cut site
164 | :param transcript_base_col: str
165 | :param target_transcript_col: str
166 | :param aa_index_col: str
167 | :return: DataFrame
168 | """
169 | sg_aas = (aa_seq_df.merge(sg_designs[list(set(id_cols +
170 | [target_transcript_col, transcript_base_col, aa_index_col]))],
171 | how='inner',
172 | on=[target_transcript_col, transcript_base_col]))
173 | sg_aas_subseq = extract_amino_acid_subsequence(sg_aas, width)
174 | return sg_aas_subseq
175 |
176 | # Cell
177 | def get_amino_acid_features(aa_subseq_df, features, id_cols):
178 | """Featurize amino acid sequences
179 |
180 | :param aa_subseq_df: DataFrame
181 | :param features: list
182 | :param id_cols: list
183 | :return: DataFrame
184 | """
185 |
186 | # Zero-indexed for python
187 | # filter out sequences without the canonical amino acids
188 | aa_set = set('ARNDCQEGHILKMFPSTWYV*-')
189 | filtered_sg_aas = (aa_subseq_df[aa_subseq_df['AA Subsequence'].apply(lambda s: set(s) <= aa_set)]
190 | .reset_index(drop=True))
191 | filtered_diff = (aa_subseq_df.shape[0] - filtered_sg_aas.shape[0])
192 | if filtered_diff > 0:
193 | warnings.warn('Ignored ' + str(filtered_diff) + ' amino acid sequences with non-canonical amino acids')
194 | aa_features = featurize_aa_seqs(filtered_sg_aas['AA Subsequence'], features=features)
195 | aa_features_annot = pd.concat([filtered_sg_aas[id_cols + ['AA Subsequence']]
196 | .reset_index(drop=True),
197 | aa_features.reset_index(drop=True)], axis=1)
198 | return aa_features_annot
199 |
200 |
201 | # Cell
202 | def get_protein_domain_features(sg_design_df, protein_domains, id_cols,
203 | sources=None,
204 | transcript_base_col='Transcript Base',
205 | aa_index_col='AA Index',
206 | domain_type_col='type',
207 | domain_start_col='start',
208 | domain_end_col='end'):
209 | """Get binary dataframe of protein domains
210 |
211 | :param sg_design_df: DataFrame, with columns [transcript_base_col, aa_index_col]
212 | :param protein_domains: DataFrame, with columns [transcript_base_col, domain_type_col]
213 | :param id_cols: list
214 | :param sources: list. list of database types to include
215 | :param transcript_base_col: str
216 | :param aa_index_col: str
217 | :param domain_type_col: str
218 | :param domain_start_col: str
219 | :param domain_end_col: str
220 | :return: DataFrame, with binary features for protein domains
221 | """
222 | if sources is None:
223 | sources = ['Pfam', 'PANTHER', 'HAMAP', 'SuperFamily', 'TIGRfam', 'ncoils', 'Gene3D',
224 | 'Prosite_patterns', 'Seg', 'SignalP', 'TMHMM', 'MobiDBLite',
225 | 'PIRSF', 'PRINTS', 'Smart', 'Prosite_profiles'] # exclude sifts
226 | protein_domains = protein_domains[protein_domains[domain_type_col].isin(sources)]
227 | clean_designs = sg_design_df[list(set(id_cols + [transcript_base_col, aa_index_col]))].copy()
228 | designs_domains = clean_designs.merge(protein_domains,
229 | how='inner', on=transcript_base_col)
230 | # Note - not every sgRNA will be present in the feature df
231 | filtered_domains = (designs_domains[designs_domains[aa_index_col].between(designs_domains[domain_start_col],
232 | designs_domains[domain_end_col])]
233 | .copy())
234 | filtered_domains = filtered_domains[id_cols + [domain_type_col]].drop_duplicates()
235 | filtered_domains['present'] = 1
236 | domain_feature_df = (filtered_domains.pivot_table(values='present',
237 | index=id_cols,
238 | columns='type', fill_value=0)
239 | .reset_index())
240 | # Ensure all domain columns are present for testing
241 | full_column_df = pd.DataFrame(columns=id_cols + sources, dtype=int) # empty
242 | domain_feature_df = pd.concat([full_column_df, domain_feature_df]).fillna(0)
243 | domain_feature_df[sources] = domain_feature_df[sources].astype(int)
244 | return domain_feature_df
245 |
246 | # Cell
247 | def get_conservation_ranges(cut_pos, small_width, large_width):
248 | small_range = range(cut_pos - small_width + 1, cut_pos + small_width + 1)
249 | large_range = range(cut_pos - large_width + 1, cut_pos + large_width + 1)
250 | return small_range, large_range
251 |
252 |
253 | def get_conservation_features(sg_designs, conservation_df, conservation_column,
254 | small_width, large_width, id_cols):
255 | """Get conservation features
256 |
257 | :param sg_designs: DataFrame
258 | :param conservation_df: DataFrame, tidy conservation scores indexed by Transcript Base and target position
259 | :param conservation_column: str, name of column to calculate scores with
260 | :param small_width: int, small window length to average scores in one direction
261 | :param large_width: int, large window length to average scores in the one direction
262 | :return: DataFrame of conservation features
263 | """
264 | sg_designs_width = sg_designs[id_cols + ['Transcript Base']].copy()
265 | sg_designs_width['target position small'], sg_designs_width['target position large'] = \
266 | zip(*sg_designs_width['Target Cut Length']
267 | .apply(get_conservation_ranges, small_width=small_width,
268 | large_width=large_width))
269 | small_width_conservation = (sg_designs_width.drop('target position large', axis=1)
270 | .rename({'target position small': 'target position'}, axis=1)
271 | .explode('target position')
272 | .merge(conservation_df, how='inner',
273 | on=['Target Transcript', 'Transcript Base', 'target position'])
274 | .groupby(id_cols)
275 | .agg(cons=(conservation_column, 'mean'))
276 | .rename({'cons': 'cons_' + str(small_width * 2)}, axis=1)
277 | .reset_index())
278 | large_width_conservation = (sg_designs_width.drop('target position small', axis=1)
279 | .rename({'target position large': 'target position'}, axis=1)
280 | .explode('target position')
281 | .merge(conservation_df, how='inner',
282 | on=['Target Transcript', 'Transcript Base', 'target position'])
283 | .groupby(id_cols)
284 | .agg(cons=(conservation_column, 'mean'))
285 | .rename({'cons': 'cons_' + str(large_width * 2)}, axis=1)
286 | .reset_index())
287 | cons_feature_df = small_width_conservation.merge(large_width_conservation, how='outer',
288 | on=id_cols)
289 | return cons_feature_df
290 |
291 | # Cell
292 | def merge_feature_dfs(design_df,
293 | aa_subseq_df, aa_features=None,
294 | domain_df=None,
295 | conservation_df=None,
296 | id_cols=None):
297 | if id_cols is None:
298 | id_cols = ['sgRNA Context Sequence', 'Target Cut Length',
299 | 'Target Transcript', 'Orientation']
300 | if aa_features is None:
301 | aa_features = ['Pos. Ind. 1mer',
302 | 'Hydrophobicity', 'Aromaticity',
303 | 'Isoelectric Point', 'Secondary Structure']
304 | if design_df[id_cols].drop_duplicates().shape[0] != design_df.shape[0]:
305 | raise ValueError('id_cols must uniquely identify rows of the design dataframe')
306 | feature_df_dict = dict()
307 | feature_list = list()
308 | position_feature_df = get_position_features(design_df, id_cols=id_cols)
309 | feature_df_dict['position'] = position_feature_df
310 | feature_list.extend(['Target Cut %', 'sense'])
311 | if domain_df is not None:
312 | feature_df_dict['domain'] = domain_df
313 | feature_list.extend(['Pfam', 'PANTHER', 'HAMAP', 'SuperFamily', 'TIGRfam', 'ncoils', 'Gene3D',
314 | 'Prosite_patterns', 'Seg', 'SignalP', 'TMHMM', 'MobiDBLite',
315 | 'PIRSF', 'PRINTS', 'Smart', 'Prosite_profiles'])
316 | if conservation_df is not None:
317 | feature_df_dict['conservation'] = conservation_df
318 | # hardcoded
319 | feature_list.extend(['cons_4', 'cons_32'])
320 | aa_feature_df = get_amino_acid_features(aa_subseq_df, aa_features, id_cols)
321 | feature_list.extend(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
322 | 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '*',
323 | 'Hydrophobicity', 'Aromaticity', 'Isoelectric Point', 'Helix', 'Turn',
324 | 'Sheet'])
325 | feature_df_dict['aa'] = aa_feature_df
326 | feature_df = design_df[id_cols]
327 | for key, df in feature_df_dict.items():
328 | feature_df = pd.merge(feature_df, df, how='left', on=id_cols)
329 | return feature_df, feature_list
330 |
--------------------------------------------------------------------------------
/settings.ini:
--------------------------------------------------------------------------------
1 | [DEFAULT]
2 | host = github
3 | lib_name = rs3
4 | user = gpp-rnd
5 | description = Predict the activity of CRISPR sgRNAs
6 | keywords = rs3, CRISPR, sgrna
7 | author = Peter Deweirdt
8 | author_email = petedeweirdt@gmail.com
9 | copyright = Genetic Perturbation Platform, Broad Institute
10 | branch = master
11 | version = 0.0.16
12 | min_python = 3.7
13 | audience = Developers
14 | language = English
15 | custom_sidebar = False
16 | license = apache2
17 | status = 2
18 | requirements = joblib>=1.0.1 pandas>=1.0.0 lightgbm>=3.0.0,<=3.3.5 sglearn>=1.2.5 tqdm>=4.61.2 pyarrow>=4.0.1 biopython>=1.78 scikit-learn>=0.24.2 requests>=2.25.1
19 | dev_requirements = gpplot>=0.5.0 seaborn>=0.11.0 scipy>=1.0.1 jupyterlab>=3.0.0 nbdev>=1.1.14,<2.0.0 matplotlib>=3.3.4 tabulate>=0.8.9 jupyter-client<=6.1.12
20 | nbs_path = .
21 | doc_path = docs
22 | recursive = False
23 | doc_host = https://gpp-rnd.github.io
24 | doc_baseurl = /rs3/
25 | git_url = https://github.com/gpp-rnd/rs3/tree/master/
26 | lib_path = rs3
27 | title = rs3
28 |
29 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from pkg_resources import parse_version
2 | from configparser import ConfigParser
3 | import setuptools
4 | import re
5 | import sys
6 |
7 | assert parse_version(setuptools.__version__) >= parse_version('36.2')
8 |
9 | # note: all settings are in settings.ini; edit there, not here
10 | config = ConfigParser(delimiters=['='])
11 | config.read('settings.ini')
12 | cfg = config['DEFAULT']
13 |
14 | cfg_keys = 'version description keywords author author_email'.split()
15 | expected = cfg_keys + "lib_name user branch license status min_python audience language".split()
16 | for o in expected:
17 | assert o in cfg, "missing expected setting: {}".format(o)
18 | setup_cfg = {o: cfg[o] for o in cfg_keys}
19 |
20 | if len(sys.argv) > 1 and sys.argv[1] == 'version':
21 | print(setup_cfg['version'])
22 | exit()
23 |
24 | licenses = {
25 | 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'),
26 | 'mit': ('MIT License', 'OSI Approved :: MIT License'),
27 | 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'),
28 | 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'),
29 | 'bsd3': ('BSD License', 'OSI Approved :: BSD License'),
30 | }
31 | statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha',
32 | '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ]
33 | py_versions = '2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8'.split()
34 |
35 | lic = licenses.get(cfg['license'].lower(), (cfg['license'], None))
36 | min_python = cfg['min_python']
37 |
38 | requirements = ['pip', 'packaging']
39 | if cfg.get('requirements'):
40 | requirements += cfg.get('requirements', '').split()
41 | if cfg.get('pip_requirements'):
42 | requirements += cfg.get('pip_requirements', '').split()
43 | dev_requirements = (cfg.get('dev_requirements') or '').split()
44 |
45 | long_description = open('README.md').read()
46 | # 
47 | for ext in ['png', 'svg']:
48 | long_description = re.sub(r'!\['+ext+'\]\((.*)\)', '+'/'+cfg['branch']+'/\\1)', long_description)
49 | long_description = re.sub(r'src=\"(.*)\.'+ext+'\"', 'src=\"https://raw.githubusercontent.com/{}/{}'.format(cfg['user'],cfg['lib_name'])+'/'+cfg['branch']+'/\\1.'+ext+'\"', long_description)
50 |
51 | setuptools.setup(
52 | name = cfg['lib_name'],
53 | license = lic[0],
54 | classifiers = [
55 | 'Development Status :: ' + statuses[int(cfg['status'])],
56 | 'Intended Audience :: ' + cfg['audience'].title(),
57 | 'Natural Language :: ' + cfg['language'].title(),
58 | ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []),
59 | url = cfg['git_url'],
60 | packages = setuptools.find_packages(),
61 | include_package_data = True,
62 | install_requires = requirements,
63 | extras_require={ 'dev': dev_requirements },
64 | python_requires = '>=' + cfg['min_python'],
65 | long_description = long_description,
66 | long_description_content_type = 'text/markdown',
67 | zip_safe = False,
68 | entry_points = { 'console_scripts': cfg.get('console_scripts','').split() },
69 | **setup_cfg)
70 |
71 |
--------------------------------------------------------------------------------
/target_lite_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/target_lite_model.pkl
--------------------------------------------------------------------------------
/target_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/target_model.pkl
--------------------------------------------------------------------------------
/test_data/codon_map.csv:
--------------------------------------------------------------------------------
1 | Codon,Amino Acid,Property
2 | TTT,F,Nonpolar
3 | TTC,F,Nonpolar
4 | TTA,L,Nonpolar
5 | TTG,L,Nonpolar
6 | CTT,L,Nonpolar
7 | CTC,L,Nonpolar
8 | CTA,L,Nonpolar
9 | CTG,L,Nonpolar
10 | ATT,I,Nonpolar
11 | ATC,I,Nonpolar
12 | ATA,I,Nonpolar
13 | ATG,M,Nonpolar
14 | GTT,V,Nonpolar
15 | GTC,V,Nonpolar
16 | GTA,V,Nonpolar
17 | GTG,V,Nonpolar
18 | TCT,S,Polar
19 | TCC,S,Polar
20 | TCA,S,Polar
21 | TCG,S,Polar
22 | CCT,P,Nonpolar
23 | CCC,P,Nonpolar
24 | CCA,P,Nonpolar
25 | CCG,P,Nonpolar
26 | ACT,T,Polar
27 | ACC,T,Polar
28 | ACA,T,Polar
29 | ACG,T,Polar
30 | GCT,A,Nonpolar
31 | GCC,A,Nonpolar
32 | GCA,A,Nonpolar
33 | GCG,A,Nonpolar
34 | TAT,Y,Polar
35 | TAC,Y,Polar
36 | TAA,*,Stop
37 | TAG,*,Stop
38 | CAT,H,Basic
39 | CAC,H,Basic
40 | CAA,Q,Polar
41 | CAG,Q,Polar
42 | AAT,N,Polar
43 | AAC,N,Polar
44 | AAA,K,Basic
45 | AAG,K,Basic
46 | GAT,D,Acidic
47 | GAC,D,Acidic
48 | GAA,E,Acidic
49 | GAG,E,Acidic
50 | TGT,C,Polar
51 | TGC,C,Polar
52 | TGA,*,Stop
53 | TGG,W,Nonpolar
54 | CGT,R,Basic
55 | CGC,R,Basic
56 | CGA,R,Basic
57 | CGG,R,Basic
58 | AGT,S,Polar
59 | AGC,S,Polar
60 | AGA,R,Basic
61 | AGG,R,Basic
62 | GGT,G,Nonpolar
63 | GGC,G,Nonpolar
64 | GGA,G,Nonpolar
65 | GGG,G,Nonpolar
--------------------------------------------------------------------------------
/test_data/target_data/aa_seqs.pq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/test_data/target_data/aa_seqs.pq
--------------------------------------------------------------------------------
/test_data/target_data/conservation.pq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/test_data/target_data/conservation.pq
--------------------------------------------------------------------------------
/test_data/target_data/protein_domains.pq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gpp-rnd/rs3/9d6f5489bcbe1c78db833b94be5e714a4fd0440f/test_data/target_data/protein_domains.pq
--------------------------------------------------------------------------------