├── .github
└── workflows
│ └── build.yml
├── .gitignore
├── LICENSE.txt
├── Makefile
├── README.md
├── _config.yml
├── _static
├── ARG.png
├── tables_uml.svg
├── tskit_logo.svg
├── tute-constant-migration-svg-out.png
├── tute-divergence-1.png
├── tute-divergence-2.png
├── tute-divergence-svg-out.png
├── tute-mass-migration.png
├── tute-parameter-changes.png
└── tute-population-structure-1.png
├── _toc.yml
├── advanced_msprime.md
├── analysing_tree_sequences.md
├── analysing_trees.md
├── args.md
├── bottlenecks.md
├── build.sh
├── completing_forward_sims.md
├── counting_topologies.md
├── data
├── afs.trees
├── basics.trees
├── benchmarks_without_copy_longer_genome.txt
├── computing_statistics.trees
├── construction_example.trees
├── different_time_samples.trees
├── download.py
├── metadata.trees
├── parsimony_map.pickle
├── parsimony_map.trees
├── parsimony_simple.trees
├── storing_everyone.csv
├── tables_example.trees
├── tables_example_muts.trees
├── topologies_sim_speciestree.trees
├── topologies_sim_stdpopsim.trees
├── tree_traversals.trees
├── unified_genealogy_2q_108Mb-110Mb.tsz
├── viz_ts_full.trees
├── viz_ts_selection.trees
├── viz_ts_small.trees
├── viz_ts_small_mutated.trees
├── viz_ts_tiny.trees
├── whatis_example.trees
└── whatis_example.yml
├── demography.md
├── forward_sims.md
├── getting_started.md
├── incremental_algorithms.md
├── intro.md
├── introgression.md
├── metadata.md
├── more_forward_sims.md
├── no_mutations.md
├── old-content
├── README.md
├── docs
│ ├── README.md
│ ├── _config.yml
│ ├── _layouts
│ │ └── default.html
│ ├── bottlenecks.md
│ ├── bottlenecks_files
│ │ ├── bottlenecks_12_1.svg
│ │ ├── bottlenecks_16_0.svg
│ │ ├── bottlenecks_4_0.svg
│ │ └── bottlenecks_7_0.svg
│ ├── introgression.md
│ ├── introgression_files
│ │ ├── introgression_15_0.svg
│ │ └── introgression_9_0.svg
│ ├── msprime_out.md
│ ├── msprime_out_files
│ │ ├── msprime_out_15_0.svg
│ │ ├── msprime_out_19_0.svg
│ │ ├── msprime_out_28_0.svg
│ │ ├── msprime_out_32_0.svg
│ │ ├── msprime_out_39_0.png
│ │ ├── msprime_out_40_0.png
│ │ └── msprime_out_43_0.png
│ ├── wfcython.md
│ ├── wfcython_files
│ │ ├── wfcython_17_0.svg
│ │ ├── wfcython_17_1.svg
│ │ ├── wfcython_17_2.svg
│ │ └── wfcython_17_3.svg
│ ├── wfforward.md
│ └── wfforward_files
│ │ ├── wfforward_11_0.svg
│ │ ├── wfforward_18_0.svg
│ │ ├── wfforward_20_0.svg
│ │ ├── wfforward_53_0.svg
│ │ ├── wfforward_55_0.svg
│ │ ├── wfforward_59_0.svg
│ │ └── wfforward_65_0.svg
└── notebooks
│ ├── Makefile.wfcython
│ ├── Makefile.wfforward
│ ├── README.md
│ ├── execute.py
│ ├── wfcython.ipynb
│ └── wfforward.ipynb
├── parallelization.md
├── phylogen.md
├── popgen.md
├── references.bib
├── requirements-CI.txt
├── requirements.txt
├── simplification.md
├── simulation_overview.md
├── tables_and_editing.md
├── terminology_and_concepts.md
├── tskitr.md
├── tutorial_development.md
├── viz.md
└── what_is.md
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | # Based on https://github.com/executablebooks/github-action-demo
2 | name: build
3 |
4 | on:
5 | pull_request:
6 | push:
7 | branches: [main]
8 | tags:
9 | - '*'
10 | # This job installs dependencies, build the book, and pushes it to `gh-pages`
11 | jobs:
12 | build:
13 | runs-on: ubuntu-24.04
14 | steps:
15 | - name: Cancel Previous Runs
16 | uses: styfle/cancel-workflow-action@0.12.1
17 | with:
18 | access_token: ${{ github.token }}
19 |
20 | - name: Checkout
21 | uses: actions/checkout@v4.2.2
22 |
23 | # Install dependencies
24 | - name: Install Graphviz
25 | run: |
26 | sudo apt-get install graphviz {lib,}graphviz-dev
27 |
28 | - name: Set up Python 3.11
29 | uses: actions/setup-python@v5.4.0
30 | with:
31 | python-version: "3.11"
32 | cache: "pip"
33 |
34 | - name: Install python dependencies
35 | run: |
36 | pip install --upgrade pip wheel
37 | pip install -r requirements-CI.txt
38 |
39 | - name: Install R support
40 | run: |
41 | # We need to remove R to pull in a version that's compatible with CRAN, weirdly.
42 | sudo apt-get update
43 | sudo apt-get remove r-base-core
44 | sudo apt-get install r-cran-reticulate r-cran-pbdzmq r-cran-uuid r-cran-ape
45 | sudo R -e 'install.packages("IRkernel")'
46 | R -e 'IRkernel::installspec()'
47 |
48 | # Build the book
49 | - name: Build the book
50 | run: ./build.sh
51 |
52 | - name: Copy files for users to run tutorials
53 | run: |
54 | cp ./requirements.txt ./_build/html/.
55 | cp -r data ./_build/html/examples
56 | ls data > ./_build/html/examples/files.txt
57 |
58 | # Push the book's HTML to github-pages
59 | - name: GitHub Pages action
60 | if: github.ref == 'refs/heads/main'
61 | uses: peaceiris/actions-gh-pages@v4.0.0
62 | with:
63 | github_token: ${{ secrets.GITHUB_TOKEN }}
64 | publish_dir: ./_build/html
65 |
66 | - name: Trigger docs site rebuild
67 | if: github.ref == 'refs/heads/main'
68 | run: |
69 | curl -X POST https://api.github.com/repos/tskit-dev/tskit-site/dispatches \
70 | -H 'Accept: application/vnd.github.everest-preview+json' \
71 | -u AdminBot-tskit:${{ secrets.ADMINBOT_TOKEN }} \
72 | --data '{"event_type":"build-docs"}'
73 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | _build
3 | .DS_Store
4 | # ignore files created by the user during a tut
5 | data/my_tree_sequence.trees
6 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 |
2 | # Simple makefile for dev.
3 |
4 | all:
5 | # Use the local build wrapper to automate writing the report log to stdout.
6 | ./build.sh
7 |
8 | clean:
9 | rm -fR _build
10 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tutorials
2 |
3 | A set of tutorials for the tskit ecosystem build using
4 | [Jupyter Book](https://jupyterbook.org/), served up at
5 | [https://tskit.dev/tutorials/](https://tskit.dev/tutorials/).
6 |
7 | Merges to this repo will trigger a rebuild of the
8 | [tskit.dev web site](https://tskit.dev/) via an
9 | [action](https://github.com/tskit-dev/tskit-site/actions) on the
10 | [tskit-site repository](https://github.com/tskit-dev/tskit-site/):
11 | look there for any deployment issues.
12 |
13 | **Under construction**
14 |
15 | These are quick notes for developers while the real developers page is
16 | under construction.
17 |
18 | # Requirements
19 |
20 | Install the Python requirements from requirements.txt:
21 | ```
22 | $ python -m pip install -r requirements.txt
23 | ```
24 |
25 | You will also need a working R installation with reticulate and irkernel installed.
26 | This command should do the trick:
27 | ```
28 | $ R -e 'install.packages(c("reticulate", "IRkernel")); IRkernel::installspec()'
29 | ```
30 |
31 | # Building tutorials
32 |
33 | - To add a new tutorial, create a Markdown file and add its name to ``_toc.yml``.
34 | - If you are basing the tutorial on an existing notebook, use
35 | [jupytext](https://github.com/mwouts/jupytext) to convert the notebook into
36 | the right format.
37 | - To build locally, run ``make``. The output tells you where to find the
38 | built HTML.
39 | - Pages rendered at https://tskit.dev/tutorials
40 | - Pages might take a while to be updated after a new tutorial is merged.
41 |
42 | If you have an idea for a tutorial, please open an issue to discuss.
43 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | # Book settings
2 | # Learn more at https://jupyterbook.org/customize/config.html
3 |
4 | title: Tree Sequence Tutorials
5 | author: Tskit Developers
6 | logo: _static/tskit_logo.svg
7 |
8 | # Force re-execution of notebooks on each build.
9 | # See https://jupyterbook.org/content/execute.html
10 | execute:
11 | execute_notebooks: cache
12 | timeout: 300
13 |
14 | # Information about where the book exists on the web
15 | repository:
16 | url: https://github.com/tskit-dev/tutorials
17 | branch: main
18 |
19 | launch_buttons:
20 | binderhub_url: ""
21 |
22 | # Add GitHub buttons to your book
23 | # See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository
24 | html:
25 | use_issues_button: true
26 | use_repository_button: true
27 |
28 | only_build_toc_files: true
29 |
30 | sphinx:
31 | extra_extensions:
32 | - sphinx.ext.todo
33 | - sphinx.ext.intersphinx
34 | - IPython.sphinxext.ipython_console_highlighting
35 | config:
36 | html_theme: sphinx_book_theme
37 | html_theme_options:
38 | pygments_dark_style: monokai
39 | todo_include_todos: true
40 | intersphinx_mapping:
41 | py: ["https://docs.python.org/3", null]
42 | tskit: ["https://tskit.dev/tskit/docs/stable", null]
43 | tszip: ["https://tskit.dev/tszip/docs/latest/", null]
44 | msprime: ["https://tskit.dev/msprime/docs/stable", null]
45 | pyslim: ["https://tskit.dev/pyslim/docs/stable", null]
46 | numpy: ["https://numpy.org/doc/stable/", null]
47 | ipython: ["https://ipython.readthedocs.io/en/stable/", null]
48 | myst_enable_extensions:
49 | - colon_fence
50 | - deflist
51 | - dollarmath
52 |
--------------------------------------------------------------------------------
/_static/ARG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/ARG.png
--------------------------------------------------------------------------------
/_static/tskit_logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/_static/tute-constant-migration-svg-out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-constant-migration-svg-out.png
--------------------------------------------------------------------------------
/_static/tute-divergence-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-divergence-1.png
--------------------------------------------------------------------------------
/_static/tute-divergence-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-divergence-2.png
--------------------------------------------------------------------------------
/_static/tute-divergence-svg-out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-divergence-svg-out.png
--------------------------------------------------------------------------------
/_static/tute-mass-migration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-mass-migration.png
--------------------------------------------------------------------------------
/_static/tute-parameter-changes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-parameter-changes.png
--------------------------------------------------------------------------------
/_static/tute-population-structure-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-population-structure-1.png
--------------------------------------------------------------------------------
/_toc.yml:
--------------------------------------------------------------------------------
1 | format: jb-book
2 | root: intro
3 | parts:
4 | - caption:
5 | chapters:
6 | - file: what_is
7 | - caption: Tree sequence basics
8 | chapters:
9 | - file: terminology_and_concepts
10 | - file: getting_started
11 | - caption: Statistics and Analysis
12 | chapters:
13 | - file: analysing_tree_sequences
14 | - file: analysing_trees
15 | - file: incremental_algorithms
16 | - file: counting_topologies
17 | - file: parallelization
18 | - caption: Further tskit tutorials
19 | chapters:
20 | - file: tables_and_editing
21 | - file: simplification
22 | - file: viz
23 | - file: metadata
24 | - file: args
25 | - caption: Simulation
26 | chapters:
27 | - file: simulation_overview
28 | - file: no_mutations
29 | - file: advanced_msprime
30 | sections:
31 | - file: demography
32 | - file: bottlenecks
33 | - file: introgression
34 | - file: completing_forward_sims
35 | - file: forward_sims
36 | - file: more_forward_sims
37 | - caption: Other languages
38 | # TODO: add basic C and maybe Rust tutes
39 | chapters:
40 | - file: tskitr
41 | - caption: Development
42 | # TODO This would be a great place to put the general tskit development
43 | # guidelines.
44 | chapters:
45 | - file: tutorial_development
46 | - caption: Tskit for ...
47 | chapters:
48 | - file: popgen.md
49 | - file: phylogen.md
50 |
--------------------------------------------------------------------------------
/advanced_msprime.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | (sec_msprime)=
15 |
16 | # Advanced {program}`msprime` topics
17 |
18 | These are advanced topics in [msprime](https://tskit.dev/msprime) or examples of how to
19 | do some particular things with it. This chapter is broken down into the following
20 | sections:
21 |
22 | ```{tableofcontents}
23 | ```
24 |
25 | For discussion of other advanced msprime topics, you might also want to look at
26 | the [msprime discussion forums](https://github.com/tskit-dev/msprime/discussions).
27 |
--------------------------------------------------------------------------------
/analysing_tree_sequences.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | ```{currentmodule} tskit
15 | ```
16 |
17 | ```{code-cell} ipython3
18 | :tags: [remove-cell]
19 | import msprime
20 | import numpy as np
21 | import tskit
22 |
23 | def computing_statistics():
24 | ts = msprime.simulate(
25 | 10**4, Ne=10**4, recombination_rate=1e-8, mutation_rate=1e-8, length=10**7, random_seed=42)
26 | ts.dump("data/computing_statistics.trees")
27 |
28 | def afs():
29 | ts = msprime.simulate(6, mutation_rate=1, random_seed=47)
30 | # remove the mutation times so the plot is nicer
31 | tables = ts.dump_tables()
32 | tables.mutations.time = np.full_like(tables.mutations.time, tskit.UNKNOWN_TIME)
33 | ts = tables.tree_sequence()
34 | ts.dump("data/afs.trees")
35 |
36 |
37 | def create_notebook_data():
38 | computing_statistics()
39 | afs()
40 |
41 | # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook
42 | ```
43 |
44 | (sec_analysing_tree_sequences)=
45 |
46 | # _Analysing tree sequences_
47 | % remove underscores in title when tutorial is complete or near-complete
48 |
49 | :::{note}
50 | This tutorial is a work in progress.
51 | :::
52 |
53 |
54 | (sec_tutorial_stats)=
55 |
56 | ## Computing statistics
57 |
58 | Tskit provides an extensive and flexible interface for computing population
59 | genetic statistics, which is documented in detail in the
60 | {ref}`general statistics ` section of the offical documentation.
61 | This tutorial aims to give a quick overview of how the APIs work and how to use
62 | them effectively.
63 |
64 | First, let's load a tree sequence to work with which has roughly human
65 | parameters for 10 thousand samples and 10Mb chromosomes:
66 |
67 | ```{code-cell} ipython3
68 | ts = tskit.load("data/computing_statistics.trees")
69 | ts
70 | ```
71 |
72 | This tree sequence has ~36.6 thousand trees & ~39 thousand segregating sites.
73 | We'd now like to compute some statistics on this dataset.
74 |
75 | ### One-way statistics
76 |
77 | We refer to statistics that are defined with respect to a single set of
78 | samples as "one-way". An example of such a statistic is diversity, which
79 | is computed using the {meth}`TreeSequence.diversity` method:
80 |
81 | ```{code-cell} ipython3
82 | d = ts.diversity()
83 | print("Average diversity per unit sequence length = {d:.3G}")
84 | ```
85 |
86 | This tells the average diversity across the whole sequence and returns a single
87 | number. We'll usually want to compute statistics in
88 | {ref}`windows ` along the genome and we
89 | use the ``windows`` argument to do this:
90 |
91 | ```{code-cell} ipython3
92 | windows = np.linspace(0, ts.sequence_length, num=5)
93 | d = ts.diversity(windows=windows)
94 | print(windows, d, sep="\n")
95 | ```
96 |
97 | The ``windows`` argument takes a numpy array specifying the breakpoints
98 | along the genome. Here, we use numpy to create four equally spaced windows
99 | of size 2.5 megabases (the windows array contains k + 1 elements to define
100 | k windows). Because we have asked for values in windows, tskit now returns
101 | a numpy array rather than a single value. (See
102 | {ref}`sec_stats_output_dimensions` for a full description of how the output
103 | dimensions of statistics are determined by the ``windows`` argument.)
104 |
105 | Suppose we wanted to compute diversity within a specific subset of samples.
106 | We can do this using the ``sample_sets`` argument:
107 |
108 | ```{code-cell} ipython3
109 | A = ts.samples()[:100]
110 | d = ts.diversity(sample_sets=A)
111 | print(d)
112 | ```
113 |
114 | Here, we've computed the average diversity within the first hundred samples across
115 | the whole genome. As we've not specified any windows, this is again a single value.
116 |
117 | We can also compute diversity in *multiple* sample sets at the same time by providing
118 | a list of sample sets as an argument:
119 |
120 | ```{code-cell} ipython3
121 | A = ts.samples()[:100]
122 | B = ts.samples()[100:200]
123 | C = ts.samples()[200:300]
124 | d = ts.diversity(sample_sets=[A, B, C])
125 | print(d)
126 | ```
127 |
128 | Because we've computed multiple statistics concurrently, tskit returns a numpy array
129 | of these statistics. We have asked for diversity within three different sample sets,
130 | and tskit therefore returns an array with three values. (In general, the
131 | dimensions of the input determines the dimensions of the output: see
132 | {ref}`tskit:sec_stats_output_dimensions` for a detailed description of the rules.)
133 |
134 | We can also compute multiple statistics in multiple windows:
135 |
136 | ```{code-cell} ipython3
137 | d = ts.diversity(sample_sets=[A, B, C], windows=windows)
138 | print("shape = ", d.shape, "\n", d)
139 | ```
140 |
141 | We have computed diversity within three different sample sets across four
142 | genomic windows, and our output is therefore a 2D numpy array with four
143 | rows and three columns: each row contains the diversity values within
144 | A, B and C for a particular window.
145 |
146 | ### Multi-way statistics
147 |
148 | Many population genetic statistics compare multiple sets of samples to
149 | each other. For example, the {meth}`TreeSequence.divergence` method computes
150 | the divergence between two subsets of samples:
151 |
152 | ```{code-cell} ipython3
153 | A = ts.samples()[:100]
154 | B = ts.samples()[:100]
155 | d = ts.divergence([A, B])
156 | print(d)
157 | ```
158 |
159 | The divergence between two sets of samples A and B is a single number,
160 | and we we again return a single floating point value as the result. We can also
161 | compute this in windows along the genome, as before:
162 |
163 | ```{code-cell} ipython3
164 | d = ts.divergence([A, B], windows=windows)
165 | print(d)
166 | ```
167 |
168 | Again, as we have defined four genomic windows along the sequence, the result is
169 | numpy array with four values.
170 |
171 | A powerful feature of tskit's stats API is that we can compute the divergences
172 | between multiple sets of samples simultaneously using the ``indexes`` argument:
173 |
174 |
175 | ```{code-cell} ipython3
176 | d = ts.divergence([A, B, C], indexes=[(0, 1), (0, 2)])
177 | print(d)
178 | ```
179 |
180 | Here, we've specified three sample sets A, B and C and we've computed the
181 | divergences between A and B, and between A and C. The ``indexes`` argument is used
182 | to specify which pairs of sets we are interested in. In this example
183 | we've computed two different divergence values and the output is therefore
184 | a numpy array of length 2.
185 |
186 | As before, we can combine computing multiple statistics in multiple windows
187 | to return a 2D numpy array:
188 |
189 | ```{code-cell} ipython3
190 | windows = np.linspace(0, ts.sequence_length, num=5)
191 | d = ts.divergence([A, B, C], indexes=[(0, 1), (0, 2)], windows=windows)
192 | print(d)
193 | ```
194 |
195 | Each row again corresponds to a window, which contains the average divergence
196 | values between the chosen sets.
197 |
198 | If the ``indexes`` parameter is 1D array, we interpret this as specifying
199 | a single statistic and remove the empty outer dimension:
200 |
201 | ```{code-cell} ipython3
202 | d = ts.divergence([A, B, C], indexes=(0, 1))
203 | print(d)
204 | ```
205 |
206 | It's important to note that we don't **have** to remove empty dimensions: tskit
207 | will only do this if you explicitly ask it to. Here, for example, we can keep the
208 | output as an array with one value if we wish:
209 |
210 | ```
211 | d = ts.divergence([A, B, C], indexes=[(0, 1)])
212 | print(d)
213 | ```
214 |
215 | Please see {ref}`tskit:sec_stats_sample_sets` for a
216 | full description of the ``sample_sets`` and ``indexes`` arguments.
217 |
218 | (sec_tutorial_afs)=
219 |
220 | ## Allele frequency spectra
221 |
222 | The allele frequency spectrum is a fundamental tool in population genetics, and
223 | tskit provides a flexible and powerful approach to computing such spectra.
224 | Suppose we have simulated the following tree sequence:
225 |
226 | ```{code-cell} ipython3
227 | from IPython.display import display
228 | ts = tskit.load("data/afs.trees")
229 | tree = ts.first()
230 | display(tree.draw_svg())
231 | ts.tables.sites
232 | ```
233 |
234 | Computing the allele frequency spectrum is then easy:
235 |
236 | ```{code-cell} ipython3
237 | afs = ts.allele_frequency_spectrum(polarised=True, span_normalise=False)
238 | print(afs)
239 | ```
240 |
241 | This tells us that we have two singletons, six doubletons and one 3-ton and
242 | one 4-ton. Note that the first element of the returned AFS array does *not* correspond
243 | to the singletons (see below for why). Because we have simulated these mutations,
244 | we know the ancestral and derived states we have set ``polarised`` to True. We
245 | can get the "folded" AFS by setting polarised to False. Because we want simple
246 | counts here and not averaged values, we set ``span_normalise=False``: by
247 | default, windowed statistics are divided by the sequence length, so they are
248 | comparable between windows.
249 |
250 | The returned value here is actually a 2D array, and this is because we can
251 | also perform these computations in windows along the genome:
252 |
253 | ```{code-cell} ipython3
254 | afs = ts.allele_frequency_spectrum(windows=[0, 0.5, 1], span_normalise=False, polarised=True)
255 | print(afs)
256 | ```
257 |
258 | This time, we've asked for the number of sites at each frequency in two
259 | equal windows. Now we can see that in the first half of the sequence we
260 | have three sites (compare with the site table above): one singleton,
261 | one doubleton and one tripleton.
262 |
263 | ### Joint spectra
264 |
265 | We can also compute allele frequencies within multiple sets of samples,
266 | the *joint allele frequency spectra*.
267 |
268 | ```{code-cell} ipython3
269 | node_colours = {0: "blue", 2: "blue", 3: "blue", 1: "green", 4: "green", 5: "green"}
270 | styles = [f".n{k} > .sym {{fill: {v}}}" for k, v in node_colours.items()]
271 | tree.draw_svg(style = "".join(styles))
272 | ```
273 |
274 | Here we've marked the samples as either blue or green (we can imagine
275 | these belonging to different populations, for example). We can then compute
276 | the joint AFS based on these two sets:
277 |
278 | ```{code-cell} ipython3
279 | afs = ts.allele_frequency_spectrum([[0, 2, 3], [1, 4, 5]], polarised=True)
280 | print(afs)
281 | ```
282 |
283 | Now, each window in our AFS is a 2D numpy array, where each dimension
284 | corresponds to frequencies within the different sets. So, we see for example
285 | that there are six sites that are singletons in both sets, 1 site
286 | that is a doubleton in both sets, and 2 sites that singletons in $[1, 4, 5]$
287 | and not present in the other sample set.
288 |
289 | ### Branch length spectra
290 |
291 | Up to now we've used the {meth}`~TreeSequence.allele_frequency_spectrum` method
292 | to summarise the number of sites that occur at different frequencies. We can also
293 | use this approach to compute the total branch lengths subtending a given
294 | number of samples by setting ``mode="branch"``:
295 |
296 | ```{code-cell} ipython3
297 | afs = ts.allele_frequency_spectrum(mode="branch", polarised=True, span_normalise=False)
298 | print(afs)
299 | ```
300 |
301 | Thus, the total branch length over example one sample is 4.86, over two is
302 | 5.39, and so on.
303 |
304 |
305 | (sec_tutorial_afs_zeroth_entry)=
306 |
307 | ### Zeroth and final entries in the AFS
308 |
309 | The zeroth element of the AFS is significant when we are working with
310 | sample sets that are a subset of all samples in the tree sequence.
311 | For example, in the following we compute the AFS within the sample set
312 | [0, 1, 2]:
313 |
314 | ```{code-cell} ipython3
315 | afs = ts.allele_frequency_spectrum([[0, 1, 2]], mode="branch", polarised=True)
316 | print(afs)
317 | ```
318 |
319 | Thus, the total branch length over 0, 1 and 2 is 5.3, and over pairs from this set
320 | is 5.25. What does the zeroth value of 4.33 signify? This is the total branch length
321 | over all samples that are **not** in this sample set. By including this value, we
322 | maintain the property that for each tree, the sum of the AFS for any sample set
323 | is always equal to the total branch length. For example, here we compute:
324 |
325 | ```{code-cell} ipython3
326 | print("sum afs = ", np.sum(afs))
327 | print("total branch len = ", tree.total_branch_length)
328 | ```
329 |
330 | The final entry of the AFS is similar: it counts alleles (for mode="site") or
331 | branches (for mode="branch") that are ancestral to all of the given sample set,
332 | but are still polymorphic in the entire set of samples of the tree sequence.
333 | Note, however, that alleles fixed among all the samples, e.g., ones above
334 | the root of the tree, will not be included.
335 |
--------------------------------------------------------------------------------
/bottlenecks.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | (sec_msprime_bottlenecks)=
15 |
16 | # Instantaneous Bottlenecks
17 |
18 | **Konrad Lohse and Jerome Kelleher**
19 |
20 | A common approach to modelling the effect of demographic history on genealogies is to assume that effective population size ($N_e$) changes in discrete steps which define a series of epochs (Griffiths, 1994; Marth et al., 2004; Keightley & Eyre-Walker, 2007; Li & Durbin 2011). In this setting of piece-wise constant $N_e$, capturing a population bottleneck requires three epochs: $N_e$ is reduced by some fraction $b$ at the start of the bottleneck, $T_{start}$, and recovers to its initial value at time $T_{end}$ (Marth et al., 2004). If bottlenecks are short both on the timescale of coalescence and mutations, one may expect little information about the duration of a bottleneck
21 | in sequence data. Thus a simpler, alternative model is to assume that bottlenecks are instantaneous ($T_{end}-T_{start} \rightarrow 0$) and generate a sudden burst of coalescence events (a multiple merger event) in the genealogy. The strength of the bottleneck B can be thought of as an (imaginary) time period during which coalescence events are collapsed, i.e. there is no growth in genealogical branches during B and the probability that a single pair of lineages entering the bottleneck coalesce during the bottleneck is $1-e^{-B}$. Although this simple two parameter model of bottlenecks is attractive and both analytic results and empirical inference (Griffiths, 1994; Galtier et al., 2000; Bunnefeld et al., 2015) have been developed under this model, it has not been straightforward to simulate data under instantaneous bottleneck histories with ms. Instantaneous bottlenecks are implemented as a demographic event in msprime. Instanteous bottlenecks are similar to coalescent approximations for selective sweeps in that they generate multiple merger events. However, unlike sweeps the whole genome is effected equally. Instantanous bottlenecks differ from approximate models of sweeps (Durret & Scheinsberg, 2004) in that they do not affect the exchangeability of lineages. A consequence of this is that all topologies are equally likely (before collapsing, the imaginary bottleneck time).
22 |
23 | Here we simulate a single sample of n=12 from a population that underwent an instantaneous bottleneck 0.4 * 2N generations ago. Note that since msprime is assuming a diploid population we set the initial population size to 1/2 to obtain coalescence times scaled in units of $2N_e$ generations:
24 |
25 | ```{code-cell} ipython3
26 | import msprime
27 | import tskit
28 | import numpy as np
29 | import matplotlib.pyplot as plt
30 | from IPython.display import SVG
31 |
32 | def run_bott_sims(num_samp, time, strength, num_rep=None, seed=123):
33 | demography = msprime.Demography()
34 | demography.add_population(initial_size=1/2)
35 | demography.add_instantaneous_bottleneck(time=time, strength=strength, population=0)
36 | return msprime.sim_ancestry(
37 | samples=[msprime.SampleSet(num_samp, ploidy=1)], # Sample num_samp haploid genomes
38 | ploidy=2,
39 | num_replicates=num_rep,
40 | demography=demography,
41 | random_seed=seed)
42 |
43 | bottT = 0.4
44 | bottB = 2
45 |
46 | ts = run_bott_sims(12, time=bottT, strength=bottB)
47 |
48 | ts.draw_svg(y_axis=True, size=(400, 400))
49 | ```
50 |
51 | The genealogy shows several simultaneous coalescence events at the time of the bottleneck
52 | (T=0.4)
53 |
54 |
55 | ## Checking the SFS against analytic expectation
56 |
57 | Bunnefeld et al (2016) derive the total length of n-ton branches under an instantaneous bottleneck using a recursion for the generating function of genealogies. Assuming a sample size of $n=4$, $B=4$ and $T=0.4$, the SFS is:
58 |
59 |
60 | ```{code-cell} ipython3
61 | T=bottT
62 | B = 4
63 | s=1-np.exp(-B)
64 | p=s*(-6 + 15*s - 20 * np.power(s,2) + 15 * np.power(s,3) - 6 * np.power(s,4) + np.power(s,5))
65 |
66 | expsfsBottlN= [2/15*(np.exp(-6*T)*(15 *np.exp(6*T) - 9 *np.exp(5*T)*s -
67 | 5*np.exp(3*T)*s*(3 - 3*s + np.power(s,2)) + p)),
68 | 1/5*np.exp(-6*T)*(5*np.exp(6*T) - 6*np.exp(5*T)*s - p),
69 | 2/15*np.exp(-6*T)*(5*np.exp(6*T) - 9*np.exp(5*T)*s + 5*np.exp(3*T)*s*(3-3*s + np.power(s,2))+ p)]
70 |
71 | expsfsBottlN/=np.sum(expsfsBottlN)
72 | print(expsfsBottlN)
73 | ```
74 |
75 | Checking against msprime (10,000 replicates) shows a close fit to this analytic expectation:
76 |
77 | ```{code-cell} ipython3
78 | nrep = 10_000
79 | nsamp = 4
80 | sims = run_bott_sims(nsamp, time=T, strength=B, num_rep=nrep)
81 | Blist=np.zeros((nrep, nsamp+1))
82 | for rep_index, ts in enumerate(sims):
83 | afs=ts.allele_frequency_spectrum(mode="branch", polarised=True, span_normalise=False)
84 | Blist[rep_index]+= afs
85 |
86 | data=np.mean(Blist, axis=0)
87 | data/=np.sum(data)
88 |
89 | fig, ax = plt.subplots()
90 | index = np.arange(1,4)
91 | bar_width = 0.4
92 | opacity = 0.9
93 |
94 | expsfs = ax.bar(index+ bar_width, expsfsBottlN, bar_width, alpha=opacity, label='exp')
95 | simsfs = ax.bar(index+ 2*bar_width, data[1:4], bar_width, alpha=opacity, label='exp')
96 | ```
97 |
98 | ## Expected coalescence times
99 |
100 | The expected pairwise coalescence time is $E[t]= 1 + e^{-(B+T)} - e^{-T}$. Both the expected
101 | coalescence time and the probability that a pair of lineages survives the bottleneck fit
102 | the analytic expectation closely:
103 |
104 | ```{code-cell} ipython3
105 |
106 | def pairCoalBott(time, strength, num_rep=1000):
107 | """
108 | Simulates replicate 2-tip tree sequences under an instantaneous bottleneck returning the TMRCA
109 | """
110 | reps=run_bott_sims(2, time=time, strength=strength, num_rep=num_rep)
111 | B = np.zeros(num_rep)
112 | for j, ts in enumerate(reps):
113 | tree = next(ts.trees())
114 | B[j] = tree.time(tree.root)
115 | return(B)
116 |
117 | nrep=1000
118 |
119 | #Recording the mean pairwise coalescence times and the fraction of replicates with t>T for a grid of bottleneck strengths:
120 | bottBlist = np.arange(0.0,5,0.25)
121 | dat = np.zeros(len(bottBlist))
122 | prob = np.zeros(len(bottBlist))
123 | for j in range(len(bottBlist)):
124 | mrcas = pairCoalBott(bottT, bottBlist[j], nrep)
125 | dat[j]=np.mean(mrcas, axis=0)
126 | prob[j]=len(mrcas[mrcas>bottT])/nrep
127 |
128 | bottT2 = bottT * 2 # 0.8
129 | dat2 = np.zeros(len(bottBlist))
130 | prob2 = np.zeros(len(bottBlist))
131 | for j in range(len(bottBlist)):
132 | mrcas = pairCoalBott(bottT2, bottBlist[j], nrep)
133 | dat2[j]=np.mean(mrcas, axis=0)
134 | prob2[j]=len(mrcas[mrcas>bottT2])/nrep
135 |
136 | expProb = [np.exp(-(bottT+i)) for i in bottBlist]
137 | expProb2 = [np.exp(-(bottT2+i)) for i in bottBlist]
138 |
139 | expMean = [1+np.exp(-(bottT+i))-np.exp(-bottT) for i in bottBlist]
140 | expMean2 = [1+np.exp(-(bottT2+i))-np.exp(-bottT2) for i in bottBlist]
141 | ```
142 |
143 | ```{code-cell} ipython3
144 |
145 | plt.plot(bottBlist, expProb, c='brown', ls=":", lw=4, label=f"T={bottT}, theoretical")
146 | plt.plot(bottBlist, prob, c='red', marker='+', ms=8, label=f"T={bottT}, sim")
147 | plt.plot(bottBlist, expProb2, c='navy', ls=":", lw=4, label=f"T={bottT2}, theoretical")
148 | plt.plot(bottBlist, prob2, c='blue', marker='+', lw=1, ms=8, label=f"T={bottT2}, simulated")
149 | plt.xlabel("Bottleneck Strength B")
150 | plt.ylabel("p(t>T)")
151 | plt.legend()
152 | plt.show()
153 |
154 | plt.plot(bottBlist, expMean, c='brown', ls=":", lw=4, label=f"T={bottT}, theoretical")
155 | plt.plot(bottBlist, dat, c='red', marker='+', ms=8, label=f"T={bottT}, simulated")
156 | plt.plot(bottBlist, expMean2, c='navy', ls=":", lw=4, label=f"T={bottT2}, theoretical")
157 | plt.plot(bottBlist, dat2, c='blue', marker='+', ms=8, label=f"T={bottT2}, simulated")
158 | plt.xlabel("Bottleneck Strength B");
159 | plt.ylabel("E[t]")
160 | plt.legend()
161 | plt.show()
162 | ```
163 |
164 | ## The distribution of pairwise coalescence times
165 |
166 | The distribution of pairwise coalescence times has two maxima at $t=0$ and the bottleneck time $t=T$ (we have assumed $T=0.8$ below) as expected. The simulated distribution of pairwise coalescence times fits the analytic expectation:
167 |
168 | ```{code-cell} ipython3
169 | s = bottBlist[3]
170 | sprob = 1-np.exp(-s)
171 |
172 | coaldis = pairCoalBott(s, bottT2, 10000);
173 | coaldisFilt = coaldis[(coaldis < 3)];
174 |
175 | probtest = len(coaldis[coaldis>bottT2])/10000
176 | expprob2 = np.exp(-(bottT2+s))
177 |
178 | tlist = np.arange(0.0,3,0.25);
179 | coalEpx1 = [np.exp(-i) for i in tlist[0:4]];
180 | coalEpx2 = [np.exp(-i)-(sprob*np.exp(-i)) for i in tlist[3:]];
181 |
182 | plt.plot(tlist[0:4], coalEpx1, color = 'black', linewidth = 2);
183 | plt.plot(tlist[3:], coalEpx2, color = 'black', linewidth = 2);
184 | plt.hist(coaldisFilt, bins = 20, density=True);
185 | plt.xlabel("t");
186 | plt.ylabel("f(t)")
187 | plt.show()
188 | ```
189 |
190 | ## Approximating the site frequency spectrum
191 |
192 | Bottlenecks can have a substantial effect on the site frequency spectrum (SFS). The SFS is a fundamental summary of sequence variation that forms the basis of many modern inference approaches (e.g. sweepfinder, DFE-alpha, dadi). in the absence of linkage information the SFS is a lossless summary, i.e. any summary of sequence variation that ignores linkage (e.g. pairwise measures of diversity and divergence, $F_{st}$, Tajima's D etc) are summaries of the SFS (Achaz 2009). The SFS is convenient analytically, since it depends only on the mean length and frequency of genealogical branches. For many demographic models of interest the mean length of n-ton branches can be derived either using coalescent theory (Chen 2011) or diffusion equations (Gutenkunst 2009). A number of composite likelihood approaches have been developed based on either analytic results for the SFS or approximate obtained from coalescent simulations (Gutenkunst 2009, Excoffier 2002). We can use msprime to obtain the SFS for a sample of $n=20$ for a range of bottleneck strengths:
193 |
194 | ```{code-cell} ipython3
195 | def bottSFS(num_samp, time, strength, num_rep):
196 | reps = run_bott_sims(num_samp, time=time, strength=strength, num_rep=num_rep)
197 | Blist = np.zeros((num_rep, num_samp+1))
198 | for rep_index, ts in enumerate(reps):
199 | afs=ts.allele_frequency_spectrum(mode="branch", polarised=True, span_normalise=False)
200 | Blist[rep_index]+= afs
201 | data = np.mean(Blist, axis=0)
202 | data /=np.sum(data)
203 | return data[0:num_samp]
204 |
205 | nrep = 5_000
206 | nsamp = 12
207 | bottT = 0.8
208 |
209 | bottBlist = np.arange(0.0,5,1)
210 | datalist = {}
211 | for s in bottBlist:
212 | datalist[s]= bottSFS(nsamp, bottT, s, nrep)
213 | ```
214 |
215 | With increasing bottleneck strength the SFS becomes increasingly skewed (the leftmost blue bars show the SFS for a population of constant size). However, bottlenecks have a complex effect on the different frequency classes of the SFS: while the relative frequency of singletons increases, other frequency classes (e.g. doubletons) have a non-monotonic relationship with B:
216 |
217 | ```{code-cell} ipython3
218 | bar_width=0.18
219 | index = np.arange(1, nsamp)
220 | j = 0
221 | for ss, y in datalist.items():
222 | plt.bar(index + j * bar_width, y[1:], bar_width, label=str(ss))
223 | j += 1
224 | ```
225 |
226 | ## The marginal distribution of n-ton branch lengths
227 |
228 | We may be interested in the marginal distributions (pdf) of branch lengths immediately
229 | above a node with n tips, which determines the expected distribution of n-ton mutations in a
230 | nonrecombining block of sequence (Bunnefeld et al., 2016). Like the distribution of
231 | pairwise coalescence times, we expect the pdf of n-ton branches to be discontinuous.
232 | Assuming n=4, B=0.75 and T=0.8 (as above):
233 |
234 | ```{code-cell} ipython3
235 | bottB = 0.8
236 | bottT = 0.75
237 | numrep = 20_000
238 | nsamp = 4
239 | sims = run_bott_sims(nsamp, bottT, bottB, num_rep=numrep)
240 | B = np.zeros((numrep, nsamp))
241 | for rep_index, ts in enumerate(sims):
242 | tree = next(ts.trees())
243 | for u in tree.nodes():
244 | nleaves = tree.num_samples(u)
245 | if tree.parent(u) != tskit.NULL:
246 | B[rep_index, nleaves] += tree.branch_length(u) # Branch length above this node
247 |
248 | Btrans=np.array(B).T.tolist()
249 | ```
250 |
251 | ```{code-cell} ipython3
252 | plt.hist([x for x in sorted(Btrans[2]) if x < 2], bins = 50, density=True, label="doubletons");
253 | plt.hist([x for x in sorted(Btrans[3]) if x < 2], bins = 50, density=True, label="tripletons");
254 | plt.ylim(0, 3)
255 | plt.ylabel("Probability")
256 | plt.xlabel("Branch length")
257 | plt.legend()
258 | plt.show()
259 | ```
260 |
261 | Again, this gives a good fit to the analytic expectation (see Bunnefeld at al 2016).
262 |
263 | ## Multiple populations
264 |
265 | The ``InstantaneousBottleneck`` model does not work without specifying any populations. Measuring bottleneck strength in generations (i.e. an imaginary time of coalescence) has a subtle but important consequence when we consider samples from multiple populations: the effect of the bottleneck on lineages present in each population at time T depends on the size of each population: genealogies in small populations are more strongly affected.
266 |
--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | #/bin/bash
2 |
3 | # Jupyter-build doesn't have an option to automatically show the
4 | # saved reports, which makes it difficult to debug the reasons for
5 | # build failures in CI. This is a simple wrapper to handle that.
6 |
7 | REPORTDIR=_build/html/reports
8 |
9 | jupyter-book build -W -n --keep-going .
10 | RETVAL=$?
11 | if [ $RETVAL -ne 0 ]; then
12 | if [ -e $REPORTDIR ]; then
13 | echo "Error occured; showing saved reports"
14 | cat $REPORTDIR/*
15 | fi
16 | else
17 | # Clear out any old reports
18 | rm -f $REPORTDIR/*
19 | fi
20 | exit $RETVAL
21 |
--------------------------------------------------------------------------------
/completing_forward_sims.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | kernelspec:
7 | display_name: Python 3
8 | language: python
9 | name: python3
10 | ---
11 |
12 | (sec_completing_forwards_simulations)=
13 |
14 | # Completing forwards simulations
15 |
16 | The ``msprime`` simulator generates tree sequences using the backwards in
17 | time coalescent model. But it is also possible to output tree sequences
18 | from [forwards-time](https://doi.org/10.1371/journal.pcbi.1006581)
19 | simulators such as [SLiM](https://messerlab.org/slim)
20 | and [fwdpy11](https://fwdpy11.readthedocs.io/) (see the
21 | {ref}`sec_tskit_forward_simulations` tutorial).
22 | There are many advantages to using forward-time simulators, but they
23 | are usually quite slow compared to similar coalescent simulations. In this
24 | section we show how to combine the best of both approaches by simulating
25 | the recent past using a forwards-time simulator and then complete the
26 | simulation of the ancient past using ``msprime``. (We sometimes refer to this
27 | "recapitation", as we can think of it as adding a "head" onto a tree sequence.)
28 |
29 | First, we define a simple Wright-Fisher simulator which returns a tree sequence
30 | with the properties that we require (please see the
31 | {ref}`msprime documentation `
32 | for a formal description of these properties):
33 |
34 | ```{code-cell} ipython3
35 | import tskit
36 | import msprime
37 | import random
38 | import numpy as np
39 |
40 |
41 | def wright_fisher(N, T, L=100, random_seed=None):
42 | """
43 | Simulate a Wright-Fisher population of N haploid individuals with L discrete
44 | loci for T generations, with one recombination per transmission event
45 | Based on Algorithm W from https://doi.org/10.1371/journal.pcbi.1006581
46 | """
47 | random.seed(random_seed)
48 | tables = tskit.TableCollection(L)
49 | tables.time_units = "generations"
50 | tables.populations.metadata_schema = tskit.MetadataSchema.permissive_json()
51 | tables.populations.add_row()
52 | P = np.arange(N, dtype=int)
53 | for _ in range(N):
54 | tables.nodes.add_row(time=T, population=0)
55 | t = T
56 | while t > 0:
57 | t -= 1
58 | Pp = P.copy()
59 | for j in range(N):
60 | u = tables.nodes.add_row(time=t, population=0)
61 | Pp[j] = u
62 | a = random.randint(0, N - 1)
63 | b = random.randint(0, N - 1)
64 | x = random.randint(1, L - 1)
65 | tables.edges.add_row(0, x, P[a], u)
66 | tables.edges.add_row(x, L, P[b], u)
67 | P = Pp
68 |
69 | tables.sort()
70 | # Simplify with respect to nodes at time zero (the current generation), using
71 | # `keep_input_roots`` to keep the ancient nodes from the initial population.
72 | tables.simplify(np.where(tables.nodes.time == 0)[0], keep_input_roots=True)
73 | return tables.tree_sequence()
74 | ```
75 |
76 | We then run a tiny forward simulation of 10 two-locus individuals
77 | for 5 generations, and print out the resulting trees:
78 |
79 | ```{code-cell} ipython3
80 | num_loci = 2
81 | N = 10
82 | wf_ts = wright_fisher(N, 5, L=num_loci, random_seed=3)
83 | wf_ts.draw_svg()
84 | ```
85 |
86 | Because our Wright Fisher simulation ran for only 5 generations, there has not
87 | been enough time for the trees to fully coalesce. Therefore, instead of having
88 | one root, the trees have several --- the first tree has 2 and the second 4.
89 | Nodes 16, 17, 18, and 19 in this simulation represent the members of the
90 | initial population of the simulation that have genetic descendants at the end
91 | of the simulation. These unary branches joining samples and coalesced subtrees
92 | to the nodes in the initial generation are essential as they allow use to
93 | correctly assemble the various fragments of ancestral material into chromosomes
94 | when creating the initial conditions for the coalescent simulation.
95 | (Please see the
96 | {ref}`msprime documentation `
97 | for for more details on the
98 | required properties of input tree sequences.)
99 |
100 | The process of completing this tree sequence using a coalescent simulation
101 | begins by first examining the root segments on the input trees. We get the
102 | following segments:
103 |
104 | ```
105 | [(0, 2, 17), (0, 2, 18), (1, 2, 19), (1, 2, 16)]
106 | ```
107 |
108 | where each segment is a ``(left, right, node)`` tuple. As nodes 17 and 18 are
109 | present in both trees, they have segments spanning both loci. Nodes 16 and 19 are
110 | present only in the second tree, and so they have ancestral segments only for
111 | the second locus. Note that this means that we do *not* simulate the ancestry
112 | of the entire initial generation of the simulation, but rather the exact
113 | minimum that we need in order to complete the ancestry of the current
114 | generation. For instance, root ``19`` has not coalesced over the interval from
115 | ``1.0`` to ``2.0``, while root ``17`` has not coalesced over the entire segment
116 | from ``0.0`` to ``2.0``.
117 |
118 | We run the coalescent simulation to complete this tree sequence using the
119 | ``initial_state`` argument to {func}`msprime.sim_ancestry`. Because we have simulated a
120 | two locus system with a recombination rate of ``1 / num_loci`` per generation
121 | in the Wright-Fisher model, we want to use the same system in the coalescent simulation.
122 | Note that we set the ``ploidy`` argument to 1 here because our forward time simulation
123 | is haploid and msprime uses a diploid time scale by default.
124 |
125 |
126 | ```{code-cell} ipython3
127 | coalesced_ts = msprime.sim_ancestry(
128 | population_size=N,
129 | initial_state=wf_ts,
130 | recombination_rate=1 / num_loci,
131 | ploidy=1,
132 | random_seed=7)
133 | coalesced_ts.draw_svg()
134 | ```
135 |
136 | The trees have fully coalesced and we've successfully combined a forwards-time
137 | Wright-Fisher simulation with a coalescent simulation: hooray!
138 |
139 |
140 | ## Why keep input roots (i.e., the initial generation)?
141 |
142 | We can now see why it is essential that we take care to preserve the roots of all
143 | trees when we simplified the tree sequence (by passing ``keep_input_roots=True``),
144 | so that the initial generation can be properly used as the
145 | ``initial_state`` argument to {func}`msprime.sim_ancestry`. In the example above, if node
146 | ``18`` was not in the tree sequence, we would not know that the segment that
147 | node ``10`` inherits from on ``[0.0, 1.0)`` and the segment that node ``2``
148 | inherits from on ``[1.0, 2.0)`` both exist in the same node.
149 |
150 | Note that although the portions of initial generation (above, nodes ``16``, ``17``,
151 | ``18``, and ``19``) must be in the tree sequence, they do *not* have to be
152 | samples, and their entire genomes need not be represented (e.g., node ``19`` is
153 | only present on ``[1.0, 2.0)``). This allows {func}`msprime.sim_ancestry` to not simulate
154 | the entire history of the first generation, but only what is necessary to complete
155 | any uncoalesced trees. Happily, this is easily done with the ``keep_input_roots`` argument
156 | to {meth}`~tskit.TableCollection.simplify`. Note that this argument would need
157 | to be provided to the periodic {meth}`~tskit.TableCollection.simplify` steps
158 | which are essential in practical simulation, but that we skipped in the toy simulator above.
159 |
160 | In fact, this is precisely how tree sequence recording in [SLiM](https://messerlab.org/slim),
161 | works, and {func}`pyslim.recapitate` provides a front-end to
162 | the method presented here.
163 |
164 |
165 | ## Topology gotchas
166 |
167 | The trees that we output from this combined forwards and backwards simulation
168 | process have some slightly odd properties that are important to be aware of.
169 | In the example above, we can see that the old roots are still present in both trees,
170 | even through they have only one child and are clearly redundant.
171 | This is because the tables of ``initial_state`` have been retained, without modification,
172 | at the top of the tables of the output tree sequence. While this
173 | redundancy is not important for many tasks, there are some cases where
174 | they may cause problems:
175 |
176 | 1. When computing statistics on the number of nodes, edges or trees in a tree
177 | sequence, having these unary edges and redundant nodes will slightly
178 | inflate the values.
179 | 2. If you are computing the overall tree "height" by taking the time of the
180 | root node, you may overestimate the height because there is a unary edge
181 | above the "real" root (this would happen if one of the trees had already
182 | coalesced in the forwards-time simulation).
183 |
184 | For these reasons it may be better to remove this redundancy from your
185 | computed tree sequence which is easily done using the
186 | {meth}`simplify ` method:
187 |
188 | ```{code-cell} ipython3
189 | final_ts = coalesced_ts.simplify()
190 | coalesced_ts.draw_svg()
191 | ```
192 |
193 | This final tree sequence is topologically identical to the original tree sequence,
194 | but has the redundant nodes and edges removed. Note also that the node IDs have been
195 | reassigned so that the samples are 0 to 9 --- if you need the IDs from the original
196 | tree sequence, please set ``map_nodes=True`` when calling ``simplify`` to get a
197 | mapping between the two sets of IDs.
198 |
199 |
200 |
--------------------------------------------------------------------------------
/counting_topologies.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | ```{currentmodule} tskit
15 | ```
16 |
17 | (sec_counting_topologies)=
18 |
19 | ```{code-cell} ipython3
20 | :tags: [remove-cell]
21 | import msprime
22 | import stdpopsim
23 |
24 | def topologies_sim_speciestree():
25 | newick_species_tree = "((A:100.0,B:100.0):100.0,C:200.0)"
26 | demography = msprime.Demography.from_species_tree(newick_species_tree, initial_size=100)
27 | ts = msprime.sim_ancestry({0: 2, 1: 2, 2: 2}, demography=demography, random_seed=321)
28 | ts.dump("data/topologies_sim_speciestree.trees")
29 |
30 | def topologies_sim_stdpopsim():
31 | species = stdpopsim.get_species("HomSap")
32 | model = species.get_demographic_model("OutOfAfrica_3G09")
33 | contig = species.get_contig("chr1", length_multiplier=0.0002, mutation_rate=model.mutation_rate)
34 | samples = {"YRI": 1000, "CEU": 1000, "CHB": 1000}
35 | engine = stdpopsim.get_engine("msprime")
36 | ts = engine.simulate(model, contig, samples, seed=321)
37 | ts.dump("data/topologies_sim_stdpopsim.trees")
38 |
39 |
40 | def create_notebook_data():
41 | topologies_sim_speciestree()
42 | topologies_sim_stdpopsim()
43 |
44 | # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook
45 | ```
46 |
47 | # Counting topologies
48 |
49 | **Yan Wong**
50 |
51 | This tutorial is intended to be a gentle introduction to the combinatorial
52 | treatment of tree topologies in `tskit`. For a more formal introduction,
53 | see the {ref}`sec_combinatorics` section of the
54 | official `tskit` {ref}`documentation`.
55 |
56 | The *topology* of a single tree is the term used to describe the branching pattern,
57 | regardless of the lengths of the branches. For example, both trees below have the
58 | same topology, although the branch lengths differ:
59 |
60 | ```{code-cell}
61 | import tskit
62 | node_labels = {0: "a", 1: "b", 2: "c"} # avoid confusion by using letters to label tips
63 | tree = tskit.Tree.generate_comb(3)
64 | display(tree.draw_svg(node_labels=node_labels, y_axis=True))
65 |
66 | deep_tree = tskit.Tree.generate_comb(10).tree_sequence.simplify([0, 1, 2]).first()
67 | display(deep_tree.draw_svg(node_labels=node_labels, y_axis=True))
68 | ```
69 |
70 | :::{note}
71 | The treatment of topologies in `tskit` is restricted to trees with a single defined root,
72 | without nodes with a single child (i.e. trees must consist of nodes that are either leaves,
73 | or internal nodes with two or more children). For convenience in the examples
74 | below, trees are drawn with the tips flagged as samples, although whether a node is a sample or
75 | not does not change the topology of the tree.
76 | :::
77 |
78 | ## Tree labellings and shapes
79 |
80 | The topology of a tree also takes into account the labelling of tips, so that
81 | the trees below, although they have the same *shape*, count as three
82 | different topologies:
83 |
84 | ```{code-cell}
85 | :tags: [hide-input]
86 | from string import ascii_lowercase
87 | from IPython.display import SVG
88 |
89 | def str_none(s, prefix=None):
90 | if s is not None:
91 | if prefix is None:
92 | return str(s)
93 | else:
94 | return prefix + " = " + str(s)
95 | return None
96 |
97 | def draw_svg_trees(trees, node_labels={}, x_lab_attr=None, width=100, height=150, space=10):
98 | w = width + space
99 | h = height + space
100 | trees = list(trees)
101 | s = f''
112 | return SVG(s)
113 |
114 | draw_svg_trees(tskit.all_tree_labellings(tree), node_labels={u: ascii_lowercase[u] for u in tree.samples()})
115 | ```
116 |
117 | These are, in fact, the only possible three labellings for a three-tip tree of that shape.
118 | There is only one other possible shape for a three-tip tree, and for this shape,
119 | all labelling orders are equivalent (in other words, there is only one
120 | possible labelling):
121 |
122 | ```{code-cell}
123 | :tags: [hide-input]
124 | tskit.Tree.generate_star(3).draw_svg(node_labels={})
125 | ```
126 |
127 | A 3-tip tree therefore has only four possible topologies.
128 | These can be generated with the {func}`~tskit.all_trees` function.
129 |
130 | ```{code-cell}
131 | generated_trees = tskit.all_trees(3)
132 | print("For a three-tip tree there are", len(list(generated_trees)), "labelled topologies.")
133 | ```
134 |
135 | Here they are, plotted out with their shapes enumerated from zero:
136 |
137 | ```{code-cell}
138 | :tags: [hide-input]
139 | draw_svg_trees(
140 | tskit.all_trees(3),
141 | node_labels={u: ascii_lowercase[u] for u in tree.samples()},
142 | x_lab_attr="shape"
143 | )
144 | ```
145 |
146 | ### Enumerating shapes and labellings
147 |
148 | For a tree with four tips, more topologies and shapes are possible. As before, we can generate the
149 | topologies using {func}`~tskit.all_trees`. Alternatively, if we only want the (unlabelled) shapes,
150 | we can use the {func}`~tskit.all_tree_shapes` function:
151 |
152 | ```{code-cell}
153 | print("For a four-tip tree there are", len(list(tskit.all_trees(4))), "labelled topologies.")
154 |
155 | generated_trees = tskit.all_tree_shapes(4)
156 | print("These can be categorised into", len(list(generated_trees)), "shapes.")
157 | ```
158 |
159 | Again, we can give each shape a number or *index*, starting from zero:
160 |
161 | ```{code-cell}
162 | :tags: [hide-input]
163 | draw_svg_trees(tskit.all_tree_shapes(4), x_lab_attr="shape")
164 | ```
165 |
166 | Each of these shapes will have a separate number of possible labellings, and trees with
167 | these labellings can be created using {func}`~tskit.all_tree_labellings`:
168 |
169 | ```{code-cell}
170 | for shape_index, tree in enumerate(tskit.all_tree_shapes(4)):
171 | labellings = tskit.all_tree_labellings(tree)
172 | num_labellings = len(list(labellings))
173 | print(
174 | f"Tree shape {shape_index} for a four-tip tree has "
175 | f"{num_labellings} labelling{'' if num_labellings==1 else 's'}."
176 | )
177 | ```
178 |
179 | Any tree topology for a tree of $N$ tips can therefore be described by a
180 | shape index combined with a labelling index. This is known as the
181 | *rank* of a tree, and it can be obtained using the
182 | {meth}`Tree.rank` method. For instance, here is the rank of a simulated tree
183 | of 10 tips:
184 |
185 | ```{code-cell}
186 | :tags: [hide-input]
187 | import msprime
188 | num_tips = 10
189 | simulated_ts = msprime.sim_ancestry(10, ploidy=1, random_seed=123)
190 | simulated_tree = simulated_ts.first()
191 | print("The topology of the simulated tree below can be described as", simulated_tree.rank())
192 | ascii_node_labels = {u: ascii_lowercase[u] for u in simulated_tree.samples()}
193 | simulated_tree.draw_svg(node_labels=ascii_node_labels)
194 | ```
195 |
196 |
197 | A tree with the same topology (i.e. the same shape and labelling, but ignoring
198 | the branch lengths) can be generated using the {meth}`Tree.unrank` method, by
199 | specifying the number of tips and the appropriate `(shape, labelling)` tuple:
200 |
201 | ```{code-cell}
202 | new_tree = tskit.Tree.unrank(num_tips, (1270, 21580))
203 | new_tree.draw_svg(node_labels=ascii_node_labels)
204 | ```
205 |
206 | Note that this method generates a single tree in a new tree sequence
207 | whose a default sequence length is 1.0.
208 |
209 | ## Methods for large trees
210 |
211 | The number of possible topologies for a tree with $N$ tips
212 | grows very rapidly with $N$. For instance, with 10 tips, there are
213 | 282,137,824 possible topologies.
214 |
215 | For this reason, the {func}`~tskit.all_trees`, {func}`~tskit.all_tree_shapes` and
216 | {func}`~tskit.all_tree_labellings` methods do not return a list of trees
217 | but an iterator over the trees. This means it is perfectly possible to start
218 | iterating over (say) all tree shapes for a tree of 100 leaves, but
219 | the iterator will not finish before the death of our galaxy.
220 |
221 | ```{code-cell}
222 | for num_trees, tree in enumerate(tskit.all_tree_shapes(100)):
223 | shape = tree.rank().shape
224 | b2 = tree.b2_index()
225 | print(f"A 100-tip tree with shape index {shape} has a b2 balance index of {b2}")
226 | if num_trees > 5:
227 | break # better not let this run too long!
228 | ```
229 |
230 | For similar combinatorial reasons, the {meth}`Tree.rank` method can be
231 | inefficient for large trees. To compare the topology of two trees, you are
232 | therefore recommended to use e.g. the {meth}`Tree.kc_distance` method
233 | rather than comparing ranks directly.
234 |
235 | ```{code-cell}
236 | simulated_tree = simulated_ts.first(sample_lists=True) # kc_distance requires sample lists
237 | if simulated_ts.first(sample_lists=True).kc_distance(simulated_tree) == 0:
238 | print("Trees are identical")
239 | # To compare to the new_tree we need to fix
240 | # print("The simulated and topology-constructed trees have the same topology")
241 | ```
242 |
243 | Despite the combinatorial explosion associated with topologies of
244 | many-tip trees, it is still possible to efficiently count
245 | the number of *embedded topologies* in a large tree.
246 |
247 | ### Embedded topologies
248 |
249 | An embedded topology is a a topology involving a subset of the tips of a tree.
250 | If the tips are classified into (say) three groups, red, green, and blue,
251 | we can efficiently count all the embedded three-tip trees which have
252 | one tip from each group using the {meth}`Tree.count_topologies` method.
253 |
254 | ```{code-cell}
255 | :tags: [hide-input]
256 | big_tree = tskit.load("data/topologies_sim_speciestree.trees").first()
257 | # Check all observed topologies have the same counts
258 | assert list(big_tree.count_topologies()[0, 1, 2].values()) == [32, 32]
259 | styles = [
260 | f".node.sample.p{p.id} > .sym " + "{" + f"fill: {colour}" + "}"
261 | for colour, p in zip(['red', 'green', 'blue'], big_tree.tree_sequence.populations())
262 | ]
263 | big_tree.draw_svg(style="".join(styles), node_labels={}, time_scale="rank", x_label="big_tree")
264 | ```
265 |
266 | In this tree, it is clear that the green and blue tips never cluster together.
267 | The {meth}`Tree.count_topologies` method exhaustively looks at all
268 | combinations of one red, one blue, and one green tip, and confirms that we never see
269 | the topology grouping green and blue. However, as might be expected from
270 | examination of the plot above, a red tip is equally likely to be a sister to a
271 | green tip as to a blue tip:
272 |
273 | ```{code-cell}
274 | # By default `count_topologies` chooses one tip from each population, like setting
275 | # sample_sets=[ts.samples(p.id) for p in ts.populations() if len(ts.samples(p.id)) > 0]
276 |
277 | topology_counter = big_tree.count_topologies()
278 |
279 | colours = ['red', 'green', 'blue']
280 | styles = [f".n{u}>.sym {{fill: {c} }}" for u, c in enumerate(colours)]
281 |
282 | embedded_counts = topology_counter[0, 1, 2]
283 | for embedded_tree in tskit.all_trees(3):
284 | rank = embedded_tree.rank()
285 | number_of_instances = embedded_counts[rank]
286 | label = f"{number_of_instances} instances embedded in big_tree"
287 | display(embedded_tree.draw_svg(style="".join(styles), node_labels={}, x_label=label))
288 | ```
289 |
290 | ## Methods over tree sequences
291 |
292 | It can be useful to count embedded topologies over an entire tree sequence.
293 | For instance, we might want to know the number of embedded topologies
294 | that support Neanderthals as a sister group to europeans versus africans.
295 | `Tskit` provides the efficient {meth}`TreeSequence.count_topologies` method to
296 | do this [incrementally](sec_incremental), without having to re-count the topologies
297 | independently in each tree.
298 |
299 | ```{code-cell}
300 | :tags: [hide-input]
301 | from myst_nb import glue
302 | ts = tskit.load("data/topologies_sim_stdpopsim.trees")
303 | print(f"Loaded a stdpopsim of {ts.num_trees} African+European+Chinese trees, each with {ts.num_samples} tips")
304 | glue("seq_len", int(ts.sequence_length/1000), display=False)
305 | ```
306 |
307 | Although the trees in this tree sequence are very large, counting the embedded topologies is
308 | quite doable (for speed in this demo we are only simulating {glue:}`seq_len` kilobases, but
309 | calculating the average over an entire chromosome simply takes a little longer)
310 |
311 | ```{code-cell}
312 | from datetime import datetime
313 | names = {"YRI": "African", "CEU": "European", "CHB": "Chinese"}
314 | colours = {"YRI": "yellow", "CEU": "green", "CHB": "blue"}
315 |
316 | population_map = {p.metadata["id"]: p.id for p in ts.populations()}
317 | sample_populations = list(sorted({ts.node(u).population for u in ts.samples()}))
318 | topology_span = {tree.rank(): 0 for tree in tskit.all_trees(len(sample_populations))}
319 |
320 | start = datetime.now()
321 | total = 0
322 | for topology_counter, tree in zip(ts.count_topologies(), ts.trees()):
323 | embedded_topologies = topology_counter[sample_populations]
324 | weight = tree.span / ts.sequence_length
325 | for rank, count in embedded_topologies.items():
326 | topology_span[rank] += count * weight
327 | total += count
328 | print(f"Counted {total} embedded topologies in {datetime.now() - start} seconds")
329 | ```
330 |
331 | ```{code-cell}
332 | :tags: [hide-input]
333 | ntips = len(sample_populations)
334 | styles = ".sample text.lab {baseline-shift: super; font-size: 0.7em;}"
335 | node_labels = {}
336 |
337 | for p in range(ntips):
338 | name = ts.population(sample_populations[p]).metadata["id"]
339 | node_labels[p] = names[name]
340 | styles += f".n{p}>.sym {{fill: {colours[name]} }}"
341 |
342 | total = sum(topology_span.values())
343 | for rank, weight in topology_span.items():
344 | label = f"{weight/total *100:.1f}% of genome"
345 | embedded_tree = tskit.Tree.unrank(ntips, rank)
346 | display(embedded_tree.draw_svg(size=(160, 150), style="".join(styles), node_labels=node_labels, x_label=label))
347 | ```
348 |
349 | Perhaps unsurprisingly, the most common topology is the one that groups the non-African
350 | populations together (although there are many trees of the other two topologies,
351 | mostly reflecting genetic divergence prior to the emergence of humans out of Africa).
352 |
353 | For an example with real data, see {ref}`sec_popgen_topological`
354 | in the {ref}`sec_intro_popgen` tutorial.
--------------------------------------------------------------------------------
/data/afs.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/afs.trees
--------------------------------------------------------------------------------
/data/basics.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/basics.trees
--------------------------------------------------------------------------------
/data/benchmarks_without_copy_longer_genome.txt:
--------------------------------------------------------------------------------
1 | toolkit nsam nmutations Nu nbytes seconds
2 | tskit 25 37743 0.25 943575 0.024380256014410406
3 | libseq 25 37743 0.25 943575 0.004416356037836522
4 | allel 25 37743 0.25 943575 0.006015182007104158
5 | tskit 38 41264 0.25 1568032 0.02725571603514254
6 | libseq 38 41264 0.25 1568032 0.007482443994376808
7 | allel 38 41264 0.25 1568032 0.008124893007334322
8 | tskit 60 45965 0.25 2757900 0.03147959499619901
9 | libseq 60 45965 0.25 2757900 0.010253532032947987
10 | allel 60 45965 0.25 2757900 0.011828921968117356
11 | tskit 94 52047 0.25 4892418 0.03646116703748703
12 | libseq 94 52047 0.25 4892418 0.014803380996454507
13 | allel 94 52047 0.25 4892418 0.018299699993804097
14 | tskit 146 56647 0.25 8270462 0.04079140804242343
15 | libseq 146 56647 0.25 8270462 0.021643130981829017
16 | allel 146 56647 0.25 8270462 0.02823396399617195
17 | tskit 227 60026 0.25 13625902 0.044762183038983494
18 | libseq 227 60026 0.25 13625902 0.032079764001537114
19 | allel 227 60026 0.25 13625902 0.04379657399840653
20 | tskit 353 63808 0.25 22524224 0.04979721503332257
21 | libseq 353 63808 0.25 22524224 0.049665857979562134
22 | allel 353 63808 0.25 22524224 0.06984564702725038
23 | tskit 549 68808 0.25 37775592 0.05479123996337876
24 | libseq 549 68808 0.25 37775592 0.08029428997542709
25 | allel 549 68808 0.25 37775592 0.11476215999573469
26 | tskit 854 72633 0.25 62028582 0.06038561399327591
27 | libseq 854 72633 0.25 62028582 0.12830477399984375
28 | allel 854 72633 0.25 62028582 0.18557889200747013
29 | tskit 1329 77876 0.25 103497204 0.06728244601981714
30 | libseq 1329 77876 0.25 103497204 0.21029049198841676
31 | allel 1329 77876 0.25 103497204 0.30698327702702954
32 | tskit 2067 81594 0.25 168654798 0.0740198030252941
33 | libseq 2067 81594 0.25 168654798 0.33949470898369327
34 | allel 2067 81594 0.25 168654798 0.497885801945813
35 | tskit 3215 85693 0.25 275502995 0.0821873809909448
36 | libseq 3215 85693 0.25 275502995 0.5518272669869475
37 | allel 3215 85693 0.25 275502995 0.8115627619554289
38 | tskit 4999 90771 0.25 453764229 0.0933208679780364
39 | libseq 4999 90771 0.25 453764229 0.9057928950060159
40 | allel 4999 90771 0.25 453764229 1.3362707490450703
41 | tskit 7775 95036 0.25 738904900 0.09812425600830466
42 | libseq 7775 95036 0.25 738904900 1.4732784099760465
43 | allel 7775 95036 0.25 738904900 2.174873666022904
44 | tskit 12091 99421 0.25 1202099311 0.1134186849812977
45 | libseq 12091 99421 0.25 1202099311 2.399354270019103
46 | allel 12091 99421 0.25 1202099311 3.5421803669887595
47 | tskit 18803 103851 0.25 1952710353 0.13057904096785933
48 | libseq 18803 103851 0.25 1952710353 3.9014300610288046
49 | allel 18803 103851 0.25 1952710353 5.784294551995117
50 | tskit 29240 108771 0.25 3180464040 0.14623118803137913
51 | libseq 29240 108771 0.25 3180464040 6.36328757496085
52 | allel 29240 108771 0.25 3180464040 9.451211793988477
53 | tskit 45470 113319 0.25 5152614930 0.16928912402363494
54 | libseq 45470 113319 0.25 5152614930 10.348922087985557
55 | allel 45470 113319 0.25 5152614930 15.338866573001724
56 | tskit 70710 117860 0.25 8333880600 0.1923062339774333
57 | libseq 70710 117860 0.25 8333880600 16.752991039014887
58 | allel 70710 117860 0.25 8333880600 25.193067331041675
59 | tskit 109960 122092 0.25 13425236320 0.2233057350385934
60 | libseq 109960 122092 0.25 13425236320 28.182809649966657
61 | allel 109960 122092 0.25 13425236320 41.492513177974615
62 | tskit 170997 126595 0.25 21647365215 0.26305200799833983
63 | libseq 170997 126595 0.25 21647365215 43.96894769597566
64 | allel 170997 126595 0.25 21647365215 69.50293152098311
65 | tskit 265914 131220 0.25 34893235080 0.32004916900768876
66 | libseq 265914 131220 0.25 34893235080 78.91534719598712
67 | allel 265914 131220 0.25 34893235080 120.64627647103043
68 | tskit 413518 134931 0.25 55796397258 0.39681092998944223
69 | libseq 413518 134931 0.25 55796397258 140.25325659103692
70 | allel 413518 134931 0.25 55796397258 194.81629185698694
71 | tskit 643054 139566 0.25 89748474564 0.5139753300463781
72 | libseq 643054 139566 0.25 89748474564 227.20759825699497
73 | allel 643054 139566 0.25 89748474564 316.33334937802283
74 | tskit 1000000 144293 0.25 144293000000 0.6793207000009716
75 | libseq 1000000 144293 0.25 144293000000 368.2131700209575
76 | allel 1000000 144293 0.25 144293000000 510.472775303002
77 |
--------------------------------------------------------------------------------
/data/computing_statistics.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/computing_statistics.trees
--------------------------------------------------------------------------------
/data/construction_example.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/construction_example.trees
--------------------------------------------------------------------------------
/data/different_time_samples.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/different_time_samples.trees
--------------------------------------------------------------------------------
/data/download.py:
--------------------------------------------------------------------------------
1 | """
2 | Simple script to download all .trees files within the `data` directory on GitHub,
3 | saving to a local `data` directory
4 | """
5 |
6 | import os
7 | import json
8 | from urllib.request import urlretrieve, urlopen
9 |
10 | if not os.path.isdir("data"):
11 | os.mkdir("data") # Make a "data" directory within the current folder
12 | print(f"Downloading data files into {os.path.join(os.getcwd(), 'data')}")
13 | # Save the data files to the data directory
14 | response = urlopen("https://tskit.dev/tutorials/examples/files.txt")
15 | for fn in response:
16 | fn = fn.decode(response.headers.get_content_charset()).strip()
17 | if fn.endswith(".trees"):
18 | urlretrieve("https://tskit.dev/tutorials/examples/" + fn, os.path.join("data", fn))
19 | print(".", end="")
20 | print(" finished downloading")
--------------------------------------------------------------------------------
/data/metadata.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/metadata.trees
--------------------------------------------------------------------------------
/data/parsimony_map.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/parsimony_map.pickle
--------------------------------------------------------------------------------
/data/parsimony_map.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/parsimony_map.trees
--------------------------------------------------------------------------------
/data/parsimony_simple.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/parsimony_simple.trees
--------------------------------------------------------------------------------
/data/storing_everyone.csv:
--------------------------------------------------------------------------------
1 | ,compressed,pbwt,pbwtz,sample_size,tsk_fit,tskz_fit,uncompressed,vcf,vcf_fit,vcfz,vcfz_fit
2 | 0,0.0016675414517521858,0.0018388191238045692,0.000494055449962616,10,0.01328652372466946,0.0014398649106981897,0.013348821550607681,0.005093865096569061,0.0038755249813772976,0.0006428053602576256,0.00024484777501409204
3 | 1,0.0042699361220002174,0.0037771547213196754,0.0013225525617599487,100,0.028833975141573032,0.004427909731197507,0.02880258485674858,0.044218966737389565,0.047833623496165135,0.002773575484752655,0.0024107513243057884
4 | 2,0.007236692123115063,0.006424359045922756,0.0028374912217259407,1000,0.044466600006542005,0.007421182366127227,0.04446660354733467,0.5665737707167864,0.5903859600357276,0.023386516608297825,0.023736061915643803
5 | 3,0.01037746760994196,0.010164554230868816,0.005389832891523838,10000,0.06095095935216508,0.010466733145360988,0.060782525688409805,7.290761827491224,7.286832071905484,0.2337631145492196,0.23370333952860764
6 | 4,0.014260578900575638,0.015963544137775898,0.009662624448537827,100000,0.085952663504329,0.014035065367635116,0.08610359206795692,89.93748273793608,89.93764289539862,2.3010212713852525,2.3010241168450483
7 | 5,0.022809751331806183,0.040422539226710796,0.021889440715312958,1000000,0.19612781572190113,0.022831212020312953,0.1961144097149372,1068.2399909570813,1110.0543459436492,21.772690244950354,22.655696734938648
8 | 6,0.07962783984839916,0.259388854727149,0.0981400953605771,10000000,1.1580374485935565,0.08390550297702788,1.15145118907094,0.0,13700.833280470879,0.0,223.06615162698623
9 | 7,0.0,0.0,0.0,100000000,10.637291888006041,0.6677612369741138,0.0,0.0,169102.3806764029,0.0,2196.291227933775
10 | 8,0.0,0.0,0.0,1000000000,105.28999439282681,6.479431401374908,0.0,0.0,2087144.2316714546,0.0,21624.50520940123
11 | 9,0.0,0.0,0.0,10000000000,1051.6771775517304,64.56924586981279,0.0,0.0,25760554.20612597,0.0,212913.12354388786
12 |
--------------------------------------------------------------------------------
/data/tables_example.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/tables_example.trees
--------------------------------------------------------------------------------
/data/tables_example_muts.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/tables_example_muts.trees
--------------------------------------------------------------------------------
/data/topologies_sim_speciestree.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/topologies_sim_speciestree.trees
--------------------------------------------------------------------------------
/data/topologies_sim_stdpopsim.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/topologies_sim_stdpopsim.trees
--------------------------------------------------------------------------------
/data/tree_traversals.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/tree_traversals.trees
--------------------------------------------------------------------------------
/data/unified_genealogy_2q_108Mb-110Mb.tsz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/unified_genealogy_2q_108Mb-110Mb.tsz
--------------------------------------------------------------------------------
/data/viz_ts_full.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/viz_ts_full.trees
--------------------------------------------------------------------------------
/data/viz_ts_selection.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/viz_ts_selection.trees
--------------------------------------------------------------------------------
/data/viz_ts_small.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/viz_ts_small.trees
--------------------------------------------------------------------------------
/data/viz_ts_small_mutated.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/viz_ts_small_mutated.trees
--------------------------------------------------------------------------------
/data/viz_ts_tiny.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/viz_ts_tiny.trees
--------------------------------------------------------------------------------
/data/whatis_example.trees:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/whatis_example.trees
--------------------------------------------------------------------------------
/data/whatis_example.yml:
--------------------------------------------------------------------------------
1 | description:
2 | Asymmetric migration between two extant demes.
3 | time_units: generations
4 | defaults:
5 | epoch:
6 | start_size: 5000
7 | demes:
8 | - name: Ancestral_population
9 | epochs:
10 | - end_time: 1000
11 | - name: A
12 | ancestors: [Ancestral_population]
13 | - name: B
14 | ancestors: [Ancestral_population]
15 | epochs:
16 | - start_size: 2000
17 | end_time: 500
18 | - start_size: 400
19 | end_size: 10000
20 | migrations:
21 | - source: A
22 | dest: B
23 | rate: 1e-4
24 |
--------------------------------------------------------------------------------
/incremental_algorithms.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | ```{currentmodule} tskit
15 | ```
16 |
17 | (sec_incremental)=
18 | # _Incremental algorithms_
19 |
20 | Much of the [efficiency](sec_what_is_analysis)
21 | of the tskit approach comes from the use of incremental algorithms.
22 | By considering only the difference between adjacent trees,
23 | incremental algorithms avoid having to perform the same
24 | calculation multiple times on different trees.
25 |
26 | This tutorial will explain the philosophy behind incremental algorithms,
27 | and provide examples of how to create your own (e.g. using the
28 | {meth}`TreeSequence.edge_diffs` method).
29 |
30 | :::{todo}
31 | Create content. See [issue 233](https://github.com/tskit-dev/tutorials/issues/233)
32 | :::
--------------------------------------------------------------------------------
/intro.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | (sec_intro)=
15 |
16 | # Welcome!
17 |
18 | This site contains a number of tutorials to develop your understanding of
19 | genetic genealogies, ancestral recombination graphs, and the
20 | [succinct tree sequence](https://tskit.dev/learn/) storage format,
21 | as implemented in [`tskit`: the tree sequence toolkit](https://tskit.dev/tskit/docs/).
22 | Also included are a number of tutorials showing advanced use of
23 | [software programs](https://tskit.dev/software/),
24 | such as [`msprime`](https://tskit.dev/msprime/docs), that form part of the
25 | [`tskit` ecosystem](https://tskit.dev).
26 |
27 | ```{code-cell} ipython3
28 | :tags: [remove-input]
29 | import math
30 | import msprime
31 |
32 | def make_7_tree_4_tip_ts():
33 | ts = msprime.sim_ancestry(
34 | 4, ploidy=1, random_seed=889, sequence_length=1000, recombination_rate=0.001)
35 | ts = msprime.sim_mutations(ts, rate=2e-3, random_seed=123)
36 |
37 | # Check we have picked a random seed that gives a nice plot of 7 trees
38 | tip_orders = {
39 | tuple(u for u in t.nodes(order="minlex_postorder") if t.is_sample(u))
40 | for t in ts.trees()
41 | }
42 | topologies = {tree.rank() for tree in ts.trees()}
43 | assert tip_orders == {(0, 1, 2, 3)} and len(topologies) > 1 and ts.num_trees == 7
44 |
45 | return ts
46 |
47 |
48 | ts = make_7_tree_4_tip_ts()
49 |
50 | # Set some parameters: these can be adjusted to your liking
51 | tree_width = 80
52 | height = 200 # Normal height for tree + x-axis
53 | y_step = 20 # Stagger between trees (i.e. 0 for all trees in a horizontal line)
54 | skew = 0.7 # How skewed the trees are, in radians
55 |
56 | width = tree_width * ts.num_trees + 20 + 20 # L & R margins in draw_svg = 20px
57 | angle = math.atan(y_step/tree_width)
58 | ax_mv = y_step, (ts.num_trees - 1) * y_step - 90 + math.tan(skew) * (tree_width * .9)
59 |
60 | # CSS transforms used to skew the axis and stagger + skew the trees
61 | style = f".x-axis {{transform: translate({ax_mv[0]}px, {ax_mv[1]}px) skewY(-{angle}rad)}}"
62 | for i in range(ts.num_trees):
63 | # Stagger each tree vertically by y_step, transforming the "plotbox" tree container
64 | style += (
65 | f".tree.t{i} > .plotbox " + "{transform:" +
66 | f"translateY({(ts.num_trees - i - 1) * y_step-85}px) skewY({skew}rad)" + "}"
67 | )
68 |
69 | # Define a bigger canvas size so we don't crop the moved trees from the drawing
70 | size = (width, height)
71 | canvas_size = (width + y_step, height + math.tan(skew)*tree_width)
72 |
73 | ts.draw_svg(size=size, x_scale="treewise", style=style, canvas_size=canvas_size)
74 | ```
75 |
76 | If you are new to the world of tree sequences, we suggest you start with the
77 | first tutorial: {ref}`sec_what_is`
78 |
79 | :::{note}
80 | Tutorials are under constant development. Those that are still a work in progress and
81 | not yet ready for use are shown in _italics_ in the list of tutorials.
82 |
83 | We very much welcome help developing existing tutorials or writing new ones. Please open
84 | or contribute to a [GitHub issue](https://github.com/tskit-dev/tutorials/issues) if you
85 | would like to help out.
86 | :::
87 |
88 | ## Other sources of help
89 |
90 | In addition to these tutorials, our [Learn page](https://tskit.dev/learn/) lists
91 | selected videos and publications to help you learn about tree sequences.
92 |
93 | We aim to be a friendly, welcoming open source community.
94 | Questions and discussion about using {program}`tskit`, the tree sequence toolkit
95 | should be directed to the
96 | [GitHub discussion forum](https://github.com/tskit-dev/tskit/discussions), and there are
97 | similar forums for other software in the tree sequence [development community](https://github.com/tskit-dev),
98 | such as for [msprime](https://github.com/tskit-dev/msprime/discussions) and
99 | [tsinfer](https://github.com/tskit-dev/tsinfer/discussions).
100 |
101 |
102 | (sec_intro_running)=
103 |
104 | ## Running tutorial code
105 |
106 | It is possible to run the tutorial code on your own computer, if you wish.
107 | This will allow you to experiment with the examples provided.
108 | The recommended way to do this is from within a
109 | [Jupyter notebook](https://jupyter.org). As well as installing Jupyter, you will also
110 | need to install the various Python libraries, most importantly
111 | ``tskit``, ``msprime``, ``numpy``, and ``matplotlib``. These and other packages are
112 | listed in the [requirements.txt](https://tskit.dev/tutorials/requirements.txt)
113 | file; a shortcut to installing the necessary software is therefore:
114 |
115 | ```
116 | python3 -m pip install -r https://tskit.dev/tutorials/requirements.txt
117 | ```
118 |
119 | In addition, to run the {ref}`R tutorial` you will need to install the R
120 | [reticulate](https://rstudio.github.io/reticulate/) library, and if running it in a Jupyter
121 | notebook, the [IRkernel](https://irkernel.github.io) library. This can be done by
122 | running the following command within R:
123 |
124 | ```
125 | install.packages(c("reticulate", "IRkernel")); IRkernel::installspec()
126 | ```
127 |
128 | (sec_intro_downloading_datafiles)=
129 |
130 | ### Downloading tutorial datafiles
131 |
132 | Many of the tutorials use pre-existing tree sequences stored in the
133 | [``data``](https://github.com/tskit-dev/tutorials/tree/main/data) directory.
134 | These can be downloaded individually from that link, or you can
135 | download them all at once by running the script stored in
136 | [https://tskit.dev/tutorials/examples/download.py](https://tskit.dev/tutorials/examples/download.py).
137 | If you are running the code in the tutorials from within a Jupyter notebook
138 | then you can simply load this code into a new cell by using the
139 | [%load cell magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-load).
140 | Just run the following in a Jupyter code cell:
141 |
142 | ```
143 | %load https://tskit.dev/tutorials/examples/download.py
144 | ```
145 |
146 | Running the resulting Python code should download the data files, then print out
147 | ``finished downloading`` when all files are downloaded. You should then be able
148 | to successfully run code such as the following:
149 |
150 | ```{code-cell} ipython3
151 | import tskit
152 | ts = tskit.load("data/basics.trees")
153 | print(f"The file 'data/basics.trees' exists, and contains {ts.num_trees} trees")
154 | ```
155 |
--------------------------------------------------------------------------------
/metadata.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 | ```{currentmodule} tskit
14 | ```
15 |
16 |
17 | (sec_tutorial_metadata)=
18 |
19 | # Working with Metadata
20 |
21 | Metadata is information associated with entities that {program}`tskit` doesn't use or
22 | interpret, but which is useful to pass on to downstream analysis such as sample ids,
23 | dates etc. (see {ref}`sec_metadata` for a full discussion). Each
24 | {ref}`table` has a {class}`MetadataSchema` which details the
25 | contents and encoding of the metadata for each row. A metadata schema is a JSON document
26 | that conforms to [JSON Schema](https://json-schema.org/understanding-json-schema/)
27 | (The full schema for tskit is at {ref}`sec_metadata_schema_schema`). Here we use an
28 | {ref}`example tree sequence`
29 | which contains some demonstration metadata:
30 |
31 | ```{code-cell} ipython3
32 | :tags: [remove-cell]
33 | import msprime
34 | import tskit
35 |
36 | def metadata():
37 | tables = msprime.sim_ancestry(4).dump_tables()
38 | tables.individuals.metadata_schema = tskit.MetadataSchema(
39 | {'additionalProperties': False,
40 | 'codec': 'json',
41 | 'properties': {'accession': {'description': 'ENA accession number',
42 | 'type': 'string'},
43 | 'pcr': {'description': 'Was PCR used on this sample',
44 | 'name': 'PCR Used',
45 | 'type': 'boolean'}},
46 | 'required': ['accession', 'pcr'],
47 | 'type': 'object'}
48 | )
49 | md = [
50 | {'accession': 'ERS0001', 'pcr': True},
51 | {'accession': 'ERS0002', 'pcr': True},
52 | {'accession': 'ERS0003', 'pcr': True},
53 | {'accession': 'ERS0004', 'pcr': False},
54 | ]
55 | table = tables.individuals
56 | copy = table.copy()
57 | table.clear()
58 | for m, row in zip(md, copy):
59 | table.append(row.replace(metadata=m))
60 | ts = tables.tree_sequence()
61 | ts.dump("data/metadata.trees")
62 |
63 | def create_notebook_data():
64 | metadata()
65 |
66 | # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook
67 | ```
68 |
69 |
70 | ```{code-cell} ipython3
71 | import tskit
72 | import json
73 |
74 | ts = tskit.load("data/metadata.trees")
75 | ```
76 | (sec_tutorial_metadata_reading)=
77 |
78 | ## Reading metadata and schemas
79 |
80 | Metadata is automatically decoded using the schema when accessed via a
81 | {class}`TreeSequence` or {class}`TableCollection` Python API. For example:
82 |
83 | ```{code-cell} ipython3
84 | print("Metadata for individual 0:", ts.individual(0).metadata) # Tree sequence access
85 | print("Metadata for individual 0:", ts.tables.individuals[0].metadata) # Table access
86 | ```
87 |
88 | Viewing the {class}`MetadataSchema` for a table can help with understanding
89 | its metadata, as it can contain descriptions and constraints:
90 |
91 | ```{code-cell} ipython3
92 | ts.table_metadata_schemas.individual
93 | ```
94 |
95 | The same schema can be accessed via a {attr}`~IndividualTable.metadata_schema` attribute
96 | on each table (printed prettily here using ``json.dumps``)
97 |
98 | ```{code-cell} ipython3
99 | schema = ts.tables.individuals.metadata_schema
100 | print(json.dumps(schema.asdict(), indent=4)) # Print with indentations
101 | ```
102 |
103 | The top-level metadata and schemas for the entire tree sequence are similarly
104 | accessed with {attr}`TreeSequence.metadata` and {attr}`TreeSequence.metadata_schema`.
105 |
106 | :::{note}
107 | If there is no schema (i.e. it is equal to ``MetadataSchema(None)``) for a table
108 | or top-level metadata, then no decoding is performed and ``bytes`` will be returned.
109 | :::
110 |
111 | (sec_tutorial_metadata_modifying)=
112 |
113 | ## Modifying metadata and schemas
114 |
115 | If you are creating or modifying a tree sequence by changing the underlying tables,
116 | you may want to record or add to the metadata. If the change fits into the same schema,
117 | this is relatively simple, you can follow the
118 | {ref}`description of minor table edits` in the
119 | {ref}`sec_tables` tutorial. However if it requires a change to the schema, this must be
120 | done first, as it is then used to validate and encode the metadata.
121 |
122 | Schemas in tskit are held in a {class}`MetadataSchema`.
123 | A Python dict representation of the schema is passed to its constructor, which
124 | will validate the schema. Here are a few examples: the first one allows arbitrary fields
125 | to be added, the second one (which will construct the schema we printed above) does not:
126 |
127 | ```{code-cell} ipython3
128 | basic_schema = tskit.MetadataSchema({'codec': 'json'})
129 |
130 | complex_schema = tskit.MetadataSchema({
131 | 'codec': 'json',
132 | 'additionalProperties': False,
133 | 'properties': {'accession': {'description': 'ENA accession number',
134 | 'type': 'string'},
135 | 'pcr': {'description': 'Was PCR used on this sample',
136 | 'name': 'PCR Used',
137 | 'type': 'boolean'}},
138 | 'required': ['accession', 'pcr'],
139 | 'type': 'object',
140 | })
141 | ```
142 |
143 | This {class}`MetadataSchema` can then be assigned to a table or the top-level
144 | tree sequence e.g. {attr}`~IndividualTable.metadata_schema`:
145 |
146 | ```{code-cell} ipython3
147 | tables = tskit.TableCollection(sequence_length=1) # make a new, empty set of tables
148 | tables.individuals.metadata_schema = complex_schema
149 | ```
150 |
151 | This will overwrite any existing schema. Note that this will not validate any existing
152 | metadata against the new schema. Now that the table has a schema, calls to
153 | {meth}`~IndividualTable.add_row` will validate and encode the metadata:
154 |
155 | ```{code-cell} ipython3
156 | row_id = tables.individuals.add_row(0, metadata={"accession": "Bob1234", "pcr": True})
157 | print(f"Row {row_id} added to the individuals table")
158 | ```
159 |
160 | If we try to add metadata that doesn't fit the schema, such as accidentally using a
161 | string instead of a proper Python boolean, we'll get an error:
162 |
163 | ```{code-cell} ipython3
164 | :tags: [raises-exception, output_scroll]
165 | tables.individuals.add_row(0, metadata={"accession": "Bob1234", "pcr": "false"})
166 | ```
167 |
168 | and because we set ``additionalProperties`` to ``False`` in the schema, an error is
169 | also raised if we attempt to add new fields:
170 |
171 | ```{code-cell} ipython3
172 | :tags: [raises-exception, output_scroll]
173 | tables.individuals.add_row(0, metadata={"accession": "Bob1234", "pcr": True, "newKey": 25})
174 | ```
175 |
176 |
177 | To set the top-level metadata, just assign it. Validation and encoding happen as
178 | specified by the top-level metadata schema
179 |
180 | ```{code-cell} ipython3
181 | tables.metadata_schema = basic_schema # Allows new fields to be added that are not validated
182 | tables.metadata = {"mean_coverage": 200.5}
183 | print(tables.metadata)
184 | ```
185 |
186 | :::{note}
187 | *Provenance* information, detailing the origin of the data, modification timestamps,
188 | and (ideally) how the tree sequence can be reconstructed, should go in
189 | {ref}`sec_provenance`, not metadata.
190 | :::
191 |
192 | To modify a schema --- for example to add a key --- first get the dict representation,
193 | modify, then write back:
194 |
195 | ```{code-cell} ipython3
196 | schema_dict = tables.individuals.metadata_schema.schema
197 | schema_dict["properties"]["newKey"] = {"type": "integer"}
198 | tables.individuals.metadata_schema = tskit.MetadataSchema(schema_dict)
199 | # Now this will work:
200 | new_id = tables.individuals.add_row(metadata={'accession': 'abc123', 'pcr': False, 'newKey': 25})
201 | print(tables.individuals[new_id].metadata)
202 | ```
203 |
204 | To modify the metadata of rows in tables use the {ref}`sec_tutorial_metadata_bulk`.
205 |
206 | (sec_tutorial_metadata_viewing_raw)=
207 |
208 | ## Viewing raw metadata
209 |
210 | If you need to see the raw (i.e. bytes) metadata, you just need to remove the
211 | schema, for instance:
212 |
213 | ```{code-cell} ipython3
214 | individual_table = tables.individuals.copy() # don't change the original tables.individual
215 |
216 | print("Metadata:\n", individual_table[0].metadata)
217 |
218 | individual_table.metadata_schema = tskit.MetadataSchema(None)
219 | print("\nRaw metadata:\n", individual_table[0].metadata)
220 | ```
221 |
222 | (sec_tutorial_metadata_bulk)=
223 |
224 | ## Metadata for bulk table methods
225 |
226 | In the interests of efficiency each table's {meth}`~NodeTable.packset_metadata` method,
227 | as well as the more general {meth}`~NodeTable.set_columns` and
228 | {meth}`~NodeTable.append_columns` methods, do not attempt to validate or encode metadata.
229 | You can call {meth}`MetadataSchema.validate_and_encode_row` directly to prepare metadata
230 | for these methods:
231 |
232 | ```{code-cell} ipython3
233 | metadata_column = [
234 | {"accession": "etho1234", "pcr": True},
235 | {"accession": "richard1235", "pcr": False},
236 | {"accession": "albert1236", "pcr": True},
237 | ]
238 | encoded_metadata_column = [
239 | tables.individuals.metadata_schema.validate_and_encode_row(r) for r in metadata_column
240 | ]
241 | md, md_offset = tskit.pack_bytes(encoded_metadata_column)
242 | tables.individuals.set_columns(flags=[0, 0, 0], metadata=md, metadata_offset=md_offset)
243 | tables.individuals
244 | ```
245 |
246 | Or if all columns do not need to be set:
247 |
248 | ```{code-cell} ipython3
249 | tables.individuals.packset_metadata(
250 | [tables.individuals.metadata_schema.validate_and_encode_row(r) for r in metadata_column]
251 | )
252 | ```
253 |
254 | (sec_tutorial_metadata_binary)=
255 |
256 | ## Binary metadata
257 |
258 | To disable the validation and encoding of metadata and store raw bytes pass ``None`` to
259 | {class}`MetadataSchema`
260 |
261 | ```{code-cell} ipython3
262 | tables.populations.metadata_schema = tskit.MetadataSchema(None)
263 | tables.populations.add_row(metadata=b"SOME CUSTOM BYTES #!@")
264 | print(tables.populations[0].metadata)
265 | ```
266 |
--------------------------------------------------------------------------------
/more_forward_sims.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | ```{currentmodule} tskit
15 | ```
16 |
17 | (sec_tskit_more_forward_simulations)=
18 |
19 | # _Advanced forward simulations_
20 |
21 | % remove underscores in title when tutorial is complete or near-complete
22 |
23 | :::{todo}
24 | Add further details on building a forward simulator
25 | (see issue [#14](https://github.com/tskit-dev/tutorials/issues/14))
26 | :::
27 |
28 | In the {ref}`previous tutorial`, we developed a basic
29 | basic forward-time Wright-Fisher (WF) simulator (refer back to that tutorial for a
30 | detailed run through of the hidden code):
31 |
32 | ```{code-cell}
33 | :tags: ["hide-cell"]
34 | import tskit
35 | import numpy as np
36 |
37 | random_seed = 6
38 | random = np.random.default_rng(random_seed) # A random number generator for general use
39 |
40 | L = 50_000 # The sequence length: 50 Kb
41 |
42 | def add_inheritance_paths(tables, parent_genomes, child_genome, recombination_rate):
43 | "Add paths from parent genomes to the child genome, with crossover recombination"
44 | L = tables.sequence_length
45 | num_recombinations = random.poisson(recombination_rate * L)
46 | breakpoints = random.integers(0, L - 1, size=num_recombinations)
47 | break_pos, counts = np.unique(breakpoints, return_counts=True)
48 | crossovers = break_pos[counts % 2 == 1] # no crossover if e.g. 2 breaks at same pos
49 | left_positions = np.insert(crossovers, 0, 0)
50 | right_positions = np.append(crossovers, L)
51 |
52 | inherit_from = random.integers(2)
53 | for left, right in zip(left_positions, right_positions):
54 | tables.edges.add_row(
55 | left, right, parent_genomes[inherit_from], child_genome)
56 | inherit_from = 1 - inherit_from # switch to other parent genome
57 |
58 | def make_diploid(tables, time, parent_individuals=None):
59 | individual_id = tables.individuals.add_row(parents=parent_individuals)
60 | return individual_id, (
61 | tables.nodes.add_row(time=time, individual=individual_id),
62 | tables.nodes.add_row(time=time, individual=individual_id),
63 | )
64 |
65 | def new_population(tables, time, prev_pop, recombination_rate):
66 | pop = {}
67 | prev_individuals = np.array([i for i in prev_pop.keys()], dtype=np.int32)
68 | for _ in range(len(prev_pop)):
69 | mother_and_father = random.choice(prev_individuals, 2, replace=True)
70 | child_id, child_genomes = make_diploid(tables, time, mother_and_father)
71 | pop[child_id] = child_genomes # store the genome IDs
72 | for child_genome, parent_individual in zip(child_genomes, mother_and_father):
73 | parent_genomes = prev_pop[parent_individual]
74 | add_inheritance_paths(tables, parent_genomes, child_genome, recombination_rate)
75 | return pop
76 |
77 | def initialise_population(tables, time, size) -> dict:
78 | return dict(make_diploid(tables, time) for _ in range(size))
79 |
80 | ```
81 |
82 | The main simulation function, as below, returned an unsimplified tree sequence,
83 | which we subsequently {meth}`simplified`:
84 |
85 | ```{code-cell} ipython3
86 |
87 | def forward_WF(num_diploids, seq_len, generations, recombination_rate=0, random_seed=7):
88 | global random
89 | random = np.random.default_rng(random_seed)
90 | tables = tskit.TableCollection(seq_len)
91 | tables.time_units = "generations"
92 |
93 | pop = initialise_population(tables, generations, num_diploids)
94 | while generations > 0:
95 | generations = generations - 1
96 | pop = new_population(tables, generations, pop, recombination_rate)
97 |
98 | tables.sort()
99 | return tables.tree_sequence()
100 | ```
101 |
102 | ## Repeated simplification
103 |
104 | We can perform simplification directly on the tables within the `forward_WF()` function,
105 | using {meth}`TableCollection.simplify`. More importantly, we can carry this out at
106 | repeated intervals. It is helpful to think of this as regular "garbage collection",
107 | as what we're really doing is getting rid of extinct lineages while also "trimming"
108 | extant lineages down to a minimal representation.
109 |
110 | :::{caution}
111 | Regular garbage collection forces us to reckon with the fact that simplification
112 | {ref}`changes the node IDs `.
113 | We therefore need to remap any node (and individual) IDs that are used outside of
114 | `tskit`. In the implementation described here, those IDs are stored in the `pop`
115 | variable.
116 | :::
117 |
118 | ```{code-cell}
119 | def simplify_tables(tables, samples, pop) -> dict[int, tuple[int, int]]:
120 | """
121 | Simplify the tables with respect to the given samples, returning a
122 | population dict in which individual and nodes have been remapped to their
123 | new ID numbers
124 | """
125 | tables.sort()
126 | node_map = tables.simplify(samples, keep_input_roots=True)
127 |
128 | nodes_individual = tables.nodes.individual
129 | remapped_pop = {}
130 | for node1, node2 in pop.values():
131 | node1, node2 = node_map[[node1, node2]] # remap
132 | assert nodes_individual[node1] == nodes_individual[node2] # sanity check
133 | remapped_pop[nodes_individual[node1]] = (node1, node2)
134 | return remapped_pop
135 |
136 |
137 | def forward_WF(
138 | num_diploids,
139 | seq_len,
140 | generations,
141 | recombination_rate=0,
142 | simplification_interval=None, # default to simplifying only at end
143 | show=None,
144 | random_seed=7,
145 | ):
146 | global random
147 | random = np.random.default_rng(random_seed)
148 | tables = tskit.TableCollection(seq_len)
149 | tables.time_units = "generations" # optional, but helpful when plotting
150 | if simplification_interval is None:
151 | simplification_interval = generations
152 | simplify_mod = generations % simplification_interval
153 |
154 | pop = initialise_population(tables, generations, num_diploids)
155 | while generations > 0:
156 | generations = generations - 1
157 | pop = new_population(tables, generations, pop, recombination_rate)
158 | if generations > 0 and generations % simplification_interval == simplify_mod:
159 | current_nodes = [u for nodes in pop.values() for u in nodes]
160 | pop = simplify_tables(tables, current_nodes, pop)
161 | if show:
162 | print("Simplified", generations, "generations before end")
163 |
164 | pop = simplify_tables(tables, [u for nodes in pop.values() for u in nodes], pop)
165 | if show:
166 | print("Final simplification")
167 | return tables.tree_sequence()
168 |
169 | ts = forward_WF(6, L, generations=100, simplification_interval=25, show=True)
170 | ts.draw_svg(size=(800, 200))
171 | ```
172 |
173 | ### Invariance to simplification interval
174 | A critical concept to keep in mind is that the simulation itself is the only random component.
175 | The simplification algorithm is deterministic given a set of (nodes, edges) satisfying
176 | `tskit`'s sorting requirements. Therefore, the results of our new `forward_WF()` function
177 | must be the same for all simplification intervals
178 |
179 | :::{note}
180 | This invariance property only holds in some cases.
181 | We discuss this in more detail below when we add in mutation.
182 | :::
183 |
184 | ```{code-cell}
185 | ts = forward_WF(10, L, 500, simplification_interval=1, random_seed=42)
186 |
187 | # Iterate over a range of odd and even simplification intervals.
188 | print("Testing invariance to simplification interval")
189 | test_intervals = list(range(2, 500, 33))
190 | for i in test_intervals:
191 | # Make sure each new sim starts with same random seed!
192 | ts_test = forward_WF(10, L, 500, simplification_interval=i, show=False, random_seed=42)
193 | assert ts.equals(ts_test, ignore_provenance=True)
194 | print(f"Intervals {test_intervals} passed")
195 | ```
196 |
197 | :::{tip}
198 | Testing your own code using loops like the one above is a very
199 | good way to identify subtle bugs in book-keeping.
200 | :::
201 |
202 | ### Summary
203 |
204 | * Simplifying during a simulation changes IDs in the tree sequence tables, so we need to remap
205 | entities that store any of these IDs between generations.
206 | * Our code to carry out simplification gets called both during the simulation and at the end.
207 | It's therefore worth encapsulating it into a class or function for easier code re-use and testing.
208 |
209 | #### Technical notes
210 |
211 | We have found that it is possible to write a simulation where the results differ
212 | by simplification interval, but appear correct in distribution.
213 | By this we mean that looking at distributions of numbers of mutations, their frequencies, etc.,
214 | match predictions from analytical theory. However, our experience is that such simulations
215 | contain bugs and that the summaries being used for testing are too crude to catch them.
216 | For example, they may affect the variance in a subtle way that would require millions
217 | of simulations to catch. Often what is going on is that parent/offspring relationships
218 | are not being properly recorded, resulting in lineages that either persist too long or
219 | not long enough. (In other words, the variance in offspring number per diploid is no
220 | longer what it should be, meaning you've changed the effective population size.)
221 | Thus, please make sure you get the **same** `tskit` tables out of a simulation for
222 | any simplification interval.
223 |
224 |
225 | ## Mutations
226 |
227 | In this section, we will add mutation to our simulation. Mutations will occur according to the
228 | infinitely-many sites model, which means that a new mutation cannot arise at a currently-mutated
229 | position. $\theta = 4N\mu$ is the scaled mutation rate, and is equal to twice the expected number
230 | of new mutations per generation. The parameter $\mu$ is the expected number of new mutations
231 | per gamete, per generation. Mutation positions will be uniformly distributed along the genome.
232 |
233 | Adding mutations changes the complexity of the simulation quite a bit, because now we must
234 | add to and simplify [site tables](sec_edge_table_definition) and
235 | [mutation tables](sec_mutation_table_definition) instances. We might also
236 | want to add *metadata* to the sites or mutations, recording details such as
237 | the selection coefficient of a mutation, or the type of mutation (e.g., synonymous vs. non-synonymous).
238 |
239 | We will write a mutation function here which we will re-use in future examples.
240 |
241 | :::{note}
242 | We will be treating mutations as neutral. Doing so is odd, as one big
243 | selling point of `tskit` is the ability to skip the tracking of neutral mutations
244 | in forward simulations. However, tracking neutral mutations plus metadata is the
245 | same as tracking selected mutations and their metadata, and being able to do neat
246 | things like put your selected mutations onto a figure of the genealogy
247 | is one of several possible use cases.
248 | :::
249 |
250 | :::{todo}
251 | The rest of this tutorial is still under construction, and needs porting from
252 | [this workbook](https://github.com/tskit-dev/tutorials/blob/main/old-content/notebooks/wfforward.ipynb).
253 | This will primarily deal with sites and mutations (and mutational metadata).
254 | We could also include details on selection, if that seems sensible.
255 |
256 | The section in that workbook on "Starting with a prior history" should be put in
257 | the {ref}`sec_completing_forwards_simulations` tutorial.
258 | :::
--------------------------------------------------------------------------------
/no_mutations.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | ```{currentmodule} tskit
15 | ```
16 |
17 | (sec_tskit_no_mutations)=
18 |
19 | # Do you really need mutations?
20 |
21 | In tree sequences, the genetic genealogy exists independently of the mutations that
22 | generate genetic variation, and often we are primarily interested in genetic variation
23 | because of what it can tell us about those genealogies. This tutorial aims to illustrate
24 | when we can leave mutations and genetic variation aside and study the genealogies directly.
25 |
26 | In simulations we know the true genealogies, and so it can be very helpful to work
27 | with these directly.
28 | In real data, we might infer the trees and then work with the resulting genealogies.
29 | (Of course, mutations add additional noise, and would be necessary
30 | to produce data directly comparable to sequencing data.)
31 | If you're wondering whether you need to add mutations at all,
32 | it's worth considering the following points:
33 |
34 | 1. Neutral mutations and sites can always be added to a genealogy later
35 | 2. Simulating sites and mutations increases memory requirements and tree
36 | sequence file size somewhat, as well as adding to CPU time (although usually this is
37 | inconsequential)
38 | 3. Quantities of interest can often be inferred equally well (or better!) on tree sequences
39 | that have no sites or mutations.
40 |
41 | To illustrate the first two points, we can use the [msprime](https://tskit.dev/msprime)
42 | {func}`~msprime.sim_mutations` function to add neutral sites and mutations onto an
43 | simulated mutationless tree sequence of 20 diploid individuals:
44 |
45 | ```{code-cell} ipython3
46 | import msprime
47 | L = 1_000_000 # simulate 1 megabase length (could increase for a larger example)
48 | rho = mu = 1e-8 # Human-like recombination and mutation parameters
49 | n_subpops = 2
50 | subpop_size = 1e4
51 | migration_rate = 1e-4
52 | # Create a mutationless diploid tree sequence of n_subpops demes
53 | ts_no_mut = msprime.sim_ancestry(
54 | samples={f"pop_{i}": 10 for i in range(n_subpops)}, # 10 samples from each subpop
55 | demography=msprime.Demography.island_model([subpop_size] * n_subpops, migration_rate),
56 | ploidy=2,
57 | recombination_rate=rho,
58 | sequence_length=L,
59 | random_seed=123,
60 | )
61 |
62 | # Optionally, add neutral mutations later, after simulating. This takes some CPU time
63 | # (although it is usually fast compared to simulating the original tree sequence)
64 | ts_mutated = msprime.sim_mutations(ts_no_mut, rate=mu, random_seed=456)
65 | print(
66 | "Adding mutations has increased the tree sequence file size by "
67 | f"{ts_mutated.nbytes / ts_no_mut.nbytes * 100:.0f}%",
68 | )
69 | ```
70 |
71 | :::{note}
72 | Above we have overlaid sites and mutations onto an existing tree sequence by simulation.
73 | It is also possible to overlay mutations in such a way as to generate a known pattern of
74 | genetic variation, using {meth}`tskit.Tree.map_mutations`.
75 | :::
76 |
77 |
78 | Although including mutations has increased the file size a fair bit in this example,
79 | unless you are running simulations with vast numbers of mutations, tree sequences are
80 | usually compact enough that the cost of including them shouldn't be prohibitive.
81 | Nevertheless, it may not be obvious that it is perfectly possible, indeed sometimes
82 | preferable, to perform genetic analyses on tree sequences that do not contain mutations
83 | and variable sites. This is the focus of the remainder of this tutorial.
84 |
85 |
86 | ## Analysis in the absence of genetic variation
87 |
88 | ### Patterns of relationship
89 |
90 | Some genetic analyses are primarily focussed on patterns or degrees of relationship
91 | between genomes. In this case, the genealogy tells you all you need to know.
92 | This includes:
93 |
94 | * analyses of local ancestry, global ancestry and identity-by-descent
95 | * identification of most recent common ancestors and their descendants (including e.g.
96 | genealogical nearest neighbour analysis)
97 |
98 | Although these methods are primarily described elsewhere, the code below illustrates
99 | how a tree sequence without mutations can be used to find the average time to the
100 | most recent common ancestor (tMRCA) of two sample genomes. The genomes have been
101 | picked at random from the two different populations, and the times averaged over
102 | the entire 1Mb genome.
103 |
104 | ```{code-cell} ipython3
105 | import numpy as np
106 | np.random.seed(10)
107 | sample_a = np.random.choice(ts_no_mut.samples(population=0), size=1)[0]
108 | sample_b = np.random.choice(ts_no_mut.samples(population=1), size=1)[0]
109 | av_tMRCA = 0
110 | for tree in ts_no_mut.trees():
111 | # weight the tMRCA by the span of genome covered
112 | av_tMRCA += tree.tmrca(sample_a, sample_b) * tree.span/ts_no_mut.sequence_length
113 | print(f"Average tMRCA between sample {sample_a} (pop_0) and",
114 | f"{sample_b} (pop_1) is {av_tMRCA:.2f} {ts_no_mut.time_units}")
115 | ```
116 |
117 | ### Genetic statistics and branch-length equivalents
118 |
119 | Although many genetic analyses are based on patterns of genetic variation, for many
120 | purposes the genetic variation can be thought of as a measure of the relative length of
121 | branches on the local trees in a tree sequence. So while mutations are necessary to
122 | generate realistically variable genetic sequences, some statistical analyses do not
123 | necessarily require them to be present in a tree sequence (see
124 | [this paper](https://doi.org/10.1534/genetics.120.303253) which explains the duality
125 | between statistics based on genetic variation and their branch length equivalents).
126 | Such statistics include all those based on the allele frequency spectrum, such as genetic
127 | diversity and Tajima's D, and those based on genetic divergence between populations
128 | such as Fst and Patterson's f statistics.
129 |
130 | A simple example of a statistic that is normally calculated by looking at variable sites
131 | is the genetic divergence. This is usually defined as the
132 | proportion of the genome that differs between two sample genomes. In a tree sequence with
133 | mutations, it can be calculated using the {meth}`~TreeSequence.divergence` method:
134 |
135 | ```{code-cell} ipython3
136 | # By default, estimating average "genetic divergence" requires mutations to be present
137 | print(
138 | f"Genetic divergence between samples {sample_a} and {sample_b} is",
139 | f"{ts_mutated.divergence([[sample_a], [sample_b]]):.6f}"
140 | )
141 | ```
142 |
143 | If mutation rates are low (i.e. when the "infinite sites" model of mutation is
144 | a good approximation), each genetic difference between sequences corresponds to a
145 | mutation on the lineage connecting the two samples. Since the number of mutations is
146 | expected to be proportional to the length of the lineage, we can use the
147 | lineage length directly to measure divergence (in this case adding the branch length
148 | from the first sample to the MRCA and the branch from the MRCA to the second sample).
149 | The {ref}`general statistics framework`, allows you to switch to
150 | these "branch length" measures by using the `mode="branch"` parameter:
151 |
152 | ```{code-cell} ipython3
153 | # By default, statistics are calculated base on variable sites (mode="site"), but
154 | # we can switch to the branch-length equivalent using mode = "branch"
155 | sample_sets = [sample_a], [sample_b]
156 | ab_dist = ts_no_mut.divergence(sample_sets, mode="branch")
157 | print(
158 | f"Av. genealogical distance between samples {sample_a} and {sample_b} is",
159 | f"{ab_dist:.1f} {ts_no_mut.time_units}"
160 | )
161 | print("With both samples at time 0, this is twice the previously calculated av tMRCA:")
162 | print(f" av_tMRCA was {av_tMRCA:.1f} (2 * {av_tMRCA:.1f} = {2 * av_tMRCA:.1f})")
163 |
164 | # To compare it to the standard genetic divergence, simply multiply by the mutation rate
165 | print(
166 | f"Estimated genetic divergence from the genealogy is",
167 | f"{ts_no_mut.divergence(sample_sets, mode='branch') * mu:.6f}"
168 | )
169 | ```
170 |
171 | #### Genealogy-based measures are less noisy
172 |
173 | Analyses based on observed genetic variation have a random component due to the
174 | stochastic nature of the mutational process. This "random mutational noise" is missing
175 | from analyses that use the genealogy directly: something that is particularly evident
176 | when the analysis is dependent on a small number of mutations. Here's an example
177 | which contrasts conventional and branch length versions of the well-known $F_{st}$
178 | statistic, across the two populations we previously simulated.
179 |
180 | ```{code-cell} ipython3
181 | import matplotlib_inline
182 | import matplotlib.pyplot as plt
183 |
184 | n_reps = 20
185 | ts_reps = list(msprime.sim_ancestry(
186 | samples={f"pop_{i}": 10 for i in range(n_subpops)},
187 | demography=msprime.Demography.island_model([subpop_size] * n_subpops, migration_rate),
188 | ploidy=2,
189 | recombination_rate=rho,
190 | sequence_length=L,
191 | random_seed=123,
192 | num_replicates=n_reps,
193 | ))
194 |
195 | ts_mutated_reps = [
196 | # Decrease the mutation rate to exaggerate effect of random mutational noise
197 | msprime.sim_mutations(ts, rate=mu/100, random_seed=i+4)
198 | for i, ts in enumerate(ts_reps)
199 | ]
200 |
201 | # Return sample sets as all samples from each population (uses all pairwise comparisons)
202 | def sample_sets(ts):
203 | return [ts.samples(population=p.id) for p in ts.populations()]
204 |
205 | Fst_genealogy = np.array([ts.Fst(sample_sets(ts), mode="branch") for ts in ts_reps])
206 | Fst_genetic_var = np.array([ts.Fst(sample_sets(ts)) for ts in ts_mutated_reps])
207 |
208 | # For the theoretical expectation see e.g. Crow and Aoki (1984) PNAS 81: 6073, eqn 7
209 | Fst_theory = 1/(4*subpop_size*migration_rate*(n_subpops/(n_subpops-1))**2 + 1)
210 |
211 | plt.scatter(["Genetic variation"] * 20, Fst_genetic_var)
212 | plt.scatter(["Genealogy"] * 20, Fst_genealogy)
213 | plt.xlabel("Basis of estimate")
214 | plt.ylabel("Fst\n(20 replicates)")
215 |
216 | plt.axhline(y=Fst_theory, ls=":", c="grey")
217 | plt.text(0.5, Fst_theory, 'theoretical prediction', ha='center')
218 | plt.show()
219 | ```
220 |
221 | ::::{margin}
222 | :::{note}
223 | In real data there is additional noise introduced by
224 | inference of the underlying tree sequence which is not accounted for in these examples
225 | :::
226 | ::::
227 |
228 | Therefore, if your ultimate goal is to compare or estimate genetic statistics of this
229 | sort (rather than to examine the genetic sequence itself), then using the
230 | genealogy-based approach should give you more statistical power.
231 |
232 | As with genetic diversity, there also exist a "branch length" version of the allele
233 | frequency spectrum (the AFS), which measures the length of branches subtending 1 sample,
234 | 2 samples, 3 samples, etc. This is a slightly less noisy version of the AFS based on
235 | actual allele frequencies, and it too can be calculated on a tree sequence with no
236 | mutations:
237 |
238 | ```{code-cell} ipython3
239 | fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 3))
240 |
241 | afs1 = ts_mutated.allele_frequency_spectrum(polarised=True, mode="site")
242 | ax1.bar(np.arange(ts_mutated.num_samples+1), afs1)
243 | ax1.set_title("AFS using variable sites (ts with mutations)")
244 |
245 | afs2 = ts_no_mut.allele_frequency_spectrum(polarised=True, mode="branch")
246 | ax2.bar(np.arange(ts_no_mut.num_samples+1), afs1)
247 | ax2.set_title("Branch length AFS (ts without mutations)")
248 |
249 | plt.show()
250 | ```
251 |
252 | In this case, the plots are almost identical because there are thousands of
253 | mutations over the entire sequence, so the mutational noise has been smoothed out
254 | (the remaining unevenness in the AFS plots is due to stochasticity in genealogy, rather
255 | than the mutations).
256 |
257 | However, if we are doing a windowed analysis, and the windows over
258 | the genome are small, each window will contain relatively few mutations, and statistics
259 | based on the genetic variation generated by mutations will be be subject to greater
260 | mutational noise than those based on branch lengths in the genealogy. Here's an
261 | example using the basic {meth}`genetic diversity `
262 | in 1Kb windows along our simulated genome:
263 |
264 | ```{code-cell} ipython3
265 | fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 3), sharey=True)
266 | w = np.linspace(0, L, num=L//1_000)
267 | ax1.stairs(ts_mutated.diversity(windows=w), w/1_000, baseline=None)
268 | ax1.set_ylabel("Diversity")
269 | ax1.set_xlabel("Genome position (kb)")
270 | ax1.set_title("Site-based calculation")
271 | ax1.set_yscale("log")
272 | ax2.stairs(ts_no_mut.diversity(windows=w, mode="branch") * mu, w/1_000, baseline=None)
273 | ax2.set_xlabel("Genome position (kb)")
274 | ax2.set_title("Branch-length-based calculation")
275 | ax2.set_yscale("log")
276 | plt.show()
277 | ```
278 |
279 | ## Summary
280 |
281 | In summary, if you are analysing tree sequences, especially those produced by simulation,
282 | think carefully. Do you really need to analyse information about alleles and mutations?
283 | If not, you may be able to omit sites and mutations from the tree sequence,
284 | yet still retain the ability to calculate parameters of interest. This genealogical
285 | approach can provide more accurate descriptors of the quantities of interest
286 | than those explicitly based on genetic variation.
287 |
--------------------------------------------------------------------------------
/old-content/README.md:
--------------------------------------------------------------------------------
1 |
2 | Source repository for the tskit tutorials site,
3 | [https://tskit-dev.github.io/tutorials](https://tskit-dev.github.io/tutorials/)
4 |
5 | **UNDER CONSTRUCTION:** This is a very early version, and really just a way to
6 | explore some options for presenting this content.
7 |
8 | ## Organisation
9 |
10 | The ``docs`` directory is a [GitHub pages](https://pages.github.com/) site. This
11 | means that all the Markdown files in this directory are automatically converted to
12 | HTML and made available on the website.
13 | The source content for each 'chapter' is a Jupyter notebook in the ``notebooks``
14 | directory. Notebooks are then converted to Markdown using ``nbconvert``, and placed
15 | in the ``docs`` directory.
16 |
17 | ## Converting a notebook
18 |
19 | To convert a notebook to markdown, use the following:
20 |
21 | ```shell
22 | $ jupyter nbconvert --to markdown --output-dir docs/ notebooks/NOTEBOOK_NAME.ipynb
23 | ```
24 |
25 | When adding a new notebook to the site, you need to then add the files to
26 | git:
27 |
28 | ```shell
29 | $ git add docs/NOTEBOOK_NAME*
30 | ```
31 |
32 | Finally, update the ``docs/README.md`` to insert a link to the new page.
33 |
34 | ## TODO
35 |
36 | - Need standardised titles including authorship.
37 | - Need some sort of citation mechanism. Perhaps [this](https://github.com/takluyver/cite2c)?
38 | - Main page needs some content explaining what the site is for.
39 | - Better template? We can use any Jekyll template, so it's quite flexible.
40 | - It would also be nice to have a download link to the original notebook.
41 |
42 |
--------------------------------------------------------------------------------
/old-content/docs/README.md:
--------------------------------------------------------------------------------
1 |
2 | This is a collection of tutorials for using ``tskit`` and ``msprime``.
3 | Each tutorial is an indepth exploration of a particular narrow topic,
4 | and is written in the form of a Jupyter notebook.
5 |
6 | ## Writing your own simulators
7 |
8 | These tutorials show how it is possible to write your own simulators
9 | using the ``tskit`` Tables API.
10 |
11 | - A simple forwards-time [Wright-Fisher](wfforward.md) simulator.
12 | - The simple Wright-Fisher example implemented using [Cython](wfcython.md)
13 |
14 |
15 | ## Advanced topics in coalescent simulation
16 |
17 | These tutorials show how to use the ``msprime`` simulator to simulate
18 | specific scenarios and how to analyse the resulting tree sequences.
19 |
20 | **NOTE: these are currently a work in progress**
21 |
22 | - Simulating [introgression](introgression.html) with msprime.
23 | - Simulating [bottlenecks](bottlenecks.html) with msprime
24 |
25 | ## Slides from workshops, etc..
26 |
27 | - [Slides](msprime_out.md) from a workshop given by Kevin Thornton at UC Davis in December 2018.
28 |
--------------------------------------------------------------------------------
/old-content/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate
2 | title: tskit tutorials
3 | description: Detailed tutorials for msprime and tskit
4 |
--------------------------------------------------------------------------------
/old-content/docs/_layouts/default.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
16 |
24 |
25 |
26 | {% seo %}
27 |
28 |
29 |
30 |
31 |
32 |
54 |
55 |
56 |
64 |
65 | {% if site.google_analytics %}
66 |
74 | {% endif %}
75 |
76 |
77 |
--------------------------------------------------------------------------------
/old-content/docs/bottlenecks.md:
--------------------------------------------------------------------------------
1 |
2 | # Bottlenecks
3 |
4 | **Konrad Lohse and Jerome Kelleher**
5 |
6 | The site frequency spectrum (SFS) summarises variants by their frequency in a sample and is a fundamental summary of sequence variation that forms the basis of many modern inference approaches (e.g. sweepfinder, DFE-alpha, dadi). First, the SFS is a lossless summary of unlinked variants, so any summary of sequence variation that ignores linkage (e.g. pairwise measures of diversity and divergence, F_st, Tajima's D and D) are summaries of the SFS.
7 |
8 | The SFS is convenient analytically, because it only depends on the mean length and frequency of genealogical branches. For many demographic models of interest the means can be derived analytically either using coalescent theory (cite Huang, TPB) or diffusion equations (cite dadi). A number of composite likelihood approaches have been developed based on either analytic results for the SFS (cite dadi Excoffier, Jaada). However, analytic expectations for the SFS break down for large samples and/or complex demographic models.
9 |
10 | In the following section we show how the SFS can be approximated using coalescence simulations and compare such approximations to analytic results. We will assume a simple toy history of a single panmictic population that is affected by an instaneous bottleneck at time T with strenght s (cite Galtier et al). The effect of this bottleneck is to induce sudden burst of coalescence, which simultaneous multiple merges. We measure bottleneck strength as the probability that a pair of lineages coalesces during the bottleneck (we could could of course convert s into on (imaginary) time period that would give the same probability of coalescence $s=1-e^{-T}$).
11 |
12 | We assume a sample of size 10 and use msprime to simulate 10,000 replicate genealogies. For each genealogy the function bottSFS records the unfolded SFS as the mean length of branches with n leafnodes (normalized by the total length of the genealogy) by iterating through all nodes in the tree.sequence. Note that we are simulating genealogies only, i.e. we do not need to simulate mutations.
13 |
14 | We use a for loop to record the SFS for a range of bottleneck strengths parameters in a dictionary:
15 |
16 |
17 | ```python
18 | %matplotlib inline
19 | %config InlineBackend.figure_format = 'svg'
20 | import msprime
21 | import numpy as np
22 | import seaborn as sns
23 | import matplotlib.pyplot as plt
24 | ```
25 |
26 |
27 | ```python
28 | def run_bott_sims(num_rep, num_samp, T, s):
29 | demographic_events = [
30 | msprime.InstantaneousBottleneck(time=T, strength=s, population=0)]
31 | reps = msprime.simulate(
32 | sample_size=num_samp, Ne=Ne, num_replicates=num_rep,
33 | demographic_events=demographic_events)
34 | return reps
35 |
36 | def approx_SFS(reps):
37 | B = np.zeros((num_rep, num_samp))
38 | for rep_index, ts in enumerate(reps):
39 | assert ts.num_trees == 1
40 | tree = ts.first()
41 | for u in tree.nodes():
42 | nleaves = tree.num_samples(u)
43 | if tree.parent(u) != msprime.NULL_NODE:
44 | B[rep_index, nleaves] += tree.branch_length(u)
45 | data = np.mean(B, axis=0)
46 | data /= np.sum(data)
47 | return data
48 |
49 | num_rep = 10000
50 | num_samp = 10
51 | Ne = 1
52 | T = 0.5
53 | taulist= np.array([0,1,2,3])
54 | datalist = {}
55 | for tau in taulist:
56 | datalist[tau]= approx_SFS(run_bott_sims(num_rep, num_samp, T, tau))
57 |
58 | # My guess/assumption is that currently bottleneck strength in msprime is scaled as an (imaginary) time tau (in units of 4N_e) generations.
59 | # It makes a lot more sense to express the bottleneck strength as the probability of pairwise coalescence
60 | # during the bottelenck s=1-np.exp(-tau/2)
61 | ```
62 |
63 | With increasing bottleneck strength the SFS becomes increasingly skewed (the leftmost blue bars show the SFS for a population of constant size). However, bottlenecks have a complex effect on the different frequency classes of the SFS: while the relative frequency of singletons increases, other frequency classes (e.g. doubletons) have a non-monotonic relationship with bottleneck strength:
64 |
65 |
66 | ```python
67 | bar_width=0.2
68 | index = np.arange(1,num_samp)
69 | j = 0
70 | for s, B in datalist.items():
71 | plt.bar(index + j * bar_width, B[1:], bar_width, label=str(s))
72 | j += 1
73 | ```
74 |
75 |
76 | 
77 |
78 |
79 | ### Comparison with analytic predictions
80 |
81 | How does the approximate SFS compare to analytic expectations? For a population of constant size, the SFS is simply given by Watterson's correction factor, that is the total length branches with i leafnodes is given is 1/i. Reassuringly, in the limit of s=0 (no bottleneck), our SFS approximation based on simulated genealogies agrees with this prediction:
82 |
83 |
84 | ```python
85 | expsfs=[(1/i) for i in range(1,10)]
86 | expsfs/=np.sum(expsfs)
87 |
88 | fig, ax = plt.subplots()
89 | index = np.arange(1,10)
90 | bar_width = 0.4
91 | opacity = 0.9
92 |
93 | simsfs = ax.bar(index, datalist[0][1:], bar_width, alpha=opacity, label='sim')
94 | expextsfs = ax.bar(index+ bar_width, expsfs, bar_width, alpha=opacity, label='exp')
95 |
96 | fig.tight_layout()
97 | plt.show()
98 | ```
99 |
100 |
101 | 
102 |
103 |
104 | The analytic prediction for the SFS under a bottleneck model is more complicated (Bunnefeld et al. 2015, Appendix). For a sample of n=4 lineages the SFS is:
105 |
106 |
107 | ```python
108 | #We are assuming a bottleneck of strength tau = 4 N_e generations
109 | #and a bottleneck time of T=1 (2 in units of 4 Ne)
110 | #I am pretty sure the analytic prediction for the SFS is correct: the limit mfor s->0 is correct and
111 | #itr matches the automatically generated expression in the Mathematica .nb...
112 |
113 | T=2
114 | slist=[1-np.exp(-tau) for tau in taulist]
115 |
116 | for s in slist:
117 | p=s*(-6 + 15*s - 20 * np.power(s,2) + 15 * np.power(s,3) - 6 * np.power(s,4) + np.power(s,5))
118 | expsfsBottlN= [2/15*(np.exp(-6*T)*(15 *np.exp(6*T) - 9 *np.exp(5*T)*s -
119 | 5*np.exp(3*T)*s*(3 - 3*s + np.power(s,2)) + p)),
120 | 1/5*np.exp(-6*T)*(5*np.exp(6*T) - 6*np.exp(5*T)*s - p),
121 | 2/15*np.exp(-6*T)*(5*np.exp(6*T) - 9*np.exp(5*T)*s + 5*np.exp(3*T)*s*(3-3*s + np.power(s,2)) + p)]
122 |
123 | expsfsBottlN/=np.sum(expsfsBottlN)
124 | print(expsfsBottlN)
125 | ```
126 |
127 | [ 0.54545455 0.27272727 0.18181818]
128 | [ 0.5644372 0.26717043 0.16839237]
129 | [ 0.57248098 0.26486068 0.16265835]
130 | [ 0.57559102 0.26396986 0.16043912]
131 |
132 |
133 | The fit between the SFS simulated with msprime and the analytic prediction is noty convincing (given the 100,000 replicates):
134 |
135 |
136 | ```python
137 | num_samp = 4
138 | num_rep = 100000
139 | data4 = {}
140 | T = 1
141 | for tau in taulist:
142 | data4[tau]= approx_SFS(run_bott_sims(num_rep, num_samp, T, tau/2))
143 | ```
144 |
145 |
146 | ```python
147 | fig, ax = plt.subplots()
148 | index = np.arange(1,4)
149 | bar_width = 0.4
150 | print(data4[0][1:])
151 | print(data4[1][1:])
152 | print(data4[2][1:])
153 | print(data4[3][1:])
154 |
155 | simsfs = ax.bar(index, data4[3][1:], bar_width, alpha=opacity, label='sim')
156 | expextsfs = ax.bar(index+ bar_width, expsfsBottlN, bar_width, label='exp')
157 |
158 | fig.tight_layout()
159 | plt.show()
160 | ```
161 |
162 | [ 0.54474183 0.27407036 0.18118781]
163 | [ 0.55186469 0.26647432 0.181661 ]
164 | [ 0.56819938 0.25679356 0.17500706]
165 | [ 0.58875024 0.24738596 0.16386379]
166 |
167 |
168 |
169 | 
170 |
171 |
172 | ## The distribution of nton branches
173 |
174 | Given that the SFS only depends on mean branch lengths, it is interesting to inspect the probability density distribution of the underlying genealogical branches. Given the discrete event, the pfd of nton branches are discontinuous.
175 |
176 |
177 | ```python
178 | s=1
179 | demographic_events = [msprime.InstantaneousBottleneck(time=T, strength=s, population=0)]
180 | reps = msprime.simulate(
181 | sample_size=num_samp, Ne=Ne, num_replicates=num_rep,
182 | demographic_events=demographic_events)
183 | B = np.zeros((num_rep, num_samp))
184 | for rep_index, ts in enumerate(reps):
185 | tree = next(ts.trees())
186 | for u in tree.nodes():
187 | nleaves = tree.num_samples(u)
188 | if tree.parent(u) != msprime.NULL_NODE:
189 | B[rep_index, nleaves]+=tree.branch_length(u)
190 | ```
191 |
192 |
193 | ```python
194 | Btrans=np.array(B).T.tolist()
195 | sns.distplot(Btrans[1],axlabel="f(t)")
196 | sns.distplot(Btrans[2],axlabel="f(t)")
197 | sns.distplot(Btrans[3],axlabel="f(t)");
198 | ```
199 |
200 |
201 | 
202 |
203 |
204 | ### To Do
205 |
206 | 1) Fix the scaling of the strength in msprime
207 | 2) Fix the pdf plot above:
208 | - Label axes on the pdf plot above: y-> f(t), x -> t
209 | - Restrict X range to 15
210 | 3) Fix the x axes on all the barplots so that these correspond to classes in the SFS
211 |
--------------------------------------------------------------------------------
/old-content/docs/introgression.md:
--------------------------------------------------------------------------------
1 |
2 | # Introgression
3 |
4 | **Jerome Kelleher and Konrad Lohse**
5 |
6 | There has been great interest in understanding the contributions past populations have made to the genetic diversity of current populations via admixture. In particular, the availability of whole genome sequence data from archaic hominins (Green et al 2010) has allowed geneticists to identify admixture tracks (Sankararaman 2016). In the simplest case, admixture tracts can be defined heuristically as regions of the genome that show excessive similarity between a putative source and a recipient population (usually quantified relative to some non-admixted reference population, ref Durand et al 2010). Because recombination breaks down admixture tracts, their length distribution gives a clock for calibrating admixture and this information been used to date the admixure contributions Neanderthals and other archaic hominins have made to non-African humans (Sankararaman 2016) and to recontruct the admixture history between different modern human populations (ref).
7 |
8 | Crucially, the power to identify admixture depends on the relative time bewteen the admixture event and the earlier divergence between source and the recipient population: the shorter this interval, the harder it becomes to detect admixture. This is because ancestral material is increasingly likely to trace its ancestry back to the common ancestral population regardless of whether it has been involved in any recent admixture or not. In other words, it becomes increasingly difficult to distinguish between admixtjure from Incomplete lineage sorting (ILS).
9 |
10 | In the following section we use msprime simulations to ask what fraction of admixture tracts are identifyable as such.
11 |
12 | To illustrate this, we simulate ancestral recombination graphs (ARGs) under a simple toy history of divergence and admixture which is loosely motivated by the demographic history of modern humans and Neandertals. As in previous sections, we will first examine properties of the ARG directly rather than use it to simulate mutations. We assume a minimal sample of a single (haploid) genome from a modern human population in African and Eurasia as well as an ancient Neandertal sample.
13 |
14 | Considering a rooted ARG, we want to distinguish three categories of segments: i) tracts actually involved in Neandertal admixture, ii) the subset of those tracts that coalesces in the Neandertal population and iii) segments at which Eurasians are more closely related to Neandertals than either are to Africans. This latter category must include all of ii) but also an additional set of short tracts that are due to incomplete lineage sorting (ILS). The last category is interesting because it is the only one that can be unambiguously detected in data (via derived mutations that are shared by Neandertals and Eurasians).
15 |
16 | First we set up a highly simplified demographic history of human neandertal demography and simulate a single chromosome of 100Mb length:
17 |
18 |
19 | ```python
20 | %matplotlib inline
21 | %config InlineBackend.figure_format = 'svg'
22 | import random
23 | import collections
24 | import msprime
25 | import numpy as np
26 | import seaborn as sns
27 | import multiprocessing
28 | import matplotlib.pyplot as plt
29 |
30 | from IPython.display import SVG
31 | ```
32 |
33 |
34 | ```python
35 | # Population IDs: Africa, Eurasia, Neanderthal
36 | AFR, EUR, NEA = 0, 1, 2
37 |
38 | def run_simulation(random_seed=None):
39 | time_units = 1000 / 25 # Conversion factor for kya to generations
40 | ts = msprime.simulate(
41 | Ne=10**4, # The same for all populations; highly unrealistic!
42 | recombination_rate=1e-8,
43 | length=100*10**6, # 100 Mb
44 | samples=[
45 | msprime.Sample(time=0, population=AFR),
46 | msprime.Sample(time=0, population=EUR),
47 | # Neanderthal sample taken 30 kya
48 | msprime.Sample(time=30 * time_units, population=NEA),
49 | ],
50 | population_configurations = [
51 | msprime.PopulationConfiguration(), # Africa
52 | msprime.PopulationConfiguration(), # Eurasia
53 | msprime.PopulationConfiguration(), # Neanderthal
54 | ],
55 | demographic_events = [
56 | msprime.MassMigration(
57 | # 2% introgression 50 kya
58 | time=50 * time_units,
59 | source=EUR, dest=NEA, proportion=0.02),
60 | msprime.MassMigration(
61 | # Eurasian & Africa populations merge 70 kya
62 | time=70 * time_units,
63 | source=EUR, dest=AFR, proportion=1),
64 | msprime.MassMigration(
65 | # Neanderthal and African populations merge 300 kya
66 | time=300 * time_units,
67 | source=NEA, destination=AFR, proportion=1),
68 | ],
69 | record_migrations=True, # Needed for tracking segments.
70 | random_seed=random_seed,
71 | )
72 | return ts
73 |
74 | ts = run_simulation(1)
75 | ```
76 |
77 | Here we run our simulation in the usual way, but including the ``record_migrations`` option. This allows us to track segments of ancestral material that migrate from the European population into the Neanderthal population (backwards in time). We can then examine the length distributions of these segments and compare them with the length of the segments that also go on to coalesce within the Neanderthal population.
78 |
79 |
80 | ```python
81 | def get_migrating_tracts(ts):
82 | migrating_tracts = []
83 | # Get all tracts that migrated into the neanderthal population
84 | for migration in ts.migrations():
85 | if migration.dest == NEA:
86 | migrating_tracts.append((migration.left, migration.right))
87 | return np.array(migrating_tracts)
88 |
89 | def get_coalescing_tracts(ts):
90 | coalescing_tracts = []
91 | tract_left = None
92 | for tree in ts.trees():
93 | # 1 is the Eurasian sample and 2 is the Neanderthal
94 | mrca_pop = tree.population(tree.mrca(1, 2))
95 | left = tree.interval[0]
96 | if mrca_pop == NEA and tract_left is None:
97 | # Start a new tract
98 | tract_left = left
99 | elif mrca_pop != NEA and tract_left is not None:
100 | # End the last tract
101 | coalescing_tracts.append((tract_left, left))
102 | tract_left = None
103 | if tract_left is not None:
104 | coalescing_tracts.append((tract_left, ts.sequence_length))
105 | return np.array(coalescing_tracts)
106 |
107 | def get_eur_nea_tracts(ts):
108 | tracts = []
109 | tract_left = None
110 | for tree in ts.trees():
111 | # 1 is the Eurasian sample and 2 is the Neanderthal
112 | mrca = tree.mrca(1, 2)
113 | left = tree.interval[0]
114 | if mrca != tree.root and tract_left is None:
115 | # Start a new tract
116 | tract_left = left
117 | elif mrca != tree.root and tract_left is not None:
118 | # End the last tract
119 | tracts.append((tract_left, left))
120 | tract_left = None
121 | if tract_left is not None:
122 | tracts.append((tract_left, ts.sequence_length))
123 | return np.array(tracts)
124 |
125 |
126 | migrating = get_migrating_tracts(ts)
127 | within_nea = get_coalescing_tracts(ts)
128 | eur_nea = get_eur_nea_tracts(ts)
129 | ```
130 |
131 | We build three different lists. The first is the set of tracts that have migrated from the Eurasian population into the Neanderthal population, and is done simply by finding all migration records in which the destination population is equal to NEA. The second list (which must contain a subset of the segments in the first list) is the ancestral segments that went on to coalesce within the Neanderthal population. The third list contains all segments in which the Eurasian and Neanderthal sample coalesce before their ancestor coalesces with the African sample. The third list includes both Eurasian segments that migrated to the Neanderthal population and segments that did not migrate and did not coalesce until after the Neanderthal-Human population split 300kya.
132 |
133 |
134 | ```python
135 | nea_total = np.sum(eur_nea[:,1] - eur_nea[:,0])
136 | migrating_total = np.sum(migrating[:,1] - migrating[:,0])
137 | within_nea_total = np.sum(within_nea[:,1] - within_nea[:,0])
138 | print([nea_total, migrating_total, within_nea_total])
139 | ```
140 |
141 | [51464256.237136059, 1533972.029931426, 630462.28620933369]
142 |
143 |
144 | Although $f=0.02$ the total length of admixted segments is 5% of the chromosome. Presumably this excess is just due to coalescence variance? We expect a proportion $1-e^{-(T_{split}-T_{ad})}$ of admixted lineages to coalesce. Given our time parameters $T_{split}-T_{ad})= 1/2$ (in units of $2 N_e$ generations), so we expect $1 -(e^-\frac{1}{2})=0.39$ of admixted sequence to have a coalesce in the neandertal population. Why is the observed fraction just 0.029?
145 |
146 |
147 | ```python
148 | kb = 1 / 1000
149 | plt.hist([
150 | (eur_nea[:,1] - eur_nea[:,0]) * kb,
151 | (migrating[:,1] - migrating[:,0]) * kb,
152 | (within_nea[:,1] - within_nea[:,0]) * kb,],
153 | label=["Migrating", "EUR-NEA", "Within NEA"]
154 | )
155 | plt.yscale('log')
156 | plt.legend()
157 | plt.xlabel("Tract length (KB)");
158 | ```
159 |
160 |
161 | 
162 |
163 |
164 | Plotting these tract lengths for a single replicate shows that, as expected, admixture tracts are large initially (blue). We can also see that there is extensive ILS which has two effects: First, only a subset of the admixted material has ancestry in the Neandertal population, it would to check whether this fits the total lengths}). Second there many more tracts at which Neandertals and non-African humans are more closely related to each other due to incomplete lineage sorting (ILS) than there are admixture tracts. Finally ILS tracts because they are old by definition are substantially shorter than admixture tracts.
165 |
166 | ## Locating mutations
167 |
168 | We are interested in finding the population in which mutations arose. Because mutations are just associated with a specific tree node in msprime, we must simulate some extra information in order to make this question answerable. This is quite straightforward to do, since we can generate a time for each mutation uniformly along a branch and therefore unambiguously locate it time (and, therefore, space).
169 |
170 |
171 | ```python
172 | def simulate_mutation_times(ts, random_seed=None):
173 | rng = random.Random(random_seed)
174 | mutation_time = np.zeros(ts.num_mutations)
175 | for tree in ts.trees():
176 | for mutation in tree.mutations():
177 | a = tree.time(mutation.node)
178 | b = tree.time(tree.parent(mutation.node))
179 | mutation_time[mutation.id] = rng.uniform(a, b)
180 | return mutation_time
181 |
182 | pop_configs = [
183 | msprime.PopulationConfiguration(sample_size=3),
184 | msprime.PopulationConfiguration(sample_size=1),
185 | msprime.PopulationConfiguration(sample_size=1)]
186 | M = [
187 | [0, 1, 1],
188 | [1, 0, 1],
189 | [1, 1, 0]]
190 | ts = msprime.simulate(
191 | population_configurations=pop_configs, migration_matrix=M,
192 | record_migrations=True, mutation_rate=0.5, random_seed=25)
193 | mutation_time = simulate_mutation_times(ts, random_seed=25)
194 |
195 | ```
196 |
197 | Once we have run our simulation and assigned times to each mutation, we can then assign populations to each of these mutations. The following function takes a simple approach, but first gathering the migrations for each node into a list. Then, for every mutation, we sequentially examine each migration that affects the mutation's node and intersects with the site position. Because we know that the migration records are sorted in increasing time order, we can simply apply the effects of each migration while the migration record's time is less than the time of the mutation. At the end of this process, we then return the computed mapping of mutation IDs to the populations in which they arose.
198 |
199 |
200 | ```python
201 | def get_mutation_population(ts, mutation_time):
202 | node_migrations = collections.defaultdict(list)
203 | for migration in ts.migrations():
204 | node_migrations[migration.node].append(migration)
205 | mutation_population = np.zeros(ts.num_mutations, dtype=int)
206 | for tree in ts.trees():
207 | for site in tree.sites():
208 | for mutation in site.mutations:
209 | mutation_population[mutation.id] = tree.population(mutation.node)
210 | for mig in node_migrations[mutation.node]:
211 | # Stepping through all migations will be inefficient for large
212 | # simulations. Should use an interval tree (e.g.
213 | # https://pypi.python.org/pypi/intervaltree) to find all
214 | # intervals intersecting with site.position.
215 | if mig.left <= site.position < mig.right:
216 | # Note that we assume that we see the migration records in
217 | # increasing order of time!
218 | if mig.time < mutation_time[mutation.id]:
219 | assert mutation_population[mutation.id] == mig.source
220 | mutation_population[mutation.id] = mig.dest
221 | return mutation_population
222 |
223 | mutation_population = get_mutation_population(ts, mutation_time)
224 | ```
225 |
226 |
227 | ```python
228 | tree = ts.first()
229 | colour_map = {0:"red", 1:"blue", 2: "green"}
230 | node_colours = {u: colour_map[tree.population(u)] for u in tree.nodes()}
231 | mutation_colours = {mut.id: colour_map[mutation_population[mut.id]] for mut in tree.mutations()}
232 | SVG(tree.draw(node_colours=node_colours, mutation_colours=mutation_colours))
233 |
234 | ```
235 |
236 |
237 |
238 |
239 | 
240 |
241 |
242 |
243 | This example shows the locations in which the mutations along the tree branches. We show a single tree here for simplicity, but the method also works when we have recombination.
244 |
--------------------------------------------------------------------------------
/old-content/docs/introgression_files/introgression_15_0.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/old-content/docs/msprime_out_files/msprime_out_15_0.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/old-content/docs/msprime_out_files/msprime_out_19_0.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/old-content/docs/msprime_out_files/msprime_out_28_0.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/old-content/docs/msprime_out_files/msprime_out_32_0.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/old-content/docs/msprime_out_files/msprime_out_39_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/old-content/docs/msprime_out_files/msprime_out_39_0.png
--------------------------------------------------------------------------------
/old-content/docs/msprime_out_files/msprime_out_40_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/old-content/docs/msprime_out_files/msprime_out_40_0.png
--------------------------------------------------------------------------------
/old-content/docs/msprime_out_files/msprime_out_43_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/old-content/docs/msprime_out_files/msprime_out_43_0.png
--------------------------------------------------------------------------------
/old-content/docs/wfforward_files/wfforward_11_0.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/old-content/docs/wfforward_files/wfforward_18_0.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/old-content/docs/wfforward_files/wfforward_65_0.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/old-content/notebooks/Makefile.wfcython:
--------------------------------------------------------------------------------
1 | all: md
2 |
3 | MD=wfcython.md
4 |
5 | md: $(MD)
6 |
7 | clean:
8 | rm -f *.output.ipynb
9 | rm -rf *.output_files
10 |
11 | reallyclean: clean
12 | for i in $(MD) ; do \
13 | rm -f ../docs/$$i ; \
14 | done
15 | for i in $(basename $(MD)) ; do \
16 | rm -fr ../docs/$$i"_files" ; \
17 | done
18 |
19 | %.output.ipynb: %.ipynb
20 | python execute.py $< $@ --timeout 2000
21 |
22 | %.md: %.output.ipynb
23 | # Set --output so that supporting
24 | # file folder (foo_files) gets
25 | # generated with correct
26 | # output name and location
27 | jupyter nbconvert --to markdown --output-dir=../docs --output=$@ $<
28 |
29 | .PHONY: md
30 | .PRECIOUS: %.output.ipynb
31 |
--------------------------------------------------------------------------------
/old-content/notebooks/Makefile.wfforward:
--------------------------------------------------------------------------------
1 | all: md
2 |
3 | MD=wfforward.md
4 |
5 | md: $(MD)
6 |
7 | clean:
8 | rm -f *.output.ipynb
9 | rm -rf *.output_files
10 |
11 | reallyclean: clean
12 | for i in $(MD) ; do \
13 | rm -f ../docs/$$i ; \
14 | done
15 | for i in $(basename $(MD)) ; do \
16 | rm -fr ../docs/$$i"_files" ; \
17 | done
18 |
19 | %.output.ipynb: %.ipynb
20 | python execute.py $< $@
21 |
22 | %.md: %.output.ipynb
23 | # Set --output so that supporting
24 | # file folder (foo_files) gets
25 | # generated with correct
26 | # output name and location
27 | jupyter nbconvert --to markdown --output-dir=../docs --output=$@ $<
28 |
29 | .PHONY: md
30 | .PRECIOUS: %.output.ipynb
31 |
--------------------------------------------------------------------------------
/old-content/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # Makefiles
2 |
3 | Makefiles are provided per-notebook, in order to support long-running tutorials. Use `make -f Makefile.foo` to build
4 | the desired output.
5 |
--------------------------------------------------------------------------------
/old-content/notebooks/execute.py:
--------------------------------------------------------------------------------
1 | import nbformat
2 | import nbconvert
3 | from nbconvert.preprocessors import ExecutePreprocessor
4 | import argparse
5 | import sys
6 |
7 |
8 | def make_parser():
9 | parser = argparse.ArgumentParser(
10 | description="Options for converting notebooks")
11 |
12 | parser.add_argument('--timeout', type=int, default=600,
13 | help='Execution timeout (seconds)')
14 | return parser
15 |
16 |
17 | nbfile = sys.argv[1]
18 | nboutfile = sys.argv[2]
19 | parser=make_parser()
20 | args=parser.parse_args(sys.argv[3:])
21 | with open(nbfile) as f:
22 | nb = nbformat.read(f, as_version=4)
23 | ep = ExecutePreprocessor(timeout=args.timeout, kernel_name='python3')
24 | ep.preprocess(nb, {'metadata': {'path': '.'}})
25 | with open(nboutfile, 'wt') as f:
26 | nbformat.write(nb, f)
27 |
--------------------------------------------------------------------------------
/parallelization.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | ```{currentmodule} tskit
15 | ```
16 |
17 | ```{code-cell} ipython3
18 | :tags: [remove-cell]
19 | import msprime
20 | import numpy as np
21 | import tskit
22 |
23 | def create_notebook_data():
24 | pass
25 |
26 | # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook
27 | ```
28 |
29 | (sec_parallelization)=
30 |
31 | # _Parallelization_
32 | % remove underscores in title when tutorial is complete or near-complete
33 |
34 | When performing large calculations it's often useful to split the
35 | work over multiple processes or threads. The ``tskit`` API can
36 | be used without issues across multiple processes, and the Python
37 | {mod}`multiprocessing` module often provides a very effective way to
38 | work with many replicate simulations in parallel.
39 |
40 | When we wish to work with a single very large dataset, however, threads can
41 | offer better resource usage because of the shared memory space. The Python
42 | {mod}`threading` library gives a very simple interface to lightweight CPU
43 | threads and allows us to perform several CPU intensive tasks in parallel. The
44 | ``tskit`` API is designed to allow multiple threads to work in parallel when
45 | CPU intensive tasks are being undertaken.
46 |
47 | :::{note}
48 | In the CPython implementation the
49 | [Global Interpreter Lock](https://wiki.python.org/moin/GlobalInterpreterLock) ensures that
50 | only one thread executes Python bytecode at one time. This means that
51 | Python code does not parallelise well across threads, but avoids a large
52 | number of nasty pitfalls associated with multiple threads updating
53 | data structures in parallel. Native C extensions like ``numpy`` and ``tskit``
54 | release the GIL while expensive tasks are being performed, therefore
55 | allowing these calculations to proceed in parallel.
56 | :::
57 |
58 |
59 | :::{todo}
60 | This tutorial previously used code with an old interface, and hence has been removed.
61 | We must recreate an example of parallel processing, giving examples of both
62 | threads and processes (but see
63 | [this stackoverflow post](https://stackoverflow.com/questions/47313732/jupyter-notebook-never-finishes-processing-using-multiprocessing-python-3)
64 | for why it may be difficult to get {mod}`multiprocessing` working in this notebook).
65 | A reasonable example might be to calculate many pairwise statistics between sample sets
66 | in parallel.
67 |
68 | We should also show how, for large tree sequences that it is better to pass the filenames
69 | to each subprocess, and load the tree sequence, rather than transferring the entire
70 | tree sequence (via pickle) to the subprocesses.
71 | :::
72 |
73 |
--------------------------------------------------------------------------------
/popgen.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.13
7 | jupytext_version: 1.10.3
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | ```{currentmodule} tskit
15 | ```
16 |
17 | ```{code-cell}
18 | :tags: [remove-cell]
19 | import urllib.request
20 |
21 | import tqdm
22 | import tskit
23 | import tszip
24 |
25 | class DownloadProgressBar(tqdm.tqdm):
26 | def update_to(self, b=1, bsize=1, tsize=None):
27 | if tsize is not None:
28 | self.total = tsize
29 | self.update(b * bsize - self.n)
30 |
31 | def download(url, progress=True):
32 | with DownloadProgressBar(
33 | unit='B',
34 | unit_scale=True,
35 | miniters=1,
36 | desc=url.split('/')[-1],
37 | disable=not progress,
38 | ) as t:
39 | tmp_fn, _ = urllib.request.urlretrieve(url, reporthook=t.update_to)
40 | try:
41 | ts = tskit.load(tmp_fn)
42 | except tskit.FileFormatError:
43 | # could be a tsz file
44 | ts = tszip.decompress(tmp_fn)
45 | urllib.request.urlcleanup() # Remove tmp_fn
46 | return ts
47 |
48 | def download_unified_genealogy():
49 | keep_span = [108_000_000, 110_000_000] # cut down to this genome region
50 | keep_regions = {"EastAsia", "EAST_ASIA", "AFRICA", "Africa"}
51 |
52 | # Downloads 138 Mb of data - this may take a while
53 | tables = download(
54 | "https://zenodo.org/record/5512994/files/"
55 | "hgdp_tgp_sgdp_high_cov_ancients_chr2_q.dated.trees.tsz"
56 | ).dump_tables()
57 | tables.keep_intervals([keep_span])
58 | tables.populations.metadata_schema = tskit.MetadataSchema.permissive_json()
59 | tables.sites.metadata_schema = tskit.MetadataSchema.permissive_json()
60 | ts = tables.tree_sequence()
61 | ts = ts.simplify([
62 | u
63 | for u in ts.samples()
64 | if (
65 | ts.population(ts.node(u).population).metadata.get("region") in keep_regions
66 | or ts.population(ts.node(u).population).metadata.get("name") == "Denisovan"
67 | )
68 | ])
69 | tszip.compress(ts, "data/unified_genealogy_2q_108Mb-110Mb.tsz")
70 |
71 | def create_notebook_data():
72 | download_unified_genealogy()
73 |
74 | # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook
75 | ```
76 |
77 | (sec_intro_popgen)=
78 |
79 | # `Tskit` for population genetics
80 |
81 | {ref}`Tskit`, the tree sequence toolkit, brings the power of
82 | evolutionary trees to the field of population genetics. The
83 | {ref}`succinct tree sequence` format
84 | is designed to store DNA sequences jointly with their ancestral history (the
85 | "genetic genealogy" or {ref}`ARG`). Storing population genetic data in this
86 | form enables highly efficient computation and analysis.
87 |
88 | The core `tskit` library provides methods for storing genetic data, a flexible
89 | analysis framework, and APIs to build your own efficient population genetic algorithms.
90 | Because of its speed and scalability, `tskit` is well-suited to interactive analysis of
91 | large genomic datasets.
92 |
93 | ## Population genetic simulation
94 |
95 | Several simulation tools output tree sequences. Below we use the
96 | standard library for population genetic simulation models
97 | ([stdpopsim](https://popsim-consortium.github.io/stdpopsim-docs/)) to generate a model of
98 | *Homo sapiens*, in which African, Eurasian,
99 | and Asian populations combine to generate a mixed American population. We can use the
100 | [demesdraw](https://pypi.org/project/demesdraw/) package to plot a schematic of the
101 | migrations and population size changes that define this model.
102 |
103 |
104 | ```{code-cell}
105 | import stdpopsim
106 | import demesdraw
107 | from matplotlib import pyplot as plt
108 |
109 | species = stdpopsim.get_species("HomSap")
110 | model = species.get_demographic_model("AmericanAdmixture_4B18")
111 |
112 | # Plot a schematic of the model
113 | demesdraw.tubes(model.model.to_demes(), ax=plt.gca(), seed=1, log_time=True)
114 | plt.show()
115 | ```
116 |
117 | Genomic data in tree sequence format can be generated via the widely-used
118 | [msprime](https://tskit.dev/software/msprime.html) simulator. Here we simulate 20
119 | kilobases of genome sequence at the start of human chromosome 1 under this model,
120 | together with its evolutionary history. We generate 16 diploid genomes: 4 from each of
121 | the populations in the model. The DNA sequences and their ancestry are stored in a
122 | succinct tree sequence named `ts`:
123 |
124 | ```{code-cell}
125 | contig = species.get_contig("chr1", mutation_rate=model.mutation_rate, right=20_000)
126 | samples = {"AFR": 4, "EUR": 4, "ASIA": 4, "ADMIX": 4} # 16 diploid samples
127 | engine = stdpopsim.get_engine("msprime")
128 | ts = engine.simulate(model, contig, samples, seed=9).trim() # trim to first 20kb simulated
129 | print(f"Simulated a tree sequence of {ts.num_samples} haploid genomes:")
130 | print(f"{ts.num_sites} variable sites over {ts.sequence_length} base pairs")
131 | ```
132 |
133 | We can now inspect alleles and their frequencies at the variable sites we have simulated
134 | along the genome:
135 |
136 | ```{code-cell}
137 | for v in ts.variants():
138 | display(v)
139 | if v.site.id >= 2: # Only show site 0, 1, and 2, for brevity
140 | break
141 | ```
142 |
143 | Or we can display the {meth}`~TreeSequence.haplotypes` (i.e. the variable sites) for
144 | each sample
145 |
146 | ```{code-cell}
147 | samples = ts.samples()
148 | for sample_id, h in zip(samples, ts.haplotypes(samples=samples)):
149 | pop = ts.node(sample_id).population
150 | print(f"Sample {sample_id:<2} ({ts.population(pop).metadata['name']:^5}): {h}")
151 | ```
152 |
153 | From the tree sequence it is easy to obtain the
154 | {meth}`TreeSequence.allele_frequency_spectrum` for the entire region (or for
155 | {ref}`windowed regions`)
156 |
157 | ```{code-cell}
158 | afs = ts.allele_frequency_spectrum()
159 | plt.bar(range(ts.num_samples + 1), afs)
160 | plt.title("Allele frequency spectrum")
161 | plt.show()
162 | ```
163 |
164 | Similarly `tskit` allows fast and easy
165 | {ref}`calculation of statistics` along the genome. Here is
166 | a plot of windowed $F_{st}$ between Africans and admixed Americans over this short
167 | region of chromosome:
168 |
169 | ```{code-cell}
170 | # Define the samples between which Fst will be calculated
171 | pop_id = {p.metadata["name"]: p.id for p in ts.populations()}
172 | sample_sets=[ts.samples(pop_id["AFR"]), ts.samples(pop_id["ADMIX"])]
173 |
174 | # Do the windowed calculation, using windows of 2 kilobases
175 | windows = list(range(0, int(ts.sequence_length + 1), 2_000))
176 | F_st = ts.Fst(sample_sets, windows=windows)
177 |
178 | # Plot
179 | plt.stairs(F_st, windows, baseline=None)
180 | plt.ylabel("AFR-ADMIX Fst")
181 | plt.xlabel("Genome position")
182 | plt.show()
183 | ```
184 |
185 | Extracting the genetic tree at a specific genomic location is easy using `tskit`, which
186 | also provides methods to {ref}`plot` these trees. Here we
187 | grab the tree at position 10kb, and colour the different populations by
188 | different colours, as described in the {ref}`viz tutorial`:
189 |
190 | ```{code-cell}
191 | tree = ts.at(10_000)
192 |
193 | colours = dict(AFR="yellow", EUR="cyan", ASIA="green", ADMIX="red")
194 | styles = [
195 | f".leaf.p{pop.id} > .sym {{fill: {colours[pop.metadata['name']]}}}"
196 | for pop in ts.populations()
197 | ]
198 |
199 | styles += [ # rotate the population labels, etc
200 | ".leaf > .lab {text-anchor: start; transform: rotate(90deg) translate(6px)}",
201 | ".leaf > .sym {stroke: black}"
202 | ]
203 |
204 | labels = { # Label samples by population
205 | u: ts.population(ts.node(u).population).metadata["name"].capitalize()
206 | for u in ts.samples()
207 | }
208 |
209 | tree.draw_svg(
210 | size=(800, 500),
211 | canvas_size=(800, 520),
212 | node_labels=labels,
213 | style="".join(styles),
214 | y_axis=True,
215 | y_ticks=range(0, 30_000, 10_000)
216 | )
217 |
218 | ```
219 |
220 | ## Population genetic inference
221 |
222 | If, instead of simulations, you want to analyse existing genomic data (for example
223 | stored in a VCF file), you will need to infer a tree sequence from it, using e.g.
224 | [tsinfer](https://tskit.dev/tsinfer/docs/stable/). Here we load an illustrative portion
225 | of an [inferred tree sequence](https://zenodo.org/record/5512994)
226 | based on about 7500 public human genomes, including genomes from the
227 | [Thousand Genomes Project](https://www.internationalgenome.org/data-portal/data-collection/grch38) and
228 | [Human Genome Diversity Project](https://www.internationalgenome.org/data-portal/data-collection/hgdp).
229 | The genomic region encoded in this tree sequence has been cut down to
230 | span positions 108Mb-110Mb of human chromosome 2, which spans the
231 | [EDAR](https://en.wikipedia.org/wiki/Ectodysplasin_A_receptor) gene.
232 |
233 | Note that tree sequence files are usually imported using {func}`load`,
234 | but because this file has been additionally compressed, we load it via
235 | {func}`tszip:tszip.decompress`:
236 |
237 | ```{code-cell}
238 | import tszip
239 | ts = tszip.decompress("data/unified_genealogy_2q_108Mb-110Mb.tsz")
240 |
241 | # The ts encompasses a region on chr 2 with an interesting SNP (rs3827760) in the EDAR gene
242 | edar_gene_bounds = [108_894_471, 108_989_220] # In Mb from the start of chromosome 2
243 | focal_variant = [v for v in ts.variants() if v.site.metadata.get("ID") == "rs3827760"].pop()
244 | print("An interesting SNP within the EDAR gene:")
245 | focal_variant
246 | ```
247 |
248 | For simplicity, this tree sequence has been {ref}`simplified` to
249 | include only those samples from the African and East Asian regions. These belong to a
250 | number of populations. The population information, as well as information describing the
251 | variable sites, is stored in tree sequence {ref}`metadata`:
252 |
253 | ```{code-cell}
254 | import pandas as pd
255 |
256 | print(ts.num_populations, "populations defined in the tree sequence:")
257 |
258 | pop_names_regions = [
259 | [p.metadata.get("name"), p.metadata.get("region")]
260 | for p in ts.populations()
261 | ]
262 | display(pd.DataFrame(pop_names_regions, columns=["population name", "region"]))
263 | ```
264 |
265 | You can see that there are multiple African and East asian populations, grouped by
266 | region. Here we collect two lists of IDs for the sample
267 | {ref}`nodes` from the African region and from the East asian
268 | region:
269 |
270 | ```{code-cell}
271 |
272 | sample_lists = {}
273 | for n, rgns in {"Africa": {'AFRICA', 'Africa'}, "East asia": {'EAST_ASIA', 'EastAsia'}}.items():
274 | pop_ids = [p.id for p in ts.populations() if p.metadata.get("region") in rgns]
275 | sample_lists[n] = [u for p in pop_ids for u in ts.samples(population=p)]
276 | ```
277 |
278 |
279 | With these lists we can calculate different windowed statistics
280 | (here {meth}`genetic diversity` and
281 | {meth}`Tajima's D`) within each of these regions:
282 |
283 | ```{code-cell}
284 | edar_ts = ts.trim() # remove regions with no data (changes the coordinate system)
285 | windows = list(range(0, int(edar_ts.sequence_length)+1, 10_000))
286 | data = {
287 | "Genetic diversity": {
288 | region: edar_ts.diversity(samples, windows=windows)
289 | for region, samples in sample_lists.items()
290 | },
291 | "Tajima's D": {
292 | region: edar_ts.Tajimas_D(samples, windows=windows)
293 | for region, samples in sample_lists.items()
294 | },
295 | }
296 |
297 | # Plot the `data`
298 | fig, axes = plt.subplots(ncols=2, figsize=(15, 3))
299 | start = ts.edges_left.min() # the empty amount at the start of the tree sequence
300 |
301 | for (title, plot_data), ax in zip(data.items(), axes):
302 | ax.set_title(title)
303 | ax.axvspan(edar_gene_bounds[0], edar_gene_bounds[1], color="lightgray")
304 | ax.axvline(focal_variant.site.position, ls=":")
305 | for label, stat in plot_data.items():
306 | ax.stairs(stat, windows+start, baseline=None, label=label)
307 | ax.text(edar_gene_bounds[0], 0, "EDAR")
308 | ax.legend()
309 | plt.show()
310 | ```
311 |
312 | Other population genetic libraries such as
313 | [scikit-allel](https://scikit-allel.readthedocs.io/en/stable/) (which is
314 | {ref}`interoperable` with `tskit`)
315 | could also have been used to produce the plot above. In this case, the advantage of
316 | using tree sequences is simply that they allow these sorts of analysis to
317 | {ref}`scale` to datasets of millions of whole genomes.
318 |
319 | (sec_popgen_topological)=
320 |
321 | ### Topological analysis
322 |
323 | As this inferred tree sequence stores (an estimate of) the underlying
324 | genealogy, we can also derive statistics based on genealogical relationships. For
325 | example, this tree sequence also contains a sample genome based on an ancient
326 | genome, a [Denisovan](https://en.wikipedia.org/wiki/Denisovan) individual. We can
327 | look at the closeness of relationship between samples from the different geographical
328 | regions and the Denisovan:
329 |
330 | :::{todo}
331 | Show an example of looking at topological relationships between the Denisovan and
332 | various East Asian groups, using the {ref}`sec_counting_topologies` functionality.
333 | :::
334 |
335 | See {ref}`sec_counting_topologies` for an introduction to topological methods in
336 | `tskit`.
337 |
338 | ## Further information
339 |
340 | This brief introduction is meant as a simple taster. Many other efficient population
341 | genetic {ref}`analyses` are possible when you have
342 | genomic data stored as a tree sequence.
343 |
344 | The rest of the {ref}`tutorials` contain a large number of examples which
345 | are relevant to population genetic analysis and research. You can also visit the
346 | [learning section](https://tskit.dev/learn/) of the [tskit website](https://tskit.dev/).
347 |
--------------------------------------------------------------------------------
/references.bib:
--------------------------------------------------------------------------------
1 | ---
2 | ---
3 |
4 | @article{kelleher2016efficient,
5 | title={Efficient coalescent simulation and genealogical analysis for large sample sizes},
6 | author={Kelleher, Jerome and Etheridge, Alison M and McVean, Gilean},
7 | journal={PLoS computational biology},
8 | volume={12},
9 | number={5},
10 | pages={e1004842},
11 | year={2016},
12 | publisher={Public Library of Science}
13 | }
14 |
15 | @InCollection{kelleher2020coalescent,
16 | author={Kelleher, Jerome and Lohse, Konrad},
17 | editor={Dutheil, Julien Y.},
18 | title={Coalescent Simulation with msprime},
19 | bookTitle={Statistical Population Genomics},
20 | year={2020},
21 | publisher={Springer US},
22 | address={New York, NY},
23 | pages={191--230},
24 | }
25 |
26 | @article{nelson2020accounting,
27 | title={Accounting for long-range correlations in genome-wide simulations
28 | of large cohorts},
29 | author={Nelson, Dominic and Kelleher, Jerome and Ragsdale, Aaron P and
30 | Moreau, Claudia and McVean, Gil and Gravel, Simon},
31 | journal={PLoS genetics},
32 | volume={16},
33 | number={5},
34 | pages={e1008619},
35 | year={2020},
36 | publisher={Public Library of Science San Francisco, CA USA}
37 | }
38 |
39 | @article{adrion2019community,
40 | title={A community-maintained standard library of population genetic models},
41 | author={Adrion, Jeffrey R and Cole, Christopher B and Dukler, Noah and Galloway, Jared G and Gladstein, Ariella L and Gower, Graham and Kyriazis, Christopher C and Ragsdale, Aaron P and Tsambos, Georgia and Baumdicker, Franz and others},
42 | journal={eLife},
43 | year={2020},
44 | volume={9},
45 | pages={e54967}
46 | }
47 |
48 |
--------------------------------------------------------------------------------
/requirements-CI.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/tskit-dev/tsconvert@e99c837e4e26ccbf4f480a4c48626338eeff7dc3
2 | demes==0.2.3
3 | demesdraw==0.4.0
4 | jupyter-book==1.0.2
5 | jupyter-cache==0.6.1
6 | msprime==1.3.2
7 | networkx==3.3
8 | numpy==1.26.4
9 | pandas==2.2.2
10 | pygraphviz==1.13
11 | scikit-allel==1.3.8
12 | stdpopsim==0.3.0
13 | tqdm==4.66.3
14 | tskit==0.5.8
15 | tskit_arg_visualizer==0.0.1
16 | tszip==0.2.4
17 | jsonschema==4.18.6 # Pinned due to 4.19 "AttributeError module jsonschema has no attribute _validators"
18 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/tskit-dev/tsconvert@e99c837e4e26ccbf4f480a4c48626338eeff7dc3
2 | demes
3 | demesdraw
4 | jupyter-book>=0.12.0
5 | jupyter-cache
6 | msprime>=1.0
7 | networkx
8 | pandas
9 | pygraphviz
10 | scikit-allel
11 | stdpopsim>=0.3
12 | tqdm
13 | tskit>=0.5.4
14 | tskit_arg_visualizer
15 | tszip
16 |
--------------------------------------------------------------------------------
/simplification.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | ```{currentmodule} tskit
15 | ```
16 |
17 | ```{code-cell} ipython3
18 | :tags: [remove-cell]
19 | def create_notebook_data():
20 | pass
21 |
22 | # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook
23 | ```
24 |
25 | (sec_simplification)=
26 |
27 | # _Simplification_
28 | % remove underscores in title when tutorial is complete or near-complete
29 |
30 | :::{todo}
31 | Create content. See https://github.com/tskit-dev/tutorials/issues/52
32 | :::
33 |
--------------------------------------------------------------------------------
/simulation_overview.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | (sec_simulation_overview)=
15 |
16 | # Tree sequences and simulation
17 |
18 | **Yan Wong, Georgia Tsambos, and Peter Ralph**
19 |
20 | Simulations are important in population genetics for many reasons:
21 |
22 | ::::{margin}
23 | :::{todo}
24 | Add links to papers that illustrate each of the following points
25 | :::
26 | ::::
27 |
28 | Exploration
29 | : Simulations allow us to explore the influence of complex historical scenarios on
30 | observed patterns of genetic variation and inheritance.
31 |
32 | Benchmarking and evaluating methodologies
33 | : To assess the accuracy of inferential methods, we need test datasets for which the
34 | true values of important parameters are known.
35 |
36 | Model training
37 | : Some methods for ancestry inference are trained on simulated data (eg. Approximate
38 | Bayesian Computation). This is especially important in studies of complex demographies,
39 | where there are many potential parameters and models, making it impractical to specify
40 | likelihood functions.
41 |
42 | Compare to expectations
43 | : It is often useful to compare data to what is expected under a simpler situation
44 | (e.g. for use as a null model). For instance, comparison to *neutral* simulations
45 | can be used to identify regions subject to selection.
46 |
47 | There are two major forms of population genetic simulation: **forwards-time**
48 | and **backwards-time**. In general, forwards-time simulation is detailed and more
49 | realistic, while backwards-time simulation is fast and efficient.
50 |
51 | More specifically, apart from a
52 | {ref}`few exceptions `,
53 | backwards-time simulations are primarily focused on neutral simulations, while
54 | forward simulation is better suited to complex simulations, including those involving
55 | selection and continuous space.
56 |
57 | ## Advantages of tree sequences
58 |
59 | Some forwards-time ([SLiM](http://messerlab.org/slim/),
60 | [fwdpy](http://molpopgen.github.io/fwdpy/)) and backwards-time
61 | ([msprime](https://tskit.dev/msprime)) simulators have a built-in capacity to output
62 | tree sequences. This can have several benefits:
63 |
64 | 1. Neutral mutations, which often account for the majority of genetic variation, do not
65 | need to be tracked during the simulation, but can be added afterwards. See
66 | "{ref}`sec_tskit_no_mutations`".
67 | 2. Tree sequences can be used as an interchange format to combine backwards and
68 | forwards simulations, allowing you to take advantage of the advantages of both
69 | approaches. This is detailed in {ref}`sec_completing_forwards_simulations`.
70 |
71 | ## Some tips on simulation
72 |
73 | Even with fast modern software, simulating full genome sequences of entire populations
74 | can take some time. If you are finding your simulations too slow, it is worth
75 | benchmarking them by running on a range of shorter chromosomes or sample sizes, then
76 | extrapolating to figure out how long the simulations you actually want to run would take.
77 |
78 | :::{todo}
79 | Add an example with a matplotlib fitted curve for some msprime simulations with
80 | e.g. a high recombination rate.
81 |
82 | Collecting data from simulations that take minutes to a few hours and looking at
83 | the msprime paper for suggestions of what curve to fit to the data should give you
84 | good predictions. See [issue #104](https://github.com/tskit-dev/tutorials/issues/104)
85 | :::
86 |
--------------------------------------------------------------------------------
/tskitr.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | formats: md:myst,ipynb
4 | text_representation:
5 | extension: .md
6 | format_name: myst
7 | format_version: 0.13
8 | jupytext_version: 1.10.3
9 | kernelspec:
10 | display_name: R
11 | language: R
12 | name: ir
13 | ---
14 |
15 | ```{currentmodule} tskit
16 | ```
17 |
18 | (sec_tskit_r)=
19 |
20 | # Tskit and R
21 |
22 | To interface with `tskit` in R, we can use the [reticulate](https://rstudio.github.io/reticulate/) R package, which lets you call Python functions within an R session. In this tutorial, we'll go through a couple of examples to show you how to get started. If you haven't done so already, you'll need to install `reticulate` in your R session via `install.packages("reticulate")`.
23 |
24 | We'll begin by simulating a small tree sequence using `msprime`.
25 |
26 | ```{code-cell}
27 | msprime <- reticulate::import("msprime")
28 |
29 | ts <- msprime$sim_ancestry(80, sequence_length=1e4, recombination_rate=1e-4, random_seed=42)
30 | ts # See "Jupyter notebook tips", below for how to render this nicely
31 | ```
32 |
33 | ## Attributes and methods
34 |
35 | `reticulate` allows us to access a Python object's attributes via
36 | the `$` operator. For example, we can access (and assign to a variable) the number of
37 | samples in the tree sequence:
38 |
39 | ```{code-cell}
40 | n <- ts$num_samples
41 | n
42 | ```
43 |
44 | The `$` operator can also be used to call methods, for example, the
45 | {meth}`~TreeSequence.simplify` method associated with the tree sequence.
46 | The method parameters are given as native R objects
47 | (but note that object IDs still use tskit's 0-based indexing system).
48 |
49 | ```{code-cell}
50 | reduced_ts <- ts$simplify(0:7) # only keep samples with ids 0, 1, 2, 3, 4, 5, 6, 7
51 | reduced_ts <- reduced_ts$delete_intervals(list(c(6000, 10000))) # delete data after 6kb
52 | reduced_ts <- reduced_ts$trim() # remove the deleted region
53 | paste(
54 | "Reduced from", ts$num_trees, "trees over", ts$sequence_length/1e3, "kb to",
55 | reduced_ts$num_trees, "trees over", reduced_ts$sequence_length/1e3, "kb.")
56 | ```
57 |
58 | ### IDs and indexes
59 |
60 | Note that if a bare digit is provided to one of these methods, it will be treated as a
61 | floating point number. This is useful to know when calling `tskit` methods that
62 | require integers (e.g. object IDs). For example, the following will not work:
63 |
64 | ```{code-cell}
65 | :tags: [raises-exception, remove-output]
66 | ts$node(0) # Will raise an error
67 | ```
68 |
69 | In this case, to force the `0` to be passed as an integer, you can either coerce it
70 | using `as.integer` or simply prepend the letter `L`:
71 |
72 | ```{code-cell}
73 | ts$node(as.integer(0))
74 | # or
75 | ts$node(0L)
76 | ```
77 |
78 | Coercing in this way is only necessary when passing parameters to those underlying
79 | `tskit` methods that expect integers. It is not needed e.g. to index into numeric arrays.
80 | _However_, when using arrays, very careful attention must be paid to the fact that
81 | `tskit` IDs start at zero, whereas R indexes start at one:
82 |
83 | ```{code-cell}
84 | root_id <- ts$first()$root
85 | paste("Root time via tskit method:", ts$node(root_id)$time)
86 | # When indexing into tskit arrays in R, add 1 to the ID
87 | paste("Root time via array access:", ts$nodes_time[root_id + 1])
88 | ```
89 |
90 | ## Analysis
91 |
92 | From within R we can use `tskit`'s powerful
93 | [Statistics](https://tskit.dev/tskit/docs/stable/stats.html) framework to efficiently
94 | compute many different summary statistics from a tree sequence. To illustrate this,
95 | we'll first add some mutations to our tree sequence with the
96 | {func}`msprime:msprime.sim_mutations` function, and then compute the genetic diversity
97 | for each of the tree sequence's sample nodes:
98 |
99 | ```{code-cell}
100 | ts_mut = msprime$sim_mutations(reduced_ts, rate=1e-4, random_seed=321)
101 |
102 | paste(ts_mut$num_mutations, "mutations, genetic diversity is", ts_mut$diversity())
103 | ```
104 |
105 | Numerical arrays and matrices work as expected. For instance, we can use the tree
106 | sequence {meth}`~TreeSequence.genotype_matrix()` method to return the genotypes of
107 | the tree sequence as a matrix object in R.
108 |
109 | ```{code-cell}
110 | G = ts_mut$genotype_matrix()
111 | G
112 | ```
113 |
114 | We can then use R functions directly on the genotype matrix:
115 |
116 | ```{code-cell}
117 | allele_frequency = rowMeans(G)
118 | allele_frequency
119 | ```
120 |
121 | ## Jupyter notebook tips
122 |
123 | When running R within a [Jupyter notebook](https://jupyter.org), a few magic functions
124 | can be defined that allow tskit objects to be rendered within the notebook:
125 |
126 | ```{code-cell}
127 | # Define some magic functions to allow objects to be displayed in R Jupyter notebooks
128 | repr_html.tskit.trees.TreeSequence <- function(obj, ...){obj$`_repr_html_`()}
129 | repr_html.tskit.trees.Tree <- function(obj, ...){obj$`_repr_html_`()}
130 | repr_svg.tskit.drawing.SVGString <- function(obj, ...){obj$`__str__`()}
131 | ```
132 |
133 | This leads to much nicer tabular summaries:
134 |
135 | ```{code-cell}
136 | ts_mut
137 | ```
138 |
139 | It also allows trees and tree sequences to be plotted inline:
140 |
141 | ```{code-cell}
142 | ts_mut$draw_svg(y_axis=TRUE, y_ticks=0:10)
143 | ```
144 |
145 |
146 | ## Interaction with R libraries
147 |
148 | R has a number of libraries to deal with genomic data and trees. Below we focus on the
149 | phylogenetic tree representation defined in the the popular
150 | [ape](http://ape-package.ird.fr) package, taking all the trees
151 | {meth}`exported in Nexus format`, or
152 | individual trees {meth}`exported in Newick format`:
153 |
154 | ```{code-cell}
155 | file = tempfile()
156 | ts_mut$write_nexus(file)
157 | # Warning - ape trees are stored independently, so this will use much more memory than tskit
158 | trees <- ape::read.nexus(file, force.multi = TRUE) # return a set of trees
159 |
160 | # Or simply read in a single tree
161 | tree <- ape::read.tree(text=ts_mut$first()$as_newick())
162 |
163 | # Now we can plot the tree in tskit style, but using the ape library
164 | plot(tree, direction="downward", srt=90, adj=0.5) # or equivalently use trees[[1]]
165 | ```
166 |
167 | Note that nodes are labelled with the prefix `n`, so that nodes `0`, `1`, `2`, ...
168 | become `n0`, `n1`, `n2` ... etc. This helps to avoid
169 | confusion between the the zero-based counting system used natively
170 | by `tskit`, and the one-based counting system used in `R`.
171 |
172 | ## Further information
173 |
174 | Be sure to check out the [reticulate](https://rstudio.github.io/reticulate/)
175 | documentation, in particular on
176 | [Calling Python from R](https://rstudio.github.io/reticulate/articles/calling_python.html),
177 | which includes important information on how R data types are converted to their
178 | equivalent Python types.
179 |
--------------------------------------------------------------------------------
/tutorial_development.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | text_representation:
4 | extension: .md
5 | format_name: myst
6 | format_version: 0.12
7 | jupytext_version: 1.9.1
8 | kernelspec:
9 | display_name: Python 3
10 | language: python
11 | name: python3
12 | ---
13 |
14 | (sec_development)=
15 |
16 | # _Developing new tutorials_
17 | % remove underscores in title when tutorial is complete or near-complete
18 |
19 | :::{note}
20 | Add content for helping developers to add more tutorials.
21 | :::
22 |
--------------------------------------------------------------------------------