├── .github └── workflows │ └── build.yml ├── .gitignore ├── LICENSE.txt ├── Makefile ├── README.md ├── _config.yml ├── _static ├── ARG.png ├── tables_uml.svg ├── tskit_logo.svg ├── tute-constant-migration-svg-out.png ├── tute-divergence-1.png ├── tute-divergence-2.png ├── tute-divergence-svg-out.png ├── tute-mass-migration.png ├── tute-parameter-changes.png └── tute-population-structure-1.png ├── _toc.yml ├── advanced_msprime.md ├── analysing_tree_sequences.md ├── analysing_trees.md ├── args.md ├── bottlenecks.md ├── build.sh ├── completing_forward_sims.md ├── counting_topologies.md ├── data ├── afs.trees ├── basics.trees ├── benchmarks_without_copy_longer_genome.txt ├── computing_statistics.trees ├── construction_example.trees ├── different_time_samples.trees ├── download.py ├── metadata.trees ├── parsimony_map.pickle ├── parsimony_map.trees ├── parsimony_simple.trees ├── storing_everyone.csv ├── tables_example.trees ├── tables_example_muts.trees ├── topologies_sim_speciestree.trees ├── topologies_sim_stdpopsim.trees ├── tree_traversals.trees ├── unified_genealogy_2q_108Mb-110Mb.tsz ├── viz_ts_full.trees ├── viz_ts_selection.trees ├── viz_ts_small.trees ├── viz_ts_small_mutated.trees ├── viz_ts_tiny.trees ├── whatis_example.trees └── whatis_example.yml ├── demography.md ├── forward_sims.md ├── getting_started.md ├── incremental_algorithms.md ├── intro.md ├── introgression.md ├── metadata.md ├── more_forward_sims.md ├── no_mutations.md ├── old-content ├── README.md ├── docs │ ├── README.md │ ├── _config.yml │ ├── _layouts │ │ └── default.html │ ├── bottlenecks.md │ ├── bottlenecks_files │ │ ├── bottlenecks_12_1.svg │ │ ├── bottlenecks_16_0.svg │ │ ├── bottlenecks_4_0.svg │ │ └── bottlenecks_7_0.svg │ ├── introgression.md │ ├── introgression_files │ │ ├── introgression_15_0.svg │ │ └── introgression_9_0.svg │ ├── msprime_out.md │ ├── msprime_out_files │ │ ├── msprime_out_15_0.svg │ │ ├── msprime_out_19_0.svg │ │ ├── msprime_out_28_0.svg │ │ ├── msprime_out_32_0.svg │ │ ├── msprime_out_39_0.png │ │ ├── msprime_out_40_0.png │ │ └── msprime_out_43_0.png │ ├── wfcython.md │ ├── wfcython_files │ │ ├── wfcython_17_0.svg │ │ ├── wfcython_17_1.svg │ │ ├── wfcython_17_2.svg │ │ └── wfcython_17_3.svg │ ├── wfforward.md │ └── wfforward_files │ │ ├── wfforward_11_0.svg │ │ ├── wfforward_18_0.svg │ │ ├── wfforward_20_0.svg │ │ ├── wfforward_53_0.svg │ │ ├── wfforward_55_0.svg │ │ ├── wfforward_59_0.svg │ │ └── wfforward_65_0.svg └── notebooks │ ├── Makefile.wfcython │ ├── Makefile.wfforward │ ├── README.md │ ├── execute.py │ ├── wfcython.ipynb │ └── wfforward.ipynb ├── parallelization.md ├── phylogen.md ├── popgen.md ├── references.bib ├── requirements-CI.txt ├── requirements.txt ├── simplification.md ├── simulation_overview.md ├── tables_and_editing.md ├── terminology_and_concepts.md ├── tskitr.md ├── tutorial_development.md ├── viz.md └── what_is.md /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | # Based on https://github.com/executablebooks/github-action-demo 2 | name: build 3 | 4 | on: 5 | pull_request: 6 | push: 7 | branches: [main] 8 | tags: 9 | - '*' 10 | # This job installs dependencies, build the book, and pushes it to `gh-pages` 11 | jobs: 12 | build: 13 | runs-on: ubuntu-24.04 14 | steps: 15 | - name: Cancel Previous Runs 16 | uses: styfle/cancel-workflow-action@0.12.1 17 | with: 18 | access_token: ${{ github.token }} 19 | 20 | - name: Checkout 21 | uses: actions/checkout@v4.2.2 22 | 23 | # Install dependencies 24 | - name: Install Graphviz 25 | run: | 26 | sudo apt-get install graphviz {lib,}graphviz-dev 27 | 28 | - name: Set up Python 3.11 29 | uses: actions/setup-python@v5.4.0 30 | with: 31 | python-version: "3.11" 32 | cache: "pip" 33 | 34 | - name: Install python dependencies 35 | run: | 36 | pip install --upgrade pip wheel 37 | pip install -r requirements-CI.txt 38 | 39 | - name: Install R support 40 | run: | 41 | # We need to remove R to pull in a version that's compatible with CRAN, weirdly. 42 | sudo apt-get update 43 | sudo apt-get remove r-base-core 44 | sudo apt-get install r-cran-reticulate r-cran-pbdzmq r-cran-uuid r-cran-ape 45 | sudo R -e 'install.packages("IRkernel")' 46 | R -e 'IRkernel::installspec()' 47 | 48 | # Build the book 49 | - name: Build the book 50 | run: ./build.sh 51 | 52 | - name: Copy files for users to run tutorials 53 | run: | 54 | cp ./requirements.txt ./_build/html/. 55 | cp -r data ./_build/html/examples 56 | ls data > ./_build/html/examples/files.txt 57 | 58 | # Push the book's HTML to github-pages 59 | - name: GitHub Pages action 60 | if: github.ref == 'refs/heads/main' 61 | uses: peaceiris/actions-gh-pages@v4.0.0 62 | with: 63 | github_token: ${{ secrets.GITHUB_TOKEN }} 64 | publish_dir: ./_build/html 65 | 66 | - name: Trigger docs site rebuild 67 | if: github.ref == 'refs/heads/main' 68 | run: | 69 | curl -X POST https://api.github.com/repos/tskit-dev/tskit-site/dispatches \ 70 | -H 'Accept: application/vnd.github.everest-preview+json' \ 71 | -u AdminBot-tskit:${{ secrets.ADMINBOT_TOKEN }} \ 72 | --data '{"event_type":"build-docs"}' 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | _build 3 | .DS_Store 4 | # ignore files created by the user during a tut 5 | data/my_tree_sequence.trees 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | # Simple makefile for dev. 3 | 4 | all: 5 | # Use the local build wrapper to automate writing the report log to stdout. 6 | ./build.sh 7 | 8 | clean: 9 | rm -fR _build 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tutorials 2 | 3 | A set of tutorials for the tskit ecosystem build using 4 | [Jupyter Book](https://jupyterbook.org/), served up at 5 | [https://tskit.dev/tutorials/](https://tskit.dev/tutorials/). 6 | 7 | Merges to this repo will trigger a rebuild of the 8 | [tskit.dev web site](https://tskit.dev/) via an 9 | [action](https://github.com/tskit-dev/tskit-site/actions) on the 10 | [tskit-site repository](https://github.com/tskit-dev/tskit-site/): 11 | look there for any deployment issues. 12 | 13 | **Under construction** 14 | 15 | These are quick notes for developers while the real developers page is 16 | under construction. 17 | 18 | # Requirements 19 | 20 | Install the Python requirements from requirements.txt: 21 | ``` 22 | $ python -m pip install -r requirements.txt 23 | ``` 24 | 25 | You will also need a working R installation with reticulate and irkernel installed. 26 | This command should do the trick: 27 | ``` 28 | $ R -e 'install.packages(c("reticulate", "IRkernel")); IRkernel::installspec()' 29 | ``` 30 | 31 | # Building tutorials 32 | 33 | - To add a new tutorial, create a Markdown file and add its name to ``_toc.yml``. 34 | - If you are basing the tutorial on an existing notebook, use 35 | [jupytext](https://github.com/mwouts/jupytext) to convert the notebook into 36 | the right format. 37 | - To build locally, run ``make``. The output tells you where to find the 38 | built HTML. 39 | - Pages rendered at https://tskit.dev/tutorials 40 | - Pages might take a while to be updated after a new tutorial is merged. 41 | 42 | If you have an idea for a tutorial, please open an issue to discuss. 43 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | # Book settings 2 | # Learn more at https://jupyterbook.org/customize/config.html 3 | 4 | title: Tree Sequence Tutorials 5 | author: Tskit Developers 6 | logo: _static/tskit_logo.svg 7 | 8 | # Force re-execution of notebooks on each build. 9 | # See https://jupyterbook.org/content/execute.html 10 | execute: 11 | execute_notebooks: cache 12 | timeout: 300 13 | 14 | # Information about where the book exists on the web 15 | repository: 16 | url: https://github.com/tskit-dev/tutorials 17 | branch: main 18 | 19 | launch_buttons: 20 | binderhub_url: "" 21 | 22 | # Add GitHub buttons to your book 23 | # See https://jupyterbook.org/customize/config.html#add-a-link-to-your-repository 24 | html: 25 | use_issues_button: true 26 | use_repository_button: true 27 | 28 | only_build_toc_files: true 29 | 30 | sphinx: 31 | extra_extensions: 32 | - sphinx.ext.todo 33 | - sphinx.ext.intersphinx 34 | - IPython.sphinxext.ipython_console_highlighting 35 | config: 36 | html_theme: sphinx_book_theme 37 | html_theme_options: 38 | pygments_dark_style: monokai 39 | todo_include_todos: true 40 | intersphinx_mapping: 41 | py: ["https://docs.python.org/3", null] 42 | tskit: ["https://tskit.dev/tskit/docs/stable", null] 43 | tszip: ["https://tskit.dev/tszip/docs/latest/", null] 44 | msprime: ["https://tskit.dev/msprime/docs/stable", null] 45 | pyslim: ["https://tskit.dev/pyslim/docs/stable", null] 46 | numpy: ["https://numpy.org/doc/stable/", null] 47 | ipython: ["https://ipython.readthedocs.io/en/stable/", null] 48 | myst_enable_extensions: 49 | - colon_fence 50 | - deflist 51 | - dollarmath 52 | -------------------------------------------------------------------------------- /_static/ARG.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/ARG.png -------------------------------------------------------------------------------- /_static/tskit_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xml -------------------------------------------------------------------------------- /_static/tute-constant-migration-svg-out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-constant-migration-svg-out.png -------------------------------------------------------------------------------- /_static/tute-divergence-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-divergence-1.png -------------------------------------------------------------------------------- /_static/tute-divergence-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-divergence-2.png -------------------------------------------------------------------------------- /_static/tute-divergence-svg-out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-divergence-svg-out.png -------------------------------------------------------------------------------- /_static/tute-mass-migration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-mass-migration.png -------------------------------------------------------------------------------- /_static/tute-parameter-changes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-parameter-changes.png -------------------------------------------------------------------------------- /_static/tute-population-structure-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/_static/tute-population-structure-1.png -------------------------------------------------------------------------------- /_toc.yml: -------------------------------------------------------------------------------- 1 | format: jb-book 2 | root: intro 3 | parts: 4 | - caption: 5 | chapters: 6 | - file: what_is 7 | - caption: Tree sequence basics 8 | chapters: 9 | - file: terminology_and_concepts 10 | - file: getting_started 11 | - caption: Statistics and Analysis 12 | chapters: 13 | - file: analysing_tree_sequences 14 | - file: analysing_trees 15 | - file: incremental_algorithms 16 | - file: counting_topologies 17 | - file: parallelization 18 | - caption: Further tskit tutorials 19 | chapters: 20 | - file: tables_and_editing 21 | - file: simplification 22 | - file: viz 23 | - file: metadata 24 | - file: args 25 | - caption: Simulation 26 | chapters: 27 | - file: simulation_overview 28 | - file: no_mutations 29 | - file: advanced_msprime 30 | sections: 31 | - file: demography 32 | - file: bottlenecks 33 | - file: introgression 34 | - file: completing_forward_sims 35 | - file: forward_sims 36 | - file: more_forward_sims 37 | - caption: Other languages 38 | # TODO: add basic C and maybe Rust tutes 39 | chapters: 40 | - file: tskitr 41 | - caption: Development 42 | # TODO This would be a great place to put the general tskit development 43 | # guidelines. 44 | chapters: 45 | - file: tutorial_development 46 | - caption: Tskit for ... 47 | chapters: 48 | - file: popgen.md 49 | - file: phylogen.md 50 | -------------------------------------------------------------------------------- /advanced_msprime.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | (sec_msprime)= 15 | 16 | # Advanced {program}`msprime` topics 17 | 18 | These are advanced topics in [msprime](https://tskit.dev/msprime) or examples of how to 19 | do some particular things with it. This chapter is broken down into the following 20 | sections: 21 | 22 | ```{tableofcontents} 23 | ``` 24 | 25 | For discussion of other advanced msprime topics, you might also want to look at 26 | the [msprime discussion forums](https://github.com/tskit-dev/msprime/discussions). 27 | -------------------------------------------------------------------------------- /analysing_tree_sequences.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | ```{currentmodule} tskit 15 | ``` 16 | 17 | ```{code-cell} ipython3 18 | :tags: [remove-cell] 19 | import msprime 20 | import numpy as np 21 | import tskit 22 | 23 | def computing_statistics(): 24 | ts = msprime.simulate( 25 | 10**4, Ne=10**4, recombination_rate=1e-8, mutation_rate=1e-8, length=10**7, random_seed=42) 26 | ts.dump("data/computing_statistics.trees") 27 | 28 | def afs(): 29 | ts = msprime.simulate(6, mutation_rate=1, random_seed=47) 30 | # remove the mutation times so the plot is nicer 31 | tables = ts.dump_tables() 32 | tables.mutations.time = np.full_like(tables.mutations.time, tskit.UNKNOWN_TIME) 33 | ts = tables.tree_sequence() 34 | ts.dump("data/afs.trees") 35 | 36 | 37 | def create_notebook_data(): 38 | computing_statistics() 39 | afs() 40 | 41 | # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook 42 | ``` 43 | 44 | (sec_analysing_tree_sequences)= 45 | 46 | # _Analysing tree sequences_ 47 | % remove underscores in title when tutorial is complete or near-complete 48 | 49 | :::{note} 50 | This tutorial is a work in progress. 51 | ::: 52 | 53 | 54 | (sec_tutorial_stats)= 55 | 56 | ## Computing statistics 57 | 58 | Tskit provides an extensive and flexible interface for computing population 59 | genetic statistics, which is documented in detail in the 60 | {ref}`general statistics ` section of the offical documentation. 61 | This tutorial aims to give a quick overview of how the APIs work and how to use 62 | them effectively. 63 | 64 | First, let's load a tree sequence to work with which has roughly human 65 | parameters for 10 thousand samples and 10Mb chromosomes: 66 | 67 | ```{code-cell} ipython3 68 | ts = tskit.load("data/computing_statistics.trees") 69 | ts 70 | ``` 71 | 72 | This tree sequence has ~36.6 thousand trees & ~39 thousand segregating sites. 73 | We'd now like to compute some statistics on this dataset. 74 | 75 | ### One-way statistics 76 | 77 | We refer to statistics that are defined with respect to a single set of 78 | samples as "one-way". An example of such a statistic is diversity, which 79 | is computed using the {meth}`TreeSequence.diversity` method: 80 | 81 | ```{code-cell} ipython3 82 | d = ts.diversity() 83 | print("Average diversity per unit sequence length = {d:.3G}") 84 | ``` 85 | 86 | This tells the average diversity across the whole sequence and returns a single 87 | number. We'll usually want to compute statistics in 88 | {ref}`windows ` along the genome and we 89 | use the ``windows`` argument to do this: 90 | 91 | ```{code-cell} ipython3 92 | windows = np.linspace(0, ts.sequence_length, num=5) 93 | d = ts.diversity(windows=windows) 94 | print(windows, d, sep="\n") 95 | ``` 96 | 97 | The ``windows`` argument takes a numpy array specifying the breakpoints 98 | along the genome. Here, we use numpy to create four equally spaced windows 99 | of size 2.5 megabases (the windows array contains k + 1 elements to define 100 | k windows). Because we have asked for values in windows, tskit now returns 101 | a numpy array rather than a single value. (See 102 | {ref}`sec_stats_output_dimensions` for a full description of how the output 103 | dimensions of statistics are determined by the ``windows`` argument.) 104 | 105 | Suppose we wanted to compute diversity within a specific subset of samples. 106 | We can do this using the ``sample_sets`` argument: 107 | 108 | ```{code-cell} ipython3 109 | A = ts.samples()[:100] 110 | d = ts.diversity(sample_sets=A) 111 | print(d) 112 | ``` 113 | 114 | Here, we've computed the average diversity within the first hundred samples across 115 | the whole genome. As we've not specified any windows, this is again a single value. 116 | 117 | We can also compute diversity in *multiple* sample sets at the same time by providing 118 | a list of sample sets as an argument: 119 | 120 | ```{code-cell} ipython3 121 | A = ts.samples()[:100] 122 | B = ts.samples()[100:200] 123 | C = ts.samples()[200:300] 124 | d = ts.diversity(sample_sets=[A, B, C]) 125 | print(d) 126 | ``` 127 | 128 | Because we've computed multiple statistics concurrently, tskit returns a numpy array 129 | of these statistics. We have asked for diversity within three different sample sets, 130 | and tskit therefore returns an array with three values. (In general, the 131 | dimensions of the input determines the dimensions of the output: see 132 | {ref}`tskit:sec_stats_output_dimensions` for a detailed description of the rules.) 133 | 134 | We can also compute multiple statistics in multiple windows: 135 | 136 | ```{code-cell} ipython3 137 | d = ts.diversity(sample_sets=[A, B, C], windows=windows) 138 | print("shape = ", d.shape, "\n", d) 139 | ``` 140 | 141 | We have computed diversity within three different sample sets across four 142 | genomic windows, and our output is therefore a 2D numpy array with four 143 | rows and three columns: each row contains the diversity values within 144 | A, B and C for a particular window. 145 | 146 | ### Multi-way statistics 147 | 148 | Many population genetic statistics compare multiple sets of samples to 149 | each other. For example, the {meth}`TreeSequence.divergence` method computes 150 | the divergence between two subsets of samples: 151 | 152 | ```{code-cell} ipython3 153 | A = ts.samples()[:100] 154 | B = ts.samples()[:100] 155 | d = ts.divergence([A, B]) 156 | print(d) 157 | ``` 158 | 159 | The divergence between two sets of samples A and B is a single number, 160 | and we we again return a single floating point value as the result. We can also 161 | compute this in windows along the genome, as before: 162 | 163 | ```{code-cell} ipython3 164 | d = ts.divergence([A, B], windows=windows) 165 | print(d) 166 | ``` 167 | 168 | Again, as we have defined four genomic windows along the sequence, the result is 169 | numpy array with four values. 170 | 171 | A powerful feature of tskit's stats API is that we can compute the divergences 172 | between multiple sets of samples simultaneously using the ``indexes`` argument: 173 | 174 | 175 | ```{code-cell} ipython3 176 | d = ts.divergence([A, B, C], indexes=[(0, 1), (0, 2)]) 177 | print(d) 178 | ``` 179 | 180 | Here, we've specified three sample sets A, B and C and we've computed the 181 | divergences between A and B, and between A and C. The ``indexes`` argument is used 182 | to specify which pairs of sets we are interested in. In this example 183 | we've computed two different divergence values and the output is therefore 184 | a numpy array of length 2. 185 | 186 | As before, we can combine computing multiple statistics in multiple windows 187 | to return a 2D numpy array: 188 | 189 | ```{code-cell} ipython3 190 | windows = np.linspace(0, ts.sequence_length, num=5) 191 | d = ts.divergence([A, B, C], indexes=[(0, 1), (0, 2)], windows=windows) 192 | print(d) 193 | ``` 194 | 195 | Each row again corresponds to a window, which contains the average divergence 196 | values between the chosen sets. 197 | 198 | If the ``indexes`` parameter is 1D array, we interpret this as specifying 199 | a single statistic and remove the empty outer dimension: 200 | 201 | ```{code-cell} ipython3 202 | d = ts.divergence([A, B, C], indexes=(0, 1)) 203 | print(d) 204 | ``` 205 | 206 | It's important to note that we don't **have** to remove empty dimensions: tskit 207 | will only do this if you explicitly ask it to. Here, for example, we can keep the 208 | output as an array with one value if we wish: 209 | 210 | ``` 211 | d = ts.divergence([A, B, C], indexes=[(0, 1)]) 212 | print(d) 213 | ``` 214 | 215 | Please see {ref}`tskit:sec_stats_sample_sets` for a 216 | full description of the ``sample_sets`` and ``indexes`` arguments. 217 | 218 | (sec_tutorial_afs)= 219 | 220 | ## Allele frequency spectra 221 | 222 | The allele frequency spectrum is a fundamental tool in population genetics, and 223 | tskit provides a flexible and powerful approach to computing such spectra. 224 | Suppose we have simulated the following tree sequence: 225 | 226 | ```{code-cell} ipython3 227 | from IPython.display import display 228 | ts = tskit.load("data/afs.trees") 229 | tree = ts.first() 230 | display(tree.draw_svg()) 231 | ts.tables.sites 232 | ``` 233 | 234 | Computing the allele frequency spectrum is then easy: 235 | 236 | ```{code-cell} ipython3 237 | afs = ts.allele_frequency_spectrum(polarised=True, span_normalise=False) 238 | print(afs) 239 | ``` 240 | 241 | This tells us that we have two singletons, six doubletons and one 3-ton and 242 | one 4-ton. Note that the first element of the returned AFS array does *not* correspond 243 | to the singletons (see below for why). Because we have simulated these mutations, 244 | we know the ancestral and derived states we have set ``polarised`` to True. We 245 | can get the "folded" AFS by setting polarised to False. Because we want simple 246 | counts here and not averaged values, we set ``span_normalise=False``: by 247 | default, windowed statistics are divided by the sequence length, so they are 248 | comparable between windows. 249 | 250 | The returned value here is actually a 2D array, and this is because we can 251 | also perform these computations in windows along the genome: 252 | 253 | ```{code-cell} ipython3 254 | afs = ts.allele_frequency_spectrum(windows=[0, 0.5, 1], span_normalise=False, polarised=True) 255 | print(afs) 256 | ``` 257 | 258 | This time, we've asked for the number of sites at each frequency in two 259 | equal windows. Now we can see that in the first half of the sequence we 260 | have three sites (compare with the site table above): one singleton, 261 | one doubleton and one tripleton. 262 | 263 | ### Joint spectra 264 | 265 | We can also compute allele frequencies within multiple sets of samples, 266 | the *joint allele frequency spectra*. 267 | 268 | ```{code-cell} ipython3 269 | node_colours = {0: "blue", 2: "blue", 3: "blue", 1: "green", 4: "green", 5: "green"} 270 | styles = [f".n{k} > .sym {{fill: {v}}}" for k, v in node_colours.items()] 271 | tree.draw_svg(style = "".join(styles)) 272 | ``` 273 | 274 | Here we've marked the samples as either blue or green (we can imagine 275 | these belonging to different populations, for example). We can then compute 276 | the joint AFS based on these two sets: 277 | 278 | ```{code-cell} ipython3 279 | afs = ts.allele_frequency_spectrum([[0, 2, 3], [1, 4, 5]], polarised=True) 280 | print(afs) 281 | ``` 282 | 283 | Now, each window in our AFS is a 2D numpy array, where each dimension 284 | corresponds to frequencies within the different sets. So, we see for example 285 | that there are six sites that are singletons in both sets, 1 site 286 | that is a doubleton in both sets, and 2 sites that singletons in $[1, 4, 5]$ 287 | and not present in the other sample set. 288 | 289 | ### Branch length spectra 290 | 291 | Up to now we've used the {meth}`~TreeSequence.allele_frequency_spectrum` method 292 | to summarise the number of sites that occur at different frequencies. We can also 293 | use this approach to compute the total branch lengths subtending a given 294 | number of samples by setting ``mode="branch"``: 295 | 296 | ```{code-cell} ipython3 297 | afs = ts.allele_frequency_spectrum(mode="branch", polarised=True, span_normalise=False) 298 | print(afs) 299 | ``` 300 | 301 | Thus, the total branch length over example one sample is 4.86, over two is 302 | 5.39, and so on. 303 | 304 | 305 | (sec_tutorial_afs_zeroth_entry)= 306 | 307 | ### Zeroth and final entries in the AFS 308 | 309 | The zeroth element of the AFS is significant when we are working with 310 | sample sets that are a subset of all samples in the tree sequence. 311 | For example, in the following we compute the AFS within the sample set 312 | [0, 1, 2]: 313 | 314 | ```{code-cell} ipython3 315 | afs = ts.allele_frequency_spectrum([[0, 1, 2]], mode="branch", polarised=True) 316 | print(afs) 317 | ``` 318 | 319 | Thus, the total branch length over 0, 1 and 2 is 5.3, and over pairs from this set 320 | is 5.25. What does the zeroth value of 4.33 signify? This is the total branch length 321 | over all samples that are **not** in this sample set. By including this value, we 322 | maintain the property that for each tree, the sum of the AFS for any sample set 323 | is always equal to the total branch length. For example, here we compute: 324 | 325 | ```{code-cell} ipython3 326 | print("sum afs = ", np.sum(afs)) 327 | print("total branch len = ", tree.total_branch_length) 328 | ``` 329 | 330 | The final entry of the AFS is similar: it counts alleles (for mode="site") or 331 | branches (for mode="branch") that are ancestral to all of the given sample set, 332 | but are still polymorphic in the entire set of samples of the tree sequence. 333 | Note, however, that alleles fixed among all the samples, e.g., ones above 334 | the root of the tree, will not be included. 335 | -------------------------------------------------------------------------------- /bottlenecks.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | (sec_msprime_bottlenecks)= 15 | 16 | # Instantaneous Bottlenecks 17 | 18 | **Konrad Lohse and Jerome Kelleher** 19 | 20 | A common approach to modelling the effect of demographic history on genealogies is to assume that effective population size ($N_e$) changes in discrete steps which define a series of epochs (Griffiths, 1994; Marth et al., 2004; Keightley & Eyre-Walker, 2007; Li & Durbin 2011). In this setting of piece-wise constant $N_e$, capturing a population bottleneck requires three epochs: $N_e$ is reduced by some fraction $b$ at the start of the bottleneck, $T_{start}$, and recovers to its initial value at time $T_{end}$ (Marth et al., 2004). If bottlenecks are short both on the timescale of coalescence and mutations, one may expect little information about the duration of a bottleneck 21 | in sequence data. Thus a simpler, alternative model is to assume that bottlenecks are instantaneous ($T_{end}-T_{start} \rightarrow 0$) and generate a sudden burst of coalescence events (a multiple merger event) in the genealogy. The strength of the bottleneck B can be thought of as an (imaginary) time period during which coalescence events are collapsed, i.e. there is no growth in genealogical branches during B and the probability that a single pair of lineages entering the bottleneck coalesce during the bottleneck is $1-e^{-B}$. Although this simple two parameter model of bottlenecks is attractive and both analytic results and empirical inference (Griffiths, 1994; Galtier et al., 2000; Bunnefeld et al., 2015) have been developed under this model, it has not been straightforward to simulate data under instantaneous bottleneck histories with ms. Instantaneous bottlenecks are implemented as a demographic event in msprime. Instanteous bottlenecks are similar to coalescent approximations for selective sweeps in that they generate multiple merger events. However, unlike sweeps the whole genome is effected equally. Instantanous bottlenecks differ from approximate models of sweeps (Durret & Scheinsberg, 2004) in that they do not affect the exchangeability of lineages. A consequence of this is that all topologies are equally likely (before collapsing, the imaginary bottleneck time). 22 | 23 | Here we simulate a single sample of n=12 from a population that underwent an instantaneous bottleneck 0.4 * 2N generations ago. Note that since msprime is assuming a diploid population we set the initial population size to 1/2 to obtain coalescence times scaled in units of $2N_e$ generations: 24 | 25 | ```{code-cell} ipython3 26 | import msprime 27 | import tskit 28 | import numpy as np 29 | import matplotlib.pyplot as plt 30 | from IPython.display import SVG 31 | 32 | def run_bott_sims(num_samp, time, strength, num_rep=None, seed=123): 33 | demography = msprime.Demography() 34 | demography.add_population(initial_size=1/2) 35 | demography.add_instantaneous_bottleneck(time=time, strength=strength, population=0) 36 | return msprime.sim_ancestry( 37 | samples=[msprime.SampleSet(num_samp, ploidy=1)], # Sample num_samp haploid genomes 38 | ploidy=2, 39 | num_replicates=num_rep, 40 | demography=demography, 41 | random_seed=seed) 42 | 43 | bottT = 0.4 44 | bottB = 2 45 | 46 | ts = run_bott_sims(12, time=bottT, strength=bottB) 47 | 48 | ts.draw_svg(y_axis=True, size=(400, 400)) 49 | ``` 50 | 51 | The genealogy shows several simultaneous coalescence events at the time of the bottleneck 52 | (T=0.4) 53 | 54 | 55 | ## Checking the SFS against analytic expectation 56 | 57 | Bunnefeld et al (2016) derive the total length of n-ton branches under an instantaneous bottleneck using a recursion for the generating function of genealogies. Assuming a sample size of $n=4$, $B=4$ and $T=0.4$, the SFS is: 58 | 59 | 60 | ```{code-cell} ipython3 61 | T=bottT 62 | B = 4 63 | s=1-np.exp(-B) 64 | p=s*(-6 + 15*s - 20 * np.power(s,2) + 15 * np.power(s,3) - 6 * np.power(s,4) + np.power(s,5)) 65 | 66 | expsfsBottlN= [2/15*(np.exp(-6*T)*(15 *np.exp(6*T) - 9 *np.exp(5*T)*s - 67 | 5*np.exp(3*T)*s*(3 - 3*s + np.power(s,2)) + p)), 68 | 1/5*np.exp(-6*T)*(5*np.exp(6*T) - 6*np.exp(5*T)*s - p), 69 | 2/15*np.exp(-6*T)*(5*np.exp(6*T) - 9*np.exp(5*T)*s + 5*np.exp(3*T)*s*(3-3*s + np.power(s,2))+ p)] 70 | 71 | expsfsBottlN/=np.sum(expsfsBottlN) 72 | print(expsfsBottlN) 73 | ``` 74 | 75 | Checking against msprime (10,000 replicates) shows a close fit to this analytic expectation: 76 | 77 | ```{code-cell} ipython3 78 | nrep = 10_000 79 | nsamp = 4 80 | sims = run_bott_sims(nsamp, time=T, strength=B, num_rep=nrep) 81 | Blist=np.zeros((nrep, nsamp+1)) 82 | for rep_index, ts in enumerate(sims): 83 | afs=ts.allele_frequency_spectrum(mode="branch", polarised=True, span_normalise=False) 84 | Blist[rep_index]+= afs 85 | 86 | data=np.mean(Blist, axis=0) 87 | data/=np.sum(data) 88 | 89 | fig, ax = plt.subplots() 90 | index = np.arange(1,4) 91 | bar_width = 0.4 92 | opacity = 0.9 93 | 94 | expsfs = ax.bar(index+ bar_width, expsfsBottlN, bar_width, alpha=opacity, label='exp') 95 | simsfs = ax.bar(index+ 2*bar_width, data[1:4], bar_width, alpha=opacity, label='exp') 96 | ``` 97 | 98 | ## Expected coalescence times 99 | 100 | The expected pairwise coalescence time is $E[t]= 1 + e^{-(B+T)} - e^{-T}$. Both the expected 101 | coalescence time and the probability that a pair of lineages survives the bottleneck fit 102 | the analytic expectation closely: 103 | 104 | ```{code-cell} ipython3 105 | 106 | def pairCoalBott(time, strength, num_rep=1000): 107 | """ 108 | Simulates replicate 2-tip tree sequences under an instantaneous bottleneck returning the TMRCA 109 | """ 110 | reps=run_bott_sims(2, time=time, strength=strength, num_rep=num_rep) 111 | B = np.zeros(num_rep) 112 | for j, ts in enumerate(reps): 113 | tree = next(ts.trees()) 114 | B[j] = tree.time(tree.root) 115 | return(B) 116 | 117 | nrep=1000 118 | 119 | #Recording the mean pairwise coalescence times and the fraction of replicates with t>T for a grid of bottleneck strengths: 120 | bottBlist = np.arange(0.0,5,0.25) 121 | dat = np.zeros(len(bottBlist)) 122 | prob = np.zeros(len(bottBlist)) 123 | for j in range(len(bottBlist)): 124 | mrcas = pairCoalBott(bottT, bottBlist[j], nrep) 125 | dat[j]=np.mean(mrcas, axis=0) 126 | prob[j]=len(mrcas[mrcas>bottT])/nrep 127 | 128 | bottT2 = bottT * 2 # 0.8 129 | dat2 = np.zeros(len(bottBlist)) 130 | prob2 = np.zeros(len(bottBlist)) 131 | for j in range(len(bottBlist)): 132 | mrcas = pairCoalBott(bottT2, bottBlist[j], nrep) 133 | dat2[j]=np.mean(mrcas, axis=0) 134 | prob2[j]=len(mrcas[mrcas>bottT2])/nrep 135 | 136 | expProb = [np.exp(-(bottT+i)) for i in bottBlist] 137 | expProb2 = [np.exp(-(bottT2+i)) for i in bottBlist] 138 | 139 | expMean = [1+np.exp(-(bottT+i))-np.exp(-bottT) for i in bottBlist] 140 | expMean2 = [1+np.exp(-(bottT2+i))-np.exp(-bottT2) for i in bottBlist] 141 | ``` 142 | 143 | ```{code-cell} ipython3 144 | 145 | plt.plot(bottBlist, expProb, c='brown', ls=":", lw=4, label=f"T={bottT}, theoretical") 146 | plt.plot(bottBlist, prob, c='red', marker='+', ms=8, label=f"T={bottT}, sim") 147 | plt.plot(bottBlist, expProb2, c='navy', ls=":", lw=4, label=f"T={bottT2}, theoretical") 148 | plt.plot(bottBlist, prob2, c='blue', marker='+', lw=1, ms=8, label=f"T={bottT2}, simulated") 149 | plt.xlabel("Bottleneck Strength B") 150 | plt.ylabel("p(t>T)") 151 | plt.legend() 152 | plt.show() 153 | 154 | plt.plot(bottBlist, expMean, c='brown', ls=":", lw=4, label=f"T={bottT}, theoretical") 155 | plt.plot(bottBlist, dat, c='red', marker='+', ms=8, label=f"T={bottT}, simulated") 156 | plt.plot(bottBlist, expMean2, c='navy', ls=":", lw=4, label=f"T={bottT2}, theoretical") 157 | plt.plot(bottBlist, dat2, c='blue', marker='+', ms=8, label=f"T={bottT2}, simulated") 158 | plt.xlabel("Bottleneck Strength B"); 159 | plt.ylabel("E[t]") 160 | plt.legend() 161 | plt.show() 162 | ``` 163 | 164 | ## The distribution of pairwise coalescence times 165 | 166 | The distribution of pairwise coalescence times has two maxima at $t=0$ and the bottleneck time $t=T$ (we have assumed $T=0.8$ below) as expected. The simulated distribution of pairwise coalescence times fits the analytic expectation: 167 | 168 | ```{code-cell} ipython3 169 | s = bottBlist[3] 170 | sprob = 1-np.exp(-s) 171 | 172 | coaldis = pairCoalBott(s, bottT2, 10000); 173 | coaldisFilt = coaldis[(coaldis < 3)]; 174 | 175 | probtest = len(coaldis[coaldis>bottT2])/10000 176 | expprob2 = np.exp(-(bottT2+s)) 177 | 178 | tlist = np.arange(0.0,3,0.25); 179 | coalEpx1 = [np.exp(-i) for i in tlist[0:4]]; 180 | coalEpx2 = [np.exp(-i)-(sprob*np.exp(-i)) for i in tlist[3:]]; 181 | 182 | plt.plot(tlist[0:4], coalEpx1, color = 'black', linewidth = 2); 183 | plt.plot(tlist[3:], coalEpx2, color = 'black', linewidth = 2); 184 | plt.hist(coaldisFilt, bins = 20, density=True); 185 | plt.xlabel("t"); 186 | plt.ylabel("f(t)") 187 | plt.show() 188 | ``` 189 | 190 | ## Approximating the site frequency spectrum 191 | 192 | Bottlenecks can have a substantial effect on the site frequency spectrum (SFS). The SFS is a fundamental summary of sequence variation that forms the basis of many modern inference approaches (e.g. sweepfinder, DFE-alpha, dadi). in the absence of linkage information the SFS is a lossless summary, i.e. any summary of sequence variation that ignores linkage (e.g. pairwise measures of diversity and divergence, $F_{st}$, Tajima's D etc) are summaries of the SFS (Achaz 2009). The SFS is convenient analytically, since it depends only on the mean length and frequency of genealogical branches. For many demographic models of interest the mean length of n-ton branches can be derived either using coalescent theory (Chen 2011) or diffusion equations (Gutenkunst 2009). A number of composite likelihood approaches have been developed based on either analytic results for the SFS or approximate obtained from coalescent simulations (Gutenkunst 2009, Excoffier 2002). We can use msprime to obtain the SFS for a sample of $n=20$ for a range of bottleneck strengths: 193 | 194 | ```{code-cell} ipython3 195 | def bottSFS(num_samp, time, strength, num_rep): 196 | reps = run_bott_sims(num_samp, time=time, strength=strength, num_rep=num_rep) 197 | Blist = np.zeros((num_rep, num_samp+1)) 198 | for rep_index, ts in enumerate(reps): 199 | afs=ts.allele_frequency_spectrum(mode="branch", polarised=True, span_normalise=False) 200 | Blist[rep_index]+= afs 201 | data = np.mean(Blist, axis=0) 202 | data /=np.sum(data) 203 | return data[0:num_samp] 204 | 205 | nrep = 5_000 206 | nsamp = 12 207 | bottT = 0.8 208 | 209 | bottBlist = np.arange(0.0,5,1) 210 | datalist = {} 211 | for s in bottBlist: 212 | datalist[s]= bottSFS(nsamp, bottT, s, nrep) 213 | ``` 214 | 215 | With increasing bottleneck strength the SFS becomes increasingly skewed (the leftmost blue bars show the SFS for a population of constant size). However, bottlenecks have a complex effect on the different frequency classes of the SFS: while the relative frequency of singletons increases, other frequency classes (e.g. doubletons) have a non-monotonic relationship with B: 216 | 217 | ```{code-cell} ipython3 218 | bar_width=0.18 219 | index = np.arange(1, nsamp) 220 | j = 0 221 | for ss, y in datalist.items(): 222 | plt.bar(index + j * bar_width, y[1:], bar_width, label=str(ss)) 223 | j += 1 224 | ``` 225 | 226 | ## The marginal distribution of n-ton branch lengths 227 | 228 | We may be interested in the marginal distributions (pdf) of branch lengths immediately 229 | above a node with n tips, which determines the expected distribution of n-ton mutations in a 230 | nonrecombining block of sequence (Bunnefeld et al., 2016). Like the distribution of 231 | pairwise coalescence times, we expect the pdf of n-ton branches to be discontinuous. 232 | Assuming n=4, B=0.75 and T=0.8 (as above): 233 | 234 | ```{code-cell} ipython3 235 | bottB = 0.8 236 | bottT = 0.75 237 | numrep = 20_000 238 | nsamp = 4 239 | sims = run_bott_sims(nsamp, bottT, bottB, num_rep=numrep) 240 | B = np.zeros((numrep, nsamp)) 241 | for rep_index, ts in enumerate(sims): 242 | tree = next(ts.trees()) 243 | for u in tree.nodes(): 244 | nleaves = tree.num_samples(u) 245 | if tree.parent(u) != tskit.NULL: 246 | B[rep_index, nleaves] += tree.branch_length(u) # Branch length above this node 247 | 248 | Btrans=np.array(B).T.tolist() 249 | ``` 250 | 251 | ```{code-cell} ipython3 252 | plt.hist([x for x in sorted(Btrans[2]) if x < 2], bins = 50, density=True, label="doubletons"); 253 | plt.hist([x for x in sorted(Btrans[3]) if x < 2], bins = 50, density=True, label="tripletons"); 254 | plt.ylim(0, 3) 255 | plt.ylabel("Probability") 256 | plt.xlabel("Branch length") 257 | plt.legend() 258 | plt.show() 259 | ``` 260 | 261 | Again, this gives a good fit to the analytic expectation (see Bunnefeld at al 2016). 262 | 263 | ## Multiple populations 264 | 265 | The ``InstantaneousBottleneck`` model does not work without specifying any populations. Measuring bottleneck strength in generations (i.e. an imaginary time of coalescence) has a subtle but important consequence when we consider samples from multiple populations: the effect of the bottleneck on lineages present in each population at time T depends on the size of each population: genealogies in small populations are more strongly affected. 266 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Jupyter-build doesn't have an option to automatically show the 4 | # saved reports, which makes it difficult to debug the reasons for 5 | # build failures in CI. This is a simple wrapper to handle that. 6 | 7 | REPORTDIR=_build/html/reports 8 | 9 | jupyter-book build -W -n --keep-going . 10 | RETVAL=$? 11 | if [ $RETVAL -ne 0 ]; then 12 | if [ -e $REPORTDIR ]; then 13 | echo "Error occured; showing saved reports" 14 | cat $REPORTDIR/* 15 | fi 16 | else 17 | # Clear out any old reports 18 | rm -f $REPORTDIR/* 19 | fi 20 | exit $RETVAL 21 | -------------------------------------------------------------------------------- /completing_forward_sims.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | kernelspec: 7 | display_name: Python 3 8 | language: python 9 | name: python3 10 | --- 11 | 12 | (sec_completing_forwards_simulations)= 13 | 14 | # Completing forwards simulations 15 | 16 | The ``msprime`` simulator generates tree sequences using the backwards in 17 | time coalescent model. But it is also possible to output tree sequences 18 | from [forwards-time](https://doi.org/10.1371/journal.pcbi.1006581) 19 | simulators such as [SLiM](https://messerlab.org/slim) 20 | and [fwdpy11](https://fwdpy11.readthedocs.io/) (see the 21 | {ref}`sec_tskit_forward_simulations` tutorial). 22 | There are many advantages to using forward-time simulators, but they 23 | are usually quite slow compared to similar coalescent simulations. In this 24 | section we show how to combine the best of both approaches by simulating 25 | the recent past using a forwards-time simulator and then complete the 26 | simulation of the ancient past using ``msprime``. (We sometimes refer to this 27 | "recapitation", as we can think of it as adding a "head" onto a tree sequence.) 28 | 29 | First, we define a simple Wright-Fisher simulator which returns a tree sequence 30 | with the properties that we require (please see the 31 | {ref}`msprime documentation ` 32 | for a formal description of these properties): 33 | 34 | ```{code-cell} ipython3 35 | import tskit 36 | import msprime 37 | import random 38 | import numpy as np 39 | 40 | 41 | def wright_fisher(N, T, L=100, random_seed=None): 42 | """ 43 | Simulate a Wright-Fisher population of N haploid individuals with L discrete 44 | loci for T generations, with one recombination per transmission event 45 | Based on Algorithm W from https://doi.org/10.1371/journal.pcbi.1006581 46 | """ 47 | random.seed(random_seed) 48 | tables = tskit.TableCollection(L) 49 | tables.time_units = "generations" 50 | tables.populations.metadata_schema = tskit.MetadataSchema.permissive_json() 51 | tables.populations.add_row() 52 | P = np.arange(N, dtype=int) 53 | for _ in range(N): 54 | tables.nodes.add_row(time=T, population=0) 55 | t = T 56 | while t > 0: 57 | t -= 1 58 | Pp = P.copy() 59 | for j in range(N): 60 | u = tables.nodes.add_row(time=t, population=0) 61 | Pp[j] = u 62 | a = random.randint(0, N - 1) 63 | b = random.randint(0, N - 1) 64 | x = random.randint(1, L - 1) 65 | tables.edges.add_row(0, x, P[a], u) 66 | tables.edges.add_row(x, L, P[b], u) 67 | P = Pp 68 | 69 | tables.sort() 70 | # Simplify with respect to nodes at time zero (the current generation), using 71 | # `keep_input_roots`` to keep the ancient nodes from the initial population. 72 | tables.simplify(np.where(tables.nodes.time == 0)[0], keep_input_roots=True) 73 | return tables.tree_sequence() 74 | ``` 75 | 76 | We then run a tiny forward simulation of 10 two-locus individuals 77 | for 5 generations, and print out the resulting trees: 78 | 79 | ```{code-cell} ipython3 80 | num_loci = 2 81 | N = 10 82 | wf_ts = wright_fisher(N, 5, L=num_loci, random_seed=3) 83 | wf_ts.draw_svg() 84 | ``` 85 | 86 | Because our Wright Fisher simulation ran for only 5 generations, there has not 87 | been enough time for the trees to fully coalesce. Therefore, instead of having 88 | one root, the trees have several --- the first tree has 2 and the second 4. 89 | Nodes 16, 17, 18, and 19 in this simulation represent the members of the 90 | initial population of the simulation that have genetic descendants at the end 91 | of the simulation. These unary branches joining samples and coalesced subtrees 92 | to the nodes in the initial generation are essential as they allow use to 93 | correctly assemble the various fragments of ancestral material into chromosomes 94 | when creating the initial conditions for the coalescent simulation. 95 | (Please see the 96 | {ref}`msprime documentation ` 97 | for for more details on the 98 | required properties of input tree sequences.) 99 | 100 | The process of completing this tree sequence using a coalescent simulation 101 | begins by first examining the root segments on the input trees. We get the 102 | following segments: 103 | 104 | ``` 105 | [(0, 2, 17), (0, 2, 18), (1, 2, 19), (1, 2, 16)] 106 | ``` 107 | 108 | where each segment is a ``(left, right, node)`` tuple. As nodes 17 and 18 are 109 | present in both trees, they have segments spanning both loci. Nodes 16 and 19 are 110 | present only in the second tree, and so they have ancestral segments only for 111 | the second locus. Note that this means that we do *not* simulate the ancestry 112 | of the entire initial generation of the simulation, but rather the exact 113 | minimum that we need in order to complete the ancestry of the current 114 | generation. For instance, root ``19`` has not coalesced over the interval from 115 | ``1.0`` to ``2.0``, while root ``17`` has not coalesced over the entire segment 116 | from ``0.0`` to ``2.0``. 117 | 118 | We run the coalescent simulation to complete this tree sequence using the 119 | ``initial_state`` argument to {func}`msprime.sim_ancestry`. Because we have simulated a 120 | two locus system with a recombination rate of ``1 / num_loci`` per generation 121 | in the Wright-Fisher model, we want to use the same system in the coalescent simulation. 122 | Note that we set the ``ploidy`` argument to 1 here because our forward time simulation 123 | is haploid and msprime uses a diploid time scale by default. 124 | 125 | 126 | ```{code-cell} ipython3 127 | coalesced_ts = msprime.sim_ancestry( 128 | population_size=N, 129 | initial_state=wf_ts, 130 | recombination_rate=1 / num_loci, 131 | ploidy=1, 132 | random_seed=7) 133 | coalesced_ts.draw_svg() 134 | ``` 135 | 136 | The trees have fully coalesced and we've successfully combined a forwards-time 137 | Wright-Fisher simulation with a coalescent simulation: hooray! 138 | 139 | 140 | ## Why keep input roots (i.e., the initial generation)? 141 | 142 | We can now see why it is essential that we take care to preserve the roots of all 143 | trees when we simplified the tree sequence (by passing ``keep_input_roots=True``), 144 | so that the initial generation can be properly used as the 145 | ``initial_state`` argument to {func}`msprime.sim_ancestry`. In the example above, if node 146 | ``18`` was not in the tree sequence, we would not know that the segment that 147 | node ``10`` inherits from on ``[0.0, 1.0)`` and the segment that node ``2`` 148 | inherits from on ``[1.0, 2.0)`` both exist in the same node. 149 | 150 | Note that although the portions of initial generation (above, nodes ``16``, ``17``, 151 | ``18``, and ``19``) must be in the tree sequence, they do *not* have to be 152 | samples, and their entire genomes need not be represented (e.g., node ``19`` is 153 | only present on ``[1.0, 2.0)``). This allows {func}`msprime.sim_ancestry` to not simulate 154 | the entire history of the first generation, but only what is necessary to complete 155 | any uncoalesced trees. Happily, this is easily done with the ``keep_input_roots`` argument 156 | to {meth}`~tskit.TableCollection.simplify`. Note that this argument would need 157 | to be provided to the periodic {meth}`~tskit.TableCollection.simplify` steps 158 | which are essential in practical simulation, but that we skipped in the toy simulator above. 159 | 160 | In fact, this is precisely how tree sequence recording in [SLiM](https://messerlab.org/slim), 161 | works, and {func}`pyslim.recapitate` provides a front-end to 162 | the method presented here. 163 | 164 | 165 | ## Topology gotchas 166 | 167 | The trees that we output from this combined forwards and backwards simulation 168 | process have some slightly odd properties that are important to be aware of. 169 | In the example above, we can see that the old roots are still present in both trees, 170 | even through they have only one child and are clearly redundant. 171 | This is because the tables of ``initial_state`` have been retained, without modification, 172 | at the top of the tables of the output tree sequence. While this 173 | redundancy is not important for many tasks, there are some cases where 174 | they may cause problems: 175 | 176 | 1. When computing statistics on the number of nodes, edges or trees in a tree 177 | sequence, having these unary edges and redundant nodes will slightly 178 | inflate the values. 179 | 2. If you are computing the overall tree "height" by taking the time of the 180 | root node, you may overestimate the height because there is a unary edge 181 | above the "real" root (this would happen if one of the trees had already 182 | coalesced in the forwards-time simulation). 183 | 184 | For these reasons it may be better to remove this redundancy from your 185 | computed tree sequence which is easily done using the 186 | {meth}`simplify ` method: 187 | 188 | ```{code-cell} ipython3 189 | final_ts = coalesced_ts.simplify() 190 | coalesced_ts.draw_svg() 191 | ``` 192 | 193 | This final tree sequence is topologically identical to the original tree sequence, 194 | but has the redundant nodes and edges removed. Note also that the node IDs have been 195 | reassigned so that the samples are 0 to 9 --- if you need the IDs from the original 196 | tree sequence, please set ``map_nodes=True`` when calling ``simplify`` to get a 197 | mapping between the two sets of IDs. 198 | 199 | 200 | -------------------------------------------------------------------------------- /counting_topologies.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | ```{currentmodule} tskit 15 | ``` 16 | 17 | (sec_counting_topologies)= 18 | 19 | ```{code-cell} ipython3 20 | :tags: [remove-cell] 21 | import msprime 22 | import stdpopsim 23 | 24 | def topologies_sim_speciestree(): 25 | newick_species_tree = "((A:100.0,B:100.0):100.0,C:200.0)" 26 | demography = msprime.Demography.from_species_tree(newick_species_tree, initial_size=100) 27 | ts = msprime.sim_ancestry({0: 2, 1: 2, 2: 2}, demography=demography, random_seed=321) 28 | ts.dump("data/topologies_sim_speciestree.trees") 29 | 30 | def topologies_sim_stdpopsim(): 31 | species = stdpopsim.get_species("HomSap") 32 | model = species.get_demographic_model("OutOfAfrica_3G09") 33 | contig = species.get_contig("chr1", length_multiplier=0.0002, mutation_rate=model.mutation_rate) 34 | samples = {"YRI": 1000, "CEU": 1000, "CHB": 1000} 35 | engine = stdpopsim.get_engine("msprime") 36 | ts = engine.simulate(model, contig, samples, seed=321) 37 | ts.dump("data/topologies_sim_stdpopsim.trees") 38 | 39 | 40 | def create_notebook_data(): 41 | topologies_sim_speciestree() 42 | topologies_sim_stdpopsim() 43 | 44 | # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook 45 | ``` 46 | 47 | # Counting topologies 48 | 49 | **Yan Wong** 50 | 51 | This tutorial is intended to be a gentle introduction to the combinatorial 52 | treatment of tree topologies in `tskit`. For a more formal introduction, 53 | see the {ref}`sec_combinatorics` section of the 54 | official `tskit` {ref}`documentation`. 55 | 56 | The *topology* of a single tree is the term used to describe the branching pattern, 57 | regardless of the lengths of the branches. For example, both trees below have the 58 | same topology, although the branch lengths differ: 59 | 60 | ```{code-cell} 61 | import tskit 62 | node_labels = {0: "a", 1: "b", 2: "c"} # avoid confusion by using letters to label tips 63 | tree = tskit.Tree.generate_comb(3) 64 | display(tree.draw_svg(node_labels=node_labels, y_axis=True)) 65 | 66 | deep_tree = tskit.Tree.generate_comb(10).tree_sequence.simplify([0, 1, 2]).first() 67 | display(deep_tree.draw_svg(node_labels=node_labels, y_axis=True)) 68 | ``` 69 | 70 | :::{note} 71 | The treatment of topologies in `tskit` is restricted to trees with a single defined root, 72 | without nodes with a single child (i.e. trees must consist of nodes that are either leaves, 73 | or internal nodes with two or more children). For convenience in the examples 74 | below, trees are drawn with the tips flagged as samples, although whether a node is a sample or 75 | not does not change the topology of the tree. 76 | ::: 77 | 78 | ## Tree labellings and shapes 79 | 80 | The topology of a tree also takes into account the labelling of tips, so that 81 | the trees below, although they have the same *shape*, count as three 82 | different topologies: 83 | 84 | ```{code-cell} 85 | :tags: [hide-input] 86 | from string import ascii_lowercase 87 | from IPython.display import SVG 88 | 89 | def str_none(s, prefix=None): 90 | if s is not None: 91 | if prefix is None: 92 | return str(s) 93 | else: 94 | return prefix + " = " + str(s) 95 | return None 96 | 97 | def draw_svg_trees(trees, node_labels={}, x_lab_attr=None, width=100, height=150, space=10): 98 | w = width + space 99 | h = height + space 100 | trees = list(trees) 101 | s = f'' 102 | s += f'' 103 | for i, tree in enumerate(trees): 104 | s += tree.draw_svg( 105 | size=(width, height), 106 | canvas_size=(w, h), 107 | root_svg_attributes={"x": i * w}, 108 | node_labels=node_labels, 109 | x_label=str_none(getattr(tree.rank(), x_lab_attr or "", None), x_lab_attr) 110 | ) 111 | s += '' 112 | return SVG(s) 113 | 114 | draw_svg_trees(tskit.all_tree_labellings(tree), node_labels={u: ascii_lowercase[u] for u in tree.samples()}) 115 | ``` 116 | 117 | These are, in fact, the only possible three labellings for a three-tip tree of that shape. 118 | There is only one other possible shape for a three-tip tree, and for this shape, 119 | all labelling orders are equivalent (in other words, there is only one 120 | possible labelling): 121 | 122 | ```{code-cell} 123 | :tags: [hide-input] 124 | tskit.Tree.generate_star(3).draw_svg(node_labels={}) 125 | ``` 126 | 127 | A 3-tip tree therefore has only four possible topologies. 128 | These can be generated with the {func}`~tskit.all_trees` function. 129 | 130 | ```{code-cell} 131 | generated_trees = tskit.all_trees(3) 132 | print("For a three-tip tree there are", len(list(generated_trees)), "labelled topologies.") 133 | ``` 134 | 135 | Here they are, plotted out with their shapes enumerated from zero: 136 | 137 | ```{code-cell} 138 | :tags: [hide-input] 139 | draw_svg_trees( 140 | tskit.all_trees(3), 141 | node_labels={u: ascii_lowercase[u] for u in tree.samples()}, 142 | x_lab_attr="shape" 143 | ) 144 | ``` 145 | 146 | ### Enumerating shapes and labellings 147 | 148 | For a tree with four tips, more topologies and shapes are possible. As before, we can generate the 149 | topologies using {func}`~tskit.all_trees`. Alternatively, if we only want the (unlabelled) shapes, 150 | we can use the {func}`~tskit.all_tree_shapes` function: 151 | 152 | ```{code-cell} 153 | print("For a four-tip tree there are", len(list(tskit.all_trees(4))), "labelled topologies.") 154 | 155 | generated_trees = tskit.all_tree_shapes(4) 156 | print("These can be categorised into", len(list(generated_trees)), "shapes.") 157 | ``` 158 | 159 | Again, we can give each shape a number or *index*, starting from zero: 160 | 161 | ```{code-cell} 162 | :tags: [hide-input] 163 | draw_svg_trees(tskit.all_tree_shapes(4), x_lab_attr="shape") 164 | ``` 165 | 166 | Each of these shapes will have a separate number of possible labellings, and trees with 167 | these labellings can be created using {func}`~tskit.all_tree_labellings`: 168 | 169 | ```{code-cell} 170 | for shape_index, tree in enumerate(tskit.all_tree_shapes(4)): 171 | labellings = tskit.all_tree_labellings(tree) 172 | num_labellings = len(list(labellings)) 173 | print( 174 | f"Tree shape {shape_index} for a four-tip tree has " 175 | f"{num_labellings} labelling{'' if num_labellings==1 else 's'}." 176 | ) 177 | ``` 178 | 179 | Any tree topology for a tree of $N$ tips can therefore be described by a 180 | shape index combined with a labelling index. This is known as the 181 | *rank* of a tree, and it can be obtained using the 182 | {meth}`Tree.rank` method. For instance, here is the rank of a simulated tree 183 | of 10 tips: 184 | 185 | ```{code-cell} 186 | :tags: [hide-input] 187 | import msprime 188 | num_tips = 10 189 | simulated_ts = msprime.sim_ancestry(10, ploidy=1, random_seed=123) 190 | simulated_tree = simulated_ts.first() 191 | print("The topology of the simulated tree below can be described as", simulated_tree.rank()) 192 | ascii_node_labels = {u: ascii_lowercase[u] for u in simulated_tree.samples()} 193 | simulated_tree.draw_svg(node_labels=ascii_node_labels) 194 | ``` 195 | 196 | 197 | A tree with the same topology (i.e. the same shape and labelling, but ignoring 198 | the branch lengths) can be generated using the {meth}`Tree.unrank` method, by 199 | specifying the number of tips and the appropriate `(shape, labelling)` tuple: 200 | 201 | ```{code-cell} 202 | new_tree = tskit.Tree.unrank(num_tips, (1270, 21580)) 203 | new_tree.draw_svg(node_labels=ascii_node_labels) 204 | ``` 205 | 206 | Note that this method generates a single tree in a new tree sequence 207 | whose a default sequence length is 1.0. 208 | 209 | ## Methods for large trees 210 | 211 | The number of possible topologies for a tree with $N$ tips 212 | grows very rapidly with $N$. For instance, with 10 tips, there are 213 | 282,137,824 possible topologies. 214 | 215 | For this reason, the {func}`~tskit.all_trees`, {func}`~tskit.all_tree_shapes` and 216 | {func}`~tskit.all_tree_labellings` methods do not return a list of trees 217 | but an iterator over the trees. This means it is perfectly possible to start 218 | iterating over (say) all tree shapes for a tree of 100 leaves, but 219 | the iterator will not finish before the death of our galaxy. 220 | 221 | ```{code-cell} 222 | for num_trees, tree in enumerate(tskit.all_tree_shapes(100)): 223 | shape = tree.rank().shape 224 | b2 = tree.b2_index() 225 | print(f"A 100-tip tree with shape index {shape} has a b2 balance index of {b2}") 226 | if num_trees > 5: 227 | break # better not let this run too long! 228 | ``` 229 | 230 | For similar combinatorial reasons, the {meth}`Tree.rank` method can be 231 | inefficient for large trees. To compare the topology of two trees, you are 232 | therefore recommended to use e.g. the {meth}`Tree.kc_distance` method 233 | rather than comparing ranks directly. 234 | 235 | ```{code-cell} 236 | simulated_tree = simulated_ts.first(sample_lists=True) # kc_distance requires sample lists 237 | if simulated_ts.first(sample_lists=True).kc_distance(simulated_tree) == 0: 238 | print("Trees are identical") 239 | # To compare to the new_tree we need to fix 240 | # print("The simulated and topology-constructed trees have the same topology") 241 | ``` 242 | 243 | Despite the combinatorial explosion associated with topologies of 244 | many-tip trees, it is still possible to efficiently count 245 | the number of *embedded topologies* in a large tree. 246 | 247 | ### Embedded topologies 248 | 249 | An embedded topology is a a topology involving a subset of the tips of a tree. 250 | If the tips are classified into (say) three groups, red, green, and blue, 251 | we can efficiently count all the embedded three-tip trees which have 252 | one tip from each group using the {meth}`Tree.count_topologies` method. 253 | 254 | ```{code-cell} 255 | :tags: [hide-input] 256 | big_tree = tskit.load("data/topologies_sim_speciestree.trees").first() 257 | # Check all observed topologies have the same counts 258 | assert list(big_tree.count_topologies()[0, 1, 2].values()) == [32, 32] 259 | styles = [ 260 | f".node.sample.p{p.id} > .sym " + "{" + f"fill: {colour}" + "}" 261 | for colour, p in zip(['red', 'green', 'blue'], big_tree.tree_sequence.populations()) 262 | ] 263 | big_tree.draw_svg(style="".join(styles), node_labels={}, time_scale="rank", x_label="big_tree") 264 | ``` 265 | 266 | In this tree, it is clear that the green and blue tips never cluster together. 267 | The {meth}`Tree.count_topologies` method exhaustively looks at all 268 | combinations of one red, one blue, and one green tip, and confirms that we never see 269 | the topology grouping green and blue. However, as might be expected from 270 | examination of the plot above, a red tip is equally likely to be a sister to a 271 | green tip as to a blue tip: 272 | 273 | ```{code-cell} 274 | # By default `count_topologies` chooses one tip from each population, like setting 275 | # sample_sets=[ts.samples(p.id) for p in ts.populations() if len(ts.samples(p.id)) > 0] 276 | 277 | topology_counter = big_tree.count_topologies() 278 | 279 | colours = ['red', 'green', 'blue'] 280 | styles = [f".n{u}>.sym {{fill: {c} }}" for u, c in enumerate(colours)] 281 | 282 | embedded_counts = topology_counter[0, 1, 2] 283 | for embedded_tree in tskit.all_trees(3): 284 | rank = embedded_tree.rank() 285 | number_of_instances = embedded_counts[rank] 286 | label = f"{number_of_instances} instances embedded in big_tree" 287 | display(embedded_tree.draw_svg(style="".join(styles), node_labels={}, x_label=label)) 288 | ``` 289 | 290 | ## Methods over tree sequences 291 | 292 | It can be useful to count embedded topologies over an entire tree sequence. 293 | For instance, we might want to know the number of embedded topologies 294 | that support Neanderthals as a sister group to europeans versus africans. 295 | `Tskit` provides the efficient {meth}`TreeSequence.count_topologies` method to 296 | do this [incrementally](sec_incremental), without having to re-count the topologies 297 | independently in each tree. 298 | 299 | ```{code-cell} 300 | :tags: [hide-input] 301 | from myst_nb import glue 302 | ts = tskit.load("data/topologies_sim_stdpopsim.trees") 303 | print(f"Loaded a stdpopsim of {ts.num_trees} African+European+Chinese trees, each with {ts.num_samples} tips") 304 | glue("seq_len", int(ts.sequence_length/1000), display=False) 305 | ``` 306 | 307 | Although the trees in this tree sequence are very large, counting the embedded topologies is 308 | quite doable (for speed in this demo we are only simulating {glue:}`seq_len` kilobases, but 309 | calculating the average over an entire chromosome simply takes a little longer) 310 | 311 | ```{code-cell} 312 | from datetime import datetime 313 | names = {"YRI": "African", "CEU": "European", "CHB": "Chinese"} 314 | colours = {"YRI": "yellow", "CEU": "green", "CHB": "blue"} 315 | 316 | population_map = {p.metadata["id"]: p.id for p in ts.populations()} 317 | sample_populations = list(sorted({ts.node(u).population for u in ts.samples()})) 318 | topology_span = {tree.rank(): 0 for tree in tskit.all_trees(len(sample_populations))} 319 | 320 | start = datetime.now() 321 | total = 0 322 | for topology_counter, tree in zip(ts.count_topologies(), ts.trees()): 323 | embedded_topologies = topology_counter[sample_populations] 324 | weight = tree.span / ts.sequence_length 325 | for rank, count in embedded_topologies.items(): 326 | topology_span[rank] += count * weight 327 | total += count 328 | print(f"Counted {total} embedded topologies in {datetime.now() - start} seconds") 329 | ``` 330 | 331 | ```{code-cell} 332 | :tags: [hide-input] 333 | ntips = len(sample_populations) 334 | styles = ".sample text.lab {baseline-shift: super; font-size: 0.7em;}" 335 | node_labels = {} 336 | 337 | for p in range(ntips): 338 | name = ts.population(sample_populations[p]).metadata["id"] 339 | node_labels[p] = names[name] 340 | styles += f".n{p}>.sym {{fill: {colours[name]} }}" 341 | 342 | total = sum(topology_span.values()) 343 | for rank, weight in topology_span.items(): 344 | label = f"{weight/total *100:.1f}% of genome" 345 | embedded_tree = tskit.Tree.unrank(ntips, rank) 346 | display(embedded_tree.draw_svg(size=(160, 150), style="".join(styles), node_labels=node_labels, x_label=label)) 347 | ``` 348 | 349 | Perhaps unsurprisingly, the most common topology is the one that groups the non-African 350 | populations together (although there are many trees of the other two topologies, 351 | mostly reflecting genetic divergence prior to the emergence of humans out of Africa). 352 | 353 | For an example with real data, see {ref}`sec_popgen_topological` 354 | in the {ref}`sec_intro_popgen` tutorial. -------------------------------------------------------------------------------- /data/afs.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/afs.trees -------------------------------------------------------------------------------- /data/basics.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/basics.trees -------------------------------------------------------------------------------- /data/benchmarks_without_copy_longer_genome.txt: -------------------------------------------------------------------------------- 1 | toolkit nsam nmutations Nu nbytes seconds 2 | tskit 25 37743 0.25 943575 0.024380256014410406 3 | libseq 25 37743 0.25 943575 0.004416356037836522 4 | allel 25 37743 0.25 943575 0.006015182007104158 5 | tskit 38 41264 0.25 1568032 0.02725571603514254 6 | libseq 38 41264 0.25 1568032 0.007482443994376808 7 | allel 38 41264 0.25 1568032 0.008124893007334322 8 | tskit 60 45965 0.25 2757900 0.03147959499619901 9 | libseq 60 45965 0.25 2757900 0.010253532032947987 10 | allel 60 45965 0.25 2757900 0.011828921968117356 11 | tskit 94 52047 0.25 4892418 0.03646116703748703 12 | libseq 94 52047 0.25 4892418 0.014803380996454507 13 | allel 94 52047 0.25 4892418 0.018299699993804097 14 | tskit 146 56647 0.25 8270462 0.04079140804242343 15 | libseq 146 56647 0.25 8270462 0.021643130981829017 16 | allel 146 56647 0.25 8270462 0.02823396399617195 17 | tskit 227 60026 0.25 13625902 0.044762183038983494 18 | libseq 227 60026 0.25 13625902 0.032079764001537114 19 | allel 227 60026 0.25 13625902 0.04379657399840653 20 | tskit 353 63808 0.25 22524224 0.04979721503332257 21 | libseq 353 63808 0.25 22524224 0.049665857979562134 22 | allel 353 63808 0.25 22524224 0.06984564702725038 23 | tskit 549 68808 0.25 37775592 0.05479123996337876 24 | libseq 549 68808 0.25 37775592 0.08029428997542709 25 | allel 549 68808 0.25 37775592 0.11476215999573469 26 | tskit 854 72633 0.25 62028582 0.06038561399327591 27 | libseq 854 72633 0.25 62028582 0.12830477399984375 28 | allel 854 72633 0.25 62028582 0.18557889200747013 29 | tskit 1329 77876 0.25 103497204 0.06728244601981714 30 | libseq 1329 77876 0.25 103497204 0.21029049198841676 31 | allel 1329 77876 0.25 103497204 0.30698327702702954 32 | tskit 2067 81594 0.25 168654798 0.0740198030252941 33 | libseq 2067 81594 0.25 168654798 0.33949470898369327 34 | allel 2067 81594 0.25 168654798 0.497885801945813 35 | tskit 3215 85693 0.25 275502995 0.0821873809909448 36 | libseq 3215 85693 0.25 275502995 0.5518272669869475 37 | allel 3215 85693 0.25 275502995 0.8115627619554289 38 | tskit 4999 90771 0.25 453764229 0.0933208679780364 39 | libseq 4999 90771 0.25 453764229 0.9057928950060159 40 | allel 4999 90771 0.25 453764229 1.3362707490450703 41 | tskit 7775 95036 0.25 738904900 0.09812425600830466 42 | libseq 7775 95036 0.25 738904900 1.4732784099760465 43 | allel 7775 95036 0.25 738904900 2.174873666022904 44 | tskit 12091 99421 0.25 1202099311 0.1134186849812977 45 | libseq 12091 99421 0.25 1202099311 2.399354270019103 46 | allel 12091 99421 0.25 1202099311 3.5421803669887595 47 | tskit 18803 103851 0.25 1952710353 0.13057904096785933 48 | libseq 18803 103851 0.25 1952710353 3.9014300610288046 49 | allel 18803 103851 0.25 1952710353 5.784294551995117 50 | tskit 29240 108771 0.25 3180464040 0.14623118803137913 51 | libseq 29240 108771 0.25 3180464040 6.36328757496085 52 | allel 29240 108771 0.25 3180464040 9.451211793988477 53 | tskit 45470 113319 0.25 5152614930 0.16928912402363494 54 | libseq 45470 113319 0.25 5152614930 10.348922087985557 55 | allel 45470 113319 0.25 5152614930 15.338866573001724 56 | tskit 70710 117860 0.25 8333880600 0.1923062339774333 57 | libseq 70710 117860 0.25 8333880600 16.752991039014887 58 | allel 70710 117860 0.25 8333880600 25.193067331041675 59 | tskit 109960 122092 0.25 13425236320 0.2233057350385934 60 | libseq 109960 122092 0.25 13425236320 28.182809649966657 61 | allel 109960 122092 0.25 13425236320 41.492513177974615 62 | tskit 170997 126595 0.25 21647365215 0.26305200799833983 63 | libseq 170997 126595 0.25 21647365215 43.96894769597566 64 | allel 170997 126595 0.25 21647365215 69.50293152098311 65 | tskit 265914 131220 0.25 34893235080 0.32004916900768876 66 | libseq 265914 131220 0.25 34893235080 78.91534719598712 67 | allel 265914 131220 0.25 34893235080 120.64627647103043 68 | tskit 413518 134931 0.25 55796397258 0.39681092998944223 69 | libseq 413518 134931 0.25 55796397258 140.25325659103692 70 | allel 413518 134931 0.25 55796397258 194.81629185698694 71 | tskit 643054 139566 0.25 89748474564 0.5139753300463781 72 | libseq 643054 139566 0.25 89748474564 227.20759825699497 73 | allel 643054 139566 0.25 89748474564 316.33334937802283 74 | tskit 1000000 144293 0.25 144293000000 0.6793207000009716 75 | libseq 1000000 144293 0.25 144293000000 368.2131700209575 76 | allel 1000000 144293 0.25 144293000000 510.472775303002 77 | -------------------------------------------------------------------------------- /data/computing_statistics.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/computing_statistics.trees -------------------------------------------------------------------------------- /data/construction_example.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/construction_example.trees -------------------------------------------------------------------------------- /data/different_time_samples.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/different_time_samples.trees -------------------------------------------------------------------------------- /data/download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple script to download all .trees files within the `data` directory on GitHub, 3 | saving to a local `data` directory 4 | """ 5 | 6 | import os 7 | import json 8 | from urllib.request import urlretrieve, urlopen 9 | 10 | if not os.path.isdir("data"): 11 | os.mkdir("data") # Make a "data" directory within the current folder 12 | print(f"Downloading data files into {os.path.join(os.getcwd(), 'data')}") 13 | # Save the data files to the data directory 14 | response = urlopen("https://tskit.dev/tutorials/examples/files.txt") 15 | for fn in response: 16 | fn = fn.decode(response.headers.get_content_charset()).strip() 17 | if fn.endswith(".trees"): 18 | urlretrieve("https://tskit.dev/tutorials/examples/" + fn, os.path.join("data", fn)) 19 | print(".", end="") 20 | print(" finished downloading") -------------------------------------------------------------------------------- /data/metadata.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/metadata.trees -------------------------------------------------------------------------------- /data/parsimony_map.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/parsimony_map.pickle -------------------------------------------------------------------------------- /data/parsimony_map.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/parsimony_map.trees -------------------------------------------------------------------------------- /data/parsimony_simple.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/parsimony_simple.trees -------------------------------------------------------------------------------- /data/storing_everyone.csv: -------------------------------------------------------------------------------- 1 | ,compressed,pbwt,pbwtz,sample_size,tsk_fit,tskz_fit,uncompressed,vcf,vcf_fit,vcfz,vcfz_fit 2 | 0,0.0016675414517521858,0.0018388191238045692,0.000494055449962616,10,0.01328652372466946,0.0014398649106981897,0.013348821550607681,0.005093865096569061,0.0038755249813772976,0.0006428053602576256,0.00024484777501409204 3 | 1,0.0042699361220002174,0.0037771547213196754,0.0013225525617599487,100,0.028833975141573032,0.004427909731197507,0.02880258485674858,0.044218966737389565,0.047833623496165135,0.002773575484752655,0.0024107513243057884 4 | 2,0.007236692123115063,0.006424359045922756,0.0028374912217259407,1000,0.044466600006542005,0.007421182366127227,0.04446660354733467,0.5665737707167864,0.5903859600357276,0.023386516608297825,0.023736061915643803 5 | 3,0.01037746760994196,0.010164554230868816,0.005389832891523838,10000,0.06095095935216508,0.010466733145360988,0.060782525688409805,7.290761827491224,7.286832071905484,0.2337631145492196,0.23370333952860764 6 | 4,0.014260578900575638,0.015963544137775898,0.009662624448537827,100000,0.085952663504329,0.014035065367635116,0.08610359206795692,89.93748273793608,89.93764289539862,2.3010212713852525,2.3010241168450483 7 | 5,0.022809751331806183,0.040422539226710796,0.021889440715312958,1000000,0.19612781572190113,0.022831212020312953,0.1961144097149372,1068.2399909570813,1110.0543459436492,21.772690244950354,22.655696734938648 8 | 6,0.07962783984839916,0.259388854727149,0.0981400953605771,10000000,1.1580374485935565,0.08390550297702788,1.15145118907094,0.0,13700.833280470879,0.0,223.06615162698623 9 | 7,0.0,0.0,0.0,100000000,10.637291888006041,0.6677612369741138,0.0,0.0,169102.3806764029,0.0,2196.291227933775 10 | 8,0.0,0.0,0.0,1000000000,105.28999439282681,6.479431401374908,0.0,0.0,2087144.2316714546,0.0,21624.50520940123 11 | 9,0.0,0.0,0.0,10000000000,1051.6771775517304,64.56924586981279,0.0,0.0,25760554.20612597,0.0,212913.12354388786 12 | -------------------------------------------------------------------------------- /data/tables_example.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/tables_example.trees -------------------------------------------------------------------------------- /data/tables_example_muts.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/tables_example_muts.trees -------------------------------------------------------------------------------- /data/topologies_sim_speciestree.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/topologies_sim_speciestree.trees -------------------------------------------------------------------------------- /data/topologies_sim_stdpopsim.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/topologies_sim_stdpopsim.trees -------------------------------------------------------------------------------- /data/tree_traversals.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/tree_traversals.trees -------------------------------------------------------------------------------- /data/unified_genealogy_2q_108Mb-110Mb.tsz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/unified_genealogy_2q_108Mb-110Mb.tsz -------------------------------------------------------------------------------- /data/viz_ts_full.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/viz_ts_full.trees -------------------------------------------------------------------------------- /data/viz_ts_selection.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/viz_ts_selection.trees -------------------------------------------------------------------------------- /data/viz_ts_small.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/viz_ts_small.trees -------------------------------------------------------------------------------- /data/viz_ts_small_mutated.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/viz_ts_small_mutated.trees -------------------------------------------------------------------------------- /data/viz_ts_tiny.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/viz_ts_tiny.trees -------------------------------------------------------------------------------- /data/whatis_example.trees: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/data/whatis_example.trees -------------------------------------------------------------------------------- /data/whatis_example.yml: -------------------------------------------------------------------------------- 1 | description: 2 | Asymmetric migration between two extant demes. 3 | time_units: generations 4 | defaults: 5 | epoch: 6 | start_size: 5000 7 | demes: 8 | - name: Ancestral_population 9 | epochs: 10 | - end_time: 1000 11 | - name: A 12 | ancestors: [Ancestral_population] 13 | - name: B 14 | ancestors: [Ancestral_population] 15 | epochs: 16 | - start_size: 2000 17 | end_time: 500 18 | - start_size: 400 19 | end_size: 10000 20 | migrations: 21 | - source: A 22 | dest: B 23 | rate: 1e-4 24 | -------------------------------------------------------------------------------- /incremental_algorithms.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | ```{currentmodule} tskit 15 | ``` 16 | 17 | (sec_incremental)= 18 | # _Incremental algorithms_ 19 | 20 | Much of the [efficiency](sec_what_is_analysis) 21 | of the tskit approach comes from the use of incremental algorithms. 22 | By considering only the difference between adjacent trees, 23 | incremental algorithms avoid having to perform the same 24 | calculation multiple times on different trees. 25 | 26 | This tutorial will explain the philosophy behind incremental algorithms, 27 | and provide examples of how to create your own (e.g. using the 28 | {meth}`TreeSequence.edge_diffs` method). 29 | 30 | :::{todo} 31 | Create content. See [issue 233](https://github.com/tskit-dev/tutorials/issues/233) 32 | ::: -------------------------------------------------------------------------------- /intro.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | (sec_intro)= 15 | 16 | # Welcome! 17 | 18 | This site contains a number of tutorials to develop your understanding of 19 | genetic genealogies, ancestral recombination graphs, and the 20 | [succinct tree sequence](https://tskit.dev/learn/) storage format, 21 | as implemented in [`tskit`: the tree sequence toolkit](https://tskit.dev/tskit/docs/). 22 | Also included are a number of tutorials showing advanced use of 23 | [software programs](https://tskit.dev/software/), 24 | such as [`msprime`](https://tskit.dev/msprime/docs), that form part of the 25 | [`tskit` ecosystem](https://tskit.dev). 26 | 27 | ```{code-cell} ipython3 28 | :tags: [remove-input] 29 | import math 30 | import msprime 31 | 32 | def make_7_tree_4_tip_ts(): 33 | ts = msprime.sim_ancestry( 34 | 4, ploidy=1, random_seed=889, sequence_length=1000, recombination_rate=0.001) 35 | ts = msprime.sim_mutations(ts, rate=2e-3, random_seed=123) 36 | 37 | # Check we have picked a random seed that gives a nice plot of 7 trees 38 | tip_orders = { 39 | tuple(u for u in t.nodes(order="minlex_postorder") if t.is_sample(u)) 40 | for t in ts.trees() 41 | } 42 | topologies = {tree.rank() for tree in ts.trees()} 43 | assert tip_orders == {(0, 1, 2, 3)} and len(topologies) > 1 and ts.num_trees == 7 44 | 45 | return ts 46 | 47 | 48 | ts = make_7_tree_4_tip_ts() 49 | 50 | # Set some parameters: these can be adjusted to your liking 51 | tree_width = 80 52 | height = 200 # Normal height for tree + x-axis 53 | y_step = 20 # Stagger between trees (i.e. 0 for all trees in a horizontal line) 54 | skew = 0.7 # How skewed the trees are, in radians 55 | 56 | width = tree_width * ts.num_trees + 20 + 20 # L & R margins in draw_svg = 20px 57 | angle = math.atan(y_step/tree_width) 58 | ax_mv = y_step, (ts.num_trees - 1) * y_step - 90 + math.tan(skew) * (tree_width * .9) 59 | 60 | # CSS transforms used to skew the axis and stagger + skew the trees 61 | style = f".x-axis {{transform: translate({ax_mv[0]}px, {ax_mv[1]}px) skewY(-{angle}rad)}}" 62 | for i in range(ts.num_trees): 63 | # Stagger each tree vertically by y_step, transforming the "plotbox" tree container 64 | style += ( 65 | f".tree.t{i} > .plotbox " + "{transform:" + 66 | f"translateY({(ts.num_trees - i - 1) * y_step-85}px) skewY({skew}rad)" + "}" 67 | ) 68 | 69 | # Define a bigger canvas size so we don't crop the moved trees from the drawing 70 | size = (width, height) 71 | canvas_size = (width + y_step, height + math.tan(skew)*tree_width) 72 | 73 | ts.draw_svg(size=size, x_scale="treewise", style=style, canvas_size=canvas_size) 74 | ``` 75 | 76 | If you are new to the world of tree sequences, we suggest you start with the 77 | first tutorial: {ref}`sec_what_is` 78 | 79 | :::{note} 80 | Tutorials are under constant development. Those that are still a work in progress and 81 | not yet ready for use are shown in _italics_ in the list of tutorials. 82 | 83 | We very much welcome help developing existing tutorials or writing new ones. Please open 84 | or contribute to a [GitHub issue](https://github.com/tskit-dev/tutorials/issues) if you 85 | would like to help out. 86 | ::: 87 | 88 | ## Other sources of help 89 | 90 | In addition to these tutorials, our [Learn page](https://tskit.dev/learn/) lists 91 | selected videos and publications to help you learn about tree sequences. 92 | 93 | We aim to be a friendly, welcoming open source community. 94 | Questions and discussion about using {program}`tskit`, the tree sequence toolkit 95 | should be directed to the 96 | [GitHub discussion forum](https://github.com/tskit-dev/tskit/discussions), and there are 97 | similar forums for other software in the tree sequence [development community](https://github.com/tskit-dev), 98 | such as for [msprime](https://github.com/tskit-dev/msprime/discussions) and 99 | [tsinfer](https://github.com/tskit-dev/tsinfer/discussions). 100 | 101 | 102 | (sec_intro_running)= 103 | 104 | ## Running tutorial code 105 | 106 | It is possible to run the tutorial code on your own computer, if you wish. 107 | This will allow you to experiment with the examples provided. 108 | The recommended way to do this is from within a 109 | [Jupyter notebook](https://jupyter.org). As well as installing Jupyter, you will also 110 | need to install the various Python libraries, most importantly 111 | ``tskit``, ``msprime``, ``numpy``, and ``matplotlib``. These and other packages are 112 | listed in the [requirements.txt](https://tskit.dev/tutorials/requirements.txt) 113 | file; a shortcut to installing the necessary software is therefore: 114 | 115 | ``` 116 | python3 -m pip install -r https://tskit.dev/tutorials/requirements.txt 117 | ``` 118 | 119 | In addition, to run the {ref}`R tutorial` you will need to install the R 120 | [reticulate](https://rstudio.github.io/reticulate/) library, and if running it in a Jupyter 121 | notebook, the [IRkernel](https://irkernel.github.io) library. This can be done by 122 | running the following command within R: 123 | 124 | ``` 125 | install.packages(c("reticulate", "IRkernel")); IRkernel::installspec() 126 | ``` 127 | 128 | (sec_intro_downloading_datafiles)= 129 | 130 | ### Downloading tutorial datafiles 131 | 132 | Many of the tutorials use pre-existing tree sequences stored in the 133 | [``data``](https://github.com/tskit-dev/tutorials/tree/main/data) directory. 134 | These can be downloaded individually from that link, or you can 135 | download them all at once by running the script stored in 136 | [https://tskit.dev/tutorials/examples/download.py](https://tskit.dev/tutorials/examples/download.py). 137 | If you are running the code in the tutorials from within a Jupyter notebook 138 | then you can simply load this code into a new cell by using the 139 | [%load cell magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-load). 140 | Just run the following in a Jupyter code cell: 141 | 142 | ``` 143 | %load https://tskit.dev/tutorials/examples/download.py 144 | ``` 145 | 146 | Running the resulting Python code should download the data files, then print out 147 | ``finished downloading`` when all files are downloaded. You should then be able 148 | to successfully run code such as the following: 149 | 150 | ```{code-cell} ipython3 151 | import tskit 152 | ts = tskit.load("data/basics.trees") 153 | print(f"The file 'data/basics.trees' exists, and contains {ts.num_trees} trees") 154 | ``` 155 | -------------------------------------------------------------------------------- /metadata.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | ```{currentmodule} tskit 14 | ``` 15 | 16 | 17 | (sec_tutorial_metadata)= 18 | 19 | # Working with Metadata 20 | 21 | Metadata is information associated with entities that {program}`tskit` doesn't use or 22 | interpret, but which is useful to pass on to downstream analysis such as sample ids, 23 | dates etc. (see {ref}`sec_metadata` for a full discussion). Each 24 | {ref}`table` has a {class}`MetadataSchema` which details the 25 | contents and encoding of the metadata for each row. A metadata schema is a JSON document 26 | that conforms to [JSON Schema](https://json-schema.org/understanding-json-schema/) 27 | (The full schema for tskit is at {ref}`sec_metadata_schema_schema`). Here we use an 28 | {ref}`example tree sequence` 29 | which contains some demonstration metadata: 30 | 31 | ```{code-cell} ipython3 32 | :tags: [remove-cell] 33 | import msprime 34 | import tskit 35 | 36 | def metadata(): 37 | tables = msprime.sim_ancestry(4).dump_tables() 38 | tables.individuals.metadata_schema = tskit.MetadataSchema( 39 | {'additionalProperties': False, 40 | 'codec': 'json', 41 | 'properties': {'accession': {'description': 'ENA accession number', 42 | 'type': 'string'}, 43 | 'pcr': {'description': 'Was PCR used on this sample', 44 | 'name': 'PCR Used', 45 | 'type': 'boolean'}}, 46 | 'required': ['accession', 'pcr'], 47 | 'type': 'object'} 48 | ) 49 | md = [ 50 | {'accession': 'ERS0001', 'pcr': True}, 51 | {'accession': 'ERS0002', 'pcr': True}, 52 | {'accession': 'ERS0003', 'pcr': True}, 53 | {'accession': 'ERS0004', 'pcr': False}, 54 | ] 55 | table = tables.individuals 56 | copy = table.copy() 57 | table.clear() 58 | for m, row in zip(md, copy): 59 | table.append(row.replace(metadata=m)) 60 | ts = tables.tree_sequence() 61 | ts.dump("data/metadata.trees") 62 | 63 | def create_notebook_data(): 64 | metadata() 65 | 66 | # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook 67 | ``` 68 | 69 | 70 | ```{code-cell} ipython3 71 | import tskit 72 | import json 73 | 74 | ts = tskit.load("data/metadata.trees") 75 | ``` 76 | (sec_tutorial_metadata_reading)= 77 | 78 | ## Reading metadata and schemas 79 | 80 | Metadata is automatically decoded using the schema when accessed via a 81 | {class}`TreeSequence` or {class}`TableCollection` Python API. For example: 82 | 83 | ```{code-cell} ipython3 84 | print("Metadata for individual 0:", ts.individual(0).metadata) # Tree sequence access 85 | print("Metadata for individual 0:", ts.tables.individuals[0].metadata) # Table access 86 | ``` 87 | 88 | Viewing the {class}`MetadataSchema` for a table can help with understanding 89 | its metadata, as it can contain descriptions and constraints: 90 | 91 | ```{code-cell} ipython3 92 | ts.table_metadata_schemas.individual 93 | ``` 94 | 95 | The same schema can be accessed via a {attr}`~IndividualTable.metadata_schema` attribute 96 | on each table (printed prettily here using ``json.dumps``) 97 | 98 | ```{code-cell} ipython3 99 | schema = ts.tables.individuals.metadata_schema 100 | print(json.dumps(schema.asdict(), indent=4)) # Print with indentations 101 | ``` 102 | 103 | The top-level metadata and schemas for the entire tree sequence are similarly 104 | accessed with {attr}`TreeSequence.metadata` and {attr}`TreeSequence.metadata_schema`. 105 | 106 | :::{note} 107 | If there is no schema (i.e. it is equal to ``MetadataSchema(None)``) for a table 108 | or top-level metadata, then no decoding is performed and ``bytes`` will be returned. 109 | ::: 110 | 111 | (sec_tutorial_metadata_modifying)= 112 | 113 | ## Modifying metadata and schemas 114 | 115 | If you are creating or modifying a tree sequence by changing the underlying tables, 116 | you may want to record or add to the metadata. If the change fits into the same schema, 117 | this is relatively simple, you can follow the 118 | {ref}`description of minor table edits` in the 119 | {ref}`sec_tables` tutorial. However if it requires a change to the schema, this must be 120 | done first, as it is then used to validate and encode the metadata. 121 | 122 | Schemas in tskit are held in a {class}`MetadataSchema`. 123 | A Python dict representation of the schema is passed to its constructor, which 124 | will validate the schema. Here are a few examples: the first one allows arbitrary fields 125 | to be added, the second one (which will construct the schema we printed above) does not: 126 | 127 | ```{code-cell} ipython3 128 | basic_schema = tskit.MetadataSchema({'codec': 'json'}) 129 | 130 | complex_schema = tskit.MetadataSchema({ 131 | 'codec': 'json', 132 | 'additionalProperties': False, 133 | 'properties': {'accession': {'description': 'ENA accession number', 134 | 'type': 'string'}, 135 | 'pcr': {'description': 'Was PCR used on this sample', 136 | 'name': 'PCR Used', 137 | 'type': 'boolean'}}, 138 | 'required': ['accession', 'pcr'], 139 | 'type': 'object', 140 | }) 141 | ``` 142 | 143 | This {class}`MetadataSchema` can then be assigned to a table or the top-level 144 | tree sequence e.g. {attr}`~IndividualTable.metadata_schema`: 145 | 146 | ```{code-cell} ipython3 147 | tables = tskit.TableCollection(sequence_length=1) # make a new, empty set of tables 148 | tables.individuals.metadata_schema = complex_schema 149 | ``` 150 | 151 | This will overwrite any existing schema. Note that this will not validate any existing 152 | metadata against the new schema. Now that the table has a schema, calls to 153 | {meth}`~IndividualTable.add_row` will validate and encode the metadata: 154 | 155 | ```{code-cell} ipython3 156 | row_id = tables.individuals.add_row(0, metadata={"accession": "Bob1234", "pcr": True}) 157 | print(f"Row {row_id} added to the individuals table") 158 | ``` 159 | 160 | If we try to add metadata that doesn't fit the schema, such as accidentally using a 161 | string instead of a proper Python boolean, we'll get an error: 162 | 163 | ```{code-cell} ipython3 164 | :tags: [raises-exception, output_scroll] 165 | tables.individuals.add_row(0, metadata={"accession": "Bob1234", "pcr": "false"}) 166 | ``` 167 | 168 | and because we set ``additionalProperties`` to ``False`` in the schema, an error is 169 | also raised if we attempt to add new fields: 170 | 171 | ```{code-cell} ipython3 172 | :tags: [raises-exception, output_scroll] 173 | tables.individuals.add_row(0, metadata={"accession": "Bob1234", "pcr": True, "newKey": 25}) 174 | ``` 175 | 176 | 177 | To set the top-level metadata, just assign it. Validation and encoding happen as 178 | specified by the top-level metadata schema 179 | 180 | ```{code-cell} ipython3 181 | tables.metadata_schema = basic_schema # Allows new fields to be added that are not validated 182 | tables.metadata = {"mean_coverage": 200.5} 183 | print(tables.metadata) 184 | ``` 185 | 186 | :::{note} 187 | *Provenance* information, detailing the origin of the data, modification timestamps, 188 | and (ideally) how the tree sequence can be reconstructed, should go in 189 | {ref}`sec_provenance`, not metadata. 190 | ::: 191 | 192 | To modify a schema --- for example to add a key --- first get the dict representation, 193 | modify, then write back: 194 | 195 | ```{code-cell} ipython3 196 | schema_dict = tables.individuals.metadata_schema.schema 197 | schema_dict["properties"]["newKey"] = {"type": "integer"} 198 | tables.individuals.metadata_schema = tskit.MetadataSchema(schema_dict) 199 | # Now this will work: 200 | new_id = tables.individuals.add_row(metadata={'accession': 'abc123', 'pcr': False, 'newKey': 25}) 201 | print(tables.individuals[new_id].metadata) 202 | ``` 203 | 204 | To modify the metadata of rows in tables use the {ref}`sec_tutorial_metadata_bulk`. 205 | 206 | (sec_tutorial_metadata_viewing_raw)= 207 | 208 | ## Viewing raw metadata 209 | 210 | If you need to see the raw (i.e. bytes) metadata, you just need to remove the 211 | schema, for instance: 212 | 213 | ```{code-cell} ipython3 214 | individual_table = tables.individuals.copy() # don't change the original tables.individual 215 | 216 | print("Metadata:\n", individual_table[0].metadata) 217 | 218 | individual_table.metadata_schema = tskit.MetadataSchema(None) 219 | print("\nRaw metadata:\n", individual_table[0].metadata) 220 | ``` 221 | 222 | (sec_tutorial_metadata_bulk)= 223 | 224 | ## Metadata for bulk table methods 225 | 226 | In the interests of efficiency each table's {meth}`~NodeTable.packset_metadata` method, 227 | as well as the more general {meth}`~NodeTable.set_columns` and 228 | {meth}`~NodeTable.append_columns` methods, do not attempt to validate or encode metadata. 229 | You can call {meth}`MetadataSchema.validate_and_encode_row` directly to prepare metadata 230 | for these methods: 231 | 232 | ```{code-cell} ipython3 233 | metadata_column = [ 234 | {"accession": "etho1234", "pcr": True}, 235 | {"accession": "richard1235", "pcr": False}, 236 | {"accession": "albert1236", "pcr": True}, 237 | ] 238 | encoded_metadata_column = [ 239 | tables.individuals.metadata_schema.validate_and_encode_row(r) for r in metadata_column 240 | ] 241 | md, md_offset = tskit.pack_bytes(encoded_metadata_column) 242 | tables.individuals.set_columns(flags=[0, 0, 0], metadata=md, metadata_offset=md_offset) 243 | tables.individuals 244 | ``` 245 | 246 | Or if all columns do not need to be set: 247 | 248 | ```{code-cell} ipython3 249 | tables.individuals.packset_metadata( 250 | [tables.individuals.metadata_schema.validate_and_encode_row(r) for r in metadata_column] 251 | ) 252 | ``` 253 | 254 | (sec_tutorial_metadata_binary)= 255 | 256 | ## Binary metadata 257 | 258 | To disable the validation and encoding of metadata and store raw bytes pass ``None`` to 259 | {class}`MetadataSchema` 260 | 261 | ```{code-cell} ipython3 262 | tables.populations.metadata_schema = tskit.MetadataSchema(None) 263 | tables.populations.add_row(metadata=b"SOME CUSTOM BYTES #!@") 264 | print(tables.populations[0].metadata) 265 | ``` 266 | -------------------------------------------------------------------------------- /more_forward_sims.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | ```{currentmodule} tskit 15 | ``` 16 | 17 | (sec_tskit_more_forward_simulations)= 18 | 19 | # _Advanced forward simulations_ 20 | 21 | % remove underscores in title when tutorial is complete or near-complete 22 | 23 | :::{todo} 24 | Add further details on building a forward simulator 25 | (see issue [#14](https://github.com/tskit-dev/tutorials/issues/14)) 26 | ::: 27 | 28 | In the {ref}`previous tutorial`, we developed a basic 29 | basic forward-time Wright-Fisher (WF) simulator (refer back to that tutorial for a 30 | detailed run through of the hidden code): 31 | 32 | ```{code-cell} 33 | :tags: ["hide-cell"] 34 | import tskit 35 | import numpy as np 36 | 37 | random_seed = 6 38 | random = np.random.default_rng(random_seed) # A random number generator for general use 39 | 40 | L = 50_000 # The sequence length: 50 Kb 41 | 42 | def add_inheritance_paths(tables, parent_genomes, child_genome, recombination_rate): 43 | "Add paths from parent genomes to the child genome, with crossover recombination" 44 | L = tables.sequence_length 45 | num_recombinations = random.poisson(recombination_rate * L) 46 | breakpoints = random.integers(0, L - 1, size=num_recombinations) 47 | break_pos, counts = np.unique(breakpoints, return_counts=True) 48 | crossovers = break_pos[counts % 2 == 1] # no crossover if e.g. 2 breaks at same pos 49 | left_positions = np.insert(crossovers, 0, 0) 50 | right_positions = np.append(crossovers, L) 51 | 52 | inherit_from = random.integers(2) 53 | for left, right in zip(left_positions, right_positions): 54 | tables.edges.add_row( 55 | left, right, parent_genomes[inherit_from], child_genome) 56 | inherit_from = 1 - inherit_from # switch to other parent genome 57 | 58 | def make_diploid(tables, time, parent_individuals=None): 59 | individual_id = tables.individuals.add_row(parents=parent_individuals) 60 | return individual_id, ( 61 | tables.nodes.add_row(time=time, individual=individual_id), 62 | tables.nodes.add_row(time=time, individual=individual_id), 63 | ) 64 | 65 | def new_population(tables, time, prev_pop, recombination_rate): 66 | pop = {} 67 | prev_individuals = np.array([i for i in prev_pop.keys()], dtype=np.int32) 68 | for _ in range(len(prev_pop)): 69 | mother_and_father = random.choice(prev_individuals, 2, replace=True) 70 | child_id, child_genomes = make_diploid(tables, time, mother_and_father) 71 | pop[child_id] = child_genomes # store the genome IDs 72 | for child_genome, parent_individual in zip(child_genomes, mother_and_father): 73 | parent_genomes = prev_pop[parent_individual] 74 | add_inheritance_paths(tables, parent_genomes, child_genome, recombination_rate) 75 | return pop 76 | 77 | def initialise_population(tables, time, size) -> dict: 78 | return dict(make_diploid(tables, time) for _ in range(size)) 79 | 80 | ``` 81 | 82 | The main simulation function, as below, returned an unsimplified tree sequence, 83 | which we subsequently {meth}`simplified`: 84 | 85 | ```{code-cell} ipython3 86 | 87 | def forward_WF(num_diploids, seq_len, generations, recombination_rate=0, random_seed=7): 88 | global random 89 | random = np.random.default_rng(random_seed) 90 | tables = tskit.TableCollection(seq_len) 91 | tables.time_units = "generations" 92 | 93 | pop = initialise_population(tables, generations, num_diploids) 94 | while generations > 0: 95 | generations = generations - 1 96 | pop = new_population(tables, generations, pop, recombination_rate) 97 | 98 | tables.sort() 99 | return tables.tree_sequence() 100 | ``` 101 | 102 | ## Repeated simplification 103 | 104 | We can perform simplification directly on the tables within the `forward_WF()` function, 105 | using {meth}`TableCollection.simplify`. More importantly, we can carry this out at 106 | repeated intervals. It is helpful to think of this as regular "garbage collection", 107 | as what we're really doing is getting rid of extinct lineages while also "trimming" 108 | extant lineages down to a minimal representation. 109 | 110 | :::{caution} 111 | Regular garbage collection forces us to reckon with the fact that simplification 112 | {ref}`changes the node IDs `. 113 | We therefore need to remap any node (and individual) IDs that are used outside of 114 | `tskit`. In the implementation described here, those IDs are stored in the `pop` 115 | variable. 116 | ::: 117 | 118 | ```{code-cell} 119 | def simplify_tables(tables, samples, pop) -> dict[int, tuple[int, int]]: 120 | """ 121 | Simplify the tables with respect to the given samples, returning a 122 | population dict in which individual and nodes have been remapped to their 123 | new ID numbers 124 | """ 125 | tables.sort() 126 | node_map = tables.simplify(samples, keep_input_roots=True) 127 | 128 | nodes_individual = tables.nodes.individual 129 | remapped_pop = {} 130 | for node1, node2 in pop.values(): 131 | node1, node2 = node_map[[node1, node2]] # remap 132 | assert nodes_individual[node1] == nodes_individual[node2] # sanity check 133 | remapped_pop[nodes_individual[node1]] = (node1, node2) 134 | return remapped_pop 135 | 136 | 137 | def forward_WF( 138 | num_diploids, 139 | seq_len, 140 | generations, 141 | recombination_rate=0, 142 | simplification_interval=None, # default to simplifying only at end 143 | show=None, 144 | random_seed=7, 145 | ): 146 | global random 147 | random = np.random.default_rng(random_seed) 148 | tables = tskit.TableCollection(seq_len) 149 | tables.time_units = "generations" # optional, but helpful when plotting 150 | if simplification_interval is None: 151 | simplification_interval = generations 152 | simplify_mod = generations % simplification_interval 153 | 154 | pop = initialise_population(tables, generations, num_diploids) 155 | while generations > 0: 156 | generations = generations - 1 157 | pop = new_population(tables, generations, pop, recombination_rate) 158 | if generations > 0 and generations % simplification_interval == simplify_mod: 159 | current_nodes = [u for nodes in pop.values() for u in nodes] 160 | pop = simplify_tables(tables, current_nodes, pop) 161 | if show: 162 | print("Simplified", generations, "generations before end") 163 | 164 | pop = simplify_tables(tables, [u for nodes in pop.values() for u in nodes], pop) 165 | if show: 166 | print("Final simplification") 167 | return tables.tree_sequence() 168 | 169 | ts = forward_WF(6, L, generations=100, simplification_interval=25, show=True) 170 | ts.draw_svg(size=(800, 200)) 171 | ``` 172 | 173 | ### Invariance to simplification interval 174 | A critical concept to keep in mind is that the simulation itself is the only random component. 175 | The simplification algorithm is deterministic given a set of (nodes, edges) satisfying 176 | `tskit`'s sorting requirements. Therefore, the results of our new `forward_WF()` function 177 | must be the same for all simplification intervals 178 | 179 | :::{note} 180 | This invariance property only holds in some cases. 181 | We discuss this in more detail below when we add in mutation. 182 | ::: 183 | 184 | ```{code-cell} 185 | ts = forward_WF(10, L, 500, simplification_interval=1, random_seed=42) 186 | 187 | # Iterate over a range of odd and even simplification intervals. 188 | print("Testing invariance to simplification interval") 189 | test_intervals = list(range(2, 500, 33)) 190 | for i in test_intervals: 191 | # Make sure each new sim starts with same random seed! 192 | ts_test = forward_WF(10, L, 500, simplification_interval=i, show=False, random_seed=42) 193 | assert ts.equals(ts_test, ignore_provenance=True) 194 | print(f"Intervals {test_intervals} passed") 195 | ``` 196 | 197 | :::{tip} 198 | Testing your own code using loops like the one above is a very 199 | good way to identify subtle bugs in book-keeping. 200 | ::: 201 | 202 | ### Summary 203 | 204 | * Simplifying during a simulation changes IDs in the tree sequence tables, so we need to remap 205 | entities that store any of these IDs between generations. 206 | * Our code to carry out simplification gets called both during the simulation and at the end. 207 | It's therefore worth encapsulating it into a class or function for easier code re-use and testing. 208 | 209 | #### Technical notes 210 | 211 | We have found that it is possible to write a simulation where the results differ 212 | by simplification interval, but appear correct in distribution. 213 | By this we mean that looking at distributions of numbers of mutations, their frequencies, etc., 214 | match predictions from analytical theory. However, our experience is that such simulations 215 | contain bugs and that the summaries being used for testing are too crude to catch them. 216 | For example, they may affect the variance in a subtle way that would require millions 217 | of simulations to catch. Often what is going on is that parent/offspring relationships 218 | are not being properly recorded, resulting in lineages that either persist too long or 219 | not long enough. (In other words, the variance in offspring number per diploid is no 220 | longer what it should be, meaning you've changed the effective population size.) 221 | Thus, please make sure you get the **same** `tskit` tables out of a simulation for 222 | any simplification interval. 223 | 224 | 225 | ## Mutations 226 | 227 | In this section, we will add mutation to our simulation. Mutations will occur according to the 228 | infinitely-many sites model, which means that a new mutation cannot arise at a currently-mutated 229 | position. $\theta = 4N\mu$ is the scaled mutation rate, and is equal to twice the expected number 230 | of new mutations per generation. The parameter $\mu$ is the expected number of new mutations 231 | per gamete, per generation. Mutation positions will be uniformly distributed along the genome. 232 | 233 | Adding mutations changes the complexity of the simulation quite a bit, because now we must 234 | add to and simplify [site tables](sec_edge_table_definition) and 235 | [mutation tables](sec_mutation_table_definition) instances. We might also 236 | want to add *metadata* to the sites or mutations, recording details such as 237 | the selection coefficient of a mutation, or the type of mutation (e.g., synonymous vs. non-synonymous). 238 | 239 | We will write a mutation function here which we will re-use in future examples. 240 | 241 | :::{note} 242 | We will be treating mutations as neutral. Doing so is odd, as one big 243 | selling point of `tskit` is the ability to skip the tracking of neutral mutations 244 | in forward simulations. However, tracking neutral mutations plus metadata is the 245 | same as tracking selected mutations and their metadata, and being able to do neat 246 | things like put your selected mutations onto a figure of the genealogy 247 | is one of several possible use cases. 248 | ::: 249 | 250 | :::{todo} 251 | The rest of this tutorial is still under construction, and needs porting from 252 | [this workbook](https://github.com/tskit-dev/tutorials/blob/main/old-content/notebooks/wfforward.ipynb). 253 | This will primarily deal with sites and mutations (and mutational metadata). 254 | We could also include details on selection, if that seems sensible. 255 | 256 | The section in that workbook on "Starting with a prior history" should be put in 257 | the {ref}`sec_completing_forwards_simulations` tutorial. 258 | ::: -------------------------------------------------------------------------------- /no_mutations.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | ```{currentmodule} tskit 15 | ``` 16 | 17 | (sec_tskit_no_mutations)= 18 | 19 | # Do you really need mutations? 20 | 21 | In tree sequences, the genetic genealogy exists independently of the mutations that 22 | generate genetic variation, and often we are primarily interested in genetic variation 23 | because of what it can tell us about those genealogies. This tutorial aims to illustrate 24 | when we can leave mutations and genetic variation aside and study the genealogies directly. 25 | 26 | In simulations we know the true genealogies, and so it can be very helpful to work 27 | with these directly. 28 | In real data, we might infer the trees and then work with the resulting genealogies. 29 | (Of course, mutations add additional noise, and would be necessary 30 | to produce data directly comparable to sequencing data.) 31 | If you're wondering whether you need to add mutations at all, 32 | it's worth considering the following points: 33 | 34 | 1. Neutral mutations and sites can always be added to a genealogy later 35 | 2. Simulating sites and mutations increases memory requirements and tree 36 | sequence file size somewhat, as well as adding to CPU time (although usually this is 37 | inconsequential) 38 | 3. Quantities of interest can often be inferred equally well (or better!) on tree sequences 39 | that have no sites or mutations. 40 | 41 | To illustrate the first two points, we can use the [msprime](https://tskit.dev/msprime) 42 | {func}`~msprime.sim_mutations` function to add neutral sites and mutations onto an 43 | simulated mutationless tree sequence of 20 diploid individuals: 44 | 45 | ```{code-cell} ipython3 46 | import msprime 47 | L = 1_000_000 # simulate 1 megabase length (could increase for a larger example) 48 | rho = mu = 1e-8 # Human-like recombination and mutation parameters 49 | n_subpops = 2 50 | subpop_size = 1e4 51 | migration_rate = 1e-4 52 | # Create a mutationless diploid tree sequence of n_subpops demes 53 | ts_no_mut = msprime.sim_ancestry( 54 | samples={f"pop_{i}": 10 for i in range(n_subpops)}, # 10 samples from each subpop 55 | demography=msprime.Demography.island_model([subpop_size] * n_subpops, migration_rate), 56 | ploidy=2, 57 | recombination_rate=rho, 58 | sequence_length=L, 59 | random_seed=123, 60 | ) 61 | 62 | # Optionally, add neutral mutations later, after simulating. This takes some CPU time 63 | # (although it is usually fast compared to simulating the original tree sequence) 64 | ts_mutated = msprime.sim_mutations(ts_no_mut, rate=mu, random_seed=456) 65 | print( 66 | "Adding mutations has increased the tree sequence file size by " 67 | f"{ts_mutated.nbytes / ts_no_mut.nbytes * 100:.0f}%", 68 | ) 69 | ``` 70 | 71 | :::{note} 72 | Above we have overlaid sites and mutations onto an existing tree sequence by simulation. 73 | It is also possible to overlay mutations in such a way as to generate a known pattern of 74 | genetic variation, using {meth}`tskit.Tree.map_mutations`. 75 | ::: 76 | 77 | 78 | Although including mutations has increased the file size a fair bit in this example, 79 | unless you are running simulations with vast numbers of mutations, tree sequences are 80 | usually compact enough that the cost of including them shouldn't be prohibitive. 81 | Nevertheless, it may not be obvious that it is perfectly possible, indeed sometimes 82 | preferable, to perform genetic analyses on tree sequences that do not contain mutations 83 | and variable sites. This is the focus of the remainder of this tutorial. 84 | 85 | 86 | ## Analysis in the absence of genetic variation 87 | 88 | ### Patterns of relationship 89 | 90 | Some genetic analyses are primarily focussed on patterns or degrees of relationship 91 | between genomes. In this case, the genealogy tells you all you need to know. 92 | This includes: 93 | 94 | * analyses of local ancestry, global ancestry and identity-by-descent 95 | * identification of most recent common ancestors and their descendants (including e.g. 96 | genealogical nearest neighbour analysis) 97 | 98 | Although these methods are primarily described elsewhere, the code below illustrates 99 | how a tree sequence without mutations can be used to find the average time to the 100 | most recent common ancestor (tMRCA) of two sample genomes. The genomes have been 101 | picked at random from the two different populations, and the times averaged over 102 | the entire 1Mb genome. 103 | 104 | ```{code-cell} ipython3 105 | import numpy as np 106 | np.random.seed(10) 107 | sample_a = np.random.choice(ts_no_mut.samples(population=0), size=1)[0] 108 | sample_b = np.random.choice(ts_no_mut.samples(population=1), size=1)[0] 109 | av_tMRCA = 0 110 | for tree in ts_no_mut.trees(): 111 | # weight the tMRCA by the span of genome covered 112 | av_tMRCA += tree.tmrca(sample_a, sample_b) * tree.span/ts_no_mut.sequence_length 113 | print(f"Average tMRCA between sample {sample_a} (pop_0) and", 114 | f"{sample_b} (pop_1) is {av_tMRCA:.2f} {ts_no_mut.time_units}") 115 | ``` 116 | 117 | ### Genetic statistics and branch-length equivalents 118 | 119 | Although many genetic analyses are based on patterns of genetic variation, for many 120 | purposes the genetic variation can be thought of as a measure of the relative length of 121 | branches on the local trees in a tree sequence. So while mutations are necessary to 122 | generate realistically variable genetic sequences, some statistical analyses do not 123 | necessarily require them to be present in a tree sequence (see 124 | [this paper](https://doi.org/10.1534/genetics.120.303253) which explains the duality 125 | between statistics based on genetic variation and their branch length equivalents). 126 | Such statistics include all those based on the allele frequency spectrum, such as genetic 127 | diversity and Tajima's D, and those based on genetic divergence between populations 128 | such as Fst and Patterson's f statistics. 129 | 130 | A simple example of a statistic that is normally calculated by looking at variable sites 131 | is the genetic divergence. This is usually defined as the 132 | proportion of the genome that differs between two sample genomes. In a tree sequence with 133 | mutations, it can be calculated using the {meth}`~TreeSequence.divergence` method: 134 | 135 | ```{code-cell} ipython3 136 | # By default, estimating average "genetic divergence" requires mutations to be present 137 | print( 138 | f"Genetic divergence between samples {sample_a} and {sample_b} is", 139 | f"{ts_mutated.divergence([[sample_a], [sample_b]]):.6f}" 140 | ) 141 | ``` 142 | 143 | If mutation rates are low (i.e. when the "infinite sites" model of mutation is 144 | a good approximation), each genetic difference between sequences corresponds to a 145 | mutation on the lineage connecting the two samples. Since the number of mutations is 146 | expected to be proportional to the length of the lineage, we can use the 147 | lineage length directly to measure divergence (in this case adding the branch length 148 | from the first sample to the MRCA and the branch from the MRCA to the second sample). 149 | The {ref}`general statistics framework`, allows you to switch to 150 | these "branch length" measures by using the `mode="branch"` parameter: 151 | 152 | ```{code-cell} ipython3 153 | # By default, statistics are calculated base on variable sites (mode="site"), but 154 | # we can switch to the branch-length equivalent using mode = "branch" 155 | sample_sets = [sample_a], [sample_b] 156 | ab_dist = ts_no_mut.divergence(sample_sets, mode="branch") 157 | print( 158 | f"Av. genealogical distance between samples {sample_a} and {sample_b} is", 159 | f"{ab_dist:.1f} {ts_no_mut.time_units}" 160 | ) 161 | print("With both samples at time 0, this is twice the previously calculated av tMRCA:") 162 | print(f" av_tMRCA was {av_tMRCA:.1f} (2 * {av_tMRCA:.1f} = {2 * av_tMRCA:.1f})") 163 | 164 | # To compare it to the standard genetic divergence, simply multiply by the mutation rate 165 | print( 166 | f"Estimated genetic divergence from the genealogy is", 167 | f"{ts_no_mut.divergence(sample_sets, mode='branch') * mu:.6f}" 168 | ) 169 | ``` 170 | 171 | #### Genealogy-based measures are less noisy 172 | 173 | Analyses based on observed genetic variation have a random component due to the 174 | stochastic nature of the mutational process. This "random mutational noise" is missing 175 | from analyses that use the genealogy directly: something that is particularly evident 176 | when the analysis is dependent on a small number of mutations. Here's an example 177 | which contrasts conventional and branch length versions of the well-known $F_{st}$ 178 | statistic, across the two populations we previously simulated. 179 | 180 | ```{code-cell} ipython3 181 | import matplotlib_inline 182 | import matplotlib.pyplot as plt 183 | 184 | n_reps = 20 185 | ts_reps = list(msprime.sim_ancestry( 186 | samples={f"pop_{i}": 10 for i in range(n_subpops)}, 187 | demography=msprime.Demography.island_model([subpop_size] * n_subpops, migration_rate), 188 | ploidy=2, 189 | recombination_rate=rho, 190 | sequence_length=L, 191 | random_seed=123, 192 | num_replicates=n_reps, 193 | )) 194 | 195 | ts_mutated_reps = [ 196 | # Decrease the mutation rate to exaggerate effect of random mutational noise 197 | msprime.sim_mutations(ts, rate=mu/100, random_seed=i+4) 198 | for i, ts in enumerate(ts_reps) 199 | ] 200 | 201 | # Return sample sets as all samples from each population (uses all pairwise comparisons) 202 | def sample_sets(ts): 203 | return [ts.samples(population=p.id) for p in ts.populations()] 204 | 205 | Fst_genealogy = np.array([ts.Fst(sample_sets(ts), mode="branch") for ts in ts_reps]) 206 | Fst_genetic_var = np.array([ts.Fst(sample_sets(ts)) for ts in ts_mutated_reps]) 207 | 208 | # For the theoretical expectation see e.g. Crow and Aoki (1984) PNAS 81: 6073, eqn 7 209 | Fst_theory = 1/(4*subpop_size*migration_rate*(n_subpops/(n_subpops-1))**2 + 1) 210 | 211 | plt.scatter(["Genetic variation"] * 20, Fst_genetic_var) 212 | plt.scatter(["Genealogy"] * 20, Fst_genealogy) 213 | plt.xlabel("Basis of estimate") 214 | plt.ylabel("Fst\n(20 replicates)") 215 | 216 | plt.axhline(y=Fst_theory, ls=":", c="grey") 217 | plt.text(0.5, Fst_theory, 'theoretical prediction', ha='center') 218 | plt.show() 219 | ``` 220 | 221 | ::::{margin} 222 | :::{note} 223 | In real data there is additional noise introduced by 224 | inference of the underlying tree sequence which is not accounted for in these examples 225 | ::: 226 | :::: 227 | 228 | Therefore, if your ultimate goal is to compare or estimate genetic statistics of this 229 | sort (rather than to examine the genetic sequence itself), then using the 230 | genealogy-based approach should give you more statistical power. 231 | 232 | As with genetic diversity, there also exist a "branch length" version of the allele 233 | frequency spectrum (the AFS), which measures the length of branches subtending 1 sample, 234 | 2 samples, 3 samples, etc. This is a slightly less noisy version of the AFS based on 235 | actual allele frequencies, and it too can be calculated on a tree sequence with no 236 | mutations: 237 | 238 | ```{code-cell} ipython3 239 | fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 3)) 240 | 241 | afs1 = ts_mutated.allele_frequency_spectrum(polarised=True, mode="site") 242 | ax1.bar(np.arange(ts_mutated.num_samples+1), afs1) 243 | ax1.set_title("AFS using variable sites (ts with mutations)") 244 | 245 | afs2 = ts_no_mut.allele_frequency_spectrum(polarised=True, mode="branch") 246 | ax2.bar(np.arange(ts_no_mut.num_samples+1), afs1) 247 | ax2.set_title("Branch length AFS (ts without mutations)") 248 | 249 | plt.show() 250 | ``` 251 | 252 | In this case, the plots are almost identical because there are thousands of 253 | mutations over the entire sequence, so the mutational noise has been smoothed out 254 | (the remaining unevenness in the AFS plots is due to stochasticity in genealogy, rather 255 | than the mutations). 256 | 257 | However, if we are doing a windowed analysis, and the windows over 258 | the genome are small, each window will contain relatively few mutations, and statistics 259 | based on the genetic variation generated by mutations will be be subject to greater 260 | mutational noise than those based on branch lengths in the genealogy. Here's an 261 | example using the basic {meth}`genetic diversity ` 262 | in 1Kb windows along our simulated genome: 263 | 264 | ```{code-cell} ipython3 265 | fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 3), sharey=True) 266 | w = np.linspace(0, L, num=L//1_000) 267 | ax1.stairs(ts_mutated.diversity(windows=w), w/1_000, baseline=None) 268 | ax1.set_ylabel("Diversity") 269 | ax1.set_xlabel("Genome position (kb)") 270 | ax1.set_title("Site-based calculation") 271 | ax1.set_yscale("log") 272 | ax2.stairs(ts_no_mut.diversity(windows=w, mode="branch") * mu, w/1_000, baseline=None) 273 | ax2.set_xlabel("Genome position (kb)") 274 | ax2.set_title("Branch-length-based calculation") 275 | ax2.set_yscale("log") 276 | plt.show() 277 | ``` 278 | 279 | ## Summary 280 | 281 | In summary, if you are analysing tree sequences, especially those produced by simulation, 282 | think carefully. Do you really need to analyse information about alleles and mutations? 283 | If not, you may be able to omit sites and mutations from the tree sequence, 284 | yet still retain the ability to calculate parameters of interest. This genealogical 285 | approach can provide more accurate descriptors of the quantities of interest 286 | than those explicitly based on genetic variation. 287 | -------------------------------------------------------------------------------- /old-content/README.md: -------------------------------------------------------------------------------- 1 | 2 | Source repository for the tskit tutorials site, 3 | [https://tskit-dev.github.io/tutorials](https://tskit-dev.github.io/tutorials/) 4 | 5 | **UNDER CONSTRUCTION:** This is a very early version, and really just a way to 6 | explore some options for presenting this content. 7 | 8 | ## Organisation 9 | 10 | The ``docs`` directory is a [GitHub pages](https://pages.github.com/) site. This 11 | means that all the Markdown files in this directory are automatically converted to 12 | HTML and made available on the website. 13 | The source content for each 'chapter' is a Jupyter notebook in the ``notebooks`` 14 | directory. Notebooks are then converted to Markdown using ``nbconvert``, and placed 15 | in the ``docs`` directory. 16 | 17 | ## Converting a notebook 18 | 19 | To convert a notebook to markdown, use the following: 20 | 21 | ```shell 22 | $ jupyter nbconvert --to markdown --output-dir docs/ notebooks/NOTEBOOK_NAME.ipynb 23 | ``` 24 | 25 | When adding a new notebook to the site, you need to then add the files to 26 | git: 27 | 28 | ```shell 29 | $ git add docs/NOTEBOOK_NAME* 30 | ``` 31 | 32 | Finally, update the ``docs/README.md`` to insert a link to the new page. 33 | 34 | ## TODO 35 | 36 | - Need standardised titles including authorship. 37 | - Need some sort of citation mechanism. Perhaps [this](https://github.com/takluyver/cite2c)? 38 | - Main page needs some content explaining what the site is for. 39 | - Better template? We can use any Jekyll template, so it's quite flexible. 40 | - It would also be nice to have a download link to the original notebook. 41 | 42 | -------------------------------------------------------------------------------- /old-content/docs/README.md: -------------------------------------------------------------------------------- 1 | 2 | This is a collection of tutorials for using ``tskit`` and ``msprime``. 3 | Each tutorial is an indepth exploration of a particular narrow topic, 4 | and is written in the form of a Jupyter notebook. 5 | 6 | ## Writing your own simulators 7 | 8 | These tutorials show how it is possible to write your own simulators 9 | using the ``tskit`` Tables API. 10 | 11 | - A simple forwards-time [Wright-Fisher](wfforward.md) simulator. 12 | - The simple Wright-Fisher example implemented using [Cython](wfcython.md) 13 | 14 | 15 | ## Advanced topics in coalescent simulation 16 | 17 | These tutorials show how to use the ``msprime`` simulator to simulate 18 | specific scenarios and how to analyse the resulting tree sequences. 19 | 20 | **NOTE: these are currently a work in progress** 21 | 22 | - Simulating [introgression](introgression.html) with msprime. 23 | - Simulating [bottlenecks](bottlenecks.html) with msprime 24 | 25 | ## Slides from workshops, etc.. 26 | 27 | - [Slides](msprime_out.md) from a workshop given by Kevin Thornton at UC Davis in December 2018. 28 | -------------------------------------------------------------------------------- /old-content/docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate 2 | title: tskit tutorials 3 | description: Detailed tutorials for msprime and tskit 4 | -------------------------------------------------------------------------------- /old-content/docs/_layouts/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 16 | 24 | 25 | 26 | {% seo %} 27 | 28 | 29 | 30 | 31 | 32 |
33 |
34 | View on GitHub 35 | 36 |

{{ site.title | default: site.github.repository_name }}

37 |

{{ site.description | default: site.github.project_tagline }}

38 | 39 | {% if site.show_downloads %} 40 |
41 | Download this project as a .zip file 42 | Download this project as a tar.gz file 43 |
44 | {% endif %} 45 |
46 |
47 | 48 | 49 |
50 |
51 | {{ content }} 52 |
53 |
54 | 55 | 56 | 64 | 65 | {% if site.google_analytics %} 66 | 74 | {% endif %} 75 | 76 | 77 | -------------------------------------------------------------------------------- /old-content/docs/bottlenecks.md: -------------------------------------------------------------------------------- 1 | 2 | # Bottlenecks 3 | 4 | **Konrad Lohse and Jerome Kelleher** 5 | 6 | The site frequency spectrum (SFS) summarises variants by their frequency in a sample and is a fundamental summary of sequence variation that forms the basis of many modern inference approaches (e.g. sweepfinder, DFE-alpha, dadi). First, the SFS is a lossless summary of unlinked variants, so any summary of sequence variation that ignores linkage (e.g. pairwise measures of diversity and divergence, F_st, Tajima's D and D) are summaries of the SFS. 7 | 8 | The SFS is convenient analytically, because it only depends on the mean length and frequency of genealogical branches. For many demographic models of interest the means can be derived analytically either using coalescent theory (cite Huang, TPB) or diffusion equations (cite dadi). A number of composite likelihood approaches have been developed based on either analytic results for the SFS (cite dadi Excoffier, Jaada). However, analytic expectations for the SFS break down for large samples and/or complex demographic models. 9 | 10 | In the following section we show how the SFS can be approximated using coalescence simulations and compare such approximations to analytic results. We will assume a simple toy history of a single panmictic population that is affected by an instaneous bottleneck at time T with strenght s (cite Galtier et al). The effect of this bottleneck is to induce sudden burst of coalescence, which simultaneous multiple merges. We measure bottleneck strength as the probability that a pair of lineages coalesces during the bottleneck (we could could of course convert s into on (imaginary) time period that would give the same probability of coalescence $s=1-e^{-T}$). 11 | 12 | We assume a sample of size 10 and use msprime to simulate 10,000 replicate genealogies. For each genealogy the function bottSFS records the unfolded SFS as the mean length of branches with n leafnodes (normalized by the total length of the genealogy) by iterating through all nodes in the tree.sequence. Note that we are simulating genealogies only, i.e. we do not need to simulate mutations. 13 | 14 | We use a for loop to record the SFS for a range of bottleneck strengths parameters in a dictionary: 15 | 16 | 17 | ```python 18 | %matplotlib inline 19 | %config InlineBackend.figure_format = 'svg' 20 | import msprime 21 | import numpy as np 22 | import seaborn as sns 23 | import matplotlib.pyplot as plt 24 | ``` 25 | 26 | 27 | ```python 28 | def run_bott_sims(num_rep, num_samp, T, s): 29 | demographic_events = [ 30 | msprime.InstantaneousBottleneck(time=T, strength=s, population=0)] 31 | reps = msprime.simulate( 32 | sample_size=num_samp, Ne=Ne, num_replicates=num_rep, 33 | demographic_events=demographic_events) 34 | return reps 35 | 36 | def approx_SFS(reps): 37 | B = np.zeros((num_rep, num_samp)) 38 | for rep_index, ts in enumerate(reps): 39 | assert ts.num_trees == 1 40 | tree = ts.first() 41 | for u in tree.nodes(): 42 | nleaves = tree.num_samples(u) 43 | if tree.parent(u) != msprime.NULL_NODE: 44 | B[rep_index, nleaves] += tree.branch_length(u) 45 | data = np.mean(B, axis=0) 46 | data /= np.sum(data) 47 | return data 48 | 49 | num_rep = 10000 50 | num_samp = 10 51 | Ne = 1 52 | T = 0.5 53 | taulist= np.array([0,1,2,3]) 54 | datalist = {} 55 | for tau in taulist: 56 | datalist[tau]= approx_SFS(run_bott_sims(num_rep, num_samp, T, tau)) 57 | 58 | # My guess/assumption is that currently bottleneck strength in msprime is scaled as an (imaginary) time tau (in units of 4N_e) generations. 59 | # It makes a lot more sense to express the bottleneck strength as the probability of pairwise coalescence 60 | # during the bottelenck s=1-np.exp(-tau/2) 61 | ``` 62 | 63 | With increasing bottleneck strength the SFS becomes increasingly skewed (the leftmost blue bars show the SFS for a population of constant size). However, bottlenecks have a complex effect on the different frequency classes of the SFS: while the relative frequency of singletons increases, other frequency classes (e.g. doubletons) have a non-monotonic relationship with bottleneck strength: 64 | 65 | 66 | ```python 67 | bar_width=0.2 68 | index = np.arange(1,num_samp) 69 | j = 0 70 | for s, B in datalist.items(): 71 | plt.bar(index + j * bar_width, B[1:], bar_width, label=str(s)) 72 | j += 1 73 | ``` 74 | 75 | 76 | ![svg](bottlenecks_files/bottlenecks_4_0.svg) 77 | 78 | 79 | ### Comparison with analytic predictions 80 | 81 | How does the approximate SFS compare to analytic expectations? For a population of constant size, the SFS is simply given by Watterson's correction factor, that is the total length branches with i leafnodes is given is 1/i. Reassuringly, in the limit of s=0 (no bottleneck), our SFS approximation based on simulated genealogies agrees with this prediction: 82 | 83 | 84 | ```python 85 | expsfs=[(1/i) for i in range(1,10)] 86 | expsfs/=np.sum(expsfs) 87 | 88 | fig, ax = plt.subplots() 89 | index = np.arange(1,10) 90 | bar_width = 0.4 91 | opacity = 0.9 92 | 93 | simsfs = ax.bar(index, datalist[0][1:], bar_width, alpha=opacity, label='sim') 94 | expextsfs = ax.bar(index+ bar_width, expsfs, bar_width, alpha=opacity, label='exp') 95 | 96 | fig.tight_layout() 97 | plt.show() 98 | ``` 99 | 100 | 101 | ![svg](bottlenecks_files/bottlenecks_7_0.svg) 102 | 103 | 104 | The analytic prediction for the SFS under a bottleneck model is more complicated (Bunnefeld et al. 2015, Appendix). For a sample of n=4 lineages the SFS is: 105 | 106 | 107 | ```python 108 | #We are assuming a bottleneck of strength tau = 4 N_e generations 109 | #and a bottleneck time of T=1 (2 in units of 4 Ne) 110 | #I am pretty sure the analytic prediction for the SFS is correct: the limit mfor s->0 is correct and 111 | #itr matches the automatically generated expression in the Mathematica .nb... 112 | 113 | T=2 114 | slist=[1-np.exp(-tau) for tau in taulist] 115 | 116 | for s in slist: 117 | p=s*(-6 + 15*s - 20 * np.power(s,2) + 15 * np.power(s,3) - 6 * np.power(s,4) + np.power(s,5)) 118 | expsfsBottlN= [2/15*(np.exp(-6*T)*(15 *np.exp(6*T) - 9 *np.exp(5*T)*s - 119 | 5*np.exp(3*T)*s*(3 - 3*s + np.power(s,2)) + p)), 120 | 1/5*np.exp(-6*T)*(5*np.exp(6*T) - 6*np.exp(5*T)*s - p), 121 | 2/15*np.exp(-6*T)*(5*np.exp(6*T) - 9*np.exp(5*T)*s + 5*np.exp(3*T)*s*(3-3*s + np.power(s,2)) + p)] 122 | 123 | expsfsBottlN/=np.sum(expsfsBottlN) 124 | print(expsfsBottlN) 125 | ``` 126 | 127 | [ 0.54545455 0.27272727 0.18181818] 128 | [ 0.5644372 0.26717043 0.16839237] 129 | [ 0.57248098 0.26486068 0.16265835] 130 | [ 0.57559102 0.26396986 0.16043912] 131 | 132 | 133 | The fit between the SFS simulated with msprime and the analytic prediction is noty convincing (given the 100,000 replicates): 134 | 135 | 136 | ```python 137 | num_samp = 4 138 | num_rep = 100000 139 | data4 = {} 140 | T = 1 141 | for tau in taulist: 142 | data4[tau]= approx_SFS(run_bott_sims(num_rep, num_samp, T, tau/2)) 143 | ``` 144 | 145 | 146 | ```python 147 | fig, ax = plt.subplots() 148 | index = np.arange(1,4) 149 | bar_width = 0.4 150 | print(data4[0][1:]) 151 | print(data4[1][1:]) 152 | print(data4[2][1:]) 153 | print(data4[3][1:]) 154 | 155 | simsfs = ax.bar(index, data4[3][1:], bar_width, alpha=opacity, label='sim') 156 | expextsfs = ax.bar(index+ bar_width, expsfsBottlN, bar_width, label='exp') 157 | 158 | fig.tight_layout() 159 | plt.show() 160 | ``` 161 | 162 | [ 0.54474183 0.27407036 0.18118781] 163 | [ 0.55186469 0.26647432 0.181661 ] 164 | [ 0.56819938 0.25679356 0.17500706] 165 | [ 0.58875024 0.24738596 0.16386379] 166 | 167 | 168 | 169 | ![svg](bottlenecks_files/bottlenecks_12_1.svg) 170 | 171 | 172 | ## The distribution of nton branches 173 | 174 | Given that the SFS only depends on mean branch lengths, it is interesting to inspect the probability density distribution of the underlying genealogical branches. Given the discrete event, the pfd of nton branches are discontinuous. 175 | 176 | 177 | ```python 178 | s=1 179 | demographic_events = [msprime.InstantaneousBottleneck(time=T, strength=s, population=0)] 180 | reps = msprime.simulate( 181 | sample_size=num_samp, Ne=Ne, num_replicates=num_rep, 182 | demographic_events=demographic_events) 183 | B = np.zeros((num_rep, num_samp)) 184 | for rep_index, ts in enumerate(reps): 185 | tree = next(ts.trees()) 186 | for u in tree.nodes(): 187 | nleaves = tree.num_samples(u) 188 | if tree.parent(u) != msprime.NULL_NODE: 189 | B[rep_index, nleaves]+=tree.branch_length(u) 190 | ``` 191 | 192 | 193 | ```python 194 | Btrans=np.array(B).T.tolist() 195 | sns.distplot(Btrans[1],axlabel="f(t)") 196 | sns.distplot(Btrans[2],axlabel="f(t)") 197 | sns.distplot(Btrans[3],axlabel="f(t)"); 198 | ``` 199 | 200 | 201 | ![svg](bottlenecks_files/bottlenecks_16_0.svg) 202 | 203 | 204 | ### To Do 205 | 206 | 1) Fix the scaling of the strength in msprime 207 | 2) Fix the pdf plot above: 208 | - Label axes on the pdf plot above: y-> f(t), x -> t 209 | - Restrict X range to 15 210 | 3) Fix the x axes on all the barplots so that these correspond to classes in the SFS 211 | -------------------------------------------------------------------------------- /old-content/docs/introgression.md: -------------------------------------------------------------------------------- 1 | 2 | # Introgression 3 | 4 | **Jerome Kelleher and Konrad Lohse** 5 | 6 | There has been great interest in understanding the contributions past populations have made to the genetic diversity of current populations via admixture. In particular, the availability of whole genome sequence data from archaic hominins (Green et al 2010) has allowed geneticists to identify admixture tracks (Sankararaman 2016). In the simplest case, admixture tracts can be defined heuristically as regions of the genome that show excessive similarity between a putative source and a recipient population (usually quantified relative to some non-admixted reference population, ref Durand et al 2010). Because recombination breaks down admixture tracts, their length distribution gives a clock for calibrating admixture and this information been used to date the admixure contributions Neanderthals and other archaic hominins have made to non-African humans (Sankararaman 2016) and to recontruct the admixture history between different modern human populations (ref). 7 | 8 | Crucially, the power to identify admixture depends on the relative time bewteen the admixture event and the earlier divergence between source and the recipient population: the shorter this interval, the harder it becomes to detect admixture. This is because ancestral material is increasingly likely to trace its ancestry back to the common ancestral population regardless of whether it has been involved in any recent admixture or not. In other words, it becomes increasingly difficult to distinguish between admixtjure from Incomplete lineage sorting (ILS). 9 | 10 | In the following section we use msprime simulations to ask what fraction of admixture tracts are identifyable as such. 11 | 12 | To illustrate this, we simulate ancestral recombination graphs (ARGs) under a simple toy history of divergence and admixture which is loosely motivated by the demographic history of modern humans and Neandertals. As in previous sections, we will first examine properties of the ARG directly rather than use it to simulate mutations. We assume a minimal sample of a single (haploid) genome from a modern human population in African and Eurasia as well as an ancient Neandertal sample. 13 | 14 | Considering a rooted ARG, we want to distinguish three categories of segments: i) tracts actually involved in Neandertal admixture, ii) the subset of those tracts that coalesces in the Neandertal population and iii) segments at which Eurasians are more closely related to Neandertals than either are to Africans. This latter category must include all of ii) but also an additional set of short tracts that are due to incomplete lineage sorting (ILS). The last category is interesting because it is the only one that can be unambiguously detected in data (via derived mutations that are shared by Neandertals and Eurasians). 15 | 16 | First we set up a highly simplified demographic history of human neandertal demography and simulate a single chromosome of 100Mb length: 17 | 18 | 19 | ```python 20 | %matplotlib inline 21 | %config InlineBackend.figure_format = 'svg' 22 | import random 23 | import collections 24 | import msprime 25 | import numpy as np 26 | import seaborn as sns 27 | import multiprocessing 28 | import matplotlib.pyplot as plt 29 | 30 | from IPython.display import SVG 31 | ``` 32 | 33 | 34 | ```python 35 | # Population IDs: Africa, Eurasia, Neanderthal 36 | AFR, EUR, NEA = 0, 1, 2 37 | 38 | def run_simulation(random_seed=None): 39 | time_units = 1000 / 25 # Conversion factor for kya to generations 40 | ts = msprime.simulate( 41 | Ne=10**4, # The same for all populations; highly unrealistic! 42 | recombination_rate=1e-8, 43 | length=100*10**6, # 100 Mb 44 | samples=[ 45 | msprime.Sample(time=0, population=AFR), 46 | msprime.Sample(time=0, population=EUR), 47 | # Neanderthal sample taken 30 kya 48 | msprime.Sample(time=30 * time_units, population=NEA), 49 | ], 50 | population_configurations = [ 51 | msprime.PopulationConfiguration(), # Africa 52 | msprime.PopulationConfiguration(), # Eurasia 53 | msprime.PopulationConfiguration(), # Neanderthal 54 | ], 55 | demographic_events = [ 56 | msprime.MassMigration( 57 | # 2% introgression 50 kya 58 | time=50 * time_units, 59 | source=EUR, dest=NEA, proportion=0.02), 60 | msprime.MassMigration( 61 | # Eurasian & Africa populations merge 70 kya 62 | time=70 * time_units, 63 | source=EUR, dest=AFR, proportion=1), 64 | msprime.MassMigration( 65 | # Neanderthal and African populations merge 300 kya 66 | time=300 * time_units, 67 | source=NEA, destination=AFR, proportion=1), 68 | ], 69 | record_migrations=True, # Needed for tracking segments. 70 | random_seed=random_seed, 71 | ) 72 | return ts 73 | 74 | ts = run_simulation(1) 75 | ``` 76 | 77 | Here we run our simulation in the usual way, but including the ``record_migrations`` option. This allows us to track segments of ancestral material that migrate from the European population into the Neanderthal population (backwards in time). We can then examine the length distributions of these segments and compare them with the length of the segments that also go on to coalesce within the Neanderthal population. 78 | 79 | 80 | ```python 81 | def get_migrating_tracts(ts): 82 | migrating_tracts = [] 83 | # Get all tracts that migrated into the neanderthal population 84 | for migration in ts.migrations(): 85 | if migration.dest == NEA: 86 | migrating_tracts.append((migration.left, migration.right)) 87 | return np.array(migrating_tracts) 88 | 89 | def get_coalescing_tracts(ts): 90 | coalescing_tracts = [] 91 | tract_left = None 92 | for tree in ts.trees(): 93 | # 1 is the Eurasian sample and 2 is the Neanderthal 94 | mrca_pop = tree.population(tree.mrca(1, 2)) 95 | left = tree.interval[0] 96 | if mrca_pop == NEA and tract_left is None: 97 | # Start a new tract 98 | tract_left = left 99 | elif mrca_pop != NEA and tract_left is not None: 100 | # End the last tract 101 | coalescing_tracts.append((tract_left, left)) 102 | tract_left = None 103 | if tract_left is not None: 104 | coalescing_tracts.append((tract_left, ts.sequence_length)) 105 | return np.array(coalescing_tracts) 106 | 107 | def get_eur_nea_tracts(ts): 108 | tracts = [] 109 | tract_left = None 110 | for tree in ts.trees(): 111 | # 1 is the Eurasian sample and 2 is the Neanderthal 112 | mrca = tree.mrca(1, 2) 113 | left = tree.interval[0] 114 | if mrca != tree.root and tract_left is None: 115 | # Start a new tract 116 | tract_left = left 117 | elif mrca != tree.root and tract_left is not None: 118 | # End the last tract 119 | tracts.append((tract_left, left)) 120 | tract_left = None 121 | if tract_left is not None: 122 | tracts.append((tract_left, ts.sequence_length)) 123 | return np.array(tracts) 124 | 125 | 126 | migrating = get_migrating_tracts(ts) 127 | within_nea = get_coalescing_tracts(ts) 128 | eur_nea = get_eur_nea_tracts(ts) 129 | ``` 130 | 131 | We build three different lists. The first is the set of tracts that have migrated from the Eurasian population into the Neanderthal population, and is done simply by finding all migration records in which the destination population is equal to NEA. The second list (which must contain a subset of the segments in the first list) is the ancestral segments that went on to coalesce within the Neanderthal population. The third list contains all segments in which the Eurasian and Neanderthal sample coalesce before their ancestor coalesces with the African sample. The third list includes both Eurasian segments that migrated to the Neanderthal population and segments that did not migrate and did not coalesce until after the Neanderthal-Human population split 300kya. 132 | 133 | 134 | ```python 135 | nea_total = np.sum(eur_nea[:,1] - eur_nea[:,0]) 136 | migrating_total = np.sum(migrating[:,1] - migrating[:,0]) 137 | within_nea_total = np.sum(within_nea[:,1] - within_nea[:,0]) 138 | print([nea_total, migrating_total, within_nea_total]) 139 | ``` 140 | 141 | [51464256.237136059, 1533972.029931426, 630462.28620933369] 142 | 143 | 144 | Although $f=0.02$ the total length of admixted segments is 5% of the chromosome. Presumably this excess is just due to coalescence variance? We expect a proportion $1-e^{-(T_{split}-T_{ad})}$ of admixted lineages to coalesce. Given our time parameters $T_{split}-T_{ad})= 1/2$ (in units of $2 N_e$ generations), so we expect $1 -(e^-\frac{1}{2})=0.39$ of admixted sequence to have a coalesce in the neandertal population. Why is the observed fraction just 0.029? 145 | 146 | 147 | ```python 148 | kb = 1 / 1000 149 | plt.hist([ 150 | (eur_nea[:,1] - eur_nea[:,0]) * kb, 151 | (migrating[:,1] - migrating[:,0]) * kb, 152 | (within_nea[:,1] - within_nea[:,0]) * kb,], 153 | label=["Migrating", "EUR-NEA", "Within NEA"] 154 | ) 155 | plt.yscale('log') 156 | plt.legend() 157 | plt.xlabel("Tract length (KB)"); 158 | ``` 159 | 160 | 161 | ![svg](introgression_files/introgression_9_0.svg) 162 | 163 | 164 | Plotting these tract lengths for a single replicate shows that, as expected, admixture tracts are large initially (blue). We can also see that there is extensive ILS which has two effects: First, only a subset of the admixted material has ancestry in the Neandertal population, it would to check whether this fits the total lengths}). Second there many more tracts at which Neandertals and non-African humans are more closely related to each other due to incomplete lineage sorting (ILS) than there are admixture tracts. Finally ILS tracts because they are old by definition are substantially shorter than admixture tracts. 165 | 166 | ## Locating mutations 167 | 168 | We are interested in finding the population in which mutations arose. Because mutations are just associated with a specific tree node in msprime, we must simulate some extra information in order to make this question answerable. This is quite straightforward to do, since we can generate a time for each mutation uniformly along a branch and therefore unambiguously locate it time (and, therefore, space). 169 | 170 | 171 | ```python 172 | def simulate_mutation_times(ts, random_seed=None): 173 | rng = random.Random(random_seed) 174 | mutation_time = np.zeros(ts.num_mutations) 175 | for tree in ts.trees(): 176 | for mutation in tree.mutations(): 177 | a = tree.time(mutation.node) 178 | b = tree.time(tree.parent(mutation.node)) 179 | mutation_time[mutation.id] = rng.uniform(a, b) 180 | return mutation_time 181 | 182 | pop_configs = [ 183 | msprime.PopulationConfiguration(sample_size=3), 184 | msprime.PopulationConfiguration(sample_size=1), 185 | msprime.PopulationConfiguration(sample_size=1)] 186 | M = [ 187 | [0, 1, 1], 188 | [1, 0, 1], 189 | [1, 1, 0]] 190 | ts = msprime.simulate( 191 | population_configurations=pop_configs, migration_matrix=M, 192 | record_migrations=True, mutation_rate=0.5, random_seed=25) 193 | mutation_time = simulate_mutation_times(ts, random_seed=25) 194 | 195 | ``` 196 | 197 | Once we have run our simulation and assigned times to each mutation, we can then assign populations to each of these mutations. The following function takes a simple approach, but first gathering the migrations for each node into a list. Then, for every mutation, we sequentially examine each migration that affects the mutation's node and intersects with the site position. Because we know that the migration records are sorted in increasing time order, we can simply apply the effects of each migration while the migration record's time is less than the time of the mutation. At the end of this process, we then return the computed mapping of mutation IDs to the populations in which they arose. 198 | 199 | 200 | ```python 201 | def get_mutation_population(ts, mutation_time): 202 | node_migrations = collections.defaultdict(list) 203 | for migration in ts.migrations(): 204 | node_migrations[migration.node].append(migration) 205 | mutation_population = np.zeros(ts.num_mutations, dtype=int) 206 | for tree in ts.trees(): 207 | for site in tree.sites(): 208 | for mutation in site.mutations: 209 | mutation_population[mutation.id] = tree.population(mutation.node) 210 | for mig in node_migrations[mutation.node]: 211 | # Stepping through all migations will be inefficient for large 212 | # simulations. Should use an interval tree (e.g. 213 | # https://pypi.python.org/pypi/intervaltree) to find all 214 | # intervals intersecting with site.position. 215 | if mig.left <= site.position < mig.right: 216 | # Note that we assume that we see the migration records in 217 | # increasing order of time! 218 | if mig.time < mutation_time[mutation.id]: 219 | assert mutation_population[mutation.id] == mig.source 220 | mutation_population[mutation.id] = mig.dest 221 | return mutation_population 222 | 223 | mutation_population = get_mutation_population(ts, mutation_time) 224 | ``` 225 | 226 | 227 | ```python 228 | tree = ts.first() 229 | colour_map = {0:"red", 1:"blue", 2: "green"} 230 | node_colours = {u: colour_map[tree.population(u)] for u in tree.nodes()} 231 | mutation_colours = {mut.id: colour_map[mutation_population[mut.id]] for mut in tree.mutations()} 232 | SVG(tree.draw(node_colours=node_colours, mutation_colours=mutation_colours)) 233 | 234 | ``` 235 | 236 | 237 | 238 | 239 | ![svg](introgression_files/introgression_15_0.svg) 240 | 241 | 242 | 243 | This example shows the locations in which the mutations along the tree branches. We show a single tree here for simplicity, but the method also works when we have recombination. 244 | -------------------------------------------------------------------------------- /old-content/docs/introgression_files/introgression_15_0.svg: -------------------------------------------------------------------------------- 1 | 5768103240911315276481012 -------------------------------------------------------------------------------- /old-content/docs/msprime_out_files/msprime_out_15_0.svg: -------------------------------------------------------------------------------- 1 | 7175210495455212246126001272123207024936815 -------------------------------------------------------------------------------- /old-content/docs/msprime_out_files/msprime_out_19_0.svg: -------------------------------------------------------------------------------- 1 | 5462103 -------------------------------------------------------------------------------- /old-content/docs/msprime_out_files/msprime_out_28_0.svg: -------------------------------------------------------------------------------- 1 | 0.01170.01071.16300.01000.01230.01170.01320.01213.57660.00000.00000.00000.00000.00000.00000.00000.00000.00000.0000 -------------------------------------------------------------------------------- /old-content/docs/msprime_out_files/msprime_out_32_0.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /old-content/docs/msprime_out_files/msprime_out_39_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/old-content/docs/msprime_out_files/msprime_out_39_0.png -------------------------------------------------------------------------------- /old-content/docs/msprime_out_files/msprime_out_40_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/old-content/docs/msprime_out_files/msprime_out_40_0.png -------------------------------------------------------------------------------- /old-content/docs/msprime_out_files/msprime_out_43_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tskit-dev/tutorials/eb3b947e03bc9608ce06db08ae3ad9d46ef3fc07/old-content/docs/msprime_out_files/msprime_out_43_0.png -------------------------------------------------------------------------------- /old-content/docs/wfforward_files/wfforward_11_0.svg: -------------------------------------------------------------------------------- 1 | 14231621178792018191501011113234122726242925285622 -------------------------------------------------------------------------------- /old-content/docs/wfforward_files/wfforward_18_0.svg: -------------------------------------------------------------------------------- 1 | 14->716->819->69->925->128->426->227->324->029->5 -------------------------------------------------------------------------------- /old-content/docs/wfforward_files/wfforward_65_0.svg: -------------------------------------------------------------------------------- 1 | 131181012914370126451302 -------------------------------------------------------------------------------- /old-content/notebooks/Makefile.wfcython: -------------------------------------------------------------------------------- 1 | all: md 2 | 3 | MD=wfcython.md 4 | 5 | md: $(MD) 6 | 7 | clean: 8 | rm -f *.output.ipynb 9 | rm -rf *.output_files 10 | 11 | reallyclean: clean 12 | for i in $(MD) ; do \ 13 | rm -f ../docs/$$i ; \ 14 | done 15 | for i in $(basename $(MD)) ; do \ 16 | rm -fr ../docs/$$i"_files" ; \ 17 | done 18 | 19 | %.output.ipynb: %.ipynb 20 | python execute.py $< $@ --timeout 2000 21 | 22 | %.md: %.output.ipynb 23 | # Set --output so that supporting 24 | # file folder (foo_files) gets 25 | # generated with correct 26 | # output name and location 27 | jupyter nbconvert --to markdown --output-dir=../docs --output=$@ $< 28 | 29 | .PHONY: md 30 | .PRECIOUS: %.output.ipynb 31 | -------------------------------------------------------------------------------- /old-content/notebooks/Makefile.wfforward: -------------------------------------------------------------------------------- 1 | all: md 2 | 3 | MD=wfforward.md 4 | 5 | md: $(MD) 6 | 7 | clean: 8 | rm -f *.output.ipynb 9 | rm -rf *.output_files 10 | 11 | reallyclean: clean 12 | for i in $(MD) ; do \ 13 | rm -f ../docs/$$i ; \ 14 | done 15 | for i in $(basename $(MD)) ; do \ 16 | rm -fr ../docs/$$i"_files" ; \ 17 | done 18 | 19 | %.output.ipynb: %.ipynb 20 | python execute.py $< $@ 21 | 22 | %.md: %.output.ipynb 23 | # Set --output so that supporting 24 | # file folder (foo_files) gets 25 | # generated with correct 26 | # output name and location 27 | jupyter nbconvert --to markdown --output-dir=../docs --output=$@ $< 28 | 29 | .PHONY: md 30 | .PRECIOUS: %.output.ipynb 31 | -------------------------------------------------------------------------------- /old-content/notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Makefiles 2 | 3 | Makefiles are provided per-notebook, in order to support long-running tutorials. Use `make -f Makefile.foo` to build 4 | the desired output. 5 | -------------------------------------------------------------------------------- /old-content/notebooks/execute.py: -------------------------------------------------------------------------------- 1 | import nbformat 2 | import nbconvert 3 | from nbconvert.preprocessors import ExecutePreprocessor 4 | import argparse 5 | import sys 6 | 7 | 8 | def make_parser(): 9 | parser = argparse.ArgumentParser( 10 | description="Options for converting notebooks") 11 | 12 | parser.add_argument('--timeout', type=int, default=600, 13 | help='Execution timeout (seconds)') 14 | return parser 15 | 16 | 17 | nbfile = sys.argv[1] 18 | nboutfile = sys.argv[2] 19 | parser=make_parser() 20 | args=parser.parse_args(sys.argv[3:]) 21 | with open(nbfile) as f: 22 | nb = nbformat.read(f, as_version=4) 23 | ep = ExecutePreprocessor(timeout=args.timeout, kernel_name='python3') 24 | ep.preprocess(nb, {'metadata': {'path': '.'}}) 25 | with open(nboutfile, 'wt') as f: 26 | nbformat.write(nb, f) 27 | -------------------------------------------------------------------------------- /parallelization.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | ```{currentmodule} tskit 15 | ``` 16 | 17 | ```{code-cell} ipython3 18 | :tags: [remove-cell] 19 | import msprime 20 | import numpy as np 21 | import tskit 22 | 23 | def create_notebook_data(): 24 | pass 25 | 26 | # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook 27 | ``` 28 | 29 | (sec_parallelization)= 30 | 31 | # _Parallelization_ 32 | % remove underscores in title when tutorial is complete or near-complete 33 | 34 | When performing large calculations it's often useful to split the 35 | work over multiple processes or threads. The ``tskit`` API can 36 | be used without issues across multiple processes, and the Python 37 | {mod}`multiprocessing` module often provides a very effective way to 38 | work with many replicate simulations in parallel. 39 | 40 | When we wish to work with a single very large dataset, however, threads can 41 | offer better resource usage because of the shared memory space. The Python 42 | {mod}`threading` library gives a very simple interface to lightweight CPU 43 | threads and allows us to perform several CPU intensive tasks in parallel. The 44 | ``tskit`` API is designed to allow multiple threads to work in parallel when 45 | CPU intensive tasks are being undertaken. 46 | 47 | :::{note} 48 | In the CPython implementation the 49 | [Global Interpreter Lock](https://wiki.python.org/moin/GlobalInterpreterLock) ensures that 50 | only one thread executes Python bytecode at one time. This means that 51 | Python code does not parallelise well across threads, but avoids a large 52 | number of nasty pitfalls associated with multiple threads updating 53 | data structures in parallel. Native C extensions like ``numpy`` and ``tskit`` 54 | release the GIL while expensive tasks are being performed, therefore 55 | allowing these calculations to proceed in parallel. 56 | ::: 57 | 58 | 59 | :::{todo} 60 | This tutorial previously used code with an old interface, and hence has been removed. 61 | We must recreate an example of parallel processing, giving examples of both 62 | threads and processes (but see 63 | [this stackoverflow post](https://stackoverflow.com/questions/47313732/jupyter-notebook-never-finishes-processing-using-multiprocessing-python-3) 64 | for why it may be difficult to get {mod}`multiprocessing` working in this notebook). 65 | A reasonable example might be to calculate many pairwise statistics between sample sets 66 | in parallel. 67 | 68 | We should also show how, for large tree sequences that it is better to pass the filenames 69 | to each subprocess, and load the tree sequence, rather than transferring the entire 70 | tree sequence (via pickle) to the subprocesses. 71 | ::: 72 | 73 | -------------------------------------------------------------------------------- /popgen.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.13 7 | jupytext_version: 1.10.3 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | ```{currentmodule} tskit 15 | ``` 16 | 17 | ```{code-cell} 18 | :tags: [remove-cell] 19 | import urllib.request 20 | 21 | import tqdm 22 | import tskit 23 | import tszip 24 | 25 | class DownloadProgressBar(tqdm.tqdm): 26 | def update_to(self, b=1, bsize=1, tsize=None): 27 | if tsize is not None: 28 | self.total = tsize 29 | self.update(b * bsize - self.n) 30 | 31 | def download(url, progress=True): 32 | with DownloadProgressBar( 33 | unit='B', 34 | unit_scale=True, 35 | miniters=1, 36 | desc=url.split('/')[-1], 37 | disable=not progress, 38 | ) as t: 39 | tmp_fn, _ = urllib.request.urlretrieve(url, reporthook=t.update_to) 40 | try: 41 | ts = tskit.load(tmp_fn) 42 | except tskit.FileFormatError: 43 | # could be a tsz file 44 | ts = tszip.decompress(tmp_fn) 45 | urllib.request.urlcleanup() # Remove tmp_fn 46 | return ts 47 | 48 | def download_unified_genealogy(): 49 | keep_span = [108_000_000, 110_000_000] # cut down to this genome region 50 | keep_regions = {"EastAsia", "EAST_ASIA", "AFRICA", "Africa"} 51 | 52 | # Downloads 138 Mb of data - this may take a while 53 | tables = download( 54 | "https://zenodo.org/record/5512994/files/" 55 | "hgdp_tgp_sgdp_high_cov_ancients_chr2_q.dated.trees.tsz" 56 | ).dump_tables() 57 | tables.keep_intervals([keep_span]) 58 | tables.populations.metadata_schema = tskit.MetadataSchema.permissive_json() 59 | tables.sites.metadata_schema = tskit.MetadataSchema.permissive_json() 60 | ts = tables.tree_sequence() 61 | ts = ts.simplify([ 62 | u 63 | for u in ts.samples() 64 | if ( 65 | ts.population(ts.node(u).population).metadata.get("region") in keep_regions 66 | or ts.population(ts.node(u).population).metadata.get("name") == "Denisovan" 67 | ) 68 | ]) 69 | tszip.compress(ts, "data/unified_genealogy_2q_108Mb-110Mb.tsz") 70 | 71 | def create_notebook_data(): 72 | download_unified_genealogy() 73 | 74 | # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook 75 | ``` 76 | 77 | (sec_intro_popgen)= 78 | 79 | # `Tskit` for population genetics 80 | 81 | {ref}`Tskit`, the tree sequence toolkit, brings the power of 82 | evolutionary trees to the field of population genetics. The 83 | {ref}`succinct tree sequence` format 84 | is designed to store DNA sequences jointly with their ancestral history (the 85 | "genetic genealogy" or {ref}`ARG`). Storing population genetic data in this 86 | form enables highly efficient computation and analysis. 87 | 88 | The core `tskit` library provides methods for storing genetic data, a flexible 89 | analysis framework, and APIs to build your own efficient population genetic algorithms. 90 | Because of its speed and scalability, `tskit` is well-suited to interactive analysis of 91 | large genomic datasets. 92 | 93 | ## Population genetic simulation 94 | 95 | Several simulation tools output tree sequences. Below we use the 96 | standard library for population genetic simulation models 97 | ([stdpopsim](https://popsim-consortium.github.io/stdpopsim-docs/)) to generate a model of 98 | *Homo sapiens*, in which African, Eurasian, 99 | and Asian populations combine to generate a mixed American population. We can use the 100 | [demesdraw](https://pypi.org/project/demesdraw/) package to plot a schematic of the 101 | migrations and population size changes that define this model. 102 | 103 | 104 | ```{code-cell} 105 | import stdpopsim 106 | import demesdraw 107 | from matplotlib import pyplot as plt 108 | 109 | species = stdpopsim.get_species("HomSap") 110 | model = species.get_demographic_model("AmericanAdmixture_4B18") 111 | 112 | # Plot a schematic of the model 113 | demesdraw.tubes(model.model.to_demes(), ax=plt.gca(), seed=1, log_time=True) 114 | plt.show() 115 | ``` 116 | 117 | Genomic data in tree sequence format can be generated via the widely-used 118 | [msprime](https://tskit.dev/software/msprime.html) simulator. Here we simulate 20 119 | kilobases of genome sequence at the start of human chromosome 1 under this model, 120 | together with its evolutionary history. We generate 16 diploid genomes: 4 from each of 121 | the populations in the model. The DNA sequences and their ancestry are stored in a 122 | succinct tree sequence named `ts`: 123 | 124 | ```{code-cell} 125 | contig = species.get_contig("chr1", mutation_rate=model.mutation_rate, right=20_000) 126 | samples = {"AFR": 4, "EUR": 4, "ASIA": 4, "ADMIX": 4} # 16 diploid samples 127 | engine = stdpopsim.get_engine("msprime") 128 | ts = engine.simulate(model, contig, samples, seed=9).trim() # trim to first 20kb simulated 129 | print(f"Simulated a tree sequence of {ts.num_samples} haploid genomes:") 130 | print(f"{ts.num_sites} variable sites over {ts.sequence_length} base pairs") 131 | ``` 132 | 133 | We can now inspect alleles and their frequencies at the variable sites we have simulated 134 | along the genome: 135 | 136 | ```{code-cell} 137 | for v in ts.variants(): 138 | display(v) 139 | if v.site.id >= 2: # Only show site 0, 1, and 2, for brevity 140 | break 141 | ``` 142 | 143 | Or we can display the {meth}`~TreeSequence.haplotypes` (i.e. the variable sites) for 144 | each sample 145 | 146 | ```{code-cell} 147 | samples = ts.samples() 148 | for sample_id, h in zip(samples, ts.haplotypes(samples=samples)): 149 | pop = ts.node(sample_id).population 150 | print(f"Sample {sample_id:<2} ({ts.population(pop).metadata['name']:^5}): {h}") 151 | ``` 152 | 153 | From the tree sequence it is easy to obtain the 154 | {meth}`TreeSequence.allele_frequency_spectrum` for the entire region (or for 155 | {ref}`windowed regions`) 156 | 157 | ```{code-cell} 158 | afs = ts.allele_frequency_spectrum() 159 | plt.bar(range(ts.num_samples + 1), afs) 160 | plt.title("Allele frequency spectrum") 161 | plt.show() 162 | ``` 163 | 164 | Similarly `tskit` allows fast and easy 165 | {ref}`calculation of statistics` along the genome. Here is 166 | a plot of windowed $F_{st}$ between Africans and admixed Americans over this short 167 | region of chromosome: 168 | 169 | ```{code-cell} 170 | # Define the samples between which Fst will be calculated 171 | pop_id = {p.metadata["name"]: p.id for p in ts.populations()} 172 | sample_sets=[ts.samples(pop_id["AFR"]), ts.samples(pop_id["ADMIX"])] 173 | 174 | # Do the windowed calculation, using windows of 2 kilobases 175 | windows = list(range(0, int(ts.sequence_length + 1), 2_000)) 176 | F_st = ts.Fst(sample_sets, windows=windows) 177 | 178 | # Plot 179 | plt.stairs(F_st, windows, baseline=None) 180 | plt.ylabel("AFR-ADMIX Fst") 181 | plt.xlabel("Genome position") 182 | plt.show() 183 | ``` 184 | 185 | Extracting the genetic tree at a specific genomic location is easy using `tskit`, which 186 | also provides methods to {ref}`plot` these trees. Here we 187 | grab the tree at position 10kb, and colour the different populations by 188 | different colours, as described in the {ref}`viz tutorial`: 189 | 190 | ```{code-cell} 191 | tree = ts.at(10_000) 192 | 193 | colours = dict(AFR="yellow", EUR="cyan", ASIA="green", ADMIX="red") 194 | styles = [ 195 | f".leaf.p{pop.id} > .sym {{fill: {colours[pop.metadata['name']]}}}" 196 | for pop in ts.populations() 197 | ] 198 | 199 | styles += [ # rotate the population labels, etc 200 | ".leaf > .lab {text-anchor: start; transform: rotate(90deg) translate(6px)}", 201 | ".leaf > .sym {stroke: black}" 202 | ] 203 | 204 | labels = { # Label samples by population 205 | u: ts.population(ts.node(u).population).metadata["name"].capitalize() 206 | for u in ts.samples() 207 | } 208 | 209 | tree.draw_svg( 210 | size=(800, 500), 211 | canvas_size=(800, 520), 212 | node_labels=labels, 213 | style="".join(styles), 214 | y_axis=True, 215 | y_ticks=range(0, 30_000, 10_000) 216 | ) 217 | 218 | ``` 219 | 220 | ## Population genetic inference 221 | 222 | If, instead of simulations, you want to analyse existing genomic data (for example 223 | stored in a VCF file), you will need to infer a tree sequence from it, using e.g. 224 | [tsinfer](https://tskit.dev/tsinfer/docs/stable/). Here we load an illustrative portion 225 | of an [inferred tree sequence](https://zenodo.org/record/5512994) 226 | based on about 7500 public human genomes, including genomes from the 227 | [Thousand Genomes Project](https://www.internationalgenome.org/data-portal/data-collection/grch38) and 228 | [Human Genome Diversity Project](https://www.internationalgenome.org/data-portal/data-collection/hgdp). 229 | The genomic region encoded in this tree sequence has been cut down to 230 | span positions 108Mb-110Mb of human chromosome 2, which spans the 231 | [EDAR](https://en.wikipedia.org/wiki/Ectodysplasin_A_receptor) gene. 232 | 233 | Note that tree sequence files are usually imported using {func}`load`, 234 | but because this file has been additionally compressed, we load it via 235 | {func}`tszip:tszip.decompress`: 236 | 237 | ```{code-cell} 238 | import tszip 239 | ts = tszip.decompress("data/unified_genealogy_2q_108Mb-110Mb.tsz") 240 | 241 | # The ts encompasses a region on chr 2 with an interesting SNP (rs3827760) in the EDAR gene 242 | edar_gene_bounds = [108_894_471, 108_989_220] # In Mb from the start of chromosome 2 243 | focal_variant = [v for v in ts.variants() if v.site.metadata.get("ID") == "rs3827760"].pop() 244 | print("An interesting SNP within the EDAR gene:") 245 | focal_variant 246 | ``` 247 | 248 | For simplicity, this tree sequence has been {ref}`simplified` to 249 | include only those samples from the African and East Asian regions. These belong to a 250 | number of populations. The population information, as well as information describing the 251 | variable sites, is stored in tree sequence {ref}`metadata`: 252 | 253 | ```{code-cell} 254 | import pandas as pd 255 | 256 | print(ts.num_populations, "populations defined in the tree sequence:") 257 | 258 | pop_names_regions = [ 259 | [p.metadata.get("name"), p.metadata.get("region")] 260 | for p in ts.populations() 261 | ] 262 | display(pd.DataFrame(pop_names_regions, columns=["population name", "region"])) 263 | ``` 264 | 265 | You can see that there are multiple African and East asian populations, grouped by 266 | region. Here we collect two lists of IDs for the sample 267 | {ref}`nodes` from the African region and from the East asian 268 | region: 269 | 270 | ```{code-cell} 271 | 272 | sample_lists = {} 273 | for n, rgns in {"Africa": {'AFRICA', 'Africa'}, "East asia": {'EAST_ASIA', 'EastAsia'}}.items(): 274 | pop_ids = [p.id for p in ts.populations() if p.metadata.get("region") in rgns] 275 | sample_lists[n] = [u for p in pop_ids for u in ts.samples(population=p)] 276 | ``` 277 | 278 | 279 | With these lists we can calculate different windowed statistics 280 | (here {meth}`genetic diversity` and 281 | {meth}`Tajima's D`) within each of these regions: 282 | 283 | ```{code-cell} 284 | edar_ts = ts.trim() # remove regions with no data (changes the coordinate system) 285 | windows = list(range(0, int(edar_ts.sequence_length)+1, 10_000)) 286 | data = { 287 | "Genetic diversity": { 288 | region: edar_ts.diversity(samples, windows=windows) 289 | for region, samples in sample_lists.items() 290 | }, 291 | "Tajima's D": { 292 | region: edar_ts.Tajimas_D(samples, windows=windows) 293 | for region, samples in sample_lists.items() 294 | }, 295 | } 296 | 297 | # Plot the `data` 298 | fig, axes = plt.subplots(ncols=2, figsize=(15, 3)) 299 | start = ts.edges_left.min() # the empty amount at the start of the tree sequence 300 | 301 | for (title, plot_data), ax in zip(data.items(), axes): 302 | ax.set_title(title) 303 | ax.axvspan(edar_gene_bounds[0], edar_gene_bounds[1], color="lightgray") 304 | ax.axvline(focal_variant.site.position, ls=":") 305 | for label, stat in plot_data.items(): 306 | ax.stairs(stat, windows+start, baseline=None, label=label) 307 | ax.text(edar_gene_bounds[0], 0, "EDAR") 308 | ax.legend() 309 | plt.show() 310 | ``` 311 | 312 | Other population genetic libraries such as 313 | [scikit-allel](https://scikit-allel.readthedocs.io/en/stable/) (which is 314 | {ref}`interoperable` with `tskit`) 315 | could also have been used to produce the plot above. In this case, the advantage of 316 | using tree sequences is simply that they allow these sorts of analysis to 317 | {ref}`scale` to datasets of millions of whole genomes. 318 | 319 | (sec_popgen_topological)= 320 | 321 | ### Topological analysis 322 | 323 | As this inferred tree sequence stores (an estimate of) the underlying 324 | genealogy, we can also derive statistics based on genealogical relationships. For 325 | example, this tree sequence also contains a sample genome based on an ancient 326 | genome, a [Denisovan](https://en.wikipedia.org/wiki/Denisovan) individual. We can 327 | look at the closeness of relationship between samples from the different geographical 328 | regions and the Denisovan: 329 | 330 | :::{todo} 331 | Show an example of looking at topological relationships between the Denisovan and 332 | various East Asian groups, using the {ref}`sec_counting_topologies` functionality. 333 | ::: 334 | 335 | See {ref}`sec_counting_topologies` for an introduction to topological methods in 336 | `tskit`. 337 | 338 | ## Further information 339 | 340 | This brief introduction is meant as a simple taster. Many other efficient population 341 | genetic {ref}`analyses` are possible when you have 342 | genomic data stored as a tree sequence. 343 | 344 | The rest of the {ref}`tutorials` contain a large number of examples which 345 | are relevant to population genetic analysis and research. You can also visit the 346 | [learning section](https://tskit.dev/learn/) of the [tskit website](https://tskit.dev/). 347 | -------------------------------------------------------------------------------- /references.bib: -------------------------------------------------------------------------------- 1 | --- 2 | --- 3 | 4 | @article{kelleher2016efficient, 5 | title={Efficient coalescent simulation and genealogical analysis for large sample sizes}, 6 | author={Kelleher, Jerome and Etheridge, Alison M and McVean, Gilean}, 7 | journal={PLoS computational biology}, 8 | volume={12}, 9 | number={5}, 10 | pages={e1004842}, 11 | year={2016}, 12 | publisher={Public Library of Science} 13 | } 14 | 15 | @InCollection{kelleher2020coalescent, 16 | author={Kelleher, Jerome and Lohse, Konrad}, 17 | editor={Dutheil, Julien Y.}, 18 | title={Coalescent Simulation with msprime}, 19 | bookTitle={Statistical Population Genomics}, 20 | year={2020}, 21 | publisher={Springer US}, 22 | address={New York, NY}, 23 | pages={191--230}, 24 | } 25 | 26 | @article{nelson2020accounting, 27 | title={Accounting for long-range correlations in genome-wide simulations 28 | of large cohorts}, 29 | author={Nelson, Dominic and Kelleher, Jerome and Ragsdale, Aaron P and 30 | Moreau, Claudia and McVean, Gil and Gravel, Simon}, 31 | journal={PLoS genetics}, 32 | volume={16}, 33 | number={5}, 34 | pages={e1008619}, 35 | year={2020}, 36 | publisher={Public Library of Science San Francisco, CA USA} 37 | } 38 | 39 | @article{adrion2019community, 40 | title={A community-maintained standard library of population genetic models}, 41 | author={Adrion, Jeffrey R and Cole, Christopher B and Dukler, Noah and Galloway, Jared G and Gladstein, Ariella L and Gower, Graham and Kyriazis, Christopher C and Ragsdale, Aaron P and Tsambos, Georgia and Baumdicker, Franz and others}, 42 | journal={eLife}, 43 | year={2020}, 44 | volume={9}, 45 | pages={e54967} 46 | } 47 | 48 | -------------------------------------------------------------------------------- /requirements-CI.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/tskit-dev/tsconvert@e99c837e4e26ccbf4f480a4c48626338eeff7dc3 2 | demes==0.2.3 3 | demesdraw==0.4.0 4 | jupyter-book==1.0.2 5 | jupyter-cache==0.6.1 6 | msprime==1.3.2 7 | networkx==3.3 8 | numpy==1.26.4 9 | pandas==2.2.2 10 | pygraphviz==1.13 11 | scikit-allel==1.3.8 12 | stdpopsim==0.3.0 13 | tqdm==4.66.3 14 | tskit==0.5.8 15 | tskit_arg_visualizer==0.0.1 16 | tszip==0.2.4 17 | jsonschema==4.18.6 # Pinned due to 4.19 "AttributeError module jsonschema has no attribute _validators" 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/tskit-dev/tsconvert@e99c837e4e26ccbf4f480a4c48626338eeff7dc3 2 | demes 3 | demesdraw 4 | jupyter-book>=0.12.0 5 | jupyter-cache 6 | msprime>=1.0 7 | networkx 8 | pandas 9 | pygraphviz 10 | scikit-allel 11 | stdpopsim>=0.3 12 | tqdm 13 | tskit>=0.5.4 14 | tskit_arg_visualizer 15 | tszip 16 | -------------------------------------------------------------------------------- /simplification.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | ```{currentmodule} tskit 15 | ``` 16 | 17 | ```{code-cell} ipython3 18 | :tags: [remove-cell] 19 | def create_notebook_data(): 20 | pass 21 | 22 | # create_notebook_data() # uncomment to recreate the tree seqs used in this notebook 23 | ``` 24 | 25 | (sec_simplification)= 26 | 27 | # _Simplification_ 28 | % remove underscores in title when tutorial is complete or near-complete 29 | 30 | :::{todo} 31 | Create content. See https://github.com/tskit-dev/tutorials/issues/52 32 | ::: 33 | -------------------------------------------------------------------------------- /simulation_overview.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | (sec_simulation_overview)= 15 | 16 | # Tree sequences and simulation 17 | 18 | **Yan Wong, Georgia Tsambos, and Peter Ralph** 19 | 20 | Simulations are important in population genetics for many reasons: 21 | 22 | ::::{margin} 23 | :::{todo} 24 | Add links to papers that illustrate each of the following points 25 | ::: 26 | :::: 27 | 28 | Exploration 29 | : Simulations allow us to explore the influence of complex historical scenarios on 30 | observed patterns of genetic variation and inheritance. 31 | 32 | Benchmarking and evaluating methodologies 33 | : To assess the accuracy of inferential methods, we need test datasets for which the 34 | true values of important parameters are known. 35 | 36 | Model training 37 | : Some methods for ancestry inference are trained on simulated data (eg. Approximate 38 | Bayesian Computation). This is especially important in studies of complex demographies, 39 | where there are many potential parameters and models, making it impractical to specify 40 | likelihood functions. 41 | 42 | Compare to expectations 43 | : It is often useful to compare data to what is expected under a simpler situation 44 | (e.g. for use as a null model). For instance, comparison to *neutral* simulations 45 | can be used to identify regions subject to selection. 46 | 47 | There are two major forms of population genetic simulation: **forwards-time** 48 | and **backwards-time**. In general, forwards-time simulation is detailed and more 49 | realistic, while backwards-time simulation is fast and efficient. 50 | 51 | More specifically, apart from a 52 | {ref}`few exceptions `, 53 | backwards-time simulations are primarily focused on neutral simulations, while 54 | forward simulation is better suited to complex simulations, including those involving 55 | selection and continuous space. 56 | 57 | ## Advantages of tree sequences 58 | 59 | Some forwards-time ([SLiM](http://messerlab.org/slim/), 60 | [fwdpy](http://molpopgen.github.io/fwdpy/)) and backwards-time 61 | ([msprime](https://tskit.dev/msprime)) simulators have a built-in capacity to output 62 | tree sequences. This can have several benefits: 63 | 64 | 1. Neutral mutations, which often account for the majority of genetic variation, do not 65 | need to be tracked during the simulation, but can be added afterwards. See 66 | "{ref}`sec_tskit_no_mutations`". 67 | 2. Tree sequences can be used as an interchange format to combine backwards and 68 | forwards simulations, allowing you to take advantage of the advantages of both 69 | approaches. This is detailed in {ref}`sec_completing_forwards_simulations`. 70 | 71 | ## Some tips on simulation 72 | 73 | Even with fast modern software, simulating full genome sequences of entire populations 74 | can take some time. If you are finding your simulations too slow, it is worth 75 | benchmarking them by running on a range of shorter chromosomes or sample sizes, then 76 | extrapolating to figure out how long the simulations you actually want to run would take. 77 | 78 | :::{todo} 79 | Add an example with a matplotlib fitted curve for some msprime simulations with 80 | e.g. a high recombination rate. 81 | 82 | Collecting data from simulations that take minutes to a few hours and looking at 83 | the msprime paper for suggestions of what curve to fit to the data should give you 84 | good predictions. See [issue #104](https://github.com/tskit-dev/tutorials/issues/104) 85 | ::: 86 | -------------------------------------------------------------------------------- /tskitr.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | formats: md:myst,ipynb 4 | text_representation: 5 | extension: .md 6 | format_name: myst 7 | format_version: 0.13 8 | jupytext_version: 1.10.3 9 | kernelspec: 10 | display_name: R 11 | language: R 12 | name: ir 13 | --- 14 | 15 | ```{currentmodule} tskit 16 | ``` 17 | 18 | (sec_tskit_r)= 19 | 20 | # Tskit and R 21 | 22 | To interface with `tskit` in R, we can use the [reticulate](https://rstudio.github.io/reticulate/) R package, which lets you call Python functions within an R session. In this tutorial, we'll go through a couple of examples to show you how to get started. If you haven't done so already, you'll need to install `reticulate` in your R session via `install.packages("reticulate")`. 23 | 24 | We'll begin by simulating a small tree sequence using `msprime`. 25 | 26 | ```{code-cell} 27 | msprime <- reticulate::import("msprime") 28 | 29 | ts <- msprime$sim_ancestry(80, sequence_length=1e4, recombination_rate=1e-4, random_seed=42) 30 | ts # See "Jupyter notebook tips", below for how to render this nicely 31 | ``` 32 | 33 | ## Attributes and methods 34 | 35 | `reticulate` allows us to access a Python object's attributes via 36 | the `$` operator. For example, we can access (and assign to a variable) the number of 37 | samples in the tree sequence: 38 | 39 | ```{code-cell} 40 | n <- ts$num_samples 41 | n 42 | ``` 43 | 44 | The `$` operator can also be used to call methods, for example, the 45 | {meth}`~TreeSequence.simplify` method associated with the tree sequence. 46 | The method parameters are given as native R objects 47 | (but note that object IDs still use tskit's 0-based indexing system). 48 | 49 | ```{code-cell} 50 | reduced_ts <- ts$simplify(0:7) # only keep samples with ids 0, 1, 2, 3, 4, 5, 6, 7 51 | reduced_ts <- reduced_ts$delete_intervals(list(c(6000, 10000))) # delete data after 6kb 52 | reduced_ts <- reduced_ts$trim() # remove the deleted region 53 | paste( 54 | "Reduced from", ts$num_trees, "trees over", ts$sequence_length/1e3, "kb to", 55 | reduced_ts$num_trees, "trees over", reduced_ts$sequence_length/1e3, "kb.") 56 | ``` 57 | 58 | ### IDs and indexes 59 | 60 | Note that if a bare digit is provided to one of these methods, it will be treated as a 61 | floating point number. This is useful to know when calling `tskit` methods that 62 | require integers (e.g. object IDs). For example, the following will not work: 63 | 64 | ```{code-cell} 65 | :tags: [raises-exception, remove-output] 66 | ts$node(0) # Will raise an error 67 | ``` 68 | 69 | In this case, to force the `0` to be passed as an integer, you can either coerce it 70 | using `as.integer` or simply prepend the letter `L`: 71 | 72 | ```{code-cell} 73 | ts$node(as.integer(0)) 74 | # or 75 | ts$node(0L) 76 | ``` 77 | 78 | Coercing in this way is only necessary when passing parameters to those underlying 79 | `tskit` methods that expect integers. It is not needed e.g. to index into numeric arrays. 80 | _However_, when using arrays, very careful attention must be paid to the fact that 81 | `tskit` IDs start at zero, whereas R indexes start at one: 82 | 83 | ```{code-cell} 84 | root_id <- ts$first()$root 85 | paste("Root time via tskit method:", ts$node(root_id)$time) 86 | # When indexing into tskit arrays in R, add 1 to the ID 87 | paste("Root time via array access:", ts$nodes_time[root_id + 1]) 88 | ``` 89 | 90 | ## Analysis 91 | 92 | From within R we can use `tskit`'s powerful 93 | [Statistics](https://tskit.dev/tskit/docs/stable/stats.html) framework to efficiently 94 | compute many different summary statistics from a tree sequence. To illustrate this, 95 | we'll first add some mutations to our tree sequence with the 96 | {func}`msprime:msprime.sim_mutations` function, and then compute the genetic diversity 97 | for each of the tree sequence's sample nodes: 98 | 99 | ```{code-cell} 100 | ts_mut = msprime$sim_mutations(reduced_ts, rate=1e-4, random_seed=321) 101 | 102 | paste(ts_mut$num_mutations, "mutations, genetic diversity is", ts_mut$diversity()) 103 | ``` 104 | 105 | Numerical arrays and matrices work as expected. For instance, we can use the tree 106 | sequence {meth}`~TreeSequence.genotype_matrix()` method to return the genotypes of 107 | the tree sequence as a matrix object in R. 108 | 109 | ```{code-cell} 110 | G = ts_mut$genotype_matrix() 111 | G 112 | ``` 113 | 114 | We can then use R functions directly on the genotype matrix: 115 | 116 | ```{code-cell} 117 | allele_frequency = rowMeans(G) 118 | allele_frequency 119 | ``` 120 | 121 | ## Jupyter notebook tips 122 | 123 | When running R within a [Jupyter notebook](https://jupyter.org), a few magic functions 124 | can be defined that allow tskit objects to be rendered within the notebook: 125 | 126 | ```{code-cell} 127 | # Define some magic functions to allow objects to be displayed in R Jupyter notebooks 128 | repr_html.tskit.trees.TreeSequence <- function(obj, ...){obj$`_repr_html_`()} 129 | repr_html.tskit.trees.Tree <- function(obj, ...){obj$`_repr_html_`()} 130 | repr_svg.tskit.drawing.SVGString <- function(obj, ...){obj$`__str__`()} 131 | ``` 132 | 133 | This leads to much nicer tabular summaries: 134 | 135 | ```{code-cell} 136 | ts_mut 137 | ``` 138 | 139 | It also allows trees and tree sequences to be plotted inline: 140 | 141 | ```{code-cell} 142 | ts_mut$draw_svg(y_axis=TRUE, y_ticks=0:10) 143 | ``` 144 | 145 | 146 | ## Interaction with R libraries 147 | 148 | R has a number of libraries to deal with genomic data and trees. Below we focus on the 149 | phylogenetic tree representation defined in the the popular 150 | [ape](http://ape-package.ird.fr) package, taking all the trees 151 | {meth}`exported in Nexus format`, or 152 | individual trees {meth}`exported in Newick format`: 153 | 154 | ```{code-cell} 155 | file = tempfile() 156 | ts_mut$write_nexus(file) 157 | # Warning - ape trees are stored independently, so this will use much more memory than tskit 158 | trees <- ape::read.nexus(file, force.multi = TRUE) # return a set of trees 159 | 160 | # Or simply read in a single tree 161 | tree <- ape::read.tree(text=ts_mut$first()$as_newick()) 162 | 163 | # Now we can plot the tree in tskit style, but using the ape library 164 | plot(tree, direction="downward", srt=90, adj=0.5) # or equivalently use trees[[1]] 165 | ``` 166 | 167 | Note that nodes are labelled with the prefix `n`, so that nodes `0`, `1`, `2`, ... 168 | become `n0`, `n1`, `n2` ... etc. This helps to avoid 169 | confusion between the the zero-based counting system used natively 170 | by `tskit`, and the one-based counting system used in `R`. 171 | 172 | ## Further information 173 | 174 | Be sure to check out the [reticulate](https://rstudio.github.io/reticulate/) 175 | documentation, in particular on 176 | [Calling Python from R](https://rstudio.github.io/reticulate/articles/calling_python.html), 177 | which includes important information on how R data types are converted to their 178 | equivalent Python types. 179 | -------------------------------------------------------------------------------- /tutorial_development.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | text_representation: 4 | extension: .md 5 | format_name: myst 6 | format_version: 0.12 7 | jupytext_version: 1.9.1 8 | kernelspec: 9 | display_name: Python 3 10 | language: python 11 | name: python3 12 | --- 13 | 14 | (sec_development)= 15 | 16 | # _Developing new tutorials_ 17 | % remove underscores in title when tutorial is complete or near-complete 18 | 19 | :::{note} 20 | Add content for helping developers to add more tutorials. 21 | ::: 22 | --------------------------------------------------------------------------------