├── .github
    └── workflows
    │   └── build-mkdocs.yml
├── .gitpod.Dockerfile
├── .gitpod.yml
├── LICENSE
├── README.md
├── docs
    └── README.md
└── mkdocs.yml


/.github/workflows/build-mkdocs.yml:
--------------------------------------------------------------------------------
 1 | name: Build mkdocs
 2 | on: push
 3 | 
 4 | permissions:
 5 |   contents: write
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@v3
12 |       - uses: actions/setup-python@v4
13 |         with:
14 |           python-version: 3.x
15 |       - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
16 |       - uses: actions/cache@v3
17 |         with:
18 |           key: mkdocs-material-${{ env.cache_id }}
19 |           path: .cache
20 |           restore-keys: |
21 |             mkdocs-material-
22 |       - run: pip install mkdocs-material
23 |       - run: mkdocs gh-deploy --force


--------------------------------------------------------------------------------
/.gitpod.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM gitpod/workspace-full
2 | 
3 | RUN brew install R
4 | RUN R -e 'install.packages(c("viridis", "argparse"))'
5 | 


--------------------------------------------------------------------------------
/.gitpod.yml:
--------------------------------------------------------------------------------
 1 | image: gitpod/workspace-base
 2 |     
 3 | tasks:
 4 | - name: install mamba
 5 |   init: |
 6 |     curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh"
 7 |     bash Mambaforge-$(uname)-$(uname -m).sh -b -p /workspace/conda && rm Mambaforge-$(uname)-$(uname -m).sh
 8 |     /workspace/conda/bin/mamba init bash
 9 | 
10 |     source ${HOME}/.bashrc
11 |     mamba create -n smudgeplot -c conda-forge r-base r-argparse r-viridis -y
12 |     
13 | - name: activate mamba
14 |   command: |
15 |     /workspace/conda/bin/mamba init bash
16 |     source ${HOME}/.bashrc
17 |     mamba activate smudgeplot
18 | 
19 |     export PATH=/workspace/bin:"$PATH"
20 | 
21 | 
22 | vscode:
23 |   extensions:
24 |     - anwar.papyrus-pdf
25 | 
26 | workspaceLocation: "/workspace"
27 | 
28 | github:
29 |   prebuilds:
30 |     # enable for the master/default branch (defaults to true)
31 |     master: true
32 |     # enable for all branches in this repo (defaults to false)
33 |     branches: true
34 |     # enable for pull requests coming from this repo (defaults to true)
35 |     pullRequests: true
36 |     # enable for pull requests coming from forks (defaults to false)
37 |     pullRequestsFromForks: true
38 |     # add a "Review in Gitpod" button as a comment to pull requests (defaults to true)
39 |     addComment: true
40 |     # add a "Review in Gitpod" button to pull requests (defaults to false)
41 |     addBadge: false
42 |     # add a label once the prebuild is ready to pull requests (defaults to false)
43 |     addLabel: prebuilt-in-gitpod


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Biodiversity Genomics Academy 2023
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Understanding k-mers and ploidy using Smudgeplot
  2 | 
  3 | This session is part of [**Biodiversity Genomics Academy 2023**](https://BGA23.org)
  4 | 
  5 | ## Session Leader(s)
  6 | 
  7 | Kamil S. Jaron; Amjad Khalaf
  8 | 
  9 | Tree of Life, Wellcome Sanger Institute
 10 | 
 11 | ## Description
 12 | 
 13 | By the end of this session you will be able to:
 14 | 
 15 | 1. Understand how Smudgeplot estimates ploidy
 16 | 2. Appreciate strengths and weakness of Smudgeplot
 17 | 3. Run Smudgeplot and understand the input parameters
 18 | 4. Critically evaluate a Smudgeplot
 19 | 
 20 | ## Prerequisites
 21 | 
 22 | 1. Understanding of linux command line basics
 23 | 2. Knowledge of basic genome biology
 24 | 3. (optional) read the smudgeplot sections of <https://www.nature.com/articles/s41467-020-14998-3>
 25 | 
 26 | !!! warning "Please make sure you MEET THE PREREQUISITES and READ THE DESCRIPTION above"
 27 | 
 28 |     You will get the most out of this session if you meet the prerequisites above.
 29 | 
 30 |     Please also read the description carefully to see if this session is relevant to you.
 31 |     
 32 |     If you don't meet the prerequisites or change your mind based on the description or are no longer available at the session time, please email tol-training at sanger.ac.uk to cancel your slot so that someone else on the waitlist might attend.
 33 | 
 34 | ## Tutorial
 35 | 
 36 | 
 37 | Have you ever sequenced something not-well studied? Something that might show strange genomic signatures? Smudgeplot is a visualisation technique for whole-genome sequencing reads from a single individual. The visualisation techique is based on the idea of het-mers. Het-mers are k-mer pairs that are exactly one nucleotide pair away from each other, while forming a unique pair in the sequencing dataset. These k-mers are assumed to be mostly representing two alleles of a heterozygous, but potentially can also show pairing of imperfect paralogs, or sequencing errors paired up with a homozygous genomic k-mer. Nevertheless, the predicted ploidy by smudgeplot is simply the ploidy with the highest number of k-mer pairs (if a reasonable estimate must be evaluated for each individual case!).
 38 | 
 39 | ### Installing the software & detting some data
 40 | 
 41 | Open gitpod. And install the development version of smudgeplot (branch sploidyplot) & FastK. 
 42 | 
 43 | ```
 44 | mkdir src bin && cd src # create directories for source code & binaries
 45 | git clone -b sploidyplot https://github.com/KamilSJaron/smudgeplot
 46 | git clone https://github.com/thegenemyers/FastK
 47 | ```
 48 | 
 49 | Now smudgeplot make install smudgeplot R package, compiles the C kernel for searching for k-mer pairs and copy all the executables to `workspace/bin/` (which will be our dedicated spot for executables). 
 50 | 
 51 | ```
 52 | cd smudgeplot
 53 | make -s INSTALL_PREFIX=/workspace
 54 | R -e 'install.packages(".", repos = NULL, type="source")' # install the R package
 55 | cd ..
 56 | smudgeplot.py -h # test the installation worked out nice
 57 | cd FastK && make
 58 | install -c FastK Fastrm Fastmv Fastcp Fastmerge Histex Tabex Profex Logex Vennex Symmex Haplex Homex Fastcat /workspace/bin/
 59 | FastK # test the installation worked out nice
 60 | cd ..
 61 | ```
 62 | 
 63 | Now the software we need is installed, all we need is to download some data; There are 8 datasets for the 8 breakout session, here is a [table of accessions](https://docs.google.com/document/d/13SEd0cIx8BATqDtbLFHnwUwRrSfDYmVniVCqXptK6d0/edit?usp=sharing), we will use the same document to upload our results too; Pick the one that is corresponding to your breakout room, and replace the example one by one of yours. Fetch the data using `wget` command. E.g.
 64 | 
 65 | ```
 66 | wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR926/SRR926341/SRR926341_[12].fastq.gz
 67 | ```
 68 | 
 69 | ### Constructing a database
 70 | 
 71 | The whole process operates with raw, or trimmed sequencing reads. From those we generate a k-mer database using [FastK](https://github.com/thegenemyers/FASTK). FastK is currently the fastest k-mer counter out there and the only supported by the lastest version of smudgeplot*. This database contains an index of all the k-mers and their coverages in the sequencing readset. Within this set user must chose a theshold for excluding low frequencing k-mers that will be considered errors. That choice is not too difficult to make by looking at the k-mer spectra. Of all the retained k-mers we find all the het-mers. Then we plot a 2d histogram. 
 72 | 
 73 | 
 74 | *Note: The previous versions of smudgeplot (up to 2.5.0) were operating on k-mer "dumps" flat files you can generate with any counter you like. You can imagine that text files are very inefficient to operate on. The new version is operating directly on the optimised k-mer database instead.
 75 | 
 76 | So construct the datavase using FastK. This will take some minutes (15-20):
 77 | 
 78 | ```
 79 | FastK -v -t4 -k31 -M16 -T4 SRR926341_[12].fastq.gz -NSRR926341
 80 | ```
 81 | 
 82 | Now, that you have a database, you can search for k-mer pairs, but I would advice to take a moment and look at a k-mer spectra first. You can get k-mer spectra from the database using `Histex`, a different tool from the same suite.
 83 | 
 84 | ```
 85 | Histex -G SRR8495097 > SRR8495097_k31.hist
 86 | ```
 87 | 
 88 | You can visualize this histogram in anyway, one faily easy one is uploading it to genomescope2 webserver: http://qb.cshl.edu/genomescope/genomescope2.0/ (use default ploidy parameter).
 89 | 
 90 | Note: `.hist` is quite often used for text histogram files, but `FastK` also generates a binary `.hist` file; don't mix them up! ()
 91 | 
 92 | Looking at a k-mer histogram; you should be able to see what is the coverage of the possible genomic k-mers. Also, upload your histogram to [the document](https://docs.google.com/document/d/13SEd0cIx8BATqDtbLFHnwUwRrSfDYmVniVCqXptK6d0/edit?usp=sharing) for our shared results, please. If we look at this example
 93 | 
 94 | ![example](
 95 | http://qb.cshl.edu/genomescope/genomescope2.0/user_data/QQbW8zXct8FErXPSt9Mw/linear_plot.png)
 96 | 
 97 | In this example, a meaningful error threshold would be 40x. As a rule of thumb, no dataset should have this threshold <10, and it is not the end of the world if we lose a bit of the real genomic k-mers (as long as there is enough signal). However these are just some gudances, what is sensible really depends on each individual datasets!
 98 | 
 99 | ### Run smudgeplot
100 | 
101 | The error threshold is sepcified by the '-L' parameter. We have 4 cores in the GitPod, so you can also run the k-mer pair search in paralel (parameter `-t`). This command will interanlly call a C-kernel optimised for the searched designed by Gene Myers. When executing, don't forget to use names of YOUR sample, not the example one.
102 | 
103 | ```
104 | smudgeplot.py hetmers -L 40 -t 4 --verbose -o SRR926341_k31_pairs SRR926341.ktab
105 | ```
106 | 
107 | and finally, once the k-mer pairs are done. A `*_text.smu` file should be generated, it's a 2d histogram, for each combination of covA and covB there is the frequency in which these two coverages occur among the het-mers (the k-mer pairs one away from each other).
108 | 
109 | ```
110 | head SRR926341_k31_pairs_text.smu
111 | ```
112 | 
113 | It you see three columns, it's a good sign. You can proceed to finally plot the smudgeplot. I would encourage to run `smudgeplot plot -h` to see all the options and understand what they mean, but a minimilistic command like this should do:
114 | 
115 | ```
116 | smudgeplot.py plot -t SRR926341 -o SRR926341_k31_smudgeplot SRR926341_k31_pairs_text.smu
117 | ```
118 | 
119 | How does the smudgeplot look? You shold see something like this:
120 | 
121 | ![smudgeplot](https://user-images.githubusercontent.com/8181573/267332563-1b9d8bc1-6241-4ebb-a92a-32d02c7c38d1.png)
122 | 
123 | A plot with a bunch of smudges, and annotations that are overlapping the smudges. In the top right panel you see proportions of kmer pairs in the individual smudges sorted by frequency. In the bottom right corner you see the 1n coverage estimate for the dataset. This is the same 1n coverage as was infered by GenomeScope, these two numbers need to be the same for the model and smudgeplot be telling the same story. If they are substantially different, one should investigate why. In different genomes smudgeplot or genomescope are better in figuring out the coverage, and usually the diffences are in factor of 2. If your think it's smudgeplot coverage estimate that is off, rerun smudgeplot with paramter '-n' and provide a number corresponding to the 1n peak in your genomescope plot such as '-n 50'.
124 | 
125 | Once you are happy with your smudgeplot, upload it to shared docs with results.
126 | 
127 | We will discuss the results and then hear from Amjad, what is the actual biological story.
128 | 
129 | ### Where to go next
130 | 
131 | - [Smudgeplot v2.5.0](https://github.com/KamilSJaron/smudgeplot/wiki) documentation: most of it applies the same for this development version (2.9.9) and there are plenty of useful things to learn in there
132 | - original [Genomescope & Smudgeplot paper](https://www.nature.com/articles/s41467-020-14998-3): this describes a lot older version of the software (0.1.3), but the general idea applies.
133 | - [OH-KNOW k-mer workshop learning materials](https://github.com/KamilSJaron/oh-know/wiki/Characterization-of-polyploid-genomes-using-k-mer-spectra-analysis).
134 | - If you would be interested finding out more about research in Jaron group, visit our [website](https://www.sanger.ac.uk/group/jaron-group/).
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Understanding k-mers and ploidy using Smudgeplot
 2 | 
 3 | This session is part of [**Biodiversity Genomics Academy 2023**](https://BGA23.org)
 4 | 
 5 | ## Session Leader(s)
 6 | 
 7 | Kamil S. Jaron  
 8 | Group leader, Tree of Life  
 9 | Wellcome Sanger Institute
10 | 
11 | Website: <https://www.sanger.ac.uk/group/jaron-group/>  
12 | Twitter: <https://twitter.com/KamilSJaron>  
13 | 
14 | K-mer learning materials: <https://github.com/KamilSJaron/oh-know/wiki/Characterization-of-polyploid-genomes-using-k-mer-spectra-analysis>
15 | 
16 | ## Description
17 | 
18 | By the end of this session you will be able to:
19 | 
20 | 1. Understand how Smudgeplot estimates ploidy
21 | 2. Appreciate strengths and weakness of Smudgeplot
22 | 3. Run Smudgeplot and understand the input parameters
23 | 4. Critically evaluate a Smudgeplot
24 | 
25 | ## Session Materials
26 | 
27 | - [Slides](https://docs.google.com/presentation/d/1cZXcdeurt3YGVvNdSTlFaIBhFRHWXFbaQHcu7y64zJY/edit?usp=sharing)
28 | - [Gitpod](https://gitpod.io/#https://github.com/BGAcademy23/smudgeplot)
29 | 
30 | ## Prerequisites
31 | 
32 | 1. Understanding of linux command line basics
33 | 2. Knowledge of basic genome biology
34 | 3. (optional) read the smudgeplot sections of <https://www.nature.com/articles/s41467-020-14998-3>
35 | 
36 | !!! warning "Please make sure you MEET THE PREREQUISITES and READ THE DESCRIPTION above"
37 | 
38 |     You will get the most out of this session if you meet the prerequisites above.
39 | 
40 |     Please also read the description carefully to see if this session is relevant to you.
41 |     
42 |     If you don't meet the prerequisites or change your mind based on the description or are no longer available at the session time, please email tol-training at sanger.ac.uk to cancel your slot so that someone else on the waitlist might attend.
43 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: smudgeplot
 2 | edit_uri: https://github.com/bgacademy23/smudgeplot/blob/main/docs
 3 | 
 4 | nav:
 5 |     - Home: README.md
 6 |     - BGA23: https://bga23.org
 7 | 
 8 | theme:
 9 |     name: material
10 |     language: en
11 |     palette:
12 |         # Palette toggle for dark mode
13 |         - media: "(prefers-color-scheme: dark)"
14 |           scheme: slate
15 |           toggle:
16 |               icon: material/weather-night
17 |               name: Switch to light mode
18 |         # Palette toggle for light mode
19 |         - media: "(prefers-color-scheme: light)"
20 |           scheme: default
21 |           toggle:
22 |               icon: material/weather-sunny
23 |               name: Switch to dark mode
24 |     font:
25 |         text: Inter
26 |     features:
27 |         - content.action.edit
28 |         - content.code.annotate
29 |         - content.code.copy
30 |         - navigation.footer
31 |         # - navigation.tabs
32 |         - navigation.top
33 |         - navigation.tracking
34 |         - search.highlight
35 |         - search.share
36 |         - search.suggest
37 |         - toc.follow
38 | 
39 | extra:
40 |     social:
41 |         - icon: fontawesome/brands/twitter
42 |           link: https://twitter.com/BGAcademy23
43 |           name: Biodiversity Genomics Academy 2023 on Twitter
44 |         - icon: fontawesome/brands/mastodon
45 |           link: https://genomic.social/@BGAcademy23
46 |           name: Biodiversity Genomics Academy 2023 on Mastodon - Genomic.Social
47 | 
48 | markdown_extensions:
49 |     - admonition
50 |     - attr_list
51 |     - md_in_html
52 |     - pymdownx.emoji:
53 |         emoji_index: !!python/name:materialx.emoji.twemoji
54 |         emoji_generator: !!python/name:materialx.emoji.to_svg
55 |     - pymdownx.details
56 |     - pymdownx.highlight:
57 |           anchor_linenums: true
58 |     - pymdownx.inlinehilite
59 |     - pymdownx.keys
60 |     - pymdownx.snippets:
61 |           base_path: ["."]
62 |     - pymdownx.snippets
63 |     - pymdownx.superfences
64 |     - tables
65 |     - toc:
66 |           title: On this page
67 |           permalink: true
68 | plugins:
69 |     - search
70 | 


--------------------------------------------------------------------------------