├── config ├── runtime.exs ├── config.exs ├── prod.exs └── dev.exs ├── test ├── test_helper.exs └── clinvar_checker_test.exs ├── .tool-versions ├── .formatter.exs ├── lib ├── clinvar_checker │ ├── application.ex │ ├── memory_profiler.ex │ └── cli.ex └── clinvar_checker.ex ├── .gitignore ├── mix.exs ├── LICENSE ├── mix.lock └── README.md /config/runtime.exs: -------------------------------------------------------------------------------- 1 | import Config 2 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | -------------------------------------------------------------------------------- /.tool-versions: -------------------------------------------------------------------------------- 1 | erlang 27.1.2 2 | elixir 1.17.3-otp-27 3 | zig 0.13.0 4 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | import Config 2 | 3 | import_config "#{Mix.env()}.exs" 4 | -------------------------------------------------------------------------------- /config/prod.exs: -------------------------------------------------------------------------------- 1 | import Config 2 | 3 | config :clinvar_checker, env: :prod 4 | -------------------------------------------------------------------------------- /config/dev.exs: -------------------------------------------------------------------------------- 1 | import Config 2 | 3 | config :logger, level: :debug 4 | 5 | config :clinvar_checker, env: :dev 6 | -------------------------------------------------------------------------------- /.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" 2 | [ 3 | inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] 4 | ] 5 | -------------------------------------------------------------------------------- /test/clinvar_checker_test.exs: -------------------------------------------------------------------------------- 1 | defmodule ClinvarCheckerTest do 2 | use ExUnit.Case 3 | doctest ClinvarChecker 4 | 5 | test "greets the world" do 6 | assert ClinvarChecker.hello() == :world 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /lib/clinvar_checker/application.ex: -------------------------------------------------------------------------------- 1 | defmodule ClinvarChecker.Application do 2 | @moduledoc false 3 | 4 | use Application 5 | 6 | @impl true 7 | def start(_type, _args) do 8 | # children = [ 9 | # Starts a worker by calling: ClinvarChecker.Worker.start_link(arg) 10 | # {ClinvarChecker.Worker, arg} 11 | # ] 12 | 13 | # See https://hexdocs.pm/elixir/Supervisor.html 14 | # for other strategies and supported options 15 | # opts = [strategy: :one_for_one, name: ClinvarChecker.Supervisor] 16 | # Supervisor.start_link(children, opts) 17 | # 18 | args = Burrito.Util.Args.get_arguments() 19 | ClinvarChecker.Cli.main(args) 20 | System.halt(0) 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build/ 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover/ 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps/ 9 | 10 | # Where third-party dependencies like ExDoc output generated docs. 11 | /doc/ 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | 22 | # Ignore package tarball (built via "mix hex.build"). 23 | clinvar_checker-*.tar 24 | 25 | # Temporary files - downloads and results 26 | /tmp/ 27 | *.vcf 28 | *.txt 29 | 30 | .elixir-tools/ 31 | .elixir_ls/ 32 | .lexical/ 33 | 34 | *.DS_Store 35 | 36 | burrito_out/ 37 | -------------------------------------------------------------------------------- /lib/clinvar_checker/memory_profiler.ex: -------------------------------------------------------------------------------- 1 | defmodule ClinvarChecker.MemoryProfiler do 2 | require Logger 3 | 4 | @spec profile((-> any())) :: any() 5 | def profile(func) do 6 | initial_memory = :erlang.memory() 7 | 8 | {time, result} = :timer.tc(func) 9 | 10 | final_memory = :erlang.memory() 11 | 12 | memory_diff = 13 | for {type, after_bytes} <- final_memory do 14 | {type, after_bytes - initial_memory[type]} 15 | end 16 | 17 | Logger.info(""" 18 | Memory Usage: 19 | Total: #{Sizeable.filesize(memory_diff[:total])} 20 | Processes: #{Sizeable.filesize(memory_diff[:processes])} 21 | System: #{Sizeable.filesize(memory_diff[:system])} 22 | Atom: #{Sizeable.filesize(memory_diff[:atom])} 23 | Binary: #{Sizeable.filesize(memory_diff[:binary])} 24 | Code: #{Sizeable.filesize(memory_diff[:code])} 25 | ETS: #{Sizeable.filesize(memory_diff[:ets])} 26 | 27 | Execution time: #{time / 1_000_000} seconds 28 | """) 29 | 30 | result 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule ClinvarChecker.MixProject do 2 | use Mix.Project 3 | 4 | def project do 5 | [ 6 | app: :clinvar_checker, 7 | version: "1.2.0", 8 | elixir: "~> 1.17", 9 | start_permanent: Mix.env() == :prod, 10 | deps: deps(), 11 | releases: releases() 12 | ] 13 | end 14 | 15 | def application do 16 | [ 17 | extra_applications: [:logger], 18 | mod: {ClinvarChecker.Application, []} 19 | ] 20 | end 21 | 22 | defp deps do 23 | [ 24 | {:burrito, "~> 1.0"}, 25 | {:req, "~> 0.5.8"}, 26 | {:flow, "~> 1.2"}, 27 | {:sizeable, "~> 1.0"} 28 | ] 29 | end 30 | 31 | defp releases do 32 | [ 33 | clinvar_checker: [ 34 | steps: [:assemble, &Burrito.wrap/1], 35 | burrito: [ 36 | targets: [ 37 | macos_intel: [os: :darwin, cpu: :x86_64], 38 | macos_arm: [os: :darwin, cpu: :aarch64], 39 | linux: [os: :linux, cpu: :x86_64], 40 | windows: [os: :windows, cpu: :x86_64] 41 | ] 42 | ] 43 | ] 44 | ] 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Austin Saunders 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/clinvar_checker/cli.ex: -------------------------------------------------------------------------------- 1 | defmodule ClinvarChecker.Cli do 2 | @moduledoc """ 3 | CLI interface for ClinVarChecker. 4 | 5 | Handles parsing arguments and executing the appropriate command. 6 | """ 7 | @spec main([String.t(), ...]) :: any() 8 | def main(args) do 9 | args 10 | |> sanitize_args() 11 | |> parse_args() 12 | |> validate_args() 13 | |> then(fn args -> 14 | if config_env() == :dev do 15 | IO.inspect(args, label: "Parsed command and args") 16 | else 17 | args 18 | end 19 | end) 20 | |> parse_command() 21 | end 22 | 23 | def parse_command({["check", input | _cmd], args}) do 24 | {:ok, _count_matches} = ClinvarChecker.run(input, args) 25 | end 26 | 27 | def parse_command({["download" | _cmd], _args}) do 28 | IO.puts("Downloading ClinVar data...\n") 29 | 30 | case ClinvarChecker.download_clinvar_data() do 31 | {:ok, _} -> IO.puts("ClinVar data downloaded successfully!\n") 32 | {:error, error} -> IO.puts("Error downloading ClinVar data: #{error}\n") 33 | end 34 | end 35 | 36 | def parse_command({["help" | _cmd], _args}) do 37 | print_help() 38 | end 39 | 40 | def parse_command({["version" | _cmd], _args}) do 41 | IO.puts("ClinVar Checker v#{Application.spec(:clinvar_checker, :vsn)}\n") 42 | end 43 | 44 | def parse_command({cmd, _args}) do 45 | IO.puts( 46 | "Error: Unknown command `#{cmd}`. Please use `clinvar-checker help` for intended usage.\n" 47 | ) 48 | 49 | print_help() 50 | System.halt(1) 51 | end 52 | 53 | defp parse_args(args) do 54 | {args, command, _} = 55 | OptionParser.parse(args, 56 | strict: [ 57 | help: :boolean, 58 | output: :string, 59 | clinical_significance: :string 60 | ], 61 | aliases: [ 62 | help: :h, 63 | output: :o, 64 | clinical_significance: :cs 65 | ] 66 | ) 67 | 68 | {command, args} 69 | end 70 | 71 | defp validate_args({command, args}) do 72 | case Keyword.get(args, :clinical_significance) do 73 | nil -> 74 | {command, args} 75 | 76 | clinical_significance -> 77 | parsed_cs = clinical_significance |> String.split(",") |> MapSet.new() 78 | cs_diff = MapSet.difference(parsed_cs, ClinvarChecker.valid_clinical_significances()) 79 | 80 | if MapSet.size(cs_diff) > 0 do 81 | IO.puts( 82 | "Error: Invalid clinical significance(s) provided: #{inspect(cs_diff |> MapSet.to_list())}" 83 | ) 84 | 85 | System.halt() 86 | else 87 | {command, Keyword.put(args, :clinical_significance, parsed_cs)} 88 | end 89 | end 90 | end 91 | 92 | defp sanitize_args(args) do 93 | Enum.drop_while(args, &(&1 == "start" || &1 == "app.start" || String.ends_with?(&1, "mix"))) 94 | end 95 | 96 | defp config_env(), do: Application.get_env(:clinvar_checker, :env) 97 | 98 | defp print_help() do 99 | IO.puts(""" 100 | Usage: clinvar-checker analyze /tmp/path_to_23andme_data 101 | 102 | Commands: 103 | check Cross-checks ClinVar variants against provided 23andMe data 104 | download Downloads the latest ClinVar database 105 | help Shows this help message 106 | version Shows version information 107 | 108 | Options: 109 | -h, --help Shows this help message 110 | -o, --output Write ouput to a file instead of stdout 111 | -cs, --clinical-significance Only shows variants with that match the specified clinical significance, accepts a comma separated list of supported values (pathogenic, likely_pathogenic, uncertain_significance, likely_benign, benign) 112 | 113 | Examples: 114 | clinvar-checker download 115 | clinvar-checker check 23andme_data.txt -o clinvar_report.txt -cs pathogenic,likely_pathogenic\n 116 | """) 117 | end 118 | end 119 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "burrito": {:hex, :burrito, "1.2.0", "88f973469edcb96bd984498fb639d3fc4dbf01b52baab072b40229f03a396789", [:mix], [{:jason, "~> 1.2", [hex: :jason, repo: "hexpm", optional: false]}, {:req, ">= 0.4.0", [hex: :req, repo: "hexpm", optional: false]}, {:typed_struct, "~> 0.2.0 or ~> 0.3.0", [hex: :typed_struct, repo: "hexpm", optional: false]}], "hexpm", "7e22158023c6558de615795ab135d27f0cbd9a0602834e3e474fe41b448afba9"}, 3 | "finch": {:hex, :finch, "0.19.0", "c644641491ea854fc5c1bbaef36bfc764e3f08e7185e1f084e35e0672241b76d", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.6.2 or ~> 1.7", [hex: :mint, repo: "hexpm", optional: false]}, {:nimble_options, "~> 0.4 or ~> 1.0", [hex: :nimble_options, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 1.1", [hex: :nimble_pool, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "fc5324ce209125d1e2fa0fcd2634601c52a787aff1cd33ee833664a5af4ea2b6"}, 4 | "flow": {:hex, :flow, "1.2.4", "1dd58918287eb286656008777cb32714b5123d3855956f29aa141ebae456922d", [:mix], [{:gen_stage, "~> 1.0", [hex: :gen_stage, repo: "hexpm", optional: false]}], "hexpm", "874adde96368e71870f3510b91e35bc31652291858c86c0e75359cbdd35eb211"}, 5 | "gen_stage": {:hex, :gen_stage, "1.2.1", "19d8b5e9a5996d813b8245338a28246307fd8b9c99d1237de199d21efc4c76a1", [:mix], [], "hexpm", "83e8be657fa05b992ffa6ac1e3af6d57aa50aace8f691fcf696ff02f8335b001"}, 6 | "hpax": {:hex, :hpax, "1.0.0", "28dcf54509fe2152a3d040e4e3df5b265dcb6cb532029ecbacf4ce52caea3fd2", [:mix], [], "hexpm", "7f1314731d711e2ca5fdc7fd361296593fc2542570b3105595bb0bc6d0fad601"}, 7 | "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, 8 | "mime": {:hex, :mime, "2.0.6", "8f18486773d9b15f95f4f4f1e39b710045fa1de891fada4516559967276e4dc2", [:mix], [], "hexpm", "c9945363a6b26d747389aac3643f8e0e09d30499a138ad64fe8fd1d13d9b153e"}, 9 | "mint": {:hex, :mint, "1.6.2", "af6d97a4051eee4f05b5500671d47c3a67dac7386045d87a904126fd4bbcea2e", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0 or ~> 1.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "5ee441dffc1892f1ae59127f74afe8fd82fda6587794278d924e4d90ea3d63f9"}, 10 | "nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"}, 11 | "nimble_pool": {:hex, :nimble_pool, "1.1.0", "bf9c29fbdcba3564a8b800d1eeb5a3c58f36e1e11d7b7fb2e084a643f645f06b", [:mix], [], "hexpm", "af2e4e6b34197db81f7aad230c1118eac993acc0dae6bc83bac0126d4ae0813a"}, 12 | "req": {:hex, :req, "0.5.8", "50d8d65279d6e343a5e46980ac2a70e97136182950833a1968b371e753f6a662", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 2.0.6 or ~> 2.1", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "d7fc5898a566477e174f26887821a3c5082b243885520ee4b45555f5d53f40ef"}, 13 | "sizeable": {:hex, :sizeable, "1.0.2", "625fe06a5dad188b52121a140286f1a6ae1adf350a942cf419499ecd8a11ee29", [:mix], [], "hexpm", "4bab548e6dfba777b400ca50830a9e3a4128e73df77ab1582540cf5860601762"}, 14 | "telemetry": {:hex, :telemetry, "1.3.0", "fedebbae410d715cf8e7062c96a1ef32ec22e764197f70cda73d82778d61e7a2", [:rebar3], [], "hexpm", "7015fc8919dbe63764f4b4b87a95b7c0996bd539e0d499be6ec9d7f3875b79e6"}, 15 | "typed_struct": {:hex, :typed_struct, "0.3.0", "939789e3c1dca39d7170c87f729127469d1315dcf99fee8e152bb774b17e7ff7", [:mix], [], "hexpm", "c50bd5c3a61fe4e198a8504f939be3d3c85903b382bde4865579bc23111d1b6d"}, 16 | } 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # clinvar-checker 2 | The ClinvarChecker is a CLI tool that can cross-check your raw 23andme genetic data against the open-source ClinVar database. This will allow you to see if you have any genetic variants that are associated with diseases or other conditions, since most of this data has been pay-walled by 23andMe. 3 | 4 | It is important to note that the ClinVar database is not a diagnostic tool. It's a research tool that aggregates information about genetic variants and their clinical significance - just because a variant matches doesn't necessarily mean that the associated conditions will manifest. This tool is meant to be a starting point for further research and discussion with a healthcare provider. 5 | 6 | Additionally, The data in the ClinVar database is constantly being updated, so it's important to keep that in mind when interpreting the results - clinical significance can change over time and be promoted or demoted [1](https://youtu.be/7mnFnoEBOW8). While ClinVar is a comprehensive resource, it does not list of all genetic variants - it's a collection of variants that have been submitted by researchers and clinicians, this field is constantly evolving and **vast** - variants are just one piece of the puzzle when it comes to understanding genetic risk, there are also other factors like family history, lifestyle, and environment that could play a role in determining risk. 7 | 8 | **TLDR; This is just a tool to help you understand your genetic data better, it's not a diagnostic tool. Take it with a grain of salt, and don't freak out.** 9 | 10 | ## Installation & Usage 11 | 1. Navigate to the [releases page](https://github.com/ssaunderss/clinvar-checker/releases) 12 | 2. Download the latest release for your operating system 13 | 3. Make the binary executable `chmod +x clinvar_checker_{$OS}` 14 | 4. Optionally move the binary to a directory in your `$PATH` - `mv clinvar_checker_{$OS} /usr/local/bin/clinvar_checker` 15 | 5. Now you can run the binary, some example usage: 16 | ```bash 17 | > clinvar_checker --help 18 | > clinvar_checker download 19 | > clinvar_checker check path/to/23_and_Me_genome.txt 20 | > clinvar_checker check path/to/23_and_Me_genome.txt --clinical-significance pathogenic 21 | ``` 22 | 23 | ## How are single-nucleotide matches determined? 24 | The ClinVar database provides information about genetic variants and their clinical significance with the following relevant pieces of information: `chromosome`, `position`, `reference allele`, and `alternate allele`. The 23andMe data provides information about genetic variants with the following relevant pieces of information: `chromosome`, `position`, and `genotype`. The order of the genotype doesn't matter - `AG` is the same as `GA`. Essentially what we're looking for are matches to the `alternate allele`. Here's a logic table outlining the possible match outcomes for a `reference allele=A` and `alternate allele=G`: 25 | 26 | | 23andMe Genotype | Contains Reference Allele (A)? | Contains Alternate Allele (G)? | Match Interpretation | Variant Status | 27 | |-------------------|---------------------------------|---------------------------------|----------------------|----------------| 28 | | AA | Yes (2 copies) | No | Perfect match to reference | No variant present | 29 | | AG | Yes (1 copy) | Yes (1 copy) | Heterozygous match | Heterozygous for variant | 30 | | GA | Yes (1 copy) | Yes (1 copy) | Heterozygous match | Heterozygous for variant | 31 | | GG | No | Yes (2 copies) | Homozygous alt | Homozygous for variant | 32 | | AT | Yes (1 copy) | No | Novel alternate T | No ClinVar variant present | 33 | | TA | Yes (1 copy) | No | Novel alternate T | No ClinVar variant present | 34 | | TT | No | No | Different variant entirely | No ClinVar variant present | 35 | | GT | No | Yes (1 copy) | Mixed variant profile | Heterozygous for G variant with novel alternate, specific variant not present | 36 | | TG | No | Yes (1 copy) | Mixed variant profile | Heterozygous for G variant with novel alternate, specific variant not present | 37 | | AC | Yes (1 copy) | No | Novel alternate C | No ClinVar variant present | 38 | | CA | Yes (1 copy) | No | Novel alternate C | No ClinVar variant present | 39 | | CC | No | No | Different variant entirely | No ClinVar variant present | 40 | | GC | No | Yes (1 copy) | Mixed variant profile | Heterozygous for G variant with novel alternate, specific variant not present | 41 | | CG | No | Yes (1 copy) | Mixed variant profile | Heterozygous for G variant with novel alternate, specific variant not present | 42 | | TC | No | No | Different variant entirely | No ClinVar variant present | 43 | | CT | No | No | Different variant entirely | No ClinVar variant present | 44 | 45 | ## Limitations 46 | - This tool only works with raw 23andMe data, it doesn't work with Ancestry or other genetic testing services. 47 | - The format of the 23andMe data does not lend itself to analyzing multi-nucleotide variants (MNVs), e.g. and insertion or deletion of multiple consecutive nucleotides `alt=AG` vs `alt=A` - this tool will not be able to detect these types of variants. 48 | 49 | ## Development 50 | 1. `asdf install` to install all the local dependencies (zig is needed to build the burrito binaries). 51 | 2. `brew install p7zip` - only needed if you plan on using burrito to make Windows executables. 52 | 3. Unlike traditional Elixir applications, this project doesn't stay running, it only stays alive for the duration of the command. So instead of the typical `iex -S mix` for local development, you'll want to run the commands directly `mix app.start check tmp/23_and_Me_genome.txt --clinical-significance pathogenic`. 53 | 54 | ## Releases 55 | 1. Bump the version in `mix.exs` 56 | 2. `MIX_ENV=prod mix release` 57 | 3. Binaries are built in `.burrito_out/` 58 | 4. `git tag -a v{$VERSION} -m {$MESSAGE} && git push --tags` 59 | 5. Create a new release on the [releases page](https://github.com/ssaunderss/clinvar-checker/releases) 60 | 61 | ## TODOs 62 | - [ ] Support different output formats (JSON, CSV, etc.) 63 | -------------------------------------------------------------------------------- /lib/clinvar_checker.ex: -------------------------------------------------------------------------------- 1 | defmodule ClinvarChecker do 2 | @moduledoc """ 3 | Module for analyzing 23andMe genetic data against ClinVar database entries. 4 | Handles downloading, parsing, and comparing genetic variants. 5 | """ 6 | 7 | @type args :: [memory_profile: boolean(), clinical_significance: String.t(), output: String.t()] 8 | 9 | @clinvar_download "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz" 10 | @clinvar_ets_table :clinvar_variants 11 | @clinvar_file "tmp/clinvar.vcf" 12 | @clinical_significances MapSet.new([ 13 | "benign", 14 | "likely_benign", 15 | "uncertain", 16 | "likely_pathogenic", 17 | "pathogenic", 18 | "uncertain", 19 | "drug_response" 20 | ]) 21 | @default_output "tmp/variant_analysis_report.txt" 22 | 23 | def valid_clinical_significances(), do: @clinical_significances 24 | defp stages, do: System.schedulers_online() 25 | defp microseconds_to_seconds(microseconds), do: microseconds / 1_000_000 26 | 27 | def run(input, args) do 28 | if File.exists?(@clinvar_file) do 29 | # Parse both datasets 30 | {time, personal_variants} = :timer.tc(fn -> parse_23andme_file(input) end) 31 | IO.puts("23andMe data parsed in #{microseconds_to_seconds(time)} seconds") 32 | 33 | {time, _} = :timer.tc(fn -> parse_clinvar_file(@clinvar_file) end) 34 | IO.puts("ClinVar data parsed in #{microseconds_to_seconds(time)} seconds") 35 | 36 | # Find matches and generate report 37 | {time, matches} = :timer.tc(fn -> analyze_variants(personal_variants, args) end) 38 | 39 | IO.puts( 40 | "Analysis completed in #{microseconds_to_seconds(time)} seconds, found #{Enum.count(matches)} matches" 41 | ) 42 | 43 | # Generate report 44 | generate_report(matches, args[:output]) 45 | else 46 | IO.puts( 47 | "Error: ClinVar data not found. Please run `clinvar-checker download` to download the data first.\n" 48 | ) 49 | 50 | System.halt(1) 51 | end 52 | end 53 | 54 | @spec download_clinvar_data() :: 55 | {:ok, file_name :: String.t()} | {:error, error_message :: String.t()} 56 | def download_clinvar_data() do 57 | ensure_tmp_dir() 58 | 59 | case Req.get!(@clinvar_download, decode_body: false) do 60 | %Req.Response{status: 200, body: body} -> 61 | # Save and decompress 62 | compressed = @clinvar_file <> ".gz" 63 | File.write!(compressed, body) 64 | System.cmd("gunzip", ["-f", compressed]) 65 | 66 | {:ok, @clinvar_file} 67 | 68 | %Req.Response{status: status} -> 69 | {:error, "Failed to download ClinVar data. Status: #{status}"} 70 | end 71 | end 72 | 73 | @spec parse_clinvar_file(file_path :: String.t()) :: ets_table_name :: atom() 74 | def parse_clinvar_file(path) do 75 | clinvar_table = 76 | :ets.new(@clinvar_ets_table, [ 77 | :ordered_set, 78 | :public, 79 | :named_table, 80 | read_concurrency: true, 81 | write_concurrency: true 82 | ]) 83 | 84 | path 85 | |> File.stream!([], :line) 86 | |> Flow.from_enumerable(max_demand: 4_000, stages: stages()) 87 | |> Flow.map(&parse_clinvar_line/1) 88 | |> Flow.partition( 89 | key: fn 90 | {key, _variant} -> 91 | :erlang.phash2(elem(key, 0), stages()) 92 | 93 | val -> 94 | :erlang.phash2(val, stages()) 95 | end, 96 | stages: stages() 97 | ) 98 | # functionally acts like Flow.each/2 used to without accumulating results 99 | |> Flow.reduce(fn -> [] end, fn 100 | {key, variant}, _acc -> 101 | :ets.insert(clinvar_table, {key, variant}) 102 | [] 103 | 104 | _, _acc -> 105 | [] 106 | end) 107 | |> Flow.run() 108 | 109 | @clinvar_ets_table 110 | end 111 | 112 | defp parse_clinvar_line("#" <> _rest), do: nil 113 | 114 | defp parse_clinvar_line(line) do 115 | with [chrom, pos, _id, ref, alt, _qual, _filter, info] <- 116 | :binary.split(line, "\t", [:global]), 117 | {position, _} <- 118 | Integer.parse(pos), 119 | # We cannot confidently analyze multi-nucleotide variants 120 | 1 <- String.length(ref), 121 | 1 <- String.length(alt) do 122 | parsed_info = parse_clinvar_info(info) 123 | normalized_chromosme = normalize_chromosome(chrom) 124 | 125 | variant = %{ 126 | chromosome: normalized_chromosme, 127 | position: position, 128 | reference: ref, 129 | alternate: alt, 130 | processed_significances: parsed_info.processed_significances, 131 | clinical_significance: parsed_info.clinical_significance, 132 | condition: parsed_info.condition 133 | } 134 | 135 | key = 136 | {variant.chromosome, variant.position, variant.reference, variant.alternate} 137 | 138 | {key, variant} 139 | else 140 | _ -> nil 141 | end 142 | end 143 | 144 | defp parse_clinvar_info(info) when is_binary(info) do 145 | info_map = 146 | info 147 | |> :binary.split(";", [:global]) 148 | |> Enum.reduce(%{}, fn item, acc -> 149 | case :binary.split(item, "=", [:global]) do 150 | [key, value] -> Map.put(acc, key, value) 151 | _ -> acc 152 | end 153 | end) 154 | 155 | raw_significance = Map.get(info_map, "CLNSIG", "unknown") 156 | 157 | processed_significances = 158 | raw_significance 159 | |> String.split(["|", "/", ","]) 160 | |> Enum.map(&String.downcase/1) 161 | |> MapSet.new() 162 | 163 | %{ 164 | clinical_significance: raw_significance, 165 | processed_significances: processed_significances, 166 | condition: Map.get(info_map, "CLNDN", "unknown") 167 | } 168 | end 169 | 170 | @spec parse_23andme_file(file_path :: String.t()) :: %{tuple() => map()} 171 | def parse_23andme_file(path) do 172 | if File.exists?(path) do 173 | path 174 | |> File.stream!([], :line) 175 | |> Flow.from_enumerable(max_demand: 4_000, stages: stages()) 176 | |> Flow.map(&parse_23andme_line/1) 177 | |> Flow.reject(&is_nil/1) 178 | |> Enum.to_list() 179 | else 180 | IO.puts("Error: 23andMe data file not found. Please use `clinvar-checker help` for help.\n") 181 | System.halt(1) 182 | end 183 | end 184 | 185 | defp parse_23andme_line("#" <> _), do: nil 186 | 187 | defp parse_23andme_line(line) do 188 | with [rsid, chromosome, position, genotype] <- :binary.split(line, "\t", [:global]), 189 | trimmed_genotype <- String.trim(genotype) do 190 | # Skip "no call" genotypes - 23andMe data not confident enough to call this genotype 191 | if trimmed_genotype == "--" do 192 | nil 193 | else 194 | {normalize_chromosome(chromosome), String.to_integer(position), String.trim(genotype), 195 | rsid} 196 | end 197 | else 198 | _ -> nil 199 | end 200 | end 201 | 202 | defp normalize_chromosome("MT"), do: "M" 203 | defp normalize_chromosome(chrom), do: chrom 204 | 205 | def analyze_variants(personal_data, args) do 206 | personal_data 207 | |> Stream.map(fn {chrom, pos, genotype, _rsid} = genotype_call -> 208 | {genotype_call, matching_variants_for_genotype(chrom, pos, genotype)} 209 | end) 210 | |> Stream.reject(fn {_genotype_call, matches} -> is_nil(matches) end) 211 | |> Stream.map(fn {genotype_call, matches} -> 212 | {genotype_call, 213 | Enum.filter(matches, &matches_significance?(&1, args[:clinical_significance]))} 214 | end) 215 | |> Enum.flat_map(fn {{chrom, pos, genotype, rsid}, clinvar_entries} -> 216 | Enum.map(clinvar_entries, fn clinvar_entry -> 217 | build_report_entry(chrom, pos, genotype, rsid, clinvar_entry) 218 | end) 219 | end) 220 | end 221 | 222 | defp build_report_entry(chrom, pos, genotype, rsid, clinvar_entry) do 223 | %{ 224 | chromosome: chrom, 225 | position: pos, 226 | rsid: rsid, 227 | genotype: genotype, 228 | clinical_significance: clinvar_entry.clinical_significance, 229 | condition: clinvar_entry.condition 230 | } 231 | end 232 | 233 | @spec matching_variants_for_genotype( 234 | chromosome :: String.t(), 235 | position :: integer(), 236 | genotype :: String.t() 237 | ) :: [map()] | nil 238 | defp matching_variants_for_genotype(chromosome, position, genotype) do 239 | case String.graphemes(genotype) do 240 | [a1, a2] -> 241 | if a1 == a2 do 242 | homozygous_matches(chromosome, position, a1) 243 | else 244 | heterozygous_matches(chromosome, position, a1, a2) 245 | end 246 | 247 | # this is for males - the X chromosome will only have one allele 248 | [a1] -> 249 | homozygous_matches(chromosome, position, a1) 250 | end 251 | |> Enum.map(&elem(&1, 1)) 252 | |> then(fn result -> if Enum.empty?(result), do: nil, else: result end) 253 | end 254 | 255 | # wildcard match alternate alleles when genotype is homozygous 256 | defp homozygous_matches(chromosome, position, allele) do 257 | :ets.select(@clinvar_ets_table, [{{{chromosome, position, :_, allele}, :_}, [], [:"$_"]}]) 258 | end 259 | 260 | # since the genotype is heterozygous, we need to check both combinations of alleles for matches 261 | defp heterozygous_matches(chromosome, position, allele1, allele2) do 262 | :ets.select(@clinvar_ets_table, [ 263 | {{{chromosome, position, allele1, allele2}, :_}, [], [:"$_"]} 264 | ]) ++ 265 | :ets.select(@clinvar_ets_table, [ 266 | {{{chromosome, position, allele2, allele1}, :_}, [], [:"$_"]} 267 | ]) 268 | end 269 | 270 | defp matches_significance?(_clinvar_entry, nil), do: true 271 | 272 | defp matches_significance?(clinvar_entry, clinical_significance) do 273 | MapSet.intersection(clinvar_entry.processed_significances, clinical_significance) 274 | |> MapSet.size() > 0 275 | end 276 | 277 | defp generate_report(matches, output) do 278 | report = 279 | matches 280 | |> Enum.sort_by(fn match -> {match.chromosome, match.position} end) 281 | |> Enum.map(fn match -> 282 | """ 283 | Variant found: 284 | Chromosome: #{match.chromosome} 285 | Position: #{match.position} 286 | rsID: #{match.rsid} 287 | Your genotype: #{match.genotype} 288 | Clinical significance: #{match.clinical_significance} 289 | Associated condition: #{match.condition} 290 | """ 291 | end) 292 | |> Enum.join("\n") 293 | 294 | output 295 | |> validate_output() 296 | |> File.write!(report) 297 | 298 | {:ok, Enum.count(matches)} 299 | end 300 | 301 | defp validate_output(nil = _output), do: @default_output 302 | 303 | defp validate_output(output) do 304 | if String.ends_with?(output, ".txt") do 305 | output 306 | else 307 | IO.puts("Warning: Output file invalid, writing results to #{@default_output}\n") 308 | 309 | @default_output 310 | end 311 | end 312 | 313 | defp ensure_tmp_dir() do 314 | case File.mkdir_p("tmp") do 315 | :ok -> 316 | :ok 317 | 318 | {:error, reason} -> 319 | IO.puts("Error creating tmp directory: #{reason}") 320 | System.halt(1) 321 | end 322 | end 323 | end 324 | --------------------------------------------------------------------------------