├── flatgfa
    ├── tests
    │   ├── test.t
    │   └── turnt.toml
    ├── src
    │   ├── cli
    │   │   ├── mod.rs
    │   │   └── main.rs
    │   ├── ops
    │   │   ├── mod.rs
    │   │   ├── bench.rs
    │   │   ├── position.rs
    │   │   ├── depth.rs
    │   │   ├── chop.rs
    │   │   └── extract.rs
    │   ├── lib.rs
    │   ├── namemap.rs
    │   ├── memfile.rs
    │   ├── flatbed.rs
    │   ├── print.rs
    │   ├── parse.rs
    │   └── gfaline.rs
    ├── Cargo.toml
    └── README.md
├── tests
    ├── subset-paths
    │   ├── ex1.paths
    │   ├── ex2.paths
    │   ├── ex1.txt
    │   └── ex2.txt
    ├── depth
    │   ├── subset-paths
    │   │   ├── ex1.txt
    │   │   ├── ex2.txt
    │   │   ├── ex1.paths
    │   │   ├── ex2.paths
    │   │   └── turnt.toml
    │   ├── basic
    │   │   ├── ex1.gfa
    │   │   └── ex2.gfa
    │   └── turnt.toml
    ├── basic
    │   ├── ex1.gfa
    │   └── ex2.gfa
    ├── handmade
    │   ├── no-test-flip4.gfa
    │   ├── flip1.gfa
    │   ├── flip2.gfa
    │   ├── flip3.gfa
    │   └── crush1.gfa
    ├── .gitignore
    └── turnt.toml
├── bench
    ├── .gitignore
    ├── Makefile
    ├── sizes.py
    ├── bar.vl.json
    ├── config.toml
    ├── summary.py
    └── graphs.toml
├── mygfa
    ├── .gitignore
    ├── mygfa
    │   ├── __init__.py
    │   ├── __main__.py
    │   └── preprocess.py
    ├── pyproject.toml
    ├── docs
    │   ├── conf.py
    │   ├── Makefile
    │   └── index.rst
    ├── example.py
    └── README.md
├── .gitattributes
├── flatgfa-py
    ├── .gitignore
    ├── test
    │   ├── tiny.gaf
    │   ├── tiny.gfa
    │   ├── test_gaf.py
    │   └── test_flatgfa.py
    ├── docs
    │   ├── conf.py
    │   ├── Makefile
    │   └── index.rst
    ├── Cargo.toml
    ├── examples
    │   ├── depth.py
    │   └── gaf.py
    ├── pyproject.toml
    ├── README.md
    └── flatgfa.pyi
├── pyrightconfig.json
├── pollen_icon.png
├── pollen_icon_transparent.png
├── slow_odgi
    ├── slow_odgi
    │   ├── __init__.py
    │   ├── paths.py
    │   ├── norm.py
    │   ├── validate_setup.py
    │   ├── matrix.py
    │   ├── degree.py
    │   ├── somepaths.py
    │   ├── depth.py
    │   ├── crush.py
    │   ├── inject_setup.py
    │   ├── validate.py
    │   ├── proofs.py
    │   ├── overlap.py
    │   ├── flatten.py
    │   ├── chop.py
    │   ├── flip.py
    │   ├── inject.py
    │   └── __main__.py
    ├── pyproject.toml
    └── Makefile
├── pollen_py
    ├── pollen
    │   ├── __init__.py
    │   ├── main.py
    │   ├── argparse_custom.py
    │   └── depth
    │   │   ├── python_depth.py
    │   │   ├── processing-elements
    │   │       └── parse_data.py
    │   │   └── main.py
    ├── pyproject.toml
    └── README.md
├── pollen_data_gen
    ├── pollen_data_gen
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── depth.py
    │   └── simple.py
    └── pyproject.toml
├── Cargo.toml
├── .github
    ├── odgi.sh
    ├── tap-matcher.json
    └── workflows
    │   ├── code-quality.yml
    │   ├── docs.yml
    │   ├── build.yml
    │   └── flatgfa-py.yml
├── .zed
    └── settings.json
├── .gitignore
├── pyproject.toml
├── process.py
├── LICENSE
├── Makefile
├── Dockerfile
└── README.md


/flatgfa/tests/test.t:
--------------------------------------------------------------------------------
1 | ACTGG
2 | 


--------------------------------------------------------------------------------
/tests/subset-paths/ex1.paths:
--------------------------------------------------------------------------------
1 | path1


--------------------------------------------------------------------------------
/tests/subset-paths/ex2.paths:
--------------------------------------------------------------------------------
1 | path0


--------------------------------------------------------------------------------
/bench/.gitignore:
--------------------------------------------------------------------------------
1 | graphs/
2 | results/
3 | 


--------------------------------------------------------------------------------
/flatgfa/src/cli/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod cmds;
2 | 


--------------------------------------------------------------------------------
/mygfa/.gitignore:
--------------------------------------------------------------------------------
1 | docs/_build/
2 | dist/
3 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.gfa -text
2 | *.gaf -text
3 | 


--------------------------------------------------------------------------------
/tests/subset-paths/ex1.txt:
--------------------------------------------------------------------------------
1 | ARGS: basic/ex1.og


--------------------------------------------------------------------------------
/tests/subset-paths/ex2.txt:
--------------------------------------------------------------------------------
1 | ARGS: basic/ex2.og


--------------------------------------------------------------------------------
/flatgfa-py/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | docs/_build/
3 | 


--------------------------------------------------------------------------------
/tests/depth/subset-paths/ex1.txt:
--------------------------------------------------------------------------------
1 | ../../subset-paths/ex1.txt


--------------------------------------------------------------------------------
/tests/depth/subset-paths/ex2.txt:
--------------------------------------------------------------------------------
1 | ../../subset-paths/ex2.txt


--------------------------------------------------------------------------------
/tests/depth/subset-paths/ex1.paths:
--------------------------------------------------------------------------------
1 | ../../subset-paths/ex1.paths


--------------------------------------------------------------------------------
/tests/depth/subset-paths/ex2.paths:
--------------------------------------------------------------------------------
1 | ../../subset-paths/ex2.paths


--------------------------------------------------------------------------------
/pyrightconfig.json:
--------------------------------------------------------------------------------
1 | {
2 |   "venvPath": ".",
3 |   "venv": ".venv"
4 | }
5 | 


--------------------------------------------------------------------------------
/pollen_icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cucapra/pollen/HEAD/pollen_icon.png


--------------------------------------------------------------------------------
/pollen_icon_transparent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cucapra/pollen/HEAD/pollen_icon_transparent.png


--------------------------------------------------------------------------------
/tests/basic/ex1.gfa:
--------------------------------------------------------------------------------
1 | H	VN:Z:1.0
2 | S	1	A
3 | L	1	+	2	+	0M
4 | S	2	C
5 | L	2	+	2	+	0M
6 | P	path1	1+,2+,2+	*
7 | 


--------------------------------------------------------------------------------
/tests/handmade/no-test-flip4.gfa:
--------------------------------------------------------------------------------
1 | H	VN:Z:1.0
2 | S	1	A
3 | S	2	TTT
4 | S	3	G
5 | P	x	1+,2+,3+	*
6 | L	1	+	2	+	0M


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/__init__.py:
--------------------------------------------------------------------------------
1 | """A reference implementation of the odgi library."""
2 | 
3 | __version__ = "0.1"
4 | 


--------------------------------------------------------------------------------
/tests/depth/basic/ex1.gfa:
--------------------------------------------------------------------------------
1 | H	VN:Z:1.0
2 | S	1	A
3 | L	1	+	2	+	0M
4 | S	2	C
5 | L	2	+	2	+	0M
6 | P	path1	1+,2+,2+	*
7 | 


--------------------------------------------------------------------------------
/bench/Makefile:
--------------------------------------------------------------------------------
1 | %.svg: %.csv bar.vl.json
2 | 	jq '.data.url |= "$<"' bar.vl.json | npx -p vega -p vega-lite vl2svg > $@
3 | 


--------------------------------------------------------------------------------
/pollen_py/pollen/__init__.py:
--------------------------------------------------------------------------------
1 | """A collection of pangenome graph query accelerator tools"""
2 | 
3 | __version__ = "1"
4 | 


--------------------------------------------------------------------------------
/flatgfa-py/test/tiny.gaf:
--------------------------------------------------------------------------------
1 | foo	12	0	12	+	>1>2<4	38	5	17	12	12	0	cg:Z:150M
2 | bar	20	0	20	+	>1>2>3	30	7	27	20	20	0	cg:Z:150M
3 | 


--------------------------------------------------------------------------------
/tests/handmade/flip1.gfa:
--------------------------------------------------------------------------------
1 | H	VN:Z:1.0
2 | S	1	A
3 | S	2	TTT
4 | S	3	G
5 | P	x	1+,2-,3+	*
6 | L	1	+	2	+	0M
7 | L	2	+	3	+	0M
8 | 


--------------------------------------------------------------------------------
/flatgfa/src/ops/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod bench;
2 | pub mod chop;
3 | pub mod depth;
4 | pub mod extract;
5 | pub mod gaf;
6 | pub mod position;
7 | 


--------------------------------------------------------------------------------
/pollen_data_gen/pollen_data_gen/__init__.py:
--------------------------------------------------------------------------------
1 | """Converts GFA graphs into calyx-friendly .data files."""
2 | 
3 | __version__ = "0.1"
4 | 


--------------------------------------------------------------------------------
/tests/handmade/flip2.gfa:
--------------------------------------------------------------------------------
1 | H	VN:Z:1.0
2 | S	1	A
3 | S	2	TTT
4 | S	3	G
5 | P	x	1+,2-,3+	*
6 | P	y	1+,2-,3+	*
7 | L	1	+	2	+	0M
8 | L	2	+	3	+	0M
9 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | resolver = "2"
3 | members = ["flatgfa", "flatgfa-py"]
4 | 
5 | [profile.profiling]
6 | inherits = "release"
7 | debug = true
8 | 


--------------------------------------------------------------------------------
/mygfa/mygfa/__init__.py:
--------------------------------------------------------------------------------
1 | """Simple GFA parsing, printing, and pre-processing in Python."""
2 | 
3 | from .gfa import *  # noqa
4 | 
5 | __version__ = "0.1"
6 | 


--------------------------------------------------------------------------------
/flatgfa/tests/turnt.toml:
--------------------------------------------------------------------------------
1 | command = "../../target/debug/fgfa seq-export {filename} packed.seq ; cargo run seq-import packed.seq ; rm packed.seq"
2 | output.t = "-"
3 | 


--------------------------------------------------------------------------------
/.github/odgi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | path=`realpath -s --relative-to=$GITHUB_WORKSPACE ${PWD}`
3 | exec docker run -i --rm -v $GITHUB_WORKSPACE:/work --workdir /work/$path odgi odgi $@
4 | 


--------------------------------------------------------------------------------
/tests/depth/turnt.toml:
--------------------------------------------------------------------------------
1 | [envs.baseline]
2 | binary = true
3 | command = "odgi depth -i {filename} -d"
4 | 
5 | [envs.calyx]
6 | binary = true
7 | command = "exine depth -a -r {filename}"


--------------------------------------------------------------------------------
/tests/depth/subset-paths/turnt.toml:
--------------------------------------------------------------------------------
1 | [envs.baseline]
2 | command = "odgi depth -i ../{args} -d -s {base}.paths"
3 | 
4 | [envs.calyx-depth]
5 | command = "exine depth -a -r ../{args} -s {base}.paths"


--------------------------------------------------------------------------------
/flatgfa-py/test/tiny.gfa:
--------------------------------------------------------------------------------
 1 | H	VN:Z:1.0
 2 | S	1	CAAATAAG
 3 | S	2	AAATTTTCTGGAGTTCTAT
 4 | S	3	TTG
 5 | S	4	CCAACTCTCTG
 6 | P	one	1+,2+,4-	*
 7 | P	two	1+,2+,3+,4-	*
 8 | L	1	+	2	+	0M
 9 | L	2	+	4	-	0M
10 | L	2	+	3	+	0M
11 | L	3	+	4	-	0M
12 | 


--------------------------------------------------------------------------------
/tests/basic/ex2.gfa:
--------------------------------------------------------------------------------
 1 | H	VN:Z:1.0
 2 | S	1	AA
 3 | L	1	+	2	+	0M
 4 | S	2	AC
 5 | L	2	+	3	+	0M
 6 | L	2	+	4	+	0M
 7 | S	3	AG
 8 | L	3	+	2	+	0M
 9 | L	3	+	5	+	0M
10 | S	4	AT
11 | L	4	+	5	+	0M
12 | S	5	CG
13 | P	path0	1+,2+,3+,2+,4+,5+	*
14 | P	path1	1+,2+,3+,5+	*
15 | 


--------------------------------------------------------------------------------
/tests/depth/basic/ex2.gfa:
--------------------------------------------------------------------------------
 1 | H	VN:Z:1.0
 2 | S	1	AA
 3 | L	1	+	2	+	0M
 4 | S	2	AC
 5 | L	2	+	3	+	0M
 6 | L	2	+	4	+	0M
 7 | S	3	AG
 8 | L	3	+	2	+	0M
 9 | L	3	+	5	+	0M
10 | S	4	AT
11 | L	4	+	5	+	0M
12 | S	5	CG
13 | P	path0	1+,2+,3+,2+,4+,5+	*
14 | P	path1	1+,2+,3+,5+	*
15 | 


--------------------------------------------------------------------------------
/flatgfa/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod cli;
 2 | pub mod file;
 3 | pub mod flatbed;
 4 | pub mod flatgfa;
 5 | pub mod gfaline;
 6 | pub mod memfile;
 7 | pub mod namemap;
 8 | pub mod ops;
 9 | pub mod packedseq;
10 | pub mod parse;
11 | pub mod pool;
12 | pub mod print;
13 | 
14 | pub use flatgfa::*;
15 | 


--------------------------------------------------------------------------------
/.zed/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "language_overrides": {
 3 |     "Python": {
 4 |       "formatter": {
 5 |         "external": {
 6 |           "command": "bash",
 7 |           "arguments": ["-c", "ruff format --stdin-filename {buffer_path}"]
 8 |         }
 9 |       }
10 |     }
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/mygfa/mygfa/__main__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from .gfa import Graph
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     mygraph = Graph.parse(sys.stdin)
 7 |     if len(sys.argv) > 1 and sys.argv[1] == "--nl":
 8 |         mygraph.emit(sys.stdout, False)
 9 |     else:
10 |         mygraph.emit(sys.stdout)
11 | 


--------------------------------------------------------------------------------
/flatgfa-py/docs/conf.py:
--------------------------------------------------------------------------------
 1 | project = "flatgfa"
 2 | copyright = "2024, Capra Lab"
 3 | author = "Capra Lab"
 4 | 
 5 | extensions = ["sphinx.ext.autodoc"]
 6 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 7 | 
 8 | html_theme = "alabaster"
 9 | 
10 | autodoc_member_order = "bysource"
11 | autodoc_typehints_format = "short"
12 | 


--------------------------------------------------------------------------------
/flatgfa-py/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "flatgfa-py"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [lib]
 7 | name = "flatgfa"
 8 | crate-type = ["cdylib"]
 9 | 
10 | [dependencies]
11 | pyo3 = { version = "0.25.0", features = ["abi3-py38", "multiple-pymethods"] }
12 | flatgfa = { path = "../flatgfa" }
13 | memmap = "0.7.0"
14 | 


--------------------------------------------------------------------------------
/flatgfa-py/examples/depth.py:
--------------------------------------------------------------------------------
 1 | import flatgfa
 2 | from collections import Counter
 3 | 
 4 | graph = flatgfa.parse("../tests/k.gfa")
 5 | depths = Counter()
 6 | for path in graph.paths:
 7 |     for step in path:
 8 |         depths[step.segment.id] += 1
 9 | 
10 | print("#node.id\tdepth")
11 | for seg in graph.segments:
12 |     print("{}\t{}".format(seg.name, depths[seg.id]))
13 | 


--------------------------------------------------------------------------------
/pollen_py/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["flit_core >=3.2,<4"]
 3 | build-backend = "flit_core.buildapi"
 4 | 
 5 | [project]
 6 | name = "pollen"
 7 | authors = [
 8 |     { name = "Susan Garry", email = "shg64@cs.cornell.edu" }
 9 | ]
10 | readme = "README.md"
11 | dynamic = ["version", "description"]
12 | 
13 | [project.scripts]
14 | exine = "pollen.main:main"
15 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/paths.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import mygfa
 3 | 
 4 | 
 5 | def paths(graph: mygfa.Graph) -> mygfa.Graph:
 6 |     """Print the names of the paths found in `graph`."""
 7 |     pathnames = graph.paths.keys()
 8 |     print("\n".join(pathnames))
 9 |     return graph
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     paths(mygfa.Graph.parse(open(sys.argv[1], "r", encoding="utf-8")))
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | *.py[cod]
 3 | .DS_Store
 4 | **/*.chop
 5 | **/*.crush
 6 | **/*.degree
 7 | **/*.depth
 8 | **/*.emit
 9 | **/*.extract
10 | **/*.flatten
11 | **/*.flip
12 | **/*.matrix
13 | **/*.overlap
14 | **/*.validate
15 | **/*.og
16 | **/*.out
17 | **/*.flatgfa
18 | og_to_gfa.py
19 | compute_maxes.py
20 | 
21 | target/
22 | pollen/*.rlib
23 | 
24 | slow_odgi/dist/
25 | 
26 | .vscode/
27 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/norm.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import mygfa
 3 | 
 4 | 
 5 | def norm(graph: mygfa.Graph) -> mygfa.Graph:
 6 |     """Gives the graph's entries a stable order:
 7 |     headers, then segments, then paths, and then links.
 8 |     """
 9 |     return graph
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     newgraph = norm(mygfa.Graph.parse(sys.stdin))
14 |     newgraph.emit(sys.stdout, "--nl" not in sys.argv[1:])
15 | 


--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gfa
 2 | *.og
 3 | *.bed
 4 | *.out
 5 | *.chop
 6 | *.crush
 7 | *.degree
 8 | *.depth
 9 | *.depthpaths
10 | *.flatten
11 | *.flip
12 | *.inj
13 | *.matrix
14 | *.norm
15 | *.overlap
16 | *.overlappaths
17 | *.paths
18 | *.validate
19 | *.flatten.fasta
20 | 
21 | temp.*
22 | 
23 | basic/*.og
24 | basic/*.out
25 | 
26 | subset-paths/*.out
27 | 
28 | depth/*.out
29 | depth/basic/*.out
30 | depth/subset-paths/*.out
31 | 


--------------------------------------------------------------------------------
/flatgfa/src/ops/bench.rs:
--------------------------------------------------------------------------------
 1 | use crate::memfile;
 2 | use rayon::iter::ParallelIterator;
 3 | 
 4 | // Count the lines in a file, like `wc -l`.
 5 | pub fn line_count(filename: &str, parallel: bool) -> usize {
 6 |     let buf = memfile::map_file(filename);
 7 |     let split = memfile::MemchrSplit::new(b'\n', &buf);
 8 |     if parallel {
 9 |         ParallelIterator::count(split)
10 |     } else {
11 |         Iterator::count(split)
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/mygfa/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["flit_core >=3.2,<4"]
 3 | build-backend = "flit_core.buildapi"
 4 | 
 5 | [project]
 6 | name = "mygfa"
 7 | authors = [{name = "Anshuman Mohan", email = "amohan@cs.cornell.edu"}]
 8 | classifiers = ["License :: OSI Approved :: MIT License"]
 9 | dynamic = ["version", "description"]
10 | readme = "README.md"
11 | 
12 | [project.urls]
13 | Home = "https://github.com/cucapra/pollen/tree/main/mygfa"
14 | 


--------------------------------------------------------------------------------
/mygfa/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Make mygfa module available to autodoc.
 2 | import sys
 3 | import os
 4 | 
 5 | sys.path.insert(0, os.path.abspath(".."))
 6 | 
 7 | project = "mygfa"
 8 | copyright = "2024, Capra Lab"
 9 | author = "Capra Lab"
10 | 
11 | extensions = ["sphinx.ext.autodoc"]
12 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
13 | 
14 | html_theme = "alabaster"
15 | 
16 | autodoc_member_order = "bysource"
17 | autodoc_typehints_format = "short"
18 | 


--------------------------------------------------------------------------------
/flatgfa/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "flatgfa"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [[bin]]
 7 | name = "fgfa"
 8 | path = "src/cli/main.rs"
 9 | 
10 | [dependencies]
11 | argh = "0.1.13"
12 | atoi = "2.0.0"
13 | bit-vec = "0.8.0"
14 | bstr = "1.12.0"
15 | memchr = "2.7.4"
16 | memmap = "0.7.0"
17 | num_enum = "0.7.3"
18 | rayon = "1.10.0"
19 | tinyvec = "1.9.0"
20 | zerocopy = { version = "0.8.25", features = ["derive"] }
21 | 
22 | [dev-dependencies]
23 | rand = "0.8"
24 | 


--------------------------------------------------------------------------------
/slow_odgi/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["flit_core >=3.2,<4"]
 3 | build-backend = "flit_core.buildapi"
 4 | 
 5 | [project]
 6 | name = "slow_odgi"
 7 | authors = [{name = "Anshuman Mohan", email = "amohan@cs.cornell.edu"}]
 8 | readme = "README.md"
 9 | dynamic = ["version", "description"]
10 | dependencies = ["mygfa"]
11 | 
12 | [project.urls]
13 | Home = "https://github.com/cucapra/pollen/tree/main/slow_odgi"
14 | 
15 | [project.scripts]
16 | slow_odgi = "slow_odgi.__main__:main"


--------------------------------------------------------------------------------
/pollen_data_gen/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["flit_core >=3.2,<4"]
 3 | build-backend = "flit_core.buildapi"
 4 | 
 5 | [project]
 6 | name = "pollen_data_gen"
 7 | authors = [{name = "Anshuman Mohan", email = "amohan@cs.cornell.edu"}]
 8 | dynamic = ["version", "description"]
 9 | dependencies = ["mygfa"]
10 | 
11 | [project.urls]
12 | Home = "https://github.com/cucapra/pollen/tree/main/pollen_data_gen"
13 | 
14 | [project.scripts]
15 | pollen_data_gen = "pollen_data_gen.__main__:main"


--------------------------------------------------------------------------------
/flatgfa-py/examples/gaf.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import flatgfa
 3 | 
 4 | TEST_DIR = pathlib.Path(__file__).parent
 5 | TEST_GFA = TEST_DIR / "../test/tiny.gfa"
 6 | TEST_GAF = TEST_DIR / "../test/tiny.gaf"
 7 | graph = flatgfa.parse(str(TEST_GFA))
 8 | gaf = str(TEST_GAF)
 9 | gaf_parser = graph.all_reads(gaf)
10 | for lines in gaf_parser:
11 |     print(lines.name)
12 |     print(lines.sequence())
13 |     print(lines.segment_ranges())
14 |     for element in lines:
15 |         print(element.handle)
16 |         print(element.range)
17 | 


--------------------------------------------------------------------------------
/.github/tap-matcher.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "problemMatcher": [
 3 |         {
 4 |             "owner": "turnt-msg",
 5 |             "pattern": [{
 6 |                 "regexp": "^not ok \\d+ - ([^#\\s]+)\\s+#\\s+(.*)",
 7 |                 "message": 1,
 8 |                 "file": 2
 9 |             }]
10 |         },
11 |         {
12 |             "owner": "turnt-nomsg",
13 |             "pattern": [{
14 |                 "regexp": "^not ok \\d+ - ([^#\\s]+)",
15 |                 "message": 0,
16 |                 "file": 1
17 |             }]
18 |         }
19 |     ]
20 | }
21 | 


--------------------------------------------------------------------------------
/mygfa/example.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import mygfa
 3 | 
 4 | 
 5 | def print_depth(graph: mygfa.Graph) -> None:
 6 |     # Count the number of times that any path passes through a segment.
 7 |     seg_depths = {name: 0 for name in graph.segments}
 8 |     for path in graph.paths.values():
 9 |         for step in path.segments:
10 |             seg_depths[step.name] += 1
11 | 
12 |     # Print the counts.
13 |     print("seg\tdepth")
14 |     for name, depth in seg_depths.items():
15 |         print(f"{name}\t{depth}")
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     print_depth(mygfa.Graph.parse(sys.stdin))
20 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "pollen_workspace"
 3 | version = "0.0.0"
 4 | requires-python = ">=3.8"
 5 | dependencies = [
 6 |     "mygfa",
 7 |     "slow_odgi",
 8 |     "pollen",
 9 |     "pollen_data_gen",
10 |     "turnt>=1.11.0",
11 | ]
12 | 
13 | [tool.uv.sources]
14 | mygfa = { workspace = true }
15 | slow_odgi = { workspace = true }
16 | pollen = { workspace = true }
17 | pollen_data_gen = { workspace = true }
18 | flatgfa = { workspace = true }
19 | 
20 | [tool.uv.workspace]
21 | members = [
22 |     "mygfa",
23 |     "slow_odgi",
24 |     "pollen_py",
25 |     "pollen_data_gen",
26 |     "flatgfa-py",
27 | ]
28 | 


--------------------------------------------------------------------------------
/flatgfa/src/ops/position.rs:
--------------------------------------------------------------------------------
 1 | use crate::flatgfa;
 2 | 
 3 | pub fn position(
 4 |     gfa: &flatgfa::FlatGFA,
 5 |     path: &flatgfa::Path,
 6 |     offset: usize,
 7 | ) -> Option<(flatgfa::Handle, usize)> {
 8 |     // Traverse the path until we reach the position.
 9 |     let mut cur_pos = 0;
10 |     for step in &gfa.steps[path.steps] {
11 |         let seg = gfa.get_handle_seg(*step);
12 |         let end_pos = cur_pos + seg.len();
13 |         if offset < end_pos {
14 |             // Found it!
15 |             return Some((*step, offset - cur_pos));
16 |         }
17 |         cur_pos = end_pos;
18 |     }
19 | 
20 |     None
21 | }
22 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/validate_setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | import mygfa
 4 | 
 5 | 
 6 | def drop_some_links(graph: mygfa.Graph) -> mygfa.Graph:
 7 |     """Given a graph, (pseudo)randomly drop 90% of the Links of the graph.
 8 |     This serves as a starting point from which to test `validate`.
 9 |     """
10 |     random.seed(4)
11 |     links = list(sorted(graph.links))
12 |     links[:] = random.sample(links, int(0.1 * len(links)))
13 |     return mygfa.Graph(graph.headers, graph.segments, links, graph.paths)
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     newgraph = drop_some_links(mygfa.Graph.parse(sys.stdin))
18 |     newgraph.emit(sys.stdout)
19 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/matrix.py:
--------------------------------------------------------------------------------
 1 | import mygfa
 2 | import mygfa.preprocess
 3 | 
 4 | 
 5 | def matrix(graph: mygfa.Graph) -> mygfa.Graph:
 6 |     """Print the graph in sparse matrix format."""
 7 | 
 8 |     # Just keeping up with the odgi header format...
 9 |     topseg = max([int(i) for i in graph.segments.keys()])
10 |     print(" ".join(str(i) for i in [topseg, topseg, 2 * len(graph.links)]))
11 | 
12 |     _, outs = mygfa.preprocess.adjlist(graph)
13 |     for seg, neighbors in outs.items():
14 |         for neighbor in neighbors:
15 |             print(" ".join([seg.name, neighbor.name, "1"]))
16 |             print(" ".join([neighbor.name, seg.name, "1"]))
17 |     return graph
18 | 


--------------------------------------------------------------------------------
/tests/handmade/flip3.gfa:
--------------------------------------------------------------------------------
 1 | H	VN:Z:1.0
 2 | S	1	CAAATAAG
 3 | S	2	A
 4 | S	3	G
 5 | S	4	T
 6 | S	5	C
 7 | S	6	TTG
 8 | S	7	A
 9 | S	8	G
10 | S	9	AAATTTTCTGGAGTTCTAT
11 | S	10	A
12 | S	11	T
13 | S	12	ATAT
14 | S	13	A
15 | S	14	T
16 | S	15	CCAACTCTCTG
17 | P	x	1+,3+,5+,6+,8+,9-,11+,12+,14+,15-	8M,1M,1M,3M,1M,19M,1M,4M,1M,11M
18 | L	1	+	2	+	0M
19 | L	1	+	3	+	0M
20 | L	10	+	12	+	0M
21 | L	11	+	12	+	0M
22 | L	12	+	13	+	0M
23 | L	12	+	14	+	0M
24 | L	13	+	15	+	0M
25 | L	14	+	15	+	0M
26 | L	2	+	4	+	0M
27 | L	2	+	5	+	0M
28 | L	3	+	4	+	0M
29 | L	3	+	5	+	0M
30 | L	4	+	6	+	0M
31 | L	5	+	6	+	0M
32 | L	6	+	7	+	0M
33 | L	6	+	8	+	0M
34 | L	7	+	9	+	0M
35 | L	8	+	9	+	0M
36 | L	9	+	10	+	0M
37 | L	9	+	11	+	0M
38 | 


--------------------------------------------------------------------------------
/tests/handmade/crush1.gfa:
--------------------------------------------------------------------------------
 1 | H	VN:Z:1.0
 2 | S	1	CNNATNNG
 3 | S	2	A
 4 | S	3	G
 5 | S	4	N
 6 | S	5	N
 7 | S	6	NNG
 8 | S	7	A
 9 | S	8	G
10 | S	9	NNANNNNCTGGAGNNCTAT
11 | S	10	A
12 | S	11	T
13 | S	12	NNNN
14 | S	13	A
15 | S	14	T
16 | S	15	CCNNCTCTCTG
17 | P	x	1+,3+,5+,6+,8+,9+,11+,12+,14+,15+	*
18 | P	y	1+,2+,4+,6+,7+,9+,10+,12+,13+,15+	*
19 | L	1	+	2	+	0M
20 | L	1	+	3	+	0M
21 | L	2	+	4	+	0M
22 | L	2	+	5	+	0M
23 | L	3	+	4	+	0M
24 | L	3	+	5	+	0M
25 | L	4	+	6	+	0M
26 | L	5	+	6	+	0M
27 | L	6	+	7	+	0M
28 | L	6	+	8	+	0M
29 | L	7	+	9	+	0M
30 | L	8	+	9	+	0M
31 | L	9	+	10	+	0M
32 | L	9	+	11	+	0M
33 | L	10	+	12	+	0M
34 | L	11	+	12	+	0M
35 | L	12	+	13	+	0M
36 | L	12	+	14	+	0M
37 | L	13	+	15	+	0M
38 | L	14	+	15	+	0M
39 | 


--------------------------------------------------------------------------------
/mygfa/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/flatgfa-py/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/degree.py:
--------------------------------------------------------------------------------
 1 | import mygfa
 2 | import mygfa.preprocess
 3 | 
 4 | 
 5 | def degree(graph: mygfa.Graph) -> mygfa.Graph:
 6 |     """The degree of a node is just the cardinality of adjlist for that node."""
 7 |     print("\t".join(["#node.id", "node.degree"]))
 8 |     ins, outs = mygfa.preprocess.adjlist(graph)
 9 |     for seg in graph.segments.values():
10 |         segname = seg.name
11 |         out_degree = len(outs[mygfa.Handle(segname, True)]) + len(
12 |             outs[mygfa.Handle(segname, False)]
13 |         )
14 |         in_degree = len(ins[mygfa.Handle(segname, True)]) + len(
15 |             ins[mygfa.Handle(segname, False)]
16 |         )
17 |         print("\t".join([segname, str(in_degree + out_degree)]))
18 |     return graph
19 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/somepaths.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | import mygfa
 4 | 
 5 | 
 6 | def somepaths(graph: mygfa.Graph, droprate: int = 0) -> mygfa.Graph:
 7 |     """Print the names of the paths found in `graph`.
 8 |     The droprate represents the percentage of paths to drop.
 9 |     """
10 |     pathnames = list(graph.paths.keys())
11 |     if droprate > 0:
12 |         random.seed(4)
13 |         pathnames[:] = random.sample(
14 |             pathnames, int((100 - droprate) / 100 * len(pathnames))
15 |         )
16 |     for name in pathnames:
17 |         print(name)
18 |     return graph
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     somepaths(
23 |         mygfa.Graph.parse(open(sys.argv[1], "r", encoding="utf-8")), int(sys.argv[2])
24 |     )
25 | 


--------------------------------------------------------------------------------
/flatgfa-py/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "flatgfa"
 3 | version = "0.2.0"
 4 | description = "efficient processing of pangenomes in Graphical Fragment Assembly (GFA) format"
 5 | readme = "README.md"
 6 | requires-python = ">=3.8"
 7 | license = "MIT"
 8 | keywords = ["genomics", "pangenomics", "gfa"]
 9 | classifiers = [
10 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
11 |     "License :: OSI Approved :: MIT License",
12 | ]
13 | 
14 | [project.urls]
15 | repository = "https://github.com/cucapra/pollen"
16 | documentation = "https://cucapra.github.io/pollen/flatgfa/"
17 | 
18 | [dependency-groups]
19 | dev = ["pytest>=8.3.0"]
20 | 
21 | [build-system]
22 | requires = ["maturin>=1.0,<2.0"]
23 | build-backend = "maturin"
24 | 
25 | [tool.maturin]
26 | features = ["pyo3/extension-module"]
27 | 


--------------------------------------------------------------------------------
/flatgfa-py/test/test_gaf.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import flatgfa
 3 | 
 4 | TEST_DIR = pathlib.Path(__file__).parent
 5 | TEST_GFA = TEST_DIR / "tiny.gfa"
 6 | TEST_GAF = TEST_DIR / "tiny.gaf"
 7 | 
 8 | 
 9 | def test_gaf_seqs():
10 |     gfa = flatgfa.parse_bytes(TEST_GFA.read_bytes())
11 |     gaf = gfa.all_reads(str(TEST_GAF))
12 |     seqs = ["".join(e.sequence() for e in line) for line in gaf]
13 |     assert seqs == [
14 |         "AAGAAATTTTCT",
15 |         "GAAATTTTCTGGAGTTCTAT",
16 |     ]
17 | 
18 | 
19 | def test_gaf_ranges():
20 |     gfa = flatgfa.parse_bytes(TEST_GFA.read_bytes())
21 |     gaf = gfa.all_reads(str(TEST_GAF))
22 |     ranges = [[e.range for e in line] for line in gaf]
23 |     assert ranges == [
24 |         [(5, 8), (0, 9), (1, 0)],
25 |         [(7, 8), (0, 18), (0, 0)],
26 |     ]
27 | 


--------------------------------------------------------------------------------
/pollen_py/pollen/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from sys import exit
 3 | 
 4 | import pollen.depth.main as depth
 5 | 
 6 | 
 7 | def main():
 8 |     # Parse commandline input
 9 |     parser = argparse.ArgumentParser()
10 | 
11 |     subparsers = parser.add_subparsers()
12 | 
13 |     depth_parser = subparsers.add_parser(
14 |         "depth", help="Compute node depth", conflict_handler="resolve"
15 |     )
16 |     depth.config_parser(depth_parser)
17 |     depth_parser.set_defaults(command="depth")
18 | 
19 |     args = parser.parse_args()
20 | 
21 |     if "command" not in args:
22 |         parser.print_help()
23 |         exit(-1)
24 | 
25 |     if args.command == "depth":
26 |         depth.run(args)
27 | 
28 |     else:
29 |         raise Exception("Command not recognized")
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     main()
34 | 


--------------------------------------------------------------------------------
/mygfa/README.md:
--------------------------------------------------------------------------------
 1 | mygfa
 2 | =====
 3 | 
 4 | This is a simple Python library for parsing, manipulating, and emitting pangenomic graphs in the [GFA][] format.
 5 | It prioritizes simplicity and clarity over performance and functionality.
 6 | 
 7 | As demonstrated in [`example.py`](./example.py), this is what it looks like to compute the node depth for a GFA file:
 8 | 
 9 |     import mygfa
10 |     import sys
11 |     graph = mygfa.Graph.parse(sys.stdin)
12 |     seg_depths = {name: 0 for name in graph.segments}
13 |     for path in graph.paths.values():
14 |         for step in path.segments:
15 |             seg_depths[step.name] += 1
16 | 
17 | Type `pip install mygfa` to get started.
18 | Then check out the [API documentation][docs].
19 | 
20 | [gfa]: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md
21 | [docs]: http://cucapra.github.io/pollen/mygfa/
22 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/depth.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | import mygfa
 3 | import mygfa.preprocess
 4 | 
 5 | 
 6 | def depth(graph: mygfa.Graph, inputpaths: Optional[List[str]]) -> mygfa.Graph:
 7 |     """The depth of a node is the cardinality of node_step for that node."""
 8 |     print("\t".join(["#node.id", "depth", "depth.uniq"]))
 9 |     for seg, crossings in mygfa.preprocess.node_steps(graph).items():
10 |         # Each crossing is a (path name, index on path, direction) tuple.
11 |         # We only want to count crossings that are on input paths.
12 |         crossings = [c for c in crossings if inputpaths is None or c[0] in inputpaths]
13 |         # For depth.uniq, we need to know how many unique path-names there are.
14 |         uniq_path_names = set(c[0] for c in crossings)
15 |         print("\t".join([seg, str(len(crossings)), str(len(uniq_path_names))]))
16 |     return graph
17 | 


--------------------------------------------------------------------------------
/bench/sizes.py:
--------------------------------------------------------------------------------
 1 | import tomllib
 2 | import requests
 3 | import os
 4 | 
 5 | GRAPHS_TOML = os.path.join(os.path.dirname(__file__), "graphs.toml")
 6 | SIZE_NAMES = {
 7 |     0: "",
 8 |     3: "k",
 9 |     6: "M",
10 |     9: "G",
11 |     12: "T",
12 | }
13 | 
14 | 
15 | def fmt_size(count):
16 |     for scale, name in reversed(SIZE_NAMES.items()):
17 |         unit = 10**scale
18 |         if count > unit:
19 |             return "{:.0f}{}B".format(count / unit, name)
20 | 
21 | 
22 | def show_sizes():
23 |     with open(GRAPHS_TOML, "rb") as f:
24 |         graphs_data = tomllib.load(f)
25 | 
26 |     for category, graphs in graphs_data.items():
27 |         for name, url in graphs.items():
28 |             res = requests.head(url)
29 |             length = int(res.headers["Content-Length"])
30 |             print(category, name, fmt_size(length))
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     show_sizes()
35 | 


--------------------------------------------------------------------------------
/pollen_py/pollen/argparse_custom.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | class store_const_and_arg(argparse.Action):
 5 |     """
 6 |     An argparse action which stores a constant and stores the argument(s)
 7 |     passed to this flag. Useful when using flags as mutually exclusive
 8 |     switches that also need to accept an argument.
 9 |     'dest' is the destination for option arguments (as usual), and
10 |     'dest2' is the destination where 'const' is stored.
11 |     """
12 | 
13 |     def __init__(self, option_strings, dest, dest2=None, nargs=None, **kwargs):
14 |         if dest2 == None:
15 |             raise Exception("dest2 must be defined")
16 |         self.const_dest = dest2
17 | 
18 |         super().__init__(option_strings, dest, **kwargs)
19 | 
20 |     def __call__(self, parser, namespace, values, option_string):
21 |         setattr(namespace, self.const_dest, self.const)
22 |         setattr(namespace, self.dest, values)
23 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/crush.py:
--------------------------------------------------------------------------------
 1 | import mygfa
 2 | import mygfa.preprocess
 3 | 
 4 | 
 5 | def crush_seg(seg: mygfa.Segment) -> mygfa.Segment:
 6 |     """Compact any "runs" of N down to a single N."""
 7 |     new_seq = ""
 8 |     in_n = False
 9 |     for char in str(seg.seq):
10 |         if char == "N":
11 |             if in_n:
12 |                 continue
13 |             in_n = True
14 |         else:
15 |             in_n = False
16 |         new_seq += char
17 |     return mygfa.Segment(seg.name, mygfa.Strand(new_seq))
18 | 
19 | 
20 | def crush(graph: mygfa.Graph) -> mygfa.Graph:
21 |     """Crush all the segments of the graph."""
22 |     crushed_segs = {name: crush_seg(seg) for name, seg in graph.segments.items()}
23 |     return mygfa.Graph(
24 |         graph.headers,
25 |         crushed_segs,
26 |         graph.links,
27 |         mygfa.preprocess.drop_all_overlaps(graph.paths),
28 |         # odgi drops overlaps, so we do too.
29 |     )
30 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/inject_setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | import mygfa
 4 | import mygfa.preprocess
 5 | 
 6 | 
 7 | def print_bed(graph: mygfa.Graph) -> None:
 8 |     """Creates a reasonable query for `inject`.
 9 |     Each entry of the output is a BED where:
10 |       `name` is the name of an existing path.
11 |       `lo`/`hi` are the start/end points that we should walk over; lo <= hi.
12 |       `new` is the name of the path we wish to create.
13 |     """
14 |     random.seed(4)
15 |     for path in graph.paths.values():
16 |         length = len(mygfa.preprocess.pathseq(graph)[path.name])
17 |         for i in range(random.randint(0, 5)):
18 |             low = random.randint(0, length - 1)
19 |             high = random.randint(low + 1, length)
20 |             bed = mygfa.Bed(path.name, low, high, f"{path.name}_{i}")
21 |             print(bed)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     print_bed(mygfa.Graph.parse(sys.stdin))
26 | 


--------------------------------------------------------------------------------
/process.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import sys
 3 | 
 4 | 
 5 | def format_graph_depth_table(node_depths):
 6 |     """
 7 |     Reads a graph depth table from the commandline and removes the depth.uniq column
 8 |     """
 9 |     for row in node_depths:
10 |         print(row[: row.rfind("\t")])
11 | 
12 | 
13 | def format_json_data(node_depths, mem="segments0"):
14 |     """
15 |     Takes a json data file (calyx output) and formats it as above
16 |     """
17 |     depths = node_depths["memories"][mem]
18 |     print("#node.id\tdepth")
19 |     for i in range(len(depths)):
20 |         print(f"{i+1}\t{depths[i]}")
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     """
25 |     Take a commandline arg, gdt or json, to specify which file to convert
26 |     """
27 |     format = sys.argv[1]
28 |     if format == "gdt":
29 |         format_graph_depth_table(sys.stdin.readlines())
30 |     elif format == "json":
31 |         data = json.load(sys.stdin)
32 |         format_json_data(data)
33 | 


--------------------------------------------------------------------------------
/bench/bar.vl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": {
 3 |     "url": "FILE.csv",
 4 |     "format": {
 5 |       "type": "csv",
 6 |       "parse": {"mean": "number", "stddev": "number"}
 7 |     }
 8 |   },
 9 |   "layer": [
10 |     {
11 |       "mark": "bar",
12 |       "encoding": {
13 |         "x": { "field": "graph", "type": "nominal", "title": null },
14 |         "xOffset": { "field": "cmd" },
15 |         "y": { "field": "mean", "type": "quantitative",
16 |                "title": "running time (seconds)" },
17 |         "color": { "field": "cmd", "title": null }
18 |       }
19 |     },
20 |     {
21 |       "mark": {"type": "errorbar", "ticks": {"color": "black"}},
22 |       "encoding": {
23 |         "x": { "field": "graph", "type": "nominal" },
24 |         "xOffset": { "field": "cmd" },
25 |         "y": { "field": "mean", "type": "quantitative",
26 |                "title": "running time (seconds)" },
27 |         "yError": { "field": "stddev" }
28 |       }
29 |     }
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/validate.py:
--------------------------------------------------------------------------------
 1 | import mygfa
 2 | import mygfa.preprocess
 3 | 
 4 | 
 5 | def validate(graph: mygfa.Graph) -> mygfa.Graph:
 6 |     """Does the underlying set of Links support the paths that the graph has?"""
 7 |     _, outs = mygfa.preprocess.adjlist(graph)
 8 | 
 9 |     for path in graph.paths.values():
10 |         length = len(path.segments)
11 |         if length < 2:
12 |             continue  # Success: done with this path.
13 |         for i in range(length - 1):
14 |             seg_from = path.segments[i]
15 |             seg_to = path.segments[i + 1]
16 |             if (
17 |                 seg_to not in outs[seg_from]
18 |                 and seg_from.rev() not in outs[seg_to.rev()]
19 |             ):
20 |                 print(
21 |                     f"[odgi::validate] error: the path {path.name} "
22 |                     "does not respect the graph topology: the link "
23 |                     f"{seg_from},{seg_to} is missing."
24 |                 )
25 |     return graph
26 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/proofs.py:
--------------------------------------------------------------------------------
 1 | import mygfa
 2 | import mygfa.preprocess
 3 | 
 4 | 
 5 | def paths_logically_le(g1: mygfa.Graph, g2: mygfa.Graph) -> bool:
 6 |     """Are the paths in g1 logically "less than or equal to" those in g2?
 7 |     That is, for all paths p in g1, does the sequence charted by
 8 |     p in g1 match the sequence charted by p in g2?
 9 |     """
10 |     pathseqs_g1 = mygfa.preprocess.pathseq(g1)
11 |     pathseqs_g2 = mygfa.preprocess.pathseq(g2)
12 |     for p in g1.paths.keys():
13 |         if p not in g2.paths.keys() or pathseqs_g1[p] != pathseqs_g2[p]:
14 |             return False
15 |     return True
16 | 
17 | 
18 | def logically_le(g1: mygfa.Graph, g2: mygfa.Graph) -> bool:
19 |     """Is `g1` logically "less than or equal to" `g2`?
20 |     That is, can a user of `g1` use `g2` without a hitch?
21 |     Note that `g2` is allowed to have more stuff than `g1`.
22 | 
23 |     Will add more line items to this as we think of them!
24 |     """
25 |     return paths_logically_le(g1, g2)
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Cornell University
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/overlap.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import mygfa
 3 | import mygfa.preprocess
 4 | 
 5 | 
 6 | def touches(path1: str, path2: str, graph: mygfa.Graph) -> bool:
 7 |     """Are these two paths different,
 8 |     and if so, do they have any segments in common?
 9 |     """
10 |     if path1 == path2:
11 |         return False
12 |     segs1 = set(graph.paths[path1].segments)
13 |     segs2 = set(graph.paths[path2].segments)
14 |     return bool(segs1 & segs2)
15 | 
16 | 
17 | def overlap(graph: mygfa.Graph, inputpaths: List[str]) -> mygfa.Graph:
18 |     """Which paths touch these input paths?"""
19 |     header_printed = False
20 |     for ip in inputpaths:
21 |         assert ip in graph.paths
22 |         for path in graph.paths.keys():
23 |             if touches(ip, path, graph):
24 |                 if not header_printed:
25 |                     print("\t".join(["#path", "start", "end", "path.touched"]))
26 |                     header_printed = True
27 |                 print(
28 |                     "\t".join(
29 |                         [ip, "0", str(len(mygfa.preprocess.pathseq(graph)[ip])), path]
30 |                     )
31 |                 )
32 |     return graph
33 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | TEST_FILES := t k note5 overlap q.chop LPA DRB1-3123 chr6.C4
 2 | GFA_URL := https://raw.githubusercontent.com/pangenome/odgi/ebc493f2622f49f1e67c63c1935d68967cd16d85/test
 3 | 
 4 | # A smaller set of test inputs for faster runs.
 5 | ifdef SMALL
 6 | TEST_FILES := t k note5 overlap q.chop DRB1-3123
 7 | endif
 8 | 
 9 | tests/%.gfa:
10 | 	curl -Lo ./$@ $(GFA_URL)/$*.gfa
11 | 
12 | tests/%.og: tests/%.gfa
13 | 	odgi build -g $< -o $@
14 | 
15 | .PHONY: fetch
16 | fetch: $(TEST_FILES:%=tests/%.gfa)
17 | 
18 | fetch-og: $(TEST_FILES:%=tests/%.og)
19 | 
20 | .PHONY: test-slow-odgi
21 | test-slow-odgi: fetch
22 | 	make -C slow_odgi test
23 | 
24 | .PHONY: test-flatgfa
25 | test-flatgfa: fetch
26 | 	cd flatgfa ; cargo build
27 | 
28 | 	turnt -e flatgfa_mem -e flatgfa_file -e flatgfa_file_inplace tests/*.gfa
29 | 
30 | 	-turnt --save -v -e chop_oracle_fgfa tests/*.gfa
31 | 	turnt -v -e flatgfa_chop tests/*.gfa
32 | 
33 | 	-turnt --save -v -e odgi_depth tests/*.gfa
34 | 	turnt -v -e flatgfa_depth tests/*.gfa
35 | 
36 | 	-turnt --save -v -e odgi_extract tests/*.gfa
37 | 	turnt -v -e flatgfa_extract tests/*.gfa
38 | 
39 | clean:
40 | 	-rm tests/*.flatgfa tests/*.inplace.flatgfa tests/*.chop tests/*.depth tests/*.extract tests/*.gfa tests/*.og


--------------------------------------------------------------------------------
/bench/config.toml:
--------------------------------------------------------------------------------
 1 | [tools]
 2 | odgi = "odgi"
 3 | fgfa = "../flatgfa/target/release/fgfa"
 4 | slow_odgi = "../.venv/bin/slow_odgi"
 5 | gfatools = "gfatools"
 6 | 
 7 | [graph_sets]
 8 | smoke = ["test.k"]
 9 | mini = ["test.lpa", "test.chr6c4", "hprc.chrM"]
10 | med = ["hprc.chr20", "hprc.chrX", "1000gont.chr16"]
11 | 
12 | [modes.paths]
13 | cmd.odgi = '{odgi} paths -i {files[og]} -L'
14 | cmd.flatgfa = '{fgfa} -i {files[flatgfa]} paths'
15 | cmd.slow_odgi = '{slow_odgi} paths {files[gfa]}'
16 | 
17 | [modes.convert]
18 | convert = false
19 | cmd.odgi = '{odgi} build -g {files[gfa]} -o {files[og]}'
20 | cmd.flatgfa = '{fgfa} -I {files[gfa]} -o {files[flatgfa]}'
21 | 
22 | [modes.roundtrip]
23 | convert = false
24 | cmd.flatgfa = '{fgfa} -I {files[gfa]}'
25 | cmd.slow_odgi = '{slow_odgi} norm {files[gfa]}'
26 | cmd.odgi = '{odgi} view -g -i {files[gfa]}'
27 | cmd.gfatools = '{gfatools} view {files[gfa]}'
28 | 
29 | [modes.depth]
30 | cmd.flatgfa = '{fgfa} -i {files[flatgfa]} depth'
31 | cmd.odgi = '{odgi} depth -i {files[og]} -d'
32 | cmd.slow_odgi = '{slow_odgi} depth {files[gfa]}'
33 | 
34 | [modes.chop]
35 | cmd.flatgfa = '{fgfa} -i {files[flatgfa]} chop -c 3'
36 | cmd.odgi = '{odgi} chop -i {files[og]} -c 3 -o -'
37 | cmd.slow_odgi = '{slow_odgi} chop {files[gfa]} -n 3'


--------------------------------------------------------------------------------
/.github/workflows/code-quality.yml:
--------------------------------------------------------------------------------
 1 | name: quality
 2 | on:
 3 |   push:
 4 |   pull_request:
 5 |     branches: [main]
 6 | 
 7 | jobs:
 8 |   python:
 9 |     runs-on: ubuntu-latest
10 |     name: Python
11 |     steps:
12 |     - uses: actions/checkout@v4
13 | 
14 |     - name: ruff check
15 |       uses: astral-sh/ruff-action@v3
16 |       with:
17 |         src: >-
18 |           mygfa
19 |           slow_odgi
20 |           pollen_data_gen
21 |           flatgfa-py
22 | 
23 |     - name: ruff format
24 |       uses: astral-sh/ruff-action@v3
25 |       with:
26 |         args: "format --check --diff"
27 |         src: >-
28 |           mygfa
29 |           slow_odgi
30 |           pollen_data_gen
31 |           flatgfa-py
32 | 
33 |     - name: Install uv
34 |       uses: astral-sh/setup-uv@v5
35 |     - name: mypy
36 |       run: MYPYPATH=mygfa uv tool run mypy --no-namespace-packages --disallow-untyped-defs mygfa slow_odgi pollen_data_gen
37 | 
38 |   rust:
39 |     runs-on: ubuntu-latest
40 |     name: Rust
41 |     env:
42 |       RUSTFLAGS: "-Dwarnings"
43 |     steps:
44 |     - uses: actions/checkout@v4
45 |     - run: rustup toolchain install stable --no-self-update
46 |     - uses: Swatinem/rust-cache@v2
47 |     - run: cargo check
48 |     - run: cargo clippy
49 |     - run: cargo fmt --check
50 | 


--------------------------------------------------------------------------------
/mygfa/docs/index.rst:
--------------------------------------------------------------------------------
 1 | mygfa: A Basic GFA Data Model
 2 | =============================
 3 | 
 4 | This library parses, represents, and emits pangenomic variation graphs in the
 5 | `GFA`_ format. Basic use looks like this::
 6 | 
 7 |     import mygfa
 8 |     import sys
 9 |     graph = mygfa.Graph.parse(sys.stdin)
10 |     seg_depths = {name: 0 for name in graph.segments}
11 |     for path in graph.paths.values():
12 |         for step in path.segments:
13 |             seg_depths[step.name] += 1
14 | 
15 | The :class:`mygfa.Graph` class represents an entire GFA file.
16 | You can work down the object hierarchy from there to see everything that the
17 | file contains.
18 | 
19 | mygfa is `on PyPI`_, so you can install it with ``pip install mygfa``.
20 | 
21 | .. _GFA: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md
22 | .. _on PyPI: https://pypi.org/project/mygfa/
23 | 
24 | API Reference
25 | -------------
26 | 
27 | .. automodule:: mygfa
28 | 
29 |     .. autoclass:: Graph
30 |        :members:
31 | 
32 |     .. autoclass:: Segment
33 |        :members:
34 | 
35 |     .. autoclass:: Link
36 |        :members:
37 | 
38 |     .. autoclass:: Path
39 |        :members:
40 | 
41 |     .. autoclass:: Handle
42 |        :members:
43 | 
44 |     .. autoclass:: Strand
45 |        :members:
46 | 
47 |     .. autoclass:: Alignment
48 |        :members:
49 | 
50 |     .. autoclass:: AlignOp
51 |        :members:
52 | 
53 | .. toctree::
54 |    :maxdepth: 2
55 |    :caption: Contents:
56 | 


--------------------------------------------------------------------------------
/flatgfa/src/namemap.rs:
--------------------------------------------------------------------------------
 1 | use crate::flatgfa::{FlatGFA, Segment};
 2 | use crate::pool::Id;
 3 | use std::collections::HashMap;
 4 | 
 5 | /// A fast way to look up segment IDs by their (integer) names.
 6 | #[derive(Default)]
 7 | pub struct NameMap {
 8 |     /// Names at most this are assigned *sequential* IDs, i.e., the ID is just the name
 9 |     /// minus one.
10 |     sequential_max: usize,
11 | 
12 |     /// Non-sequential names go here.
13 |     others: HashMap<usize, u32>,
14 | }
15 | 
16 | impl NameMap {
17 |     pub fn insert(&mut self, name: usize, id: Id<Segment>) {
18 |         // Is this the next sequential name? If so, no need to record it in our hash table;
19 |         // just bump the number of sequential names we've seen.
20 |         if (name - 1) == self.sequential_max && (name - 1) == id.index() {
21 |             self.sequential_max += 1;
22 |         } else {
23 |             self.others.insert(name, id.into());
24 |         }
25 |     }
26 | 
27 |     pub fn get(&self, name: usize) -> Id<Segment> {
28 |         if name <= self.sequential_max {
29 |             ((name - 1) as u32).into()
30 |         } else {
31 |             self.others[&name].into()
32 |         }
33 |     }
34 | 
35 |     /// Construct a name map for all the segments in a GFA.
36 |     pub fn build(gfa: &FlatGFA) -> Self {
37 |         let mut name_map = NameMap::default();
38 |         for (id, seg) in gfa.segs.items() {
39 |             name_map.insert(seg.name, id);
40 |         }
41 |         name_map
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/flatgfa/src/ops/depth.rs:
--------------------------------------------------------------------------------
 1 | use crate::flatgfa;
 2 | use bit_vec::BitVec;
 3 | 
 4 | /// Compute the *depth* of each segment in the variation graph.
 5 | ///
 6 | /// The depth is defined to be the number of times that a path traverses a given
 7 | /// segment. We return two values: the ordinary depth and the *unique* depth,
 8 | /// which only counts each path that tarverses a given segment once.
 9 | ///
10 | /// Both outputs are depth values indexed by segment ID.
11 | pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec<usize>, Vec<usize>) {
12 |     // Our output vectors: the ordinary and unique depths of each segment.
13 |     let mut depths = vec![0; gfa.segs.len()];
14 |     let mut uniq_depths = vec![0; gfa.segs.len()];
15 | 
16 |     // This bit vector keeps track of whether the current path has already
17 |     // traversed a given segment, and therefore whether we should ignore
18 |     // subsequent traversals (for the purpose of counting unique depth).
19 |     let mut seen = BitVec::from_elem(gfa.segs.len(), false);
20 | 
21 |     for path in gfa.paths.all().iter() {
22 |         seen.clear(); // All segments are unseen.
23 |         for step in &gfa.steps[path.steps] {
24 |             let seg_id = step.segment().index();
25 |             depths[seg_id] += 1;
26 |             if !seen[seg_id] {
27 |                 // The first traversal of this path over this segment.
28 |                 uniq_depths[seg_id] += 1;
29 |                 seen.set(seg_id, true);
30 |             }
31 |         }
32 |     }
33 | 
34 |     (depths, uniq_depths)
35 | }
36 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # USAGE
 2 | # With this Dockerfile in working directory,
 3 | # docker build -t username/imagename .
 4 | # (note the period at the end)
 5 | # docker run -it --rm username/imagename
 6 | 
 7 | # Start with latest Calyx image
 8 | FROM ghcr.io/cucapra/calyx:latest
 9 | 
10 | # Go to the root directory
11 | WORKDIR /root
12 | 
13 | # Install ODGI
14 | # Dependencies:
15 | RUN apt install -y build-essential cmake python3-distutils python3-dev libjemalloc-dev
16 | # Clone:
17 | RUN git clone --recursive https://github.com/pangenome/odgi.git
18 | # Build:
19 | WORKDIR /root/odgi
20 | RUN cmake -H. -Bbuild && cmake --build build -- -j7
21 | # Return to root directory
22 | WORKDIR /root
23 | 
24 | # Add ODGI to paths
25 | ENV PATH="/root/odgi/bin:$PATH"
26 | ENV PYTHONPATH=$PYTHONPATH:/root/odgi/lib
27 | ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
28 | ENV FLIT_ROOT_INSTALL=1
29 | 
30 | # Install Pollen's dependencies:
31 | RUN git clone https://github.com/cucapra/turnt.git
32 | WORKDIR /root/turnt
33 | RUN flit install -s --user
34 | WORKDIR /root
35 | 
36 | # Good to have:
37 | RUN apt install emacs -y
38 | RUN apt install vim -y
39 | 
40 | # Clone and build Pollen:
41 | RUN git clone https://github.com/cucapra/pollen.git
42 | WORKDIR /root/pollen
43 | RUN make fetch
44 | RUN make og
45 | WORKDIR /root/pollen/pollen_py
46 | RUN flit install -s --user
47 | WORKDIR /root/pollen/mygfa
48 | RUN flit install -s --user
49 | WORKDIR /root/pollen/slow_odgi
50 | RUN flit install -s --user
51 | 
52 | # return to the Pollen directory
53 | WORKDIR /root/pollen


--------------------------------------------------------------------------------
/flatgfa-py/README.md:
--------------------------------------------------------------------------------
 1 | Python Bindings for FlatGFA
 2 | ===========================
 3 | 
 4 | This is a Python interface for the [FlatGFA][] library, which provides an efficient representation for pangenomic variation graphs in the [Graphical Fragment Assembly (GFA)][gfa] format.
 5 | 
 6 | You can install it [from PyPI][flatgfa-pypi]:
 7 | 
 8 |     $ pip install flatgfa
 9 | 
10 | Then, read [the API documentation][flatgfa-py-docs] for details about what it can do so far.
11 | 
12 | Development
13 | -----------
14 | 
15 | The easiest way to get started is with [uv][]:
16 | 
17 |     $ uv run --package flatgfa python example.py
18 | 
19 | That should build and install the package and then run our `example.py` script.
20 | 
21 | Or run the tests:
22 | 
23 |     $ uv run --package flatgfa pytest
24 | 
25 | During development, you'll want to rebuild the module using [Maturin][].
26 | One way to do it is to install the necessary command-line tools into the virtualenv, like this:
27 | 
28 |     $ . .venv/bin/activate
29 |     $ cd flatgfa-py
30 |     $ uv pip install maturin pip
31 |     $ maturin develop
32 | 
33 | Then, just type `maturin develop` and `pytest` while you work.
34 | 
35 | [maturin]: https://www.maturin.rs
36 | [flatgfa-py-docs]: https://cucapra.github.io/pollen/flatgfa/
37 | [flatgfa]: https://github.com/cucapra/pollen/tree/main/flatgfa
38 | [gfa]: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md
39 | [flatgfa-pypi]: https://pypi.org/project/flatgfa/
40 | [example]: https://github.com/cucapra/pollen/blob/main/flatgfa-py/example.py
41 | [uv]: https://docs.astral.sh/uv/
42 | 


--------------------------------------------------------------------------------
/bench/summary.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import sys
 3 | from collections import defaultdict
 4 | from statistics import harmonic_mean
 5 | 
 6 | 
 7 | def summary():
 8 |     reader = csv.DictReader(sys.stdin)
 9 |     by_graph = defaultdict(dict)
10 |     for row in reader:
11 |         by_graph[row["graph"]][row["cmd"]] = row
12 | 
13 |     # Guess a suitable baseline by taking the fastest time on the first graph.
14 |     first_res = next(iter(by_graph.values()))
15 |     min_row = min(first_res.values(), key=lambda r: r["mean"])
16 |     baseline = min_row["cmd"]
17 | 
18 |     # Show each graph's times.
19 |     ratios = defaultdict(list)
20 |     for graph, cmds in by_graph.items():
21 |         baseline_time = float(cmds[baseline]["mean"])
22 | 
23 |         print(graph)
24 |         for cmd, row in cmds.items():
25 |             mean = float(row["mean"])
26 |             stddev = float(row["stddev"])
27 |             ratio = mean / baseline_time
28 |             ratios[cmd].append(ratio)
29 | 
30 |             if mean > 80:
31 |                 mins = int(mean / 60)
32 |                 secs = int(mean % 60)
33 |                 print(f"  {cmd}: {mins}m{secs}s ± {stddev:.1f}", end="")
34 |             else:
35 |                 if mean < 0.2:
36 |                     mean *= 1000
37 |                     stddev *= 1000
38 |                     unit = "ms"
39 |                 else:
40 |                     unit = "s"
41 |                 print(f"  {cmd}: {mean:.1f} ± {stddev:.1f} {unit}", end="")
42 | 
43 |             print(f" ({ratio:.1f}× {baseline})")
44 | 
45 |     # Show the average across graphs.
46 |     print("harmonic mean")
47 |     for cmd, cmd_ratios in ratios.items():
48 |         hmean = harmonic_mean(cmd_ratios)
49 |         print(f"  {cmd}: {hmean:.1f}× {baseline}")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     summary()
54 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |   pull_request:
 7 |     branches: [main]
 8 | 
 9 | # Permissions for GitHub pages deployment.
10 | permissions:
11 |   contents: read
12 |   pages: write
13 |   id-token: write
14 | 
15 | jobs:
16 |   build:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - uses: actions/checkout@v4
20 | 
21 |       # Set up Sphinx.
22 |       - uses: actions/setup-python@v5
23 |         with:
24 |             python-version: '3.12'
25 |       - run: pip install Sphinx
26 |       - name: "Sphinx problem matcher"
27 |         uses: sphinx-doc/github-problem-matcher@master
28 | 
29 |       # Docs for mygfa.
30 |       - name: Build mygfa docs
31 |         run: |
32 |           cd mygfa/docs
33 |           make html
34 | 
35 |       # Docs for flatgfa-py.
36 |       - name: Build flatgfa-py
37 |         uses: PyO3/maturin-action@v1
38 |         with:
39 |           command: build
40 |           args: --out dist --manifest-path flatgfa-py/Cargo.toml
41 |       - name: Install flatgfa-py
42 |         run: pip install dist/flatgfa-*.whl
43 |       - name: Build flatgfa-py docs
44 |         run: |
45 |           cd flatgfa-py/docs
46 |           make html
47 | 
48 |       # Create site.
49 |       - name: Assemble site directory
50 |         run: |
51 |           mkdir -p site
52 |           cp -r mygfa/docs/_build/html site/mygfa
53 |           cp -r flatgfa-py/docs/_build/html site/flatgfa
54 |       - name: Pages artifact
55 |         uses: actions/upload-pages-artifact@v3
56 |         with:
57 |           path: "site"
58 | 
59 |   deploy:
60 |     environment:
61 |       name: github-pages
62 |       url: ${{ steps.deploy.outputs.page_url }}
63 |     runs-on: ubuntu-latest
64 |     needs: build
65 |     if: ${{github.event_name=='push' && github.ref=='refs/heads/main' && github.repository_owner=='cucapra'}}
66 |     steps:
67 |       - id: deploy
68 |         uses: actions/deploy-pages@v4
69 | 


--------------------------------------------------------------------------------
/slow_odgi/Makefile:
--------------------------------------------------------------------------------
 1 | # We use the small set of tests by default, because larger files make
 2 | # slow_odgi go *really* slow.
 3 | TESTS := t k note5 overlap q.chop DRB1-3123
 4 | 
 5 | GFA := $(TESTS:%=../tests/%.gfa)
 6 | OG := $(TESTS:%=../tests/%.og)
 7 | 
 8 | %.og: %.gfa
 9 | 	odgi build -g $^ -o $@
10 | 
11 | # Sets up all the odgi-oracles and then tests slow_odgi against them.
12 | test: setup oracles slow-odgi
13 | 
14 | # Produce some input files that are necessary for the slow_odgi tests.
15 | setup: $(OG)
16 | 	-turnt -j --save --env depth_setup --env inject_setup \
17 | 		--env overlap_setup --env validate_setup $(GFA)
18 | 
19 | # Produce the oracle output (from "real" odgi) for each test input. Run this
20 | # once, noisily, to obtain the expected outputs. Then run `slow-odgi` to
21 | # compare against these expected outputs.
22 | # In reality, this depends on the setup stage above. Run this by itself ONLY
23 | # if you know that the setup stages don't need to be run afresh.
24 | ORACLES := chop_oracle crush_oracle degree_oracle depth_oracle \
25 | 	flip_oracle flatten_oracle inject_oracle matrix_oracle overlap_oracle \
26 | 	paths_oracle validate_oracle
27 | oracles: $(OG)
28 | 	-turnt -j --save $(ORACLES:%=--env %) $(OG)
29 | 	-turnt -j --save --env validate_oracle_err ../tests/invalid/*.gfa
30 | 	-turnt -j --save --env crush_oracle ../tests/handmade/crush*.gfa
31 | 	-turnt -j --save --env flip_oracle ../tests/handmade/flip*.gfa
32 | 
33 | # Test slow_odgi against the output files generated by the `oracles`
34 | # target above. Be sure to rerun that before this if the inputs or odgi
35 | # behavior change.
36 | TEST_ENVS := chop_test crush_test degree_test depth_test flip_test \
37 | 	 flatten_test inject_test matrix_test overlap_test paths_test validate_test
38 | slow-odgi:
39 | 	-turnt -j $(TEST_ENVS:%=--env %) $(GFA)
40 | 	-turnt -j --env validate_test ../tests/invalid/*.gfa
41 | 	-turnt -j --env crush_test ../tests/handmade/crush*.gfa
42 | 	-turnt -j --env flip_test ../tests/handmade/flip*.gfa
43 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/flatten.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | import mygfa
 3 | 
 4 | 
 5 | def get_fasta_legend(graph: mygfa.Graph) -> Tuple[str, mygfa.LegendType]:
 6 |     """The main deliverable is the FASTA:
 7 |     Simply traverse the segments in order and glue their seqs together.
 8 |     However, it pays to do some bookkeeping now.
 9 |     legend[segname] stores the [start, end) of the spot in the FASTA that
10 |     segname's seq is featured.
11 |     """
12 |     ans = ""
13 |     legend = {}
14 |     ptr = 0
15 |     for segment in graph.segments.values():
16 |         ans += str(segment.seq)
17 |         length = len(segment.seq)
18 |         legend[segment.name] = (ptr, ptr + length)
19 |         ptr += length
20 |     return ans, legend
21 | 
22 | 
23 | def print_bed(graph: mygfa.Graph, legend: mygfa.LegendType, name: str) -> None:
24 |     """With the legend computed during FASTA-building, this is easy."""
25 | 
26 |     print("\t".join(["#name", "start", "end", "path.name", "strand", "step.rank"]))
27 |     for path in graph.paths.values():
28 |         for i, handle in enumerate(path.segments):
29 |             start, end = legend[handle.name]
30 |             print(
31 |                 "\t".join(
32 |                     [
33 |                         name,
34 |                         str(start),
35 |                         str(end),
36 |                         path.name,
37 |                         "+" if handle.ori else "-",
38 |                         str(i),
39 |                     ]
40 |                 )
41 |             )
42 | 
43 | 
44 | def insert_newlines(string: str, every: int = 80) -> str:
45 |     """odgi's output does this for this algorithm, so we follow them."""
46 |     return "\n".join(string[i : i + every] for i in range(0, len(string), every))
47 | 
48 | 
49 | def flatten(graph: mygfa.Graph, name: str) -> mygfa.Graph:
50 |     """Print out the FASTA and BED."""
51 |     print(f">{name}")
52 |     # This is a bit harcoded for files living in test/file.gfa
53 |     # Would be nice to neaten this up and make it less brittle.
54 |     fasta, legend = get_fasta_legend(graph)
55 |     print(insert_newlines(fasta))
56 |     print_bed(graph, legend, name)
57 |     return graph
58 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/chop.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Tuple
 2 | import mygfa
 3 | 
 4 | 
 5 | def chop_segs(
 6 |     graph: mygfa.Graph, choplength: int
 7 | ) -> Tuple[Dict[str, mygfa.Segment], mygfa.LegendType]:
 8 |     """Chop all the sequences of the graph into length n or lower."""
 9 | 
10 |     legend: mygfa.LegendType = {}
11 |     # If a segment is chopped, its sequence will be spread out over
12 |     # up among a series of contiguous new segments.
13 | 
14 |     # While not important for segment-chopping itself, it will serve us well to
15 |     # maintain a dict that bookkeeps this chopping.
16 | 
17 |     # For example, if
18 |     #     S 3 = ATGGCCC
19 |     # gets chopped into
20 |     #     S 7 = AT
21 |     #     S 8 = GG
22 |     #     S 9 = CC
23 |     #     S 10 = C
24 |     # then legend[3] = (7,11).
25 | 
26 |     # Later, if 3+ occurs in a path, we will replace it with 7+,8+,9+,10+.
27 |     # If 3- occurs in a path, we will replace it with 10-,9-,8-,7-.
28 | 
29 |     seg_count = 1  # To generate names for the new segments.
30 |     new_segs: Dict[str, mygfa.Segment] = {}
31 | 
32 |     for segment in graph.segments.values():
33 |         chopped_segs = {}
34 |         chopped_seqs = segment.seq.chop(choplength)
35 |         seg_count_start = seg_count
36 |         for chopped_seq in chopped_seqs:  # Going from seqs to segs.
37 |             seg_name = str(seg_count)
38 |             chopped_segs[seg_name] = mygfa.Segment(seg_name, chopped_seq)
39 |             seg_count += 1
40 |         legend[segment.name] = (seg_count_start, seg_count)
41 |         new_segs = new_segs | chopped_segs
42 | 
43 |     return new_segs, legend
44 | 
45 | 
46 | def chop_paths(graph: mygfa.Graph, legend: mygfa.LegendType) -> Dict[str, mygfa.Path]:
47 |     """With the legend computed as above, this step is easy."""
48 |     new_paths = {}
49 |     for path in graph.paths.values():
50 |         new_p_segs = []
51 |         for handle in path.segments:
52 |             ori = handle.ori
53 |             fst, snd = legend[handle.name]
54 |             segments = [mygfa.Handle(str(s), ori) for s in range(fst, snd)]
55 |             new_p_segs += segments if ori else list(reversed(segments))
56 |         new_paths[path.name] = mygfa.Path(path.name, new_p_segs, None)
57 |         # odgi drops overlaps, so we do too.
58 |     return new_paths
59 | 
60 | 
61 | def chop(graph: mygfa.Graph, choplength: int) -> mygfa.Graph:
62 |     """Chop segments and regenerate paths."""
63 |     new_segments, legend = chop_segs(graph, choplength)
64 |     new_paths = chop_paths(graph, legend)
65 |     return mygfa.Graph(graph.headers, new_segments, [], new_paths)
66 |     # The blank list is because we are choosing to drop links for now.
67 | 


--------------------------------------------------------------------------------
/flatgfa-py/flatgfa.pyi:
--------------------------------------------------------------------------------
 1 | from collections.abc import Iterator
 2 | from typing import Optional, overload
 3 | 
 4 | class Segment:
 5 |     id: int
 6 |     name: int
 7 | 
 8 |     def sequence(self) -> bytes: ...
 9 |     def __len__(self) -> int: ...
10 | 
11 | class Handle:
12 |     seg_id: int
13 |     segment: Segment
14 |     is_forward: bool
15 | 
16 | class StepList:
17 |     def __iter__(self) -> Iterator[Handle]: ...
18 |     def __len__(self) -> int: ...
19 |     @overload
20 |     def __getitem__(self, idx: int) -> Handle: ...
21 |     @overload
22 |     def __getitem__(self, slice: slice) -> StepList: ...
23 | 
24 | class Path:
25 |     id: int
26 |     name: bytes
27 | 
28 |     def __iter__(self) -> Iterator[Handle]: ...
29 |     @overload
30 |     def __getitem__(self, idx: int) -> Handle: ...
31 |     @overload
32 |     def __getitem__(self, slice: slice) -> StepList: ...
33 | 
34 | class Link:
35 |     id: int
36 |     from_: Handle
37 |     to: Handle
38 | 
39 | class SegmentList:
40 |     @overload
41 |     def __getitem__(self, idx: int) -> Segment: ...
42 |     @overload
43 |     def __getitem__(self, slice: slice) -> SegmentList: ...
44 |     def __iter__(self) -> Iterator[Segment]: ...
45 |     def __len__(self) -> int: ...
46 |     def find(self, name: int) -> Optional[Segment]: ...
47 | 
48 | class PathList:
49 |     @overload
50 |     def __getitem__(self, idx: int) -> Path: ...
51 |     @overload
52 |     def __getitem__(self, slice: slice) -> PathList: ...
53 |     def __iter__(self) -> Iterator[Path]: ...
54 |     def __len__(self) -> int: ...
55 |     def find(self, name: bytes) -> Optional[Path]: ...
56 | 
57 | class LinkList:
58 |     @overload
59 |     def __getitem__(self, idx: int) -> Link: ...
60 |     @overload
61 |     def __getitem__(self, slice: slice) -> LinkList: ...
62 |     def __iter__(self) -> Iterator[Link]: ...
63 |     def __len__(self) -> int: ...
64 | 
65 | class ChunkEvent:
66 |     handle: Handle
67 |     range: tuple[int, int]
68 |     def sequence(self) -> str: ...
69 | 
70 | class GAFLine:
71 |     name: str
72 |     chunks: list[ChunkEvent]
73 |     def segment_ranges(self) -> str: ...
74 |     def sequence(self) -> str: ...
75 |     def __iter__(self) -> Iterator[ChunkEvent]: ...
76 | 
77 | class GAFParser:
78 |     def __iter__(self) -> Iterator[GAFLine]: ...
79 | 
80 | class FlatGFA:
81 |     segments: SegmentList
82 |     paths: PathList
83 |     links: LinkList
84 | 
85 |     def write_flatgfa(self, filename: str) -> None: ...
86 |     def write_gfa(self, filename: str) -> None: ...
87 |     def all_reads(self, gaf: str) -> GAFParser: ...
88 |     def print_gaf_lookup(self, gaf: str) -> None: ...
89 | 
90 | def parse(filename: str) -> FlatGFA: ...
91 | def load(filename: str) -> FlatGFA: ...
92 | def parse_bytes(gfa: bytes) -> FlatGFA: ...
93 | 


--------------------------------------------------------------------------------
/flatgfa/README.md:
--------------------------------------------------------------------------------
 1 | FlatGFA
 2 | =======
 3 | 
 4 | This is an experimental [odgi][]-like tool for manipulating pangenome graphs in the popular [GFA][] format. It works by converting the GFA to a "flat," pointer-free representation that can be stored directly on disk for zero-copy reads and writes.
 5 | 
 6 | [odgi]: https://odgi.readthedocs.io/en/latest/
 7 | [gfa]: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md
 8 | 
 9 | Build
10 | -----
11 | 
12 | It's a Rust project, so all you need to do is:
13 | 
14 |     $ cargo build --release
15 | 
16 | Then you might like to do something like this to put a symlink on your `$PATH`:
17 | 
18 |     $ ln -s `pwd`/target/release/fgfa ~/.local/bin
19 | 
20 | Now see what's available:
21 | 
22 |     $ fgfa --help
23 | 
24 | Convert GFA Files
25 | -----------------
26 | 
27 | This tool can run queries directly on GFA text files, but you can amortize that cost by converting to the native FlatGFA format. Try this:
28 | 
29 |     $ fgfa -I chr22.hprc-v1.0-pggb.gfa -o chr22.flatgfa
30 | 
31 | In general, you will want to remember these flags for input and output:
32 | 
33 | * `-i` or `-o`: Read or write our native FlatGFA binary format.
34 | * `-I` or `-O`: Read or write the standard GFAv1 text format. Or, just omit the relevant flag to use standard input or standard output.
35 | 
36 | So combining `-I` and `-o` as above does the conversion you want. FlatGFA files should be a little smaller than their text counterparts. Now that we have one, we can convert it back to a GFA text file like this:
37 | 
38 |     $ fgfa -i chr22.flatgfa | less
39 | 
40 | Simple Queries
41 | --------------
42 | 
43 | Here are some things we can do with FlatGFA files. See some basic statistics about the graph:
44 | 
45 |     $ fgfa -i chr22.flatgfa stats -S
46 | 
47 | Or use `-L` instead to see information about self-loops. This output should match [`odgi stats`][odgi-stats].
48 | 
49 | Get a list of all the path names in the graph---or, in this case, just the first few:
50 | 
51 |     $ fgfa -i chr22.flatgfa paths | head
52 | 
53 | Find the graph position of a given base-pair offset within a certain path, just like [`odgi position -v`][odgi-position]:
54 | 
55 |     $ fgfa -i chr22.flatgfa position -p chm13#chr22,12345,+
56 | 
57 | Extract a subgraph from a larger graph around a specific segment:
58 | 
59 |     $ fgfa -i chr22.flatgfa -o chr22.sub.flatgfa extract -n 25 -c
60 |     $ fgfa -i chr22.sub.flatgfa stats -S
61 | 
62 | Unfortunately, this extraction doesn't quite match [`odgi extract`][odgi-extract] yet (because I haven't quite been able to figure out how it's supposed to work).
63 | 
64 | [odgi-stats]: https://odgi.readthedocs.io/en/latest/rst/commands/odgi_stats.html
65 | [odgi-position]: https://odgi.readthedocs.io/en/latest/rst/commands/odgi_position.html
66 | [odgi-extract]: https://odgi.readthedocs.io/en/latest/rst/commands/odgi_extract.html
67 | 


--------------------------------------------------------------------------------
/mygfa/mygfa/preprocess.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Dict
 2 | from . import gfa as mygfa
 3 | 
 4 | 
 5 | def node_steps(graph: mygfa.Graph) -> Dict[str, List[Tuple[str, int, bool]]]:
 6 |     """For each segment in the graph,
 7 |     list the times the segment was crossed by a path"""
 8 |     # segment name, (path name, index on path, direction) list
 9 |     crossings: Dict[str, List[Tuple[str, int, bool]]] = {}
10 |     for segname in graph.segments.keys():
11 |         crossings[segname] = []
12 | 
13 |     for path in graph.paths.values():
14 |         for index, handle in enumerate(path.segments):
15 |             crossings[handle.name].append((path.name, index, handle.ori))
16 | 
17 |     return crossings
18 | 
19 | 
20 | HandleMap = Dict[mygfa.Handle, List[mygfa.Handle]]
21 | 
22 | 
23 | def adjlist(graph: mygfa.Graph) -> Tuple[HandleMap, HandleMap]:
24 |     """Construct an adjacency list representation of the graph.
25 |     This is via two dicts having the same type:
26 |     key: Handle              # my details
27 |     value: list of Handle    # neighbors' details
28 |     We take each segment into account, regardless of whether it is on a path.
29 |     We make two such dicts: one for in-edges and one for out-edges
30 |     """
31 |     ins: HandleMap = {}
32 |     outs: HandleMap = {}
33 |     for segname in graph.segments.keys():
34 |         ins[mygfa.Handle(segname, True)] = []
35 |         ins[mygfa.Handle(segname, False)] = []
36 |         outs[mygfa.Handle(segname, True)] = []
37 |         outs[mygfa.Handle(segname, False)] = []
38 | 
39 |     for link in graph.links:
40 |         ins[link.to_].append(link.from_)
41 |         outs[link.from_].append(link.to_)
42 | 
43 |     return ins, outs
44 | 
45 | 
46 | def handle_seq(graph: mygfa.Graph, handle: mygfa.Handle) -> mygfa.Strand:
47 |     """Get the sequence of a handle, reverse-complementing if necessary."""
48 |     seg = graph.segments[handle.name]
49 |     return seg.seq if handle.ori else seg.revcomp().seq
50 | 
51 | 
52 | def pathseq(graph: mygfa.Graph) -> Dict[str, str]:
53 |     """Given a graph, precompute the _sequence_
54 |     charted by each of the graph's paths.
55 |     """
56 |     ans: Dict[str, str] = {}
57 |     for path in graph.paths.keys():
58 |         ans[path] = "".join(
59 |             str(handle_seq(graph, handle)) for handle in graph.paths[path].segments
60 |         )
61 |     return ans
62 | 
63 | 
64 | def get_maxes(graph: mygfa.Graph) -> Tuple[int, int, int]:
65 |     """Given a graph, returns:
66 |     - the number of nodes
67 |     - the maximum number of steps in a path
68 |     - the number of paths in the graph.
69 |     """
70 |     max_nodes = len(graph.segments)
71 |     max_steps = max([len(steps) for steps in node_steps(graph).values()])
72 |     max_paths = len(graph.paths)
73 |     return max_nodes, max_steps, max_paths
74 | 
75 | 
76 | def drop_all_overlaps(paths: Dict[str, mygfa.Path]) -> Dict[str, mygfa.Path]:
77 |     """Drop all overlaps from the given paths."""
78 |     return {name: path.drop_overlaps() for name, path in paths.items()}
79 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <h1>
 2 | <p align="center">
 3 | <img src="https://github.com/cucapra/pollen/blob/main/pollen_icon_transparent.png">
 4 | </h1>
 5 | 
 6 | Accelerated Pangenome Graph Queries
 7 | ===================================
 8 | 
 9 | Pollen is a nascent project to accelerate queries on pangenomic graphs.
10 | We are designing a graph-manipulating DSL that exposes functionality that pangenomicists care about.
11 | Our DSL will support graph queries in the vein of the [odgi][] project.
12 | We will compile programs written in this DSL into fast query code.
13 | Eventually, we aim to generate custom hardware accelerators for these queries via the [Calyx][] compiler.
14 | 
15 | There are several things in this repository:
16 | 
17 | * [mygfa](./mygfa), a simple Python library for parsing, processing, and emitting [GFA][] files. See [its documentation][mygfa-docs].
18 | * [slow_odgi](./slow_odgi), a reference implementation of several GFA queries from the [odgi][] tool using `mygfa`.
19 | * [FlatGFA](./flatgfa), an experimental fast binary format for representing and analyzing GFA files. There are also [Python bindings](./flatgfa-py) for this library; check out [their documentation][flatgfa-py-docs].
20 | * A proof-of-concept Calyx-based [hardware accelerator generator](./pollen_py) for a single GFA query (`odgi depth`) and a data generator for this hardware.
21 | 
22 | [calyx]: https://calyxir.org
23 | [odgi]: https://odgi.readthedocs.io/en/latest/
24 | [gfa]: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md
25 | [flatgfa-py-docs]: https://cucapra.github.io/pollen/flatgfa/
26 | 
27 | 
28 | `mygfa` and `slow_odgi`
29 | -----------------------
30 | 
31 | The `mygfa` library is an extremely simple Python library for representing (and parsing and emitting) GFA files. It emphasizes clarity over efficiency. Use `pip install mygfa` to get started, and read the [API documentation][mygfa-docs] for details.
32 | 
33 | Similarly, `slow_odgi` is a set of GFA analyses based on `mygfa`; it's meant to act as a *reference implementation* of the much faster functionality in [odgi][]. Check out [the slow_odgi README](slow_odgi/) for more details.
34 | 
35 | To set up both of them from this repository, try using [uv][]:
36 | 
37 |     $ uv run slow_odgi --help
38 | 
39 | Or, alternatively, you can set up and activate the environment manually:
40 | 
41 |     $ uv sync
42 |     $ source .venv/bin/activate
43 |     $ slow_odgi --help
44 | 
45 | [uv]: https://github.com/astral-sh/uv
46 | [mygfa-docs]: http://cucapra.github.io/pollen/mygfa/
47 | 
48 | 
49 | FlatGFA
50 | -------
51 | 
52 | [FlatGFA](./flatgfa) is an efficient representation for GFA files. It is implemented in Rust and available with [Python bindings](./flatgfa-py). The latter is [on PyPI][flatgfa-pypi], so you can get started with:
53 | 
54 |     $ pip install flatgfa
55 | 
56 | Then read the [API documentation][flatgfa-py-docs] to see what's available. Or see [the included example](./flatgfa-py/example.py) for a synopsis.
57 | 
58 | [flatgfa-pypi]: https://pypi.org/project/flatgfa/
59 | 
60 | 
61 | Credits
62 | -------
63 | 
64 | This is a project of the [Capra][] lab at Cornell.
65 | The license is [MIT][].
66 | 
67 | [capra]: https://capra.cs.cornell.edu
68 | [mit]: https://choosealicense.com/licenses/mit/
69 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/flip.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Dict
 2 | from collections.abc import Callable
 3 | import mygfa
 4 | 
 5 | 
 6 | def path_is_rev(path: mygfa.Path, graph: mygfa.Graph) -> bool:
 7 |     """Is this path more reverse-oriented than it is forward-oriented?"""
 8 |     fwd = 0
 9 |     rev = 0
10 |     for seg in path.segments:
11 |         length = len(graph.segments[seg.name].seq)
12 |         if seg.ori:
13 |             fwd += length
14 |         else:
15 |             rev += length
16 |     return rev > fwd
17 | 
18 | 
19 | def flip_path(path: mygfa.Path, graph: mygfa.Graph) -> Tuple[mygfa.Path, bool]:
20 |     """Flip the given path if it is more reverse- than forward-oriented.
21 |     Return the path, whether this method flipped it or not,
22 |     along with a bool that says whether this method flipped the path."""
23 |     if path_is_rev(path, graph):
24 |         path_segs = []
25 |         for seg in reversed(path.segments):
26 |             path_segs.append(mygfa.Handle(seg.name, not seg.ori))
27 |         return mygfa.Path(f"{path.name}_inv", path_segs, None), True
28 |     else:
29 |         return path.drop_overlaps(), False
30 |         # odgi drops overlaps, so we do too.
31 | 
32 | 
33 | def dedup(mylist: List[mygfa.Link]) -> List[mygfa.Link]:
34 |     """De-duplicate a list of links."""
35 |     new: List[mygfa.Link] = []
36 |     for item in mylist:
37 |         if item not in new and item.rev() not in new:
38 |             # odgi seems to consider a link's reverse its own duplicate.
39 |             new.append(item)
40 |     return new
41 | 
42 | 
43 | def gen_links(
44 |     paths_dec: Dict[str, Tuple[mygfa.Path, bool]], pred: Callable[[bool], bool]
45 | ) -> List[mygfa.Link]:
46 |     """Given a dict of decorated paths and a predicate on path-decorations,
47 |     return a list of links that, when added to the graph,
48 |     would make the predicate-satisfying paths valid.
49 | 
50 |     The code feels like the spiritual reverse of `validate`,
51 |     and indeed, after this has been run, `validate` will be happy
52 |     with those paths that satisfy the predicate.
53 |     """
54 |     links = []
55 |     # A "no-op" alignment
56 |     alignment = mygfa.Alignment([(0, mygfa.AlignOp("M"))])
57 |     for path, dec in paths_dec.values():
58 |         if not pred(dec):
59 |             continue
60 |         # Below be the paths of interest.
61 |         length = len(path.segments)
62 |         if length < 2:
63 |             continue  # Success: done with this path.
64 |         for i in range(length - 1):
65 |             from_ = path.segments[i]
66 |             to = path.segments[i + 1]
67 |             links.append(mygfa.Link(from_, to, alignment))
68 |     return links
69 | 
70 | 
71 | def flip(graph: mygfa.Graph) -> mygfa.Graph:
72 |     """Flip the paths, and generate new links that make the graph valid."""
73 |     paths_dec = {name: flip_path(p, graph) for name, p in graph.paths.items()}
74 |     # paths_dec is "decorated" with info re:
75 |     # whether a path has just been flipped.
76 |     new_links = gen_links(paths_dec, lambda x: x)
77 |     paths = {name: p for name, (p, _) in paths_dec.items()}
78 |     # Stripping the decoration off paths_dec gives a reasonable
79 |     # Dict[str, Path].
80 |     return mygfa.Graph(
81 |         graph.headers, graph.segments, dedup(graph.links + new_links), paths
82 |     )
83 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
  1 | name: build
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 |   pull_request:
  8 |     branches:
  9 |       - main
 10 | 
 11 | jobs:
 12 |   test-py:
 13 |     name: test Python tools
 14 |     runs-on: ubuntu-latest
 15 |     steps:
 16 |       - uses: actions/checkout@v4
 17 |       - uses: actions/setup-python@v5
 18 |         with:
 19 |           python-version: "3.12"
 20 | 
 21 |       # Set up and use uv.
 22 |       - uses: actions/cache@v4
 23 |         id: cache-uv
 24 |         with:
 25 |           path: ~/.cache/uv
 26 |           key: ${{ runner.os }}-python-${{ matrix.python-version }}-uv
 27 |       - name: uv sync and activate
 28 |         run: |
 29 |           curl -LsSf https://astral.sh/uv/install.sh | sh
 30 |           uv sync
 31 |           echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV
 32 |           echo "$PWD/.venv/bin" >> $GITHUB_PATH
 33 | 
 34 |       # Set up for tests.
 35 |       - name: Problem matcher
 36 |         run: echo '::add-matcher::.github/tap-matcher.json'
 37 |       - name: Fetch test data
 38 |         run: make fetch SMALL=1
 39 | 
 40 |       - name: Pull odgi container
 41 |         run: |
 42 |           docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1
 43 |           docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi
 44 |       - name: Install odgi alias
 45 |         run: |
 46 |           mkdir -p $HOME/.local/bin
 47 |           cp .github/odgi.sh $HOME/.local/bin/odgi
 48 |           chmod a+x $HOME/.local/bin/odgi
 49 | 
 50 |       # Test slow_odgi.
 51 |       - name: Set up for slow_odgi tests
 52 |         run: make -C slow_odgi setup oracles SMALL=1
 53 |       - name: Test slow_odgi
 54 |         run: make -C slow_odgi test SMALL=1
 55 | 
 56 |   test-flatgfa:
 57 |     name: test FlatGFA
 58 |     runs-on: ubuntu-latest
 59 |     steps:
 60 |       - uses: actions/checkout@v4
 61 |       - run: rustup toolchain install stable --no-self-update
 62 | 
 63 |       # Install slow-odgi.
 64 |       - uses: actions/cache@v4
 65 |         id: cache-uv
 66 |         with:
 67 |           path: ~/.cache/uv
 68 |           key: ${{ runner.os }}-python-${{ matrix.python-version }}-uv
 69 |       - name: uv sync and activate
 70 |         run: |
 71 |           curl -LsSf https://astral.sh/uv/install.sh | sh
 72 |           uv sync
 73 |           echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV
 74 |           echo "$PWD/.venv/bin" >> $GITHUB_PATH
 75 | 
 76 |       # Install odgi
 77 |       - name: Pull odgi container
 78 |         run: |
 79 |           docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1
 80 |           docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi
 81 |       - name: Install odgi alias
 82 |         run: |
 83 |           mkdir -p $HOME/.local/bin
 84 |           cp .github/odgi.sh $HOME/.local/bin/odgi
 85 |           chmod a+x $HOME/.local/bin/odgi
 86 | 
 87 |       # Install Turnt.
 88 |       - uses: actions/setup-python@v5
 89 |         with:
 90 |           python-version: "3.12"
 91 |       - name: Install Turnt
 92 |         run: pip install turnt
 93 |       - name: Problem matcher
 94 |         run: echo '::add-matcher::.github/tap-matcher.json'
 95 | 
 96 |       # We need the test data.
 97 |       - name: Fetch test data
 98 |         run: make fetch SMALL=1
 99 | 
100 |       # Build and test.
101 |       - run: cargo build
102 |         working-directory: ./flatgfa
103 |       - run: cargo test
104 |         working-directory: ./flatgfa
105 |       - run: make test-flatgfa
106 | 


--------------------------------------------------------------------------------
/pollen_py/pollen/depth/python_depth.py:
--------------------------------------------------------------------------------
 1 | """ A node depth computation for .og files implemented using odgi's
 2 | Python bindings. While this implementation reuses odgi's data structures, it
 3 | does not reuse its node depth computation algorithm and instead implements
 4 | it from scratch.
 5 | 
 6 | The documentation for the odgi module can be found at
 7 | https://odgi.readthedocs.io/en/latest/rst/binding/glossary.html
 8 | """
 9 | 
10 | import argparse
11 | import sys
12 | import odgi
13 | 
14 | 
15 | def get_depth_table(graph, subset_paths=None):
16 |     """
17 |     Input: an odgi.graph object
18 |     Output: the node depth table, a dictionary that maps from a node's id to its (depth, uniq_depth),
19 |         where depth is the total number of times each path in subset_paths crosses the node,
20 |         and uniq_depth is the number of paths in subset_paths which cross the node
21 |     Note: if subset_paths is empty, consider all paths when computing node depth
22 |     """
23 | 
24 |     ndt = dict()  # node depth table map from node.id -> (node.depth, node.uniq_depth)
25 | 
26 |     # Compute the node depth and unique depth
27 |     def get_node_depth(handle):
28 |         """
29 |         Input: [handle] is an odgi.handle object which represents a node
30 |         Inserts node.depth and node.uniq into ndt for the node associated with
31 |             [handle]
32 |         """
33 | 
34 |         # Note: a node can have multiple handles, but only one id
35 |         node_id = graph.get_id(handle)
36 | 
37 |         paths = set()
38 |         depth = 0
39 | 
40 |         # For a given path step, update the node depth and set of paths which cross the node
41 |         def for_step(step):
42 |             path_h = graph.get_path_handle_of_step(step)
43 |             # The name of the path associated with path_h
44 |             path = graph.get_path_name(path_h)
45 |             if not subset_paths or path in subset_paths:
46 |                 paths.add(path)
47 |                 nonlocal depth  # modify the 'depth' variable in the outer scope
48 |                 depth += 1
49 | 
50 |         graph.for_each_step_on_handle(handle, for_step)
51 | 
52 |         ndt[node_id] = (depth, len(paths))
53 | 
54 |     graph.for_each_handle(get_node_depth)
55 |     return ndt
56 | 
57 | 
58 | def parse_paths_file(filename):
59 |     """Parse a file which contains the name of a path on each line."""
60 | 
61 |     if filename is None:  # Return the default value
62 |         return None
63 | 
64 |     with open(filename, "r") as paths_file:
65 |         text = paths_file.read()
66 |         paths = text.splitlines()
67 |     return paths
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     # Parse commandline arguments
72 |     parser = argparse.ArgumentParser()
73 |     parser.add_argument(
74 |         "filename",
75 |         help="A .og file representing a pangenome whose node depth we want to calculate",
76 |     )
77 |     parser.add_argument(
78 |         "-s",
79 |         "--subset-paths",
80 |         help="Specify a file containing a subset of all paths in the graph. See the odgi documentation for more details",
81 |     )
82 |     args = parser.parse_args()
83 | 
84 |     graph = odgi.graph()
85 |     graph.load(args.filename)
86 | 
87 |     # Get the set of all paths specified in the file give
88 |     subset_paths = parse_paths_file(args.subset_paths)
89 | 
90 |     # Get the node depths for all nodes in the graph
91 |     ndt = get_depth_table(graph, subset_paths)
92 | 
93 |     # Print the ndt to the standard output
94 |     print("#node.id\tdepth\tdepth.uniq")
95 |     for id, (depth, uniq) in sorted(ndt.items()):
96 |         print(f"{id}\t{depth}\t{uniq}")
97 | 


--------------------------------------------------------------------------------
/flatgfa/src/memfile.rs:
--------------------------------------------------------------------------------
  1 | use memmap::{Mmap, MmapMut};
  2 | use rayon::iter::{
  3 |     plumbing::{bridge_unindexed, UnindexedConsumer, UnindexedProducer},
  4 |     ParallelIterator,
  5 | };
  6 | 
  7 | pub fn map_file(name: &str) -> Mmap {
  8 |     let file = std::fs::File::open(name).unwrap();
  9 |     unsafe { Mmap::map(&file) }.unwrap()
 10 | }
 11 | 
 12 | pub fn map_new_file(name: &str, size: u64) -> MmapMut {
 13 |     let file = std::fs::OpenOptions::new()
 14 |         .read(true)
 15 |         .write(true)
 16 |         .truncate(true)
 17 |         .create(true)
 18 |         .open(name)
 19 |         .unwrap();
 20 |     file.set_len(size).unwrap();
 21 |     unsafe { MmapMut::map_mut(&file) }.unwrap()
 22 | }
 23 | 
 24 | pub fn map_file_mut(name: &str) -> MmapMut {
 25 |     let file = std::fs::OpenOptions::new()
 26 |         .read(true)
 27 |         .write(true)
 28 |         .open(name)
 29 |         .unwrap();
 30 |     unsafe { MmapMut::map_mut(&file) }.unwrap()
 31 | }
 32 | 
 33 | pub struct MemchrSplit<'a> {
 34 |     needle: u8,
 35 |     haystack: &'a [u8],
 36 |     memchr: memchr::Memchr<'a>,
 37 |     pub pos: usize,
 38 | }
 39 | 
 40 | impl MemchrSplit<'_> {
 41 |     pub fn new(needle: u8, haystack: &[u8]) -> MemchrSplit<'_> {
 42 |         MemchrSplit {
 43 |             needle,
 44 |             haystack,
 45 |             memchr: memchr::memchr_iter(needle, haystack),
 46 |             pos: 0,
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | impl<'a> Iterator for MemchrSplit<'a> {
 52 |     type Item = &'a [u8];
 53 | 
 54 |     fn next(&mut self) -> Option<Self::Item> {
 55 |         if self.pos >= self.haystack.len() {
 56 |             return None;
 57 |         }
 58 |         let start = self.pos;
 59 |         let end = self.memchr.next()?;
 60 |         self.pos = end + 1;
 61 |         Some(&self.haystack[start..end])
 62 |     }
 63 | }
 64 | 
 65 | impl<'a> UnindexedProducer for MemchrSplit<'a> {
 66 |     type Item = &'a [u8];
 67 | 
 68 |     fn split(self) -> (Self, Option<Self>) {
 69 |         // Roughly chop the buffer in half. Maybe this should give up if the current
 70 |         // size is already below a threshold.
 71 |         let mid = self.pos + (self.haystack.len() - self.pos) / 2;
 72 |         if mid >= self.haystack.len() || mid == 0 {
 73 |             return (self, None);
 74 |         };
 75 | 
 76 |         // Advance the midpoint to a needle boundary.
 77 |         let mid_nl = memchr::memchr(self.needle, &self.haystack[mid..]);
 78 |         let right_start = match mid_nl {
 79 |             Some(mid_nl) => mid + mid_nl + 1,
 80 |             None => return (self, None),
 81 |         };
 82 | 
 83 |         // Create two sub-iterators.
 84 |         let left = Self {
 85 |             needle: self.needle,
 86 |             haystack: &self.haystack[..right_start],
 87 |             memchr: self.memchr,
 88 |             pos: self.pos,
 89 |         };
 90 |         let right_buf = &self.haystack[right_start..];
 91 |         let right = Self {
 92 |             needle: self.needle,
 93 |             haystack: right_buf,
 94 |             memchr: memchr::memchr_iter(self.needle, right_buf),
 95 |             pos: 0,
 96 |         };
 97 |         (left, Some(right))
 98 |     }
 99 | 
100 |     fn fold_with<F>(self, folder: F) -> F
101 |     where
102 |         F: rayon::iter::plumbing::Folder<Self::Item>,
103 |     {
104 |         folder.consume_iter(self)
105 |     }
106 | }
107 | 
108 | impl<'a> ParallelIterator for MemchrSplit<'a> {
109 |     type Item = &'a [u8];
110 | 
111 |     fn drive_unindexed<C>(self, consumer: C) -> C::Result
112 |     where
113 |         C: UnindexedConsumer<Self::Item>,
114 |     {
115 |         bridge_unindexed(self, consumer)
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/pollen_data_gen/pollen_data_gen/__main__.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | import mygfa
  4 | from typing import List
  5 | 
  6 | from . import depth, simple
  7 | 
  8 | 
  9 | def parse_args() -> tuple[argparse.ArgumentParser, argparse.Namespace]:
 10 |     """Parse command line arguments and run the appropriate subcommand."""
 11 |     parser = argparse.ArgumentParser()
 12 | 
 13 |     subparsers = parser.add_subparsers(
 14 |         title="pollen-data-gen commands", metavar="COMMAND", dest="command"
 15 |     )
 16 | 
 17 |     simple_parser = subparsers.add_parser(
 18 |         "simple", help="Produces a simple JSON serialization of the graph."
 19 |     )
 20 |     # Optional arguments - argparse automatically infers flags beginning with '-' as optional
 21 |     simple_parser.add_argument(
 22 |         "-n",
 23 |         help="The max number of nodes.",
 24 |     )
 25 |     simple_parser.add_argument(
 26 |         "-e",
 27 |         help="The max number of steps per node.",
 28 |     )
 29 |     simple_parser.add_argument(
 30 |         "-p",
 31 |         help="The max number of paths.",
 32 |     )
 33 |     simple_parser.add_argument(
 34 |         "-s",
 35 |         "--subset-paths",
 36 |         help="A file where each line is a path of the graph to consider when calculating node depth",
 37 |     )
 38 | 
 39 |     _ = subparsers.add_parser(
 40 |         "roundtrip",
 41 |         help="Checks that we can serialize the deserilize the graph losslessly.",
 42 |     )
 43 | 
 44 |     depth_parser = subparsers.add_parser(
 45 |         "depth", help="Produces a `depth`-specific JSON of the graph."
 46 |     )
 47 |     depth_parser.add_argument(
 48 |         "-n",
 49 |         help="The max number of nodes.",
 50 |     )
 51 |     depth_parser.add_argument(
 52 |         "-e",
 53 |         help="The max number of steps per node.",
 54 |     )
 55 |     depth_parser.add_argument(
 56 |         "-p",
 57 |         help="The max number of paths.",
 58 |     )
 59 |     depth_parser.add_argument(
 60 |         "-s",
 61 |         "--subset-paths",
 62 |         help="A file where each line is a path of the graph to consider when calculating node depth",
 63 |     )
 64 | 
 65 |     # Add the graph argument to all subparsers.
 66 |     # Doing it this way means that the graph argument is sought _after_ the
 67 |     # command name.
 68 |     for subparser in subparsers.choices.values():
 69 |         subparser.add_argument("graph", help="Input GFA file", metavar="GRAPH")
 70 | 
 71 |     args = parser.parse_args()
 72 | 
 73 |     return parser, args
 74 | 
 75 | 
 76 | def parse_subset_paths(filename: str) -> List[str]:
 77 |     """
 78 |     Return a list of the names of paths in [filename]
 79 |     """
 80 | 
 81 |     if filename is None:  # Return the default value
 82 |         return []
 83 | 
 84 |     with open(filename, "r", encoding="utf-8") as paths_file:
 85 |         text = paths_file.read()
 86 |         return text.splitlines()
 87 | 
 88 | 
 89 | def dispatch(args: argparse.Namespace) -> None:
 90 |     """Parse the graph from filename,
 91 |     then dispatch to the appropriate pollen_data_gen command.
 92 |     """
 93 |     subset_paths = parse_subset_paths(args.subset_paths)
 94 |     name_to_func = {
 95 |         "depth": lambda g: depth.depth_stdout(g, args.n, args.e, args.p, subset_paths),
 96 |         "simple": lambda g: simple.dump(
 97 |             g, sys.stdout, args.n, args.e, args.p, subset_paths
 98 |         ),
 99 |         "roundtrip": simple.roundtrip_test,
100 |     }
101 |     graph = mygfa.Graph.parse(open(args.graph, "r", encoding="utf-8"))
102 |     name_to_func[args.command](graph)
103 | 
104 | 
105 | def main() -> None:
106 |     """Parse command line arguments and run the appropriate subcommand."""
107 |     parser, arguments = parse_args()
108 |     if "graph" not in arguments or not arguments.graph:
109 |         parser.print_help()
110 |         exit(-1)
111 |     dispatch(arguments)
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     main()
116 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/inject.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Tuple
 2 | import mygfa
 3 | from . import chop
 4 | 
 5 | 
 6 | def track_path(graph: mygfa.Graph, bed: mygfa.Bed) -> List[mygfa.Handle]:
 7 |     """Given a BED entry, make a list of the Segments traversed _in full_."""
 8 |     walk = 0
 9 |     segs_walked = []
10 |     for handle in graph.paths[bed.name].segments:
11 |         length = len(graph.segments[handle.name].seq)
12 |         if walk < bed.low:
13 |             # Skipping over segments that are not of interest.
14 |             walk = walk + length
15 |             continue
16 |         if walk + length <= bed.high:
17 |             walk = walk + length
18 |             segs_walked.append(handle)
19 |         else:
20 |             return segs_walked
21 |     return segs_walked  # Given a legal BED, I should never reach this point.
22 | 
23 | 
24 | def handle_pos(handle: mygfa.Handle, length: int, index: int) -> Tuple[str, int]:
25 |     """Get the concrete index in the underlying segment sequence corresponding
26 |     to the `n`th nucleotide from the beginning (in the appropriate direction).
27 |     """
28 |     return handle.name, (index if handle.ori else length - index)
29 | 
30 | 
31 | def where_chop(
32 |     graph: mygfa.Graph, pathname: str, index: int
33 | ) -> Optional[Tuple[str, int]]:
34 |     """Given a path and an index, find which segment should be chopped.
35 |     We may not need to chop: the index could already be at a seam b/w segments.
36 |     In such case, return None.
37 |     """
38 |     walk = 0
39 |     for handle in graph.paths[pathname].segments:
40 |         if walk == index:
41 |             return None
42 |         length = len(graph.segments[handle.name].seq)
43 |         if walk + length > index:
44 |             return handle_pos(handle, length, index - walk)
45 |         walk = walk + length
46 |     return None  # Given a legal path, I should never reach this point.
47 | 
48 | 
49 | def chop_if_needed(graph: mygfa.Graph, pathname: str, index: int) -> mygfa.Graph:
50 |     """Modify this graph such that the given index will fall on a segment-seam.
51 |     This involves:
52 |       1. renumbering segments
53 |       2. redoing paths
54 |     But at least we know we'll only ever need to renumber a max of one segment.
55 |     """
56 |     targetpos = where_chop(graph, pathname, index)
57 |     if not targetpos:
58 |         return graph  # We were already on a seam.
59 |     target, pos = targetpos
60 | 
61 |     segments = {}
62 |     legend = {}  # With plans to reuse `chop_paths`.
63 | 
64 |     for seg in graph.segments.values():
65 |         segnumber = int(seg.name)
66 |         succname = str(segnumber + 1)
67 |         if segnumber < int(target):  # Keep these verbatim.
68 |             segments[seg.name] = seg
69 |             legend[seg.name] = segnumber, segnumber + 1
70 |         elif seg.name == target:  # Perform one chop.
71 |             segments[seg.name] = mygfa.Segment(target, mygfa.Strand(str(seg.seq)[:pos]))
72 |             segments[succname] = mygfa.Segment(
73 |                 succname, mygfa.Strand(str(seg.seq)[pos:])
74 |             )
75 |             legend[seg.name] = segnumber, segnumber + 2
76 |         else:  # Keep the segment as it was, but increment its name.
77 |             segments[succname] = mygfa.Segment(succname, seg.seq)
78 |             legend[seg.name] = segnumber + 1, segnumber + 2
79 | 
80 |     paths = chop.chop_paths(graph, legend)
81 |     return mygfa.Graph(graph.headers, segments, graph.links, paths)
82 | 
83 | 
84 | def inject(graph: mygfa.Graph, p2i: List[mygfa.Bed]) -> mygfa.Graph:
85 |     """Given a graph and the list of paths to inject, inject those paths."""
86 |     for p in p2i:
87 |         if p.name in graph.paths.keys():  # odgi is silent if path was absent.
88 |             # if flip.path_is_rev(graph.paths[p.name], graph):
89 |             # print(f"Path {p.name} is reverse-oriented.")
90 |             graph = chop_if_needed(chop_if_needed(graph, p.name, p.low), p.name, p.high)
91 |             new_path = mygfa.Path(p.new, track_path(graph, p), None)
92 |             graph.paths[p.new] = new_path  # In-place update!
93 |     return graph
94 | 


--------------------------------------------------------------------------------
/.github/workflows/flatgfa-py.yml:
--------------------------------------------------------------------------------
  1 | # This file is autogenerated by maturin v1.5.1
  2 | # To update, run
  3 | #
  4 | #    maturin generate-ci github --pytest -m flatgfa-py/Cargo.toml -o .github/workflows/flatgfa-py.yml
  5 | #
  6 | name: flatgfa-py CI
  7 | 
  8 | on:
  9 |   push:
 10 |     branches: [main]
 11 |     tags: ["flatgfa-*"]
 12 |   pull_request:
 13 |     paths:
 14 |       - "flatgfa/**"
 15 |       - "flatgfa-py/**"
 16 | 
 17 | permissions:
 18 |   contents: read
 19 | 
 20 | jobs:
 21 |   linux:
 22 |     runs-on: ${{ matrix.platform.runner }}
 23 |     strategy:
 24 |       matrix:
 25 |         platform:
 26 |           - runner: ubuntu-latest
 27 |             target: x86_64
 28 |     steps:
 29 |       - uses: actions/checkout@v4
 30 |       - uses: actions/setup-python@v5
 31 |         with:
 32 |           python-version: "3.10"
 33 |       - name: Build wheels
 34 |         uses: PyO3/maturin-action@v1
 35 |         with:
 36 |           target: ${{ matrix.platform.target }}
 37 |           args: --release --out dist --manifest-path flatgfa-py/Cargo.toml
 38 |           sccache: "true"
 39 |           manylinux: auto
 40 |       - name: Upload wheels
 41 |         uses: actions/upload-artifact@v4
 42 |         with:
 43 |           name: wheels-linux-${{ matrix.platform.target }}
 44 |           path: dist
 45 |       - name: pytest
 46 |         if: ${{ startsWith(matrix.platform.target, 'x86_64') }}
 47 |         shell: bash
 48 |         run: |
 49 |           set -e
 50 |           pip install ./dist/flatgfa*.whl
 51 |           pip install pytest
 52 |           cd flatgfa-py && pytest
 53 | 
 54 |   windows:
 55 |     runs-on: ${{ matrix.platform.runner }}
 56 |     strategy:
 57 |       matrix:
 58 |         platform:
 59 |           - runner: windows-latest
 60 |             target: x64
 61 |     steps:
 62 |       - uses: actions/checkout@v4
 63 |       - uses: actions/setup-python@v5
 64 |         with:
 65 |           python-version: "3.10"
 66 |           architecture: ${{ matrix.platform.target }}
 67 |       - name: Build wheels
 68 |         uses: PyO3/maturin-action@v1
 69 |         with:
 70 |           target: ${{ matrix.platform.target }}
 71 |           args: --release --out dist --manifest-path flatgfa-py/Cargo.toml
 72 |           sccache: "true"
 73 |       - name: Upload wheels
 74 |         uses: actions/upload-artifact@v4
 75 |         with:
 76 |           name: wheels-windows-${{ matrix.platform.target }}
 77 |           path: dist
 78 |       - name: pytest
 79 |         if: ${{ !startsWith(matrix.platform.target, 'aarch64') }}
 80 |         shell: bash
 81 |         run: |
 82 |           set -e
 83 |           pip install ./dist/flatgfa*.whl
 84 |           pip install pytest
 85 |           cd flatgfa-py && pytest
 86 | 
 87 |   macos:
 88 |     runs-on: ${{ matrix.platform.runner }}
 89 |     strategy:
 90 |       matrix:
 91 |         platform:
 92 |           - runner: macos-latest
 93 |             target: x86_64
 94 |           - runner: macos-latest
 95 |             target: aarch64
 96 |     steps:
 97 |       - uses: actions/checkout@v4
 98 |       - uses: actions/setup-python@v5
 99 |         with:
100 |           python-version: "3.10"
101 |       - name: Build wheels
102 |         uses: PyO3/maturin-action@v1
103 |         with:
104 |           target: ${{ matrix.platform.target }}
105 |           args: --release --out dist --manifest-path flatgfa-py/Cargo.toml
106 |           sccache: "true"
107 |       - name: Upload wheels
108 |         uses: actions/upload-artifact@v4
109 |         with:
110 |           name: wheels-macos-${{ matrix.platform.target }}
111 |           path: dist
112 |       - name: pytest
113 |         if: ${{ startsWith(matrix.platform.target, 'aarch64') }}
114 |         shell: bash
115 |         run: |
116 |           set -e
117 |           pip install ./dist/flatgfa*.whl
118 |           pip install pytest
119 |           cd flatgfa-py && pytest
120 | 
121 |   sdist:
122 |     runs-on: ubuntu-latest
123 |     steps:
124 |       - uses: actions/checkout@v4
125 |       - name: Build sdist
126 |         uses: PyO3/maturin-action@v1
127 |         with:
128 |           command: sdist
129 |           args: --out dist --manifest-path flatgfa-py/Cargo.toml
130 |       - name: Upload sdist
131 |         uses: actions/upload-artifact@v4
132 |         with:
133 |           name: wheels-sdist
134 |           path: dist
135 | 
136 |   release:
137 |     name: Release
138 |     runs-on: ubuntu-latest
139 |     environment: release
140 |     if: "startsWith(github.ref, 'refs/tags/')"
141 |     needs: [linux, windows, macos, sdist]
142 |     permissions:
143 |       id-token: write
144 |     steps:
145 |       - uses: actions/download-artifact@v4
146 |       - name: Publish to PyPI
147 |         uses: PyO3/maturin-action@v1
148 |         with:
149 |           command: upload
150 |           args: --non-interactive --skip-existing wheels-*/*
151 | 


--------------------------------------------------------------------------------
/flatgfa-py/docs/index.rst:
--------------------------------------------------------------------------------
  1 | FlatGFA: An Efficient Pangenome Representation
  2 | ==============================================
  3 | 
  4 | .. py:module:: flatgfa
  5 | 
  6 | `FlatGFA`_ is an efficient on-disk and in-memory way to represent
  7 | pangenomic variation graphs. It can losslessly represent `GFA`_ files.
  8 | Here's a quick example::
  9 | 
 10 |     import flatgfa
 11 |     from collections import Counter
 12 | 
 13 |     graph = flatgfa.parse("something.gfa")
 14 |     depths = Counter()
 15 |     for path in graph.paths:
 16 |         for step in path:
 17 |             depths[step.segment.id] += 1
 18 | 
 19 |     print('#node.id\tdepth')
 20 |     for seg in graph.segments:
 21 |         print('{}\t{}'.format(seg.name, depths[seg.id]))
 22 | 
 23 | This example computes the `node depth`_ for every segment in a graph.
 24 | It starts by parsing a GFA text file, but FlatGFA also has its own efficient
 25 | binary representation---you can read and write this format with
 26 | :func:`load` and :meth:`FlatGFA.write_flatgfa`.
 27 | 
 28 | The library is on `PyPI`_, so you can get started by typing
 29 | ``pip install flatgfa``.
 30 | 
 31 | .. _GFA: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md
 32 | .. _node depth: https://odgi.readthedocs.io/en/latest/rst/commands/odgi_depth.html
 33 | .. _FlatGFA: https://github.com/cucapra/pollen/tree/main/flatgfa
 34 | .. _PyPI: https://pypi.org/project/flatgfa/
 35 | 
 36 | API Reference
 37 | -------------
 38 | 
 39 | Loading Data
 40 | ''''''''''''
 41 | 
 42 | The FlatGFA library can both read and write files in two formats: the standard
 43 | `GFA`_ text format, and its own efficient binary representation (called
 44 | "FlatGFA" files). Each of these functions below return a :class:`FlatGFA`
 45 | object. Parsing GFA text can take some time, but loading a binary FlatGFA file
 46 | should be very fast.
 47 | 
 48 | .. autofunction:: parse
 49 | 
 50 | .. autofunction:: parse_bytes
 51 | 
 52 | .. autofunction:: load
 53 | 
 54 | GFA Graphs
 55 | ''''''''''
 56 | 
 57 | The :class:`FlatGFA` class provides the entry point to access the data either
 58 | loaded from a FlatGFA binary file or parsed from a GFA text file. Most
 59 | importantly, you can iterate over the :class:`Segment`, :class:`Path`, and
 60 | :class:`Link` objects that it contains. The :class:`FlatGFA` class exposes
 61 | :class:`list`-like containers for each of these types::
 62 | 
 63 |     for seg in graph.segments:
 64 |         print(seg.name)
 65 |     print(graph.segments[0].sequence())
 66 | 
 67 | These containers support both iteration (like the ``for`` above) and random
 68 | access (like ``graph.segments[0]`` above).
 69 | 
 70 | You can also write graphs out to disk using :meth:`FlatGFA.write_gfa`
 71 | (producing a standard GFA text file) and :meth:`FlatGFA.write_flatgfa` (our
 72 | binary format). If you just want a GFA string, use `str(graph)`.
 73 | 
 74 | .. autoclass:: FlatGFA
 75 |    :members:
 76 | 
 77 | The GFA Data Model
 78 | ''''''''''''''''''
 79 | 
 80 | These classes represent the core data model for GFA graphs:
 81 | :class:`Segment` for vertices in the graph,
 82 | :class:`Path` for walks through the graph,
 83 | and :class:`Link` for edges in the graph.
 84 | Internally, all of these objects only contain references to the underlying
 85 | data stored in a :class:`FlatGFA`, so they are very small, but accessing any
 86 | of the associated data (such as the nucleotide sequence for a segment) require
 87 | further lookups.
 88 | 
 89 | The :class:`Handle` class is a segment--orientation pair: both paths and links
 90 | traverse these handles.
 91 | 
 92 | To get a GFA text representation of any of these objects, use ``str(obj)``.
 93 | All these objects are equatable (so you can compare them with ``==``) and
 94 | hashable (so you can store them in dicts and sets). This reflects equality on
 95 | the underlying references to the data store, so two objects are equal if they
 96 | refer to the same index in the same :class:`FlatGFA`.
 97 | 
 98 | .. autoclass:: Segment
 99 |    :members:
100 | 
101 | .. autoclass:: Path
102 |    :members:
103 | 
104 | .. autoclass:: Link
105 |    :members:
106 | 
107 | .. autoclass:: Handle
108 |    :members:
109 | 
110 | .. toctree::
111 |    :maxdepth: 2
112 |    :caption: Contents:
113 | 
114 | Iteration
115 | '''''''''
116 | 
117 | The FlatGFA library exposes special container classes to access the
118 | :class:`Segment`, :class:`Path`, and :class:`Link` objects that make up a GFA
119 | graph. These classes are meant to behave sort of like Python :class:`list`
120 | objects while supporting efficient iteration over FlatGFA's internal
121 | representation.
122 | 
123 | All of these container objects support subscripting (like
124 | ``graph.segments[i]`` where ``i`` is an integer index) and iteration.
125 | 
126 | .. autoclass:: SegmentList
127 |    :members:
128 | 
129 | .. autoclass:: PathList
130 |    :members:
131 | 
132 | .. autoclass:: LinkList
133 |    :members:
134 | 
135 | .. autoclass:: StepList
136 |    :members:
137 | 


--------------------------------------------------------------------------------
/pollen_data_gen/pollen_data_gen/depth.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from typing import Any, Collection, Dict, Union, Optional, List
  3 | import json
  4 | from json import JSONEncoder
  5 | import mygfa
  6 | import mygfa.preprocess
  7 | 
  8 | 
  9 | FormatType = Dict[str, Union[bool, str, int]]
 10 | OutputType = Dict[str, Dict[str, Collection[object]]]
 11 | 
 12 | 
 13 | def format_gen(width: int) -> FormatType:
 14 |     """Generates a format object for a bitvector of length `width`."""
 15 |     return {"is_signed": False, "numeric_type": "bitnum", "width": width}
 16 | 
 17 | 
 18 | def paths_viewed_from_nodes(
 19 |     graph: mygfa.Graph, max_n: int, max_e: int, max_p: int
 20 | ) -> OutputType:
 21 |     """Given a graph, return a dict representing the paths
 22 |     viewed from the PoV of each node.
 23 |     """
 24 |     path2id = {path: id for id, path in enumerate(graph.paths, start=1)}
 25 |     output = {}
 26 |     json_format = format_gen(max_p.bit_length())
 27 |     # segment name, (path name, index on path, direction) list
 28 |     for seg, crossings in mygfa.preprocess.node_steps(graph).items():
 29 |         data = list(path2id[c[0]] for c in crossings)
 30 |         data = data + [0] * (max_e - len(data))
 31 |         output[f"path_ids{seg}"] = {"data": data, "format": json_format}
 32 |     data = [0] * max_e
 33 |     for i in range(len(graph.segments) + 1, max_n + 1):
 34 |         output[f"path_ids{i}"] = {"data": data, "format": json_format}
 35 |     return output
 36 | 
 37 | 
 38 | def paths_to_consider(
 39 |     subset_paths_idx: List[int], max_n: int, max_p: int
 40 | ) -> OutputType:
 41 |     """Currently just a stub; later we will populate this with a
 42 |     bitvector of length MAX_PATHS, where the i'th index will be 1 if
 43 |     the i'th path is to be considered during depth calculation.
 44 | 
 45 |     Somewhat annoyingly, we need as many copies of this bitvector as there
 46 |     are nodes in the graph.
 47 |     """
 48 |     output = {}
 49 |     data = []
 50 |     if subset_paths_idx:
 51 |         data = [0] * (max_p + 1)
 52 |         for path_idx in subset_paths_idx:
 53 |             data[path_idx] = 1
 54 |     else:
 55 |         data = [0] + ([1] * max_p)
 56 | 
 57 |     for i in range(1, max_n + 1):
 58 |         output[f"paths_to_consider{i}"] = {"data": data, "format": format_gen(1)}
 59 |     return output
 60 | 
 61 | 
 62 | class NodeDepthEncoder(JSONEncoder):
 63 |     """Encodes the entire graph as a JSON object, for the purpose of node depth.
 64 | 
 65 |     The exine command `depth` is the oracle for this encoding.
 66 |     """
 67 | 
 68 |     def __init__(
 69 |         self,
 70 |         max_n: int,
 71 |         max_e: int,
 72 |         max_p: int,
 73 |         subset_paths: Optional[List[str]],
 74 |         **kwargs: Any,
 75 |     ) -> None:
 76 |         super(NodeDepthEncoder, self).__init__(**kwargs)
 77 |         self.max_n = max_n
 78 |         self.max_e = max_e
 79 |         self.max_p = max_p
 80 |         self.subset_paths = subset_paths
 81 | 
 82 |     def paths_to_idxs(self, o: mygfa.Graph) -> List[int]:
 83 |         if not self.subset_paths:
 84 |             return []
 85 |         path2id = {path: id for id, path in enumerate(o.paths, start=1)}
 86 |         return list(map(lambda p: path2id[p], self.subset_paths))
 87 | 
 88 |     def default(self, o: Any) -> Dict[str, Dict[str, Collection[object]]]:
 89 |         answer_field = {
 90 |             "depth_output": {
 91 |                 "data": list([0] * self.max_n),
 92 |                 "format": format_gen(self.max_e.bit_length()),
 93 |             }
 94 |         }
 95 |         answer_field_uniq = {
 96 |             "uniq_output": {
 97 |                 "data": list([0] * self.max_n),
 98 |                 "format": format_gen(self.max_p.bit_length()),
 99 |             }
100 |         }
101 |         subset_paths_idx = self.paths_to_idxs(o)
102 |         paths = paths_viewed_from_nodes(
103 |             o, self.max_n, self.max_e, self.max_p
104 |         ) | paths_to_consider(subset_paths_idx, self.max_n, self.max_p)
105 | 
106 |         return answer_field | paths | answer_field_uniq
107 | 
108 | 
109 | def depth_json(
110 |     graph: mygfa.Graph,
111 |     max_n: Optional[int],
112 |     max_e: Optional[int],
113 |     max_p: Optional[int],
114 |     subset_paths: Optional[List[str]],
115 | ) -> str:
116 |     """Returns a JSON representation of `graph`
117 |     that is specific to the exine command `depth`.
118 |     """
119 |     n_tight, e_tight, p_tight = mygfa.preprocess.get_maxes(graph)
120 |     # These values have been calculated automatically, and are likely optimal.
121 |     # However, they are only to be used when the user-does not supply them via CLI.
122 |     if not max_n:
123 |         max_n = n_tight
124 |     if not max_e:
125 |         max_e = e_tight
126 |     if not max_p:
127 |         max_p = p_tight
128 | 
129 |     return NodeDepthEncoder(
130 |         max_n=int(max_n), max_e=int(max_e), max_p=int(max_p), subset_paths=subset_paths
131 |     ).encode(graph)
132 | 
133 | 
134 | def depth_stdout(
135 |     graph: mygfa.Graph, max_n: int, max_e: int, max_p: int, subset_paths: List[str]
136 | ) -> None:
137 |     """Prints a JSON representation of `graph` to stdout."""
138 |     encoding = depth_json(graph, max_n, max_e, max_p, subset_paths)
139 | 
140 |     json.dump(
141 |         json.loads(encoding),
142 |         sys.stdout,
143 |         indent=2,
144 |         sort_keys=True,
145 |     )
146 | 


--------------------------------------------------------------------------------
/flatgfa-py/test/test_flatgfa.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import flatgfa
  3 | import pathlib
  4 | 
  5 | TEST_DIR = pathlib.Path(__file__).parent
  6 | TEST_GFA = TEST_DIR / "tiny.gfa"
  7 | 
  8 | 
  9 | @pytest.fixture
 10 | def gfa():
 11 |     return flatgfa.parse_bytes(TEST_GFA.read_bytes())
 12 | 
 13 | 
 14 | def test_segs(gfa):
 15 |     # `gfa.segments` acts like a list.
 16 |     assert len(gfa.segments) == 4
 17 |     seg = gfa.segments[0]
 18 | 
 19 |     # An individual segment exposes its name and nucleotide sequence.
 20 |     assert seg.name == 1
 21 |     assert seg.sequence() == b"CAAATAAG"
 22 |     assert len(seg) == 8
 23 | 
 24 |     # You can also pull out the entire sequence of segments.
 25 |     seg = list(gfa.segments)[2]
 26 |     assert seg.name == 3
 27 | 
 28 |     # Use `str()` to get a GFA representation.
 29 |     assert str(seg) == "S	3	TTG"
 30 | 
 31 | 
 32 | def test_segs_find(gfa):
 33 |     # There is a method to find a segment by its name (with linear search).
 34 |     seg = gfa.segments.find(3)
 35 |     assert seg.id == 2
 36 |     assert seg.sequence() == b"TTG"
 37 | 
 38 | 
 39 | def test_paths(gfa):
 40 |     # `gfa.paths` similarly acts like a list.
 41 |     assert len(gfa.paths) == 2
 42 |     assert len(list(gfa.paths)) == 2
 43 | 
 44 |     # Individual paths expose their name (a bytestring).
 45 |     path = gfa.paths[0]
 46 |     assert path.name == "one"
 47 | 
 48 |     # GFA representation.
 49 |     assert str(path) == "P	one	1+,2+,4-	*"
 50 | 
 51 | 
 52 | def test_paths_find(gfa):
 53 |     # There is a method to find a path by its name.
 54 |     path = gfa.paths.find("two")
 55 |     assert path.id == 1
 56 |     assert path.name == "two"
 57 | 
 58 | 
 59 | def test_path_steps(gfa):
 60 |     # When you get a path, the path itself acts as a list of steps (handles).
 61 |     path = gfa.paths[1]
 62 |     assert len(path) == 4
 63 |     assert len(list(path)) == 4
 64 |     step = path[0]
 65 | 
 66 |     # A step (handle) is a reference to a segment and an orientation.
 67 |     assert step.segment.name == 1
 68 |     assert step.is_forward
 69 | 
 70 |     # GFA representation.
 71 |     assert str(step) == "1+"
 72 | 
 73 | 
 74 | def test_links(gfa):
 75 |     # You guessed it: `gfa.links` behaves as a list too.
 76 |     assert len(gfa.links) == 4
 77 |     assert len(list(gfa.links)) == 4
 78 |     link = gfa.links[1]
 79 | 
 80 |     # A link has a "from" handle and a "to" handle.
 81 |     assert link.from_.segment.name == 2
 82 |     assert link.from_.is_forward
 83 |     assert link.to.segment.name == 4
 84 |     assert not link.to.is_forward
 85 | 
 86 |     # GFA representation.
 87 |     assert str(link) == "L	2	+	4	-	0M"
 88 | 
 89 | 
 90 | def test_gfa_str(gfa):
 91 |     with open(TEST_GFA, "r") as f:
 92 |         orig_gfa = f.read()
 93 | 
 94 |     # You can serialize a graph as GFA text.
 95 |     assert str(gfa) == orig_gfa
 96 | 
 97 | 
 98 | def test_read_write_gfa(gfa, tmp_path):
 99 |     # You can write FlatGFA objects as GFA text files.
100 |     gfa_path = str(tmp_path / "tiny.gfa")
101 |     gfa.write_gfa(gfa_path)
102 |     with open(TEST_GFA, "rb") as orig_f:
103 |         with open(gfa_path, "rb") as written_f:
104 |             assert orig_f.read() == written_f.read()
105 | 
106 |     # You can also parse GFA text files from the filesystem.
107 |     new_gfa = flatgfa.parse(gfa_path)
108 |     assert len(new_gfa.segments) == len(gfa.segments)
109 | 
110 | 
111 | def test_read_write_flatgfa(gfa, tmp_path):
112 |     # You can write FlatGFA graphs in our native binary format too.
113 |     flatgfa_path = str(tmp_path / "tiny.flatgfa")
114 |     gfa.write_flatgfa(flatgfa_path)
115 | 
116 |     # And read them back, which should be very fast indeed.
117 |     new_gfa = flatgfa.load(flatgfa_path)
118 |     assert len(new_gfa.segments) == len(gfa.segments)
119 | 
120 | 
121 | def test_eq(gfa):
122 |     # The various data components are equatable.
123 |     assert gfa.segments[0] == gfa.segments[0]
124 |     assert gfa.segments[0] != gfa.segments[1]
125 |     assert gfa.paths[0] == gfa.paths[0]
126 |     assert gfa.paths[0] != gfa.paths[1]
127 |     assert gfa.links[0] == gfa.links[0]
128 |     assert gfa.links[0] != gfa.links[1]
129 | 
130 |     # Including handles, which do not have their own identity.
131 |     assert gfa.links[1].from_ == gfa.links[2].from_
132 |     assert gfa.links[1].from_ != gfa.links[1].to
133 | 
134 | 
135 | def test_hash(gfa):
136 |     # The objects are also hashable, so you can put them in dicts and sets.
137 |     d = {
138 |         gfa.segments[0]: "foo",
139 |         gfa.paths[0]: "bar",
140 |         gfa.links[0]: "baz",
141 |         gfa.links[1].from_: "qux",
142 |     }
143 |     assert d[gfa.segments[0]] == "foo"
144 |     assert d[gfa.paths[0]] == "bar"
145 |     assert d[gfa.links[0]] == "baz"
146 |     assert d[gfa.links[1].from_] == "qux"
147 | 
148 | 
149 | def test_slice(gfa):
150 |     # The various container types can be sliced to get narrower ranges.
151 |     assert len(gfa.segments[1:3]) == 2
152 |     assert len(gfa.segments[2:]) == len(gfa.segments) - 2
153 |     assert gfa.segments[1:3][0].name == gfa.segments[1].name
154 | 
155 |     assert len(gfa.paths[1:]) == 1
156 |     assert len(gfa.links[2:100]) == 2
157 | 
158 |     assert len(list(gfa.paths[:1])) == 1
159 | 
160 |     # Including paths, which act like lists of steps.
161 |     path = gfa.paths[0]
162 |     assert len(path[2:]) == len(path) - 2
163 |     assert path[2:][0] == path[2]
164 |     assert len(list(path[2:])) == len(path) - 2
165 | 


--------------------------------------------------------------------------------
/flatgfa/src/flatbed.rs:
--------------------------------------------------------------------------------
  1 | use crate::gfaline::parse_field;
  2 | use crate::memfile::MemchrSplit;
  3 | use crate::pool::{FixedStore, HeapStore, Id, Pool, Span, Store};
  4 | use atoi::FromRadix10;
  5 | use bstr::BStr;
  6 | use zerocopy::{FromBytes, IntoBytes};
  7 | 
  8 | /// A single interval from a BED file.
  9 | #[derive(Debug, FromBytes, IntoBytes, Clone, Copy)]
 10 | #[repr(C, packed)]
 11 | pub struct BEDEntry {
 12 |     pub name: Span<u8>,
 13 |     pub start: u64,
 14 |     pub end: u64,
 15 | }
 16 | 
 17 | /// A flat representation of an entire BED file, i.e., a list of named intervals.
 18 | pub struct FlatBED<'a> {
 19 |     pub name_data: Pool<'a, u8>,
 20 |     pub entries: Pool<'a, BEDEntry>,
 21 | }
 22 | 
 23 | impl FlatBED<'_> {
 24 |     /// Get the number of entries in this BED file
 25 |     pub fn get_num_entries(&self) -> usize {
 26 |         self.entries.len()
 27 |     }
 28 | 
 29 |     /// Get the name of a specific entry as a string
 30 |     pub fn get_name_of_entry(&self, entry: &BEDEntry) -> &BStr {
 31 |         self.name_data[entry.name].as_ref()
 32 |     }
 33 | 
 34 |     /// Get a list of all BED entries from this file that intersect with `entry`.
 35 |     /// `bed` is the the file that `entry` is located in, which need not be self.
 36 |     pub fn get_intersects(&self, bed: &FlatBED, entry: &BEDEntry) -> Vec<BEDEntry> {
 37 |         self.entries
 38 |             .all()
 39 |             .iter()
 40 |             // To be compatible with bedtools, entries that partially overlap only
 41 |             // report the overlapping portion, so we need to construct new entries
 42 |             // here to only contain the overlap
 43 |             .map(|x| BEDEntry {
 44 |                 name: x.name,
 45 |                 start: if x.start < entry.start {
 46 |                     entry.start
 47 |                 } else {
 48 |                     x.start
 49 |                 },
 50 |                 end: if entry.end < x.end { entry.end } else { x.end },
 51 |             })
 52 |             .filter(|x| {
 53 |                 bed.get_name_of_entry(entry).eq(self.get_name_of_entry(x)) && x.end > x.start
 54 |             })
 55 |             .collect()
 56 |     }
 57 | }
 58 | 
 59 | /// The data storage pools for a `FlatBED`.
 60 | #[derive(Default)]
 61 | pub struct BEDStore<'a, P: StoreFamily<'a>> {
 62 |     pub name_data: P::Store<u8>,
 63 |     pub entries: P::Store<BEDEntry>,
 64 | }
 65 | 
 66 | impl<'a, P: StoreFamily<'a>> BEDStore<'a, P> {
 67 |     pub fn add_entry(&mut self, name: &[u8], start: u64, end: u64) -> Id<BEDEntry> {
 68 |         let name = self.name_data.add_slice(name);
 69 |         self.entries.add(BEDEntry { name, start, end })
 70 |     }
 71 | 
 72 |     pub fn as_ref(&self) -> FlatBED<'_> {
 73 |         FlatBED {
 74 |             name_data: self.name_data.as_ref(),
 75 |             entries: self.entries.as_ref(),
 76 |         }
 77 |     }
 78 | }
 79 | 
 80 | pub trait StoreFamily<'a> {
 81 |     type Store<T: Clone + 'a>: Store<T>;
 82 | }
 83 | 
 84 | #[derive(Default)]
 85 | pub struct HeapFamily;
 86 | impl<'a> StoreFamily<'a> for HeapFamily {
 87 |     type Store<T: Clone + 'a> = HeapStore<T>;
 88 | }
 89 | 
 90 | pub struct FixedFamily;
 91 | impl<'a> StoreFamily<'a> for FixedFamily {
 92 |     type Store<T: Clone + 'a> = FixedStore<'a, T>;
 93 | }
 94 | 
 95 | /// A store for `FlatBED` data backed by fixed-size slices.
 96 | ///
 97 | /// This store contains `SliceVec`s, which act like `Vec`s but are allocated within
 98 | /// a fixed region. This means they have a maximum size, but they can directly map
 99 | /// onto the contents of a file.
100 | pub type FixedBEDStore<'a> = BEDStore<'a, FixedFamily>;
101 | 
102 | /// A mutable, in-memory data store for `FlatBED`.
103 | ///
104 | /// This store contains a bunch of `Vec`s: one per array required to implement a
105 | /// `FlatBED`. It exposes an API for building up a BED data structure, so it is
106 | /// useful for creating new ones from scratch.
107 | pub type HeapBEDStore = BEDStore<'static, HeapFamily>;
108 | 
109 | type ParseResult<T> = Result<T, &'static str>;
110 | type PartialParseResult<'a, T> = ParseResult<(T, &'a [u8])>;
111 | fn parse_num<T: FromRadix10>(s: &[u8]) -> PartialParseResult<'_, T> {
112 |     match T::from_radix_10(s) {
113 |         (_, 0) => Err("expected number"),
114 |         (num, used) => Ok((num, &s[used..])),
115 |     }
116 | }
117 | 
118 | pub struct BEDParser<'a, P: StoreFamily<'a>> {
119 |     /// The flat representation we're building.
120 |     flat: BEDStore<'a, P>,
121 | }
122 | 
123 | impl<'a, P: StoreFamily<'a>> BEDParser<'a, P> {
124 |     pub fn new(builder: BEDStore<'a, P>) -> Self {
125 |         Self { flat: builder }
126 |     }
127 | 
128 |     /// Parse a BED text file from an in-memory buffer.
129 |     pub fn parse_mem(mut self, buf: &[u8]) -> BEDStore<'a, P> {
130 |         for line in MemchrSplit::new(b'\n', buf) {
131 |             let (name_slice, rest) = parse_field(line).unwrap();
132 |             let (start_num, rest) = parse_num(rest).unwrap();
133 |             let (end_num, _) = parse_num(&rest[1..]).unwrap();
134 | 
135 |             self.flat.add_entry(name_slice, start_num, end_num);
136 |         }
137 | 
138 |         self.flat
139 |     }
140 | }
141 | 
142 | impl BEDParser<'static, HeapFamily> {
143 |     pub fn for_heap() -> Self {
144 |         Self::new(HeapBEDStore::default())
145 |     }
146 | }
147 | 
148 | impl<'a> BEDParser<'a, FixedFamily> {
149 |     pub fn for_slice(store: FixedBEDStore<'a>) -> Self {
150 |         Self::new(store)
151 |     }
152 | }
153 | 


--------------------------------------------------------------------------------
/flatgfa/src/print.rs:
--------------------------------------------------------------------------------
  1 | use crate::flatgfa;
  2 | use std::fmt;
  3 | 
  4 | impl fmt::Display for flatgfa::Orientation {
  5 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  6 |         match self {
  7 |             flatgfa::Orientation::Forward => write!(f, "+"),
  8 |             flatgfa::Orientation::Backward => write!(f, "-"),
  9 |         }
 10 |     }
 11 | }
 12 | 
 13 | impl fmt::Display for flatgfa::AlignOpcode {
 14 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 15 |         match self {
 16 |             flatgfa::AlignOpcode::Match => write!(f, "M"),
 17 |             flatgfa::AlignOpcode::Gap => write!(f, "N"),
 18 |             flatgfa::AlignOpcode::Insertion => write!(f, "D"),
 19 |             flatgfa::AlignOpcode::Deletion => write!(f, "I"),
 20 |         }
 21 |     }
 22 | }
 23 | 
 24 | impl fmt::Display for flatgfa::Alignment<'_> {
 25 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 26 |         if self.ops.is_empty() {
 27 |             write!(f, "0M")?;
 28 |         }
 29 |         for op in self.ops {
 30 |             write!(f, "{}{}", op.len(), op.op())?;
 31 |         }
 32 |         Ok(())
 33 |     }
 34 | }
 35 | 
 36 | /// A wrapper for displaying components from FlatGFA.
 37 | pub struct Display<'a, T>(pub &'a flatgfa::FlatGFA<'a>, pub T);
 38 | 
 39 | impl fmt::Display for Display<'_, flatgfa::Handle> {
 40 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 41 |         let seg = self.0.get_handle_seg(self.1);
 42 |         let name = seg.name;
 43 |         write!(f, "{}{}", name, self.1.orient())
 44 |     }
 45 | }
 46 | 
 47 | impl fmt::Display for Display<'_, &flatgfa::Path> {
 48 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 49 |         write!(f, "P\t{}\t", self.0.get_path_name(self.1))?;
 50 |         let steps = &self.0.steps[self.1.steps];
 51 |         write!(f, "{}", Display(self.0, steps[0]))?;
 52 |         for step in steps[1..].iter() {
 53 |             write!(f, ",{}", Display(self.0, *step))?;
 54 |         }
 55 |         write!(f, "\t")?;
 56 |         let overlaps = &self.0.overlaps[self.1.overlaps];
 57 |         if overlaps.is_empty() {
 58 |             write!(f, "*")?;
 59 |         } else {
 60 |             write!(f, "{}", self.0.get_alignment(overlaps[0]))?;
 61 |             for overlap in overlaps[1..].iter() {
 62 |                 write!(f, ",{}", self.0.get_alignment(*overlap))?;
 63 |             }
 64 |         }
 65 |         Ok(())
 66 |     }
 67 | }
 68 | 
 69 | impl fmt::Display for Display<'_, &flatgfa::Link> {
 70 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 71 |         let from = self.1.from;
 72 |         let from_name = self.0.get_handle_seg(from).name;
 73 |         let to = self.1.to;
 74 |         let to_name = self.0.get_handle_seg(to).name;
 75 |         write!(
 76 |             f,
 77 |             "L\t{}\t{}\t{}\t{}\t{}",
 78 |             from_name,
 79 |             from.orient(),
 80 |             to_name,
 81 |             to.orient(),
 82 |             self.0.get_alignment(self.1.overlap)
 83 |         )
 84 |     }
 85 | }
 86 | 
 87 | impl fmt::Display for Display<'_, &flatgfa::Segment> {
 88 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 89 |         let name = self.1.name;
 90 |         write!(f, "S\t{}\t{}", name, self.0.get_seq(self.1))?;
 91 |         if !self.1.optional.is_empty() {
 92 |             write!(f, "\t{}", self.0.get_optional_data(self.1))?;
 93 |         }
 94 |         Ok(())
 95 |     }
 96 | }
 97 | 
 98 | /// Print a graph in the order preserved from an original GFA file.
 99 | fn write_preserved(gfa: &flatgfa::FlatGFA, f: &mut fmt::Formatter<'_>) -> fmt::Result {
100 |     let mut seg_iter = gfa.segs.all().iter();
101 |     let mut path_iter = gfa.paths.all().iter();
102 |     let mut link_iter = gfa.links.all().iter();
103 |     for kind in gfa.get_line_order() {
104 |         match kind {
105 |             flatgfa::LineKind::Header => {
106 |                 let version = gfa.header;
107 |                 assert!(!version.is_empty());
108 |                 writeln!(f, "H\t{}", bstr::BStr::new(version.all()))?;
109 |             }
110 |             flatgfa::LineKind::Segment => {
111 |                 let seg = seg_iter.next().expect("too few segments");
112 |                 writeln!(f, "{}", Display(gfa, seg))?;
113 |             }
114 |             flatgfa::LineKind::Path => {
115 |                 let path = path_iter.next().expect("too few paths");
116 |                 writeln!(f, "{}", Display(gfa, path))?;
117 |             }
118 |             flatgfa::LineKind::Link => {
119 |                 let link = link_iter.next().expect("too few links");
120 |                 writeln!(f, "{}", Display(gfa, link))?;
121 |             }
122 |         }
123 |     }
124 |     Ok(())
125 | }
126 | 
127 | /// Print a graph in a normalized order, ignoring the original GFA line order.
128 | pub fn write_normalized(gfa: &flatgfa::FlatGFA, f: &mut fmt::Formatter<'_>) -> fmt::Result {
129 |     if !gfa.header.is_empty() {
130 |         writeln!(f, "H\t{}", bstr::BStr::new(gfa.header.all()))?;
131 |     }
132 |     for seg in gfa.segs.all().iter() {
133 |         writeln!(f, "{}", Display(gfa, seg))?;
134 |     }
135 |     for path in gfa.paths.all().iter() {
136 |         writeln!(f, "{}", Display(gfa, path))?;
137 |     }
138 |     for link in gfa.links.all().iter() {
139 |         writeln!(f, "{}", Display(gfa, link))?;
140 |     }
141 |     Ok(())
142 | }
143 | 
144 | /// Print our flat representation as in GFA text format.
145 | impl<'a> fmt::Display for &'a flatgfa::FlatGFA<'a> {
146 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
147 |         if self.line_order.is_empty() {
148 |             write_normalized(self, f)
149 |         } else {
150 |             write_preserved(self, f)
151 |         }
152 |     }
153 | }
154 | 


--------------------------------------------------------------------------------
/tests/turnt.toml:
--------------------------------------------------------------------------------
  1 | [envs.chop_oracle]
  2 | binary = true 
  3 | command = "odgi chop -i {filename} -c 3 -o - | odgi view -g -i - | slow_odgi norm --nl"
  4 | output.chop = "-"
  5 | 
  6 | [envs.chop_test]
  7 | binary = true
  8 | command = "slow_odgi chop {filename} -n 3"
  9 | output.chop = "-"
 10 | 
 11 | [envs.crush_oracle]
 12 | binary = true
 13 | command = "odgi crush -i {filename} -o - | odgi view -g -i - | slow_odgi norm"
 14 | output.crush = "-"
 15 | 
 16 | [envs.crush_test]
 17 | binary = true
 18 | command = "slow_odgi crush {filename}"
 19 | output.crush = "-"
 20 | 
 21 | [envs.degree_oracle]
 22 | binary = true
 23 | command = "odgi degree -d --input={filename}"
 24 | output.degree = "-"
 25 | 
 26 | [envs.degree_test]
 27 | binary = true
 28 | command = "slow_odgi degree {filename}"
 29 | output.degree = "-"
 30 | 
 31 | [envs.depth_setup]
 32 | binary = true
 33 | command = "slow_odgi somepaths --drop 50 {filename}"
 34 | output.depthpaths = "-"
 35 | 
 36 | [envs.depth_oracle]
 37 | binary = true
 38 | command = "odgi depth -d -i {filename} -s {base}.depthpaths"
 39 | output.depth = "-"
 40 | 
 41 | [envs.depth_test]
 42 | binary = true
 43 | command = "slow_odgi depth --paths {base}.depthpaths {filename}"
 44 | output.depth = "-"
 45 | 
 46 | [envs.flatten_oracle]
 47 | binary = true
 48 | command = "odgi flatten -i {filename} -f {base}.flatten.fasta -b {base}.flatten.bed; cat {base}.flatten.fasta; cat {base}.flatten.bed"
 49 | output.flatten = "-"
 50 | 
 51 | [envs.flatten_test]
 52 | binary = true
 53 | command = "slow_odgi flatten {filename}"
 54 | output.flatten = "-"
 55 | 
 56 | [envs.flip_oracle]
 57 | binary = true
 58 | command = "odgi flip -i {filename} -o - | odgi view -g -i - | slow_odgi norm"
 59 | output.flip = "-"
 60 | 
 61 | [envs.flip_test]
 62 | binary = true
 63 | command = "slow_odgi flip {filename}"
 64 | output.flip = "-"
 65 | 
 66 | [envs.inject_setup]
 67 | binary = true
 68 | command = "slow_odgi inject_setup < {filename}"
 69 | output.bed = "-"
 70 | 
 71 | [envs.inject_oracle]
 72 | binary = true
 73 | command = "odgi inject -i {filename} -b {base}.bed -o - | odgi view -g -i - | slow_odgi norm --nl"
 74 | output.inj = "-"
 75 | 
 76 | [envs.inject_test]
 77 | binary = true
 78 | command = "slow_odgi inject --bed {base}.bed {filename}"
 79 | output.inj = "-"
 80 | 
 81 | [envs.matrix_oracle]
 82 | binary = true
 83 | command = "odgi matrix -i {filename} | sort"
 84 | output.matrix = "-"
 85 | 
 86 | [envs.matrix_test]
 87 | binary = true
 88 | command = "slow_odgi matrix {filename} | sort"
 89 | output.matrix = "-"
 90 | 
 91 | [envs.norm_oracle]
 92 | binary = true
 93 | command = "odgi view -g -i {filename} | slow_odgi norm"
 94 | output.norm = "-"
 95 | 
 96 | [envs.norm_test]
 97 | binary = true
 98 | command = "slow_odgi norm {filename}"
 99 | output.norm = "-"
100 | 
101 | [envs.overlap_setup]
102 | binary = true
103 | command = "slow_odgi somepaths --drop 50 {filename}"
104 | output.overlappaths = "-"
105 | 
106 | [envs.overlap_oracle]
107 | binary = true
108 | command = "odgi overlap -i {filename} -R {base}.overlappaths"
109 | output.overlap = "-"
110 | 
111 | [envs.overlap_test]
112 | binary = true
113 | command = "slow_odgi overlap --paths {base}.overlappaths {filename}"
114 | output.overlap = "-"
115 | 
116 | [envs.paths_oracle]
117 | binary = true
118 | command = "odgi paths -i {filename} -L"
119 | output.paths = "-"
120 | 
121 | [envs.paths_test]
122 | binary = true
123 | command = "slow_odgi paths {filename}"
124 | output.paths = "-"
125 | 
126 | # Drop some links in the "real" input graphs to produce invalid graphs, in the
127 | # `invalid` subdirectory, that will yield interesting errors when running
128 | # validation.
129 | [envs.validate_setup]
130 | binary = true
131 | command = "slow_odgi validate_setup < {filename}"
132 | output.gfa = "-"
133 | out_dir = "invalid"
134 | 
135 | [envs.validate_oracle]
136 | binary = true
137 | command = "odgi validate -i {filename} 2>&1"
138 | output.validate = "-"
139 | 
140 | # An alternate version for graphs that are supposed to fail validation.
141 | [envs.validate_oracle_err]
142 | binary = true
143 | command = "odgi validate -i {filename} 2>&1"
144 | output.validate = "-"
145 | return_code = 1
146 | 
147 | [envs.validate_test]
148 | binary = true
149 | command = "slow_odgi validate {filename}"
150 | output.validate = "-"
151 | 
152 | [envs.pollen_data_gen_depth_oracle]
153 | binary = true
154 | command = "exine depth -d {filename} -a {filename}"
155 | output.json = "-"
156 | 
157 | [envs.pollen_data_gen_depth_test]
158 | binary = true
159 | command = "pollen_data_gen simple {filename} | jq .depth"
160 | output.json = "-"
161 | 
162 | [envs.flatgfa_mem]
163 | command = "../target/debug/fgfa < {filename}"
164 | output.gfa = "-"
165 | 
166 | [envs.flatgfa_file]
167 | command = "../target/debug/fgfa -o {base}.flatgfa < {filename} ; ../target/debug/fgfa -i {base}.flatgfa"
168 | output.gfa = "-"
169 | 
170 | [envs.flatgfa_file_inplace]
171 | command = "../target/debug/fgfa -m -p 128 -o {base}.inplace.flatgfa -I {filename} ; ../target/debug/fgfa -m -i {base}.inplace.flatgfa"
172 | output.gfa = "-"
173 | 
174 | [envs.odgi_depth]
175 | binary = true
176 | command = "odgi depth -d -i {filename}"
177 | output.depth = "-"
178 | 
179 | [envs.flatgfa_depth]
180 | command = "../target/debug/fgfa -I {filename} depth"
181 | output.depth = "-"
182 | 
183 | [envs.chop_oracle_fgfa]
184 | binary = true
185 | command = "odgi chop -i {filename} -c 3 -o - | odgi view -g -i - | slow_odgi norm"
186 | output.chop = "-"
187 | 
188 | [envs.flatgfa_chop]
189 | command = "../target/debug/fgfa -I {filename} chop -l -c 3 | slow_odgi norm"
190 | output.chop = "-"
191 | 
192 | [envs.odgi_extract]
193 | binary = true
194 | command = "odgi extract -i {filename} -n 3 -c 3 -o - | odgi view -g -i - | slow_odgi norm"
195 | output.extract = "-"
196 | 
197 | [envs.flatgfa_extract]
198 | command = "../target/debug/fgfa -I {filename} extract -n 3 -c 3 | slow_odgi norm"
199 | output.extract = "-"
200 | 


--------------------------------------------------------------------------------
/flatgfa/src/ops/chop.rs:
--------------------------------------------------------------------------------
  1 | use crate::flatgfa::{self, Handle, Link, Orientation, Path, Segment};
  2 | use crate::pool::{Id, Span, Store};
  3 | use crate::{GFAStore, HeapFamily};
  4 | 
  5 | pub fn chop(gfa: &flatgfa::FlatGFA, max_size: usize, incl_links: bool) -> flatgfa::HeapGFAStore {
  6 |     let mut flat = flatgfa::HeapGFAStore::default();
  7 | 
  8 |     // when segment S is chopped into segments S1 through S2 (exclusive),
  9 |     // seg_map[S.name] = Span(Id(S1.name), Id(S2.name)). If S is not chopped: S=S1, S2.name = S1.name+1
 10 |     let mut seg_map: Vec<Span<Segment>> = Vec::new();
 11 |     // The smallest id (>0) which does not already belong to a segment in `flat`
 12 |     let mut max_node_id = 1;
 13 | 
 14 |     fn link_forward(flat: &mut GFAStore<'static, HeapFamily>, span: &Span<Segment>) {
 15 |         // Link segments spanned by `span` from head to tail
 16 |         let overlap = Span::new_empty();
 17 |         flat.add_links((span.start.index()..span.end.index() - 1).map(|idx| Link {
 18 |             from: Handle::new(Id::new(idx), Orientation::Forward),
 19 |             to: Handle::new(Id::new(idx + 1), Orientation::Forward),
 20 |             overlap,
 21 |         }));
 22 |     }
 23 | 
 24 |     // Add new, chopped segments
 25 |     for seg in gfa.segs.all().iter() {
 26 |         let len = seg.len();
 27 |         if len <= max_size {
 28 |             // Leave the segment as is
 29 |             let id = flat.segs.add(Segment {
 30 |                 name: max_node_id,
 31 |                 seq: seg.seq,
 32 |                 optional: Span::new_empty(), // TODO: Optional data may stay valid when seg not chopped?
 33 |             });
 34 |             max_node_id += 1;
 35 |             seg_map.push(Span::new(id, flat.segs.next_id()));
 36 |         } else {
 37 |             let seq_end = seg.seq.end;
 38 |             let mut offset = seg.seq.start.index();
 39 |             let segs_start = flat.segs.next_id();
 40 |             // Could also generate end_id by setting it equal to the start_id and
 41 |             // updating it for each segment that is added - only benefits us if we
 42 |             // don't unroll the last iteration of this loop
 43 |             while offset < seq_end.index() - max_size {
 44 |                 // Generate a new segment of length c
 45 |                 flat.segs.add(Segment {
 46 |                     name: max_node_id,
 47 |                     seq: Span::new(Id::new(offset), Id::new(offset + max_size)),
 48 |                     optional: Span::new_empty(),
 49 |                 });
 50 |                 offset += max_size;
 51 |                 max_node_id += 1;
 52 |             }
 53 |             // Generate the last segment
 54 |             flat.segs.add(Segment {
 55 |                 name: max_node_id,
 56 |                 seq: Span::new(Id::new(offset), seq_end),
 57 |                 optional: Span::new_empty(),
 58 |             });
 59 |             max_node_id += 1;
 60 |             let new_seg_span = Span::new(segs_start, flat.segs.next_id());
 61 |             seg_map.push(new_seg_span);
 62 |             if incl_links {
 63 |                 link_forward(&mut flat, &new_seg_span);
 64 |             }
 65 |         }
 66 |     }
 67 | 
 68 |     // For each path, add updated handles. Then add the updated path
 69 |     for path in gfa.paths.all().iter() {
 70 |         let path_start = flat.steps.next_id();
 71 |         let mut path_end = flat.steps.next_id();
 72 |         // Generate the new handles
 73 |         // Tentative to-do: see if it is faster to read Id from segs than to re-generate it?
 74 |         for step in gfa.get_path_steps(path) {
 75 |             let range = {
 76 |                 let span = seg_map[step.segment().index()];
 77 |                 std::ops::Range::from(span)
 78 |             };
 79 |             match step.orient() {
 80 |                 Orientation::Forward => {
 81 |                     // In this builder, Id.index() == seg.name - 1 for all seg
 82 |                     path_end = flat
 83 |                         .add_steps(range.map(|idx| Handle::new(Id::new(idx), Orientation::Forward)))
 84 |                         .end;
 85 |                 }
 86 |                 Orientation::Backward => {
 87 |                     path_end = flat
 88 |                         .add_steps(
 89 |                             range
 90 |                                 .rev()
 91 |                                 .map(|idx| Handle::new(Id::new(idx), Orientation::Backward)),
 92 |                         )
 93 |                         .end;
 94 |                 }
 95 |             }
 96 |         }
 97 | 
 98 |         // Add the updated path
 99 |         flat.paths.add(Path {
100 |             name: path.name,
101 |             steps: Span::new(path_start, path_end),
102 |             overlaps: Span::new_empty(),
103 |         });
104 |     }
105 | 
106 |     // If the 'l' flag is specified, compute the links in the new graph
107 |     if incl_links {
108 |         // For each link in the old graph, from handle A -> B:
109 |         //      Add a link from
110 |         //          (A.forward ? (A.end, forward) : (A.begin, backwards))
111 |         //          -> (B.forward ? (B.begin, forward) : (B.end ? backwards))
112 | 
113 |         for link in gfa.links.all().iter() {
114 |             let new_from = {
115 |                 let old_from = link.from;
116 |                 let chopped_segs = seg_map[old_from.segment().index()];
117 |                 let seg_id = match old_from.orient() {
118 |                     Orientation::Forward => chopped_segs.end - 1,
119 |                     Orientation::Backward => chopped_segs.start,
120 |                 };
121 |                 seg_id.handle(old_from.orient())
122 |             };
123 |             let new_to = {
124 |                 let old_to = link.to;
125 |                 let chopped_segs = seg_map[old_to.segment().index()];
126 |                 let seg_id = match old_to.orient() {
127 |                     Orientation::Forward => chopped_segs.start,
128 |                     Orientation::Backward => chopped_segs.end - 1,
129 |                 };
130 |                 seg_id.handle(old_to.orient())
131 |             };
132 |             flat.add_link(new_from, new_to, vec![]);
133 |         }
134 |     }
135 | 
136 |     flat
137 | }
138 | 


--------------------------------------------------------------------------------
/pollen_py/README.md:
--------------------------------------------------------------------------------
  1 | # Proof-of-Concept Pollen Hardware Generator
  2 | 
  3 | This directory contains a proof-of-concept hardware accelerator generator for a simple GFA query. This section contains some guides for trying out this generator.
  4 | 
  5 | ### The Docker Image
  6 | 
  7 | Running the hardware generator is easy if you use our [Docker image][package]:
  8 | 
  9 |     docker run -it --rm ghcr.io/cucapra/pollen:latest
 10 | 
 11 | If you prefer to install locally, we point you to the somewhat more involved instructions [below](#installing-locally).
 12 | 
 13 | ### Generating an Accelerator: Quick
 14 | 
 15 | If you want to compute the [depth][] of all the nodes in the graph, the following command will generate and run a node depth accelerator:
 16 | ```
 17 | exine depth -a -r <filename.og>
 18 | ```
 19 | 
 20 | This will automatically generate a node depth accelerator whose dimensions match the input data, compute the node depth, and remove the accelerator once the computation is done.
 21 | 
 22 | To save the files generated from the previous command in `<path>`, use the `--tmp-dir` flag:
 23 | ```
 24 | exine depth -a -r <filename.og> --tmpdir <path>
 25 | ```
 26 | The node depth accelerator will be saved at `<path>/<filename.futil>` and the input data will be saved at `<path>/<filename.data>`.
 27 | 
 28 | ### Generating an Accelerator: Full Walkthrough
 29 | 
 30 | Take [depth][] as an example. To generate and run a node depth accelerator for the graph `k.og`, first navigate to the root directory of this repository. Then run
 31 | ```
 32 | make fetch
 33 | make test/k.og
 34 | exine depth -o depth.futil
 35 | exine depth -d test/k.og -o depth.data
 36 | exine depth -r depth.data --accelerator depth.futil
 37 | ```
 38 | 
 39 | What just happened? Below, we walk through the five commands we issued above, pointing out the other options that we could have used.
 40 | 
 41 | First, `make fetch` downloads some [GFA][] data files into the `./test` directory.
 42 | 
 43 | Second, `make test/*.og` builds the odgi graph files from those GFA files.
 44 | 
 45 | Third, we generate the hardware accelerator and write it to a file named `depth.futil`. The commands to generate a node depth hardware accelerator in [Calyx][] include:
 46 | 
 47 | 1. `exine depth -o depth.futil`
 48 | 2. `exine depth -a <filename.og> -o depth.futil`
 49 | 3. `exine depth -n=MAX_NODES -e=MAX_STEPS -p=MAX_PATHS -o depth.futil`
 50 | 
 51 | The commands use the hardware parameters as follows:
 52 | 1. Uses default hardware parameters.
 53 | 2. Automatically infers the hardware parameters from a `.og` file.
 54 | 3. Takes the hardware parameters as input.
 55 | 
 56 | Parameters that are specified manually take precedence over those that are inferred automatically, and it is legal to specify just a subset of parameters. For example, `exine depth -a test/k.og -n=1` will infer `MAX_STEPS` and `MAX_PATHS` from `test/k.og`, but the resulting accelerator will only handle one node.
 57 | 
 58 | Fourth, we need to generate some input from our odgi file. This is what we will feed to the hardware accelerator. The following variations all accomplish this:
 59 | 
 60 | 1. `exine depth -d <filename.og> -o depth.data`
 61 | 2. `exine depth -d <filename.og> -a <filename2.og> -o depth.data`
 62 | 3. `exine depth -d <filename.og> -n=MAX_NODES -e=MAX_STEPS -p=MAX_PATHS -o depth.data`
 63 | 4. `exine depth -d <filename.og> -a -o depth.data`
 64 | 
 65 | The flags work as before, except that if no argument is passed to the `-a` flag, the dimensions are inferred from the input file. **The dimensions of the input must be the same as that of the hardware accelerator.**
 66 | 
 67 | Fifth, we run our hardware accelerator. The following code simulates the Calyx code for the hardware accelerator and outputs the node depth table:
 68 | 
 69 | ```
 70 | exine depth -r depth.data -x depth.futil
 71 | ```
 72 | 
 73 | ### Installing Locally
 74 | 
 75 | You will need  [Flit][] version 3.7.1 and [Turnt][] version 1.11.0.
 76 | We will guide you through the installation of our major dependencies, [Calyx][] and [odgi][], and then show you how to install Pollen itself.
 77 | 
 78 | #### Calyx
 79 | 
 80 | Below we show you how to build Calyx from source and set it up for our use.
 81 | If you are curious, this tracks the "[installing from source][calyx-install-src]" and "[installing the command-line driver][calyx-install-fud]" sections of the Calyx documentation.
 82 | 
 83 | 1. `git clone https://github.com/cucapra/calyx.git`
 84 | 2. `cd calyx`
 85 | 3. `cargo build`
 86 | 3. `flit -f fud/pyproject.toml install -s --deps production`
 87 | 4. `fud config --create global.root $(pwd)`
 88 | 5. `cargo build -p interp`
 89 | 6. `fud config stages.calyx.exec $(pwd)/target/debug/calyx`
 90 | 7. `fud config stages.interpreter.exec $(pwd)/target/debug/interp`
 91 | 8. `flit -f calyx-py/pyproject.toml install -s`
 92 | 9. `fud check`
 93 | 
 94 | You will be warned that `synth-verilog` and `vivado-hls` were not installed correctly; this is fine for our purposes.
 95 | 
 96 | #### Odgi
 97 | 
 98 | We recommend that you build odgi from source, as described [here][odgi-from-source].
 99 | To check that this worked, run `odgi` from the command line.
100 | 
101 | Some parts of Pollen presently use odgi's Python bindings.
102 | You will need to edit your PYTHONPATH, as explained [here][odgi-pythonpath], to enable this.
103 | To verify that this worked, open up a Python shell and try `import odgi`.
104 | If it succeeds quietly, great!
105 | If it segfaults, try the preload step explained [here][odgi-preload].
106 | 
107 | #### Pollen
108 | 
109 | Clone this repository:
110 | 
111 |     git clone https://github.com/cucapra/pollen.git
112 | 
113 | And then install the Python tools using [uv][]:
114 | 
115 |     $ uv sync
116 |     $ source .venv/bin/activate
117 | 
118 | [calyx]: https://calyxir.org
119 | [odgi]: https://odgi.readthedocs.io/en/latest/
120 | [flit]: https://flit.pypa.io/en/stable/
121 | [turnt]: https://github.com/cucapra/turnt
122 | [calyx-install-src]: https://docs.calyxir.org/#installing-from-source-to-use-and-extend-calyx
123 | [calyx-install-fud]: https://docs.calyxir.org/#installing-the-command-line-driver
124 | [package]: https://github.com/cucapra/pollen/pkgs/container/pollen
125 | [odgi-from-source]: https://odgi.readthedocs.io/en/latest/rst/installation.html#building-from-source
126 | [odgi-pythonpath]: https://odgi.readthedocs.io/en/latest/rst/binding/usage.html
127 | [odgi-preload]: https://odgi.readthedocs.io/en/latest/rst/binding/usage.html#optimise
128 | [depth]: https://pangenome.github.io/odgi.github.io/rst/commands/odgi_depth.html
129 | [gfa]: https://github.com/lh3/gfatools/blob/master/doc/rGFA.md#the-reference-gfa-rgfa-format
130 | [uv]: https://github.com/astral-sh/uv
131 | 


--------------------------------------------------------------------------------
/pollen_data_gen/pollen_data_gen/simple.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import Dict, Union, Optional, Any, List, Sequence, TextIO
  3 | from io import TextIOWrapper
  4 | from json import JSONEncoder
  5 | import mygfa
  6 | from . import depth
  7 | 
  8 | 
  9 | SimpleType = Optional[
 10 |     Union[str, Dict[str, Sequence[object]], List[int], List[Union[int, str]]]
 11 | ]
 12 | # A SimpleType is a type that can be serialized by the JSON encoder below.
 13 | # It's a bit of a hack, but it works.
 14 | 
 15 | char_to_number = {"A": 1, "T": 2, "G": 3, "C": 4, "N": 5}
 16 | number_to_char = {v: k for k, v in char_to_number.items()}
 17 | 
 18 | 
 19 | def strand_to_number_list(strand: mygfa.Strand) -> List[int]:
 20 |     """Converts a strand to a list of numbers following the mapping above.
 21 |     For instance, "AGGA" is converted to [1,3,3,1].
 22 |     """
 23 |     return [char_to_number[c] for c in str(strand)]
 24 | 
 25 | 
 26 | def number_list_to_strand(numbers: List[str]) -> mygfa.Strand:
 27 |     """Converts a list of numbers to a strand following the mapping above.
 28 |     For instance, [1,3,3,1] is converted to "AGGA"."""
 29 |     return mygfa.Strand("".join([number_to_char[int(number)] for number in numbers]))
 30 | 
 31 | 
 32 | def path_seq_to_number_list(path: str) -> List[int]:
 33 |     """Converts a path's segment sequence into a list of numbers.
 34 |     Every + becomes 0 and - becomes 1.
 35 |     For instance, "1+,2-,14+" is converted to [1,0,2,1,14,0].
 36 |     The 1 at the 4th cell will not be confused for a node called "1" because
 37 |     it is at an even index.
 38 | 
 39 |     In the future, once we know more about our consumer, it is very likely that
 40 |     we will want to unzip this list into two lists:
 41 |      - one of node names
 42 |      - one of directions of traversal.
 43 |     This is because the direction of traversal can be stored in 1 bit each,
 44 |     whereas the node names will take log_2{max_steps} bits each.
 45 |     """
 46 |     ans = []
 47 |     for chunk in path.split(","):
 48 |         num, orient = chunk[:-1], chunk[-1]
 49 |         ans.append(int(num))
 50 |         if orient == "+":
 51 |             ans.append(0)
 52 |         else:
 53 |             ans.append(1)
 54 | 
 55 |     return ans
 56 | 
 57 | 
 58 | def number_list_to_path_seq(numbers: List[int]) -> str:
 59 |     """The inverse of the above function."""
 60 |     ans = []
 61 |     for i, number in enumerate(numbers):
 62 |         if i % 2:
 63 |             if number == 0:
 64 |                 ans.append("+,")
 65 |             elif number == 1:
 66 |                 ans.append("-,")
 67 |         else:
 68 |             ans.append(str(number))
 69 | 
 70 |     # Need to drop the last comma.
 71 |     return "".join(ans)[:-1]
 72 | 
 73 | 
 74 | def align_to_str(align: mygfa.Alignment) -> str:
 75 |     """Placeholder until we have reason to do anything cleverer."""
 76 |     return str(align)
 77 | 
 78 | 
 79 | def str_to_align(align_str: str) -> mygfa.Alignment:
 80 |     """Placeholder until we have reason to do anything cleverer."""
 81 |     return mygfa.Alignment.parse(align_str)
 82 | 
 83 | 
 84 | def link_to_number_list(link: mygfa.Link) -> List[Union[int, str]]:
 85 |     """Converts a Link object to a list of four numbers and a string.
 86 |     As before, every + becomes 0 and - becomes 1."""
 87 |     return [
 88 |         int(link.from_.name),
 89 |         0 if link.from_.ori else 1,
 90 |         int(link.to_.name),
 91 |         0 if link.to_.ori else 1,
 92 |         align_to_str(link.overlap),
 93 |     ]
 94 | 
 95 | 
 96 | def number_list_to_link(link_json: List[Union[int, str]]) -> mygfa.Link:
 97 |     """The inverse of the above function."""
 98 |     return mygfa.Link(
 99 |         mygfa.Handle(str(link_json[0]), link_json[1] == 0),
100 |         mygfa.Handle(str(link_json[2]), link_json[3] == 0),
101 |         str_to_align(str(link_json[4])),
102 |     )
103 | 
104 | 
105 | class GenericSimpleEncoder(JSONEncoder):
106 |     """A generic JSON encoder for mygfa graphs."""
107 | 
108 |     def default(self, o: Any) -> SimpleType:
109 |         if isinstance(o, mygfa.Path):
110 |             items = str(o).split("\t")
111 |             # We can drop the 0th cell, which will just be 'P',
112 |             # and the 1st cell, which will just be the path's name.
113 |             # Not doing anything clever with the overlaps yet.
114 |             return {"segments": path_seq_to_number_list(items[2]), "overlaps": items[3]}
115 |         if isinstance(o, mygfa.Link):
116 |             return link_to_number_list(o)
117 |         if isinstance(o, mygfa.Header):
118 |             return o
119 |         if isinstance(o, mygfa.Segment):
120 |             return strand_to_number_list(o.seq)
121 |         return None
122 | 
123 | 
124 | def dump(
125 |     graph: mygfa.Graph,
126 |     json_file: Union[TextIO, TextIOWrapper],
127 |     max_n: Optional[int],
128 |     max_e: Optional[int],
129 |     max_p: Optional[int],
130 |     subset_paths: Optional[List[str]] = None,
131 | ) -> None:
132 |     """Outputs the graph as a JSON, along with precomputed data for the
133 |     calculation of node depth.
134 |     """
135 | 
136 |     basic_encoding = GenericSimpleEncoder().encode(
137 |         {"headers": graph.headers}
138 |         | {f"seg_to_seq_{k}": v for k, v in graph.segments.items()}
139 |         | {"links": graph.links}
140 |         | {f"path_details_{k}": v for k, v in graph.paths.items()}
141 |     )
142 | 
143 |     depth_encoding = depth.depth_json(graph, max_n, max_e, max_p, subset_paths)
144 | 
145 |     json.dump(
146 |         {
147 |             "basic": json.loads(basic_encoding),
148 |             "depth": json.loads(depth_encoding),
149 |         },
150 |         json_file,
151 |         indent=2,
152 |         sort_keys=True,
153 |     )
154 | 
155 | 
156 | def parse(file: TextIO) -> mygfa.Graph:
157 |     """Reads a JSON file and returns a mygfa.Graph object."""
158 |     graph = json.load(file)["basic"]
159 |     graph_gfa = mygfa.Graph(
160 |         [mygfa.Header.parse(h) for h in graph["headers"]],
161 |         {
162 |             k.split("_")[3]: mygfa.Segment(k.split("_")[3], number_list_to_strand(v))
163 |             for k, v in graph.items()
164 |             if k.startswith("seg_to_seq_")
165 |         },
166 |         [number_list_to_link(link) for link in graph["links"]],
167 |         {
168 |             k.split("_")[2]: mygfa.Path.parse_inner(
169 |                 k.split("_")[2], number_list_to_path_seq(v["segments"]), v["overlaps"]
170 |             )
171 |             for k, v in graph.items()
172 |             if k.startswith("path_details_")
173 |         },
174 |     )
175 |     # graph_gfa.emit(sys.stdout)  # Good for debugging.
176 |     return graph_gfa
177 | 
178 | 
179 | def roundtrip_test(graph: mygfa.Graph) -> None:
180 |     """Tests that the graph can be serialized and deserialized."""
181 |     with open("roundtrip_test.json", "w", encoding="utf-8") as file:
182 |         dump(graph, file, None, None, None)
183 |     with open("roundtrip_test.json", "r", encoding="utf-8") as file2:
184 |         assert parse(file2) == graph
185 | 


--------------------------------------------------------------------------------
/pollen_py/pollen/depth/processing-elements/parse_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file converts an odgi graph to numerical JSON data that can be used by the prototype calyx hardware simulator. This means it only generates data for the first node in a graph.
  3 | """
  4 | 
  5 | import sys
  6 | import argparse
  7 | import json
  8 | import odgi
  9 | 
 10 | # Defaults for the maximum possible number of nodes, steps per node, and paths to consider
 11 | MAX_STEPS = 15
 12 | MAX_PATHS = 15
 13 | 
 14 | 
 15 | def parse_steps_on_nodes(
 16 |     graph, path_name_to_id, max_steps=MAX_STEPS, max_paths=MAX_PATHS
 17 | ):
 18 |     """
 19 |     Generate input data containing the path ids for each step on the min node in the graph, e.g..
 20 |     {path_ids:
 21 |         "data": [0, 1, 1, 2],
 22 |             "format": {
 23 |                 "numeric_type": "bitnum",
 24 |                 "is_signed": False,
 25 |                 "width": 2
 26 |             }
 27 |     }
 28 |     """
 29 | 
 30 |     data = {}
 31 |     node_id = graph.min_node_id()
 32 |     node_h = graph.get_handle(node_id)
 33 | 
 34 |     """
 35 |     Get a list of path ids for each step on node_h.
 36 |     """
 37 | 
 38 |     # Check that the number of steps on the node does not exceed max_steps
 39 |     if graph.get_step_count(node_h) > max_steps:
 40 |         raise Exception(
 41 |             f"The number of paths in the graph exceeds the maximum number of paths the hardware can process. {graph.get_step_count(node_h)} > {max_steps}. Hint: try setting the maximum number of steps manually using the -e flag."
 42 |         )
 43 | 
 44 |     path_ids = []
 45 | 
 46 |     def parse_step(step_h):
 47 |         path_h = graph.get_path(step_h)
 48 |         path_id = path_name_to_id[graph.get_path_name(path_h)]
 49 |         path_ids.append(path_id + 1)
 50 | 
 51 |     graph.for_each_step_on_handle(node_h, parse_step)
 52 | 
 53 |     # Pad path_ids with 0s
 54 |     path_ids = path_ids + [0] * (max_steps - len(path_ids))
 55 | 
 56 |     # 'path_ids{id}' is the list of path ids for each step crossing node {id}
 57 |     width = max_paths.bit_length()
 58 |     data[f"path_ids"] = {
 59 |         "data": path_ids,
 60 |         "format": {"numeric_type": "bitnum", "is_signed": False, "width": width},
 61 |     }
 62 | 
 63 |     return data
 64 | 
 65 | 
 66 | def parse_paths_file(filename, path_to_id, max_paths=MAX_PATHS):
 67 |     """
 68 |     Return paths_to_consider, a list of length max_paths, where
 69 |     paths_to_consider[i] is 1 if i is a path id and we include path i in our
 70 |     calculations of node depth
 71 |     """
 72 | 
 73 |     if filename is None:  # Return the default value
 74 |         paths_to_consider = [1] * (max_paths + 1)
 75 |         paths_to_consider[0] = 0
 76 |         return paths_to_consider
 77 | 
 78 |     with open(filename, "r") as paths_file:
 79 |         text = paths_file.read()
 80 |         paths = text.splitlines()
 81 | 
 82 |     paths_to_consider = [0] * (max_paths + 1)
 83 | 
 84 |     for path_name in paths:
 85 |         path_id = path_name_to_id[path_name]
 86 |         paths_to_consider[path_id] = 1
 87 | 
 88 |     return paths_to_consider
 89 | 
 90 | 
 91 | def get_maxes(filename):
 92 |     graph = odgi.graph()
 93 |     graph.load(filename)
 94 | 
 95 |     max_steps = 0
 96 |     max_paths = graph.get_path_count()
 97 | 
 98 |     def update_max_steps(node_h):
 99 |         nonlocal max_steps
100 |         num_steps = graph.get_step_count(node_h)
101 |         if num_steps > max_steps:
102 |             max_steps = num_steps
103 | 
104 |     graph.for_each_handle(update_max_steps)
105 | 
106 |     return max_steps, max_paths
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     # Parse commandline arguments
111 |     parser = argparse.ArgumentParser()
112 |     parser.add_argument(
113 |         "filename",
114 |         help="A .og file representing a pangenome whose node depth we want to calculate",
115 |     )
116 |     parser.add_argument(
117 |         "-a",
118 |         "--auto-size",
119 |         action="store_true",
120 |         help="Automically infer hardware dimensions from a given odgi graph",
121 |     )
122 |     parser.add_argument(
123 |         "-s",
124 |         "--subset-paths",
125 |         help="Specify a file containing a subset of all paths in the graph. See the odgi documentation for more details.",
126 |     )
127 |     parser.add_argument(
128 |         "-e",
129 |         "--max-steps",
130 |         type=int,
131 |         default=MAX_STEPS,
132 |         help="Specify the maximum number of steps per node that the hardware can support.",
133 |     )
134 |     parser.add_argument(
135 |         "-p",
136 |         "--max-paths",
137 |         type=int,
138 |         default=MAX_PATHS,
139 |         help="Specify the maximum number of paths that the hardware can support.",
140 |     )
141 |     parser.add_argument(
142 |         "-o",
143 |         "--out",
144 |         help="Specify the output file. If not specified, will dump to stdout.",
145 |     )
146 |     args = parser.parse_args()
147 | 
148 |     graph = odgi.graph()
149 |     graph.load(args.filename)
150 | 
151 |     if args.auto_size:
152 |         max_steps, max_paths = get_maxes(args.filename)
153 |     else:
154 |         max_stpes, max_paths = args.max_steps, args.max_paths
155 | 
156 |     # Check that the number of paths on the graph does not exceed max_paths
157 |     if graph.get_path_count() > max_paths:
158 |         raise Exception(
159 |             f"The number of paths in the graph exceeds the maximum number of paths the hardware can process. {graph.get_path_count()} > {args.max_paths}. Hint: try setting the maximum number of paths manually using the -p flag"
160 |         )
161 | 
162 |     # Assign a path_id to each path; the path_ids are not accessible using the
163 |     # default python bindings for odgi
164 | 
165 |     # Obtain a list of path names; a path's index is its id
166 |     paths = []
167 |     graph.for_each_path_handle(lambda h: paths.append(graph.get_path_name(h)))
168 | 
169 |     # Path name -> path id
170 |     path_name_to_id = {path: count for count, path in enumerate(paths)}
171 | 
172 |     paths_to_consider = parse_paths_file(args.subset_paths, path_name_to_id, max_paths)
173 | 
174 |     data = parse_steps_on_nodes(graph, path_name_to_id, max_steps, max_paths)
175 | 
176 |     data["paths_to_consider"] = {
177 |         "data": paths_to_consider,
178 |         "format": {"numeric_type": "bitnum", "is_signed": False, "width": 1},
179 |     }
180 | 
181 |     data["paths_on_node"] = {
182 |         "data": [0] * (max_paths + 1),
183 |         "format": {"numeric_type": "bitnum", "is_signed": False, "width": 1},
184 |     }
185 | 
186 |     data["depth_output"] = {
187 |         "data": [0],
188 |         "format": {
189 |             "numeric_type": "bitnum",
190 |             "is_signed": False,
191 |             "width": max_steps.bit_length(),
192 |         },
193 |     }
194 | 
195 |     data["uniq_output"] = {
196 |         "data": [0],
197 |         "format": {
198 |             "numeric_type": "bitnum",
199 |             "is_signed": False,
200 |             "width": max_paths.bit_length(),
201 |         },
202 |     }
203 | 
204 |     if args.out:
205 |         with open(args.out, "w") as out_file:
206 |             json.dump(data, out_file, indent=2, sort_keys=True)
207 |     else:
208 |         json.dump(data, sys.stdout, indent=2, sort_keys=True)
209 | 


--------------------------------------------------------------------------------
/bench/graphs.toml:
--------------------------------------------------------------------------------
 1 | # From: https://github.com/AndreaGuarracino/1000G-ONT-F100-PGGB/blob/master/data/1000G-ONT-F100-PGGB.gfa.urls.tsv
 2 | [1000gont]
 3 | chr1 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr1.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
 4 | chr2 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr2.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
 5 | chr3 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr3.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
 6 | chr4 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr4.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
 7 | chr5 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr5.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
 8 | chr6 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr6.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
 9 | chr7 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr7.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
10 | chr8 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr8.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
11 | chr9 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr9.30kbp.fa.gz.445f03b.e34d4cd.b691e61.smooth.final.gfa.zst"
12 | chr10 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr10.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
13 | chr11 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr11.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
14 | chr12 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr12.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
15 | chr13 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr13.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
16 | chr14 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr14.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
17 | chr15 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr15.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
18 | chr16 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr16.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
19 | chr17 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr17.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
20 | chr18 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr18.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
21 | chr19 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr19.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
22 | chr20 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr20.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
23 | chr21 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr21.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
24 | chr22 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr22.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst"
25 | chrX = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chrX.30kbp.fa.gz.a8a102b.eb0f3d3.a58faa8.smooth.final.gfa.zst"
26 | chrY = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chrY.30kbp.fa.gz.a8a102b.eb0f3d3.0713820.smooth.final.gfa.zst"
27 | 
28 | # From: https://s3-us-west-2.amazonaws.com/human-pangenomics/index.html?prefix=pangenomes/freeze/freeze1/pggb/chroms/
29 | [hprc]
30 | chrY = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrY.hprc-v1.0-pggb.gfa.gz"
31 | chr1 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr1.hprc-v1.0-pggb.gfa.gz"
32 | chr10 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr10.hprc-v1.0-pggb.gfa.gz"
33 | chr11 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr11.hprc-v1.0-pggb.gfa.gz"
34 | chr12 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr12.hprc-v1.0-pggb.gfa.gz"
35 | chr13 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr13.hprc-v1.0-pggb.gfa.gz"
36 | chr14 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr14.hprc-v1.0-pggb.gfa.gz"
37 | chr15 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr15.hprc-v1.0-pggb.gfa.gz"
38 | chr16 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr16.hprc-v1.0-pggb.gfa.gz"
39 | chr17 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr17.hprc-v1.0-pggb.gfa.gz"
40 | chr18 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr18.hprc-v1.0-pggb.gfa.gz"
41 | chr19 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr19.hprc-v1.0-pggb.gfa.gz"
42 | chr2 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr2.hprc-v1.0-pggb.gfa.gz"
43 | chr20 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr20.hprc-v1.0-pggb.gfa.gz"
44 | chr21 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr21.hprc-v1.0-pggb.gfa.gz"
45 | chr22 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr22.hprc-v1.0-pggb.gfa.gz"
46 | chr3 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr3.hprc-v1.0-pggb.gfa.gz"
47 | chr4 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr4.hprc-v1.0-pggb.gfa.gz"
48 | chr5 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr5.hprc-v1.0-pggb.gfa.gz"
49 | chr6 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr6.hprc-v1.0-pggb.gfa.gz"
50 | chr7 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr7.hprc-v1.0-pggb.gfa.gz"
51 | chr8 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr8.hprc-v1.0-pggb.gfa.gz"
52 | chr9 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr9.hprc-v1.0-pggb.gfa.gz"
53 | chrM = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrM.hprc-v1.0-pggb.gfa.gz"
54 | chrX = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrX.hprc-v1.0-pggb.gfa.gz"
55 | 
56 | # Small tests from odgi:
57 | # https://github.com/pangenome/odgi/tree/master/test
58 | [test]
59 | k = "https://raw.githubusercontent.com/pangenome/odgi/master/test/k.gfa"
60 | lpa = "https://raw.githubusercontent.com/pangenome/odgi/master/test/LPA.gfa"
61 | chr6c4 = "https://raw.githubusercontent.com/pangenome/odgi/master/test/chr6.C4.gfa"
62 | drb1 = "https://raw.githubusercontent.com/pangenome/odgi/master/test/DRB1-3123.gfa"
63 | 


--------------------------------------------------------------------------------
/slow_odgi/slow_odgi/__main__.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | import io
  4 | from typing import Dict, Tuple, List
  5 | from collections.abc import Callable
  6 | import mygfa
  7 | 
  8 | from . import (
  9 |     chop,
 10 |     crush,
 11 |     degree,
 12 |     depth,
 13 |     flatten,
 14 |     flip,
 15 |     inject,
 16 |     matrix,
 17 |     overlap,
 18 |     paths,
 19 |     proofs,
 20 |     validate,
 21 |     norm,
 22 |     inject_setup,
 23 |     somepaths,
 24 |     validate_setup,
 25 | )
 26 | 
 27 | 
 28 | def parse_args() -> Tuple[argparse.ArgumentParser, argparse.Namespace]:
 29 |     """Parse command line arguments and run the appropriate subcommand."""
 30 |     parser = argparse.ArgumentParser()
 31 | 
 32 |     subparsers = parser.add_subparsers(
 33 |         title="slow-odgi commands", metavar="COMMAND", dest="command"
 34 |     )
 35 | 
 36 |     chop_parser = subparsers.add_parser(
 37 |         "chop",
 38 |         help="Shortens segments' sequences to a given maximum length.",
 39 |     )
 40 |     chop_parser.add_argument(
 41 |         "-n",
 42 |         nargs="?",
 43 |         const="d",
 44 |         help="The max segment size desired after chopping.",
 45 |         required=True,
 46 |     )
 47 | 
 48 |     subparsers.add_parser(
 49 |         "crush",
 50 |         help="Replaces consecutive instances of `N` with a single `N`.",
 51 |     )
 52 | 
 53 |     subparsers.add_parser(
 54 |         "degree", help="Generates a table summarizing each segment's degree."
 55 |     )
 56 | 
 57 |     depth_parser = subparsers.add_parser(
 58 |         "depth", help="Generates a table summarizing each segment's depth."
 59 |     )
 60 |     depth_parser.add_argument(
 61 |         "--paths",
 62 |         help="A file describing the paths you wish to query.",
 63 |         required=False,
 64 |     )
 65 | 
 66 |     subparsers.add_parser(
 67 |         "flatten",
 68 |         help="Converts the graph into FASTA + BED representation.",
 69 |     )
 70 | 
 71 |     subparsers.add_parser(
 72 |         "flip",
 73 |         help="Flips any paths that step more backward than forward.",
 74 |     )
 75 | 
 76 |     inject_parser = subparsers.add_parser(
 77 |         "inject", help="Adds new paths, as specified, to the graph."
 78 |     )
 79 |     inject_parser.add_argument(
 80 |         "--bed",
 81 |         nargs="?",
 82 |         help="A BED file describing the paths you wish to insert.",
 83 |         required=True,
 84 |     )
 85 | 
 86 |     subparsers.add_parser("matrix", help="Represents the graph as a matrix.")
 87 | 
 88 |     overlap_parser = subparsers.add_parser(
 89 |         "overlap",
 90 |         help="Queries the graph about which paths overlap with which other paths.",
 91 |     )
 92 |     overlap_parser.add_argument(
 93 |         "--paths",
 94 |         nargs="?",
 95 |         help="A file describing the paths you wish to query.",
 96 |         required=True,
 97 |     )
 98 | 
 99 |     subparsers.add_parser("paths", help="Lists the paths in the graph.")
100 | 
101 |     somepaths_parser = subparsers.add_parser(
102 |         "somepaths",
103 |         help="Lists the paths in the graph, with the option of dropping some.",
104 |     )
105 |     somepaths_parser.add_argument(
106 |         "--drop",
107 |         type=int,
108 |         default=0,
109 |         help="Randomly drop a percentage of the paths.",
110 |         metavar="PCT",
111 |     )
112 | 
113 |     subparsers.add_parser(
114 |         "validate",
115 |         help="Checks whether the links of the graph support its paths.",
116 |     )
117 | 
118 |     norm_parser = subparsers.add_parser(
119 |         "norm",
120 |         help="Print a graph unmodified, normalizing its representation.",
121 |     )
122 |     norm_parser.add_argument(
123 |         "--nl",
124 |         action="store_true",
125 |         help="Don't include links.",
126 |     )
127 | 
128 |     # "Hidden" commands for testing only
129 |     subparsers.add_parser("inject_setup")
130 |     subparsers.add_parser("validate_setup")
131 | 
132 |     # Add the graph argument to all subparsers.
133 |     # Doing it this way means that the graph argument is sought _after_ the
134 |     # command name.
135 |     for subparser in subparsers.choices.values():
136 |         subparser.add_argument(
137 |             "graph", nargs="?", help="Input GFA file", metavar="GRAPH"
138 |         )
139 | 
140 |     args = parser.parse_args()
141 | 
142 |     return parser, args
143 | 
144 | 
145 | def parse_bedfile(filename: str) -> List[mygfa.Bed]:
146 |     """Parse BED files that describe which paths to insert."""
147 |     bedfile = open(filename, "r", encoding="utf-8")
148 |     return [mygfa.Bed.parse(line) for line in (mygfa.nonblanks(bedfile))]
149 | 
150 | 
151 | def parse_paths(filename: str) -> List[str]:
152 |     """Parse path names from a file."""
153 |     return list(mygfa.nonblanks(open(filename, "r", encoding="utf-8")))
154 | 
155 | 
156 | def dispatch(args: argparse.Namespace) -> None:
157 |     """Parse the graph from filename,
158 |     parse any additional files if needed,
159 |     then dispatch to the appropriate slow-odgi command.
160 |     If the command makes a new graph, emit it to stdout."""
161 | 
162 |     # Functions that produce a new graph.
163 |     transformer_funcs: Dict[str, Callable[[mygfa.Graph], mygfa.Graph]] = {
164 |         "chop": lambda g: chop.chop(g, int(args.n)),
165 |         "crush": crush.crush,
166 |         "flip": flip.flip,
167 |         "inject": lambda g: inject.inject(g, parse_bedfile(args.bed)),
168 |         "norm": norm.norm,
169 |         "validate_setup": validate_setup.drop_some_links,
170 |     }
171 | 
172 |     # Other functions, which typically print their own output.
173 |     other_funcs: Dict[str, Callable[[mygfa.Graph], object]] = {
174 |         "degree": degree.degree,
175 |         "depth": lambda g: depth.depth(
176 |             g, parse_paths(args.paths) if args.paths else None
177 |         ),
178 |         "flatten": lambda g: flatten.flatten(g, f"{args.graph[:-4]}.og"),
179 |         "matrix": matrix.matrix,
180 |         "overlap": lambda g: overlap.overlap(g, parse_paths(args.paths)),
181 |         "paths": paths.paths,
182 |         "somepaths": lambda g: somepaths.somepaths(g, args.drop),
183 |         "validate": validate.validate,
184 |         "inject_setup": inject_setup.print_bed,
185 |     }
186 | 
187 |     show_no_links = ["chop", "inject"]
188 |     constructive_changes = ["chop", "inject"]
189 |     # These commands only add to the graph, so we'll assert "logically_le".
190 | 
191 |     # Parse the input graph, which comes from either a filename argument or
192 |     # stdin (if the filename is unspecified).
193 |     if args.graph:
194 |         in_file = open(args.graph, "r", encoding="utf-8")
195 |     else:
196 |         in_file = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8")
197 |     graph = mygfa.Graph.parse(in_file)
198 | 
199 |     # Run the appropriate command on the input graph.
200 |     if args.command in transformer_funcs:
201 |         out_graph = transformer_funcs[args.command](graph)
202 |         out_graph.emit(
203 |             sys.stdout, args.command not in show_no_links and not vars(args).get("nl")
204 |         )
205 |         if args.command in constructive_changes:
206 |             assert proofs.logically_le(graph, out_graph)
207 |     elif args.command in other_funcs:
208 |         other_funcs[args.command](graph)
209 |     else:
210 |         assert False
211 | 
212 | 
213 | def main() -> None:
214 |     """Parse command line arguments and run the appropriate subcommand."""
215 |     parser, args = parse_args()
216 |     dispatch(args)
217 | 
218 | 
219 | if __name__ == "__main__":
220 |     main()
221 | 


--------------------------------------------------------------------------------
/pollen_py/pollen/depth/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Combines the commandline interface for calyx_depth.py and parse_data.py. Run ./main.py -h for more info.
  3 | """
  4 | 
  5 | import argparse
  6 | import json
  7 | import os.path
  8 | import subprocess
  9 | import tempfile
 10 | import warnings
 11 | 
 12 | import pollen.depth.calyx_depth as depth
 13 | import pollen.depth.parse_data as parse_data
 14 | from pollen.argparse_custom import store_const_and_arg
 15 | 
 16 | 
 17 | def config_parser(parser):
 18 |     depth.config_parser(parser)
 19 | 
 20 |     parser.add_argument(
 21 |         "-a",
 22 |         "--auto-size",
 23 |         nargs="?",
 24 |         const="d",
 25 |         help="Provide an odgi file that will be used to calculate the hardware dimensions. If the flag is set with no argument, the argument of --parse-data or --run is used instead. Specified hardware dimensions take precedence.",
 26 |     )
 27 | 
 28 |     parser.set_defaults(action="gen")
 29 |     parser.add_argument(
 30 |         "-g",
 31 |         "--gen",
 32 |         dest="action",
 33 |         action="store_const",
 34 |         const="gen",
 35 |         help="Generate an accelerator. Should not be used with --run or --parse-data.",
 36 |     )
 37 |     parser.add_argument(
 38 |         "-r",
 39 |         "--run",
 40 |         dest="filename",
 41 |         dest2="action",
 42 |         action=store_const_and_arg,
 43 |         const="run",
 44 |         default="gen",
 45 |         help="Run node depth on the given .og or .data file. Outputs the node depth table. Should not be used with --gen or --parse-data.",
 46 |     )
 47 |     parser.add_argument(
 48 |         "-d",
 49 |         "--parse-data",
 50 |         dest="filename",
 51 |         dest2="action",
 52 |         action=store_const_and_arg,
 53 |         const="parse",
 54 |         default="gen",
 55 |         help="Parse the .og file to accelerator input. Should not be used with --gen or --run.",
 56 |     )
 57 | 
 58 |     parser.add_argument(
 59 |         "-s",
 60 |         "--subset-paths",
 61 |         help="Should only be used if the --run or --parse-data flag is set. Specifies a\
 62 |  subset of paths whose node depth to compute.",
 63 |     )
 64 | 
 65 |     parser.add_argument(
 66 |         "-x",
 67 |         "--accelerator",
 68 |         help="Specify a node depth accelerator to run. Should only be set if the --run flag is set.",
 69 |     )
 70 |     parser.add_argument(
 71 |         "--pr",
 72 |         action="store_true",
 73 |         help="Print profiling info. Passes the -pr flag to fud if --run is set.",
 74 |     )
 75 | 
 76 |     parser.add_argument(
 77 |         "--tmp-dir",
 78 |         help="Specify a directory to store temporary files in. The files will not be deleted at the end of execution.",
 79 |     )
 80 | 
 81 | 
 82 | def run_accel(args, tmp_dir_name):
 83 |     """
 84 |     Run the node depth accelerator
 85 |     """
 86 | 
 87 |     # Data parser
 88 |     parser = argparse.ArgumentParser()
 89 |     parse_data.config_parser(parser)
 90 | 
 91 |     # Parse the data file if necessary
 92 |     out_file = args.out
 93 |     basename = os.path.basename(args.filename)
 94 |     base, ext = os.path.splitext(basename)
 95 | 
 96 |     if ext == ".data":  # Data file was provided
 97 |         if args.auto_size == "d":
 98 |             warnings.warn("Cannot infer dimensions from .data file.", SyntaxWarning)
 99 |         data_file = args.filename
100 |     else:
101 |         # parse_data_file(args, tmp_dir_name)
102 |         data_file = f"{tmp_dir_name}/{base}.data"
103 |         new_args = [args.filename, "--out", data_file]
104 |         parser.parse_args(new_args, namespace=args)
105 |         # print("here1")
106 |         parse_data.run(args)
107 |         # print("here2")
108 | 
109 |     # Generate the accelerator if necessary
110 |     if args.accelerator:
111 |         futil_file = args.accelerator
112 |     else:
113 |         futil_file = f"{tmp_dir_name}/{base}.futil"
114 |         new_args = [args.filename, "--out", futil_file]
115 |         if args.auto_size == "d":
116 |             new_args.extend(["-a", args.filename])
117 |         parser.parse_args(new_args, namespace=args)
118 |         depth.run(args)
119 | 
120 |     # Compute the node depth
121 |     cmd = [
122 |         "fud",
123 |         "e",
124 |         futil_file,
125 |         "--to",
126 |         "interpreter-out",
127 |         "-s",
128 |         "verilog.data",
129 |         data_file,
130 |     ]
131 |     if args.pr:
132 |         cmd.append("-pr")
133 |         calyx_out = subprocess.run(cmd, capture_output=True, text=True)
134 |         output = calyx_out.stdout
135 |     else:
136 |         calyx_out = subprocess.run(cmd, capture_output=True, text=True)
137 |         try:
138 |             # Convert calyx output to a node depth table
139 |             calyx_out = json.loads(calyx_out.stdout)
140 |             output = parse_data.from_calyx(calyx_out, True)  # ndt
141 |         except:
142 |             output = calyx_out.stderr
143 | 
144 |     # Output the ndt
145 |     if out_file:
146 |         with open(out_file, "w") as out_file:
147 |             out_file.write(output)
148 |     else:
149 |         print(output)
150 | 
151 | 
152 | def parse_data_file(args, tmp_dir_name):
153 |     # Parser for parsing input to data file parser
154 |     parser = argparse.ArgumentParser()
155 |     parse_data.config_parser(parser)
156 | 
157 |     filename = args.filename
158 |     basename = os.path.basename(filename)
159 |     base, ext = os.path.splitext(basename)
160 | 
161 |     if ext == ".gfa":  # Build an odgi file
162 |         data_file = f"{tmp_dir_name}/{base}.data"
163 |         og_file = f"{tmp_dir_name}/{base}.og"
164 |         cmd = ["odgi", "build", "--gfa", filename, "--out", og_file]
165 |         subprocess.run(cmd)
166 | 
167 |         new_args = [og_file, "--out", data_file]
168 |     elif ext == ".og":  # Construct the pollen data file
169 |         data_file = f"{tmp_dir_name}/{base}.data"
170 |         new_args = [filename, "--out", data_file]
171 |     else:
172 |         raise Exception(f"file extension {ext} not recognized")
173 |     parser.parse_args(new_args, namespace=args)
174 |     parse_data.run(new_args)
175 | 
176 | 
177 | def run(args):
178 |     if args.action == "gen":  # Generate an accelerator
179 |         if args.subset_paths or args.accelerator or args.pr:
180 |             warnings.warn(
181 |                 "--subset-paths, --accelerator, and --pr will be ignored if action is gen.",
182 |                 SyntaxWarning,
183 |             )
184 |         depth.run(args)
185 | 
186 |     elif args.action == "parse":  # Generate a data file
187 |         if args.accelerator or args.pr:
188 |             warnings.warn(
189 |                 "--accelerator and --pr will be ignored if action is not 'run'.",
190 |                 SyntaxWarning,
191 |             )
192 | 
193 |         parser = argparse.ArgumentParser()
194 |         parse_data.config_parser(parser)
195 |         parser.parse_args(
196 |             [args.filename], namespace=args
197 |         )  # Set defaults for all arguments; does not change existing arguments
198 |         parse_data.run(args)
199 | 
200 |     elif args.action == "run":  # Run the accelerator
201 |         if args.tmp_dir:
202 |             with open(args.tmp_dir, "w") as tmp_dir_name:
203 |                 run_accel(args, tmp_dir_name)
204 |         else:
205 |             with tempfile.TemporaryDirectory() as tmp_dir_name:
206 |                 run_accel(args, tmp_dir_name)
207 | 
208 | 
209 | def main():
210 |     parser = argparse.ArgumentParser(conflict_handler="resolve")
211 | 
212 |     config_parser(parser)
213 | 
214 |     args = parser.parse_args()
215 |     run(args)
216 | 
217 | 
218 | if __name__ == "__main__":
219 |     main()
220 | 


--------------------------------------------------------------------------------
/flatgfa/src/parse.rs:
--------------------------------------------------------------------------------
  1 | use crate::flatgfa::{self, LineKind};
  2 | use crate::gfaline;
  3 | use crate::memfile::MemchrSplit;
  4 | use crate::namemap::NameMap;
  5 | use std::io::BufRead;
  6 | 
  7 | pub struct Parser<'a, P: flatgfa::StoreFamily<'a>> {
  8 |     /// The flat representation we're building.
  9 |     flat: flatgfa::GFAStore<'a, P>,
 10 | 
 11 |     /// All segment IDs, indexed by their names, which we need to refer to segments in paths.
 12 |     seg_ids: NameMap,
 13 | }
 14 | 
 15 | impl<'a, P: flatgfa::StoreFamily<'a>> Parser<'a, P> {
 16 |     pub fn new(builder: flatgfa::GFAStore<'a, P>) -> Self {
 17 |         Self {
 18 |             flat: builder,
 19 |             seg_ids: NameMap::default(),
 20 |         }
 21 |     }
 22 | 
 23 |     /// Parse a GFA text file from an I/O stream.
 24 |     pub fn parse_stream<R: BufRead>(mut self, stream: R) -> flatgfa::GFAStore<'a, P> {
 25 |         // We can parse segments immediately, but we need to defer links and paths until we have all
 26 |         // the segment names that they might refer to.
 27 |         let mut deferred_links = Vec::new();
 28 |         let mut deferred_paths = Vec::new();
 29 | 
 30 |         // Parse or defer each line.
 31 |         for line in stream.split(b'\n') {
 32 |             let line = line.unwrap();
 33 | 
 34 |             // Avoid parsing paths entirely for now; just preserve the entire line for later.
 35 |             if line[0] == b'P' {
 36 |                 self.flat.record_line(LineKind::Path);
 37 |                 deferred_paths.push(line);
 38 |                 continue;
 39 |             }
 40 | 
 41 |             // Parse other kinds of lines.
 42 |             let gfa_line = gfaline::parse_line(line.as_ref()).unwrap();
 43 |             self.record_line(&gfa_line);
 44 | 
 45 |             match gfa_line {
 46 |                 gfaline::Line::Header(data) => {
 47 |                     self.flat.add_header(data);
 48 |                 }
 49 |                 gfaline::Line::Segment(seg) => {
 50 |                     self.add_seg(seg);
 51 |                 }
 52 |                 gfaline::Line::Link(link) => {
 53 |                     deferred_links.push(link);
 54 |                 }
 55 |                 gfaline::Line::Path(_) => {
 56 |                     unreachable!("paths handled separately")
 57 |                 }
 58 |             }
 59 |         }
 60 | 
 61 |         // "Unwind" the deferred links and paths.
 62 |         for link in deferred_links {
 63 |             self.add_link(link);
 64 |         }
 65 |         for line in deferred_paths {
 66 |             if let gfaline::Line::Path(path) = gfaline::parse_line(&line).unwrap() {
 67 |                 self.add_path(path);
 68 |             } else {
 69 |                 unreachable!("unexpected deferred line")
 70 |             }
 71 |         }
 72 | 
 73 |         self.flat
 74 |     }
 75 | 
 76 |     /// Parse a GFA text file from an in-memory buffer.
 77 |     pub fn parse_mem(mut self, buf: &[u8]) -> flatgfa::GFAStore<'a, P> {
 78 |         let mut deferred_lines = Vec::new();
 79 | 
 80 |         for line in MemchrSplit::new(b'\n', buf) {
 81 |             // When parsing from memory, it's easy to entirely defer parsing of any line: we just keep
 82 |             // pointers to them. So we defer both paths and links.
 83 |             if line[0] == b'P' || line[0] == b'L' {
 84 |                 self.flat.record_line(if line[0] == b'P' {
 85 |                     LineKind::Path
 86 |                 } else {
 87 |                     LineKind::Link
 88 |                 });
 89 |                 deferred_lines.push(line);
 90 |                 continue;
 91 |             }
 92 | 
 93 |             // Actually parse other lines.
 94 |             let gfa_line = gfaline::parse_line(line).unwrap();
 95 |             self.record_line(&gfa_line);
 96 |             match gfa_line {
 97 |                 gfaline::Line::Header(data) => {
 98 |                     self.flat.add_header(data);
 99 |                 }
100 |                 gfaline::Line::Segment(seg) => {
101 |                     self.add_seg(seg);
102 |                 }
103 |                 gfaline::Line::Link(_) | gfaline::Line::Path(_) => {
104 |                     unreachable!("paths and links handled separately")
105 |                 }
106 |             }
107 |         }
108 | 
109 |         // "Unwind" the deferred lines.
110 |         for line in deferred_lines {
111 |             let gfa_line = gfaline::parse_line(line).unwrap();
112 |             match gfa_line {
113 |                 gfaline::Line::Link(link) => {
114 |                     self.add_link(link);
115 |                 }
116 |                 gfaline::Line::Path(path) => {
117 |                     self.add_path(path);
118 |                 }
119 |                 gfaline::Line::Header(_) | gfaline::Line::Segment(_) => {
120 |                     unreachable!("unexpected deferred line")
121 |                 }
122 |             }
123 |         }
124 | 
125 |         self.flat
126 |     }
127 | 
128 |     /// Record a marker that captures the original GFA line ordering.
129 |     fn record_line(&mut self, line: &gfaline::Line) {
130 |         match line {
131 |             gfaline::Line::Header(_) => self.flat.record_line(LineKind::Header),
132 |             gfaline::Line::Segment(_) => self.flat.record_line(LineKind::Segment),
133 |             gfaline::Line::Link(_) => self.flat.record_line(LineKind::Link),
134 |             gfaline::Line::Path(_) => self.flat.record_line(LineKind::Path),
135 |         }
136 |     }
137 | 
138 |     fn add_seg(&mut self, seg: gfaline::Segment) {
139 |         let seg_id = self.flat.add_seg(seg.name, seg.seq, seg.data);
140 |         self.seg_ids.insert(seg.name, seg_id);
141 |     }
142 | 
143 |     fn add_link(&mut self, link: gfaline::Link) {
144 |         let from = self.seg_ids.get(link.from_seg).handle(link.from_orient);
145 |         let to = self.seg_ids.get(link.to_seg).handle(link.to_orient);
146 |         self.flat.add_link(from, to, link.overlap);
147 |     }
148 | 
149 |     fn add_path(&mut self, path: gfaline::Path) {
150 |         // Parse the steps.
151 |         let mut step_parser = gfaline::StepsParser::new(path.steps);
152 |         let steps = self.flat.add_steps(
153 |             (&mut step_parser).map(|(name, dir)| self.seg_ids.get(name).handle(dir.into())),
154 |         );
155 |         assert!(step_parser.rest().is_empty());
156 | 
157 |         self.flat
158 |             .add_path(path.name, steps, path.overlaps.into_iter());
159 |     }
160 | }
161 | 
162 | impl Parser<'static, flatgfa::HeapFamily> {
163 |     pub fn for_heap() -> Self {
164 |         Self::new(flatgfa::HeapGFAStore::default())
165 |     }
166 | }
167 | 
168 | impl<'a> Parser<'a, flatgfa::FixedFamily> {
169 |     pub fn for_slice(store: flatgfa::FixedGFAStore<'a>) -> Self {
170 |         Self::new(store)
171 |     }
172 | }
173 | 
174 | /// Scan a GFA text file to count the number of each type of line and measure some sizes
175 | /// that are useful in estimating the final size of the FlatGFA file.
176 | pub fn estimate_toc(buf: &[u8]) -> crate::file::Toc {
177 |     let mut segs = 0;
178 |     let mut links = 0;
179 |     let mut paths = 0;
180 |     let mut header_bytes = 0;
181 |     let mut seg_bytes = 0;
182 |     let mut path_bytes = 0;
183 | 
184 |     let mut rest = buf;
185 |     while !rest.is_empty() {
186 |         let marker = rest[0];
187 |         let next = memchr::memchr(b'\n', rest).unwrap_or(rest.len() + 1);
188 | 
189 |         match marker {
190 |             b'H' => {
191 |                 header_bytes += next;
192 |             }
193 |             b'S' => {
194 |                 segs += 1;
195 |                 seg_bytes += next;
196 |             }
197 |             b'L' => {
198 |                 links += 1;
199 |             }
200 |             b'P' => {
201 |                 paths += 1;
202 |                 path_bytes += next;
203 |             }
204 |             _ => {
205 |                 panic!("unknown line type")
206 |             }
207 |         }
208 | 
209 |         if next >= rest.len() {
210 |             break;
211 |         }
212 |         rest = &rest[next + 1..];
213 |     }
214 | 
215 |     crate::file::Toc::estimate(segs, links, paths, header_bytes, seg_bytes, path_bytes)
216 | }
217 | 


--------------------------------------------------------------------------------
/flatgfa/src/cli/main.rs:
--------------------------------------------------------------------------------
  1 | use argh::FromArgs;
  2 | use flatgfa::flatgfa::FlatGFA;
  3 | use flatgfa::parse::Parser;
  4 | use flatgfa::pool::Store;
  5 | use flatgfa::{cli::cmds, file, memfile, parse};
  6 | 
  7 | #[derive(FromArgs)]
  8 | /// Convert between GFA text and FlatGFA binary formats.
  9 | struct PolBin {
 10 |     /// read from a binary FlatGFA file
 11 |     #[argh(option, short = 'i')]
 12 |     input: Option<String>,
 13 | 
 14 |     /// read from a text GFA file
 15 |     #[argh(option, short = 'I')]
 16 |     input_gfa: Option<String>,
 17 | 
 18 |     /// write to a binary FlatGFA file
 19 |     #[argh(option, short = 'o')]
 20 |     output: Option<String>,
 21 | 
 22 |     /// mutate the input file in place
 23 |     #[argh(switch, short = 'm')]
 24 |     mutate: bool,
 25 | 
 26 |     /// preallocation size factor
 27 |     #[argh(option, short = 'p', default = "32")]
 28 |     prealloc_factor: usize,
 29 | 
 30 |     #[argh(subcommand)]
 31 |     command: Option<Command>,
 32 | }
 33 | 
 34 | #[derive(FromArgs, PartialEq, Debug)]
 35 | #[argh(subcommand)]
 36 | enum Command {
 37 |     Toc(cmds::Toc),
 38 |     Paths(cmds::Paths),
 39 |     Stats(cmds::Stats),
 40 |     Position(cmds::Position),
 41 |     Extract(cmds::Extract),
 42 |     Depth(cmds::Depth),
 43 |     Chop(cmds::Chop),
 44 |     GafLookup(cmds::GAFLookup),
 45 |     Bench(cmds::Bench),
 46 |     BedIntersect(cmds::BEDIntersect),
 47 |     SeqExport(cmds::SeqExport),
 48 |     SeqImport(cmds::SeqImport),
 49 | }
 50 | 
 51 | fn main() -> Result<(), &'static str> {
 52 |     let args: PolBin = argh::from_env();
 53 | 
 54 |     // A special case for converting from GFA text to an in-place FlatGFA binary.
 55 |     if args.mutate {
 56 |         if let (None, None, Some(out_name)) = (&args.command, &args.input, &args.output) {
 57 |             prealloc_translate(args.input_gfa.as_deref(), out_name, args.prealloc_factor);
 58 |             return Ok(());
 59 |         }
 60 |     }
 61 | 
 62 |     // Another special case for parsing BED files,
 63 |     // since we do not parse a GFA file for that.
 64 |     if let Some(Command::BedIntersect(sub_args)) = args.command {
 65 |         cmds::bed_intersect(sub_args);
 66 |         return Ok(());
 67 |     }
 68 | 
 69 |     // Yet more special cases for sequence compression/decompression, which only
 70 |     // deal with raw sequence data and not GFA files.
 71 |     if let Some(Command::SeqExport(sub_args)) = args.command {
 72 |         cmds::seq_export(sub_args);
 73 |         return Ok(());
 74 |     }
 75 | 
 76 |     if let Some(Command::SeqImport(sub_args)) = args.command {
 77 |         cmds::seq_import(sub_args);
 78 |         return Ok(());
 79 |     }
 80 | 
 81 |     // Load the input from a file (binary) or stdin (text).
 82 |     let mmap;
 83 |     let mut mmap_mut;
 84 |     let store;
 85 |     let slice_store;
 86 |     let gfa = match args.input {
 87 |         Some(name) => {
 88 |             if args.mutate {
 89 |                 mmap_mut = memfile::map_file_mut(&name);
 90 |                 slice_store = file::view_store(&mut mmap_mut);
 91 |                 slice_store.as_ref()
 92 |             } else {
 93 |                 mmap = memfile::map_file(&name);
 94 |                 file::view(&mmap)
 95 |             }
 96 |         }
 97 |         None => {
 98 |             // Parse from stdin or a file.
 99 |             store = match args.input_gfa {
100 |                 Some(name) => {
101 |                     let file = memfile::map_file(&name);
102 |                     Parser::for_heap().parse_mem(file.as_ref())
103 |                 }
104 |                 None => {
105 |                     let stdin = std::io::stdin();
106 |                     Parser::for_heap().parse_stream(stdin.lock())
107 |                 }
108 |             };
109 |             store.as_ref()
110 |         }
111 |     };
112 | 
113 |     match args.command {
114 |         Some(Command::Toc(sub_args)) => {
115 |             cmds::toc(&gfa, sub_args);
116 |         }
117 |         Some(Command::Paths(_)) => {
118 |             cmds::paths(&gfa);
119 |         }
120 |         Some(Command::Stats(sub_args)) => {
121 |             cmds::stats(&gfa, sub_args);
122 |         }
123 |         Some(Command::Position(sub_args)) => {
124 |             cmds::position(&gfa, sub_args)?;
125 |         }
126 |         Some(Command::Extract(sub_args)) => {
127 |             let store = cmds::extract(&gfa, sub_args)?;
128 |             dump(&store.as_ref(), &args.output);
129 |         }
130 |         Some(Command::Depth(_)) => {
131 |             cmds::depth(&gfa);
132 |         }
133 |         Some(Command::Chop(sub_args)) => {
134 |             let store = cmds::chop(&gfa, sub_args)?;
135 |             // TODO: Ideally, find a way to encapsulate the logic of chop in `cmd.rs`, instead of
136 |             // defining here which values from out input `gfa` are needed by our final `flat` gfa.
137 |             // Here we are reference values in two different Stores to create this Flatgfa, and
138 |             // have not yet found a good rust-safe way to do this
139 |             let flat = flatgfa::FlatGFA {
140 |                 header: gfa.header,
141 |                 seq_data: gfa.seq_data,
142 |                 name_data: gfa.name_data,
143 |                 segs: store.segs.as_ref(),
144 |                 paths: store.paths.as_ref(),
145 |                 links: store.links.as_ref(),
146 |                 steps: store.steps.as_ref(),
147 |                 overlaps: store.overlaps.as_ref(),
148 |                 alignment: store.alignment.as_ref(),
149 |                 optional_data: store.optional_data.as_ref(),
150 |                 line_order: store.line_order.as_ref(),
151 |             };
152 |             dump(&flat, &args.output);
153 |         }
154 |         Some(Command::GafLookup(sub_args)) => {
155 |             cmds::gaf_lookup(&gfa, sub_args);
156 |         }
157 |         Some(Command::Bench(sub_args)) => {
158 |             cmds::bench(sub_args);
159 |         }
160 |         Some(Command::BedIntersect(_sub_args)) => {
161 |             panic!("Unreachable code");
162 |         }
163 |         Some(Command::SeqExport(_sub_args)) => {
164 |             panic!("Unreachable code");
165 |         }
166 |         Some(Command::SeqImport(_sub_args)) => {
167 |             panic!("Unreachable code");
168 |         }
169 |         None => {
170 |             // Just emit the GFA or FlatGFA file.
171 |             dump(&gfa, &args.output);
172 |         }
173 |     }
174 | 
175 |     Ok(())
176 | }
177 | 
178 | /// Write a FlatGFA either to a GFA text file to stdout or a binary FlatGFA file given
179 | /// with a name.
180 | fn dump(gfa: &FlatGFA, output: &Option<String>) {
181 |     match output {
182 |         Some(name) => {
183 |             let mut mmap = memfile::map_new_file(name, file::size(gfa) as u64);
184 |             file::dump(gfa, &mut mmap);
185 |             mmap.flush().unwrap();
186 |         }
187 |         None => {
188 |             print!("{gfa}");
189 |         }
190 |     }
191 | }
192 | 
193 | /// A special-case fast-path transformation from a GFA text file to a *preallocated*
194 | /// FlatGFA, with sizes based on estimates of the input counts.
195 | fn prealloc_translate(in_name: Option<&str>, out_name: &str, prealloc_factor: usize) {
196 |     let file;
197 |     let (input_buf, empty_toc) = match in_name {
198 |         // If we have an input GFA file, we can estimate its sizes for the TOC.
199 |         Some(name) => {
200 |             file = memfile::map_file(name);
201 |             let toc = parse::estimate_toc(file.as_ref());
202 |             (Some(file.as_ref()), toc)
203 |         }
204 | 
205 |         // Otherwise, we need to guess.
206 |         None => (None, file::Toc::guess(prealloc_factor)),
207 |     };
208 | 
209 |     // Create a file with an empty table of contents.
210 |     let mut mmap = memfile::map_new_file(out_name, empty_toc.size() as u64);
211 |     let (toc, store) = file::init(&mut mmap, empty_toc);
212 | 
213 |     // Parse the input into the file.
214 |     match input_buf {
215 |         Some(buf) => {
216 |             let store = Parser::for_slice(store).parse_mem(buf);
217 |             *toc = file::Toc::for_fixed_store(&store)
218 |         }
219 |         None => {
220 |             let stdin = std::io::stdin();
221 |             let store = Parser::for_slice(store).parse_stream(stdin.lock());
222 |             *toc = file::Toc::for_fixed_store(&store)
223 |         }
224 |     };
225 | 
226 |     mmap.flush().unwrap();
227 | }
228 | 


--------------------------------------------------------------------------------
/flatgfa/src/ops/extract.rs:
--------------------------------------------------------------------------------
  1 | use crate::flatgfa::{self, Handle, Segment};
  2 | use crate::pool::{self, Id, Span, Store};
  3 | use std::collections::HashMap;
  4 | 
  5 | /// A helper to construct a new graph that includes part of an old graph.
  6 | pub struct SubgraphBuilder<'a> {
  7 |     pub old: &'a flatgfa::FlatGFA<'a>,
  8 |     pub store: flatgfa::HeapGFAStore,
  9 |     pub seg_map: HashMap<Id<Segment>, Id<Segment>>,
 10 | }
 11 | 
 12 | pub struct SubpathStart {
 13 |     step: Id<Handle>, // The id of the first step in the subpath.
 14 |     pos: usize,       // The bp position at the start of the subpath.
 15 | }
 16 | 
 17 | impl<'a> SubgraphBuilder<'a> {
 18 |     pub fn new(old: &'a flatgfa::FlatGFA) -> Self {
 19 |         Self {
 20 |             old,
 21 |             store: flatgfa::HeapGFAStore::default(),
 22 |             seg_map: HashMap::new(),
 23 |         }
 24 |     }
 25 | 
 26 |     /// Include the old graph's header
 27 |     pub fn add_header(&mut self) {
 28 |         // pub fn add_header(&mut self, version: &[u8]) {
 29 |         //     assert!(self.header.as_ref().is_empty());
 30 |         //     self.header.add_slice(version);
 31 |         // }
 32 |         assert!(self.store.header.as_ref().is_empty());
 33 |         self.store.header.add_slice(self.old.header.all());
 34 |     }
 35 | 
 36 |     /// Add a segment from the source graph to this subgraph.
 37 |     fn include_seg(&mut self, seg_id: Id<Segment>) {
 38 |         let seg = &self.old.segs[seg_id];
 39 |         let new_seg_id = self.store.add_seg(
 40 |             seg.name,
 41 |             self.old.get_seq(seg),
 42 |             self.old.get_optional_data(seg),
 43 |         );
 44 |         self.seg_map.insert(seg_id, new_seg_id);
 45 |     }
 46 | 
 47 |     /// Add a link from the source graph to the subgraph.
 48 |     fn include_link(&mut self, link: &flatgfa::Link) {
 49 |         let from = self.tr_handle(link.from);
 50 |         let to = self.tr_handle(link.to);
 51 |         let overlap = self.old.get_alignment(link.overlap);
 52 |         self.store.add_link(from, to, overlap.ops.into());
 53 |     }
 54 | 
 55 |     /// Add a single subpath from the given path to the subgraph.
 56 |     fn include_subpath(&mut self, path: &flatgfa::Path, start: &SubpathStart, end_pos: usize) {
 57 |         let steps = pool::Span::new(start.step, self.store.steps.next_id()); // why the next id?
 58 |         let name = format!("{}:{}-{}", self.old.get_path_name(path), start.pos, end_pos);
 59 |         self.store
 60 |             .add_path(name.as_bytes(), steps, std::iter::empty());
 61 |     }
 62 | 
 63 |     /// Identify all the subpaths in a path from the original graph that cross through
 64 |     /// segments in this subgraph and merge them if possible.
 65 |     fn merge_subpaths(&mut self, path: &flatgfa::Path, max_distance_subpaths: usize) {
 66 |         // these are subpaths which *aren't* already included in the new graph
 67 |         let mut cur_subpath_start: Option<usize> = Some(0);
 68 |         let mut subpath_length = 0;
 69 |         let mut ignore_path = true;
 70 | 
 71 |         for (idx, step) in self.old.steps[path.steps].iter().enumerate() {
 72 |             let in_neighb = self.seg_map.contains_key(&step.segment());
 73 | 
 74 |             if let (Some(start), true) = (&cur_subpath_start, in_neighb) {
 75 |                 // We just entered the subgraph. End the current subpath.
 76 |                 if !ignore_path && subpath_length <= max_distance_subpaths {
 77 |                     // TODO: type safety
 78 |                     let subpath_span = Span::new(
 79 |                         path.steps.start + *start as u32,
 80 |                         path.steps.start + idx as u32,
 81 |                     );
 82 |                     for step in &self.old.steps[subpath_span] {
 83 |                         if !self.seg_map.contains_key(&step.segment()) {
 84 |                             self.include_seg(step.segment());
 85 |                         }
 86 |                     }
 87 |                 }
 88 |                 cur_subpath_start = None;
 89 |                 ignore_path = false;
 90 |             } else if let (None, false) = (&cur_subpath_start, in_neighb) {
 91 |                 // We've exited the current subgraph, start a new subpath
 92 |                 cur_subpath_start = Some(idx);
 93 |             }
 94 | 
 95 |             // Track the current bp position in the path.
 96 |             subpath_length += self.old.get_handle_seg(*step).len();
 97 |         }
 98 |     }
 99 | 
100 |     /// Identify all the subpaths in a path from the original graph that cross through
101 |     /// segments in this subgraph and add them.
102 |     fn find_subpaths(&mut self, path: &flatgfa::Path) {
103 |         let mut cur_subpath_start: Option<SubpathStart> = None;
104 |         let mut path_pos = 0;
105 | 
106 |         for step in &self.old.steps[path.steps] {
107 |             let in_neighb = self.seg_map.contains_key(&step.segment());
108 | 
109 |             if let (Some(start), false) = (&cur_subpath_start, in_neighb) {
110 |                 // End the current subpath.
111 |                 self.include_subpath(path, start, path_pos);
112 |                 cur_subpath_start = None;
113 |             } else if let (None, true) = (&cur_subpath_start, in_neighb) {
114 |                 // Start a new subpath.
115 |                 cur_subpath_start = Some(SubpathStart {
116 |                     step: self.store.steps.next_id(),
117 |                     pos: path_pos,
118 |                 });
119 |             }
120 | 
121 |             // Add the (translated) step to the new graph.
122 |             if in_neighb {
123 |                 self.store.add_step(self.tr_handle(*step));
124 |             }
125 | 
126 |             // Track the current bp position in the path.
127 |             path_pos += self.old.get_handle_seg(*step).len();
128 |         }
129 | 
130 |         // Did we reach the end of the path while still in the neighborhood?
131 |         if let Some(start) = cur_subpath_start {
132 |             self.include_subpath(path, &start, path_pos);
133 |         }
134 |     }
135 | 
136 |     /// Translate a handle from the source graph to this subgraph.
137 |     fn tr_handle(&self, old_handle: flatgfa::Handle) -> flatgfa::Handle {
138 |         // TODO: is this just generating the handle or should we add it to the new graph?
139 |         self.seg_map[&old_handle.segment()].handle(old_handle.orient())
140 |     }
141 | 
142 |     /// Check whether a segment from the old graph is in the subgraph.
143 |     fn contains(&self, old_seg_id: Id<Segment>) -> bool {
144 |         self.seg_map.contains_key(&old_seg_id)
145 |     }
146 | 
147 |     /// Extract a subgraph consisting of a neighborhood of segments up to `dist` links away
148 |     /// from the given segment in the original graph.
149 |     ///
150 |     /// Include any links between the segments in the neighborhood and subpaths crossing
151 |     /// through the neighborhood.
152 |     pub fn extract(
153 |         &mut self,
154 |         origin: Id<Segment>,
155 |         dist: usize,
156 |         max_distance_subpaths: usize,
157 |         num_iterations: usize,
158 |     ) {
159 |         self.include_seg(origin);
160 | 
161 |         // Find the set of all segments that are c links away.
162 |         let mut frontier: Vec<Id<Segment>> = Vec::new();
163 |         let mut next_frontier: Vec<Id<Segment>> = Vec::new();
164 |         frontier.push(origin);
165 |         for _ in 0..dist {
166 |             while let Some(seg_id) = frontier.pop() {
167 |                 for link in self.old.links.all().iter() {
168 |                     if let Some(other_seg) = link.incident_seg(seg_id) {
169 |                         // Add other_seg to the frontier set if it is not already in the frontier set or the seg_map
170 |                         if !self.seg_map.contains_key(&other_seg) {
171 |                             self.include_seg(other_seg);
172 |                             next_frontier.push(other_seg);
173 |                         }
174 |                     }
175 |                 }
176 |             }
177 |             (frontier, next_frontier) = (next_frontier, frontier);
178 |         }
179 | 
180 |         // Merge subpaths within max_distance_subpaths bp of each other, num_iterations times
181 |         for _ in 0..num_iterations {
182 |             for path in self.old.paths.all().iter() {
183 |                 self.merge_subpaths(path, max_distance_subpaths);
184 |             }
185 |         }
186 | 
187 |         // Include all links within the subgraph.
188 |         for link in self.old.links.all().iter() {
189 |             if self.contains(link.from.segment()) && self.contains(link.to.segment()) {
190 |                 self.include_link(link);
191 |             }
192 |         }
193 | 
194 |         // Find subpaths within the subgraph.
195 |         for path in self.old.paths.all().iter() {
196 |             self.find_subpaths(path);
197 |         }
198 |     }
199 | }
200 | 


--------------------------------------------------------------------------------
/flatgfa/src/gfaline.rs:
--------------------------------------------------------------------------------
  1 | use crate::flatgfa::{AlignOp, Orientation};
  2 | use atoi::FromRadix10;
  3 | 
  4 | type ParseResult<T> = Result<T, &'static str>;
  5 | type LineResult<'a> = ParseResult<Line<'a>>;
  6 | type PartialParseResult<'a, T> = ParseResult<(T, &'a [u8])>;
  7 | 
  8 | /// A parsed GFA file line.
  9 | pub enum Line<'a> {
 10 |     Header(&'a [u8]),
 11 |     Segment(Segment<'a>),
 12 |     Link(Link),
 13 |     Path(Path<'a>),
 14 | }
 15 | 
 16 | pub struct Segment<'a> {
 17 |     pub name: usize,
 18 |     pub seq: &'a [u8],
 19 |     pub data: &'a [u8],
 20 | }
 21 | 
 22 | pub struct Link {
 23 |     pub from_seg: usize,
 24 |     pub from_orient: Orientation,
 25 |     pub to_seg: usize,
 26 |     pub to_orient: Orientation,
 27 |     pub overlap: Vec<AlignOp>,
 28 | }
 29 | 
 30 | pub struct Path<'a> {
 31 |     pub name: &'a [u8],
 32 |     pub steps: &'a [u8],
 33 |     pub overlaps: Vec<Vec<AlignOp>>,
 34 | }
 35 | 
 36 | /// Parse a single line of a GFA file.
 37 | pub fn parse_line(line: &[u8]) -> LineResult<'_> {
 38 |     if line.len() < 2 || line[1] != b'\t' {
 39 |         return Err("expected marker and tab");
 40 |     }
 41 |     let rest = &line[2..];
 42 |     match line[0] {
 43 |         b'H' => parse_header(rest),
 44 |         b'S' => parse_seg(rest),
 45 |         b'L' => parse_link(rest),
 46 |         b'P' => parse_path(rest),
 47 |         _ => Err("unhandled line kind"),
 48 |     }
 49 | }
 50 | 
 51 | /// Parse a header line, which looks like `H <data>`.
 52 | fn parse_header(line: &[u8]) -> LineResult<'_> {
 53 |     Ok(Line::Header(line))
 54 | }
 55 | 
 56 | /// Parse a segment line, which looks like `S <name> <seq> <data>`.
 57 | fn parse_seg(line: &[u8]) -> LineResult<'_> {
 58 |     let (name, rest) = parse_num(line)?;
 59 |     let rest = parse_byte(rest, b'\t')?;
 60 |     let (seq, data) = parse_field(rest)?;
 61 |     Ok(Line::Segment(Segment { name, seq, data }))
 62 | }
 63 | 
 64 | /// Parse a link line, which looks like `L <from> <+-> <to> <+-> <CIGAR>`.
 65 | fn parse_link(line: &[u8]) -> LineResult<'_> {
 66 |     let (from_seg, rest) = parse_num(line)?;
 67 |     let rest = parse_byte(rest, b'\t')?;
 68 |     let (from_orient, rest) = parse_orient(rest)?;
 69 |     let rest = parse_byte(rest, b'\t')?;
 70 |     let (to_seg, rest) = parse_num(rest)?;
 71 |     let rest = parse_byte(rest, b'\t')?;
 72 |     let (to_orient, rest) = parse_orient(rest)?;
 73 |     let rest = parse_byte(rest, b'\t')?;
 74 |     let (overlap, rest) = parse_align(rest)?;
 75 |     if !rest.is_empty() {
 76 |         return Err("expected end of line");
 77 |     }
 78 |     Ok(Line::Link(Link {
 79 |         from_seg,
 80 |         from_orient,
 81 |         to_seg,
 82 |         to_orient,
 83 |         overlap,
 84 |     }))
 85 | }
 86 | 
 87 | /// Parse a path line, which looks like `P <name> <steps> <*|CIGARs>`.
 88 | fn parse_path(line: &[u8]) -> LineResult<'_> {
 89 |     let (name, rest) = parse_field(line)?;
 90 |     let (steps, rest) = parse_field(rest)?;
 91 |     let (overlaps, rest) = parse_maybe_overlap_list(rest)?;
 92 |     if !rest.is_empty() {
 93 |         return Err("expected end of line");
 94 |     }
 95 |     Ok(Line::Path(Path {
 96 |         name,
 97 |         steps,
 98 |         overlaps,
 99 |     }))
100 | }
101 | 
102 | /// Parse a *possible* overlap list, which may be `*` (empty).
103 | pub fn parse_maybe_overlap_list(s: &[u8]) -> PartialParseResult<'_, Vec<Vec<AlignOp>>> {
104 |     if s == b"*" {
105 |         Ok((vec![], &s[1..]))
106 |     } else {
107 |         parse_overlap_list(s)
108 |     }
109 | }
110 | 
111 | /// Parse a comma-separated list of CIGAR strings.
112 | ///
113 | /// TODO: This could be optimized to avoid accumulating into a vector.
114 | fn parse_overlap_list(s: &[u8]) -> PartialParseResult<'_, Vec<Vec<AlignOp>>> {
115 |     let mut rest = s;
116 |     let mut overlaps = vec![];
117 |     while !rest.is_empty() {
118 |         let overlap;
119 |         (overlap, rest) = parse_align(rest)?;
120 |         overlaps.push(overlap);
121 |         if !rest.is_empty() {
122 |             rest = parse_byte(rest, b',')?;
123 |         }
124 |     }
125 |     Ok((overlaps, rest))
126 | }
127 | 
128 | /// Consume a chunk of a string up to a given marker byte.
129 | fn parse_until(line: &[u8], marker: u8) -> PartialParseResult<'_, &[u8]> {
130 |     let end = memchr::memchr(marker, line).unwrap_or(line.len());
131 |     let rest = if end == line.len() {
132 |         &[]
133 |     } else {
134 |         &line[end + 1..]
135 |     };
136 |     Ok((&line[..end], rest))
137 | }
138 | 
139 | /// Consume a string from the line, until a tab (or the end of the line).
140 | pub fn parse_field(line: &[u8]) -> PartialParseResult<'_, &[u8]> {
141 |     parse_until(line, b'\t')
142 | }
143 | 
144 | /// Consume a specific byte.
145 | fn parse_byte(s: &[u8], byte: u8) -> ParseResult<&[u8]> {
146 |     if s.is_empty() || s[0] != byte {
147 |         return Err("expected byte");
148 |     }
149 |     Ok(&s[1..])
150 | }
151 | 
152 | /// Parse a single integer.
153 | fn parse_num<T: FromRadix10>(s: &[u8]) -> PartialParseResult<'_, T> {
154 |     match T::from_radix_10(s) {
155 |         (_, 0) => Err("expected number"),
156 |         (num, used) => Ok((num, &s[used..])),
157 |     }
158 | }
159 | 
160 | /// Parse a segment orientation (+ or -).
161 | fn parse_orient(line: &[u8]) -> PartialParseResult<'_, Orientation> {
162 |     if line.is_empty() {
163 |         return Err("expected orientation");
164 |     }
165 |     let orient = match line[0] {
166 |         b'+' => Orientation::Forward,
167 |         b'-' => Orientation::Backward,
168 |         _ => return Err("expected orient"),
169 |     };
170 |     Ok((orient, &line[1..]))
171 | }
172 | 
173 | /// Parse a single CIGAR alignment operation (like `4D`).
174 | fn parse_align_op(s: &[u8]) -> PartialParseResult<'_, AlignOp> {
175 |     let (len, rest) = parse_num::<u32>(s)?;
176 |     let op = match rest[0] {
177 |         b'M' => crate::flatgfa::AlignOpcode::Match,
178 |         b'N' => crate::flatgfa::AlignOpcode::Gap,
179 |         b'D' => crate::flatgfa::AlignOpcode::Deletion,
180 |         b'I' => crate::flatgfa::AlignOpcode::Insertion,
181 |         _ => return Err("expected align op"),
182 |     };
183 |     Ok((AlignOp::new(op, len), &rest[1..]))
184 | }
185 | 
186 | /// Parse a complete CIGAR alignment string (like `3M2I`).
187 | ///
188 | /// TODO This could be optimized to avoid collecting into a vector.
189 | fn parse_align(s: &[u8]) -> PartialParseResult<'_, Vec<AlignOp>> {
190 |     let mut rest = s;
191 |     let mut align = vec![];
192 |     while !rest.is_empty() && rest[0].is_ascii_digit() {
193 |         let op;
194 |         (op, rest) = parse_align_op(rest)?;
195 |         align.push(op);
196 |     }
197 |     Ok((align, rest))
198 | }
199 | 
200 | /// Parse GFA paths' segment lists. These look like `1+,2-,3+`.
201 | pub struct StepsParser<'a> {
202 |     str: &'a [u8],
203 |     index: usize,
204 |     state: StepsParseState,
205 |     seg: usize,
206 | }
207 | 
208 | /// The parser state: we're either looking for a segment name (or a +/- terminator),
209 | /// or we're expecting a comma (or end of string).
210 | enum StepsParseState {
211 |     Seg,
212 |     Comma,
213 | }
214 | 
215 | impl<'a> StepsParser<'a> {
216 |     pub fn new(str: &'a [u8]) -> Self {
217 |         StepsParser {
218 |             str,
219 |             index: 0,
220 |             state: StepsParseState::Seg,
221 |             seg: 0,
222 |         }
223 |     }
224 | 
225 |     pub fn rest(&self) -> &[u8] {
226 |         &self.str[self.index..]
227 |     }
228 | }
229 | 
230 | impl Iterator for StepsParser<'_> {
231 |     type Item = (usize, bool);
232 |     fn next(&mut self) -> Option<(usize, bool)> {
233 |         while self.index < self.str.len() {
234 |             // Consume one byte.
235 |             let byte = self.str[self.index];
236 |             self.index += 1;
237 | 
238 |             match self.state {
239 |                 StepsParseState::Seg => {
240 |                     if byte == b'+' || byte == b'-' {
241 |                         self.state = StepsParseState::Comma;
242 |                         return Some((self.seg, byte == b'+'));
243 |                     } else if byte.is_ascii_digit() {
244 |                         self.seg *= 10;
245 |                         self.seg += (byte - b'0') as usize;
246 |                     } else {
247 |                         return None;
248 |                     }
249 |                 }
250 |                 StepsParseState::Comma => {
251 |                     if byte == b',' {
252 |                         self.state = StepsParseState::Seg;
253 |                         self.seg = 0;
254 |                     } else {
255 |                         return None;
256 |                     }
257 |                 }
258 |             }
259 |         }
260 | 
261 |         None
262 |     }
263 | }
264 | 
265 | #[test]
266 | fn test_parse_steps() {
267 |     let s = b"1+,23-,4+ suffix";
268 |     let mut parser = StepsParser::new(s);
269 |     let path: Vec<_> = (&mut parser).collect();
270 |     assert_eq!(path, vec![(1, true), (23, false), (4, true)]);
271 |     assert_eq!(parser.rest(), b"suffix");
272 | }
273 | 


--------------------------------------------------------------------------------