├── flatgfa ├── tests │ ├── test.t │ └── turnt.toml ├── src │ ├── cli │ │ ├── mod.rs │ │ └── main.rs │ ├── ops │ │ ├── mod.rs │ │ ├── bench.rs │ │ ├── position.rs │ │ ├── depth.rs │ │ ├── chop.rs │ │ └── extract.rs │ ├── lib.rs │ ├── namemap.rs │ ├── memfile.rs │ ├── flatbed.rs │ ├── print.rs │ ├── parse.rs │ └── gfaline.rs ├── Cargo.toml └── README.md ├── tests ├── subset-paths │ ├── ex1.paths │ ├── ex2.paths │ ├── ex1.txt │ └── ex2.txt ├── depth │ ├── subset-paths │ │ ├── ex1.txt │ │ ├── ex2.txt │ │ ├── ex1.paths │ │ ├── ex2.paths │ │ └── turnt.toml │ ├── basic │ │ ├── ex1.gfa │ │ └── ex2.gfa │ └── turnt.toml ├── basic │ ├── ex1.gfa │ └── ex2.gfa ├── handmade │ ├── no-test-flip4.gfa │ ├── flip1.gfa │ ├── flip2.gfa │ ├── flip3.gfa │ └── crush1.gfa ├── .gitignore └── turnt.toml ├── bench ├── .gitignore ├── Makefile ├── sizes.py ├── bar.vl.json ├── config.toml ├── summary.py └── graphs.toml ├── mygfa ├── .gitignore ├── mygfa │ ├── __init__.py │ ├── __main__.py │ └── preprocess.py ├── pyproject.toml ├── docs │ ├── conf.py │ ├── Makefile │ └── index.rst ├── example.py └── README.md ├── .gitattributes ├── flatgfa-py ├── .gitignore ├── test │ ├── tiny.gaf │ ├── tiny.gfa │ ├── test_gaf.py │ └── test_flatgfa.py ├── docs │ ├── conf.py │ ├── Makefile │ └── index.rst ├── Cargo.toml ├── examples │ ├── depth.py │ └── gaf.py ├── pyproject.toml ├── README.md └── flatgfa.pyi ├── pyrightconfig.json ├── pollen_icon.png ├── pollen_icon_transparent.png ├── slow_odgi ├── slow_odgi │ ├── __init__.py │ ├── paths.py │ ├── norm.py │ ├── validate_setup.py │ ├── matrix.py │ ├── degree.py │ ├── somepaths.py │ ├── depth.py │ ├── crush.py │ ├── inject_setup.py │ ├── validate.py │ ├── proofs.py │ ├── overlap.py │ ├── flatten.py │ ├── chop.py │ ├── flip.py │ ├── inject.py │ └── __main__.py ├── pyproject.toml └── Makefile ├── pollen_py ├── pollen │ ├── __init__.py │ ├── main.py │ ├── argparse_custom.py │ └── depth │ │ ├── python_depth.py │ │ ├── processing-elements │ │ └── parse_data.py │ │ └── main.py ├── pyproject.toml └── README.md ├── pollen_data_gen ├── pollen_data_gen │ ├── __init__.py │ ├── __main__.py │ ├── depth.py │ └── simple.py └── pyproject.toml ├── Cargo.toml ├── .github ├── odgi.sh ├── tap-matcher.json └── workflows │ ├── code-quality.yml │ ├── docs.yml │ ├── build.yml │ └── flatgfa-py.yml ├── .zed └── settings.json ├── .gitignore ├── pyproject.toml ├── process.py ├── LICENSE ├── Makefile ├── Dockerfile └── README.md /flatgfa/tests/test.t: -------------------------------------------------------------------------------- 1 | ACTGG 2 | -------------------------------------------------------------------------------- /tests/subset-paths/ex1.paths: -------------------------------------------------------------------------------- 1 | path1 -------------------------------------------------------------------------------- /tests/subset-paths/ex2.paths: -------------------------------------------------------------------------------- 1 | path0 -------------------------------------------------------------------------------- /bench/.gitignore: -------------------------------------------------------------------------------- 1 | graphs/ 2 | results/ 3 | -------------------------------------------------------------------------------- /flatgfa/src/cli/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod cmds; 2 | -------------------------------------------------------------------------------- /mygfa/.gitignore: -------------------------------------------------------------------------------- 1 | docs/_build/ 2 | dist/ 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.gfa -text 2 | *.gaf -text 3 | -------------------------------------------------------------------------------- /tests/subset-paths/ex1.txt: -------------------------------------------------------------------------------- 1 | ARGS: basic/ex1.og -------------------------------------------------------------------------------- /tests/subset-paths/ex2.txt: -------------------------------------------------------------------------------- 1 | ARGS: basic/ex2.og -------------------------------------------------------------------------------- /flatgfa-py/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | docs/_build/ 3 | -------------------------------------------------------------------------------- /tests/depth/subset-paths/ex1.txt: -------------------------------------------------------------------------------- 1 | ../../subset-paths/ex1.txt -------------------------------------------------------------------------------- /tests/depth/subset-paths/ex2.txt: -------------------------------------------------------------------------------- 1 | ../../subset-paths/ex2.txt -------------------------------------------------------------------------------- /tests/depth/subset-paths/ex1.paths: -------------------------------------------------------------------------------- 1 | ../../subset-paths/ex1.paths -------------------------------------------------------------------------------- /tests/depth/subset-paths/ex2.paths: -------------------------------------------------------------------------------- 1 | ../../subset-paths/ex2.paths -------------------------------------------------------------------------------- /pyrightconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "venvPath": ".", 3 | "venv": ".venv" 4 | } 5 | -------------------------------------------------------------------------------- /pollen_icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cucapra/pollen/HEAD/pollen_icon.png -------------------------------------------------------------------------------- /pollen_icon_transparent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cucapra/pollen/HEAD/pollen_icon_transparent.png -------------------------------------------------------------------------------- /tests/basic/ex1.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S 1 A 3 | L 1 + 2 + 0M 4 | S 2 C 5 | L 2 + 2 + 0M 6 | P path1 1+,2+,2+ * 7 | -------------------------------------------------------------------------------- /tests/handmade/no-test-flip4.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S 1 A 3 | S 2 TTT 4 | S 3 G 5 | P x 1+,2+,3+ * 6 | L 1 + 2 + 0M -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/__init__.py: -------------------------------------------------------------------------------- 1 | """A reference implementation of the odgi library.""" 2 | 3 | __version__ = "0.1" 4 | -------------------------------------------------------------------------------- /tests/depth/basic/ex1.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S 1 A 3 | L 1 + 2 + 0M 4 | S 2 C 5 | L 2 + 2 + 0M 6 | P path1 1+,2+,2+ * 7 | -------------------------------------------------------------------------------- /bench/Makefile: -------------------------------------------------------------------------------- 1 | %.svg: %.csv bar.vl.json 2 | jq '.data.url |= "$<"' bar.vl.json | npx -p vega -p vega-lite vl2svg > $@ 3 | -------------------------------------------------------------------------------- /pollen_py/pollen/__init__.py: -------------------------------------------------------------------------------- 1 | """A collection of pangenome graph query accelerator tools""" 2 | 3 | __version__ = "1" 4 | -------------------------------------------------------------------------------- /flatgfa-py/test/tiny.gaf: -------------------------------------------------------------------------------- 1 | foo 12 0 12 + >1>2<4 38 5 17 12 12 0 cg:Z:150M 2 | bar 20 0 20 + >1>2>3 30 7 27 20 20 0 cg:Z:150M 3 | -------------------------------------------------------------------------------- /tests/handmade/flip1.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S 1 A 3 | S 2 TTT 4 | S 3 G 5 | P x 1+,2-,3+ * 6 | L 1 + 2 + 0M 7 | L 2 + 3 + 0M 8 | -------------------------------------------------------------------------------- /flatgfa/src/ops/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod bench; 2 | pub mod chop; 3 | pub mod depth; 4 | pub mod extract; 5 | pub mod gaf; 6 | pub mod position; 7 | -------------------------------------------------------------------------------- /pollen_data_gen/pollen_data_gen/__init__.py: -------------------------------------------------------------------------------- 1 | """Converts GFA graphs into calyx-friendly .data files.""" 2 | 3 | __version__ = "0.1" 4 | -------------------------------------------------------------------------------- /tests/handmade/flip2.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S 1 A 3 | S 2 TTT 4 | S 3 G 5 | P x 1+,2-,3+ * 6 | P y 1+,2-,3+ * 7 | L 1 + 2 + 0M 8 | L 2 + 3 + 0M 9 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "2" 3 | members = ["flatgfa", "flatgfa-py"] 4 | 5 | [profile.profiling] 6 | inherits = "release" 7 | debug = true 8 | -------------------------------------------------------------------------------- /mygfa/mygfa/__init__.py: -------------------------------------------------------------------------------- 1 | """Simple GFA parsing, printing, and pre-processing in Python.""" 2 | 3 | from .gfa import * # noqa 4 | 5 | __version__ = "0.1" 6 | -------------------------------------------------------------------------------- /flatgfa/tests/turnt.toml: -------------------------------------------------------------------------------- 1 | command = "../../target/debug/fgfa seq-export {filename} packed.seq ; cargo run seq-import packed.seq ; rm packed.seq" 2 | output.t = "-" 3 | -------------------------------------------------------------------------------- /.github/odgi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | path=`realpath -s --relative-to=$GITHUB_WORKSPACE ${PWD}` 3 | exec docker run -i --rm -v $GITHUB_WORKSPACE:/work --workdir /work/$path odgi odgi $@ 4 | -------------------------------------------------------------------------------- /tests/depth/turnt.toml: -------------------------------------------------------------------------------- 1 | [envs.baseline] 2 | binary = true 3 | command = "odgi depth -i {filename} -d" 4 | 5 | [envs.calyx] 6 | binary = true 7 | command = "exine depth -a -r {filename}" -------------------------------------------------------------------------------- /tests/depth/subset-paths/turnt.toml: -------------------------------------------------------------------------------- 1 | [envs.baseline] 2 | command = "odgi depth -i ../{args} -d -s {base}.paths" 3 | 4 | [envs.calyx-depth] 5 | command = "exine depth -a -r ../{args} -s {base}.paths" -------------------------------------------------------------------------------- /flatgfa-py/test/tiny.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S 1 CAAATAAG 3 | S 2 AAATTTTCTGGAGTTCTAT 4 | S 3 TTG 5 | S 4 CCAACTCTCTG 6 | P one 1+,2+,4- * 7 | P two 1+,2+,3+,4- * 8 | L 1 + 2 + 0M 9 | L 2 + 4 - 0M 10 | L 2 + 3 + 0M 11 | L 3 + 4 - 0M 12 | -------------------------------------------------------------------------------- /tests/basic/ex2.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S 1 AA 3 | L 1 + 2 + 0M 4 | S 2 AC 5 | L 2 + 3 + 0M 6 | L 2 + 4 + 0M 7 | S 3 AG 8 | L 3 + 2 + 0M 9 | L 3 + 5 + 0M 10 | S 4 AT 11 | L 4 + 5 + 0M 12 | S 5 CG 13 | P path0 1+,2+,3+,2+,4+,5+ * 14 | P path1 1+,2+,3+,5+ * 15 | -------------------------------------------------------------------------------- /tests/depth/basic/ex2.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S 1 AA 3 | L 1 + 2 + 0M 4 | S 2 AC 5 | L 2 + 3 + 0M 6 | L 2 + 4 + 0M 7 | S 3 AG 8 | L 3 + 2 + 0M 9 | L 3 + 5 + 0M 10 | S 4 AT 11 | L 4 + 5 + 0M 12 | S 5 CG 13 | P path0 1+,2+,3+,2+,4+,5+ * 14 | P path1 1+,2+,3+,5+ * 15 | -------------------------------------------------------------------------------- /flatgfa/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod cli; 2 | pub mod file; 3 | pub mod flatbed; 4 | pub mod flatgfa; 5 | pub mod gfaline; 6 | pub mod memfile; 7 | pub mod namemap; 8 | pub mod ops; 9 | pub mod packedseq; 10 | pub mod parse; 11 | pub mod pool; 12 | pub mod print; 13 | 14 | pub use flatgfa::*; 15 | -------------------------------------------------------------------------------- /.zed/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "language_overrides": { 3 | "Python": { 4 | "formatter": { 5 | "external": { 6 | "command": "bash", 7 | "arguments": ["-c", "ruff format --stdin-filename {buffer_path}"] 8 | } 9 | } 10 | } 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /mygfa/mygfa/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from .gfa import Graph 3 | 4 | 5 | if __name__ == "__main__": 6 | mygraph = Graph.parse(sys.stdin) 7 | if len(sys.argv) > 1 and sys.argv[1] == "--nl": 8 | mygraph.emit(sys.stdout, False) 9 | else: 10 | mygraph.emit(sys.stdout) 11 | -------------------------------------------------------------------------------- /flatgfa-py/docs/conf.py: -------------------------------------------------------------------------------- 1 | project = "flatgfa" 2 | copyright = "2024, Capra Lab" 3 | author = "Capra Lab" 4 | 5 | extensions = ["sphinx.ext.autodoc"] 6 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 7 | 8 | html_theme = "alabaster" 9 | 10 | autodoc_member_order = "bysource" 11 | autodoc_typehints_format = "short" 12 | -------------------------------------------------------------------------------- /flatgfa-py/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "flatgfa-py" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [lib] 7 | name = "flatgfa" 8 | crate-type = ["cdylib"] 9 | 10 | [dependencies] 11 | pyo3 = { version = "0.25.0", features = ["abi3-py38", "multiple-pymethods"] } 12 | flatgfa = { path = "../flatgfa" } 13 | memmap = "0.7.0" 14 | -------------------------------------------------------------------------------- /flatgfa-py/examples/depth.py: -------------------------------------------------------------------------------- 1 | import flatgfa 2 | from collections import Counter 3 | 4 | graph = flatgfa.parse("../tests/k.gfa") 5 | depths = Counter() 6 | for path in graph.paths: 7 | for step in path: 8 | depths[step.segment.id] += 1 9 | 10 | print("#node.id\tdepth") 11 | for seg in graph.segments: 12 | print("{}\t{}".format(seg.name, depths[seg.id])) 13 | -------------------------------------------------------------------------------- /pollen_py/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "pollen" 7 | authors = [ 8 | { name = "Susan Garry", email = "shg64@cs.cornell.edu" } 9 | ] 10 | readme = "README.md" 11 | dynamic = ["version", "description"] 12 | 13 | [project.scripts] 14 | exine = "pollen.main:main" 15 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/paths.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import mygfa 3 | 4 | 5 | def paths(graph: mygfa.Graph) -> mygfa.Graph: 6 | """Print the names of the paths found in `graph`.""" 7 | pathnames = graph.paths.keys() 8 | print("\n".join(pathnames)) 9 | return graph 10 | 11 | 12 | if __name__ == "__main__": 13 | paths(mygfa.Graph.parse(open(sys.argv[1], "r", encoding="utf-8"))) 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | .DS_Store 4 | **/*.chop 5 | **/*.crush 6 | **/*.degree 7 | **/*.depth 8 | **/*.emit 9 | **/*.extract 10 | **/*.flatten 11 | **/*.flip 12 | **/*.matrix 13 | **/*.overlap 14 | **/*.validate 15 | **/*.og 16 | **/*.out 17 | **/*.flatgfa 18 | og_to_gfa.py 19 | compute_maxes.py 20 | 21 | target/ 22 | pollen/*.rlib 23 | 24 | slow_odgi/dist/ 25 | 26 | .vscode/ 27 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/norm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import mygfa 3 | 4 | 5 | def norm(graph: mygfa.Graph) -> mygfa.Graph: 6 | """Gives the graph's entries a stable order: 7 | headers, then segments, then paths, and then links. 8 | """ 9 | return graph 10 | 11 | 12 | if __name__ == "__main__": 13 | newgraph = norm(mygfa.Graph.parse(sys.stdin)) 14 | newgraph.emit(sys.stdout, "--nl" not in sys.argv[1:]) 15 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | *.gfa 2 | *.og 3 | *.bed 4 | *.out 5 | *.chop 6 | *.crush 7 | *.degree 8 | *.depth 9 | *.depthpaths 10 | *.flatten 11 | *.flip 12 | *.inj 13 | *.matrix 14 | *.norm 15 | *.overlap 16 | *.overlappaths 17 | *.paths 18 | *.validate 19 | *.flatten.fasta 20 | 21 | temp.* 22 | 23 | basic/*.og 24 | basic/*.out 25 | 26 | subset-paths/*.out 27 | 28 | depth/*.out 29 | depth/basic/*.out 30 | depth/subset-paths/*.out 31 | -------------------------------------------------------------------------------- /flatgfa/src/ops/bench.rs: -------------------------------------------------------------------------------- 1 | use crate::memfile; 2 | use rayon::iter::ParallelIterator; 3 | 4 | // Count the lines in a file, like `wc -l`. 5 | pub fn line_count(filename: &str, parallel: bool) -> usize { 6 | let buf = memfile::map_file(filename); 7 | let split = memfile::MemchrSplit::new(b'\n', &buf); 8 | if parallel { 9 | ParallelIterator::count(split) 10 | } else { 11 | Iterator::count(split) 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /mygfa/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "mygfa" 7 | authors = [{name = "Anshuman Mohan", email = "amohan@cs.cornell.edu"}] 8 | classifiers = ["License :: OSI Approved :: MIT License"] 9 | dynamic = ["version", "description"] 10 | readme = "README.md" 11 | 12 | [project.urls] 13 | Home = "https://github.com/cucapra/pollen/tree/main/mygfa" 14 | -------------------------------------------------------------------------------- /mygfa/docs/conf.py: -------------------------------------------------------------------------------- 1 | # Make mygfa module available to autodoc. 2 | import sys 3 | import os 4 | 5 | sys.path.insert(0, os.path.abspath("..")) 6 | 7 | project = "mygfa" 8 | copyright = "2024, Capra Lab" 9 | author = "Capra Lab" 10 | 11 | extensions = ["sphinx.ext.autodoc"] 12 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 13 | 14 | html_theme = "alabaster" 15 | 16 | autodoc_member_order = "bysource" 17 | autodoc_typehints_format = "short" 18 | -------------------------------------------------------------------------------- /flatgfa/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "flatgfa" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [[bin]] 7 | name = "fgfa" 8 | path = "src/cli/main.rs" 9 | 10 | [dependencies] 11 | argh = "0.1.13" 12 | atoi = "2.0.0" 13 | bit-vec = "0.8.0" 14 | bstr = "1.12.0" 15 | memchr = "2.7.4" 16 | memmap = "0.7.0" 17 | num_enum = "0.7.3" 18 | rayon = "1.10.0" 19 | tinyvec = "1.9.0" 20 | zerocopy = { version = "0.8.25", features = ["derive"] } 21 | 22 | [dev-dependencies] 23 | rand = "0.8" 24 | -------------------------------------------------------------------------------- /slow_odgi/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "slow_odgi" 7 | authors = [{name = "Anshuman Mohan", email = "amohan@cs.cornell.edu"}] 8 | readme = "README.md" 9 | dynamic = ["version", "description"] 10 | dependencies = ["mygfa"] 11 | 12 | [project.urls] 13 | Home = "https://github.com/cucapra/pollen/tree/main/slow_odgi" 14 | 15 | [project.scripts] 16 | slow_odgi = "slow_odgi.__main__:main" -------------------------------------------------------------------------------- /pollen_data_gen/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "pollen_data_gen" 7 | authors = [{name = "Anshuman Mohan", email = "amohan@cs.cornell.edu"}] 8 | dynamic = ["version", "description"] 9 | dependencies = ["mygfa"] 10 | 11 | [project.urls] 12 | Home = "https://github.com/cucapra/pollen/tree/main/pollen_data_gen" 13 | 14 | [project.scripts] 15 | pollen_data_gen = "pollen_data_gen.__main__:main" -------------------------------------------------------------------------------- /flatgfa-py/examples/gaf.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import flatgfa 3 | 4 | TEST_DIR = pathlib.Path(__file__).parent 5 | TEST_GFA = TEST_DIR / "../test/tiny.gfa" 6 | TEST_GAF = TEST_DIR / "../test/tiny.gaf" 7 | graph = flatgfa.parse(str(TEST_GFA)) 8 | gaf = str(TEST_GAF) 9 | gaf_parser = graph.all_reads(gaf) 10 | for lines in gaf_parser: 11 | print(lines.name) 12 | print(lines.sequence()) 13 | print(lines.segment_ranges()) 14 | for element in lines: 15 | print(element.handle) 16 | print(element.range) 17 | -------------------------------------------------------------------------------- /.github/tap-matcher.json: -------------------------------------------------------------------------------- 1 | { 2 | "problemMatcher": [ 3 | { 4 | "owner": "turnt-msg", 5 | "pattern": [{ 6 | "regexp": "^not ok \\d+ - ([^#\\s]+)\\s+#\\s+(.*)", 7 | "message": 1, 8 | "file": 2 9 | }] 10 | }, 11 | { 12 | "owner": "turnt-nomsg", 13 | "pattern": [{ 14 | "regexp": "^not ok \\d+ - ([^#\\s]+)", 15 | "message": 0, 16 | "file": 1 17 | }] 18 | } 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /mygfa/example.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import mygfa 3 | 4 | 5 | def print_depth(graph: mygfa.Graph) -> None: 6 | # Count the number of times that any path passes through a segment. 7 | seg_depths = {name: 0 for name in graph.segments} 8 | for path in graph.paths.values(): 9 | for step in path.segments: 10 | seg_depths[step.name] += 1 11 | 12 | # Print the counts. 13 | print("seg\tdepth") 14 | for name, depth in seg_depths.items(): 15 | print(f"{name}\t{depth}") 16 | 17 | 18 | if __name__ == "__main__": 19 | print_depth(mygfa.Graph.parse(sys.stdin)) 20 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pollen_workspace" 3 | version = "0.0.0" 4 | requires-python = ">=3.8" 5 | dependencies = [ 6 | "mygfa", 7 | "slow_odgi", 8 | "pollen", 9 | "pollen_data_gen", 10 | "turnt>=1.11.0", 11 | ] 12 | 13 | [tool.uv.sources] 14 | mygfa = { workspace = true } 15 | slow_odgi = { workspace = true } 16 | pollen = { workspace = true } 17 | pollen_data_gen = { workspace = true } 18 | flatgfa = { workspace = true } 19 | 20 | [tool.uv.workspace] 21 | members = [ 22 | "mygfa", 23 | "slow_odgi", 24 | "pollen_py", 25 | "pollen_data_gen", 26 | "flatgfa-py", 27 | ] 28 | -------------------------------------------------------------------------------- /flatgfa/src/ops/position.rs: -------------------------------------------------------------------------------- 1 | use crate::flatgfa; 2 | 3 | pub fn position( 4 | gfa: &flatgfa::FlatGFA, 5 | path: &flatgfa::Path, 6 | offset: usize, 7 | ) -> Option<(flatgfa::Handle, usize)> { 8 | // Traverse the path until we reach the position. 9 | let mut cur_pos = 0; 10 | for step in &gfa.steps[path.steps] { 11 | let seg = gfa.get_handle_seg(*step); 12 | let end_pos = cur_pos + seg.len(); 13 | if offset < end_pos { 14 | // Found it! 15 | return Some((*step, offset - cur_pos)); 16 | } 17 | cur_pos = end_pos; 18 | } 19 | 20 | None 21 | } 22 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/validate_setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | import mygfa 4 | 5 | 6 | def drop_some_links(graph: mygfa.Graph) -> mygfa.Graph: 7 | """Given a graph, (pseudo)randomly drop 90% of the Links of the graph. 8 | This serves as a starting point from which to test `validate`. 9 | """ 10 | random.seed(4) 11 | links = list(sorted(graph.links)) 12 | links[:] = random.sample(links, int(0.1 * len(links))) 13 | return mygfa.Graph(graph.headers, graph.segments, links, graph.paths) 14 | 15 | 16 | if __name__ == "__main__": 17 | newgraph = drop_some_links(mygfa.Graph.parse(sys.stdin)) 18 | newgraph.emit(sys.stdout) 19 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/matrix.py: -------------------------------------------------------------------------------- 1 | import mygfa 2 | import mygfa.preprocess 3 | 4 | 5 | def matrix(graph: mygfa.Graph) -> mygfa.Graph: 6 | """Print the graph in sparse matrix format.""" 7 | 8 | # Just keeping up with the odgi header format... 9 | topseg = max([int(i) for i in graph.segments.keys()]) 10 | print(" ".join(str(i) for i in [topseg, topseg, 2 * len(graph.links)])) 11 | 12 | _, outs = mygfa.preprocess.adjlist(graph) 13 | for seg, neighbors in outs.items(): 14 | for neighbor in neighbors: 15 | print(" ".join([seg.name, neighbor.name, "1"])) 16 | print(" ".join([neighbor.name, seg.name, "1"])) 17 | return graph 18 | -------------------------------------------------------------------------------- /tests/handmade/flip3.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S 1 CAAATAAG 3 | S 2 A 4 | S 3 G 5 | S 4 T 6 | S 5 C 7 | S 6 TTG 8 | S 7 A 9 | S 8 G 10 | S 9 AAATTTTCTGGAGTTCTAT 11 | S 10 A 12 | S 11 T 13 | S 12 ATAT 14 | S 13 A 15 | S 14 T 16 | S 15 CCAACTCTCTG 17 | P x 1+,3+,5+,6+,8+,9-,11+,12+,14+,15- 8M,1M,1M,3M,1M,19M,1M,4M,1M,11M 18 | L 1 + 2 + 0M 19 | L 1 + 3 + 0M 20 | L 10 + 12 + 0M 21 | L 11 + 12 + 0M 22 | L 12 + 13 + 0M 23 | L 12 + 14 + 0M 24 | L 13 + 15 + 0M 25 | L 14 + 15 + 0M 26 | L 2 + 4 + 0M 27 | L 2 + 5 + 0M 28 | L 3 + 4 + 0M 29 | L 3 + 5 + 0M 30 | L 4 + 6 + 0M 31 | L 5 + 6 + 0M 32 | L 6 + 7 + 0M 33 | L 6 + 8 + 0M 34 | L 7 + 9 + 0M 35 | L 8 + 9 + 0M 36 | L 9 + 10 + 0M 37 | L 9 + 11 + 0M 38 | -------------------------------------------------------------------------------- /tests/handmade/crush1.gfa: -------------------------------------------------------------------------------- 1 | H VN:Z:1.0 2 | S 1 CNNATNNG 3 | S 2 A 4 | S 3 G 5 | S 4 N 6 | S 5 N 7 | S 6 NNG 8 | S 7 A 9 | S 8 G 10 | S 9 NNANNNNCTGGAGNNCTAT 11 | S 10 A 12 | S 11 T 13 | S 12 NNNN 14 | S 13 A 15 | S 14 T 16 | S 15 CCNNCTCTCTG 17 | P x 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ * 18 | P y 1+,2+,4+,6+,7+,9+,10+,12+,13+,15+ * 19 | L 1 + 2 + 0M 20 | L 1 + 3 + 0M 21 | L 2 + 4 + 0M 22 | L 2 + 5 + 0M 23 | L 3 + 4 + 0M 24 | L 3 + 5 + 0M 25 | L 4 + 6 + 0M 26 | L 5 + 6 + 0M 27 | L 6 + 7 + 0M 28 | L 6 + 8 + 0M 29 | L 7 + 9 + 0M 30 | L 8 + 9 + 0M 31 | L 9 + 10 + 0M 32 | L 9 + 11 + 0M 33 | L 10 + 12 + 0M 34 | L 11 + 12 + 0M 35 | L 12 + 13 + 0M 36 | L 12 + 14 + 0M 37 | L 13 + 15 + 0M 38 | L 14 + 15 + 0M 39 | -------------------------------------------------------------------------------- /mygfa/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /flatgfa-py/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/degree.py: -------------------------------------------------------------------------------- 1 | import mygfa 2 | import mygfa.preprocess 3 | 4 | 5 | def degree(graph: mygfa.Graph) -> mygfa.Graph: 6 | """The degree of a node is just the cardinality of adjlist for that node.""" 7 | print("\t".join(["#node.id", "node.degree"])) 8 | ins, outs = mygfa.preprocess.adjlist(graph) 9 | for seg in graph.segments.values(): 10 | segname = seg.name 11 | out_degree = len(outs[mygfa.Handle(segname, True)]) + len( 12 | outs[mygfa.Handle(segname, False)] 13 | ) 14 | in_degree = len(ins[mygfa.Handle(segname, True)]) + len( 15 | ins[mygfa.Handle(segname, False)] 16 | ) 17 | print("\t".join([segname, str(in_degree + out_degree)])) 18 | return graph 19 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/somepaths.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | import mygfa 4 | 5 | 6 | def somepaths(graph: mygfa.Graph, droprate: int = 0) -> mygfa.Graph: 7 | """Print the names of the paths found in `graph`. 8 | The droprate represents the percentage of paths to drop. 9 | """ 10 | pathnames = list(graph.paths.keys()) 11 | if droprate > 0: 12 | random.seed(4) 13 | pathnames[:] = random.sample( 14 | pathnames, int((100 - droprate) / 100 * len(pathnames)) 15 | ) 16 | for name in pathnames: 17 | print(name) 18 | return graph 19 | 20 | 21 | if __name__ == "__main__": 22 | somepaths( 23 | mygfa.Graph.parse(open(sys.argv[1], "r", encoding="utf-8")), int(sys.argv[2]) 24 | ) 25 | -------------------------------------------------------------------------------- /flatgfa-py/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "flatgfa" 3 | version = "0.2.0" 4 | description = "efficient processing of pangenomes in Graphical Fragment Assembly (GFA) format" 5 | readme = "README.md" 6 | requires-python = ">=3.8" 7 | license = "MIT" 8 | keywords = ["genomics", "pangenomics", "gfa"] 9 | classifiers = [ 10 | "Topic :: Scientific/Engineering :: Bio-Informatics", 11 | "License :: OSI Approved :: MIT License", 12 | ] 13 | 14 | [project.urls] 15 | repository = "https://github.com/cucapra/pollen" 16 | documentation = "https://cucapra.github.io/pollen/flatgfa/" 17 | 18 | [dependency-groups] 19 | dev = ["pytest>=8.3.0"] 20 | 21 | [build-system] 22 | requires = ["maturin>=1.0,<2.0"] 23 | build-backend = "maturin" 24 | 25 | [tool.maturin] 26 | features = ["pyo3/extension-module"] 27 | -------------------------------------------------------------------------------- /flatgfa-py/test/test_gaf.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import flatgfa 3 | 4 | TEST_DIR = pathlib.Path(__file__).parent 5 | TEST_GFA = TEST_DIR / "tiny.gfa" 6 | TEST_GAF = TEST_DIR / "tiny.gaf" 7 | 8 | 9 | def test_gaf_seqs(): 10 | gfa = flatgfa.parse_bytes(TEST_GFA.read_bytes()) 11 | gaf = gfa.all_reads(str(TEST_GAF)) 12 | seqs = ["".join(e.sequence() for e in line) for line in gaf] 13 | assert seqs == [ 14 | "AAGAAATTTTCT", 15 | "GAAATTTTCTGGAGTTCTAT", 16 | ] 17 | 18 | 19 | def test_gaf_ranges(): 20 | gfa = flatgfa.parse_bytes(TEST_GFA.read_bytes()) 21 | gaf = gfa.all_reads(str(TEST_GAF)) 22 | ranges = [[e.range for e in line] for line in gaf] 23 | assert ranges == [ 24 | [(5, 8), (0, 9), (1, 0)], 25 | [(7, 8), (0, 18), (0, 0)], 26 | ] 27 | -------------------------------------------------------------------------------- /pollen_py/pollen/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from sys import exit 3 | 4 | import pollen.depth.main as depth 5 | 6 | 7 | def main(): 8 | # Parse commandline input 9 | parser = argparse.ArgumentParser() 10 | 11 | subparsers = parser.add_subparsers() 12 | 13 | depth_parser = subparsers.add_parser( 14 | "depth", help="Compute node depth", conflict_handler="resolve" 15 | ) 16 | depth.config_parser(depth_parser) 17 | depth_parser.set_defaults(command="depth") 18 | 19 | args = parser.parse_args() 20 | 21 | if "command" not in args: 22 | parser.print_help() 23 | exit(-1) 24 | 25 | if args.command == "depth": 26 | depth.run(args) 27 | 28 | else: 29 | raise Exception("Command not recognized") 30 | 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /mygfa/README.md: -------------------------------------------------------------------------------- 1 | mygfa 2 | ===== 3 | 4 | This is a simple Python library for parsing, manipulating, and emitting pangenomic graphs in the [GFA][] format. 5 | It prioritizes simplicity and clarity over performance and functionality. 6 | 7 | As demonstrated in [`example.py`](./example.py), this is what it looks like to compute the node depth for a GFA file: 8 | 9 | import mygfa 10 | import sys 11 | graph = mygfa.Graph.parse(sys.stdin) 12 | seg_depths = {name: 0 for name in graph.segments} 13 | for path in graph.paths.values(): 14 | for step in path.segments: 15 | seg_depths[step.name] += 1 16 | 17 | Type `pip install mygfa` to get started. 18 | Then check out the [API documentation][docs]. 19 | 20 | [gfa]: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md 21 | [docs]: http://cucapra.github.io/pollen/mygfa/ 22 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/depth.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | import mygfa 3 | import mygfa.preprocess 4 | 5 | 6 | def depth(graph: mygfa.Graph, inputpaths: Optional[List[str]]) -> mygfa.Graph: 7 | """The depth of a node is the cardinality of node_step for that node.""" 8 | print("\t".join(["#node.id", "depth", "depth.uniq"])) 9 | for seg, crossings in mygfa.preprocess.node_steps(graph).items(): 10 | # Each crossing is a (path name, index on path, direction) tuple. 11 | # We only want to count crossings that are on input paths. 12 | crossings = [c for c in crossings if inputpaths is None or c[0] in inputpaths] 13 | # For depth.uniq, we need to know how many unique path-names there are. 14 | uniq_path_names = set(c[0] for c in crossings) 15 | print("\t".join([seg, str(len(crossings)), str(len(uniq_path_names))])) 16 | return graph 17 | -------------------------------------------------------------------------------- /bench/sizes.py: -------------------------------------------------------------------------------- 1 | import tomllib 2 | import requests 3 | import os 4 | 5 | GRAPHS_TOML = os.path.join(os.path.dirname(__file__), "graphs.toml") 6 | SIZE_NAMES = { 7 | 0: "", 8 | 3: "k", 9 | 6: "M", 10 | 9: "G", 11 | 12: "T", 12 | } 13 | 14 | 15 | def fmt_size(count): 16 | for scale, name in reversed(SIZE_NAMES.items()): 17 | unit = 10**scale 18 | if count > unit: 19 | return "{:.0f}{}B".format(count / unit, name) 20 | 21 | 22 | def show_sizes(): 23 | with open(GRAPHS_TOML, "rb") as f: 24 | graphs_data = tomllib.load(f) 25 | 26 | for category, graphs in graphs_data.items(): 27 | for name, url in graphs.items(): 28 | res = requests.head(url) 29 | length = int(res.headers["Content-Length"]) 30 | print(category, name, fmt_size(length)) 31 | 32 | 33 | if __name__ == "__main__": 34 | show_sizes() 35 | -------------------------------------------------------------------------------- /pollen_py/pollen/argparse_custom.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | class store_const_and_arg(argparse.Action): 5 | """ 6 | An argparse action which stores a constant and stores the argument(s) 7 | passed to this flag. Useful when using flags as mutually exclusive 8 | switches that also need to accept an argument. 9 | 'dest' is the destination for option arguments (as usual), and 10 | 'dest2' is the destination where 'const' is stored. 11 | """ 12 | 13 | def __init__(self, option_strings, dest, dest2=None, nargs=None, **kwargs): 14 | if dest2 == None: 15 | raise Exception("dest2 must be defined") 16 | self.const_dest = dest2 17 | 18 | super().__init__(option_strings, dest, **kwargs) 19 | 20 | def __call__(self, parser, namespace, values, option_string): 21 | setattr(namespace, self.const_dest, self.const) 22 | setattr(namespace, self.dest, values) 23 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/crush.py: -------------------------------------------------------------------------------- 1 | import mygfa 2 | import mygfa.preprocess 3 | 4 | 5 | def crush_seg(seg: mygfa.Segment) -> mygfa.Segment: 6 | """Compact any "runs" of N down to a single N.""" 7 | new_seq = "" 8 | in_n = False 9 | for char in str(seg.seq): 10 | if char == "N": 11 | if in_n: 12 | continue 13 | in_n = True 14 | else: 15 | in_n = False 16 | new_seq += char 17 | return mygfa.Segment(seg.name, mygfa.Strand(new_seq)) 18 | 19 | 20 | def crush(graph: mygfa.Graph) -> mygfa.Graph: 21 | """Crush all the segments of the graph.""" 22 | crushed_segs = {name: crush_seg(seg) for name, seg in graph.segments.items()} 23 | return mygfa.Graph( 24 | graph.headers, 25 | crushed_segs, 26 | graph.links, 27 | mygfa.preprocess.drop_all_overlaps(graph.paths), 28 | # odgi drops overlaps, so we do too. 29 | ) 30 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/inject_setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | import mygfa 4 | import mygfa.preprocess 5 | 6 | 7 | def print_bed(graph: mygfa.Graph) -> None: 8 | """Creates a reasonable query for `inject`. 9 | Each entry of the output is a BED where: 10 | `name` is the name of an existing path. 11 | `lo`/`hi` are the start/end points that we should walk over; lo <= hi. 12 | `new` is the name of the path we wish to create. 13 | """ 14 | random.seed(4) 15 | for path in graph.paths.values(): 16 | length = len(mygfa.preprocess.pathseq(graph)[path.name]) 17 | for i in range(random.randint(0, 5)): 18 | low = random.randint(0, length - 1) 19 | high = random.randint(low + 1, length) 20 | bed = mygfa.Bed(path.name, low, high, f"{path.name}_{i}") 21 | print(bed) 22 | 23 | 24 | if __name__ == "__main__": 25 | print_bed(mygfa.Graph.parse(sys.stdin)) 26 | -------------------------------------------------------------------------------- /process.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | 5 | def format_graph_depth_table(node_depths): 6 | """ 7 | Reads a graph depth table from the commandline and removes the depth.uniq column 8 | """ 9 | for row in node_depths: 10 | print(row[: row.rfind("\t")]) 11 | 12 | 13 | def format_json_data(node_depths, mem="segments0"): 14 | """ 15 | Takes a json data file (calyx output) and formats it as above 16 | """ 17 | depths = node_depths["memories"][mem] 18 | print("#node.id\tdepth") 19 | for i in range(len(depths)): 20 | print(f"{i+1}\t{depths[i]}") 21 | 22 | 23 | if __name__ == "__main__": 24 | """ 25 | Take a commandline arg, gdt or json, to specify which file to convert 26 | """ 27 | format = sys.argv[1] 28 | if format == "gdt": 29 | format_graph_depth_table(sys.stdin.readlines()) 30 | elif format == "json": 31 | data = json.load(sys.stdin) 32 | format_json_data(data) 33 | -------------------------------------------------------------------------------- /bench/bar.vl.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "url": "FILE.csv", 4 | "format": { 5 | "type": "csv", 6 | "parse": {"mean": "number", "stddev": "number"} 7 | } 8 | }, 9 | "layer": [ 10 | { 11 | "mark": "bar", 12 | "encoding": { 13 | "x": { "field": "graph", "type": "nominal", "title": null }, 14 | "xOffset": { "field": "cmd" }, 15 | "y": { "field": "mean", "type": "quantitative", 16 | "title": "running time (seconds)" }, 17 | "color": { "field": "cmd", "title": null } 18 | } 19 | }, 20 | { 21 | "mark": {"type": "errorbar", "ticks": {"color": "black"}}, 22 | "encoding": { 23 | "x": { "field": "graph", "type": "nominal" }, 24 | "xOffset": { "field": "cmd" }, 25 | "y": { "field": "mean", "type": "quantitative", 26 | "title": "running time (seconds)" }, 27 | "yError": { "field": "stddev" } 28 | } 29 | } 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/validate.py: -------------------------------------------------------------------------------- 1 | import mygfa 2 | import mygfa.preprocess 3 | 4 | 5 | def validate(graph: mygfa.Graph) -> mygfa.Graph: 6 | """Does the underlying set of Links support the paths that the graph has?""" 7 | _, outs = mygfa.preprocess.adjlist(graph) 8 | 9 | for path in graph.paths.values(): 10 | length = len(path.segments) 11 | if length < 2: 12 | continue # Success: done with this path. 13 | for i in range(length - 1): 14 | seg_from = path.segments[i] 15 | seg_to = path.segments[i + 1] 16 | if ( 17 | seg_to not in outs[seg_from] 18 | and seg_from.rev() not in outs[seg_to.rev()] 19 | ): 20 | print( 21 | f"[odgi::validate] error: the path {path.name} " 22 | "does not respect the graph topology: the link " 23 | f"{seg_from},{seg_to} is missing." 24 | ) 25 | return graph 26 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/proofs.py: -------------------------------------------------------------------------------- 1 | import mygfa 2 | import mygfa.preprocess 3 | 4 | 5 | def paths_logically_le(g1: mygfa.Graph, g2: mygfa.Graph) -> bool: 6 | """Are the paths in g1 logically "less than or equal to" those in g2? 7 | That is, for all paths p in g1, does the sequence charted by 8 | p in g1 match the sequence charted by p in g2? 9 | """ 10 | pathseqs_g1 = mygfa.preprocess.pathseq(g1) 11 | pathseqs_g2 = mygfa.preprocess.pathseq(g2) 12 | for p in g1.paths.keys(): 13 | if p not in g2.paths.keys() or pathseqs_g1[p] != pathseqs_g2[p]: 14 | return False 15 | return True 16 | 17 | 18 | def logically_le(g1: mygfa.Graph, g2: mygfa.Graph) -> bool: 19 | """Is `g1` logically "less than or equal to" `g2`? 20 | That is, can a user of `g1` use `g2` without a hitch? 21 | Note that `g2` is allowed to have more stuff than `g1`. 22 | 23 | Will add more line items to this as we think of them! 24 | """ 25 | return paths_logically_le(g1, g2) 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Cornell University 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/overlap.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import mygfa 3 | import mygfa.preprocess 4 | 5 | 6 | def touches(path1: str, path2: str, graph: mygfa.Graph) -> bool: 7 | """Are these two paths different, 8 | and if so, do they have any segments in common? 9 | """ 10 | if path1 == path2: 11 | return False 12 | segs1 = set(graph.paths[path1].segments) 13 | segs2 = set(graph.paths[path2].segments) 14 | return bool(segs1 & segs2) 15 | 16 | 17 | def overlap(graph: mygfa.Graph, inputpaths: List[str]) -> mygfa.Graph: 18 | """Which paths touch these input paths?""" 19 | header_printed = False 20 | for ip in inputpaths: 21 | assert ip in graph.paths 22 | for path in graph.paths.keys(): 23 | if touches(ip, path, graph): 24 | if not header_printed: 25 | print("\t".join(["#path", "start", "end", "path.touched"])) 26 | header_printed = True 27 | print( 28 | "\t".join( 29 | [ip, "0", str(len(mygfa.preprocess.pathseq(graph)[ip])), path] 30 | ) 31 | ) 32 | return graph 33 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | TEST_FILES := t k note5 overlap q.chop LPA DRB1-3123 chr6.C4 2 | GFA_URL := https://raw.githubusercontent.com/pangenome/odgi/ebc493f2622f49f1e67c63c1935d68967cd16d85/test 3 | 4 | # A smaller set of test inputs for faster runs. 5 | ifdef SMALL 6 | TEST_FILES := t k note5 overlap q.chop DRB1-3123 7 | endif 8 | 9 | tests/%.gfa: 10 | curl -Lo ./$@ $(GFA_URL)/$*.gfa 11 | 12 | tests/%.og: tests/%.gfa 13 | odgi build -g $< -o $@ 14 | 15 | .PHONY: fetch 16 | fetch: $(TEST_FILES:%=tests/%.gfa) 17 | 18 | fetch-og: $(TEST_FILES:%=tests/%.og) 19 | 20 | .PHONY: test-slow-odgi 21 | test-slow-odgi: fetch 22 | make -C slow_odgi test 23 | 24 | .PHONY: test-flatgfa 25 | test-flatgfa: fetch 26 | cd flatgfa ; cargo build 27 | 28 | turnt -e flatgfa_mem -e flatgfa_file -e flatgfa_file_inplace tests/*.gfa 29 | 30 | -turnt --save -v -e chop_oracle_fgfa tests/*.gfa 31 | turnt -v -e flatgfa_chop tests/*.gfa 32 | 33 | -turnt --save -v -e odgi_depth tests/*.gfa 34 | turnt -v -e flatgfa_depth tests/*.gfa 35 | 36 | -turnt --save -v -e odgi_extract tests/*.gfa 37 | turnt -v -e flatgfa_extract tests/*.gfa 38 | 39 | clean: 40 | -rm tests/*.flatgfa tests/*.inplace.flatgfa tests/*.chop tests/*.depth tests/*.extract tests/*.gfa tests/*.og -------------------------------------------------------------------------------- /bench/config.toml: -------------------------------------------------------------------------------- 1 | [tools] 2 | odgi = "odgi" 3 | fgfa = "../flatgfa/target/release/fgfa" 4 | slow_odgi = "../.venv/bin/slow_odgi" 5 | gfatools = "gfatools" 6 | 7 | [graph_sets] 8 | smoke = ["test.k"] 9 | mini = ["test.lpa", "test.chr6c4", "hprc.chrM"] 10 | med = ["hprc.chr20", "hprc.chrX", "1000gont.chr16"] 11 | 12 | [modes.paths] 13 | cmd.odgi = '{odgi} paths -i {files[og]} -L' 14 | cmd.flatgfa = '{fgfa} -i {files[flatgfa]} paths' 15 | cmd.slow_odgi = '{slow_odgi} paths {files[gfa]}' 16 | 17 | [modes.convert] 18 | convert = false 19 | cmd.odgi = '{odgi} build -g {files[gfa]} -o {files[og]}' 20 | cmd.flatgfa = '{fgfa} -I {files[gfa]} -o {files[flatgfa]}' 21 | 22 | [modes.roundtrip] 23 | convert = false 24 | cmd.flatgfa = '{fgfa} -I {files[gfa]}' 25 | cmd.slow_odgi = '{slow_odgi} norm {files[gfa]}' 26 | cmd.odgi = '{odgi} view -g -i {files[gfa]}' 27 | cmd.gfatools = '{gfatools} view {files[gfa]}' 28 | 29 | [modes.depth] 30 | cmd.flatgfa = '{fgfa} -i {files[flatgfa]} depth' 31 | cmd.odgi = '{odgi} depth -i {files[og]} -d' 32 | cmd.slow_odgi = '{slow_odgi} depth {files[gfa]}' 33 | 34 | [modes.chop] 35 | cmd.flatgfa = '{fgfa} -i {files[flatgfa]} chop -c 3' 36 | cmd.odgi = '{odgi} chop -i {files[og]} -c 3 -o -' 37 | cmd.slow_odgi = '{slow_odgi} chop {files[gfa]} -n 3' -------------------------------------------------------------------------------- /.github/workflows/code-quality.yml: -------------------------------------------------------------------------------- 1 | name: quality 2 | on: 3 | push: 4 | pull_request: 5 | branches: [main] 6 | 7 | jobs: 8 | python: 9 | runs-on: ubuntu-latest 10 | name: Python 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: ruff check 15 | uses: astral-sh/ruff-action@v3 16 | with: 17 | src: >- 18 | mygfa 19 | slow_odgi 20 | pollen_data_gen 21 | flatgfa-py 22 | 23 | - name: ruff format 24 | uses: astral-sh/ruff-action@v3 25 | with: 26 | args: "format --check --diff" 27 | src: >- 28 | mygfa 29 | slow_odgi 30 | pollen_data_gen 31 | flatgfa-py 32 | 33 | - name: Install uv 34 | uses: astral-sh/setup-uv@v5 35 | - name: mypy 36 | run: MYPYPATH=mygfa uv tool run mypy --no-namespace-packages --disallow-untyped-defs mygfa slow_odgi pollen_data_gen 37 | 38 | rust: 39 | runs-on: ubuntu-latest 40 | name: Rust 41 | env: 42 | RUSTFLAGS: "-Dwarnings" 43 | steps: 44 | - uses: actions/checkout@v4 45 | - run: rustup toolchain install stable --no-self-update 46 | - uses: Swatinem/rust-cache@v2 47 | - run: cargo check 48 | - run: cargo clippy 49 | - run: cargo fmt --check 50 | -------------------------------------------------------------------------------- /mygfa/docs/index.rst: -------------------------------------------------------------------------------- 1 | mygfa: A Basic GFA Data Model 2 | ============================= 3 | 4 | This library parses, represents, and emits pangenomic variation graphs in the 5 | `GFA`_ format. Basic use looks like this:: 6 | 7 | import mygfa 8 | import sys 9 | graph = mygfa.Graph.parse(sys.stdin) 10 | seg_depths = {name: 0 for name in graph.segments} 11 | for path in graph.paths.values(): 12 | for step in path.segments: 13 | seg_depths[step.name] += 1 14 | 15 | The :class:`mygfa.Graph` class represents an entire GFA file. 16 | You can work down the object hierarchy from there to see everything that the 17 | file contains. 18 | 19 | mygfa is `on PyPI`_, so you can install it with ``pip install mygfa``. 20 | 21 | .. _GFA: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md 22 | .. _on PyPI: https://pypi.org/project/mygfa/ 23 | 24 | API Reference 25 | ------------- 26 | 27 | .. automodule:: mygfa 28 | 29 | .. autoclass:: Graph 30 | :members: 31 | 32 | .. autoclass:: Segment 33 | :members: 34 | 35 | .. autoclass:: Link 36 | :members: 37 | 38 | .. autoclass:: Path 39 | :members: 40 | 41 | .. autoclass:: Handle 42 | :members: 43 | 44 | .. autoclass:: Strand 45 | :members: 46 | 47 | .. autoclass:: Alignment 48 | :members: 49 | 50 | .. autoclass:: AlignOp 51 | :members: 52 | 53 | .. toctree:: 54 | :maxdepth: 2 55 | :caption: Contents: 56 | -------------------------------------------------------------------------------- /flatgfa/src/namemap.rs: -------------------------------------------------------------------------------- 1 | use crate::flatgfa::{FlatGFA, Segment}; 2 | use crate::pool::Id; 3 | use std::collections::HashMap; 4 | 5 | /// A fast way to look up segment IDs by their (integer) names. 6 | #[derive(Default)] 7 | pub struct NameMap { 8 | /// Names at most this are assigned *sequential* IDs, i.e., the ID is just the name 9 | /// minus one. 10 | sequential_max: usize, 11 | 12 | /// Non-sequential names go here. 13 | others: HashMap, 14 | } 15 | 16 | impl NameMap { 17 | pub fn insert(&mut self, name: usize, id: Id) { 18 | // Is this the next sequential name? If so, no need to record it in our hash table; 19 | // just bump the number of sequential names we've seen. 20 | if (name - 1) == self.sequential_max && (name - 1) == id.index() { 21 | self.sequential_max += 1; 22 | } else { 23 | self.others.insert(name, id.into()); 24 | } 25 | } 26 | 27 | pub fn get(&self, name: usize) -> Id { 28 | if name <= self.sequential_max { 29 | ((name - 1) as u32).into() 30 | } else { 31 | self.others[&name].into() 32 | } 33 | } 34 | 35 | /// Construct a name map for all the segments in a GFA. 36 | pub fn build(gfa: &FlatGFA) -> Self { 37 | let mut name_map = NameMap::default(); 38 | for (id, seg) in gfa.segs.items() { 39 | name_map.insert(seg.name, id); 40 | } 41 | name_map 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /flatgfa/src/ops/depth.rs: -------------------------------------------------------------------------------- 1 | use crate::flatgfa; 2 | use bit_vec::BitVec; 3 | 4 | /// Compute the *depth* of each segment in the variation graph. 5 | /// 6 | /// The depth is defined to be the number of times that a path traverses a given 7 | /// segment. We return two values: the ordinary depth and the *unique* depth, 8 | /// which only counts each path that tarverses a given segment once. 9 | /// 10 | /// Both outputs are depth values indexed by segment ID. 11 | pub fn depth(gfa: &flatgfa::FlatGFA) -> (Vec, Vec) { 12 | // Our output vectors: the ordinary and unique depths of each segment. 13 | let mut depths = vec![0; gfa.segs.len()]; 14 | let mut uniq_depths = vec![0; gfa.segs.len()]; 15 | 16 | // This bit vector keeps track of whether the current path has already 17 | // traversed a given segment, and therefore whether we should ignore 18 | // subsequent traversals (for the purpose of counting unique depth). 19 | let mut seen = BitVec::from_elem(gfa.segs.len(), false); 20 | 21 | for path in gfa.paths.all().iter() { 22 | seen.clear(); // All segments are unseen. 23 | for step in &gfa.steps[path.steps] { 24 | let seg_id = step.segment().index(); 25 | depths[seg_id] += 1; 26 | if !seen[seg_id] { 27 | // The first traversal of this path over this segment. 28 | uniq_depths[seg_id] += 1; 29 | seen.set(seg_id, true); 30 | } 31 | } 32 | } 33 | 34 | (depths, uniq_depths) 35 | } 36 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # USAGE 2 | # With this Dockerfile in working directory, 3 | # docker build -t username/imagename . 4 | # (note the period at the end) 5 | # docker run -it --rm username/imagename 6 | 7 | # Start with latest Calyx image 8 | FROM ghcr.io/cucapra/calyx:latest 9 | 10 | # Go to the root directory 11 | WORKDIR /root 12 | 13 | # Install ODGI 14 | # Dependencies: 15 | RUN apt install -y build-essential cmake python3-distutils python3-dev libjemalloc-dev 16 | # Clone: 17 | RUN git clone --recursive https://github.com/pangenome/odgi.git 18 | # Build: 19 | WORKDIR /root/odgi 20 | RUN cmake -H. -Bbuild && cmake --build build -- -j7 21 | # Return to root directory 22 | WORKDIR /root 23 | 24 | # Add ODGI to paths 25 | ENV PATH="/root/odgi/bin:$PATH" 26 | ENV PYTHONPATH=$PYTHONPATH:/root/odgi/lib 27 | ENV LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 28 | ENV FLIT_ROOT_INSTALL=1 29 | 30 | # Install Pollen's dependencies: 31 | RUN git clone https://github.com/cucapra/turnt.git 32 | WORKDIR /root/turnt 33 | RUN flit install -s --user 34 | WORKDIR /root 35 | 36 | # Good to have: 37 | RUN apt install emacs -y 38 | RUN apt install vim -y 39 | 40 | # Clone and build Pollen: 41 | RUN git clone https://github.com/cucapra/pollen.git 42 | WORKDIR /root/pollen 43 | RUN make fetch 44 | RUN make og 45 | WORKDIR /root/pollen/pollen_py 46 | RUN flit install -s --user 47 | WORKDIR /root/pollen/mygfa 48 | RUN flit install -s --user 49 | WORKDIR /root/pollen/slow_odgi 50 | RUN flit install -s --user 51 | 52 | # return to the Pollen directory 53 | WORKDIR /root/pollen -------------------------------------------------------------------------------- /flatgfa-py/README.md: -------------------------------------------------------------------------------- 1 | Python Bindings for FlatGFA 2 | =========================== 3 | 4 | This is a Python interface for the [FlatGFA][] library, which provides an efficient representation for pangenomic variation graphs in the [Graphical Fragment Assembly (GFA)][gfa] format. 5 | 6 | You can install it [from PyPI][flatgfa-pypi]: 7 | 8 | $ pip install flatgfa 9 | 10 | Then, read [the API documentation][flatgfa-py-docs] for details about what it can do so far. 11 | 12 | Development 13 | ----------- 14 | 15 | The easiest way to get started is with [uv][]: 16 | 17 | $ uv run --package flatgfa python example.py 18 | 19 | That should build and install the package and then run our `example.py` script. 20 | 21 | Or run the tests: 22 | 23 | $ uv run --package flatgfa pytest 24 | 25 | During development, you'll want to rebuild the module using [Maturin][]. 26 | One way to do it is to install the necessary command-line tools into the virtualenv, like this: 27 | 28 | $ . .venv/bin/activate 29 | $ cd flatgfa-py 30 | $ uv pip install maturin pip 31 | $ maturin develop 32 | 33 | Then, just type `maturin develop` and `pytest` while you work. 34 | 35 | [maturin]: https://www.maturin.rs 36 | [flatgfa-py-docs]: https://cucapra.github.io/pollen/flatgfa/ 37 | [flatgfa]: https://github.com/cucapra/pollen/tree/main/flatgfa 38 | [gfa]: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md 39 | [flatgfa-pypi]: https://pypi.org/project/flatgfa/ 40 | [example]: https://github.com/cucapra/pollen/blob/main/flatgfa-py/example.py 41 | [uv]: https://docs.astral.sh/uv/ 42 | -------------------------------------------------------------------------------- /bench/summary.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | from collections import defaultdict 4 | from statistics import harmonic_mean 5 | 6 | 7 | def summary(): 8 | reader = csv.DictReader(sys.stdin) 9 | by_graph = defaultdict(dict) 10 | for row in reader: 11 | by_graph[row["graph"]][row["cmd"]] = row 12 | 13 | # Guess a suitable baseline by taking the fastest time on the first graph. 14 | first_res = next(iter(by_graph.values())) 15 | min_row = min(first_res.values(), key=lambda r: r["mean"]) 16 | baseline = min_row["cmd"] 17 | 18 | # Show each graph's times. 19 | ratios = defaultdict(list) 20 | for graph, cmds in by_graph.items(): 21 | baseline_time = float(cmds[baseline]["mean"]) 22 | 23 | print(graph) 24 | for cmd, row in cmds.items(): 25 | mean = float(row["mean"]) 26 | stddev = float(row["stddev"]) 27 | ratio = mean / baseline_time 28 | ratios[cmd].append(ratio) 29 | 30 | if mean > 80: 31 | mins = int(mean / 60) 32 | secs = int(mean % 60) 33 | print(f" {cmd}: {mins}m{secs}s ± {stddev:.1f}", end="") 34 | else: 35 | if mean < 0.2: 36 | mean *= 1000 37 | stddev *= 1000 38 | unit = "ms" 39 | else: 40 | unit = "s" 41 | print(f" {cmd}: {mean:.1f} ± {stddev:.1f} {unit}", end="") 42 | 43 | print(f" ({ratio:.1f}× {baseline})") 44 | 45 | # Show the average across graphs. 46 | print("harmonic mean") 47 | for cmd, cmd_ratios in ratios.items(): 48 | hmean = harmonic_mean(cmd_ratios) 49 | print(f" {cmd}: {hmean:.1f}× {baseline}") 50 | 51 | 52 | if __name__ == "__main__": 53 | summary() 54 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | # Permissions for GitHub pages deployment. 10 | permissions: 11 | contents: read 12 | pages: write 13 | id-token: write 14 | 15 | jobs: 16 | build: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | # Set up Sphinx. 22 | - uses: actions/setup-python@v5 23 | with: 24 | python-version: '3.12' 25 | - run: pip install Sphinx 26 | - name: "Sphinx problem matcher" 27 | uses: sphinx-doc/github-problem-matcher@master 28 | 29 | # Docs for mygfa. 30 | - name: Build mygfa docs 31 | run: | 32 | cd mygfa/docs 33 | make html 34 | 35 | # Docs for flatgfa-py. 36 | - name: Build flatgfa-py 37 | uses: PyO3/maturin-action@v1 38 | with: 39 | command: build 40 | args: --out dist --manifest-path flatgfa-py/Cargo.toml 41 | - name: Install flatgfa-py 42 | run: pip install dist/flatgfa-*.whl 43 | - name: Build flatgfa-py docs 44 | run: | 45 | cd flatgfa-py/docs 46 | make html 47 | 48 | # Create site. 49 | - name: Assemble site directory 50 | run: | 51 | mkdir -p site 52 | cp -r mygfa/docs/_build/html site/mygfa 53 | cp -r flatgfa-py/docs/_build/html site/flatgfa 54 | - name: Pages artifact 55 | uses: actions/upload-pages-artifact@v3 56 | with: 57 | path: "site" 58 | 59 | deploy: 60 | environment: 61 | name: github-pages 62 | url: ${{ steps.deploy.outputs.page_url }} 63 | runs-on: ubuntu-latest 64 | needs: build 65 | if: ${{github.event_name=='push' && github.ref=='refs/heads/main' && github.repository_owner=='cucapra'}} 66 | steps: 67 | - id: deploy 68 | uses: actions/deploy-pages@v4 69 | -------------------------------------------------------------------------------- /slow_odgi/Makefile: -------------------------------------------------------------------------------- 1 | # We use the small set of tests by default, because larger files make 2 | # slow_odgi go *really* slow. 3 | TESTS := t k note5 overlap q.chop DRB1-3123 4 | 5 | GFA := $(TESTS:%=../tests/%.gfa) 6 | OG := $(TESTS:%=../tests/%.og) 7 | 8 | %.og: %.gfa 9 | odgi build -g $^ -o $@ 10 | 11 | # Sets up all the odgi-oracles and then tests slow_odgi against them. 12 | test: setup oracles slow-odgi 13 | 14 | # Produce some input files that are necessary for the slow_odgi tests. 15 | setup: $(OG) 16 | -turnt -j --save --env depth_setup --env inject_setup \ 17 | --env overlap_setup --env validate_setup $(GFA) 18 | 19 | # Produce the oracle output (from "real" odgi) for each test input. Run this 20 | # once, noisily, to obtain the expected outputs. Then run `slow-odgi` to 21 | # compare against these expected outputs. 22 | # In reality, this depends on the setup stage above. Run this by itself ONLY 23 | # if you know that the setup stages don't need to be run afresh. 24 | ORACLES := chop_oracle crush_oracle degree_oracle depth_oracle \ 25 | flip_oracle flatten_oracle inject_oracle matrix_oracle overlap_oracle \ 26 | paths_oracle validate_oracle 27 | oracles: $(OG) 28 | -turnt -j --save $(ORACLES:%=--env %) $(OG) 29 | -turnt -j --save --env validate_oracle_err ../tests/invalid/*.gfa 30 | -turnt -j --save --env crush_oracle ../tests/handmade/crush*.gfa 31 | -turnt -j --save --env flip_oracle ../tests/handmade/flip*.gfa 32 | 33 | # Test slow_odgi against the output files generated by the `oracles` 34 | # target above. Be sure to rerun that before this if the inputs or odgi 35 | # behavior change. 36 | TEST_ENVS := chop_test crush_test degree_test depth_test flip_test \ 37 | flatten_test inject_test matrix_test overlap_test paths_test validate_test 38 | slow-odgi: 39 | -turnt -j $(TEST_ENVS:%=--env %) $(GFA) 40 | -turnt -j --env validate_test ../tests/invalid/*.gfa 41 | -turnt -j --env crush_test ../tests/handmade/crush*.gfa 42 | -turnt -j --env flip_test ../tests/handmade/flip*.gfa 43 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/flatten.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | import mygfa 3 | 4 | 5 | def get_fasta_legend(graph: mygfa.Graph) -> Tuple[str, mygfa.LegendType]: 6 | """The main deliverable is the FASTA: 7 | Simply traverse the segments in order and glue their seqs together. 8 | However, it pays to do some bookkeeping now. 9 | legend[segname] stores the [start, end) of the spot in the FASTA that 10 | segname's seq is featured. 11 | """ 12 | ans = "" 13 | legend = {} 14 | ptr = 0 15 | for segment in graph.segments.values(): 16 | ans += str(segment.seq) 17 | length = len(segment.seq) 18 | legend[segment.name] = (ptr, ptr + length) 19 | ptr += length 20 | return ans, legend 21 | 22 | 23 | def print_bed(graph: mygfa.Graph, legend: mygfa.LegendType, name: str) -> None: 24 | """With the legend computed during FASTA-building, this is easy.""" 25 | 26 | print("\t".join(["#name", "start", "end", "path.name", "strand", "step.rank"])) 27 | for path in graph.paths.values(): 28 | for i, handle in enumerate(path.segments): 29 | start, end = legend[handle.name] 30 | print( 31 | "\t".join( 32 | [ 33 | name, 34 | str(start), 35 | str(end), 36 | path.name, 37 | "+" if handle.ori else "-", 38 | str(i), 39 | ] 40 | ) 41 | ) 42 | 43 | 44 | def insert_newlines(string: str, every: int = 80) -> str: 45 | """odgi's output does this for this algorithm, so we follow them.""" 46 | return "\n".join(string[i : i + every] for i in range(0, len(string), every)) 47 | 48 | 49 | def flatten(graph: mygfa.Graph, name: str) -> mygfa.Graph: 50 | """Print out the FASTA and BED.""" 51 | print(f">{name}") 52 | # This is a bit harcoded for files living in test/file.gfa 53 | # Would be nice to neaten this up and make it less brittle. 54 | fasta, legend = get_fasta_legend(graph) 55 | print(insert_newlines(fasta)) 56 | print_bed(graph, legend, name) 57 | return graph 58 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/chop.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Tuple 2 | import mygfa 3 | 4 | 5 | def chop_segs( 6 | graph: mygfa.Graph, choplength: int 7 | ) -> Tuple[Dict[str, mygfa.Segment], mygfa.LegendType]: 8 | """Chop all the sequences of the graph into length n or lower.""" 9 | 10 | legend: mygfa.LegendType = {} 11 | # If a segment is chopped, its sequence will be spread out over 12 | # up among a series of contiguous new segments. 13 | 14 | # While not important for segment-chopping itself, it will serve us well to 15 | # maintain a dict that bookkeeps this chopping. 16 | 17 | # For example, if 18 | # S 3 = ATGGCCC 19 | # gets chopped into 20 | # S 7 = AT 21 | # S 8 = GG 22 | # S 9 = CC 23 | # S 10 = C 24 | # then legend[3] = (7,11). 25 | 26 | # Later, if 3+ occurs in a path, we will replace it with 7+,8+,9+,10+. 27 | # If 3- occurs in a path, we will replace it with 10-,9-,8-,7-. 28 | 29 | seg_count = 1 # To generate names for the new segments. 30 | new_segs: Dict[str, mygfa.Segment] = {} 31 | 32 | for segment in graph.segments.values(): 33 | chopped_segs = {} 34 | chopped_seqs = segment.seq.chop(choplength) 35 | seg_count_start = seg_count 36 | for chopped_seq in chopped_seqs: # Going from seqs to segs. 37 | seg_name = str(seg_count) 38 | chopped_segs[seg_name] = mygfa.Segment(seg_name, chopped_seq) 39 | seg_count += 1 40 | legend[segment.name] = (seg_count_start, seg_count) 41 | new_segs = new_segs | chopped_segs 42 | 43 | return new_segs, legend 44 | 45 | 46 | def chop_paths(graph: mygfa.Graph, legend: mygfa.LegendType) -> Dict[str, mygfa.Path]: 47 | """With the legend computed as above, this step is easy.""" 48 | new_paths = {} 49 | for path in graph.paths.values(): 50 | new_p_segs = [] 51 | for handle in path.segments: 52 | ori = handle.ori 53 | fst, snd = legend[handle.name] 54 | segments = [mygfa.Handle(str(s), ori) for s in range(fst, snd)] 55 | new_p_segs += segments if ori else list(reversed(segments)) 56 | new_paths[path.name] = mygfa.Path(path.name, new_p_segs, None) 57 | # odgi drops overlaps, so we do too. 58 | return new_paths 59 | 60 | 61 | def chop(graph: mygfa.Graph, choplength: int) -> mygfa.Graph: 62 | """Chop segments and regenerate paths.""" 63 | new_segments, legend = chop_segs(graph, choplength) 64 | new_paths = chop_paths(graph, legend) 65 | return mygfa.Graph(graph.headers, new_segments, [], new_paths) 66 | # The blank list is because we are choosing to drop links for now. 67 | -------------------------------------------------------------------------------- /flatgfa-py/flatgfa.pyi: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterator 2 | from typing import Optional, overload 3 | 4 | class Segment: 5 | id: int 6 | name: int 7 | 8 | def sequence(self) -> bytes: ... 9 | def __len__(self) -> int: ... 10 | 11 | class Handle: 12 | seg_id: int 13 | segment: Segment 14 | is_forward: bool 15 | 16 | class StepList: 17 | def __iter__(self) -> Iterator[Handle]: ... 18 | def __len__(self) -> int: ... 19 | @overload 20 | def __getitem__(self, idx: int) -> Handle: ... 21 | @overload 22 | def __getitem__(self, slice: slice) -> StepList: ... 23 | 24 | class Path: 25 | id: int 26 | name: bytes 27 | 28 | def __iter__(self) -> Iterator[Handle]: ... 29 | @overload 30 | def __getitem__(self, idx: int) -> Handle: ... 31 | @overload 32 | def __getitem__(self, slice: slice) -> StepList: ... 33 | 34 | class Link: 35 | id: int 36 | from_: Handle 37 | to: Handle 38 | 39 | class SegmentList: 40 | @overload 41 | def __getitem__(self, idx: int) -> Segment: ... 42 | @overload 43 | def __getitem__(self, slice: slice) -> SegmentList: ... 44 | def __iter__(self) -> Iterator[Segment]: ... 45 | def __len__(self) -> int: ... 46 | def find(self, name: int) -> Optional[Segment]: ... 47 | 48 | class PathList: 49 | @overload 50 | def __getitem__(self, idx: int) -> Path: ... 51 | @overload 52 | def __getitem__(self, slice: slice) -> PathList: ... 53 | def __iter__(self) -> Iterator[Path]: ... 54 | def __len__(self) -> int: ... 55 | def find(self, name: bytes) -> Optional[Path]: ... 56 | 57 | class LinkList: 58 | @overload 59 | def __getitem__(self, idx: int) -> Link: ... 60 | @overload 61 | def __getitem__(self, slice: slice) -> LinkList: ... 62 | def __iter__(self) -> Iterator[Link]: ... 63 | def __len__(self) -> int: ... 64 | 65 | class ChunkEvent: 66 | handle: Handle 67 | range: tuple[int, int] 68 | def sequence(self) -> str: ... 69 | 70 | class GAFLine: 71 | name: str 72 | chunks: list[ChunkEvent] 73 | def segment_ranges(self) -> str: ... 74 | def sequence(self) -> str: ... 75 | def __iter__(self) -> Iterator[ChunkEvent]: ... 76 | 77 | class GAFParser: 78 | def __iter__(self) -> Iterator[GAFLine]: ... 79 | 80 | class FlatGFA: 81 | segments: SegmentList 82 | paths: PathList 83 | links: LinkList 84 | 85 | def write_flatgfa(self, filename: str) -> None: ... 86 | def write_gfa(self, filename: str) -> None: ... 87 | def all_reads(self, gaf: str) -> GAFParser: ... 88 | def print_gaf_lookup(self, gaf: str) -> None: ... 89 | 90 | def parse(filename: str) -> FlatGFA: ... 91 | def load(filename: str) -> FlatGFA: ... 92 | def parse_bytes(gfa: bytes) -> FlatGFA: ... 93 | -------------------------------------------------------------------------------- /flatgfa/README.md: -------------------------------------------------------------------------------- 1 | FlatGFA 2 | ======= 3 | 4 | This is an experimental [odgi][]-like tool for manipulating pangenome graphs in the popular [GFA][] format. It works by converting the GFA to a "flat," pointer-free representation that can be stored directly on disk for zero-copy reads and writes. 5 | 6 | [odgi]: https://odgi.readthedocs.io/en/latest/ 7 | [gfa]: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md 8 | 9 | Build 10 | ----- 11 | 12 | It's a Rust project, so all you need to do is: 13 | 14 | $ cargo build --release 15 | 16 | Then you might like to do something like this to put a symlink on your `$PATH`: 17 | 18 | $ ln -s `pwd`/target/release/fgfa ~/.local/bin 19 | 20 | Now see what's available: 21 | 22 | $ fgfa --help 23 | 24 | Convert GFA Files 25 | ----------------- 26 | 27 | This tool can run queries directly on GFA text files, but you can amortize that cost by converting to the native FlatGFA format. Try this: 28 | 29 | $ fgfa -I chr22.hprc-v1.0-pggb.gfa -o chr22.flatgfa 30 | 31 | In general, you will want to remember these flags for input and output: 32 | 33 | * `-i` or `-o`: Read or write our native FlatGFA binary format. 34 | * `-I` or `-O`: Read or write the standard GFAv1 text format. Or, just omit the relevant flag to use standard input or standard output. 35 | 36 | So combining `-I` and `-o` as above does the conversion you want. FlatGFA files should be a little smaller than their text counterparts. Now that we have one, we can convert it back to a GFA text file like this: 37 | 38 | $ fgfa -i chr22.flatgfa | less 39 | 40 | Simple Queries 41 | -------------- 42 | 43 | Here are some things we can do with FlatGFA files. See some basic statistics about the graph: 44 | 45 | $ fgfa -i chr22.flatgfa stats -S 46 | 47 | Or use `-L` instead to see information about self-loops. This output should match [`odgi stats`][odgi-stats]. 48 | 49 | Get a list of all the path names in the graph---or, in this case, just the first few: 50 | 51 | $ fgfa -i chr22.flatgfa paths | head 52 | 53 | Find the graph position of a given base-pair offset within a certain path, just like [`odgi position -v`][odgi-position]: 54 | 55 | $ fgfa -i chr22.flatgfa position -p chm13#chr22,12345,+ 56 | 57 | Extract a subgraph from a larger graph around a specific segment: 58 | 59 | $ fgfa -i chr22.flatgfa -o chr22.sub.flatgfa extract -n 25 -c 60 | $ fgfa -i chr22.sub.flatgfa stats -S 61 | 62 | Unfortunately, this extraction doesn't quite match [`odgi extract`][odgi-extract] yet (because I haven't quite been able to figure out how it's supposed to work). 63 | 64 | [odgi-stats]: https://odgi.readthedocs.io/en/latest/rst/commands/odgi_stats.html 65 | [odgi-position]: https://odgi.readthedocs.io/en/latest/rst/commands/odgi_position.html 66 | [odgi-extract]: https://odgi.readthedocs.io/en/latest/rst/commands/odgi_extract.html 67 | -------------------------------------------------------------------------------- /mygfa/mygfa/preprocess.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Dict 2 | from . import gfa as mygfa 3 | 4 | 5 | def node_steps(graph: mygfa.Graph) -> Dict[str, List[Tuple[str, int, bool]]]: 6 | """For each segment in the graph, 7 | list the times the segment was crossed by a path""" 8 | # segment name, (path name, index on path, direction) list 9 | crossings: Dict[str, List[Tuple[str, int, bool]]] = {} 10 | for segname in graph.segments.keys(): 11 | crossings[segname] = [] 12 | 13 | for path in graph.paths.values(): 14 | for index, handle in enumerate(path.segments): 15 | crossings[handle.name].append((path.name, index, handle.ori)) 16 | 17 | return crossings 18 | 19 | 20 | HandleMap = Dict[mygfa.Handle, List[mygfa.Handle]] 21 | 22 | 23 | def adjlist(graph: mygfa.Graph) -> Tuple[HandleMap, HandleMap]: 24 | """Construct an adjacency list representation of the graph. 25 | This is via two dicts having the same type: 26 | key: Handle # my details 27 | value: list of Handle # neighbors' details 28 | We take each segment into account, regardless of whether it is on a path. 29 | We make two such dicts: one for in-edges and one for out-edges 30 | """ 31 | ins: HandleMap = {} 32 | outs: HandleMap = {} 33 | for segname in graph.segments.keys(): 34 | ins[mygfa.Handle(segname, True)] = [] 35 | ins[mygfa.Handle(segname, False)] = [] 36 | outs[mygfa.Handle(segname, True)] = [] 37 | outs[mygfa.Handle(segname, False)] = [] 38 | 39 | for link in graph.links: 40 | ins[link.to_].append(link.from_) 41 | outs[link.from_].append(link.to_) 42 | 43 | return ins, outs 44 | 45 | 46 | def handle_seq(graph: mygfa.Graph, handle: mygfa.Handle) -> mygfa.Strand: 47 | """Get the sequence of a handle, reverse-complementing if necessary.""" 48 | seg = graph.segments[handle.name] 49 | return seg.seq if handle.ori else seg.revcomp().seq 50 | 51 | 52 | def pathseq(graph: mygfa.Graph) -> Dict[str, str]: 53 | """Given a graph, precompute the _sequence_ 54 | charted by each of the graph's paths. 55 | """ 56 | ans: Dict[str, str] = {} 57 | for path in graph.paths.keys(): 58 | ans[path] = "".join( 59 | str(handle_seq(graph, handle)) for handle in graph.paths[path].segments 60 | ) 61 | return ans 62 | 63 | 64 | def get_maxes(graph: mygfa.Graph) -> Tuple[int, int, int]: 65 | """Given a graph, returns: 66 | - the number of nodes 67 | - the maximum number of steps in a path 68 | - the number of paths in the graph. 69 | """ 70 | max_nodes = len(graph.segments) 71 | max_steps = max([len(steps) for steps in node_steps(graph).values()]) 72 | max_paths = len(graph.paths) 73 | return max_nodes, max_steps, max_paths 74 | 75 | 76 | def drop_all_overlaps(paths: Dict[str, mygfa.Path]) -> Dict[str, mygfa.Path]: 77 | """Drop all overlaps from the given paths.""" 78 | return {name: path.drop_overlaps() for name, path in paths.items()} 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |

3 | 4 |

5 | 6 | Accelerated Pangenome Graph Queries 7 | =================================== 8 | 9 | Pollen is a nascent project to accelerate queries on pangenomic graphs. 10 | We are designing a graph-manipulating DSL that exposes functionality that pangenomicists care about. 11 | Our DSL will support graph queries in the vein of the [odgi][] project. 12 | We will compile programs written in this DSL into fast query code. 13 | Eventually, we aim to generate custom hardware accelerators for these queries via the [Calyx][] compiler. 14 | 15 | There are several things in this repository: 16 | 17 | * [mygfa](./mygfa), a simple Python library for parsing, processing, and emitting [GFA][] files. See [its documentation][mygfa-docs]. 18 | * [slow_odgi](./slow_odgi), a reference implementation of several GFA queries from the [odgi][] tool using `mygfa`. 19 | * [FlatGFA](./flatgfa), an experimental fast binary format for representing and analyzing GFA files. There are also [Python bindings](./flatgfa-py) for this library; check out [their documentation][flatgfa-py-docs]. 20 | * A proof-of-concept Calyx-based [hardware accelerator generator](./pollen_py) for a single GFA query (`odgi depth`) and a data generator for this hardware. 21 | 22 | [calyx]: https://calyxir.org 23 | [odgi]: https://odgi.readthedocs.io/en/latest/ 24 | [gfa]: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md 25 | [flatgfa-py-docs]: https://cucapra.github.io/pollen/flatgfa/ 26 | 27 | 28 | `mygfa` and `slow_odgi` 29 | ----------------------- 30 | 31 | The `mygfa` library is an extremely simple Python library for representing (and parsing and emitting) GFA files. It emphasizes clarity over efficiency. Use `pip install mygfa` to get started, and read the [API documentation][mygfa-docs] for details. 32 | 33 | Similarly, `slow_odgi` is a set of GFA analyses based on `mygfa`; it's meant to act as a *reference implementation* of the much faster functionality in [odgi][]. Check out [the slow_odgi README](slow_odgi/) for more details. 34 | 35 | To set up both of them from this repository, try using [uv][]: 36 | 37 | $ uv run slow_odgi --help 38 | 39 | Or, alternatively, you can set up and activate the environment manually: 40 | 41 | $ uv sync 42 | $ source .venv/bin/activate 43 | $ slow_odgi --help 44 | 45 | [uv]: https://github.com/astral-sh/uv 46 | [mygfa-docs]: http://cucapra.github.io/pollen/mygfa/ 47 | 48 | 49 | FlatGFA 50 | ------- 51 | 52 | [FlatGFA](./flatgfa) is an efficient representation for GFA files. It is implemented in Rust and available with [Python bindings](./flatgfa-py). The latter is [on PyPI][flatgfa-pypi], so you can get started with: 53 | 54 | $ pip install flatgfa 55 | 56 | Then read the [API documentation][flatgfa-py-docs] to see what's available. Or see [the included example](./flatgfa-py/example.py) for a synopsis. 57 | 58 | [flatgfa-pypi]: https://pypi.org/project/flatgfa/ 59 | 60 | 61 | Credits 62 | ------- 63 | 64 | This is a project of the [Capra][] lab at Cornell. 65 | The license is [MIT][]. 66 | 67 | [capra]: https://capra.cs.cornell.edu 68 | [mit]: https://choosealicense.com/licenses/mit/ 69 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/flip.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Dict 2 | from collections.abc import Callable 3 | import mygfa 4 | 5 | 6 | def path_is_rev(path: mygfa.Path, graph: mygfa.Graph) -> bool: 7 | """Is this path more reverse-oriented than it is forward-oriented?""" 8 | fwd = 0 9 | rev = 0 10 | for seg in path.segments: 11 | length = len(graph.segments[seg.name].seq) 12 | if seg.ori: 13 | fwd += length 14 | else: 15 | rev += length 16 | return rev > fwd 17 | 18 | 19 | def flip_path(path: mygfa.Path, graph: mygfa.Graph) -> Tuple[mygfa.Path, bool]: 20 | """Flip the given path if it is more reverse- than forward-oriented. 21 | Return the path, whether this method flipped it or not, 22 | along with a bool that says whether this method flipped the path.""" 23 | if path_is_rev(path, graph): 24 | path_segs = [] 25 | for seg in reversed(path.segments): 26 | path_segs.append(mygfa.Handle(seg.name, not seg.ori)) 27 | return mygfa.Path(f"{path.name}_inv", path_segs, None), True 28 | else: 29 | return path.drop_overlaps(), False 30 | # odgi drops overlaps, so we do too. 31 | 32 | 33 | def dedup(mylist: List[mygfa.Link]) -> List[mygfa.Link]: 34 | """De-duplicate a list of links.""" 35 | new: List[mygfa.Link] = [] 36 | for item in mylist: 37 | if item not in new and item.rev() not in new: 38 | # odgi seems to consider a link's reverse its own duplicate. 39 | new.append(item) 40 | return new 41 | 42 | 43 | def gen_links( 44 | paths_dec: Dict[str, Tuple[mygfa.Path, bool]], pred: Callable[[bool], bool] 45 | ) -> List[mygfa.Link]: 46 | """Given a dict of decorated paths and a predicate on path-decorations, 47 | return a list of links that, when added to the graph, 48 | would make the predicate-satisfying paths valid. 49 | 50 | The code feels like the spiritual reverse of `validate`, 51 | and indeed, after this has been run, `validate` will be happy 52 | with those paths that satisfy the predicate. 53 | """ 54 | links = [] 55 | # A "no-op" alignment 56 | alignment = mygfa.Alignment([(0, mygfa.AlignOp("M"))]) 57 | for path, dec in paths_dec.values(): 58 | if not pred(dec): 59 | continue 60 | # Below be the paths of interest. 61 | length = len(path.segments) 62 | if length < 2: 63 | continue # Success: done with this path. 64 | for i in range(length - 1): 65 | from_ = path.segments[i] 66 | to = path.segments[i + 1] 67 | links.append(mygfa.Link(from_, to, alignment)) 68 | return links 69 | 70 | 71 | def flip(graph: mygfa.Graph) -> mygfa.Graph: 72 | """Flip the paths, and generate new links that make the graph valid.""" 73 | paths_dec = {name: flip_path(p, graph) for name, p in graph.paths.items()} 74 | # paths_dec is "decorated" with info re: 75 | # whether a path has just been flipped. 76 | new_links = gen_links(paths_dec, lambda x: x) 77 | paths = {name: p for name, (p, _) in paths_dec.items()} 78 | # Stripping the decoration off paths_dec gives a reasonable 79 | # Dict[str, Path]. 80 | return mygfa.Graph( 81 | graph.headers, graph.segments, dedup(graph.links + new_links), paths 82 | ) 83 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | test-py: 13 | name: test Python tools 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v4 17 | - uses: actions/setup-python@v5 18 | with: 19 | python-version: "3.12" 20 | 21 | # Set up and use uv. 22 | - uses: actions/cache@v4 23 | id: cache-uv 24 | with: 25 | path: ~/.cache/uv 26 | key: ${{ runner.os }}-python-${{ matrix.python-version }}-uv 27 | - name: uv sync and activate 28 | run: | 29 | curl -LsSf https://astral.sh/uv/install.sh | sh 30 | uv sync 31 | echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV 32 | echo "$PWD/.venv/bin" >> $GITHUB_PATH 33 | 34 | # Set up for tests. 35 | - name: Problem matcher 36 | run: echo '::add-matcher::.github/tap-matcher.json' 37 | - name: Fetch test data 38 | run: make fetch SMALL=1 39 | 40 | - name: Pull odgi container 41 | run: | 42 | docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 43 | docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi 44 | - name: Install odgi alias 45 | run: | 46 | mkdir -p $HOME/.local/bin 47 | cp .github/odgi.sh $HOME/.local/bin/odgi 48 | chmod a+x $HOME/.local/bin/odgi 49 | 50 | # Test slow_odgi. 51 | - name: Set up for slow_odgi tests 52 | run: make -C slow_odgi setup oracles SMALL=1 53 | - name: Test slow_odgi 54 | run: make -C slow_odgi test SMALL=1 55 | 56 | test-flatgfa: 57 | name: test FlatGFA 58 | runs-on: ubuntu-latest 59 | steps: 60 | - uses: actions/checkout@v4 61 | - run: rustup toolchain install stable --no-self-update 62 | 63 | # Install slow-odgi. 64 | - uses: actions/cache@v4 65 | id: cache-uv 66 | with: 67 | path: ~/.cache/uv 68 | key: ${{ runner.os }}-python-${{ matrix.python-version }}-uv 69 | - name: uv sync and activate 70 | run: | 71 | curl -LsSf https://astral.sh/uv/install.sh | sh 72 | uv sync 73 | echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV 74 | echo "$PWD/.venv/bin" >> $GITHUB_PATH 75 | 76 | # Install odgi 77 | - name: Pull odgi container 78 | run: | 79 | docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 80 | docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi 81 | - name: Install odgi alias 82 | run: | 83 | mkdir -p $HOME/.local/bin 84 | cp .github/odgi.sh $HOME/.local/bin/odgi 85 | chmod a+x $HOME/.local/bin/odgi 86 | 87 | # Install Turnt. 88 | - uses: actions/setup-python@v5 89 | with: 90 | python-version: "3.12" 91 | - name: Install Turnt 92 | run: pip install turnt 93 | - name: Problem matcher 94 | run: echo '::add-matcher::.github/tap-matcher.json' 95 | 96 | # We need the test data. 97 | - name: Fetch test data 98 | run: make fetch SMALL=1 99 | 100 | # Build and test. 101 | - run: cargo build 102 | working-directory: ./flatgfa 103 | - run: cargo test 104 | working-directory: ./flatgfa 105 | - run: make test-flatgfa 106 | -------------------------------------------------------------------------------- /pollen_py/pollen/depth/python_depth.py: -------------------------------------------------------------------------------- 1 | """ A node depth computation for .og files implemented using odgi's 2 | Python bindings. While this implementation reuses odgi's data structures, it 3 | does not reuse its node depth computation algorithm and instead implements 4 | it from scratch. 5 | 6 | The documentation for the odgi module can be found at 7 | https://odgi.readthedocs.io/en/latest/rst/binding/glossary.html 8 | """ 9 | 10 | import argparse 11 | import sys 12 | import odgi 13 | 14 | 15 | def get_depth_table(graph, subset_paths=None): 16 | """ 17 | Input: an odgi.graph object 18 | Output: the node depth table, a dictionary that maps from a node's id to its (depth, uniq_depth), 19 | where depth is the total number of times each path in subset_paths crosses the node, 20 | and uniq_depth is the number of paths in subset_paths which cross the node 21 | Note: if subset_paths is empty, consider all paths when computing node depth 22 | """ 23 | 24 | ndt = dict() # node depth table map from node.id -> (node.depth, node.uniq_depth) 25 | 26 | # Compute the node depth and unique depth 27 | def get_node_depth(handle): 28 | """ 29 | Input: [handle] is an odgi.handle object which represents a node 30 | Inserts node.depth and node.uniq into ndt for the node associated with 31 | [handle] 32 | """ 33 | 34 | # Note: a node can have multiple handles, but only one id 35 | node_id = graph.get_id(handle) 36 | 37 | paths = set() 38 | depth = 0 39 | 40 | # For a given path step, update the node depth and set of paths which cross the node 41 | def for_step(step): 42 | path_h = graph.get_path_handle_of_step(step) 43 | # The name of the path associated with path_h 44 | path = graph.get_path_name(path_h) 45 | if not subset_paths or path in subset_paths: 46 | paths.add(path) 47 | nonlocal depth # modify the 'depth' variable in the outer scope 48 | depth += 1 49 | 50 | graph.for_each_step_on_handle(handle, for_step) 51 | 52 | ndt[node_id] = (depth, len(paths)) 53 | 54 | graph.for_each_handle(get_node_depth) 55 | return ndt 56 | 57 | 58 | def parse_paths_file(filename): 59 | """Parse a file which contains the name of a path on each line.""" 60 | 61 | if filename is None: # Return the default value 62 | return None 63 | 64 | with open(filename, "r") as paths_file: 65 | text = paths_file.read() 66 | paths = text.splitlines() 67 | return paths 68 | 69 | 70 | if __name__ == "__main__": 71 | # Parse commandline arguments 72 | parser = argparse.ArgumentParser() 73 | parser.add_argument( 74 | "filename", 75 | help="A .og file representing a pangenome whose node depth we want to calculate", 76 | ) 77 | parser.add_argument( 78 | "-s", 79 | "--subset-paths", 80 | help="Specify a file containing a subset of all paths in the graph. See the odgi documentation for more details", 81 | ) 82 | args = parser.parse_args() 83 | 84 | graph = odgi.graph() 85 | graph.load(args.filename) 86 | 87 | # Get the set of all paths specified in the file give 88 | subset_paths = parse_paths_file(args.subset_paths) 89 | 90 | # Get the node depths for all nodes in the graph 91 | ndt = get_depth_table(graph, subset_paths) 92 | 93 | # Print the ndt to the standard output 94 | print("#node.id\tdepth\tdepth.uniq") 95 | for id, (depth, uniq) in sorted(ndt.items()): 96 | print(f"{id}\t{depth}\t{uniq}") 97 | -------------------------------------------------------------------------------- /flatgfa/src/memfile.rs: -------------------------------------------------------------------------------- 1 | use memmap::{Mmap, MmapMut}; 2 | use rayon::iter::{ 3 | plumbing::{bridge_unindexed, UnindexedConsumer, UnindexedProducer}, 4 | ParallelIterator, 5 | }; 6 | 7 | pub fn map_file(name: &str) -> Mmap { 8 | let file = std::fs::File::open(name).unwrap(); 9 | unsafe { Mmap::map(&file) }.unwrap() 10 | } 11 | 12 | pub fn map_new_file(name: &str, size: u64) -> MmapMut { 13 | let file = std::fs::OpenOptions::new() 14 | .read(true) 15 | .write(true) 16 | .truncate(true) 17 | .create(true) 18 | .open(name) 19 | .unwrap(); 20 | file.set_len(size).unwrap(); 21 | unsafe { MmapMut::map_mut(&file) }.unwrap() 22 | } 23 | 24 | pub fn map_file_mut(name: &str) -> MmapMut { 25 | let file = std::fs::OpenOptions::new() 26 | .read(true) 27 | .write(true) 28 | .open(name) 29 | .unwrap(); 30 | unsafe { MmapMut::map_mut(&file) }.unwrap() 31 | } 32 | 33 | pub struct MemchrSplit<'a> { 34 | needle: u8, 35 | haystack: &'a [u8], 36 | memchr: memchr::Memchr<'a>, 37 | pub pos: usize, 38 | } 39 | 40 | impl MemchrSplit<'_> { 41 | pub fn new(needle: u8, haystack: &[u8]) -> MemchrSplit<'_> { 42 | MemchrSplit { 43 | needle, 44 | haystack, 45 | memchr: memchr::memchr_iter(needle, haystack), 46 | pos: 0, 47 | } 48 | } 49 | } 50 | 51 | impl<'a> Iterator for MemchrSplit<'a> { 52 | type Item = &'a [u8]; 53 | 54 | fn next(&mut self) -> Option { 55 | if self.pos >= self.haystack.len() { 56 | return None; 57 | } 58 | let start = self.pos; 59 | let end = self.memchr.next()?; 60 | self.pos = end + 1; 61 | Some(&self.haystack[start..end]) 62 | } 63 | } 64 | 65 | impl<'a> UnindexedProducer for MemchrSplit<'a> { 66 | type Item = &'a [u8]; 67 | 68 | fn split(self) -> (Self, Option) { 69 | // Roughly chop the buffer in half. Maybe this should give up if the current 70 | // size is already below a threshold. 71 | let mid = self.pos + (self.haystack.len() - self.pos) / 2; 72 | if mid >= self.haystack.len() || mid == 0 { 73 | return (self, None); 74 | }; 75 | 76 | // Advance the midpoint to a needle boundary. 77 | let mid_nl = memchr::memchr(self.needle, &self.haystack[mid..]); 78 | let right_start = match mid_nl { 79 | Some(mid_nl) => mid + mid_nl + 1, 80 | None => return (self, None), 81 | }; 82 | 83 | // Create two sub-iterators. 84 | let left = Self { 85 | needle: self.needle, 86 | haystack: &self.haystack[..right_start], 87 | memchr: self.memchr, 88 | pos: self.pos, 89 | }; 90 | let right_buf = &self.haystack[right_start..]; 91 | let right = Self { 92 | needle: self.needle, 93 | haystack: right_buf, 94 | memchr: memchr::memchr_iter(self.needle, right_buf), 95 | pos: 0, 96 | }; 97 | (left, Some(right)) 98 | } 99 | 100 | fn fold_with(self, folder: F) -> F 101 | where 102 | F: rayon::iter::plumbing::Folder, 103 | { 104 | folder.consume_iter(self) 105 | } 106 | } 107 | 108 | impl<'a> ParallelIterator for MemchrSplit<'a> { 109 | type Item = &'a [u8]; 110 | 111 | fn drive_unindexed(self, consumer: C) -> C::Result 112 | where 113 | C: UnindexedConsumer, 114 | { 115 | bridge_unindexed(self, consumer) 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /pollen_data_gen/pollen_data_gen/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | import mygfa 4 | from typing import List 5 | 6 | from . import depth, simple 7 | 8 | 9 | def parse_args() -> tuple[argparse.ArgumentParser, argparse.Namespace]: 10 | """Parse command line arguments and run the appropriate subcommand.""" 11 | parser = argparse.ArgumentParser() 12 | 13 | subparsers = parser.add_subparsers( 14 | title="pollen-data-gen commands", metavar="COMMAND", dest="command" 15 | ) 16 | 17 | simple_parser = subparsers.add_parser( 18 | "simple", help="Produces a simple JSON serialization of the graph." 19 | ) 20 | # Optional arguments - argparse automatically infers flags beginning with '-' as optional 21 | simple_parser.add_argument( 22 | "-n", 23 | help="The max number of nodes.", 24 | ) 25 | simple_parser.add_argument( 26 | "-e", 27 | help="The max number of steps per node.", 28 | ) 29 | simple_parser.add_argument( 30 | "-p", 31 | help="The max number of paths.", 32 | ) 33 | simple_parser.add_argument( 34 | "-s", 35 | "--subset-paths", 36 | help="A file where each line is a path of the graph to consider when calculating node depth", 37 | ) 38 | 39 | _ = subparsers.add_parser( 40 | "roundtrip", 41 | help="Checks that we can serialize the deserilize the graph losslessly.", 42 | ) 43 | 44 | depth_parser = subparsers.add_parser( 45 | "depth", help="Produces a `depth`-specific JSON of the graph." 46 | ) 47 | depth_parser.add_argument( 48 | "-n", 49 | help="The max number of nodes.", 50 | ) 51 | depth_parser.add_argument( 52 | "-e", 53 | help="The max number of steps per node.", 54 | ) 55 | depth_parser.add_argument( 56 | "-p", 57 | help="The max number of paths.", 58 | ) 59 | depth_parser.add_argument( 60 | "-s", 61 | "--subset-paths", 62 | help="A file where each line is a path of the graph to consider when calculating node depth", 63 | ) 64 | 65 | # Add the graph argument to all subparsers. 66 | # Doing it this way means that the graph argument is sought _after_ the 67 | # command name. 68 | for subparser in subparsers.choices.values(): 69 | subparser.add_argument("graph", help="Input GFA file", metavar="GRAPH") 70 | 71 | args = parser.parse_args() 72 | 73 | return parser, args 74 | 75 | 76 | def parse_subset_paths(filename: str) -> List[str]: 77 | """ 78 | Return a list of the names of paths in [filename] 79 | """ 80 | 81 | if filename is None: # Return the default value 82 | return [] 83 | 84 | with open(filename, "r", encoding="utf-8") as paths_file: 85 | text = paths_file.read() 86 | return text.splitlines() 87 | 88 | 89 | def dispatch(args: argparse.Namespace) -> None: 90 | """Parse the graph from filename, 91 | then dispatch to the appropriate pollen_data_gen command. 92 | """ 93 | subset_paths = parse_subset_paths(args.subset_paths) 94 | name_to_func = { 95 | "depth": lambda g: depth.depth_stdout(g, args.n, args.e, args.p, subset_paths), 96 | "simple": lambda g: simple.dump( 97 | g, sys.stdout, args.n, args.e, args.p, subset_paths 98 | ), 99 | "roundtrip": simple.roundtrip_test, 100 | } 101 | graph = mygfa.Graph.parse(open(args.graph, "r", encoding="utf-8")) 102 | name_to_func[args.command](graph) 103 | 104 | 105 | def main() -> None: 106 | """Parse command line arguments and run the appropriate subcommand.""" 107 | parser, arguments = parse_args() 108 | if "graph" not in arguments or not arguments.graph: 109 | parser.print_help() 110 | exit(-1) 111 | dispatch(arguments) 112 | 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/inject.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple 2 | import mygfa 3 | from . import chop 4 | 5 | 6 | def track_path(graph: mygfa.Graph, bed: mygfa.Bed) -> List[mygfa.Handle]: 7 | """Given a BED entry, make a list of the Segments traversed _in full_.""" 8 | walk = 0 9 | segs_walked = [] 10 | for handle in graph.paths[bed.name].segments: 11 | length = len(graph.segments[handle.name].seq) 12 | if walk < bed.low: 13 | # Skipping over segments that are not of interest. 14 | walk = walk + length 15 | continue 16 | if walk + length <= bed.high: 17 | walk = walk + length 18 | segs_walked.append(handle) 19 | else: 20 | return segs_walked 21 | return segs_walked # Given a legal BED, I should never reach this point. 22 | 23 | 24 | def handle_pos(handle: mygfa.Handle, length: int, index: int) -> Tuple[str, int]: 25 | """Get the concrete index in the underlying segment sequence corresponding 26 | to the `n`th nucleotide from the beginning (in the appropriate direction). 27 | """ 28 | return handle.name, (index if handle.ori else length - index) 29 | 30 | 31 | def where_chop( 32 | graph: mygfa.Graph, pathname: str, index: int 33 | ) -> Optional[Tuple[str, int]]: 34 | """Given a path and an index, find which segment should be chopped. 35 | We may not need to chop: the index could already be at a seam b/w segments. 36 | In such case, return None. 37 | """ 38 | walk = 0 39 | for handle in graph.paths[pathname].segments: 40 | if walk == index: 41 | return None 42 | length = len(graph.segments[handle.name].seq) 43 | if walk + length > index: 44 | return handle_pos(handle, length, index - walk) 45 | walk = walk + length 46 | return None # Given a legal path, I should never reach this point. 47 | 48 | 49 | def chop_if_needed(graph: mygfa.Graph, pathname: str, index: int) -> mygfa.Graph: 50 | """Modify this graph such that the given index will fall on a segment-seam. 51 | This involves: 52 | 1. renumbering segments 53 | 2. redoing paths 54 | But at least we know we'll only ever need to renumber a max of one segment. 55 | """ 56 | targetpos = where_chop(graph, pathname, index) 57 | if not targetpos: 58 | return graph # We were already on a seam. 59 | target, pos = targetpos 60 | 61 | segments = {} 62 | legend = {} # With plans to reuse `chop_paths`. 63 | 64 | for seg in graph.segments.values(): 65 | segnumber = int(seg.name) 66 | succname = str(segnumber + 1) 67 | if segnumber < int(target): # Keep these verbatim. 68 | segments[seg.name] = seg 69 | legend[seg.name] = segnumber, segnumber + 1 70 | elif seg.name == target: # Perform one chop. 71 | segments[seg.name] = mygfa.Segment(target, mygfa.Strand(str(seg.seq)[:pos])) 72 | segments[succname] = mygfa.Segment( 73 | succname, mygfa.Strand(str(seg.seq)[pos:]) 74 | ) 75 | legend[seg.name] = segnumber, segnumber + 2 76 | else: # Keep the segment as it was, but increment its name. 77 | segments[succname] = mygfa.Segment(succname, seg.seq) 78 | legend[seg.name] = segnumber + 1, segnumber + 2 79 | 80 | paths = chop.chop_paths(graph, legend) 81 | return mygfa.Graph(graph.headers, segments, graph.links, paths) 82 | 83 | 84 | def inject(graph: mygfa.Graph, p2i: List[mygfa.Bed]) -> mygfa.Graph: 85 | """Given a graph and the list of paths to inject, inject those paths.""" 86 | for p in p2i: 87 | if p.name in graph.paths.keys(): # odgi is silent if path was absent. 88 | # if flip.path_is_rev(graph.paths[p.name], graph): 89 | # print(f"Path {p.name} is reverse-oriented.") 90 | graph = chop_if_needed(chop_if_needed(graph, p.name, p.low), p.name, p.high) 91 | new_path = mygfa.Path(p.new, track_path(graph, p), None) 92 | graph.paths[p.new] = new_path # In-place update! 93 | return graph 94 | -------------------------------------------------------------------------------- /.github/workflows/flatgfa-py.yml: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by maturin v1.5.1 2 | # To update, run 3 | # 4 | # maturin generate-ci github --pytest -m flatgfa-py/Cargo.toml -o .github/workflows/flatgfa-py.yml 5 | # 6 | name: flatgfa-py CI 7 | 8 | on: 9 | push: 10 | branches: [main] 11 | tags: ["flatgfa-*"] 12 | pull_request: 13 | paths: 14 | - "flatgfa/**" 15 | - "flatgfa-py/**" 16 | 17 | permissions: 18 | contents: read 19 | 20 | jobs: 21 | linux: 22 | runs-on: ${{ matrix.platform.runner }} 23 | strategy: 24 | matrix: 25 | platform: 26 | - runner: ubuntu-latest 27 | target: x86_64 28 | steps: 29 | - uses: actions/checkout@v4 30 | - uses: actions/setup-python@v5 31 | with: 32 | python-version: "3.10" 33 | - name: Build wheels 34 | uses: PyO3/maturin-action@v1 35 | with: 36 | target: ${{ matrix.platform.target }} 37 | args: --release --out dist --manifest-path flatgfa-py/Cargo.toml 38 | sccache: "true" 39 | manylinux: auto 40 | - name: Upload wheels 41 | uses: actions/upload-artifact@v4 42 | with: 43 | name: wheels-linux-${{ matrix.platform.target }} 44 | path: dist 45 | - name: pytest 46 | if: ${{ startsWith(matrix.platform.target, 'x86_64') }} 47 | shell: bash 48 | run: | 49 | set -e 50 | pip install ./dist/flatgfa*.whl 51 | pip install pytest 52 | cd flatgfa-py && pytest 53 | 54 | windows: 55 | runs-on: ${{ matrix.platform.runner }} 56 | strategy: 57 | matrix: 58 | platform: 59 | - runner: windows-latest 60 | target: x64 61 | steps: 62 | - uses: actions/checkout@v4 63 | - uses: actions/setup-python@v5 64 | with: 65 | python-version: "3.10" 66 | architecture: ${{ matrix.platform.target }} 67 | - name: Build wheels 68 | uses: PyO3/maturin-action@v1 69 | with: 70 | target: ${{ matrix.platform.target }} 71 | args: --release --out dist --manifest-path flatgfa-py/Cargo.toml 72 | sccache: "true" 73 | - name: Upload wheels 74 | uses: actions/upload-artifact@v4 75 | with: 76 | name: wheels-windows-${{ matrix.platform.target }} 77 | path: dist 78 | - name: pytest 79 | if: ${{ !startsWith(matrix.platform.target, 'aarch64') }} 80 | shell: bash 81 | run: | 82 | set -e 83 | pip install ./dist/flatgfa*.whl 84 | pip install pytest 85 | cd flatgfa-py && pytest 86 | 87 | macos: 88 | runs-on: ${{ matrix.platform.runner }} 89 | strategy: 90 | matrix: 91 | platform: 92 | - runner: macos-latest 93 | target: x86_64 94 | - runner: macos-latest 95 | target: aarch64 96 | steps: 97 | - uses: actions/checkout@v4 98 | - uses: actions/setup-python@v5 99 | with: 100 | python-version: "3.10" 101 | - name: Build wheels 102 | uses: PyO3/maturin-action@v1 103 | with: 104 | target: ${{ matrix.platform.target }} 105 | args: --release --out dist --manifest-path flatgfa-py/Cargo.toml 106 | sccache: "true" 107 | - name: Upload wheels 108 | uses: actions/upload-artifact@v4 109 | with: 110 | name: wheels-macos-${{ matrix.platform.target }} 111 | path: dist 112 | - name: pytest 113 | if: ${{ startsWith(matrix.platform.target, 'aarch64') }} 114 | shell: bash 115 | run: | 116 | set -e 117 | pip install ./dist/flatgfa*.whl 118 | pip install pytest 119 | cd flatgfa-py && pytest 120 | 121 | sdist: 122 | runs-on: ubuntu-latest 123 | steps: 124 | - uses: actions/checkout@v4 125 | - name: Build sdist 126 | uses: PyO3/maturin-action@v1 127 | with: 128 | command: sdist 129 | args: --out dist --manifest-path flatgfa-py/Cargo.toml 130 | - name: Upload sdist 131 | uses: actions/upload-artifact@v4 132 | with: 133 | name: wheels-sdist 134 | path: dist 135 | 136 | release: 137 | name: Release 138 | runs-on: ubuntu-latest 139 | environment: release 140 | if: "startsWith(github.ref, 'refs/tags/')" 141 | needs: [linux, windows, macos, sdist] 142 | permissions: 143 | id-token: write 144 | steps: 145 | - uses: actions/download-artifact@v4 146 | - name: Publish to PyPI 147 | uses: PyO3/maturin-action@v1 148 | with: 149 | command: upload 150 | args: --non-interactive --skip-existing wheels-*/* 151 | -------------------------------------------------------------------------------- /flatgfa-py/docs/index.rst: -------------------------------------------------------------------------------- 1 | FlatGFA: An Efficient Pangenome Representation 2 | ============================================== 3 | 4 | .. py:module:: flatgfa 5 | 6 | `FlatGFA`_ is an efficient on-disk and in-memory way to represent 7 | pangenomic variation graphs. It can losslessly represent `GFA`_ files. 8 | Here's a quick example:: 9 | 10 | import flatgfa 11 | from collections import Counter 12 | 13 | graph = flatgfa.parse("something.gfa") 14 | depths = Counter() 15 | for path in graph.paths: 16 | for step in path: 17 | depths[step.segment.id] += 1 18 | 19 | print('#node.id\tdepth') 20 | for seg in graph.segments: 21 | print('{}\t{}'.format(seg.name, depths[seg.id])) 22 | 23 | This example computes the `node depth`_ for every segment in a graph. 24 | It starts by parsing a GFA text file, but FlatGFA also has its own efficient 25 | binary representation---you can read and write this format with 26 | :func:`load` and :meth:`FlatGFA.write_flatgfa`. 27 | 28 | The library is on `PyPI`_, so you can get started by typing 29 | ``pip install flatgfa``. 30 | 31 | .. _GFA: https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md 32 | .. _node depth: https://odgi.readthedocs.io/en/latest/rst/commands/odgi_depth.html 33 | .. _FlatGFA: https://github.com/cucapra/pollen/tree/main/flatgfa 34 | .. _PyPI: https://pypi.org/project/flatgfa/ 35 | 36 | API Reference 37 | ------------- 38 | 39 | Loading Data 40 | '''''''''''' 41 | 42 | The FlatGFA library can both read and write files in two formats: the standard 43 | `GFA`_ text format, and its own efficient binary representation (called 44 | "FlatGFA" files). Each of these functions below return a :class:`FlatGFA` 45 | object. Parsing GFA text can take some time, but loading a binary FlatGFA file 46 | should be very fast. 47 | 48 | .. autofunction:: parse 49 | 50 | .. autofunction:: parse_bytes 51 | 52 | .. autofunction:: load 53 | 54 | GFA Graphs 55 | '''''''''' 56 | 57 | The :class:`FlatGFA` class provides the entry point to access the data either 58 | loaded from a FlatGFA binary file or parsed from a GFA text file. Most 59 | importantly, you can iterate over the :class:`Segment`, :class:`Path`, and 60 | :class:`Link` objects that it contains. The :class:`FlatGFA` class exposes 61 | :class:`list`-like containers for each of these types:: 62 | 63 | for seg in graph.segments: 64 | print(seg.name) 65 | print(graph.segments[0].sequence()) 66 | 67 | These containers support both iteration (like the ``for`` above) and random 68 | access (like ``graph.segments[0]`` above). 69 | 70 | You can also write graphs out to disk using :meth:`FlatGFA.write_gfa` 71 | (producing a standard GFA text file) and :meth:`FlatGFA.write_flatgfa` (our 72 | binary format). If you just want a GFA string, use `str(graph)`. 73 | 74 | .. autoclass:: FlatGFA 75 | :members: 76 | 77 | The GFA Data Model 78 | '''''''''''''''''' 79 | 80 | These classes represent the core data model for GFA graphs: 81 | :class:`Segment` for vertices in the graph, 82 | :class:`Path` for walks through the graph, 83 | and :class:`Link` for edges in the graph. 84 | Internally, all of these objects only contain references to the underlying 85 | data stored in a :class:`FlatGFA`, so they are very small, but accessing any 86 | of the associated data (such as the nucleotide sequence for a segment) require 87 | further lookups. 88 | 89 | The :class:`Handle` class is a segment--orientation pair: both paths and links 90 | traverse these handles. 91 | 92 | To get a GFA text representation of any of these objects, use ``str(obj)``. 93 | All these objects are equatable (so you can compare them with ``==``) and 94 | hashable (so you can store them in dicts and sets). This reflects equality on 95 | the underlying references to the data store, so two objects are equal if they 96 | refer to the same index in the same :class:`FlatGFA`. 97 | 98 | .. autoclass:: Segment 99 | :members: 100 | 101 | .. autoclass:: Path 102 | :members: 103 | 104 | .. autoclass:: Link 105 | :members: 106 | 107 | .. autoclass:: Handle 108 | :members: 109 | 110 | .. toctree:: 111 | :maxdepth: 2 112 | :caption: Contents: 113 | 114 | Iteration 115 | ''''''''' 116 | 117 | The FlatGFA library exposes special container classes to access the 118 | :class:`Segment`, :class:`Path`, and :class:`Link` objects that make up a GFA 119 | graph. These classes are meant to behave sort of like Python :class:`list` 120 | objects while supporting efficient iteration over FlatGFA's internal 121 | representation. 122 | 123 | All of these container objects support subscripting (like 124 | ``graph.segments[i]`` where ``i`` is an integer index) and iteration. 125 | 126 | .. autoclass:: SegmentList 127 | :members: 128 | 129 | .. autoclass:: PathList 130 | :members: 131 | 132 | .. autoclass:: LinkList 133 | :members: 134 | 135 | .. autoclass:: StepList 136 | :members: 137 | -------------------------------------------------------------------------------- /pollen_data_gen/pollen_data_gen/depth.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import Any, Collection, Dict, Union, Optional, List 3 | import json 4 | from json import JSONEncoder 5 | import mygfa 6 | import mygfa.preprocess 7 | 8 | 9 | FormatType = Dict[str, Union[bool, str, int]] 10 | OutputType = Dict[str, Dict[str, Collection[object]]] 11 | 12 | 13 | def format_gen(width: int) -> FormatType: 14 | """Generates a format object for a bitvector of length `width`.""" 15 | return {"is_signed": False, "numeric_type": "bitnum", "width": width} 16 | 17 | 18 | def paths_viewed_from_nodes( 19 | graph: mygfa.Graph, max_n: int, max_e: int, max_p: int 20 | ) -> OutputType: 21 | """Given a graph, return a dict representing the paths 22 | viewed from the PoV of each node. 23 | """ 24 | path2id = {path: id for id, path in enumerate(graph.paths, start=1)} 25 | output = {} 26 | json_format = format_gen(max_p.bit_length()) 27 | # segment name, (path name, index on path, direction) list 28 | for seg, crossings in mygfa.preprocess.node_steps(graph).items(): 29 | data = list(path2id[c[0]] for c in crossings) 30 | data = data + [0] * (max_e - len(data)) 31 | output[f"path_ids{seg}"] = {"data": data, "format": json_format} 32 | data = [0] * max_e 33 | for i in range(len(graph.segments) + 1, max_n + 1): 34 | output[f"path_ids{i}"] = {"data": data, "format": json_format} 35 | return output 36 | 37 | 38 | def paths_to_consider( 39 | subset_paths_idx: List[int], max_n: int, max_p: int 40 | ) -> OutputType: 41 | """Currently just a stub; later we will populate this with a 42 | bitvector of length MAX_PATHS, where the i'th index will be 1 if 43 | the i'th path is to be considered during depth calculation. 44 | 45 | Somewhat annoyingly, we need as many copies of this bitvector as there 46 | are nodes in the graph. 47 | """ 48 | output = {} 49 | data = [] 50 | if subset_paths_idx: 51 | data = [0] * (max_p + 1) 52 | for path_idx in subset_paths_idx: 53 | data[path_idx] = 1 54 | else: 55 | data = [0] + ([1] * max_p) 56 | 57 | for i in range(1, max_n + 1): 58 | output[f"paths_to_consider{i}"] = {"data": data, "format": format_gen(1)} 59 | return output 60 | 61 | 62 | class NodeDepthEncoder(JSONEncoder): 63 | """Encodes the entire graph as a JSON object, for the purpose of node depth. 64 | 65 | The exine command `depth` is the oracle for this encoding. 66 | """ 67 | 68 | def __init__( 69 | self, 70 | max_n: int, 71 | max_e: int, 72 | max_p: int, 73 | subset_paths: Optional[List[str]], 74 | **kwargs: Any, 75 | ) -> None: 76 | super(NodeDepthEncoder, self).__init__(**kwargs) 77 | self.max_n = max_n 78 | self.max_e = max_e 79 | self.max_p = max_p 80 | self.subset_paths = subset_paths 81 | 82 | def paths_to_idxs(self, o: mygfa.Graph) -> List[int]: 83 | if not self.subset_paths: 84 | return [] 85 | path2id = {path: id for id, path in enumerate(o.paths, start=1)} 86 | return list(map(lambda p: path2id[p], self.subset_paths)) 87 | 88 | def default(self, o: Any) -> Dict[str, Dict[str, Collection[object]]]: 89 | answer_field = { 90 | "depth_output": { 91 | "data": list([0] * self.max_n), 92 | "format": format_gen(self.max_e.bit_length()), 93 | } 94 | } 95 | answer_field_uniq = { 96 | "uniq_output": { 97 | "data": list([0] * self.max_n), 98 | "format": format_gen(self.max_p.bit_length()), 99 | } 100 | } 101 | subset_paths_idx = self.paths_to_idxs(o) 102 | paths = paths_viewed_from_nodes( 103 | o, self.max_n, self.max_e, self.max_p 104 | ) | paths_to_consider(subset_paths_idx, self.max_n, self.max_p) 105 | 106 | return answer_field | paths | answer_field_uniq 107 | 108 | 109 | def depth_json( 110 | graph: mygfa.Graph, 111 | max_n: Optional[int], 112 | max_e: Optional[int], 113 | max_p: Optional[int], 114 | subset_paths: Optional[List[str]], 115 | ) -> str: 116 | """Returns a JSON representation of `graph` 117 | that is specific to the exine command `depth`. 118 | """ 119 | n_tight, e_tight, p_tight = mygfa.preprocess.get_maxes(graph) 120 | # These values have been calculated automatically, and are likely optimal. 121 | # However, they are only to be used when the user-does not supply them via CLI. 122 | if not max_n: 123 | max_n = n_tight 124 | if not max_e: 125 | max_e = e_tight 126 | if not max_p: 127 | max_p = p_tight 128 | 129 | return NodeDepthEncoder( 130 | max_n=int(max_n), max_e=int(max_e), max_p=int(max_p), subset_paths=subset_paths 131 | ).encode(graph) 132 | 133 | 134 | def depth_stdout( 135 | graph: mygfa.Graph, max_n: int, max_e: int, max_p: int, subset_paths: List[str] 136 | ) -> None: 137 | """Prints a JSON representation of `graph` to stdout.""" 138 | encoding = depth_json(graph, max_n, max_e, max_p, subset_paths) 139 | 140 | json.dump( 141 | json.loads(encoding), 142 | sys.stdout, 143 | indent=2, 144 | sort_keys=True, 145 | ) 146 | -------------------------------------------------------------------------------- /flatgfa-py/test/test_flatgfa.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import flatgfa 3 | import pathlib 4 | 5 | TEST_DIR = pathlib.Path(__file__).parent 6 | TEST_GFA = TEST_DIR / "tiny.gfa" 7 | 8 | 9 | @pytest.fixture 10 | def gfa(): 11 | return flatgfa.parse_bytes(TEST_GFA.read_bytes()) 12 | 13 | 14 | def test_segs(gfa): 15 | # `gfa.segments` acts like a list. 16 | assert len(gfa.segments) == 4 17 | seg = gfa.segments[0] 18 | 19 | # An individual segment exposes its name and nucleotide sequence. 20 | assert seg.name == 1 21 | assert seg.sequence() == b"CAAATAAG" 22 | assert len(seg) == 8 23 | 24 | # You can also pull out the entire sequence of segments. 25 | seg = list(gfa.segments)[2] 26 | assert seg.name == 3 27 | 28 | # Use `str()` to get a GFA representation. 29 | assert str(seg) == "S 3 TTG" 30 | 31 | 32 | def test_segs_find(gfa): 33 | # There is a method to find a segment by its name (with linear search). 34 | seg = gfa.segments.find(3) 35 | assert seg.id == 2 36 | assert seg.sequence() == b"TTG" 37 | 38 | 39 | def test_paths(gfa): 40 | # `gfa.paths` similarly acts like a list. 41 | assert len(gfa.paths) == 2 42 | assert len(list(gfa.paths)) == 2 43 | 44 | # Individual paths expose their name (a bytestring). 45 | path = gfa.paths[0] 46 | assert path.name == "one" 47 | 48 | # GFA representation. 49 | assert str(path) == "P one 1+,2+,4- *" 50 | 51 | 52 | def test_paths_find(gfa): 53 | # There is a method to find a path by its name. 54 | path = gfa.paths.find("two") 55 | assert path.id == 1 56 | assert path.name == "two" 57 | 58 | 59 | def test_path_steps(gfa): 60 | # When you get a path, the path itself acts as a list of steps (handles). 61 | path = gfa.paths[1] 62 | assert len(path) == 4 63 | assert len(list(path)) == 4 64 | step = path[0] 65 | 66 | # A step (handle) is a reference to a segment and an orientation. 67 | assert step.segment.name == 1 68 | assert step.is_forward 69 | 70 | # GFA representation. 71 | assert str(step) == "1+" 72 | 73 | 74 | def test_links(gfa): 75 | # You guessed it: `gfa.links` behaves as a list too. 76 | assert len(gfa.links) == 4 77 | assert len(list(gfa.links)) == 4 78 | link = gfa.links[1] 79 | 80 | # A link has a "from" handle and a "to" handle. 81 | assert link.from_.segment.name == 2 82 | assert link.from_.is_forward 83 | assert link.to.segment.name == 4 84 | assert not link.to.is_forward 85 | 86 | # GFA representation. 87 | assert str(link) == "L 2 + 4 - 0M" 88 | 89 | 90 | def test_gfa_str(gfa): 91 | with open(TEST_GFA, "r") as f: 92 | orig_gfa = f.read() 93 | 94 | # You can serialize a graph as GFA text. 95 | assert str(gfa) == orig_gfa 96 | 97 | 98 | def test_read_write_gfa(gfa, tmp_path): 99 | # You can write FlatGFA objects as GFA text files. 100 | gfa_path = str(tmp_path / "tiny.gfa") 101 | gfa.write_gfa(gfa_path) 102 | with open(TEST_GFA, "rb") as orig_f: 103 | with open(gfa_path, "rb") as written_f: 104 | assert orig_f.read() == written_f.read() 105 | 106 | # You can also parse GFA text files from the filesystem. 107 | new_gfa = flatgfa.parse(gfa_path) 108 | assert len(new_gfa.segments) == len(gfa.segments) 109 | 110 | 111 | def test_read_write_flatgfa(gfa, tmp_path): 112 | # You can write FlatGFA graphs in our native binary format too. 113 | flatgfa_path = str(tmp_path / "tiny.flatgfa") 114 | gfa.write_flatgfa(flatgfa_path) 115 | 116 | # And read them back, which should be very fast indeed. 117 | new_gfa = flatgfa.load(flatgfa_path) 118 | assert len(new_gfa.segments) == len(gfa.segments) 119 | 120 | 121 | def test_eq(gfa): 122 | # The various data components are equatable. 123 | assert gfa.segments[0] == gfa.segments[0] 124 | assert gfa.segments[0] != gfa.segments[1] 125 | assert gfa.paths[0] == gfa.paths[0] 126 | assert gfa.paths[0] != gfa.paths[1] 127 | assert gfa.links[0] == gfa.links[0] 128 | assert gfa.links[0] != gfa.links[1] 129 | 130 | # Including handles, which do not have their own identity. 131 | assert gfa.links[1].from_ == gfa.links[2].from_ 132 | assert gfa.links[1].from_ != gfa.links[1].to 133 | 134 | 135 | def test_hash(gfa): 136 | # The objects are also hashable, so you can put them in dicts and sets. 137 | d = { 138 | gfa.segments[0]: "foo", 139 | gfa.paths[0]: "bar", 140 | gfa.links[0]: "baz", 141 | gfa.links[1].from_: "qux", 142 | } 143 | assert d[gfa.segments[0]] == "foo" 144 | assert d[gfa.paths[0]] == "bar" 145 | assert d[gfa.links[0]] == "baz" 146 | assert d[gfa.links[1].from_] == "qux" 147 | 148 | 149 | def test_slice(gfa): 150 | # The various container types can be sliced to get narrower ranges. 151 | assert len(gfa.segments[1:3]) == 2 152 | assert len(gfa.segments[2:]) == len(gfa.segments) - 2 153 | assert gfa.segments[1:3][0].name == gfa.segments[1].name 154 | 155 | assert len(gfa.paths[1:]) == 1 156 | assert len(gfa.links[2:100]) == 2 157 | 158 | assert len(list(gfa.paths[:1])) == 1 159 | 160 | # Including paths, which act like lists of steps. 161 | path = gfa.paths[0] 162 | assert len(path[2:]) == len(path) - 2 163 | assert path[2:][0] == path[2] 164 | assert len(list(path[2:])) == len(path) - 2 165 | -------------------------------------------------------------------------------- /flatgfa/src/flatbed.rs: -------------------------------------------------------------------------------- 1 | use crate::gfaline::parse_field; 2 | use crate::memfile::MemchrSplit; 3 | use crate::pool::{FixedStore, HeapStore, Id, Pool, Span, Store}; 4 | use atoi::FromRadix10; 5 | use bstr::BStr; 6 | use zerocopy::{FromBytes, IntoBytes}; 7 | 8 | /// A single interval from a BED file. 9 | #[derive(Debug, FromBytes, IntoBytes, Clone, Copy)] 10 | #[repr(C, packed)] 11 | pub struct BEDEntry { 12 | pub name: Span, 13 | pub start: u64, 14 | pub end: u64, 15 | } 16 | 17 | /// A flat representation of an entire BED file, i.e., a list of named intervals. 18 | pub struct FlatBED<'a> { 19 | pub name_data: Pool<'a, u8>, 20 | pub entries: Pool<'a, BEDEntry>, 21 | } 22 | 23 | impl FlatBED<'_> { 24 | /// Get the number of entries in this BED file 25 | pub fn get_num_entries(&self) -> usize { 26 | self.entries.len() 27 | } 28 | 29 | /// Get the name of a specific entry as a string 30 | pub fn get_name_of_entry(&self, entry: &BEDEntry) -> &BStr { 31 | self.name_data[entry.name].as_ref() 32 | } 33 | 34 | /// Get a list of all BED entries from this file that intersect with `entry`. 35 | /// `bed` is the the file that `entry` is located in, which need not be self. 36 | pub fn get_intersects(&self, bed: &FlatBED, entry: &BEDEntry) -> Vec { 37 | self.entries 38 | .all() 39 | .iter() 40 | // To be compatible with bedtools, entries that partially overlap only 41 | // report the overlapping portion, so we need to construct new entries 42 | // here to only contain the overlap 43 | .map(|x| BEDEntry { 44 | name: x.name, 45 | start: if x.start < entry.start { 46 | entry.start 47 | } else { 48 | x.start 49 | }, 50 | end: if entry.end < x.end { entry.end } else { x.end }, 51 | }) 52 | .filter(|x| { 53 | bed.get_name_of_entry(entry).eq(self.get_name_of_entry(x)) && x.end > x.start 54 | }) 55 | .collect() 56 | } 57 | } 58 | 59 | /// The data storage pools for a `FlatBED`. 60 | #[derive(Default)] 61 | pub struct BEDStore<'a, P: StoreFamily<'a>> { 62 | pub name_data: P::Store, 63 | pub entries: P::Store, 64 | } 65 | 66 | impl<'a, P: StoreFamily<'a>> BEDStore<'a, P> { 67 | pub fn add_entry(&mut self, name: &[u8], start: u64, end: u64) -> Id { 68 | let name = self.name_data.add_slice(name); 69 | self.entries.add(BEDEntry { name, start, end }) 70 | } 71 | 72 | pub fn as_ref(&self) -> FlatBED<'_> { 73 | FlatBED { 74 | name_data: self.name_data.as_ref(), 75 | entries: self.entries.as_ref(), 76 | } 77 | } 78 | } 79 | 80 | pub trait StoreFamily<'a> { 81 | type Store: Store; 82 | } 83 | 84 | #[derive(Default)] 85 | pub struct HeapFamily; 86 | impl<'a> StoreFamily<'a> for HeapFamily { 87 | type Store = HeapStore; 88 | } 89 | 90 | pub struct FixedFamily; 91 | impl<'a> StoreFamily<'a> for FixedFamily { 92 | type Store = FixedStore<'a, T>; 93 | } 94 | 95 | /// A store for `FlatBED` data backed by fixed-size slices. 96 | /// 97 | /// This store contains `SliceVec`s, which act like `Vec`s but are allocated within 98 | /// a fixed region. This means they have a maximum size, but they can directly map 99 | /// onto the contents of a file. 100 | pub type FixedBEDStore<'a> = BEDStore<'a, FixedFamily>; 101 | 102 | /// A mutable, in-memory data store for `FlatBED`. 103 | /// 104 | /// This store contains a bunch of `Vec`s: one per array required to implement a 105 | /// `FlatBED`. It exposes an API for building up a BED data structure, so it is 106 | /// useful for creating new ones from scratch. 107 | pub type HeapBEDStore = BEDStore<'static, HeapFamily>; 108 | 109 | type ParseResult = Result; 110 | type PartialParseResult<'a, T> = ParseResult<(T, &'a [u8])>; 111 | fn parse_num(s: &[u8]) -> PartialParseResult<'_, T> { 112 | match T::from_radix_10(s) { 113 | (_, 0) => Err("expected number"), 114 | (num, used) => Ok((num, &s[used..])), 115 | } 116 | } 117 | 118 | pub struct BEDParser<'a, P: StoreFamily<'a>> { 119 | /// The flat representation we're building. 120 | flat: BEDStore<'a, P>, 121 | } 122 | 123 | impl<'a, P: StoreFamily<'a>> BEDParser<'a, P> { 124 | pub fn new(builder: BEDStore<'a, P>) -> Self { 125 | Self { flat: builder } 126 | } 127 | 128 | /// Parse a BED text file from an in-memory buffer. 129 | pub fn parse_mem(mut self, buf: &[u8]) -> BEDStore<'a, P> { 130 | for line in MemchrSplit::new(b'\n', buf) { 131 | let (name_slice, rest) = parse_field(line).unwrap(); 132 | let (start_num, rest) = parse_num(rest).unwrap(); 133 | let (end_num, _) = parse_num(&rest[1..]).unwrap(); 134 | 135 | self.flat.add_entry(name_slice, start_num, end_num); 136 | } 137 | 138 | self.flat 139 | } 140 | } 141 | 142 | impl BEDParser<'static, HeapFamily> { 143 | pub fn for_heap() -> Self { 144 | Self::new(HeapBEDStore::default()) 145 | } 146 | } 147 | 148 | impl<'a> BEDParser<'a, FixedFamily> { 149 | pub fn for_slice(store: FixedBEDStore<'a>) -> Self { 150 | Self::new(store) 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /flatgfa/src/print.rs: -------------------------------------------------------------------------------- 1 | use crate::flatgfa; 2 | use std::fmt; 3 | 4 | impl fmt::Display for flatgfa::Orientation { 5 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 6 | match self { 7 | flatgfa::Orientation::Forward => write!(f, "+"), 8 | flatgfa::Orientation::Backward => write!(f, "-"), 9 | } 10 | } 11 | } 12 | 13 | impl fmt::Display for flatgfa::AlignOpcode { 14 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 15 | match self { 16 | flatgfa::AlignOpcode::Match => write!(f, "M"), 17 | flatgfa::AlignOpcode::Gap => write!(f, "N"), 18 | flatgfa::AlignOpcode::Insertion => write!(f, "D"), 19 | flatgfa::AlignOpcode::Deletion => write!(f, "I"), 20 | } 21 | } 22 | } 23 | 24 | impl fmt::Display for flatgfa::Alignment<'_> { 25 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 26 | if self.ops.is_empty() { 27 | write!(f, "0M")?; 28 | } 29 | for op in self.ops { 30 | write!(f, "{}{}", op.len(), op.op())?; 31 | } 32 | Ok(()) 33 | } 34 | } 35 | 36 | /// A wrapper for displaying components from FlatGFA. 37 | pub struct Display<'a, T>(pub &'a flatgfa::FlatGFA<'a>, pub T); 38 | 39 | impl fmt::Display for Display<'_, flatgfa::Handle> { 40 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 41 | let seg = self.0.get_handle_seg(self.1); 42 | let name = seg.name; 43 | write!(f, "{}{}", name, self.1.orient()) 44 | } 45 | } 46 | 47 | impl fmt::Display for Display<'_, &flatgfa::Path> { 48 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 49 | write!(f, "P\t{}\t", self.0.get_path_name(self.1))?; 50 | let steps = &self.0.steps[self.1.steps]; 51 | write!(f, "{}", Display(self.0, steps[0]))?; 52 | for step in steps[1..].iter() { 53 | write!(f, ",{}", Display(self.0, *step))?; 54 | } 55 | write!(f, "\t")?; 56 | let overlaps = &self.0.overlaps[self.1.overlaps]; 57 | if overlaps.is_empty() { 58 | write!(f, "*")?; 59 | } else { 60 | write!(f, "{}", self.0.get_alignment(overlaps[0]))?; 61 | for overlap in overlaps[1..].iter() { 62 | write!(f, ",{}", self.0.get_alignment(*overlap))?; 63 | } 64 | } 65 | Ok(()) 66 | } 67 | } 68 | 69 | impl fmt::Display for Display<'_, &flatgfa::Link> { 70 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 71 | let from = self.1.from; 72 | let from_name = self.0.get_handle_seg(from).name; 73 | let to = self.1.to; 74 | let to_name = self.0.get_handle_seg(to).name; 75 | write!( 76 | f, 77 | "L\t{}\t{}\t{}\t{}\t{}", 78 | from_name, 79 | from.orient(), 80 | to_name, 81 | to.orient(), 82 | self.0.get_alignment(self.1.overlap) 83 | ) 84 | } 85 | } 86 | 87 | impl fmt::Display for Display<'_, &flatgfa::Segment> { 88 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 89 | let name = self.1.name; 90 | write!(f, "S\t{}\t{}", name, self.0.get_seq(self.1))?; 91 | if !self.1.optional.is_empty() { 92 | write!(f, "\t{}", self.0.get_optional_data(self.1))?; 93 | } 94 | Ok(()) 95 | } 96 | } 97 | 98 | /// Print a graph in the order preserved from an original GFA file. 99 | fn write_preserved(gfa: &flatgfa::FlatGFA, f: &mut fmt::Formatter<'_>) -> fmt::Result { 100 | let mut seg_iter = gfa.segs.all().iter(); 101 | let mut path_iter = gfa.paths.all().iter(); 102 | let mut link_iter = gfa.links.all().iter(); 103 | for kind in gfa.get_line_order() { 104 | match kind { 105 | flatgfa::LineKind::Header => { 106 | let version = gfa.header; 107 | assert!(!version.is_empty()); 108 | writeln!(f, "H\t{}", bstr::BStr::new(version.all()))?; 109 | } 110 | flatgfa::LineKind::Segment => { 111 | let seg = seg_iter.next().expect("too few segments"); 112 | writeln!(f, "{}", Display(gfa, seg))?; 113 | } 114 | flatgfa::LineKind::Path => { 115 | let path = path_iter.next().expect("too few paths"); 116 | writeln!(f, "{}", Display(gfa, path))?; 117 | } 118 | flatgfa::LineKind::Link => { 119 | let link = link_iter.next().expect("too few links"); 120 | writeln!(f, "{}", Display(gfa, link))?; 121 | } 122 | } 123 | } 124 | Ok(()) 125 | } 126 | 127 | /// Print a graph in a normalized order, ignoring the original GFA line order. 128 | pub fn write_normalized(gfa: &flatgfa::FlatGFA, f: &mut fmt::Formatter<'_>) -> fmt::Result { 129 | if !gfa.header.is_empty() { 130 | writeln!(f, "H\t{}", bstr::BStr::new(gfa.header.all()))?; 131 | } 132 | for seg in gfa.segs.all().iter() { 133 | writeln!(f, "{}", Display(gfa, seg))?; 134 | } 135 | for path in gfa.paths.all().iter() { 136 | writeln!(f, "{}", Display(gfa, path))?; 137 | } 138 | for link in gfa.links.all().iter() { 139 | writeln!(f, "{}", Display(gfa, link))?; 140 | } 141 | Ok(()) 142 | } 143 | 144 | /// Print our flat representation as in GFA text format. 145 | impl<'a> fmt::Display for &'a flatgfa::FlatGFA<'a> { 146 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 147 | if self.line_order.is_empty() { 148 | write_normalized(self, f) 149 | } else { 150 | write_preserved(self, f) 151 | } 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /tests/turnt.toml: -------------------------------------------------------------------------------- 1 | [envs.chop_oracle] 2 | binary = true 3 | command = "odgi chop -i {filename} -c 3 -o - | odgi view -g -i - | slow_odgi norm --nl" 4 | output.chop = "-" 5 | 6 | [envs.chop_test] 7 | binary = true 8 | command = "slow_odgi chop {filename} -n 3" 9 | output.chop = "-" 10 | 11 | [envs.crush_oracle] 12 | binary = true 13 | command = "odgi crush -i {filename} -o - | odgi view -g -i - | slow_odgi norm" 14 | output.crush = "-" 15 | 16 | [envs.crush_test] 17 | binary = true 18 | command = "slow_odgi crush {filename}" 19 | output.crush = "-" 20 | 21 | [envs.degree_oracle] 22 | binary = true 23 | command = "odgi degree -d --input={filename}" 24 | output.degree = "-" 25 | 26 | [envs.degree_test] 27 | binary = true 28 | command = "slow_odgi degree {filename}" 29 | output.degree = "-" 30 | 31 | [envs.depth_setup] 32 | binary = true 33 | command = "slow_odgi somepaths --drop 50 {filename}" 34 | output.depthpaths = "-" 35 | 36 | [envs.depth_oracle] 37 | binary = true 38 | command = "odgi depth -d -i {filename} -s {base}.depthpaths" 39 | output.depth = "-" 40 | 41 | [envs.depth_test] 42 | binary = true 43 | command = "slow_odgi depth --paths {base}.depthpaths {filename}" 44 | output.depth = "-" 45 | 46 | [envs.flatten_oracle] 47 | binary = true 48 | command = "odgi flatten -i {filename} -f {base}.flatten.fasta -b {base}.flatten.bed; cat {base}.flatten.fasta; cat {base}.flatten.bed" 49 | output.flatten = "-" 50 | 51 | [envs.flatten_test] 52 | binary = true 53 | command = "slow_odgi flatten {filename}" 54 | output.flatten = "-" 55 | 56 | [envs.flip_oracle] 57 | binary = true 58 | command = "odgi flip -i {filename} -o - | odgi view -g -i - | slow_odgi norm" 59 | output.flip = "-" 60 | 61 | [envs.flip_test] 62 | binary = true 63 | command = "slow_odgi flip {filename}" 64 | output.flip = "-" 65 | 66 | [envs.inject_setup] 67 | binary = true 68 | command = "slow_odgi inject_setup < {filename}" 69 | output.bed = "-" 70 | 71 | [envs.inject_oracle] 72 | binary = true 73 | command = "odgi inject -i {filename} -b {base}.bed -o - | odgi view -g -i - | slow_odgi norm --nl" 74 | output.inj = "-" 75 | 76 | [envs.inject_test] 77 | binary = true 78 | command = "slow_odgi inject --bed {base}.bed {filename}" 79 | output.inj = "-" 80 | 81 | [envs.matrix_oracle] 82 | binary = true 83 | command = "odgi matrix -i {filename} | sort" 84 | output.matrix = "-" 85 | 86 | [envs.matrix_test] 87 | binary = true 88 | command = "slow_odgi matrix {filename} | sort" 89 | output.matrix = "-" 90 | 91 | [envs.norm_oracle] 92 | binary = true 93 | command = "odgi view -g -i {filename} | slow_odgi norm" 94 | output.norm = "-" 95 | 96 | [envs.norm_test] 97 | binary = true 98 | command = "slow_odgi norm {filename}" 99 | output.norm = "-" 100 | 101 | [envs.overlap_setup] 102 | binary = true 103 | command = "slow_odgi somepaths --drop 50 {filename}" 104 | output.overlappaths = "-" 105 | 106 | [envs.overlap_oracle] 107 | binary = true 108 | command = "odgi overlap -i {filename} -R {base}.overlappaths" 109 | output.overlap = "-" 110 | 111 | [envs.overlap_test] 112 | binary = true 113 | command = "slow_odgi overlap --paths {base}.overlappaths {filename}" 114 | output.overlap = "-" 115 | 116 | [envs.paths_oracle] 117 | binary = true 118 | command = "odgi paths -i {filename} -L" 119 | output.paths = "-" 120 | 121 | [envs.paths_test] 122 | binary = true 123 | command = "slow_odgi paths {filename}" 124 | output.paths = "-" 125 | 126 | # Drop some links in the "real" input graphs to produce invalid graphs, in the 127 | # `invalid` subdirectory, that will yield interesting errors when running 128 | # validation. 129 | [envs.validate_setup] 130 | binary = true 131 | command = "slow_odgi validate_setup < {filename}" 132 | output.gfa = "-" 133 | out_dir = "invalid" 134 | 135 | [envs.validate_oracle] 136 | binary = true 137 | command = "odgi validate -i {filename} 2>&1" 138 | output.validate = "-" 139 | 140 | # An alternate version for graphs that are supposed to fail validation. 141 | [envs.validate_oracle_err] 142 | binary = true 143 | command = "odgi validate -i {filename} 2>&1" 144 | output.validate = "-" 145 | return_code = 1 146 | 147 | [envs.validate_test] 148 | binary = true 149 | command = "slow_odgi validate {filename}" 150 | output.validate = "-" 151 | 152 | [envs.pollen_data_gen_depth_oracle] 153 | binary = true 154 | command = "exine depth -d {filename} -a {filename}" 155 | output.json = "-" 156 | 157 | [envs.pollen_data_gen_depth_test] 158 | binary = true 159 | command = "pollen_data_gen simple {filename} | jq .depth" 160 | output.json = "-" 161 | 162 | [envs.flatgfa_mem] 163 | command = "../target/debug/fgfa < {filename}" 164 | output.gfa = "-" 165 | 166 | [envs.flatgfa_file] 167 | command = "../target/debug/fgfa -o {base}.flatgfa < {filename} ; ../target/debug/fgfa -i {base}.flatgfa" 168 | output.gfa = "-" 169 | 170 | [envs.flatgfa_file_inplace] 171 | command = "../target/debug/fgfa -m -p 128 -o {base}.inplace.flatgfa -I {filename} ; ../target/debug/fgfa -m -i {base}.inplace.flatgfa" 172 | output.gfa = "-" 173 | 174 | [envs.odgi_depth] 175 | binary = true 176 | command = "odgi depth -d -i {filename}" 177 | output.depth = "-" 178 | 179 | [envs.flatgfa_depth] 180 | command = "../target/debug/fgfa -I {filename} depth" 181 | output.depth = "-" 182 | 183 | [envs.chop_oracle_fgfa] 184 | binary = true 185 | command = "odgi chop -i {filename} -c 3 -o - | odgi view -g -i - | slow_odgi norm" 186 | output.chop = "-" 187 | 188 | [envs.flatgfa_chop] 189 | command = "../target/debug/fgfa -I {filename} chop -l -c 3 | slow_odgi norm" 190 | output.chop = "-" 191 | 192 | [envs.odgi_extract] 193 | binary = true 194 | command = "odgi extract -i {filename} -n 3 -c 3 -o - | odgi view -g -i - | slow_odgi norm" 195 | output.extract = "-" 196 | 197 | [envs.flatgfa_extract] 198 | command = "../target/debug/fgfa -I {filename} extract -n 3 -c 3 | slow_odgi norm" 199 | output.extract = "-" 200 | -------------------------------------------------------------------------------- /flatgfa/src/ops/chop.rs: -------------------------------------------------------------------------------- 1 | use crate::flatgfa::{self, Handle, Link, Orientation, Path, Segment}; 2 | use crate::pool::{Id, Span, Store}; 3 | use crate::{GFAStore, HeapFamily}; 4 | 5 | pub fn chop(gfa: &flatgfa::FlatGFA, max_size: usize, incl_links: bool) -> flatgfa::HeapGFAStore { 6 | let mut flat = flatgfa::HeapGFAStore::default(); 7 | 8 | // when segment S is chopped into segments S1 through S2 (exclusive), 9 | // seg_map[S.name] = Span(Id(S1.name), Id(S2.name)). If S is not chopped: S=S1, S2.name = S1.name+1 10 | let mut seg_map: Vec> = Vec::new(); 11 | // The smallest id (>0) which does not already belong to a segment in `flat` 12 | let mut max_node_id = 1; 13 | 14 | fn link_forward(flat: &mut GFAStore<'static, HeapFamily>, span: &Span) { 15 | // Link segments spanned by `span` from head to tail 16 | let overlap = Span::new_empty(); 17 | flat.add_links((span.start.index()..span.end.index() - 1).map(|idx| Link { 18 | from: Handle::new(Id::new(idx), Orientation::Forward), 19 | to: Handle::new(Id::new(idx + 1), Orientation::Forward), 20 | overlap, 21 | })); 22 | } 23 | 24 | // Add new, chopped segments 25 | for seg in gfa.segs.all().iter() { 26 | let len = seg.len(); 27 | if len <= max_size { 28 | // Leave the segment as is 29 | let id = flat.segs.add(Segment { 30 | name: max_node_id, 31 | seq: seg.seq, 32 | optional: Span::new_empty(), // TODO: Optional data may stay valid when seg not chopped? 33 | }); 34 | max_node_id += 1; 35 | seg_map.push(Span::new(id, flat.segs.next_id())); 36 | } else { 37 | let seq_end = seg.seq.end; 38 | let mut offset = seg.seq.start.index(); 39 | let segs_start = flat.segs.next_id(); 40 | // Could also generate end_id by setting it equal to the start_id and 41 | // updating it for each segment that is added - only benefits us if we 42 | // don't unroll the last iteration of this loop 43 | while offset < seq_end.index() - max_size { 44 | // Generate a new segment of length c 45 | flat.segs.add(Segment { 46 | name: max_node_id, 47 | seq: Span::new(Id::new(offset), Id::new(offset + max_size)), 48 | optional: Span::new_empty(), 49 | }); 50 | offset += max_size; 51 | max_node_id += 1; 52 | } 53 | // Generate the last segment 54 | flat.segs.add(Segment { 55 | name: max_node_id, 56 | seq: Span::new(Id::new(offset), seq_end), 57 | optional: Span::new_empty(), 58 | }); 59 | max_node_id += 1; 60 | let new_seg_span = Span::new(segs_start, flat.segs.next_id()); 61 | seg_map.push(new_seg_span); 62 | if incl_links { 63 | link_forward(&mut flat, &new_seg_span); 64 | } 65 | } 66 | } 67 | 68 | // For each path, add updated handles. Then add the updated path 69 | for path in gfa.paths.all().iter() { 70 | let path_start = flat.steps.next_id(); 71 | let mut path_end = flat.steps.next_id(); 72 | // Generate the new handles 73 | // Tentative to-do: see if it is faster to read Id from segs than to re-generate it? 74 | for step in gfa.get_path_steps(path) { 75 | let range = { 76 | let span = seg_map[step.segment().index()]; 77 | std::ops::Range::from(span) 78 | }; 79 | match step.orient() { 80 | Orientation::Forward => { 81 | // In this builder, Id.index() == seg.name - 1 for all seg 82 | path_end = flat 83 | .add_steps(range.map(|idx| Handle::new(Id::new(idx), Orientation::Forward))) 84 | .end; 85 | } 86 | Orientation::Backward => { 87 | path_end = flat 88 | .add_steps( 89 | range 90 | .rev() 91 | .map(|idx| Handle::new(Id::new(idx), Orientation::Backward)), 92 | ) 93 | .end; 94 | } 95 | } 96 | } 97 | 98 | // Add the updated path 99 | flat.paths.add(Path { 100 | name: path.name, 101 | steps: Span::new(path_start, path_end), 102 | overlaps: Span::new_empty(), 103 | }); 104 | } 105 | 106 | // If the 'l' flag is specified, compute the links in the new graph 107 | if incl_links { 108 | // For each link in the old graph, from handle A -> B: 109 | // Add a link from 110 | // (A.forward ? (A.end, forward) : (A.begin, backwards)) 111 | // -> (B.forward ? (B.begin, forward) : (B.end ? backwards)) 112 | 113 | for link in gfa.links.all().iter() { 114 | let new_from = { 115 | let old_from = link.from; 116 | let chopped_segs = seg_map[old_from.segment().index()]; 117 | let seg_id = match old_from.orient() { 118 | Orientation::Forward => chopped_segs.end - 1, 119 | Orientation::Backward => chopped_segs.start, 120 | }; 121 | seg_id.handle(old_from.orient()) 122 | }; 123 | let new_to = { 124 | let old_to = link.to; 125 | let chopped_segs = seg_map[old_to.segment().index()]; 126 | let seg_id = match old_to.orient() { 127 | Orientation::Forward => chopped_segs.start, 128 | Orientation::Backward => chopped_segs.end - 1, 129 | }; 130 | seg_id.handle(old_to.orient()) 131 | }; 132 | flat.add_link(new_from, new_to, vec![]); 133 | } 134 | } 135 | 136 | flat 137 | } 138 | -------------------------------------------------------------------------------- /pollen_py/README.md: -------------------------------------------------------------------------------- 1 | # Proof-of-Concept Pollen Hardware Generator 2 | 3 | This directory contains a proof-of-concept hardware accelerator generator for a simple GFA query. This section contains some guides for trying out this generator. 4 | 5 | ### The Docker Image 6 | 7 | Running the hardware generator is easy if you use our [Docker image][package]: 8 | 9 | docker run -it --rm ghcr.io/cucapra/pollen:latest 10 | 11 | If you prefer to install locally, we point you to the somewhat more involved instructions [below](#installing-locally). 12 | 13 | ### Generating an Accelerator: Quick 14 | 15 | If you want to compute the [depth][] of all the nodes in the graph, the following command will generate and run a node depth accelerator: 16 | ``` 17 | exine depth -a -r 18 | ``` 19 | 20 | This will automatically generate a node depth accelerator whose dimensions match the input data, compute the node depth, and remove the accelerator once the computation is done. 21 | 22 | To save the files generated from the previous command in ``, use the `--tmp-dir` flag: 23 | ``` 24 | exine depth -a -r --tmpdir 25 | ``` 26 | The node depth accelerator will be saved at `/` and the input data will be saved at `/`. 27 | 28 | ### Generating an Accelerator: Full Walkthrough 29 | 30 | Take [depth][] as an example. To generate and run a node depth accelerator for the graph `k.og`, first navigate to the root directory of this repository. Then run 31 | ``` 32 | make fetch 33 | make test/k.og 34 | exine depth -o depth.futil 35 | exine depth -d test/k.og -o depth.data 36 | exine depth -r depth.data --accelerator depth.futil 37 | ``` 38 | 39 | What just happened? Below, we walk through the five commands we issued above, pointing out the other options that we could have used. 40 | 41 | First, `make fetch` downloads some [GFA][] data files into the `./test` directory. 42 | 43 | Second, `make test/*.og` builds the odgi graph files from those GFA files. 44 | 45 | Third, we generate the hardware accelerator and write it to a file named `depth.futil`. The commands to generate a node depth hardware accelerator in [Calyx][] include: 46 | 47 | 1. `exine depth -o depth.futil` 48 | 2. `exine depth -a -o depth.futil` 49 | 3. `exine depth -n=MAX_NODES -e=MAX_STEPS -p=MAX_PATHS -o depth.futil` 50 | 51 | The commands use the hardware parameters as follows: 52 | 1. Uses default hardware parameters. 53 | 2. Automatically infers the hardware parameters from a `.og` file. 54 | 3. Takes the hardware parameters as input. 55 | 56 | Parameters that are specified manually take precedence over those that are inferred automatically, and it is legal to specify just a subset of parameters. For example, `exine depth -a test/k.og -n=1` will infer `MAX_STEPS` and `MAX_PATHS` from `test/k.og`, but the resulting accelerator will only handle one node. 57 | 58 | Fourth, we need to generate some input from our odgi file. This is what we will feed to the hardware accelerator. The following variations all accomplish this: 59 | 60 | 1. `exine depth -d -o depth.data` 61 | 2. `exine depth -d -a -o depth.data` 62 | 3. `exine depth -d -n=MAX_NODES -e=MAX_STEPS -p=MAX_PATHS -o depth.data` 63 | 4. `exine depth -d -a -o depth.data` 64 | 65 | The flags work as before, except that if no argument is passed to the `-a` flag, the dimensions are inferred from the input file. **The dimensions of the input must be the same as that of the hardware accelerator.** 66 | 67 | Fifth, we run our hardware accelerator. The following code simulates the Calyx code for the hardware accelerator and outputs the node depth table: 68 | 69 | ``` 70 | exine depth -r depth.data -x depth.futil 71 | ``` 72 | 73 | ### Installing Locally 74 | 75 | You will need [Flit][] version 3.7.1 and [Turnt][] version 1.11.0. 76 | We will guide you through the installation of our major dependencies, [Calyx][] and [odgi][], and then show you how to install Pollen itself. 77 | 78 | #### Calyx 79 | 80 | Below we show you how to build Calyx from source and set it up for our use. 81 | If you are curious, this tracks the "[installing from source][calyx-install-src]" and "[installing the command-line driver][calyx-install-fud]" sections of the Calyx documentation. 82 | 83 | 1. `git clone https://github.com/cucapra/calyx.git` 84 | 2. `cd calyx` 85 | 3. `cargo build` 86 | 3. `flit -f fud/pyproject.toml install -s --deps production` 87 | 4. `fud config --create global.root $(pwd)` 88 | 5. `cargo build -p interp` 89 | 6. `fud config stages.calyx.exec $(pwd)/target/debug/calyx` 90 | 7. `fud config stages.interpreter.exec $(pwd)/target/debug/interp` 91 | 8. `flit -f calyx-py/pyproject.toml install -s` 92 | 9. `fud check` 93 | 94 | You will be warned that `synth-verilog` and `vivado-hls` were not installed correctly; this is fine for our purposes. 95 | 96 | #### Odgi 97 | 98 | We recommend that you build odgi from source, as described [here][odgi-from-source]. 99 | To check that this worked, run `odgi` from the command line. 100 | 101 | Some parts of Pollen presently use odgi's Python bindings. 102 | You will need to edit your PYTHONPATH, as explained [here][odgi-pythonpath], to enable this. 103 | To verify that this worked, open up a Python shell and try `import odgi`. 104 | If it succeeds quietly, great! 105 | If it segfaults, try the preload step explained [here][odgi-preload]. 106 | 107 | #### Pollen 108 | 109 | Clone this repository: 110 | 111 | git clone https://github.com/cucapra/pollen.git 112 | 113 | And then install the Python tools using [uv][]: 114 | 115 | $ uv sync 116 | $ source .venv/bin/activate 117 | 118 | [calyx]: https://calyxir.org 119 | [odgi]: https://odgi.readthedocs.io/en/latest/ 120 | [flit]: https://flit.pypa.io/en/stable/ 121 | [turnt]: https://github.com/cucapra/turnt 122 | [calyx-install-src]: https://docs.calyxir.org/#installing-from-source-to-use-and-extend-calyx 123 | [calyx-install-fud]: https://docs.calyxir.org/#installing-the-command-line-driver 124 | [package]: https://github.com/cucapra/pollen/pkgs/container/pollen 125 | [odgi-from-source]: https://odgi.readthedocs.io/en/latest/rst/installation.html#building-from-source 126 | [odgi-pythonpath]: https://odgi.readthedocs.io/en/latest/rst/binding/usage.html 127 | [odgi-preload]: https://odgi.readthedocs.io/en/latest/rst/binding/usage.html#optimise 128 | [depth]: https://pangenome.github.io/odgi.github.io/rst/commands/odgi_depth.html 129 | [gfa]: https://github.com/lh3/gfatools/blob/master/doc/rGFA.md#the-reference-gfa-rgfa-format 130 | [uv]: https://github.com/astral-sh/uv 131 | -------------------------------------------------------------------------------- /pollen_data_gen/pollen_data_gen/simple.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Dict, Union, Optional, Any, List, Sequence, TextIO 3 | from io import TextIOWrapper 4 | from json import JSONEncoder 5 | import mygfa 6 | from . import depth 7 | 8 | 9 | SimpleType = Optional[ 10 | Union[str, Dict[str, Sequence[object]], List[int], List[Union[int, str]]] 11 | ] 12 | # A SimpleType is a type that can be serialized by the JSON encoder below. 13 | # It's a bit of a hack, but it works. 14 | 15 | char_to_number = {"A": 1, "T": 2, "G": 3, "C": 4, "N": 5} 16 | number_to_char = {v: k for k, v in char_to_number.items()} 17 | 18 | 19 | def strand_to_number_list(strand: mygfa.Strand) -> List[int]: 20 | """Converts a strand to a list of numbers following the mapping above. 21 | For instance, "AGGA" is converted to [1,3,3,1]. 22 | """ 23 | return [char_to_number[c] for c in str(strand)] 24 | 25 | 26 | def number_list_to_strand(numbers: List[str]) -> mygfa.Strand: 27 | """Converts a list of numbers to a strand following the mapping above. 28 | For instance, [1,3,3,1] is converted to "AGGA".""" 29 | return mygfa.Strand("".join([number_to_char[int(number)] for number in numbers])) 30 | 31 | 32 | def path_seq_to_number_list(path: str) -> List[int]: 33 | """Converts a path's segment sequence into a list of numbers. 34 | Every + becomes 0 and - becomes 1. 35 | For instance, "1+,2-,14+" is converted to [1,0,2,1,14,0]. 36 | The 1 at the 4th cell will not be confused for a node called "1" because 37 | it is at an even index. 38 | 39 | In the future, once we know more about our consumer, it is very likely that 40 | we will want to unzip this list into two lists: 41 | - one of node names 42 | - one of directions of traversal. 43 | This is because the direction of traversal can be stored in 1 bit each, 44 | whereas the node names will take log_2{max_steps} bits each. 45 | """ 46 | ans = [] 47 | for chunk in path.split(","): 48 | num, orient = chunk[:-1], chunk[-1] 49 | ans.append(int(num)) 50 | if orient == "+": 51 | ans.append(0) 52 | else: 53 | ans.append(1) 54 | 55 | return ans 56 | 57 | 58 | def number_list_to_path_seq(numbers: List[int]) -> str: 59 | """The inverse of the above function.""" 60 | ans = [] 61 | for i, number in enumerate(numbers): 62 | if i % 2: 63 | if number == 0: 64 | ans.append("+,") 65 | elif number == 1: 66 | ans.append("-,") 67 | else: 68 | ans.append(str(number)) 69 | 70 | # Need to drop the last comma. 71 | return "".join(ans)[:-1] 72 | 73 | 74 | def align_to_str(align: mygfa.Alignment) -> str: 75 | """Placeholder until we have reason to do anything cleverer.""" 76 | return str(align) 77 | 78 | 79 | def str_to_align(align_str: str) -> mygfa.Alignment: 80 | """Placeholder until we have reason to do anything cleverer.""" 81 | return mygfa.Alignment.parse(align_str) 82 | 83 | 84 | def link_to_number_list(link: mygfa.Link) -> List[Union[int, str]]: 85 | """Converts a Link object to a list of four numbers and a string. 86 | As before, every + becomes 0 and - becomes 1.""" 87 | return [ 88 | int(link.from_.name), 89 | 0 if link.from_.ori else 1, 90 | int(link.to_.name), 91 | 0 if link.to_.ori else 1, 92 | align_to_str(link.overlap), 93 | ] 94 | 95 | 96 | def number_list_to_link(link_json: List[Union[int, str]]) -> mygfa.Link: 97 | """The inverse of the above function.""" 98 | return mygfa.Link( 99 | mygfa.Handle(str(link_json[0]), link_json[1] == 0), 100 | mygfa.Handle(str(link_json[2]), link_json[3] == 0), 101 | str_to_align(str(link_json[4])), 102 | ) 103 | 104 | 105 | class GenericSimpleEncoder(JSONEncoder): 106 | """A generic JSON encoder for mygfa graphs.""" 107 | 108 | def default(self, o: Any) -> SimpleType: 109 | if isinstance(o, mygfa.Path): 110 | items = str(o).split("\t") 111 | # We can drop the 0th cell, which will just be 'P', 112 | # and the 1st cell, which will just be the path's name. 113 | # Not doing anything clever with the overlaps yet. 114 | return {"segments": path_seq_to_number_list(items[2]), "overlaps": items[3]} 115 | if isinstance(o, mygfa.Link): 116 | return link_to_number_list(o) 117 | if isinstance(o, mygfa.Header): 118 | return o 119 | if isinstance(o, mygfa.Segment): 120 | return strand_to_number_list(o.seq) 121 | return None 122 | 123 | 124 | def dump( 125 | graph: mygfa.Graph, 126 | json_file: Union[TextIO, TextIOWrapper], 127 | max_n: Optional[int], 128 | max_e: Optional[int], 129 | max_p: Optional[int], 130 | subset_paths: Optional[List[str]] = None, 131 | ) -> None: 132 | """Outputs the graph as a JSON, along with precomputed data for the 133 | calculation of node depth. 134 | """ 135 | 136 | basic_encoding = GenericSimpleEncoder().encode( 137 | {"headers": graph.headers} 138 | | {f"seg_to_seq_{k}": v for k, v in graph.segments.items()} 139 | | {"links": graph.links} 140 | | {f"path_details_{k}": v for k, v in graph.paths.items()} 141 | ) 142 | 143 | depth_encoding = depth.depth_json(graph, max_n, max_e, max_p, subset_paths) 144 | 145 | json.dump( 146 | { 147 | "basic": json.loads(basic_encoding), 148 | "depth": json.loads(depth_encoding), 149 | }, 150 | json_file, 151 | indent=2, 152 | sort_keys=True, 153 | ) 154 | 155 | 156 | def parse(file: TextIO) -> mygfa.Graph: 157 | """Reads a JSON file and returns a mygfa.Graph object.""" 158 | graph = json.load(file)["basic"] 159 | graph_gfa = mygfa.Graph( 160 | [mygfa.Header.parse(h) for h in graph["headers"]], 161 | { 162 | k.split("_")[3]: mygfa.Segment(k.split("_")[3], number_list_to_strand(v)) 163 | for k, v in graph.items() 164 | if k.startswith("seg_to_seq_") 165 | }, 166 | [number_list_to_link(link) for link in graph["links"]], 167 | { 168 | k.split("_")[2]: mygfa.Path.parse_inner( 169 | k.split("_")[2], number_list_to_path_seq(v["segments"]), v["overlaps"] 170 | ) 171 | for k, v in graph.items() 172 | if k.startswith("path_details_") 173 | }, 174 | ) 175 | # graph_gfa.emit(sys.stdout) # Good for debugging. 176 | return graph_gfa 177 | 178 | 179 | def roundtrip_test(graph: mygfa.Graph) -> None: 180 | """Tests that the graph can be serialized and deserialized.""" 181 | with open("roundtrip_test.json", "w", encoding="utf-8") as file: 182 | dump(graph, file, None, None, None) 183 | with open("roundtrip_test.json", "r", encoding="utf-8") as file2: 184 | assert parse(file2) == graph 185 | -------------------------------------------------------------------------------- /pollen_py/pollen/depth/processing-elements/parse_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file converts an odgi graph to numerical JSON data that can be used by the prototype calyx hardware simulator. This means it only generates data for the first node in a graph. 3 | """ 4 | 5 | import sys 6 | import argparse 7 | import json 8 | import odgi 9 | 10 | # Defaults for the maximum possible number of nodes, steps per node, and paths to consider 11 | MAX_STEPS = 15 12 | MAX_PATHS = 15 13 | 14 | 15 | def parse_steps_on_nodes( 16 | graph, path_name_to_id, max_steps=MAX_STEPS, max_paths=MAX_PATHS 17 | ): 18 | """ 19 | Generate input data containing the path ids for each step on the min node in the graph, e.g.. 20 | {path_ids: 21 | "data": [0, 1, 1, 2], 22 | "format": { 23 | "numeric_type": "bitnum", 24 | "is_signed": False, 25 | "width": 2 26 | } 27 | } 28 | """ 29 | 30 | data = {} 31 | node_id = graph.min_node_id() 32 | node_h = graph.get_handle(node_id) 33 | 34 | """ 35 | Get a list of path ids for each step on node_h. 36 | """ 37 | 38 | # Check that the number of steps on the node does not exceed max_steps 39 | if graph.get_step_count(node_h) > max_steps: 40 | raise Exception( 41 | f"The number of paths in the graph exceeds the maximum number of paths the hardware can process. {graph.get_step_count(node_h)} > {max_steps}. Hint: try setting the maximum number of steps manually using the -e flag." 42 | ) 43 | 44 | path_ids = [] 45 | 46 | def parse_step(step_h): 47 | path_h = graph.get_path(step_h) 48 | path_id = path_name_to_id[graph.get_path_name(path_h)] 49 | path_ids.append(path_id + 1) 50 | 51 | graph.for_each_step_on_handle(node_h, parse_step) 52 | 53 | # Pad path_ids with 0s 54 | path_ids = path_ids + [0] * (max_steps - len(path_ids)) 55 | 56 | # 'path_ids{id}' is the list of path ids for each step crossing node {id} 57 | width = max_paths.bit_length() 58 | data[f"path_ids"] = { 59 | "data": path_ids, 60 | "format": {"numeric_type": "bitnum", "is_signed": False, "width": width}, 61 | } 62 | 63 | return data 64 | 65 | 66 | def parse_paths_file(filename, path_to_id, max_paths=MAX_PATHS): 67 | """ 68 | Return paths_to_consider, a list of length max_paths, where 69 | paths_to_consider[i] is 1 if i is a path id and we include path i in our 70 | calculations of node depth 71 | """ 72 | 73 | if filename is None: # Return the default value 74 | paths_to_consider = [1] * (max_paths + 1) 75 | paths_to_consider[0] = 0 76 | return paths_to_consider 77 | 78 | with open(filename, "r") as paths_file: 79 | text = paths_file.read() 80 | paths = text.splitlines() 81 | 82 | paths_to_consider = [0] * (max_paths + 1) 83 | 84 | for path_name in paths: 85 | path_id = path_name_to_id[path_name] 86 | paths_to_consider[path_id] = 1 87 | 88 | return paths_to_consider 89 | 90 | 91 | def get_maxes(filename): 92 | graph = odgi.graph() 93 | graph.load(filename) 94 | 95 | max_steps = 0 96 | max_paths = graph.get_path_count() 97 | 98 | def update_max_steps(node_h): 99 | nonlocal max_steps 100 | num_steps = graph.get_step_count(node_h) 101 | if num_steps > max_steps: 102 | max_steps = num_steps 103 | 104 | graph.for_each_handle(update_max_steps) 105 | 106 | return max_steps, max_paths 107 | 108 | 109 | if __name__ == "__main__": 110 | # Parse commandline arguments 111 | parser = argparse.ArgumentParser() 112 | parser.add_argument( 113 | "filename", 114 | help="A .og file representing a pangenome whose node depth we want to calculate", 115 | ) 116 | parser.add_argument( 117 | "-a", 118 | "--auto-size", 119 | action="store_true", 120 | help="Automically infer hardware dimensions from a given odgi graph", 121 | ) 122 | parser.add_argument( 123 | "-s", 124 | "--subset-paths", 125 | help="Specify a file containing a subset of all paths in the graph. See the odgi documentation for more details.", 126 | ) 127 | parser.add_argument( 128 | "-e", 129 | "--max-steps", 130 | type=int, 131 | default=MAX_STEPS, 132 | help="Specify the maximum number of steps per node that the hardware can support.", 133 | ) 134 | parser.add_argument( 135 | "-p", 136 | "--max-paths", 137 | type=int, 138 | default=MAX_PATHS, 139 | help="Specify the maximum number of paths that the hardware can support.", 140 | ) 141 | parser.add_argument( 142 | "-o", 143 | "--out", 144 | help="Specify the output file. If not specified, will dump to stdout.", 145 | ) 146 | args = parser.parse_args() 147 | 148 | graph = odgi.graph() 149 | graph.load(args.filename) 150 | 151 | if args.auto_size: 152 | max_steps, max_paths = get_maxes(args.filename) 153 | else: 154 | max_stpes, max_paths = args.max_steps, args.max_paths 155 | 156 | # Check that the number of paths on the graph does not exceed max_paths 157 | if graph.get_path_count() > max_paths: 158 | raise Exception( 159 | f"The number of paths in the graph exceeds the maximum number of paths the hardware can process. {graph.get_path_count()} > {args.max_paths}. Hint: try setting the maximum number of paths manually using the -p flag" 160 | ) 161 | 162 | # Assign a path_id to each path; the path_ids are not accessible using the 163 | # default python bindings for odgi 164 | 165 | # Obtain a list of path names; a path's index is its id 166 | paths = [] 167 | graph.for_each_path_handle(lambda h: paths.append(graph.get_path_name(h))) 168 | 169 | # Path name -> path id 170 | path_name_to_id = {path: count for count, path in enumerate(paths)} 171 | 172 | paths_to_consider = parse_paths_file(args.subset_paths, path_name_to_id, max_paths) 173 | 174 | data = parse_steps_on_nodes(graph, path_name_to_id, max_steps, max_paths) 175 | 176 | data["paths_to_consider"] = { 177 | "data": paths_to_consider, 178 | "format": {"numeric_type": "bitnum", "is_signed": False, "width": 1}, 179 | } 180 | 181 | data["paths_on_node"] = { 182 | "data": [0] * (max_paths + 1), 183 | "format": {"numeric_type": "bitnum", "is_signed": False, "width": 1}, 184 | } 185 | 186 | data["depth_output"] = { 187 | "data": [0], 188 | "format": { 189 | "numeric_type": "bitnum", 190 | "is_signed": False, 191 | "width": max_steps.bit_length(), 192 | }, 193 | } 194 | 195 | data["uniq_output"] = { 196 | "data": [0], 197 | "format": { 198 | "numeric_type": "bitnum", 199 | "is_signed": False, 200 | "width": max_paths.bit_length(), 201 | }, 202 | } 203 | 204 | if args.out: 205 | with open(args.out, "w") as out_file: 206 | json.dump(data, out_file, indent=2, sort_keys=True) 207 | else: 208 | json.dump(data, sys.stdout, indent=2, sort_keys=True) 209 | -------------------------------------------------------------------------------- /bench/graphs.toml: -------------------------------------------------------------------------------- 1 | # From: https://github.com/AndreaGuarracino/1000G-ONT-F100-PGGB/blob/master/data/1000G-ONT-F100-PGGB.gfa.urls.tsv 2 | [1000gont] 3 | chr1 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr1.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 4 | chr2 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr2.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 5 | chr3 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr3.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 6 | chr4 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr4.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 7 | chr5 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr5.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 8 | chr6 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr6.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 9 | chr7 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr7.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 10 | chr8 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr8.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 11 | chr9 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr9.30kbp.fa.gz.445f03b.e34d4cd.b691e61.smooth.final.gfa.zst" 12 | chr10 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr10.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 13 | chr11 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr11.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 14 | chr12 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr12.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 15 | chr13 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr13.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 16 | chr14 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr14.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 17 | chr15 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr15.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 18 | chr16 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr16.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 19 | chr17 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr17.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 20 | chr18 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr18.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 21 | chr19 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr19.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 22 | chr20 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr20.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 23 | chr21 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr21.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 24 | chr22 = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chr22.30kbp.fa.gz.a8a102b.eb0f3d3.b691e61.smooth.final.gfa.zst" 25 | chrX = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chrX.30kbp.fa.gz.a8a102b.eb0f3d3.a58faa8.smooth.final.gfa.zst" 26 | chrY = "https://garrisonlab.s3.amazonaws.com/1000G-ONT-F100-PGGB/1000G-ONT.100x2%2B4.chrY.30kbp.fa.gz.a8a102b.eb0f3d3.0713820.smooth.final.gfa.zst" 27 | 28 | # From: https://s3-us-west-2.amazonaws.com/human-pangenomics/index.html?prefix=pangenomes/freeze/freeze1/pggb/chroms/ 29 | [hprc] 30 | chrY = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrY.hprc-v1.0-pggb.gfa.gz" 31 | chr1 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr1.hprc-v1.0-pggb.gfa.gz" 32 | chr10 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr10.hprc-v1.0-pggb.gfa.gz" 33 | chr11 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr11.hprc-v1.0-pggb.gfa.gz" 34 | chr12 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr12.hprc-v1.0-pggb.gfa.gz" 35 | chr13 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr13.hprc-v1.0-pggb.gfa.gz" 36 | chr14 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr14.hprc-v1.0-pggb.gfa.gz" 37 | chr15 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr15.hprc-v1.0-pggb.gfa.gz" 38 | chr16 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr16.hprc-v1.0-pggb.gfa.gz" 39 | chr17 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr17.hprc-v1.0-pggb.gfa.gz" 40 | chr18 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr18.hprc-v1.0-pggb.gfa.gz" 41 | chr19 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr19.hprc-v1.0-pggb.gfa.gz" 42 | chr2 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr2.hprc-v1.0-pggb.gfa.gz" 43 | chr20 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr20.hprc-v1.0-pggb.gfa.gz" 44 | chr21 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr21.hprc-v1.0-pggb.gfa.gz" 45 | chr22 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr22.hprc-v1.0-pggb.gfa.gz" 46 | chr3 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr3.hprc-v1.0-pggb.gfa.gz" 47 | chr4 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr4.hprc-v1.0-pggb.gfa.gz" 48 | chr5 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr5.hprc-v1.0-pggb.gfa.gz" 49 | chr6 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr6.hprc-v1.0-pggb.gfa.gz" 50 | chr7 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr7.hprc-v1.0-pggb.gfa.gz" 51 | chr8 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr8.hprc-v1.0-pggb.gfa.gz" 52 | chr9 = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chr9.hprc-v1.0-pggb.gfa.gz" 53 | chrM = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrM.hprc-v1.0-pggb.gfa.gz" 54 | chrX = "https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/freeze1/pggb/chroms/chrX.hprc-v1.0-pggb.gfa.gz" 55 | 56 | # Small tests from odgi: 57 | # https://github.com/pangenome/odgi/tree/master/test 58 | [test] 59 | k = "https://raw.githubusercontent.com/pangenome/odgi/master/test/k.gfa" 60 | lpa = "https://raw.githubusercontent.com/pangenome/odgi/master/test/LPA.gfa" 61 | chr6c4 = "https://raw.githubusercontent.com/pangenome/odgi/master/test/chr6.C4.gfa" 62 | drb1 = "https://raw.githubusercontent.com/pangenome/odgi/master/test/DRB1-3123.gfa" 63 | -------------------------------------------------------------------------------- /slow_odgi/slow_odgi/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import io 4 | from typing import Dict, Tuple, List 5 | from collections.abc import Callable 6 | import mygfa 7 | 8 | from . import ( 9 | chop, 10 | crush, 11 | degree, 12 | depth, 13 | flatten, 14 | flip, 15 | inject, 16 | matrix, 17 | overlap, 18 | paths, 19 | proofs, 20 | validate, 21 | norm, 22 | inject_setup, 23 | somepaths, 24 | validate_setup, 25 | ) 26 | 27 | 28 | def parse_args() -> Tuple[argparse.ArgumentParser, argparse.Namespace]: 29 | """Parse command line arguments and run the appropriate subcommand.""" 30 | parser = argparse.ArgumentParser() 31 | 32 | subparsers = parser.add_subparsers( 33 | title="slow-odgi commands", metavar="COMMAND", dest="command" 34 | ) 35 | 36 | chop_parser = subparsers.add_parser( 37 | "chop", 38 | help="Shortens segments' sequences to a given maximum length.", 39 | ) 40 | chop_parser.add_argument( 41 | "-n", 42 | nargs="?", 43 | const="d", 44 | help="The max segment size desired after chopping.", 45 | required=True, 46 | ) 47 | 48 | subparsers.add_parser( 49 | "crush", 50 | help="Replaces consecutive instances of `N` with a single `N`.", 51 | ) 52 | 53 | subparsers.add_parser( 54 | "degree", help="Generates a table summarizing each segment's degree." 55 | ) 56 | 57 | depth_parser = subparsers.add_parser( 58 | "depth", help="Generates a table summarizing each segment's depth." 59 | ) 60 | depth_parser.add_argument( 61 | "--paths", 62 | help="A file describing the paths you wish to query.", 63 | required=False, 64 | ) 65 | 66 | subparsers.add_parser( 67 | "flatten", 68 | help="Converts the graph into FASTA + BED representation.", 69 | ) 70 | 71 | subparsers.add_parser( 72 | "flip", 73 | help="Flips any paths that step more backward than forward.", 74 | ) 75 | 76 | inject_parser = subparsers.add_parser( 77 | "inject", help="Adds new paths, as specified, to the graph." 78 | ) 79 | inject_parser.add_argument( 80 | "--bed", 81 | nargs="?", 82 | help="A BED file describing the paths you wish to insert.", 83 | required=True, 84 | ) 85 | 86 | subparsers.add_parser("matrix", help="Represents the graph as a matrix.") 87 | 88 | overlap_parser = subparsers.add_parser( 89 | "overlap", 90 | help="Queries the graph about which paths overlap with which other paths.", 91 | ) 92 | overlap_parser.add_argument( 93 | "--paths", 94 | nargs="?", 95 | help="A file describing the paths you wish to query.", 96 | required=True, 97 | ) 98 | 99 | subparsers.add_parser("paths", help="Lists the paths in the graph.") 100 | 101 | somepaths_parser = subparsers.add_parser( 102 | "somepaths", 103 | help="Lists the paths in the graph, with the option of dropping some.", 104 | ) 105 | somepaths_parser.add_argument( 106 | "--drop", 107 | type=int, 108 | default=0, 109 | help="Randomly drop a percentage of the paths.", 110 | metavar="PCT", 111 | ) 112 | 113 | subparsers.add_parser( 114 | "validate", 115 | help="Checks whether the links of the graph support its paths.", 116 | ) 117 | 118 | norm_parser = subparsers.add_parser( 119 | "norm", 120 | help="Print a graph unmodified, normalizing its representation.", 121 | ) 122 | norm_parser.add_argument( 123 | "--nl", 124 | action="store_true", 125 | help="Don't include links.", 126 | ) 127 | 128 | # "Hidden" commands for testing only 129 | subparsers.add_parser("inject_setup") 130 | subparsers.add_parser("validate_setup") 131 | 132 | # Add the graph argument to all subparsers. 133 | # Doing it this way means that the graph argument is sought _after_ the 134 | # command name. 135 | for subparser in subparsers.choices.values(): 136 | subparser.add_argument( 137 | "graph", nargs="?", help="Input GFA file", metavar="GRAPH" 138 | ) 139 | 140 | args = parser.parse_args() 141 | 142 | return parser, args 143 | 144 | 145 | def parse_bedfile(filename: str) -> List[mygfa.Bed]: 146 | """Parse BED files that describe which paths to insert.""" 147 | bedfile = open(filename, "r", encoding="utf-8") 148 | return [mygfa.Bed.parse(line) for line in (mygfa.nonblanks(bedfile))] 149 | 150 | 151 | def parse_paths(filename: str) -> List[str]: 152 | """Parse path names from a file.""" 153 | return list(mygfa.nonblanks(open(filename, "r", encoding="utf-8"))) 154 | 155 | 156 | def dispatch(args: argparse.Namespace) -> None: 157 | """Parse the graph from filename, 158 | parse any additional files if needed, 159 | then dispatch to the appropriate slow-odgi command. 160 | If the command makes a new graph, emit it to stdout.""" 161 | 162 | # Functions that produce a new graph. 163 | transformer_funcs: Dict[str, Callable[[mygfa.Graph], mygfa.Graph]] = { 164 | "chop": lambda g: chop.chop(g, int(args.n)), 165 | "crush": crush.crush, 166 | "flip": flip.flip, 167 | "inject": lambda g: inject.inject(g, parse_bedfile(args.bed)), 168 | "norm": norm.norm, 169 | "validate_setup": validate_setup.drop_some_links, 170 | } 171 | 172 | # Other functions, which typically print their own output. 173 | other_funcs: Dict[str, Callable[[mygfa.Graph], object]] = { 174 | "degree": degree.degree, 175 | "depth": lambda g: depth.depth( 176 | g, parse_paths(args.paths) if args.paths else None 177 | ), 178 | "flatten": lambda g: flatten.flatten(g, f"{args.graph[:-4]}.og"), 179 | "matrix": matrix.matrix, 180 | "overlap": lambda g: overlap.overlap(g, parse_paths(args.paths)), 181 | "paths": paths.paths, 182 | "somepaths": lambda g: somepaths.somepaths(g, args.drop), 183 | "validate": validate.validate, 184 | "inject_setup": inject_setup.print_bed, 185 | } 186 | 187 | show_no_links = ["chop", "inject"] 188 | constructive_changes = ["chop", "inject"] 189 | # These commands only add to the graph, so we'll assert "logically_le". 190 | 191 | # Parse the input graph, which comes from either a filename argument or 192 | # stdin (if the filename is unspecified). 193 | if args.graph: 194 | in_file = open(args.graph, "r", encoding="utf-8") 195 | else: 196 | in_file = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8") 197 | graph = mygfa.Graph.parse(in_file) 198 | 199 | # Run the appropriate command on the input graph. 200 | if args.command in transformer_funcs: 201 | out_graph = transformer_funcs[args.command](graph) 202 | out_graph.emit( 203 | sys.stdout, args.command not in show_no_links and not vars(args).get("nl") 204 | ) 205 | if args.command in constructive_changes: 206 | assert proofs.logically_le(graph, out_graph) 207 | elif args.command in other_funcs: 208 | other_funcs[args.command](graph) 209 | else: 210 | assert False 211 | 212 | 213 | def main() -> None: 214 | """Parse command line arguments and run the appropriate subcommand.""" 215 | parser, args = parse_args() 216 | dispatch(args) 217 | 218 | 219 | if __name__ == "__main__": 220 | main() 221 | -------------------------------------------------------------------------------- /pollen_py/pollen/depth/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Combines the commandline interface for calyx_depth.py and parse_data.py. Run ./main.py -h for more info. 3 | """ 4 | 5 | import argparse 6 | import json 7 | import os.path 8 | import subprocess 9 | import tempfile 10 | import warnings 11 | 12 | import pollen.depth.calyx_depth as depth 13 | import pollen.depth.parse_data as parse_data 14 | from pollen.argparse_custom import store_const_and_arg 15 | 16 | 17 | def config_parser(parser): 18 | depth.config_parser(parser) 19 | 20 | parser.add_argument( 21 | "-a", 22 | "--auto-size", 23 | nargs="?", 24 | const="d", 25 | help="Provide an odgi file that will be used to calculate the hardware dimensions. If the flag is set with no argument, the argument of --parse-data or --run is used instead. Specified hardware dimensions take precedence.", 26 | ) 27 | 28 | parser.set_defaults(action="gen") 29 | parser.add_argument( 30 | "-g", 31 | "--gen", 32 | dest="action", 33 | action="store_const", 34 | const="gen", 35 | help="Generate an accelerator. Should not be used with --run or --parse-data.", 36 | ) 37 | parser.add_argument( 38 | "-r", 39 | "--run", 40 | dest="filename", 41 | dest2="action", 42 | action=store_const_and_arg, 43 | const="run", 44 | default="gen", 45 | help="Run node depth on the given .og or .data file. Outputs the node depth table. Should not be used with --gen or --parse-data.", 46 | ) 47 | parser.add_argument( 48 | "-d", 49 | "--parse-data", 50 | dest="filename", 51 | dest2="action", 52 | action=store_const_and_arg, 53 | const="parse", 54 | default="gen", 55 | help="Parse the .og file to accelerator input. Should not be used with --gen or --run.", 56 | ) 57 | 58 | parser.add_argument( 59 | "-s", 60 | "--subset-paths", 61 | help="Should only be used if the --run or --parse-data flag is set. Specifies a\ 62 | subset of paths whose node depth to compute.", 63 | ) 64 | 65 | parser.add_argument( 66 | "-x", 67 | "--accelerator", 68 | help="Specify a node depth accelerator to run. Should only be set if the --run flag is set.", 69 | ) 70 | parser.add_argument( 71 | "--pr", 72 | action="store_true", 73 | help="Print profiling info. Passes the -pr flag to fud if --run is set.", 74 | ) 75 | 76 | parser.add_argument( 77 | "--tmp-dir", 78 | help="Specify a directory to store temporary files in. The files will not be deleted at the end of execution.", 79 | ) 80 | 81 | 82 | def run_accel(args, tmp_dir_name): 83 | """ 84 | Run the node depth accelerator 85 | """ 86 | 87 | # Data parser 88 | parser = argparse.ArgumentParser() 89 | parse_data.config_parser(parser) 90 | 91 | # Parse the data file if necessary 92 | out_file = args.out 93 | basename = os.path.basename(args.filename) 94 | base, ext = os.path.splitext(basename) 95 | 96 | if ext == ".data": # Data file was provided 97 | if args.auto_size == "d": 98 | warnings.warn("Cannot infer dimensions from .data file.", SyntaxWarning) 99 | data_file = args.filename 100 | else: 101 | # parse_data_file(args, tmp_dir_name) 102 | data_file = f"{tmp_dir_name}/{base}.data" 103 | new_args = [args.filename, "--out", data_file] 104 | parser.parse_args(new_args, namespace=args) 105 | # print("here1") 106 | parse_data.run(args) 107 | # print("here2") 108 | 109 | # Generate the accelerator if necessary 110 | if args.accelerator: 111 | futil_file = args.accelerator 112 | else: 113 | futil_file = f"{tmp_dir_name}/{base}.futil" 114 | new_args = [args.filename, "--out", futil_file] 115 | if args.auto_size == "d": 116 | new_args.extend(["-a", args.filename]) 117 | parser.parse_args(new_args, namespace=args) 118 | depth.run(args) 119 | 120 | # Compute the node depth 121 | cmd = [ 122 | "fud", 123 | "e", 124 | futil_file, 125 | "--to", 126 | "interpreter-out", 127 | "-s", 128 | "verilog.data", 129 | data_file, 130 | ] 131 | if args.pr: 132 | cmd.append("-pr") 133 | calyx_out = subprocess.run(cmd, capture_output=True, text=True) 134 | output = calyx_out.stdout 135 | else: 136 | calyx_out = subprocess.run(cmd, capture_output=True, text=True) 137 | try: 138 | # Convert calyx output to a node depth table 139 | calyx_out = json.loads(calyx_out.stdout) 140 | output = parse_data.from_calyx(calyx_out, True) # ndt 141 | except: 142 | output = calyx_out.stderr 143 | 144 | # Output the ndt 145 | if out_file: 146 | with open(out_file, "w") as out_file: 147 | out_file.write(output) 148 | else: 149 | print(output) 150 | 151 | 152 | def parse_data_file(args, tmp_dir_name): 153 | # Parser for parsing input to data file parser 154 | parser = argparse.ArgumentParser() 155 | parse_data.config_parser(parser) 156 | 157 | filename = args.filename 158 | basename = os.path.basename(filename) 159 | base, ext = os.path.splitext(basename) 160 | 161 | if ext == ".gfa": # Build an odgi file 162 | data_file = f"{tmp_dir_name}/{base}.data" 163 | og_file = f"{tmp_dir_name}/{base}.og" 164 | cmd = ["odgi", "build", "--gfa", filename, "--out", og_file] 165 | subprocess.run(cmd) 166 | 167 | new_args = [og_file, "--out", data_file] 168 | elif ext == ".og": # Construct the pollen data file 169 | data_file = f"{tmp_dir_name}/{base}.data" 170 | new_args = [filename, "--out", data_file] 171 | else: 172 | raise Exception(f"file extension {ext} not recognized") 173 | parser.parse_args(new_args, namespace=args) 174 | parse_data.run(new_args) 175 | 176 | 177 | def run(args): 178 | if args.action == "gen": # Generate an accelerator 179 | if args.subset_paths or args.accelerator or args.pr: 180 | warnings.warn( 181 | "--subset-paths, --accelerator, and --pr will be ignored if action is gen.", 182 | SyntaxWarning, 183 | ) 184 | depth.run(args) 185 | 186 | elif args.action == "parse": # Generate a data file 187 | if args.accelerator or args.pr: 188 | warnings.warn( 189 | "--accelerator and --pr will be ignored if action is not 'run'.", 190 | SyntaxWarning, 191 | ) 192 | 193 | parser = argparse.ArgumentParser() 194 | parse_data.config_parser(parser) 195 | parser.parse_args( 196 | [args.filename], namespace=args 197 | ) # Set defaults for all arguments; does not change existing arguments 198 | parse_data.run(args) 199 | 200 | elif args.action == "run": # Run the accelerator 201 | if args.tmp_dir: 202 | with open(args.tmp_dir, "w") as tmp_dir_name: 203 | run_accel(args, tmp_dir_name) 204 | else: 205 | with tempfile.TemporaryDirectory() as tmp_dir_name: 206 | run_accel(args, tmp_dir_name) 207 | 208 | 209 | def main(): 210 | parser = argparse.ArgumentParser(conflict_handler="resolve") 211 | 212 | config_parser(parser) 213 | 214 | args = parser.parse_args() 215 | run(args) 216 | 217 | 218 | if __name__ == "__main__": 219 | main() 220 | -------------------------------------------------------------------------------- /flatgfa/src/parse.rs: -------------------------------------------------------------------------------- 1 | use crate::flatgfa::{self, LineKind}; 2 | use crate::gfaline; 3 | use crate::memfile::MemchrSplit; 4 | use crate::namemap::NameMap; 5 | use std::io::BufRead; 6 | 7 | pub struct Parser<'a, P: flatgfa::StoreFamily<'a>> { 8 | /// The flat representation we're building. 9 | flat: flatgfa::GFAStore<'a, P>, 10 | 11 | /// All segment IDs, indexed by their names, which we need to refer to segments in paths. 12 | seg_ids: NameMap, 13 | } 14 | 15 | impl<'a, P: flatgfa::StoreFamily<'a>> Parser<'a, P> { 16 | pub fn new(builder: flatgfa::GFAStore<'a, P>) -> Self { 17 | Self { 18 | flat: builder, 19 | seg_ids: NameMap::default(), 20 | } 21 | } 22 | 23 | /// Parse a GFA text file from an I/O stream. 24 | pub fn parse_stream(mut self, stream: R) -> flatgfa::GFAStore<'a, P> { 25 | // We can parse segments immediately, but we need to defer links and paths until we have all 26 | // the segment names that they might refer to. 27 | let mut deferred_links = Vec::new(); 28 | let mut deferred_paths = Vec::new(); 29 | 30 | // Parse or defer each line. 31 | for line in stream.split(b'\n') { 32 | let line = line.unwrap(); 33 | 34 | // Avoid parsing paths entirely for now; just preserve the entire line for later. 35 | if line[0] == b'P' { 36 | self.flat.record_line(LineKind::Path); 37 | deferred_paths.push(line); 38 | continue; 39 | } 40 | 41 | // Parse other kinds of lines. 42 | let gfa_line = gfaline::parse_line(line.as_ref()).unwrap(); 43 | self.record_line(&gfa_line); 44 | 45 | match gfa_line { 46 | gfaline::Line::Header(data) => { 47 | self.flat.add_header(data); 48 | } 49 | gfaline::Line::Segment(seg) => { 50 | self.add_seg(seg); 51 | } 52 | gfaline::Line::Link(link) => { 53 | deferred_links.push(link); 54 | } 55 | gfaline::Line::Path(_) => { 56 | unreachable!("paths handled separately") 57 | } 58 | } 59 | } 60 | 61 | // "Unwind" the deferred links and paths. 62 | for link in deferred_links { 63 | self.add_link(link); 64 | } 65 | for line in deferred_paths { 66 | if let gfaline::Line::Path(path) = gfaline::parse_line(&line).unwrap() { 67 | self.add_path(path); 68 | } else { 69 | unreachable!("unexpected deferred line") 70 | } 71 | } 72 | 73 | self.flat 74 | } 75 | 76 | /// Parse a GFA text file from an in-memory buffer. 77 | pub fn parse_mem(mut self, buf: &[u8]) -> flatgfa::GFAStore<'a, P> { 78 | let mut deferred_lines = Vec::new(); 79 | 80 | for line in MemchrSplit::new(b'\n', buf) { 81 | // When parsing from memory, it's easy to entirely defer parsing of any line: we just keep 82 | // pointers to them. So we defer both paths and links. 83 | if line[0] == b'P' || line[0] == b'L' { 84 | self.flat.record_line(if line[0] == b'P' { 85 | LineKind::Path 86 | } else { 87 | LineKind::Link 88 | }); 89 | deferred_lines.push(line); 90 | continue; 91 | } 92 | 93 | // Actually parse other lines. 94 | let gfa_line = gfaline::parse_line(line).unwrap(); 95 | self.record_line(&gfa_line); 96 | match gfa_line { 97 | gfaline::Line::Header(data) => { 98 | self.flat.add_header(data); 99 | } 100 | gfaline::Line::Segment(seg) => { 101 | self.add_seg(seg); 102 | } 103 | gfaline::Line::Link(_) | gfaline::Line::Path(_) => { 104 | unreachable!("paths and links handled separately") 105 | } 106 | } 107 | } 108 | 109 | // "Unwind" the deferred lines. 110 | for line in deferred_lines { 111 | let gfa_line = gfaline::parse_line(line).unwrap(); 112 | match gfa_line { 113 | gfaline::Line::Link(link) => { 114 | self.add_link(link); 115 | } 116 | gfaline::Line::Path(path) => { 117 | self.add_path(path); 118 | } 119 | gfaline::Line::Header(_) | gfaline::Line::Segment(_) => { 120 | unreachable!("unexpected deferred line") 121 | } 122 | } 123 | } 124 | 125 | self.flat 126 | } 127 | 128 | /// Record a marker that captures the original GFA line ordering. 129 | fn record_line(&mut self, line: &gfaline::Line) { 130 | match line { 131 | gfaline::Line::Header(_) => self.flat.record_line(LineKind::Header), 132 | gfaline::Line::Segment(_) => self.flat.record_line(LineKind::Segment), 133 | gfaline::Line::Link(_) => self.flat.record_line(LineKind::Link), 134 | gfaline::Line::Path(_) => self.flat.record_line(LineKind::Path), 135 | } 136 | } 137 | 138 | fn add_seg(&mut self, seg: gfaline::Segment) { 139 | let seg_id = self.flat.add_seg(seg.name, seg.seq, seg.data); 140 | self.seg_ids.insert(seg.name, seg_id); 141 | } 142 | 143 | fn add_link(&mut self, link: gfaline::Link) { 144 | let from = self.seg_ids.get(link.from_seg).handle(link.from_orient); 145 | let to = self.seg_ids.get(link.to_seg).handle(link.to_orient); 146 | self.flat.add_link(from, to, link.overlap); 147 | } 148 | 149 | fn add_path(&mut self, path: gfaline::Path) { 150 | // Parse the steps. 151 | let mut step_parser = gfaline::StepsParser::new(path.steps); 152 | let steps = self.flat.add_steps( 153 | (&mut step_parser).map(|(name, dir)| self.seg_ids.get(name).handle(dir.into())), 154 | ); 155 | assert!(step_parser.rest().is_empty()); 156 | 157 | self.flat 158 | .add_path(path.name, steps, path.overlaps.into_iter()); 159 | } 160 | } 161 | 162 | impl Parser<'static, flatgfa::HeapFamily> { 163 | pub fn for_heap() -> Self { 164 | Self::new(flatgfa::HeapGFAStore::default()) 165 | } 166 | } 167 | 168 | impl<'a> Parser<'a, flatgfa::FixedFamily> { 169 | pub fn for_slice(store: flatgfa::FixedGFAStore<'a>) -> Self { 170 | Self::new(store) 171 | } 172 | } 173 | 174 | /// Scan a GFA text file to count the number of each type of line and measure some sizes 175 | /// that are useful in estimating the final size of the FlatGFA file. 176 | pub fn estimate_toc(buf: &[u8]) -> crate::file::Toc { 177 | let mut segs = 0; 178 | let mut links = 0; 179 | let mut paths = 0; 180 | let mut header_bytes = 0; 181 | let mut seg_bytes = 0; 182 | let mut path_bytes = 0; 183 | 184 | let mut rest = buf; 185 | while !rest.is_empty() { 186 | let marker = rest[0]; 187 | let next = memchr::memchr(b'\n', rest).unwrap_or(rest.len() + 1); 188 | 189 | match marker { 190 | b'H' => { 191 | header_bytes += next; 192 | } 193 | b'S' => { 194 | segs += 1; 195 | seg_bytes += next; 196 | } 197 | b'L' => { 198 | links += 1; 199 | } 200 | b'P' => { 201 | paths += 1; 202 | path_bytes += next; 203 | } 204 | _ => { 205 | panic!("unknown line type") 206 | } 207 | } 208 | 209 | if next >= rest.len() { 210 | break; 211 | } 212 | rest = &rest[next + 1..]; 213 | } 214 | 215 | crate::file::Toc::estimate(segs, links, paths, header_bytes, seg_bytes, path_bytes) 216 | } 217 | -------------------------------------------------------------------------------- /flatgfa/src/cli/main.rs: -------------------------------------------------------------------------------- 1 | use argh::FromArgs; 2 | use flatgfa::flatgfa::FlatGFA; 3 | use flatgfa::parse::Parser; 4 | use flatgfa::pool::Store; 5 | use flatgfa::{cli::cmds, file, memfile, parse}; 6 | 7 | #[derive(FromArgs)] 8 | /// Convert between GFA text and FlatGFA binary formats. 9 | struct PolBin { 10 | /// read from a binary FlatGFA file 11 | #[argh(option, short = 'i')] 12 | input: Option, 13 | 14 | /// read from a text GFA file 15 | #[argh(option, short = 'I')] 16 | input_gfa: Option, 17 | 18 | /// write to a binary FlatGFA file 19 | #[argh(option, short = 'o')] 20 | output: Option, 21 | 22 | /// mutate the input file in place 23 | #[argh(switch, short = 'm')] 24 | mutate: bool, 25 | 26 | /// preallocation size factor 27 | #[argh(option, short = 'p', default = "32")] 28 | prealloc_factor: usize, 29 | 30 | #[argh(subcommand)] 31 | command: Option, 32 | } 33 | 34 | #[derive(FromArgs, PartialEq, Debug)] 35 | #[argh(subcommand)] 36 | enum Command { 37 | Toc(cmds::Toc), 38 | Paths(cmds::Paths), 39 | Stats(cmds::Stats), 40 | Position(cmds::Position), 41 | Extract(cmds::Extract), 42 | Depth(cmds::Depth), 43 | Chop(cmds::Chop), 44 | GafLookup(cmds::GAFLookup), 45 | Bench(cmds::Bench), 46 | BedIntersect(cmds::BEDIntersect), 47 | SeqExport(cmds::SeqExport), 48 | SeqImport(cmds::SeqImport), 49 | } 50 | 51 | fn main() -> Result<(), &'static str> { 52 | let args: PolBin = argh::from_env(); 53 | 54 | // A special case for converting from GFA text to an in-place FlatGFA binary. 55 | if args.mutate { 56 | if let (None, None, Some(out_name)) = (&args.command, &args.input, &args.output) { 57 | prealloc_translate(args.input_gfa.as_deref(), out_name, args.prealloc_factor); 58 | return Ok(()); 59 | } 60 | } 61 | 62 | // Another special case for parsing BED files, 63 | // since we do not parse a GFA file for that. 64 | if let Some(Command::BedIntersect(sub_args)) = args.command { 65 | cmds::bed_intersect(sub_args); 66 | return Ok(()); 67 | } 68 | 69 | // Yet more special cases for sequence compression/decompression, which only 70 | // deal with raw sequence data and not GFA files. 71 | if let Some(Command::SeqExport(sub_args)) = args.command { 72 | cmds::seq_export(sub_args); 73 | return Ok(()); 74 | } 75 | 76 | if let Some(Command::SeqImport(sub_args)) = args.command { 77 | cmds::seq_import(sub_args); 78 | return Ok(()); 79 | } 80 | 81 | // Load the input from a file (binary) or stdin (text). 82 | let mmap; 83 | let mut mmap_mut; 84 | let store; 85 | let slice_store; 86 | let gfa = match args.input { 87 | Some(name) => { 88 | if args.mutate { 89 | mmap_mut = memfile::map_file_mut(&name); 90 | slice_store = file::view_store(&mut mmap_mut); 91 | slice_store.as_ref() 92 | } else { 93 | mmap = memfile::map_file(&name); 94 | file::view(&mmap) 95 | } 96 | } 97 | None => { 98 | // Parse from stdin or a file. 99 | store = match args.input_gfa { 100 | Some(name) => { 101 | let file = memfile::map_file(&name); 102 | Parser::for_heap().parse_mem(file.as_ref()) 103 | } 104 | None => { 105 | let stdin = std::io::stdin(); 106 | Parser::for_heap().parse_stream(stdin.lock()) 107 | } 108 | }; 109 | store.as_ref() 110 | } 111 | }; 112 | 113 | match args.command { 114 | Some(Command::Toc(sub_args)) => { 115 | cmds::toc(&gfa, sub_args); 116 | } 117 | Some(Command::Paths(_)) => { 118 | cmds::paths(&gfa); 119 | } 120 | Some(Command::Stats(sub_args)) => { 121 | cmds::stats(&gfa, sub_args); 122 | } 123 | Some(Command::Position(sub_args)) => { 124 | cmds::position(&gfa, sub_args)?; 125 | } 126 | Some(Command::Extract(sub_args)) => { 127 | let store = cmds::extract(&gfa, sub_args)?; 128 | dump(&store.as_ref(), &args.output); 129 | } 130 | Some(Command::Depth(_)) => { 131 | cmds::depth(&gfa); 132 | } 133 | Some(Command::Chop(sub_args)) => { 134 | let store = cmds::chop(&gfa, sub_args)?; 135 | // TODO: Ideally, find a way to encapsulate the logic of chop in `cmd.rs`, instead of 136 | // defining here which values from out input `gfa` are needed by our final `flat` gfa. 137 | // Here we are reference values in two different Stores to create this Flatgfa, and 138 | // have not yet found a good rust-safe way to do this 139 | let flat = flatgfa::FlatGFA { 140 | header: gfa.header, 141 | seq_data: gfa.seq_data, 142 | name_data: gfa.name_data, 143 | segs: store.segs.as_ref(), 144 | paths: store.paths.as_ref(), 145 | links: store.links.as_ref(), 146 | steps: store.steps.as_ref(), 147 | overlaps: store.overlaps.as_ref(), 148 | alignment: store.alignment.as_ref(), 149 | optional_data: store.optional_data.as_ref(), 150 | line_order: store.line_order.as_ref(), 151 | }; 152 | dump(&flat, &args.output); 153 | } 154 | Some(Command::GafLookup(sub_args)) => { 155 | cmds::gaf_lookup(&gfa, sub_args); 156 | } 157 | Some(Command::Bench(sub_args)) => { 158 | cmds::bench(sub_args); 159 | } 160 | Some(Command::BedIntersect(_sub_args)) => { 161 | panic!("Unreachable code"); 162 | } 163 | Some(Command::SeqExport(_sub_args)) => { 164 | panic!("Unreachable code"); 165 | } 166 | Some(Command::SeqImport(_sub_args)) => { 167 | panic!("Unreachable code"); 168 | } 169 | None => { 170 | // Just emit the GFA or FlatGFA file. 171 | dump(&gfa, &args.output); 172 | } 173 | } 174 | 175 | Ok(()) 176 | } 177 | 178 | /// Write a FlatGFA either to a GFA text file to stdout or a binary FlatGFA file given 179 | /// with a name. 180 | fn dump(gfa: &FlatGFA, output: &Option) { 181 | match output { 182 | Some(name) => { 183 | let mut mmap = memfile::map_new_file(name, file::size(gfa) as u64); 184 | file::dump(gfa, &mut mmap); 185 | mmap.flush().unwrap(); 186 | } 187 | None => { 188 | print!("{gfa}"); 189 | } 190 | } 191 | } 192 | 193 | /// A special-case fast-path transformation from a GFA text file to a *preallocated* 194 | /// FlatGFA, with sizes based on estimates of the input counts. 195 | fn prealloc_translate(in_name: Option<&str>, out_name: &str, prealloc_factor: usize) { 196 | let file; 197 | let (input_buf, empty_toc) = match in_name { 198 | // If we have an input GFA file, we can estimate its sizes for the TOC. 199 | Some(name) => { 200 | file = memfile::map_file(name); 201 | let toc = parse::estimate_toc(file.as_ref()); 202 | (Some(file.as_ref()), toc) 203 | } 204 | 205 | // Otherwise, we need to guess. 206 | None => (None, file::Toc::guess(prealloc_factor)), 207 | }; 208 | 209 | // Create a file with an empty table of contents. 210 | let mut mmap = memfile::map_new_file(out_name, empty_toc.size() as u64); 211 | let (toc, store) = file::init(&mut mmap, empty_toc); 212 | 213 | // Parse the input into the file. 214 | match input_buf { 215 | Some(buf) => { 216 | let store = Parser::for_slice(store).parse_mem(buf); 217 | *toc = file::Toc::for_fixed_store(&store) 218 | } 219 | None => { 220 | let stdin = std::io::stdin(); 221 | let store = Parser::for_slice(store).parse_stream(stdin.lock()); 222 | *toc = file::Toc::for_fixed_store(&store) 223 | } 224 | }; 225 | 226 | mmap.flush().unwrap(); 227 | } 228 | -------------------------------------------------------------------------------- /flatgfa/src/ops/extract.rs: -------------------------------------------------------------------------------- 1 | use crate::flatgfa::{self, Handle, Segment}; 2 | use crate::pool::{self, Id, Span, Store}; 3 | use std::collections::HashMap; 4 | 5 | /// A helper to construct a new graph that includes part of an old graph. 6 | pub struct SubgraphBuilder<'a> { 7 | pub old: &'a flatgfa::FlatGFA<'a>, 8 | pub store: flatgfa::HeapGFAStore, 9 | pub seg_map: HashMap, Id>, 10 | } 11 | 12 | pub struct SubpathStart { 13 | step: Id, // The id of the first step in the subpath. 14 | pos: usize, // The bp position at the start of the subpath. 15 | } 16 | 17 | impl<'a> SubgraphBuilder<'a> { 18 | pub fn new(old: &'a flatgfa::FlatGFA) -> Self { 19 | Self { 20 | old, 21 | store: flatgfa::HeapGFAStore::default(), 22 | seg_map: HashMap::new(), 23 | } 24 | } 25 | 26 | /// Include the old graph's header 27 | pub fn add_header(&mut self) { 28 | // pub fn add_header(&mut self, version: &[u8]) { 29 | // assert!(self.header.as_ref().is_empty()); 30 | // self.header.add_slice(version); 31 | // } 32 | assert!(self.store.header.as_ref().is_empty()); 33 | self.store.header.add_slice(self.old.header.all()); 34 | } 35 | 36 | /// Add a segment from the source graph to this subgraph. 37 | fn include_seg(&mut self, seg_id: Id) { 38 | let seg = &self.old.segs[seg_id]; 39 | let new_seg_id = self.store.add_seg( 40 | seg.name, 41 | self.old.get_seq(seg), 42 | self.old.get_optional_data(seg), 43 | ); 44 | self.seg_map.insert(seg_id, new_seg_id); 45 | } 46 | 47 | /// Add a link from the source graph to the subgraph. 48 | fn include_link(&mut self, link: &flatgfa::Link) { 49 | let from = self.tr_handle(link.from); 50 | let to = self.tr_handle(link.to); 51 | let overlap = self.old.get_alignment(link.overlap); 52 | self.store.add_link(from, to, overlap.ops.into()); 53 | } 54 | 55 | /// Add a single subpath from the given path to the subgraph. 56 | fn include_subpath(&mut self, path: &flatgfa::Path, start: &SubpathStart, end_pos: usize) { 57 | let steps = pool::Span::new(start.step, self.store.steps.next_id()); // why the next id? 58 | let name = format!("{}:{}-{}", self.old.get_path_name(path), start.pos, end_pos); 59 | self.store 60 | .add_path(name.as_bytes(), steps, std::iter::empty()); 61 | } 62 | 63 | /// Identify all the subpaths in a path from the original graph that cross through 64 | /// segments in this subgraph and merge them if possible. 65 | fn merge_subpaths(&mut self, path: &flatgfa::Path, max_distance_subpaths: usize) { 66 | // these are subpaths which *aren't* already included in the new graph 67 | let mut cur_subpath_start: Option = Some(0); 68 | let mut subpath_length = 0; 69 | let mut ignore_path = true; 70 | 71 | for (idx, step) in self.old.steps[path.steps].iter().enumerate() { 72 | let in_neighb = self.seg_map.contains_key(&step.segment()); 73 | 74 | if let (Some(start), true) = (&cur_subpath_start, in_neighb) { 75 | // We just entered the subgraph. End the current subpath. 76 | if !ignore_path && subpath_length <= max_distance_subpaths { 77 | // TODO: type safety 78 | let subpath_span = Span::new( 79 | path.steps.start + *start as u32, 80 | path.steps.start + idx as u32, 81 | ); 82 | for step in &self.old.steps[subpath_span] { 83 | if !self.seg_map.contains_key(&step.segment()) { 84 | self.include_seg(step.segment()); 85 | } 86 | } 87 | } 88 | cur_subpath_start = None; 89 | ignore_path = false; 90 | } else if let (None, false) = (&cur_subpath_start, in_neighb) { 91 | // We've exited the current subgraph, start a new subpath 92 | cur_subpath_start = Some(idx); 93 | } 94 | 95 | // Track the current bp position in the path. 96 | subpath_length += self.old.get_handle_seg(*step).len(); 97 | } 98 | } 99 | 100 | /// Identify all the subpaths in a path from the original graph that cross through 101 | /// segments in this subgraph and add them. 102 | fn find_subpaths(&mut self, path: &flatgfa::Path) { 103 | let mut cur_subpath_start: Option = None; 104 | let mut path_pos = 0; 105 | 106 | for step in &self.old.steps[path.steps] { 107 | let in_neighb = self.seg_map.contains_key(&step.segment()); 108 | 109 | if let (Some(start), false) = (&cur_subpath_start, in_neighb) { 110 | // End the current subpath. 111 | self.include_subpath(path, start, path_pos); 112 | cur_subpath_start = None; 113 | } else if let (None, true) = (&cur_subpath_start, in_neighb) { 114 | // Start a new subpath. 115 | cur_subpath_start = Some(SubpathStart { 116 | step: self.store.steps.next_id(), 117 | pos: path_pos, 118 | }); 119 | } 120 | 121 | // Add the (translated) step to the new graph. 122 | if in_neighb { 123 | self.store.add_step(self.tr_handle(*step)); 124 | } 125 | 126 | // Track the current bp position in the path. 127 | path_pos += self.old.get_handle_seg(*step).len(); 128 | } 129 | 130 | // Did we reach the end of the path while still in the neighborhood? 131 | if let Some(start) = cur_subpath_start { 132 | self.include_subpath(path, &start, path_pos); 133 | } 134 | } 135 | 136 | /// Translate a handle from the source graph to this subgraph. 137 | fn tr_handle(&self, old_handle: flatgfa::Handle) -> flatgfa::Handle { 138 | // TODO: is this just generating the handle or should we add it to the new graph? 139 | self.seg_map[&old_handle.segment()].handle(old_handle.orient()) 140 | } 141 | 142 | /// Check whether a segment from the old graph is in the subgraph. 143 | fn contains(&self, old_seg_id: Id) -> bool { 144 | self.seg_map.contains_key(&old_seg_id) 145 | } 146 | 147 | /// Extract a subgraph consisting of a neighborhood of segments up to `dist` links away 148 | /// from the given segment in the original graph. 149 | /// 150 | /// Include any links between the segments in the neighborhood and subpaths crossing 151 | /// through the neighborhood. 152 | pub fn extract( 153 | &mut self, 154 | origin: Id, 155 | dist: usize, 156 | max_distance_subpaths: usize, 157 | num_iterations: usize, 158 | ) { 159 | self.include_seg(origin); 160 | 161 | // Find the set of all segments that are c links away. 162 | let mut frontier: Vec> = Vec::new(); 163 | let mut next_frontier: Vec> = Vec::new(); 164 | frontier.push(origin); 165 | for _ in 0..dist { 166 | while let Some(seg_id) = frontier.pop() { 167 | for link in self.old.links.all().iter() { 168 | if let Some(other_seg) = link.incident_seg(seg_id) { 169 | // Add other_seg to the frontier set if it is not already in the frontier set or the seg_map 170 | if !self.seg_map.contains_key(&other_seg) { 171 | self.include_seg(other_seg); 172 | next_frontier.push(other_seg); 173 | } 174 | } 175 | } 176 | } 177 | (frontier, next_frontier) = (next_frontier, frontier); 178 | } 179 | 180 | // Merge subpaths within max_distance_subpaths bp of each other, num_iterations times 181 | for _ in 0..num_iterations { 182 | for path in self.old.paths.all().iter() { 183 | self.merge_subpaths(path, max_distance_subpaths); 184 | } 185 | } 186 | 187 | // Include all links within the subgraph. 188 | for link in self.old.links.all().iter() { 189 | if self.contains(link.from.segment()) && self.contains(link.to.segment()) { 190 | self.include_link(link); 191 | } 192 | } 193 | 194 | // Find subpaths within the subgraph. 195 | for path in self.old.paths.all().iter() { 196 | self.find_subpaths(path); 197 | } 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /flatgfa/src/gfaline.rs: -------------------------------------------------------------------------------- 1 | use crate::flatgfa::{AlignOp, Orientation}; 2 | use atoi::FromRadix10; 3 | 4 | type ParseResult = Result; 5 | type LineResult<'a> = ParseResult>; 6 | type PartialParseResult<'a, T> = ParseResult<(T, &'a [u8])>; 7 | 8 | /// A parsed GFA file line. 9 | pub enum Line<'a> { 10 | Header(&'a [u8]), 11 | Segment(Segment<'a>), 12 | Link(Link), 13 | Path(Path<'a>), 14 | } 15 | 16 | pub struct Segment<'a> { 17 | pub name: usize, 18 | pub seq: &'a [u8], 19 | pub data: &'a [u8], 20 | } 21 | 22 | pub struct Link { 23 | pub from_seg: usize, 24 | pub from_orient: Orientation, 25 | pub to_seg: usize, 26 | pub to_orient: Orientation, 27 | pub overlap: Vec, 28 | } 29 | 30 | pub struct Path<'a> { 31 | pub name: &'a [u8], 32 | pub steps: &'a [u8], 33 | pub overlaps: Vec>, 34 | } 35 | 36 | /// Parse a single line of a GFA file. 37 | pub fn parse_line(line: &[u8]) -> LineResult<'_> { 38 | if line.len() < 2 || line[1] != b'\t' { 39 | return Err("expected marker and tab"); 40 | } 41 | let rest = &line[2..]; 42 | match line[0] { 43 | b'H' => parse_header(rest), 44 | b'S' => parse_seg(rest), 45 | b'L' => parse_link(rest), 46 | b'P' => parse_path(rest), 47 | _ => Err("unhandled line kind"), 48 | } 49 | } 50 | 51 | /// Parse a header line, which looks like `H `. 52 | fn parse_header(line: &[u8]) -> LineResult<'_> { 53 | Ok(Line::Header(line)) 54 | } 55 | 56 | /// Parse a segment line, which looks like `S `. 57 | fn parse_seg(line: &[u8]) -> LineResult<'_> { 58 | let (name, rest) = parse_num(line)?; 59 | let rest = parse_byte(rest, b'\t')?; 60 | let (seq, data) = parse_field(rest)?; 61 | Ok(Line::Segment(Segment { name, seq, data })) 62 | } 63 | 64 | /// Parse a link line, which looks like `L <+-> <+-> `. 65 | fn parse_link(line: &[u8]) -> LineResult<'_> { 66 | let (from_seg, rest) = parse_num(line)?; 67 | let rest = parse_byte(rest, b'\t')?; 68 | let (from_orient, rest) = parse_orient(rest)?; 69 | let rest = parse_byte(rest, b'\t')?; 70 | let (to_seg, rest) = parse_num(rest)?; 71 | let rest = parse_byte(rest, b'\t')?; 72 | let (to_orient, rest) = parse_orient(rest)?; 73 | let rest = parse_byte(rest, b'\t')?; 74 | let (overlap, rest) = parse_align(rest)?; 75 | if !rest.is_empty() { 76 | return Err("expected end of line"); 77 | } 78 | Ok(Line::Link(Link { 79 | from_seg, 80 | from_orient, 81 | to_seg, 82 | to_orient, 83 | overlap, 84 | })) 85 | } 86 | 87 | /// Parse a path line, which looks like `P <*|CIGARs>`. 88 | fn parse_path(line: &[u8]) -> LineResult<'_> { 89 | let (name, rest) = parse_field(line)?; 90 | let (steps, rest) = parse_field(rest)?; 91 | let (overlaps, rest) = parse_maybe_overlap_list(rest)?; 92 | if !rest.is_empty() { 93 | return Err("expected end of line"); 94 | } 95 | Ok(Line::Path(Path { 96 | name, 97 | steps, 98 | overlaps, 99 | })) 100 | } 101 | 102 | /// Parse a *possible* overlap list, which may be `*` (empty). 103 | pub fn parse_maybe_overlap_list(s: &[u8]) -> PartialParseResult<'_, Vec>> { 104 | if s == b"*" { 105 | Ok((vec![], &s[1..])) 106 | } else { 107 | parse_overlap_list(s) 108 | } 109 | } 110 | 111 | /// Parse a comma-separated list of CIGAR strings. 112 | /// 113 | /// TODO: This could be optimized to avoid accumulating into a vector. 114 | fn parse_overlap_list(s: &[u8]) -> PartialParseResult<'_, Vec>> { 115 | let mut rest = s; 116 | let mut overlaps = vec![]; 117 | while !rest.is_empty() { 118 | let overlap; 119 | (overlap, rest) = parse_align(rest)?; 120 | overlaps.push(overlap); 121 | if !rest.is_empty() { 122 | rest = parse_byte(rest, b',')?; 123 | } 124 | } 125 | Ok((overlaps, rest)) 126 | } 127 | 128 | /// Consume a chunk of a string up to a given marker byte. 129 | fn parse_until(line: &[u8], marker: u8) -> PartialParseResult<'_, &[u8]> { 130 | let end = memchr::memchr(marker, line).unwrap_or(line.len()); 131 | let rest = if end == line.len() { 132 | &[] 133 | } else { 134 | &line[end + 1..] 135 | }; 136 | Ok((&line[..end], rest)) 137 | } 138 | 139 | /// Consume a string from the line, until a tab (or the end of the line). 140 | pub fn parse_field(line: &[u8]) -> PartialParseResult<'_, &[u8]> { 141 | parse_until(line, b'\t') 142 | } 143 | 144 | /// Consume a specific byte. 145 | fn parse_byte(s: &[u8], byte: u8) -> ParseResult<&[u8]> { 146 | if s.is_empty() || s[0] != byte { 147 | return Err("expected byte"); 148 | } 149 | Ok(&s[1..]) 150 | } 151 | 152 | /// Parse a single integer. 153 | fn parse_num(s: &[u8]) -> PartialParseResult<'_, T> { 154 | match T::from_radix_10(s) { 155 | (_, 0) => Err("expected number"), 156 | (num, used) => Ok((num, &s[used..])), 157 | } 158 | } 159 | 160 | /// Parse a segment orientation (+ or -). 161 | fn parse_orient(line: &[u8]) -> PartialParseResult<'_, Orientation> { 162 | if line.is_empty() { 163 | return Err("expected orientation"); 164 | } 165 | let orient = match line[0] { 166 | b'+' => Orientation::Forward, 167 | b'-' => Orientation::Backward, 168 | _ => return Err("expected orient"), 169 | }; 170 | Ok((orient, &line[1..])) 171 | } 172 | 173 | /// Parse a single CIGAR alignment operation (like `4D`). 174 | fn parse_align_op(s: &[u8]) -> PartialParseResult<'_, AlignOp> { 175 | let (len, rest) = parse_num::(s)?; 176 | let op = match rest[0] { 177 | b'M' => crate::flatgfa::AlignOpcode::Match, 178 | b'N' => crate::flatgfa::AlignOpcode::Gap, 179 | b'D' => crate::flatgfa::AlignOpcode::Deletion, 180 | b'I' => crate::flatgfa::AlignOpcode::Insertion, 181 | _ => return Err("expected align op"), 182 | }; 183 | Ok((AlignOp::new(op, len), &rest[1..])) 184 | } 185 | 186 | /// Parse a complete CIGAR alignment string (like `3M2I`). 187 | /// 188 | /// TODO This could be optimized to avoid collecting into a vector. 189 | fn parse_align(s: &[u8]) -> PartialParseResult<'_, Vec> { 190 | let mut rest = s; 191 | let mut align = vec![]; 192 | while !rest.is_empty() && rest[0].is_ascii_digit() { 193 | let op; 194 | (op, rest) = parse_align_op(rest)?; 195 | align.push(op); 196 | } 197 | Ok((align, rest)) 198 | } 199 | 200 | /// Parse GFA paths' segment lists. These look like `1+,2-,3+`. 201 | pub struct StepsParser<'a> { 202 | str: &'a [u8], 203 | index: usize, 204 | state: StepsParseState, 205 | seg: usize, 206 | } 207 | 208 | /// The parser state: we're either looking for a segment name (or a +/- terminator), 209 | /// or we're expecting a comma (or end of string). 210 | enum StepsParseState { 211 | Seg, 212 | Comma, 213 | } 214 | 215 | impl<'a> StepsParser<'a> { 216 | pub fn new(str: &'a [u8]) -> Self { 217 | StepsParser { 218 | str, 219 | index: 0, 220 | state: StepsParseState::Seg, 221 | seg: 0, 222 | } 223 | } 224 | 225 | pub fn rest(&self) -> &[u8] { 226 | &self.str[self.index..] 227 | } 228 | } 229 | 230 | impl Iterator for StepsParser<'_> { 231 | type Item = (usize, bool); 232 | fn next(&mut self) -> Option<(usize, bool)> { 233 | while self.index < self.str.len() { 234 | // Consume one byte. 235 | let byte = self.str[self.index]; 236 | self.index += 1; 237 | 238 | match self.state { 239 | StepsParseState::Seg => { 240 | if byte == b'+' || byte == b'-' { 241 | self.state = StepsParseState::Comma; 242 | return Some((self.seg, byte == b'+')); 243 | } else if byte.is_ascii_digit() { 244 | self.seg *= 10; 245 | self.seg += (byte - b'0') as usize; 246 | } else { 247 | return None; 248 | } 249 | } 250 | StepsParseState::Comma => { 251 | if byte == b',' { 252 | self.state = StepsParseState::Seg; 253 | self.seg = 0; 254 | } else { 255 | return None; 256 | } 257 | } 258 | } 259 | } 260 | 261 | None 262 | } 263 | } 264 | 265 | #[test] 266 | fn test_parse_steps() { 267 | let s = b"1+,23-,4+ suffix"; 268 | let mut parser = StepsParser::new(s); 269 | let path: Vec<_> = (&mut parser).collect(); 270 | assert_eq!(path, vec![(1, true), (23, false), (4, true)]); 271 | assert_eq!(parser.rest(), b"suffix"); 272 | } 273 | --------------------------------------------------------------------------------