├── .github ├── ISSUE_TEMPLATE │ ├── bugreport.yml │ └── featurerequest.yml ├── dependabot.yml └── workflows │ └── lint_and_test.yml ├── .gitignore ├── CITATION.cff ├── Cargo.toml ├── LICENSE ├── README.md ├── benches ├── criterion.rs └── iai.rs ├── examples └── query.rs ├── rustfmt.toml ├── src ├── containers │ ├── adj_list.rs │ ├── bitmap.rs │ ├── control_info.rs │ ├── mod.rs │ ├── rdf.rs │ ├── sequence.rs │ └── vbyte.rs ├── dict_sect_pfc.rs ├── four_sect_dict.rs ├── hdt.rs ├── hdt_graph.rs ├── hdt_graph │ └── term.rs ├── header.rs ├── lib.rs ├── triples.rs └── triples │ ├── object_iter.rs │ ├── predicate_iter.rs │ ├── predicate_object_iter.rs │ └── subject_iter.rs └── tests └── resources ├── snikmeta.hdt └── yago_header.hdt /.github/ISSUE_TEMPLATE/bugreport.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: File a bug report. 3 | labels: ["bug"] 4 | assignees: 5 | - KonradHoeffner 6 | body: 7 | - type: textarea 8 | id: what-happened 9 | attributes: 10 | label: What happened? 11 | validations: 12 | required: true 13 | - type: input 14 | id: version 15 | attributes: 16 | label: library version 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/featurerequest.yml: -------------------------------------------------------------------------------- 1 | name: Feature Request 2 | description: Suggest a new feature. 3 | labels: ["enhancement"] 4 | assignees: 5 | - KonradHoeffner 6 | body: 7 | - type: textarea 8 | id: new-feature 9 | attributes: 10 | label: What new feature do you want? 11 | validations: 12 | required: true 13 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 2 | version: 2 3 | updates: 4 | - package-ecosystem: "github-actions" 5 | directory: "/.github/workflows" 6 | assignees: ["KonradHoeffner"] 7 | schedule: 8 | interval: "daily" 9 | - package-ecosystem: "cargo" 10 | directory: "/" 11 | assignees: ["KonradHoeffner"] 12 | schedule: 13 | interval: "daily" 14 | -------------------------------------------------------------------------------- /.github/workflows/lint_and_test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Lint and Test 3 | 4 | on: 5 | push: 6 | paths-ignore: 7 | - '.gitignore' 8 | - 'CITATION.cff' 9 | - 'LICENSE' 10 | - 'README.md' 11 | pull_request: 12 | paths-ignore: 13 | - '.gitignore' 14 | - 'CITATION.cff' 15 | - 'LICENSE' 16 | - 'README.md' 17 | 18 | jobs: 19 | build: 20 | strategy: 21 | matrix: 22 | include: 23 | - os: ubuntu-latest 24 | # - os: macos-latest 25 | # - os: windows-latest 26 | runs-on: ${{ matrix.os }} 27 | steps: 28 | - uses: actions/checkout@v4 29 | - name: Install Rust 30 | uses: dtolnay/rust-toolchain@stable 31 | with: 32 | components: rustfmt,clippy 33 | - name: fmt 34 | run: cargo fmt -- --check 35 | - name: check feature combinations 36 | run: | 37 | cargo check --no-default-features 38 | cargo check --no-default-features --features sophia 39 | cargo check --no-default-features --features cache 40 | cargo check --no-default-features --features "sophia cache" 41 | - name: clippy 42 | run: cargo clippy --no-deps --all-features 43 | - name: build 44 | run: cargo build 45 | - name: test 46 | run: cargo test --all-features 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | /data 4 | *.index.v1-rust-cache -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | --- 2 | cff-version: 1.2.0 3 | title: "hdt-rs: A Rust library 4 | for the Header Dictionary Triples binary RDF compression format" 5 | message: If you use this software, please cite our article in the 6 | Journal of Open Source Software. 7 | type: software 8 | authors: 9 | - given-names: Konrad 10 | family-names: Höffner 11 | email: konrad.hoeffner@uni-leipzig.de 12 | affiliation: >- 13 | Institute for Medical Informatics, Statistics 14 | and Epidemiology (IMISE), Leipzig, Germany 15 | orcid: 'https://orcid.org/0000-0001-7358-3217' 16 | - given-names: Baccaert 17 | family-names: Tim 18 | affiliation: >- 19 | Independent Researcher 20 | Belgium 21 | repository-code: 'https://github.com/konradhoeffner/hdt' 22 | url: 'https://crates.io/crates/hdt' 23 | keywords: 24 | - RDF 25 | - HDT 26 | - Rust 27 | license: MIT 28 | preferred-citation: 29 | type: article 30 | authors: 31 | - family-names: Höffner 32 | given-names: Konrad 33 | orcid: "https://orcid.org/0000-0000-0000-0000" 34 | - family-names: "Baccaert" 35 | given-names: "Tim" 36 | date-published: 2023-04-29 37 | doi: 10.21105/joss.05114 38 | issn: 2475-9066 39 | issue: 84 40 | journal: Journal of Open Source Software 41 | publisher: 42 | name: Open Journals 43 | start: 5114 44 | title: "hdt-rs: A Rust library 45 | for the Header Dictionary Triples binary RDF compression format" 46 | url: "https://joss.theoj.org/papers/10.21105/joss.05114" 47 | volume: 8 48 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hdt" 3 | version = "0.3.0" 4 | repository = "https://github.com/konradhoeffner/hdt" 5 | authors = ["Tim Baccaert ", "Konrad Höffner"] 6 | license = "MIT" 7 | description = "Library for the Header Dictionary Triples (HDT) RDF compression format." 8 | keywords = ["rdf", "hdt", "compression", "file-format"] 9 | categories = ["compression", "filesystem", "parsing", "web-programming"] 10 | edition = "2024" 11 | rust-version = "1.85" 12 | 13 | [package.metadata."docs.rs"] 14 | all-features = true 15 | 16 | [dependencies] 17 | bytesize = "2" 18 | crc = "3" 19 | iref = "3" 20 | langtag = "0.4" 21 | ntriple = "0.1" 22 | sophia = { version = "0.9", optional = true } 23 | sucds = "0.8" 24 | thiserror = "2" 25 | log = "0.4" 26 | mownstr = "0.3" 27 | bincode = { version = "2", optional = true, default-features = false, features = ["std", "serde"] } 28 | serde = { version = "1", optional = true, features = ["derive"] } 29 | 30 | [features] 31 | default = ["sophia"] 32 | sophia = ["dep:sophia"] 33 | cache = ["dep:serde", "dep:bincode"] 34 | 35 | [[bench]] 36 | name = "criterion" 37 | harness = false 38 | 39 | [[bench]] 40 | name = "iai" 41 | harness = false 42 | 43 | [lib] 44 | bench = false 45 | 46 | [profile.test] 47 | opt-level = 1 48 | 49 | [dev-dependencies] 50 | pretty_assertions = "1" 51 | env_logger = { version = "0.11", default-features = false, features = ["auto-color"] } 52 | criterion = { version = "0.6", default-features = false, features = ["cargo_bench_support", "html_reports"] } 53 | #iai = "0.1" 54 | iai = { git = "https://github.com/sigaloid/iai", rev = "d56a597" } # until https://github.com/bheisler/iai/pull/35 is merged 55 | color-eyre = "0.6" 56 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Tim Baccaert 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HDT 2 | 3 | [![Latest Version](https://img.shields.io/crates/v/hdt.svg)](https://crates.io/crates/hdt) 4 | [![Lint and Test](https://github.com/konradhoeffner/hdt/actions/workflows/lint_and_test.yml/badge.svg)](https://github.com/konradhoeffner/hdt/actions/workflows/lint_and_test.yml) 5 | [![Documentation](https://docs.rs/hdt/badge.svg)](https://docs.rs/hdt/) 6 | [![Benchmarks](https://img.shields.io/badge/Benchmarks--x.svg?style=social)](https://github.com/KonradHoeffner/hdt_benchmark/blob/master/benchmark_results.ipynb) 7 | [![HDT Rust @ LD Party Video](https://img.shields.io/badge/video-8A2BE2)](https://www.youtube.com/watch?v=R-S0o_UwPMk) 8 | [![DOI](https://joss.theoj.org/papers/10.21105/joss.05114/status.svg)](https://doi.org/10.21105/joss.05114) 9 | 10 | A Rust library for the [Header Dictionary Triples](https://www.rdfhdt.org/) compressed RDF format, including: 11 | 12 | * loading the HDT default format as created by [hdt-cpp](https://github.com/rdfhdt/hdt-cpp) 13 | * efficient querying by triple patterns 14 | * serializing into other formats like RDF Turtle and N-Triples using the [Sophia](https://crates.io/crates/sophia) adapter 15 | 16 | However it cannot: 17 | 18 | * load other RDF formats 19 | * load other HDT variants 20 | 21 | For this functionality and acknowledgement of all the original authors, please look at the reference implementations in C++ and Java by the [https://github.com/rdfhdt](https://github.com/rdfhdt) organisation. 22 | 23 | It also cannot: 24 | 25 | * swap data to disk 26 | * modify the RDF graph in memory 27 | * run SPARQL queries 28 | 29 | If you need any of the those features, consider using a SPARQL endpoint instead. 30 | 31 | ## Examples 32 | 33 | ```rust 34 | use hdt::Hdt; 35 | 36 | let file = std::fs::File::open("example.hdt").expect("error opening file"); 37 | let hdt = Hdt::new(std::io::BufReader::new(file)).expect("error loading HDT"); 38 | // query 39 | let majors = hdt.triples_with_pattern(Some("http://dbpedia.org/resource/Leipzig"), Some("http://dbpedia.org/ontology/major"),None); 40 | println!("{:?}", majors.collect::>()); 41 | ``` 42 | 43 | You can also use the Sophia adapter to load HDT files and reduce memory consumption of an existing application based on Sophia, which is re-exported as `hdt::sophia`: 44 | 45 | ```rust 46 | use hdt::{Hdt,HdtGraph}; 47 | use hdt::sophia::api::graph::Graph; 48 | use hdt::sophia::api::term::{IriRef, SimpleTerm, matcher::Any}; 49 | 50 | let file = std::fs::File::open("dbpedia.hdt").expect("error opening file"); 51 | let hdt = Hdt::new(std::io::BufReader::new(file)).expect("error loading HDT"); 52 | let graph = HdtGraph::new(hdt); 53 | let s = SimpleTerm::Iri(IriRef::new_unchecked("http://dbpedia.org/resource/Leipzig".into())); 54 | let p = SimpleTerm::Iri(IriRef::new_unchecked("http://dbpedia.org/ontology/major".into())); 55 | let majors = graph.triples_matching(Some(s),Some(p),Any); 56 | ``` 57 | 58 | If you don't want to pull in the Sophia dependency, you can exclude the adapter: 59 | 60 | ```toml 61 | [dependencies] 62 | hdt = { version = "...", default-features = false } 63 | ``` 64 | 65 | There is also a runnable example [in the examples folder](https://github.com/KonradHoeffner/hdt/tree/main/examples), which you can run with `cargo run --example query`. 66 | 67 | Users can also choose to use the experimental `cache` feature. If enabled, the library will utilize a custom cached TriplesBitmap file if it exists or create one if it does not exist. 68 | 69 | ```rust 70 | let hdt = hdt::Hdt::new_from_path(std::path::Path::new("tests/resources/snikmeta.hdt")).unwrap(); 71 | ``` 72 | 73 | The `cache` feature is experimental and may change or be removed in future releases. 74 | 75 | ## API Documentation 76 | 77 | See [docs.rs/latest/hdt](https://docs.rs/hdt) or generate for yourself with `cargo doc --no-deps` without disabling default features. 78 | 79 | ## Performance 80 | The performance of a query depends on the size of the graph, the type of triple pattern and the size of the result set. 81 | When using large HDT files, make sure to enable the release profile, such as through `cargo build --release`, as this can be much faster than using the dev profile. 82 | 83 | ### Profiling 84 | If you want to optimize the code, you can use a profiler. 85 | The provided test data is very small in order to keep the size of the crate down; locally modifying the tests to use a large HDT file returns more meaningful results. 86 | 87 | #### Example with perf and Firefox Profiler 88 | 89 | $ cargo test --release 90 | [...] 91 | Running unittests src/lib.rs (target/release/deps/hdt-2b2f139dafe69681) 92 | [...] 93 | $ perf record --call-graph=dwarf target/release/deps/hdt-2b2f139dafe69681 hdt::tests::triples 94 | $ perf script > /tmp/test.perf 95 | 96 | Then go to and open `/tmp/test.perf`. 97 | 98 | ## Criterion benchmark 99 | 100 | cargo bench --bench criterion 101 | 102 | * requires [persondata\_en.hdt](https://github.com/KonradHoeffner/hdt/releases/download/benchmarkdata/persondata_en.hdt.bz2) placed in `tests/resources` 103 | 104 | ## iai benchmark 105 | 106 | cargo bench --bench iai 107 | 108 | * requires [persondata\_en\_10k.hdt](https://github.com/KonradHoeffner/hdt/releases/download/benchmarkdata/persondata_en_10k.hdt.bz2) placed in `tests/resources` 109 | * requires [Valgrind](https://valgrind.org/) to be installed 110 | 111 | ## Comparative benchmark suite 112 | 113 | [The separate benchmark suite](https://github.com/KonradHoeffner/hdt_benchmark/blob/master/benchmark_results.ipynb) compares the performance of this and some other RDF libraries. 114 | 115 | ## Community Guidelines 116 | 117 | ### Issues and Support 118 | If you have a problem with the software, want to report a bug or have a feature request, please use the [issue tracker](https://github.com/KonradHoeffner/hdt/issues). 119 | If have a different type of request, feel free to send an email to [Konrad](mailto:konrad.hoeffner@uni-leipzig.de). 120 | 121 | ### Citation 122 | 123 | [![DOI](https://joss.theoj.org/papers/10.21105/joss.05114/status.svg)](https://doi.org/10.21105/joss.05114) 124 | 125 | If you use this library in your research, please cite our paper in the Journal of Open Source Software. 126 | We also provide a [CITATION.cff](./CITATION.cff) file. 127 | 128 | #### BibTeX entry 129 | 130 | ```bibtex 131 | @article{hdtrs, 132 | doi = {10.21105/joss.05114}, 133 | year = {2023}, 134 | publisher = {The Open Journal}, 135 | volume = {8}, 136 | number = {84}, 137 | pages = {5114}, 138 | author = {Konrad Höffner and Tim Baccaert}, 139 | title = {hdt-rs: {A} {R}ust library for the {H}eader {D}ictionary {T}riples binary {RDF} compression format}, 140 | journal = {Journal of Open Source Software} 141 | } 142 | ``` 143 | 144 | #### Citation string 145 | 146 | Höffner et al., (2023). hdt-rs: A Rust library for the Header Dictionary Triples binary RDF compression format. Journal of Open Source Software, 8(84), 5114, https://doi.org/10.21105/joss.05114 147 | 148 | ### Contribute 149 | We are happy to receive pull requests. 150 | Please use `cargo fmt` before committing, make sure that `cargo test` succeeds and that the code compiles on the stable and nightly toolchain both with and without the "sophia" feature active. 151 | `cargo clippy` should not report any warnings. 152 | -------------------------------------------------------------------------------- /benches/criterion.rs: -------------------------------------------------------------------------------- 1 | use criterion::{Criterion, criterion_group, criterion_main}; 2 | use hdt::Hdt; 3 | use hdt::HdtGraph; 4 | use hdt::IdKind; 5 | use hdt::triples::*; 6 | use sophia::api::graph::Graph; 7 | use sophia::api::term::IriRef; 8 | use sophia::api::term::SimpleTerm; 9 | use sophia::api::term::matcher::Any; 10 | use std::fs::File; 11 | 12 | const VINCENT: &str = "http://dbpedia.org/resource/Vincent_Descombes_Sevoie"; 13 | const TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; 14 | const PERSON: &str = "http://dbpedia.org/ontology/Person"; 15 | 16 | fn load() -> HdtGraph { 17 | let filename = "tests/resources/persondata_en.hdt"; 18 | let file = File::open(filename) 19 | .expect(&format!("Error opening file {filename}, did you forget to download it? See README.md.")); 20 | //let file = File::open("tests/resources/lscomplete2015.hdt").expect("error opening file"); 21 | //let file = File::open("tests/resources/snikmeta.hdt").expect("error opening file"); 22 | let hdt = Hdt::new(std::io::BufReader::new(file)).unwrap(); 23 | HdtGraph::new(hdt) 24 | } 25 | 26 | fn query(c: &mut Criterion) { 27 | let graph = load(); 28 | let triples = &graph.hdt.triples; 29 | let twp = |s, p, o| graph.hdt.triples_with_pattern(s, p, o); 30 | 31 | let vincent_id = graph.hdt.dict.string_to_id(VINCENT, &IdKind::Subject); 32 | let type_id = graph.hdt.dict.string_to_id(TYPE, &IdKind::Predicate); 33 | let person_id = graph.hdt.dict.string_to_id(PERSON, &IdKind::Object); 34 | let vincent_term = SimpleTerm::Iri(IriRef::new_unchecked(VINCENT.into())); 35 | let type_term = SimpleTerm::Iri(IriRef::new_unchecked(TYPE.into())); 36 | let person_term = SimpleTerm::Iri(IriRef::new_unchecked(PERSON.into())); 37 | 38 | // count to prevent optimizing away function call 39 | let mut group = c.benchmark_group("??? (all)"); 40 | group.sample_size(10); 41 | group.bench_function("0.1 all triple IDs", |b| b.iter(|| graph.hdt.triples.into_iter().count())); 42 | group.bench_function("0.2 all str triples", |b| b.iter(|| graph.hdt.triples().count())); 43 | group.bench_function("0.3 all Sophia triples", |b| b.iter(|| graph.triples().count())); 44 | group.finish(); 45 | let mut group = c.benchmark_group("S??"); 46 | //let mut group = c.benchmark_group("query"); 47 | group.bench_function("1.1 (vincent, ?, ?) triple IDs", |b| { 48 | b.iter(|| SubjectIter::with_pattern(triples, &TripleId::new(vincent_id, 0, 0)).count()) 49 | }); 50 | group.bench_function("1.2 (vincent, ?, ?) str triples", |b| b.iter(|| twp(Some(VINCENT), None, None).count())); 51 | group.bench_function("1.3 (vincent, ?, ?) Sophia triples", |b| { 52 | b.iter(|| graph.triples_matching(Some(&vincent_term), Any, Any).count()) 53 | }); 54 | group.finish(); 55 | 56 | let mut group = c.benchmark_group(format!("?P? {} triples", PredicateIter::new(triples, type_id).count())); 57 | group.sample_size(10); 58 | group.bench_function("2.1 (?, type, ?) triple IDs", |b| { 59 | b.iter(|| PredicateIter::new(triples, type_id).count()) 60 | }); 61 | group.bench_function("2.2 (?, type, ?) str triples", |b| b.iter(|| twp(None, Some(TYPE), None).count())); 62 | group.bench_function("2.3 (?, type, ?) Sophia triples", |b| { 63 | b.iter(|| graph.triples_matching(Any, Some(&type_term), Any).count()) 64 | }); 65 | group.finish(); 66 | let mut group = c.benchmark_group(format!("??O {} triples", ObjectIter::new(triples, person_id).count())); 67 | group.bench_function("3.1 (?, ?, person) triple IDs", |b| { 68 | b.iter(|| ObjectIter::new(triples, person_id).count()) 69 | }); 70 | group.bench_function("3.2 (?, ?, person) str triples", |b| b.iter(|| twp(None, None, Some(PERSON)).count())); 71 | group.bench_function("3.3 (?, ?, person) Sophia triples", |b| { 72 | b.iter(|| graph.triples_matching(Any, Any, Some(&person_term)).count()) 73 | }); 74 | group.finish(); 75 | let mut group = c 76 | .benchmark_group(format!("?PO {} triples", PredicateObjectIter::new(triples, type_id, person_id).count())); 77 | group.sample_size(10); 78 | group.bench_function("4.1 (?, type, person) triple IDs", |b| { 79 | b.iter(|| PredicateObjectIter::new(triples, type_id, person_id).count()) 80 | }); 81 | group.bench_function("4.2 (?, type, person) str subjects", |b| { 82 | b.iter(|| graph.hdt.subjects_with_po(TYPE, PERSON).count()) 83 | }); 84 | group.bench_function("4.3 (?, type, person) str triples", |b| { 85 | b.iter(|| twp(None, Some(TYPE), Some(PERSON)).count()) 86 | }); 87 | group.bench_function("4.4 (?, type, person) Sophia triples", |b| { 88 | b.iter(|| graph.triples_matching(Any, Some(&type_term), Some(&person_term)).count()) 89 | }); 90 | group.finish(); 91 | } 92 | 93 | criterion_group!(criterion, query); 94 | criterion_main!(criterion); 95 | -------------------------------------------------------------------------------- /benches/iai.rs: -------------------------------------------------------------------------------- 1 | use hdt::Hdt; 2 | use hdt::HdtGraph; 3 | use sophia::api::graph::Graph; 4 | use sophia::api::term::IriRef; 5 | use sophia::api::term::SimpleTerm; 6 | use sophia::api::term::matcher::Any; 7 | use std::fs::File; 8 | 9 | const TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; 10 | const PERSON: &str = "http://dbpedia.org/ontology/Person"; 11 | 12 | fn load() -> HdtGraph { 13 | let file = File::open("tests/resources/persondata_en_10k.hdt").expect("error opening file"); 14 | let hdt = Hdt::new(std::io::BufReader::new(file)).unwrap(); 15 | HdtGraph::new(hdt) 16 | } 17 | 18 | // iai currently does not allow excluding loading time so that has to be subtracted 19 | 20 | fn query_all() { 21 | let hdt = load().hdt; 22 | hdt.triples_with_pattern(None, None, None).count(); 23 | } 24 | 25 | fn query_all_sophia() { 26 | let graph = load(); 27 | graph.triples_matching(Any, Any, Any).count(); 28 | } 29 | 30 | fn query_po() { 31 | let hdt = load().hdt; 32 | hdt.triples_with_pattern(None, Some(TYPE), Some(PERSON)).count(); 33 | } 34 | 35 | fn query_po_sophia() { 36 | let graph = load(); 37 | let type_term = SimpleTerm::Iri(IriRef::new_unchecked(TYPE.into())); 38 | let person_term = SimpleTerm::Iri(IriRef::new_unchecked(PERSON.into())); 39 | graph.triples_matching(Any, Some(&type_term), Some(&person_term)).count(); 40 | } 41 | 42 | fn query_o() { 43 | let hdt = load().hdt; 44 | hdt.triples_with_pattern(None, None, Some(PERSON)).count(); 45 | } 46 | 47 | fn query_o_sophia() { 48 | let graph = load(); 49 | let person_term = SimpleTerm::Iri(IriRef::new_unchecked(PERSON.into())); 50 | graph.triples_matching(Any, Any, Some(&person_term)).count(); 51 | } 52 | 53 | iai::main!(load, query_all, query_all_sophia, query_po, query_po_sophia, query_o, query_o_sophia); 54 | -------------------------------------------------------------------------------- /examples/query.rs: -------------------------------------------------------------------------------- 1 | use hdt::Hdt; 2 | 3 | fn main() -> Result<(), Box> { 4 | env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init(); 5 | let path = std::path::Path::new("tests/resources/snikmeta.hdt"); 6 | let file = std::fs::File::open(path)?; 7 | let meta_top = "http://www.snik.eu/ontology/meta/Top"; 8 | let rdfs_label = "http://www.w3.org/2000/01/rdf-schema#label"; 9 | #[allow(unused_mut)] 10 | let mut hdts = vec![Hdt::new(std::io::BufReader::new(file))?]; 11 | #[cfg(feature = "cache")] 12 | hdts.push(Hdt::new_from_path(path)?); 13 | for hdt in hdts { 14 | // SP? pattern 15 | let labels = hdt.triples_with_pattern(Some(meta_top), Some(rdfs_label), None); 16 | println!("{:?}", labels.collect::>()); 17 | } 18 | Ok(()) 19 | } 20 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 115 2 | short_array_element_width_threshold = 115 3 | use_field_init_shorthand = true 4 | use_small_heuristics = "Max" 5 | use_try_shorthand = true 6 | fn_params_layout = "Compressed" 7 | single_line_if_else_max_width = 115 8 | 9 | # *** only available in unstable rust *** 10 | #where_single_line = true 11 | #fn_single_line = true 12 | #group_imports = "One" 13 | #condense_wildcard_suffixes = true 14 | # *************************************** 15 | -------------------------------------------------------------------------------- /src/containers/adj_list.rs: -------------------------------------------------------------------------------- 1 | //! Adjacency list containing an integer sequence and a bitmap with rank and select support. 2 | use crate::containers::Bitmap; 3 | use crate::containers::Sequence; 4 | use crate::triples::Id; 5 | use std::cmp::Ordering; 6 | 7 | /// Adjacency list including a compact integer sequence and a bitmap for efficient access of that sequence using rank and select queries. 8 | #[derive(Debug)] 9 | #[cfg_attr(feature = "cache", derive(serde::Deserialize, serde::Serialize))] 10 | pub struct AdjList { 11 | /// Compact integer sequence. 12 | pub sequence: Sequence, 13 | /// Helper structure for rank and select queries. 14 | pub bitmap: Bitmap, 15 | } 16 | 17 | impl AdjList { 18 | /// Adjacency list with the given sequence and bitmap. 19 | pub const fn new(sequence: Sequence, bitmap: Bitmap) -> Self { 20 | AdjList { sequence, bitmap } 21 | } 22 | 23 | /// Combined size in bytes of the sequence and the bitmap on the heap. 24 | pub fn size_in_bytes(&self) -> usize { 25 | self.sequence.size_in_bytes() + self.bitmap.size_in_bytes() 26 | } 27 | 28 | /// Whether the given position represents the last child of the parent node. 29 | pub fn at_last_sibling(&self, word_index: usize) -> bool { 30 | self.bitmap.at_last_sibling(word_index) 31 | } 32 | 33 | /// Get the ID at the given position. 34 | pub fn get_id(&self, word_index: usize) -> Id { 35 | self.sequence.get(word_index) as Id 36 | } 37 | 38 | /// Number of entries in both the integer sequence and the bitmap. 39 | pub const fn len(&self) -> usize { 40 | self.sequence.entries 41 | } 42 | 43 | /// Whether the list is emtpy 44 | pub const fn is_empty(&self) -> bool { 45 | self.sequence.entries == 0 46 | } 47 | 48 | /// Find the first position for the given ID, counting from 1. 49 | pub fn find(&self, x: Id) -> usize { 50 | if x == 0 { 51 | return 0; 52 | } 53 | // hdt counts from 1 54 | // rsdict has nonzero value for 0, is that correct? adjust for that. 55 | self.bitmap.select1(x - 1).unwrap() as usize + 1 56 | } 57 | 58 | /// Return the position of element within the given bounds. 59 | /// # Arguments 60 | /// 61 | /// * `element` - a value that may or may not exist in the specified range of the list 62 | /// * `begin` - first index of the search range 63 | /// * `end` - end (exclusive) of the search range 64 | fn bin_search(&self, element: usize, begin: usize, end: usize) -> Option { 65 | let mut low = begin; 66 | let mut high = end; 67 | while low < high { 68 | let mid = usize::midpoint(low, high); 69 | match self.sequence.get(mid).cmp(&element) { 70 | Ordering::Less => low = mid + 1, 71 | Ordering::Greater => high = mid, 72 | Ordering::Equal => return Some(mid), 73 | } 74 | } 75 | None 76 | } 77 | 78 | /// Find position of element y in the list x. 79 | // See . 80 | pub fn search(&self, x: usize, y: usize) -> Option { 81 | self.bin_search(y, self.find(x), self.last(x) + 1) 82 | } 83 | 84 | /// Find the last position for the given ID, counting from 1. 85 | pub fn last(&self, x: Id) -> usize { 86 | self.find(x + 1) - 1 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/containers/bitmap.rs: -------------------------------------------------------------------------------- 1 | //! Bitmap with rank and select support read from an HDT file. 2 | use crate::containers::vbyte::read_vbyte; 3 | use bytesize::ByteSize; 4 | #[cfg(feature = "cache")] 5 | use serde::ser::SerializeStruct; 6 | use std::fmt; 7 | use std::io::BufRead; 8 | use std::mem::size_of; 9 | use sucds::Serializable; 10 | use sucds::bit_vectors::{Access, BitVector, Rank, Rank9Sel, Select}; 11 | 12 | /// Compact bitmap representation with rank and select support. 13 | #[derive(Clone)] 14 | pub struct Bitmap { 15 | /// should be private but is needed by containers/bitmap.rs, use methods provided by Bitmap 16 | pub dict: Rank9Sel, 17 | } 18 | 19 | /// The error type for the bitmap read function. 20 | #[derive(thiserror::Error, Debug)] 21 | pub enum BitmapReadError { 22 | #[error("IO error")] 23 | Io(#[from] std::io::Error), 24 | #[error("Invalid CRC8-CCIT checksum {0}, expected {1}")] 25 | InvalidCrc8Checksum(u8, u8), 26 | #[error("Invalid CRC32C checksum {0}, expected {1}")] 27 | InvalidCrc32Checksum(u32, u32), 28 | #[error("Failed to turn raw bytes into u64")] 29 | TryFromSliceError(#[from] std::array::TryFromSliceError), 30 | #[error("Read unsupported bitmap type {0} != 1")] 31 | UnsupportedBitmapType(u8), 32 | } 33 | 34 | #[cfg(feature = "cache")] 35 | impl serde::Serialize for Bitmap { 36 | fn serialize(&self, serializer: S) -> Result 37 | where 38 | S: serde::ser::Serializer, 39 | { 40 | let mut state: ::SerializeStruct = 41 | serializer.serialize_struct("Bitmap", 1)?; 42 | 43 | //bitmap_y 44 | let mut dict_buffer = Vec::new(); 45 | self.dict.serialize_into(&mut dict_buffer).map_err(serde::ser::Error::custom)?; 46 | state.serialize_field("dict", &dict_buffer)?; 47 | 48 | state.end() 49 | } 50 | } 51 | 52 | #[cfg(feature = "cache")] 53 | impl<'de> serde::Deserialize<'de> for Bitmap { 54 | fn deserialize(deserializer: D) -> Result 55 | where 56 | D: serde::de::Deserializer<'de>, 57 | { 58 | #[derive(serde::Deserialize)] 59 | struct BitmapData { 60 | dict: Vec, 61 | } 62 | 63 | let data = BitmapData::deserialize(deserializer)?; 64 | 65 | // Deserialize `sucds` structures 66 | let mut bitmap_reader = std::io::BufReader::new(&data.dict[..]); 67 | let rank9sel = Rank9Sel::deserialize_from(&mut bitmap_reader).map_err(serde::de::Error::custom)?; 68 | 69 | let bitmap = Bitmap { dict: rank9sel }; 70 | Ok(bitmap) 71 | } 72 | } 73 | 74 | impl fmt::Debug for Bitmap { 75 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 76 | write!(f, "{}", ByteSize(self.size_in_bytes() as u64)) 77 | } 78 | } 79 | 80 | impl Bitmap { 81 | /// Construct a bitmap from an existing bitmap in form of a vector, which doesn't have rank and select support. 82 | pub fn new(data: Vec) -> Self { 83 | let mut v = BitVector::new(); 84 | for d in data { 85 | let _ = v.push_bits(d as usize, 64); 86 | } 87 | let dict = Rank9Sel::new(v).select1_hints(); 88 | Bitmap { dict } 89 | } 90 | 91 | /// Size in bytes on the heap. 92 | pub fn size_in_bytes(&self) -> usize { 93 | self.dict.size_in_bytes() 94 | } 95 | 96 | /// Number of bits in the bitmap 97 | pub const fn len(&self) -> usize { 98 | self.dict.len() 99 | } 100 | 101 | /// Returns the position of the k-1-th one bit or None if there aren't that many. 102 | pub fn select1(&self, k: usize) -> Option { 103 | self.dict.select1(k) 104 | } 105 | 106 | /// Returns the number of one bits from the 0-th bit to the k-1-th bit. Panics if self.len() < pos. 107 | pub fn rank(&self, k: usize) -> usize { 108 | self.dict.rank1(k).unwrap_or_else(|| panic!("Out of bounds position: {} >= {}", k, self.dict.len())) 109 | } 110 | 111 | /// Whether the node given position is the last child of its parent. 112 | pub fn at_last_sibling(&self, word_index: usize) -> bool { 113 | self.dict.access(word_index).expect("word index out of bounds") 114 | } 115 | 116 | /// Read bitmap from a suitable point within HDT file data and verify checksums. 117 | pub fn read(reader: &mut R) -> Result { 118 | use BitmapReadError::*; 119 | let mut history: Vec = Vec::with_capacity(5); 120 | 121 | // read the type 122 | let mut bitmap_type = [0u8]; 123 | reader.read_exact(&mut bitmap_type)?; 124 | history.extend_from_slice(&bitmap_type); 125 | if bitmap_type[0] != 1 { 126 | return Err(UnsupportedBitmapType(bitmap_type[0])); 127 | } 128 | 129 | // read the number of bits 130 | let (num_bits, bytes_read) = read_vbyte(reader)?; 131 | history.extend_from_slice(&bytes_read); 132 | 133 | // read section CRC8 134 | let mut crc_code = [0_u8]; 135 | reader.read_exact(&mut crc_code)?; 136 | let crc_code = crc_code[0]; 137 | 138 | // validate section CRC8 139 | let crc8 = crc::Crc::::new(&crc::CRC_8_SMBUS); 140 | let mut digest = crc8.digest(); 141 | digest.update(&history); 142 | let crc_calculated = digest.finalize(); 143 | if crc_calculated != crc_code { 144 | return Err(InvalidCrc8Checksum(crc_calculated, crc_code)); 145 | } 146 | 147 | // read all but the last word, last word is byte aligned 148 | let full_byte_amount = ((num_bits - 1) >> 6) * 8; 149 | let mut full_words = vec![0_u8; full_byte_amount]; 150 | // div_ceil is unstable 151 | let mut data: Vec = Vec::with_capacity(full_byte_amount / 8 + usize::from(full_byte_amount % 8 != 0)); 152 | reader.read_exact(&mut full_words)?; 153 | 154 | for word in full_words.chunks_exact(size_of::()) { 155 | data.push(u64::from_le_bytes(<[u8; 8]>::try_from(word)?)); 156 | } 157 | 158 | // initiate computation of CRC32 159 | let crc32 = crc::Crc::::new(&crc::CRC_32_ISCSI); 160 | let mut digest = crc32.digest(); 161 | digest.update(&full_words); 162 | 163 | let mut bits_read = 0; 164 | let mut last_value: u64 = 0; 165 | let last_word_bits = if num_bits == 0 { 0 } else { ((num_bits - 1) % 64) + 1 }; 166 | 167 | while bits_read < last_word_bits { 168 | let mut buffer = [0u8]; 169 | reader.read_exact(&mut buffer)?; 170 | digest.update(&buffer); 171 | last_value |= (buffer[0] as u64) << bits_read; 172 | bits_read += 8; 173 | } 174 | data.push(last_value); 175 | 176 | // read entry body CRC32 177 | let mut crc_code = [0_u8; 4]; 178 | reader.read_exact(&mut crc_code)?; 179 | let crc_code = u32::from_le_bytes(crc_code); 180 | 181 | // validate entry body CRC32 182 | let crc_calculated = digest.finalize(); 183 | if crc_calculated != crc_code { 184 | return Err(InvalidCrc32Checksum(crc_calculated, crc_code)); 185 | } 186 | 187 | Ok(Self::new(data)) 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /src/containers/control_info.rs: -------------------------------------------------------------------------------- 1 | use io::ErrorKind::UnexpectedEof; 2 | use std::collections::HashMap; 3 | use std::io::BufRead; 4 | use std::io::{self, Write}; 5 | use std::str; 6 | 7 | pub const TERMINATOR: [u8; 1] = [0]; 8 | const HDT_HEADER: &[u8] = b"$HDT"; 9 | 10 | /// Type of Control Information. 11 | #[allow(missing_docs)] 12 | #[repr(u8)] 13 | #[derive(Debug, PartialEq, Eq, Clone, Copy, Default)] 14 | pub enum ControlType { 15 | #[default] 16 | Unknown = 0, 17 | Global = 1, 18 | Header = 2, 19 | Dictionary = 3, 20 | Triples = 4, 21 | Index = 5, 22 | } 23 | 24 | impl TryFrom for ControlType { 25 | type Error = ControlInfoReadErrorKind; 26 | 27 | fn try_from(original: u8) -> Result { 28 | match original { 29 | 0 => Ok(ControlType::Unknown), 30 | 1 => Ok(ControlType::Global), 31 | 2 => Ok(ControlType::Header), 32 | 3 => Ok(ControlType::Dictionary), 33 | 4 => Ok(ControlType::Triples), 34 | 5 => Ok(ControlType::Index), 35 | _ => Err(ControlInfoReadErrorKind::InvalidControlType(original)), 36 | } 37 | } 38 | } 39 | 40 | /// : "preamble that describes a chunk of information". 41 | #[derive(Debug, Clone, PartialEq, Default)] 42 | pub struct ControlInfo { 43 | /// Type of control information. 44 | pub control_type: ControlType, 45 | /// "URI identifier of the implementation of the following section." 46 | pub format: String, 47 | /// Key-value entries, ASCII only. 48 | pub properties: HashMap, 49 | } 50 | 51 | /// The error type for the `read` method. 52 | #[derive(thiserror::Error, Debug)] 53 | #[error("failed to read HDT control info")] 54 | pub struct ControlInfoReadError(#[from] ControlInfoReadErrorKind); 55 | 56 | /// The kind of the ControlInfoReadError error. 57 | #[derive(thiserror::Error, Debug)] 58 | pub enum ControlInfoReadErrorKind { 59 | #[error("IO error")] 60 | Io(#[from] std::io::Error), 61 | #[error("chunk {0:?} does not equal the HDT cookie '$HDT'")] 62 | HdtCookie([u8; 4]), 63 | #[error("invalid separator while reading format")] 64 | InvalidSeparator, 65 | #[error("invalid CRC16-ANSI checksum")] 66 | InvalidChecksum, 67 | #[error("invalid UTF8")] 68 | Utf8(#[from] std::string::FromUtf8Error), 69 | #[error("invalid control type '{0}'")] 70 | InvalidControlType(u8), 71 | } 72 | 73 | impl ControlInfo { 74 | /// Read and verify control information. 75 | pub fn read(reader: &mut R) -> Result { 76 | Ok(Self::read_kind(reader)?) 77 | } 78 | 79 | // Helper function returning a ControlInfoReadErrorKind that is wrapped by Self::read. 80 | fn read_kind(reader: &mut R) -> Result { 81 | use ControlInfoReadErrorKind::*; 82 | //use std::io::Error; 83 | 84 | // Keep track of what we are reading for computing the CRC afterwards. 85 | let crc = crc::Crc::::new(&crc::CRC_16_ARC); 86 | let mut digest = crc.digest(); 87 | 88 | // 1. Read the HDT Cookie 89 | let mut hdt_cookie: [u8; 4] = [0; 4]; 90 | reader.read_exact(&mut hdt_cookie)?; 91 | if &hdt_cookie != b"$HDT" { 92 | return Err(HdtCookie(hdt_cookie)); 93 | } 94 | digest.update(&hdt_cookie); 95 | 96 | // 2. Read the Control Type 97 | let mut control_type: [u8; 1] = [0; 1]; 98 | reader.read_exact(&mut control_type)?; 99 | digest.update(&control_type); 100 | let control_type = ControlType::try_from(control_type[0])?; 101 | 102 | // 3. Read the Format 103 | let mut format = Vec::new(); 104 | reader.read_until(0x00, &mut format)?; 105 | digest.update(&format); 106 | if format.pop() != Some(0x00) { 107 | return Err(InvalidSeparator); 108 | } 109 | let format = String::from_utf8(format)?; 110 | 111 | // 4. Read the Properties 112 | let mut prop_str = Vec::new(); 113 | reader.read_until(0x00, &mut prop_str)?; 114 | digest.update(&prop_str); 115 | if prop_str.pop() != Some(0x00) { 116 | return Err(std::io::Error::new(UnexpectedEof, "reading the properties").into()); 117 | } 118 | let prop_str = String::from_utf8(prop_str)?; 119 | let mut properties = HashMap::new(); 120 | for item in prop_str.split(';') { 121 | if let Some(index) = item.find('=') { 122 | let (key, val) = item.split_at(index); 123 | properties.insert(String::from(key), String::from(&val[1..])); 124 | } 125 | } 126 | 127 | // 5. Read the CRC 128 | let mut crc_code = [0_u8; 2]; 129 | reader.read_exact(&mut crc_code)?; 130 | let crc_code: u16 = u16::from_le_bytes(crc_code); 131 | 132 | // 6. Check the CRC 133 | if digest.finalize() != crc_code { 134 | return Err(InvalidChecksum); 135 | } 136 | 137 | Ok(ControlInfo { control_type, format, properties }) 138 | } 139 | 140 | /// Save a ControlInfo object to file using crc 141 | pub fn save(&self, dest_writer: &mut impl Write) -> Result<(), Box> { 142 | let crc = crc::Crc::::new(&crc::CRC_16_ARC); 143 | let mut hasher = crc.digest(); 144 | dest_writer.write_all(HDT_HEADER)?; 145 | hasher.update(HDT_HEADER); 146 | 147 | // write type 148 | let type_: [u8; 1] = [self.control_type as u8]; 149 | dest_writer.write_all(&type_)?; 150 | hasher.update(&type_); 151 | 152 | // write format 153 | let format = self.format.as_bytes(); 154 | dest_writer.write_all(format)?; 155 | hasher.update(format); 156 | dest_writer.write_all(&TERMINATOR)?; 157 | hasher.update(&TERMINATOR); 158 | 159 | // write properties 160 | let mut properties_string = String::new(); 161 | for (key, value) in &self.properties { 162 | properties_string.push_str(key); 163 | properties_string.push('='); 164 | properties_string.push_str(value); 165 | properties_string.push(';'); 166 | } 167 | dest_writer.write_all(properties_string.as_bytes())?; 168 | hasher.update(properties_string.as_bytes()); 169 | dest_writer.write_all(&TERMINATOR)?; 170 | hasher.update(&TERMINATOR); 171 | 172 | let checksum = hasher.finalize(); 173 | dest_writer.write_all(&checksum.to_le_bytes())?; 174 | 175 | Ok(()) 176 | } 177 | 178 | /// Get property value for the given key, if available. 179 | pub fn get(&self, key: &str) -> Option { 180 | self.properties.get(key).cloned() 181 | } 182 | } 183 | 184 | #[cfg(test)] 185 | mod tests { 186 | use super::*; 187 | use crate::tests::init; 188 | use std::io::BufReader; 189 | 190 | #[test] 191 | fn read_info() -> color_eyre::Result<()> { 192 | init(); 193 | let info = b"$HDT\x01\x00\x00\x76\x35"; 194 | let mut reader = BufReader::new(&info[..]); 195 | 196 | let info = ControlInfo::read(&mut reader)?; 197 | assert_eq!(info.control_type, ControlType::Global); 198 | assert_eq!(info.format, ""); 199 | assert!(info.properties.is_empty()); 200 | Ok(()) 201 | } 202 | 203 | #[test] 204 | fn write_info() -> color_eyre::Result<()> { 205 | init(); 206 | let control_type = ControlType::Global; 207 | let format = "".to_owned(); 208 | let mut properties = HashMap::::new(); 209 | properties.insert("Software".to_owned(), "hdt_rs".to_owned()); 210 | let info = ControlInfo { control_type, format, properties }; 211 | 212 | let mut buffer = Vec::new(); 213 | info.save(&mut buffer); 214 | 215 | let expected = b"$HDT\x01\x00Software=hdt_rs;\x00\x52\x22"; 216 | assert_eq!(buffer, expected); 217 | 218 | let mut reader = BufReader::new(&expected[..]); 219 | let info2 = ControlInfo::read(&mut reader)?; 220 | assert_eq!(info, info2); 221 | Ok(()) 222 | } 223 | } 224 | -------------------------------------------------------------------------------- /src/containers/mod.rs: -------------------------------------------------------------------------------- 1 | /// In-memory RDF representation. 2 | pub mod rdf; 3 | 4 | /// Variable length numbers. 5 | pub mod vbyte; 6 | 7 | // byte containers 8 | mod adj_list; 9 | mod bitmap; 10 | mod sequence; 11 | 12 | // control info section reader 13 | mod control_info; 14 | 15 | pub use adj_list::AdjList; 16 | pub use bitmap::{Bitmap, BitmapReadError}; 17 | pub use control_info::{ControlInfo, ControlInfoReadError, ControlType}; 18 | pub use sequence::{Sequence, SequenceReadError}; 19 | -------------------------------------------------------------------------------- /src/containers/rdf.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | 3 | /// Represents an RDF triple. 4 | #[derive(PartialEq, Eq, PartialOrd, Ord, Clone)] 5 | pub struct Triple { 6 | /// Named IRI or blank node. 7 | pub subject: Id, 8 | /// IRI 9 | pub predicate: String, 10 | /// Named IRI, blank node or literal. 11 | pub object: Term, 12 | } 13 | 14 | impl Triple { 15 | /// Triple with the given subject, predicate and object. 16 | pub const fn new(subject: Id, predicate: String, object: Term) -> Self { 17 | Triple { subject, predicate, object } 18 | } 19 | } 20 | 21 | impl fmt::Debug for Triple { 22 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 23 | write!(f, "{:?} {:?} {:?} .", self.subject, self.predicate, self.object) 24 | } 25 | } 26 | 27 | /// RDF identifiers can either be Internationalized Resource Identifiers (IRIs) or blank node 28 | /// identifiers. The latter are random identifiers which should be unique to the graph they are 29 | /// contained in. 30 | #[derive(PartialEq, Eq, PartialOrd, Ord, Clone)] 31 | pub enum Id { 32 | /// IRI 33 | Named(String), 34 | /// Blank node 35 | Blank(String), 36 | } 37 | 38 | // There's a custom debug implementation to hide the enum variant tag when printing, 39 | // it saves some screen space that's not needed. 40 | impl fmt::Debug for Id { 41 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 42 | match self { 43 | Id::Named(iri) => write!(f, "\"{iri}\""), 44 | Id::Blank(id) => write!(f, "\"{id}\""), 45 | } 46 | } 47 | } 48 | 49 | /// RDF Terms are either identifiers or literals. 50 | #[derive(PartialEq, Eq, PartialOrd, Ord, Clone)] 51 | pub enum Term { 52 | /// Named IRI or blank node. 53 | Id(Id), 54 | /// Literal value. 55 | Literal(Literal), 56 | } 57 | 58 | // There's a custom debug implementation to hide the enum variant tag when printing, 59 | // it saves some screen space that's not needed. 60 | impl fmt::Debug for Term { 61 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 62 | match self { 63 | Term::Id(id) => id.fmt(f), 64 | Term::Literal(lit) => lit.fmt(f), 65 | } 66 | } 67 | } 68 | 69 | /// RDF Literals always have a lexical 'form' as per 70 | /// [RDF 1.1 Concepts And Abstract Syntax](https://www.w3.org/TR/rdf11-concepts/#dfn-literal). 71 | /// 72 | /// They can optionally contain a datatype describing how the literal form maps to a literal value 73 | /// (The default type is: [xs:string](http://www.w3.org/2001/XMLSchema#string), but we do not store 74 | /// this). 75 | /// If the datatype is [rdf:langString](http://www.w3.org/1999/02/22-rdf-syntax-ns#langString), 76 | /// we can optionally supply a language tag ([BCP47](https://tools.ietf.org/html/bcp47)) such as 77 | /// `"nl"` or `"fr"`. 78 | /// 79 | /// # Examples 80 | /// ``` 81 | /// // string 82 | /// use hdt::containers::rdf::Literal; 83 | /// let literal = Literal::new(String::from("hello")); 84 | /// assert_eq!("\"hello\"", format!("{:?}", literal)); 85 | /// ``` 86 | /// ``` 87 | /// // typed literal 88 | /// use hdt::containers::rdf::Literal; 89 | /// let type_iri = String::from("http://www.w3.org/2001/XMLSchema#integer"); 90 | /// let typed_literal = Literal::new_typed(String::from("42"), type_iri); 91 | /// assert_eq!("\"42\"^^http://www.w3.org/2001/XMLSchema#integer", format!("{:?}", typed_literal)); 92 | /// ``` 93 | /// ``` 94 | /// // language tagged string 95 | /// use hdt::containers::rdf::Literal; 96 | /// let lang_tag = String::from("nl"); 97 | /// let lang_string = Literal::new_lang(String::from("hallo wereld"), lang_tag); 98 | /// assert_eq!("\"hallo wereld\"@nl", format!("{:?}", lang_string)); 99 | /// ``` 100 | #[derive(PartialEq, Eq, PartialOrd, Ord, Clone)] 101 | pub struct Literal { 102 | form: String, 103 | datatype: Option, 104 | lang: Option, 105 | } 106 | 107 | // There's a custom debug implementation to hide structure tags when printing, 108 | // it saves some screen space that's not needed. 109 | impl fmt::Debug for Literal { 110 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 111 | if let Some(lang) = &self.lang { 112 | write!(f, "\"{}\"@{lang}", self.form) 113 | } else if let Some(dtype) = &self.datatype { 114 | write!(f, "\"{}\"^^{dtype}", self.form) 115 | } else { 116 | write!(f, "\"{}\"", self.form) 117 | } 118 | } 119 | } 120 | 121 | impl Literal { 122 | /// Create a new literal with type [xs:string](http://www.w3.org/2001/XMLSchema#string) (which 123 | /// we do not store since it is the default type). 124 | pub const fn new(form: String) -> Self { 125 | Literal { form, datatype: None, lang: None } 126 | } 127 | 128 | /// Create a new literal with a given form and datatype. 129 | pub const fn new_typed(form: String, datatype: String) -> Self { 130 | Literal { form, datatype: Some(datatype), lang: None } 131 | } 132 | 133 | /// Create a new literal with a given form and langauge. Automatically sets the type to 134 | /// [xs:langString](http://www.w3.org/2001/XMLSchema#langString) 135 | pub fn new_lang(form: String, lang: String) -> Self { 136 | let datatype = String::from("http://www.w3.org/1999/02/22-rdf-syntax-ns#langString"); 137 | 138 | Literal { form, datatype: Some(datatype), lang: Some(lang) } 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/containers/sequence.rs: -------------------------------------------------------------------------------- 1 | use super::vbyte::encode_vbyte; 2 | use crate::containers::vbyte::read_vbyte; 3 | use bytesize::ByteSize; 4 | #[cfg(feature = "cache")] 5 | use serde::{self, Deserialize, Serialize}; 6 | use std::fs::File; 7 | use std::io::{BufRead, BufWriter, Write}; 8 | use std::mem::size_of; 9 | use std::thread; 10 | use std::{error, fmt}; 11 | 12 | const USIZE_BITS: usize = usize::BITS as usize; 13 | 14 | /// Integer sequence with a given number of bits, which means numbers may be represented along byte boundaries. 15 | //#[derive(Clone)] 16 | #[cfg_attr(feature = "cache", derive(Deserialize, Serialize))] 17 | pub struct Sequence { 18 | /// Number of integers in the sequence. 19 | pub entries: usize, 20 | /// Number of bits that each integer uses. 21 | pub bits_per_entry: usize, 22 | /// Data in blocks. 23 | pub data: Vec, 24 | /// whether CRC check was successful 25 | #[cfg_attr(feature = "cache", serde(skip))] 26 | pub crc_handle: Option>, 27 | } 28 | 29 | /// The error type for the sequence read function. 30 | #[derive(thiserror::Error, Debug)] 31 | pub enum SequenceReadError { 32 | #[error("IO error")] 33 | Io(#[from] std::io::Error), 34 | #[error("Invalid CRC8-CCIT checksum {0}, expected {1}")] 35 | InvalidCrc8Checksum(u8, u8), 36 | #[error("Failed to turn raw bytes into usize")] 37 | TryFromSliceError(#[from] std::array::TryFromSliceError), 38 | #[error("invalid LogArray type {0} != 1")] 39 | InvalidLogArrayType(u8), 40 | #[error("entry size of {0} bit too large (>64 bit)")] 41 | EntrySizeTooLarge(usize), 42 | } 43 | 44 | impl fmt::Debug for Sequence { 45 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 46 | write!( 47 | f, 48 | "{} with {} entries, {} bits per entry", 49 | ByteSize(self.size_in_bytes() as u64), 50 | self.entries, 51 | self.bits_per_entry 52 | ) 53 | } 54 | } 55 | 56 | pub struct SequenceIter<'a> { 57 | sequence: &'a Sequence, 58 | i: usize, 59 | } 60 | 61 | impl Iterator for SequenceIter<'_> { 62 | type Item = usize; 63 | fn next(&mut self) -> Option { 64 | if self.i >= self.sequence.entries { 65 | return None; 66 | } 67 | let e = self.sequence.get(self.i); 68 | self.i += 1; 69 | Some(e) 70 | } 71 | } 72 | 73 | impl<'a> IntoIterator for &'a Sequence { 74 | type Item = usize; 75 | type IntoIter = SequenceIter<'a>; 76 | 77 | fn into_iter(self) -> Self::IntoIter { 78 | SequenceIter { sequence: self, i: 0 } 79 | } 80 | } 81 | 82 | impl Sequence { 83 | /// Get the integer at the given index, counting from 0. 84 | pub fn get(&self, index: usize) -> usize { 85 | let scaled_index = index * self.bits_per_entry; 86 | let block_index = scaled_index / USIZE_BITS; 87 | let bit_index = scaled_index % USIZE_BITS; 88 | 89 | let mut result; 90 | 91 | let result_shift = USIZE_BITS - self.bits_per_entry; 92 | if bit_index + self.bits_per_entry <= USIZE_BITS { 93 | let block_shift = USIZE_BITS - bit_index - self.bits_per_entry; 94 | result = (self.data[block_index] << block_shift) >> result_shift; 95 | } else { 96 | let block_shift = (USIZE_BITS << 1) - bit_index - self.bits_per_entry; 97 | result = self.data[block_index] >> bit_index; 98 | result |= (self.data[block_index + 1] << block_shift) >> result_shift; 99 | } 100 | result 101 | } 102 | 103 | /// Size in bytes on the heap. 104 | pub fn size_in_bytes(&self) -> usize { 105 | (self.data.len() * USIZE_BITS) >> 3 106 | } 107 | 108 | /// Read sequence including metadata from HDT data. 109 | pub fn read(reader: &mut R) -> Result { 110 | use SequenceReadError::*; 111 | // read entry metadata 112 | // keep track of history for CRC8 113 | let mut history: Vec = Vec::new(); 114 | 115 | // read and validate type 116 | let mut buffer = [0_u8]; 117 | reader.read_exact(&mut buffer)?; 118 | history.extend_from_slice(&buffer); 119 | if buffer[0] != 1 { 120 | return Err(InvalidLogArrayType(buffer[0])); 121 | } 122 | 123 | // read number of bits per entry 124 | let mut buffer = [0_u8]; 125 | reader.read_exact(&mut buffer)?; 126 | history.extend_from_slice(&buffer); 127 | let bits_per_entry = buffer[0] as usize; 128 | if bits_per_entry > USIZE_BITS { 129 | return Err(EntrySizeTooLarge(bits_per_entry)); 130 | } 131 | 132 | // read number of entries 133 | let (entries, bytes_read) = read_vbyte(reader)?; 134 | history.extend_from_slice(&bytes_read); 135 | 136 | // read entry metadata CRC8 137 | let mut crc_code = [0_u8]; 138 | reader.read_exact(&mut crc_code)?; 139 | let crc_code = crc_code[0]; 140 | 141 | // validate entry metadata CRC8 142 | let crc8 = crc::Crc::::new(&crc::CRC_8_SMBUS); 143 | let mut digest = crc8.digest(); 144 | digest.update(&history); 145 | 146 | let crc_calculated = digest.finalize(); 147 | if crc_calculated != crc_code { 148 | return Err(InvalidCrc8Checksum(crc_calculated, crc_code)); 149 | } 150 | 151 | // read body data 152 | // read all but the last entry, since the last one is byte aligned 153 | let total_bits = bits_per_entry * entries; 154 | let full_byte_amount = (total_bits.div_ceil(USIZE_BITS).saturating_sub(1)) * size_of::(); 155 | let mut full_words = vec![0_u8; full_byte_amount]; 156 | reader.read_exact(&mut full_words)?; 157 | let mut data: Vec = Vec::with_capacity(full_byte_amount / 8 + 2); 158 | // read entry body 159 | 160 | // turn the raw bytes into usize values 161 | for word in full_words.chunks_exact(size_of::()) { 162 | data.push(usize::from_le_bytes(<[u8; size_of::()]>::try_from(word)?)); 163 | } 164 | 165 | // keep track of history for CRC32 166 | let mut history = full_words; 167 | 168 | // read the last few bits, byte aligned 169 | let mut bits_read = 0; 170 | let mut last_value: usize = 0; 171 | let last_entry_bits = if total_bits == 0 { 0 } else { ((total_bits - 1) % USIZE_BITS) + 1 }; 172 | 173 | while bits_read < last_entry_bits { 174 | let mut buffer = [0u8]; 175 | reader.read_exact(&mut buffer)?; 176 | history.extend_from_slice(&buffer); 177 | last_value |= (buffer[0] as usize) << bits_read; 178 | bits_read += size_of::(); 179 | } 180 | data.push(last_value); 181 | // read entry body CRC32 182 | let mut crc_code = [0_u8; 4]; 183 | reader.read_exact(&mut crc_code)?; 184 | let crc_handle = Some(thread::spawn(move || { 185 | let crc_code = u32::from_le_bytes(crc_code); 186 | 187 | // validate entry body CRC32 188 | let crc32 = crc::Crc::::new(&crc::CRC_32_ISCSI); 189 | let mut digest = crc32.digest(); 190 | digest.update(&history); 191 | digest.finalize() == crc_code 192 | })); 193 | 194 | Ok(Sequence { entries, bits_per_entry, data, crc_handle }) 195 | } 196 | 197 | pub fn save(&self, dest_writer: &mut BufWriter) -> Result<(), Box> { 198 | let crc = crc::Crc::::new(&crc::CRC_8_SMBUS); 199 | let mut hasher = crc.digest(); 200 | // libhdt/src/sequence/LogSequence2.cpp::save() 201 | // Write offsets using variable-length encoding 202 | let seq_type: [u8; 1] = [1]; 203 | let _ = dest_writer.write(&seq_type)?; 204 | hasher.update(&seq_type); 205 | // Write numbits 206 | let bits_per_entry: [u8; 1] = [self.bits_per_entry.try_into().unwrap()]; 207 | let _ = dest_writer.write(&bits_per_entry)?; 208 | hasher.update(&bits_per_entry); 209 | // Write numentries 210 | let buf = &encode_vbyte(self.entries); 211 | let _ = dest_writer.write(buf)?; 212 | hasher.update(buf); 213 | let checksum = hasher.finalize(); 214 | let _ = dest_writer.write(&checksum.to_le_bytes())?; 215 | 216 | // Write data 217 | let crc = crc::Crc::::new(&crc::CRC_32_ISCSI); 218 | let mut hasher = crc.digest(); 219 | let offset_data = self.pack_bits(); 220 | let _ = dest_writer.write(&offset_data)?; 221 | hasher.update(&offset_data); 222 | let checksum = hasher.finalize(); 223 | let _ = dest_writer.write(&checksum.to_le_bytes())?; 224 | 225 | Ok(()) 226 | } 227 | 228 | fn pack_bits(&self) -> Vec { 229 | let mut output = Vec::new(); 230 | let mut current_byte = 0u8; 231 | let mut bit_offset = 0; 232 | 233 | for &value in &self.data { 234 | let mut val = value & ((1 << self.bits_per_entry) - 1); // mask to get only relevant bits 235 | let mut bits_left = self.bits_per_entry; 236 | 237 | while bits_left > 0 { 238 | let available = 8 - bit_offset; 239 | let to_write = bits_left.min(available); 240 | 241 | // Shift bits to align with current byte offset 242 | current_byte |= ((val & ((1 << to_write) - 1)) as u8) << bit_offset; 243 | 244 | bit_offset += to_write; 245 | val >>= to_write; 246 | bits_left -= to_write; 247 | 248 | if bit_offset == 8 { 249 | output.push(current_byte); 250 | current_byte = 0; 251 | bit_offset = 0; 252 | } 253 | } 254 | } 255 | 256 | // Push final byte if there's remaining bits 257 | if bit_offset > 0 { 258 | output.push(current_byte); 259 | } 260 | 261 | output 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /src/containers/vbyte.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::io::BufRead; 3 | 4 | const MAX_VBYTE_BYTES: usize = usize::BITS as usize / 7 + 1; 5 | 6 | /// little endian 7 | pub fn read_vbyte(reader: &mut R) -> io::Result<(usize, Vec)> { 8 | use io::Error; 9 | use io::ErrorKind::InvalidData; 10 | 11 | let mut n: u128 = 0; 12 | let mut shift = 0; 13 | let mut buffer = [0u8]; 14 | let mut bytes_read = Vec::new(); 15 | reader.read_exact(&mut buffer)?; 16 | bytes_read.extend_from_slice(&buffer); 17 | 18 | while (buffer[0] & 0x80) == 0 { 19 | if bytes_read.len() >= MAX_VBYTE_BYTES { 20 | return Err(Error::new(InvalidData, "Tried to read a VByte that does not fit into a usize")); 21 | } 22 | 23 | n |= ((buffer[0] & 127) as u128) << shift; 24 | reader.read_exact(&mut buffer)?; 25 | bytes_read.extend_from_slice(&buffer); 26 | // IMPORTANT: The original implementation has an off-by-one error here, hence we 27 | // have to copy the same off-by-one error in order to read the file format. 28 | // The correct implementation is supposed to shift by 8! Look at the commented out 29 | // tests at the bottom of the file for proof. 30 | shift += 7; 31 | } 32 | 33 | n |= ((buffer[0] & 127) as u128) << shift; 34 | 35 | if let Ok(valid) = usize::try_from(n) { 36 | Ok((valid, bytes_read)) 37 | } else { 38 | Err(Error::new(InvalidData, "Tried to read a VByte that does not fit into a usize")) 39 | } 40 | } 41 | 42 | /// decode vbyte with offset 43 | pub const fn decode_vbyte_delta(data: &[u8], offset: usize) -> (usize, usize) { 44 | let mut n: usize = 0; 45 | let mut shift: usize = 0; 46 | let mut byte_amount = 0; 47 | 48 | while (data[offset + byte_amount] & 0x80) == 0 { 49 | n |= ((data[offset + byte_amount] & 127) as usize) << shift; 50 | byte_amount += 1; 51 | shift += 7; 52 | } 53 | 54 | n |= ((data[offset + byte_amount] & 127) as usize) << shift; 55 | byte_amount += 1; 56 | 57 | (n, byte_amount) 58 | } 59 | 60 | /// little endian 61 | pub fn encode_vbyte(n: usize) -> Vec { 62 | let mut bytes = Vec::new(); 63 | let mut n = n; 64 | 65 | while n > 127 { 66 | bytes.push((n & 127) as u8); 67 | // IMPORTANT: The original implementation has an off-by-one error here, hence we 68 | // have to copy the same off-by-one error in order to read the file format. 69 | // The correct implementation is supposed to shift by 8! Look at the commented out 70 | // tests at the bottom of the file for proof. 71 | n >>= 7; 72 | } 73 | 74 | bytes.push((n | 0x80) as u8); 75 | bytes 76 | } 77 | 78 | #[cfg(test)] 79 | mod tests { 80 | use super::*; 81 | use crate::tests::init; 82 | use std::io::BufReader; 83 | 84 | #[test] 85 | fn test_encode_decode() { 86 | init(); 87 | let buffer = encode_vbyte(824); 88 | let mut reader = BufReader::new(&buffer[..]); 89 | if let Ok((number, bytes_read)) = read_vbyte(&mut reader) { 90 | assert_eq!(number, 824); 91 | assert_eq!(bytes_read, buffer); 92 | } else { 93 | panic!("Failed to read vbyte"); 94 | } 95 | } 96 | 97 | #[test] 98 | fn test_max_value() { 99 | init(); 100 | let buffer = encode_vbyte(usize::MAX); 101 | let mut reader = BufReader::new(&buffer[..]); 102 | if let Ok((number, bytes_read)) = read_vbyte(&mut reader) { 103 | assert_eq!(number, usize::MAX); 104 | assert_eq!(bytes_read, buffer); 105 | } else { 106 | panic!("Failed to read vbyte"); 107 | } 108 | } 109 | 110 | #[test] 111 | #[should_panic(expected = "Tried to read a VByte that does not fit into a usize")] 112 | fn test_decode_too_large() { 113 | init(); 114 | let mut buffer = encode_vbyte(usize::MAX); 115 | buffer[MAX_VBYTE_BYTES - 1] &= 0x7F; 116 | buffer.push(0x7F); 117 | let mut reader = BufReader::new(&buffer[..]); 118 | read_vbyte(&mut reader).unwrap(); 119 | } 120 | 121 | // These tests show the off-by-one bug in the current implementation, but 122 | // we need to keep the bug in order to read the current version of .hdt files. 123 | // 124 | // #[test] 125 | // fn test_encode() { 126 | // assert_eq!(encode_vbyte(824), vec![0x38_u8, 0x83_u8]) 127 | // } 128 | // 129 | // #[test] 130 | // fn test_decode() { 131 | // // this represents 824 132 | // // 0011 1000 1000 0011 133 | // // 0x38 0x83 134 | // let buffer = b"\x38\x83"; 135 | // let mut reader = BufReader::new(&buffer[..]); 136 | // if let Ok((number, bytes_read)) = read_vbyte(&mut reader) { 137 | // assert_eq!(number, 824); 138 | // assert_eq!(bytes_read, vec![0x38_u8, 0x83_u8]); 139 | // } else { 140 | // panic!("Failed to read vbyte"); 141 | // } 142 | // } 143 | } 144 | -------------------------------------------------------------------------------- /src/dict_sect_pfc.rs: -------------------------------------------------------------------------------- 1 | #![allow(missing_docs)] // temporariy while we figure out what should be public in the end 2 | /// Dictionary section with plain front coding. 3 | /// See . 4 | use crate::containers::vbyte::{decode_vbyte_delta, encode_vbyte, read_vbyte}; 5 | use crate::containers::{Sequence, SequenceReadError}; 6 | use crate::triples::Id; 7 | use bytesize::ByteSize; 8 | use log::error; 9 | use std::cmp::{Ordering, min}; 10 | use std::error; 11 | use std::fmt; 12 | use std::fs::File; 13 | use std::io::{BufRead, BufWriter, Write}; 14 | use std::str; 15 | use std::sync::Arc; 16 | use std::thread::{JoinHandle, spawn}; 17 | use thiserror::Error; 18 | 19 | /// Dictionary section with plain front coding. 20 | //#[derive(Clone)] 21 | pub struct DictSectPFC { 22 | /// total number of strings stored 23 | pub num_strings: usize, 24 | /// the last block may have less than "block_size" strings 25 | pub block_size: usize, 26 | /// stores the starting position of each block 27 | pub sequence: Sequence, 28 | /// the substrings 29 | pub packed_data: Arc<[u8]>, 30 | } 31 | 32 | /// The error type for the DictSectPFC read function. 33 | #[derive(thiserror::Error, Debug)] 34 | pub enum DictSectReadError { 35 | #[error("IO error")] 36 | Io(#[from] std::io::Error), 37 | #[error("Invalid CRC8-CCIT checksum {0}, expected {1}")] 38 | InvalidCrc8Checksum(u8, u8), 39 | #[error("Implementation only supports plain front coded dictionary sections")] 40 | DictSectNotPfc, 41 | #[error("sequence read error")] 42 | SequenceReadError(#[from] SequenceReadError), 43 | } 44 | 45 | impl fmt::Debug for DictSectPFC { 46 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 47 | write!( 48 | f, 49 | "total size {}, {} strings, sequence {:?}, packed data {}", 50 | ByteSize(self.size_in_bytes() as u64), 51 | self.num_strings, 52 | self.sequence, 53 | ByteSize(self.packed_data.len() as u64) 54 | ) 55 | } 56 | } 57 | 58 | #[derive(Error, Debug)] 59 | pub enum ExtractError { 60 | #[error("index out of bounds: id {id} > dictionary section len {len}")] 61 | IdOutOfBounds { id: Id, len: usize }, 62 | #[error("read invalid UTF-8 sequence in {data:?}, recovered: '{recovered}'")] 63 | InvalidUtf8 { source: std::str::Utf8Error, data: Vec, recovered: String }, 64 | } 65 | 66 | impl DictSectPFC { 67 | /// size in bytes of the dictionary section 68 | pub fn size_in_bytes(&self) -> usize { 69 | self.sequence.size_in_bytes() + self.packed_data.len() 70 | } 71 | 72 | /* 73 | // TODO: fix this 74 | fn decode(string: String) -> String { 75 | let mut split: Vec = string.rsplit('"').map(String::from).collect(); 76 | 77 | if split.len() > 2 { 78 | split = split.into_iter().skip(1).collect(); 79 | split[0] = format!("\"{}\"", split[0]); 80 | split.into_iter().collect() 81 | } else { 82 | split[0].clone() 83 | } 84 | } 85 | */ 86 | 87 | fn index_str(&self, index: usize) -> &str { 88 | let position: usize = self.sequence.get(index); 89 | let length = self.strlen(position); 90 | str::from_utf8(&self.packed_data[position..position + length]).unwrap() 91 | } 92 | 93 | /// translated from Java 94 | /// https://github.com/rdfhdt/hdt-java/blob/master/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java 95 | /// 0 means not found 96 | pub fn string_to_id(&self, element: &str) -> Id { 97 | if self.num_strings == 0 { 98 | // shared dictionary may be empty 99 | return 0; 100 | } 101 | // binary search 102 | let mut low: usize = 0; 103 | let mut high = self.sequence.entries.saturating_sub(2); // should be -1 but only works with -2, investigate 104 | let max = high; 105 | let mut mid = high; 106 | while low <= high { 107 | mid = usize::midpoint(low, high); 108 | 109 | let cmp: Ordering = if mid > max { 110 | mid = max; 111 | break; 112 | } else { 113 | let text = self.index_str(mid); 114 | element.cmp(text) 115 | //println!("mid: {} text: {} cmp: {:?}", mid, text, cmp); 116 | }; 117 | match cmp { 118 | Ordering::Less => { 119 | if mid == 0 { 120 | return 0; 121 | } 122 | high = mid - 1; 123 | } 124 | Ordering::Greater => low = mid + 1, 125 | Ordering::Equal => return ((mid * self.block_size) + 1) as Id, 126 | } 127 | } 128 | if high < mid { 129 | mid = high; 130 | } 131 | let idblock = self.locate_in_block(mid, element); 132 | if idblock == 0 { 133 | return 0; 134 | } 135 | ((mid * self.block_size) + idblock + 1) as Id 136 | } 137 | 138 | fn longest_common_prefix(a: &[u8], b: &[u8]) -> usize { 139 | let len = min(a.len(), b.len()); 140 | let mut delta = 0; 141 | while delta < len && a[delta] == b[delta] { 142 | delta += 1; 143 | } 144 | delta 145 | } 146 | 147 | fn locate_in_block(&self, block: usize, element: &str) -> usize { 148 | if block >= self.sequence.entries { 149 | return 0; 150 | } 151 | let element = element.as_bytes(); 152 | let mut pos = self.sequence.get(block); 153 | let mut id_in_block = 0; 154 | let mut cshared = 0; 155 | 156 | // Read the first string in the block 157 | let slen = self.strlen(pos); 158 | let mut temp_string: Vec = self.packed_data[pos..pos + slen].to_vec(); 159 | pos += slen + 1; 160 | id_in_block += 1; 161 | 162 | while (id_in_block < self.block_size) && (pos < self.packed_data.len()) { 163 | // Decode prefix 164 | let (delta, vbyte_bytes) = decode_vbyte_delta(&self.packed_data, pos); 165 | pos += vbyte_bytes; 166 | 167 | //Copy suffix 168 | let slen = self.strlen(pos); 169 | temp_string.truncate(delta); 170 | temp_string.extend_from_slice(&self.packed_data[pos..pos + slen]); 171 | if delta >= cshared { 172 | // Current delta value means that this string has a larger long common prefix than the previous one 173 | cshared += Self::longest_common_prefix(&temp_string[cshared..], &element[cshared..]); 174 | 175 | if (cshared == element.len()) && (temp_string.len() == element.len()) { 176 | break; 177 | } 178 | } else { 179 | // We have less common characters than before, this string is bigger that what we are looking for. 180 | // i.e. Not found. 181 | id_in_block = 0; 182 | break; 183 | } 184 | pos += slen + 1; 185 | id_in_block += 1; 186 | } 187 | 188 | if pos >= self.packed_data.len() || id_in_block == self.block_size { 189 | id_in_block = 0; 190 | } 191 | id_in_block 192 | } 193 | 194 | /// extract the string with the given ID from the dictionary 195 | pub fn extract(&self, id: Id) -> Result { 196 | if id as usize > self.num_strings { 197 | return Err(ExtractError::IdOutOfBounds { id, len: self.num_strings }); 198 | } 199 | let block_index = id.saturating_sub(1) as usize / self.block_size; 200 | let string_index = id.saturating_sub(1) as usize % self.block_size; 201 | let mut position = self.sequence.get(block_index); 202 | let mut slen = self.strlen(position); 203 | let mut string: Vec = self.packed_data[position..position + slen].to_vec(); 204 | //println!("block_index={} string_index={}, string={}", block_index, string_index, str::from_utf8(&string).unwrap()); 205 | // loop takes around nearly half the time of the function 206 | for _ in 0..string_index { 207 | position += slen + 1; 208 | let (delta, vbyte_bytes) = decode_vbyte_delta(&self.packed_data, position); 209 | position += vbyte_bytes; 210 | slen = self.strlen(position); 211 | string.truncate(delta); 212 | string.extend_from_slice(&self.packed_data[position..position + slen]); 213 | } 214 | // tried simdutf8::basic::from_utf8 but that didn't speed up extract that much 215 | match str::from_utf8(&string) { 216 | Ok(string) => Ok(String::from(string)), 217 | Err(e) => Err(ExtractError::InvalidUtf8 { 218 | source: e, 219 | data: string.clone(), 220 | recovered: String::from_utf8_lossy(&string).into_owned(), 221 | }), 222 | } 223 | } 224 | 225 | fn strlen(&self, offset: usize) -> usize { 226 | let length = self.packed_data.len(); 227 | let mut position = offset; 228 | 229 | while position < length && self.packed_data[position] != 0 { 230 | position += 1; 231 | } 232 | 233 | position - offset 234 | } 235 | 236 | /// deprecated: we should be able to remove this as it is public now 237 | pub const fn num_strings(&self) -> usize { 238 | self.num_strings 239 | } 240 | 241 | /// Returns an unverified dictionary section together with a handle to verify the checksum. 242 | pub fn read(reader: &mut R) -> Result<(Self, JoinHandle), DictSectReadError> { 243 | use DictSectReadError::*; 244 | let mut preamble = [0_u8]; 245 | reader.read_exact(&mut preamble)?; 246 | if preamble[0] != 2 { 247 | return Err(DictSectNotPfc); 248 | } 249 | 250 | // read section meta data 251 | let crc = crc::Crc::::new(&crc::CRC_8_SMBUS); 252 | let mut digest = crc.digest(); 253 | // The CRC includes the type of the block, inaccuracy in the spec, careful. 254 | digest.update(&[0x02]); 255 | // This was determined based on https://git.io/JthMG because the spec on this 256 | // https://www.rdfhdt.org/hdt-binary-format was inaccurate, it's 3 vbytes, not 2. 257 | let (num_strings, bytes_read) = read_vbyte(reader)?; 258 | digest.update(&bytes_read); 259 | let (packed_length, bytes_read) = read_vbyte(reader)?; 260 | digest.update(&bytes_read); 261 | let (block_size, bytes_read) = read_vbyte(reader)?; 262 | digest.update(&bytes_read); 263 | 264 | // read section CRC8 265 | let mut crc_code = [0_u8]; 266 | reader.read_exact(&mut crc_code)?; 267 | let crc_code = crc_code[0]; 268 | 269 | let crc_calculated = digest.finalize(); 270 | if crc_calculated != crc_code { 271 | return Err(InvalidCrc8Checksum(crc_calculated, crc_code)); 272 | } 273 | 274 | // read sequence log array 275 | let sequence = Sequence::read(reader)?; 276 | 277 | // read packed data 278 | let mut packed_data = vec![0u8; packed_length]; 279 | reader.read_exact(&mut packed_data)?; 280 | let packed_data = Arc::<[u8]>::from(packed_data); 281 | 282 | // read packed data CRC32 283 | let mut crc_code = [0_u8; 4]; 284 | reader.read_exact(&mut crc_code)?; 285 | let cloned_data = Arc::clone(&packed_data); 286 | let crc_handle = spawn(move || { 287 | let crc = crc::Crc::::new(&crc::CRC_32_ISCSI); 288 | let mut digest = crc.digest(); 289 | digest.update(&cloned_data[..]); 290 | digest.finalize() == u32::from_le_bytes(crc_code) 291 | }); 292 | 293 | Ok((DictSectPFC { num_strings, block_size, sequence, packed_data }, crc_handle)) 294 | } 295 | 296 | /// counterpoint to the read method 297 | // TODO: use Write trait and add test 298 | pub fn save(&self, dest_writer: &mut BufWriter) -> Result<(), Box> { 299 | let crc = crc::Crc::::new(&crc::CRC_8_SMBUS); 300 | let mut hasher = crc.digest(); 301 | // libhdt/src/libdcs/CSD_PFC.cpp::save() 302 | // save type 303 | let seq_type: [u8; 1] = [2]; 304 | let _ = dest_writer.write(&seq_type)?; 305 | hasher.update(&seq_type); 306 | 307 | // // Save sizes 308 | let mut buf: Vec = vec![]; 309 | buf.extend_from_slice(&encode_vbyte(self.num_strings)); 310 | buf.extend_from_slice(&encode_vbyte(self.packed_data.len())); 311 | buf.extend_from_slice(&encode_vbyte(self.block_size)); 312 | let _ = dest_writer.write(&buf)?; 313 | hasher.update(&buf); 314 | let checksum = hasher.finalize(); 315 | let _ = dest_writer.write(&checksum.to_le_bytes())?; 316 | 317 | self.sequence.save(dest_writer)?; 318 | 319 | // Write packed data 320 | let crc = crc::Crc::::new(&crc::CRC_32_ISCSI); 321 | let mut hasher = crc.digest(); 322 | let _ = dest_writer.write(&self.packed_data)?; 323 | hasher.update(&self.packed_data); 324 | // println!("{}", String::from_utf8_lossy(&self.compressed_terms)); 325 | let checksum = hasher.finalize(); 326 | let _ = dest_writer.write(&checksum.to_le_bytes())?; 327 | 328 | Ok(()) 329 | } 330 | } 331 | 332 | #[cfg(test)] 333 | mod tests { 334 | use super::*; 335 | use crate::ControlInfo; 336 | use crate::header::Header; 337 | use crate::tests::init; 338 | use pretty_assertions::assert_eq; 339 | use std::fs::File; 340 | use std::io::BufReader; 341 | /* unused 342 | #[test] 343 | fn test_decode() { 344 | let s = String::from("^^\"123\""); 345 | let d = DictSectPFC::decode(s); 346 | assert_eq!(d, "\"123\"^^"); 347 | } 348 | */ 349 | #[test] 350 | fn read_section_read() -> color_eyre::Result<()> { 351 | init(); 352 | let file = File::open("tests/resources/snikmeta.hdt").expect("error opening file"); 353 | let mut reader = BufReader::new(file); 354 | ControlInfo::read(&mut reader)?; 355 | Header::read(&mut reader)?; 356 | 357 | // read dictionary control information 358 | let dict_ci = ControlInfo::read(&mut reader)?; 359 | assert!( 360 | dict_ci.format == "", 361 | "invalid dictionary type: {:?}", 362 | dict_ci.format 363 | ); 364 | 365 | let (shared, _) = DictSectPFC::read(&mut reader)?; 366 | // the file contains IRIs that are used both as subject and object 23128 367 | assert_eq!(shared.num_strings, 43); 368 | assert_eq!(shared.packed_data.len(), 614); 369 | assert_eq!(shared.block_size, 16); 370 | for term in ["http://www.snik.eu/ontology/meta/Top", "http://www.snik.eu/ontology/meta/Function", "_:b1"] { 371 | let id = shared.string_to_id(term); 372 | let back = shared.extract(id)?; 373 | assert_eq!(term, back, "term does not translate back to itself {} -> {} -> {}", term, id, back); 374 | } 375 | let sequence = shared.sequence; 376 | let data_size = (sequence.bits_per_entry * sequence.entries).div_ceil(64); 377 | assert_eq!(sequence.data.len(), data_size); 378 | 379 | let (subjects, _) = DictSectPFC::read(&mut reader)?; 380 | assert_eq!(subjects.num_strings, 6); 381 | for term in [ 382 | "http://www.snik.eu/ontology/meta", "http://www.snik.eu/ontology/meta/feature", 383 | "http://www.snik.eu/ontology/meta/homonym", "http://www.snik.eu/ontology/meta/master", 384 | "http://www.snik.eu/ontology/meta/typicalFeature", 385 | ] { 386 | let id = subjects.string_to_id(term); 387 | let back = subjects.extract(id)?; 388 | assert_eq!(term, back, "term does not translate back to itself {} -> {} -> {}", term, id, back); 389 | } 390 | let sequence = subjects.sequence; 391 | let data_size = (sequence.bits_per_entry * sequence.entries).div_ceil(64); 392 | assert_eq!(sequence.data.len(), data_size); 393 | Ok(()) 394 | } 395 | } 396 | -------------------------------------------------------------------------------- /src/four_sect_dict.rs: -------------------------------------------------------------------------------- 1 | #![allow(missing_docs)] // temporariy while we figure out what should be public in the end 2 | use crate::ControlInfo; 3 | use crate::DictSectPFC; 4 | /// Four section dictionary. 5 | use crate::dict_sect_pfc::{DictSectReadError, ExtractError}; 6 | use crate::triples::Id; 7 | //use eyre::{Result, WrapErr, eyre}; 8 | use std::io; 9 | use std::io::{BufRead, Error, ErrorKind}; 10 | use std::thread::JoinHandle; 11 | use thiserror::Error; 12 | 13 | /// Position in an RDF triple. 14 | #[derive(Debug, Clone)] 15 | pub enum IdKind { 16 | /// IRI or blank node in the first position of a triple. 17 | Subject, 18 | /// IRI in the second position of a triple. 19 | Predicate, 20 | /// IRI, blank node or literal in the third position of a triple. 21 | Object, 22 | } 23 | 24 | /// Four section dictionary with plain front coding. 25 | /// Dictionary with shared, subject, predicate and object sections. 26 | /// Types specified as . 27 | /// See . 28 | #[derive(Debug)] 29 | pub struct FourSectDict { 30 | /// The shared section contains URIs that occur both in subject and object position. Its IDs start at one. 31 | pub shared: DictSectPFC, 32 | /// URIs that only occur as subjects. Their IDs start at the last ID of the shared section + 1. 33 | pub subjects: DictSectPFC, 34 | /// The predicate section has its own separate numbering starting from 1. 35 | pub predicates: DictSectPFC, 36 | /// URIs and literals that only occur as objects . Their IDs start at the last ID of the shared section + 1. 37 | pub objects: DictSectPFC, 38 | } 39 | 40 | /// Designates one of the four sections. 41 | #[derive(Debug)] 42 | pub enum SectKind { 43 | /// section for terms that appear as both subject and object 44 | Shared, 45 | /// section for terms that only appear as subjects 46 | Subject, 47 | /// section for terms that only appear as predicates 48 | Predicate, 49 | /// sections for terms that only appear as objects 50 | Object, 51 | } 52 | 53 | /// Wraps an extraction error with additional information on which dictionary section it occurred in. 54 | #[derive(Error, Debug)] 55 | #[error("four sect dict error id_to_string({id},IdKind::{id_kind:?}) in the {sect_kind:?} section, caused by {e}")] 56 | pub struct DictError { 57 | #[source] 58 | e: ExtractError, 59 | id: Id, 60 | id_kind: &'static IdKind, 61 | sect_kind: SectKind, 62 | } 63 | 64 | #[derive(Error, Debug)] 65 | #[error("four sect dict section error in the {sect_kind:?} section, caused by {e}")] 66 | pub struct DictSectError { 67 | #[source] 68 | e: DictSectReadError, 69 | sect_kind: SectKind, 70 | } 71 | 72 | #[derive(Error, Debug)] 73 | #[error("error reading four section dictionary")] 74 | pub enum DictReadError { 75 | ControlInfo(#[from] crate::containers::ControlInfoReadError), 76 | DictSect(#[from] DictSectError), 77 | Other(String), 78 | } 79 | 80 | impl FourSectDict { 81 | /// Get the string value of a given ID of a given type. 82 | /// String representation of URIs, literals and blank nodes is defined in >.. 83 | pub fn id_to_string(&self, id: Id, id_kind: &'static IdKind) -> Result { 84 | use SectKind::*; 85 | let shared_size = self.shared.num_strings() as Id; 86 | let d = id.saturating_sub(shared_size); 87 | match id_kind { 88 | IdKind::Subject => { 89 | if id <= shared_size { 90 | self.shared.extract(id).map_err(|e| DictError { e, id, id_kind, sect_kind: Shared }) 91 | } else { 92 | self.subjects.extract(d).map_err(|e| DictError { e, id, id_kind, sect_kind: Subject }) 93 | } 94 | } 95 | IdKind::Predicate => { 96 | self.predicates.extract(id).map_err(|e| DictError { e, id, id_kind, sect_kind: Predicate }) 97 | } 98 | IdKind::Object => { 99 | if id <= shared_size { 100 | self.shared.extract(id).map_err(|e| DictError { e, id, id_kind, sect_kind: Shared }) 101 | } else { 102 | self.objects.extract(d).map_err(|e| DictError { e, id, id_kind, sect_kind: Object }) 103 | } 104 | } 105 | } 106 | } 107 | 108 | /// Get the string value of an ID. 109 | /// String representation of URIs, literals and blank nodes is defined in >.. 110 | pub fn string_to_id(&self, s: &str, id_kind: &IdKind) -> Id { 111 | let shared_size = self.shared.num_strings(); 112 | match id_kind { 113 | IdKind::Subject => { 114 | let mut id = self.shared.string_to_id(s); 115 | if id == 0 { 116 | id = self.subjects.string_to_id(s); 117 | if id > 0 { 118 | id += shared_size as Id; 119 | } 120 | } 121 | id 122 | } 123 | IdKind::Predicate => self.predicates.string_to_id(s), 124 | IdKind::Object => { 125 | let mut id = self.shared.string_to_id(s); 126 | if id == 0 { 127 | id = self.objects.string_to_id(s); 128 | if id > 0 { 129 | id += shared_size as Id; 130 | } 131 | } 132 | id 133 | } 134 | } 135 | } 136 | 137 | /// read the whole dictionary section including control information 138 | pub fn read(reader: &mut R) -> Result { 139 | use SectKind::*; 140 | let dict_ci = ControlInfo::read(reader)?; 141 | if dict_ci.format != "" { 142 | return Err(DictReadError::Other("Implementation only supports four section dictionaries".to_owned())); 143 | } 144 | let (shared, shared_crc) = 145 | DictSectPFC::read(reader).map_err(|e| DictSectError { e, sect_kind: Shared })?; 146 | let (subjects, subjects_crc) = 147 | DictSectPFC::read(reader).map_err(|e| DictSectError { e, sect_kind: Subject })?; 148 | let (predicates, predicates_crc) = 149 | DictSectPFC::read(reader).map_err(|e| DictSectError { e, sect_kind: Predicate })?; 150 | let (objects, objects_crc) = 151 | DictSectPFC::read(reader).map_err(|e| DictSectError { e, sect_kind: Object })?; 152 | 153 | Ok(UnvalidatedFourSectDict { 154 | four_sect_dict: FourSectDict { shared, subjects, predicates, objects }, 155 | crc_handles: [shared_crc, subjects_crc, predicates_crc, objects_crc], 156 | }) 157 | } 158 | /* 159 | pub fn translate_all_ids(&self, triple_ids: &[TripleId]) -> Vec<(String, String, String)> { 160 | triple_ids 161 | .into_par_iter() 162 | .map(|id: &TripleId| { 163 | let subject = self.id_to_string(id.subject_id, IdKind::Subject).unwrap(); 164 | let predicate = self.id_to_string(id.predicate_id, IdKind::Predicate).unwrap(); 165 | let object = self.id_to_string(id.object_id, IdKind::Object).unwrap(); 166 | (subject, predicate, object) 167 | }) 168 | .collect() 169 | } 170 | */ 171 | /// size in bytes of the in memory four section dictionary 172 | pub fn size_in_bytes(&self) -> usize { 173 | self.shared.size_in_bytes() 174 | + self.subjects.size_in_bytes() 175 | + self.predicates.size_in_bytes() 176 | + self.objects.size_in_bytes() 177 | } 178 | } 179 | 180 | /// A wrapper to ensure prevent using FourSectDict before its checksum have been validated 181 | pub struct UnvalidatedFourSectDict { 182 | four_sect_dict: FourSectDict, 183 | crc_handles: [JoinHandle; 4], 184 | } 185 | 186 | impl UnvalidatedFourSectDict { 187 | /// Validates the checksums of all dictionary sections in parallel. 188 | /// Dict validation takes around 1200 ms on a single thread with an 1.5 GB HDT file on an i9-12900k. 189 | /// This function must NOT be called more than once. 190 | // TODO can this be simplified? 191 | pub fn validate(self) -> io::Result { 192 | let names = ["shared", "subject", "predicate", "object"]; 193 | for (name, handle) in names.iter().zip(self.crc_handles) { 194 | if !handle.join().unwrap() { 195 | return Err(Error::new( 196 | ErrorKind::InvalidData, 197 | format!("CRC Error in {name} dictionary section."), 198 | )); 199 | } 200 | } 201 | Ok(self.four_sect_dict) 202 | } 203 | } 204 | 205 | #[cfg(test)] 206 | mod tests { 207 | use super::*; 208 | use crate::header::Header; 209 | use crate::tests::init; 210 | use pretty_assertions::assert_eq; 211 | use std::fs::File; 212 | use std::io::BufReader; 213 | 214 | #[test] 215 | fn read_dict() -> color_eyre::Result<()> { 216 | init(); 217 | let file = File::open("tests/resources/snikmeta.hdt")?; 218 | let mut reader = BufReader::new(file); 219 | ControlInfo::read(&mut reader)?; 220 | Header::read(&mut reader)?; 221 | 222 | let dict = FourSectDict::read(&mut reader)?.validate()?; 223 | assert_eq!(dict.shared.num_strings(), 43, "wrong number of strings in the shared section"); 224 | assert_eq!(dict.subjects.num_strings(), 6, "wrong number of strings in the subject section"); 225 | assert_eq!(dict.predicates.num_strings(), 23, "wrong number of strings in the predicates section"); 226 | assert_eq!(dict.objects.num_strings(), 133, "wrong number of strings in the objects section"); 227 | assert_eq!(dict.string_to_id("_:b1", &IdKind::Subject), 1); 228 | assert_eq!("http://www.snik.eu/ontology/meta/uses", dict.id_to_string(43, &IdKind::Subject)?); 229 | assert_eq!("http://www.snik.eu/ontology/meta/Chapter", dict.id_to_string(3, &IdKind::Subject)?); 230 | assert_eq!("http://www.snik.eu/ontology/meta/DataSetType", dict.id_to_string(5, &IdKind::Subject)?); 231 | for id in 1..dict.shared.num_strings() { 232 | let s = dict.id_to_string(id, &IdKind::Subject)?; 233 | let back = dict.string_to_id(&s, &IdKind::Subject); 234 | assert_eq!(id, back, "shared id {} -> subject {} -> id {}", id, s, back); 235 | 236 | let s = dict.id_to_string(id, &IdKind::Object)?; 237 | let back = dict.string_to_id(&s, &IdKind::Object); 238 | assert_eq!(id, back, "shared id {} -> object {} -> id {}", id, s, back); 239 | } 240 | for (sect, kind, name, offset) in [ 241 | (&dict.subjects, &IdKind::Subject, "subject", dict.shared.num_strings()), 242 | (&dict.objects, &IdKind::Object, "object", dict.shared.num_strings()), 243 | (&dict.predicates, &IdKind::Predicate, "predicate", 0), 244 | ] { 245 | for id in offset + 1..offset + sect.num_strings() { 246 | let s = dict.id_to_string(id, kind)?; 247 | let back = dict.string_to_id(&s, kind); 248 | assert_eq!(id, back, "{} id {} -> {} {} -> id {}", name, id, name, s, back); 249 | } 250 | } 251 | Ok(()) 252 | } 253 | } 254 | -------------------------------------------------------------------------------- /src/hdt.rs: -------------------------------------------------------------------------------- 1 | use crate::FourSectDict; 2 | use crate::containers::{ControlInfo, ControlInfoReadError}; 3 | use crate::four_sect_dict::{DictError, DictReadError, IdKind}; 4 | use crate::header::{Header, HeaderReadError}; 5 | use crate::triples::{ 6 | ObjectIter, PredicateIter, PredicateObjectIter, SubjectIter, TripleId, TriplesBitmap, TriplesReadError, 7 | }; 8 | use bytesize::ByteSize; 9 | use log::{debug, error}; 10 | #[cfg(feature = "cache")] 11 | use std::fs::File; 12 | #[cfg(feature = "cache")] 13 | use std::io::{Seek, SeekFrom, Write}; 14 | use std::iter; 15 | use std::sync::Arc; 16 | 17 | pub type Result = core::result::Result; 18 | 19 | /// In-memory representation of an RDF graph loaded from an HDT file. 20 | /// Allows queries by triple patterns. 21 | #[derive(Debug)] 22 | pub struct Hdt { 23 | //global_ci: ControlInfo, 24 | //header: Header, 25 | /// in-memory representation of dictionary 26 | pub dict: FourSectDict, 27 | /// in-memory representation of triples 28 | pub triples: TriplesBitmap, 29 | } 30 | 31 | type StringTriple = (Arc, Arc, Arc); 32 | 33 | /// The error type for the `translate_id` method. 34 | #[derive(thiserror::Error, Debug)] 35 | #[error("cannot translate triple ID {t:?} to string triple: {e}")] 36 | pub struct TranslateError { 37 | #[source] 38 | e: DictError, 39 | t: TripleId, 40 | } 41 | 42 | /// The error type for the `new` method. 43 | #[derive(thiserror::Error, Debug)] 44 | #[error("failed to read HDT")] 45 | pub enum Error { 46 | ControlInfo(#[from] ControlInfoReadError), 47 | Header(#[from] HeaderReadError), 48 | /// Failed to read HDT dictionary 49 | FourSectDict(#[from] DictReadError), 50 | Triples(#[from] TriplesReadError), 51 | DictionaryValidationErrorTodo(#[from] std::io::Error), 52 | } 53 | 54 | impl Hdt { 55 | #[deprecated(since = "0.4.0", note = "please use `read` instead")] 56 | pub fn new(reader: R) -> Result { 57 | Self::read(reader) 58 | } 59 | 60 | /// Creates an immutable HDT instance containing the dictionary and triples from the given reader. 61 | /// The reader must point to the beginning of the data of an HDT file as produced by hdt-cpp. 62 | /// FourSectionDictionary with DictionarySectionPlainFrontCoding and SPO order is the only supported implementation. 63 | /// The format is specified at , however there are some deviations. 64 | /// The initial HDT specification at is outdated and not supported. 65 | /// # Example 66 | /// ``` 67 | /// let file = std::fs::File::open("tests/resources/snikmeta.hdt").expect("error opening file"); 68 | /// let hdt = hdt::Hdt::new(std::io::BufReader::new(file)).unwrap(); 69 | /// ``` 70 | pub fn read(mut reader: R) -> Result { 71 | ControlInfo::read(&mut reader)?; 72 | Header::read(&mut reader)?; 73 | let unvalidated_dict = FourSectDict::read(&mut reader)?; 74 | let triples = TriplesBitmap::read_sect(&mut reader)?; 75 | let dict = unvalidated_dict.validate()?; 76 | let hdt = Hdt { dict, triples }; 77 | debug!("HDT size in memory {}, details:", ByteSize(hdt.size_in_bytes() as u64)); 78 | debug!("{hdt:#?}"); 79 | Ok(hdt) 80 | } 81 | 82 | /// Creates an immutable HDT instance containing the dictionary and triples from the Path. 83 | /// Will utilize a custom cached TriplesBitmap file if exists or create one if it does not exist. 84 | /// The file path must point to the beginning of the data of an HDT file as produced by hdt-cpp. 85 | /// FourSectionDictionary with DictionarySectionPlainFrontCoding and SPO order is the only supported implementation. 86 | /// The format is specified at , however there are some deviations. 87 | /// The initial HDT specification at is outdated and not supported. 88 | /// # Example 89 | /// ``` 90 | /// let hdt = hdt::Hdt::new_from_path(std::path::Path::new("tests/resources/snikmeta.hdt")).unwrap(); 91 | /// ``` 92 | #[cfg(feature = "cache")] 93 | pub fn new_from_path(f: &std::path::Path) -> Result { 94 | use log::warn; 95 | 96 | let source = File::open(f)?; 97 | let mut reader = std::io::BufReader::new(source); 98 | ControlInfo::read(&mut reader)?; 99 | Header::read(&mut reader)?; 100 | let unvalidated_dict = FourSectDict::read(&mut reader)?; 101 | let mut abs_path = std::fs::canonicalize(f)?; 102 | let _ = abs_path.pop(); 103 | let index_file_name = format!("{}.index.v1-rust-cache", f.file_name().unwrap().to_str().unwrap()); 104 | let index_file_path = abs_path.join(index_file_name); 105 | let triples = if index_file_path.exists() { 106 | let pos = reader.stream_position()?; 107 | match Self::load_with_cache(&mut reader, &index_file_path) { 108 | Ok(triples) => triples, 109 | Err(e) => { 110 | warn!("error loading cache, overwriting: {e}"); 111 | reader.seek(SeekFrom::Start(pos))?; 112 | Self::load_without_cache(&mut reader, &index_file_path)? 113 | } 114 | } 115 | } else { 116 | Self::load_without_cache(&mut reader, &index_file_path)? 117 | }; 118 | 119 | let dict = unvalidated_dict.validate()?; 120 | let hdt = Hdt { dict, triples }; 121 | debug!("HDT size in memory {}, details:", ByteSize(hdt.size_in_bytes() as u64)); 122 | debug!("{hdt:#?}"); 123 | Ok(hdt) 124 | } 125 | 126 | #[cfg(feature = "cache")] 127 | fn load_without_cache( 128 | mut reader: R, index_file_path: &std::path::PathBuf, 129 | ) -> Result { 130 | use log::warn; 131 | 132 | debug!("no cache detected, generating index"); 133 | let triples = TriplesBitmap::read_sect(&mut reader)?; 134 | debug!("index generated, saving cache to {}", index_file_path.display()); 135 | if let Err(e) = Self::write_cache(index_file_path, &triples) { 136 | warn!("error trying to save cache to file: {e}"); 137 | } 138 | Ok(triples) 139 | } 140 | 141 | #[cfg(feature = "cache")] 142 | fn load_with_cache( 143 | mut reader: R, index_file_path: &std::path::PathBuf, 144 | ) -> core::result::Result> { 145 | // load cached index 146 | debug!("hdt file cache detected, loading from {}", index_file_path.display()); 147 | let index_source = File::open(index_file_path)?; 148 | let mut index_reader = std::io::BufReader::new(index_source); 149 | let triples_ci = ControlInfo::read(&mut reader)?; 150 | Ok(TriplesBitmap::load_cache(&mut index_reader, &triples_ci)?) 151 | } 152 | 153 | #[cfg(feature = "cache")] 154 | fn write_cache( 155 | index_file_path: &std::path::PathBuf, triples: &TriplesBitmap, 156 | ) -> core::result::Result<(), Box> { 157 | let new_index_file = File::create(index_file_path)?; 158 | let mut writer = std::io::BufWriter::new(new_index_file); 159 | bincode::serde::encode_into_std_write(&triples, &mut writer, bincode::config::standard())?; 160 | writer.flush()?; 161 | Ok(()) 162 | } 163 | 164 | /// Recursive size in bytes on the heap. 165 | pub fn size_in_bytes(&self) -> usize { 166 | self.dict.size_in_bytes() + self.triples.size_in_bytes() 167 | } 168 | 169 | /// An iterator visiting *all* triples as strings in order. 170 | /// Using this method with a filter can be inefficient for large graphs, 171 | /// because the strings are stored in compressed form and must be decompressed and allocated. 172 | /// Whenever possible, use [`Hdt::triples_with_pattern`] instead. 173 | /// # Example 174 | /// ``` 175 | /// fn print_first_triple(hdt: hdt::Hdt) { 176 | /// println!("{:?}", hdt.triples().next().expect("no triple in the graph")); 177 | /// } 178 | /// ``` 179 | pub fn triples(&self) -> impl Iterator + '_ { 180 | let mut triple_cache = TripleCache::new(self); 181 | self.triples.into_iter().map(move |ids| triple_cache.translate(ids).unwrap()) 182 | } 183 | 184 | /// Get all subjects with the given property and object (?PO pattern). 185 | /// Use this over `triples_with_pattern(None,Some(p),Some(o))` if you don't need whole triples. 186 | /// # Example 187 | /// Who was born in Leipzig? 188 | /// ``` 189 | /// fn query(dbpedia: hdt::Hdt) { 190 | /// for person in dbpedia.subjects_with_po( 191 | /// "http://dbpedia.org/ontology/birthPlace", "http://dbpedia.org/resource/Leipzig") { 192 | /// println!("{person:?}"); 193 | /// } 194 | /// } 195 | /// ``` 196 | pub fn subjects_with_po(&self, p: &str, o: &str) -> Box + '_> { 197 | let pid = self.dict.string_to_id(p, &IdKind::Predicate); 198 | let oid = self.dict.string_to_id(o, &IdKind::Object); 199 | // predicate or object not in dictionary, iterator would interpret 0 as variable 200 | if pid == 0 || oid == 0 { 201 | return Box::new(iter::empty()); 202 | } 203 | // needed for extending the lifetime of the parameters into the iterator for error messages 204 | let p_owned = p.to_owned(); 205 | let o_owned = o.to_owned(); 206 | Box::new( 207 | PredicateObjectIter::new(&self.triples, pid, oid) 208 | .map(move |sid| self.dict.id_to_string(sid, &IdKind::Subject)) 209 | .filter_map(move |r| { 210 | r.map_err(|e| error!("Error on triple with property {p_owned} and object {o_owned}: {e}")).ok() 211 | }), 212 | ) 213 | } 214 | 215 | /// Get all triples that fit the given triple patterns, where `None` stands for a variable. 216 | /// For example, `triples_with_pattern(Some(s), Some(p), None)` answers an SP? pattern. 217 | /// # Example 218 | /// What is the capital of the United States of America? 219 | /// ``` 220 | /// fn query(dbpedia: hdt::Hdt) { 221 | /// println!("{:?}", dbpedia.triples_with_pattern( 222 | /// Some("http://dbpedia.org/resource/United_States"), Some("http://dbpedia.org/ontology/capital"), None) 223 | /// .next().expect("no capital found").2); 224 | /// } 225 | /// ``` 226 | pub fn triples_with_pattern<'a>( 227 | &'a self, sp: Option<&'a str>, pp: Option<&'a str>, op: Option<&'a str>, 228 | ) -> Box + 'a> { 229 | let xso: Option<(Arc, usize)> = 230 | sp.map(|s| (Arc::from(s), self.dict.string_to_id(s, &IdKind::Subject))); 231 | let xpo: Option<(Arc, usize)> = 232 | pp.map(|p| (Arc::from(p), self.dict.string_to_id(p, &IdKind::Predicate))); 233 | let xoo: Option<(Arc, usize)> = 234 | op.map(|o| (Arc::from(o), self.dict.string_to_id(o, &IdKind::Object))); 235 | if [&xso, &xpo, &xoo].into_iter().flatten().any(|x| x.1 == 0) { 236 | // at least one term does not exist in the graph 237 | return Box::new(iter::empty()); 238 | } 239 | // TODO: improve error handling 240 | let mut cache = TripleCache::new(self); 241 | match (xso, xpo, xoo) { 242 | (Some(s), Some(p), Some(o)) => { 243 | if SubjectIter::with_pattern(&self.triples, &TripleId::new(s.1, p.1, o.1)).next().is_some() { 244 | Box::new(iter::once((s.0, p.0, o.0))) 245 | } else { 246 | Box::new(iter::empty()) 247 | } 248 | } 249 | (Some(s), Some(p), None) => { 250 | Box::new(SubjectIter::with_pattern(&self.triples, &TripleId::new(s.1, p.1, 0)).map(move |t| { 251 | ( 252 | s.0.clone(), 253 | p.0.clone(), 254 | Arc::from(self.dict.id_to_string(t.object_id, &IdKind::Object).unwrap()), 255 | ) 256 | })) 257 | } 258 | (Some(s), None, Some(o)) => { 259 | Box::new(SubjectIter::with_pattern(&self.triples, &TripleId::new(s.1, 0, o.1)).map(move |t| { 260 | ( 261 | s.0.clone(), 262 | Arc::from(self.dict.id_to_string(t.predicate_id, &IdKind::Predicate).unwrap()), 263 | o.0.clone(), 264 | ) 265 | })) 266 | } 267 | (Some(s), None, None) => { 268 | Box::new(SubjectIter::with_pattern(&self.triples, &TripleId::new(s.1, 0, 0)).map(move |t| { 269 | ( 270 | s.0.clone(), 271 | cache.get_p_string(t.predicate_id).unwrap(), 272 | cache.get_o_string(t.object_id).unwrap(), 273 | ) 274 | })) 275 | } 276 | (None, Some(p), Some(o)) => { 277 | Box::new(PredicateObjectIter::new(&self.triples, p.1, o.1).map(move |sid| { 278 | (Arc::from(self.dict.id_to_string(sid, &IdKind::Subject).unwrap()), p.0.clone(), o.0.clone()) 279 | })) 280 | } 281 | (None, Some(p), None) => Box::new(PredicateIter::new(&self.triples, p.1).map(move |t| { 282 | (cache.get_s_string(t.subject_id).unwrap(), p.0.clone(), cache.get_o_string(t.object_id).unwrap()) 283 | })), 284 | (None, None, Some(o)) => Box::new(ObjectIter::new(&self.triples, o.1).map(move |t| { 285 | ( 286 | cache.get_s_string(t.subject_id).unwrap(), 287 | cache.get_p_string(t.predicate_id).unwrap(), 288 | o.0.clone(), 289 | ) 290 | })), 291 | (None, None, None) => Box::new(self.triples()), 292 | } 293 | } 294 | } 295 | 296 | /// A TripleCache stores the `Arc` of the last returned triple 297 | #[derive(Clone, Debug)] 298 | pub struct TripleCache<'a> { 299 | hdt: &'a super::Hdt, 300 | idx: [usize; 3], 301 | arc: [Option>; 3], 302 | } 303 | 304 | impl<'a> TripleCache<'a> { 305 | /// Build a new [`TripleCache`] for the given [`Hdt`] 306 | pub const fn new(hdt: &'a super::Hdt) -> Self { 307 | TripleCache { hdt, idx: [0; 3], arc: [None, None, None] } 308 | } 309 | 310 | /// Get the string representation of the subject `sid`. 311 | pub fn get_s_string(&mut self, sid: usize) -> core::result::Result, DictError> { 312 | self.get_x_string(sid, 0, &IdKind::Subject) 313 | } 314 | 315 | /// Get the string representation of the predicate `pid`. 316 | pub fn get_p_string(&mut self, pid: usize) -> core::result::Result, DictError> { 317 | self.get_x_string(pid, 1, &IdKind::Predicate) 318 | } 319 | 320 | /// Get the string representation of the object `oid`. 321 | pub fn get_o_string(&mut self, oid: usize) -> core::result::Result, DictError> { 322 | self.get_x_string(oid, 2, &IdKind::Object) 323 | } 324 | 325 | /// Translate a triple of indexes into a triple of strings. 326 | pub fn translate(&mut self, t: TripleId) -> core::result::Result { 327 | Ok(( 328 | self.get_s_string(t.subject_id).map_err(|e| TranslateError { e, t })?, 329 | self.get_p_string(t.predicate_id).map_err(|e| TranslateError { e, t })?, 330 | self.get_o_string(t.object_id).map_err(|e| TranslateError { e, t })?, 331 | )) 332 | } 333 | 334 | fn get_x_string( 335 | &mut self, i: usize, pos: usize, kind: &'static IdKind, 336 | ) -> core::result::Result, DictError> { 337 | debug_assert!(i != 0); 338 | if self.idx[pos] == i { 339 | Ok(self.arc[pos].as_ref().unwrap().clone()) 340 | } else { 341 | let ret: Arc = self.hdt.dict.id_to_string(i, kind)?.into(); 342 | self.arc[pos] = Some(ret.clone()); 343 | self.idx[pos] = i; 344 | Ok(ret) 345 | } 346 | } 347 | } 348 | 349 | #[cfg(test)] 350 | mod tests { 351 | use super::*; 352 | use crate::tests::init; 353 | use pretty_assertions::{assert_eq, assert_ne}; 354 | use std::fs::File; 355 | 356 | #[test] 357 | fn triples() -> color_eyre::Result<()> { 358 | init(); 359 | let filename = "tests/resources/snikmeta.hdt"; 360 | let file = File::open(filename)?; 361 | let hdt = Hdt::new(std::io::BufReader::new(file))?; 362 | let triples = hdt.triples(); 363 | let v: Vec = triples.collect(); 364 | assert_eq!(v.len(), 328); 365 | assert_eq!(v, hdt.triples_with_pattern(None, None, None).collect::>(), "all triples not equal ???"); 366 | assert_ne!(0, hdt.dict.string_to_id("http://www.snik.eu/ontology/meta", &IdKind::Subject)); 367 | for uri in ["http://www.snik.eu/ontology/meta/Top", "http://www.snik.eu/ontology/meta", "doesnotexist"] { 368 | let filtered: Vec<_> = v.clone().into_iter().filter(|triple| triple.0.as_ref() == uri).collect(); 369 | let with_s: Vec<_> = hdt.triples_with_pattern(Some(uri), None, None).collect(); 370 | assert_eq!(filtered, with_s, "different results between triples() and triples_with_s() for {}", uri); 371 | } 372 | let s = "http://www.snik.eu/ontology/meta/Top"; 373 | let p = "http://www.w3.org/2000/01/rdf-schema#label"; 374 | let o = "\"top class\"@en"; 375 | let triple_vec = vec![(Arc::from(s), Arc::from(p), Arc::from(o))]; 376 | // triple patterns with 2-3 terms 377 | assert_eq!(triple_vec, hdt.triples_with_pattern(Some(s), Some(p), Some(o)).collect::>(), "SPO"); 378 | assert_eq!(triple_vec, hdt.triples_with_pattern(Some(s), Some(p), None).collect::>(), "SP?"); 379 | assert_eq!(triple_vec, hdt.triples_with_pattern(Some(s), None, Some(o)).collect::>(), "S?O"); 380 | assert_eq!(triple_vec, hdt.triples_with_pattern(None, Some(p), Some(o)).collect::>(), "?PO"); 381 | let et = "http://www.snik.eu/ontology/meta/EntityType"; 382 | let meta = "http://www.snik.eu/ontology/meta"; 383 | let subjects = ["ApplicationComponent", "Method", "RepresentationType", "SoftwareProduct"] 384 | .map(|s| meta.to_owned() + "/" + s) 385 | .to_vec(); 386 | assert_eq!( 387 | subjects, 388 | hdt.subjects_with_po("http://www.w3.org/2000/01/rdf-schema#subClassOf", et).collect::>() 389 | ); 390 | assert_eq!( 391 | 12, 392 | hdt.triples_with_pattern(None, Some("http://www.w3.org/2000/01/rdf-schema#subClassOf"), None).count() 393 | ); 394 | assert_eq!(20, hdt.triples_with_pattern(None, None, Some(et)).count()); 395 | let snikeu = "http://www.snik.eu"; 396 | let triple_vec = [ 397 | "http://purl.org/dc/terms/publisher", "http://purl.org/dc/terms/source", 398 | "http://xmlns.com/foaf/0.1/homepage", 399 | ] 400 | .into_iter() 401 | .map(|p| (Arc::from(meta), Arc::from(p), Arc::from(snikeu))) 402 | .collect::>(); 403 | assert_eq!( 404 | triple_vec, 405 | hdt.triples_with_pattern(Some(meta), None, Some(snikeu)).collect::>(), 406 | "S?O multiple" 407 | ); 408 | let s = "http://www.snik.eu/ontology/meta/хобби-N-0"; 409 | let o = "\"ХОББИ\"@ru"; 410 | let triple_vec = vec![(Arc::from(s), Arc::from(p), Arc::from(o))]; 411 | assert_eq!(triple_vec, hdt.triples_with_pattern(Some(s), Some(p), None).collect::>(),); 412 | Ok(()) 413 | } 414 | } 415 | -------------------------------------------------------------------------------- /src/hdt_graph.rs: -------------------------------------------------------------------------------- 1 | // //! *This module is available only if HDT is built with the `"sophia"` feature.* 2 | #[cfg(feature = "sophia")] 3 | use crate::four_sect_dict::IdKind; 4 | use crate::hdt::Hdt; 5 | use crate::triples::{Id, ObjectIter, PredicateIter, PredicateObjectIter, SubjectIter, TripleId}; 6 | use log::debug; 7 | use sophia::api::graph::Graph; 8 | use sophia::api::term::{BnodeId, IriRef, LanguageTag, Term, matcher::TermMatcher}; 9 | use std::convert::Infallible; 10 | use std::io::{self, Error, ErrorKind}; 11 | use std::iter; 12 | use std::sync::Arc; 13 | 14 | mod term; 15 | pub use term::HdtTerm; 16 | 17 | /// Adapter to use HDT as a Sophia graph. 18 | pub struct HdtGraph { 19 | /// Wrapped HDT instance 20 | pub hdt: Hdt, 21 | } 22 | 23 | /// HdtGraph does not support all of the Sophia TermMatcher functionality. 24 | enum HdtMatcher { 25 | Constant((HdtTerm, Id)), 26 | Other, 27 | } 28 | 29 | impl HdtGraph { 30 | /// Wrapper around Hdt. 31 | pub const fn new(hdt: Hdt) -> Self { 32 | HdtGraph { hdt } 33 | } 34 | /// Size in bytes on the heap. 35 | pub fn size_in_bytes(&self) -> usize { 36 | self.hdt.size_in_bytes() 37 | } 38 | 39 | fn id_term(&self, id: Id, kind: &'static IdKind) -> HdtTerm { 40 | auto_term(&self.hdt.dict.id_to_string(id, kind).unwrap()).unwrap() 41 | // TODO: optimize by excluding cases depending on the id kind 42 | //IriRef::new_unchecked(MownStr::from(s)).into_term() 43 | } 44 | 45 | /// Transforms a Sophia TermMatcher to a constant HdtTerm and Id if possible. 46 | /// Returns none if it matches a constant term that cannot be found. 47 | fn unpack_matcher(&self, tm: &T, kind: &IdKind) -> Option { 48 | match tm.constant() { 49 | Some(t) => match HdtTerm::try_from(t.borrow_term()) { 50 | Some(t) => { 51 | let id = self.hdt.dict.string_to_id(&term_string(&t), kind); 52 | if id == 0 { 53 | return None; 54 | } 55 | Some(HdtMatcher::Constant((t, id))) 56 | } 57 | None => None, 58 | }, 59 | None => Some(HdtMatcher::Other), 60 | } 61 | } 62 | } 63 | 64 | /// Create the correct Sophia term for a given resource string. 65 | /// Slow, use the appropriate method if you know which type (Literal, URI, or blank node) the string has. 66 | fn auto_term(s: &str) -> io::Result { 67 | match s.chars().next() { 68 | None => Err(Error::new(ErrorKind::InvalidData, "empty input")), 69 | Some('"') => match s.rfind('"') { 70 | None => Err(Error::new( 71 | ErrorKind::InvalidData, 72 | format!("missing right quotation mark in literal string {s}"), 73 | )), 74 | Some(index) => { 75 | let lex = Arc::from(&s[1..index]); 76 | let rest = &s[index + 1..]; 77 | // literal with no language tag and no datatype 78 | if rest.is_empty() { 79 | return Ok(HdtTerm::LiteralDatatype(lex, term::XSD_STRING.clone())); 80 | } 81 | // either language tag or datatype 82 | if let Some(tag_index) = rest.find('@') { 83 | let tag = LanguageTag::new_unchecked(Arc::from(&rest[tag_index + 1..])); 84 | return Ok(HdtTerm::LiteralLanguage(lex, tag)); 85 | } 86 | // datatype 87 | let mut dt_split = rest.split("^^"); 88 | dt_split.next(); // empty 89 | match dt_split.next() { 90 | Some(dt) => { 91 | let unquoted = &dt[1..dt.len() - 1]; 92 | let dt = IriRef::new_unchecked(Arc::from(unquoted)); 93 | Ok(HdtTerm::LiteralDatatype(lex, dt)) 94 | } 95 | None => Err(Error::new(ErrorKind::InvalidData, format!("empty datatype in {s}"))), 96 | } 97 | } 98 | }, 99 | Some('_') => Ok(HdtTerm::BlankNode(BnodeId::new_unchecked(Arc::from(&s[2..])))), 100 | _ => Ok(HdtTerm::Iri(IriRef::new_unchecked(Arc::from(s)))), 101 | } 102 | } 103 | 104 | // Convert a SimpleTerm into the HDT String format. 105 | // Sophia doesn't include the _: prefix for blank node strings but HDT expects it 106 | // not needed for property terms, as they can't be blank nodes 107 | fn term_string(t: &HdtTerm) -> String { 108 | match t { 109 | HdtTerm::BlankNode(b) => "_:".to_owned() + b.as_str(), 110 | HdtTerm::Iri(i) => i.as_str().to_owned(), 111 | HdtTerm::LiteralLanguage(l, lang) => { 112 | format!("\"{l}\"@{}", lang.as_str()) 113 | } 114 | HdtTerm::LiteralDatatype(l, dt) => { 115 | let xsd_string: &str = "http://www.w3.org/2001/XMLSchema#string"; 116 | let dts = dt.as_str(); 117 | if dts == xsd_string { format!("\"{l}\"") } else { format!("\"{l}\"^^<{dts}>") } 118 | } 119 | } 120 | } 121 | 122 | impl Graph for HdtGraph { 123 | type Triple<'a> = [HdtTerm; 3]; 124 | type Error = Infallible; // infallible for now, figure out what to put here later 125 | 126 | /// # Example 127 | /// ``` 128 | /// use hdt::sophia::api::graph::Graph; 129 | /// fn print_first_triple(graph: hdt::HdtGraph) { 130 | /// println!("{:?}", graph.triples().next().expect("no triple in the graph")); 131 | /// } 132 | /// ``` 133 | fn triples(&self) -> impl Iterator, Self::Error>> { 134 | debug!("Iterating through ALL triples in the HDT Graph. This can be inefficient for large graphs."); 135 | self.hdt.triples().map(move |(s, p, o)| { 136 | Ok([auto_term(&s).unwrap(), HdtTerm::Iri(IriRef::new_unchecked(p)), auto_term(&o).unwrap()]) 137 | }) 138 | } 139 | 140 | /// Only supports constant and "any" matchers. 141 | /// Non-constant matchers are supposed to be "any" matchers. 142 | /// # Example 143 | /// Who was born in Leipzig? 144 | /// ``` 145 | /// use hdt::{Hdt,HdtGraph}; 146 | /// use hdt::sophia::api::graph::Graph; 147 | /// use hdt::sophia::api::term::{IriRef, SimpleTerm, matcher::Any}; 148 | /// 149 | /// fn query(dbpedia: hdt::HdtGraph) { 150 | /// let birth_place = SimpleTerm::Iri(IriRef::new_unchecked("http://www.snik.eu/ontology/birthPlace".into())); 151 | /// let leipzig = SimpleTerm::Iri(IriRef::new_unchecked("http://www.snik.eu/resource/Leipzig".into())); 152 | /// let persons = dbpedia.triples_matching(Any, Some(birth_place), Some(leipzig)); 153 | /// } 154 | /// ``` 155 | fn triples_matching<'s, S, P, O>( 156 | &'s self, sm: S, pm: P, om: O, 157 | ) -> impl Iterator, Self::Error>> + 's 158 | where 159 | S: TermMatcher + 's, 160 | P: TermMatcher + 's, 161 | O: TermMatcher + 's, 162 | { 163 | use HdtMatcher::{Constant, Other}; 164 | let xso = match self.unpack_matcher(&sm, &IdKind::Subject) { 165 | None => return Box::new(iter::empty()) as Box>, 166 | Some(x) => x, 167 | }; 168 | let xpo = match self.unpack_matcher(&pm, &IdKind::Predicate) { 169 | None => return Box::new(iter::empty()), 170 | Some(x) => x, 171 | }; 172 | let xoo = match self.unpack_matcher(&om, &IdKind::Object) { 173 | None => return Box::new(iter::empty()), 174 | Some(x) => x, 175 | }; 176 | // TODO: improve error handling 177 | match (xso, xpo, xoo) { 178 | //if SubjectIter::with_pattern(&self.hdt.triples, &TripleId::new(s.1, p.1, o.1)).next().is_some() { // always true 179 | (Constant(s), Constant(p), Constant(o)) => Box::new(iter::once(Ok([s.0, p.0, o.0]))), 180 | (Constant(s), Constant(p), Other) => Box::new( 181 | SubjectIter::with_pattern(&self.hdt.triples, &TripleId::new(s.1, p.1, 0)) 182 | .map(|tid| { 183 | auto_term(&self.hdt.dict.id_to_string(tid.object_id, &IdKind::Object).unwrap()).unwrap() 184 | }) 185 | .filter(move |term| om.matches(term)) 186 | .map(move |term| Ok([s.0.clone(), p.0.clone(), term])), 187 | ), 188 | (Constant(s), Other, Constant(o)) => Box::new( 189 | SubjectIter::with_pattern(&self.hdt.triples, &TripleId::new(s.1, 0, o.1)) 190 | .map(|t| self.id_term(t.predicate_id, &IdKind::Predicate)) 191 | .filter(move |term| pm.matches(term)) 192 | .map(move |term| Ok([s.0.clone(), term, o.0.clone()])), 193 | ), 194 | (Constant(s), Other, Other) => Box::new( 195 | SubjectIter::with_pattern(&self.hdt.triples, &TripleId::new(s.1, 0, 0)) 196 | .map(move |t| { 197 | [ 198 | self.id_term(t.predicate_id, &IdKind::Predicate), 199 | self.id_term(t.object_id, &IdKind::Object), 200 | ] 201 | }) 202 | .filter(move |[pt, ot]| pm.matches(pt) && om.matches(ot)) 203 | .map(move |[pt, ot]| Ok([s.0.clone(), pt, ot])), 204 | ), 205 | (Other, Constant(p), Constant(o)) => Box::new( 206 | PredicateObjectIter::new(&self.hdt.triples, p.1, o.1) 207 | .map(|sid| self.id_term(sid, &IdKind::Subject)) 208 | .filter(move |term| sm.matches(term)) 209 | .map(move |term| Ok([term, p.0.clone(), o.0.clone()])), 210 | ), 211 | (Other, Constant(p), Other) => Box::new( 212 | PredicateIter::new(&self.hdt.triples, p.1) 213 | .map(move |t| { 214 | [self.id_term(t.subject_id, &IdKind::Subject), self.id_term(t.object_id, &IdKind::Object)] 215 | }) 216 | .filter(move |[st, ot]| sm.matches(st) && om.matches(ot)) 217 | .map(move |[st, ot]| Ok([st, p.0.clone(), ot])), 218 | ), 219 | (Other, Other, Constant(o)) => Box::new(ObjectIter::new(&self.hdt.triples, o.1).map(move |t| { 220 | Ok([ 221 | auto_term(&Arc::from(self.hdt.dict.id_to_string(t.subject_id, &IdKind::Subject).unwrap())) 222 | .unwrap(), 223 | self.id_term(t.predicate_id, &IdKind::Predicate), 224 | o.0.clone(), 225 | ]) 226 | })), 227 | (Other, Other, Other) => Box::new( 228 | self.hdt 229 | .triples() 230 | .map(move |(s, p, o)| { 231 | [auto_term(&s).unwrap(), HdtTerm::Iri(IriRef::new_unchecked(p)), auto_term(&o).unwrap()] 232 | }) 233 | .filter(move |[st, pt, ot]| sm.matches(st) && pm.matches(pt) && om.matches(ot)) 234 | .map(Result::Ok), 235 | ), 236 | } 237 | } 238 | } 239 | 240 | #[cfg(test)] 241 | mod tests { 242 | use super::*; 243 | use crate::tests::init; 244 | use sophia::api::prelude::Triple; 245 | use sophia::api::term::matcher::Any; 246 | use std::fs::File; 247 | 248 | #[test] 249 | fn test_graph() -> color_eyre::Result<()> { 250 | init(); 251 | let file = File::open("tests/resources/snikmeta.hdt")?; 252 | let hdt = Hdt::read(std::io::BufReader::new(file))?; 253 | let graph = HdtGraph::new(hdt); 254 | let triples: Vec> = graph.triples().collect(); 255 | assert_eq!(triples.len(), 328); 256 | let meta_top = "http://www.snik.eu/ontology/meta/Top"; 257 | assert!( 258 | graph 259 | .triples_matching( 260 | Some(HdtTerm::Iri(IriRef::new_unchecked(Arc::from("http://www.snik.eu/ontology/meta")))), 261 | Any, 262 | Any 263 | ) 264 | .next() 265 | .is_some() 266 | ); 267 | for uri in [meta_top, "http://www.snik.eu/ontology/meta", "doesnotexist"] { 268 | let term = HdtTerm::Iri(IriRef::new_unchecked(Arc::from(uri))); 269 | let filtered: Vec<_> = triples 270 | .iter() 271 | .map(|triple| triple.as_ref().unwrap()) 272 | .filter(|triple| triple.s().iri().is_some() && triple.s().iri().unwrap().to_string() == uri) 273 | .collect(); 274 | let with_s: Vec<_> = graph.triples_matching(Some(term), Any, Any).map(Result::unwrap).collect(); 275 | // Sophia strings can't be compared directly, use the Debug trait for string comparison that is more brittle and less elegant 276 | // could break in the future e.g. because of ordering 277 | let filtered_string = format!("{filtered:?}"); 278 | let with_s_string = format!("{with_s:?}"); 279 | assert_eq!( 280 | filtered_string, with_s_string, 281 | "different results between triples() and triples_with_s() for {uri}" 282 | ); 283 | } 284 | let s = HdtTerm::Iri(IriRef::new_unchecked(meta_top.into())); 285 | let label = HdtTerm::Iri(IriRef::new_unchecked("http://www.w3.org/2000/01/rdf-schema#label".into())); 286 | let o = HdtTerm::LiteralLanguage("top class".into(), LanguageTag::new_unchecked("en".into())); 287 | assert!(graph.triples_matching(Any, Any, [o.borrow_term()]).next().is_some()); 288 | 289 | let tvec = vec![[s.clone(), label.clone(), o.clone()]]; 290 | assert_eq!( 291 | tvec, 292 | graph 293 | .triples_matching([s.borrow_term()], [label.borrow_term()], Any) 294 | .map(Result::unwrap) 295 | .collect::>() 296 | ); 297 | assert_eq!( 298 | tvec, 299 | graph 300 | .triples_matching([s.borrow_term()], Any, [o.borrow_term()]) 301 | .map(Result::unwrap) 302 | .collect::>() 303 | ); 304 | assert_eq!( 305 | tvec, 306 | graph 307 | .triples_matching(Any, [label.borrow_term()], [o.borrow_term()]) 308 | .map(Result::unwrap) 309 | .collect::>() 310 | ); 311 | assert_eq!(1, graph.triples_matching(Any, Any, ["22.10"]).count()); 312 | let date = HdtTerm::LiteralDatatype( 313 | "2022-10-20".into(), 314 | IriRef::new_unchecked("http://www.w3.org/2001/XMLSchema#date".into()), 315 | ); 316 | assert_eq!(1, graph.triples_matching(Any, Any, Some(&date)).count()); 317 | // *** matchers other than constant and Any ******************************************** 318 | let meta = HdtTerm::Iri(IriRef::new_unchecked("http://www.snik.eu/ontology/meta".into())); 319 | let modified = HdtTerm::Iri(IriRef::new_unchecked("http://purl.org/dc/terms/modified".into())); 320 | // SPO 321 | assert_eq!(2, graph.triples_matching([&meta, &s], [&label, &modified], [&date, &o]).count()); 322 | // SP? 323 | assert_eq!(3, graph.triples_matching([&meta, &s], [&label, &modified], Any).count()); 324 | // S?O 325 | assert_eq!(2, graph.triples_matching([&meta, &s], Any, [&date, &o]).count()); 326 | // S?? 327 | assert_eq!( 328 | graph.triples_matching([&meta, &s], Any, Any).count(), 329 | graph.triples_matching([&meta], Any, Any).count() + graph.triples_matching([&s], Any, Any).count(), 330 | ); 331 | // ?P? 332 | assert_eq!(2, graph.triples_matching(Any, Any, [&date, &o]).count()); 333 | // ?PO 334 | assert_eq!(2, graph.triples_matching(Any, [&label, &modified], [&date, &o]).count()); 335 | // ?P? 336 | assert_eq!( 337 | graph.triples_matching(Any, [&label, &modified], Any).count(), 338 | graph.triples_matching(Any, [&label], Any).count() 339 | + graph.triples_matching(Any, [&modified], Any).count() 340 | ); 341 | // test for errors involving blank nodes 342 | let blank = HdtTerm::BlankNode(BnodeId::new_unchecked("b1".into())); 343 | // blank node as input 344 | assert_eq!(3, graph.triples_matching(Some(&blank), Any, Any).count()); 345 | assert_eq!(1, graph.triples_matching(Any, Any, Some(&blank)).count()); 346 | // blank node as output 347 | let rdftype = 348 | HdtTerm::Iri(IriRef::new_unchecked("http://www.w3.org/1999/02/22-rdf-syntax-ns#type".into())); 349 | let owlrestriction = 350 | HdtTerm::Iri(IriRef::new_unchecked("http://www.w3.org/2002/07/owl#Restriction".into())); 351 | assert_eq!(1, graph.triples_matching(Any, Some(rdftype), Some(owlrestriction)).count()); 352 | // not in the original SNIK meta but added to cover more cases 353 | let s = HdtTerm::Iri(IriRef::new_unchecked("http://www.snik.eu/ontology/meta/хобби-N-0".into())); 354 | let o = HdtTerm::LiteralLanguage("ХОББИ".into(), LanguageTag::new_unchecked("ru".into())); 355 | let tvec = vec![[s.clone(), label.clone(), o.clone()]]; 356 | assert_eq!( 357 | tvec, 358 | graph 359 | .triples_matching([s.borrow_term()], [label.borrow_term()], Any) 360 | .map(Result::unwrap) 361 | .collect::>() 362 | ); 363 | Ok(()) 364 | } 365 | } 366 | -------------------------------------------------------------------------------- /src/hdt_graph/term.rs: -------------------------------------------------------------------------------- 1 | //! I define [`HdtTerm`], an implementation of [`sophia::api::term::Term`]. 2 | use sophia::api::MownStr; 3 | use sophia::api::ns::{rdf, xsd}; 4 | use sophia::api::term::{BnodeId, LanguageTag, Term, TermKind}; 5 | use sophia::iri::IriRef; 6 | use std::sync::{Arc, LazyLock}; 7 | 8 | pub static XSD_STRING: LazyLock>> = 9 | LazyLock::new(|| xsd::string.iri().unwrap().map_unchecked(|m| Arc::from(m.as_ref()))); 10 | 11 | /// An implementation of [`sophia::api::term::Term`] for [`HdtGraph`](super::HdtGraph). 12 | #[derive(Clone, Debug)] 13 | pub enum HdtTerm { 14 | /// This HdtTerm is an IRI 15 | Iri(IriRef>), 16 | /// This HdtTerm is a blank node 17 | BlankNode(BnodeId>), 18 | /// This HdtTerm is a literal with a "standard" datatype 19 | LiteralDatatype(Arc, IriRef>), 20 | /// This HdtTerm is a language string literal 21 | LiteralLanguage(Arc, LanguageTag>), 22 | } 23 | 24 | impl HdtTerm { 25 | /// Convert t into an HdtTerm if it is a supported kind of term. 26 | #[allow(clippy::needless_pass_by_value)] 27 | pub fn try_from(t: T) -> Option { 28 | match t.kind() { 29 | TermKind::Iri => t.iri().map(|iri| HdtTerm::Iri(iri.map_unchecked(mown2arc))), 30 | TermKind::BlankNode => t.bnode_id().map(|bnid| HdtTerm::BlankNode(bnid.map_unchecked(mown2arc))), 31 | TermKind::Literal => Some({ 32 | let lex = mown2arc(t.lexical_form().unwrap()); 33 | if let Some(tag) = t.language_tag() { 34 | let tag = tag.map_unchecked(mown2arc); 35 | HdtTerm::LiteralLanguage(lex, tag) 36 | } else { 37 | let dt = t.datatype().unwrap().map_unchecked(mown2arc); 38 | HdtTerm::LiteralDatatype(lex, dt) 39 | } 40 | }), 41 | _ => None, 42 | } 43 | } 44 | } 45 | 46 | impl Term for HdtTerm { 47 | type BorrowTerm<'x> 48 | = &'x Self 49 | where 50 | Self: 'x; 51 | 52 | fn kind(&self) -> TermKind { 53 | match self { 54 | HdtTerm::Iri(_) => TermKind::Iri, 55 | HdtTerm::BlankNode(_) => TermKind::BlankNode, 56 | HdtTerm::LiteralDatatype(..) | HdtTerm::LiteralLanguage(..) => TermKind::Literal, 57 | } 58 | } 59 | 60 | fn borrow_term(&self) -> Self::BorrowTerm<'_> { 61 | self 62 | } 63 | 64 | fn iri(&self) -> Option> { 65 | match self { 66 | HdtTerm::Iri(iri) => Some(iri.as_ref().map_unchecked(MownStr::from_ref)), 67 | _ => None, 68 | } 69 | } 70 | 71 | fn bnode_id(&self) -> Option> { 72 | match self { 73 | HdtTerm::BlankNode(bnid) => Some(bnid.as_ref().map_unchecked(MownStr::from_ref)), 74 | _ => None, 75 | } 76 | } 77 | 78 | fn lexical_form(&self) -> Option { 79 | match self { 80 | HdtTerm::LiteralDatatype(lex, _) | HdtTerm::LiteralLanguage(lex, _) => Some(lex.as_ref().into()), 81 | _ => None, 82 | } 83 | } 84 | 85 | fn datatype(&self) -> Option> { 86 | match self { 87 | HdtTerm::LiteralDatatype(_, datatype) => Some(datatype.as_ref().map_unchecked(MownStr::from_ref)), 88 | HdtTerm::LiteralLanguage(..) => rdf::langString.iri(), 89 | _ => None, 90 | } 91 | } 92 | 93 | fn language_tag(&self) -> Option> { 94 | match self { 95 | HdtTerm::LiteralLanguage(_, tag) => Some(tag.as_ref().map_unchecked(MownStr::from_ref)), 96 | _ => None, 97 | } 98 | } 99 | } 100 | 101 | impl PartialEq for HdtTerm { 102 | fn eq(&self, other: &Self) -> bool { 103 | Term::eq(self, other) 104 | } 105 | } 106 | 107 | impl Eq for HdtTerm {} 108 | 109 | fn mown2arc(m: MownStr) -> Arc { 110 | Box::::from(m).into() 111 | } 112 | -------------------------------------------------------------------------------- /src/header.rs: -------------------------------------------------------------------------------- 1 | use crate::containers::ControlInfo; 2 | use crate::containers::rdf::{Id, Literal, Term, Triple}; 3 | use ntriple::parser::triple_line; 4 | use std::collections::BTreeSet; 5 | use std::io::BufRead; 6 | use std::str; 7 | 8 | /// Metadata about the dataset, see . 9 | #[derive(Debug, Clone)] 10 | pub struct Header { 11 | /// Header data format. Only "ntriples" is supported. 12 | pub format: String, 13 | /// The number of bytes of the header data. 14 | pub length: usize, 15 | /// Triples describing the dataset. 16 | pub body: BTreeSet, 17 | } 18 | 19 | /// The error type for the `read` method. 20 | #[derive(thiserror::Error, Debug)] 21 | #[error("failed to read HDT header")] 22 | pub enum HeaderReadError { 23 | #[error("{0}")] 24 | Other(String), 25 | Io(#[from] std::io::Error), 26 | ControlInfoError(#[from] crate::containers::ControlInfoReadError), 27 | #[error("invalid header format {0}, only 'ntriples' is supported")] 28 | InvalidHeaderFormat(String), 29 | } 30 | 31 | impl Header { 32 | /// Reader needs to be positioned directly after the global control information. 33 | pub fn read(reader: &mut R) -> Result { 34 | use HeaderReadError::*; 35 | let header_ci = ControlInfo::read(reader)?; 36 | if header_ci.format != "ntriples" { 37 | return Err(InvalidHeaderFormat(header_ci.format)); 38 | } 39 | 40 | //let ls = header_ci.get("length").ok_or_else(|| "missing header length".to_owned().into())?; 41 | let ls = header_ci.get("length").unwrap(); 42 | let length = ls.parse::().unwrap(); 43 | //ls.parse::().map_err(|_| format!("invalid header length '{ls}'").into())?; 44 | 45 | let mut body_buffer: Vec = vec![0; length]; 46 | reader.read_exact(&mut body_buffer)?; 47 | let mut body = BTreeSet::new(); 48 | 49 | for line_slice in body_buffer.split(|b| b == &b'\n') { 50 | let line = str::from_utf8(line_slice).map_err(|_| Other("Header is not UTF-8".to_owned()))?; 51 | if let Ok(Some(triple)) = triple_line(line) { 52 | let subject = match triple.subject { 53 | ntriple::Subject::IriRef(iri) => Id::Named(iri), 54 | ntriple::Subject::BNode(id) => Id::Blank(id), 55 | }; 56 | 57 | let ntriple::Predicate::IriRef(predicate) = triple.predicate; 58 | 59 | let object = match triple.object { 60 | ntriple::Object::IriRef(iri) => Term::Id(Id::Named(iri)), 61 | ntriple::Object::BNode(id) => Term::Id(Id::Blank(id)), 62 | ntriple::Object::Lit(lit) => Term::Literal(match lit.data_type { 63 | ntriple::TypeLang::Lang(lan) => Literal::new_lang(lit.data, lan), 64 | ntriple::TypeLang::Type(data_type) => { 65 | if data_type == "http://www.w3.org/2001/XMLSchema#string" { 66 | Literal::new(lit.data) 67 | } else { 68 | Literal::new_typed(lit.data, data_type) 69 | } 70 | } 71 | }), 72 | }; 73 | 74 | body.insert(Triple::new(subject, predicate, object)); 75 | } 76 | } 77 | Ok(Header { format: header_ci.format, length, body }) 78 | } 79 | } 80 | 81 | #[cfg(test)] 82 | mod tests { 83 | use super::*; 84 | use crate::tests::init; 85 | use std::fs::File; 86 | use std::io::BufReader; 87 | 88 | #[test] 89 | fn read_header() -> color_eyre::Result<()> { 90 | init(); 91 | let file = File::open("tests/resources/yago_header.hdt")?; 92 | let mut reader = BufReader::new(file); 93 | ControlInfo::read(&mut reader)?; 94 | 95 | let header = Header::read(&mut reader)?; 96 | assert_eq!(header.format, "ntriples"); 97 | assert_eq!(header.length, 1891); 98 | assert_eq!(header.body.len(), 22); 99 | Ok(()) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! [![github]](https://github.com/konradhoeffner/hdt) [![crates-io]](https://crates.io/crates/hdt) [![docs-rs]](crate) 2 | //! 3 | //! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github 4 | //! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust 5 | //! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs 6 | //! 7 | //!
8 | //! 9 | //! HDT is a loading and triple pattern querying library for the [Header Dictionary Triples](https://www.rdfhdt.org/) compressed binary RDF format. 10 | //! 11 | //! Currently this library only supports loading and querying existing HDT files as created by [hdt-cpp](https://github.com/rdfhdt/hdt-cpp). 12 | //! For reference implementations of HDT in C++ and Java, which support conversion and serialization from and into HDT with different format options, 13 | //! and acknowledgement of all the original authors, please look at the organisation. 14 | //! 15 | //! # Example of loading and querying an HDT file 16 | //! 17 | //! ```no_run 18 | //! use hdt::Hdt; 19 | //! // Load an hdt file 20 | //! let file = std::fs::File::open("example.hdt").expect("error opening file"); 21 | //! let hdt = Hdt::new(std::io::BufReader::new(file)).expect("error loading HDT"); 22 | //! // query 23 | //! let majors = hdt.triples_with_pattern(Some("http://dbpedia.org/resource/Leipzig"), Some("http://dbpedia.org/ontology/major"),None); 24 | //! println!("{:?}", majors.collect::>()); 25 | //! ``` 26 | //! 27 | #![cfg_attr( 28 | feature = "cache", 29 | doc = r#" 30 | # Experimental Features 31 | The **cache** feature is experimental and may change or be removed in future releases. 32 | 33 | Creating and/or loading a HDT file leveraging a custom cache: 34 | 35 | ```no_run 36 | let hdt = hdt::Hdt::new_from_path(std::path::Path::new("tests/resources/snikmeta.hdt")).unwrap(); 37 | ``` 38 | "# 39 | )] 40 | #![cfg_attr( 41 | feature = "sophia", 42 | doc = r#" 43 | # Additional Optional Features 44 | 45 | Using the **sophia** adapter: 46 | 47 | ``` 48 | use hdt::{Hdt,HdtGraph}; 49 | use hdt::sophia::api::graph::Graph; 50 | use hdt::sophia::api::term::{IriRef, SimpleTerm, matcher::Any}; 51 | 52 | fn query(hdt: Hdt) 53 | { 54 | let graph = HdtGraph::new(hdt); 55 | let s = SimpleTerm::Iri(IriRef::new_unchecked("http://dbpedia.org/resource/Leipzig".into())); 56 | let p = SimpleTerm::Iri(IriRef::new_unchecked("http://dbpedia.org/ontology/major".into())); 57 | let majors = graph.triples_matching(Some(s),Some(p),Any); 58 | } 59 | ``` 60 | "# 61 | )] 62 | // # Optional features 63 | // 64 | // The following features are available. 65 | // 66 | // - **`sophia`** *(enabled by default)* — Implements the Graph trait from the [Sophia](https://crates.io/crates/sophia) RDF toolkit. 67 | // This allows you to drastically reduce the RAM usage of an existing application based on Sophia that loads a large knowledge base but requires an input file in the HDT format. 68 | #![warn(missing_docs)] 69 | #![warn(clippy::pedantic)] 70 | #![warn(clippy::cargo)] 71 | #![warn(clippy::str_to_string)] 72 | #![warn(clippy::print_stdout)] 73 | #![warn(clippy::print_stderr)] 74 | #![warn(clippy::missing_const_for_fn)] 75 | #![allow(clippy::unnecessary_cast)] 76 | #![allow(clippy::must_use_candidate)] 77 | #![allow(clippy::missing_errors_doc)] 78 | #![allow(clippy::missing_panics_doc)] 79 | #![allow(clippy::cast_lossless)] 80 | #![allow(clippy::cast_possible_truncation)] 81 | #![allow(clippy::wildcard_imports)] 82 | #![allow(clippy::module_name_repetitions)] 83 | #![allow(clippy::similar_names)] 84 | #![allow(clippy::doc_markdown)] 85 | #![allow(clippy::if_not_else)] 86 | #![allow(clippy::into_iter_without_iter)] 87 | #![allow(clippy::len_without_is_empty)] 88 | // multiple versions of syn crate in transitive dependencies 89 | #![allow(clippy::multiple_crate_versions)] 90 | /// Types for storing and reading data. 91 | pub mod containers; 92 | /// Types for representing dictionaries. 93 | pub mod dict_sect_pfc; 94 | /// Types for representing a four section dictionary 95 | pub mod four_sect_dict; 96 | /// Types for representing triple sections. 97 | pub mod hdt; 98 | #[cfg(feature = "sophia")] 99 | pub use sophia; 100 | #[cfg(feature = "sophia")] 101 | /// Adapter for the Sophia library. 102 | pub mod hdt_graph; 103 | /// Types for representing the header. 104 | pub mod header; 105 | /// Types for representing and querying triples. 106 | pub mod triples; 107 | 108 | pub use crate::hdt::Hdt; 109 | use containers::{ControlInfo, ControlInfoReadError}; 110 | use dict_sect_pfc::DictSectPFC; 111 | use four_sect_dict::FourSectDict; 112 | pub use four_sect_dict::IdKind; 113 | #[cfg(feature = "sophia")] 114 | pub use hdt_graph::HdtGraph; 115 | 116 | #[cfg(test)] 117 | mod tests { 118 | use std::sync::Once; 119 | 120 | static INIT: Once = Once::new(); 121 | 122 | pub fn init() { 123 | INIT.call_once(|| { 124 | color_eyre::install().unwrap(); 125 | env_logger::init(); 126 | }); 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/triples.rs: -------------------------------------------------------------------------------- 1 | use crate::containers::{AdjList, Bitmap, BitmapReadError, Sequence, SequenceReadError}; 2 | use crate::{ControlInfo, ControlInfoReadError}; 3 | use bytesize::ByteSize; 4 | use log::{debug, error}; 5 | use std::cmp::Ordering; 6 | use std::fmt; 7 | use std::io::BufRead; 8 | use sucds::{ 9 | Serializable, 10 | bit_vectors::{BitVector, Rank9Sel}, 11 | char_sequences::WaveletMatrix, 12 | int_vectors::CompactVector, 13 | }; 14 | 15 | mod subject_iter; 16 | pub use subject_iter::SubjectIter; 17 | mod predicate_iter; 18 | pub use predicate_iter::PredicateIter; 19 | mod predicate_object_iter; 20 | pub use predicate_object_iter::PredicateObjectIter; 21 | mod object_iter; 22 | pub use object_iter::ObjectIter; 23 | #[cfg(feature = "cache")] 24 | use serde::ser::SerializeStruct; 25 | 26 | /// Order of the triple sections. 27 | /// Only SPO is tested, others probably don't work correctly. 28 | #[allow(missing_docs)] 29 | #[repr(u8)] 30 | #[derive(Debug, Default, Clone, PartialEq, Eq, PartialOrd, Ord)] 31 | #[cfg_attr(feature = "cache", derive(serde::Deserialize, serde::Serialize))] 32 | pub enum Order { 33 | #[default] 34 | Unknown = 0, 35 | SPO = 1, 36 | SOP = 2, 37 | PSO = 3, 38 | POS = 4, 39 | OSP = 5, 40 | OPS = 6, 41 | } 42 | 43 | impl TryFrom for Order { 44 | type Error = TriplesReadError; 45 | 46 | fn try_from(original: u32) -> Result { 47 | match original { 48 | 0 => Ok(Order::Unknown), 49 | 1 => Ok(Order::SPO), 50 | 2 => Ok(Order::SOP), 51 | 3 => Ok(Order::PSO), 52 | 4 => Ok(Order::POS), 53 | 5 => Ok(Order::OSP), 54 | 6 => Ok(Order::OPS), 55 | n => Err(TriplesReadError::UnrecognizedTriplesOrder(n)), 56 | } 57 | } 58 | } 59 | 60 | /// Inverse index from object id to positions in the object adjacency list. 61 | /// Used for logarithmic (?) time access instead of linear time sequential search. 62 | pub struct OpIndex { 63 | /// Compact integer vector of object positions. 64 | pub sequence: CompactVector, 65 | /// Bitmap with a one bit for every new object to allow finding the starting point for a given object id. 66 | pub bitmap: Bitmap, 67 | } 68 | 69 | #[cfg(feature = "cache")] 70 | impl serde::Serialize for OpIndex { 71 | fn serialize(&self, serializer: S) -> Result 72 | where 73 | S: serde::ser::Serializer, 74 | { 75 | let mut state: ::SerializeStruct = 76 | serializer.serialize_struct("OpIndex", 2)?; 77 | 78 | // Serialize sequence using `sucds` 79 | let mut seq_buffer = Vec::new(); 80 | self.sequence.serialize_into(&mut seq_buffer).map_err(serde::ser::Error::custom)?; 81 | state.serialize_field("sequence", &seq_buffer)?; 82 | 83 | state.serialize_field("bitmap", &self.bitmap)?; 84 | 85 | state.end() 86 | } 87 | } 88 | 89 | #[cfg(feature = "cache")] 90 | impl<'de> serde::Deserialize<'de> for OpIndex { 91 | fn deserialize(deserializer: D) -> Result 92 | where 93 | D: serde::de::Deserializer<'de>, 94 | { 95 | #[derive(serde::Deserialize)] 96 | struct OpIndexData { 97 | sequence: Vec, 98 | bitmap: Bitmap, 99 | } 100 | 101 | let data = OpIndexData::deserialize(deserializer)?; 102 | 103 | // Deserialize `sucds` structures 104 | let mut seq_reader = std::io::BufReader::new(&data.sequence[..]); 105 | 106 | let v = CompactVector::deserialize_from(&mut seq_reader).map_err(serde::de::Error::custom)?; 107 | let index = OpIndex { sequence: v, bitmap: data.bitmap }; // Replace with proper reconstruction 108 | 109 | Ok(index) 110 | } 111 | } 112 | 113 | impl fmt::Debug for OpIndex { 114 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 115 | writeln!(f, "total size {} {{", ByteSize(self.size_in_bytes() as u64))?; 116 | writeln!( 117 | f, 118 | " sequence: {} with {} bits,", 119 | ByteSize(self.sequence.len() as u64 * self.sequence.width() as u64 / 8), 120 | self.sequence.width() 121 | )?; 122 | write!(f, " bitmap: {:#?}\n}}", self.bitmap) 123 | } 124 | } 125 | 126 | impl OpIndex { 127 | /// Size in bytes on the heap. 128 | pub fn size_in_bytes(&self) -> usize { 129 | self.sequence.len() * self.sequence.width() / 8 + self.bitmap.size_in_bytes() 130 | } 131 | /// Find the first position in the OP index of the given object ID. 132 | pub fn find(&self, o: Id) -> usize { 133 | self.bitmap.select1(o - 1).unwrap() as usize 134 | } 135 | /// Find the last position in the object index of the given object ID. 136 | pub fn last(&self, o: Id) -> usize { 137 | match self.bitmap.select1(o) { 138 | Some(index) => index as usize - 1, 139 | None => self.bitmap.len() - 1, 140 | } 141 | } 142 | } 143 | 144 | /// `BitmapTriples` variant of the triples section. 145 | //#[derive(Clone)] 146 | pub struct TriplesBitmap { 147 | order: Order, 148 | /// bitmap to find positions in the wavelet matrix 149 | pub bitmap_y: Bitmap, 150 | /// adjacency list storing the object IDs 151 | pub adjlist_z: AdjList, 152 | /// Index for object-based access. Points to the predicate layer. 153 | pub op_index: OpIndex, 154 | /// wavelet matrix for predicate-based access 155 | pub wavelet_y: WaveletMatrix, 156 | } 157 | 158 | /// The error type for the triples bitmap read function. 159 | #[derive(thiserror::Error, Debug)] 160 | pub enum TriplesReadError { 161 | #[error("failed to read control info")] 162 | ControlInfoReadError(#[from] ControlInfoReadError), 163 | #[error("bitmap read error")] 164 | BitmapReadError(#[from] BitmapReadError), 165 | #[error("sequence read error")] 166 | SequenceReadError(#[from] SequenceReadError), 167 | #[error("unspecified triples order")] 168 | UnspecifiedTriplesOrder, 169 | #[error("unknown triples order")] 170 | UnknownTriplesOrder, 171 | #[error("unrecognized triples order {0}")] 172 | UnrecognizedTriplesOrder(u32), 173 | #[error("unknown triples format {0}")] 174 | UnknownTriplesFormat(String), 175 | #[error("triple lists are not supported yet")] 176 | TriplesList, 177 | #[error("({0},{1},{2}) none of the components of a triple may be 0.")] 178 | TripleComponentZero(usize, usize, usize), 179 | #[error("unspecified external library error")] 180 | ExternalError(#[from] Box), 181 | #[error("cache decode error")] 182 | #[cfg(feature = "cache")] 183 | DecodeError(#[from] bincode::error::DecodeError), 184 | } 185 | 186 | impl fmt::Debug for TriplesBitmap { 187 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 188 | writeln!(f, "total size {}", ByteSize(self.size_in_bytes() as u64))?; 189 | writeln!(f, "adjlist_z {:#?}", self.adjlist_z)?; 190 | writeln!(f, "op_index {:#?}", self.op_index)?; 191 | write!(f, "wavelet_y {}", ByteSize(self.wavelet_y.size_in_bytes() as u64)) 192 | } 193 | } 194 | 195 | #[cfg(feature = "cache")] 196 | impl serde::Serialize for TriplesBitmap { 197 | fn serialize(&self, serializer: S) -> Result 198 | where 199 | S: serde::ser::Serializer, 200 | { 201 | let mut state: ::SerializeStruct = 202 | serializer.serialize_struct("TriplesBitmap", 5)?; 203 | 204 | // Extract the number of triples 205 | state.serialize_field("order", &self.order)?; 206 | 207 | //bitmap_y 208 | state.serialize_field("bitmap_y", &self.bitmap_y)?; 209 | 210 | // adjlist_z 211 | state.serialize_field("adjlist_z", &self.adjlist_z)?; 212 | 213 | // op_index 214 | state.serialize_field("op_index", &self.op_index)?; 215 | 216 | // wavelet_y 217 | let mut wavelet_y_buffer = Vec::new(); 218 | self.wavelet_y.serialize_into(&mut wavelet_y_buffer).map_err(serde::ser::Error::custom)?; 219 | state.serialize_field("wavelet_y", &wavelet_y_buffer)?; 220 | 221 | state.end() 222 | } 223 | } 224 | 225 | #[cfg(feature = "cache")] 226 | impl<'de> serde::Deserialize<'de> for TriplesBitmap { 227 | fn deserialize(deserializer: D) -> Result 228 | where 229 | D: serde::de::Deserializer<'de>, 230 | { 231 | #[derive(serde::Deserialize)] 232 | struct TriplesBitmapData { 233 | order: Order, 234 | pub bitmap_y: Bitmap, 235 | pub adjlist_z: AdjList, 236 | pub op_index: OpIndex, 237 | pub wavelet_y: Vec, 238 | } 239 | 240 | let data = TriplesBitmapData::deserialize(deserializer)?; 241 | 242 | // Deserialize `sucds` structures 243 | let mut bitmap_reader = std::io::BufReader::new(&data.wavelet_y[..]); 244 | let wavelet_y = 245 | WaveletMatrix::::deserialize_from(&mut bitmap_reader).map_err(serde::de::Error::custom)?; 246 | 247 | let bitmap = TriplesBitmap { 248 | order: data.order, 249 | bitmap_y: data.bitmap_y, 250 | adjlist_z: data.adjlist_z, 251 | op_index: data.op_index, 252 | wavelet_y, 253 | }; 254 | 255 | Ok(bitmap) 256 | } 257 | } 258 | 259 | impl TriplesBitmap { 260 | /// read the whole triple section including control information 261 | pub fn read_sect(reader: &mut R) -> Result { 262 | use TriplesReadError::*; 263 | let triples_ci = ControlInfo::read(reader)?; 264 | 265 | match &triples_ci.format[..] { 266 | "" => TriplesBitmap::read(reader, &triples_ci), 267 | "" => Err(TriplesList), 268 | f => Err(UnknownTriplesFormat(f.to_owned())), 269 | } 270 | } 271 | 272 | /// load the cached HDT index file, only supports TriplesBitmap 273 | #[cfg(feature = "cache")] 274 | pub fn load_cache(reader: &mut R, info: &ControlInfo) -> Result { 275 | use TriplesReadError::*; 276 | match &info.format[..] { 277 | "" => TriplesBitmap::load(reader), 278 | "" => Err(TriplesList), 279 | f => Err(UnknownTriplesFormat(f.to_owned())), 280 | } 281 | } 282 | 283 | /// load the entire cached TriplesBitmap object 284 | #[cfg(feature = "cache")] 285 | pub fn load(reader: &mut R) -> Result { 286 | let triples: TriplesBitmap = bincode::serde::decode_from_std_read(reader, bincode::config::standard())?; 287 | Ok(triples) 288 | } 289 | 290 | /// Size in bytes on the heap. 291 | pub fn size_in_bytes(&self) -> usize { 292 | self.adjlist_z.size_in_bytes() + self.op_index.size_in_bytes() + self.wavelet_y.size_in_bytes() 293 | } 294 | 295 | /// Position in the wavelet index of the first predicate for the given subject ID. 296 | pub fn find_y(&self, subject_id: Id) -> usize { 297 | if subject_id == 0 { 298 | return 0; 299 | } 300 | self.bitmap_y.select1(subject_id - 1).unwrap() as usize + 1 301 | } 302 | 303 | /// Position in the wavelet index of the last predicate for the given subject ID. 304 | pub fn last_y(&self, subject_id: usize) -> usize { 305 | self.find_y(subject_id + 1) - 1 306 | } 307 | 308 | /// Binary search in the wavelet matrix. 309 | fn bin_search_y(&self, element: usize, begin: usize, end: usize) -> Option { 310 | let mut low = begin; 311 | let mut high = end; 312 | 313 | while low < high { 314 | let mid = usize::midpoint(low, high); 315 | match self.wavelet_y.access(mid).unwrap().cmp(&element) { 316 | Ordering::Less => low = mid + 1, 317 | Ordering::Greater => high = mid, 318 | Ordering::Equal => return Some(mid), 319 | } 320 | } 321 | None 322 | } 323 | 324 | /// Search the wavelet matrix for the position of a given subject, predicate pair. 325 | pub fn search_y(&self, subject_id: usize, property_id: usize) -> Option { 326 | self.bin_search_y(property_id, self.find_y(subject_id), self.last_y(subject_id) + 1) 327 | } 328 | 329 | fn build_wavelet(mut sequence: Sequence) -> WaveletMatrix { 330 | debug!("Building wavelet matrix..."); 331 | let mut builder = 332 | CompactVector::new(sequence.bits_per_entry).expect("Failed to create wavelet matrix builder"); 333 | // possible refactor of Sequence to use sucds CompactVector, then builder can be removed 334 | for x in &sequence { 335 | builder.push_int(x).unwrap(); 336 | } 337 | assert!(sequence.crc_handle.take().unwrap().join().unwrap(), "Wavelet source CRC check failed."); 338 | drop(sequence); 339 | let wavelet = WaveletMatrix::new(builder).expect("Error building the wavelet matrix. Aborting."); 340 | debug!("Built wavelet matrix with length {}", wavelet.len()); 341 | wavelet 342 | } 343 | 344 | /* 345 | /// Get the predicate ID for the given z index position. 346 | fn get_p(bitmap_z: Bitmap, wavelet_y: WaveletMatrix, pos_z: usize) -> Id { 347 | let pos_y = bitmap_z.dict.rank(pos_z, true); 348 | wavelet_y.get(pos_y as usize) as Id 349 | } 350 | */ 351 | 352 | fn read(reader: &mut R, triples_ci: &ControlInfo) -> Result { 353 | use TriplesReadError::*; 354 | // read order 355 | //let order: Order = Order::try_from(triples_ci.get("order").unwrap().parse::()); 356 | let order: Order; 357 | if let Some(n) = triples_ci.get("order").and_then(|v| v.parse::().ok()) { 358 | order = Order::try_from(n)?; 359 | } else { 360 | return Err(UnspecifiedTriplesOrder); 361 | } 362 | 363 | // read bitmaps 364 | // TODO: note level in the error 365 | let bitmap_y = Bitmap::read(reader)?; //.wrap_err("Failed to read Y level bitmap")?; 366 | let bitmap_z = Bitmap::read(reader)?; //.wrap_err("Failed to read Z level bitmap")?; 367 | 368 | // read sequences 369 | let sequence_y = Sequence::read(reader)?; 370 | let wavelet_thread = std::thread::spawn(|| Self::build_wavelet(sequence_y)); 371 | let mut sequence_z = Sequence::read(reader)?; 372 | 373 | // construct adjacency lists 374 | // construct object-based index to traverse from the leaves and support ??O and ?PO queries 375 | debug!("Building OPS index..."); 376 | let entries = sequence_z.entries; 377 | // if it takes too long to calculate, can also pass in as parameter 378 | let max_object = sequence_z.into_iter().max().unwrap().to_owned(); 379 | // limited to < 2^32 objects 380 | let mut indicess = vec![Vec::::with_capacity(4); max_object]; 381 | 382 | // Count the indexes of appearance of each object 383 | // In https://github.com/rdfhdt/hdt-cpp/blob/develop/libhdt/src/triples/BitmapTriples.cpp 384 | // they count the number of appearances in a sequence instead, which saves memory 385 | // temporarily but they need to loop over it an additional time. 386 | for pos_z in 0..entries { 387 | let object = sequence_z.get(pos_z); 388 | if object == 0 { 389 | error!("ERROR: There is a zero value in the Z level."); 390 | continue; 391 | } 392 | let pos_y = bitmap_z.rank(pos_z.to_owned()); 393 | indicess[object - 1].push(pos_y as u32); // hdt index counts from 1 but we count from 0 for simplicity 394 | } 395 | // reduce memory consumption of index by using adjacency list 396 | let mut bitmap_index_bitvector = BitVector::new(); 397 | let mut cv = CompactVector::with_capacity(entries, sucds::utils::needed_bits(entries)) 398 | .map_err(|e| e.into_boxed_dyn_error())?; 399 | let wavelet_y = wavelet_thread.join().unwrap(); 400 | /* 401 | let get_p = |pos_z: u32| { 402 | let pos_y = bitmap_z.dict.rank(pos_z.to_owned() as u64, true); 403 | wavelet_y.access(pos_y as usize).unwrap() as Id 404 | }; 405 | */ 406 | for mut indices in indicess { 407 | let mut first = true; 408 | // sort by predicate 409 | indices.sort_by_cached_key(|pos_y| wavelet_y.access(*pos_y as usize).unwrap()); 410 | for index in indices { 411 | bitmap_index_bitvector.push_bit(first); 412 | first = false; 413 | cv.push_int(index as usize).unwrap(); 414 | } 415 | } 416 | let bitmap_index = Bitmap { dict: Rank9Sel::new(bitmap_index_bitvector) }; 417 | let op_index = OpIndex { sequence: cv, bitmap: bitmap_index }; 418 | debug!("built OPS index"); 419 | assert!(sequence_z.crc_handle.take().unwrap().join().unwrap(), "sequence_z CRC check failed."); 420 | let adjlist_z = AdjList::new(sequence_z, bitmap_z); 421 | Ok(TriplesBitmap { order, bitmap_y, adjlist_z, op_index, wavelet_y }) 422 | } 423 | 424 | /// Transform the given IDs of the layers in triple section order to a triple ID. 425 | /// Warning: At the moment only SPO is properly supported anyways, in which case this is equivalent to `TripleId::new(x,y,z)`. 426 | /// Other orders may lead to undefined behaviour. 427 | pub fn coord_to_triple(&self, x: Id, y: Id, z: Id) -> Result { 428 | use TriplesReadError::*; 429 | if x == 0 || y == 0 || z == 0 { 430 | return Err(TripleComponentZero(x, y, z)); 431 | } 432 | match self.order { 433 | Order::SPO => Ok(TripleId::new(x, y, z)), 434 | Order::SOP => Ok(TripleId::new(x, z, y)), 435 | Order::PSO => Ok(TripleId::new(y, x, z)), 436 | Order::POS => Ok(TripleId::new(y, z, x)), 437 | Order::OSP => Ok(TripleId::new(z, x, y)), 438 | Order::OPS => Ok(TripleId::new(z, y, x)), 439 | Order::Unknown => Err(UnknownTriplesOrder), 440 | } 441 | } 442 | } 443 | 444 | impl<'a> IntoIterator for &'a TriplesBitmap { 445 | type Item = TripleId; 446 | type IntoIter = SubjectIter<'a>; 447 | 448 | fn into_iter(self) -> Self::IntoIter { 449 | SubjectIter::new(self) 450 | } 451 | } 452 | 453 | /// Subject, predicate or object ID, starting at 1. 454 | /// 455 | /// Subjects and predicate share IDs, starting at 1, for common values. 456 | /// A value of 0 indicates either not found (as a return value) or all of them (in a triple pattern). 457 | /// In the official documentation, u32 is used, however here, usize is used. 458 | /// While u32 caps out at 4 billion, more is not supported by the format anyways so this can probably be changed to u32. 459 | pub type Id = usize; 460 | 461 | /// Type for a triple encoded as numeric IDs for subject, predicate and object, respectively. 462 | /// See . 463 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] 464 | pub struct TripleId { 465 | /// Index starting at 1 in the combined shared and subject section. 466 | pub subject_id: Id, 467 | /// Index starting at 1 in the predicate section. 468 | pub predicate_id: Id, 469 | /// Index starting at 1 in the combined shared and object section. 470 | pub object_id: Id, 471 | } 472 | 473 | impl TripleId { 474 | /// Create a new triple ID. 475 | pub const fn new(subject_id: Id, predicate_id: Id, object_id: Id) -> Self { 476 | TripleId { subject_id, predicate_id, object_id } 477 | } 478 | } 479 | 480 | #[cfg(test)] 481 | mod tests { 482 | use super::*; 483 | use crate::header::Header; 484 | use crate::tests::init; 485 | use crate::{FourSectDict, IdKind}; 486 | use pretty_assertions::assert_eq; 487 | use std::fs::File; 488 | use std::io::BufReader; 489 | 490 | /// Iterator over all triples with a given ID in the specified position (subject, predicate or object). 491 | fn triples_with_id<'a>( 492 | t: &'a TriplesBitmap, id: usize, k: &IdKind, 493 | ) -> Box + 'a> { 494 | match k { 495 | IdKind::Subject => Box::new(SubjectIter::with_s(t, id)), 496 | IdKind::Predicate => Box::new(PredicateIter::new(t, id)), 497 | IdKind::Object => Box::new(ObjectIter::new(t, id)), 498 | } 499 | } 500 | 501 | #[test] 502 | fn read_triples() -> color_eyre::Result<()> { 503 | init(); 504 | let file = File::open("tests/resources/snikmeta.hdt")?; 505 | let mut reader = BufReader::new(file); 506 | ControlInfo::read(&mut reader)?; 507 | Header::read(&mut reader)?; 508 | let _dict = FourSectDict::read(&mut reader)?; 509 | let triples = TriplesBitmap::read_sect(&mut reader)?; 510 | let v: Vec = triples.into_iter().collect::>(); 511 | assert_eq!(v.len(), 328); 512 | assert_eq!(v[0].subject_id, 1); 513 | assert_eq!(v[2].subject_id, 1); 514 | assert_eq!(v[3].subject_id, 2); 515 | let num_subjects = 48; 516 | let num_predicates = 23; 517 | let num_objects = 175; 518 | let mut filtered: Vec; 519 | let kinds = [IdKind::Subject, IdKind::Predicate, IdKind::Object]; 520 | let lens = [num_subjects, num_predicates, num_objects]; 521 | let funs = [|t: TripleId| t.subject_id, |t: TripleId| t.predicate_id, |t: TripleId| t.object_id]; 522 | for j in 0..kinds.len() { 523 | for i in 1..=lens[j] { 524 | filtered = v.iter().filter(|tid| funs[j](**tid) == i).copied().collect(); 525 | filtered.sort_unstable(); 526 | let mut triples_with_id = triples_with_id(&triples, i, &kinds[j]).collect::>(); 527 | triples_with_id.sort_unstable(); 528 | assert_eq!(filtered, triples_with_id, "triples_with({},{:?})", i, kinds[j]); 529 | } 530 | } 531 | 532 | // SubjectIter 533 | assert_eq!(0, SubjectIter::empty(&triples).count()); 534 | // SPO 535 | assert_eq!( 536 | vec![TripleId::new(14, 14, 154)], 537 | SubjectIter::with_pattern(&triples, &TripleId::new(14, 14, 154)).collect::>() 538 | ); 539 | // SP 540 | assert_eq!( 541 | vec![TripleId::new(14, 14, 154)], 542 | SubjectIter::with_pattern(&triples, &TripleId::new(14, 14, 0)).collect::>() 543 | ); 544 | // S?? 545 | for i in 1..num_subjects { 546 | assert_eq!( 547 | SubjectIter::with_s(&triples, i).collect::>(), 548 | SubjectIter::with_pattern(&triples, &TripleId::new(i, 0, 0)).collect::>() 549 | ); 550 | } 551 | // ??? (all triples) 552 | assert_eq!(v, SubjectIter::with_pattern(&triples, &TripleId::new(0, 0, 0)).collect::>()); 553 | // SP? where S and P are in the graph, but not together 554 | assert_eq!(0, SubjectIter::with_pattern(&triples, &TripleId::new(12, 14, 154)).count()); 555 | Ok(()) 556 | } 557 | } 558 | -------------------------------------------------------------------------------- /src/triples/object_iter.rs: -------------------------------------------------------------------------------- 1 | use crate::triples::Id; 2 | use crate::triples::TripleId; 3 | use crate::triples::TriplesBitmap; 4 | use sucds::int_vectors::Access; 5 | 6 | // see "Exchange and Consumption of Huge RDF Data" by Martinez et al. 2012 7 | // https://link.springer.com/chapter/10.1007/978-3-642-30284-8_36 8 | // actually only an object iterator when SPO order is used 9 | // TODO test with other orders and fix if broken 10 | 11 | /// Iterator over all triples with a given object ID, answering an (?S,?P,O) query. 12 | pub struct ObjectIter<'a> { 13 | triples: &'a TriplesBitmap, 14 | o: Id, 15 | pos_index: usize, 16 | max_index: usize, 17 | } 18 | 19 | impl<'a> ObjectIter<'a> { 20 | /// Create a new iterator over all triples with the given object ID. 21 | /// Panics if the object does not exist. 22 | pub fn new(triples: &'a TriplesBitmap, o: Id) -> Self { 23 | assert!(o != 0, "object 0 does not exist, cant iterate"); 24 | let pos_index = triples.op_index.find(o); 25 | let max_index = triples.op_index.last(o); 26 | //println!("ObjectIter o={} pos_index={} max_index={}", o, pos_index, max_index); 27 | ObjectIter { triples, o, pos_index, max_index } 28 | } 29 | } 30 | 31 | impl Iterator for ObjectIter<'_> { 32 | type Item = TripleId; 33 | fn next(&mut self) -> Option { 34 | if self.pos_index > self.max_index { 35 | return None; 36 | } 37 | let pos_y = self.triples.op_index.sequence.access(self.pos_index).unwrap(); 38 | let y = self.triples.wavelet_y.access(pos_y).unwrap() as Id; 39 | let x = self.triples.bitmap_y.rank(pos_y) as Id + 1; 40 | self.pos_index += 1; 41 | Some(TripleId::new(x, y, self.o)) 42 | //Some(self.triples.coord_to_triple(x, y, self.o).unwrap()) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/triples/predicate_iter.rs: -------------------------------------------------------------------------------- 1 | use crate::triples::Id; 2 | use crate::triples::TripleId; 3 | use crate::triples::TriplesBitmap; 4 | 5 | /// Iterator over all triples with a given property ID, answering an (?S,P,?O) query. 6 | pub struct PredicateIter<'a> { 7 | triples: &'a TriplesBitmap, 8 | s: Id, 9 | p: Id, 10 | i: usize, 11 | os: usize, 12 | pos_z: usize, 13 | occs: usize, 14 | } 15 | 16 | impl<'a> PredicateIter<'a> { 17 | /// Create a new iterator over all triples with the given property ID. 18 | /// Panics if the object does not exist. 19 | pub fn new(triples: &'a TriplesBitmap, p: Id) -> Self { 20 | assert!(p != 0, "object 0 does not exist, cant iterate"); 21 | let occs = triples.wavelet_y.rank(triples.wavelet_y.len(), p as usize).unwrap(); 22 | //println!("the predicate {} is used by {} subjects in the index", p, occs); 23 | PredicateIter { triples, p, i: 0, pos_z: 0, os: 0, s: 0, occs } 24 | } 25 | } 26 | 27 | impl Iterator for PredicateIter<'_> { 28 | type Item = TripleId; 29 | fn next(&mut self) -> Option { 30 | if self.i >= self.occs { 31 | return None; 32 | } 33 | if self.os == 0 { 34 | // Algorithm 1 findSubj from Martinez et al. 2012 ****** 35 | let pos_y = self.triples.wavelet_y.select(self.i, self.p as usize).unwrap(); 36 | self.s = self.triples.bitmap_y.rank(pos_y) as Id + 1; 37 | // ***************************************************** 38 | // SP can have multiple O 39 | self.pos_z = self.triples.adjlist_z.find(pos_y as Id); 40 | let pos_z_end = self.triples.adjlist_z.last(pos_y as Id); 41 | //println!("**** found predicate {} between {} and {} (inclusive)", self.p, self.pos_z, pos_z_end); 42 | self.os = pos_z_end - self.pos_z; 43 | } else { 44 | self.os -= 1; 45 | self.pos_z += 1; 46 | } 47 | 48 | let o = self.triples.adjlist_z.sequence.get(self.pos_z) as Id; 49 | if self.os == 0 { 50 | self.i += 1; 51 | } 52 | Some(self.triples.coord_to_triple(self.s, self.p, o).unwrap()) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/triples/predicate_object_iter.rs: -------------------------------------------------------------------------------- 1 | use crate::triples::Id; 2 | use crate::triples::TriplesBitmap; 3 | use std::cmp::Ordering; 4 | use sucds::int_vectors::Access; 5 | 6 | // see filterPredSubj in "Exchange and Consumption of Huge RDF Data" by Martinez et al. 2012 7 | // https://link.springer.com/chapter/10.1007/978-3-642-30284-8_36 8 | 9 | /// Iterator over all subject IDs with a given predicate and object ID, answering an (?S,P,O) query. 10 | pub struct PredicateObjectIter<'a> { 11 | triples: &'a TriplesBitmap, 12 | pos_index: usize, 13 | max_index: usize, 14 | } 15 | 16 | impl<'a> PredicateObjectIter<'a> { 17 | /// Create a new iterator over all triples with the given predicate and object ID. 18 | /// Panics if the predicate or object ID is 0. 19 | pub fn new(triples: &'a TriplesBitmap, p: Id, o: Id) -> Self { 20 | assert_ne!(0, p, "predicate 0 does not exist, cant iterate"); 21 | assert_ne!(0, o, "object 0 does not exist, cant iterate"); 22 | let mut low = triples.op_index.find(o); 23 | let mut high = triples.op_index.last(o); 24 | let get_y = |pos_index| { 25 | let pos_y = triples.op_index.sequence.access(pos_index).unwrap(); 26 | triples.wavelet_y.access(pos_y).unwrap() as Id 27 | }; 28 | // Binary search with a twist: 29 | // Each value may occur multiple times, so we search for the left and right borders. 30 | while low <= high { 31 | let mut mid = usize::midpoint(low, high); 32 | match get_y(mid).cmp(&p) { 33 | Ordering::Less => low = mid + 1, 34 | Ordering::Greater => high = mid, 35 | Ordering::Equal => { 36 | let mut left_high = mid; 37 | while low < left_high { 38 | mid = usize::midpoint(low, left_high); 39 | match get_y(mid).cmp(&p) { 40 | Ordering::Less => low = mid + 1, 41 | Ordering::Greater => { 42 | high = mid; 43 | left_high = mid; 44 | } 45 | Ordering::Equal => left_high = mid, 46 | } 47 | } 48 | // right border 49 | let mut right_low = low; 50 | while right_low < high { 51 | mid = (right_low + high).div_ceil(2); 52 | match get_y(mid).cmp(&p) { 53 | Ordering::Greater => high = mid - 1, 54 | _ => right_low = mid, 55 | } 56 | } 57 | return PredicateObjectIter { triples, pos_index: low, max_index: high }; 58 | } 59 | } 60 | if (high == 0 && low == 0) || (high == low && high == mid) { 61 | break; 62 | } 63 | } 64 | // not found 65 | PredicateObjectIter { triples, pos_index: 999, max_index: 0 } 66 | } 67 | } 68 | 69 | impl Iterator for PredicateObjectIter<'_> { 70 | type Item = Id; 71 | fn next(&mut self) -> Option { 72 | if self.pos_index > self.max_index { 73 | return None; 74 | } 75 | let pos_y = self.triples.op_index.sequence.access(self.pos_index).unwrap(); 76 | //let y = self.triples.wavelet_y.get(pos_y as usize) as Id; 77 | //println!(" op p {y}"); 78 | let s = self.triples.bitmap_y.rank(pos_y) as Id + 1; 79 | self.pos_index += 1; 80 | Some(s) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/triples/subject_iter.rs: -------------------------------------------------------------------------------- 1 | use super::{Id, TripleId, TriplesBitmap}; 2 | 3 | /// Iterator over triples fitting an SPO, SP? S?? or ??? triple pattern. 4 | //#[derive(Debug)] 5 | pub struct SubjectIter<'a> { 6 | // triples data 7 | triples: &'a TriplesBitmap, 8 | // x-coordinate identifier 9 | x: Id, 10 | // current position 11 | pos_y: usize, 12 | pos_z: usize, 13 | max_y: usize, 14 | max_z: usize, 15 | search_z: usize, // for S?O 16 | } 17 | 18 | impl<'a> SubjectIter<'a> { 19 | /// Create an iterator over all triples. 20 | pub fn new(triples: &'a TriplesBitmap) -> Self { 21 | SubjectIter { 22 | triples, 23 | x: 1, // was 0 in the old code but it should start at 1 24 | pos_y: 0, 25 | pos_z: 0, 26 | max_y: triples.wavelet_y.len(), // exclusive 27 | max_z: triples.adjlist_z.len(), // exclusive 28 | search_z: 0, 29 | } 30 | } 31 | 32 | /// Use when no results are found. 33 | pub const fn empty(triples: &'a TriplesBitmap) -> Self { 34 | SubjectIter { triples, x: 1, pos_y: 0, pos_z: 0, max_y: 0, max_z: 0, search_z: 0 } 35 | } 36 | 37 | /// Convenience method for the S?? triple pattern. 38 | /// See . 39 | pub fn with_s(triples: &'a TriplesBitmap, subject_id: Id) -> Self { 40 | let min_y = triples.find_y(subject_id - 1); 41 | let min_z = triples.adjlist_z.find(min_y as Id); 42 | let max_y = triples.find_y(subject_id); 43 | let max_z = triples.adjlist_z.find(max_y as Id); 44 | SubjectIter { triples, x: subject_id, pos_y: min_y, pos_z: min_z, max_y, max_z, search_z: 0 } 45 | } 46 | 47 | /// Iterate over triples fitting the given SPO, SP? S??, S?O or ??? triple pattern. 48 | /// Variable positions are signified with a 0 value. 49 | /// Undefined result if any other triple pattern is used. 50 | /// # Examples 51 | /// ```text 52 | /// // S?? pattern, all triples with subject ID 1 53 | /// SubjectIter::with_pattern(triples, TripleId::new(1, 0, 0); 54 | /// // SP? pattern, all triples with subject ID 1 and predicate ID 2 55 | /// SubjectIter::with_pattern(triples, TripleId::new(1, 2, 0); 56 | /// // match a specific triple, not useful in practice except as an ASK query 57 | /// SubjectIter::with_pattern(triples, TripleId::new(1, 2, 3); 58 | /// ``` 59 | // Translated from . 60 | pub fn with_pattern(triples: &'a TriplesBitmap, pat: &TripleId) -> Self { 61 | let (pat_x, pat_y, pat_z) = (pat.subject_id, pat.predicate_id, pat.object_id); 62 | let (min_y, max_y, min_z, max_z); 63 | let mut x = 1; 64 | let mut search_z = 0; 65 | // only SPO order is supported currently 66 | if pat_x != 0 { 67 | // S X X 68 | if pat_y != 0 { 69 | // S P X 70 | match triples.search_y(pat_x - 1, pat_y) { 71 | Some(y) => min_y = y, 72 | None => return SubjectIter::empty(triples), 73 | } 74 | max_y = min_y + 1; 75 | if pat_z != 0 { 76 | // S P O 77 | // simply with try block when they come to stable Rust 78 | match triples.adjlist_z.search(min_y, pat_z) { 79 | Some(pos_z) => min_z = pos_z, 80 | None => return SubjectIter::empty(triples), 81 | } 82 | max_z = min_z + 1; 83 | } else { 84 | // S P ? 85 | min_z = triples.adjlist_z.find(min_y); 86 | max_z = triples.adjlist_z.last(min_y) + 1; 87 | } 88 | } else { 89 | // S ? X 90 | min_y = triples.find_y(pat_x - 1); 91 | min_z = triples.adjlist_z.find(min_y); 92 | max_y = triples.last_y(pat_x - 1) + 1; 93 | max_z = triples.adjlist_z.find(max_y); 94 | search_z = pat_z; 95 | } 96 | x = pat_x; 97 | } else { 98 | // ? X X 99 | // assume ? ? ?, other triple patterns are not supported by this iterator 100 | min_y = 0; 101 | min_z = 0; 102 | max_y = triples.wavelet_y.len(); 103 | max_z = triples.adjlist_z.len(); 104 | } 105 | SubjectIter { triples, x, pos_y: min_y, pos_z: min_z, max_y, max_z, search_z } 106 | } 107 | } 108 | 109 | impl Iterator for SubjectIter<'_> { 110 | type Item = TripleId; 111 | 112 | fn next(&mut self) -> Option { 113 | if self.pos_y >= self.max_y { 114 | return None; 115 | } 116 | 117 | let y = self.triples.wavelet_y.access(self.pos_y).unwrap() as Id; 118 | 119 | if self.search_z > 0 { 120 | self.pos_y += 1; 121 | match self.triples.adjlist_z.search(self.pos_y - 1, self.search_z) { 122 | Some(_) => { 123 | return Some(self.triples.coord_to_triple(self.x, y, self.search_z).unwrap()); 124 | } 125 | None => { 126 | return self.next(); 127 | } 128 | } 129 | } 130 | 131 | if self.pos_z >= self.max_z { 132 | return None; 133 | } 134 | let z = self.triples.adjlist_z.get_id(self.pos_z); 135 | let triple_id = self.triples.coord_to_triple(self.x, y, z).unwrap(); 136 | 137 | // theoretically the second condition should only be true if the first is as well but in practise it wasn't, which screwed up the subject identifiers 138 | // fixed by moving the second condition inside the first one but there may be another reason for the bug occuring in the first place 139 | if self.triples.adjlist_z.at_last_sibling(self.pos_z) { 140 | if self.triples.bitmap_y.at_last_sibling(self.pos_y) { 141 | self.x += 1; 142 | } 143 | self.pos_y += 1; 144 | } 145 | self.pos_z += 1; 146 | Some(triple_id) 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /tests/resources/snikmeta.hdt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KonradHoeffner/hdt/3fee2114fb39d6ae8690da023a7b02970c4d3bdb/tests/resources/snikmeta.hdt -------------------------------------------------------------------------------- /tests/resources/yago_header.hdt: -------------------------------------------------------------------------------- 1 | $HDTv5$HDTntripleslength=1891;ѫ . 2 | . 3 | "158991568" . 4 | "104" . 5 | "67813972" . 6 | "22354760" . 7 | _:statistics . 8 | _:publicationInformation . 9 | _:format . 10 | _:format _:dictionary . 11 | _:format _:triples . 12 | _:dictionary . 13 | _:dictionary "2748838" . 14 | _:dictionary "1" . 15 | _:dictionary "1402843002" . 16 | _:dictionary "16" . 17 | _:triples . 18 | _:triples "158991568" . 19 | _:triples "SPO" . 20 | _:statistics "26345372323" . 21 | _:statistics "2080973301" . 22 | _:publicationInformation "2013-05-09T10:45:06+0100" . 23 | --------------------------------------------------------------------------------