├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bugreport.yml
    │   └── featurerequest.yml
    ├── dependabot.yml
    └── workflows
    │   └── lint_and_test.yml
├── .gitignore
├── CITATION.cff
├── Cargo.toml
├── LICENSE
├── README.md
├── benches
    ├── criterion.rs
    └── iai.rs
├── examples
    └── query.rs
├── rustfmt.toml
├── src
    ├── containers
    │   ├── adj_list.rs
    │   ├── bitmap.rs
    │   ├── control_info.rs
    │   ├── mod.rs
    │   ├── rdf.rs
    │   ├── sequence.rs
    │   └── vbyte.rs
    ├── dict_sect_pfc.rs
    ├── four_sect_dict.rs
    ├── hdt.rs
    ├── hdt_graph.rs
    ├── hdt_graph
    │   └── term.rs
    ├── header.rs
    ├── lib.rs
    ├── triples.rs
    └── triples
    │   ├── object_iter.rs
    │   ├── predicate_iter.rs
    │   ├── predicate_object_iter.rs
    │   └── subject_iter.rs
└── tests
    └── resources
        ├── snikmeta.hdt
        └── yago_header.hdt


/.github/ISSUE_TEMPLATE/bugreport.yml:
--------------------------------------------------------------------------------
 1 | name: Bug Report
 2 | description: File a bug report.
 3 | labels: ["bug"]
 4 | assignees:
 5 |   - KonradHoeffner
 6 | body:
 7 |   - type: textarea
 8 |     id: what-happened
 9 |     attributes:
10 |       label: What happened?
11 |     validations:
12 |       required: true
13 |   - type: input 
14 |     id: version
15 |     attributes:
16 |       label: library version
17 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/featurerequest.yml:
--------------------------------------------------------------------------------
 1 | name: Feature Request
 2 | description: Suggest a new feature.
 3 | labels: ["enhancement"]
 4 | assignees:
 5 |   - KonradHoeffner
 6 | body:
 7 |   - type: textarea
 8 |     id: new-feature
 9 |     attributes:
10 |       label: What new feature do you want?
11 |     validations:
12 |       required: true
13 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 2 | version: 2
 3 | updates:
 4 |   - package-ecosystem: "github-actions"
 5 |     directory: "/.github/workflows"
 6 |     assignees: ["KonradHoeffner"]
 7 |     schedule:
 8 |       interval: "daily"
 9 |   - package-ecosystem: "cargo"
10 |     directory: "/"
11 |     assignees: ["KonradHoeffner"]
12 |     schedule:
13 |       interval: "daily"
14 | 


--------------------------------------------------------------------------------
/.github/workflows/lint_and_test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Lint and Test
 3 | 
 4 | on:
 5 |   push:
 6 |     paths-ignore:
 7 |       - '.gitignore'
 8 |       - 'CITATION.cff'
 9 |       - 'LICENSE'
10 |       - 'README.md'
11 |   pull_request:
12 |     paths-ignore:
13 |       - '.gitignore'
14 |       - 'CITATION.cff'
15 |       - 'LICENSE'
16 |       - 'README.md'
17 | 
18 | jobs:
19 |   build:
20 |     strategy:
21 |       matrix:
22 |         include:
23 |           - os: ubuntu-latest
24 |           # - os: macos-latest
25 |           # - os: windows-latest
26 |     runs-on: ${{ matrix.os }}
27 |     steps:
28 |       - uses: actions/checkout@v4
29 |       - name: Install Rust
30 |         uses: dtolnay/rust-toolchain@stable
31 |         with:
32 |           components: rustfmt,clippy
33 |       - name: fmt
34 |         run: cargo fmt -- --check
35 |       - name: check feature combinations
36 |         run: |
37 |               cargo check --no-default-features
38 |               cargo check --no-default-features --features sophia
39 |               cargo check --no-default-features --features cache
40 |               cargo check --no-default-features --features "sophia cache"
41 |       - name: clippy
42 |         run: cargo clippy --no-deps --all-features
43 |       - name: build
44 |         run: cargo build
45 |       - name: test
46 |         run: cargo test --all-features
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | Cargo.lock
3 | /data
4 | *.index.v1-rust-cache


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | ---
 2 | cff-version: 1.2.0
 3 | title: "hdt-rs: A Rust library
 4 |   for the Header Dictionary Triples binary RDF compression format"
 5 | message: If you use this software, please cite our article in the
 6 |   Journal of Open Source Software.
 7 | type: software
 8 | authors:
 9 |   - given-names: Konrad
10 |     family-names: Höffner
11 |     email: konrad.hoeffner@uni-leipzig.de
12 |     affiliation: >-
13 |       Institute for Medical Informatics, Statistics
14 |       and Epidemiology (IMISE), Leipzig, Germany
15 |     orcid: 'https://orcid.org/0000-0001-7358-3217'
16 |   - given-names: Baccaert
17 |     family-names: Tim
18 |     affiliation: >-
19 |       Independent Researcher
20 |       Belgium
21 | repository-code: 'https://github.com/konradhoeffner/hdt'
22 | url: 'https://crates.io/crates/hdt'
23 | keywords:
24 |   - RDF
25 |   - HDT
26 |   - Rust
27 | license: MIT
28 | preferred-citation:
29 |   type: article
30 |   authors:
31 |     - family-names: Höffner
32 |       given-names: Konrad
33 |       orcid: "https://orcid.org/0000-0000-0000-0000"
34 |     - family-names: "Baccaert"
35 |       given-names: "Tim"
36 |   date-published: 2023-04-29
37 |   doi: 10.21105/joss.05114
38 |   issn: 2475-9066
39 |   issue: 84
40 |   journal: Journal of Open Source Software
41 |   publisher:
42 |     name: Open Journals
43 |   start: 5114
44 |   title: "hdt-rs: A Rust library
45 |     for the Header Dictionary Triples binary RDF compression format"
46 |   url: "https://joss.theoj.org/papers/10.21105/joss.05114"
47 |   volume: 8
48 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "hdt"
 3 | version = "0.3.0"
 4 | repository = "https://github.com/konradhoeffner/hdt"
 5 | authors = ["Tim Baccaert <tbaccaer@vub.be>", "Konrad Höffner"]
 6 | license = "MIT"
 7 | description = "Library for the Header Dictionary Triples (HDT) RDF compression format."
 8 | keywords = ["rdf", "hdt", "compression", "file-format"]
 9 | categories = ["compression", "filesystem", "parsing", "web-programming"]
10 | edition = "2024"
11 | rust-version = "1.85"
12 | 
13 | [package.metadata."docs.rs"]
14 | all-features = true
15 | 
16 | [dependencies]
17 | bytesize = "2"
18 | crc = "3"
19 | iref = "3"
20 | langtag = "0.4"
21 | ntriple = "0.1"
22 | sophia = { version = "0.9", optional = true }
23 | sucds = "0.8"
24 | thiserror = "2"
25 | log = "0.4"
26 | mownstr = "0.3"
27 | bincode = { version = "2", optional = true, default-features = false, features = ["std", "serde"] }
28 | serde = { version = "1", optional = true, features = ["derive"] }
29 | 
30 | [features]
31 | default = ["sophia"]
32 | sophia = ["dep:sophia"]
33 | cache = ["dep:serde", "dep:bincode"]
34 | 
35 | [[bench]]
36 | name = "criterion"
37 | harness = false
38 | 
39 | [[bench]]
40 | name = "iai"
41 | harness = false
42 | 
43 | [lib]
44 | bench = false
45 | 
46 | [profile.test]
47 | opt-level = 1
48 | 
49 | [dev-dependencies]
50 | pretty_assertions = "1"
51 | env_logger = { version = "0.11", default-features = false, features = ["auto-color"] }
52 | criterion = { version = "0.6", default-features = false, features = ["cargo_bench_support", "html_reports"] }
53 | #iai = "0.1"
54 | iai = { git = "https://github.com/sigaloid/iai", rev = "d56a597" } # until https://github.com/bheisler/iai/pull/35 is merged
55 | color-eyre = "0.6"
56 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Tim Baccaert
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # HDT
  2 | 
  3 | [![Latest Version](https://img.shields.io/crates/v/hdt.svg)](https://crates.io/crates/hdt)
  4 | [![Lint and Test](https://github.com/konradhoeffner/hdt/actions/workflows/lint_and_test.yml/badge.svg)](https://github.com/konradhoeffner/hdt/actions/workflows/lint_and_test.yml)
  5 | [![Documentation](https://docs.rs/hdt/badge.svg)](https://docs.rs/hdt/)
  6 | [![Benchmarks](https://img.shields.io/badge/Benchmarks--x.svg?style=social)](https://github.com/KonradHoeffner/hdt_benchmark/blob/master/benchmark_results.ipynb)
  7 | [![HDT Rust @ LD Party Video](https://img.shields.io/badge/video-8A2BE2)](https://www.youtube.com/watch?v=R-S0o_UwPMk)
  8 | [![DOI](https://joss.theoj.org/papers/10.21105/joss.05114/status.svg)](https://doi.org/10.21105/joss.05114)
  9 | 
 10 | A Rust library for the [Header Dictionary Triples](https://www.rdfhdt.org/) compressed RDF format, including:
 11 | 
 12 | * loading the HDT default format as created by [hdt-cpp](https://github.com/rdfhdt/hdt-cpp)
 13 | * efficient querying by triple patterns
 14 | * serializing into other formats like RDF Turtle and N-Triples using the [Sophia](https://crates.io/crates/sophia) adapter
 15 | 
 16 | However it cannot:
 17 | 
 18 | * load other RDF formats
 19 | * load other HDT variants
 20 | 
 21 | For this functionality and acknowledgement of all the original authors, please look at the reference implementations in C++ and Java by the [https://github.com/rdfhdt](https://github.com/rdfhdt) organisation.
 22 | 
 23 | It also cannot:
 24 | 
 25 | * swap data to disk
 26 | * modify the RDF graph in memory
 27 | * run SPARQL queries
 28 | 
 29 | If you need any of the those features, consider using a SPARQL endpoint instead.
 30 | 
 31 | ## Examples
 32 | 
 33 | ```rust
 34 | use hdt::Hdt;
 35 | 
 36 | let file = std::fs::File::open("example.hdt").expect("error opening file");
 37 | let hdt = Hdt::new(std::io::BufReader::new(file)).expect("error loading HDT");
 38 | // query
 39 | let majors = hdt.triples_with_pattern(Some("http://dbpedia.org/resource/Leipzig"), Some("http://dbpedia.org/ontology/major"),None);
 40 | println!("{:?}", majors.collect::<Vec<_>>());
 41 | ```
 42 | 
 43 | You can also use the Sophia adapter to load HDT files and reduce memory consumption of an existing application based on Sophia, which is re-exported as `hdt::sophia`:
 44 | 
 45 | ```rust
 46 | use hdt::{Hdt,HdtGraph};
 47 | use hdt::sophia::api::graph::Graph;
 48 | use hdt::sophia::api::term::{IriRef, SimpleTerm, matcher::Any};
 49 | 
 50 | let file = std::fs::File::open("dbpedia.hdt").expect("error opening file");
 51 | let hdt = Hdt::new(std::io::BufReader::new(file)).expect("error loading HDT");
 52 | let graph = HdtGraph::new(hdt);
 53 | let s = SimpleTerm::Iri(IriRef::new_unchecked("http://dbpedia.org/resource/Leipzig".into()));
 54 | let p = SimpleTerm::Iri(IriRef::new_unchecked("http://dbpedia.org/ontology/major".into()));
 55 | let majors = graph.triples_matching(Some(s),Some(p),Any);
 56 | ```
 57 | 
 58 | If you don't want to pull in the Sophia dependency, you can exclude the adapter:
 59 | 
 60 | ```toml
 61 | [dependencies]
 62 | hdt = { version = "...", default-features = false }
 63 | ```
 64 | 
 65 | There is also a runnable example [in the examples folder](https://github.com/KonradHoeffner/hdt/tree/main/examples), which you can run with `cargo run --example query`.
 66 | 
 67 | Users can also choose to use the experimental `cache` feature. If enabled, the library will utilize a custom cached TriplesBitmap file if it exists or create one if it does not exist.
 68 | 
 69 | ```rust
 70 | let hdt = hdt::Hdt::new_from_path(std::path::Path::new("tests/resources/snikmeta.hdt")).unwrap();
 71 | ```
 72 | 
 73 | The `cache` feature is experimental and may change or be removed in future releases.
 74 | 
 75 | ## API Documentation
 76 | 
 77 | See [docs.rs/latest/hdt](https://docs.rs/hdt) or generate for yourself with `cargo doc --no-deps` without disabling default features.
 78 | 
 79 | ## Performance
 80 | The performance of a query depends on the size of the graph, the type of triple pattern and the size of the result set.
 81 | When using large HDT files, make sure to enable the release profile, such as through `cargo build --release`, as this can be much faster than using the dev profile.
 82 | 
 83 | ### Profiling
 84 | If you want to optimize the code, you can use a profiler.
 85 | The provided test data is very small in order to keep the size of the crate down; locally modifying the tests to use a large HDT file returns more meaningful results.
 86 | 
 87 | #### Example with perf and Firefox Profiler
 88 | 
 89 |     $ cargo test --release
 90 |     [...]
 91 |     Running unittests src/lib.rs (target/release/deps/hdt-2b2f139dafe69681)
 92 |     [...]
 93 |     $ perf record --call-graph=dwarf target/release/deps/hdt-2b2f139dafe69681 hdt::tests::triples
 94 |     $ perf script > /tmp/test.perf
 95 | 
 96 | Then go to <https://profiler.firefox.com/> and open `/tmp/test.perf`.
 97 | 
 98 | ## Criterion benchmark
 99 | 
100 |     cargo bench --bench criterion
101 | 
102 | * requires [persondata\_en.hdt](https://github.com/KonradHoeffner/hdt/releases/download/benchmarkdata/persondata_en.hdt.bz2) placed in `tests/resources`
103 | 
104 | ## iai benchmark
105 | 
106 |     cargo bench --bench iai
107 | 
108 | * requires [persondata\_en\_10k.hdt](https://github.com/KonradHoeffner/hdt/releases/download/benchmarkdata/persondata_en_10k.hdt.bz2) placed in `tests/resources`
109 | * requires [Valgrind](https://valgrind.org/) to be installed
110 | 
111 | ## Comparative benchmark suite
112 | 
113 | [The separate benchmark suite](https://github.com/KonradHoeffner/hdt_benchmark/blob/master/benchmark_results.ipynb) compares the performance of this and some other RDF libraries.
114 | 
115 | ## Community Guidelines
116 | 
117 | ### Issues and Support
118 | If you have a problem with the software, want to report a bug or have a feature request, please use the [issue tracker](https://github.com/KonradHoeffner/hdt/issues).
119 | If have a different type of request, feel free to send an email to [Konrad](mailto:konrad.hoeffner@uni-leipzig.de).
120 | 
121 | ### Citation
122 | 
123 | [![DOI](https://joss.theoj.org/papers/10.21105/joss.05114/status.svg)](https://doi.org/10.21105/joss.05114)
124 | 
125 | If you use this library in your research, please cite our paper in the Journal of Open Source Software.
126 | We also provide a [CITATION.cff](./CITATION.cff) file.
127 | 
128 | #### BibTeX entry
129 | 
130 | ```bibtex
131 | @article{hdtrs,
132 |   doi = {10.21105/joss.05114},
133 |   year = {2023},
134 |   publisher = {The Open Journal},
135 |   volume = {8},
136 |   number = {84},
137 |   pages = {5114},
138 |   author = {Konrad Höffner and Tim Baccaert},
139 |   title = {hdt-rs: {A} {R}ust library for the {H}eader {D}ictionary {T}riples binary {RDF} compression format},
140 |   journal = {Journal of Open Source Software}
141 | }
142 | ```
143 | 
144 | #### Citation string
145 | 
146 | Höffner et al., (2023). hdt-rs: A Rust library for the Header Dictionary Triples binary RDF compression format. Journal of Open Source Software, 8(84), 5114, https://doi.org/10.21105/joss.05114
147 | 
148 | ### Contribute
149 | We are happy to receive pull requests.
150 | Please use `cargo fmt` before committing, make sure that `cargo test` succeeds and that the code compiles on the stable and nightly toolchain both with and without the "sophia" feature active.
151 | `cargo clippy` should not report any warnings.
152 | 


--------------------------------------------------------------------------------
/benches/criterion.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{Criterion, criterion_group, criterion_main};
 2 | use hdt::Hdt;
 3 | use hdt::HdtGraph;
 4 | use hdt::IdKind;
 5 | use hdt::triples::*;
 6 | use sophia::api::graph::Graph;
 7 | use sophia::api::term::IriRef;
 8 | use sophia::api::term::SimpleTerm;
 9 | use sophia::api::term::matcher::Any;
10 | use std::fs::File;
11 | 
12 | const VINCENT: &str = "http://dbpedia.org/resource/Vincent_Descombes_Sevoie";
13 | const TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
14 | const PERSON: &str = "http://dbpedia.org/ontology/Person";
15 | 
16 | fn load() -> HdtGraph {
17 |     let filename = "tests/resources/persondata_en.hdt";
18 |     let file = File::open(filename)
19 |         .expect(&format!("Error opening file {filename}, did you forget to download it? See README.md."));
20 |     //let file = File::open("tests/resources/lscomplete2015.hdt").expect("error opening file");
21 |     //let file = File::open("tests/resources/snikmeta.hdt").expect("error opening file");
22 |     let hdt = Hdt::new(std::io::BufReader::new(file)).unwrap();
23 |     HdtGraph::new(hdt)
24 | }
25 | 
26 | fn query(c: &mut Criterion) {
27 |     let graph = load();
28 |     let triples = &graph.hdt.triples;
29 |     let twp = |s, p, o| graph.hdt.triples_with_pattern(s, p, o);
30 | 
31 |     let vincent_id = graph.hdt.dict.string_to_id(VINCENT, &IdKind::Subject);
32 |     let type_id = graph.hdt.dict.string_to_id(TYPE, &IdKind::Predicate);
33 |     let person_id = graph.hdt.dict.string_to_id(PERSON, &IdKind::Object);
34 |     let vincent_term = SimpleTerm::Iri(IriRef::new_unchecked(VINCENT.into()));
35 |     let type_term = SimpleTerm::Iri(IriRef::new_unchecked(TYPE.into()));
36 |     let person_term = SimpleTerm::Iri(IriRef::new_unchecked(PERSON.into()));
37 | 
38 |     // count to prevent optimizing away function call
39 |     let mut group = c.benchmark_group("??? (all)");
40 |     group.sample_size(10);
41 |     group.bench_function("0.1 all triple IDs", |b| b.iter(|| graph.hdt.triples.into_iter().count()));
42 |     group.bench_function("0.2 all str triples", |b| b.iter(|| graph.hdt.triples().count()));
43 |     group.bench_function("0.3 all Sophia triples", |b| b.iter(|| graph.triples().count()));
44 |     group.finish();
45 |     let mut group = c.benchmark_group("S??");
46 |     //let mut group = c.benchmark_group("query");
47 |     group.bench_function("1.1 (vincent, ?, ?) triple IDs", |b| {
48 |         b.iter(|| SubjectIter::with_pattern(triples, &TripleId::new(vincent_id, 0, 0)).count())
49 |     });
50 |     group.bench_function("1.2 (vincent, ?, ?) str triples", |b| b.iter(|| twp(Some(VINCENT), None, None).count()));
51 |     group.bench_function("1.3 (vincent, ?, ?) Sophia triples", |b| {
52 |         b.iter(|| graph.triples_matching(Some(&vincent_term), Any, Any).count())
53 |     });
54 |     group.finish();
55 | 
56 |     let mut group = c.benchmark_group(format!("?P? {} triples", PredicateIter::new(triples, type_id).count()));
57 |     group.sample_size(10);
58 |     group.bench_function("2.1 (?, type, ?) triple IDs", |b| {
59 |         b.iter(|| PredicateIter::new(triples, type_id).count())
60 |     });
61 |     group.bench_function("2.2 (?, type, ?) str triples", |b| b.iter(|| twp(None, Some(TYPE), None).count()));
62 |     group.bench_function("2.3 (?, type, ?) Sophia triples", |b| {
63 |         b.iter(|| graph.triples_matching(Any, Some(&type_term), Any).count())
64 |     });
65 |     group.finish();
66 |     let mut group = c.benchmark_group(format!("??O {} triples", ObjectIter::new(triples, person_id).count()));
67 |     group.bench_function("3.1 (?, ?, person) triple IDs", |b| {
68 |         b.iter(|| ObjectIter::new(triples, person_id).count())
69 |     });
70 |     group.bench_function("3.2 (?, ?, person) str triples", |b| b.iter(|| twp(None, None, Some(PERSON)).count()));
71 |     group.bench_function("3.3 (?, ?, person) Sophia triples", |b| {
72 |         b.iter(|| graph.triples_matching(Any, Any, Some(&person_term)).count())
73 |     });
74 |     group.finish();
75 |     let mut group = c
76 |         .benchmark_group(format!("?PO {} triples", PredicateObjectIter::new(triples, type_id, person_id).count()));
77 |     group.sample_size(10);
78 |     group.bench_function("4.1 (?, type, person) triple IDs", |b| {
79 |         b.iter(|| PredicateObjectIter::new(triples, type_id, person_id).count())
80 |     });
81 |     group.bench_function("4.2 (?, type, person) str subjects", |b| {
82 |         b.iter(|| graph.hdt.subjects_with_po(TYPE, PERSON).count())
83 |     });
84 |     group.bench_function("4.3 (?, type, person) str triples", |b| {
85 |         b.iter(|| twp(None, Some(TYPE), Some(PERSON)).count())
86 |     });
87 |     group.bench_function("4.4 (?, type, person) Sophia triples", |b| {
88 |         b.iter(|| graph.triples_matching(Any, Some(&type_term), Some(&person_term)).count())
89 |     });
90 |     group.finish();
91 | }
92 | 
93 | criterion_group!(criterion, query);
94 | criterion_main!(criterion);
95 | 


--------------------------------------------------------------------------------
/benches/iai.rs:
--------------------------------------------------------------------------------
 1 | use hdt::Hdt;
 2 | use hdt::HdtGraph;
 3 | use sophia::api::graph::Graph;
 4 | use sophia::api::term::IriRef;
 5 | use sophia::api::term::SimpleTerm;
 6 | use sophia::api::term::matcher::Any;
 7 | use std::fs::File;
 8 | 
 9 | const TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
10 | const PERSON: &str = "http://dbpedia.org/ontology/Person";
11 | 
12 | fn load() -> HdtGraph {
13 |     let file = File::open("tests/resources/persondata_en_10k.hdt").expect("error opening file");
14 |     let hdt = Hdt::new(std::io::BufReader::new(file)).unwrap();
15 |     HdtGraph::new(hdt)
16 | }
17 | 
18 | // iai currently does not allow excluding loading time so that has to be subtracted
19 | 
20 | fn query_all() {
21 |     let hdt = load().hdt;
22 |     hdt.triples_with_pattern(None, None, None).count();
23 | }
24 | 
25 | fn query_all_sophia() {
26 |     let graph = load();
27 |     graph.triples_matching(Any, Any, Any).count();
28 | }
29 | 
30 | fn query_po() {
31 |     let hdt = load().hdt;
32 |     hdt.triples_with_pattern(None, Some(TYPE), Some(PERSON)).count();
33 | }
34 | 
35 | fn query_po_sophia() {
36 |     let graph = load();
37 |     let type_term = SimpleTerm::Iri(IriRef::new_unchecked(TYPE.into()));
38 |     let person_term = SimpleTerm::Iri(IriRef::new_unchecked(PERSON.into()));
39 |     graph.triples_matching(Any, Some(&type_term), Some(&person_term)).count();
40 | }
41 | 
42 | fn query_o() {
43 |     let hdt = load().hdt;
44 |     hdt.triples_with_pattern(None, None, Some(PERSON)).count();
45 | }
46 | 
47 | fn query_o_sophia() {
48 |     let graph = load();
49 |     let person_term = SimpleTerm::Iri(IriRef::new_unchecked(PERSON.into()));
50 |     graph.triples_matching(Any, Any, Some(&person_term)).count();
51 | }
52 | 
53 | iai::main!(load, query_all, query_all_sophia, query_po, query_po_sophia, query_o, query_o_sophia);
54 | 


--------------------------------------------------------------------------------
/examples/query.rs:
--------------------------------------------------------------------------------
 1 | use hdt::Hdt;
 2 | 
 3 | fn main() -> Result<(), Box<dyn std::error::Error>> {
 4 |     env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("info")).init();
 5 |     let path = std::path::Path::new("tests/resources/snikmeta.hdt");
 6 |     let file = std::fs::File::open(path)?;
 7 |     let meta_top = "http://www.snik.eu/ontology/meta/Top";
 8 |     let rdfs_label = "http://www.w3.org/2000/01/rdf-schema#label";
 9 |     #[allow(unused_mut)]
10 |     let mut hdts = vec![Hdt::new(std::io::BufReader::new(file))?];
11 |     #[cfg(feature = "cache")]
12 |     hdts.push(Hdt::new_from_path(path)?);
13 |     for hdt in hdts {
14 |         // SP? pattern
15 |         let labels = hdt.triples_with_pattern(Some(meta_top), Some(rdfs_label), None);
16 |         println!("{:?}", labels.collect::<Vec<_>>());
17 |     }
18 |     Ok(())
19 | }
20 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
 1 | max_width = 115
 2 | short_array_element_width_threshold = 115
 3 | use_field_init_shorthand = true
 4 | use_small_heuristics = "Max"
 5 | use_try_shorthand = true
 6 | fn_params_layout = "Compressed"
 7 | single_line_if_else_max_width = 115
 8 | 
 9 | # *** only available in unstable rust ***
10 | #where_single_line = true
11 | #fn_single_line = true
12 | #group_imports = "One"
13 | #condense_wildcard_suffixes = true
14 | # ***************************************
15 | 


--------------------------------------------------------------------------------
/src/containers/adj_list.rs:
--------------------------------------------------------------------------------
 1 | //! Adjacency list containing an integer sequence and a bitmap with rank and select support.
 2 | use crate::containers::Bitmap;
 3 | use crate::containers::Sequence;
 4 | use crate::triples::Id;
 5 | use std::cmp::Ordering;
 6 | 
 7 | /// Adjacency list including a compact integer sequence and a bitmap for efficient access of that sequence using rank and select queries.
 8 | #[derive(Debug)]
 9 | #[cfg_attr(feature = "cache", derive(serde::Deserialize, serde::Serialize))]
10 | pub struct AdjList {
11 |     /// Compact integer sequence.
12 |     pub sequence: Sequence,
13 |     /// Helper structure for rank and select queries.
14 |     pub bitmap: Bitmap,
15 | }
16 | 
17 | impl AdjList {
18 |     /// Adjacency list with the given sequence and bitmap.
19 |     pub const fn new(sequence: Sequence, bitmap: Bitmap) -> Self {
20 |         AdjList { sequence, bitmap }
21 |     }
22 | 
23 |     /// Combined size in bytes of the sequence and the bitmap on the heap.
24 |     pub fn size_in_bytes(&self) -> usize {
25 |         self.sequence.size_in_bytes() + self.bitmap.size_in_bytes()
26 |     }
27 | 
28 |     /// Whether the given position represents the last child of the parent node.
29 |     pub fn at_last_sibling(&self, word_index: usize) -> bool {
30 |         self.bitmap.at_last_sibling(word_index)
31 |     }
32 | 
33 |     /// Get the ID at the given position.
34 |     pub fn get_id(&self, word_index: usize) -> Id {
35 |         self.sequence.get(word_index) as Id
36 |     }
37 | 
38 |     /// Number of entries in both the integer sequence and the bitmap.
39 |     pub const fn len(&self) -> usize {
40 |         self.sequence.entries
41 |     }
42 | 
43 |     /// Whether the list is emtpy
44 |     pub const fn is_empty(&self) -> bool {
45 |         self.sequence.entries == 0
46 |     }
47 | 
48 |     /// Find the first position for the given ID, counting from 1.
49 |     pub fn find(&self, x: Id) -> usize {
50 |         if x == 0 {
51 |             return 0;
52 |         }
53 |         // hdt counts from 1
54 |         // rsdict has nonzero value for 0, is that correct? adjust for that.
55 |         self.bitmap.select1(x - 1).unwrap() as usize + 1
56 |     }
57 | 
58 |     /// Return the position of element within the given bounds.
59 |     /// # Arguments
60 |     ///
61 |     /// * `element` - a value that may or may not exist in the specified range of the list
62 |     /// * `begin` - first index of the search range
63 |     /// * `end` - end (exclusive) of the search range
64 |     fn bin_search(&self, element: usize, begin: usize, end: usize) -> Option<usize> {
65 |         let mut low = begin;
66 |         let mut high = end;
67 |         while low < high {
68 |             let mid = usize::midpoint(low, high);
69 |             match self.sequence.get(mid).cmp(&element) {
70 |                 Ordering::Less => low = mid + 1,
71 |                 Ordering::Greater => high = mid,
72 |                 Ordering::Equal => return Some(mid),
73 |             }
74 |         }
75 |         None
76 |     }
77 | 
78 |     /// Find position of element y in the list x.
79 |     // See <https://github.com/rdfhdt/hdt-cpp/blob/develop/libhdt/src/sequence/AdjacencyList.cpp>.
80 |     pub fn search(&self, x: usize, y: usize) -> Option<usize> {
81 |         self.bin_search(y, self.find(x), self.last(x) + 1)
82 |     }
83 | 
84 |     /// Find the last position for the given ID, counting from 1.
85 |     pub fn last(&self, x: Id) -> usize {
86 |         self.find(x + 1) - 1
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/src/containers/bitmap.rs:
--------------------------------------------------------------------------------
  1 | //! Bitmap with rank and select support read from an HDT file.
  2 | use crate::containers::vbyte::read_vbyte;
  3 | use bytesize::ByteSize;
  4 | #[cfg(feature = "cache")]
  5 | use serde::ser::SerializeStruct;
  6 | use std::fmt;
  7 | use std::io::BufRead;
  8 | use std::mem::size_of;
  9 | use sucds::Serializable;
 10 | use sucds::bit_vectors::{Access, BitVector, Rank, Rank9Sel, Select};
 11 | 
 12 | /// Compact bitmap representation with rank and select support.
 13 | #[derive(Clone)]
 14 | pub struct Bitmap {
 15 |     /// should be private but is needed by containers/bitmap.rs, use methods provided by Bitmap
 16 |     pub dict: Rank9Sel,
 17 | }
 18 | 
 19 | /// The error type for the bitmap read function.
 20 | #[derive(thiserror::Error, Debug)]
 21 | pub enum BitmapReadError {
 22 |     #[error("IO error")]
 23 |     Io(#[from] std::io::Error),
 24 |     #[error("Invalid CRC8-CCIT checksum {0}, expected {1}")]
 25 |     InvalidCrc8Checksum(u8, u8),
 26 |     #[error("Invalid CRC32C checksum {0}, expected {1}")]
 27 |     InvalidCrc32Checksum(u32, u32),
 28 |     #[error("Failed to turn raw bytes into u64")]
 29 |     TryFromSliceError(#[from] std::array::TryFromSliceError),
 30 |     #[error("Read unsupported bitmap type {0} != 1")]
 31 |     UnsupportedBitmapType(u8),
 32 | }
 33 | 
 34 | #[cfg(feature = "cache")]
 35 | impl serde::Serialize for Bitmap {
 36 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
 37 |     where
 38 |         S: serde::ser::Serializer,
 39 |     {
 40 |         let mut state: <S as serde::ser::Serializer>::SerializeStruct =
 41 |             serializer.serialize_struct("Bitmap", 1)?;
 42 | 
 43 |         //bitmap_y
 44 |         let mut dict_buffer = Vec::new();
 45 |         self.dict.serialize_into(&mut dict_buffer).map_err(serde::ser::Error::custom)?;
 46 |         state.serialize_field("dict", &dict_buffer)?;
 47 | 
 48 |         state.end()
 49 |     }
 50 | }
 51 | 
 52 | #[cfg(feature = "cache")]
 53 | impl<'de> serde::Deserialize<'de> for Bitmap {
 54 |     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
 55 |     where
 56 |         D: serde::de::Deserializer<'de>,
 57 |     {
 58 |         #[derive(serde::Deserialize)]
 59 |         struct BitmapData {
 60 |             dict: Vec<u8>,
 61 |         }
 62 | 
 63 |         let data = BitmapData::deserialize(deserializer)?;
 64 | 
 65 |         // Deserialize `sucds` structures
 66 |         let mut bitmap_reader = std::io::BufReader::new(&data.dict[..]);
 67 |         let rank9sel = Rank9Sel::deserialize_from(&mut bitmap_reader).map_err(serde::de::Error::custom)?;
 68 | 
 69 |         let bitmap = Bitmap { dict: rank9sel };
 70 |         Ok(bitmap)
 71 |     }
 72 | }
 73 | 
 74 | impl fmt::Debug for Bitmap {
 75 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 76 |         write!(f, "{}", ByteSize(self.size_in_bytes() as u64))
 77 |     }
 78 | }
 79 | 
 80 | impl Bitmap {
 81 |     /// Construct a bitmap from an existing bitmap in form of a vector, which doesn't have rank and select support.
 82 |     pub fn new(data: Vec<u64>) -> Self {
 83 |         let mut v = BitVector::new();
 84 |         for d in data {
 85 |             let _ = v.push_bits(d as usize, 64);
 86 |         }
 87 |         let dict = Rank9Sel::new(v).select1_hints();
 88 |         Bitmap { dict }
 89 |     }
 90 | 
 91 |     /// Size in bytes on the heap.
 92 |     pub fn size_in_bytes(&self) -> usize {
 93 |         self.dict.size_in_bytes()
 94 |     }
 95 | 
 96 |     /// Number of bits in the bitmap
 97 |     pub const fn len(&self) -> usize {
 98 |         self.dict.len()
 99 |     }
100 | 
101 |     /// Returns the position of the k-1-th one bit or None if there aren't that many.
102 |     pub fn select1(&self, k: usize) -> Option<usize> {
103 |         self.dict.select1(k)
104 |     }
105 | 
106 |     /// Returns the number of one bits from the 0-th bit to the k-1-th bit. Panics if self.len() < pos.
107 |     pub fn rank(&self, k: usize) -> usize {
108 |         self.dict.rank1(k).unwrap_or_else(|| panic!("Out of bounds position: {} >= {}", k, self.dict.len()))
109 |     }
110 | 
111 |     /// Whether the node given position is the last child of its parent.
112 |     pub fn at_last_sibling(&self, word_index: usize) -> bool {
113 |         self.dict.access(word_index).expect("word index out of bounds")
114 |     }
115 | 
116 |     /// Read bitmap from a suitable point within HDT file data and verify checksums.
117 |     pub fn read<R: BufRead>(reader: &mut R) -> Result<Self, BitmapReadError> {
118 |         use BitmapReadError::*;
119 |         let mut history: Vec<u8> = Vec::with_capacity(5);
120 | 
121 |         // read the type
122 |         let mut bitmap_type = [0u8];
123 |         reader.read_exact(&mut bitmap_type)?;
124 |         history.extend_from_slice(&bitmap_type);
125 |         if bitmap_type[0] != 1 {
126 |             return Err(UnsupportedBitmapType(bitmap_type[0]));
127 |         }
128 | 
129 |         // read the number of bits
130 |         let (num_bits, bytes_read) = read_vbyte(reader)?;
131 |         history.extend_from_slice(&bytes_read);
132 | 
133 |         // read section CRC8
134 |         let mut crc_code = [0_u8];
135 |         reader.read_exact(&mut crc_code)?;
136 |         let crc_code = crc_code[0];
137 | 
138 |         // validate section CRC8
139 |         let crc8 = crc::Crc::<u8>::new(&crc::CRC_8_SMBUS);
140 |         let mut digest = crc8.digest();
141 |         digest.update(&history);
142 |         let crc_calculated = digest.finalize();
143 |         if crc_calculated != crc_code {
144 |             return Err(InvalidCrc8Checksum(crc_calculated, crc_code));
145 |         }
146 | 
147 |         // read all but the last word, last word is byte aligned
148 |         let full_byte_amount = ((num_bits - 1) >> 6) * 8;
149 |         let mut full_words = vec![0_u8; full_byte_amount];
150 |         // div_ceil is unstable
151 |         let mut data: Vec<u64> = Vec::with_capacity(full_byte_amount / 8 + usize::from(full_byte_amount % 8 != 0));
152 |         reader.read_exact(&mut full_words)?;
153 | 
154 |         for word in full_words.chunks_exact(size_of::<u64>()) {
155 |             data.push(u64::from_le_bytes(<[u8; 8]>::try_from(word)?));
156 |         }
157 | 
158 |         // initiate computation of CRC32
159 |         let crc32 = crc::Crc::<u32>::new(&crc::CRC_32_ISCSI);
160 |         let mut digest = crc32.digest();
161 |         digest.update(&full_words);
162 | 
163 |         let mut bits_read = 0;
164 |         let mut last_value: u64 = 0;
165 |         let last_word_bits = if num_bits == 0 { 0 } else { ((num_bits - 1) % 64) + 1 };
166 | 
167 |         while bits_read < last_word_bits {
168 |             let mut buffer = [0u8];
169 |             reader.read_exact(&mut buffer)?;
170 |             digest.update(&buffer);
171 |             last_value |= (buffer[0] as u64) << bits_read;
172 |             bits_read += 8;
173 |         }
174 |         data.push(last_value);
175 | 
176 |         // read entry body CRC32
177 |         let mut crc_code = [0_u8; 4];
178 |         reader.read_exact(&mut crc_code)?;
179 |         let crc_code = u32::from_le_bytes(crc_code);
180 | 
181 |         // validate entry body CRC32
182 |         let crc_calculated = digest.finalize();
183 |         if crc_calculated != crc_code {
184 |             return Err(InvalidCrc32Checksum(crc_calculated, crc_code));
185 |         }
186 | 
187 |         Ok(Self::new(data))
188 |     }
189 | }
190 | 


--------------------------------------------------------------------------------
/src/containers/control_info.rs:
--------------------------------------------------------------------------------
  1 | use io::ErrorKind::UnexpectedEof;
  2 | use std::collections::HashMap;
  3 | use std::io::BufRead;
  4 | use std::io::{self, Write};
  5 | use std::str;
  6 | 
  7 | pub const TERMINATOR: [u8; 1] = [0];
  8 | const HDT_HEADER: &[u8] = b"$HDT";
  9 | 
 10 | /// Type of Control Information.
 11 | #[allow(missing_docs)]
 12 | #[repr(u8)]
 13 | #[derive(Debug, PartialEq, Eq, Clone, Copy, Default)]
 14 | pub enum ControlType {
 15 |     #[default]
 16 |     Unknown = 0,
 17 |     Global = 1,
 18 |     Header = 2,
 19 |     Dictionary = 3,
 20 |     Triples = 4,
 21 |     Index = 5,
 22 | }
 23 | 
 24 | impl TryFrom<u8> for ControlType {
 25 |     type Error = ControlInfoReadErrorKind;
 26 | 
 27 |     fn try_from(original: u8) -> Result<Self, Self::Error> {
 28 |         match original {
 29 |             0 => Ok(ControlType::Unknown),
 30 |             1 => Ok(ControlType::Global),
 31 |             2 => Ok(ControlType::Header),
 32 |             3 => Ok(ControlType::Dictionary),
 33 |             4 => Ok(ControlType::Triples),
 34 |             5 => Ok(ControlType::Index),
 35 |             _ => Err(ControlInfoReadErrorKind::InvalidControlType(original)),
 36 |         }
 37 |     }
 38 | }
 39 | 
 40 | /// <https://www.rdfhdt.org/hdt-binary-format/>: "preamble that describes a chunk of information".
 41 | #[derive(Debug, Clone, PartialEq, Default)]
 42 | pub struct ControlInfo {
 43 |     /// Type of control information.
 44 |     pub control_type: ControlType,
 45 |     /// "URI identifier of the implementation of the following section."
 46 |     pub format: String,
 47 |     /// Key-value entries, ASCII only.
 48 |     pub properties: HashMap<String, String>,
 49 | }
 50 | 
 51 | /// The error type for the `read` method.
 52 | #[derive(thiserror::Error, Debug)]
 53 | #[error("failed to read HDT control info")]
 54 | pub struct ControlInfoReadError(#[from] ControlInfoReadErrorKind);
 55 | 
 56 | /// The kind of the ControlInfoReadError error.
 57 | #[derive(thiserror::Error, Debug)]
 58 | pub enum ControlInfoReadErrorKind {
 59 |     #[error("IO error")]
 60 |     Io(#[from] std::io::Error),
 61 |     #[error("chunk {0:?} does not equal the HDT cookie '$HDT'")]
 62 |     HdtCookie([u8; 4]),
 63 |     #[error("invalid separator while reading format")]
 64 |     InvalidSeparator,
 65 |     #[error("invalid CRC16-ANSI checksum")]
 66 |     InvalidChecksum,
 67 |     #[error("invalid UTF8")]
 68 |     Utf8(#[from] std::string::FromUtf8Error),
 69 |     #[error("invalid control type '{0}'")]
 70 |     InvalidControlType(u8),
 71 | }
 72 | 
 73 | impl ControlInfo {
 74 |     /// Read and verify control information.
 75 |     pub fn read<R: BufRead>(reader: &mut R) -> Result<Self, ControlInfoReadError> {
 76 |         Ok(Self::read_kind(reader)?)
 77 |     }
 78 | 
 79 |     // Helper function returning a ControlInfoReadErrorKind that is wrapped by Self::read.
 80 |     fn read_kind<R: BufRead>(reader: &mut R) -> Result<Self, ControlInfoReadErrorKind> {
 81 |         use ControlInfoReadErrorKind::*;
 82 |         //use std::io::Error;
 83 | 
 84 |         // Keep track of what we are reading for computing the CRC afterwards.
 85 |         let crc = crc::Crc::<u16>::new(&crc::CRC_16_ARC);
 86 |         let mut digest = crc.digest();
 87 | 
 88 |         // 1. Read the HDT Cookie
 89 |         let mut hdt_cookie: [u8; 4] = [0; 4];
 90 |         reader.read_exact(&mut hdt_cookie)?;
 91 |         if &hdt_cookie != b"$HDT" {
 92 |             return Err(HdtCookie(hdt_cookie));
 93 |         }
 94 |         digest.update(&hdt_cookie);
 95 | 
 96 |         // 2. Read the Control Type
 97 |         let mut control_type: [u8; 1] = [0; 1];
 98 |         reader.read_exact(&mut control_type)?;
 99 |         digest.update(&control_type);
100 |         let control_type = ControlType::try_from(control_type[0])?;
101 | 
102 |         // 3. Read the Format
103 |         let mut format = Vec::new();
104 |         reader.read_until(0x00, &mut format)?;
105 |         digest.update(&format);
106 |         if format.pop() != Some(0x00) {
107 |             return Err(InvalidSeparator);
108 |         }
109 |         let format = String::from_utf8(format)?;
110 | 
111 |         // 4. Read the Properties
112 |         let mut prop_str = Vec::new();
113 |         reader.read_until(0x00, &mut prop_str)?;
114 |         digest.update(&prop_str);
115 |         if prop_str.pop() != Some(0x00) {
116 |             return Err(std::io::Error::new(UnexpectedEof, "reading the properties").into());
117 |         }
118 |         let prop_str = String::from_utf8(prop_str)?;
119 |         let mut properties = HashMap::new();
120 |         for item in prop_str.split(';') {
121 |             if let Some(index) = item.find('=') {
122 |                 let (key, val) = item.split_at(index);
123 |                 properties.insert(String::from(key), String::from(&val[1..]));
124 |             }
125 |         }
126 | 
127 |         // 5. Read the CRC
128 |         let mut crc_code = [0_u8; 2];
129 |         reader.read_exact(&mut crc_code)?;
130 |         let crc_code: u16 = u16::from_le_bytes(crc_code);
131 | 
132 |         // 6. Check the CRC
133 |         if digest.finalize() != crc_code {
134 |             return Err(InvalidChecksum);
135 |         }
136 | 
137 |         Ok(ControlInfo { control_type, format, properties })
138 |     }
139 | 
140 |     /// Save a ControlInfo object to file using crc
141 |     pub fn save(&self, dest_writer: &mut impl Write) -> Result<(), Box<dyn std::error::Error>> {
142 |         let crc = crc::Crc::<u16>::new(&crc::CRC_16_ARC);
143 |         let mut hasher = crc.digest();
144 |         dest_writer.write_all(HDT_HEADER)?;
145 |         hasher.update(HDT_HEADER);
146 | 
147 |         // write type
148 |         let type_: [u8; 1] = [self.control_type as u8];
149 |         dest_writer.write_all(&type_)?;
150 |         hasher.update(&type_);
151 | 
152 |         // write format
153 |         let format = self.format.as_bytes();
154 |         dest_writer.write_all(format)?;
155 |         hasher.update(format);
156 |         dest_writer.write_all(&TERMINATOR)?;
157 |         hasher.update(&TERMINATOR);
158 | 
159 |         // write properties
160 |         let mut properties_string = String::new();
161 |         for (key, value) in &self.properties {
162 |             properties_string.push_str(key);
163 |             properties_string.push('=');
164 |             properties_string.push_str(value);
165 |             properties_string.push(';');
166 |         }
167 |         dest_writer.write_all(properties_string.as_bytes())?;
168 |         hasher.update(properties_string.as_bytes());
169 |         dest_writer.write_all(&TERMINATOR)?;
170 |         hasher.update(&TERMINATOR);
171 | 
172 |         let checksum = hasher.finalize();
173 |         dest_writer.write_all(&checksum.to_le_bytes())?;
174 | 
175 |         Ok(())
176 |     }
177 | 
178 |     /// Get property value for the given key, if available.
179 |     pub fn get(&self, key: &str) -> Option<String> {
180 |         self.properties.get(key).cloned()
181 |     }
182 | }
183 | 
184 | #[cfg(test)]
185 | mod tests {
186 |     use super::*;
187 |     use crate::tests::init;
188 |     use std::io::BufReader;
189 | 
190 |     #[test]
191 |     fn read_info() -> color_eyre::Result<()> {
192 |         init();
193 |         let info = b"$HDT\x01<http://purl.org/HDT/hdt#HDTv1>\x00\x00\x76\x35";
194 |         let mut reader = BufReader::new(&info[..]);
195 | 
196 |         let info = ControlInfo::read(&mut reader)?;
197 |         assert_eq!(info.control_type, ControlType::Global);
198 |         assert_eq!(info.format, "<http://purl.org/HDT/hdt#HDTv1>");
199 |         assert!(info.properties.is_empty());
200 |         Ok(())
201 |     }
202 | 
203 |     #[test]
204 |     fn write_info() -> color_eyre::Result<()> {
205 |         init();
206 |         let control_type = ControlType::Global;
207 |         let format = "<http://purl.org/HDT/hdt#HDTv1>".to_owned();
208 |         let mut properties = HashMap::<String, String>::new();
209 |         properties.insert("Software".to_owned(), "hdt_rs".to_owned());
210 |         let info = ControlInfo { control_type, format, properties };
211 | 
212 |         let mut buffer = Vec::new();
213 |         info.save(&mut buffer);
214 | 
215 |         let expected = b"$HDT\x01<http://purl.org/HDT/hdt#HDTv1>\x00Software=hdt_rs;\x00\x52\x22";
216 |         assert_eq!(buffer, expected);
217 | 
218 |         let mut reader = BufReader::new(&expected[..]);
219 |         let info2 = ControlInfo::read(&mut reader)?;
220 |         assert_eq!(info, info2);
221 |         Ok(())
222 |     }
223 | }
224 | 


--------------------------------------------------------------------------------
/src/containers/mod.rs:
--------------------------------------------------------------------------------
 1 | /// In-memory RDF representation.
 2 | pub mod rdf;
 3 | 
 4 | /// Variable length numbers.
 5 | pub mod vbyte;
 6 | 
 7 | // byte containers
 8 | mod adj_list;
 9 | mod bitmap;
10 | mod sequence;
11 | 
12 | // control info section reader
13 | mod control_info;
14 | 
15 | pub use adj_list::AdjList;
16 | pub use bitmap::{Bitmap, BitmapReadError};
17 | pub use control_info::{ControlInfo, ControlInfoReadError, ControlType};
18 | pub use sequence::{Sequence, SequenceReadError};
19 | 


--------------------------------------------------------------------------------
/src/containers/rdf.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt;
  2 | 
  3 | /// Represents an RDF triple.
  4 | #[derive(PartialEq, Eq, PartialOrd, Ord, Clone)]
  5 | pub struct Triple {
  6 |     /// Named IRI or blank node.
  7 |     pub subject: Id,
  8 |     /// IRI
  9 |     pub predicate: String,
 10 |     /// Named IRI, blank node or literal.
 11 |     pub object: Term,
 12 | }
 13 | 
 14 | impl Triple {
 15 |     /// Triple with the given subject, predicate and object.
 16 |     pub const fn new(subject: Id, predicate: String, object: Term) -> Self {
 17 |         Triple { subject, predicate, object }
 18 |     }
 19 | }
 20 | 
 21 | impl fmt::Debug for Triple {
 22 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 23 |         write!(f, "{:?} {:?} {:?} .", self.subject, self.predicate, self.object)
 24 |     }
 25 | }
 26 | 
 27 | /// RDF identifiers can either be Internationalized Resource Identifiers (IRIs) or blank node
 28 | /// identifiers. The latter are random identifiers which should be unique to the graph they are
 29 | /// contained in.
 30 | #[derive(PartialEq, Eq, PartialOrd, Ord, Clone)]
 31 | pub enum Id {
 32 |     /// IRI
 33 |     Named(String),
 34 |     /// Blank node
 35 |     Blank(String),
 36 | }
 37 | 
 38 | // There's a custom debug implementation to hide the enum variant tag when printing,
 39 | // it saves some screen space that's not needed.
 40 | impl fmt::Debug for Id {
 41 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 42 |         match self {
 43 |             Id::Named(iri) => write!(f, "\"{iri}\""),
 44 |             Id::Blank(id) => write!(f, "\"{id}\""),
 45 |         }
 46 |     }
 47 | }
 48 | 
 49 | /// RDF Terms are either identifiers or literals.
 50 | #[derive(PartialEq, Eq, PartialOrd, Ord, Clone)]
 51 | pub enum Term {
 52 |     /// Named IRI or blank node.
 53 |     Id(Id),
 54 |     /// Literal value.
 55 |     Literal(Literal),
 56 | }
 57 | 
 58 | // There's a custom debug implementation to hide the enum variant tag when printing,
 59 | // it saves some screen space that's not needed.
 60 | impl fmt::Debug for Term {
 61 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 62 |         match self {
 63 |             Term::Id(id) => id.fmt(f),
 64 |             Term::Literal(lit) => lit.fmt(f),
 65 |         }
 66 |     }
 67 | }
 68 | 
 69 | /// RDF Literals always have a lexical 'form' as per
 70 | /// [RDF 1.1 Concepts And Abstract Syntax](https://www.w3.org/TR/rdf11-concepts/#dfn-literal).
 71 | ///
 72 | /// They can optionally contain a datatype describing how the literal form maps to a literal value
 73 | /// (The default type is: [xs:string](http://www.w3.org/2001/XMLSchema#string), but we do not store
 74 | /// this).
 75 | /// If the datatype is [rdf:langString](http://www.w3.org/1999/02/22-rdf-syntax-ns#langString),
 76 | /// we can optionally supply a language tag ([BCP47](https://tools.ietf.org/html/bcp47)) such as
 77 | /// `"nl"` or `"fr"`.
 78 | ///
 79 | /// # Examples
 80 | /// ```
 81 | /// // string
 82 | /// use hdt::containers::rdf::Literal;
 83 | /// let literal = Literal::new(String::from("hello"));
 84 | /// assert_eq!("\"hello\"", format!("{:?}", literal));
 85 | /// ```
 86 | /// ```
 87 | /// // typed literal
 88 | /// use hdt::containers::rdf::Literal;
 89 | /// let type_iri = String::from("http://www.w3.org/2001/XMLSchema#integer");
 90 | /// let typed_literal = Literal::new_typed(String::from("42"), type_iri);
 91 | /// assert_eq!("\"42\"^^http://www.w3.org/2001/XMLSchema#integer", format!("{:?}", typed_literal));
 92 | /// ```
 93 | /// ```
 94 | /// // language tagged string
 95 | /// use hdt::containers::rdf::Literal;
 96 | /// let lang_tag = String::from("nl");
 97 | /// let lang_string = Literal::new_lang(String::from("hallo wereld"), lang_tag);
 98 | /// assert_eq!("\"hallo wereld\"@nl", format!("{:?}", lang_string));
 99 | /// ```
100 | #[derive(PartialEq, Eq, PartialOrd, Ord, Clone)]
101 | pub struct Literal {
102 |     form: String,
103 |     datatype: Option<String>,
104 |     lang: Option<String>,
105 | }
106 | 
107 | // There's a custom debug implementation to hide structure tags when printing,
108 | // it saves some screen space that's not needed.
109 | impl fmt::Debug for Literal {
110 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
111 |         if let Some(lang) = &self.lang {
112 |             write!(f, "\"{}\"@{lang}", self.form)
113 |         } else if let Some(dtype) = &self.datatype {
114 |             write!(f, "\"{}\"^^{dtype}", self.form)
115 |         } else {
116 |             write!(f, "\"{}\"", self.form)
117 |         }
118 |     }
119 | }
120 | 
121 | impl Literal {
122 |     /// Create a new literal with type [xs:string](http://www.w3.org/2001/XMLSchema#string) (which
123 |     /// we do not store since it is the default type).
124 |     pub const fn new(form: String) -> Self {
125 |         Literal { form, datatype: None, lang: None }
126 |     }
127 | 
128 |     /// Create a new literal with a given form and datatype.
129 |     pub const fn new_typed(form: String, datatype: String) -> Self {
130 |         Literal { form, datatype: Some(datatype), lang: None }
131 |     }
132 | 
133 |     /// Create a new literal with a given form and langauge. Automatically sets the type to
134 |     /// [xs:langString](http://www.w3.org/2001/XMLSchema#langString)
135 |     pub fn new_lang(form: String, lang: String) -> Self {
136 |         let datatype = String::from("http://www.w3.org/1999/02/22-rdf-syntax-ns#langString");
137 | 
138 |         Literal { form, datatype: Some(datatype), lang: Some(lang) }
139 |     }
140 | }
141 | 


--------------------------------------------------------------------------------
/src/containers/sequence.rs:
--------------------------------------------------------------------------------
  1 | use super::vbyte::encode_vbyte;
  2 | use crate::containers::vbyte::read_vbyte;
  3 | use bytesize::ByteSize;
  4 | #[cfg(feature = "cache")]
  5 | use serde::{self, Deserialize, Serialize};
  6 | use std::fs::File;
  7 | use std::io::{BufRead, BufWriter, Write};
  8 | use std::mem::size_of;
  9 | use std::thread;
 10 | use std::{error, fmt};
 11 | 
 12 | const USIZE_BITS: usize = usize::BITS as usize;
 13 | 
 14 | /// Integer sequence with a given number of bits, which means numbers may be represented along byte boundaries.
 15 | //#[derive(Clone)]
 16 | #[cfg_attr(feature = "cache", derive(Deserialize, Serialize))]
 17 | pub struct Sequence {
 18 |     /// Number of integers in the sequence.
 19 |     pub entries: usize,
 20 |     /// Number of bits that each integer uses.
 21 |     pub bits_per_entry: usize,
 22 |     /// Data in blocks.
 23 |     pub data: Vec<usize>,
 24 |     /// whether CRC check was successful
 25 |     #[cfg_attr(feature = "cache", serde(skip))]
 26 |     pub crc_handle: Option<thread::JoinHandle<bool>>,
 27 | }
 28 | 
 29 | /// The error type for the sequence read function.
 30 | #[derive(thiserror::Error, Debug)]
 31 | pub enum SequenceReadError {
 32 |     #[error("IO error")]
 33 |     Io(#[from] std::io::Error),
 34 |     #[error("Invalid CRC8-CCIT checksum {0}, expected {1}")]
 35 |     InvalidCrc8Checksum(u8, u8),
 36 |     #[error("Failed to turn raw bytes into usize")]
 37 |     TryFromSliceError(#[from] std::array::TryFromSliceError),
 38 |     #[error("invalid LogArray type {0} != 1")]
 39 |     InvalidLogArrayType(u8),
 40 |     #[error("entry size of {0} bit too large (>64 bit)")]
 41 |     EntrySizeTooLarge(usize),
 42 | }
 43 | 
 44 | impl fmt::Debug for Sequence {
 45 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 46 |         write!(
 47 |             f,
 48 |             "{} with {} entries, {} bits per entry",
 49 |             ByteSize(self.size_in_bytes() as u64),
 50 |             self.entries,
 51 |             self.bits_per_entry
 52 |         )
 53 |     }
 54 | }
 55 | 
 56 | pub struct SequenceIter<'a> {
 57 |     sequence: &'a Sequence,
 58 |     i: usize,
 59 | }
 60 | 
 61 | impl Iterator for SequenceIter<'_> {
 62 |     type Item = usize;
 63 |     fn next(&mut self) -> Option<Self::Item> {
 64 |         if self.i >= self.sequence.entries {
 65 |             return None;
 66 |         }
 67 |         let e = self.sequence.get(self.i);
 68 |         self.i += 1;
 69 |         Some(e)
 70 |     }
 71 | }
 72 | 
 73 | impl<'a> IntoIterator for &'a Sequence {
 74 |     type Item = usize;
 75 |     type IntoIter = SequenceIter<'a>;
 76 | 
 77 |     fn into_iter(self) -> Self::IntoIter {
 78 |         SequenceIter { sequence: self, i: 0 }
 79 |     }
 80 | }
 81 | 
 82 | impl Sequence {
 83 |     /// Get the integer at the given index, counting from 0.
 84 |     pub fn get(&self, index: usize) -> usize {
 85 |         let scaled_index = index * self.bits_per_entry;
 86 |         let block_index = scaled_index / USIZE_BITS;
 87 |         let bit_index = scaled_index % USIZE_BITS;
 88 | 
 89 |         let mut result;
 90 | 
 91 |         let result_shift = USIZE_BITS - self.bits_per_entry;
 92 |         if bit_index + self.bits_per_entry <= USIZE_BITS {
 93 |             let block_shift = USIZE_BITS - bit_index - self.bits_per_entry;
 94 |             result = (self.data[block_index] << block_shift) >> result_shift;
 95 |         } else {
 96 |             let block_shift = (USIZE_BITS << 1) - bit_index - self.bits_per_entry;
 97 |             result = self.data[block_index] >> bit_index;
 98 |             result |= (self.data[block_index + 1] << block_shift) >> result_shift;
 99 |         }
100 |         result
101 |     }
102 | 
103 |     /// Size in bytes on the heap.
104 |     pub fn size_in_bytes(&self) -> usize {
105 |         (self.data.len() * USIZE_BITS) >> 3
106 |     }
107 | 
108 |     /// Read sequence including metadata from HDT data.
109 |     pub fn read<R: BufRead>(reader: &mut R) -> Result<Self, SequenceReadError> {
110 |         use SequenceReadError::*;
111 |         // read entry metadata
112 |         // keep track of history for CRC8
113 |         let mut history: Vec<u8> = Vec::new();
114 | 
115 |         // read and validate type
116 |         let mut buffer = [0_u8];
117 |         reader.read_exact(&mut buffer)?;
118 |         history.extend_from_slice(&buffer);
119 |         if buffer[0] != 1 {
120 |             return Err(InvalidLogArrayType(buffer[0]));
121 |         }
122 | 
123 |         // read number of bits per entry
124 |         let mut buffer = [0_u8];
125 |         reader.read_exact(&mut buffer)?;
126 |         history.extend_from_slice(&buffer);
127 |         let bits_per_entry = buffer[0] as usize;
128 |         if bits_per_entry > USIZE_BITS {
129 |             return Err(EntrySizeTooLarge(bits_per_entry));
130 |         }
131 | 
132 |         // read number of entries
133 |         let (entries, bytes_read) = read_vbyte(reader)?;
134 |         history.extend_from_slice(&bytes_read);
135 | 
136 |         // read entry metadata CRC8
137 |         let mut crc_code = [0_u8];
138 |         reader.read_exact(&mut crc_code)?;
139 |         let crc_code = crc_code[0];
140 | 
141 |         // validate entry metadata CRC8
142 |         let crc8 = crc::Crc::<u8>::new(&crc::CRC_8_SMBUS);
143 |         let mut digest = crc8.digest();
144 |         digest.update(&history);
145 | 
146 |         let crc_calculated = digest.finalize();
147 |         if crc_calculated != crc_code {
148 |             return Err(InvalidCrc8Checksum(crc_calculated, crc_code));
149 |         }
150 | 
151 |         // read body data
152 |         // read all but the last entry, since the last one is byte aligned
153 |         let total_bits = bits_per_entry * entries;
154 |         let full_byte_amount = (total_bits.div_ceil(USIZE_BITS).saturating_sub(1)) * size_of::<usize>();
155 |         let mut full_words = vec![0_u8; full_byte_amount];
156 |         reader.read_exact(&mut full_words)?;
157 |         let mut data: Vec<usize> = Vec::with_capacity(full_byte_amount / 8 + 2);
158 |         // read entry body
159 | 
160 |         // turn the raw bytes into usize values
161 |         for word in full_words.chunks_exact(size_of::<usize>()) {
162 |             data.push(usize::from_le_bytes(<[u8; size_of::<usize>()]>::try_from(word)?));
163 |         }
164 | 
165 |         // keep track of history for CRC32
166 |         let mut history = full_words;
167 | 
168 |         // read the last few bits, byte aligned
169 |         let mut bits_read = 0;
170 |         let mut last_value: usize = 0;
171 |         let last_entry_bits = if total_bits == 0 { 0 } else { ((total_bits - 1) % USIZE_BITS) + 1 };
172 | 
173 |         while bits_read < last_entry_bits {
174 |             let mut buffer = [0u8];
175 |             reader.read_exact(&mut buffer)?;
176 |             history.extend_from_slice(&buffer);
177 |             last_value |= (buffer[0] as usize) << bits_read;
178 |             bits_read += size_of::<usize>();
179 |         }
180 |         data.push(last_value);
181 |         // read entry body CRC32
182 |         let mut crc_code = [0_u8; 4];
183 |         reader.read_exact(&mut crc_code)?;
184 |         let crc_handle = Some(thread::spawn(move || {
185 |             let crc_code = u32::from_le_bytes(crc_code);
186 | 
187 |             // validate entry body CRC32
188 |             let crc32 = crc::Crc::<u32>::new(&crc::CRC_32_ISCSI);
189 |             let mut digest = crc32.digest();
190 |             digest.update(&history);
191 |             digest.finalize() == crc_code
192 |         }));
193 | 
194 |         Ok(Sequence { entries, bits_per_entry, data, crc_handle })
195 |     }
196 | 
197 |     pub fn save(&self, dest_writer: &mut BufWriter<File>) -> Result<(), Box<dyn error::Error>> {
198 |         let crc = crc::Crc::<u8>::new(&crc::CRC_8_SMBUS);
199 |         let mut hasher = crc.digest();
200 |         // libhdt/src/sequence/LogSequence2.cpp::save()
201 |         // Write offsets using variable-length encoding
202 |         let seq_type: [u8; 1] = [1];
203 |         let _ = dest_writer.write(&seq_type)?;
204 |         hasher.update(&seq_type);
205 |         // Write numbits
206 |         let bits_per_entry: [u8; 1] = [self.bits_per_entry.try_into().unwrap()];
207 |         let _ = dest_writer.write(&bits_per_entry)?;
208 |         hasher.update(&bits_per_entry);
209 |         // Write numentries
210 |         let buf = &encode_vbyte(self.entries);
211 |         let _ = dest_writer.write(buf)?;
212 |         hasher.update(buf);
213 |         let checksum = hasher.finalize();
214 |         let _ = dest_writer.write(&checksum.to_le_bytes())?;
215 | 
216 |         // Write data
217 |         let crc = crc::Crc::<u32>::new(&crc::CRC_32_ISCSI);
218 |         let mut hasher = crc.digest();
219 |         let offset_data = self.pack_bits();
220 |         let _ = dest_writer.write(&offset_data)?;
221 |         hasher.update(&offset_data);
222 |         let checksum = hasher.finalize();
223 |         let _ = dest_writer.write(&checksum.to_le_bytes())?;
224 | 
225 |         Ok(())
226 |     }
227 | 
228 |     fn pack_bits(&self) -> Vec<u8> {
229 |         let mut output = Vec::new();
230 |         let mut current_byte = 0u8;
231 |         let mut bit_offset = 0;
232 | 
233 |         for &value in &self.data {
234 |             let mut val = value & ((1 << self.bits_per_entry) - 1); // mask to get only relevant bits
235 |             let mut bits_left = self.bits_per_entry;
236 | 
237 |             while bits_left > 0 {
238 |                 let available = 8 - bit_offset;
239 |                 let to_write = bits_left.min(available);
240 | 
241 |                 // Shift bits to align with current byte offset
242 |                 current_byte |= ((val & ((1 << to_write) - 1)) as u8) << bit_offset;
243 | 
244 |                 bit_offset += to_write;
245 |                 val >>= to_write;
246 |                 bits_left -= to_write;
247 | 
248 |                 if bit_offset == 8 {
249 |                     output.push(current_byte);
250 |                     current_byte = 0;
251 |                     bit_offset = 0;
252 |                 }
253 |             }
254 |         }
255 | 
256 |         // Push final byte if there's remaining bits
257 |         if bit_offset > 0 {
258 |             output.push(current_byte);
259 |         }
260 | 
261 |         output
262 |     }
263 | }
264 | 


--------------------------------------------------------------------------------
/src/containers/vbyte.rs:
--------------------------------------------------------------------------------
  1 | use std::io;
  2 | use std::io::BufRead;
  3 | 
  4 | const MAX_VBYTE_BYTES: usize = usize::BITS as usize / 7 + 1;
  5 | 
  6 | /// little endian
  7 | pub fn read_vbyte<R: BufRead>(reader: &mut R) -> io::Result<(usize, Vec<u8>)> {
  8 |     use io::Error;
  9 |     use io::ErrorKind::InvalidData;
 10 | 
 11 |     let mut n: u128 = 0;
 12 |     let mut shift = 0;
 13 |     let mut buffer = [0u8];
 14 |     let mut bytes_read = Vec::new();
 15 |     reader.read_exact(&mut buffer)?;
 16 |     bytes_read.extend_from_slice(&buffer);
 17 | 
 18 |     while (buffer[0] & 0x80) == 0 {
 19 |         if bytes_read.len() >= MAX_VBYTE_BYTES {
 20 |             return Err(Error::new(InvalidData, "Tried to read a VByte that does not fit into a usize"));
 21 |         }
 22 | 
 23 |         n |= ((buffer[0] & 127) as u128) << shift;
 24 |         reader.read_exact(&mut buffer)?;
 25 |         bytes_read.extend_from_slice(&buffer);
 26 |         // IMPORTANT: The original implementation has an off-by-one error here, hence we
 27 |         // have to copy the same off-by-one error in order to read the file format.
 28 |         // The correct implementation is supposed to shift by 8! Look at the commented out
 29 |         // tests at the bottom of the file for proof.
 30 |         shift += 7;
 31 |     }
 32 | 
 33 |     n |= ((buffer[0] & 127) as u128) << shift;
 34 | 
 35 |     if let Ok(valid) = usize::try_from(n) {
 36 |         Ok((valid, bytes_read))
 37 |     } else {
 38 |         Err(Error::new(InvalidData, "Tried to read a VByte that does not fit into a usize"))
 39 |     }
 40 | }
 41 | 
 42 | /// decode vbyte with offset
 43 | pub const fn decode_vbyte_delta(data: &[u8], offset: usize) -> (usize, usize) {
 44 |     let mut n: usize = 0;
 45 |     let mut shift: usize = 0;
 46 |     let mut byte_amount = 0;
 47 | 
 48 |     while (data[offset + byte_amount] & 0x80) == 0 {
 49 |         n |= ((data[offset + byte_amount] & 127) as usize) << shift;
 50 |         byte_amount += 1;
 51 |         shift += 7;
 52 |     }
 53 | 
 54 |     n |= ((data[offset + byte_amount] & 127) as usize) << shift;
 55 |     byte_amount += 1;
 56 | 
 57 |     (n, byte_amount)
 58 | }
 59 | 
 60 | /// little endian
 61 | pub fn encode_vbyte(n: usize) -> Vec<u8> {
 62 |     let mut bytes = Vec::new();
 63 |     let mut n = n;
 64 | 
 65 |     while n > 127 {
 66 |         bytes.push((n & 127) as u8);
 67 |         // IMPORTANT: The original implementation has an off-by-one error here, hence we
 68 |         // have to copy the same off-by-one error in order to read the file format.
 69 |         // The correct implementation is supposed to shift by 8! Look at the commented out
 70 |         // tests at the bottom of the file for proof.
 71 |         n >>= 7;
 72 |     }
 73 | 
 74 |     bytes.push((n | 0x80) as u8);
 75 |     bytes
 76 | }
 77 | 
 78 | #[cfg(test)]
 79 | mod tests {
 80 |     use super::*;
 81 |     use crate::tests::init;
 82 |     use std::io::BufReader;
 83 | 
 84 |     #[test]
 85 |     fn test_encode_decode() {
 86 |         init();
 87 |         let buffer = encode_vbyte(824);
 88 |         let mut reader = BufReader::new(&buffer[..]);
 89 |         if let Ok((number, bytes_read)) = read_vbyte(&mut reader) {
 90 |             assert_eq!(number, 824);
 91 |             assert_eq!(bytes_read, buffer);
 92 |         } else {
 93 |             panic!("Failed to read vbyte");
 94 |         }
 95 |     }
 96 | 
 97 |     #[test]
 98 |     fn test_max_value() {
 99 |         init();
100 |         let buffer = encode_vbyte(usize::MAX);
101 |         let mut reader = BufReader::new(&buffer[..]);
102 |         if let Ok((number, bytes_read)) = read_vbyte(&mut reader) {
103 |             assert_eq!(number, usize::MAX);
104 |             assert_eq!(bytes_read, buffer);
105 |         } else {
106 |             panic!("Failed to read vbyte");
107 |         }
108 |     }
109 | 
110 |     #[test]
111 |     #[should_panic(expected = "Tried to read a VByte that does not fit into a usize")]
112 |     fn test_decode_too_large() {
113 |         init();
114 |         let mut buffer = encode_vbyte(usize::MAX);
115 |         buffer[MAX_VBYTE_BYTES - 1] &= 0x7F;
116 |         buffer.push(0x7F);
117 |         let mut reader = BufReader::new(&buffer[..]);
118 |         read_vbyte(&mut reader).unwrap();
119 |     }
120 | 
121 |     // These tests show the off-by-one bug in the current implementation, but
122 |     // we need to keep the bug in order to read the current version of .hdt files.
123 |     //
124 |     // #[test]
125 |     // fn test_encode() {
126 |     //     assert_eq!(encode_vbyte(824), vec![0x38_u8, 0x83_u8])
127 |     // }
128 |     //
129 |     // #[test]
130 |     // fn test_decode() {
131 |     //     // this represents 824
132 |     //     // 0011 1000 1000 0011
133 |     //     // 0x38      0x83
134 |     //     let buffer = b"\x38\x83";
135 |     //     let mut reader = BufReader::new(&buffer[..]);
136 |     //     if let Ok((number, bytes_read)) = read_vbyte(&mut reader) {
137 |     //         assert_eq!(number, 824);
138 |     //         assert_eq!(bytes_read, vec![0x38_u8, 0x83_u8]);
139 |     //     } else {
140 |     //         panic!("Failed to read vbyte");
141 |     //     }
142 |     // }
143 | }
144 | 


--------------------------------------------------------------------------------
/src/dict_sect_pfc.rs:
--------------------------------------------------------------------------------
  1 | #![allow(missing_docs)] // temporariy while we figure out what should be public in the end
  2 | /// Dictionary section with plain front coding.
  3 | /// See <https://www.rdfhdt.org/hdt-binary-format/#DictionarySectionPlainFrontCoding>.
  4 | use crate::containers::vbyte::{decode_vbyte_delta, encode_vbyte, read_vbyte};
  5 | use crate::containers::{Sequence, SequenceReadError};
  6 | use crate::triples::Id;
  7 | use bytesize::ByteSize;
  8 | use log::error;
  9 | use std::cmp::{Ordering, min};
 10 | use std::error;
 11 | use std::fmt;
 12 | use std::fs::File;
 13 | use std::io::{BufRead, BufWriter, Write};
 14 | use std::str;
 15 | use std::sync::Arc;
 16 | use std::thread::{JoinHandle, spawn};
 17 | use thiserror::Error;
 18 | 
 19 | /// Dictionary section with plain front coding.
 20 | //#[derive(Clone)]
 21 | pub struct DictSectPFC {
 22 |     /// total number of strings stored
 23 |     pub num_strings: usize,
 24 |     /// the last block may have less than "block_size" strings
 25 |     pub block_size: usize,
 26 |     /// stores the starting position of each block
 27 |     pub sequence: Sequence,
 28 |     /// the substrings
 29 |     pub packed_data: Arc<[u8]>,
 30 | }
 31 | 
 32 | /// The error type for the DictSectPFC read function.
 33 | #[derive(thiserror::Error, Debug)]
 34 | pub enum DictSectReadError {
 35 |     #[error("IO error")]
 36 |     Io(#[from] std::io::Error),
 37 |     #[error("Invalid CRC8-CCIT checksum {0}, expected {1}")]
 38 |     InvalidCrc8Checksum(u8, u8),
 39 |     #[error("Implementation only supports plain front coded dictionary sections")]
 40 |     DictSectNotPfc,
 41 |     #[error("sequence read error")]
 42 |     SequenceReadError(#[from] SequenceReadError),
 43 | }
 44 | 
 45 | impl fmt::Debug for DictSectPFC {
 46 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 47 |         write!(
 48 |             f,
 49 |             "total size {}, {} strings, sequence {:?}, packed data {}",
 50 |             ByteSize(self.size_in_bytes() as u64),
 51 |             self.num_strings,
 52 |             self.sequence,
 53 |             ByteSize(self.packed_data.len() as u64)
 54 |         )
 55 |     }
 56 | }
 57 | 
 58 | #[derive(Error, Debug)]
 59 | pub enum ExtractError {
 60 |     #[error("index out of bounds: id {id} > dictionary section len {len}")]
 61 |     IdOutOfBounds { id: Id, len: usize },
 62 |     #[error("read invalid UTF-8 sequence in {data:?}, recovered: '{recovered}'")]
 63 |     InvalidUtf8 { source: std::str::Utf8Error, data: Vec<u8>, recovered: String },
 64 | }
 65 | 
 66 | impl DictSectPFC {
 67 |     /// size in bytes of the dictionary section
 68 |     pub fn size_in_bytes(&self) -> usize {
 69 |         self.sequence.size_in_bytes() + self.packed_data.len()
 70 |     }
 71 | 
 72 |     /*
 73 |     // TODO: fix this
 74 |     fn decode(string: String) -> String {
 75 |         let mut split: Vec<String> = string.rsplit('"').map(String::from).collect();
 76 | 
 77 |         if split.len() > 2 {
 78 |             split = split.into_iter().skip(1).collect();
 79 |             split[0] = format!("\"{}\"", split[0]);
 80 |             split.into_iter().collect()
 81 |         } else {
 82 |             split[0].clone()
 83 |         }
 84 |     }
 85 |     */
 86 | 
 87 |     fn index_str(&self, index: usize) -> &str {
 88 |         let position: usize = self.sequence.get(index);
 89 |         let length = self.strlen(position);
 90 |         str::from_utf8(&self.packed_data[position..position + length]).unwrap()
 91 |     }
 92 | 
 93 |     /// translated from Java
 94 |     /// https://github.com/rdfhdt/hdt-java/blob/master/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java
 95 |     /// 0 means not found
 96 |     pub fn string_to_id(&self, element: &str) -> Id {
 97 |         if self.num_strings == 0 {
 98 |             // shared dictionary may be empty
 99 |             return 0;
100 |         }
101 |         // binary search
102 |         let mut low: usize = 0;
103 |         let mut high = self.sequence.entries.saturating_sub(2); // should be -1 but only works with -2, investigate
104 |         let max = high;
105 |         let mut mid = high;
106 |         while low <= high {
107 |             mid = usize::midpoint(low, high);
108 | 
109 |             let cmp: Ordering = if mid > max {
110 |                 mid = max;
111 |                 break;
112 |             } else {
113 |                 let text = self.index_str(mid);
114 |                 element.cmp(text)
115 |                 //println!("mid: {} text: {} cmp: {:?}", mid, text, cmp);
116 |             };
117 |             match cmp {
118 |                 Ordering::Less => {
119 |                     if mid == 0 {
120 |                         return 0;
121 |                     }
122 |                     high = mid - 1;
123 |                 }
124 |                 Ordering::Greater => low = mid + 1,
125 |                 Ordering::Equal => return ((mid * self.block_size) + 1) as Id,
126 |             }
127 |         }
128 |         if high < mid {
129 |             mid = high;
130 |         }
131 |         let idblock = self.locate_in_block(mid, element);
132 |         if idblock == 0 {
133 |             return 0;
134 |         }
135 |         ((mid * self.block_size) + idblock + 1) as Id
136 |     }
137 | 
138 |     fn longest_common_prefix(a: &[u8], b: &[u8]) -> usize {
139 |         let len = min(a.len(), b.len());
140 |         let mut delta = 0;
141 |         while delta < len && a[delta] == b[delta] {
142 |             delta += 1;
143 |         }
144 |         delta
145 |     }
146 | 
147 |     fn locate_in_block(&self, block: usize, element: &str) -> usize {
148 |         if block >= self.sequence.entries {
149 |             return 0;
150 |         }
151 |         let element = element.as_bytes();
152 |         let mut pos = self.sequence.get(block);
153 |         let mut id_in_block = 0;
154 |         let mut cshared = 0;
155 | 
156 |         // Read the first string in the block
157 |         let slen = self.strlen(pos);
158 |         let mut temp_string: Vec<u8> = self.packed_data[pos..pos + slen].to_vec();
159 |         pos += slen + 1;
160 |         id_in_block += 1;
161 | 
162 |         while (id_in_block < self.block_size) && (pos < self.packed_data.len()) {
163 |             // Decode prefix
164 |             let (delta, vbyte_bytes) = decode_vbyte_delta(&self.packed_data, pos);
165 |             pos += vbyte_bytes;
166 | 
167 |             //Copy suffix
168 |             let slen = self.strlen(pos);
169 |             temp_string.truncate(delta);
170 |             temp_string.extend_from_slice(&self.packed_data[pos..pos + slen]);
171 |             if delta >= cshared {
172 |                 // Current delta value means that this string has a larger long common prefix than the previous one
173 |                 cshared += Self::longest_common_prefix(&temp_string[cshared..], &element[cshared..]);
174 | 
175 |                 if (cshared == element.len()) && (temp_string.len() == element.len()) {
176 |                     break;
177 |                 }
178 |             } else {
179 |                 // We have less common characters than before, this string is bigger that what we are looking for.
180 |                 // i.e. Not found.
181 |                 id_in_block = 0;
182 |                 break;
183 |             }
184 |             pos += slen + 1;
185 |             id_in_block += 1;
186 |         }
187 | 
188 |         if pos >= self.packed_data.len() || id_in_block == self.block_size {
189 |             id_in_block = 0;
190 |         }
191 |         id_in_block
192 |     }
193 | 
194 |     /// extract the string with the given ID from the dictionary
195 |     pub fn extract(&self, id: Id) -> Result<String, ExtractError> {
196 |         if id as usize > self.num_strings {
197 |             return Err(ExtractError::IdOutOfBounds { id, len: self.num_strings });
198 |         }
199 |         let block_index = id.saturating_sub(1) as usize / self.block_size;
200 |         let string_index = id.saturating_sub(1) as usize % self.block_size;
201 |         let mut position = self.sequence.get(block_index);
202 |         let mut slen = self.strlen(position);
203 |         let mut string: Vec<u8> = self.packed_data[position..position + slen].to_vec();
204 |         //println!("block_index={} string_index={}, string={}", block_index, string_index, str::from_utf8(&string).unwrap());
205 |         // loop takes around nearly half the time of the function
206 |         for _ in 0..string_index {
207 |             position += slen + 1;
208 |             let (delta, vbyte_bytes) = decode_vbyte_delta(&self.packed_data, position);
209 |             position += vbyte_bytes;
210 |             slen = self.strlen(position);
211 |             string.truncate(delta);
212 |             string.extend_from_slice(&self.packed_data[position..position + slen]);
213 |         }
214 |         // tried simdutf8::basic::from_utf8 but that didn't speed up extract that much
215 |         match str::from_utf8(&string) {
216 |             Ok(string) => Ok(String::from(string)),
217 |             Err(e) => Err(ExtractError::InvalidUtf8 {
218 |                 source: e,
219 |                 data: string.clone(),
220 |                 recovered: String::from_utf8_lossy(&string).into_owned(),
221 |             }),
222 |         }
223 |     }
224 | 
225 |     fn strlen(&self, offset: usize) -> usize {
226 |         let length = self.packed_data.len();
227 |         let mut position = offset;
228 | 
229 |         while position < length && self.packed_data[position] != 0 {
230 |             position += 1;
231 |         }
232 | 
233 |         position - offset
234 |     }
235 | 
236 |     /// deprecated: we should be able to remove this as it is public now
237 |     pub const fn num_strings(&self) -> usize {
238 |         self.num_strings
239 |     }
240 | 
241 |     /// Returns an unverified dictionary section together with a handle to verify the checksum.
242 |     pub fn read<R: BufRead>(reader: &mut R) -> Result<(Self, JoinHandle<bool>), DictSectReadError> {
243 |         use DictSectReadError::*;
244 |         let mut preamble = [0_u8];
245 |         reader.read_exact(&mut preamble)?;
246 |         if preamble[0] != 2 {
247 |             return Err(DictSectNotPfc);
248 |         }
249 | 
250 |         // read section meta data
251 |         let crc = crc::Crc::<u8>::new(&crc::CRC_8_SMBUS);
252 |         let mut digest = crc.digest();
253 |         // The CRC includes the type of the block, inaccuracy in the spec, careful.
254 |         digest.update(&[0x02]);
255 |         // This was determined based on https://git.io/JthMG because the spec on this
256 |         // https://www.rdfhdt.org/hdt-binary-format was inaccurate, it's 3 vbytes, not 2.
257 |         let (num_strings, bytes_read) = read_vbyte(reader)?;
258 |         digest.update(&bytes_read);
259 |         let (packed_length, bytes_read) = read_vbyte(reader)?;
260 |         digest.update(&bytes_read);
261 |         let (block_size, bytes_read) = read_vbyte(reader)?;
262 |         digest.update(&bytes_read);
263 | 
264 |         // read section CRC8
265 |         let mut crc_code = [0_u8];
266 |         reader.read_exact(&mut crc_code)?;
267 |         let crc_code = crc_code[0];
268 | 
269 |         let crc_calculated = digest.finalize();
270 |         if crc_calculated != crc_code {
271 |             return Err(InvalidCrc8Checksum(crc_calculated, crc_code));
272 |         }
273 | 
274 |         // read sequence log array
275 |         let sequence = Sequence::read(reader)?;
276 | 
277 |         // read packed data
278 |         let mut packed_data = vec![0u8; packed_length];
279 |         reader.read_exact(&mut packed_data)?;
280 |         let packed_data = Arc::<[u8]>::from(packed_data);
281 | 
282 |         // read packed data CRC32
283 |         let mut crc_code = [0_u8; 4];
284 |         reader.read_exact(&mut crc_code)?;
285 |         let cloned_data = Arc::clone(&packed_data);
286 |         let crc_handle = spawn(move || {
287 |             let crc = crc::Crc::<u32>::new(&crc::CRC_32_ISCSI);
288 |             let mut digest = crc.digest();
289 |             digest.update(&cloned_data[..]);
290 |             digest.finalize() == u32::from_le_bytes(crc_code)
291 |         });
292 | 
293 |         Ok((DictSectPFC { num_strings, block_size, sequence, packed_data }, crc_handle))
294 |     }
295 | 
296 |     /// counterpoint to the read method
297 |     // TODO: use Write trait and add test
298 |     pub fn save(&self, dest_writer: &mut BufWriter<File>) -> Result<(), Box<dyn error::Error>> {
299 |         let crc = crc::Crc::<u8>::new(&crc::CRC_8_SMBUS);
300 |         let mut hasher = crc.digest();
301 |         // libhdt/src/libdcs/CSD_PFC.cpp::save()
302 |         // save type
303 |         let seq_type: [u8; 1] = [2];
304 |         let _ = dest_writer.write(&seq_type)?;
305 |         hasher.update(&seq_type);
306 | 
307 |         // // Save sizes
308 |         let mut buf: Vec<u8> = vec![];
309 |         buf.extend_from_slice(&encode_vbyte(self.num_strings));
310 |         buf.extend_from_slice(&encode_vbyte(self.packed_data.len()));
311 |         buf.extend_from_slice(&encode_vbyte(self.block_size));
312 |         let _ = dest_writer.write(&buf)?;
313 |         hasher.update(&buf);
314 |         let checksum = hasher.finalize();
315 |         let _ = dest_writer.write(&checksum.to_le_bytes())?;
316 | 
317 |         self.sequence.save(dest_writer)?;
318 | 
319 |         // Write packed data
320 |         let crc = crc::Crc::<u32>::new(&crc::CRC_32_ISCSI);
321 |         let mut hasher = crc.digest();
322 |         let _ = dest_writer.write(&self.packed_data)?;
323 |         hasher.update(&self.packed_data);
324 |         // println!("{}", String::from_utf8_lossy(&self.compressed_terms));
325 |         let checksum = hasher.finalize();
326 |         let _ = dest_writer.write(&checksum.to_le_bytes())?;
327 | 
328 |         Ok(())
329 |     }
330 | }
331 | 
332 | #[cfg(test)]
333 | mod tests {
334 |     use super::*;
335 |     use crate::ControlInfo;
336 |     use crate::header::Header;
337 |     use crate::tests::init;
338 |     use pretty_assertions::assert_eq;
339 |     use std::fs::File;
340 |     use std::io::BufReader;
341 |     /* unused
342 |     #[test]
343 |     fn test_decode() {
344 |         let s = String::from("^^<http://www.w3.org/2001/XMLSchema#integer>\"123\"");
345 |         let d = DictSectPFC::decode(s);
346 |         assert_eq!(d, "\"123\"^^<http://www.w3.org/2001/XMLSchema#integer>");
347 |     }
348 |     */
349 |     #[test]
350 |     fn read_section_read() -> color_eyre::Result<()> {
351 |         init();
352 |         let file = File::open("tests/resources/snikmeta.hdt").expect("error opening file");
353 |         let mut reader = BufReader::new(file);
354 |         ControlInfo::read(&mut reader)?;
355 |         Header::read(&mut reader)?;
356 | 
357 |         // read dictionary control information
358 |         let dict_ci = ControlInfo::read(&mut reader)?;
359 |         assert!(
360 |             dict_ci.format == "<http://purl.org/HDT/hdt#dictionaryFour>",
361 |             "invalid dictionary type: {:?}",
362 |             dict_ci.format
363 |         );
364 | 
365 |         let (shared, _) = DictSectPFC::read(&mut reader)?;
366 |         // the file contains IRIs that are used both as subject and object 23128
367 |         assert_eq!(shared.num_strings, 43);
368 |         assert_eq!(shared.packed_data.len(), 614);
369 |         assert_eq!(shared.block_size, 16);
370 |         for term in ["http://www.snik.eu/ontology/meta/Top", "http://www.snik.eu/ontology/meta/Function", "_:b1"] {
371 |             let id = shared.string_to_id(term);
372 |             let back = shared.extract(id)?;
373 |             assert_eq!(term, back, "term does not translate back to itself {} -> {} -> {}", term, id, back);
374 |         }
375 |         let sequence = shared.sequence;
376 |         let data_size = (sequence.bits_per_entry * sequence.entries).div_ceil(64);
377 |         assert_eq!(sequence.data.len(), data_size);
378 | 
379 |         let (subjects, _) = DictSectPFC::read(&mut reader)?;
380 |         assert_eq!(subjects.num_strings, 6);
381 |         for term in [
382 |             "http://www.snik.eu/ontology/meta", "http://www.snik.eu/ontology/meta/feature",
383 |             "http://www.snik.eu/ontology/meta/homonym", "http://www.snik.eu/ontology/meta/master",
384 |             "http://www.snik.eu/ontology/meta/typicalFeature",
385 |         ] {
386 |             let id = subjects.string_to_id(term);
387 |             let back = subjects.extract(id)?;
388 |             assert_eq!(term, back, "term does not translate back to itself {} -> {} -> {}", term, id, back);
389 |         }
390 |         let sequence = subjects.sequence;
391 |         let data_size = (sequence.bits_per_entry * sequence.entries).div_ceil(64);
392 |         assert_eq!(sequence.data.len(), data_size);
393 |         Ok(())
394 |     }
395 | }
396 | 


--------------------------------------------------------------------------------
/src/four_sect_dict.rs:
--------------------------------------------------------------------------------
  1 | #![allow(missing_docs)] // temporariy while we figure out what should be public in the end
  2 | use crate::ControlInfo;
  3 | use crate::DictSectPFC;
  4 | /// Four section dictionary.
  5 | use crate::dict_sect_pfc::{DictSectReadError, ExtractError};
  6 | use crate::triples::Id;
  7 | //use eyre::{Result, WrapErr, eyre};
  8 | use std::io;
  9 | use std::io::{BufRead, Error, ErrorKind};
 10 | use std::thread::JoinHandle;
 11 | use thiserror::Error;
 12 | 
 13 | /// Position in an RDF triple.
 14 | #[derive(Debug, Clone)]
 15 | pub enum IdKind {
 16 |     /// IRI or blank node in the first position of a triple.
 17 |     Subject,
 18 |     /// IRI in the second position of a triple.
 19 |     Predicate,
 20 |     /// IRI, blank node or literal in the third position of a triple.
 21 |     Object,
 22 | }
 23 | 
 24 | /// Four section dictionary with plain front coding.
 25 | /// Dictionary with shared, subject, predicate and object sections.
 26 | /// Types specified as <http://purl.org/HDT/hdt#dictionaryFour>.
 27 | /// See <https://www.rdfhdt.org/hdt-internals/#dictionary>.
 28 | #[derive(Debug)]
 29 | pub struct FourSectDict {
 30 |     /// The shared section contains URIs that occur both in subject and object position. Its IDs start at one.
 31 |     pub shared: DictSectPFC,
 32 |     /// URIs that only occur as subjects. Their IDs start at the last ID of the shared section + 1.
 33 |     pub subjects: DictSectPFC,
 34 |     /// The predicate section has its own separate numbering starting from 1.
 35 |     pub predicates: DictSectPFC,
 36 |     /// URIs and literals that only occur as objects . Their IDs start at the last ID of the shared section + 1.
 37 |     pub objects: DictSectPFC,
 38 | }
 39 | 
 40 | /// Designates one of the four sections.
 41 | #[derive(Debug)]
 42 | pub enum SectKind {
 43 |     /// section for terms that appear as both subject and object
 44 |     Shared,
 45 |     /// section for terms that only appear as subjects
 46 |     Subject,
 47 |     /// section for terms that only appear as predicates
 48 |     Predicate,
 49 |     /// sections for terms that only appear as objects
 50 |     Object,
 51 | }
 52 | 
 53 | /// Wraps an extraction error with additional information on which dictionary section it occurred in.
 54 | #[derive(Error, Debug)]
 55 | #[error("four sect dict error id_to_string({id},IdKind::{id_kind:?}) in the {sect_kind:?} section, caused by {e}")]
 56 | pub struct DictError {
 57 |     #[source]
 58 |     e: ExtractError,
 59 |     id: Id,
 60 |     id_kind: &'static IdKind,
 61 |     sect_kind: SectKind,
 62 | }
 63 | 
 64 | #[derive(Error, Debug)]
 65 | #[error("four sect dict section error in the {sect_kind:?} section, caused by {e}")]
 66 | pub struct DictSectError {
 67 |     #[source]
 68 |     e: DictSectReadError,
 69 |     sect_kind: SectKind,
 70 | }
 71 | 
 72 | #[derive(Error, Debug)]
 73 | #[error("error reading four section dictionary")]
 74 | pub enum DictReadError {
 75 |     ControlInfo(#[from] crate::containers::ControlInfoReadError),
 76 |     DictSect(#[from] DictSectError),
 77 |     Other(String),
 78 | }
 79 | 
 80 | impl FourSectDict {
 81 |     /// Get the string value of a given ID of a given type.
 82 |     /// String representation of URIs, literals and blank nodes is defined in <https://www.w3.org/Submission/2011/SUBM-HDT-20110330/#dictionaryEncoding>>..
 83 |     pub fn id_to_string(&self, id: Id, id_kind: &'static IdKind) -> Result<String, DictError> {
 84 |         use SectKind::*;
 85 |         let shared_size = self.shared.num_strings() as Id;
 86 |         let d = id.saturating_sub(shared_size);
 87 |         match id_kind {
 88 |             IdKind::Subject => {
 89 |                 if id <= shared_size {
 90 |                     self.shared.extract(id).map_err(|e| DictError { e, id, id_kind, sect_kind: Shared })
 91 |                 } else {
 92 |                     self.subjects.extract(d).map_err(|e| DictError { e, id, id_kind, sect_kind: Subject })
 93 |                 }
 94 |             }
 95 |             IdKind::Predicate => {
 96 |                 self.predicates.extract(id).map_err(|e| DictError { e, id, id_kind, sect_kind: Predicate })
 97 |             }
 98 |             IdKind::Object => {
 99 |                 if id <= shared_size {
100 |                     self.shared.extract(id).map_err(|e| DictError { e, id, id_kind, sect_kind: Shared })
101 |                 } else {
102 |                     self.objects.extract(d).map_err(|e| DictError { e, id, id_kind, sect_kind: Object })
103 |                 }
104 |             }
105 |         }
106 |     }
107 | 
108 |     /// Get the string value of an ID.
109 |     /// String representation of URIs, literals and blank nodes is defined in <https://www.w3.org/Submission/2011/SUBM-HDT-20110330/#dictionaryEncoding>>..
110 |     pub fn string_to_id(&self, s: &str, id_kind: &IdKind) -> Id {
111 |         let shared_size = self.shared.num_strings();
112 |         match id_kind {
113 |             IdKind::Subject => {
114 |                 let mut id = self.shared.string_to_id(s);
115 |                 if id == 0 {
116 |                     id = self.subjects.string_to_id(s);
117 |                     if id > 0 {
118 |                         id += shared_size as Id;
119 |                     }
120 |                 }
121 |                 id
122 |             }
123 |             IdKind::Predicate => self.predicates.string_to_id(s),
124 |             IdKind::Object => {
125 |                 let mut id = self.shared.string_to_id(s);
126 |                 if id == 0 {
127 |                     id = self.objects.string_to_id(s);
128 |                     if id > 0 {
129 |                         id += shared_size as Id;
130 |                     }
131 |                 }
132 |                 id
133 |             }
134 |         }
135 |     }
136 | 
137 |     /// read the whole dictionary section including control information
138 |     pub fn read<R: BufRead>(reader: &mut R) -> Result<UnvalidatedFourSectDict, DictReadError> {
139 |         use SectKind::*;
140 |         let dict_ci = ControlInfo::read(reader)?;
141 |         if dict_ci.format != "<http://purl.org/HDT/hdt#dictionaryFour>" {
142 |             return Err(DictReadError::Other("Implementation only supports four section dictionaries".to_owned()));
143 |         }
144 |         let (shared, shared_crc) =
145 |             DictSectPFC::read(reader).map_err(|e| DictSectError { e, sect_kind: Shared })?;
146 |         let (subjects, subjects_crc) =
147 |             DictSectPFC::read(reader).map_err(|e| DictSectError { e, sect_kind: Subject })?;
148 |         let (predicates, predicates_crc) =
149 |             DictSectPFC::read(reader).map_err(|e| DictSectError { e, sect_kind: Predicate })?;
150 |         let (objects, objects_crc) =
151 |             DictSectPFC::read(reader).map_err(|e| DictSectError { e, sect_kind: Object })?;
152 | 
153 |         Ok(UnvalidatedFourSectDict {
154 |             four_sect_dict: FourSectDict { shared, subjects, predicates, objects },
155 |             crc_handles: [shared_crc, subjects_crc, predicates_crc, objects_crc],
156 |         })
157 |     }
158 |     /*
159 |     pub fn translate_all_ids(&self, triple_ids: &[TripleId]) -> Vec<(String, String, String)> {
160 |         triple_ids
161 |             .into_par_iter()
162 |             .map(|id: &TripleId| {
163 |                 let subject = self.id_to_string(id.subject_id, IdKind::Subject).unwrap();
164 |                 let predicate = self.id_to_string(id.predicate_id, IdKind::Predicate).unwrap();
165 |                 let object = self.id_to_string(id.object_id, IdKind::Object).unwrap();
166 |                 (subject, predicate, object)
167 |             })
168 |             .collect()
169 |     }
170 |     */
171 |     /// size in bytes of the in memory four section dictionary
172 |     pub fn size_in_bytes(&self) -> usize {
173 |         self.shared.size_in_bytes()
174 |             + self.subjects.size_in_bytes()
175 |             + self.predicates.size_in_bytes()
176 |             + self.objects.size_in_bytes()
177 |     }
178 | }
179 | 
180 | /// A wrapper to ensure prevent using FourSectDict before its checksum have been validated
181 | pub struct UnvalidatedFourSectDict {
182 |     four_sect_dict: FourSectDict,
183 |     crc_handles: [JoinHandle<bool>; 4],
184 | }
185 | 
186 | impl UnvalidatedFourSectDict {
187 |     /// Validates the checksums of all dictionary sections in parallel.
188 |     /// Dict validation takes around 1200 ms on a single thread with an 1.5 GB HDT file on an i9-12900k.
189 |     /// This function must NOT be called more than once.
190 |     // TODO can this be simplified?
191 |     pub fn validate(self) -> io::Result<FourSectDict> {
192 |         let names = ["shared", "subject", "predicate", "object"];
193 |         for (name, handle) in names.iter().zip(self.crc_handles) {
194 |             if !handle.join().unwrap() {
195 |                 return Err(Error::new(
196 |                     ErrorKind::InvalidData,
197 |                     format!("CRC Error in {name} dictionary section."),
198 |                 ));
199 |             }
200 |         }
201 |         Ok(self.four_sect_dict)
202 |     }
203 | }
204 | 
205 | #[cfg(test)]
206 | mod tests {
207 |     use super::*;
208 |     use crate::header::Header;
209 |     use crate::tests::init;
210 |     use pretty_assertions::assert_eq;
211 |     use std::fs::File;
212 |     use std::io::BufReader;
213 | 
214 |     #[test]
215 |     fn read_dict() -> color_eyre::Result<()> {
216 |         init();
217 |         let file = File::open("tests/resources/snikmeta.hdt")?;
218 |         let mut reader = BufReader::new(file);
219 |         ControlInfo::read(&mut reader)?;
220 |         Header::read(&mut reader)?;
221 | 
222 |         let dict = FourSectDict::read(&mut reader)?.validate()?;
223 |         assert_eq!(dict.shared.num_strings(), 43, "wrong number of strings in the shared section");
224 |         assert_eq!(dict.subjects.num_strings(), 6, "wrong number of strings in the subject section");
225 |         assert_eq!(dict.predicates.num_strings(), 23, "wrong number of strings in the predicates section");
226 |         assert_eq!(dict.objects.num_strings(), 133, "wrong number of strings in the objects section");
227 |         assert_eq!(dict.string_to_id("_:b1", &IdKind::Subject), 1);
228 |         assert_eq!("http://www.snik.eu/ontology/meta/uses", dict.id_to_string(43, &IdKind::Subject)?);
229 |         assert_eq!("http://www.snik.eu/ontology/meta/Chapter", dict.id_to_string(3, &IdKind::Subject)?);
230 |         assert_eq!("http://www.snik.eu/ontology/meta/DataSetType", dict.id_to_string(5, &IdKind::Subject)?);
231 |         for id in 1..dict.shared.num_strings() {
232 |             let s = dict.id_to_string(id, &IdKind::Subject)?;
233 |             let back = dict.string_to_id(&s, &IdKind::Subject);
234 |             assert_eq!(id, back, "shared id {} -> subject {} -> id {}", id, s, back);
235 | 
236 |             let s = dict.id_to_string(id, &IdKind::Object)?;
237 |             let back = dict.string_to_id(&s, &IdKind::Object);
238 |             assert_eq!(id, back, "shared id {} -> object {} -> id {}", id, s, back);
239 |         }
240 |         for (sect, kind, name, offset) in [
241 |             (&dict.subjects, &IdKind::Subject, "subject", dict.shared.num_strings()),
242 |             (&dict.objects, &IdKind::Object, "object", dict.shared.num_strings()),
243 |             (&dict.predicates, &IdKind::Predicate, "predicate", 0),
244 |         ] {
245 |             for id in offset + 1..offset + sect.num_strings() {
246 |                 let s = dict.id_to_string(id, kind)?;
247 |                 let back = dict.string_to_id(&s, kind);
248 |                 assert_eq!(id, back, "{} id {} -> {} {} -> id {}", name, id, name, s, back);
249 |             }
250 |         }
251 |         Ok(())
252 |     }
253 | }
254 | 


--------------------------------------------------------------------------------
/src/hdt.rs:
--------------------------------------------------------------------------------
  1 | use crate::FourSectDict;
  2 | use crate::containers::{ControlInfo, ControlInfoReadError};
  3 | use crate::four_sect_dict::{DictError, DictReadError, IdKind};
  4 | use crate::header::{Header, HeaderReadError};
  5 | use crate::triples::{
  6 |     ObjectIter, PredicateIter, PredicateObjectIter, SubjectIter, TripleId, TriplesBitmap, TriplesReadError,
  7 | };
  8 | use bytesize::ByteSize;
  9 | use log::{debug, error};
 10 | #[cfg(feature = "cache")]
 11 | use std::fs::File;
 12 | #[cfg(feature = "cache")]
 13 | use std::io::{Seek, SeekFrom, Write};
 14 | use std::iter;
 15 | use std::sync::Arc;
 16 | 
 17 | pub type Result<T> = core::result::Result<T, Error>;
 18 | 
 19 | /// In-memory representation of an RDF graph loaded from an HDT file.
 20 | /// Allows queries by triple patterns.
 21 | #[derive(Debug)]
 22 | pub struct Hdt {
 23 |     //global_ci: ControlInfo,
 24 |     //header: Header,
 25 |     /// in-memory representation of dictionary
 26 |     pub dict: FourSectDict,
 27 |     /// in-memory representation of triples
 28 |     pub triples: TriplesBitmap,
 29 | }
 30 | 
 31 | type StringTriple = (Arc<str>, Arc<str>, Arc<str>);
 32 | 
 33 | /// The error type for the `translate_id` method.
 34 | #[derive(thiserror::Error, Debug)]
 35 | #[error("cannot translate triple ID {t:?} to string triple: {e}")]
 36 | pub struct TranslateError {
 37 |     #[source]
 38 |     e: DictError,
 39 |     t: TripleId,
 40 | }
 41 | 
 42 | /// The error type for the `new` method.
 43 | #[derive(thiserror::Error, Debug)]
 44 | #[error("failed to read HDT")]
 45 | pub enum Error {
 46 |     ControlInfo(#[from] ControlInfoReadError),
 47 |     Header(#[from] HeaderReadError),
 48 |     /// Failed to read HDT dictionary
 49 |     FourSectDict(#[from] DictReadError),
 50 |     Triples(#[from] TriplesReadError),
 51 |     DictionaryValidationErrorTodo(#[from] std::io::Error),
 52 | }
 53 | 
 54 | impl Hdt {
 55 |     #[deprecated(since = "0.4.0", note = "please use `read` instead")]
 56 |     pub fn new<R: std::io::BufRead>(reader: R) -> Result<Self> {
 57 |         Self::read(reader)
 58 |     }
 59 | 
 60 |     /// Creates an immutable HDT instance containing the dictionary and triples from the given reader.
 61 |     /// The reader must point to the beginning of the data of an HDT file as produced by hdt-cpp.
 62 |     /// FourSectionDictionary with DictionarySectionPlainFrontCoding and SPO order is the only supported implementation.
 63 |     /// The format is specified at <https://www.rdfhdt.org/hdt-binary-format/>, however there are some deviations.
 64 |     /// The initial HDT specification at <http://www.w3.org/Submission/2011/03/> is outdated and not supported.
 65 |     /// # Example
 66 |     /// ```
 67 |     /// let file = std::fs::File::open("tests/resources/snikmeta.hdt").expect("error opening file");
 68 |     /// let hdt = hdt::Hdt::new(std::io::BufReader::new(file)).unwrap();
 69 |     /// ```
 70 |     pub fn read<R: std::io::BufRead>(mut reader: R) -> Result<Self> {
 71 |         ControlInfo::read(&mut reader)?;
 72 |         Header::read(&mut reader)?;
 73 |         let unvalidated_dict = FourSectDict::read(&mut reader)?;
 74 |         let triples = TriplesBitmap::read_sect(&mut reader)?;
 75 |         let dict = unvalidated_dict.validate()?;
 76 |         let hdt = Hdt { dict, triples };
 77 |         debug!("HDT size in memory {}, details:", ByteSize(hdt.size_in_bytes() as u64));
 78 |         debug!("{hdt:#?}");
 79 |         Ok(hdt)
 80 |     }
 81 | 
 82 |     /// Creates an immutable HDT instance containing the dictionary and triples from the Path.
 83 |     /// Will utilize a custom cached TriplesBitmap file if exists or create one if it does not exist.
 84 |     /// The file path must point to the beginning of the data of an HDT file as produced by hdt-cpp.
 85 |     /// FourSectionDictionary with DictionarySectionPlainFrontCoding and SPO order is the only supported implementation.
 86 |     /// The format is specified at <https://www.rdfhdt.org/hdt-binary-format/>, however there are some deviations.
 87 |     /// The initial HDT specification at <http://www.w3.org/Submission/2011/03/> is outdated and not supported.
 88 |     /// # Example
 89 |     /// ```
 90 |     /// let hdt = hdt::Hdt::new_from_path(std::path::Path::new("tests/resources/snikmeta.hdt")).unwrap();
 91 |     /// ```
 92 |     #[cfg(feature = "cache")]
 93 |     pub fn new_from_path(f: &std::path::Path) -> Result<Self> {
 94 |         use log::warn;
 95 | 
 96 |         let source = File::open(f)?;
 97 |         let mut reader = std::io::BufReader::new(source);
 98 |         ControlInfo::read(&mut reader)?;
 99 |         Header::read(&mut reader)?;
100 |         let unvalidated_dict = FourSectDict::read(&mut reader)?;
101 |         let mut abs_path = std::fs::canonicalize(f)?;
102 |         let _ = abs_path.pop();
103 |         let index_file_name = format!("{}.index.v1-rust-cache", f.file_name().unwrap().to_str().unwrap());
104 |         let index_file_path = abs_path.join(index_file_name);
105 |         let triples = if index_file_path.exists() {
106 |             let pos = reader.stream_position()?;
107 |             match Self::load_with_cache(&mut reader, &index_file_path) {
108 |                 Ok(triples) => triples,
109 |                 Err(e) => {
110 |                     warn!("error loading cache, overwriting: {e}");
111 |                     reader.seek(SeekFrom::Start(pos))?;
112 |                     Self::load_without_cache(&mut reader, &index_file_path)?
113 |                 }
114 |             }
115 |         } else {
116 |             Self::load_without_cache(&mut reader, &index_file_path)?
117 |         };
118 | 
119 |         let dict = unvalidated_dict.validate()?;
120 |         let hdt = Hdt { dict, triples };
121 |         debug!("HDT size in memory {}, details:", ByteSize(hdt.size_in_bytes() as u64));
122 |         debug!("{hdt:#?}");
123 |         Ok(hdt)
124 |     }
125 | 
126 |     #[cfg(feature = "cache")]
127 |     fn load_without_cache<R: std::io::BufRead>(
128 |         mut reader: R, index_file_path: &std::path::PathBuf,
129 |     ) -> Result<TriplesBitmap> {
130 |         use log::warn;
131 | 
132 |         debug!("no cache detected, generating index");
133 |         let triples = TriplesBitmap::read_sect(&mut reader)?;
134 |         debug!("index generated, saving cache to {}", index_file_path.display());
135 |         if let Err(e) = Self::write_cache(index_file_path, &triples) {
136 |             warn!("error trying to save cache to file: {e}");
137 |         }
138 |         Ok(triples)
139 |     }
140 | 
141 |     #[cfg(feature = "cache")]
142 |     fn load_with_cache<R: std::io::BufRead>(
143 |         mut reader: R, index_file_path: &std::path::PathBuf,
144 |     ) -> core::result::Result<TriplesBitmap, Box<dyn std::error::Error>> {
145 |         // load cached index
146 |         debug!("hdt file cache detected, loading from {}", index_file_path.display());
147 |         let index_source = File::open(index_file_path)?;
148 |         let mut index_reader = std::io::BufReader::new(index_source);
149 |         let triples_ci = ControlInfo::read(&mut reader)?;
150 |         Ok(TriplesBitmap::load_cache(&mut index_reader, &triples_ci)?)
151 |     }
152 | 
153 |     #[cfg(feature = "cache")]
154 |     fn write_cache(
155 |         index_file_path: &std::path::PathBuf, triples: &TriplesBitmap,
156 |     ) -> core::result::Result<(), Box<dyn std::error::Error>> {
157 |         let new_index_file = File::create(index_file_path)?;
158 |         let mut writer = std::io::BufWriter::new(new_index_file);
159 |         bincode::serde::encode_into_std_write(&triples, &mut writer, bincode::config::standard())?;
160 |         writer.flush()?;
161 |         Ok(())
162 |     }
163 | 
164 |     /// Recursive size in bytes on the heap.
165 |     pub fn size_in_bytes(&self) -> usize {
166 |         self.dict.size_in_bytes() + self.triples.size_in_bytes()
167 |     }
168 | 
169 |     /// An iterator visiting *all* triples as strings in order.
170 |     /// Using this method with a filter can be inefficient for large graphs,
171 |     /// because the strings are stored in compressed form and must be decompressed and allocated.
172 |     /// Whenever possible, use [`Hdt::triples_with_pattern`] instead.
173 |     /// # Example
174 |     /// ```
175 |     /// fn print_first_triple(hdt: hdt::Hdt) {
176 |     ///     println!("{:?}", hdt.triples().next().expect("no triple in the graph"));
177 |     /// }
178 |     /// ```
179 |     pub fn triples(&self) -> impl Iterator<Item = StringTriple> + '_ {
180 |         let mut triple_cache = TripleCache::new(self);
181 |         self.triples.into_iter().map(move |ids| triple_cache.translate(ids).unwrap())
182 |     }
183 | 
184 |     /// Get all subjects with the given property and object (?PO pattern).
185 |     /// Use this over `triples_with_pattern(None,Some(p),Some(o))` if you don't need whole triples.
186 |     /// # Example
187 |     /// Who was born in Leipzig?
188 |     /// ```
189 |     /// fn query(dbpedia: hdt::Hdt) {
190 |     ///     for person in dbpedia.subjects_with_po(
191 |     ///       "http://dbpedia.org/ontology/birthPlace", "http://dbpedia.org/resource/Leipzig") {
192 |     ///       println!("{person:?}");
193 |     ///     }
194 |     /// }
195 |     /// ```
196 |     pub fn subjects_with_po(&self, p: &str, o: &str) -> Box<dyn Iterator<Item = String> + '_> {
197 |         let pid = self.dict.string_to_id(p, &IdKind::Predicate);
198 |         let oid = self.dict.string_to_id(o, &IdKind::Object);
199 |         // predicate or object not in dictionary, iterator would interpret 0 as variable
200 |         if pid == 0 || oid == 0 {
201 |             return Box::new(iter::empty());
202 |         }
203 |         // needed for extending the lifetime of the parameters into the iterator for error messages
204 |         let p_owned = p.to_owned();
205 |         let o_owned = o.to_owned();
206 |         Box::new(
207 |             PredicateObjectIter::new(&self.triples, pid, oid)
208 |                 .map(move |sid| self.dict.id_to_string(sid, &IdKind::Subject))
209 |                 .filter_map(move |r| {
210 |                     r.map_err(|e| error!("Error on triple with property {p_owned} and object {o_owned}: {e}")).ok()
211 |                 }),
212 |         )
213 |     }
214 | 
215 |     /// Get all triples that fit the given triple patterns, where `None` stands for a variable.
216 |     /// For example, `triples_with_pattern(Some(s), Some(p), None)` answers an SP? pattern.
217 |     /// # Example
218 |     /// What is the capital of the United States of America?
219 |     /// ```
220 |     /// fn query(dbpedia: hdt::Hdt) {
221 |     ///   println!("{:?}", dbpedia.triples_with_pattern(
222 |     ///     Some("http://dbpedia.org/resource/United_States"), Some("http://dbpedia.org/ontology/capital"), None)
223 |     ///     .next().expect("no capital found").2);
224 |     /// }
225 |     /// ```
226 |     pub fn triples_with_pattern<'a>(
227 |         &'a self, sp: Option<&'a str>, pp: Option<&'a str>, op: Option<&'a str>,
228 |     ) -> Box<dyn Iterator<Item = StringTriple> + 'a> {
229 |         let xso: Option<(Arc<str>, usize)> =
230 |             sp.map(|s| (Arc::from(s), self.dict.string_to_id(s, &IdKind::Subject)));
231 |         let xpo: Option<(Arc<str>, usize)> =
232 |             pp.map(|p| (Arc::from(p), self.dict.string_to_id(p, &IdKind::Predicate)));
233 |         let xoo: Option<(Arc<str>, usize)> =
234 |             op.map(|o| (Arc::from(o), self.dict.string_to_id(o, &IdKind::Object)));
235 |         if [&xso, &xpo, &xoo].into_iter().flatten().any(|x| x.1 == 0) {
236 |             // at least one term does not exist in the graph
237 |             return Box::new(iter::empty());
238 |         }
239 |         // TODO: improve error handling
240 |         let mut cache = TripleCache::new(self);
241 |         match (xso, xpo, xoo) {
242 |             (Some(s), Some(p), Some(o)) => {
243 |                 if SubjectIter::with_pattern(&self.triples, &TripleId::new(s.1, p.1, o.1)).next().is_some() {
244 |                     Box::new(iter::once((s.0, p.0, o.0)))
245 |                 } else {
246 |                     Box::new(iter::empty())
247 |                 }
248 |             }
249 |             (Some(s), Some(p), None) => {
250 |                 Box::new(SubjectIter::with_pattern(&self.triples, &TripleId::new(s.1, p.1, 0)).map(move |t| {
251 |                     (
252 |                         s.0.clone(),
253 |                         p.0.clone(),
254 |                         Arc::from(self.dict.id_to_string(t.object_id, &IdKind::Object).unwrap()),
255 |                     )
256 |                 }))
257 |             }
258 |             (Some(s), None, Some(o)) => {
259 |                 Box::new(SubjectIter::with_pattern(&self.triples, &TripleId::new(s.1, 0, o.1)).map(move |t| {
260 |                     (
261 |                         s.0.clone(),
262 |                         Arc::from(self.dict.id_to_string(t.predicate_id, &IdKind::Predicate).unwrap()),
263 |                         o.0.clone(),
264 |                     )
265 |                 }))
266 |             }
267 |             (Some(s), None, None) => {
268 |                 Box::new(SubjectIter::with_pattern(&self.triples, &TripleId::new(s.1, 0, 0)).map(move |t| {
269 |                     (
270 |                         s.0.clone(),
271 |                         cache.get_p_string(t.predicate_id).unwrap(),
272 |                         cache.get_o_string(t.object_id).unwrap(),
273 |                     )
274 |                 }))
275 |             }
276 |             (None, Some(p), Some(o)) => {
277 |                 Box::new(PredicateObjectIter::new(&self.triples, p.1, o.1).map(move |sid| {
278 |                     (Arc::from(self.dict.id_to_string(sid, &IdKind::Subject).unwrap()), p.0.clone(), o.0.clone())
279 |                 }))
280 |             }
281 |             (None, Some(p), None) => Box::new(PredicateIter::new(&self.triples, p.1).map(move |t| {
282 |                 (cache.get_s_string(t.subject_id).unwrap(), p.0.clone(), cache.get_o_string(t.object_id).unwrap())
283 |             })),
284 |             (None, None, Some(o)) => Box::new(ObjectIter::new(&self.triples, o.1).map(move |t| {
285 |                 (
286 |                     cache.get_s_string(t.subject_id).unwrap(),
287 |                     cache.get_p_string(t.predicate_id).unwrap(),
288 |                     o.0.clone(),
289 |                 )
290 |             })),
291 |             (None, None, None) => Box::new(self.triples()),
292 |         }
293 |     }
294 | }
295 | 
296 | /// A TripleCache stores the `Arc<str>` of the last returned triple
297 | #[derive(Clone, Debug)]
298 | pub struct TripleCache<'a> {
299 |     hdt: &'a super::Hdt,
300 |     idx: [usize; 3],
301 |     arc: [Option<Arc<str>>; 3],
302 | }
303 | 
304 | impl<'a> TripleCache<'a> {
305 |     /// Build a new [`TripleCache`] for the given [`Hdt`]
306 |     pub const fn new(hdt: &'a super::Hdt) -> Self {
307 |         TripleCache { hdt, idx: [0; 3], arc: [None, None, None] }
308 |     }
309 | 
310 |     /// Get the string representation of the subject `sid`.
311 |     pub fn get_s_string(&mut self, sid: usize) -> core::result::Result<Arc<str>, DictError> {
312 |         self.get_x_string(sid, 0, &IdKind::Subject)
313 |     }
314 | 
315 |     /// Get the string representation of the predicate `pid`.
316 |     pub fn get_p_string(&mut self, pid: usize) -> core::result::Result<Arc<str>, DictError> {
317 |         self.get_x_string(pid, 1, &IdKind::Predicate)
318 |     }
319 | 
320 |     /// Get the string representation of the object `oid`.
321 |     pub fn get_o_string(&mut self, oid: usize) -> core::result::Result<Arc<str>, DictError> {
322 |         self.get_x_string(oid, 2, &IdKind::Object)
323 |     }
324 | 
325 |     /// Translate a triple of indexes into a triple of strings.
326 |     pub fn translate(&mut self, t: TripleId) -> core::result::Result<StringTriple, TranslateError> {
327 |         Ok((
328 |             self.get_s_string(t.subject_id).map_err(|e| TranslateError { e, t })?,
329 |             self.get_p_string(t.predicate_id).map_err(|e| TranslateError { e, t })?,
330 |             self.get_o_string(t.object_id).map_err(|e| TranslateError { e, t })?,
331 |         ))
332 |     }
333 | 
334 |     fn get_x_string(
335 |         &mut self, i: usize, pos: usize, kind: &'static IdKind,
336 |     ) -> core::result::Result<Arc<str>, DictError> {
337 |         debug_assert!(i != 0);
338 |         if self.idx[pos] == i {
339 |             Ok(self.arc[pos].as_ref().unwrap().clone())
340 |         } else {
341 |             let ret: Arc<str> = self.hdt.dict.id_to_string(i, kind)?.into();
342 |             self.arc[pos] = Some(ret.clone());
343 |             self.idx[pos] = i;
344 |             Ok(ret)
345 |         }
346 |     }
347 | }
348 | 
349 | #[cfg(test)]
350 | mod tests {
351 |     use super::*;
352 |     use crate::tests::init;
353 |     use pretty_assertions::{assert_eq, assert_ne};
354 |     use std::fs::File;
355 | 
356 |     #[test]
357 |     fn triples() -> color_eyre::Result<()> {
358 |         init();
359 |         let filename = "tests/resources/snikmeta.hdt";
360 |         let file = File::open(filename)?;
361 |         let hdt = Hdt::new(std::io::BufReader::new(file))?;
362 |         let triples = hdt.triples();
363 |         let v: Vec<StringTriple> = triples.collect();
364 |         assert_eq!(v.len(), 328);
365 |         assert_eq!(v, hdt.triples_with_pattern(None, None, None).collect::<Vec<_>>(), "all triples not equal ???");
366 |         assert_ne!(0, hdt.dict.string_to_id("http://www.snik.eu/ontology/meta", &IdKind::Subject));
367 |         for uri in ["http://www.snik.eu/ontology/meta/Top", "http://www.snik.eu/ontology/meta", "doesnotexist"] {
368 |             let filtered: Vec<_> = v.clone().into_iter().filter(|triple| triple.0.as_ref() == uri).collect();
369 |             let with_s: Vec<_> = hdt.triples_with_pattern(Some(uri), None, None).collect();
370 |             assert_eq!(filtered, with_s, "different results between triples() and triples_with_s() for {}", uri);
371 |         }
372 |         let s = "http://www.snik.eu/ontology/meta/Top";
373 |         let p = "http://www.w3.org/2000/01/rdf-schema#label";
374 |         let o = "\"top class\"@en";
375 |         let triple_vec = vec![(Arc::from(s), Arc::from(p), Arc::from(o))];
376 |         // triple patterns with 2-3 terms
377 |         assert_eq!(triple_vec, hdt.triples_with_pattern(Some(s), Some(p), Some(o)).collect::<Vec<_>>(), "SPO");
378 |         assert_eq!(triple_vec, hdt.triples_with_pattern(Some(s), Some(p), None).collect::<Vec<_>>(), "SP?");
379 |         assert_eq!(triple_vec, hdt.triples_with_pattern(Some(s), None, Some(o)).collect::<Vec<_>>(), "S?O");
380 |         assert_eq!(triple_vec, hdt.triples_with_pattern(None, Some(p), Some(o)).collect::<Vec<_>>(), "?PO");
381 |         let et = "http://www.snik.eu/ontology/meta/EntityType";
382 |         let meta = "http://www.snik.eu/ontology/meta";
383 |         let subjects = ["ApplicationComponent", "Method", "RepresentationType", "SoftwareProduct"]
384 |             .map(|s| meta.to_owned() + "/" + s)
385 |             .to_vec();
386 |         assert_eq!(
387 |             subjects,
388 |             hdt.subjects_with_po("http://www.w3.org/2000/01/rdf-schema#subClassOf", et).collect::<Vec<_>>()
389 |         );
390 |         assert_eq!(
391 |             12,
392 |             hdt.triples_with_pattern(None, Some("http://www.w3.org/2000/01/rdf-schema#subClassOf"), None).count()
393 |         );
394 |         assert_eq!(20, hdt.triples_with_pattern(None, None, Some(et)).count());
395 |         let snikeu = "http://www.snik.eu";
396 |         let triple_vec = [
397 |             "http://purl.org/dc/terms/publisher", "http://purl.org/dc/terms/source",
398 |             "http://xmlns.com/foaf/0.1/homepage",
399 |         ]
400 |         .into_iter()
401 |         .map(|p| (Arc::from(meta), Arc::from(p), Arc::from(snikeu)))
402 |         .collect::<Vec<_>>();
403 |         assert_eq!(
404 |             triple_vec,
405 |             hdt.triples_with_pattern(Some(meta), None, Some(snikeu)).collect::<Vec<_>>(),
406 |             "S?O multiple"
407 |         );
408 |         let s = "http://www.snik.eu/ontology/meta/хобби-N-0";
409 |         let o = "\"ХОББИ\"@ru";
410 |         let triple_vec = vec![(Arc::from(s), Arc::from(p), Arc::from(o))];
411 |         assert_eq!(triple_vec, hdt.triples_with_pattern(Some(s), Some(p), None).collect::<Vec<_>>(),);
412 |         Ok(())
413 |     }
414 | }
415 | 


--------------------------------------------------------------------------------
/src/hdt_graph.rs:
--------------------------------------------------------------------------------
  1 | // //! *This module is available only if HDT is built with the `"sophia"` feature.*
  2 | #[cfg(feature = "sophia")]
  3 | use crate::four_sect_dict::IdKind;
  4 | use crate::hdt::Hdt;
  5 | use crate::triples::{Id, ObjectIter, PredicateIter, PredicateObjectIter, SubjectIter, TripleId};
  6 | use log::debug;
  7 | use sophia::api::graph::Graph;
  8 | use sophia::api::term::{BnodeId, IriRef, LanguageTag, Term, matcher::TermMatcher};
  9 | use std::convert::Infallible;
 10 | use std::io::{self, Error, ErrorKind};
 11 | use std::iter;
 12 | use std::sync::Arc;
 13 | 
 14 | mod term;
 15 | pub use term::HdtTerm;
 16 | 
 17 | /// Adapter to use HDT as a Sophia graph.
 18 | pub struct HdtGraph {
 19 |     /// Wrapped HDT instance
 20 |     pub hdt: Hdt,
 21 | }
 22 | 
 23 | /// HdtGraph does not support all of the Sophia TermMatcher functionality.
 24 | enum HdtMatcher {
 25 |     Constant((HdtTerm, Id)),
 26 |     Other,
 27 | }
 28 | 
 29 | impl HdtGraph {
 30 |     /// Wrapper around Hdt.
 31 |     pub const fn new(hdt: Hdt) -> Self {
 32 |         HdtGraph { hdt }
 33 |     }
 34 |     /// Size in bytes on the heap.
 35 |     pub fn size_in_bytes(&self) -> usize {
 36 |         self.hdt.size_in_bytes()
 37 |     }
 38 | 
 39 |     fn id_term(&self, id: Id, kind: &'static IdKind) -> HdtTerm {
 40 |         auto_term(&self.hdt.dict.id_to_string(id, kind).unwrap()).unwrap()
 41 |         // TODO: optimize by excluding cases depending on the id kind
 42 |         //IriRef::new_unchecked(MownStr::from(s)).into_term()
 43 |     }
 44 | 
 45 |     /// Transforms a Sophia TermMatcher to a constant HdtTerm and Id if possible.
 46 |     /// Returns none if it matches a constant term that cannot be found.
 47 |     fn unpack_matcher<T: TermMatcher>(&self, tm: &T, kind: &IdKind) -> Option<HdtMatcher> {
 48 |         match tm.constant() {
 49 |             Some(t) => match HdtTerm::try_from(t.borrow_term()) {
 50 |                 Some(t) => {
 51 |                     let id = self.hdt.dict.string_to_id(&term_string(&t), kind);
 52 |                     if id == 0 {
 53 |                         return None;
 54 |                     }
 55 |                     Some(HdtMatcher::Constant((t, id)))
 56 |                 }
 57 |                 None => None,
 58 |             },
 59 |             None => Some(HdtMatcher::Other),
 60 |         }
 61 |     }
 62 | }
 63 | 
 64 | /// Create the correct Sophia term for a given resource string.
 65 | /// Slow, use the appropriate method if you know which type (Literal, URI, or blank node) the string has.
 66 | fn auto_term(s: &str) -> io::Result<HdtTerm> {
 67 |     match s.chars().next() {
 68 |         None => Err(Error::new(ErrorKind::InvalidData, "empty input")),
 69 |         Some('"') => match s.rfind('"') {
 70 |             None => Err(Error::new(
 71 |                 ErrorKind::InvalidData,
 72 |                 format!("missing right quotation mark in literal string {s}"),
 73 |             )),
 74 |             Some(index) => {
 75 |                 let lex = Arc::from(&s[1..index]);
 76 |                 let rest = &s[index + 1..];
 77 |                 // literal with no language tag and no datatype
 78 |                 if rest.is_empty() {
 79 |                     return Ok(HdtTerm::LiteralDatatype(lex, term::XSD_STRING.clone()));
 80 |                 }
 81 |                 // either language tag or datatype
 82 |                 if let Some(tag_index) = rest.find('@') {
 83 |                     let tag = LanguageTag::new_unchecked(Arc::from(&rest[tag_index + 1..]));
 84 |                     return Ok(HdtTerm::LiteralLanguage(lex, tag));
 85 |                 }
 86 |                 // datatype
 87 |                 let mut dt_split = rest.split("^^");
 88 |                 dt_split.next(); // empty
 89 |                 match dt_split.next() {
 90 |                     Some(dt) => {
 91 |                         let unquoted = &dt[1..dt.len() - 1];
 92 |                         let dt = IriRef::new_unchecked(Arc::from(unquoted));
 93 |                         Ok(HdtTerm::LiteralDatatype(lex, dt))
 94 |                     }
 95 |                     None => Err(Error::new(ErrorKind::InvalidData, format!("empty datatype in {s}"))),
 96 |                 }
 97 |             }
 98 |         },
 99 |         Some('_') => Ok(HdtTerm::BlankNode(BnodeId::new_unchecked(Arc::from(&s[2..])))),
100 |         _ => Ok(HdtTerm::Iri(IriRef::new_unchecked(Arc::from(s)))),
101 |     }
102 | }
103 | 
104 | // Convert a SimpleTerm into the HDT String format.
105 | // Sophia doesn't include the _: prefix for blank node strings but HDT expects it
106 | // not needed for property terms, as they can't be blank nodes
107 | fn term_string(t: &HdtTerm) -> String {
108 |     match t {
109 |         HdtTerm::BlankNode(b) => "_:".to_owned() + b.as_str(),
110 |         HdtTerm::Iri(i) => i.as_str().to_owned(),
111 |         HdtTerm::LiteralLanguage(l, lang) => {
112 |             format!("\"{l}\"@{}", lang.as_str())
113 |         }
114 |         HdtTerm::LiteralDatatype(l, dt) => {
115 |             let xsd_string: &str = "http://www.w3.org/2001/XMLSchema#string";
116 |             let dts = dt.as_str();
117 |             if dts == xsd_string { format!("\"{l}\"") } else { format!("\"{l}\"^^<{dts}>") }
118 |         }
119 |     }
120 | }
121 | 
122 | impl Graph for HdtGraph {
123 |     type Triple<'a> = [HdtTerm; 3];
124 |     type Error = Infallible; // infallible for now, figure out what to put here later
125 | 
126 |     /// # Example
127 |     /// ```
128 |     /// use hdt::sophia::api::graph::Graph;
129 |     /// fn print_first_triple(graph: hdt::HdtGraph) {
130 |     ///     println!("{:?}", graph.triples().next().expect("no triple in the graph"));
131 |     /// }
132 |     /// ```
133 |     fn triples(&self) -> impl Iterator<Item = Result<Self::Triple<'_>, Self::Error>> {
134 |         debug!("Iterating through ALL triples in the HDT Graph. This can be inefficient for large graphs.");
135 |         self.hdt.triples().map(move |(s, p, o)| {
136 |             Ok([auto_term(&s).unwrap(), HdtTerm::Iri(IriRef::new_unchecked(p)), auto_term(&o).unwrap()])
137 |         })
138 |     }
139 | 
140 |     /// Only supports constant and "any" matchers.
141 |     /// Non-constant matchers are supposed to be "any" matchers.
142 |     /// # Example
143 |     /// Who was born in Leipzig?
144 |     /// ```
145 |     /// use hdt::{Hdt,HdtGraph};
146 |     /// use hdt::sophia::api::graph::Graph;
147 |     /// use hdt::sophia::api::term::{IriRef, SimpleTerm, matcher::Any};
148 |     ///
149 |     /// fn query(dbpedia: hdt::HdtGraph) {
150 |     ///     let birth_place = SimpleTerm::Iri(IriRef::new_unchecked("http://www.snik.eu/ontology/birthPlace".into()));
151 |     ///     let leipzig = SimpleTerm::Iri(IriRef::new_unchecked("http://www.snik.eu/resource/Leipzig".into()));
152 |     ///     let persons = dbpedia.triples_matching(Any, Some(birth_place), Some(leipzig));
153 |     /// }
154 |     /// ```
155 |     fn triples_matching<'s, S, P, O>(
156 |         &'s self, sm: S, pm: P, om: O,
157 |     ) -> impl Iterator<Item = Result<Self::Triple<'s>, Self::Error>> + 's
158 |     where
159 |         S: TermMatcher + 's,
160 |         P: TermMatcher + 's,
161 |         O: TermMatcher + 's,
162 |     {
163 |         use HdtMatcher::{Constant, Other};
164 |         let xso = match self.unpack_matcher(&sm, &IdKind::Subject) {
165 |             None => return Box::new(iter::empty()) as Box<dyn Iterator<Item = _>>,
166 |             Some(x) => x,
167 |         };
168 |         let xpo = match self.unpack_matcher(&pm, &IdKind::Predicate) {
169 |             None => return Box::new(iter::empty()),
170 |             Some(x) => x,
171 |         };
172 |         let xoo = match self.unpack_matcher(&om, &IdKind::Object) {
173 |             None => return Box::new(iter::empty()),
174 |             Some(x) => x,
175 |         };
176 |         // TODO: improve error handling
177 |         match (xso, xpo, xoo) {
178 |             //if SubjectIter::with_pattern(&self.hdt.triples, &TripleId::new(s.1, p.1, o.1)).next().is_some() { // always true
179 |             (Constant(s), Constant(p), Constant(o)) => Box::new(iter::once(Ok([s.0, p.0, o.0]))),
180 |             (Constant(s), Constant(p), Other) => Box::new(
181 |                 SubjectIter::with_pattern(&self.hdt.triples, &TripleId::new(s.1, p.1, 0))
182 |                     .map(|tid| {
183 |                         auto_term(&self.hdt.dict.id_to_string(tid.object_id, &IdKind::Object).unwrap()).unwrap()
184 |                     })
185 |                     .filter(move |term| om.matches(term))
186 |                     .map(move |term| Ok([s.0.clone(), p.0.clone(), term])),
187 |             ),
188 |             (Constant(s), Other, Constant(o)) => Box::new(
189 |                 SubjectIter::with_pattern(&self.hdt.triples, &TripleId::new(s.1, 0, o.1))
190 |                     .map(|t| self.id_term(t.predicate_id, &IdKind::Predicate))
191 |                     .filter(move |term| pm.matches(term))
192 |                     .map(move |term| Ok([s.0.clone(), term, o.0.clone()])),
193 |             ),
194 |             (Constant(s), Other, Other) => Box::new(
195 |                 SubjectIter::with_pattern(&self.hdt.triples, &TripleId::new(s.1, 0, 0))
196 |                     .map(move |t| {
197 |                         [
198 |                             self.id_term(t.predicate_id, &IdKind::Predicate),
199 |                             self.id_term(t.object_id, &IdKind::Object),
200 |                         ]
201 |                     })
202 |                     .filter(move |[pt, ot]| pm.matches(pt) && om.matches(ot))
203 |                     .map(move |[pt, ot]| Ok([s.0.clone(), pt, ot])),
204 |             ),
205 |             (Other, Constant(p), Constant(o)) => Box::new(
206 |                 PredicateObjectIter::new(&self.hdt.triples, p.1, o.1)
207 |                     .map(|sid| self.id_term(sid, &IdKind::Subject))
208 |                     .filter(move |term| sm.matches(term))
209 |                     .map(move |term| Ok([term, p.0.clone(), o.0.clone()])),
210 |             ),
211 |             (Other, Constant(p), Other) => Box::new(
212 |                 PredicateIter::new(&self.hdt.triples, p.1)
213 |                     .map(move |t| {
214 |                         [self.id_term(t.subject_id, &IdKind::Subject), self.id_term(t.object_id, &IdKind::Object)]
215 |                     })
216 |                     .filter(move |[st, ot]| sm.matches(st) && om.matches(ot))
217 |                     .map(move |[st, ot]| Ok([st, p.0.clone(), ot])),
218 |             ),
219 |             (Other, Other, Constant(o)) => Box::new(ObjectIter::new(&self.hdt.triples, o.1).map(move |t| {
220 |                 Ok([
221 |                     auto_term(&Arc::from(self.hdt.dict.id_to_string(t.subject_id, &IdKind::Subject).unwrap()))
222 |                         .unwrap(),
223 |                     self.id_term(t.predicate_id, &IdKind::Predicate),
224 |                     o.0.clone(),
225 |                 ])
226 |             })),
227 |             (Other, Other, Other) => Box::new(
228 |                 self.hdt
229 |                     .triples()
230 |                     .map(move |(s, p, o)| {
231 |                         [auto_term(&s).unwrap(), HdtTerm::Iri(IriRef::new_unchecked(p)), auto_term(&o).unwrap()]
232 |                     })
233 |                     .filter(move |[st, pt, ot]| sm.matches(st) && pm.matches(pt) && om.matches(ot))
234 |                     .map(Result::Ok),
235 |             ),
236 |         }
237 |     }
238 | }
239 | 
240 | #[cfg(test)]
241 | mod tests {
242 |     use super::*;
243 |     use crate::tests::init;
244 |     use sophia::api::prelude::Triple;
245 |     use sophia::api::term::matcher::Any;
246 |     use std::fs::File;
247 | 
248 |     #[test]
249 |     fn test_graph() -> color_eyre::Result<()> {
250 |         init();
251 |         let file = File::open("tests/resources/snikmeta.hdt")?;
252 |         let hdt = Hdt::read(std::io::BufReader::new(file))?;
253 |         let graph = HdtGraph::new(hdt);
254 |         let triples: Vec<Result<[HdtTerm; 3], Infallible>> = graph.triples().collect();
255 |         assert_eq!(triples.len(), 328);
256 |         let meta_top = "http://www.snik.eu/ontology/meta/Top";
257 |         assert!(
258 |             graph
259 |                 .triples_matching(
260 |                     Some(HdtTerm::Iri(IriRef::new_unchecked(Arc::from("http://www.snik.eu/ontology/meta")))),
261 |                     Any,
262 |                     Any
263 |                 )
264 |                 .next()
265 |                 .is_some()
266 |         );
267 |         for uri in [meta_top, "http://www.snik.eu/ontology/meta", "doesnotexist"] {
268 |             let term = HdtTerm::Iri(IriRef::new_unchecked(Arc::from(uri)));
269 |             let filtered: Vec<_> = triples
270 |                 .iter()
271 |                 .map(|triple| triple.as_ref().unwrap())
272 |                 .filter(|triple| triple.s().iri().is_some() && triple.s().iri().unwrap().to_string() == uri)
273 |                 .collect();
274 |             let with_s: Vec<_> = graph.triples_matching(Some(term), Any, Any).map(Result::unwrap).collect();
275 |             // Sophia strings can't be compared directly, use the Debug trait for string comparison that is more brittle and less elegant
276 |             // could break in the future e.g. because of ordering
277 |             let filtered_string = format!("{filtered:?}");
278 |             let with_s_string = format!("{with_s:?}");
279 |             assert_eq!(
280 |                 filtered_string, with_s_string,
281 |                 "different results between triples() and triples_with_s() for {uri}"
282 |             );
283 |         }
284 |         let s = HdtTerm::Iri(IriRef::new_unchecked(meta_top.into()));
285 |         let label = HdtTerm::Iri(IriRef::new_unchecked("http://www.w3.org/2000/01/rdf-schema#label".into()));
286 |         let o = HdtTerm::LiteralLanguage("top class".into(), LanguageTag::new_unchecked("en".into()));
287 |         assert!(graph.triples_matching(Any, Any, [o.borrow_term()]).next().is_some());
288 | 
289 |         let tvec = vec![[s.clone(), label.clone(), o.clone()]];
290 |         assert_eq!(
291 |             tvec,
292 |             graph
293 |                 .triples_matching([s.borrow_term()], [label.borrow_term()], Any)
294 |                 .map(Result::unwrap)
295 |                 .collect::<Vec<_>>()
296 |         );
297 |         assert_eq!(
298 |             tvec,
299 |             graph
300 |                 .triples_matching([s.borrow_term()], Any, [o.borrow_term()])
301 |                 .map(Result::unwrap)
302 |                 .collect::<Vec<_>>()
303 |         );
304 |         assert_eq!(
305 |             tvec,
306 |             graph
307 |                 .triples_matching(Any, [label.borrow_term()], [o.borrow_term()])
308 |                 .map(Result::unwrap)
309 |                 .collect::<Vec<_>>()
310 |         );
311 |         assert_eq!(1, graph.triples_matching(Any, Any, ["22.10"]).count());
312 |         let date = HdtTerm::LiteralDatatype(
313 |             "2022-10-20".into(),
314 |             IriRef::new_unchecked("http://www.w3.org/2001/XMLSchema#date".into()),
315 |         );
316 |         assert_eq!(1, graph.triples_matching(Any, Any, Some(&date)).count());
317 |         // *** matchers other than constant and Any ********************************************
318 |         let meta = HdtTerm::Iri(IriRef::new_unchecked("http://www.snik.eu/ontology/meta".into()));
319 |         let modified = HdtTerm::Iri(IriRef::new_unchecked("http://purl.org/dc/terms/modified".into()));
320 |         // SPO
321 |         assert_eq!(2, graph.triples_matching([&meta, &s], [&label, &modified], [&date, &o]).count());
322 |         // SP?
323 |         assert_eq!(3, graph.triples_matching([&meta, &s], [&label, &modified], Any).count());
324 |         // S?O
325 |         assert_eq!(2, graph.triples_matching([&meta, &s], Any, [&date, &o]).count());
326 |         // S??
327 |         assert_eq!(
328 |             graph.triples_matching([&meta, &s], Any, Any).count(),
329 |             graph.triples_matching([&meta], Any, Any).count() + graph.triples_matching([&s], Any, Any).count(),
330 |         );
331 |         // ?P?
332 |         assert_eq!(2, graph.triples_matching(Any, Any, [&date, &o]).count());
333 |         // ?PO
334 |         assert_eq!(2, graph.triples_matching(Any, [&label, &modified], [&date, &o]).count());
335 |         // ?P?
336 |         assert_eq!(
337 |             graph.triples_matching(Any, [&label, &modified], Any).count(),
338 |             graph.triples_matching(Any, [&label], Any).count()
339 |                 + graph.triples_matching(Any, [&modified], Any).count()
340 |         );
341 |         // test for errors involving blank nodes
342 |         let blank = HdtTerm::BlankNode(BnodeId::new_unchecked("b1".into()));
343 |         // blank node as input
344 |         assert_eq!(3, graph.triples_matching(Some(&blank), Any, Any).count());
345 |         assert_eq!(1, graph.triples_matching(Any, Any, Some(&blank)).count());
346 |         // blank node as output
347 |         let rdftype =
348 |             HdtTerm::Iri(IriRef::new_unchecked("http://www.w3.org/1999/02/22-rdf-syntax-ns#type".into()));
349 |         let owlrestriction =
350 |             HdtTerm::Iri(IriRef::new_unchecked("http://www.w3.org/2002/07/owl#Restriction".into()));
351 |         assert_eq!(1, graph.triples_matching(Any, Some(rdftype), Some(owlrestriction)).count());
352 |         // not in the original SNIK meta but added to cover more cases
353 |         let s = HdtTerm::Iri(IriRef::new_unchecked("http://www.snik.eu/ontology/meta/хобби-N-0".into()));
354 |         let o = HdtTerm::LiteralLanguage("ХОББИ".into(), LanguageTag::new_unchecked("ru".into()));
355 |         let tvec = vec![[s.clone(), label.clone(), o.clone()]];
356 |         assert_eq!(
357 |             tvec,
358 |             graph
359 |                 .triples_matching([s.borrow_term()], [label.borrow_term()], Any)
360 |                 .map(Result::unwrap)
361 |                 .collect::<Vec<_>>()
362 |         );
363 |         Ok(())
364 |     }
365 | }
366 | 


--------------------------------------------------------------------------------
/src/hdt_graph/term.rs:
--------------------------------------------------------------------------------
  1 | //! I define [`HdtTerm`], an implementation of [`sophia::api::term::Term`].
  2 | use sophia::api::MownStr;
  3 | use sophia::api::ns::{rdf, xsd};
  4 | use sophia::api::term::{BnodeId, LanguageTag, Term, TermKind};
  5 | use sophia::iri::IriRef;
  6 | use std::sync::{Arc, LazyLock};
  7 | 
  8 | pub static XSD_STRING: LazyLock<IriRef<Arc<str>>> =
  9 |     LazyLock::new(|| xsd::string.iri().unwrap().map_unchecked(|m| Arc::from(m.as_ref())));
 10 | 
 11 | /// An implementation of [`sophia::api::term::Term`] for [`HdtGraph`](super::HdtGraph).
 12 | #[derive(Clone, Debug)]
 13 | pub enum HdtTerm {
 14 |     /// This HdtTerm is an IRI
 15 |     Iri(IriRef<Arc<str>>),
 16 |     /// This HdtTerm is a blank node
 17 |     BlankNode(BnodeId<Arc<str>>),
 18 |     /// This HdtTerm is a literal with a "standard" datatype
 19 |     LiteralDatatype(Arc<str>, IriRef<Arc<str>>),
 20 |     /// This HdtTerm is a language string literal
 21 |     LiteralLanguage(Arc<str>, LanguageTag<Arc<str>>),
 22 | }
 23 | 
 24 | impl HdtTerm {
 25 |     /// Convert t into an HdtTerm if it is a supported kind of term.
 26 |     #[allow(clippy::needless_pass_by_value)]
 27 |     pub fn try_from<T: Term>(t: T) -> Option<HdtTerm> {
 28 |         match t.kind() {
 29 |             TermKind::Iri => t.iri().map(|iri| HdtTerm::Iri(iri.map_unchecked(mown2arc))),
 30 |             TermKind::BlankNode => t.bnode_id().map(|bnid| HdtTerm::BlankNode(bnid.map_unchecked(mown2arc))),
 31 |             TermKind::Literal => Some({
 32 |                 let lex = mown2arc(t.lexical_form().unwrap());
 33 |                 if let Some(tag) = t.language_tag() {
 34 |                     let tag = tag.map_unchecked(mown2arc);
 35 |                     HdtTerm::LiteralLanguage(lex, tag)
 36 |                 } else {
 37 |                     let dt = t.datatype().unwrap().map_unchecked(mown2arc);
 38 |                     HdtTerm::LiteralDatatype(lex, dt)
 39 |                 }
 40 |             }),
 41 |             _ => None,
 42 |         }
 43 |     }
 44 | }
 45 | 
 46 | impl Term for HdtTerm {
 47 |     type BorrowTerm<'x>
 48 |         = &'x Self
 49 |     where
 50 |         Self: 'x;
 51 | 
 52 |     fn kind(&self) -> TermKind {
 53 |         match self {
 54 |             HdtTerm::Iri(_) => TermKind::Iri,
 55 |             HdtTerm::BlankNode(_) => TermKind::BlankNode,
 56 |             HdtTerm::LiteralDatatype(..) | HdtTerm::LiteralLanguage(..) => TermKind::Literal,
 57 |         }
 58 |     }
 59 | 
 60 |     fn borrow_term(&self) -> Self::BorrowTerm<'_> {
 61 |         self
 62 |     }
 63 | 
 64 |     fn iri(&self) -> Option<sophia::api::term::IriRef<mownstr::MownStr>> {
 65 |         match self {
 66 |             HdtTerm::Iri(iri) => Some(iri.as_ref().map_unchecked(MownStr::from_ref)),
 67 |             _ => None,
 68 |         }
 69 |     }
 70 | 
 71 |     fn bnode_id(&self) -> Option<BnodeId<mownstr::MownStr>> {
 72 |         match self {
 73 |             HdtTerm::BlankNode(bnid) => Some(bnid.as_ref().map_unchecked(MownStr::from_ref)),
 74 |             _ => None,
 75 |         }
 76 |     }
 77 | 
 78 |     fn lexical_form(&self) -> Option<mownstr::MownStr> {
 79 |         match self {
 80 |             HdtTerm::LiteralDatatype(lex, _) | HdtTerm::LiteralLanguage(lex, _) => Some(lex.as_ref().into()),
 81 |             _ => None,
 82 |         }
 83 |     }
 84 | 
 85 |     fn datatype(&self) -> Option<sophia::api::term::IriRef<mownstr::MownStr>> {
 86 |         match self {
 87 |             HdtTerm::LiteralDatatype(_, datatype) => Some(datatype.as_ref().map_unchecked(MownStr::from_ref)),
 88 |             HdtTerm::LiteralLanguage(..) => rdf::langString.iri(),
 89 |             _ => None,
 90 |         }
 91 |     }
 92 | 
 93 |     fn language_tag(&self) -> Option<LanguageTag<mownstr::MownStr>> {
 94 |         match self {
 95 |             HdtTerm::LiteralLanguage(_, tag) => Some(tag.as_ref().map_unchecked(MownStr::from_ref)),
 96 |             _ => None,
 97 |         }
 98 |     }
 99 | }
100 | 
101 | impl PartialEq for HdtTerm {
102 |     fn eq(&self, other: &Self) -> bool {
103 |         Term::eq(self, other)
104 |     }
105 | }
106 | 
107 | impl Eq for HdtTerm {}
108 | 
109 | fn mown2arc(m: MownStr) -> Arc<str> {
110 |     Box::<str>::from(m).into()
111 | }
112 | 


--------------------------------------------------------------------------------
/src/header.rs:
--------------------------------------------------------------------------------
  1 | use crate::containers::ControlInfo;
  2 | use crate::containers::rdf::{Id, Literal, Term, Triple};
  3 | use ntriple::parser::triple_line;
  4 | use std::collections::BTreeSet;
  5 | use std::io::BufRead;
  6 | use std::str;
  7 | 
  8 | /// Metadata about the dataset, see <https://www.rdfhdt.org/hdt-binary-format/#header>.
  9 | #[derive(Debug, Clone)]
 10 | pub struct Header {
 11 |     /// Header data format. Only "ntriples" is supported.
 12 |     pub format: String,
 13 |     /// The number of bytes of the header data.
 14 |     pub length: usize,
 15 |     /// Triples describing the dataset.
 16 |     pub body: BTreeSet<Triple>,
 17 | }
 18 | 
 19 | /// The error type for the `read` method.
 20 | #[derive(thiserror::Error, Debug)]
 21 | #[error("failed to read HDT header")]
 22 | pub enum HeaderReadError {
 23 |     #[error("{0}")]
 24 |     Other(String),
 25 |     Io(#[from] std::io::Error),
 26 |     ControlInfoError(#[from] crate::containers::ControlInfoReadError),
 27 |     #[error("invalid header format {0}, only 'ntriples' is supported")]
 28 |     InvalidHeaderFormat(String),
 29 | }
 30 | 
 31 | impl Header {
 32 |     /// Reader needs to be positioned directly after the global control information.
 33 |     pub fn read<R: BufRead>(reader: &mut R) -> Result<Self, HeaderReadError> {
 34 |         use HeaderReadError::*;
 35 |         let header_ci = ControlInfo::read(reader)?;
 36 |         if header_ci.format != "ntriples" {
 37 |             return Err(InvalidHeaderFormat(header_ci.format));
 38 |         }
 39 | 
 40 |         //let ls = header_ci.get("length").ok_or_else(|| "missing header length".to_owned().into())?;
 41 |         let ls = header_ci.get("length").unwrap();
 42 |         let length = ls.parse::<usize>().unwrap();
 43 |         //ls.parse::<usize>().map_err(|_| format!("invalid header length '{ls}'").into())?;
 44 | 
 45 |         let mut body_buffer: Vec<u8> = vec![0; length];
 46 |         reader.read_exact(&mut body_buffer)?;
 47 |         let mut body = BTreeSet::new();
 48 | 
 49 |         for line_slice in body_buffer.split(|b| b == &b'\n') {
 50 |             let line = str::from_utf8(line_slice).map_err(|_| Other("Header is not UTF-8".to_owned()))?;
 51 |             if let Ok(Some(triple)) = triple_line(line) {
 52 |                 let subject = match triple.subject {
 53 |                     ntriple::Subject::IriRef(iri) => Id::Named(iri),
 54 |                     ntriple::Subject::BNode(id) => Id::Blank(id),
 55 |                 };
 56 | 
 57 |                 let ntriple::Predicate::IriRef(predicate) = triple.predicate;
 58 | 
 59 |                 let object = match triple.object {
 60 |                     ntriple::Object::IriRef(iri) => Term::Id(Id::Named(iri)),
 61 |                     ntriple::Object::BNode(id) => Term::Id(Id::Blank(id)),
 62 |                     ntriple::Object::Lit(lit) => Term::Literal(match lit.data_type {
 63 |                         ntriple::TypeLang::Lang(lan) => Literal::new_lang(lit.data, lan),
 64 |                         ntriple::TypeLang::Type(data_type) => {
 65 |                             if data_type == "http://www.w3.org/2001/XMLSchema#string" {
 66 |                                 Literal::new(lit.data)
 67 |                             } else {
 68 |                                 Literal::new_typed(lit.data, data_type)
 69 |                             }
 70 |                         }
 71 |                     }),
 72 |                 };
 73 | 
 74 |                 body.insert(Triple::new(subject, predicate, object));
 75 |             }
 76 |         }
 77 |         Ok(Header { format: header_ci.format, length, body })
 78 |     }
 79 | }
 80 | 
 81 | #[cfg(test)]
 82 | mod tests {
 83 |     use super::*;
 84 |     use crate::tests::init;
 85 |     use std::fs::File;
 86 |     use std::io::BufReader;
 87 | 
 88 |     #[test]
 89 |     fn read_header() -> color_eyre::Result<()> {
 90 |         init();
 91 |         let file = File::open("tests/resources/yago_header.hdt")?;
 92 |         let mut reader = BufReader::new(file);
 93 |         ControlInfo::read(&mut reader)?;
 94 | 
 95 |         let header = Header::read(&mut reader)?;
 96 |         assert_eq!(header.format, "ntriples");
 97 |         assert_eq!(header.length, 1891);
 98 |         assert_eq!(header.body.len(), 22);
 99 |         Ok(())
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! [![github]](https://github.com/konradhoeffner/hdt)&ensp;[![crates-io]](https://crates.io/crates/hdt)&ensp;[![docs-rs]](crate)
  2 | //!
  3 | //! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
  4 | //! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
  5 | //! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs
  6 | //!
  7 | //! <br>
  8 | //!
  9 | //! HDT is a loading and triple pattern querying library for the [Header Dictionary Triples](https://www.rdfhdt.org/) compressed binary RDF format.
 10 | //!
 11 | //! Currently this library only supports loading and querying existing HDT files as created by [hdt-cpp](https://github.com/rdfhdt/hdt-cpp).
 12 | //! For reference implementations of HDT in C++ and Java, which support conversion and serialization from and into HDT with different format options,
 13 | //! and acknowledgement of all the original authors, please look at the <https://github.com/rdfhdt> organisation.
 14 | //!
 15 | //! # Example of loading and querying an HDT file
 16 | //!
 17 | //! ```no_run
 18 | //! use hdt::Hdt;
 19 | //! // Load an hdt file
 20 | //! let file = std::fs::File::open("example.hdt").expect("error opening file");
 21 | //! let hdt = Hdt::new(std::io::BufReader::new(file)).expect("error loading HDT");
 22 | //! // query
 23 | //! let majors = hdt.triples_with_pattern(Some("http://dbpedia.org/resource/Leipzig"), Some("http://dbpedia.org/ontology/major"),None);
 24 | //! println!("{:?}", majors.collect::<Vec<_>>());
 25 | //! ```
 26 | //!
 27 | #![cfg_attr(
 28 |     feature = "cache",
 29 |     doc = r#"
 30 | # Experimental Features
 31 | The **cache** feature is experimental and may change or be removed in future releases.
 32 |  
 33 | Creating and/or loading a HDT file leveraging a custom cache:
 34 | 
 35 | ```no_run
 36 | let hdt = hdt::Hdt::new_from_path(std::path::Path::new("tests/resources/snikmeta.hdt")).unwrap();
 37 | ``` 
 38 | "#
 39 | )]
 40 | #![cfg_attr(
 41 |     feature = "sophia",
 42 |     doc = r#"
 43 | # Additional Optional Features
 44 | 
 45 | Using the **sophia** adapter:
 46 | 
 47 | ```
 48 | use hdt::{Hdt,HdtGraph};
 49 | use hdt::sophia::api::graph::Graph;
 50 | use hdt::sophia::api::term::{IriRef, SimpleTerm, matcher::Any};
 51 | 
 52 | fn query(hdt: Hdt)
 53 | {
 54 |   let graph = HdtGraph::new(hdt);
 55 |   let s = SimpleTerm::Iri(IriRef::new_unchecked("http://dbpedia.org/resource/Leipzig".into()));
 56 |   let p = SimpleTerm::Iri(IriRef::new_unchecked("http://dbpedia.org/ontology/major".into()));
 57 |   let majors = graph.triples_matching(Some(s),Some(p),Any);
 58 | }
 59 | ```
 60 | "#
 61 | )]
 62 | // # Optional features
 63 | //
 64 | // The following features are available.
 65 | //
 66 | // - **`sophia`** *(enabled by default)* — Implements the Graph trait from the [Sophia](https://crates.io/crates/sophia) RDF toolkit.
 67 | // This allows you to drastically reduce the RAM usage of an existing application based on Sophia that loads a large knowledge base but requires an input file in the HDT format.
 68 | #![warn(missing_docs)]
 69 | #![warn(clippy::pedantic)]
 70 | #![warn(clippy::cargo)]
 71 | #![warn(clippy::str_to_string)]
 72 | #![warn(clippy::print_stdout)]
 73 | #![warn(clippy::print_stderr)]
 74 | #![warn(clippy::missing_const_for_fn)]
 75 | #![allow(clippy::unnecessary_cast)]
 76 | #![allow(clippy::must_use_candidate)]
 77 | #![allow(clippy::missing_errors_doc)]
 78 | #![allow(clippy::missing_panics_doc)]
 79 | #![allow(clippy::cast_lossless)]
 80 | #![allow(clippy::cast_possible_truncation)]
 81 | #![allow(clippy::wildcard_imports)]
 82 | #![allow(clippy::module_name_repetitions)]
 83 | #![allow(clippy::similar_names)]
 84 | #![allow(clippy::doc_markdown)]
 85 | #![allow(clippy::if_not_else)]
 86 | #![allow(clippy::into_iter_without_iter)]
 87 | #![allow(clippy::len_without_is_empty)]
 88 | // multiple versions of syn crate in transitive dependencies
 89 | #![allow(clippy::multiple_crate_versions)]
 90 | /// Types for storing and reading data.
 91 | pub mod containers;
 92 | /// Types for representing dictionaries.
 93 | pub mod dict_sect_pfc;
 94 | /// Types for representing a four section dictionary
 95 | pub mod four_sect_dict;
 96 | /// Types for representing triple sections.
 97 | pub mod hdt;
 98 | #[cfg(feature = "sophia")]
 99 | pub use sophia;
100 | #[cfg(feature = "sophia")]
101 | /// Adapter for the Sophia library.
102 | pub mod hdt_graph;
103 | /// Types for representing the header.
104 | pub mod header;
105 | /// Types for representing and querying triples.
106 | pub mod triples;
107 | 
108 | pub use crate::hdt::Hdt;
109 | use containers::{ControlInfo, ControlInfoReadError};
110 | use dict_sect_pfc::DictSectPFC;
111 | use four_sect_dict::FourSectDict;
112 | pub use four_sect_dict::IdKind;
113 | #[cfg(feature = "sophia")]
114 | pub use hdt_graph::HdtGraph;
115 | 
116 | #[cfg(test)]
117 | mod tests {
118 |     use std::sync::Once;
119 | 
120 |     static INIT: Once = Once::new();
121 | 
122 |     pub fn init() {
123 |         INIT.call_once(|| {
124 |             color_eyre::install().unwrap();
125 |             env_logger::init();
126 |         });
127 |     }
128 | }
129 | 


--------------------------------------------------------------------------------
/src/triples.rs:
--------------------------------------------------------------------------------
  1 | use crate::containers::{AdjList, Bitmap, BitmapReadError, Sequence, SequenceReadError};
  2 | use crate::{ControlInfo, ControlInfoReadError};
  3 | use bytesize::ByteSize;
  4 | use log::{debug, error};
  5 | use std::cmp::Ordering;
  6 | use std::fmt;
  7 | use std::io::BufRead;
  8 | use sucds::{
  9 |     Serializable,
 10 |     bit_vectors::{BitVector, Rank9Sel},
 11 |     char_sequences::WaveletMatrix,
 12 |     int_vectors::CompactVector,
 13 | };
 14 | 
 15 | mod subject_iter;
 16 | pub use subject_iter::SubjectIter;
 17 | mod predicate_iter;
 18 | pub use predicate_iter::PredicateIter;
 19 | mod predicate_object_iter;
 20 | pub use predicate_object_iter::PredicateObjectIter;
 21 | mod object_iter;
 22 | pub use object_iter::ObjectIter;
 23 | #[cfg(feature = "cache")]
 24 | use serde::ser::SerializeStruct;
 25 | 
 26 | /// Order of the triple sections.
 27 | /// Only SPO is tested, others probably don't work correctly.
 28 | #[allow(missing_docs)]
 29 | #[repr(u8)]
 30 | #[derive(Debug, Default, Clone, PartialEq, Eq, PartialOrd, Ord)]
 31 | #[cfg_attr(feature = "cache", derive(serde::Deserialize, serde::Serialize))]
 32 | pub enum Order {
 33 |     #[default]
 34 |     Unknown = 0,
 35 |     SPO = 1,
 36 |     SOP = 2,
 37 |     PSO = 3,
 38 |     POS = 4,
 39 |     OSP = 5,
 40 |     OPS = 6,
 41 | }
 42 | 
 43 | impl TryFrom<u32> for Order {
 44 |     type Error = TriplesReadError;
 45 | 
 46 |     fn try_from(original: u32) -> Result<Self, TriplesReadError> {
 47 |         match original {
 48 |             0 => Ok(Order::Unknown),
 49 |             1 => Ok(Order::SPO),
 50 |             2 => Ok(Order::SOP),
 51 |             3 => Ok(Order::PSO),
 52 |             4 => Ok(Order::POS),
 53 |             5 => Ok(Order::OSP),
 54 |             6 => Ok(Order::OPS),
 55 |             n => Err(TriplesReadError::UnrecognizedTriplesOrder(n)),
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | /// Inverse index from object id to positions in the object adjacency list.
 61 | /// Used for logarithmic (?) time access instead of linear time sequential search.
 62 | pub struct OpIndex {
 63 |     /// Compact integer vector of object positions.
 64 |     pub sequence: CompactVector,
 65 |     /// Bitmap with a one bit for every new object to allow finding the starting point for a given object id.
 66 |     pub bitmap: Bitmap,
 67 | }
 68 | 
 69 | #[cfg(feature = "cache")]
 70 | impl serde::Serialize for OpIndex {
 71 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
 72 |     where
 73 |         S: serde::ser::Serializer,
 74 |     {
 75 |         let mut state: <S as serde::ser::Serializer>::SerializeStruct =
 76 |             serializer.serialize_struct("OpIndex", 2)?;
 77 | 
 78 |         // Serialize sequence using `sucds`
 79 |         let mut seq_buffer = Vec::new();
 80 |         self.sequence.serialize_into(&mut seq_buffer).map_err(serde::ser::Error::custom)?;
 81 |         state.serialize_field("sequence", &seq_buffer)?;
 82 | 
 83 |         state.serialize_field("bitmap", &self.bitmap)?;
 84 | 
 85 |         state.end()
 86 |     }
 87 | }
 88 | 
 89 | #[cfg(feature = "cache")]
 90 | impl<'de> serde::Deserialize<'de> for OpIndex {
 91 |     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
 92 |     where
 93 |         D: serde::de::Deserializer<'de>,
 94 |     {
 95 |         #[derive(serde::Deserialize)]
 96 |         struct OpIndexData {
 97 |             sequence: Vec<u8>,
 98 |             bitmap: Bitmap,
 99 |         }
100 | 
101 |         let data = OpIndexData::deserialize(deserializer)?;
102 | 
103 |         // Deserialize `sucds` structures
104 |         let mut seq_reader = std::io::BufReader::new(&data.sequence[..]);
105 | 
106 |         let v = CompactVector::deserialize_from(&mut seq_reader).map_err(serde::de::Error::custom)?;
107 |         let index = OpIndex { sequence: v, bitmap: data.bitmap }; // Replace with proper reconstruction
108 | 
109 |         Ok(index)
110 |     }
111 | }
112 | 
113 | impl fmt::Debug for OpIndex {
114 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
115 |         writeln!(f, "total size {} {{", ByteSize(self.size_in_bytes() as u64))?;
116 |         writeln!(
117 |             f,
118 |             "    sequence: {} with {} bits,",
119 |             ByteSize(self.sequence.len() as u64 * self.sequence.width() as u64 / 8),
120 |             self.sequence.width()
121 |         )?;
122 |         write!(f, "    bitmap: {:#?}\n}}", self.bitmap)
123 |     }
124 | }
125 | 
126 | impl OpIndex {
127 |     /// Size in bytes on the heap.
128 |     pub fn size_in_bytes(&self) -> usize {
129 |         self.sequence.len() * self.sequence.width() / 8 + self.bitmap.size_in_bytes()
130 |     }
131 |     /// Find the first position in the OP index of the given object ID.
132 |     pub fn find(&self, o: Id) -> usize {
133 |         self.bitmap.select1(o - 1).unwrap() as usize
134 |     }
135 |     /// Find the last position in the object index of the given object ID.
136 |     pub fn last(&self, o: Id) -> usize {
137 |         match self.bitmap.select1(o) {
138 |             Some(index) => index as usize - 1,
139 |             None => self.bitmap.len() - 1,
140 |         }
141 |     }
142 | }
143 | 
144 | /// `BitmapTriples` variant of the triples section.
145 | //#[derive(Clone)]
146 | pub struct TriplesBitmap {
147 |     order: Order,
148 |     /// bitmap to find positions in the wavelet matrix
149 |     pub bitmap_y: Bitmap,
150 |     /// adjacency list storing the object IDs
151 |     pub adjlist_z: AdjList,
152 |     /// Index for object-based access. Points to the predicate layer.
153 |     pub op_index: OpIndex,
154 |     /// wavelet matrix for predicate-based access
155 |     pub wavelet_y: WaveletMatrix<Rank9Sel>,
156 | }
157 | 
158 | /// The error type for the triples bitmap read function.
159 | #[derive(thiserror::Error, Debug)]
160 | pub enum TriplesReadError {
161 |     #[error("failed to read control info")]
162 |     ControlInfoReadError(#[from] ControlInfoReadError),
163 |     #[error("bitmap read error")]
164 |     BitmapReadError(#[from] BitmapReadError),
165 |     #[error("sequence read error")]
166 |     SequenceReadError(#[from] SequenceReadError),
167 |     #[error("unspecified triples order")]
168 |     UnspecifiedTriplesOrder,
169 |     #[error("unknown triples order")]
170 |     UnknownTriplesOrder,
171 |     #[error("unrecognized triples order {0}")]
172 |     UnrecognizedTriplesOrder(u32),
173 |     #[error("unknown triples format {0}")]
174 |     UnknownTriplesFormat(String),
175 |     #[error("triple lists are not supported yet")]
176 |     TriplesList,
177 |     #[error("({0},{1},{2}) none of the components of a triple may be 0.")]
178 |     TripleComponentZero(usize, usize, usize),
179 |     #[error("unspecified external library error")]
180 |     ExternalError(#[from] Box<dyn std::error::Error + Send + Sync + 'static>),
181 |     #[error("cache decode error")]
182 |     #[cfg(feature = "cache")]
183 |     DecodeError(#[from] bincode::error::DecodeError),
184 | }
185 | 
186 | impl fmt::Debug for TriplesBitmap {
187 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
188 |         writeln!(f, "total size {}", ByteSize(self.size_in_bytes() as u64))?;
189 |         writeln!(f, "adjlist_z {:#?}", self.adjlist_z)?;
190 |         writeln!(f, "op_index {:#?}", self.op_index)?;
191 |         write!(f, "wavelet_y {}", ByteSize(self.wavelet_y.size_in_bytes() as u64))
192 |     }
193 | }
194 | 
195 | #[cfg(feature = "cache")]
196 | impl serde::Serialize for TriplesBitmap {
197 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
198 |     where
199 |         S: serde::ser::Serializer,
200 |     {
201 |         let mut state: <S as serde::ser::Serializer>::SerializeStruct =
202 |             serializer.serialize_struct("TriplesBitmap", 5)?;
203 | 
204 |         // Extract the number of triples
205 |         state.serialize_field("order", &self.order)?;
206 | 
207 |         //bitmap_y
208 |         state.serialize_field("bitmap_y", &self.bitmap_y)?;
209 | 
210 |         // adjlist_z
211 |         state.serialize_field("adjlist_z", &self.adjlist_z)?;
212 | 
213 |         // op_index
214 |         state.serialize_field("op_index", &self.op_index)?;
215 | 
216 |         // wavelet_y
217 |         let mut wavelet_y_buffer = Vec::new();
218 |         self.wavelet_y.serialize_into(&mut wavelet_y_buffer).map_err(serde::ser::Error::custom)?;
219 |         state.serialize_field("wavelet_y", &wavelet_y_buffer)?;
220 | 
221 |         state.end()
222 |     }
223 | }
224 | 
225 | #[cfg(feature = "cache")]
226 | impl<'de> serde::Deserialize<'de> for TriplesBitmap {
227 |     fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
228 |     where
229 |         D: serde::de::Deserializer<'de>,
230 |     {
231 |         #[derive(serde::Deserialize)]
232 |         struct TriplesBitmapData {
233 |             order: Order,
234 |             pub bitmap_y: Bitmap,
235 |             pub adjlist_z: AdjList,
236 |             pub op_index: OpIndex,
237 |             pub wavelet_y: Vec<u8>,
238 |         }
239 | 
240 |         let data = TriplesBitmapData::deserialize(deserializer)?;
241 | 
242 |         // Deserialize `sucds` structures
243 |         let mut bitmap_reader = std::io::BufReader::new(&data.wavelet_y[..]);
244 |         let wavelet_y =
245 |             WaveletMatrix::<Rank9Sel>::deserialize_from(&mut bitmap_reader).map_err(serde::de::Error::custom)?;
246 | 
247 |         let bitmap = TriplesBitmap {
248 |             order: data.order,
249 |             bitmap_y: data.bitmap_y,
250 |             adjlist_z: data.adjlist_z,
251 |             op_index: data.op_index,
252 |             wavelet_y,
253 |         };
254 | 
255 |         Ok(bitmap)
256 |     }
257 | }
258 | 
259 | impl TriplesBitmap {
260 |     /// read the whole triple section including control information
261 |     pub fn read_sect<R: BufRead>(reader: &mut R) -> Result<Self, TriplesReadError> {
262 |         use TriplesReadError::*;
263 |         let triples_ci = ControlInfo::read(reader)?;
264 | 
265 |         match &triples_ci.format[..] {
266 |             "<http://purl.org/HDT/hdt#triplesBitmap>" => TriplesBitmap::read(reader, &triples_ci),
267 |             "<http://purl.org/HDT/hdt#triplesList>" => Err(TriplesList),
268 |             f => Err(UnknownTriplesFormat(f.to_owned())),
269 |         }
270 |     }
271 | 
272 |     /// load the cached HDT index file, only supports TriplesBitmap
273 |     #[cfg(feature = "cache")]
274 |     pub fn load_cache<R: BufRead>(reader: &mut R, info: &ControlInfo) -> Result<Self, TriplesReadError> {
275 |         use TriplesReadError::*;
276 |         match &info.format[..] {
277 |             "<http://purl.org/HDT/hdt#triplesBitmap>" => TriplesBitmap::load(reader),
278 |             "<http://purl.org/HDT/hdt#triplesList>" => Err(TriplesList),
279 |             f => Err(UnknownTriplesFormat(f.to_owned())),
280 |         }
281 |     }
282 | 
283 |     /// load the entire cached TriplesBitmap object
284 |     #[cfg(feature = "cache")]
285 |     pub fn load<R: BufRead>(reader: &mut R) -> Result<Self, TriplesReadError> {
286 |         let triples: TriplesBitmap = bincode::serde::decode_from_std_read(reader, bincode::config::standard())?;
287 |         Ok(triples)
288 |     }
289 | 
290 |     /// Size in bytes on the heap.
291 |     pub fn size_in_bytes(&self) -> usize {
292 |         self.adjlist_z.size_in_bytes() + self.op_index.size_in_bytes() + self.wavelet_y.size_in_bytes()
293 |     }
294 | 
295 |     /// Position in the wavelet index of the first predicate for the given subject ID.
296 |     pub fn find_y(&self, subject_id: Id) -> usize {
297 |         if subject_id == 0 {
298 |             return 0;
299 |         }
300 |         self.bitmap_y.select1(subject_id - 1).unwrap() as usize + 1
301 |     }
302 | 
303 |     /// Position in the wavelet index of the last predicate for the given subject ID.
304 |     pub fn last_y(&self, subject_id: usize) -> usize {
305 |         self.find_y(subject_id + 1) - 1
306 |     }
307 | 
308 |     /// Binary search in the wavelet matrix.
309 |     fn bin_search_y(&self, element: usize, begin: usize, end: usize) -> Option<usize> {
310 |         let mut low = begin;
311 |         let mut high = end;
312 | 
313 |         while low < high {
314 |             let mid = usize::midpoint(low, high);
315 |             match self.wavelet_y.access(mid).unwrap().cmp(&element) {
316 |                 Ordering::Less => low = mid + 1,
317 |                 Ordering::Greater => high = mid,
318 |                 Ordering::Equal => return Some(mid),
319 |             }
320 |         }
321 |         None
322 |     }
323 | 
324 |     /// Search the wavelet matrix for the position of a given subject, predicate pair.
325 |     pub fn search_y(&self, subject_id: usize, property_id: usize) -> Option<usize> {
326 |         self.bin_search_y(property_id, self.find_y(subject_id), self.last_y(subject_id) + 1)
327 |     }
328 | 
329 |     fn build_wavelet(mut sequence: Sequence) -> WaveletMatrix<Rank9Sel> {
330 |         debug!("Building wavelet matrix...");
331 |         let mut builder =
332 |             CompactVector::new(sequence.bits_per_entry).expect("Failed to create wavelet matrix builder");
333 |         // possible refactor of Sequence to use sucds CompactVector, then builder can be removed
334 |         for x in &sequence {
335 |             builder.push_int(x).unwrap();
336 |         }
337 |         assert!(sequence.crc_handle.take().unwrap().join().unwrap(), "Wavelet source CRC check failed.");
338 |         drop(sequence);
339 |         let wavelet = WaveletMatrix::new(builder).expect("Error building the wavelet matrix. Aborting.");
340 |         debug!("Built wavelet matrix with length {}", wavelet.len());
341 |         wavelet
342 |     }
343 | 
344 |     /*
345 |         /// Get the predicate ID for the given z index position.
346 |     fn get_p(bitmap_z:  Bitmap, wavelet_y: WaveletMatrix, pos_z: usize) -> Id {
347 |                 let pos_y = bitmap_z.dict.rank(pos_z, true);
348 |                 wavelet_y.get(pos_y as usize) as Id
349 |     }
350 |     */
351 | 
352 |     fn read<R: BufRead>(reader: &mut R, triples_ci: &ControlInfo) -> Result<Self, TriplesReadError> {
353 |         use TriplesReadError::*;
354 |         // read order
355 |         //let order: Order = Order::try_from(triples_ci.get("order").unwrap().parse::<u32>());
356 |         let order: Order;
357 |         if let Some(n) = triples_ci.get("order").and_then(|v| v.parse::<u32>().ok()) {
358 |             order = Order::try_from(n)?;
359 |         } else {
360 |             return Err(UnspecifiedTriplesOrder);
361 |         }
362 | 
363 |         // read bitmaps
364 |         // TODO: note level in the error
365 |         let bitmap_y = Bitmap::read(reader)?; //.wrap_err("Failed to read Y level bitmap")?;
366 |         let bitmap_z = Bitmap::read(reader)?; //.wrap_err("Failed to read Z level bitmap")?;
367 | 
368 |         // read sequences
369 |         let sequence_y = Sequence::read(reader)?;
370 |         let wavelet_thread = std::thread::spawn(|| Self::build_wavelet(sequence_y));
371 |         let mut sequence_z = Sequence::read(reader)?;
372 | 
373 |         // construct adjacency lists
374 |         // construct object-based index to traverse from the leaves and support ??O and ?PO queries
375 |         debug!("Building OPS index...");
376 |         let entries = sequence_z.entries;
377 |         // if it takes too long to calculate, can also pass in as parameter
378 |         let max_object = sequence_z.into_iter().max().unwrap().to_owned();
379 |         // limited to < 2^32 objects
380 |         let mut indicess = vec![Vec::<u32>::with_capacity(4); max_object];
381 | 
382 |         // Count the indexes of appearance of each object
383 |         // In https://github.com/rdfhdt/hdt-cpp/blob/develop/libhdt/src/triples/BitmapTriples.cpp
384 |         // they count the number of appearances in a sequence instead, which saves memory
385 |         // temporarily but they need to loop over it an additional time.
386 |         for pos_z in 0..entries {
387 |             let object = sequence_z.get(pos_z);
388 |             if object == 0 {
389 |                 error!("ERROR: There is a zero value in the Z level.");
390 |                 continue;
391 |             }
392 |             let pos_y = bitmap_z.rank(pos_z.to_owned());
393 |             indicess[object - 1].push(pos_y as u32); // hdt index counts from 1 but we count from 0 for simplicity
394 |         }
395 |         // reduce memory consumption of index by using adjacency list
396 |         let mut bitmap_index_bitvector = BitVector::new();
397 |         let mut cv = CompactVector::with_capacity(entries, sucds::utils::needed_bits(entries))
398 |             .map_err(|e| e.into_boxed_dyn_error())?;
399 |         let wavelet_y = wavelet_thread.join().unwrap();
400 |         /*
401 |         let get_p = |pos_z: u32| {
402 |             let pos_y = bitmap_z.dict.rank(pos_z.to_owned() as u64, true);
403 |             wavelet_y.access(pos_y as usize).unwrap() as Id
404 |         };
405 |         */
406 |         for mut indices in indicess {
407 |             let mut first = true;
408 |             // sort by predicate
409 |             indices.sort_by_cached_key(|pos_y| wavelet_y.access(*pos_y as usize).unwrap());
410 |             for index in indices {
411 |                 bitmap_index_bitvector.push_bit(first);
412 |                 first = false;
413 |                 cv.push_int(index as usize).unwrap();
414 |             }
415 |         }
416 |         let bitmap_index = Bitmap { dict: Rank9Sel::new(bitmap_index_bitvector) };
417 |         let op_index = OpIndex { sequence: cv, bitmap: bitmap_index };
418 |         debug!("built OPS index");
419 |         assert!(sequence_z.crc_handle.take().unwrap().join().unwrap(), "sequence_z CRC check failed.");
420 |         let adjlist_z = AdjList::new(sequence_z, bitmap_z);
421 |         Ok(TriplesBitmap { order, bitmap_y, adjlist_z, op_index, wavelet_y })
422 |     }
423 | 
424 |     /// Transform the given IDs of the layers in triple section order to a triple ID.
425 |     /// Warning: At the moment only SPO is properly supported anyways, in which case this is equivalent to `TripleId::new(x,y,z)`.
426 |     /// Other orders may lead to undefined behaviour.
427 |     pub fn coord_to_triple(&self, x: Id, y: Id, z: Id) -> Result<TripleId, TriplesReadError> {
428 |         use TriplesReadError::*;
429 |         if x == 0 || y == 0 || z == 0 {
430 |             return Err(TripleComponentZero(x, y, z));
431 |         }
432 |         match self.order {
433 |             Order::SPO => Ok(TripleId::new(x, y, z)),
434 |             Order::SOP => Ok(TripleId::new(x, z, y)),
435 |             Order::PSO => Ok(TripleId::new(y, x, z)),
436 |             Order::POS => Ok(TripleId::new(y, z, x)),
437 |             Order::OSP => Ok(TripleId::new(z, x, y)),
438 |             Order::OPS => Ok(TripleId::new(z, y, x)),
439 |             Order::Unknown => Err(UnknownTriplesOrder),
440 |         }
441 |     }
442 | }
443 | 
444 | impl<'a> IntoIterator for &'a TriplesBitmap {
445 |     type Item = TripleId;
446 |     type IntoIter = SubjectIter<'a>;
447 | 
448 |     fn into_iter(self) -> Self::IntoIter {
449 |         SubjectIter::new(self)
450 |     }
451 | }
452 | 
453 | /// Subject, predicate or object ID, starting at 1.
454 | ///
455 | /// Subjects and predicate share IDs, starting at 1, for common values.
456 | /// A value of 0 indicates either not found (as a return value) or all of them (in a triple pattern).
457 | /// In the official documentation, u32 is used, however here, usize is used.
458 | /// While u32 caps out at 4 billion, more is not supported by the format anyways so this can probably be changed to u32.
459 | pub type Id = usize;
460 | 
461 | /// Type for a triple encoded as numeric IDs for subject, predicate and object, respectively.
462 | /// See <https://www.rdfhdt.org/hdt-binary-format/#triples>.
463 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
464 | pub struct TripleId {
465 |     /// Index starting at 1 in the combined shared and subject section.
466 |     pub subject_id: Id,
467 |     /// Index starting at 1 in the predicate section.
468 |     pub predicate_id: Id,
469 |     /// Index starting at 1 in the combined shared and object section.
470 |     pub object_id: Id,
471 | }
472 | 
473 | impl TripleId {
474 |     /// Create a new triple ID.
475 |     pub const fn new(subject_id: Id, predicate_id: Id, object_id: Id) -> Self {
476 |         TripleId { subject_id, predicate_id, object_id }
477 |     }
478 | }
479 | 
480 | #[cfg(test)]
481 | mod tests {
482 |     use super::*;
483 |     use crate::header::Header;
484 |     use crate::tests::init;
485 |     use crate::{FourSectDict, IdKind};
486 |     use pretty_assertions::assert_eq;
487 |     use std::fs::File;
488 |     use std::io::BufReader;
489 | 
490 |     /// Iterator over all triples with a given ID in the specified position (subject, predicate or object).
491 |     fn triples_with_id<'a>(
492 |         t: &'a TriplesBitmap, id: usize, k: &IdKind,
493 |     ) -> Box<dyn Iterator<Item = TripleId> + 'a> {
494 |         match k {
495 |             IdKind::Subject => Box::new(SubjectIter::with_s(t, id)),
496 |             IdKind::Predicate => Box::new(PredicateIter::new(t, id)),
497 |             IdKind::Object => Box::new(ObjectIter::new(t, id)),
498 |         }
499 |     }
500 | 
501 |     #[test]
502 |     fn read_triples() -> color_eyre::Result<()> {
503 |         init();
504 |         let file = File::open("tests/resources/snikmeta.hdt")?;
505 |         let mut reader = BufReader::new(file);
506 |         ControlInfo::read(&mut reader)?;
507 |         Header::read(&mut reader)?;
508 |         let _dict = FourSectDict::read(&mut reader)?;
509 |         let triples = TriplesBitmap::read_sect(&mut reader)?;
510 |         let v: Vec<TripleId> = triples.into_iter().collect::<Vec<TripleId>>();
511 |         assert_eq!(v.len(), 328);
512 |         assert_eq!(v[0].subject_id, 1);
513 |         assert_eq!(v[2].subject_id, 1);
514 |         assert_eq!(v[3].subject_id, 2);
515 |         let num_subjects = 48;
516 |         let num_predicates = 23;
517 |         let num_objects = 175;
518 |         let mut filtered: Vec<TripleId>;
519 |         let kinds = [IdKind::Subject, IdKind::Predicate, IdKind::Object];
520 |         let lens = [num_subjects, num_predicates, num_objects];
521 |         let funs = [|t: TripleId| t.subject_id, |t: TripleId| t.predicate_id, |t: TripleId| t.object_id];
522 |         for j in 0..kinds.len() {
523 |             for i in 1..=lens[j] {
524 |                 filtered = v.iter().filter(|tid| funs[j](**tid) == i).copied().collect();
525 |                 filtered.sort_unstable();
526 |                 let mut triples_with_id = triples_with_id(&triples, i, &kinds[j]).collect::<Vec<TripleId>>();
527 |                 triples_with_id.sort_unstable();
528 |                 assert_eq!(filtered, triples_with_id, "triples_with({},{:?})", i, kinds[j]);
529 |             }
530 |         }
531 | 
532 |         // SubjectIter
533 |         assert_eq!(0, SubjectIter::empty(&triples).count());
534 |         // SPO
535 |         assert_eq!(
536 |             vec![TripleId::new(14, 14, 154)],
537 |             SubjectIter::with_pattern(&triples, &TripleId::new(14, 14, 154)).collect::<Vec<_>>()
538 |         );
539 |         // SP
540 |         assert_eq!(
541 |             vec![TripleId::new(14, 14, 154)],
542 |             SubjectIter::with_pattern(&triples, &TripleId::new(14, 14, 0)).collect::<Vec<_>>()
543 |         );
544 |         // S??
545 |         for i in 1..num_subjects {
546 |             assert_eq!(
547 |                 SubjectIter::with_s(&triples, i).collect::<Vec<_>>(),
548 |                 SubjectIter::with_pattern(&triples, &TripleId::new(i, 0, 0)).collect::<Vec<_>>()
549 |             );
550 |         }
551 |         // ??? (all triples)
552 |         assert_eq!(v, SubjectIter::with_pattern(&triples, &TripleId::new(0, 0, 0)).collect::<Vec<_>>());
553 |         // SP? where S and P are in the graph, but not together
554 |         assert_eq!(0, SubjectIter::with_pattern(&triples, &TripleId::new(12, 14, 154)).count());
555 |         Ok(())
556 |     }
557 | }
558 | 


--------------------------------------------------------------------------------
/src/triples/object_iter.rs:
--------------------------------------------------------------------------------
 1 | use crate::triples::Id;
 2 | use crate::triples::TripleId;
 3 | use crate::triples::TriplesBitmap;
 4 | use sucds::int_vectors::Access;
 5 | 
 6 | // see "Exchange and Consumption of Huge RDF Data" by Martinez et al. 2012
 7 | // https://link.springer.com/chapter/10.1007/978-3-642-30284-8_36
 8 | // actually only an object iterator when SPO order is used
 9 | // TODO test with other orders and fix if broken
10 | 
11 | /// Iterator over all triples with a given object ID, answering an (?S,?P,O) query.
12 | pub struct ObjectIter<'a> {
13 |     triples: &'a TriplesBitmap,
14 |     o: Id,
15 |     pos_index: usize,
16 |     max_index: usize,
17 | }
18 | 
19 | impl<'a> ObjectIter<'a> {
20 |     /// Create a new iterator over all triples with the given object ID.
21 |     /// Panics if the object does not exist.
22 |     pub fn new(triples: &'a TriplesBitmap, o: Id) -> Self {
23 |         assert!(o != 0, "object 0 does not exist, cant iterate");
24 |         let pos_index = triples.op_index.find(o);
25 |         let max_index = triples.op_index.last(o);
26 |         //println!("ObjectIter o={} pos_index={} max_index={}", o, pos_index, max_index);
27 |         ObjectIter { triples, o, pos_index, max_index }
28 |     }
29 | }
30 | 
31 | impl Iterator for ObjectIter<'_> {
32 |     type Item = TripleId;
33 |     fn next(&mut self) -> Option<Self::Item> {
34 |         if self.pos_index > self.max_index {
35 |             return None;
36 |         }
37 |         let pos_y = self.triples.op_index.sequence.access(self.pos_index).unwrap();
38 |         let y = self.triples.wavelet_y.access(pos_y).unwrap() as Id;
39 |         let x = self.triples.bitmap_y.rank(pos_y) as Id + 1;
40 |         self.pos_index += 1;
41 |         Some(TripleId::new(x, y, self.o))
42 |         //Some(self.triples.coord_to_triple(x, y, self.o).unwrap())
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/triples/predicate_iter.rs:
--------------------------------------------------------------------------------
 1 | use crate::triples::Id;
 2 | use crate::triples::TripleId;
 3 | use crate::triples::TriplesBitmap;
 4 | 
 5 | /// Iterator over all triples with a given property ID, answering an (?S,P,?O) query.
 6 | pub struct PredicateIter<'a> {
 7 |     triples: &'a TriplesBitmap,
 8 |     s: Id,
 9 |     p: Id,
10 |     i: usize,
11 |     os: usize,
12 |     pos_z: usize,
13 |     occs: usize,
14 | }
15 | 
16 | impl<'a> PredicateIter<'a> {
17 |     /// Create a new iterator over all triples with the given property ID.
18 |     /// Panics if the object does not exist.
19 |     pub fn new(triples: &'a TriplesBitmap, p: Id) -> Self {
20 |         assert!(p != 0, "object 0 does not exist, cant iterate");
21 |         let occs = triples.wavelet_y.rank(triples.wavelet_y.len(), p as usize).unwrap();
22 |         //println!("the predicate {} is used by {} subjects in the index", p, occs);
23 |         PredicateIter { triples, p, i: 0, pos_z: 0, os: 0, s: 0, occs }
24 |     }
25 | }
26 | 
27 | impl Iterator for PredicateIter<'_> {
28 |     type Item = TripleId;
29 |     fn next(&mut self) -> Option<Self::Item> {
30 |         if self.i >= self.occs {
31 |             return None;
32 |         }
33 |         if self.os == 0 {
34 |             // Algorithm 1 findSubj from Martinez et al. 2012 ******
35 |             let pos_y = self.triples.wavelet_y.select(self.i, self.p as usize).unwrap();
36 |             self.s = self.triples.bitmap_y.rank(pos_y) as Id + 1;
37 |             // *****************************************************
38 |             // SP can have multiple O
39 |             self.pos_z = self.triples.adjlist_z.find(pos_y as Id);
40 |             let pos_z_end = self.triples.adjlist_z.last(pos_y as Id);
41 |             //println!("**** found predicate {} between {} and {} (inclusive)", self.p, self.pos_z, pos_z_end);
42 |             self.os = pos_z_end - self.pos_z;
43 |         } else {
44 |             self.os -= 1;
45 |             self.pos_z += 1;
46 |         }
47 | 
48 |         let o = self.triples.adjlist_z.sequence.get(self.pos_z) as Id;
49 |         if self.os == 0 {
50 |             self.i += 1;
51 |         }
52 |         Some(self.triples.coord_to_triple(self.s, self.p, o).unwrap())
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/triples/predicate_object_iter.rs:
--------------------------------------------------------------------------------
 1 | use crate::triples::Id;
 2 | use crate::triples::TriplesBitmap;
 3 | use std::cmp::Ordering;
 4 | use sucds::int_vectors::Access;
 5 | 
 6 | // see filterPredSubj in "Exchange and Consumption of Huge RDF Data" by Martinez et al. 2012
 7 | // https://link.springer.com/chapter/10.1007/978-3-642-30284-8_36
 8 | 
 9 | /// Iterator over all subject IDs with a given predicate and object ID, answering an (?S,P,O) query.
10 | pub struct PredicateObjectIter<'a> {
11 |     triples: &'a TriplesBitmap,
12 |     pos_index: usize,
13 |     max_index: usize,
14 | }
15 | 
16 | impl<'a> PredicateObjectIter<'a> {
17 |     /// Create a new iterator over all triples with the given predicate and object ID.
18 |     /// Panics if the predicate or object ID is 0.
19 |     pub fn new(triples: &'a TriplesBitmap, p: Id, o: Id) -> Self {
20 |         assert_ne!(0, p, "predicate 0 does not exist, cant iterate");
21 |         assert_ne!(0, o, "object 0 does not exist, cant iterate");
22 |         let mut low = triples.op_index.find(o);
23 |         let mut high = triples.op_index.last(o);
24 |         let get_y = |pos_index| {
25 |             let pos_y = triples.op_index.sequence.access(pos_index).unwrap();
26 |             triples.wavelet_y.access(pos_y).unwrap() as Id
27 |         };
28 |         // Binary search with a twist:
29 |         // Each value may occur multiple times, so we search for the left and right borders.
30 |         while low <= high {
31 |             let mut mid = usize::midpoint(low, high);
32 |             match get_y(mid).cmp(&p) {
33 |                 Ordering::Less => low = mid + 1,
34 |                 Ordering::Greater => high = mid,
35 |                 Ordering::Equal => {
36 |                     let mut left_high = mid;
37 |                     while low < left_high {
38 |                         mid = usize::midpoint(low, left_high);
39 |                         match get_y(mid).cmp(&p) {
40 |                             Ordering::Less => low = mid + 1,
41 |                             Ordering::Greater => {
42 |                                 high = mid;
43 |                                 left_high = mid;
44 |                             }
45 |                             Ordering::Equal => left_high = mid,
46 |                         }
47 |                     }
48 |                     // right border
49 |                     let mut right_low = low;
50 |                     while right_low < high {
51 |                         mid = (right_low + high).div_ceil(2);
52 |                         match get_y(mid).cmp(&p) {
53 |                             Ordering::Greater => high = mid - 1,
54 |                             _ => right_low = mid,
55 |                         }
56 |                     }
57 |                     return PredicateObjectIter { triples, pos_index: low, max_index: high };
58 |                 }
59 |             }
60 |             if (high == 0 && low == 0) || (high == low && high == mid) {
61 |                 break;
62 |             }
63 |         }
64 |         // not found
65 |         PredicateObjectIter { triples, pos_index: 999, max_index: 0 }
66 |     }
67 | }
68 | 
69 | impl Iterator for PredicateObjectIter<'_> {
70 |     type Item = Id;
71 |     fn next(&mut self) -> Option<Self::Item> {
72 |         if self.pos_index > self.max_index {
73 |             return None;
74 |         }
75 |         let pos_y = self.triples.op_index.sequence.access(self.pos_index).unwrap();
76 |         //let y = self.triples.wavelet_y.get(pos_y as usize) as Id;
77 |         //println!(" op p {y}");
78 |         let s = self.triples.bitmap_y.rank(pos_y) as Id + 1;
79 |         self.pos_index += 1;
80 |         Some(s)
81 |     }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/triples/subject_iter.rs:
--------------------------------------------------------------------------------
  1 | use super::{Id, TripleId, TriplesBitmap};
  2 | 
  3 | /// Iterator over triples fitting an SPO, SP? S?? or ??? triple pattern.
  4 | //#[derive(Debug)]
  5 | pub struct SubjectIter<'a> {
  6 |     // triples data
  7 |     triples: &'a TriplesBitmap,
  8 |     // x-coordinate identifier
  9 |     x: Id,
 10 |     // current position
 11 |     pos_y: usize,
 12 |     pos_z: usize,
 13 |     max_y: usize,
 14 |     max_z: usize,
 15 |     search_z: usize, // for S?O
 16 | }
 17 | 
 18 | impl<'a> SubjectIter<'a> {
 19 |     /// Create an iterator over all triples.
 20 |     pub fn new(triples: &'a TriplesBitmap) -> Self {
 21 |         SubjectIter {
 22 |             triples,
 23 |             x: 1, // was 0 in the old code but it should start at 1
 24 |             pos_y: 0,
 25 |             pos_z: 0,
 26 |             max_y: triples.wavelet_y.len(), // exclusive
 27 |             max_z: triples.adjlist_z.len(), // exclusive
 28 |             search_z: 0,
 29 |         }
 30 |     }
 31 | 
 32 |     /// Use when no results are found.
 33 |     pub const fn empty(triples: &'a TriplesBitmap) -> Self {
 34 |         SubjectIter { triples, x: 1, pos_y: 0, pos_z: 0, max_y: 0, max_z: 0, search_z: 0 }
 35 |     }
 36 | 
 37 |     /// Convenience method for the S?? triple pattern.
 38 |     /// See <https://github.com/rdfhdt/hdt-cpp/blob/develop/libhdt/src/triples/BitmapTriplesIterators.cpp>.
 39 |     pub fn with_s(triples: &'a TriplesBitmap, subject_id: Id) -> Self {
 40 |         let min_y = triples.find_y(subject_id - 1);
 41 |         let min_z = triples.adjlist_z.find(min_y as Id);
 42 |         let max_y = triples.find_y(subject_id);
 43 |         let max_z = triples.adjlist_z.find(max_y as Id);
 44 |         SubjectIter { triples, x: subject_id, pos_y: min_y, pos_z: min_z, max_y, max_z, search_z: 0 }
 45 |     }
 46 | 
 47 |     /// Iterate over triples fitting the given SPO, SP? S??, S?O or ??? triple pattern.
 48 |     /// Variable positions are signified with a 0 value.
 49 |     /// Undefined result if any other triple pattern is used.
 50 |     /// # Examples
 51 |     /// ```text
 52 |     /// // S?? pattern, all triples with subject ID 1
 53 |     /// SubjectIter::with_pattern(triples, TripleId::new(1, 0, 0);
 54 |     /// // SP? pattern, all triples with subject ID 1 and predicate ID 2
 55 |     /// SubjectIter::with_pattern(triples, TripleId::new(1, 2, 0);
 56 |     /// // match a specific triple, not useful in practice except as an ASK query
 57 |     /// SubjectIter::with_pattern(triples, TripleId::new(1, 2, 3);
 58 |     /// ```
 59 |     // Translated from <https://github.com/rdfhdt/hdt-cpp/blob/develop/libhdt/src/triples/BitmapTriplesIterators.cpp>.
 60 |     pub fn with_pattern(triples: &'a TriplesBitmap, pat: &TripleId) -> Self {
 61 |         let (pat_x, pat_y, pat_z) = (pat.subject_id, pat.predicate_id, pat.object_id);
 62 |         let (min_y, max_y, min_z, max_z);
 63 |         let mut x = 1;
 64 |         let mut search_z = 0;
 65 |         // only SPO order is supported currently
 66 |         if pat_x != 0 {
 67 |             // S X X
 68 |             if pat_y != 0 {
 69 |                 // S P X
 70 |                 match triples.search_y(pat_x - 1, pat_y) {
 71 |                     Some(y) => min_y = y,
 72 |                     None => return SubjectIter::empty(triples),
 73 |                 }
 74 |                 max_y = min_y + 1;
 75 |                 if pat_z != 0 {
 76 |                     // S P O
 77 |                     // simply with try block when they come to stable Rust
 78 |                     match triples.adjlist_z.search(min_y, pat_z) {
 79 |                         Some(pos_z) => min_z = pos_z,
 80 |                         None => return SubjectIter::empty(triples),
 81 |                     }
 82 |                     max_z = min_z + 1;
 83 |                 } else {
 84 |                     // S P ?
 85 |                     min_z = triples.adjlist_z.find(min_y);
 86 |                     max_z = triples.adjlist_z.last(min_y) + 1;
 87 |                 }
 88 |             } else {
 89 |                 // S ? X
 90 |                 min_y = triples.find_y(pat_x - 1);
 91 |                 min_z = triples.adjlist_z.find(min_y);
 92 |                 max_y = triples.last_y(pat_x - 1) + 1;
 93 |                 max_z = triples.adjlist_z.find(max_y);
 94 |                 search_z = pat_z;
 95 |             }
 96 |             x = pat_x;
 97 |         } else {
 98 |             // ? X X
 99 |             // assume ? ? ?, other triple patterns are not supported by this iterator
100 |             min_y = 0;
101 |             min_z = 0;
102 |             max_y = triples.wavelet_y.len();
103 |             max_z = triples.adjlist_z.len();
104 |         }
105 |         SubjectIter { triples, x, pos_y: min_y, pos_z: min_z, max_y, max_z, search_z }
106 |     }
107 | }
108 | 
109 | impl Iterator for SubjectIter<'_> {
110 |     type Item = TripleId;
111 | 
112 |     fn next(&mut self) -> Option<Self::Item> {
113 |         if self.pos_y >= self.max_y {
114 |             return None;
115 |         }
116 | 
117 |         let y = self.triples.wavelet_y.access(self.pos_y).unwrap() as Id;
118 | 
119 |         if self.search_z > 0 {
120 |             self.pos_y += 1;
121 |             match self.triples.adjlist_z.search(self.pos_y - 1, self.search_z) {
122 |                 Some(_) => {
123 |                     return Some(self.triples.coord_to_triple(self.x, y, self.search_z).unwrap());
124 |                 }
125 |                 None => {
126 |                     return self.next();
127 |                 }
128 |             }
129 |         }
130 | 
131 |         if self.pos_z >= self.max_z {
132 |             return None;
133 |         }
134 |         let z = self.triples.adjlist_z.get_id(self.pos_z);
135 |         let triple_id = self.triples.coord_to_triple(self.x, y, z).unwrap();
136 | 
137 |         // theoretically the second condition should only be true if the first is as well but in practise it wasn't, which screwed up the subject identifiers
138 |         // fixed by moving the second condition inside the first one but there may be another reason for the bug occuring in the first place
139 |         if self.triples.adjlist_z.at_last_sibling(self.pos_z) {
140 |             if self.triples.bitmap_y.at_last_sibling(self.pos_y) {
141 |                 self.x += 1;
142 |             }
143 |             self.pos_y += 1;
144 |         }
145 |         self.pos_z += 1;
146 |         Some(triple_id)
147 |     }
148 | }
149 | 


--------------------------------------------------------------------------------
/tests/resources/snikmeta.hdt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KonradHoeffner/hdt/3fee2114fb39d6ae8690da023a7b02970c4d3bdb/tests/resources/snikmeta.hdt


--------------------------------------------------------------------------------
/tests/resources/yago_header.hdt:
--------------------------------------------------------------------------------
 1 | $HDT<http://purl.org/HDT/hdt#HDTv1>  v5$HDTntriples length=1891; ѫ<http://www.mpi-inf.mpg.de/yago-naga/yago/> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://purl.org/HDT/hdt#Dataset> .
 2 | <http://www.mpi-inf.mpg.de/yago-naga/yago/> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://rdfs.org/ns/void#Dataset> .
 3 | <http://www.mpi-inf.mpg.de/yago-naga/yago/> <http://rdfs.org/ns/void#triples> "158991568" .
 4 | <http://www.mpi-inf.mpg.de/yago-naga/yago/> <http://rdfs.org/ns/void#properties> "104" .
 5 | <http://www.mpi-inf.mpg.de/yago-naga/yago/> <http://rdfs.org/ns/void#distinctSubjects> "67813972" .
 6 | <http://www.mpi-inf.mpg.de/yago-naga/yago/> <http://rdfs.org/ns/void#distinctObjects> "22354760" .
 7 | <http://www.mpi-inf.mpg.de/yago-naga/yago/> <http://purl.org/HDT/hdt#statisticalInformation> _:statistics .
 8 | <http://www.mpi-inf.mpg.de/yago-naga/yago/> <http://purl.org/HDT/hdt#publicationInformation> _:publicationInformation .
 9 | <http://www.mpi-inf.mpg.de/yago-naga/yago/> <http://purl.org/HDT/hdt#formatInformation> _:format .
10 | _:format <http://purl.org/HDT/hdt#dictionary> _:dictionary .
11 | _:format <http://purl.org/HDT/hdt#triples> _:triples .
12 | _:dictionary <http://purl.org/dc/terms/format> <http://purl.org/HDT/hdt#dictionaryFour> .
13 | _:dictionary <http://purl.org/HDT/hdt#dictionarynumSharedSubjectObject> "2748838" .
14 | _:dictionary <http://purl.org/HDT/hdt#dictionarymapping> "1" .
15 | _:dictionary <http://purl.org/HDT/hdt#dictionarysizeStrings> "1402843002" .
16 | _:dictionary <http://purl.org/HDT/hdt#dictionaryblockSize> "16" .
17 | _:triples <http://purl.org/dc/terms/format> <http://purl.org/HDT/hdt#triplesBitmap> .
18 | _:triples <http://purl.org/HDT/hdt#triplesnumTriples> "158991568" .
19 | _:triples <http://purl.org/HDT/hdt#triplesOrder> "SPO" .
20 | _:statistics <http://purl.org/HDT/hdt#originalSize> "26345372323" .
21 | _:statistics <http://purl.org/HDT/hdt#hdtSize> "2080973301" .
22 | _:publicationInformation <http://purl.org/dc/terms/issued> "2013-05-09T10:45:06+0100" .
23 | 


--------------------------------------------------------------------------------