├── .codecov.yml ├── .github ├── FUNDING.yml ├── script │ └── build_libzim.cmd └── workflows │ ├── ci.yml │ └── package.yml ├── .gitignore ├── .readthedocs.yaml ├── AUTHORS ├── COPYING ├── ChangeLog ├── README.md ├── debian ├── changelog ├── control ├── copyright ├── libzim-dev.install ├── libzim9.install ├── rules └── source │ └── format ├── docs ├── .gitignore ├── 6to7.rst ├── conf.py ├── index.rst ├── meson.build ├── requirements.txt └── usage.rst ├── examples ├── createZimExample.cpp └── meson.build ├── include ├── meson.build └── zim │ ├── archive.h │ ├── blob.h │ ├── entry.h │ ├── error.h │ ├── item.h │ ├── meson.build │ ├── search.h │ ├── search_iterator.h │ ├── suggestion.h │ ├── suggestion_iterator.h │ ├── tools.h │ ├── uuid.h │ ├── version.h │ ├── writer │ ├── contentProvider.h │ ├── creator.h │ └── item.h │ └── zim.h ├── meson.build ├── meson_options.txt ├── scripts ├── download_test_data.py ├── libzim-compile-resources └── meson.build ├── src ├── _dirent.h ├── archive.cpp ├── blob.cpp ├── buffer.cpp ├── buffer.h ├── buffer_reader.cpp ├── buffer_reader.h ├── bufferstreamer.h ├── cluster.cpp ├── cluster.h ├── compression.cpp ├── compression.h ├── concurrent_cache.h ├── config.h.in ├── constants.h ├── debug.h ├── decoderstreamreader.h ├── dirent.cpp ├── dirent_accessor.cpp ├── dirent_accessor.h ├── dirent_lookup.h ├── direntreader.h ├── endian_tools.h ├── entry.cpp ├── file_compound.cpp ├── file_compound.h ├── file_part.h ├── file_reader.cpp ├── file_reader.h ├── fileheader.cpp ├── fileheader.h ├── fileimpl.cpp ├── fileimpl.h ├── fs.h ├── fs_unix.cpp ├── fs_unix.h ├── fs_windows.cpp ├── fs_windows.h ├── istreamreader.cpp ├── istreamreader.h ├── item.cpp ├── lock.h ├── log.cpp ├── log.h ├── lrucache.h ├── md5.c ├── md5.h ├── meson.build ├── namedthread.cpp ├── namedthread.h ├── narrowdown.h ├── rawstreamreader.h ├── reader.h ├── search.cpp ├── search_internal.h ├── search_iterator.cpp ├── suggestion.cpp ├── suggestion_internal.h ├── suggestion_iterator.cpp ├── tools.cpp ├── tools.h ├── uuid.cpp ├── version.cpp ├── writer │ ├── _dirent.h │ ├── cluster.cpp │ ├── cluster.h │ ├── clusterWorker.cpp │ ├── clusterWorker.h │ ├── contentProvider.cpp │ ├── counterHandler.cpp │ ├── counterHandler.h │ ├── creator.cpp │ ├── creatordata.h │ ├── defaultIndexData.h │ ├── dirent.cpp │ ├── direntPool.h │ ├── handler.h │ ├── item.cpp │ ├── queue.h │ ├── tinyString.h │ ├── titleListingHandler.cpp │ ├── titleListingHandler.h │ ├── workers.cpp │ ├── workers.h │ ├── xapianHandler.cpp │ ├── xapianHandler.h │ ├── xapianIndexer.cpp │ ├── xapianIndexer.h │ ├── xapianWorker.cpp │ └── xapianWorker.h ├── xapian │ ├── htmlparse.cc │ ├── htmlparse.h │ ├── myhtmlparse.cc │ ├── myhtmlparse.h │ └── namedentities.h └── zim_types.h ├── static ├── meson.build ├── resources_list.txt └── stopwords │ ├── af │ ├── ar │ ├── bg │ ├── bn │ ├── br │ ├── ca │ ├── cs │ ├── da │ ├── de │ ├── el │ ├── en │ ├── eo │ ├── es │ ├── et │ ├── eu │ ├── fa │ ├── fi │ ├── fr │ ├── ga │ ├── gl │ ├── gu │ ├── ha │ ├── he │ ├── hi │ ├── hr │ ├── hu │ ├── hy │ ├── id │ ├── it │ ├── ja │ ├── ko │ ├── ku │ ├── la │ ├── lt │ ├── lv │ ├── mr │ ├── ms │ ├── nl │ ├── no │ ├── pl │ ├── pt │ ├── ro │ ├── ru │ ├── sk │ ├── sl │ ├── so │ ├── st │ ├── sv │ ├── sw │ ├── th │ ├── tl │ ├── tr │ ├── uk │ ├── ur │ ├── vi │ ├── yo │ ├── zh │ └── zu ├── subprojects ├── gtest.wrap ├── liblzma.wrap └── zstd.wrap └── test ├── archive.cpp ├── bufferstreamer.cpp ├── cluster.cpp ├── compression.cpp ├── concurrentcache.cpp ├── counterParsing.cpp ├── creator.cpp ├── decoderstreamreader.cpp ├── defaultIndexdata.cpp ├── dirent.cpp ├── dirent_lookup.cpp ├── error_in_creator.cpp ├── find.cpp ├── header.cpp ├── indexing_criteria.cpp ├── istreamreader.cpp ├── iterator.cpp ├── log.cpp ├── lrucache.cpp ├── meson.build ├── parseLongPath.cpp ├── random.cpp ├── rawstreamreader.cpp ├── reader.cpp ├── search.cpp ├── search_iterator.cpp ├── suggestion.cpp ├── suggestion_iterator.cpp ├── tinyString.cpp ├── tools.cpp ├── tools.h ├── tooltesting.cpp └── uuid.cpp /.codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | notify: 3 | require_ci_to_pass: yes 4 | 5 | coverage: 6 | status: 7 | project: 8 | default: 9 | threshold: 1% 10 | patch: 11 | default: 12 | target: 90% 13 | threshold: 0% 14 | 15 | ignore: 16 | - "test" 17 | - "examples" 18 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: kiwix # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # https://kiwix.org/support-us/ 13 | -------------------------------------------------------------------------------- /.github/script/build_libzim.cmd: -------------------------------------------------------------------------------- 1 | call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" 2 | 3 | set CC=cl.exe 4 | set CXX=cl.exe 5 | 6 | meson.exe setup build . --force-fallback-for liblzma -Ddefault_library=static -Dwith_xapian=false -Dzstd:bin_programs=false -Dzstd:bin_tests=false -Dzstd:bin_contrib=false -Dliblzma:default_library=static 7 | 8 | cd build 9 | 10 | ninja.exe 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *#* 3 | autom4te.cache 4 | build 5 | compile 6 | config.h 7 | configure 8 | depcomp 9 | .deps 10 | .dirstamp 11 | INSTALL 12 | install-sh 13 | *.kate-swp 14 | *.la 15 | .libs 16 | libtool 17 | *.lo 18 | ltmain.sh 19 | *.m4 20 | Makefile 21 | Makefile.in 22 | missing 23 | *.o 24 | stamp-h1 25 | .svn 26 | .*.swp 27 | *.zim 28 | examples/createZimExample 29 | src/tools/zimdump 30 | src/tools/zimsearch 31 | libzim.pc 32 | test-driver 33 | test/zimlib-test* 34 | test/test-suite.log 35 | .clangd 36 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Set the version of Python and other tools you might need 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.11" 12 | 13 | # Build documentation in the docs/ directory with Sphinx 14 | sphinx: 15 | configuration: docs/conf.py 16 | 17 | # We recommend specifying your dependencies to enable reproducible builds: 18 | # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 19 | python: 20 | install: 21 | - requirements: docs/requirements.txt 22 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # This is the list of Libzim's significant contributors. 2 | # 3 | # This does not necessarily list everyone who has contributed code, 4 | # especially since many employees of one corporation may be contributing. 5 | # To see the full list of contributors, see the revision history in 6 | # source control. 7 | 8 | C. Scott Ananian https://github.com/cscott 9 | Dmitry Atamanov https://github.com/data-man 10 | Emmanuel Engelhart https://github.com/kelson42 11 | Kunal Mehta https://github.com/legoktm 12 | Maneeshpm https://github.com/maneeshpm 13 | Matthieu Gautier https://github.com/mgautierfr 14 | MiguelRocha https://github.com/miguelrocha 15 | Renaud Gaudin https://github.com/rgaudin 16 | Tommi Mäkitalo https://github.com/maekitalo 17 | Veloman Yunkan https://github.com/veloman-yunkan 18 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | libzim (8.0.1) stable; urgency=medium 2 | 3 | * Update to libzim version 8.0.1 4 | 5 | -- Matthieu Gautier Web, 07 Sep 2022 14:38:00 -0200 6 | 7 | libzim (0.0.0) unstable; urgency=medium 8 | 9 | * Initial release. 10 | 11 | -- Kunal Mehta Tue, 02 Jun 2020 01:49:48 -0700 12 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: libzim 2 | Section: libs 3 | Priority: optional 4 | Build-Depends: debhelper-compat (= 13), 5 | liblzma-dev, 6 | libicu-dev, 7 | libxapian-dev, 8 | libzstd-dev, 9 | uuid-dev, 10 | libgtest-dev, 11 | meson, 12 | ninja-build, 13 | pkgconf 14 | Maintainer: Kiwix team 15 | Homepage: https://www.openzim.org/wiki/Libzim 16 | Standards-Version: 4.6.2 17 | Rules-Requires-Root: no 18 | 19 | Package: libzim9 20 | Architecture: any 21 | Multi-Arch: same 22 | Depends: ${misc:Depends}, 23 | ${shlibs:Depends} 24 | Pre-Depends: ${misc:Pre-Depends} 25 | Conflicts: libzim0, libzim0v5, libzim2, libzim4, libzim5, libzim6, libzim7, libzim8 26 | Replaces: libzim0, libzim0v5, libzim2, libzim4, libzim5, libzim6, libzim7, libzim8 27 | Description: library implementation of ZIM specifications 28 | ZIM (Zeno IMproved) is an open file format for storing the contents of 29 | wiki for offline usage. This file format is primarily focused on 30 | providing the contents of Wikipedia and Wikimedia projects for offline 31 | use. 32 | . 33 | libzim is the standard implementation of ZIM specification, which 34 | implements the read and write method for ZIM files. 35 | . 36 | ZIM is a file format created with focus on extracting and encoding data 37 | from MediaWiki for offline use. 38 | . 39 | Features of libzim are: 40 | * Native, coded in C++ 41 | * Extremely fast 42 | * Minimal footprint 43 | * Minimal dependencies 44 | * Portable on most OS (Windows, Linux, iOS, MacOS, Android, ...) 45 | 46 | Package: libzim-dev 47 | Section: libdevel 48 | Architecture: any 49 | Depends: ${misc:Depends}, 50 | libzim9 (= ${binary:Version}), 51 | liblzma-dev, 52 | libxapian-dev, 53 | libicu-dev, 54 | libzstd-dev 55 | Description: library implementation of ZIM specifications (development) 56 | ZIM (Zeno IMproved) is an open file format for storing the contents of 57 | wiki for offline usage. This file format is primarily focused on 58 | providing the contents of Wikipedia and Wikimedia projects for offline 59 | use. 60 | . 61 | libzim is the standard implementation of ZIM specification, which 62 | implements the read and write method for ZIM files. 63 | . 64 | ZIM is a file format created with focus on extracting and encoding data 65 | from MediaWiki for offline use. 66 | . 67 | This package contains development files. 68 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | See COPYING in the repository root. 2 | -------------------------------------------------------------------------------- /debian/libzim-dev.install: -------------------------------------------------------------------------------- 1 | usr/include/* 2 | usr/lib/*/libzim.so 3 | usr/lib/*/pkgconfig/* -------------------------------------------------------------------------------- /debian/libzim9.install: -------------------------------------------------------------------------------- 1 | usr/lib/*/*.so.* -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | export DEB_BUILD_MAINT_OPTIONS = hardening=+all 3 | 4 | # Skip some extremely memory-intensive tests 5 | export SKIP_BIG_MEMORY_TEST=1 6 | %: 7 | dh $@ --buildsystem=meson 8 | 9 | # Skip tests that require zim-testing-data for now 10 | override_dh_auto_configure: 11 | dh_auto_configure -- -Dtest_data_dir=none 12 | 13 | # Increase test timeout 14 | override_dh_auto_test: 15 | dh_auto_test -- -t 3 16 | -------------------------------------------------------------------------------- /debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (native) 2 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | api 2 | xml 3 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'libzim' 21 | copyright = '2020, libzim-team' 22 | author = 'libzim-team' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | on_rtd = os.environ.get('READTHEDOCS', None) == 'True' 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | 'breathe', 34 | 'exhale' 35 | ] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # List of patterns, relative to source directory, that match files and 41 | # directories to ignore when looking for source files. 42 | # This pattern also affects html_static_path and html_extra_path. 43 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 44 | 45 | 46 | if not on_rtd: 47 | html_theme = 'sphinx_rtd_theme' 48 | 49 | # Add any paths that contain custom static files (such as style sheets) here, 50 | # relative to this directory. They are copied after the builtin static files, 51 | # so a file named "default.css" will overwrite the builtin "default.css". 52 | html_static_path = ['_static'] 53 | 54 | breathe_projects = { 55 | "libzim": "./xml" 56 | } 57 | breathe_default_project = 'libzim' 58 | 59 | exhale_args = { 60 | "containmentFolder": "./api", 61 | "rootFileName": "ref_api.rst", 62 | "rootFileTitle": "Reference API", 63 | "doxygenStripFromPath": "..", 64 | "treeViewIsBootstrap": True, 65 | "createTreeView" : True, 66 | "exhaleExecutesDoxygen": True, 67 | "exhaleDoxygenStdin": "INPUT = ../include" 68 | } 69 | 70 | primary_domain = 'cpp' 71 | 72 | highlight_language = 'cpp' 73 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. libzim documentation master file, created by 2 | sphinx-quickstart on Fri Jul 24 15:40:50 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to libzim's documentation! 7 | ================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | usage 14 | 6to7 15 | api/ref_api 16 | -------------------------------------------------------------------------------- /docs/meson.build: -------------------------------------------------------------------------------- 1 | 2 | sphinx = find_program('sphinx-build', native:true) 3 | 4 | sphinx_target = run_target('doc', 5 | command: [sphinx, '-bhtml', 6 | meson.current_source_dir(), 7 | meson.current_build_dir()]) 8 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | breathe 2 | exhale 3 | sphinx 4 | -------------------------------------------------------------------------------- /examples/createZimExample.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 Tommi Maekitalo 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | class TestItem : public zim::writer::Item 29 | { 30 | std::string _id; 31 | std::string _data; 32 | 33 | public: 34 | TestItem() { } 35 | explicit TestItem(const std::string& id); 36 | virtual ~TestItem() = default; 37 | 38 | virtual std::string getPath() const; 39 | virtual std::string getTitle() const; 40 | virtual std::string getMimeType() const; 41 | 42 | virtual std::unique_ptr getContentProvider() const; 43 | }; 44 | 45 | TestItem::TestItem(const std::string& id) 46 | : _id(id) 47 | { 48 | std::ostringstream data; 49 | data << "this is item " << id << std::endl; 50 | _data = data.str(); 51 | } 52 | 53 | std::string TestItem::getPath() const 54 | { 55 | return std::string("A/") + _id; 56 | } 57 | 58 | std::string TestItem::getTitle() const 59 | { 60 | return _id; 61 | } 62 | 63 | std::string TestItem::getMimeType() const 64 | { 65 | return "text/plain"; 66 | } 67 | 68 | std::unique_ptr TestItem::getContentProvider() const 69 | { 70 | return std::unique_ptr(new zim::writer::StringProvider(_data)); 71 | } 72 | 73 | int main(int argc, char* argv[]) 74 | { 75 | unsigned max = 16; 76 | try { 77 | zim::writer::Creator c; 78 | c.configVerbose(false).configCompression(zim::Compression::Zstd); 79 | c.startZimCreation("foo.zim"); 80 | for (unsigned n = 0; n < max; ++n) 81 | { 82 | std::ostringstream id; 83 | id << (n + 1); 84 | auto article = std::make_shared(id.str()); 85 | c.addItem(article); 86 | } 87 | c.setMainPath("A/0"); 88 | c.finishZimCreation(); 89 | } 90 | catch (const std::exception& e) 91 | { 92 | std::cerr << e.what() << std::endl; 93 | } 94 | } 95 | 96 | -------------------------------------------------------------------------------- /examples/meson.build: -------------------------------------------------------------------------------- 1 | 2 | executable('createZimExample', 'createZimExample.cpp', 3 | link_with: libzim, 4 | include_directories: include_directory, 5 | dependencies: [thread_dep, xapian_dep, icu_dep, lzma_dep, zstd_dep, win_deps]) 6 | -------------------------------------------------------------------------------- /include/meson.build: -------------------------------------------------------------------------------- 1 | subdir('zim') 2 | 3 | include_directory = include_directories('.') 4 | -------------------------------------------------------------------------------- /include/zim/blob.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018 Matthieu Gautier 3 | * Copyright (C) 2009 Tommi Maekitalo 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #ifndef ZIM_BLOB_H 22 | #define ZIM_BLOB_H 23 | 24 | #include "zim.h" 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | namespace zim 32 | { 33 | /** 34 | * A blob is a pointer to data, potentially stored in an `Archive`. 35 | * 36 | * All `Blob`'s methods are threadsafe. 37 | */ 38 | class LIBZIM_API Blob 39 | { 40 | public: // types 41 | using DataPtr = std::shared_ptr; 42 | 43 | public: // functions 44 | /** 45 | * Constuct a empty `Blob` 46 | */ 47 | Blob(); 48 | 49 | /** 50 | * Constuct `Blob` pointing to `data`. 51 | * 52 | * The created blob only point to the data and doesn't own it. 53 | * User must care that data is not freed before using the blob. 54 | */ 55 | Blob(const char* data, size_type size); 56 | 57 | /** 58 | * Constuct `Blob` pointing to `data`. 59 | * 60 | * The created blob shares the ownership on data. 61 | */ 62 | Blob(const DataPtr& buffer, size_type size); 63 | 64 | operator std::string() const { return std::string(_data.get(), _size); } 65 | const char* data() const { return _data.get(); } 66 | const char* end() const { return _data.get() + _size; } 67 | size_type size() const { return _size; } 68 | 69 | private: 70 | DataPtr _data; 71 | size_type _size; 72 | }; 73 | 74 | inline std::ostream& operator<< (std::ostream& out, const Blob& blob) 75 | { 76 | if (blob.data()) 77 | out.write(blob.data(), blob.size()); 78 | return out; 79 | } 80 | 81 | inline bool operator== (const Blob& b1, const Blob& b2) 82 | { 83 | return b1.size() == b2.size() 84 | && std::equal(b1.data(), b1.data() + b1.size(), b2.data()); 85 | } 86 | } 87 | 88 | #endif // ZIM_BLOB_H 89 | -------------------------------------------------------------------------------- /include/zim/entry.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef ZIM_ENTRY_H 21 | #define ZIM_ENTRY_H 22 | 23 | #include "zim.h" 24 | 25 | #include 26 | #include 27 | 28 | namespace zim 29 | { 30 | class Item; 31 | class Dirent; 32 | class FileImpl; 33 | 34 | /** 35 | * An entry in an `Archive`. 36 | * 37 | * All `Entry`'s methods are threadsafe. 38 | */ 39 | class LIBZIM_API Entry 40 | { 41 | public: 42 | explicit Entry(std::shared_ptr file_, entry_index_type idx_); 43 | 44 | bool isRedirect() const; 45 | std::string getTitle() const; 46 | std::string getPath() const; 47 | 48 | /** Get the item associated to the entry. 49 | * 50 | * An item is associated only if the entry is not a redirect. 51 | * For convenience, if follow is true, return the item associated to the targeted entry. 52 | * 53 | * @param follow True if the redirection is resolved before getting the item. (false by default) 54 | * @return The Item associated to the entry. 55 | * @exception InvalidType if the entry is a redirection and follow is false. 56 | */ 57 | Item getItem(bool follow=false) const; 58 | 59 | /** Get the item associated to the target entry. 60 | * 61 | * If there is a chain of redirection, the whole chain is resolved 62 | * and the item associted to the last entry is returned. 63 | * 64 | * @return the Item associated with the targeted entry. 65 | * @exception InvalidType if the entry is not a redirection. 66 | */ 67 | Item getRedirect() const; 68 | 69 | /** Get the Entry targeted by the entry. 70 | * 71 | * @return The entry directly targeted by this redirect entry. 72 | * @exception InvalidEntry if the entry is not a redirection. 73 | */ 74 | Entry getRedirectEntry() const; 75 | 76 | /** Get the index of the Entry targeted by the entry. 77 | * 78 | * @return The index of the entry directly targeted by this redirect 79 | * entry. 80 | * @exception InvalidEntry if the entry is not a redirection. 81 | */ 82 | entry_index_type getRedirectEntryIndex() const; 83 | 84 | entry_index_type getIndex() const { return m_idx; } 85 | 86 | protected: // so that Item can be implemented as a wrapper over Entry 87 | std::shared_ptr m_file; 88 | entry_index_type m_idx; 89 | std::shared_ptr m_dirent; 90 | }; 91 | 92 | } 93 | 94 | #endif // ZIM_ENTRY_H 95 | 96 | -------------------------------------------------------------------------------- /include/zim/item.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Veloman Yunkan 3 | * Copyright (C) 2020 Matthieu Gautier 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #ifndef ZIM_ITEM_H 22 | #define ZIM_ITEM_H 23 | 24 | #include "zim.h" 25 | #include "blob.h" 26 | #include "entry.h" 27 | #include 28 | 29 | namespace zim 30 | { 31 | /** 32 | * An `Item` in an `Archive` 33 | * 34 | * There is no public constructor - the only way to obtain an `Item` 35 | * is via `Entry::getItem()` or `Entry::getRedirect()`. 36 | * 37 | * All `Item`'s methods are threadsafe. 38 | */ 39 | class LIBZIM_API Item : private Entry 40 | { 41 | public: // functions 42 | std::string getTitle() const { return Entry::getTitle(); } 43 | std::string getPath() const { return Entry::getPath(); } 44 | std::string getMimetype() const; 45 | 46 | /** Get the data associated to the item 47 | * 48 | * Get the data of the item, starting at offset. 49 | * 50 | * @param offset The number of byte to skip at begining of the data. 51 | * @return A blob corresponding to the data. 52 | */ 53 | Blob getData(offset_type offset=0) const; 54 | 55 | /** Get the data associated to the item 56 | * 57 | * Get the `size` bytes of data of the item, starting at offset. 58 | * 59 | * @param offset The number of byte to skip at begining of the data. 60 | * @param size The number of byte to read. 61 | * @return A blob corresponding to the data. 62 | */ 63 | Blob getData(offset_type offset, size_type size) const; 64 | 65 | /** The size of the item. 66 | * 67 | * @return The size (in byte) of the item. 68 | */ 69 | size_type getSize() const; 70 | 71 | /** Direct access information. 72 | * 73 | * Some item are stored raw in the zim file. 74 | * If possible, this function give information about which file 75 | * and at which to read to get the data. 76 | * 77 | * It can be usefull as an optimisation when interacting with other system 78 | * by reopeing the file and reading the content bypassing the libzim. 79 | * 80 | * @return A pair of filename/offset specifying where read the content. 81 | * If it is not possible to have direct access for this item, 82 | * return a pair of `{"", 0}` 83 | */ 84 | zim::ItemDataDirectAccessInfo getDirectAccessInformation() const; 85 | 86 | entry_index_type getIndex() const { return Entry::getIndex(); } 87 | 88 | #ifdef ZIM_PRIVATE 89 | cluster_index_type getClusterIndex() const; 90 | blob_index_type getBlobIndex() const; 91 | #endif 92 | 93 | private: // functions 94 | explicit Item(const Entry& entry); 95 | friend class Entry; 96 | }; 97 | 98 | } 99 | 100 | #endif // ZIM_ITEM_H 101 | 102 | -------------------------------------------------------------------------------- /include/zim/meson.build: -------------------------------------------------------------------------------- 1 | zim_config = configure_file(output : 'zim_config.h', 2 | configuration : public_conf) 3 | 4 | install_headers( 5 | 'archive.h', 6 | 'blob.h', 7 | 'error.h', 8 | 'item.h', 9 | 'entry.h', 10 | 'uuid.h', 11 | 'zim.h', 12 | 'suggestion.h', 13 | 'suggestion_iterator.h', 14 | 'tools.h', 15 | 'version.h', 16 | zim_config, 17 | subdir:'zim' 18 | ) 19 | 20 | if xapian_dep.found() 21 | install_headers( 22 | 'search.h', 23 | 'search_iterator.h', 24 | subdir:'zim' 25 | ) 26 | endif 27 | 28 | install_headers( 29 | 'writer/item.h', 30 | 'writer/creator.h', 31 | 'writer/contentProvider.h', 32 | subdir:'zim/writer' 33 | ) 34 | 35 | -------------------------------------------------------------------------------- /include/zim/tools.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2022 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef ZIM_TOOLS_H 21 | #define ZIM_TOOLS_H 22 | 23 | #include "zim.h" 24 | #include 25 | namespace zim { 26 | #if defined(LIBZIM_WITH_XAPIAN) 27 | 28 | /** Helper function to set the icu data directory. 29 | * 30 | * On Android, we compile ICU without data integrated 31 | * in the library. So android application needs to set 32 | * the data directory where ICU can find its data. 33 | */ 34 | LIBZIM_API void setICUDataDirectory(const std::string& path); 35 | 36 | #endif 37 | 38 | /** 39 | * @brief Stringstream Class to use itself as the stream object 40 | * returned by << operator. (std::stringstream returns an std::ostream). 41 | * Allows a one-line stringstream to str conversion, e.g. use_str(Formatter() 42 | * << "foo" << variable); 43 | * 44 | */ 45 | class Formatter 46 | { 47 | public: 48 | Formatter() {} 49 | ~Formatter() {} 50 | 51 | template Formatter &operator<<(const Type &value) 52 | { 53 | stream_ << value; 54 | return *this; 55 | } 56 | 57 | /* Operator for function templates like std::endl */ 58 | Formatter &operator<<(std::ostream& (* __pf)(std::ostream&)) 59 | { 60 | stream_ << __pf; 61 | return *this; 62 | } 63 | 64 | /* Operator for working with other ostream like std::cerr */ 65 | friend std::ostream &operator<<(std::ostream &os, const Formatter &obj) 66 | { 67 | os << obj.stream_.str(); 68 | return os; 69 | } 70 | 71 | operator std::string() const { return stream_.str(); } 72 | 73 | private: 74 | /* Disable copy and assignment constructors */ 75 | Formatter(const Formatter &) = delete; 76 | Formatter &operator=(Formatter &) = delete; 77 | 78 | /* Simple composition with std::stringstream */ 79 | std::stringstream stream_; 80 | }; 81 | } 82 | 83 | #endif // ZIM_TOOLS_H 84 | -------------------------------------------------------------------------------- /include/zim/uuid.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Mannesh P M 3 | * Copyright (C) 2018 Matthieu Gautier 4 | * Copyright (C) 2009 Tommi Maekitalo 5 | * 6 | * This program is free software; you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation; either version 2 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 13 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 14 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 19 | * 20 | */ 21 | 22 | #ifndef ZIM_UUID_H 23 | #define ZIM_UUID_H 24 | 25 | #include "zim.h" 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | namespace zim 33 | { 34 | struct LIBZIM_API Uuid 35 | { 36 | Uuid() 37 | { 38 | std::memset(data, 0, 16); 39 | } 40 | 41 | Uuid(const char uuid[16]) 42 | { 43 | std::copy(uuid, uuid+16, data); 44 | } 45 | 46 | static Uuid generate(std::string value = ""); 47 | 48 | bool operator== (const Uuid& other) const 49 | { return std::equal(data, data+16, other.data); } 50 | bool operator!= (const Uuid& other) const 51 | { return !(*this == other); } 52 | unsigned size() const { return 16; } 53 | 54 | explicit operator std::string() const; 55 | 56 | char data[16]; 57 | }; 58 | 59 | LIBZIM_API std::ostream& operator<< (std::ostream& out, const Uuid& uuid); 60 | 61 | } 62 | 63 | #endif // ZIM_UUID_H 64 | -------------------------------------------------------------------------------- /include/zim/version.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Emmanuel Engelhart 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef ZIM_VERSION_H 21 | #define ZIM_VERSION_H 22 | 23 | #include "zim.h" 24 | #include 25 | #include 26 | 27 | namespace zim 28 | { 29 | typedef std::vector> LibVersions; 30 | LIBZIM_API LibVersions getVersions(); 31 | LIBZIM_API void printVersions(std::ostream& out = std::cout); 32 | } 33 | 34 | #endif // ZIM_VERSION_H 35 | 36 | -------------------------------------------------------------------------------- /meson_options.txt: -------------------------------------------------------------------------------- 1 | option('CLUSTER_CACHE_SIZE', type : 'integer', min: 0, max: 1000000000000, value : 536870912, 2 | description : 'set default cluster cache size in bytes (default:512MB)') 3 | option('DIRENT_CACHE_SIZE', type : 'string', value : '512', 4 | description : 'set dirent cache size to number (default:512)') 5 | option('DIRENT_LOOKUP_CACHE_SIZE', type : 'string', value : '1024', 6 | description : 'set dirent lookup cache size to number (default:1024)') 7 | option('LZMA_MEMORY_SIZE', type : 'string', value : '128', 8 | description : 'set lzma uncompress memory in MB (default:128)') 9 | option('USE_MMAP', type: 'boolean', value: true, 10 | description: 'Use mmap to avoid copy from file. (default:true, always false on windows)') 11 | option('USE_BUFFER_HEADER', type: 'boolean', value: true, 12 | description: '''Copy (or use mmap) header index buffers. (default:true) 13 | Header index are used to access articles, having them in memory can improve access speed but on low memory devices it may use to many memory. 14 | If false, we directly read the index in the file at each article access.''') 15 | option('static-linkage', type : 'boolean', value : false, 16 | description : 'Link statically with the dependencies.') 17 | option('doc', type : 'boolean', value : false, 18 | description : 'Build the documentations.') 19 | option('examples', type : 'boolean', value : true, 20 | description : 'Build the examples.') 21 | option('tests', type : 'boolean', value : true, 22 | description : 'Build the tests.') 23 | option('with_xapian', type : 'boolean', value: true, 24 | description: 'Build libzim with xapian support') 25 | option('with_xapian_fuller', type: 'boolean', value: true, 26 | description: 'Create xapian archive using "FULLER" compaction.\nThis is a workaround for a compilation issue on Windows. This will be removed soon') 27 | option('test_data_dir', type : 'string', value: '', 28 | description: 'Where the test data are. If not set, meson will use a internal directory in the build dir. If you want to download the data in the specified directory you can use `meson download_test_data`. As a special value, you can pass `none` to deactivate test using external test data.') 29 | -------------------------------------------------------------------------------- /scripts/download_test_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | Copyright 2021 Matthieu Gautier 5 | 6 | This program is free software; you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation; either version 3 of the License, or any 9 | later version. 10 | 11 | This program is distributed in the hope that it will be useful, but 12 | WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License 17 | along with this program; if not, write to the Free Software 18 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 | 02110-1301, USA. 20 | ''' 21 | 22 | import argparse 23 | from pathlib import Path 24 | from urllib import request 25 | from urllib.error import * 26 | import tarfile 27 | import sys 28 | 29 | TEST_DATA_VERSION = "0.8.0" 30 | ARCHIVE_URL_TEMPL = "https://github.com/openzim/zim-testing-suite/releases/download/{version}/zim-testing-suite-{version}.tar.gz" 31 | 32 | if __name__ == "__main__": 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('--version', '-v', 35 | help="The version to download.", 36 | default=TEST_DATA_VERSION) 37 | parser.add_argument('--remove-top-dir', 38 | help="Remove the top directory when extracting", 39 | action='store_true') 40 | parser.add_argument('outdir', 41 | help='The directory where to install the test data.') 42 | args = parser.parse_args() 43 | 44 | test_data_url = ARCHIVE_URL_TEMPL.format(version=args.version) 45 | 46 | try: 47 | with request.urlopen(test_data_url) as f: 48 | with tarfile.open(fileobj=f, mode="r|*") as archive: 49 | while True: 50 | member = archive.next() 51 | if member is None: 52 | break 53 | if args.remove_top_dir: 54 | member.name = '/'.join(member.name.split('/')[1:]) 55 | archive.extract(member, path=args.outdir) 56 | 57 | except HTTPError as e: 58 | print("Error downloading archive at url : {}".format(test_data_url)) 59 | print(e) 60 | sys.exit(1) 61 | except OSError as e: 62 | print("Error writing the test data on the file system.") 63 | print(e) 64 | sys.exit(1) 65 | -------------------------------------------------------------------------------- /scripts/meson.build: -------------------------------------------------------------------------------- 1 | 2 | res_compiler = find_program('libzim-compile-resources') 3 | test_data_downloader = find_program('download_test_data.py') 4 | -------------------------------------------------------------------------------- /src/blob.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Veloman Yunkan 3 | * Copyright (C) 2017-2020 Matthieu Gautier 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | 22 | #include "zim/blob.h" 23 | #include "debug.h" 24 | #include "buffer.h" 25 | 26 | namespace zim { 27 | 28 | namespace 29 | { 30 | 31 | struct NoDelete 32 | { 33 | template void operator()(T*) {} 34 | }; 35 | 36 | // This shared_ptr is used as a source object for the std::shared_ptr 37 | // aliasing constructor (with the purpose of avoiding the control block 38 | // allocation) for the case when the referred data must not be deleted. 39 | static Blob::DataPtr nonOwnedDataPtr((char*)nullptr, NoDelete()); 40 | 41 | } // unnamed namespace 42 | 43 | 44 | Blob::Blob() 45 | : _data(nonOwnedDataPtr), 46 | _size(0) 47 | {} 48 | 49 | Blob::Blob(const char* data, size_type size) 50 | : _data(nonOwnedDataPtr, data), 51 | _size(size) 52 | { 53 | ASSERT(size, <, SIZE_MAX); 54 | ASSERT(data, <, (void*)(SIZE_MAX-size)); 55 | } 56 | 57 | Blob::Blob(const DataPtr& buffer, size_type size) 58 | : _data(buffer), 59 | _size(size) 60 | {} 61 | 62 | 63 | 64 | 65 | } //zim 66 | -------------------------------------------------------------------------------- /src/buffer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Veloman Yunkan 3 | * Copyright (C) 2017-2020 Matthieu Gautier 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #include "buffer.h" 22 | #include "debug.h" 23 | 24 | #include 25 | #include 26 | 27 | #ifndef _WIN32 28 | # include 29 | # include 30 | #endif 31 | 32 | namespace zim { 33 | 34 | namespace 35 | { 36 | 37 | struct NoDelete 38 | { 39 | template void operator()(T*) {} 40 | }; 41 | 42 | // This shared_ptr is used as a source object for the std::shared_ptr 43 | // aliasing constructor (with the purpose of avoiding the control block 44 | // allocation) for the case when the referred data must not be deleted. 45 | static Buffer::DataPtr nonOwnedDataPtr((char*)nullptr, NoDelete()); 46 | 47 | } // unnamed namespace 48 | 49 | const Buffer Buffer::sub_buffer(offset_t offset, zsize_t size) const 50 | { 51 | ASSERT(offset.v, <=, m_size.v); 52 | ASSERT(offset.v+size.v, <=, m_size.v); 53 | auto sub_data = DataPtr(m_data, data(offset)); 54 | return Buffer(sub_data, size); 55 | } 56 | 57 | const Buffer Buffer::makeBuffer(const DataPtr& data, zsize_t size) 58 | { 59 | return Buffer(data, size); 60 | } 61 | 62 | const Buffer Buffer::makeBuffer(const char* data, zsize_t size) 63 | { 64 | return Buffer(DataPtr(nonOwnedDataPtr, data), size); 65 | } 66 | 67 | Buffer Buffer::makeBuffer(zsize_t size) 68 | { 69 | if (0 == size.v) { 70 | return Buffer(DataPtr(nonOwnedDataPtr, nullptr), size); 71 | } 72 | return Buffer(DataPtr(new char[size.v], std::default_delete()), size); 73 | } 74 | 75 | Buffer::Buffer(const DataPtr& data, zsize_t size) 76 | : m_size(size), 77 | m_data(data) 78 | { 79 | ASSERT(m_size.v, <, SIZE_MAX); 80 | } 81 | 82 | const char* 83 | Buffer::data(offset_t offset) const { 84 | ASSERT(offset.v, <=, m_size.v); 85 | return m_data.get() + offset.v; 86 | } 87 | 88 | } //zim 89 | -------------------------------------------------------------------------------- /src/buffer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Veloman Yunkan 3 | * Copyright (C) 2017-2020 Matthieu Gautier 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #ifndef ZIM_BUFFER_H_ 22 | #define ZIM_BUFFER_H_ 23 | 24 | #include 25 | 26 | #include "config.h" 27 | #include "zim_types.h" 28 | #include 29 | 30 | namespace zim { 31 | 32 | class LIBZIM_PRIVATE_API Buffer { 33 | public: // types 34 | typedef std::shared_ptr DataPtr; 35 | 36 | public: // functions 37 | static const Buffer makeBuffer(const char* data, zsize_t size); 38 | static const Buffer makeBuffer(const DataPtr& data, zsize_t size); 39 | static Buffer makeBuffer(zsize_t size); 40 | 41 | const char* data(offset_t offset=offset_t(0)) const; 42 | 43 | char at(offset_t offset) const { 44 | return *(data(offset)); 45 | } 46 | zsize_t size() const { return m_size; } 47 | const Buffer sub_buffer(offset_t offset, zsize_t size) const; 48 | operator Blob() const { return Blob(m_data, m_size.v); } 49 | 50 | private: // functions 51 | Buffer(const DataPtr& data, zsize_t size); 52 | 53 | private: // data 54 | zsize_t m_size; 55 | DataPtr m_data; 56 | }; 57 | 58 | } // zim namespace 59 | 60 | #endif //ZIM_BUFFER_H_ 61 | -------------------------------------------------------------------------------- /src/buffer_reader.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2017-2021 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include 21 | #include 22 | #include "buffer_reader.h" 23 | #include "buffer.h" 24 | 25 | #include 26 | #include 27 | 28 | namespace zim { 29 | 30 | const Buffer BufferReader::get_buffer(offset_t offset, zsize_t size) const 31 | { 32 | return source.sub_buffer(offset, size); 33 | } 34 | 35 | std::unique_ptr BufferReader::sub_reader(offset_t offset, zsize_t size) const 36 | { 37 | auto sub_buff = get_buffer(offset, size); 38 | std::unique_ptr sub_read(new BufferReader(sub_buff)); 39 | return sub_read; 40 | } 41 | 42 | zsize_t BufferReader::size() const 43 | { 44 | return source.size(); 45 | } 46 | 47 | size_t BufferReader::getMemorySize() const 48 | { 49 | return source.size().v; 50 | } 51 | 52 | offset_t BufferReader::offset() const 53 | { 54 | return offset_t((offset_type)(static_cast(source.data(offset_t(0))))); 55 | } 56 | 57 | 58 | void BufferReader::readImpl(char* dest, offset_t offset, zsize_t size) const { 59 | memcpy(dest, source.data(offset), size.v); 60 | } 61 | 62 | 63 | char BufferReader::readImpl(offset_t offset) const { 64 | char dest; 65 | dest = *source.data(offset); 66 | return dest; 67 | } 68 | 69 | 70 | } // zim 71 | -------------------------------------------------------------------------------- /src/buffer_reader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2017 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef ZIM_BUFFER_READER_H_ 21 | #define ZIM_BUFFER_READER_H_ 22 | 23 | #include "reader.h" 24 | 25 | namespace zim { 26 | 27 | class LIBZIM_PRIVATE_API BufferReader : public Reader { 28 | public: 29 | BufferReader(const Buffer& source) 30 | : source(source) {} 31 | virtual ~BufferReader() {}; 32 | 33 | zsize_t size() const override; 34 | size_t getMemorySize() const override; 35 | offset_t offset() const override; 36 | 37 | const Buffer get_buffer(offset_t offset, zsize_t size) const override; 38 | std::unique_ptr sub_reader(offset_t offset, zsize_t size) const override; 39 | 40 | private: // functions 41 | void readImpl(char* dest, offset_t offset, zsize_t size) const override; 42 | char readImpl(offset_t offset) const override; 43 | 44 | private: // data 45 | const Buffer source; 46 | }; 47 | 48 | }; 49 | 50 | #endif // ZIM_BUFFER_READER_H_ 51 | -------------------------------------------------------------------------------- /src/bufferstreamer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Veloman Yunkan 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef ZIM_BUFFERSTREAMER_H 21 | #define ZIM_BUFFERSTREAMER_H 22 | 23 | #include "debug.h" 24 | #include "endian_tools.h" 25 | 26 | #include 27 | 28 | namespace zim 29 | { 30 | 31 | class BufferStreamer 32 | { 33 | public: // functions 34 | BufferStreamer(const Buffer& buffer, zsize_t size) 35 | : m_buffer(buffer), 36 | m_current(buffer.data()), 37 | m_size(size) 38 | {} 39 | 40 | explicit BufferStreamer(const Buffer& buffer) 41 | : BufferStreamer(buffer, buffer.size()) 42 | {} 43 | 44 | // Reads a value of the said type from the stream 45 | // 46 | // For best portability this function should be used with types of known 47 | // bit-width (int32_t, uint16_t, etc) rather than builtin types with 48 | // unknown bit-width (int, unsigned, etc). 49 | template T read() 50 | { 51 | const size_t N(sizeof(T)); 52 | char buf[N]; 53 | memcpy(buf, m_current, N); 54 | skip(zsize_t(N)); 55 | return fromLittleEndian(buf); // XXX: This handles only integral types 56 | } 57 | 58 | const char* current() const { 59 | return m_current; 60 | } 61 | 62 | zsize_t left() const { 63 | return m_size; 64 | } 65 | 66 | void skip(zsize_t nbBytes) { 67 | m_current += nbBytes.v; 68 | m_size -= nbBytes; 69 | } 70 | 71 | private: // data 72 | const Buffer m_buffer; 73 | const char* m_current; 74 | zsize_t m_size; 75 | }; 76 | 77 | } // namespace zim 78 | 79 | #endif // ZIM_BUFDATASTREAM_H 80 | -------------------------------------------------------------------------------- /src/config.h.in: -------------------------------------------------------------------------------- 1 | #ifndef ZIM_CONFIG_H 2 | #define ZIM_CONFIG_H 3 | 4 | #if defined(_MSC_VER) && defined(LIBZIM_EXPORT_PRIVATE_DLL) 5 | #define LIBZIM_PRIVATE_API __declspec(dllexport) 6 | #else 7 | #define LIBZIM_PRIVATE_API 8 | #endif 9 | 10 | #mesondefine VERSION 11 | 12 | #mesondefine DIRENT_CACHE_SIZE 13 | 14 | #mesondefine DIRENT_LOOKUP_CACHE_SIZE 15 | 16 | #mesondefine CLUSTER_CACHE_SIZE 17 | 18 | #mesondefine LZMA_MEMORY_SIZE 19 | 20 | #mesondefine ENABLE_XAPIAN 21 | 22 | #mesondefine ENABLE_XAPIAN_FULLER 23 | 24 | #mesondefine ENABLE_USE_MMAP 25 | 26 | #mesondefine ENABLE_USE_BUFFER_HEADER 27 | 28 | #mesondefine MMAP_SUPPORT_64 29 | 30 | #mesondefine ENV64BIT 31 | 32 | #mesondefine ENV32BIT 33 | 34 | #endif // ZIM_CONFIG_H 35 | -------------------------------------------------------------------------------- /src/constants.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Maneesh P M 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #define ANCHOR_TERM "0posanchor " 21 | 22 | #define DEFAULT_CLUSTER_SIZE 2*1024*1024 23 | -------------------------------------------------------------------------------- /src/debug.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2017-2020 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef DEBUG_H_ 21 | #define DEBUG_H_ 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #if defined (NDEBUG) 29 | # define ASSERT(left, operator, right) (void(0)) 30 | #else 31 | 32 | #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__EMSCRIPTEN__) && defined(__GNU_LIBRARY__) 33 | #include 34 | #endif 35 | 36 | template 37 | void _on_assert_fail(const char* vara, const char* op, const char* varb, 38 | T a, U b, const char* file, int line) { 39 | zim::Formatter fmt; 40 | std::cerr << (fmt << "\nAssertion failed at " << file << ":" << line << "\n " 41 | << vara << "[" << a << "] " << op << " " << varb << "[" << b 42 | << "]") 43 | << std::endl; 44 | 45 | #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__EMSCRIPTEN__) && defined(__GNU_LIBRARY__) 46 | void *callstack[64]; 47 | size_t size; 48 | size = backtrace(callstack, 64); 49 | char** strings = backtrace_symbols(callstack, size); 50 | for (size_t i=0; i 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include "dirent_accessor.h" 21 | 22 | #include "direntreader.h" 23 | #include "_dirent.h" 24 | 25 | #include 26 | 27 | #include 28 | 29 | using namespace zim; 30 | 31 | DirectDirentAccessor::DirectDirentAccessor( 32 | std::shared_ptr direntReader, 33 | std::unique_ptr pathPtrReader, 34 | entry_index_t direntCount) 35 | : mp_direntReader(direntReader), 36 | mp_pathPtrReader(std::move(pathPtrReader)), 37 | m_direntCount(direntCount), 38 | m_direntCache(DIRENT_CACHE_SIZE), 39 | m_bufferDirentZone(256) 40 | {} 41 | 42 | std::shared_ptr DirectDirentAccessor::getDirent(entry_index_t idx) const 43 | { 44 | { 45 | std::lock_guard l(m_direntCacheLock); 46 | auto v = m_direntCache.get(idx.v); 47 | if (v.hit()) { 48 | return v.value(); 49 | } 50 | } 51 | 52 | auto direntOffset = getOffset(idx); 53 | auto dirent = readDirent(direntOffset); 54 | std::lock_guard l(m_direntCacheLock); 55 | m_direntCache.put(idx.v, dirent); 56 | 57 | return dirent; 58 | } 59 | 60 | offset_t DirectDirentAccessor::getOffset(entry_index_t idx) const 61 | { 62 | if (idx >= m_direntCount) { 63 | throw std::out_of_range("entry index out of range"); 64 | } 65 | offset_t offset(mp_pathPtrReader->read_uint(offset_t(sizeof(offset_type)*idx.v))); 66 | return offset; 67 | } 68 | 69 | std::shared_ptr DirectDirentAccessor::readDirent(offset_t offset) const 70 | { 71 | return mp_direntReader->readDirent(offset); 72 | } 73 | 74 | 75 | IndirectDirentAccessor::IndirectDirentAccessor(std::shared_ptr direntAccessor, std::unique_ptr indexReader, title_index_t direntCount) 76 | : mp_direntAccessor(direntAccessor), 77 | mp_indexReader(std::move(indexReader)), 78 | m_direntCount(direntCount) 79 | {} 80 | 81 | entry_index_t IndirectDirentAccessor::getDirectIndex(title_index_t idx) const 82 | { 83 | if (idx >= m_direntCount) { 84 | throw std::out_of_range("entry index out of range"); 85 | } 86 | entry_index_t index(mp_indexReader->read_uint(offset_t(sizeof(entry_index_t)*idx.v))); 87 | return index; 88 | } 89 | 90 | std::shared_ptr IndirectDirentAccessor::getDirent(title_index_t idx) const 91 | { 92 | auto directIndex = getDirectIndex(idx); 93 | return mp_direntAccessor->getDirent(directIndex); 94 | } 95 | -------------------------------------------------------------------------------- /src/dirent_accessor.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef ZIM_DIRENT_ACCESSOR_H 21 | #define ZIM_DIRENT_ACCESSOR_H 22 | 23 | #include "zim_types.h" 24 | #include "lrucache.h" 25 | #include "config.h" 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | namespace zim 33 | { 34 | 35 | class Dirent; 36 | class Reader; 37 | class DirentReader; 38 | 39 | /** 40 | * DirectDirentAccessor is used to access a dirent from its index. 41 | * It doesn't provide any "advanced" features like lookup or find. 42 | * 43 | * This is the base class to locate a dirent (offset) and read it. 44 | * 45 | */ 46 | 47 | class LIBZIM_PRIVATE_API DirectDirentAccessor 48 | { 49 | public: // functions 50 | DirectDirentAccessor(std::shared_ptr direntReader, 51 | std::unique_ptr pathPtrReader, 52 | entry_index_t direntCount); 53 | 54 | offset_t getOffset(entry_index_t idx) const; 55 | std::shared_ptr getDirent(entry_index_t idx) const; 56 | entry_index_t getDirentCount() const { return m_direntCount; } 57 | 58 | size_t getMaxCacheSize() const { return m_direntCache.getMaxCost(); } 59 | size_t getCurrentCacheSize() const { return m_direntCache.cost(); } 60 | void setMaxCacheSize(size_t nbDirents) const { m_direntCache.setMaxCost(nbDirents); } 61 | 62 | private: // functions 63 | std::shared_ptr readDirent(offset_t) const; 64 | 65 | private: // data 66 | std::shared_ptr mp_direntReader; 67 | std::unique_ptr mp_pathPtrReader; 68 | entry_index_t m_direntCount; 69 | 70 | mutable lru_cache, UnitCostEstimation> m_direntCache; 71 | mutable std::mutex m_direntCacheLock; 72 | 73 | mutable std::vector m_bufferDirentZone; 74 | mutable std::mutex m_bufferDirentLock; 75 | }; 76 | 77 | class IndirectDirentAccessor 78 | { 79 | public: 80 | IndirectDirentAccessor(std::shared_ptr, std::unique_ptr indexReader, title_index_t direntCount); 81 | 82 | entry_index_t getDirectIndex(title_index_t idx) const; 83 | std::shared_ptr getDirent(title_index_t idx) const; 84 | title_index_t getDirentCount() const { return m_direntCount; } 85 | 86 | private: // data 87 | std::shared_ptr mp_direntAccessor; 88 | std::unique_ptr mp_indexReader; 89 | title_index_t m_direntCount; 90 | }; 91 | 92 | } // namespace zim 93 | 94 | #endif // ZIM_DIRENT_ACCESSOR_H 95 | -------------------------------------------------------------------------------- /src/direntreader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Veloman Yunkan 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef ZIM_DIRENTREADER_H 21 | #define ZIM_DIRENTREADER_H 22 | 23 | #include "_dirent.h" 24 | #include "reader.h" 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | namespace zim 31 | { 32 | 33 | // Unlke FileReader and MemoryReader (which read data from a file and memory, 34 | // respectively), DirentReader is a helper class that reads Dirents (rather 35 | // than from a Dirent). 36 | class LIBZIM_PRIVATE_API DirentReader 37 | { 38 | public: // functions 39 | explicit DirentReader(std::shared_ptr zimReader) 40 | : mp_zimReader(zimReader) 41 | {} 42 | 43 | std::shared_ptr readDirent(offset_t offset); 44 | 45 | private: // functions 46 | bool initDirent(Dirent& dirent, const Buffer& direntData) const; 47 | 48 | private: // data 49 | std::shared_ptr mp_zimReader; 50 | std::vector m_buffer; 51 | std::mutex m_bufferMutex; 52 | }; 53 | 54 | } // namespace zim 55 | 56 | #endif // ZIM_DIRENTREADER_H 57 | -------------------------------------------------------------------------------- /src/endian_tools.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018 Matthieu Gautier 3 | * Copyright (C) 2006 Tommi Maekitalo 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #ifndef ENDIAN_H 22 | #define ENDIAN_H 23 | 24 | #include 25 | 26 | #include 27 | 28 | namespace zim 29 | { 30 | 31 | template 32 | struct ToLittleEndianImpl; 33 | 34 | template 35 | struct ToLittleEndianImpl{ 36 | static void write(const T& d, char* dst) { 37 | uint16_t v = static_cast(d); 38 | dst[0] = static_cast(v); 39 | dst[1] = static_cast(v>>8); 40 | } 41 | }; 42 | 43 | template 44 | struct ToLittleEndianImpl{ 45 | static void write(const T& d, char* dst) { 46 | uint32_t v = static_cast(d); 47 | dst[0] = static_cast(v); 48 | dst[1] = static_cast(v>>8); 49 | dst[2] = static_cast(v>>16); 50 | dst[3] = static_cast(v>>24); 51 | } 52 | }; 53 | 54 | template 55 | struct ToLittleEndianImpl{ 56 | static void write(const T& d, char* dst) { 57 | uint64_t v = static_cast(d); 58 | dst[0] = static_cast(v); 59 | dst[1] = static_cast(v>>8); 60 | dst[2] = static_cast(v>>16); 61 | dst[3] = static_cast(v>>24); 62 | dst[4] = static_cast(v>>32); 63 | dst[5] = static_cast(v>>40); 64 | dst[6] = static_cast(v>>48); 65 | dst[7] = static_cast(v>>56); 66 | } 67 | }; 68 | 69 | //////////////////////////////////////////////////////////////////////// 70 | template 71 | inline void toLittleEndian(T d, char* dst) 72 | { 73 | ToLittleEndianImpl::write(d, dst); 74 | } 75 | 76 | template 77 | inline T fromLittleEndian(const char* ptr) 78 | { 79 | T ret = 0; 80 | for(size_t i=0; i(static_cast(ptr[i])) << (i*8)); 82 | } 83 | return ret; 84 | } 85 | 86 | } 87 | 88 | #endif // ENDIAN_H 89 | 90 | -------------------------------------------------------------------------------- /src/entry.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Renaud Gaudin 3 | * Copyright (C) 2020 Matthieu Gautier 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include "fileimpl.h" 26 | #include "log.h" 27 | 28 | log_define("zim.entry") 29 | 30 | using namespace zim; 31 | 32 | Entry::Entry(std::shared_ptr file, entry_index_type idx) 33 | : m_file(file), 34 | m_idx(idx), 35 | m_dirent(file->getDirent(entry_index_t(idx))) 36 | {} 37 | 38 | std::string Entry::getTitle() const 39 | { 40 | return m_dirent->getTitle(); 41 | } 42 | 43 | std::string Entry::getPath() const 44 | { 45 | if (m_file->hasNewNamespaceScheme()) { 46 | return m_dirent->getPath(); 47 | } else { 48 | return m_dirent->getLongPath(); 49 | } 50 | } 51 | 52 | bool Entry::isRedirect() const 53 | { 54 | return m_dirent->isRedirect(); 55 | } 56 | 57 | Item Entry::getItem(bool follow) const 58 | { 59 | if (isRedirect()) 60 | { 61 | if (!follow) 62 | throw InvalidType(Formatter() 63 | << "Entry " << getPath() << " is a redirect entry."); 64 | return getRedirect(); 65 | } 66 | 67 | return Item(*this); 68 | } 69 | 70 | Item Entry::getRedirect() const { 71 | auto nextEntry = getRedirectEntry(); 72 | auto watchdog = 50U; 73 | while (nextEntry.isRedirect() && --watchdog) { 74 | nextEntry = nextEntry.getRedirectEntry(); 75 | } 76 | return nextEntry.getItem(false); 77 | } 78 | 79 | entry_index_type Entry::getRedirectEntryIndex() const { 80 | if (!isRedirect()) 81 | throw InvalidType(Formatter() 82 | << "Entry " << getPath() << " is not a redirect entry."); 83 | 84 | return m_dirent->getRedirectIndex().v; 85 | } 86 | 87 | Entry Entry::getRedirectEntry() const { 88 | return Entry(m_file, getRedirectEntryIndex()); 89 | } 90 | -------------------------------------------------------------------------------- /src/file_part.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020-2021 Veloman Yunkan 3 | * Copyright (C) 2017-2021 Matthieu Gautier 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #ifndef ZIM_FILE_PART_H_ 22 | #define ZIM_FILE_PART_H_ 23 | 24 | #include 25 | #include 26 | 27 | #include 28 | 29 | #include "zim_types.h" 30 | #include "fs.h" 31 | 32 | namespace zim { 33 | 34 | /** A part of file. 35 | * 36 | * `FilePart` references a part(section) of a physical file. 37 | * Most of the time, `FilePart` will reference the whole file (m_offset==0 and m_size==m_fhandle->getSize()) 38 | * but in some situation, it can reference only a part of the file: 39 | * We have this case on android where the zim file is split in different part and stored in a "resource" (zip) archive 40 | * using no-compression. 41 | */ 42 | class FilePart { 43 | typedef DEFAULTFS FS; 44 | 45 | public: 46 | using FDSharedPtr = std::shared_ptr; 47 | 48 | public: 49 | explicit FilePart(const std::string& filename) : 50 | m_filename(filename), 51 | m_fhandle(std::make_shared(FS::openFile(filename))), 52 | m_offset(0), 53 | m_size(m_fhandle->getSize()) {} 54 | 55 | #ifndef _WIN32 56 | explicit FilePart(int fd) : 57 | FilePart(getFilePathFromFD(fd)) {} 58 | 59 | explicit FilePart(FdInput fdInput): 60 | m_filename(getFilePathFromFD(fdInput.fd)), 61 | m_fhandle(std::make_shared(FS::openFile(m_filename))), 62 | m_offset(fdInput.offset), 63 | m_size(fdInput.size) {} 64 | #endif 65 | 66 | ~FilePart() = default; 67 | const std::string& filename() const { return m_filename; }; 68 | const FS::FD& fhandle() const { return *m_fhandle; }; 69 | const FDSharedPtr& shareable_fhandle() const { return m_fhandle; }; 70 | 71 | zsize_t size() const { return m_size; }; 72 | offset_t offset() const { return m_offset; } 73 | bool fail() const { return !m_size; }; 74 | bool good() const { return bool(m_size); }; 75 | 76 | private: 77 | const std::string m_filename; 78 | FDSharedPtr m_fhandle; 79 | offset_t m_offset; 80 | zsize_t m_size; // The total size of the (starting at m_offset) of the part 81 | }; 82 | 83 | }; 84 | 85 | #endif //ZIM_FILE_PART_H_ 86 | -------------------------------------------------------------------------------- /src/fs.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef ZIM_FS_H_ 21 | #define ZIM_FS_H_ 22 | 23 | #ifdef _WIN32 24 | # include "fs_windows.h" 25 | #else 26 | # include "fs_unix.h" 27 | #endif 28 | 29 | namespace zim { 30 | 31 | #ifdef _WIN32 32 | using DEFAULTFS = windows::FS; 33 | #else 34 | using DEFAULTFS = unix::FS; 35 | #endif 36 | }; 37 | 38 | #endif //ZIM_FS_H_ 39 | -------------------------------------------------------------------------------- /src/fs_unix.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef ZIM_FS_UNIX_H_ 21 | #define ZIM_FS_UNIX_H_ 22 | 23 | #include "zim_types.h" 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | namespace zim { 32 | 33 | namespace unix { 34 | 35 | using path_t = const std::string&; 36 | 37 | class FD { 38 | public: 39 | using fd_t = int; 40 | 41 | private: 42 | fd_t m_fd = -1; 43 | 44 | public: 45 | FD() = default; 46 | FD(fd_t fd): 47 | m_fd(fd) {}; 48 | FD(const FD& o) = delete; 49 | FD(FD&& o) : 50 | m_fd(o.m_fd) { o.m_fd = -1; } 51 | FD& operator=(FD&& o) { 52 | m_fd = o.m_fd; 53 | o.m_fd = -1; 54 | return *this; 55 | } 56 | ~FD() { close(); } 57 | zsize_t readAt(char* dest, zsize_t size, offset_t offset) const; 58 | zsize_t getSize() const; 59 | fd_t getNativeHandle() const 60 | { 61 | return m_fd; 62 | } 63 | fd_t release() 64 | { 65 | int ret = m_fd; 66 | m_fd = -1; 67 | return ret; 68 | } 69 | bool seek(offset_t offset); 70 | bool close(); 71 | }; 72 | 73 | struct FS { 74 | using FD = zim::unix::FD; 75 | static std::string join(path_t base, path_t name); 76 | static FD openFile(path_t filepath); 77 | static bool makeDirectory(path_t path); 78 | static void rename(path_t old_path, path_t new_path); 79 | static bool remove(path_t path); 80 | static bool removeDir(path_t path); 81 | static bool removeFile(path_t path); 82 | }; 83 | 84 | }; // unix namespace 85 | 86 | std::string getFilePathFromFD(int fd); 87 | 88 | }; // zim namespace 89 | 90 | #endif //ZIM_FS_UNIX_H_ 91 | -------------------------------------------------------------------------------- /src/fs_windows.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef ZIM_FS_WINDOWS_H_ 21 | #define ZIM_FS_WINDOWS_H_ 22 | 23 | #include "zim_types.h" 24 | #include "config.h" 25 | 26 | #include 27 | 28 | typedef void* HANDLE; 29 | 30 | namespace zim { 31 | 32 | namespace windows { 33 | 34 | using path_t = const std::string&; 35 | 36 | struct ImplFD; 37 | 38 | class LIBZIM_PRIVATE_API FD { 39 | public: 40 | typedef HANDLE fd_t; 41 | private: 42 | std::unique_ptr mp_impl; 43 | 44 | public: 45 | FD(); 46 | FD(fd_t handle); 47 | FD(const FD& o) = delete; 48 | FD(FD&& o); 49 | FD& operator=(FD&& o); 50 | FD& operator=(const FD& o) = delete; 51 | ~FD(); 52 | zsize_t readAt(char* dest, zsize_t size, offset_t offset) const; 53 | zsize_t getSize() const; 54 | int release(); 55 | bool seek(offset_t offset); 56 | bool close(); 57 | }; 58 | 59 | struct LIBZIM_PRIVATE_API FS { 60 | using FD = zim::windows::FD; 61 | static std::string join(path_t base, path_t name); 62 | static std::unique_ptr toWideChar(path_t path); 63 | static FD openFile(path_t filepath); 64 | static bool makeDirectory(path_t path); 65 | static void rename(path_t old_path, path_t new_path); 66 | static bool remove(path_t path); 67 | static bool removeDir(path_t path); 68 | static bool removeFile(path_t path); 69 | }; 70 | 71 | }; // windows namespace 72 | 73 | }; // zim namespace 74 | 75 | #endif //ZIM_FS_WINDOWS_H_ 76 | -------------------------------------------------------------------------------- /src/istreamreader.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Matthieu Gautier 3 | * Copyright (C) 2020 Veloman Yunkan 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #include "istreamreader.h" 22 | #include "buffer_reader.h" 23 | 24 | namespace zim 25 | { 26 | 27 | //////////////////////////////////////////////////////////////////////////////// 28 | // IDataStream 29 | //////////////////////////////////////////////////////////////////////////////// 30 | 31 | std::unique_ptr 32 | IStreamReader::sub_reader(zsize_t size) 33 | { 34 | auto buffer = Buffer::makeBuffer(size); 35 | readImpl(const_cast(buffer.data()), size); 36 | return std::unique_ptr(new BufferReader(buffer)); 37 | } 38 | 39 | } // namespace zim 40 | -------------------------------------------------------------------------------- /src/istreamreader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Matthieu Gautier 3 | * Copyright (C) 2020 Veloman Yunkan 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #ifndef ZIM_IDATASTREAM_H 22 | #define ZIM_IDATASTREAM_H 23 | 24 | #include "endian_tools.h" 25 | #include "reader.h" 26 | 27 | #include 28 | 29 | namespace zim 30 | { 31 | 32 | // IStreamReader is a simple interface for sequential iteration over a stream 33 | // of values of built-in/primitive types and/or opaque binary objects (blobs). 34 | // An example usage: 35 | // 36 | // void foo(IStreamReader& s) 37 | // { 38 | // const uint32_t n = s.read(); 39 | // for(uint32_t i=0; i < n; ++i) 40 | // { 41 | // const uint16_t blobSize = s.read(); 42 | // IStreamReader::Blob blob = s.readBlob(blobSize); 43 | // bar(blob, blobSize); 44 | // } 45 | // } 46 | // 47 | class LIBZIM_PRIVATE_API IStreamReader 48 | { 49 | public: // functions 50 | virtual ~IStreamReader() = default; 51 | 52 | // Reads a value of the said type from the stream 53 | // 54 | // For best portability this function should be used with types of known 55 | // bit-width (int32_t, uint16_t, etc) rather than builtin types with 56 | // unknown bit-width (int, unsigned, etc). 57 | template T read(); 58 | 59 | // Reads a blob of the specified size from the stream 60 | virtual std::unique_ptr sub_reader(zsize_t size); 61 | 62 | // Get the total memory consumption by the reader object 63 | virtual size_t getMemorySize() const = 0; 64 | 65 | private: // virtual methods 66 | // Reads exactly 'nbytes' bytes into the provided buffer 'buf' 67 | // (which must be at least that big). Throws an exception if 68 | // more bytes are requested than can be retrieved. 69 | virtual void readImpl(char* buf, zsize_t nbytes) = 0; 70 | }; 71 | 72 | //////////////////////////////////////////////////////////////////////////////// 73 | // Implementation of IStreamReader 74 | //////////////////////////////////////////////////////////////////////////////// 75 | 76 | // XXX: Assuming that opaque binary data retrieved via 'readImpl()' 77 | // XXX: is encoded in little-endian form. 78 | template 79 | inline T 80 | IStreamReader::read() 81 | { 82 | constexpr size_type N(sizeof(T)); 83 | char buf[N]; 84 | readImpl(buf, zsize_t(N)); 85 | return fromLittleEndian(buf); // XXX: This handles only integral types 86 | } 87 | 88 | } // namespace zim 89 | 90 | #endif // ZIM_IDATASTREAM_H 91 | -------------------------------------------------------------------------------- /src/item.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Veloman Yunkan 3 | * Copyright (C) 2020 Matthieu Gautier 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #define ZIM_PRIVATE 22 | #include 23 | #include "cluster.h" 24 | #include "zim/zim.h" 25 | #include "fileimpl.h" 26 | #include "log.h" 27 | 28 | #include 29 | 30 | log_define("zim.item") 31 | 32 | using namespace zim; 33 | 34 | Item::Item(const Entry& entry) 35 | : Entry(entry) 36 | { 37 | assert(!entry.isRedirect()); 38 | } 39 | 40 | std::string Item::getMimetype() const 41 | { 42 | return m_file->getMimeType(m_dirent->getMimeType()); 43 | } 44 | 45 | Blob Item::getData(offset_type offset) const 46 | { 47 | return m_file->getBlob(*m_dirent, offset_t(offset)); 48 | } 49 | 50 | Blob Item::getData(offset_type offset, size_type size) const 51 | { 52 | return m_file->getBlob(*m_dirent, offset_t(offset), zsize_t(size)); 53 | } 54 | 55 | size_type Item::getSize() const 56 | { 57 | auto cluster = m_file->getCluster(m_dirent->getClusterNumber()); 58 | return size_type(cluster->getBlobSize(m_dirent->getBlobNumber())); 59 | } 60 | 61 | ItemDataDirectAccessInfo Item::getDirectAccessInformation() const 62 | { 63 | return m_file->getDirectAccessInformation(m_dirent->getClusterNumber(), m_dirent->getBlobNumber()); 64 | } 65 | 66 | cluster_index_type Item::getClusterIndex() const 67 | { 68 | return m_dirent->getClusterNumber().v; 69 | } 70 | 71 | blob_index_type Item::getBlobIndex() const 72 | { 73 | return m_dirent->getBlobNumber().v; 74 | } 75 | -------------------------------------------------------------------------------- /src/lock.h: -------------------------------------------------------------------------------- 1 | #ifndef ZIM_LOCK_H 2 | #define ZIM_LOCK_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | class MultiMutex { 10 | public: 11 | explicit MultiMutex() 12 | : m_mutexes() 13 | {} 14 | explicit MultiMutex(const std::vector& mutexes) 15 | : m_mutexes(mutexes) { 16 | // By sorting the mutex, we avoid the simple case when 3 concurrent multi lock: 17 | // - (A, B) 18 | // - (B, C) 19 | // - (C, A) 20 | // As we sort, we will have : 21 | // - (A, B) 22 | // - (B, C) 23 | // - (A, C) 24 | // 25 | // And no deadlock can occurs. 26 | std::sort(m_mutexes.begin(), m_mutexes.end()); 27 | } 28 | 29 | void lock() { 30 | auto lockedCount = 0; 31 | for (auto mutex:m_mutexes) { 32 | try { 33 | mutex->lock(); 34 | lockedCount += 1; 35 | } catch(const std::system_error& e) { 36 | unwindLock(lockedCount); 37 | throw; 38 | } 39 | } 40 | } 41 | 42 | void unlock() { 43 | unwindLock(m_mutexes.size()); 44 | } 45 | 46 | 47 | private: 48 | std::vector m_mutexes; 49 | 50 | void unwindLock(size_t lockedCount) { 51 | while (lockedCount) { 52 | m_mutexes[--lockedCount]->unlock(); 53 | } 54 | } 55 | }; 56 | 57 | #endif // ZIM_LOCK_H 58 | -------------------------------------------------------------------------------- /src/meson.build: -------------------------------------------------------------------------------- 1 | 2 | configure_file(output : 'config.h', 3 | configuration : private_conf, 4 | input : 'config.h.in') 5 | 6 | src_directory = include_directories('.') 7 | 8 | common_sources = [ 9 | # 'config.h', 10 | 'archive.cpp', 11 | 'cluster.cpp', 12 | 'buffer_reader.cpp', 13 | 'dirent.cpp', 14 | 'dirent_accessor.cpp', 15 | 'entry.cpp', 16 | 'fileheader.cpp', 17 | 'fileimpl.cpp', 18 | 'file_compound.cpp', 19 | 'file_reader.cpp', 20 | 'item.cpp', 21 | 'blob.cpp', 22 | 'buffer.cpp', 23 | 'md5.c', 24 | 'uuid.cpp', 25 | 'tools.cpp', 26 | 'compression.cpp', 27 | 'istreamreader.cpp', 28 | 'namedthread.cpp', 29 | 'log.cpp', 30 | 'writer/contentProvider.cpp', 31 | 'writer/creator.cpp', 32 | 'writer/item.cpp', 33 | 'writer/cluster.cpp', 34 | 'writer/dirent.cpp', 35 | 'writer/workers.cpp', 36 | 'writer/clusterWorker.cpp', 37 | 'writer/titleListingHandler.cpp', 38 | 'writer/counterHandler.cpp', 39 | 'suggestion.cpp', 40 | 'suggestion_iterator.cpp', 41 | 'version.cpp' 42 | ] 43 | 44 | if host_machine.system() == 'windows' 45 | common_sources += 'fs_windows.cpp' 46 | else 47 | common_sources += 'fs_unix.cpp' 48 | endif 49 | 50 | xapian_sources = [ 51 | 'search.cpp', 52 | 'search_iterator.cpp', 53 | 'xapian/htmlparse.cc', 54 | 'xapian/myhtmlparse.cc', 55 | 'writer/xapianIndexer.cpp', 56 | 'writer/xapianWorker.cpp', 57 | 'writer/xapianHandler.cpp' 58 | ] 59 | 60 | sources = common_sources 61 | deps = [thread_dep, lzma_dep, zstd_dep, win_deps] 62 | 63 | if target_machine.system() == 'freebsd' 64 | deps += [execinfo_dep] 65 | endif 66 | 67 | if xapian_dep.found() 68 | sources += xapian_sources 69 | sources += lib_resources 70 | deps += [xapian_dep, icu_dep] 71 | endif 72 | 73 | libzim = library('zim', 74 | sources, 75 | include_directories : inc, 76 | dependencies : deps, 77 | version: meson.project_version(), 78 | install : true) 79 | libzim_dep = declare_dependency(link_with: libzim, 80 | include_directories: include_directory) 81 | -------------------------------------------------------------------------------- /src/namedthread.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2025 Veloman Yunkan 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include "namedthread.h" 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | namespace zim 29 | { 30 | 31 | namespace 32 | { 33 | 34 | std::mutex mutex_; 35 | size_t threadCounter_ = 0; 36 | std::vector namedThreads_; 37 | std::map threadId2NameMap_; 38 | 39 | } // unnamed namespace 40 | 41 | 42 | std::mutex& NamedThread::getMutex() 43 | { 44 | return mutex_; 45 | } 46 | 47 | NamedThread::NamedThread(const std::string& name) 48 | : name_(name) 49 | { 50 | std::lock_guard lock(mutex_); 51 | namedThreads_.push_back(this); 52 | } 53 | 54 | NamedThread::~NamedThread() 55 | { 56 | join(); 57 | 58 | std::lock_guard lock(mutex_); 59 | const auto it = std::find(namedThreads_.begin(), namedThreads_.end(), this); 60 | namedThreads_.erase(it); 61 | } 62 | 63 | void NamedThread::join() 64 | { 65 | if ( thread_.joinable() ) { 66 | const auto threadId = thread_.get_id(); 67 | thread_.join(); 68 | std::lock_guard lock(mutex_); 69 | threadId2NameMap_.erase(threadId); 70 | } 71 | } 72 | 73 | std::string NamedThread::getCurrentThreadName() 74 | { 75 | std::lock_guard lock(mutex_); 76 | 77 | const auto curThreadId = std::this_thread::get_id(); 78 | const auto it = threadId2NameMap_.find(curThreadId); 79 | if ( it != threadId2NameMap_.end() ) 80 | return it->second; 81 | 82 | for (const auto nt : namedThreads_) { 83 | if ( nt->thread_.get_id() == curThreadId ) { 84 | threadId2NameMap_[curThreadId] = nt->name_; 85 | return nt->name_; 86 | } 87 | } 88 | 89 | std::ostringstream newEntryName; 90 | newEntryName << "thread#" << threadCounter_++; 91 | threadId2NameMap_[curThreadId] = newEntryName.str(); 92 | return newEntryName.str(); 93 | } 94 | 95 | } // namespace zim 96 | -------------------------------------------------------------------------------- /src/namedthread.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2025 Veloman Yunkan 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef OPENZIM_LIBZIM_NAMEDTHREAD_H 21 | #define OPENZIM_LIBZIM_NAMEDTHREAD_H 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | #include "config.h" 28 | 29 | namespace zim 30 | { 31 | 32 | class LIBZIM_PRIVATE_API NamedThread 33 | { 34 | private: 35 | explicit NamedThread(const std::string& name); 36 | 37 | public: 38 | template 39 | NamedThread(const std::string& name, F&& f) 40 | : NamedThread(name) 41 | { 42 | // Ensure that f starts executing after the assignment to 43 | // the thread_ data member has completed (so that any possible 44 | // calls to NamedThread::getCurrentThreadName() from inside f() 45 | // read the correct value of thread id). 46 | std::mutex& mutex = getMutex(); 47 | std::lock_guard lock(mutex); 48 | 49 | thread_ = std::thread([f, &mutex]() { mutex.lock(); mutex.unlock(); f(); }); 50 | } 51 | 52 | ~NamedThread(); 53 | 54 | NamedThread(const NamedThread& ) = delete; 55 | void operator=(const NamedThread& ) = delete; 56 | 57 | void join(); 58 | 59 | static std::string getCurrentThreadName(); 60 | 61 | private: // functions 62 | // This is a workaround for a bug in our build system that prevents 63 | // LIBZIM_PRIVATE_API and/or LIBZIM_API classes from having static data 64 | // members 65 | static std::mutex& getMutex(); 66 | 67 | private: // data 68 | const std::string name_; 69 | std::thread thread_; 70 | }; 71 | 72 | } // namespace zim 73 | 74 | #endif // OPENZIM_LIBZIM_NAMEDTHREAD_H 75 | -------------------------------------------------------------------------------- /src/rawstreamreader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Matthieu Gautier 3 | * Copyright (C) 2020 Veloman Yunkan 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #ifndef ZIM_RAWSTREAMREADER_H 22 | #define ZIM_RAWSTREAMREADER_H 23 | 24 | #include "istreamreader.h" 25 | #include "reader.h" 26 | 27 | namespace zim 28 | { 29 | 30 | class RawStreamReader : public IStreamReader 31 | { 32 | public: // functions 33 | explicit RawStreamReader(std::shared_ptr reader) 34 | : m_reader(reader), 35 | m_readerPos(0) 36 | {} 37 | 38 | size_t getMemorySize() const override { 39 | return m_reader->getMemorySize(); 40 | } 41 | 42 | void readImpl(char* buf, zsize_t nbytes) override 43 | { 44 | m_reader->read(buf, m_readerPos, zsize_t(nbytes)); 45 | m_readerPos += nbytes; 46 | } 47 | 48 | std::unique_ptr sub_reader(zsize_t nbytes) override 49 | { 50 | auto reader = m_reader->sub_reader(m_readerPos, nbytes); 51 | m_readerPos += nbytes; 52 | return reader; 53 | } 54 | 55 | 56 | private: // data 57 | std::shared_ptr m_reader; 58 | offset_t m_readerPos; 59 | }; 60 | 61 | } // namespace zim 62 | 63 | #endif // ZIM_RAWSTREAMREADER_H 64 | -------------------------------------------------------------------------------- /src/reader.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2017-2020 Matthieu Gautier 3 | * Copyright (C) 2020 Veloman Yunkan 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #ifndef ZIM_READER_H_ 22 | #define ZIM_READER_H_ 23 | 24 | #include 25 | #include 26 | 27 | #include "zim_types.h" 28 | #include "endian_tools.h" 29 | #include "debug.h" 30 | 31 | #include "buffer.h" 32 | 33 | namespace zim { 34 | 35 | class LIBZIM_PRIVATE_API Reader { 36 | public: 37 | Reader() {}; 38 | 39 | // Returns the full size of data accessible via this reader object 40 | virtual zsize_t size() const = 0; 41 | 42 | // Returns the memory consumption by this reader object 43 | virtual size_t getMemorySize() const = 0; 44 | 45 | virtual ~Reader() {}; 46 | 47 | void read(char* dest, offset_t offset, zsize_t size) const { 48 | if (can_read(offset, size)) { 49 | if (size) { 50 | // Do the actuall read only if we have a size to read 51 | readImpl(dest, offset, size); 52 | } 53 | return; 54 | } 55 | throw std::runtime_error("Cannot read after the end of the reader"); 56 | } 57 | 58 | template 59 | T read_uint(offset_t offset) const { 60 | ASSERT(offset.v, <, size().v); 61 | ASSERT(offset.v+sizeof(T), <=, size().v); 62 | char tmp_buf[sizeof(T)]; 63 | read(tmp_buf, offset, zsize_t(sizeof(T))); 64 | return fromLittleEndian(tmp_buf); 65 | } 66 | 67 | char read(offset_t offset) const { 68 | if (can_read(offset, zsize_t(1))) { 69 | return readImpl(offset); 70 | } 71 | throw std::runtime_error("Cannot read after the end of the reader"); 72 | } 73 | 74 | virtual const Buffer get_buffer(offset_t offset, zsize_t size) const = 0; 75 | const Buffer get_buffer(offset_t offset) const { 76 | return get_buffer(offset, zsize_t(size().v-offset.v)); 77 | } 78 | virtual std::unique_ptr sub_reader(offset_t offset, zsize_t size) const = 0; 79 | std::unique_ptr sub_reader(offset_t offset) const { 80 | return sub_reader(offset, zsize_t(size().v-offset.v)); 81 | } 82 | virtual offset_t offset() const = 0; 83 | 84 | bool can_read(offset_t offset, zsize_t size) const; 85 | 86 | private: 87 | // Implementation of the read method. 88 | // Check of the validity of the offset/size has already been done. 89 | virtual void readImpl(char* dest, offset_t offset, zsize_t size) const = 0; 90 | 91 | // Implementation of the read method. 92 | // Check of the validity of the offset has already been done. 93 | virtual char readImpl(offset_t offset) const = 0; 94 | }; 95 | 96 | }; 97 | 98 | #endif // ZIM_READER_H_ 99 | -------------------------------------------------------------------------------- /src/tools.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2016-2020 Matthieu Gautier 3 | * Copyright (C) 2021 Maneesh P M 4 | * Copyright (C) 2013-2016 Emmanuel Engelhart 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 3 of the License, or 9 | * any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 19 | * MA 02110-1301, USA. 20 | */ 21 | 22 | #ifndef OPENZIM_LIBZIM_TOOLS_H 23 | #define OPENZIM_LIBZIM_TOOLS_H 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include "config.h" 30 | 31 | #include 32 | #include "zim/writer/item.h" 33 | 34 | #if defined(ENABLE_XAPIAN) 35 | namespace Xapian { 36 | class Database; 37 | } 38 | #endif // ENABLE_XAPIAN 39 | namespace zim { 40 | bool isCompressibleMimetype(const std::string& mimetype); 41 | uint32_t LIBZIM_PRIVATE_API countWords(const std::string& text); 42 | void LIBZIM_PRIVATE_API microsleep(int microseconds); 43 | 44 | std::tuple LIBZIM_PRIVATE_API parseLongPath(const std::string& longPath); 45 | 46 | // Parse a illustration path ("Illustration_x@1") to a size. 47 | unsigned int LIBZIM_PRIVATE_API parseIllustrationPathToSize(const std::string& s); 48 | 49 | /** Return a random number from range [0, max] 50 | * 51 | * This function is threadsafe 52 | **/ 53 | uint32_t LIBZIM_PRIVATE_API randomNumber(uint32_t max); 54 | 55 | std::vector split(const std::string & str, 56 | const std::string & delims=" *-"); 57 | 58 | std::map read_valuesmap(const std::string& s); 59 | 60 | using MimeCounterType = std::map; 61 | MimeCounterType LIBZIM_PRIVATE_API parseMimetypeCounter(const std::string& counterData); 62 | 63 | template 64 | entry_index_type countMimeType(const std::string& counterData, Filter filter) { 65 | entry_index_type count = 0; 66 | for (auto& pair: parseMimetypeCounter(counterData)) { 67 | if (filter(pair.first)) { 68 | count += pair.second; 69 | } 70 | } 71 | return count; 72 | } 73 | 74 | namespace writer { 75 | class Dirent; 76 | bool isFrontArticle(const Dirent* dirent, const Hints& hints); 77 | } 78 | 79 | // Xapian based tools 80 | #if defined(ENABLE_XAPIAN) 81 | std::string LIBZIM_PRIVATE_API removeAccents(const std::string& text); 82 | bool getDbFromAccessInfo(zim::ItemDataDirectAccessInfo accessInfo, Xapian::Database& database); 83 | #endif 84 | } 85 | 86 | #endif // OPENZIM_LIBZIM_TOOLS_H 87 | -------------------------------------------------------------------------------- /src/uuid.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Maneesh P M 3 | * Copyright (C) 2018-2020 Matthieu Gautier 4 | * Copyright (C) 2009 Tommi Maekitalo 5 | * 6 | * This program is free software; you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation; either version 2 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but 12 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 13 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 14 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 19 | * 20 | */ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include // necessary to have the new types 27 | #include "log.h" 28 | #include "md5.h" 29 | 30 | #ifdef _WIN32 31 | 32 | # include 33 | # include 34 | int gettimeofday(struct timeval* tp, void* tzp) { 35 | DWORD t; 36 | t = timeGetTime(); 37 | tp->tv_sec = t / 1000; 38 | tp->tv_usec = t % 1000; 39 | return 0; 40 | } 41 | 42 | #define getpid GetCurrentProcessId 43 | 44 | #else 45 | # include 46 | #endif 47 | 48 | log_define("zim.uuid") 49 | 50 | namespace zim 51 | { 52 | namespace 53 | { 54 | char hex[] = "0123456789abcdef"; 55 | inline char hi(char v) 56 | { return hex[(v >> 4) & 0xf]; } 57 | 58 | inline char lo(char v) 59 | { return hex[v & 0xf]; } 60 | } 61 | 62 | Uuid Uuid::generate(std::string value) 63 | { 64 | Uuid ret; 65 | struct zim_MD5_CTX md5ctx; 66 | zim_MD5Init(&md5ctx); 67 | 68 | if ( value.empty() ) { 69 | struct timeval tv; 70 | gettimeofday(&tv, 0); 71 | 72 | clock_t c = clock(); 73 | 74 | zim_MD5Update(&md5ctx, reinterpret_cast(&c), sizeof(clock_t)); 75 | zim_MD5Update(&md5ctx, reinterpret_cast(&tv), sizeof(struct timeval)); 76 | } else { 77 | zim_MD5Update(&md5ctx, reinterpret_cast(value.data()), value.size()); 78 | } 79 | zim_MD5Final(reinterpret_cast(&ret.data[0]), &md5ctx); 80 | 81 | log_debug("generated uuid: " << ret.data); 82 | 83 | return ret; 84 | } 85 | 86 | Uuid::operator std::string() const 87 | { 88 | std::ostringstream out; 89 | zim::operator<<(out, *this); 90 | return out.str(); 91 | } 92 | 93 | std::ostream& operator<< (std::ostream& out, const Uuid& uuid) 94 | { 95 | for (unsigned n = 0; n < 4; ++n) 96 | out << hi(uuid.data[n]) << lo(uuid.data[n]); 97 | out << '-'; 98 | for (unsigned n = 4; n < 6; ++n) 99 | out << hi(uuid.data[n]) << lo(uuid.data[n]); 100 | out << '-'; 101 | for (unsigned n = 6; n < 8; ++n) 102 | out << hi(uuid.data[n]) << lo(uuid.data[n]); 103 | out << '-'; 104 | for (unsigned n = 8; n < 10; ++n) 105 | out << hi(uuid.data[n]) << lo(uuid.data[n]); 106 | out << '-'; 107 | for (unsigned n = 10; n < 16; ++n) 108 | out << hi(uuid.data[n]) << lo(uuid.data[n]); 109 | return out; 110 | } 111 | 112 | } 113 | -------------------------------------------------------------------------------- /src/version.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Emmanuel Engelhart 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #if defined(ENABLE_XAPIAN) 30 | #include 31 | #include 32 | #endif 33 | 34 | namespace zim 35 | { 36 | LibVersions getVersions() { 37 | LibVersions versions = { 38 | { "libzim", LIBZIM_VERSION }, 39 | { "libzstd", ZSTD_VERSION_STRING }, 40 | { "liblzma", LZMA_VERSION_STRING } 41 | }; 42 | 43 | #if defined(ENABLE_XAPIAN) 44 | // Libxapian is not a mandatory dependence 45 | versions.push_back({ "libxapian", XAPIAN_VERSION }); 46 | 47 | // U_ICU_VERSION does not include the patch level if 0 48 | versions.push_back({"libicu", Formatter() << U_ICU_VERSION_MAJOR_NUM << "." 49 | << U_ICU_VERSION_MINOR_NUM << "." 50 | << U_ICU_VERSION_PATCHLEVEL_NUM}); 51 | #endif 52 | 53 | return versions; 54 | } 55 | 56 | void printVersions(std::ostream& out) { 57 | LibVersions versions = getVersions(); 58 | for (const auto& iter : versions) { 59 | out << (iter != versions.front() ? "+ " : "") << 60 | iter.first << " " << iter.second << std::endl; 61 | } 62 | } 63 | 64 | } //namespace zim 65 | -------------------------------------------------------------------------------- /src/writer/clusterWorker.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include "clusterWorker.h" 21 | 22 | #include "cluster.h" 23 | 24 | namespace zim 25 | { 26 | namespace writer 27 | { 28 | 29 | void ClusterTask::run(CreatorData* data) { 30 | cluster->close(); 31 | }; 32 | 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/writer/clusterWorker.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation; either version 3 of the License, or 7 | * any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 17 | * MA 02110-1301, USA. 18 | */ 19 | 20 | #ifndef OPENZIM_LIBZIM_CLUSTER_WORKER_H 21 | #define OPENZIM_LIBZIM_CLUSTER_WORKER_H 22 | 23 | #include 24 | #include "workers.h" 25 | 26 | namespace zim { 27 | namespace writer { 28 | 29 | class Cluster; 30 | 31 | class ClusterTask : public TrackableTask { 32 | public: 33 | ClusterTask(const ClusterTask&) = delete; 34 | ClusterTask& operator=(const ClusterTask&) = delete; 35 | explicit ClusterTask(Cluster* cluster) : 36 | cluster(cluster) 37 | {}; 38 | virtual ~ClusterTask() = default; 39 | 40 | virtual void run(CreatorData* data); 41 | 42 | private: 43 | Cluster* cluster; 44 | }; 45 | 46 | } 47 | } 48 | 49 | #endif // OPENZIM_LIBZIM_QUEUE_H 50 | -------------------------------------------------------------------------------- /src/writer/contentProvider.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include 21 | 22 | #include "../fs.h" 23 | 24 | const zim::size_type BUFFER_SIZE(1024*1024); 25 | 26 | namespace zim 27 | { 28 | namespace writer 29 | { 30 | Blob StringProvider::feed() 31 | { 32 | if (feeded) { 33 | return Blob(nullptr, 0); 34 | } 35 | feeded = true; 36 | return Blob(content.data(), content.size()); 37 | } 38 | 39 | Blob SharedStringProvider::feed() 40 | { 41 | if (feeded) { 42 | return Blob(nullptr, 0); 43 | } 44 | feeded = true; 45 | return Blob(content->data(), content->size()); 46 | } 47 | 48 | FileProvider::FileProvider(const std::string& filepath) 49 | : filepath(filepath), 50 | buffer(new char[BUFFER_SIZE]), 51 | fd(new DEFAULTFS::FD(DEFAULTFS::openFile(filepath))), 52 | offset(0) 53 | { 54 | size = fd->getSize().v; 55 | } 56 | 57 | FileProvider::~FileProvider() = default; 58 | 59 | Blob FileProvider::feed() 60 | { 61 | auto sizeToRead = std::min(BUFFER_SIZE, size-offset); 62 | if (!sizeToRead) { 63 | return Blob(nullptr, 0); 64 | } 65 | 66 | if(fd->readAt(buffer.get(), zim::zsize_t(sizeToRead), zim::offset_t(offset)) == zim::zsize_t(-1)) { 67 | throw std::runtime_error("Error reading file " + filepath); 68 | } 69 | offset += sizeToRead; 70 | return Blob(buffer.get(), sizeToRead); 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/writer/counterHandler.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation; either version 3 of the License, or 7 | * any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 17 | * MA 02110-1301, USA. 18 | */ 19 | 20 | #include "counterHandler.h" 21 | #include "creatordata.h" 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | using namespace zim::writer; 28 | 29 | CounterHandler::CounterHandler(CreatorData* data) 30 | : mp_creatorData(data) 31 | {} 32 | 33 | CounterHandler::~CounterHandler() = default; 34 | 35 | void CounterHandler::start() { 36 | } 37 | 38 | void CounterHandler::stop() { 39 | } 40 | 41 | DirentHandler::Dirents CounterHandler::createDirents() const { 42 | Dirents ret; 43 | ret.push_back(mp_creatorData->createDirent(NS::M, "Counter", "text/plain", "")); 44 | return ret; 45 | } 46 | 47 | DirentHandler::ContentProviders CounterHandler::getContentProviders() const { 48 | ContentProviders ret; 49 | Formatter fmt; 50 | bool first = true; 51 | for(auto pair: m_mimetypeCounter) { 52 | if (! first) { 53 | fmt << ";"; 54 | } 55 | fmt << pair.first << "=" << pair.second; 56 | first = false; 57 | } 58 | ret.push_back(std::unique_ptr(new StringProvider(fmt))); 59 | return ret; 60 | } 61 | 62 | void CounterHandler::handle(Dirent* dirent, const Hints& hints) 63 | { 64 | } 65 | 66 | void CounterHandler::handle(Dirent* dirent, std::shared_ptr item) 67 | { 68 | if (dirent->getNamespace() != NS::C) { 69 | return; 70 | } 71 | auto mimetype = item->getMimeType(); 72 | if (mimetype.empty()) { 73 | return; 74 | } 75 | m_mimetypeCounter[mimetype] += 1; 76 | } 77 | -------------------------------------------------------------------------------- /src/writer/counterHandler.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation; either version 3 of the License, or 7 | * any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 17 | * MA 02110-1301, USA. 18 | */ 19 | 20 | #ifndef OPENZIM_LIBZIM_COUNTER_HANDLER_H 21 | #define OPENZIM_LIBZIM_COUNTER_HANDLER_H 22 | 23 | #include "handler.h" 24 | 25 | #include 26 | 27 | namespace zim { 28 | namespace writer { 29 | 30 | 31 | class CounterHandler : public DirentHandler { 32 | public: 33 | typedef std::map Counter; 34 | 35 | explicit CounterHandler(CreatorData* data); 36 | virtual ~CounterHandler(); 37 | 38 | void start() override; 39 | void stop() override; 40 | bool isCompressible() override { return true; } 41 | ContentProviders getContentProviders() const override; 42 | void handle(Dirent* dirent, std::shared_ptr item) override; 43 | void handle(Dirent* dirent, const Hints& hints) override; 44 | 45 | private: 46 | Dirents createDirents() const override; 47 | CreatorData* mp_creatorData; 48 | Counter m_mimetypeCounter; 49 | }; 50 | 51 | } 52 | } 53 | 54 | #endif // OPENZIM_LIBZIM_COUNTER_HANDLER_H 55 | -------------------------------------------------------------------------------- /src/writer/direntPool.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2019-2021 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #ifndef ZIM_WRITER_DIRENTPOOL_H 21 | #define ZIM_WRITER_DIRENTPOOL_H 22 | 23 | #include "debug.h" 24 | #include "_dirent.h" 25 | 26 | namespace zim 27 | { 28 | namespace writer { 29 | class DirentPool { 30 | private: 31 | std::vector pools; 32 | uint16_t direntIndex; 33 | 34 | void allocate_new_pool() { 35 | pools.push_back(reinterpret_cast(new char[sizeof(Dirent)*0xFFFF])); 36 | direntIndex = 0; 37 | } 38 | static void destroyPoolBlock(Dirent* pool, uint16_t count=0xFFFF) { 39 | for (auto i = 0U; i < count; i++) { 40 | try { 41 | pool[i].~Dirent(); 42 | } catch (...){ /*discard*/ } 43 | } 44 | delete [] (reinterpret_cast(pool)); 45 | } 46 | 47 | /* Return a *NOT constructed* pointer to a dirent */ 48 | Dirent* getDirentSlot() { 49 | if (direntIndex == 0xFFFF) { 50 | allocate_new_pool(); 51 | } 52 | auto dirent = pools.back() + direntIndex++; 53 | return dirent; 54 | } 55 | 56 | public: 57 | DirentPool() : 58 | direntIndex(0xFFFF) 59 | {} 60 | DirentPool(const DirentPool&) = delete; 61 | DirentPool& operator=(const DirentPool&) = delete; 62 | ~DirentPool() { 63 | auto nbPools = pools.size(); 64 | if (nbPools == 0) { 65 | return; 66 | } 67 | // Delete all but last pools (add call the destructors of the dirents) 68 | for (auto i = 0U; i 3 | * Copyright (C) 2009 Tommi Maekitalo 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #include 22 | #include 23 | #include "defaultIndexData.h" 24 | 25 | namespace zim 26 | { 27 | namespace writer 28 | { 29 | std::shared_ptr Item::getIndexData() const 30 | { 31 | if (getMimeType().find("text/html")!=0) { 32 | return nullptr; 33 | } 34 | 35 | auto provider = getContentProvider(); 36 | return std::make_shared(std::move(provider), getTitle()); 37 | } 38 | 39 | Hints Item::getHints() const { 40 | return Hints(); 41 | } 42 | 43 | Hints Item::getAmendedHints() const { 44 | auto hints = getHints(); 45 | 46 | // If not FRONT_ARTICLE hints is given, determine it from the mimetype. 47 | if (hints.find(FRONT_ARTICLE) == hints.end()) { 48 | hints[FRONT_ARTICLE] = (getMimeType().find("text/html") == 0); 49 | } 50 | 51 | // If not COMPRESS hints is given, determine it from the mimetype. 52 | if (hints.find(COMPRESS) == hints.end()) { 53 | hints[COMPRESS] = isCompressibleMimetype(getMimeType()); 54 | } 55 | return hints; 56 | } 57 | 58 | std::unique_ptr StringItem::getContentProvider() const 59 | { 60 | auto shared_string = std::shared_ptr(shared_from_this(), &content); 61 | return std::unique_ptr(new SharedStringProvider(shared_string)); 62 | } 63 | 64 | std::unique_ptr FileItem::getContentProvider() const 65 | { 66 | return std::unique_ptr(new FileProvider(filepath)); 67 | } 68 | 69 | 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/writer/queue.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2016-2020 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation; either version 3 of the License, or 7 | * any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 17 | * MA 02110-1301, USA. 18 | */ 19 | 20 | #ifndef OPENZIM_LIBZIM_QUEUE_H 21 | #define OPENZIM_LIBZIM_QUEUE_H 22 | 23 | #define MAX_QUEUE_SIZE 10 24 | 25 | #include 26 | #include 27 | #include "../tools.h" 28 | 29 | template 30 | class Queue { 31 | public: 32 | Queue() = default; 33 | virtual ~Queue() = default; 34 | virtual bool isEmpty(); 35 | virtual size_t size(); 36 | virtual void pushToQueue(const T& element); 37 | virtual bool getHead(T &element); 38 | virtual bool popFromQueue(T &element); 39 | 40 | protected: 41 | std::queue m_realQueue; 42 | std::mutex m_queueMutex; 43 | 44 | private: 45 | // Make this queue non copyable 46 | Queue(const Queue&); 47 | Queue& operator=(const Queue&); 48 | }; 49 | 50 | template 51 | bool Queue::isEmpty() { 52 | std::lock_guard l(m_queueMutex); 53 | return m_realQueue.empty(); 54 | } 55 | 56 | template 57 | size_t Queue::size() { 58 | std::lock_guard l(m_queueMutex); 59 | return m_realQueue.size(); 60 | } 61 | 62 | template 63 | void Queue::pushToQueue(const T &element) { 64 | unsigned int wait = 0; 65 | unsigned int queueSize = 0; 66 | 67 | do { 68 | zim::microsleep(wait); 69 | queueSize = size(); 70 | wait += 10; 71 | } while (queueSize > MAX_QUEUE_SIZE); 72 | 73 | std::lock_guard l(m_queueMutex); 74 | m_realQueue.push(element); 75 | } 76 | 77 | template 78 | bool Queue::getHead(T &element) { 79 | std::lock_guard l(m_queueMutex); 80 | if (m_realQueue.empty()) { 81 | return false; 82 | } 83 | element = m_realQueue.front(); 84 | return true; 85 | } 86 | 87 | template 88 | bool Queue::popFromQueue(T &element) { 89 | std::lock_guard l(m_queueMutex); 90 | if (m_realQueue.empty()) { 91 | return false; 92 | } 93 | 94 | element = m_realQueue.front(); 95 | m_realQueue.pop(); 96 | 97 | return true; 98 | } 99 | 100 | #endif // OPENZIM_LIBZIM_QUEUE_H 101 | -------------------------------------------------------------------------------- /src/writer/titleListingHandler.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020-2021 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation; either version 3 of the License, or 7 | * any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 17 | * MA 02110-1301, USA. 18 | */ 19 | 20 | #include "titleListingHandler.h" 21 | #include "creatordata.h" 22 | 23 | #include "../endian_tools.h" 24 | #include "tools.h" 25 | 26 | #include 27 | #include 28 | 29 | using namespace zim::writer; 30 | 31 | namespace { 32 | 33 | class ListingProvider : public ContentProvider { 34 | public: 35 | explicit ListingProvider(const TitleListingHandler::Dirents* dirents) 36 | : mp_dirents(dirents), 37 | m_it(dirents->begin()) 38 | {} 39 | 40 | zim::size_type getSize() const override { 41 | return mp_dirents->size() * sizeof(zim::entry_index_type); 42 | } 43 | 44 | zim::Blob feed() override { 45 | if (m_it == mp_dirents->end()) { 46 | return zim::Blob(nullptr, 0); 47 | } 48 | zim::toLittleEndian((*m_it)->getIdx().v, buffer); 49 | m_it++; 50 | return zim::Blob(buffer, sizeof(zim::entry_index_type)); 51 | } 52 | 53 | private: 54 | const TitleListingHandler::Dirents* mp_dirents; 55 | char buffer[sizeof(zim::entry_index_type)]; 56 | TitleListingHandler::Dirents::const_iterator m_it; 57 | }; 58 | 59 | } // end of anonymous namespace 60 | 61 | TitleListingHandler::TitleListingHandler(CreatorData* data) 62 | : mp_creatorData(data) 63 | {} 64 | 65 | TitleListingHandler::~TitleListingHandler() = default; 66 | 67 | void TitleListingHandler::start() { 68 | } 69 | 70 | void TitleListingHandler::stop() { 71 | m_handledDirents.erase( 72 | std::remove_if(m_handledDirents.begin(), m_handledDirents.end(), [](const Dirent* d) { return d->isRemoved(); }), 73 | m_handledDirents.end()); 74 | std::sort(m_handledDirents.begin(), m_handledDirents.end(), TitleCompare()); 75 | } 76 | 77 | DirentHandler::Dirents TitleListingHandler::createDirents() const { 78 | Dirents ret; 79 | ret.push_back(mp_creatorData->createDirent(NS::X, "listing/titleOrdered/v1", "application/octet-stream+zimlisting", "")); 80 | return ret; 81 | } 82 | 83 | DirentHandler::ContentProviders TitleListingHandler::getContentProviders() const { 84 | ContentProviders ret; 85 | ret.push_back(std::unique_ptr(new ListingProvider(&m_handledDirents))); 86 | return ret; 87 | } 88 | 89 | void TitleListingHandler::handle(Dirent* dirent, std::shared_ptr item) 90 | { 91 | handle(dirent, item->getAmendedHints()); 92 | } 93 | 94 | 95 | void TitleListingHandler::handle(Dirent* dirent, const Hints& hints) 96 | { 97 | if (isFrontArticle(dirent, hints)) { 98 | m_handledDirents.push_back(dirent); 99 | } 100 | } 101 | 102 | -------------------------------------------------------------------------------- /src/writer/titleListingHandler.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020-2021 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation; either version 3 of the License, or 7 | * any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 17 | * MA 02110-1301, USA. 18 | */ 19 | 20 | #ifndef OPENZIM_LIBZIM_LISTING_HANDLER_H 21 | #define OPENZIM_LIBZIM_LISTING_HANDLER_H 22 | 23 | #include "handler.h" 24 | #include "_dirent.h" 25 | 26 | #include 27 | 28 | namespace zim { 29 | namespace writer { 30 | 31 | struct TitleCompare { 32 | bool operator() (const Dirent* d1, const Dirent* d2) const { 33 | return compareTitle(d1, d2); 34 | } 35 | }; 36 | 37 | // This handler is in charge of handling titles. 38 | // It will create the "classic" old V0 title listing (for ALL entries) but also 39 | // the V1 title listing (for front article only). 40 | class TitleListingHandler : public DirentHandler { 41 | public: 42 | explicit TitleListingHandler(CreatorData* data); 43 | virtual ~TitleListingHandler(); 44 | 45 | void start() override; 46 | void stop() override; 47 | bool isCompressible() override { return false; } 48 | ContentProviders getContentProviders() const override; 49 | void handle(Dirent* dirent, std::shared_ptr item) override; 50 | void handle(Dirent* dirent, const Hints& hints) override; 51 | 52 | protected: 53 | Dirents createDirents() const override; 54 | CreatorData* mp_creatorData; 55 | Dirents m_handledDirents; 56 | }; 57 | } 58 | } 59 | 60 | #endif // OPENZIM_LIBZIM_LISTING_HANDLER_H 61 | -------------------------------------------------------------------------------- /src/writer/workers.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2019-2020 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include "workers.h" 21 | #include "cluster.h" 22 | #include "creatordata.h" 23 | 24 | #include "../tools.h" 25 | 26 | #ifdef _WIN32 27 | #include 28 | #else 29 | #include 30 | #endif 31 | 32 | namespace zim 33 | { 34 | namespace writer 35 | { 36 | 37 | void* taskRunner(void* arg) { 38 | auto creatorData = static_cast(arg); 39 | unsigned int wait = 0; 40 | try { 41 | while(!creatorData->isErrored()) { 42 | std::shared_ptr task; 43 | microsleep(wait); 44 | wait += 100; 45 | if (creatorData->taskList.popFromQueue(task)) { 46 | if (!task) { 47 | return nullptr; 48 | } 49 | task->run(creatorData); 50 | wait = 0; 51 | } 52 | } 53 | } catch (...) { 54 | creatorData->addError(std::current_exception()); 55 | } 56 | return nullptr; 57 | } 58 | 59 | void* clusterWriter(void* arg) { 60 | auto creatorData = static_cast(arg); 61 | Cluster* cluster; 62 | unsigned int wait = 0; 63 | try { 64 | while(!creatorData->isErrored()) { 65 | microsleep(wait); 66 | wait += 100; 67 | if(creatorData->clusterToWrite.getHead(cluster)) { 68 | if (cluster == nullptr) { 69 | // All cluster writen, we can quit 70 | return nullptr; 71 | } 72 | if (not cluster->isClosed()) { 73 | continue; 74 | } 75 | creatorData->clusterToWrite.popFromQueue(cluster); 76 | cluster->setOffset(offset_t(lseek(creatorData->out_fd, 0, SEEK_CUR))); 77 | cluster->write(creatorData->out_fd); 78 | cluster->clear_data(); 79 | wait = 0; 80 | } 81 | } 82 | } catch(...) { 83 | creatorData->addError(std::current_exception()); 84 | } 85 | return nullptr; 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/writer/workers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2019-2020 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation; either version 3 of the License, or 7 | * any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 17 | * MA 02110-1301, USA. 18 | */ 19 | 20 | #ifndef OPENZIM_LIBZIM_WORKERS_H 21 | #define OPENZIM_LIBZIM_WORKERS_H 22 | 23 | #include "tools.h" 24 | #include "creatordata.h" 25 | 26 | namespace zim { 27 | namespace writer { 28 | 29 | class Task { 30 | public: 31 | Task() = default; 32 | virtual ~Task() = default; 33 | 34 | virtual void run(CreatorData* data) = 0; 35 | }; 36 | 37 | template 38 | class TrackableTask: public Task { 39 | public: 40 | TrackableTask(const TrackableTask&) = delete; 41 | TrackableTask& operator=(const TrackableTask&) = delete; 42 | TrackableTask() { ++waitingTaskCount; } 43 | virtual ~TrackableTask() { --waitingTaskCount;} 44 | 45 | static void waitNoMoreTask(const CreatorData* data) { 46 | // Wait for all tasks has been done 47 | // If we are in error state, threads have been stopped and waitingTaskCount 48 | // will never reach 0, so no need to wait. 49 | unsigned int wait = 0; 50 | do { 51 | microsleep(wait); 52 | wait += 10; 53 | } while(waitingTaskCount.load() > 0 && !data->isErrored()); 54 | } 55 | 56 | private: 57 | static std::atomic waitingTaskCount; 58 | }; 59 | 60 | template 61 | std::atomic zim::writer::TrackableTask::waitingTaskCount(0); 62 | 63 | void* taskRunner(void* data); 64 | void* clusterWriter(void* data); 65 | 66 | } 67 | } 68 | 69 | #endif // OPENZIM_LIBZIM_WORKERS_H 70 | -------------------------------------------------------------------------------- /src/writer/xapianHandler.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020-2021 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation; either version 3 of the License, or 7 | * any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 17 | * MA 02110-1301, USA. 18 | */ 19 | 20 | #ifndef OPENZIM_LIBZIM_XAPIAN_HANDLER_H 21 | #define OPENZIM_LIBZIM_XAPIAN_HANDLER_H 22 | 23 | #include "handler.h" 24 | 25 | namespace zim { 26 | namespace writer { 27 | 28 | class XapianIndexer; 29 | 30 | class XapianHandler : public DirentHandler { 31 | public: 32 | XapianHandler(CreatorData* data, bool withFullTextIndex); 33 | virtual ~XapianHandler(); 34 | 35 | void start() override; 36 | void stop() override; 37 | bool isCompressible() override { return false; } 38 | ContentProviders getContentProviders() const override; 39 | void handle(Dirent* dirent, std::shared_ptr item) override; 40 | void handle(Dirent* dirent, const Hints& hints) override; 41 | 42 | protected: 43 | Dirents createDirents() const override; 44 | 45 | private: // methods 46 | void indexTitle(Dirent* dirent); 47 | void waitNoMoreTask() const; 48 | 49 | private: // data 50 | std::unique_ptr mp_fulltextIndexer; 51 | std::unique_ptr mp_titleIndexer; 52 | CreatorData* mp_creatorData; 53 | }; 54 | 55 | } 56 | } 57 | 58 | #endif // OPENZIM_LIBZIM_XAPIAN_WORKER_H 59 | -------------------------------------------------------------------------------- /src/writer/xapianIndexer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Maneesh P M 3 | * Copyright (C) 2018-2021 Matthieu Gautier 4 | * Copyright (C) 2011 Emmanuel Engelhart 5 | * 6 | * This program is free software; you can redistribute it and/or modify 7 | * it under the terms of the GNU General Public License as published by 8 | * the Free Software Foundation; either version 3 of the License, or 9 | * any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 19 | * MA 02110-1301, USA. 20 | */ 21 | 22 | #ifndef LIBZIM_WRITER_XAPIANINDEXER_H 23 | #define LIBZIM_WRITER_XAPIANINDEXER_H 24 | 25 | #include 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | 32 | namespace zim { 33 | namespace writer { 34 | 35 | class IndexTask; 36 | 37 | enum class IndexingMode { 38 | TITLE, 39 | FULL 40 | }; 41 | 42 | class XapianIndexer 43 | { 44 | public: 45 | XapianIndexer(const std::string& indexPath, const std::string& language, IndexingMode mode, bool verbose); 46 | virtual ~XapianIndexer(); 47 | std::string getIndexPath() { return indexPath; } 48 | void indexingPrelude(); 49 | void indexingPostlude(); 50 | bool is_empty() { return empty; } 51 | 52 | void indexTitle(const std::string& path, const std::string& title, const std::string& targetPath = ""); 53 | 54 | protected: 55 | Xapian::WritableDatabase writableDatabase; 56 | bool empty {true}; 57 | std::string stemmer_language; 58 | Xapian::SimpleStopper stopper; 59 | std::string indexPath; 60 | std::string language; 61 | std::string stopwords; 62 | IndexingMode indexingMode; 63 | 64 | friend class zim::writer::IndexTask; 65 | }; 66 | 67 | } 68 | } 69 | 70 | #endif // LIBZIM_WRITER_XAPIANINDEXER_H 71 | -------------------------------------------------------------------------------- /src/writer/xapianWorker.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Maneesh P M 3 | * Copyright (C) 2020-2021 Matthieu Gautier 4 | * 5 | * This program is free software; you can redistribute it and/or 6 | * modify it under the terms of the GNU General Public License as 7 | * published by the Free Software Foundation; either version 2 of the 8 | * License, or (at your option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but 11 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 12 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 13 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 14 | * 15 | * You should have received a copy of the GNU General Public License 16 | * along with this program; if not, write to the Free Software 17 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 18 | * 19 | */ 20 | 21 | #include "xapianWorker.h" 22 | #include "creatordata.h" 23 | 24 | #include "xapianIndexer.h" 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | static std::mutex s_dbaccessLock; 31 | 32 | namespace zim 33 | { 34 | namespace writer 35 | { 36 | 37 | const unsigned int keywordsBoostFactor = 3; 38 | inline unsigned int getTitleBoostFactor(const unsigned int contentLength) 39 | { 40 | return contentLength / 500 + 1; 41 | } 42 | 43 | void IndexTask::run(CreatorData* data) { 44 | if (!mp_indexData->hasIndexData()) { 45 | return; 46 | } 47 | Xapian::Stem stemmer; 48 | Xapian::TermGenerator indexer; 49 | indexer.set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM); 50 | try { 51 | stemmer = Xapian::Stem(mp_indexer->stemmer_language); 52 | indexer.set_stemmer(stemmer); 53 | indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL); 54 | } catch (...) { 55 | // No stemming for language. 56 | } 57 | indexer.set_stopper(&mp_indexer->stopper); 58 | indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL); 59 | 60 | Xapian::Document document; 61 | indexer.set_document(document); 62 | 63 | std::string fullPath = "C/" + m_path; 64 | document.set_data(fullPath); 65 | document.add_value(0, mp_indexData->getTitle()); 66 | document.add_value(1, Formatter() << mp_indexData->getWordCount()); 67 | 68 | auto geoInfo = mp_indexData->getGeoPosition(); 69 | if (std::get<0>(geoInfo)) { 70 | auto geoPosition = Xapian::LatLongCoord( 71 | std::get<1>(geoInfo), std::get<2>(geoInfo)).serialise(); 72 | document.add_value(2, geoPosition); 73 | } 74 | 75 | /* Index the content */ 76 | auto indexContent = mp_indexData->getContent(); 77 | if (!indexContent.empty()) { 78 | indexer.index_text_without_positions(indexContent); 79 | } 80 | 81 | /* Index the title */ 82 | auto indexTitle = mp_indexData->getTitle(); 83 | if (!indexTitle.empty()) { 84 | indexer.index_text_without_positions( 85 | indexTitle, getTitleBoostFactor(indexContent.size())); 86 | } 87 | 88 | /* Index the keywords */ 89 | auto indexKeywords = mp_indexData->getKeywords(); 90 | if (!indexKeywords.empty()) { 91 | indexer.index_text_without_positions(indexKeywords, keywordsBoostFactor); 92 | } 93 | 94 | std::lock_guard l(s_dbaccessLock); 95 | mp_indexer->writableDatabase.add_document(document); 96 | mp_indexer->empty = false; 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/writer/xapianWorker.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020-2021 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or modify 5 | * it under the terms of the GNU General Public License as published by 6 | * the Free Software Foundation; either version 3 of the License, or 7 | * any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 17 | * MA 02110-1301, USA. 18 | */ 19 | 20 | #ifndef OPENZIM_LIBZIM_XAPIAN_WORKER_H 21 | #define OPENZIM_LIBZIM_XAPIAN_WORKER_H 22 | 23 | #include 24 | #include 25 | #include "workers.h" 26 | #include 27 | 28 | namespace zim { 29 | namespace writer { 30 | 31 | class Item; 32 | class XapianIndexer; 33 | 34 | class IndexTask : public TrackableTask { 35 | public: 36 | IndexTask(const IndexTask&) = delete; 37 | IndexTask& operator=(const IndexTask&) = delete; 38 | IndexTask(std::shared_ptr indexData, const std::string& path, XapianIndexer* indexer) : 39 | mp_indexData(indexData), 40 | m_path(path), 41 | mp_indexer(indexer) 42 | {} 43 | virtual ~IndexTask() = default; 44 | 45 | virtual void run(CreatorData* data); 46 | 47 | private: 48 | std::shared_ptr mp_indexData; 49 | std::string m_path; 50 | XapianIndexer* mp_indexer; 51 | }; 52 | 53 | } 54 | } 55 | 56 | #endif // OPENZIM_LIBZIM_XAPIAN_WORKER_H 57 | -------------------------------------------------------------------------------- /src/xapian/htmlparse.h: -------------------------------------------------------------------------------- 1 | /* htmlparse.h: simple HTML parser for omega indexer 2 | * 3 | * Copyright 1999,2000,2001 BrightStation PLC 4 | * Copyright 2002,2006,2008 Olly Betts 5 | * 6 | * This program is free software; you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation; either version 2 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 19 | * USA 20 | */ 21 | 22 | #ifndef OMEGA_INCLUDED_HTMLPARSE_H 23 | #define OMEGA_INCLUDED_HTMLPARSE_H 24 | 25 | #include 26 | #include "config.h" 27 | 28 | #include 29 | #include 30 | 31 | using std::string; 32 | using std::map; 33 | 34 | namespace zim { 35 | 36 | class LIBZIM_PRIVATE_API HtmlParser { 37 | map parameters; 38 | protected: 39 | void decode_entities(string &s); 40 | bool in_script; 41 | string charset; 42 | static map named_ents; 43 | 44 | bool get_parameter(const string & param, string & value); 45 | public: 46 | virtual void process_text(const string &/*text*/) { } 47 | virtual void opening_tag(const string &/*tag*/) { } 48 | virtual void closing_tag(const string &/*tag*/) { } 49 | virtual void parse_html(const string &text); 50 | HtmlParser(); 51 | virtual ~HtmlParser() { } 52 | }; 53 | 54 | }; 55 | 56 | #endif // OMEGA_INCLUDED_HTMLPARSE_H 57 | -------------------------------------------------------------------------------- /src/xapian/myhtmlparse.h: -------------------------------------------------------------------------------- 1 | /* myhtmlparse.h: subclass of HtmlParser for extracting text 2 | * 3 | * Copyright 1999,2000,2001 BrightStation PLC 4 | * Copyright 2002,2003,2004,2006,2008 Olly Betts 5 | * 6 | * This program is free software; you can redistribute it and/or 7 | * modify it under the terms of the GNU General Public License as 8 | * published by the Free Software Foundation; either version 2 of the 9 | * License, or (at your option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU General Public License for more details. 15 | * 16 | * You should have received a copy of the GNU General Public License 17 | * along with this program; if not, write to the Free Software 18 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 19 | * USA 20 | */ 21 | 22 | #ifndef OMEGA_INCLUDED_MYHTMLPARSE_H 23 | #define OMEGA_INCLUDED_MYHTMLPARSE_H 24 | 25 | #include 26 | #include "htmlparse.h" 27 | 28 | // FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but 29 | // not in all charsets and perhaps spans of all \xa0 should become a single 30 | // \xa0? 31 | #define WHITESPACE " \t\n\r" 32 | 33 | namespace zim { 34 | 35 | class LIBZIM_PRIVATE_API MyHtmlParser : public HtmlParser { 36 | public: 37 | bool in_script_tag; 38 | bool in_style_tag; 39 | bool pending_space; 40 | bool indexing_allowed; 41 | bool charset_from_meta; 42 | float latitude, longitude; 43 | bool has_geoPosition; 44 | string title, sample, keywords, dump; 45 | void process_text(const string &text); 46 | void opening_tag(const string &tag); 47 | void closing_tag(const string &tag); 48 | using HtmlParser::parse_html; 49 | void parse_html(const string &text, const string &charset_, 50 | bool charset_from_meta_); 51 | MyHtmlParser() : 52 | in_script_tag(false), 53 | in_style_tag(false), 54 | pending_space(false), 55 | indexing_allowed(true), 56 | charset_from_meta(false), 57 | latitude(0), longitude(0), has_geoPosition(false) { } 58 | 59 | void reset() { 60 | in_script_tag = false; 61 | in_style_tag = false; 62 | pending_space = false; 63 | indexing_allowed = true; 64 | charset_from_meta = false; 65 | latitude = longitude = 0; 66 | has_geoPosition = false; 67 | title.resize(0); 68 | sample.resize(0); 69 | keywords.resize(0); 70 | dump.resize(0); 71 | } 72 | }; 73 | 74 | }; 75 | 76 | #endif // OMEGA_INCLUDED_MYHTMLPARSE_H 77 | -------------------------------------------------------------------------------- /src/zim_types.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018-2021 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | 21 | #ifndef ZIM_TYPES_H 22 | #define ZIM_TYPES_H 23 | 24 | #include 25 | 26 | #include 27 | 28 | #ifdef __GNUC__ 29 | #define PACKED __attribute__((packed)) 30 | #else 31 | #define PACKED 32 | #endif 33 | 34 | template 35 | struct REAL_TYPEDEF{ 36 | typedef B base_type; 37 | typedef S SELF; 38 | 39 | B v; 40 | REAL_TYPEDEF() : v(0) {}; 41 | explicit REAL_TYPEDEF(B v) : v(v) {}; 42 | explicit inline operator bool() const { return v != 0; } 43 | explicit inline operator B() const { return v; } 44 | 45 | inline bool operator==(const REAL_TYPEDEF& rhs) const 46 | { return v == rhs.v; } 47 | 48 | inline REAL_TYPEDEF& operator++() 49 | { v++; return *this; } 50 | 51 | inline REAL_TYPEDEF operator++(int) 52 | { return REAL_TYPEDEF(v++); } 53 | } PACKED; 54 | 55 | template 56 | std::ostream& operator<<(std::ostream& os, const REAL_TYPEDEF& obj) 57 | { 58 | os << obj.v; 59 | return os; 60 | } 61 | 62 | namespace zim { 63 | 64 | #define TYPEDEF(NAME, TYPE) struct NAME : public REAL_TYPEDEF { \ 65 | explicit NAME(TYPE v=0) : REAL_TYPEDEF(v) {}; } PACKED; \ 66 | static_assert(sizeof(NAME) == sizeof(TYPE), ""); \ 67 | inline NAME& operator+= (NAME& lhs, const NAME& rhs) { lhs.v += rhs.v; return lhs; } \ 68 | inline NAME& operator+= (NAME& lhs, const TYPE& rhs) { lhs.v += rhs; return lhs; } \ 69 | inline NAME operator+(NAME lhs, const NAME& rhs) { lhs += rhs; return lhs; } \ 70 | inline NAME& operator-=(NAME& lhs, const NAME& rhs) { lhs.v -= rhs.v; return lhs; } \ 71 | inline NAME operator-(NAME lhs, const NAME& rhs) { lhs -= rhs; return lhs; } \ 72 | inline bool operator< (const NAME& lhs, const NAME& rhs) { return lhs.v < rhs.v; } \ 73 | inline bool operator> (const NAME& lhs, const NAME& rhs) { return rhs < lhs; } \ 74 | inline bool operator<=(const NAME& lhs, const NAME& rhs) { return !(lhs > rhs); } \ 75 | inline bool operator>=(const NAME& lhs, const NAME& rhs) { return !(lhs < rhs); } \ 76 | inline bool operator!=(const NAME& lhs, const NAME& rhs) { return !(lhs == rhs); } 77 | 78 | 79 | TYPEDEF(entry_index_t, entry_index_type) 80 | TYPEDEF(title_index_t, entry_index_type) 81 | TYPEDEF(cluster_index_t, cluster_index_type) 82 | TYPEDEF(blob_index_t, blob_index_type) 83 | 84 | TYPEDEF(zsize_t, size_type) 85 | TYPEDEF(offset_t, offset_type) 86 | 87 | #undef TYPEDEF 88 | 89 | inline offset_t& operator+= (offset_t& lhs, const zsize_t& rhs) 90 | { 91 | lhs.v += rhs.v; 92 | return lhs; 93 | } 94 | 95 | inline offset_t operator+(offset_t lhs, const zsize_t& rhs) 96 | { 97 | lhs += rhs; 98 | return lhs; 99 | } 100 | 101 | }; 102 | 103 | #endif //ZIM_TYPES_H 104 | -------------------------------------------------------------------------------- /static/meson.build: -------------------------------------------------------------------------------- 1 | 2 | resources_list = 'resources_list.txt' 3 | 4 | lib_resources = custom_target('resources', 5 | input: resources_list, 6 | output: ['libzim-resources.cpp', 'libzim-resources.h'], 7 | command:[res_compiler, 8 | '--cxxfile', '@OUTPUT0@', 9 | '--hfile', '@OUTPUT1@', 10 | '--source_dir', '@OUTDIR@', 11 | '@INPUT@'] 12 | ) 13 | -------------------------------------------------------------------------------- /static/resources_list.txt: -------------------------------------------------------------------------------- 1 | stopwords/af 2 | stopwords/ar 3 | stopwords/bg 4 | stopwords/bn 5 | stopwords/br 6 | stopwords/ca 7 | stopwords/cs 8 | stopwords/da 9 | stopwords/de 10 | stopwords/el 11 | stopwords/en 12 | stopwords/eo 13 | stopwords/es 14 | stopwords/et 15 | stopwords/eu 16 | stopwords/fa 17 | stopwords/fi 18 | stopwords/fr 19 | stopwords/ga 20 | stopwords/gl 21 | stopwords/gu 22 | stopwords/ha 23 | stopwords/he 24 | stopwords/hi 25 | stopwords/hr 26 | stopwords/hu 27 | stopwords/hy 28 | stopwords/id 29 | stopwords/it 30 | stopwords/ja 31 | stopwords/ko 32 | stopwords/ku 33 | stopwords/la 34 | stopwords/lt 35 | stopwords/lv 36 | stopwords/mr 37 | stopwords/ms 38 | stopwords/nl 39 | stopwords/no 40 | stopwords/pl 41 | stopwords/pt 42 | stopwords/ro 43 | stopwords/ru 44 | stopwords/sk 45 | stopwords/sl 46 | stopwords/so 47 | stopwords/st 48 | stopwords/sv 49 | stopwords/sw 50 | stopwords/th 51 | stopwords/tl 52 | stopwords/tr 53 | stopwords/uk 54 | stopwords/ur 55 | stopwords/vi 56 | stopwords/yo 57 | stopwords/zh 58 | stopwords/zu -------------------------------------------------------------------------------- /static/stopwords/af: -------------------------------------------------------------------------------- 1 | 'n 2 | aan 3 | af 4 | al 5 | as 6 | baie 7 | by 8 | daar 9 | dag 10 | dat 11 | die 12 | dit 13 | een 14 | ek 15 | en 16 | gaan 17 | gesê 18 | haar 19 | het 20 | hom 21 | hulle 22 | hy 23 | in 24 | is 25 | jou 26 | jy 27 | kan 28 | kom 29 | ma 30 | maar 31 | met 32 | my 33 | na 34 | nie 35 | om 36 | ons 37 | op 38 | saam 39 | sal 40 | se 41 | sien 42 | so 43 | sy 44 | te 45 | toe 46 | uit 47 | van 48 | vir 49 | was 50 | wat 51 | ʼn -------------------------------------------------------------------------------- /static/stopwords/bg: -------------------------------------------------------------------------------- 1 | а 2 | автентичен 3 | аз 4 | ако 5 | ала 6 | бе 7 | без 8 | беше 9 | би 10 | бивш 11 | бивша 12 | бившо 13 | бил 14 | била 15 | били 16 | било 17 | благодаря 18 | близо 19 | бъдат 20 | бъде 21 | бяха 22 | в 23 | вас 24 | ваш 25 | ваша 26 | вероятно 27 | вече 28 | взема 29 | ви 30 | вие 31 | винаги 32 | внимава 33 | време 34 | все 35 | всеки 36 | всички 37 | всичко 38 | всяка 39 | във 40 | въпреки 41 | върху 42 | г 43 | ги 44 | главен 45 | главна 46 | главно 47 | глас 48 | го 49 | година 50 | години 51 | годишен 52 | д 53 | да 54 | дали 55 | два 56 | двама 57 | двамата 58 | две 59 | двете 60 | ден 61 | днес 62 | дни 63 | до 64 | добра 65 | добре 66 | добро 67 | добър 68 | докато 69 | докога 70 | дори 71 | досега 72 | доста 73 | друг 74 | друга 75 | други 76 | е 77 | евтин 78 | едва 79 | един 80 | една 81 | еднаква 82 | еднакви 83 | еднакъв 84 | едно 85 | екип 86 | ето 87 | живот 88 | за 89 | забавям 90 | зад 91 | заедно 92 | заради 93 | засега 94 | заспал 95 | затова 96 | защо 97 | защото 98 | и 99 | из 100 | или 101 | им 102 | има 103 | имат 104 | иска 105 | й 106 | каза 107 | как 108 | каква 109 | какво 110 | както 111 | какъв 112 | като 113 | кога 114 | когато 115 | което 116 | които 117 | кой 118 | който 119 | колко 120 | която 121 | къде 122 | където 123 | към 124 | лесен 125 | лесно 126 | ли 127 | лош 128 | м 129 | май 130 | малко 131 | ме 132 | между 133 | мек 134 | мен 135 | месец 136 | ми 137 | много 138 | мнозина 139 | мога 140 | могат 141 | може 142 | мокър 143 | моля 144 | момента 145 | му 146 | н 147 | на 148 | над 149 | назад 150 | най 151 | направи 152 | напред 153 | например 154 | нас 155 | не 156 | него 157 | нещо 158 | нея 159 | ни 160 | ние 161 | никой 162 | нито 163 | нищо 164 | но 165 | нов 166 | нова 167 | нови 168 | новина 169 | някои 170 | някой 171 | няколко 172 | няма 173 | обаче 174 | около 175 | освен 176 | особено 177 | от 178 | отгоре 179 | отново 180 | още 181 | пак 182 | по 183 | повече 184 | повечето 185 | под 186 | поне 187 | поради 188 | после 189 | почти 190 | прави 191 | пред 192 | преди 193 | през 194 | при 195 | пък 196 | първата 197 | първи 198 | първо 199 | пъти 200 | равен 201 | равна 202 | с 203 | са 204 | сам 205 | само 206 | се 207 | сега 208 | си 209 | син 210 | скоро 211 | след 212 | следващ 213 | сме 214 | смях 215 | според 216 | сред 217 | срещу 218 | сте 219 | съм 220 | със 221 | също 222 | т 223 | т.н. 224 | тази 225 | така 226 | такива 227 | такъв 228 | там 229 | твой 230 | те 231 | тези 232 | ти 233 | то 234 | това 235 | тогава 236 | този 237 | той 238 | толкова 239 | точно 240 | три 241 | трябва 242 | тук 243 | тъй 244 | тя 245 | тях 246 | у 247 | утре 248 | харесва 249 | хиляди 250 | ч 251 | часа 252 | че 253 | често 254 | чрез 255 | ще 256 | щом 257 | юмрук 258 | я 259 | як -------------------------------------------------------------------------------- /static/stopwords/ca: -------------------------------------------------------------------------------- 1 | a 2 | abans 3 | ací 4 | ah 5 | així 6 | això 7 | al 8 | aleshores 9 | algun 10 | alguna 11 | algunes 12 | alguns 13 | alhora 14 | allà 15 | allí 16 | allò 17 | als 18 | altra 19 | altre 20 | altres 21 | amb 22 | ambdues 23 | ambdós 24 | anar 25 | ans 26 | apa 27 | aquell 28 | aquella 29 | aquelles 30 | aquells 31 | aquest 32 | aquesta 33 | aquestes 34 | aquests 35 | aquí 36 | baix 37 | bastant 38 | bé 39 | cada 40 | cadascuna 41 | cadascunes 42 | cadascuns 43 | cadascú 44 | com 45 | consegueixo 46 | conseguim 47 | conseguir 48 | consigueix 49 | consigueixen 50 | consigueixes 51 | contra 52 | d'un 53 | d'una 54 | d'unes 55 | d'uns 56 | dalt 57 | de 58 | del 59 | dels 60 | des 61 | des de 62 | després 63 | dins 64 | dintre 65 | donat 66 | doncs 67 | durant 68 | e 69 | eh 70 | el 71 | elles 72 | ells 73 | els 74 | em 75 | en 76 | encara 77 | ens 78 | entre 79 | era 80 | erem 81 | eren 82 | eres 83 | es 84 | esta 85 | estan 86 | estat 87 | estava 88 | estaven 89 | estem 90 | esteu 91 | estic 92 | està 93 | estàvem 94 | estàveu 95 | et 96 | etc 97 | ets 98 | fa 99 | faig 100 | fan 101 | fas 102 | fem 103 | fer 104 | feu 105 | fi 106 | fins 107 | fora 108 | gairebé 109 | ha 110 | han 111 | has 112 | haver 113 | havia 114 | he 115 | hem 116 | heu 117 | hi 118 | ho 119 | i 120 | igual 121 | iguals 122 | inclòs 123 | ja 124 | jo 125 | l'hi 126 | la 127 | les 128 | li 129 | li'n 130 | llarg 131 | llavors 132 | m'he 133 | ma 134 | mal 135 | malgrat 136 | mateix 137 | mateixa 138 | mateixes 139 | mateixos 140 | me 141 | mentre 142 | meu 143 | meus 144 | meva 145 | meves 146 | mode 147 | molt 148 | molta 149 | moltes 150 | molts 151 | mon 152 | mons 153 | més 154 | n'he 155 | n'hi 156 | ne 157 | ni 158 | no 159 | nogensmenys 160 | només 161 | nosaltres 162 | nostra 163 | nostre 164 | nostres 165 | o 166 | oh 167 | oi 168 | on 169 | pas 170 | pel 171 | pels 172 | per 173 | per que 174 | perquè 175 | però 176 | poc 177 | poca 178 | pocs 179 | podem 180 | poden 181 | poder 182 | podeu 183 | poques 184 | potser 185 | primer 186 | propi 187 | puc 188 | qual 189 | quals 190 | quan 191 | quant 192 | que 193 | quelcom 194 | qui 195 | quin 196 | quina 197 | quines 198 | quins 199 | què 200 | s'ha 201 | s'han 202 | sa 203 | sabem 204 | saben 205 | saber 206 | sabeu 207 | sap 208 | saps 209 | semblant 210 | semblants 211 | sense 212 | ser 213 | ses 214 | seu 215 | seus 216 | seva 217 | seves 218 | si 219 | sobre 220 | sobretot 221 | soc 222 | solament 223 | sols 224 | som 225 | son 226 | sons 227 | sota 228 | sou 229 | sóc 230 | són 231 | t'ha 232 | t'han 233 | t'he 234 | ta 235 | tal 236 | també 237 | tampoc 238 | tan 239 | tant 240 | tanta 241 | tantes 242 | te 243 | tene 244 | tenim 245 | tenir 246 | teniu 247 | teu 248 | teus 249 | teva 250 | teves 251 | tinc 252 | ton 253 | tons 254 | tot 255 | tota 256 | totes 257 | tots 258 | un 259 | una 260 | unes 261 | uns 262 | us 263 | va 264 | vaig 265 | vam 266 | van 267 | vas 268 | veu 269 | vosaltres 270 | vostra 271 | vostre 272 | vostres 273 | érem 274 | éreu 275 | és 276 | éssent 277 | últim 278 | ús -------------------------------------------------------------------------------- /static/stopwords/da: -------------------------------------------------------------------------------- 1 | ad 2 | af 3 | aldrig 4 | alle 5 | alt 6 | anden 7 | andet 8 | andre 9 | at 10 | bare 11 | begge 12 | blev 13 | blive 14 | bliver 15 | da 16 | de 17 | dem 18 | den 19 | denne 20 | der 21 | deres 22 | det 23 | dette 24 | dig 25 | din 26 | dine 27 | disse 28 | dit 29 | dog 30 | du 31 | efter 32 | ej 33 | eller 34 | en 35 | end 36 | ene 37 | eneste 38 | enhver 39 | er 40 | et 41 | far 42 | fem 43 | fik 44 | fire 45 | flere 46 | fleste 47 | for 48 | fordi 49 | forrige 50 | fra 51 | få 52 | får 53 | før 54 | god 55 | godt 56 | ham 57 | han 58 | hans 59 | har 60 | havde 61 | have 62 | hej 63 | helt 64 | hende 65 | hendes 66 | her 67 | hos 68 | hun 69 | hvad 70 | hvem 71 | hver 72 | hvilken 73 | hvis 74 | hvor 75 | hvordan 76 | hvorfor 77 | hvornår 78 | i 79 | ikke 80 | ind 81 | ingen 82 | intet 83 | ja 84 | jeg 85 | jer 86 | jeres 87 | jo 88 | kan 89 | kom 90 | komme 91 | kommer 92 | kun 93 | kunne 94 | lad 95 | lav 96 | lidt 97 | lige 98 | lille 99 | man 100 | mand 101 | mange 102 | med 103 | meget 104 | men 105 | mens 106 | mere 107 | mig 108 | min 109 | mine 110 | mit 111 | mod 112 | må 113 | ned 114 | nej 115 | ni 116 | nogen 117 | noget 118 | nogle 119 | nu 120 | ny 121 | nyt 122 | når 123 | nær 124 | næste 125 | næsten 126 | og 127 | også 128 | okay 129 | om 130 | op 131 | os 132 | otte 133 | over 134 | på 135 | se 136 | seks 137 | selv 138 | ser 139 | ses 140 | sig 141 | sige 142 | sin 143 | sine 144 | sit 145 | skal 146 | skulle 147 | som 148 | stor 149 | store 150 | syv 151 | så 152 | sådan 153 | tag 154 | tage 155 | thi 156 | ti 157 | til 158 | to 159 | tre 160 | ud 161 | under 162 | var 163 | ved 164 | vi 165 | vil 166 | ville 167 | vor 168 | vores 169 | være 170 | været -------------------------------------------------------------------------------- /static/stopwords/eo: -------------------------------------------------------------------------------- 1 | adiaŭ 2 | ajn 3 | al 4 | ankoraŭ 5 | antaŭ 6 | aŭ 7 | bonan 8 | bonvole 9 | bonvolu 10 | bv 11 | ci 12 | cia 13 | cian 14 | cin 15 | d-ro 16 | da 17 | de 18 | dek 19 | deka 20 | do 21 | doktor' 22 | doktoro 23 | du 24 | dua 25 | dum 26 | eble 27 | ekz 28 | ekzemple 29 | en 30 | estas 31 | estis 32 | estos 33 | estu 34 | estus 35 | eĉ 36 | f-no 37 | feliĉan 38 | for 39 | fraŭlino 40 | ha 41 | havas 42 | havis 43 | havos 44 | havu 45 | havus 46 | he 47 | ho 48 | hu 49 | ili 50 | ilia 51 | ilian 52 | ilin 53 | inter 54 | io 55 | ion 56 | iu 57 | iujn 58 | iun 59 | ja 60 | jam 61 | je 62 | jes 63 | k 64 | kaj 65 | ke 66 | kio 67 | kion 68 | kiu 69 | kiujn 70 | kiun 71 | kvankam 72 | kvar 73 | kvara 74 | kvazaŭ 75 | kvin 76 | kvina 77 | la 78 | li 79 | lia 80 | lian 81 | lin 82 | malantaŭ 83 | male 84 | malgraŭ 85 | mem 86 | mi 87 | mia 88 | mian 89 | min 90 | minus 91 | naŭ 92 | naŭa 93 | ne 94 | nek 95 | nenio 96 | nenion 97 | neniu 98 | neniun 99 | nepre 100 | ni 101 | nia 102 | nian 103 | nin 104 | nu 105 | nun 106 | nur 107 | ok 108 | oka 109 | oni 110 | onia 111 | onian 112 | onin 113 | plej 114 | pli 115 | plu 116 | plus 117 | por 118 | post 119 | preter 120 | s-no 121 | s-ro 122 | se 123 | sed 124 | sep 125 | sepa 126 | ses 127 | sesa 128 | si 129 | sia 130 | sian 131 | sin 132 | sinjor' 133 | sinjorino 134 | sinjoro 135 | sub 136 | super 137 | supren 138 | sur 139 | tamen 140 | tio 141 | tion 142 | tiu 143 | tiujn 144 | tiun 145 | tra 146 | tri 147 | tria 148 | tuj 149 | tute 150 | unu 151 | unua 152 | ve 153 | verŝajne 154 | vi 155 | via 156 | vian 157 | vin 158 | ĉi 159 | ĉio 160 | ĉion 161 | ĉiu 162 | ĉiujn 163 | ĉiun 164 | ĉu 165 | ĝi 166 | ĝia 167 | ĝian 168 | ĝin 169 | ĝis 170 | ĵus 171 | ŝi 172 | ŝia 173 | ŝin -------------------------------------------------------------------------------- /static/stopwords/et: -------------------------------------------------------------------------------- 1 | aga 2 | ei 3 | et 4 | ja 5 | jah 6 | kas 7 | kui 8 | kõik 9 | ma 10 | me 11 | mida 12 | midagi 13 | mind 14 | minu 15 | mis 16 | mu 17 | mul 18 | mulle 19 | nad 20 | nii 21 | oled 22 | olen 23 | oli 24 | oma 25 | on 26 | pole 27 | sa 28 | seda 29 | see 30 | selle 31 | siin 32 | siis 33 | ta 34 | te 35 | ära -------------------------------------------------------------------------------- /static/stopwords/eu: -------------------------------------------------------------------------------- 1 | al 2 | anitz 3 | arabera 4 | asko 5 | baina 6 | bat 7 | batean 8 | batek 9 | bati 10 | batzuei 11 | batzuek 12 | batzuetan 13 | batzuk 14 | bera 15 | beraiek 16 | berau 17 | berauek 18 | bere 19 | berori 20 | beroriek 21 | beste 22 | bezala 23 | da 24 | dago 25 | dira 26 | ditu 27 | du 28 | dute 29 | edo 30 | egin 31 | ere 32 | eta 33 | eurak 34 | ez 35 | gainera 36 | gu 37 | gutxi 38 | guzti 39 | haiei 40 | haiek 41 | haietan 42 | hainbeste 43 | hala 44 | han 45 | handik 46 | hango 47 | hara 48 | hari 49 | hark 50 | hartan 51 | hau 52 | hauei 53 | hauek 54 | hauetan 55 | hemen 56 | hemendik 57 | hemengo 58 | hi 59 | hona 60 | honek 61 | honela 62 | honetan 63 | honi 64 | hor 65 | hori 66 | horiei 67 | horiek 68 | horietan 69 | horko 70 | horra 71 | horrek 72 | horrela 73 | horretan 74 | horri 75 | hortik 76 | hura 77 | izan 78 | ni 79 | noiz 80 | nola 81 | non 82 | nondik 83 | nongo 84 | nor 85 | nora 86 | ze 87 | zein 88 | zen 89 | zenbait 90 | zenbat 91 | zer 92 | zergatik 93 | ziren 94 | zituen 95 | zu 96 | zuek 97 | zuen 98 | zuten -------------------------------------------------------------------------------- /static/stopwords/ga: -------------------------------------------------------------------------------- 1 | a 2 | ach 3 | ag 4 | agus 5 | an 6 | aon 7 | ar 8 | arna 9 | as 10 | b' 11 | ba 12 | beirt 13 | bhúr 14 | caoga 15 | ceathair 16 | ceathrar 17 | chomh 18 | chtó 19 | chuig 20 | chun 21 | cois 22 | céad 23 | cúig 24 | cúigear 25 | d' 26 | daichead 27 | dar 28 | de 29 | deich 30 | deichniúr 31 | den 32 | dhá 33 | do 34 | don 35 | dtí 36 | dá 37 | dár 38 | dó 39 | faoi 40 | faoin 41 | faoina 42 | faoinár 43 | fara 44 | fiche 45 | gach 46 | gan 47 | go 48 | gur 49 | haon 50 | hocht 51 | i 52 | iad 53 | idir 54 | in 55 | ina 56 | ins 57 | inár 58 | is 59 | le 60 | leis 61 | lena 62 | lenár 63 | m' 64 | mar 65 | mo 66 | mé 67 | na 68 | nach 69 | naoi 70 | naonúr 71 | ná 72 | ní 73 | níor 74 | nó 75 | nócha 76 | ocht 77 | ochtar 78 | os 79 | roimh 80 | sa 81 | seacht 82 | seachtar 83 | seachtó 84 | seasca 85 | seisear 86 | siad 87 | sibh 88 | sinn 89 | sna 90 | sé 91 | sí 92 | tar 93 | thar 94 | thú 95 | triúr 96 | trí 97 | trína 98 | trínár 99 | tríocha 100 | tú 101 | um 102 | ár 103 | é 104 | éis 105 | í 106 | ó 107 | ón 108 | óna 109 | ónár -------------------------------------------------------------------------------- /static/stopwords/gl: -------------------------------------------------------------------------------- 1 | a 2 | alí 3 | ao 4 | aos 5 | aquel 6 | aquela 7 | aquelas 8 | aqueles 9 | aquilo 10 | aquí 11 | as 12 | así 13 | aínda 14 | ben 15 | cando 16 | che 17 | co 18 | coa 19 | coas 20 | comigo 21 | con 22 | connosco 23 | contigo 24 | convosco 25 | cos 26 | cun 27 | cunha 28 | cunhas 29 | cuns 30 | da 31 | dalgunha 32 | dalgunhas 33 | dalgún 34 | dalgúns 35 | das 36 | de 37 | del 38 | dela 39 | delas 40 | deles 41 | desde 42 | deste 43 | do 44 | dos 45 | dun 46 | dunha 47 | dunhas 48 | duns 49 | e 50 | el 51 | ela 52 | elas 53 | eles 54 | en 55 | era 56 | eran 57 | esa 58 | esas 59 | ese 60 | eses 61 | esta 62 | estaba 63 | estar 64 | este 65 | estes 66 | estiven 67 | estou 68 | está 69 | están 70 | eu 71 | facer 72 | foi 73 | foron 74 | fun 75 | había 76 | hai 77 | iso 78 | isto 79 | la 80 | las 81 | lle 82 | lles 83 | lo 84 | los 85 | mais 86 | me 87 | meu 88 | meus 89 | min 90 | miña 91 | miñas 92 | moi 93 | na 94 | nas 95 | neste 96 | nin 97 | no 98 | non 99 | nos 100 | nosa 101 | nosas 102 | noso 103 | nosos 104 | nun 105 | nunha 106 | nunhas 107 | nuns 108 | nós 109 | o 110 | os 111 | ou 112 | para 113 | pero 114 | pode 115 | pois 116 | pola 117 | polas 118 | polo 119 | polos 120 | por 121 | que 122 | se 123 | senón 124 | ser 125 | seu 126 | seus 127 | sexa 128 | sido 129 | sobre 130 | súa 131 | súas 132 | tamén 133 | tan 134 | te 135 | ten 136 | ter 137 | teu 138 | teus 139 | teñen 140 | teño 141 | ti 142 | tido 143 | tiven 144 | tiña 145 | túa 146 | túas 147 | un 148 | unha 149 | unhas 150 | uns 151 | vos 152 | vosa 153 | vosas 154 | voso 155 | vosos 156 | vós 157 | á 158 | é 159 | ó 160 | ós -------------------------------------------------------------------------------- /static/stopwords/gu: -------------------------------------------------------------------------------- 1 | અંગે 2 | અંદર 3 | અથવા 4 | અને 5 | અમને 6 | અમારું 7 | અમે 8 | અહીં 9 | આ 10 | આગળ 11 | આથી 12 | આનું 13 | આને 14 | આપણને 15 | આપણું 16 | આપણે 17 | આપી 18 | આર 19 | આવી 20 | આવે 21 | ઉપર 22 | ઉભા 23 | ઊંચે 24 | ઊભું 25 | એ 26 | એક 27 | એન 28 | એના 29 | એનાં 30 | એની 31 | એનું 32 | એને 33 | એનો 34 | એમ 35 | એવા 36 | એવાં 37 | એવી 38 | એવું 39 | એવો 40 | ઓછું 41 | કંઈક 42 | કઈ 43 | કયું 44 | કયો 45 | કરતાં 46 | કરવું 47 | કરી 48 | કરીએ 49 | કરું 50 | કરે 51 | કરેલું 52 | કર્યા 53 | કર્યાં 54 | કર્યું 55 | કર્યો 56 | કાંઈ 57 | કે 58 | કેટલું 59 | કેમ 60 | કેવી 61 | કેવું 62 | કોઈ 63 | કોઈક 64 | કોણ 65 | કોણે 66 | કોને 67 | ક્યાં 68 | ક્યારે 69 | ખૂબ 70 | ગઈ 71 | ગયા 72 | ગયાં 73 | ગયું 74 | ગયો 75 | ઘણું 76 | છ 77 | છતાં 78 | છીએ 79 | છું 80 | છે 81 | છેક 82 | છો 83 | જ 84 | જાય 85 | જી 86 | જે 87 | જેટલું 88 | જેને 89 | જેમ 90 | જેવી 91 | જેવું 92 | જેવો 93 | જો 94 | જોઈએ 95 | જ્યાં 96 | જ્યારે 97 | ઝાઝું 98 | તને 99 | તમને 100 | તમારું 101 | તમે 102 | તા 103 | તારાથી 104 | તારામાં 105 | તારું 106 | તું 107 | તે 108 | તેં 109 | તેઓ 110 | તેણે 111 | તેથી 112 | તેના 113 | તેની 114 | તેનું 115 | તેને 116 | તેમ 117 | તેમનું 118 | તેમને 119 | તેવી 120 | તેવું 121 | તો 122 | ત્યાં 123 | ત્યારે 124 | થઇ 125 | થઈ 126 | થઈએ 127 | થતા 128 | થતાં 129 | થતી 130 | થતું 131 | થતો 132 | થયા 133 | થયાં 134 | થયું 135 | થયેલું 136 | થયો 137 | થવું 138 | થાઉં 139 | થાઓ 140 | થાય 141 | થી 142 | થોડું 143 | દરેક 144 | ન 145 | નં 146 | નં. 147 | નથી 148 | નહિ 149 | નહી 150 | નહીં 151 | ના 152 | ની 153 | નીચે 154 | નું 155 | ને 156 | નો 157 | પછી 158 | પણ 159 | પર 160 | પરંતુ 161 | પહેલાં 162 | પાછળ 163 | પાસે 164 | પોતાનું 165 | પ્રત્યેક 166 | ફક્ત 167 | ફરી 168 | ફરીથી 169 | બંને 170 | બધા 171 | બધું 172 | બની 173 | બહાર 174 | બહુ 175 | બાદ 176 | બે 177 | મને 178 | મા 179 | માં 180 | માટે 181 | માત્ર 182 | મારું 183 | મી 184 | મૂકવું 185 | મૂકી 186 | મૂક્યા 187 | મૂક્યાં 188 | મૂક્યું 189 | મેં 190 | રહી 191 | રહે 192 | રહેવું 193 | રહ્યા 194 | રહ્યાં 195 | રહ્યો 196 | રીતે 197 | રૂ. 198 | રૂા 199 | લેતા 200 | લેતું 201 | લેવા 202 | વગેરે 203 | વધુ 204 | શકે 205 | શા 206 | શું 207 | સરખું 208 | સામે 209 | સુધી 210 | હતા 211 | હતાં 212 | હતી 213 | હતું 214 | હવે 215 | હશે 216 | હશો 217 | હા 218 | હું 219 | હો 220 | હોઈ 221 | હોઈશ 222 | હોઈશું 223 | હોય 224 | હોવા -------------------------------------------------------------------------------- /static/stopwords/ha: -------------------------------------------------------------------------------- 1 | a 2 | amma 3 | ba 4 | ban 5 | ce 6 | cikin 7 | da 8 | don 9 | ga 10 | in 11 | ina 12 | ita 13 | ji 14 | ka 15 | ko 16 | kuma 17 | lokacin 18 | ma 19 | mai 20 | na 21 | ne 22 | ni 23 | sai 24 | shi 25 | su 26 | suka 27 | sun 28 | ta 29 | tafi 30 | take 31 | tana 32 | wani 33 | wannan 34 | wata 35 | ya 36 | yake 37 | yana 38 | yi 39 | za -------------------------------------------------------------------------------- /static/stopwords/he: -------------------------------------------------------------------------------- 1 | אבל 2 | או 3 | אולי 4 | אותה 5 | אותו 6 | אותי 7 | אותך 8 | אותם 9 | אותן 10 | אותנו 11 | אז 12 | אחר 13 | אחרות 14 | אחרי 15 | אחריכן 16 | אחרים 17 | אחרת 18 | אי 19 | איזה 20 | איך 21 | אין 22 | איפה 23 | איתה 24 | איתו 25 | איתי 26 | איתך 27 | איתכם 28 | איתכן 29 | איתם 30 | איתן 31 | איתנו 32 | אך 33 | אל 34 | אלה 35 | אלו 36 | אם 37 | אנחנו 38 | אני 39 | אס 40 | אף 41 | אצל 42 | אשר 43 | את 44 | אתה 45 | אתכם 46 | אתכן 47 | אתם 48 | אתן 49 | באיזומידה 50 | באמצע 51 | באמצעות 52 | בגלל 53 | בין 54 | בלי 55 | במידה 56 | במקוםשבו 57 | ברם 58 | בשביל 59 | בשעהש 60 | בתוך 61 | גם 62 | דרך 63 | הוא 64 | היא 65 | היה 66 | היכן 67 | היתה 68 | היתי 69 | הם 70 | הן 71 | הנה 72 | הסיבהשבגללה 73 | הרי 74 | ואילו 75 | ואת 76 | זאת 77 | זה 78 | זות 79 | יהיה 80 | יוכל 81 | יוכלו 82 | יותרמדי 83 | יכול 84 | יכולה 85 | יכולות 86 | יכולים 87 | יכל 88 | יכלה 89 | יכלו 90 | יש 91 | כאן 92 | כאשר 93 | כולם 94 | כולן 95 | כזה 96 | כי 97 | כיצד 98 | כך 99 | ככה 100 | כל 101 | כלל 102 | כמו 103 | כן 104 | כפי 105 | כש 106 | לא 107 | לאו 108 | לאיזותכלית 109 | לאן 110 | לבין 111 | לה 112 | להיות 113 | להם 114 | להן 115 | לו 116 | לי 117 | לכם 118 | לכן 119 | למה 120 | למטה 121 | למעלה 122 | למקוםשבו 123 | למרות 124 | לנו 125 | לעבר 126 | לעיכן 127 | לפיכך 128 | לפני 129 | מאד 130 | מאחורי 131 | מאיזוסיבה 132 | מאין 133 | מאיפה 134 | מבלי 135 | מבעד 136 | מדוע 137 | מה 138 | מהיכן 139 | מול 140 | מחוץ 141 | מי 142 | מכאן 143 | מכיוון 144 | מלבד 145 | מן 146 | מנין 147 | מסוגל 148 | מעט 149 | מעטים 150 | מעל 151 | מצד 152 | מקוםבו 153 | מתחת 154 | מתי 155 | נגד 156 | נגר 157 | נו 158 | עד 159 | עז 160 | על 161 | עלי 162 | עליה 163 | עליהם 164 | עליהן 165 | עליו 166 | עליך 167 | עליכם 168 | עלינו 169 | עם 170 | עצמה 171 | עצמהם 172 | עצמהן 173 | עצמו 174 | עצמי 175 | עצמם 176 | עצמן 177 | עצמנו 178 | פה 179 | רק 180 | שוב 181 | של 182 | שלה 183 | שלהם 184 | שלהן 185 | שלו 186 | שלי 187 | שלך 188 | שלכה 189 | שלכם 190 | שלכן 191 | שלנו 192 | שם 193 | תהיה 194 | תחת -------------------------------------------------------------------------------- /static/stopwords/hi: -------------------------------------------------------------------------------- 1 | अंदर 2 | अत 3 | अदि 4 | अप 5 | अपना 6 | अपनि 7 | अपनी 8 | अपने 9 | अभि 10 | अभी 11 | आदि 12 | आप 13 | इंहिं 14 | इंहें 15 | इंहों 16 | इतयादि 17 | इत्यादि 18 | इन 19 | इनका 20 | इन्हीं 21 | इन्हें 22 | इन्हों 23 | इस 24 | इसका 25 | इसकि 26 | इसकी 27 | इसके 28 | इसमें 29 | इसि 30 | इसी 31 | इसे 32 | उंहिं 33 | उंहें 34 | उंहों 35 | उन 36 | उनका 37 | उनकि 38 | उनकी 39 | उनके 40 | उनको 41 | उन्हीं 42 | उन्हें 43 | उन्हों 44 | उस 45 | उसके 46 | उसि 47 | उसी 48 | उसे 49 | एक 50 | एवं 51 | एस 52 | एसे 53 | ऐसे 54 | ओर 55 | और 56 | कइ 57 | कई 58 | कर 59 | करता 60 | करते 61 | करना 62 | करने 63 | करें 64 | कहते 65 | कहा 66 | का 67 | काफि 68 | काफ़ी 69 | कि 70 | किंहें 71 | किंहों 72 | कितना 73 | किन्हें 74 | किन्हों 75 | किया 76 | किर 77 | किस 78 | किसि 79 | किसी 80 | किसे 81 | की 82 | कुछ 83 | कुल 84 | के 85 | को 86 | कोइ 87 | कोई 88 | कोन 89 | कोनसा 90 | कौन 91 | कौनसा 92 | गया 93 | घर 94 | जब 95 | जहाँ 96 | जहां 97 | जा 98 | जिंहें 99 | जिंहों 100 | जितना 101 | जिधर 102 | जिन 103 | जिन्हें 104 | जिन्हों 105 | जिस 106 | जिसे 107 | जीधर 108 | जेसा 109 | जेसे 110 | जैसा 111 | जैसे 112 | जो 113 | तक 114 | तब 115 | तरह 116 | तिंहें 117 | तिंहों 118 | तिन 119 | तिन्हें 120 | तिन्हों 121 | तिस 122 | तिसे 123 | तो 124 | था 125 | थि 126 | थी 127 | थे 128 | दबारा 129 | दवारा 130 | दिया 131 | दुसरा 132 | दुसरे 133 | दूसरे 134 | दो 135 | द्वारा 136 | न 137 | नहिं 138 | नहीं 139 | ना 140 | निचे 141 | निहायत 142 | नीचे 143 | ने 144 | पर 145 | पहले 146 | पुरा 147 | पूरा 148 | पे 149 | फिर 150 | बनि 151 | बनी 152 | बहि 153 | बही 154 | बहुत 155 | बाद 156 | बाला 157 | बिलकुल 158 | भि 159 | भितर 160 | भी 161 | भीतर 162 | मगर 163 | मानो 164 | मे 165 | में 166 | यदि 167 | यह 168 | यहाँ 169 | यहां 170 | यहि 171 | यही 172 | या 173 | यिह 174 | ये 175 | रखें 176 | रवासा 177 | रहा 178 | रहे 179 | ऱ्वासा 180 | लिए 181 | लिये 182 | लेकिन 183 | व 184 | वगेरह 185 | वरग 186 | वर्ग 187 | वह 188 | वहाँ 189 | वहां 190 | वहिं 191 | वहीं 192 | वाले 193 | वुह 194 | वे 195 | वग़ैरह 196 | संग 197 | सकता 198 | सकते 199 | सबसे 200 | सभि 201 | सभी 202 | साथ 203 | साबुत 204 | साभ 205 | सारा 206 | से 207 | सो 208 | हि 209 | ही 210 | हुअ 211 | हुआ 212 | हुइ 213 | हुई 214 | हुए 215 | हे 216 | हें 217 | है 218 | हैं 219 | हो 220 | होता 221 | होति 222 | होती 223 | होते 224 | होना 225 | होने -------------------------------------------------------------------------------- /static/stopwords/hr: -------------------------------------------------------------------------------- 1 | a 2 | ako 3 | ali 4 | bi 5 | bih 6 | bila 7 | bili 8 | bilo 9 | bio 10 | bismo 11 | biste 12 | biti 13 | bumo 14 | da 15 | do 16 | duž 17 | ga 18 | hoće 19 | hoćemo 20 | hoćete 21 | hoćeš 22 | hoću 23 | i 24 | iako 25 | ih 26 | ili 27 | iz 28 | ja 29 | je 30 | jedna 31 | jedne 32 | jedno 33 | jer 34 | jesam 35 | jesi 36 | jesmo 37 | jest 38 | jeste 39 | jesu 40 | jim 41 | joj 42 | još 43 | ju 44 | kada 45 | kako 46 | kao 47 | koja 48 | koje 49 | koji 50 | kojima 51 | koju 52 | kroz 53 | li 54 | me 55 | mene 56 | meni 57 | mi 58 | mimo 59 | moj 60 | moja 61 | moje 62 | mu 63 | na 64 | nad 65 | nakon 66 | nam 67 | nama 68 | nas 69 | naš 70 | naša 71 | naše 72 | našeg 73 | ne 74 | nego 75 | neka 76 | neki 77 | nekog 78 | neku 79 | nema 80 | netko 81 | neće 82 | nećemo 83 | nećete 84 | nećeš 85 | neću 86 | nešto 87 | ni 88 | nije 89 | nikoga 90 | nikoje 91 | nikoju 92 | nisam 93 | nisi 94 | nismo 95 | niste 96 | nisu 97 | njega 98 | njegov 99 | njegova 100 | njegovo 101 | njemu 102 | njezin 103 | njezina 104 | njezino 105 | njih 106 | njihov 107 | njihova 108 | njihovo 109 | njim 110 | njima 111 | njoj 112 | nju 113 | no 114 | o 115 | od 116 | odmah 117 | on 118 | ona 119 | oni 120 | ono 121 | ova 122 | pa 123 | pak 124 | po 125 | pod 126 | pored 127 | prije 128 | s 129 | sa 130 | sam 131 | samo 132 | se 133 | sebe 134 | sebi 135 | si 136 | smo 137 | ste 138 | su 139 | sve 140 | svi 141 | svog 142 | svoj 143 | svoja 144 | svoje 145 | svom 146 | ta 147 | tada 148 | taj 149 | tako 150 | te 151 | tebe 152 | tebi 153 | ti 154 | to 155 | toj 156 | tome 157 | tu 158 | tvoj 159 | tvoja 160 | tvoje 161 | u 162 | uz 163 | vam 164 | vama 165 | vas 166 | vaš 167 | vaša 168 | vaše 169 | već 170 | vi 171 | vrlo 172 | za 173 | zar 174 | će 175 | ćemo 176 | ćete 177 | ćeš 178 | ću 179 | što -------------------------------------------------------------------------------- /static/stopwords/hy: -------------------------------------------------------------------------------- 1 | այդ 2 | այլ 3 | այն 4 | այս 5 | դու 6 | դուք 7 | եմ 8 | են 9 | ենք 10 | ես 11 | եք 12 | է 13 | էի 14 | էին 15 | էինք 16 | էիր 17 | էիք 18 | էր 19 | ըստ 20 | թ 21 | ի 22 | ին 23 | իսկ 24 | իր 25 | կամ 26 | համար 27 | հետ 28 | հետո 29 | մենք 30 | մեջ 31 | մի 32 | ն 33 | նա 34 | նաև 35 | նրա 36 | նրանք 37 | որ 38 | որը 39 | որոնք 40 | որպես 41 | ու 42 | ում 43 | պիտի 44 | վրա 45 | և -------------------------------------------------------------------------------- /static/stopwords/ja: -------------------------------------------------------------------------------- 1 | あそこ 2 | あっ 3 | あの 4 | あのかた 5 | あの人 6 | あり 7 | あります 8 | ある 9 | あれ 10 | い 11 | いう 12 | います 13 | いる 14 | う 15 | うち 16 | え 17 | お 18 | および 19 | おり 20 | おります 21 | か 22 | かつて 23 | から 24 | が 25 | き 26 | ここ 27 | こちら 28 | こと 29 | この 30 | これ 31 | これら 32 | さ 33 | さらに 34 | し 35 | しかし 36 | する 37 | ず 38 | せ 39 | せる 40 | そこ 41 | そして 42 | その 43 | その他 44 | その後 45 | それ 46 | それぞれ 47 | それで 48 | た 49 | ただし 50 | たち 51 | ため 52 | たり 53 | だ 54 | だっ 55 | だれ 56 | つ 57 | て 58 | で 59 | でき 60 | できる 61 | です 62 | では 63 | でも 64 | と 65 | という 66 | といった 67 | とき 68 | ところ 69 | として 70 | とともに 71 | とも 72 | と共に 73 | どこ 74 | どの 75 | な 76 | ない 77 | なお 78 | なかっ 79 | ながら 80 | なく 81 | なっ 82 | など 83 | なに 84 | なら 85 | なり 86 | なる 87 | なん 88 | に 89 | において 90 | における 91 | について 92 | にて 93 | によって 94 | により 95 | による 96 | に対して 97 | に対する 98 | に関する 99 | の 100 | ので 101 | のみ 102 | は 103 | ば 104 | へ 105 | ほか 106 | ほとんど 107 | ほど 108 | ます 109 | また 110 | または 111 | まで 112 | も 113 | もの 114 | ものの 115 | や 116 | よう 117 | より 118 | ら 119 | られ 120 | られる 121 | れ 122 | れる 123 | を 124 | ん 125 | 何 126 | 及び 127 | 彼 128 | 彼女 129 | 我々 130 | 特に 131 | 私 132 | 私達 133 | 貴方 134 | 貴方方 -------------------------------------------------------------------------------- /static/stopwords/ku: -------------------------------------------------------------------------------- 1 | ئێمە 2 | ئێوە 3 | ئەم 4 | ئەو 5 | ئەوان 6 | ئەوەی 7 | بۆ 8 | بێ 9 | بێجگە 10 | بە 11 | بەبێ 12 | بەدەم 13 | بەردەم 14 | بەرلە 15 | بەرەوی 16 | بەرەوە 17 | بەلای 18 | بەپێی 19 | تۆ 20 | تێ 21 | جگە 22 | دوای 23 | دوو 24 | دە 25 | دەکات 26 | دەگەڵ 27 | سەر 28 | لێ 29 | لە 30 | لەبابەت 31 | لەباتی 32 | لەبارەی 33 | لەبرێتی 34 | لەبن 35 | لەبەر 36 | لەبەینی 37 | لەدەم 38 | لەرێ 39 | لەرێگا 40 | لەرەوی 41 | لەسەر 42 | لەلایەن 43 | لەناو 44 | لەنێو 45 | لەو 46 | لەپێناوی 47 | لەژێر 48 | لەگەڵ 49 | من 50 | ناو 51 | نێوان 52 | هەر 53 | هەروەها 54 | و 55 | وەک 56 | پاش 57 | پێ 58 | پێش 59 | چەند 60 | کرد 61 | کە 62 | ی -------------------------------------------------------------------------------- /static/stopwords/la: -------------------------------------------------------------------------------- 1 | a 2 | ab 3 | ac 4 | ad 5 | at 6 | atque 7 | aut 8 | autem 9 | cum 10 | de 11 | dum 12 | e 13 | erant 14 | erat 15 | est 16 | et 17 | etiam 18 | ex 19 | haec 20 | hic 21 | hoc 22 | in 23 | ita 24 | me 25 | nec 26 | neque 27 | non 28 | per 29 | qua 30 | quae 31 | quam 32 | qui 33 | quibus 34 | quidem 35 | quo 36 | quod 37 | re 38 | rebus 39 | rem 40 | res 41 | sed 42 | si 43 | sic 44 | sunt 45 | tamen 46 | tandem 47 | te 48 | ut 49 | vel -------------------------------------------------------------------------------- /static/stopwords/lv: -------------------------------------------------------------------------------- 1 | aiz 2 | ap 3 | apakš 4 | apakšpus 5 | ar 6 | arī 7 | augšpus 8 | bet 9 | bez 10 | bija 11 | biji 12 | biju 13 | bijām 14 | bijāt 15 | būs 16 | būsi 17 | būsiet 18 | būsim 19 | būt 20 | būšu 21 | caur 22 | diemžēl 23 | diezin 24 | droši 25 | dēļ 26 | esam 27 | esat 28 | esi 29 | esmu 30 | gan 31 | gar 32 | iekam 33 | iekams 34 | iekām 35 | iekāms 36 | iekš 37 | iekšpus 38 | ik 39 | ir 40 | it 41 | itin 42 | iz 43 | ja 44 | jau 45 | jeb 46 | jebšu 47 | jel 48 | jo 49 | jā 50 | ka 51 | kamēr 52 | kaut 53 | kolīdz 54 | kopš 55 | kā 56 | kļuva 57 | kļuvi 58 | kļuvu 59 | kļuvām 60 | kļuvāt 61 | kļūs 62 | kļūsi 63 | kļūsiet 64 | kļūsim 65 | kļūst 66 | kļūstam 67 | kļūstat 68 | kļūsti 69 | kļūstu 70 | kļūt 71 | kļūšu 72 | labad 73 | lai 74 | lejpus 75 | līdz 76 | līdzko 77 | ne 78 | nebūt 79 | nedz 80 | nekā 81 | nevis 82 | nezin 83 | no 84 | nu 85 | nē 86 | otrpus 87 | pa 88 | par 89 | pat 90 | pie 91 | pirms 92 | pret 93 | priekš 94 | pār 95 | pēc 96 | starp 97 | tad 98 | tak 99 | tapi 100 | taps 101 | tapsi 102 | tapsiet 103 | tapsim 104 | tapt 105 | tapāt 106 | tapšu 107 | taču 108 | te 109 | tiec 110 | tiek 111 | tiekam 112 | tiekat 113 | tieku 114 | tik 115 | tika 116 | tikai 117 | tiki 118 | tikko 119 | tiklab 120 | tiklīdz 121 | tiks 122 | tiksiet 123 | tiksim 124 | tikt 125 | tiku 126 | tikvien 127 | tikām 128 | tikāt 129 | tikšu 130 | tomēr 131 | topat 132 | turpretim 133 | turpretī 134 | tā 135 | tādēļ 136 | tālab 137 | tāpēc 138 | un 139 | uz 140 | vai 141 | var 142 | varat 143 | varēja 144 | varēji 145 | varēju 146 | varējām 147 | varējāt 148 | varēs 149 | varēsi 150 | varēsiet 151 | varēsim 152 | varēt 153 | varēšu 154 | vien 155 | virs 156 | virspus 157 | vis 158 | viņpus 159 | zem 160 | ārpus 161 | šaipus -------------------------------------------------------------------------------- /static/stopwords/mr: -------------------------------------------------------------------------------- 1 | अधिक 2 | अनेक 3 | अशी 4 | असलयाचे 5 | असलेल्या 6 | असा 7 | असून 8 | असे 9 | आज 10 | आणि 11 | आता 12 | आपल्या 13 | आला 14 | आली 15 | आले 16 | आहे 17 | आहेत 18 | एक 19 | एका 20 | कमी 21 | करणयात 22 | करून 23 | का 24 | काम 25 | काय 26 | काही 27 | किवा 28 | की 29 | केला 30 | केली 31 | केले 32 | कोटी 33 | गेल्या 34 | घेऊन 35 | जात 36 | झाला 37 | झाली 38 | झाले 39 | झालेल्या 40 | टा 41 | डॉ 42 | तर 43 | तरी 44 | तसेच 45 | ता 46 | ती 47 | तीन 48 | ते 49 | तो 50 | त्या 51 | त्याचा 52 | त्याची 53 | त्याच्या 54 | त्याना 55 | त्यानी 56 | त्यामुळे 57 | त्री 58 | दिली 59 | दोन 60 | न 61 | नाही 62 | निर्ण्य 63 | पण 64 | पम 65 | परयतन 66 | पाटील 67 | म 68 | मात्र 69 | माहिती 70 | मी 71 | मुबी 72 | म्हणजे 73 | म्हणाले 74 | म्हणून 75 | या 76 | याचा 77 | याची 78 | याच्या 79 | याना 80 | यानी 81 | येणार 82 | येत 83 | येथील 84 | येथे 85 | लाख 86 | व 87 | व्यकत 88 | सर्व 89 | सागित्ले 90 | सुरू 91 | हजार 92 | हा 93 | ही 94 | हे 95 | होणार 96 | होत 97 | होता 98 | होती 99 | होते -------------------------------------------------------------------------------- /static/stopwords/no: -------------------------------------------------------------------------------- 1 | alle 2 | andre 3 | arbeid 4 | at 5 | av 6 | bare 7 | begge 8 | ble 9 | blei 10 | bli 11 | blir 12 | blitt 13 | bort 14 | bra 15 | bruke 16 | både 17 | båe 18 | da 19 | de 20 | deg 21 | dei 22 | deim 23 | deira 24 | deires 25 | dem 26 | den 27 | denne 28 | der 29 | dere 30 | deres 31 | det 32 | dette 33 | di 34 | din 35 | disse 36 | ditt 37 | du 38 | dykk 39 | dykkar 40 | då 41 | eg 42 | ein 43 | eit 44 | eitt 45 | eller 46 | elles 47 | en 48 | ene 49 | eneste 50 | enhver 51 | enn 52 | er 53 | et 54 | ett 55 | etter 56 | folk 57 | for 58 | fordi 59 | forsûke 60 | fra 61 | få 62 | før 63 | fûr 64 | fûrst 65 | gjorde 66 | gjûre 67 | god 68 | gå 69 | ha 70 | hadde 71 | han 72 | hans 73 | har 74 | hennar 75 | henne 76 | hennes 77 | her 78 | hjå 79 | ho 80 | hoe 81 | honom 82 | hoss 83 | hossen 84 | hun 85 | hva 86 | hvem 87 | hver 88 | hvilke 89 | hvilken 90 | hvis 91 | hvor 92 | hvordan 93 | hvorfor 94 | i 95 | ikke 96 | ikkje 97 | ingen 98 | ingi 99 | inkje 100 | inn 101 | innen 102 | inni 103 | ja 104 | jeg 105 | kan 106 | kom 107 | korleis 108 | korso 109 | kun 110 | kunne 111 | kva 112 | kvar 113 | kvarhelst 114 | kven 115 | kvi 116 | kvifor 117 | lage 118 | lang 119 | lik 120 | like 121 | makt 122 | man 123 | mange 124 | me 125 | med 126 | medan 127 | meg 128 | meget 129 | mellom 130 | men 131 | mens 132 | mer 133 | mest 134 | mi 135 | min 136 | mine 137 | mitt 138 | mot 139 | mye 140 | mykje 141 | må 142 | måte 143 | navn 144 | ned 145 | nei 146 | no 147 | noe 148 | noen 149 | noka 150 | noko 151 | nokon 152 | nokor 153 | nokre 154 | ny 155 | nå 156 | når 157 | og 158 | også 159 | om 160 | opp 161 | oss 162 | over 163 | part 164 | punkt 165 | på 166 | rett 167 | riktig 168 | samme 169 | sant 170 | seg 171 | selv 172 | si 173 | sia 174 | sidan 175 | siden 176 | sin 177 | sine 178 | sist 179 | sitt 180 | sjøl 181 | skal 182 | skulle 183 | slik 184 | slutt 185 | so 186 | som 187 | somme 188 | somt 189 | start 190 | stille 191 | så 192 | sånn 193 | tid 194 | til 195 | tilbake 196 | tilstand 197 | um 198 | under 199 | upp 200 | ut 201 | uten 202 | var 203 | vart 204 | varte 205 | ved 206 | verdi 207 | vere 208 | verte 209 | vi 210 | vil 211 | ville 212 | vite 213 | vore 214 | vors 215 | vort 216 | vår 217 | være 218 | vært 219 | vöre 220 | vört 221 | å -------------------------------------------------------------------------------- /static/stopwords/pl: -------------------------------------------------------------------------------- 1 | a 2 | aby 3 | ach 4 | acz 5 | aczkolwiek 6 | aj 7 | albo 8 | ale 9 | ależ 10 | ani 11 | aż 12 | bardziej 13 | bardzo 14 | bez 15 | bo 16 | bowiem 17 | by 18 | byli 19 | bym 20 | bynajmniej 21 | być 22 | był 23 | była 24 | było 25 | były 26 | będzie 27 | będą 28 | cali 29 | cała 30 | cały 31 | chce 32 | choć 33 | ci 34 | ciebie 35 | cię 36 | co 37 | cokolwiek 38 | coraz 39 | coś 40 | czasami 41 | czasem 42 | czemu 43 | czy 44 | czyli 45 | często 46 | daleko 47 | dla 48 | dlaczego 49 | dlatego 50 | do 51 | dobrze 52 | dokąd 53 | dość 54 | dr 55 | dużo 56 | dwa 57 | dwaj 58 | dwie 59 | dwoje 60 | dzisiaj 61 | dziś 62 | gdy 63 | gdyby 64 | gdyż 65 | gdzie 66 | gdziekolwiek 67 | gdzieś 68 | go 69 | godz 70 | hab 71 | i 72 | ich 73 | ii 74 | iii 75 | ile 76 | im 77 | inna 78 | inne 79 | inny 80 | innych 81 | inż 82 | iv 83 | ix 84 | iż 85 | ja 86 | jak 87 | jakaś 88 | jakby 89 | jaki 90 | jakichś 91 | jakie 92 | jakiś 93 | jakiż 94 | jakkolwiek 95 | jako 96 | jakoś 97 | je 98 | jeden 99 | jedna 100 | jednak 101 | jednakże 102 | jedno 103 | jednym 104 | jedynie 105 | jego 106 | jej 107 | jemu 108 | jest 109 | jestem 110 | jeszcze 111 | jeśli 112 | jeżeli 113 | już 114 | ją 115 | każdy 116 | kiedy 117 | kierunku 118 | kilka 119 | kilku 120 | kimś 121 | kto 122 | ktokolwiek 123 | ktoś 124 | która 125 | które 126 | którego 127 | której 128 | który 129 | których 130 | którym 131 | którzy 132 | ku 133 | lat 134 | lecz 135 | lub 136 | ma 137 | mają 138 | mam 139 | mamy 140 | mało 141 | mgr 142 | mi 143 | miał 144 | mimo 145 | między 146 | mnie 147 | mną 148 | mogą 149 | moi 150 | moim 151 | moja 152 | moje 153 | może 154 | możliwe 155 | można 156 | mu 157 | musi 158 | my 159 | mój 160 | na 161 | nad 162 | nam 163 | nami 164 | nas 165 | nasi 166 | nasz 167 | nasza 168 | nasze 169 | naszego 170 | naszych 171 | natomiast 172 | natychmiast 173 | nawet 174 | nic 175 | nich 176 | nie 177 | niech 178 | niego 179 | niej 180 | niemu 181 | nigdy 182 | nim 183 | nimi 184 | nią 185 | niż 186 | no 187 | nowe 188 | np 189 | nr 190 | o 191 | o.o. 192 | obok 193 | od 194 | ok 195 | około 196 | on 197 | ona 198 | one 199 | oni 200 | ono 201 | oraz 202 | oto 203 | owszem 204 | pan 205 | pana 206 | pani 207 | pl 208 | po 209 | pod 210 | podczas 211 | pomimo 212 | ponad 213 | ponieważ 214 | powinien 215 | powinna 216 | powinni 217 | powinno 218 | poza 219 | prawie 220 | prof 221 | przecież 222 | przed 223 | przede 224 | przedtem 225 | przez 226 | przy 227 | raz 228 | razie 229 | roku 230 | również 231 | sam 232 | sama 233 | się 234 | skąd 235 | sobie 236 | sobą 237 | sposób 238 | swoje 239 | są 240 | ta 241 | tak 242 | taka 243 | taki 244 | takich 245 | takie 246 | także 247 | tam 248 | te 249 | tego 250 | tej 251 | tel 252 | temu 253 | ten 254 | teraz 255 | też 256 | to 257 | tobie 258 | tobą 259 | toteż 260 | totobą 261 | trzeba 262 | tu 263 | tutaj 264 | twoi 265 | twoim 266 | twoja 267 | twoje 268 | twym 269 | twój 270 | ty 271 | tych 272 | tylko 273 | tym 274 | tys 275 | tzw 276 | tę 277 | u 278 | ul 279 | vi 280 | vii 281 | viii 282 | vol 283 | w 284 | wam 285 | wami 286 | was 287 | wasi 288 | wasz 289 | wasza 290 | wasze 291 | we 292 | według 293 | wie 294 | wiele 295 | wielu 296 | więc 297 | więcej 298 | wszyscy 299 | wszystkich 300 | wszystkie 301 | wszystkim 302 | wszystko 303 | wtedy 304 | www 305 | wy 306 | właśnie 307 | wśród 308 | xi 309 | xii 310 | xiii 311 | xiv 312 | xv 313 | z 314 | za 315 | zapewne 316 | zawsze 317 | zaś 318 | ze 319 | zeznowu 320 | znowu 321 | znów 322 | został 323 | zł 324 | żaden 325 | żadna 326 | żadne 327 | żadnych 328 | że 329 | żeby -------------------------------------------------------------------------------- /static/stopwords/so: -------------------------------------------------------------------------------- 1 | aad 2 | albaabkii 3 | atabo 4 | ay 5 | ayaa 6 | ayee 7 | ayuu 8 | dhan 9 | hadana 10 | in 11 | inuu 12 | isku 13 | jiray 14 | jirtay 15 | ka 16 | kale 17 | kasoo 18 | ku 19 | kuu 20 | lakin 21 | markii 22 | oo 23 | si 24 | soo 25 | uga 26 | ugu 27 | uu 28 | waa 29 | waxa 30 | waxuu -------------------------------------------------------------------------------- /static/stopwords/st: -------------------------------------------------------------------------------- 1 | a 2 | ba 3 | bane 4 | bona 5 | e 6 | ea 7 | eaba 8 | empa 9 | ena 10 | ha 11 | hae 12 | hape 13 | ho 14 | hore 15 | ka 16 | ke 17 | la 18 | le 19 | li 20 | me 21 | mo 22 | moo 23 | ne 24 | o 25 | oa 26 | re 27 | sa 28 | se 29 | tloha 30 | tsa 31 | tse -------------------------------------------------------------------------------- /static/stopwords/sw: -------------------------------------------------------------------------------- 1 | akasema 2 | alikuwa 3 | alisema 4 | baada 5 | basi 6 | bila 7 | cha 8 | chini 9 | hadi 10 | hapo 11 | hata 12 | hivyo 13 | hiyo 14 | huku 15 | huo 16 | ili 17 | ilikuwa 18 | juu 19 | kama 20 | karibu 21 | katika 22 | kila 23 | kima 24 | kisha 25 | kubwa 26 | kutoka 27 | kuwa 28 | kwa 29 | kwamba 30 | kwenda 31 | kwenye 32 | la 33 | lakini 34 | mara 35 | mdogo 36 | mimi 37 | mkubwa 38 | mmoja 39 | moja 40 | muda 41 | mwenye 42 | na 43 | naye 44 | ndani 45 | ng 46 | ni 47 | nini 48 | nonkungu 49 | pamoja 50 | pia 51 | sana 52 | sasa 53 | sauti 54 | tafadhali 55 | tena 56 | tu 57 | vile 58 | wa 59 | wakati 60 | wake 61 | walikuwa 62 | wao 63 | watu 64 | wengine 65 | wote 66 | ya 67 | yake 68 | yangu 69 | yao 70 | yeye 71 | yule 72 | za 73 | zaidi 74 | zake -------------------------------------------------------------------------------- /static/stopwords/th: -------------------------------------------------------------------------------- 1 | กล่าว 2 | กว่า 3 | กัน 4 | กับ 5 | การ 6 | ก็ 7 | ก่อน 8 | ขณะ 9 | ขอ 10 | ของ 11 | ขึ้น 12 | คง 13 | ครั้ง 14 | ความ 15 | คือ 16 | จะ 17 | จัด 18 | จาก 19 | จึง 20 | ช่วง 21 | ซึ่ง 22 | ดัง 23 | ด้วย 24 | ด้าน 25 | ตั้ง 26 | ตั้งแต่ 27 | ตาม 28 | ต่อ 29 | ต่าง 30 | ต่างๆ 31 | ต้อง 32 | ถึง 33 | ถูก 34 | ถ้า 35 | ทั้ง 36 | ทั้งนี้ 37 | ทาง 38 | ทำ 39 | ทำให้ 40 | ที่ 41 | ที่สุด 42 | ทุก 43 | นอกจาก 44 | นัก 45 | นั้น 46 | นำ 47 | นี้ 48 | น่า 49 | บาง 50 | ผล 51 | ผ่าน 52 | พบ 53 | พร้อม 54 | มา 55 | มาก 56 | มี 57 | ยัง 58 | รวม 59 | ระหว่าง 60 | รับ 61 | ราย 62 | ร่วม 63 | ลง 64 | วัน 65 | ว่า 66 | สำหรับ 67 | สุด 68 | ส่ง 69 | ส่วน 70 | หนึ่ง 71 | หรือ 72 | หลัง 73 | หลังจาก 74 | หลาย 75 | หาก 76 | อยาก 77 | อยู่ 78 | อย่าง 79 | ออก 80 | อะไร 81 | อาจ 82 | อีก 83 | เขา 84 | เข้า 85 | เคย 86 | เฉพาะ 87 | เช่น 88 | เดียว 89 | เดียวกัน 90 | เนื่องจาก 91 | เปิด 92 | เปิดเผย 93 | เป็น 94 | เป็นการ 95 | เพราะ 96 | เพื่อ 97 | เมื่อ 98 | เรา 99 | เริ่ม 100 | เลย 101 | เห็น 102 | เอง 103 | แต่ 104 | แบบ 105 | แรก 106 | และ 107 | แล้ว 108 | แห่ง 109 | โดย 110 | ใน 111 | ให้ 112 | ได้ 113 | ไป 114 | ไม่ 115 | ไว้ -------------------------------------------------------------------------------- /static/stopwords/tl: -------------------------------------------------------------------------------- 1 | akin 2 | aking 3 | ako 4 | alin 5 | am 6 | amin 7 | aming 8 | ang 9 | ano 10 | anumang 11 | apat 12 | at 13 | atin 14 | ating 15 | ay 16 | bababa 17 | bago 18 | bakit 19 | bawat 20 | bilang 21 | dahil 22 | dalawa 23 | dapat 24 | din 25 | dito 26 | doon 27 | gagawin 28 | gayunman 29 | ginagawa 30 | ginawa 31 | ginawang 32 | gumawa 33 | gusto 34 | habang 35 | hanggang 36 | hindi 37 | huwag 38 | iba 39 | ibaba 40 | ibabaw 41 | ibig 42 | ikaw 43 | ilagay 44 | ilalim 45 | ilan 46 | inyong 47 | isa 48 | isang 49 | itaas 50 | ito 51 | iyo 52 | iyon 53 | iyong 54 | ka 55 | kahit 56 | kailangan 57 | kailanman 58 | kami 59 | kanila 60 | kanilang 61 | kanino 62 | kanya 63 | kanyang 64 | kapag 65 | kapwa 66 | karamihan 67 | katiyakan 68 | katulad 69 | kaya 70 | kaysa 71 | ko 72 | kong 73 | kulang 74 | kumuha 75 | kung 76 | laban 77 | lahat 78 | lamang 79 | likod 80 | lima 81 | maaari 82 | maaaring 83 | maging 84 | mahusay 85 | makita 86 | marami 87 | marapat 88 | masyado 89 | may 90 | mayroon 91 | mga 92 | minsan 93 | mismo 94 | mula 95 | muli 96 | na 97 | nabanggit 98 | naging 99 | nagkaroon 100 | nais 101 | nakita 102 | namin 103 | napaka 104 | narito 105 | nasaan 106 | ng 107 | ngayon 108 | ni 109 | nila 110 | nilang 111 | nito 112 | niya 113 | niyang 114 | noon 115 | o 116 | pa 117 | paano 118 | pababa 119 | paggawa 120 | pagitan 121 | pagkakaroon 122 | pagkatapos 123 | palabas 124 | pamamagitan 125 | panahon 126 | pangalawa 127 | para 128 | paraan 129 | pareho 130 | pataas 131 | pero 132 | pumunta 133 | pumupunta 134 | sa 135 | saan 136 | sabi 137 | sabihin 138 | sarili 139 | sila 140 | sino 141 | siya 142 | tatlo 143 | tayo 144 | tulad 145 | tungkol 146 | una 147 | walang -------------------------------------------------------------------------------- /static/stopwords/uk: -------------------------------------------------------------------------------- 1 | авжеж 2 | адже 3 | але 4 | б 5 | без 6 | був 7 | була 8 | були 9 | було 10 | бути 11 | більш 12 | вам 13 | вас 14 | весь 15 | вздовж 16 | ви 17 | вниз 18 | внизу 19 | вона 20 | вони 21 | воно 22 | все 23 | всередині 24 | всіх 25 | від 26 | він 27 | да 28 | давай 29 | давати 30 | де 31 | дещо 32 | для 33 | до 34 | з 35 | завжди 36 | замість 37 | й 38 | коли 39 | ледве 40 | майже 41 | ми 42 | навколо 43 | навіть 44 | нам 45 | от 46 | отже 47 | отож 48 | поза 49 | про 50 | під 51 | та 52 | так 53 | такий 54 | також 55 | те 56 | ти 57 | тобто 58 | тож 59 | тощо 60 | хоча 61 | це 62 | цей 63 | чи 64 | чого 65 | що 66 | як 67 | який 68 | якої 69 | є 70 | із 71 | інших 72 | їх 73 | її -------------------------------------------------------------------------------- /static/stopwords/yo: -------------------------------------------------------------------------------- 1 | a 2 | an 3 | bá 4 | bí 5 | bẹ̀rẹ̀ 6 | fún 7 | fẹ́ 8 | gbogbo 9 | inú 10 | jù 11 | jẹ 12 | jẹ́ 13 | kan 14 | kì 15 | kí 16 | kò 17 | láti 18 | lè 19 | lọ 20 | mi 21 | mo 22 | máa 23 | mọ̀ 24 | ni 25 | náà 26 | ní 27 | nígbà 28 | nítorí 29 | nǹkan 30 | o 31 | padà 32 | pé 33 | púpọ̀ 34 | pẹ̀lú 35 | rẹ̀ 36 | sì 37 | sí 38 | sínú 39 | ṣ 40 | ti 41 | tí 42 | wà 43 | wá 44 | wọn 45 | wọ́n 46 | yìí 47 | àti 48 | àwọn 49 | é 50 | í 51 | òun 52 | ó 53 | ń 54 | ńlá 55 | ṣe 56 | ṣé 57 | ṣùgbọ́n 58 | ẹmọ́ 59 | ọjọ́ 60 | ọ̀pọ̀lọpọ̀ -------------------------------------------------------------------------------- /static/stopwords/zu: -------------------------------------------------------------------------------- 1 | futhi 2 | kahle 3 | kakhulu 4 | kanye 5 | khona 6 | kodwa 7 | kungani 8 | kusho 9 | la 10 | lakhe 11 | lapho 12 | mina 13 | ngesikhathi 14 | nje 15 | phansi 16 | phezulu 17 | u 18 | ukuba 19 | ukuthi 20 | ukuze 21 | uma 22 | wahamba 23 | wakhe 24 | wami 25 | wase 26 | wathi 27 | yakhe 28 | zakhe 29 | zonke -------------------------------------------------------------------------------- /subprojects/gtest.wrap: -------------------------------------------------------------------------------- 1 | [wrap-file] 2 | directory = googletest-1.14.0 3 | source_url = https://github.com/google/googletest/archive/refs/tags/v1.14.0.tar.gz 4 | source_filename = gtest-1.14.0.tar.gz 5 | source_hash = 8ad598c73ad796e0d8280b082cebd82a630d73e73cd3c70057938a6501bba5d7 6 | patch_filename = gtest_1.14.0-1_patch.zip 7 | patch_url = https://wrapdb.mesonbuild.com/v2/gtest_1.14.0-1/get_patch 8 | patch_hash = 2e693c7d3f9370a7aa6dac802bada0874d3198ad4cfdf75647b818f691182b50 9 | source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/gtest_1.14.0-1/gtest-1.14.0.tar.gz 10 | wrapdb_version = 1.14.0-1 11 | 12 | [provide] 13 | gtest = gtest_dep 14 | gtest_main = gtest_main_dep 15 | gmock = gmock_dep 16 | gmock_main = gmock_main_dep 17 | -------------------------------------------------------------------------------- /subprojects/liblzma.wrap: -------------------------------------------------------------------------------- 1 | [wrap-file] 2 | directory = xz-5.2.11 3 | source_url = http://tukaani.org/xz/xz-5.2.11.tar.xz 4 | source_filename = xz-5.2.11.tar.xz 5 | source_hash = 503b4a9fb405e70e1d3912e418fdffe5de27e713e58925fb67e12d20d03a77bc 6 | patch_filename = liblzma_5.2.11-2_patch.zip 7 | patch_url = https://wrapdb.mesonbuild.com/v2/liblzma_5.2.11-2/get_patch 8 | patch_hash = 1a1a008b2f3a81e7332d8d5d28df16df70488038901496fe73c734afab74c645 9 | source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/liblzma_5.2.11-2/xz-5.2.11.tar.xz 10 | wrapdb_version = 5.2.11-2 11 | 12 | [provide] 13 | liblzma = lzma_dep 14 | -------------------------------------------------------------------------------- /subprojects/zstd.wrap: -------------------------------------------------------------------------------- 1 | [wrap-file] 2 | directory = zstd-1.5.4 3 | source_url = https://github.com/facebook/zstd/releases/download/v1.5.4/zstd-1.5.4.tar.gz 4 | source_filename = zstd-1.5.4.tar.gz 5 | source_hash = 0f470992aedad543126d06efab344dc5f3e171893810455787d38347343a4424 6 | patch_filename = zstd_1.5.4-1_patch.zip 7 | patch_url = https://wrapdb.mesonbuild.com/v2/zstd_1.5.4-1/get_patch 8 | patch_hash = 34cf5d5255918631b9a90dfc472e51295715ebd92249c5c19dd374c3ca279571 9 | wrapdb_version = 1.5.4-1 10 | 11 | [provide] 12 | libzstd = libzstd_dep 13 | -------------------------------------------------------------------------------- /test/bufferstreamer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Veloman Yunkan 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include "buffer.h" 21 | #include "bufferstreamer.h" 22 | #include "endian_tools.h" 23 | 24 | #include "gtest/gtest.h" 25 | 26 | namespace 27 | { 28 | 29 | using namespace zim; 30 | 31 | //////////////////////////////////////////////////////////////////////////////// 32 | // BufferStreamer 33 | //////////////////////////////////////////////////////////////////////////////// 34 | 35 | TEST(BufferStreamer, shouldJustWork) 36 | { 37 | char data[] = "abcdefghijklmnopqrstuvwxyz"; 38 | zim::toLittleEndian(uint32_t(1234), data); 39 | zim::toLittleEndian(int64_t(-987654321), data+18); 40 | 41 | auto buffer = Buffer::makeBuffer(data, zsize_t(sizeof(data))); 42 | zim::BufferStreamer bds(buffer, zsize_t(sizeof(data))); 43 | 44 | ASSERT_EQ(1234U, bds.read()); 45 | 46 | ASSERT_EQ(data + 4, bds.current()); 47 | const auto blob1 = std::string(bds.current(), 4); 48 | bds.skip(zsize_t(4)); 49 | ASSERT_EQ("efgh", blob1); 50 | 51 | ASSERT_EQ(data + 8, bds.current()); 52 | const auto blob2 = std::string(bds.current(), 10); 53 | bds.skip(zsize_t(10)); 54 | ASSERT_EQ("ijklmnopqr", blob2); 55 | 56 | ASSERT_EQ(-987654321, bds.read()); 57 | } 58 | 59 | } // unnamed namespace 60 | -------------------------------------------------------------------------------- /test/compression.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Matthieu Gautier 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include 21 | #include 22 | #include "gtest/gtest.h" 23 | 24 | #include 25 | 26 | #include "../src/compression.h" 27 | 28 | namespace 29 | { 30 | 31 | template 32 | class CompressionTest : public testing::Test { 33 | protected: 34 | typedef zim::Compressor CompressorT; 35 | typedef zim::Uncompressor DecompressorT; 36 | }; 37 | 38 | using CompressionAlgo = ::testing::Types< 39 | ZSTD_INFO 40 | >; 41 | 42 | TYPED_TEST_SUITE(CompressionTest, CompressionAlgo); 43 | 44 | TYPED_TEST(CompressionTest, compress) { 45 | std::string data; 46 | data.reserve(100000); 47 | for (int i=0; i<100000; i++) { 48 | data.append(1, (char)(i%256)); 49 | } 50 | data[99999] = 0; 51 | 52 | auto initialSizes = std::vector{32, 1024, 1024*1024}; 53 | auto chunkSizes = std::vector{32, 512, 1024*1024}; 54 | for (auto initialSize: initialSizes) { 55 | for (auto chunkSize: chunkSizes) { 56 | typename TestFixture::CompressorT compressor(initialSize); 57 | { 58 | bool first=true; 59 | unsigned long size = data.size(); 60 | size_t offset = 0; 61 | while (size) { 62 | if (first) { 63 | compressor.init(const_cast(data.c_str())); 64 | first = false; 65 | } 66 | auto adjustedChunkSize = std::min(size, chunkSize); 67 | compressor.feed(data.c_str()+offset, adjustedChunkSize); 68 | offset += adjustedChunkSize; 69 | size -= adjustedChunkSize; 70 | } 71 | } 72 | 73 | zim::zsize_t comp_size; 74 | auto comp_data = compressor.get_data(&comp_size); 75 | 76 | typename TestFixture::DecompressorT decompressor(initialSize); 77 | { 78 | bool first=true; 79 | unsigned long size = comp_size.v; 80 | size_t offset = 0; 81 | while (size) { 82 | if (first) { 83 | decompressor.init(comp_data.get()); 84 | first = false; 85 | } 86 | auto adjustedChunkSize = std::min(size, chunkSize); 87 | decompressor.feed(comp_data.get()+offset, adjustedChunkSize); 88 | offset += adjustedChunkSize; 89 | size -= adjustedChunkSize; 90 | } 91 | } 92 | 93 | zim::zsize_t decomp_size; 94 | auto decomp_data = decompressor.get_data(&decomp_size); 95 | 96 | ASSERT_EQ(decomp_size.v, data.size()); 97 | ASSERT_EQ(data, std::string(decomp_data.get(), decomp_size.v)); 98 | } 99 | } 100 | } 101 | 102 | 103 | } // namespace 104 | -------------------------------------------------------------------------------- /test/decoderstreamreader.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Veloman Yunkan 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include "decoderstreamreader.h" 21 | #include "buffer_reader.h" 22 | 23 | #include "gtest/gtest.h" 24 | 25 | namespace 26 | { 27 | 28 | template 29 | std::string 30 | compress(const std::string& data) 31 | { 32 | zim::Compressor compressor(data.size()); 33 | compressor.init(const_cast(data.c_str())); 34 | compressor.feed(data.c_str(), data.size()); 35 | zim::zsize_t comp_size; 36 | const auto comp_data = compressor.get_data(&comp_size); 37 | return std::string(comp_data.get(), comp_size.v); 38 | } 39 | 40 | std::string operator*(const std::string& s, unsigned N) 41 | { 42 | std::string result; 43 | for (unsigned i=0; i 54 | class DecoderStreamReaderTest : public testing::Test { 55 | protected: 56 | typedef T CompressionInfo; 57 | }; 58 | 59 | using CompressionTypes = ::testing::Types< 60 | ZSTD_INFO 61 | >; 62 | 63 | TYPED_TEST_SUITE(DecoderStreamReaderTest, CompressionTypes); 64 | 65 | TYPED_TEST(DecoderStreamReaderTest, justCompressedData) { 66 | typedef typename TestFixture::CompressionInfo CompressionInfo; 67 | 68 | const int N = 10; 69 | const std::string s("DecoderStreamReader should work correctly"); 70 | const std::string compDataStr = compress(s*N); 71 | auto compData = zim::Buffer::makeBuffer(compDataStr.data(), zim::zsize_t(compDataStr.size())); 72 | 73 | auto compReader = std::make_shared(compData); 74 | zim::DecoderStreamReader dds(compReader); 75 | for (int i=0; iget_buffer(zim::offset_t(0), zim::zsize_t(s.size())))) << "i: " << i; 79 | } 80 | } 81 | 82 | TYPED_TEST(DecoderStreamReaderTest, compressedDataFollowedByGarbage) { 83 | typedef typename TestFixture::CompressionInfo CompressionInfo; 84 | 85 | const int N = 10; 86 | const std::string s("DecoderStreamReader should work correctly"); 87 | std::string compDataStr = compress(s*N); 88 | compDataStr += std::string(10, '\0'); 89 | 90 | auto compData = zim::Buffer::makeBuffer(compDataStr.data(), zim::zsize_t(compDataStr.size())); 91 | auto compReader = std::make_shared(compData); 92 | 93 | zim::DecoderStreamReader dds(compReader); 94 | for (int i=0; iget_buffer(zim::offset_t(0), zim::zsize_t(s.size())))) << "i: " << i; 98 | } 99 | } 100 | 101 | } // unnamed namespace 102 | -------------------------------------------------------------------------------- /test/header.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2009 Tommi Maekitalo 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include 21 | #ifdef _WIN32 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #endif 29 | 30 | #include 31 | #include 32 | 33 | #include "gtest/gtest.h" 34 | 35 | #include "../src/fileheader.h" 36 | #include "../src/buffer.h" 37 | #include "../src/buffer_reader.h" 38 | 39 | #include "tools.h" 40 | 41 | namespace 42 | { 43 | 44 | using zim::unittests::TempFile; 45 | using zim::unittests::write_to_buffer; 46 | 47 | TEST(HeaderTest, read_write_header) 48 | { 49 | zim::Fileheader header; 50 | header.setUuid("123456789\0abcd\nf"); 51 | header.setArticleCount(4711); 52 | header.setPathPtrPos(12345); 53 | header.setTitleIdxPos(23456); 54 | header.setClusterCount(14); 55 | header.setClusterPtrPos(45678); 56 | header.setMainPage(11); 57 | header.setLayoutPage(13); 58 | header.setMimeListPos(72); 59 | 60 | ASSERT_EQ(header.getUuid(), "123456789\0abcd\nf"); 61 | ASSERT_EQ(header.getArticleCount(), 4711U); 62 | ASSERT_EQ(header.getPathPtrPos(), 12345U); 63 | ASSERT_EQ(header.getTitleIdxPos(), 23456U); 64 | ASSERT_EQ(header.getClusterCount(), 14U); 65 | ASSERT_EQ(header.getClusterPtrPos(), 45678U); 66 | ASSERT_EQ(header.getMainPage(), 11U); 67 | ASSERT_EQ(header.getLayoutPage(), 13U); 68 | ASSERT_EQ(header.getMimeListPos(), 72U); 69 | 70 | auto buffer = write_to_buffer(header); 71 | zim::Fileheader header2; 72 | header2.read(zim::BufferReader(buffer)); 73 | 74 | ASSERT_EQ(header2.getUuid(), "123456789\0abcd\nf"); 75 | ASSERT_EQ(header2.getArticleCount(), 4711U); 76 | ASSERT_EQ(header2.getPathPtrPos(), 12345U); 77 | ASSERT_EQ(header2.getTitleIdxPos(), 23456U); 78 | ASSERT_EQ(header2.getClusterCount(), 14U); 79 | ASSERT_EQ(header2.getClusterPtrPos(), 45678U); 80 | ASSERT_EQ(header2.getMainPage(), 11U); 81 | ASSERT_EQ(header2.getLayoutPage(), 13U); 82 | } 83 | 84 | } // namespace 85 | -------------------------------------------------------------------------------- /test/meson.build: -------------------------------------------------------------------------------- 1 | tests = [ 2 | 'log', 3 | 'lrucache', 4 | 'concurrentcache', 5 | 'cluster', 6 | 'creator', 7 | 'dirent', 8 | 'error_in_creator', 9 | 'header', 10 | 'uuid', 11 | 'archive', 12 | 'iterator', 13 | 'reader', 14 | 'find', 15 | 'compression', 16 | 'dirent_lookup', 17 | 'istreamreader', 18 | 'decoderstreamreader', 19 | 'rawstreamreader', 20 | 'bufferstreamer', 21 | 'parseLongPath', 22 | 'random', 23 | 'tooltesting', 24 | 'tinyString', 25 | 'suggestion_iterator', 26 | 'indexing_criteria', 27 | 'counterParsing' 28 | ] 29 | 30 | if xapian_dep.found() 31 | tests += ['search', 'defaultIndexdata', 'search_iterator', 'suggestion'] 32 | endif 33 | 34 | datadir = get_option('test_data_dir') 35 | if datadir == 'none' 36 | test_cpp_args = ['-DWITH_TEST_DATA=0'] 37 | else 38 | test_cpp_args = ['-DWITH_TEST_DATA=1'] 39 | if datadir == '' 40 | # We need to download the test data. 41 | datadir = join_paths(meson.current_build_dir(), 'data') 42 | endif 43 | run_target('download_test_data', command : [test_data_downloader, '--remove-top-dir', datadir]) 44 | endif 45 | 46 | testenv = environment() 47 | testenv.set('ZIM_TEST_DATA_DIR', datadir) 48 | 49 | if cpp.get_id() == 'gcc' and cpp.version().version_compare('>=12.0.0') and cpp.version().version_compare('<13.0.0') 50 | test_cpp_args += ['-Wno-error=restrict'] 51 | endif 52 | 53 | if gtest_dep.found() and not meson.is_cross_build() 54 | foreach test_name : tests 55 | test_exe = executable(test_name, [test_name+'.cpp', 'tools.cpp'], 56 | implicit_include_directories: false, 57 | include_directories: [include_directory, src_directory], 58 | link_with: libzim, 59 | cpp_args: test_cpp_args, 60 | dependencies: deps + [gtest_dep], 61 | build_rpath: '$ORIGIN') 62 | test(test_name, test_exe, timeout : 120, env: testenv) 63 | endforeach 64 | endif 65 | -------------------------------------------------------------------------------- /test/parseLongPath.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2020 Matthieu Gautier mgautier@kymeria.fr 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include "gtest/gtest.h" 21 | #include 22 | #include 23 | 24 | namespace zim { 25 | std::tuple parseLongPath(const std::string& longPath); 26 | }; 27 | 28 | using zim::parseLongPath; 29 | 30 | namespace 31 | { 32 | TEST(ParseLongPathTest, invalid) 33 | { 34 | ASSERT_THROW(parseLongPath(""), std::runtime_error); 35 | ASSERT_THROW(parseLongPath("AB"), std::runtime_error); 36 | ASSERT_THROW(parseLongPath("AB/path"), std::runtime_error); 37 | ASSERT_THROW(parseLongPath("/"), std::runtime_error); 38 | ASSERT_THROW(parseLongPath("//"), std::runtime_error); 39 | ASSERT_THROW(parseLongPath("/AB"), std::runtime_error); 40 | ASSERT_THROW(parseLongPath("AB/"), std::runtime_error); 41 | ASSERT_THROW(parseLongPath("/AB/path"), std::runtime_error); 42 | ASSERT_THROW(parseLongPath("//A/path"), std::runtime_error); 43 | } 44 | 45 | TEST(ParseLongPathTest, valid) 46 | { 47 | ASSERT_EQ(parseLongPath("A/path"), std::make_tuple('A', "path")); 48 | ASSERT_EQ(parseLongPath("A/p"), std::make_tuple('A', "p")); 49 | ASSERT_EQ(parseLongPath("/B/path"), std::make_tuple('B', "path")); 50 | ASSERT_EQ(parseLongPath("/B/p"), std::make_tuple('B', "p")); 51 | ASSERT_EQ(parseLongPath("C//path"), std::make_tuple('C', "/path")); 52 | ASSERT_EQ(parseLongPath("/C//path"), std::make_tuple('C', "/path")); 53 | ASSERT_EQ(parseLongPath("L/path/with/separator"), std::make_tuple('L', "path/with/separator")); 54 | ASSERT_EQ(parseLongPath("L//path/with/separator"), std::make_tuple('L', "/path/with/separator")); 55 | ASSERT_EQ(parseLongPath("A"), std::make_tuple('A', "")); 56 | ASSERT_EQ(parseLongPath("/A"), std::make_tuple('A', "")); 57 | ASSERT_EQ(parseLongPath("A/"), std::make_tuple('A', "")); 58 | ASSERT_EQ(parseLongPath("/A/"), std::make_tuple('A', "")); 59 | } 60 | }; 61 | -------------------------------------------------------------------------------- /test/random.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2021 Matthieu Gautier mgautier@kymeria.fr 3 | * 4 | * This program is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU General Public License as 6 | * published by the Free Software Foundation; either version 2 of the 7 | * License, or (at your option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but 10 | * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied 11 | * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and 12 | * NON-INFRINGEMENT. See the GNU General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with this program; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | */ 19 | 20 | #include "gtest/gtest.h" 21 | #include 22 | 23 | namespace zim { 24 | uint32_t randomNumber(uint32_t max); 25 | }; 26 | 27 | using namespace zim; 28 | 29 | namespace 30 | { 31 | TEST(Random, smallMax) 32 | { 33 | for(auto i=0; i<1000; i++) { 34 | ASSERT_EQ(randomNumber(0), 0U); 35 | } 36 | 37 | 38 | for(auto i=0; i<1000; i++) { 39 | auto r = randomNumber(1); 40 | ASSERT_TRUE(r>=0U && r<=1U) << r; 41 | } 42 | } 43 | 44 | TEST(Random, distribution) 45 | { 46 | const uint32_t NB_NUMBERS = 1000000; 47 | const uint32_t NB_BUCKETS = 100; 48 | const uint32_t BUCKET_SIZE = NB_NUMBERS/NB_BUCKETS; 49 | const uint32_t MAX_RANDOM = 1000000; 50 | std::vector distribution(NB_BUCKETS); 51 | 52 | for (auto i=0U; i(Buffer::makeBuffer(data, zsize_t(sizeof(data)))); 43 | 44 | RawStreamReader rdr(reader); 45 | 46 | ASSERT_EQ(1234U, rdr.read()); 47 | auto subbuffer = rdr.sub_reader(zsize_t(4))->get_buffer(offset_t(0), zsize_t(4)); 48 | ASSERT_EQ("efgh", toString(subbuffer)); 49 | subbuffer = rdr.sub_reader(zsize_t(10))->get_buffer(offset_t(0), zsize_t(10)); 50 | ASSERT_EQ("ijklmnopqr", toString(subbuffer)); 51 | ASSERT_EQ(-987654321, rdr.read()); 52 | } 53 | 54 | } // unnamed namespace 55 | --------------------------------------------------------------------------------