├── .cirrus.yml ├── .clang-format ├── .github └── workflows │ └── wheels.yml ├── CMakeLists.txt ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmark ├── README.md ├── prepare_dataset.py ├── requirements.txt ├── run-jagger.py ├── run-multiprocess-jagger.py └── run-vaporetto.py ├── bootstrap-cpp-llvm-mingw-cross.sh ├── bootstrap-cpp-python.sh ├── cmake ├── ClangClCMakeCompileRules.cmake ├── aarch64-linux-gnu.toolchain ├── clang-cl-msvc-windows.cmake ├── clang-cl-msvc-wsl.cmake ├── llvm-mingw-cross.cmake ├── llvm-mingw-win64.cmake ├── mingw64-cross.cmake └── sanitizers │ ├── FindASan.cmake │ ├── FindMSan.cmake │ ├── FindSanitizers.cmake │ ├── FindTSan.cmake │ ├── FindUBSan.cmake │ ├── asan-wrapper │ └── sanitize-helpers.cmake ├── cpp_cli └── jagger-app.cc ├── data ├── Makefile ├── README.md ├── emoji-kaomoji.csv ├── kaomoji-list.txt └── to_mecab_feature.py ├── example ├── Makefile ├── batch_tokenize.py └── simple_tokenize.py ├── jagger.BSD ├── jagger.GPL ├── jagger.LGPL ├── jagger.png ├── jagger ├── __init__.py ├── ccedar_core.h ├── jagger.h ├── main.py └── python-binding-jagger.cc ├── pyproject.toml ├── python-binding-train-jagger.cc ├── setup.py └── train ├── CMakeLists.txt ├── README.md ├── bootstrap-linux.sh ├── bootstrap-llvm-mingw-cross.sh ├── tagging.py ├── train_jagger.cc └── vcsetup.bat /.cirrus.yml: -------------------------------------------------------------------------------- 1 | build_and_store_wheels: &BUILD_AND_STORE_WHEELS 2 | install_cibuildwheel_script: 3 | - python -m pip install cibuildwheel==2.16.2 4 | run_cibuildwheel_script: 5 | - cibuildwheel 6 | wheels_artifacts: 7 | path: "wheelhouse/*" 8 | 9 | # Upload only for tagged commit 10 | only_if: $CIRRUS_TAG != '' 11 | publish_script: 12 | - python -m pip install twine 13 | - python -m twine upload --repository-url https://upload.pypi.org/legacy/ --username __token__ wheelhouse/*.whl 14 | 15 | 16 | linux_aarch64_task: 17 | name: Build Linux aarch64 wheels. 18 | compute_engine_instance: 19 | image_project: cirrus-images 20 | image: family/docker-builder-arm64 21 | architecture: arm64 22 | platform: linux 23 | cpu: 4 24 | memory: 4G 25 | environment: 26 | TWINE_PASSWORD: ENCRYPTED[c06553df9d8d52500784b41f5c45afce5329d3397cf8e7e2a6dd82990e3f32b4362efabc028a213eacf9fc0181acfb86] 27 | 28 | install_pre_requirements_script: 29 | - apt install -y python3-venv python-is-python3 30 | <<: *BUILD_AND_STORE_WHEELS 31 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: Google 3 | IndentWidth: 2 4 | TabWidth: 2 5 | UseTab: Never 6 | BreakBeforeBraces: Attach 7 | Standard: Cpp11 8 | -------------------------------------------------------------------------------- /.github/workflows/wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build and upload to PyPI 2 | 3 | # Build on every branch push, tag push, and pull request change: 4 | on: [push, pull_request] 5 | 6 | jobs: 7 | 8 | build_wheels: 9 | name: Build wheels on ${{ matrix.os }} 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | matrix: 13 | os: [ubuntu-latest, windows-latest, macos-latest] 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | with: 18 | fetch-depth: 0 19 | fetch-tags: true # Optional, use if you use setuptools_scm 20 | 21 | - name: Build wheels 22 | uses: pypa/cibuildwheel@v2.16.5 23 | # to supply options, put them in 'env', like: 24 | # env: 25 | # CIBW_SOME_OPTION: value 26 | # Disable building PyPy wheels on all platforms 27 | env: 28 | CIBW_ARCHS_MACOS: "x86_64 universal2 arm64" 29 | CIBW_ARCHS_WINDOWS: "AMD64 x86" 30 | # disable aarm64 build since its too slow to build(docker + qemu) 31 | CIBW_ARCHS_LINUX: "x86_64 i686" 32 | # it looks cibuildwheel fails to add version string to wheel file for python 3.6, so skip it 33 | CIBW_SKIP: pp* 34 | 35 | - uses: actions/upload-artifact@v4 36 | with: 37 | name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} 38 | path: ./wheelhouse/*.whl 39 | 40 | # It looks cibuildwheels did not clean build folder(CMake), and it results to Windows arm64 build failure(trying to reuse x86 build of .obj) 41 | # So supply separated build job for Windows ARM64 build 42 | # TODO: clean build folder using CIBW_BEFORE_ALL? 43 | build_win_arm64_wheels: 44 | name: Build ARM64 wheels on Windows. 45 | runs-on: windows-latest 46 | steps: 47 | - uses: actions/checkout@v4 48 | with: 49 | fetch-depth: 0 50 | fetch-tags: true # Optional, use if you use setuptools_scm 51 | 52 | - name: Build wheels 53 | uses: pypa/cibuildwheel@v2.16.5 54 | # to supply options, put them in 'env', like: 55 | # env: 56 | # CIBW_SOME_OPTION: value 57 | # Disable building PyPy wheels on all platforms 58 | env: 59 | CIBW_ARCHS_WINDOWS: "ARM64" 60 | CIBW_SKIP: pp* 61 | 62 | - uses: actions/upload-artifact@v4 63 | with: 64 | name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} 65 | path: ./wheelhouse/*.whl 66 | 67 | make_sdist: 68 | name: Make SDist 69 | runs-on: ubuntu-latest 70 | steps: 71 | - uses: actions/checkout@v4 72 | with: 73 | fetch-depth: 0 # Optional, use if you use setuptools_scm 74 | fetch-tags: true # Optional, use if you use setuptools_scm 75 | 76 | - name: Build SDist 77 | run: pipx run build --sdist 78 | 79 | - uses: actions/upload-artifact@v4 80 | with: 81 | name: cibw-sdist 82 | path: dist/*.tar.gz 83 | 84 | upload_all: 85 | needs: [build_wheels, build_wheels, make_sdist] 86 | runs-on: ubuntu-latest 87 | environment: release 88 | permissions: 89 | # IMPORTANT: this permission is mandatory for trusted publishing 90 | id-token: write 91 | # upload to PyPI on every tag starting with 'v' 92 | # NOTE: Without github.event_name & githug.ref check, `upload_all` task is still triggered on 'main' branch push. 93 | # (then get 'Branch "main" is not allowed to deploy to release due to environment protection rules.' error) 94 | # So still do event_name and github.ref check. 95 | # TODO: Make it work only using Github `environment` feature. 96 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') 97 | # alternatively, to publish when a GitHub Release is created, use the following rule: 98 | # if: github.event_name == 'push' && github.event.action == 'published' 99 | steps: 100 | - uses: actions/download-artifact@v4 101 | with: 102 | pattern: cibw-* 103 | path: dist 104 | merge-multiple: true 105 | 106 | - uses: pypa/gh-action-pypi-publish@release/v1 107 | with: 108 | # Use Trusted Publisher feature: 109 | # https://docs.pypi.org/trusted-publishers/ 110 | # so no use of PYPI_API_TOKEN 111 | #password: ${{ secrets.PYPI_API_TOKEN }} 112 | # 113 | # Avoid race condition when using multiple CIs 114 | skip-existing: true 115 | verbose: true 116 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | set(EXE_TARGET "jagger") 4 | set(PY_TARGET "jagger_ext") 5 | project(${EXE_TARGET} CXX) 6 | 7 | option(JAGGER_WITH_PYTHON "Build Python module(For developer)." On) 8 | option( 9 | JAGGER_PREFER_LOCAL_PYTHON_INSTALLATION 10 | "Prefer locally-installed Python interpreter than system or conda/brew installed Python. Please specify your Python interpreter with `Python3_EXECUTABLE` cmake option if you enable this option." 11 | OFF) 12 | 13 | 14 | # cmake modules 15 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) 16 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/sanitizers) 17 | find_package(Sanitizers) # Address sanitizer (-DSANITIZE_ADDRESS=ON) 18 | 19 | set(CMAKE_CXX_STANDARD 11) 20 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 21 | set(CMAKE_CXX_EXTENSIONS OFF) 22 | 23 | # Build standalone .so(for developer) 24 | if (JAGGER_WITH_PYTHON) 25 | 26 | if(JAGGER_PREFER_LOCAL_PYTHON_INSTALLATION) 27 | #message(STATUS "Local Python") 28 | set(Python3_FIND_FRAMEWORK NEVER) # Do not search framework python 29 | set(Python3_FIND_STRATEGY LOCATION) 30 | set(Python3_FIND_REGISTRY NEVER) # Windows only 31 | else() 32 | set(Python3_FIND_FRAMEWORK LAST 33 | )# Prefer Brew/Conda to Apple framework python 34 | endif() 35 | 36 | find_package( 37 | Python3 38 | COMPONENTS Interpreter Development 39 | REQUIRED) 40 | 41 | find_package(pybind11 CONFIG) 42 | 43 | # pybind11 method: 44 | pybind11_add_module(${PY_TARGET} jagger/python-binding-jagger.cc) 45 | 46 | # copy .so to jagger/ after the build. 47 | add_custom_command( 48 | TARGET ${PY_TARGET} 49 | POST_BUILD 50 | COMMAND "${CMAKE_COMMAND}" -E copy "$" 51 | "${CMAKE_SOURCE_DIR}/jagger/$" 52 | COMMENT "copying jagger python module file to jagger/" 53 | VERBATIM) 54 | 55 | endif() 56 | 57 | 58 | add_executable(${EXE_TARGET} cpp_cli/jagger-app.cc) 59 | add_sanitizers(${EXE_TARGET}) 60 | 61 | target_include_directories(${EXE_TARGET} PRIVATE jagger) 62 | 63 | # enable mmap by default. 64 | target_compile_definitions(${EXE_TARGET} PRIVATE "JAGGER_USE_MMAP_IO") 65 | 66 | # [VisualStudio] 67 | if(WIN32) 68 | # Set ${EXE_TARGET} as a startup project for VS IDE 69 | set_property(DIRECTORY PROPERTY VS_STARTUP_PROJECT ${EXE_TARGET}) 70 | 71 | # For easier debugging in VS IDE(cmake 3.8.0 or later required) Set working 72 | # directory where CMakeLists.txt is placed. 73 | if(CMAKE_VERSION VERSION_GREATER 3.8.0) 74 | set_target_properties( 75 | ${EXE_TARGET} PROPERTIES VS_DEBUGGER_WORKING_DIRECTORY 76 | "${CMAKE_CURRENT_SOURCE_DIR}") 77 | endif() 78 | endif() 79 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2023, Light Transport Entertainment, Inc. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include pyproject.toml 2 | include setup.py 3 | include README.md 4 | include LICENSE 5 | include jagger.png 6 | include jagger.BSD 7 | include jagger.GPL 8 | include jagger.LGPL 9 | include jagger/ccedar_core.h 10 | include jagger/jagger.h 11 | include jagger/python-binding-jagger.cc 12 | include jagger/__init__.py 13 | include jagger/main.py 14 | include jagger/_version.py 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jagger-python 2 | 3 |
4 | 5 |
6 | 7 | 8 | Python binding for Jagger(C++ implementation of Pattern-based Japanese Morphological Analyzer) : https://www.tkl.iis.u-tokyo.ac.jp/~ynaga/jagger/index.en.html 9 | 10 | ## Install 11 | 12 | ``` 13 | $ python -m pip install jagger 14 | ``` 15 | 16 | This does not install model files. 17 | 18 | You can download precompiled KWDLC model from https://github.com/lighttransport/jagger-python/releases/download/v0.1.0/model_kwdlc.tar.gz 19 | (Note that KWDLC has unclear license/TermOfUse. Use it at your own risk) 20 | 21 | ## Example 22 | 23 | ```py 24 | import jagger 25 | 26 | model_path = "model/kwdlc/patterns" 27 | 28 | tokenizer = jagger.Jagger() 29 | tokenizer.load_model(model_path) 30 | 31 | text = "吾輩は猫である。名前はまだない。" 32 | toks = tokenizer.tokenize(text) 33 | 34 | for tok in toks: 35 | print(tok.surface(), tok.feature()) 36 | print("EOS") 37 | 38 | """ 39 | 吾輩 名詞,普通名詞,*,*,吾輩,わがはい,代表表記:我が輩/わがはい カテゴリ:人 40 | は 助詞,副助詞,*,*,は,は,* 41 | 猫 名詞,普通名詞,*,*,猫,ねこ,* 42 | である 判定詞,*,判定詞,デアル列基本形,だ,である,* 43 | 。 特殊,句点,*,*,。,。,* 44 | 名前 名詞,普通名詞,*,*,名前,なまえ,* 45 | は 助詞,副助詞,*,*,は,は,* 46 | まだ 副詞,*,*,*,まだ,まだ,* 47 | ない 形容詞,*,イ形容詞アウオ段,基本形,ない,ない,* 48 | 。 特殊,句点,*,*,。,。,* 49 | EOS 50 | """ 51 | 52 | # print tags 53 | for tok in toks: 54 | # print tag(split feature() by comma) 55 | print(tok.surface()) 56 | for i in range(tok.n_tags()): 57 | print(" tag[{}] = {}".format(i, tok.tag(i))) 58 | print("EOS") 59 | ``` 60 | 61 | ## Batch processing(experimental) 62 | 63 | `tokenize_batch` tokenizes multiple lines(delimited by newline('\n', '\r', or '\r\n')) at once. 64 | Splitting lines is done in C++ side. 65 | 66 | ```py 67 | import jagger 68 | 69 | model_path = "model/kwdlc/patterns" 70 | 71 | tokenizer = jagger.Jagger() 72 | tokenizer.load_model(model_path) 73 | 74 | text = """ 75 | 吾輩は猫である。 76 | 名前はまだない。 77 | 明日の天気は晴れです。 78 | """ 79 | 80 | # optional: set C++ threads(CPU cores) to use 81 | # default: Use all CPU cores. 82 | # tokenizer.set_threads(4) 83 | 84 | toks_list = tokenizer.tokenize_batch(text) 85 | 86 | for toks in toks_list: 87 | for tok in toks: 88 | print(tok.surface(), tok.feature()) 89 | 90 | ``` 91 | 92 | ## Train a model. 93 | 94 | Pyhthon interface for training a model is not provided yet. 95 | For a while, you can build C++ trainer cli using CMake(Windows supported). 96 | See `train/` for details. 97 | 98 | ## Limitation 99 | 100 | Single line string must be less than 262,144 bytes(~= 87,000 UTF-8 Japanese chars). 101 | 102 | ## Jagger version 103 | 104 | Jagger version used in this Python binding is 105 | 106 | 2023-02-18 107 | 108 | ## For developer 109 | 110 | Edit `dev_mode=True` in to enable asan + debug build 111 | 112 | Run python script with 113 | 114 | ``` 115 | $ LD_PRELOAD=$(gcc -print-file-name=libasan.so) python FILE.py 116 | 117 | or 118 | 119 | $ LD_PRELOAD=$(clang -print-file-name=libclang_rt.asan-x86_64.so) python FILE.py 120 | ``` 121 | 122 | ### Releasing 123 | 124 | Version is created automatically using `setuptools_scm`. 125 | 126 | * tag it: `git tag vX.Y.Z` 127 | * push tag: `git push --tags` 128 | 129 | 130 | ## TODO 131 | 132 | - [ ] Provide a model file trained from Wikipedia, UniDic, etc(clearer & permissive licencing&TermOfUse). 133 | - Use GiNZA for morphological analysis. 134 | - [x] Split feature vector(CSV) considering quote char when extracting tags. 135 | - e.g. 'a,b,"c,d",e' => ["a", "b", "c,d", "e"] 136 | - [ ] Optimize C++ <-> Python interface 137 | - [ ] string_view(or read-only string literal) for tag str. 138 | - [ ] pickle support(for exchanging Python object when using multiprocessing) 139 | - https://pybind11.readthedocs.io/en/latest/advanced/classes.html#pickling-support 140 | 141 | ## License 142 | 143 | Python binding is available under 2-clause BSD licence. 144 | 145 | Jagger and `ccedar_core.h` is licensed under GPLv2/LGPLv2.1/BSD triple licenses. 146 | 147 | ### Third party licences 148 | 149 | * stack_container.h: BSD like license. 150 | * nanocsv.h MIT license. 151 | 152 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark jagger-python 2 | 3 | ## Dataset 4 | 5 | Wiki40b 6 | 7 | ## Requirements 8 | 9 | * Python 10 | * Conda 11 | 12 | ## Install 13 | 14 | ``` 15 | $ python -m pip install -r requirements.txt 16 | ``` 17 | 18 | ## Prepare data 19 | 20 | We use huggingface datasets to download wiki40b. 21 | 22 | Run `prepare_dataset.py` 23 | 24 | 25 | ## Benchmark in Jagger 26 | 27 | Download and extract dictionary. https://github.com/lighttransport/jagger-python/releases/download/v0.1.0/model_kwdlc.tar.gz 28 | 29 | Then, 30 | 31 | ``` 32 | $ python run-jagger.py 33 | ``` 34 | 35 | ## Benchmark in Vaporetto 36 | 37 | ``` 38 | $ wget https://github.com/daac-tools/vaporetto-models/releases/download/v0.5.0/bccwj-suw+unidic_pos+pron.tar.xz 39 | $ tar xvf bccwj-suw+unidic_pos+pron.tar.xz 40 | ``` 41 | 42 | ``` 43 | $ python run-vaporetto.py 44 | ``` 45 | 46 | EoL. 47 | -------------------------------------------------------------------------------- /benchmark/prepare_dataset.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import tqdm 3 | 4 | dss = datasets.load_dataset("range3/wiki40b-ja") 5 | print(dss) 6 | 7 | 8 | f = open("output-wiki.txt", 'w') 9 | 10 | for example in tqdm.tqdm(dss['train']): 11 | texts = example['text'].split() 12 | 13 | # extract paragraph only. 14 | in_paragraph = False 15 | 16 | txts_result = [] 17 | for text in texts: 18 | if in_paragraph: 19 | text = text.replace("_NEWLINE_", '\n') 20 | f.write(text + '\n') 21 | in_paragraph = False 22 | 23 | if text == "_START_PARAGRAPH_": 24 | in_paragraph = True 25 | -------------------------------------------------------------------------------- /benchmark/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | jagger 3 | tqdm 4 | vaporetto 5 | zstandard 6 | -------------------------------------------------------------------------------- /benchmark/run-jagger.py: -------------------------------------------------------------------------------- 1 | import jagger 2 | import tqdm 3 | import time 4 | 5 | model_path = "model/kwdlc/patterns" 6 | 7 | tokenizer = jagger.Jagger() 8 | tokenizer.load_model(model_path) 9 | #tokenizer.set_threads(16) 10 | 11 | lines = open("output-wiki.txt", 'r', encoding='utf8').readlines() 12 | 13 | s = time.time() 14 | for line in tqdm.tqdm(lines): 15 | toks = tokenizer.tokenize(line) 16 | 17 | e = time.time() 18 | print("Jagger: Total {} secs".format(e - s)) 19 | 20 | #total_secs = 0 21 | #nlines_per_batch = 1024*128 22 | #for i in tqdm.tqdm(range(0, len(lines), nlines_per_batch)): 23 | # text = '\n'.join(lines[i:i+nlines_per_batch]) 24 | # 25 | # print("run jagger for {} lines.".format(nlines_per_batch)) 26 | # s = time.time() 27 | # toks_list = tokenizer.tokenize_batch(text) 28 | # e = time.time() 29 | # print("{} secs".format(e - s)) 30 | # 31 | # total_secs += (e - s) 32 | # 33 | # # print result 34 | # #for toks in toks_list: 35 | # # for tok in toks: 36 | # # print(tok.surface(), tok.feature()) 37 | # print("Total processing time: {} secs".format(total_secs)) 38 | -------------------------------------------------------------------------------- /benchmark/run-multiprocess-jagger.py: -------------------------------------------------------------------------------- 1 | import jagger 2 | import concurrent.futures 3 | from multiprocessing import cpu_count 4 | import os 5 | import sys 6 | from tqdm import tqdm 7 | 8 | model_path = "model/kwdlc/patterns" 9 | 10 | tokenizer = jagger.Jagger() 11 | tokenizer.load_model(model_path) 12 | #tokenizer.set_threads(16) 13 | 14 | lines = open("output-wiki.txt", 'r', encoding='utf8').readlines() 15 | 16 | # Use half of CPU cores 17 | num_process = max(1, cpu_count() // 2) 18 | 19 | nlines_per_batch = 1000 20 | 21 | def run(lines): 22 | # TODO: Accept List[str] as input for tokenize_batch 23 | toks_list = tokenizer.tokenize_batch(''.join(lines)) 24 | 25 | # NOTE: Cannot return tokenized result at the moment. List[List[PyToken]]] fails pickle serialization 26 | # So process toks_list here and convert to pure Python object if you want to return something. 27 | return None 28 | 29 | 30 | total_ticks = max(1, len(lines) // nlines_per_batch) 31 | with tqdm(total=total_ticks) as pbar: 32 | with concurrent.futures.ProcessPoolExecutor(max_workers=num_process) as executor: 33 | futures = {executor.submit(run, lines[i:i+nlines_per_batch]): i for i in range(0, len(lines), nlines_per_batch)} 34 | 35 | results = {} 36 | for future in concurrent.futures.as_completed(futures): 37 | arg = futures[future] 38 | result = future.result() 39 | pbar.update(1) 40 | -------------------------------------------------------------------------------- /benchmark/run-vaporetto.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import vaporetto 4 | import zstandard 5 | import tqdm 6 | 7 | dctx = zstandard.ZstdDecompressor() 8 | with open('bccwj-suw+unidic_pos+pron/bccwj-suw+unidic_pos+pron.model.zst', 'rb') as fp: 9 | with dctx.stream_reader(fp) as dict_reader: 10 | tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags = True) 11 | 12 | lines = open("output-wiki.txt", 'r', encoding='utf8').readlines() 13 | 14 | s = time.time() 15 | for line in tqdm.tqdm(lines): 16 | toks = tokenizer.tokenize(line) 17 | 18 | e = time.time() 19 | 20 | print("Vaporetto: Total {} secs".format(e - s)) 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /bootstrap-cpp-llvm-mingw-cross.sh: -------------------------------------------------------------------------------- 1 | # llvm-mingw cross compile 2 | # Assume Ninja is installed on your system 3 | curdir=`pwd` 4 | 5 | # Set path to llvm-mingw in env var. 6 | # https://github.com/mstorsjo/llvm-mingw 7 | export LLVM_MINGW_DIR=/mnt/data/local/llvm-mingw-20231128-ucrt-ubuntu-20.04-x86_64/ 8 | 9 | builddir=${curdir}/build-llvm-mingw 10 | 11 | rm -rf ${builddir} 12 | mkdir ${builddir} 13 | 14 | cd ${builddir} && cmake \ 15 | -DCMAKE_TOOLCHAIN_FILE=${curdir}/cmake/llvm-mingw-cross.cmake \ 16 | -G "Ninja" \ 17 | -DCMAKE_VERBOSE_MAKEFILE=1 \ 18 | .. 19 | 20 | cd ${curdir} 21 | -------------------------------------------------------------------------------- /bootstrap-cpp-python.sh: -------------------------------------------------------------------------------- 1 | curdir=`pwd` 2 | 3 | builddir=${curdir}/build_python_module 4 | 5 | rm -rf ${builddir} 6 | mkdir ${builddir} 7 | 8 | # set path to pybind11 9 | # If you install pybind11 through pip, its usually installed to /pybind11. 10 | pybind11_path=`python -c "import site; print (site.getsitepackages()[0])"` 11 | 12 | CC=clang CXX=clang++ pybind11_DIR=${pybind11_path}/pybind11 cmake -B${builddir} -S. \ 13 | -DJAGGER_WITH_PYTHON=1 \ 14 | -DCMAKE_VERBOSE_MAKEFILE=1 15 | -------------------------------------------------------------------------------- /cmake/ClangClCMakeCompileRules.cmake: -------------------------------------------------------------------------------- 1 | # macOS paths usually start with /Users/*. Unfortunately, clang-cl interprets 2 | # paths starting with /U as macro undefines, so we need to put a -- before the 3 | # input file path to force it to be treated as a path. CMake's compilation rules 4 | # should be tweaked accordingly, but until that's done, and to support older 5 | # CMake versions, overriding compilation rules works well enough. This file will 6 | # be included by cmake after the default compilation rules have already been set 7 | # up, so we can just modify them instead of duplicating them entirely. 8 | string(REPLACE "-c " "-c -- " CMAKE_C_COMPILE_OBJECT "${CMAKE_C_COMPILE_OBJECT}") 9 | string(REPLACE "-c " "-c -- " CMAKE_CXX_COMPILE_OBJECT "${CMAKE_CXX_COMPILE_OBJECT}") 10 | -------------------------------------------------------------------------------- /cmake/aarch64-linux-gnu.toolchain: -------------------------------------------------------------------------------- 1 | set(CMAKE_SYSTEM_NAME Linux) 2 | set(CMAKE_SYSTEM_PROCESSOR aarch64) 3 | set(CMAKE_C_COMPILER_TARGET aarch64-linux-gnu) 4 | 5 | set(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu/) 6 | 7 | # Sync with GitHub Actions config 8 | set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc) 9 | set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++) 10 | 11 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 12 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 13 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 14 | set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) 15 | -------------------------------------------------------------------------------- /cmake/clang-cl-msvc-windows.cmake: -------------------------------------------------------------------------------- 1 | # From llvm/cmake/platforms/WinMsvc.cmake 2 | # Modified to use clang-cl on native Windows. 3 | 4 | # Cross toolchain configuration for using clang-cl on non-Windows hosts to 5 | # target MSVC. 6 | # 7 | # Usage: 8 | # cmake -G Ninja 9 | # -DCMAKE_TOOLCHAIN_FILE=/path/to/this/file 10 | # -DHOST_ARCH=[aarch64|arm64|armv7|arm|i686|x86|x86_64|x64] 11 | # -DLLVM_NATIVE_TOOLCHAIN=/path/to/llvm/installation 12 | # -DMSVC_BASE=/path/to/MSVC/system/libraries/and/includes 13 | # -DWINSDK_BASE=/path/to/windows-sdk 14 | # -DWINSDK_VER=windows sdk version folder name 15 | # 16 | # HOST_ARCH: 17 | # The architecture to build for. 18 | # 19 | # LLVM_NATIVE_TOOLCHAIN: 20 | # *Absolute path* to a folder containing the toolchain which will be used to 21 | # build. At a minimum, this folder should have a bin directory with a 22 | # copy of clang-cl, clang, clang++, and lld-link, as well as a lib directory 23 | # containing clang's system resource directory. 24 | # 25 | # MSVC_BASE: 26 | # *Absolute path* to the folder containing MSVC headers and system libraries. 27 | # The layout of the folder matches that which is intalled by MSVC 2017 on 28 | # Windows, and should look like this: 29 | # 30 | # ${MSVC_BASE} 31 | # include 32 | # vector 33 | # stdint.h 34 | # etc... 35 | # lib 36 | # x64 37 | # libcmt.lib 38 | # msvcrt.lib 39 | # etc... 40 | # x86 41 | # libcmt.lib 42 | # msvcrt.lib 43 | # etc... 44 | # 45 | # For versions of MSVC < 2017, or where you have a hermetic toolchain in a 46 | # custom format, you must use symlinks or restructure it to look like the above. 47 | # 48 | # WINSDK_BASE: 49 | # Together with WINSDK_VER, determines the location of Windows SDK headers 50 | # and libraries. 51 | # 52 | # WINSDK_VER: 53 | # Together with WINSDK_BASE, determines the locations of Windows SDK headers 54 | # and libraries. 55 | # 56 | # WINSDK_BASE and WINSDK_VER work together to define a folder layout that matches 57 | # that of the Windows SDK installation on a standard Windows machine. It should 58 | # match the layout described below. 59 | # 60 | # Note that if you install Windows SDK to a windows machine and simply copy the 61 | # files, it will already be in the correct layout. 62 | # 63 | # ${WINSDK_BASE} 64 | # Include 65 | # ${WINSDK_VER} 66 | # shared 67 | # ucrt 68 | # um 69 | # windows.h 70 | # etc... 71 | # Lib 72 | # ${WINSDK_VER} 73 | # ucrt 74 | # x64 75 | # x86 76 | # ucrt.lib 77 | # etc... 78 | # um 79 | # x64 80 | # x86 81 | # kernel32.lib 82 | # etc 83 | # 84 | # IMPORTANT: In order for this to work, you will need a valid copy of the Windows 85 | # SDK and C++ STL headers and libraries on your host. Additionally, since the 86 | # Windows libraries and headers are not case-correct, this toolchain file sets 87 | # up a VFS overlay for the SDK headers and case-correcting symlinks for the 88 | # libraries when running on a case-sensitive filesystem. 89 | 90 | 91 | # When configuring CMake with a toolchain file against a top-level CMakeLists.txt, 92 | # it will actually run CMake many times, once for each small test program used to 93 | # determine what features a compiler supports. Unfortunately, none of these 94 | # invocations share a CMakeCache.txt with the top-level invocation, meaning they 95 | # won't see the value of any arguments the user passed via -D. Since these are 96 | # necessary to properly configure MSVC in both the top-level configuration as well as 97 | # all feature-test invocations, we set environment variables with the values so that 98 | # these environments get inherited by child invocations. We can switch to 99 | # CMAKE_TRY_COMPILE_PLATFORM_VARIABLES once our minimum supported CMake version 100 | # is 3.6 or greater. 101 | function(init_user_prop prop) 102 | if(${prop}) 103 | set(ENV{_${prop}} "${${prop}}") 104 | else() 105 | set(${prop} "$ENV{_${prop}}" PARENT_SCOPE) 106 | endif() 107 | endfunction() 108 | 109 | function(generate_winsdk_vfs_overlay winsdk_include_dir output_path) 110 | set(include_dirs) 111 | file(GLOB_RECURSE entries LIST_DIRECTORIES true "${winsdk_include_dir}/*") 112 | foreach(entry ${entries}) 113 | if(IS_DIRECTORY "${entry}") 114 | list(APPEND include_dirs "${entry}") 115 | endif() 116 | endforeach() 117 | 118 | file(WRITE "${output_path}" "version: 0\n") 119 | file(APPEND "${output_path}" "case-sensitive: false\n") 120 | file(APPEND "${output_path}" "roots:\n") 121 | 122 | foreach(dir ${include_dirs}) 123 | file(GLOB headers RELATIVE "${dir}" "${dir}/*.h") 124 | if(NOT headers) 125 | continue() 126 | endif() 127 | 128 | file(APPEND "${output_path}" " - name: \"${dir}\"\n") 129 | file(APPEND "${output_path}" " type: directory\n") 130 | file(APPEND "${output_path}" " contents:\n") 131 | 132 | foreach(header ${headers}) 133 | file(APPEND "${output_path}" " - name: \"${header}\"\n") 134 | file(APPEND "${output_path}" " type: file\n") 135 | file(APPEND "${output_path}" " external-contents: \"${dir}/${header}\"\n") 136 | endforeach() 137 | endforeach() 138 | endfunction() 139 | 140 | function(generate_winsdk_lib_symlinks winsdk_um_lib_dir output_dir) 141 | execute_process(COMMAND "${CMAKE_COMMAND}" -E make_directory "${output_dir}") 142 | file(GLOB libraries RELATIVE "${winsdk_um_lib_dir}" "${winsdk_um_lib_dir}/*") 143 | foreach(library ${libraries}) 144 | string(TOLOWER "${library}" all_lowercase_symlink_name) 145 | if(NOT library STREQUAL all_lowercase_symlink_name) 146 | execute_process(COMMAND "${CMAKE_COMMAND}" 147 | -E create_symlink 148 | "${winsdk_um_lib_dir}/${library}" 149 | "${output_dir}/${all_lowercase_symlink_name}") 150 | endif() 151 | 152 | get_filename_component(name_we "${library}" NAME_WE) 153 | get_filename_component(ext "${library}" EXT) 154 | string(TOLOWER "${ext}" lowercase_ext) 155 | set(lowercase_ext_symlink_name "${name_we}${lowercase_ext}") 156 | if(NOT library STREQUAL lowercase_ext_symlink_name AND 157 | NOT all_lowercase_symlink_name STREQUAL lowercase_ext_symlink_name) 158 | execute_process(COMMAND "${CMAKE_COMMAND}" 159 | -E create_symlink 160 | "${winsdk_um_lib_dir}/${library}" 161 | "${output_dir}/${lowercase_ext_symlink_name}") 162 | endif() 163 | endforeach() 164 | endfunction() 165 | 166 | set(CMAKE_SYSTEM_NAME Windows) 167 | set(CMAKE_SYSTEM_VERSION 10.0) 168 | set(CMAKE_SYSTEM_PROCESSOR AMD64) 169 | 170 | init_user_prop(HOST_ARCH) 171 | init_user_prop(LLVM_NATIVE_TOOLCHAIN) 172 | init_user_prop(MSVC_BASE) 173 | init_user_prop(WINSDK_BASE) 174 | init_user_prop(WINSDK_VER) 175 | 176 | if(NOT HOST_ARCH) 177 | set(HOST_ARCH x86_64) 178 | endif() 179 | if(HOST_ARCH STREQUAL "aarch64" OR HOST_ARCH STREQUAL "arm64") 180 | set(TRIPLE_ARCH "aarch64") 181 | set(WINSDK_ARCH "arm64") 182 | elseif(HOST_ARCH STREQUAL "armv7" OR HOST_ARCH STREQUAL "arm") 183 | set(TRIPLE_ARCH "armv7") 184 | set(WINSDK_ARCH "arm") 185 | elseif(HOST_ARCH STREQUAL "i686" OR HOST_ARCH STREQUAL "x86") 186 | set(TRIPLE_ARCH "i686") 187 | set(WINSDK_ARCH "x86") 188 | elseif(HOST_ARCH STREQUAL "x86_64" OR HOST_ARCH STREQUAL "x64") 189 | set(TRIPLE_ARCH "x86_64") 190 | set(WINSDK_ARCH "x64") 191 | else() 192 | message(SEND_ERROR "Unknown host architecture ${HOST_ARCH}. Must be aarch64 (or arm64), armv7 (or arm), i686 (or x86), or x86_64 (or x64).") 193 | endif() 194 | 195 | set(MSVC_INCLUDE "${MSVC_BASE}/include") 196 | set(ATLMFC_INCLUDE "${MSVC_BASE}/atlmfc/include") 197 | set(MSVC_LIB "${MSVC_BASE}/lib") 198 | set(ATLMFC_LIB "${MSVC_BASE}/atlmfc/lib") 199 | set(WINSDK_INCLUDE "${WINSDK_BASE}/Include/${WINSDK_VER}") 200 | set(WINSDK_LIB "${WINSDK_BASE}/Lib/${WINSDK_VER}") 201 | 202 | # Do some sanity checking to make sure we can find a native toolchain and 203 | # that the Windows SDK / MSVC STL directories look kosher. 204 | if(NOT EXISTS "${LLVM_NATIVE_TOOLCHAIN}/bin/clang-cl.exe" OR 205 | NOT EXISTS "${LLVM_NATIVE_TOOLCHAIN}/bin/lld-link.exe") 206 | message(SEND_ERROR 207 | "LLVM_NATIVE_TOOLCHAIN folder '${LLVM_NATIVE_TOOLCHAIN}' does not " 208 | "point to a valid directory containing bin/clang-cl.exe and bin/lld-link.exe " 209 | "binaries") 210 | endif() 211 | 212 | if(NOT EXISTS "${MSVC_BASE}" OR 213 | NOT EXISTS "${MSVC_INCLUDE}" OR 214 | NOT EXISTS "${MSVC_LIB}") 215 | message(SEND_ERROR 216 | "CMake variable MSVC_BASE must point to a folder containing MSVC " 217 | "system headers and libraries") 218 | endif() 219 | 220 | if(NOT EXISTS "${WINSDK_BASE}" OR 221 | NOT EXISTS "${WINSDK_INCLUDE}" OR 222 | NOT EXISTS "${WINSDK_LIB}") 223 | message(SEND_ERROR 224 | "CMake variable WINSDK_BASE and WINSDK_VER must resolve to a valid " 225 | "Windows SDK installation") 226 | endif() 227 | 228 | if(NOT EXISTS "${WINSDK_INCLUDE}/um/Windows.h") 229 | message(SEND_ERROR "Cannot find Windows.h") 230 | endif() 231 | if(NOT EXISTS "${WINSDK_INCLUDE}/um/WINDOWS.H") 232 | set(case_sensitive_filesystem TRUE) 233 | endif() 234 | 235 | set(CMAKE_C_COMPILER "${LLVM_NATIVE_TOOLCHAIN}/bin/clang-cl.exe" CACHE FILEPATH "") 236 | set(CMAKE_CXX_COMPILER "${LLVM_NATIVE_TOOLCHAIN}/bin/clang-cl.exe" CACHE FILEPATH "") 237 | set(CMAKE_LINKER "${LLVM_NATIVE_TOOLCHAIN}/bin/lld-link.exe" CACHE FILEPATH "") 238 | 239 | # Even though we're cross-compiling, we need some native tools (e.g. llvm-tblgen), and those 240 | # native tools have to be built before we can start doing the cross-build. LLVM supports 241 | # a CROSS_TOOLCHAIN_FLAGS_NATIVE argument which consists of a list of flags to pass to CMake 242 | # when configuring the NATIVE portion of the cross-build. By default we construct this so 243 | # that it points to the tools in the same location as the native clang-cl that we're using. 244 | list(APPEND _CTF_NATIVE_DEFAULT "-DCMAKE_ASM_COMPILER=${LLVM_NATIVE_TOOLCHAIN}/bin/clang") 245 | list(APPEND _CTF_NATIVE_DEFAULT "-DCMAKE_C_COMPILER=${LLVM_NATIVE_TOOLCHAIN}/bin/clang") 246 | list(APPEND _CTF_NATIVE_DEFAULT "-DCMAKE_CXX_COMPILER=${LLVM_NATIVE_TOOLCHAIN}/bin/clang++") 247 | 248 | set(CROSS_TOOLCHAIN_FLAGS_NATIVE "${_CTF_NATIVE_DEFAULT}" CACHE STRING "") 249 | 250 | set(COMPILE_FLAGS 251 | -D_CRT_SECURE_NO_WARNINGS 252 | --target=${TRIPLE_ARCH}-windows-msvc 253 | -fms-compatibility-version=19.11 254 | -imsvc "\"${ATLMFC_INCLUDE}\"" 255 | -imsvc "\"${MSVC_INCLUDE}\"" 256 | -imsvc "\"${WINSDK_INCLUDE}/ucrt\"" 257 | -imsvc "\"${WINSDK_INCLUDE}/shared\"" 258 | -imsvc "\"${WINSDK_INCLUDE}/um\"" 259 | -imsvc "\"${WINSDK_INCLUDE}/winrt\"") 260 | 261 | if(case_sensitive_filesystem) 262 | # Ensure all sub-configures use the top-level VFS overlay instead of generating their own. 263 | init_user_prop(winsdk_vfs_overlay_path) 264 | if(NOT winsdk_vfs_overlay_path) 265 | set(winsdk_vfs_overlay_path "${CMAKE_BINARY_DIR}/winsdk_vfs_overlay.yaml") 266 | generate_winsdk_vfs_overlay("${WINSDK_BASE}/Include/${WINSDK_VER}" "${winsdk_vfs_overlay_path}") 267 | init_user_prop(winsdk_vfs_overlay_path) 268 | endif() 269 | list(APPEND COMPILE_FLAGS 270 | -Xclang -ivfsoverlay -Xclang "${winsdk_vfs_overlay_path}") 271 | endif() 272 | 273 | string(REPLACE ";" " " COMPILE_FLAGS "${COMPILE_FLAGS}") 274 | 275 | # We need to preserve any flags that were passed in by the user. However, we 276 | # can't append to CMAKE_C_FLAGS and friends directly, because toolchain files 277 | # will be re-invoked on each reconfigure and therefore need to be idempotent. 278 | # The assignments to the _INITIAL cache variables don't use FORCE, so they'll 279 | # only be populated on the initial configure, and their values won't change 280 | # afterward. 281 | set(_CMAKE_C_FLAGS_INITIAL "${CMAKE_C_FLAGS}" CACHE STRING "") 282 | set(CMAKE_C_FLAGS "${_CMAKE_C_FLAGS_INITIAL} ${COMPILE_FLAGS}" CACHE STRING "" FORCE) 283 | 284 | set(_CMAKE_CXX_FLAGS_INITIAL "${CMAKE_CXX_FLAGS}" CACHE STRING "") 285 | set(CMAKE_CXX_FLAGS "${_CMAKE_CXX_FLAGS_INITIAL} ${COMPILE_FLAGS}" CACHE STRING "" FORCE) 286 | 287 | set(LINK_FLAGS 288 | # Prevent CMake from attempting to invoke mt.exe. It only recognizes the slashed form and not the dashed form. 289 | /manifest:no 290 | 291 | -libpath:"${ATLMFC_LIB}/${WINSDK_ARCH}" 292 | -libpath:"${MSVC_LIB}/${WINSDK_ARCH}" 293 | -libpath:"${WINSDK_LIB}/ucrt/${WINSDK_ARCH}" 294 | -libpath:"${WINSDK_LIB}/um/${WINSDK_ARCH}") 295 | 296 | if(case_sensitive_filesystem) 297 | # Ensure all sub-configures use the top-level symlinks dir instead of generating their own. 298 | init_user_prop(winsdk_lib_symlinks_dir) 299 | if(NOT winsdk_lib_symlinks_dir) 300 | set(winsdk_lib_symlinks_dir "${CMAKE_BINARY_DIR}/winsdk_lib_symlinks") 301 | generate_winsdk_lib_symlinks("${WINSDK_BASE}/Lib/${WINSDK_VER}/um/${WINSDK_ARCH}" "${winsdk_lib_symlinks_dir}") 302 | init_user_prop(winsdk_lib_symlinks_dir) 303 | endif() 304 | list(APPEND LINK_FLAGS 305 | -libpath:"${winsdk_lib_symlinks_dir}") 306 | endif() 307 | 308 | string(REPLACE ";" " " LINK_FLAGS "${LINK_FLAGS}") 309 | 310 | # See explanation for compiler flags above for the _INITIAL variables. 311 | set(_CMAKE_EXE_LINKER_FLAGS_INITIAL "${CMAKE_EXE_LINKER_FLAGS}" CACHE STRING "") 312 | set(CMAKE_EXE_LINKER_FLAGS "${_CMAKE_EXE_LINKER_FLAGS_INITIAL} ${LINK_FLAGS}" CACHE STRING "" FORCE) 313 | 314 | set(_CMAKE_MODULE_LINKER_FLAGS_INITIAL "${CMAKE_MODULE_LINKER_FLAGS}" CACHE STRING "") 315 | set(CMAKE_MODULE_LINKER_FLAGS "${_CMAKE_MODULE_LINKER_FLAGS_INITIAL} ${LINK_FLAGS}" CACHE STRING "" FORCE) 316 | 317 | set(_CMAKE_SHARED_LINKER_FLAGS_INITIAL "${CMAKE_SHARED_LINKER_FLAGS}" CACHE STRING "") 318 | set(CMAKE_SHARED_LINKER_FLAGS "${_CMAKE_SHARED_LINKER_FLAGS_INITIAL} ${LINK_FLAGS}" CACHE STRING "" FORCE) 319 | 320 | # CMake populates these with a bunch of unnecessary libraries, which requires 321 | # extra case-correcting symlinks and what not. Instead, let projects explicitly 322 | # control which libraries they require. 323 | set(CMAKE_C_STANDARD_LIBRARIES "" CACHE STRING "" FORCE) 324 | set(CMAKE_CXX_STANDARD_LIBRARIES "" CACHE STRING "" FORCE) 325 | 326 | # Allow clang-cl to work with macOS paths. 327 | set(CMAKE_USER_MAKE_RULES_OVERRIDE "${CMAKE_CURRENT_LIST_DIR}/ClangClCMakeCompileRules.cmake") 328 | -------------------------------------------------------------------------------- /cmake/clang-cl-msvc-wsl.cmake: -------------------------------------------------------------------------------- 1 | # From llvm/cmake/platforms/WinMsvc.cmake 2 | # Modified to use clang-cl on native Windows. 3 | 4 | # Cross toolchain configuration for using clang-cl on non-Windows hosts to 5 | # target MSVC. 6 | # 7 | # Usage: 8 | # cmake -G Ninja 9 | # -DCMAKE_TOOLCHAIN_FILE=/path/to/this/file 10 | # -DHOST_ARCH=[aarch64|arm64|armv7|arm|i686|x86|x86_64|x64] 11 | # -DLLVM_NATIVE_TOOLCHAIN=/path/to/llvm/installation 12 | # -DMSVC_BASE=/path/to/MSVC/system/libraries/and/includes 13 | # -DWINSDK_BASE=/path/to/windows-sdk 14 | # -DWINSDK_VER=windows sdk version folder name 15 | # 16 | # HOST_ARCH: 17 | # The architecture to build for. 18 | # 19 | # LLVM_NATIVE_TOOLCHAIN: 20 | # *Absolute path* to a folder containing the toolchain which will be used to 21 | # build. At a minimum, this folder should have a bin directory with a 22 | # copy of clang-cl, clang, clang++, and lld-link, as well as a lib directory 23 | # containing clang's system resource directory. 24 | # 25 | # MSVC_BASE: 26 | # *Absolute path* to the folder containing MSVC headers and system libraries. 27 | # The layout of the folder matches that which is intalled by MSVC 2017 on 28 | # Windows, and should look like this: 29 | # 30 | # ${MSVC_BASE} 31 | # include 32 | # vector 33 | # stdint.h 34 | # etc... 35 | # lib 36 | # x64 37 | # libcmt.lib 38 | # msvcrt.lib 39 | # etc... 40 | # x86 41 | # libcmt.lib 42 | # msvcrt.lib 43 | # etc... 44 | # 45 | # For versions of MSVC < 2017, or where you have a hermetic toolchain in a 46 | # custom format, you must use symlinks or restructure it to look like the above. 47 | # 48 | # WINSDK_BASE: 49 | # Together with WINSDK_VER, determines the location of Windows SDK headers 50 | # and libraries. 51 | # 52 | # WINSDK_VER: 53 | # Together with WINSDK_BASE, determines the locations of Windows SDK headers 54 | # and libraries. 55 | # 56 | # WINSDK_BASE and WINSDK_VER work together to define a folder layout that matches 57 | # that of the Windows SDK installation on a standard Windows machine. It should 58 | # match the layout described below. 59 | # 60 | # Note that if you install Windows SDK to a windows machine and simply copy the 61 | # files, it will already be in the correct layout. 62 | # 63 | # ${WINSDK_BASE} 64 | # Include 65 | # ${WINSDK_VER} 66 | # shared 67 | # ucrt 68 | # um 69 | # windows.h 70 | # etc... 71 | # Lib 72 | # ${WINSDK_VER} 73 | # ucrt 74 | # x64 75 | # x86 76 | # ucrt.lib 77 | # etc... 78 | # um 79 | # x64 80 | # x86 81 | # kernel32.lib 82 | # etc 83 | # 84 | # IMPORTANT: In order for this to work, you will need a valid copy of the Windows 85 | # SDK and C++ STL headers and libraries on your host. Additionally, since the 86 | # Windows libraries and headers are not case-correct, this toolchain file sets 87 | # up a VFS overlay for the SDK headers and case-correcting symlinks for the 88 | # libraries when running on a case-sensitive filesystem. 89 | 90 | 91 | # When configuring CMake with a toolchain file against a top-level CMakeLists.txt, 92 | # it will actually run CMake many times, once for each small test program used to 93 | # determine what features a compiler supports. Unfortunately, none of these 94 | # invocations share a CMakeCache.txt with the top-level invocation, meaning they 95 | # won't see the value of any arguments the user passed via -D. Since these are 96 | # necessary to properly configure MSVC in both the top-level configuration as well as 97 | # all feature-test invocations, we set environment variables with the values so that 98 | # these environments get inherited by child invocations. We can switch to 99 | # CMAKE_TRY_COMPILE_PLATFORM_VARIABLES once our minimum supported CMake version 100 | # is 3.6 or greater. 101 | function(init_user_prop prop) 102 | if(${prop}) 103 | set(ENV{_${prop}} "${${prop}}") 104 | else() 105 | set(${prop} "$ENV{_${prop}}" PARENT_SCOPE) 106 | endif() 107 | endfunction() 108 | 109 | function(generate_winsdk_vfs_overlay winsdk_include_dir output_path) 110 | set(include_dirs) 111 | file(GLOB_RECURSE entries LIST_DIRECTORIES true "${winsdk_include_dir}/*") 112 | foreach(entry ${entries}) 113 | if(IS_DIRECTORY "${entry}") 114 | list(APPEND include_dirs "${entry}") 115 | endif() 116 | endforeach() 117 | 118 | file(WRITE "${output_path}" "version: 0\n") 119 | file(APPEND "${output_path}" "case-sensitive: false\n") 120 | file(APPEND "${output_path}" "roots:\n") 121 | 122 | foreach(dir ${include_dirs}) 123 | file(GLOB headers RELATIVE "${dir}" "${dir}/*.h") 124 | if(NOT headers) 125 | continue() 126 | endif() 127 | 128 | file(APPEND "${output_path}" " - name: \"${dir}\"\n") 129 | file(APPEND "${output_path}" " type: directory\n") 130 | file(APPEND "${output_path}" " contents:\n") 131 | 132 | foreach(header ${headers}) 133 | file(APPEND "${output_path}" " - name: \"${header}\"\n") 134 | file(APPEND "${output_path}" " type: file\n") 135 | file(APPEND "${output_path}" " external-contents: \"${dir}/${header}\"\n") 136 | endforeach() 137 | endforeach() 138 | endfunction() 139 | 140 | function(generate_winsdk_lib_symlinks winsdk_um_lib_dir output_dir) 141 | execute_process(COMMAND "${CMAKE_COMMAND}" -E make_directory "${output_dir}") 142 | file(GLOB libraries RELATIVE "${winsdk_um_lib_dir}" "${winsdk_um_lib_dir}/*") 143 | foreach(library ${libraries}) 144 | string(TOLOWER "${library}" all_lowercase_symlink_name) 145 | if(NOT library STREQUAL all_lowercase_symlink_name) 146 | execute_process(COMMAND "${CMAKE_COMMAND}" 147 | -E create_symlink 148 | "${winsdk_um_lib_dir}/${library}" 149 | "${output_dir}/${all_lowercase_symlink_name}") 150 | endif() 151 | 152 | get_filename_component(name_we "${library}" NAME_WE) 153 | get_filename_component(ext "${library}" EXT) 154 | string(TOLOWER "${ext}" lowercase_ext) 155 | set(lowercase_ext_symlink_name "${name_we}${lowercase_ext}") 156 | if(NOT library STREQUAL lowercase_ext_symlink_name AND 157 | NOT all_lowercase_symlink_name STREQUAL lowercase_ext_symlink_name) 158 | execute_process(COMMAND "${CMAKE_COMMAND}" 159 | -E create_symlink 160 | "${winsdk_um_lib_dir}/${library}" 161 | "${output_dir}/${lowercase_ext_symlink_name}") 162 | endif() 163 | endforeach() 164 | endfunction() 165 | 166 | set(CMAKE_SYSTEM_NAME Windows) 167 | set(CMAKE_SYSTEM_VERSION 10.0) 168 | set(CMAKE_SYSTEM_PROCESSOR AMD64) 169 | 170 | init_user_prop(HOST_ARCH) 171 | init_user_prop(LLVM_NATIVE_TOOLCHAIN) 172 | init_user_prop(MSVC_BASE) 173 | init_user_prop(WINSDK_BASE) 174 | init_user_prop(WINSDK_VER) 175 | 176 | if(NOT HOST_ARCH) 177 | set(HOST_ARCH x86_64) 178 | endif() 179 | if(HOST_ARCH STREQUAL "aarch64" OR HOST_ARCH STREQUAL "arm64") 180 | set(TRIPLE_ARCH "aarch64") 181 | set(WINSDK_ARCH "arm64") 182 | elseif(HOST_ARCH STREQUAL "armv7" OR HOST_ARCH STREQUAL "arm") 183 | set(TRIPLE_ARCH "armv7") 184 | set(WINSDK_ARCH "arm") 185 | elseif(HOST_ARCH STREQUAL "i686" OR HOST_ARCH STREQUAL "x86") 186 | set(TRIPLE_ARCH "i686") 187 | set(WINSDK_ARCH "x86") 188 | elseif(HOST_ARCH STREQUAL "x86_64" OR HOST_ARCH STREQUAL "x64") 189 | set(TRIPLE_ARCH "x86_64") 190 | set(WINSDK_ARCH "x64") 191 | else() 192 | message(SEND_ERROR "Unknown host architecture ${HOST_ARCH}. Must be aarch64 (or arm64), armv7 (or arm), i686 (or x86), or x86_64 (or x64).") 193 | endif() 194 | 195 | set(MSVC_INCLUDE "${MSVC_BASE}/include") 196 | set(ATLMFC_INCLUDE "${MSVC_BASE}/atlmfc/include") 197 | set(MSVC_LIB "${MSVC_BASE}/lib") 198 | set(ATLMFC_LIB "${MSVC_BASE}/atlmfc/lib") 199 | set(WINSDK_INCLUDE "${WINSDK_BASE}/Include/${WINSDK_VER}") 200 | set(WINSDK_LIB "${WINSDK_BASE}/Lib/${WINSDK_VER}") 201 | 202 | # Do some sanity checking to make sure we can find a native toolchain and 203 | # that the Windows SDK / MSVC STL directories look kosher. 204 | if(NOT EXISTS "${LLVM_NATIVE_TOOLCHAIN}/bin/clang-cl" OR 205 | NOT EXISTS "${LLVM_NATIVE_TOOLCHAIN}/bin/lld-link") 206 | message(SEND_ERROR 207 | "LLVM_NATIVE_TOOLCHAIN folder '${LLVM_NATIVE_TOOLCHAIN}' does not " 208 | "point to a valid directory containing bin/clang-cl and bin/lld-link " 209 | "binaries") 210 | endif() 211 | 212 | if(NOT EXISTS "${MSVC_BASE}" OR 213 | NOT EXISTS "${MSVC_INCLUDE}" OR 214 | NOT EXISTS "${MSVC_LIB}") 215 | message(SEND_ERROR 216 | "CMake variable MSVC_BASE must point to a folder containing MSVC " 217 | "system headers and libraries") 218 | endif() 219 | 220 | if(NOT EXISTS "${WINSDK_BASE}" OR 221 | NOT EXISTS "${WINSDK_INCLUDE}" OR 222 | NOT EXISTS "${WINSDK_LIB}") 223 | message(SEND_ERROR 224 | "CMake variable WINSDK_BASE and WINSDK_VER must resolve to a valid " 225 | "Windows SDK installation") 226 | endif() 227 | 228 | if(NOT EXISTS "${WINSDK_INCLUDE}/um/Windows.h") 229 | message(SEND_ERROR "Cannot find Windows.h") 230 | endif() 231 | if(NOT EXISTS "${WINSDK_INCLUDE}/um/WINDOWS.H") 232 | set(case_sensitive_filesystem TRUE) 233 | endif() 234 | 235 | set(CMAKE_C_COMPILER "${LLVM_NATIVE_TOOLCHAIN}/bin/clang-cl" CACHE FILEPATH "") 236 | set(CMAKE_CXX_COMPILER "${LLVM_NATIVE_TOOLCHAIN}/bin/clang-cl" CACHE FILEPATH "") 237 | set(CMAKE_LINKER "${LLVM_NATIVE_TOOLCHAIN}/bin/lld-link" CACHE FILEPATH "") 238 | 239 | # Even though we're cross-compiling, we need some native tools (e.g. llvm-tblgen), and those 240 | # native tools have to be built before we can start doing the cross-build. LLVM supports 241 | # a CROSS_TOOLCHAIN_FLAGS_NATIVE argument which consists of a list of flags to pass to CMake 242 | # when configuring the NATIVE portion of the cross-build. By default we construct this so 243 | # that it points to the tools in the same location as the native clang-cl that we're using. 244 | list(APPEND _CTF_NATIVE_DEFAULT "-DCMAKE_ASM_COMPILER=${LLVM_NATIVE_TOOLCHAIN}/bin/clang") 245 | list(APPEND _CTF_NATIVE_DEFAULT "-DCMAKE_C_COMPILER=${LLVM_NATIVE_TOOLCHAIN}/bin/clang") 246 | list(APPEND _CTF_NATIVE_DEFAULT "-DCMAKE_CXX_COMPILER=${LLVM_NATIVE_TOOLCHAIN}/bin/clang++") 247 | 248 | set(CROSS_TOOLCHAIN_FLAGS_NATIVE "${_CTF_NATIVE_DEFAULT}" CACHE STRING "") 249 | 250 | set(COMPILE_FLAGS 251 | -D_CRT_SECURE_NO_WARNINGS 252 | --target=${TRIPLE_ARCH}-windows-msvc 253 | -fms-compatibility-version=19.11 254 | -imsvc "\"${ATLMFC_INCLUDE}\"" 255 | -imsvc "\"${MSVC_INCLUDE}\"" 256 | -imsvc "\"${WINSDK_INCLUDE}/ucrt\"" 257 | -imsvc "\"${WINSDK_INCLUDE}/shared\"" 258 | -imsvc "\"${WINSDK_INCLUDE}/um\"" 259 | -imsvc "\"${WINSDK_INCLUDE}/winrt\"") 260 | 261 | if(case_sensitive_filesystem) 262 | # Ensure all sub-configures use the top-level VFS overlay instead of generating their own. 263 | init_user_prop(winsdk_vfs_overlay_path) 264 | if(NOT winsdk_vfs_overlay_path) 265 | set(winsdk_vfs_overlay_path "${CMAKE_BINARY_DIR}/winsdk_vfs_overlay.yaml") 266 | generate_winsdk_vfs_overlay("${WINSDK_BASE}/Include/${WINSDK_VER}" "${winsdk_vfs_overlay_path}") 267 | init_user_prop(winsdk_vfs_overlay_path) 268 | endif() 269 | list(APPEND COMPILE_FLAGS 270 | -Xclang -ivfsoverlay -Xclang "${winsdk_vfs_overlay_path}") 271 | endif() 272 | 273 | string(REPLACE ";" " " COMPILE_FLAGS "${COMPILE_FLAGS}") 274 | 275 | # We need to preserve any flags that were passed in by the user. However, we 276 | # can't append to CMAKE_C_FLAGS and friends directly, because toolchain files 277 | # will be re-invoked on each reconfigure and therefore need to be idempotent. 278 | # The assignments to the _INITIAL cache variables don't use FORCE, so they'll 279 | # only be populated on the initial configure, and their values won't change 280 | # afterward. 281 | set(_CMAKE_C_FLAGS_INITIAL "${CMAKE_C_FLAGS}" CACHE STRING "") 282 | set(CMAKE_C_FLAGS "${_CMAKE_C_FLAGS_INITIAL} ${COMPILE_FLAGS}" CACHE STRING "" FORCE) 283 | 284 | set(_CMAKE_CXX_FLAGS_INITIAL "${CMAKE_CXX_FLAGS}" CACHE STRING "") 285 | set(CMAKE_CXX_FLAGS "${_CMAKE_CXX_FLAGS_INITIAL} ${COMPILE_FLAGS}" CACHE STRING "" FORCE) 286 | 287 | set(LINK_FLAGS 288 | # Prevent CMake from attempting to invoke mt.exe. It only recognizes the slashed form and not the dashed form. 289 | /manifest:no 290 | 291 | -libpath:"${ATLMFC_LIB}/${WINSDK_ARCH}" 292 | -libpath:"${MSVC_LIB}/${WINSDK_ARCH}" 293 | -libpath:"${WINSDK_LIB}/ucrt/${WINSDK_ARCH}" 294 | -libpath:"${WINSDK_LIB}/um/${WINSDK_ARCH}") 295 | 296 | if(case_sensitive_filesystem) 297 | # Ensure all sub-configures use the top-level symlinks dir instead of generating their own. 298 | init_user_prop(winsdk_lib_symlinks_dir) 299 | if(NOT winsdk_lib_symlinks_dir) 300 | set(winsdk_lib_symlinks_dir "${CMAKE_BINARY_DIR}/winsdk_lib_symlinks") 301 | generate_winsdk_lib_symlinks("${WINSDK_BASE}/Lib/${WINSDK_VER}/um/${WINSDK_ARCH}" "${winsdk_lib_symlinks_dir}") 302 | init_user_prop(winsdk_lib_symlinks_dir) 303 | endif() 304 | list(APPEND LINK_FLAGS 305 | -libpath:"${winsdk_lib_symlinks_dir}") 306 | endif() 307 | 308 | string(REPLACE ";" " " LINK_FLAGS "${LINK_FLAGS}") 309 | 310 | # See explanation for compiler flags above for the _INITIAL variables. 311 | set(_CMAKE_EXE_LINKER_FLAGS_INITIAL "${CMAKE_EXE_LINKER_FLAGS}" CACHE STRING "") 312 | set(CMAKE_EXE_LINKER_FLAGS "${_CMAKE_EXE_LINKER_FLAGS_INITIAL} ${LINK_FLAGS}" CACHE STRING "" FORCE) 313 | 314 | set(_CMAKE_MODULE_LINKER_FLAGS_INITIAL "${CMAKE_MODULE_LINKER_FLAGS}" CACHE STRING "") 315 | set(CMAKE_MODULE_LINKER_FLAGS "${_CMAKE_MODULE_LINKER_FLAGS_INITIAL} ${LINK_FLAGS}" CACHE STRING "" FORCE) 316 | 317 | set(_CMAKE_SHARED_LINKER_FLAGS_INITIAL "${CMAKE_SHARED_LINKER_FLAGS}" CACHE STRING "") 318 | set(CMAKE_SHARED_LINKER_FLAGS "${_CMAKE_SHARED_LINKER_FLAGS_INITIAL} ${LINK_FLAGS}" CACHE STRING "" FORCE) 319 | 320 | # CMake populates these with a bunch of unnecessary libraries, which requires 321 | # extra case-correcting symlinks and what not. Instead, let projects explicitly 322 | # control which libraries they require. 323 | set(CMAKE_C_STANDARD_LIBRARIES "" CACHE STRING "" FORCE) 324 | set(CMAKE_CXX_STANDARD_LIBRARIES "" CACHE STRING "" FORCE) 325 | 326 | # Allow clang-cl to work with macOS paths. 327 | set(CMAKE_USER_MAKE_RULES_OVERRIDE "${CMAKE_CURRENT_LIST_DIR}/ClangClCMakeCompileRules.cmake") 328 | -------------------------------------------------------------------------------- /cmake/llvm-mingw-cross.cmake: -------------------------------------------------------------------------------- 1 | SET(CMAKE_SYSTEM_NAME Windows) 2 | 3 | IF (DEFINED ENV{LLVM_MINGW_DIR}) 4 | SET(LLVM_MINGW_ROOT "$ENV{LLVM_MINGW_DIR}") 5 | ELSE () 6 | SET(LLVM_MINGW_ROOT "/mnt/data/local/llvm-mingw-20200325-ubuntu-18.04") 7 | ENDIF() 8 | 9 | 10 | SET(CMAKE_C_COMPILER ${LLVM_MINGW_ROOT}/bin/x86_64-w64-mingw32-clang) 11 | SET(CMAKE_CXX_COMPILER ${LLVM_MINGW_ROOT}/bin/x86_64-w64-mingw32-clang++) 12 | SET(CMAKE_RC_COMPILER ${LLVM_MINGW_ROOT}/bin/x86_64-w64-mingw32-windres) 13 | 14 | #SET(CMAKE_C_LINK_EXECUTABLE x86_64-w64-mingw32-gcc) 15 | #SET(CMAKE_CXX_LINK_EXECUTABLE x86_64-w64-mingw32-g++) 16 | 17 | SET(CMAKE_FIND_ROOT_PATH ${LLVM_MINGW_ROOT}/x86_64-w64-mingw32) 18 | 19 | # We may need some advanced thread APIs to compile, so enable 0x601(Win7) if required. 20 | # SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_WIN32_WINNT=0x601") 21 | 22 | SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 23 | SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 24 | SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 25 | -------------------------------------------------------------------------------- /cmake/llvm-mingw-win64.cmake: -------------------------------------------------------------------------------- 1 | SET(CMAKE_SYSTEM_NAME Windows) 2 | 3 | IF (DEFINED ENV{LLVM_MINGW_DIR}) 4 | SET(LLVM_MINGW_ROOT "$ENV{LLVM_MINGW_DIR}") 5 | ELSE () 6 | SET(LLVM_MINGW_ROOT "C:/ProgramData/llvm-mingw") 7 | ENDIF() 8 | 9 | SET(CMAKE_C_COMPILER ${LLVM_MINGW_ROOT}/bin/x86_64-w64-mingw32-clang.exe) 10 | SET(CMAKE_CXX_COMPILER ${LLVM_MINGW_ROOT}/bin/x86_64-w64-mingw32-clang++.exe) 11 | SET(CMAKE_RC_COMPILER ${LLVM_MINGW_ROOT}/bin/x86_64-w64-mingw32-windres.exe) 12 | 13 | SET(CMAKE_FIND_ROOT_PATH ${LLVM_MINGW_ROOT}/x86_64-w64-mingw32) 14 | 15 | # We may need some advanced thread APIs to compile tinyusz. use 0x601(Win7) if required 16 | # SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_WIN32_WINNT=0x601") 17 | 18 | SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 19 | SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 20 | SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 21 | -------------------------------------------------------------------------------- /cmake/mingw64-cross.cmake: -------------------------------------------------------------------------------- 1 | SET(CMAKE_SYSTEM_NAME Windows) 2 | 3 | IF (DEFINED ENV{MINGW_GCC_DIR}) 4 | SET(MINGW_GCC_ROOT "$ENV{MINGW_GCC_DIR}") 5 | ELSE () 6 | # Assume mingw cross compiler is installed in your system 7 | SET(MINGW_GCC_ROOT "/usr") 8 | ENDIF() 9 | 10 | # win32 may fail to compile with C++11 threads. 11 | 12 | SET(CMAKE_C_COMPILER ${MINGW_GCC_ROOT}/bin/x86_64-w64-mingw32-gcc-posix) 13 | SET(CMAKE_CXX_COMPILER ${MINGW_GCC_ROOT}/bin/x86_64-w64-mingw32-g++-posix) 14 | SET(CMAKE_RC_COMPILER ${MINGW_GCC_ROOT}/bin/x86_64-w64-mingw32-windres) 15 | 16 | SET(CMAKE_FIND_ROOT_PATH ${MINGW_GCC_ROOT}/x86_64-w64-mingw32) 17 | 18 | SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 19 | SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 20 | SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 21 | -------------------------------------------------------------------------------- /cmake/sanitizers/FindASan.cmake: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 4 | # 2013 Matthew Arsenault 5 | # 2015-2016 RWTH Aachen University, Federal Republic of Germany 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | option(SANITIZE_ADDRESS "Enable AddressSanitizer for sanitized targets." Off) 26 | 27 | set(FLAG_CANDIDATES 28 | # Clang 3.2+ use this version. The no-omit-frame-pointer option is optional. 29 | "-g -fsanitize=address -fno-omit-frame-pointer" 30 | "-g -fsanitize=address" 31 | 32 | # Older deprecated flag for ASan 33 | "-g -faddress-sanitizer" 34 | ) 35 | 36 | 37 | if (SANITIZE_ADDRESS AND (SANITIZE_THREAD OR SANITIZE_MEMORY)) 38 | message(FATAL_ERROR "AddressSanitizer is not compatible with " 39 | "ThreadSanitizer or MemorySanitizer.") 40 | endif () 41 | 42 | 43 | include(sanitize-helpers) 44 | 45 | if (SANITIZE_ADDRESS) 46 | sanitizer_check_compiler_flags("${FLAG_CANDIDATES}" "AddressSanitizer" 47 | "ASan") 48 | 49 | find_program(ASan_WRAPPER "asan-wrapper" PATHS ${CMAKE_MODULE_PATH}) 50 | mark_as_advanced(ASan_WRAPPER) 51 | endif () 52 | 53 | function (add_sanitize_address TARGET) 54 | if (NOT SANITIZE_ADDRESS) 55 | return() 56 | endif () 57 | 58 | sanitizer_add_flags(${TARGET} "AddressSanitizer" "ASan") 59 | endfunction () 60 | -------------------------------------------------------------------------------- /cmake/sanitizers/FindMSan.cmake: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 4 | # 2013 Matthew Arsenault 5 | # 2015-2016 RWTH Aachen University, Federal Republic of Germany 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | option(SANITIZE_MEMORY "Enable MemorySanitizer for sanitized targets." Off) 26 | 27 | set(FLAG_CANDIDATES 28 | "-g -fsanitize=memory" 29 | ) 30 | 31 | 32 | include(sanitize-helpers) 33 | 34 | if (SANITIZE_MEMORY) 35 | if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") 36 | message(WARNING "MemorySanitizer disabled for target ${TARGET} because " 37 | "MemorySanitizer is supported for Linux systems only.") 38 | set(SANITIZE_MEMORY Off CACHE BOOL 39 | "Enable MemorySanitizer for sanitized targets." FORCE) 40 | elseif (NOT ${CMAKE_SIZEOF_VOID_P} EQUAL 8) 41 | message(WARNING "MemorySanitizer disabled for target ${TARGET} because " 42 | "MemorySanitizer is supported for 64bit systems only.") 43 | set(SANITIZE_MEMORY Off CACHE BOOL 44 | "Enable MemorySanitizer for sanitized targets." FORCE) 45 | else () 46 | sanitizer_check_compiler_flags("${FLAG_CANDIDATES}" "MemorySanitizer" 47 | "MSan") 48 | endif () 49 | endif () 50 | 51 | function (add_sanitize_memory TARGET) 52 | if (NOT SANITIZE_MEMORY) 53 | return() 54 | endif () 55 | 56 | sanitizer_add_flags(${TARGET} "MemorySanitizer" "MSan") 57 | endfunction () 58 | -------------------------------------------------------------------------------- /cmake/sanitizers/FindSanitizers.cmake: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 4 | # 2013 Matthew Arsenault 5 | # 2015-2016 RWTH Aachen University, Federal Republic of Germany 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | # If any of the used compiler is a GNU compiler, add a second option to static 26 | # link against the sanitizers. 27 | option(SANITIZE_LINK_STATIC "Try to link static against sanitizers." Off) 28 | 29 | 30 | 31 | 32 | set(FIND_QUIETLY_FLAG "") 33 | if (DEFINED Sanitizers_FIND_QUIETLY) 34 | set(FIND_QUIETLY_FLAG "QUIET") 35 | endif () 36 | 37 | find_package(ASan ${FIND_QUIETLY_FLAG}) 38 | find_package(TSan ${FIND_QUIETLY_FLAG}) 39 | find_package(MSan ${FIND_QUIETLY_FLAG}) 40 | find_package(UBSan ${FIND_QUIETLY_FLAG}) 41 | 42 | 43 | 44 | 45 | function(sanitizer_add_blacklist_file FILE) 46 | if(NOT IS_ABSOLUTE ${FILE}) 47 | set(FILE "${CMAKE_CURRENT_SOURCE_DIR}/${FILE}") 48 | endif() 49 | get_filename_component(FILE "${FILE}" REALPATH) 50 | 51 | sanitizer_check_compiler_flags("-fsanitize-blacklist=${FILE}" 52 | "SanitizerBlacklist" "SanBlist") 53 | endfunction() 54 | 55 | function(add_sanitizers ...) 56 | # If no sanitizer is enabled, return immediately. 57 | if (NOT (SANITIZE_ADDRESS OR SANITIZE_MEMORY OR SANITIZE_THREAD OR 58 | SANITIZE_UNDEFINED)) 59 | return() 60 | endif () 61 | 62 | foreach (TARGET ${ARGV}) 63 | # Check if this target will be compiled by exactly one compiler. Other- 64 | # wise sanitizers can't be used and a warning should be printed once. 65 | get_target_property(TARGET_TYPE ${TARGET} TYPE) 66 | if (TARGET_TYPE STREQUAL "INTERFACE_LIBRARY") 67 | message(WARNING "Can't use any sanitizers for target ${TARGET}, " 68 | "because it is an interface library and cannot be " 69 | "compiled directly.") 70 | return() 71 | endif () 72 | sanitizer_target_compilers(${TARGET} TARGET_COMPILER) 73 | list(LENGTH TARGET_COMPILER NUM_COMPILERS) 74 | if (NUM_COMPILERS GREATER 1) 75 | message(WARNING "Can't use any sanitizers for target ${TARGET}, " 76 | "because it will be compiled by incompatible compilers. " 77 | "Target will be compiled without sanitizers.") 78 | return() 79 | 80 | # If the target is compiled by no or no known compiler, give a warning. 81 | elseif (NUM_COMPILERS EQUAL 0) 82 | message(WARNING "Sanitizers for target ${TARGET} may not be" 83 | " usable, because it uses no or an unknown compiler. " 84 | "This is a false warning for targets using only " 85 | "object lib(s) as input.") 86 | endif () 87 | 88 | # Add sanitizers for target. 89 | add_sanitize_address(${TARGET}) 90 | add_sanitize_thread(${TARGET}) 91 | add_sanitize_memory(${TARGET}) 92 | add_sanitize_undefined(${TARGET}) 93 | endforeach () 94 | endfunction(add_sanitizers) 95 | -------------------------------------------------------------------------------- /cmake/sanitizers/FindTSan.cmake: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 4 | # 2013 Matthew Arsenault 5 | # 2015-2016 RWTH Aachen University, Federal Republic of Germany 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | option(SANITIZE_THREAD "Enable ThreadSanitizer for sanitized targets." Off) 26 | 27 | set(FLAG_CANDIDATES 28 | "-g -fsanitize=thread" 29 | ) 30 | 31 | 32 | # ThreadSanitizer is not compatible with MemorySanitizer. 33 | if (SANITIZE_THREAD AND SANITIZE_MEMORY) 34 | message(FATAL_ERROR "ThreadSanitizer is not compatible with " 35 | "MemorySanitizer.") 36 | endif () 37 | 38 | 39 | include(sanitize-helpers) 40 | 41 | if (SANITIZE_THREAD) 42 | if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux" AND 43 | NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") 44 | message(WARNING "ThreadSanitizer disabled for target ${TARGET} because " 45 | "ThreadSanitizer is supported for Linux systems and macOS only.") 46 | set(SANITIZE_THREAD Off CACHE BOOL 47 | "Enable ThreadSanitizer for sanitized targets." FORCE) 48 | elseif (NOT ${CMAKE_SIZEOF_VOID_P} EQUAL 8) 49 | message(WARNING "ThreadSanitizer disabled for target ${TARGET} because " 50 | "ThreadSanitizer is supported for 64bit systems only.") 51 | set(SANITIZE_THREAD Off CACHE BOOL 52 | "Enable ThreadSanitizer for sanitized targets." FORCE) 53 | else () 54 | sanitizer_check_compiler_flags("${FLAG_CANDIDATES}" "ThreadSanitizer" 55 | "TSan") 56 | endif () 57 | endif () 58 | 59 | function (add_sanitize_thread TARGET) 60 | if (NOT SANITIZE_THREAD) 61 | return() 62 | endif () 63 | 64 | sanitizer_add_flags(${TARGET} "ThreadSanitizer" "TSan") 65 | endfunction () 66 | -------------------------------------------------------------------------------- /cmake/sanitizers/FindUBSan.cmake: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 4 | # 2013 Matthew Arsenault 5 | # 2015-2016 RWTH Aachen University, Federal Republic of Germany 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | option(SANITIZE_UNDEFINED 26 | "Enable UndefinedBehaviorSanitizer for sanitized targets." Off) 27 | 28 | set(FLAG_CANDIDATES 29 | "-g -fsanitize=undefined" 30 | ) 31 | 32 | 33 | include(sanitize-helpers) 34 | 35 | if (SANITIZE_UNDEFINED) 36 | sanitizer_check_compiler_flags("${FLAG_CANDIDATES}" 37 | "UndefinedBehaviorSanitizer" "UBSan") 38 | endif () 39 | 40 | function (add_sanitize_undefined TARGET) 41 | if (NOT SANITIZE_UNDEFINED) 42 | return() 43 | endif () 44 | 45 | sanitizer_add_flags(${TARGET} "UndefinedBehaviorSanitizer" "UBSan") 46 | endfunction () 47 | -------------------------------------------------------------------------------- /cmake/sanitizers/asan-wrapper: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # The MIT License (MIT) 4 | # 5 | # Copyright (c) 6 | # 2013 Matthew Arsenault 7 | # 2015-2016 RWTH Aachen University, Federal Republic of Germany 8 | # 9 | # Permission is hereby granted, free of charge, to any person obtaining a copy 10 | # of this software and associated documentation files (the "Software"), to deal 11 | # in the Software without restriction, including without limitation the rights 12 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | # copies of the Software, and to permit persons to whom the Software is 14 | # furnished to do so, subject to the following conditions: 15 | # 16 | # The above copyright notice and this permission notice shall be included in all 17 | # copies or substantial portions of the Software. 18 | # 19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | # SOFTWARE. 26 | 27 | # This script is a wrapper for AddressSanitizer. In some special cases you need 28 | # to preload AddressSanitizer to avoid error messages - e.g. if you're 29 | # preloading another library to your application. At the moment this script will 30 | # only do something, if we're running on a Linux platform. OSX might not be 31 | # affected. 32 | 33 | 34 | # Exit immediately, if platform is not Linux. 35 | if [ "$(uname)" != "Linux" ] 36 | then 37 | exec $@ 38 | fi 39 | 40 | 41 | # Get the used libasan of the application ($1). If a libasan was found, it will 42 | # be prepended to LD_PRELOAD. 43 | libasan=$(ldd $1 | grep libasan | sed "s/^[[:space:]]//" | cut -d' ' -f1) 44 | if [ -n "$libasan" ] 45 | then 46 | if [ -n "$LD_PRELOAD" ] 47 | then 48 | export LD_PRELOAD="$libasan:$LD_PRELOAD" 49 | else 50 | export LD_PRELOAD="$libasan" 51 | fi 52 | fi 53 | 54 | # Execute the application. 55 | exec $@ 56 | -------------------------------------------------------------------------------- /cmake/sanitizers/sanitize-helpers.cmake: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 4 | # 2013 Matthew Arsenault 5 | # 2015-2016 RWTH Aachen University, Federal Republic of Germany 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in all 15 | # copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | # SOFTWARE. 24 | 25 | # Helper function to get the language of a source file. 26 | function (sanitizer_lang_of_source FILE RETURN_VAR) 27 | get_filename_component(LONGEST_EXT "${FILE}" EXT) 28 | # If extension is empty return. This can happen for extensionless headers 29 | if("${LONGEST_EXT}" STREQUAL "") 30 | set(${RETURN_VAR} "" PARENT_SCOPE) 31 | return() 32 | endif() 33 | # Get shortest extension as some files can have dot in their names 34 | string(REGEX REPLACE "^.*(\\.[^.]+)$" "\\1" FILE_EXT ${LONGEST_EXT}) 35 | string(TOLOWER "${FILE_EXT}" FILE_EXT) 36 | string(SUBSTRING "${FILE_EXT}" 1 -1 FILE_EXT) 37 | 38 | get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) 39 | foreach (LANG ${ENABLED_LANGUAGES}) 40 | list(FIND CMAKE_${LANG}_SOURCE_FILE_EXTENSIONS "${FILE_EXT}" TEMP) 41 | if (NOT ${TEMP} EQUAL -1) 42 | set(${RETURN_VAR} "${LANG}" PARENT_SCOPE) 43 | return() 44 | endif () 45 | endforeach() 46 | 47 | set(${RETURN_VAR} "" PARENT_SCOPE) 48 | endfunction () 49 | 50 | 51 | # Helper function to get compilers used by a target. 52 | function (sanitizer_target_compilers TARGET RETURN_VAR) 53 | # Check if all sources for target use the same compiler. If a target uses 54 | # e.g. C and Fortran mixed and uses different compilers (e.g. clang and 55 | # gfortran) this can trigger huge problems, because different compilers may 56 | # use different implementations for sanitizers. 57 | set(BUFFER "") 58 | get_target_property(TSOURCES ${TARGET} SOURCES) 59 | foreach (FILE ${TSOURCES}) 60 | # If expression was found, FILE is a generator-expression for an object 61 | # library. Object libraries will be ignored. 62 | string(REGEX MATCH "TARGET_OBJECTS:([^ >]+)" _file ${FILE}) 63 | if ("${_file}" STREQUAL "") 64 | sanitizer_lang_of_source(${FILE} LANG) 65 | if (LANG) 66 | list(APPEND BUFFER ${CMAKE_${LANG}_COMPILER_ID}) 67 | endif () 68 | endif () 69 | endforeach () 70 | 71 | list(REMOVE_DUPLICATES BUFFER) 72 | set(${RETURN_VAR} "${BUFFER}" PARENT_SCOPE) 73 | endfunction () 74 | 75 | 76 | # Helper function to check compiler flags for language compiler. 77 | function (sanitizer_check_compiler_flag FLAG LANG VARIABLE) 78 | if (${LANG} STREQUAL "C") 79 | include(CheckCCompilerFlag) 80 | check_c_compiler_flag("${FLAG}" ${VARIABLE}) 81 | 82 | elseif (${LANG} STREQUAL "CXX") 83 | include(CheckCXXCompilerFlag) 84 | check_cxx_compiler_flag("${FLAG}" ${VARIABLE}) 85 | 86 | elseif (${LANG} STREQUAL "Fortran") 87 | # CheckFortranCompilerFlag was introduced in CMake 3.x. To be compatible 88 | # with older Cmake versions, we will check if this module is present 89 | # before we use it. Otherwise we will define Fortran coverage support as 90 | # not available. 91 | include(CheckFortranCompilerFlag OPTIONAL RESULT_VARIABLE INCLUDED) 92 | if (INCLUDED) 93 | check_fortran_compiler_flag("${FLAG}" ${VARIABLE}) 94 | elseif (NOT CMAKE_REQUIRED_QUIET) 95 | message(STATUS "Performing Test ${VARIABLE}") 96 | message(STATUS "Performing Test ${VARIABLE}" 97 | " - Failed (Check not supported)") 98 | endif () 99 | endif() 100 | endfunction () 101 | 102 | 103 | # Helper function to test compiler flags. 104 | function (sanitizer_check_compiler_flags FLAG_CANDIDATES NAME PREFIX) 105 | set(CMAKE_REQUIRED_QUIET ${${PREFIX}_FIND_QUIETLY}) 106 | 107 | get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) 108 | foreach (LANG ${ENABLED_LANGUAGES}) 109 | # Sanitizer flags are not dependend on language, but the used compiler. 110 | # So instead of searching flags foreach language, search flags foreach 111 | # compiler used. 112 | set(COMPILER ${CMAKE_${LANG}_COMPILER_ID}) 113 | if (NOT DEFINED ${PREFIX}_${COMPILER}_FLAGS) 114 | foreach (FLAG ${FLAG_CANDIDATES}) 115 | if(NOT CMAKE_REQUIRED_QUIET) 116 | message(STATUS "Try ${COMPILER} ${NAME} flag = [${FLAG}]") 117 | endif() 118 | 119 | set(CMAKE_REQUIRED_FLAGS "${FLAG}") 120 | unset(${PREFIX}_FLAG_DETECTED CACHE) 121 | sanitizer_check_compiler_flag("${FLAG}" ${LANG} 122 | ${PREFIX}_FLAG_DETECTED) 123 | 124 | if (${PREFIX}_FLAG_DETECTED) 125 | # If compiler is a GNU compiler, search for static flag, if 126 | # SANITIZE_LINK_STATIC is enabled. 127 | if (SANITIZE_LINK_STATIC AND (${COMPILER} STREQUAL "GNU")) 128 | string(TOLOWER ${PREFIX} PREFIX_lower) 129 | sanitizer_check_compiler_flag( 130 | "-static-lib${PREFIX_lower}" ${LANG} 131 | ${PREFIX}_STATIC_FLAG_DETECTED) 132 | 133 | if (${PREFIX}_STATIC_FLAG_DETECTED) 134 | set(FLAG "-static-lib${PREFIX_lower} ${FLAG}") 135 | endif () 136 | endif () 137 | 138 | set(${PREFIX}_${COMPILER}_FLAGS "${FLAG}" CACHE STRING 139 | "${NAME} flags for ${COMPILER} compiler.") 140 | mark_as_advanced(${PREFIX}_${COMPILER}_FLAGS) 141 | break() 142 | endif () 143 | endforeach () 144 | 145 | if (NOT ${PREFIX}_FLAG_DETECTED) 146 | set(${PREFIX}_${COMPILER}_FLAGS "" CACHE STRING 147 | "${NAME} flags for ${COMPILER} compiler.") 148 | mark_as_advanced(${PREFIX}_${COMPILER}_FLAGS) 149 | 150 | message(WARNING "${NAME} is not available for ${COMPILER} " 151 | "compiler. Targets using this compiler will be " 152 | "compiled without ${NAME}.") 153 | endif () 154 | endif () 155 | endforeach () 156 | endfunction () 157 | 158 | 159 | # Helper to assign sanitizer flags for TARGET. 160 | function (sanitizer_add_flags TARGET NAME PREFIX) 161 | # Get list of compilers used by target and check, if sanitizer is available 162 | # for this target. Other compiler checks like check for conflicting 163 | # compilers will be done in add_sanitizers function. 164 | sanitizer_target_compilers(${TARGET} TARGET_COMPILER) 165 | list(LENGTH TARGET_COMPILER NUM_COMPILERS) 166 | if ("${${PREFIX}_${TARGET_COMPILER}_FLAGS}" STREQUAL "") 167 | return() 168 | endif() 169 | 170 | # Set compile- and link-flags for target. 171 | set_property(TARGET ${TARGET} APPEND_STRING 172 | PROPERTY COMPILE_FLAGS " ${${PREFIX}_${TARGET_COMPILER}_FLAGS}") 173 | set_property(TARGET ${TARGET} APPEND_STRING 174 | PROPERTY COMPILE_FLAGS " ${SanBlist_${TARGET_COMPILER}_FLAGS}") 175 | set_property(TARGET ${TARGET} APPEND_STRING 176 | PROPERTY LINK_FLAGS " ${${PREFIX}_${TARGET_COMPILER}_FLAGS}") 177 | endfunction () 178 | -------------------------------------------------------------------------------- /cpp_cli/jagger-app.cc: -------------------------------------------------------------------------------- 1 | // Jagger -- deterministic pattern-based Japanese tagger 2 | // $Id: jagger.cc 2031 2023-02-17 21:47:05Z ynaga $ 3 | // Copyright (c) 2022 Naoki Yoshinaga 4 | // Modification by Copyright 2023 - Present, Light Transport Entertainment Inc. 5 | #include "jagger.h" 6 | 7 | static const size_t MAX_KEY_BITS = 14; 8 | static const size_t MAX_FEATURE_BITS = 7; 9 | 10 | #ifdef _WIN32 11 | static std::wstring UTF8ToWchar(const std::string &str) { 12 | int wstr_size = 13 | MultiByteToWideChar(CP_UTF8, 0, str.data(), int(str.size()), nullptr, 0); 14 | std::wstring wstr(size_t(wstr_size), 0); 15 | MultiByteToWideChar(CP_UTF8, 0, str.data(), int(str.size()), &wstr[0], 16 | int(wstr.size())); 17 | return wstr; 18 | } 19 | 20 | static std::string WcharToUTF8(const std::wstring &wstr) { 21 | int str_size = WideCharToMultiByte(CP_UTF8, 0, wstr.data(), int(wstr.size()), 22 | nullptr, 0, nullptr, nullptr); 23 | std::string str(size_t(str_size), 0); 24 | WideCharToMultiByte(CP_UTF8, 0, wstr.data(), int(wstr.size()), &str[0], 25 | int(str.size()), nullptr, nullptr); 26 | return str; 27 | } 28 | #endif 29 | 30 | 31 | static bool FileExists(const std::string &filepath) { 32 | 33 | bool ret{false}; 34 | #ifdef JAGGER_ANDROID_LOAD_FROM_ASSETS 35 | if (asset_manager) { 36 | AAsset *asset = AAssetManager_open(asset_manager, filepath.c_str(), 37 | AASSET_MODE_STREAMING); 38 | if (!asset) { 39 | return false; 40 | } 41 | AAsset_close(asset); 42 | ret = true; 43 | } else { 44 | return false; 45 | } 46 | #else 47 | #ifdef _WIN32 48 | #if defined(_MSC_VER) || defined(__GLIBCXX__) || defined(_LIBCPP_VERSION) 49 | FILE *fp = nullptr; 50 | errno_t err = _wfopen_s(&fp, UTF8ToWchar(filepath).c_str(), L"rb"); 51 | if (err != 0) { 52 | return false; 53 | } 54 | #else 55 | FILE *fp = nullptr; 56 | errno_t err = fopen_s(&fp, filepath.c_str(), "rb"); 57 | if (err != 0) { 58 | return false; 59 | } 60 | #endif 61 | 62 | #else 63 | FILE *fp = fopen(filepath.c_str(), "rb"); 64 | #endif 65 | if (fp) { 66 | ret = true; 67 | fclose(fp); 68 | } else { 69 | ret = false; 70 | } 71 | #endif 72 | 73 | return ret; 74 | } 75 | 76 | 77 | 78 | namespace ccedar { 79 | class da_ : public ccedar::da { 80 | public: 81 | struct utf8_feeder { // feed one UTF-8 character by one while mapping codes 82 | const char *p, * const end; 83 | utf8_feeder (const char *key_, const char *end_) : p (key_), end (end_) {} 84 | int read (int &b) const { return p == end ? 0 : unicode (p, b); } 85 | void advance (const int b) { p += b; } 86 | }; 87 | int longestPrefixSearchWithPOS (const char* key, const char* const end, int fi_prev, const uint16_t* const c2i, size_t from = 0) const { 88 | size_t from_ = 0; 89 | int n (0), i (0), b (0); 90 | for (utf8_feeder f (key, end); (i = c2i[f.read (b)]); f.advance (b)) { 91 | size_t pos = 0; 92 | const int n_ = traverse (&i, from, pos, pos + 1); 93 | if (n_ == CEDAR_NO_VALUE) continue; 94 | if (n_ == CEDAR_NO_PATH) break; 95 | from_ = from; 96 | n = n_; 97 | } 98 | // ad-hock matching at the moment; it prefers POS-ending patterns 99 | if (! fi_prev) return n; 100 | for (const node* const array_ = reinterpret_cast (array ()); 101 | ; from = array_[from].check) { // hopefully, in the cache 102 | const int n_ = exactMatchSearch (&fi_prev, 1, from); 103 | if (n_ != CEDAR_NO_VALUE) return n_; 104 | if (from == from_) return n; 105 | } 106 | } 107 | }; 108 | } 109 | 110 | namespace jagger { 111 | class tagger { 112 | private: 113 | ccedar::da_ da; 114 | uint16_t* c2i; // mapping from utf8, BOS, unk to character ID 115 | uint64_t* p2f; // mapping from pattern ID to feature strings 116 | char* fs; // feature strings 117 | std::vector > mmaped; 118 | static inline void write_string (char* &p, const char* s, size_t len = 0) { 119 | #ifdef USE_COMPACT_DICT 120 | if (! len) { 121 | len = *reinterpret_cast (s); 122 | s += sizeof (uint16_t); 123 | } 124 | #endif 125 | std::memcpy (p, s, len); 126 | p += len; 127 | } 128 | static inline void write_buffer (char* &p, char* buf, const size_t limit) { 129 | if (p - buf <= limit) return; 130 | ::write (1, buf, static_cast (p - buf)); 131 | p = buf; 132 | } 133 | template 134 | static inline void write_array (T& data, const std::string& fn) { 135 | FILE *fp = std::fopen (fn.c_str (), "wb"); 136 | if (! fp) my_errx (1, "no such file: %s", fn.c_str ()); 137 | std::fwrite (&data[0], sizeof (typename T::value_type), data.size (), fp); 138 | std::fclose (fp); 139 | } 140 | void* read_array (const std::string& fn, size_t &bufsize) { 141 | int fd = ::open (fn.c_str (), O_RDONLY); 142 | if (fd == -1) my_errx (1, "no such file: %s", fn.c_str ()); 143 | // get size and read; 144 | const size_t size = ::lseek (fd, 0, SEEK_END); 145 | ::lseek (fd, 0, SEEK_SET); 146 | #if defined(_WIN32) 147 | HANDLE hFile = reinterpret_cast(_get_osfhandle(fd)); 148 | HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); 149 | if (hMapping == NULL) { 150 | my_errx(1, "CreateFileMappingA failed for: %s", fn.c_str()); 151 | } 152 | void *data = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); 153 | if (!data) { 154 | my_errx(1, "MapViewOfFile failed for: %s", fn.c_str()); 155 | } 156 | CloseHandle(hMapping); 157 | #else 158 | void *data = ::mmap (0, size, PROT_READ, MAP_SHARED, fd, 0); 159 | if (!data) { 160 | my_errx(1, "mmap failed for: %s", fn.c_str()); 161 | } 162 | #endif 163 | ::close (fd); 164 | mmaped.push_back (std::make_pair (data, size)); 165 | bufsize = size; 166 | return data; 167 | } 168 | public: 169 | tagger () : da (), c2i (0), p2f (0), fs (0), mmaped () {} 170 | ~tagger () { 171 | for (size_t i = 0; i < mmaped.size (); ++i) 172 | #if defined(_WIN32) 173 | if (!UnmapViewOfFile(mmaped[i].first)) { 174 | fprintf(stderr, "jagger: warn: UnmapViewOfFile failed."); 175 | } 176 | #else 177 | ::munmap (mmaped[i].first, mmaped[i].second); 178 | #endif 179 | } 180 | void read_model (const std::string& m) { // read patterns to memory 181 | const std::string da_fn (m + ".da"), c2i_fn (m + ".c2i"), p2f_fn (m + ".p2f"), fs_fn (m + ".fs"); 182 | //struct stat st; 183 | //if (::stat (da_fn.c_str (), &st) != 0) { // compile 184 | if (!FileExists(da_fn)) { 185 | std::fprintf (stderr, "building DA trie from patterns.."); 186 | std::vector c2i_; // mapping from utf8, BOS, unk to char ID 187 | std::vector p2f_; // mapping from pattern ID to feature str 188 | std::vector fs_; // feature strings 189 | sbag_t fbag ("\tBOS"); 190 | #ifdef USE_COMPACT_DICT 191 | fbag.to_i (FEAT_UNK); 192 | sbag_t fbag_ (",*,*,*\n"); 193 | #else 194 | sbag_t fbag_ ((std::string (FEAT_UNK) + ",*,*,*\n").c_str ()); 195 | #endif 196 | std::map fs2pid; 197 | fs2pid.insert (std::make_pair ((1ull << 32) | 2, fs2pid.size ())); 198 | p2f_.push_back ((1ull << 32) | 2); 199 | // count each character to obtain dense mapping 200 | std::vector > counter (CP_MAX + 3); 201 | for (int u = 0; u < counter.size (); ++u) // allow 43 bits for counting 202 | counter[u] = std::make_pair (0, u); 203 | std::vector > keys; 204 | char *line = 0; 205 | simple_reader reader (m.c_str ()); 206 | while (const size_t len = reader.gets (&line)) { // find pos offset 207 | // pattern format: COUNT PATTEN PREV_POS BYTES CHAR_TYPE FEATURES 208 | char *p (line), * const p_end (p + len); 209 | const size_t count = std::strtoul (p, &p, 10); 210 | const char *pat = ++p; 211 | for (int b = 0; *p != '\t'; p += b) 212 | counter[unicode (p, b)].first += count + 1; 213 | size_t fi_prev = 0; 214 | const char* f_prev = p; // starting with '\t' 215 | if (*++p != '\t') { // with pos context 216 | p = const_cast (skip_to (p, 1, '\t')) - 1; 217 | fi_prev = fbag.to_i (f_prev, p - f_prev) + 1; 218 | if (fi_prev + CP_MAX == counter.size ()) // new part-of-speech 219 | counter.push_back (std::make_pair (0, (fi_prev + CP_MAX))); 220 | counter[fi_prev + CP_MAX].first += count + 1; 221 | } 222 | const size_t bytes = std::strtoul (++p, &p, 10); 223 | const size_t ctype = std::strtoul (++p, &p, 10); 224 | const char* f = p; // starting with '\t' 225 | p = const_cast (skip_to (p, NUM_POS_FIELD, ',')) - 1; 226 | const size_t fi_ = fbag.to_i (f, p - f) + 1; 227 | #ifndef USE_COMPACT_DICT 228 | p = const_cast (f); 229 | #endif 230 | const size_t fi = fbag_.to_i (p, p_end - p) + 1; 231 | if (fi_ + CP_MAX == counter.size ()) // new part-of-speech 232 | counter.push_back (std::make_pair (0, fi_ + CP_MAX)); 233 | std::pair ::iterator, bool> itb 234 | = fs2pid.insert (std::make_pair ((fi << 32) | fi_, fs2pid.size ())); 235 | if (itb.second) p2f_.push_back ((fi << 32) | fi_); 236 | keys.push_back (std::make_pair (std::string (pat, f_prev - pat), 237 | (((bytes << 23) | ((ctype & 0x7) << 20) | (itb.first->second & 0xfffff)) << 12) | fi_prev)); 238 | } 239 | // save c2i 240 | std::sort (counter.begin () + 1, counter.end (), std::greater > ()); 241 | c2i_.resize (counter.size ()); 242 | for (unsigned int i = 1; i < counter.size () && counter[i].first; ++i) 243 | c2i_[counter[i].second] = static_cast (i); 244 | // save feature strings 245 | std::vector offsets; 246 | #ifdef USE_COMPACT_DICT 247 | fbag.serialize (fs_, offsets); // required only for compact dict 248 | #endif 249 | fbag_.serialize (fs_, offsets); 250 | write_array (fs_, fs_fn); 251 | // save mapping from morpheme ID to morpheme feature strings 252 | for (size_t i = 0; i < p2f_.size (); ++i) { 253 | #ifdef USE_COMPACT_DICT 254 | p2f_[i] = (offsets[(p2f_[i] >> 32) - 1 + fbag.size ()] << 34) | 255 | (offsets[(p2f_[i] & 0xffffffff) - 1] << MAX_KEY_BITS) | 256 | #else 257 | const std::string& f = fbag_.to_s ((p2f_[i] >> 32) - 1); 258 | const char* q = skip_to (f.c_str (), NUM_POS_FIELD, ',') - 1; 259 | p2f_[i] = (offsets[(p2f_[i] >> 32) - 1] << 34) | 260 | (fbag_.to_s ((p2f_[i] >> 32) - 1).size () << (MAX_KEY_BITS + MAX_FEATURE_BITS)) | 261 | (q - f.c_str ()) << MAX_KEY_BITS | 262 | #endif 263 | c2i_[(p2f_[i] & 0xffffffff) + CP_MAX]; 264 | } 265 | write_array (p2f_, p2f_fn); 266 | // save pattern trie 267 | for (std::vector >::const_iterator it = keys.begin (); it != keys.end (); ++it) { 268 | std::vector key; 269 | for (int offset (0), b (0); offset < it->first.size (); offset += b) 270 | key.push_back (c2i_[unicode (&it->first[offset], b)]); 271 | if (it->second & 0xfff) 272 | key.push_back (c2i_[(it->second & 0xfff) + CP_MAX]); 273 | da.update (&key[0], key.size ()) = it->second >> 12; 274 | } 275 | c2i_.resize (CP_MAX + 2); // chop most of part-of-speech mapping 276 | write_array (c2i_, c2i_fn); 277 | da.save (da_fn.c_str ()); 278 | std::fprintf (stderr, "done.\n"); 279 | } 280 | size_t bufsize; 281 | const void *da_buf = read_array(da_fn, bufsize); 282 | da.set_array (da_buf, bufsize); 283 | c2i = static_cast (read_array (c2i_fn, bufsize)); 284 | p2f = static_cast (read_array (p2f_fn, bufsize)); 285 | fs = static_cast (read_array (fs_fn, bufsize)); 286 | } 287 | template 288 | void run () const { 289 | if (BUF_SIZE_ == 0) std::fprintf (stderr, "(input: stdin)\n"); 290 | char _res[BUF_SIZE], *_ptr (&_res[0]), *line (0); 291 | simple_reader reader; 292 | while (const size_t len = reader.gets (&line)) { 293 | int bytes (0), bytes_prev (0), id (0), ctype (0), ctype_prev (0); 294 | uint64_t offsets = c2i[CP_MAX + 1]; 295 | bool bos (true), ret (line[len - 1] == '\n'), concat (false); 296 | for (const char *p (line), * const p_end (p + len - ret); p != p_end; bytes_prev = bytes, ctype_prev = ctype, offsets = p2f[static_cast (id)], p += bytes) { 297 | const int r = da.longestPrefixSearchWithPOS (p, p_end, offsets & 0x3fff, &c2i[0]); // found word 298 | id = r & 0xfffff; 299 | bytes = (r >> 23) ? (r >> 23) : u8_len (p); 300 | ctype = (r >> 20) & 0x7; // 0: num|unk / 1: alpha / 2: kana / 3: other 301 | if (! bos) { // word that may concat with the future context 302 | if (ctype_prev != ctype || // different character types 303 | ctype_prev == 3 || // seen words in non-num/alpha/kana 304 | (ctype_prev == 2 && bytes_prev + bytes >= 18)) { 305 | if (POS_TAGGING) { 306 | #ifdef USE_COMPACT_DICT 307 | write_string (_ptr, &fs[((offsets >> MAX_KEY_BITS) & 0xfffff)]); 308 | if (concat) 309 | write_string (_ptr, ",*,*,*\n", 7); 310 | else 311 | write_string (_ptr, &fs[(offsets >> 34)]); 312 | #else 313 | if (concat) { 314 | write_string (_ptr, &fs[(offsets >> 34)], (offsets >> MAX_KEY_BITS) & 0x7f); 315 | write_string (_ptr, ",*,*,*\n", 7); 316 | } else 317 | write_string (_ptr, &fs[(offsets >> 34)], (offsets >> (MAX_KEY_BITS + MAX_FEATURE_BITS)) & 0x3ff); 318 | #endif 319 | concat = false; 320 | } else 321 | write_string (_ptr, " ", 1); 322 | } else 323 | concat = true; 324 | } else 325 | bos = false; 326 | write_string (_ptr, p, static_cast (bytes)); 327 | } 328 | if (! bos) // output fs of last token 329 | if (POS_TAGGING) { 330 | #ifdef USE_COMPACT_DICT 331 | write_string (_ptr, &fs[((offsets >> MAX_KEY_BITS) & 0xfffff)]); 332 | if (concat) 333 | write_string (_ptr, ",*,*,*\n", 7); 334 | else 335 | write_string (_ptr, &fs[(offsets >> 34)]); 336 | #else 337 | if (concat) { 338 | write_string (_ptr, &fs[(offsets >> 34)], (offsets >> MAX_KEY_BITS) & 0x7f); 339 | write_string (_ptr, ",*,*,*\n", 7); 340 | } else 341 | write_string (_ptr, &fs[(offsets >> 34)], (offsets >> (MAX_KEY_BITS + MAX_FEATURE_BITS)) & 0x3ff); 342 | #endif 343 | } 344 | write_string (_ptr, POS_TAGGING ? "EOS\n" : "\n", POS_TAGGING ? 4 : 1); 345 | write_buffer (_ptr, &_res[0], BUF_SIZE_); 346 | } 347 | write_buffer (_ptr, &_res[0], 0); 348 | } 349 | }; 350 | } 351 | 352 | int main (int argc, char** argv) { 353 | std::string model (JAGGER_DEFAULT_MODEL "/patterns"); 354 | bool tag (true), fbf (false); 355 | #if 0 356 | { // options (minimal) 357 | extern char *optarg; 358 | for (int opt = 0; (opt = getopt (argc, argv, "m:wfh")) != -1;) 359 | switch (opt) { 360 | case 'm': model = optarg; model += "/patterns"; break; 361 | case 'w': tag = false; break; 362 | case 'f': fbf = true; break; 363 | case 'h': 364 | my_errx (1, "Pattern-based Jappanese Morphological Analyzer\nUsage: %s -m dir [-wf] < input\n\nOptions:\n -m dir\tpattern directory (default: " JAGGER_DEFAULT_MODEL ")\n -w\tperform only segmentation\n -f\tfull buffering (fast but not interactive)", argv[0]); 365 | } 366 | } 367 | #else 368 | { 369 | if ((argc < 2) || (std::string(argv[1]) == "-h")) { 370 | my_errx (1, "Pattern-based Jappanese Morphological Analyzer\nUsage: %s -m dir [-wf] < input\n\nOptions:\n -m dir\tpattern directory (default: " JAGGER_DEFAULT_MODEL ")\n -w\tperform only segmentation\n -f\tfull buffering (fast but not interactive)", argv[0]); 371 | 372 | } 373 | 374 | for (size_t i = 1; i < argc; i++) { 375 | std::string arg = argv[i]; 376 | 377 | if (arg == "-m") { 378 | if ((i + 1) >= argc) { 379 | my_errx(1, "%s: model filename is missing.\n", argv[0]); 380 | } 381 | model = argv[i+1]; 382 | i++; 383 | } else if (arg == "-w") { 384 | tag = false; 385 | } else if (arg == "-f") { 386 | fbf = true; 387 | } 388 | } 389 | } 390 | #endif 391 | jagger::tagger jagger; 392 | jagger.read_model (model); 393 | switch ((fbf << 4) | tag) { 394 | case 0x00: jagger.run <0, false> (); break; 395 | case 0x01: jagger.run <0, true> (); break; 396 | case 0x10: jagger.run <(BUF_SIZE >> 1), false> (); break; 397 | case 0x11: jagger.run <(BUF_SIZE >> 1), true> (); break; 398 | } 399 | return 0; 400 | } 401 | -------------------------------------------------------------------------------- /data/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python to_mecab_feature.py 3 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | 顔文字と絵文字の単語を追加します 2 | 3 | 絵文字は python emoji から一覧を取得します. 4 | kaomoji-list.txt is based on https://kaomoji.ru/en/ 5 | 6 | いくつかの顔文字は ',' を含むため, それらは quote します. 7 | (jagger_train 側で quote は適切に対応されます) 8 | 9 | ## TODO 10 | 11 | * [ ] 絵文字のカテゴリを設定する 12 | 13 | EoL. 14 | 15 | 16 | -------------------------------------------------------------------------------- /data/kaomoji-list.txt: -------------------------------------------------------------------------------- 1 | 楽しい, Joy, Positive Emotion 2 | (* ^ ω ^) 3 | (´ ∀ ` *) 4 | ٩(◕‿◕。)۶ 5 | ☆*:.。.o(≧▽≦)o.。.:*☆ 6 | (o^▽^o) 7 | (⌒▽⌒)☆ 8 | <( ̄︶ ̄)> 9 | 。.:☆*:・'(*⌒―⌒*))) 10 | ヽ(・∀・)ノ 11 | (´。• ω •。`) 12 | ( ̄ω ̄) 13 | `;:゛;`;・(°ε° ) 14 | (o・ω・o) 15 | (@^◡^) 16 | ヽ(*・ω・)ノ 17 | (o_ _)ノ彡☆ 18 | (^人^) 19 | (o´▽`o) 20 | (*´▽`*) 21 | 。゚( ゚^∀^゚)゚。 22 | ( ´ ω ` ) 23 | (((o(*°▽°*)o))) 24 | (≧◡≦) 25 | (o´∀`o) 26 | (´• ω •`) 27 | (^▽^) 28 | (⌒ω⌒) 29 | ∑d(°∀°d) 30 | ╰(▔∀▔)╯ 31 | (─‿‿─) 32 | (*^‿^*) 33 | ヽ(o^ ^o)ノ 34 | (✯◡✯) 35 | (◕‿◕) 36 | (*≧ω≦*) 37 | (☆▽☆) 38 | (⌒‿⌒) 39 | \(≧▽≦)/ 40 | ヽ(o^▽^o)ノ 41 | ☆ ~('▽^人) 42 | (*°▽°*) 43 | ٩(。•́‿•̀。)۶ 44 | (✧ω✧) 45 | ヽ(*⌒▽⌒*)ノ 46 | (´。• ᵕ •。`) 47 | ( ´ ▽ ` ) 48 | ( ̄▽ ̄) 49 | ╰(*´︶`*)╯ 50 | ヽ(>∀<☆)ノ 51 | o(≧▽≦)o 52 | (☆ω☆) 53 | (っ˘ω˘ς ) 54 | \( ̄▽ ̄)/ 55 | (*¯︶¯*) 56 | \(^▽^)/ 57 | ٩(◕‿◕)۶ 58 | (o˘◡˘o) 59 | \(★ω★)/ 60 | \(^ヮ^)/ 61 | (〃^▽^〃) 62 | (╯✧▽✧)╯ 63 | o(>ω<)o 64 | o( ❛ᴗ❛ )o 65 | 。゚(TヮT)゚。 66 | ( ‾́ ◡ ‾́ ) 67 | (ノ´ヮ`)ノ*: ・゚ 68 | (b ᵔ▽ᵔ)b 69 | (๑˃ᴗ˂)ﻭ 70 | (๑˘︶˘๑) 71 | ( ˙꒳​˙ ) 72 | (*꒦ິ꒳꒦ີ) 73 | °˖✧◝(⁰▿⁰)◜✧˖° 74 | (´・ᴗ・ ` ) 75 | (ノ◕ヮ◕)ノ*:・゚✧ 76 | („• ֊ •„) 77 | (.❛ ᴗ ❛.) 78 | (⁀ᗢ⁀) 79 | (¬‿¬ ) 80 | (¬‿¬ ) 81 | (* ̄▽ ̄)b 82 | ( ˙▿˙ ) 83 | (¯▿¯) 84 | ( ◕▿◕ ) 85 | \(٥⁀▽⁀ )/ 86 | („• ᴗ •„) 87 | (ᵔ◡ᵔ) 88 | ( ´ ▿ ` ) 89 | (๑>◡<๑) 90 | ( = ⩊ = ) 91 | ( ´ ꒳ ` ) 92 | ⸜( ´ ꒳ ` )⸝ 93 | ⸜(⸝⸝⸝´꒳`⸝⸝⸝)⸝ 94 | ⸜(*ˊᗜˋ*)⸝ 95 | ⸜( *ˊᵕˋ* )⸝ 96 | 97 | 愛, Love, Positive Emotion 98 | (ノ´ з `)ノ 99 | (♡μ_μ) 100 | (*^^*)♡ 101 | ☆⌒ヽ(*'、^*)chu 102 | (♡-_-♡) 103 | ( ̄ε ̄@) 104 | ヽ(♡‿♡)ノ 105 | ( ´ ∀ `)ノ~ ♡ 106 | (─‿‿─)♡ 107 | (´。• ᵕ •。`) ♡ 108 | (*♡∀♡) 109 | (。・//ε//・。) 110 | (´ ω `♡) 111 | ♡( ◡‿◡ ) 112 | (◕‿◕)♡ 113 | (/▽\*)。o○♡ 114 | (ღ˘⌣˘ღ) 115 | (♡°▽°♡) 116 | ♡(。- ω -) 117 | ♡ ~('▽^人) 118 | (´• ω •`) ♡ 119 | (´ ε ` )♡ 120 | (´。• ω •。`) ♡ 121 | ( ´ ▽ ` ).。o♡ 122 | ╰(*´︶`*)╯♡ 123 | (*˘︶˘*).。.:*♡ 124 | (♡˙︶˙♡) 125 | ♡\( ̄▽ ̄)/♡ 126 | (≧◡≦) ♡ 127 | (⌒▽⌒)♡ 128 | (*¯ ³¯*)♡ 129 | (っ˘з(˘⌣˘ ) ♡ 130 | ♡ (˘▽˘>ԅ( ˘⌣˘) 131 | ( ˘⌣˘)♡(˘⌣˘ ) 132 | (/^-^(^ ^*)/ ♡ 133 | ٩(♡ε♡)۶ 134 | σ(≧ε≦σ) ♡ 135 | ♡ (⇀ 3 ↼) 136 | ♡ ( ̄З ̄) 137 | (❤ω❤) 138 | (˘∀˘)/(μ‿μ) ❤ 139 | ❤ (ɔˆз(ˆ⌣ˆc) 140 | (´♡‿♡`) 141 | (°◡°♡) 142 | Σ>―(〃°ω°〃)♡→ 143 | (´,,•ω•,,)♡ 144 | (´꒳`)♡ 145 | ♡(>ᴗ•) 146 | 147 | 恥ずかしい, Embarassement, Positive Emotion 148 | (⌒_⌒;) 149 | (o^ ^o) 150 | (*/ω\) 151 | (*/。\) 152 | (*/_\) 153 | (*ノωノ) 154 | (o-_-o) 155 | (*μ_μ) 156 | ( ◡‿◡ *) 157 | (ᵔ.ᵔ) 158 | (*ノ∀`*) 159 | (//▽//) 160 | (//ω//) 161 | (ノ*°▽°*) 162 | (*^.^*) 163 | (*ノ▽ノ) 164 | ( ̄▽ ̄*)ゞ 165 | (⁄ ⁄•⁄ω⁄•⁄ ⁄) 166 | (*/▽\*) 167 | (⁄ ⁄>⁄ ▽ ⁄<⁄ ⁄) 168 | („ಡωಡ„) 169 | (ง ื▿ ื)ว 170 | ( 〃▽〃) 171 | (/▿\ ) 172 | (/// ̄  ̄///) 173 | 174 | 同情, Sympathy, Positive Emotion 175 | (ノ_<。)ヾ(´ ▽ ` ) 176 | 。・゚・(ノД`)ヽ( ̄ω ̄ ) 177 | ρ(- ω -、)ヾ( ̄ω ̄; ) 178 | ヽ( ̄ω ̄(。。 )ゝ 179 | (*´ I `)ノ゚(ノД`゚)゚。 180 | ヽ(~_~(・_・ )ゝ 181 | (ノ_;)ヾ(´ ∀ ` ) 182 | (; ω ; )ヾ(´∀`* ) 183 | (*´ー)ノ(ノд`) 184 | (´-ω-`( _ _ ) 185 | (っ´ω`)ノ(╥ω╥) 186 | (o・_・)ノ”(ノ_<、) 187 | 188 | 不満, Dissatisfaction, Negative Emotion 189 | (#><) 190 | (;⌣̀_⌣́) 191 | ☆o(><;)○ 192 | ( ̄  ̄|||) 193 | (; ̄Д ̄) 194 | ( ̄□ ̄」) 195 | (# ̄0 ̄) 196 | (# ̄ω ̄) 197 | (¬_¬;) 198 | (>m<) 199 | (」°ロ°)」 200 | (〃>_<;〃) 201 | (^^#) 202 | (︶︹︺) 203 | ( ̄ヘ ̄) 204 | <( ̄ ﹌  ̄)> 205 | ( ̄︿ ̄) 206 | (>﹏<) 207 | (--_--) 208 | 凸( ̄ヘ ̄) 209 | ヾ(  ̄O ̄)ツ 210 | (⇀‸↼‶) 211 | o(>< )o 212 | (」><)」 213 | (ᗒᗣᗕ)՞ 214 | (눈_눈) 215 | 216 | 怒り, Anger, Negative Emotion 217 | (#`Д´) 218 | (`皿´#) 219 | ( ` ω ´ ) 220 | ヽ( `д´*)ノ 221 | (・`ω´・) 222 | (`ー´) 223 | ヽ(`⌒´メ)ノ 224 | 凸(`△´#) 225 | ( `ε´ ) 226 | ψ( ` ∇ ´ )ψ 227 | ヾ(`ヘ´)ノ゙ 228 | ヽ(‵﹏´)ノ 229 | (メ` ロ ´) 230 | (╬`益´) 231 | ┌∩┐(◣_◢)┌∩┐ 232 | 凸( ` ロ ´ )凸 233 | Σ(▼□▼メ) 234 | (°ㅂ°╬) 235 | ψ(▼へ▼メ)~→ 236 | (ノ°益°)ノ 237 | (҂ `з´ ) 238 | (‡▼益▼) 239 | (҂` ロ ´)凸 240 | ((╬◣﹏◢)) 241 | ٩(╬ʘ益ʘ╬)۶ 242 | (╬ Ò﹏Ó) 243 | \\٩(๑`^´๑)۶// 244 | (凸ಠ益ಠ)凸 245 | ↑_(ΦwΦ)Ψ 246 | ←~(Ψ▼ー▼)∈ 247 | ୧((#Φ益Φ#))୨ 248 | ٩(ఠ益ఠ)۶ 249 | (ノಥ益ಥ)ノ 250 | (≖、≖╬) 251 | 252 | 悲しい, Sadness, Negative Emotion 253 | (ノ_<。) 254 | (-_-) 255 | (´-ω-`) 256 | .・゚゚・(/ω\)・゚゚・. 257 | (μ_μ) 258 | (ノД`) 259 | (-ω-、) 260 | 。゜゜(´O`) ゜゜。 261 | o(TヘTo) 262 | ( ; ω ; ) 263 | (。╯︵╰。) 264 | 。・゚゚*(>д<)*゚゚・。 265 | ( ゚,_ゝ`) 266 | (个_个) 267 | (╯︵╰,) 268 | 。・゚(゚><゚)゚・。 269 | ( ╥ω╥ ) 270 | (╯_╰) 271 | (╥_╥) 272 | .。・゚゚・(>_<)・゚゚・。. 273 | (/ˍ・、) 274 | (ノ_<、) 275 | (╥﹏╥) 276 | 。゚(。ノωヽ。)゚。 277 | (つω`。) 278 | (。T ω T。) 279 | (ノω・、) 280 | ・゚・(。>ω<。)・゚・ 281 | (T_T) 282 | (>_<) 283 | (っ˘̩╭╮˘̩)っ 284 | 。゚・ (>﹏<) ・゚。 285 | o(〒﹏〒)o 286 | (。•́︿•̀。) 287 | (ಥ﹏ಥ) 288 | (ಡ‸ಡ) 289 | 290 | 痛み, Pain, Negative Emotion 291 | ~(>_<~) 292 | ☆⌒(> _ <) 293 | ☆⌒(>。<) 294 | (☆_@) 295 | (×_×) 296 | (x_x) 297 | (×_×)⌒☆ 298 | (x_x)⌒☆ 299 | (×﹏×) 300 | ☆(#××) 301 | (+_+) 302 | [ ± _ ± ] 303 | ٩(× ×)۶ 304 | _:(´ཀ`」 ∠):_ 305 | (メ﹏メ) 306 | 307 | 怖い, Fear, Negative Emotion 308 | (ノωヽ) 309 | (/。\) 310 | (ノ_ヽ) 311 | ..・ヾ(。><)シ 312 | (″ロ゛) 313 | (;;;*_*) 314 | (・人・) 315 | \(〇_o)/ 316 | (/ω\) 317 | (/_\) 318 | 〜(><)〜 319 | Σ(°△°|||)︴ 320 | (((><))) 321 | {{ (>_<) }} 322 | \(º □ º l|l)/ 323 | 〣( ºΔº )〣 324 | ▓▒░(°◡°)░▒▓ 325 | 326 | 無関心, Indifference, Neutral Emotion 327 | ヽ(ー_ー )ノ 328 | ヽ(´ー` )┌ 329 | ┐(‘~` )┌ 330 | ヽ(  ̄д ̄)ノ 331 | ┐( ̄ヘ ̄)┌ 332 | ヽ( ̄~ ̄ )ノ 333 | ╮( ̄_ ̄)╭ 334 | ヽ(ˇヘˇ)ノ 335 | ┐( ̄~ ̄)┌ 336 | ┐(︶▽︶)┌ 337 | ╮( ̄~ ̄)╭ 338 | ¯\_(ツ)_/¯ 339 | ┐( ´ д ` )┌ 340 | ╮(︶︿︶)╭ 341 | ┐( ̄∀ ̄)┌ 342 | ┐( ˘ 、 ˘ )┌ 343 | ╮(︶▽︶)╭ 344 | ╮( ˘ 、 ˘ )╭ 345 | ┐( ˘_˘ )┌ 346 | ╮( ˘_˘ )╭ 347 | ┐( ̄ヮ ̄)┌ 348 | ᕕ( ᐛ )ᕗ 349 | ┐(シ)┌ 350 | 351 | 困惑, Confusion, Neutral Emotion 352 | ( ̄ω ̄;) 353 | σ( ̄、 ̄〃) 354 | ( ̄~ ̄;) 355 | (-_-;)・・・ 356 | ┐('~`;)┌ 357 | (・_・ヾ 358 | (〃 ̄ω ̄〃ゞ 359 | ┐( ̄ヘ ̄;)┌ 360 | (・_・;) 361 | ( ̄_ ̄)・・・ 362 | ╮( ̄ω ̄;)╭ 363 | (¯ . ¯;) 364 | (@_@) 365 | (・・;)ゞ 366 | Σ( ̄。 ̄ノ) 367 | (・・ ) ? 368 | (•ิ_•ิ)? 369 | (◎ ◎)ゞ 370 | (ーー;) 371 | ლ(ಠ_ಠ ლ) 372 | ლ(¯ロ¯"ლ) 373 | (¯ . ¯٥) 374 | (¯ ¯٥) 375 | 376 | 疑い, Doubt, Neutral Emotion 377 | (¬_¬) 378 | (→_→) 379 | (¬ ¬) 380 | (¬‿¬ ) 381 | (¬_¬ ) 382 | (←_←) 383 | (¬ ¬ ) 384 | (¬‿¬ ) 385 | (↼_↼) 386 | (⇀_⇀) 387 | (ᓀ ᓀ) 388 | 389 | 驚き, Surprise, Neutral Emotion 390 | w(°o°)w 391 | ヽ(°〇°)ノ 392 | Σ(O_O) 393 | Σ(°ロ°) 394 | (⊙_⊙) 395 | (o_O) 396 | (O_O;) 397 | (O.O) 398 | (°ロ°) ! 399 | (o_O) ! 400 | (□_□) 401 | Σ(□_□) 402 | ∑(O_O;) 403 | ( : ౦ ‸ ౦ : ) 404 | 405 | 挨拶, Greeting, Various Actions 406 | (*・ω・)ノ 407 | ( ̄▽ ̄)ノ 408 | (°▽°)/ 409 | ( ´ ∀ ` )ノ 410 | (^-^*)/ 411 | (@´ー`)ノ゙ 412 | (´• ω •`)ノ 413 | ( ° ∀ ° )ノ゙ 414 | ヾ(*'▽'*) 415 | \(⌒▽⌒) 416 | ヾ(☆▽☆) 417 | ( ´ ▽ ` )ノ 418 | (^0^)ノ 419 | ~ヾ(・ω・) 420 | (・∀・)ノ 421 | ヾ(・ω・*) 422 | (*°ー°)ノ 423 | (・_・)ノ 424 | (o´ω`o)ノ 425 | ( ´ ▽ ` )/ 426 | ( ̄ω ̄)/ 427 | ( ´ ω ` )ノ゙ 428 | (⌒ω⌒)ノ 429 | (o^ ^o)/ 430 | (≧▽≦)/ 431 | (✧∀✧)/ 432 | (o´▽`o)ノ 433 | ( ̄▽ ̄)/ 434 | 435 | 抱擁, Hugging, Various Actions 436 | (づ ̄ ³ ̄)づ 437 | (つ≧▽≦)つ 438 | (つ✧ω✧)つ 439 | (づ ◕‿◕ )づ 440 | (⊃。•́‿•̀。)⊃ 441 | (つ . •́ _ʖ •̀ .)つ 442 | (っಠ‿ಠ)っ 443 | (づ◡﹏◡)づ 444 | ⊂(´• ω •`⊂) 445 | ⊂(・ω・*⊂) 446 | ⊂( ̄▽ ̄)⊃ 447 | ⊂( ´ ▽ ` )⊃ 448 | ( ~*-*)~ 449 | (ノ= ⩊ = )ノ 450 | (っ ᵔ◡ᵔ)っ 451 | (っ╹ᆺ╹)っ 452 | 453 | ウインク, Winking, Various Actions 454 | (^_~) 455 | ( ゚o⌒) 456 | (^_-)≡☆ 457 | (^ω~) 458 | (>ω^) 459 | (~人^) 460 | (^_-) 461 | ( -_・) 462 | (^_<)〜☆ 463 | (^人<)〜☆ 464 | ☆⌒(≧▽​° ) 465 | ☆⌒(ゝ。∂) 466 | (^_<) 467 | (^_−)☆ 468 | (・ω<)☆ 469 | (^.~)☆ 470 | (^.~) 471 | (。•̀ᴗ-)✧ 472 | (>ᴗ•) 473 | 474 | お詫び, Apologizing, Various Actions 475 | (^_~) 476 | m(_ _)m 477 | (シ_ _)シ 478 | m(. .)m 479 | <(_ _)> 480 | 人(_ _*) 481 | (*_ _)人 482 | m(_ _;m) 483 | (m;_ _)m 484 | (シ. .)シ 485 | 486 | 鼻血, Nosebleeding, Various Actions 487 | (* ̄ii ̄) 488 | ( ̄ハ ̄*) 489 | \( ̄ハ ̄) 490 | (^་།^) 491 | (^〃^) 492 | ( ̄ ¨ヽ ̄) 493 | ( ̄ ; ̄) 494 | ( ̄ ;; ̄) 495 | 496 | 隠れる, Hiding, Various Actions 497 | |・ω・) 498 | ヘ(・_| 499 | |ω・)ノ 500 | ヾ(・| 501 | |д・) 502 | |_ ̄)) 503 | |▽//) 504 | ┬┴┬┴┤(・_├┬┴┬┴ 505 | ┬┴┬┴┤・ω・)ノ 506 | ┬┴┬┴┤( ͡° ͜ʖ├┬┴┬┴ 507 | ┬┴┬┴┤(・_├┬┴┬┴ 508 | |_・) 509 | |・д・)ノ 510 | |ʘ‿ʘ)╯ 511 | 512 | メモ, Writing, Various Actions 513 | __φ(..) 514 | (  ̄ー ̄)φ__ 515 | __φ(。。) 516 | __φ(..;) 517 | ヾ( `ー´)シφ__ 518 | __〆( ̄ー ̄ ) 519 | ....φ(・∀・*) 520 | ___〆(・∀・) 521 | ( ^▽^)ψ__ 522 | ....φ(︶▽︶)φ.... 523 | ( . .)φ__ 524 | __φ(◎◎ヘ) 525 | 526 | 走る, Running, Various Actions 527 | ☆ミ(o*・ω・)ノ 528 | C= C= C= C= C=┌(;・ω・)┘ 529 | ─=≡Σ((( つ><)つ 530 | ε=ε=ε=ε=┌(; ̄▽ ̄)┘ 531 | ε=ε=┌( >_<)┘ 532 | C= C= C= C=┌( `ー´)┘ 533 | ε===(っ≧ω≦)っ 534 | ヽ( ̄д ̄;)ノ=3=3=3 535 | 。。。ミヽ(。><)ノ 536 | 537 | 寝る, Running, Various Actions 538 | [(--)]..zzZ 539 | (-_-) zzZ 540 | (∪。∪)。。。zzZ 541 | (-ω-) zzZ 542 | ( ̄o ̄) zzZZzzZZ 543 | (( _ _ ))..zzzZZ 544 | ( ̄ρ ̄)..zzZZ 545 | (-.-)...zzz 546 | (_ _*) Z z z 547 | (x . x) ~~zzZ 548 | 549 | 猫, Cat, Animals 550 | (=^・ω・^=) 551 | (=^・ェ・^=) 552 | (=①ω①=) 553 | ( =ω=)..nyaa 554 | (= ; ェ ; =) 555 | (=`ω´=) 556 | (=^‥^=) 557 | ( =ノωヽ=) 558 | (=⌒‿‿⌒=) 559 | (=^ ◡ ^=) 560 | (=^-ω-^=) 561 | ヾ(=`ω´=)ノ” 562 | (^• ω •^) 563 | (/ =ω=)/ 564 | ฅ(•ㅅ•❀)ฅ 565 | ฅ(• ɪ •)ฅ 566 | ଲ(ⓛ ω ⓛ)ଲ 567 | (^=◕ᴥ◕=^) 568 | ( =ω= ) 569 | ଲ(ⓛ ω ⓛ)ଲ 570 | (^=◕ᴥ◕=^) 571 | ( =ω= ) 572 | (^˵◕ω◕˵^) 573 | (^◔ᴥ◔^) 574 | (^◕ᴥ◕^) 575 | ต(=ω=)ต 576 | ( Φ ω Φ ) 577 | ฅ(^◕ᴥ◕^)ฅ 578 | 579 | 熊, Bear, Animals 580 | ( ´(エ)ˋ ) 581 | (* ̄(エ) ̄*) 582 | ヽ( ̄(エ) ̄)ノ 583 | (/ ̄(エ) ̄)/ 584 | ( ̄(エ) ̄) 585 | ヽ( ˋ(エ)´ )ノ 586 | ⊂( ̄(エ) ̄)⊃ 587 | (/(エ)\) 588 | ⊂(´(ェ)ˋ)⊃ 589 | (/-(エ)-\) 590 | (/°(エ)°)/ 591 | ʕ ᵔᴥᵔ ʔ 592 | ʕ •ᴥ• ʔ 593 | ʕ •̀ ω •́ ʔ 594 | ʕ •̀ o •́ ʔ 595 | ʕಠᴥಠʔ 596 | 597 | 犬, Dog, Animals 598 | ∪^ェ^∪ 599 | ∪・ω・∪ 600 | ∪ ̄- ̄∪ 601 | ∪・ェ・∪ 602 | U^皿^U 603 | UTェTU 604 | U^ェ^U 605 | V●ᴥ●V 606 | U・ᴥ・U 607 | 608 | うさぎ, Rabbit, Animals 609 | /(≧ x ≦)\ 610 | /(・ × ・)\ 611 | /(=´x`=)\ 612 | /(^ x ^)\ 613 | /(=・ x ・=)\ 614 | /(^ × ^)\ 615 | /(>×<)\ 616 | /(˃ᆺ˂)\ 617 | ૮ ˶ᵔ ᵕ ᵔ˶ ა 618 | ૮₍ ˶• ༝ •˶ ₎ა 619 | 620 | 豚, Pig, Animals 621 | ( ´(00)ˋ ) 622 | ( ̄(ω) ̄) 623 | ヽ( ˋ(00)´ )ノ 624 | ( ´(oo)ˋ ) 625 | \( ̄(oo) ̄)/ 626 | 。゚(゚´(00)`゚)゚。 627 | ( ̄(00) ̄) 628 | (ˆ(oo)ˆ) 629 | 630 | 鳥, Bird, Animals 631 | ( ̄Θ ̄) 632 | (`・Θ・´) 633 | ( ˋ Θ ´ ) 634 | (◉Θ◉) 635 | \( ˋ Θ ´ )/ 636 | (・θ・) 637 | (・Θ・) 638 | ヾ( ̄◇ ̄)ノ〃 639 | (・Θ・) 640 | 641 | 魚, Fish, Animals 642 | (°)#))<< 643 | <・ )))><< 644 | ζ°)))彡 645 | >°))))彡 646 | (°))<< 647 | >^)))<~~ 648 | ≧( ° ° )≦ 649 | 650 | 蜘蛛, Spider, Animals 651 | /╲/\╭(ఠఠ益ఠఠ)╮/\╱\ 652 | /╲/\╭(ರರ⌓ರರ)╮/\╱\ 653 | /╲/\╭༼ ººل͟ºº ༽╮/\╱\ 654 | /╲/\╭( ͡°͡° ͜ʖ ͡°͡°)╮/\╱\ 655 | /╲/\╭[ ᴼᴼ ౪ ᴼᴼ]╮/\╱\ 656 | /╲/\( •̀ ω •́ )/\╱\ 657 | /╲/\╭[☉﹏☉]╮/\╱\ 658 | 659 | 仲間, Friends, Other Types 660 | ヾ(・ω・)メ(・ω・)ノ 661 | ヽ(∀° )人( °∀)ノ 662 | ヽ( ⌒o⌒)人(⌒-⌒ )ノ 663 | (*^ω^)八(⌒▽⌒)八(-‿‿- )ヽ 664 | \(^∀^)メ(^∀^)ノ 665 | ヾ( ̄ー ̄(≧ω≦*)ゝ 666 | ヽ( ⌒ω⌒)人(=^‥^= )ノ 667 | ヽ(≧◡≦)八(o^ ^o)ノ 668 | (*・∀・)爻(・∀・*) 669 | 。*:☆(・ω・人・ω・)。:゜☆。 670 | o(^^o)(o^^o)(o^^o)(o^^)o 671 | ((( ̄( ̄( ̄▽ ̄) ̄) ̄))) 672 | (°(°ω(°ω°(☆ω☆)°ω°)ω°)°) 673 | ヾ(・ω・`)ノヾ(´・ω・)ノ゛ 674 | Ψ( `∀)(∀´ )Ψ 675 | (っ˘▽˘)(˘▽˘)˘▽˘ς) 676 | (((*°▽°*)八(*°▽°*))) 677 | ☆ヾ(*´・∀・)ノヾ(・∀・`*)ノ☆ 678 | (*^ω^)人(^ω^*) 679 | ٩(๑・ิᴗ・ิ)۶٩(・ิᴗ・ิ๑)۶ 680 | (☞°ヮ°)☞ ☜(°ヮ°☜) 681 | \(▽ ̄ \ ( ̄▽ ̄) /  ̄▽)/ 682 | \( ˙▿˙ )/\( ˙▿˙ )/ 683 | 684 | 敵, Enemies, Other Types 685 | ヽ( ・∀・)ノ_θ彡☆Σ(ノ `Д´)ノ 686 | (*´∇`)┌θ☆(ノ>_<)ノ 687 | (  ̄ω ̄)ノ゙⌒☆ミ(o _ _)o 688 | (*`0´)θ☆(メ°皿°)ノ 689 | (o¬‿¬o )...☆ミ(*x_x) 690 | (╬ ̄皿 ̄)=○#( ̄#)3 ̄) 691 | (; -_-)――――――C<―_-) 692 | <(  ̄︿ ̄)︵θ︵θ︵☆(>口<-) 693 | ( ̄ε(# ̄)☆╰╮o( ̄▽ ̄///) 694 | ヽ(>_<ヽ) ―⊂|=0ヘ(^‿^ ) 695 | ヘ(>_<ヘ) ¬o( ̄‿ ̄メ) 696 | ,,(((  ̄□)_/ \_(○ ̄ ))),, 697 | (҂` ロ ´)︻デ═一 \(º □ º l|l)/ 698 | (╯°Д°)╯︵ /(.□ . \) 699 | (¬_¬'')ԅ( ̄ε ̄ԅ) 700 | /( .□.)\ ︵╰(°益°)╯︵ /(.□. /) 701 | (ノ-.-)ノ….((((((((((((●~* ( >_<) 702 | !!(メ ̄  ̄)_θ☆°0°)/ 703 | (`⌒*)O-(`⌒´Q) 704 | (((ง’ω’)و三 ง’ω’)ڡ≡ ☆⌒ミ((x_x) 705 | (งಠ_ಠ)ง σ( •̀ ω •́ σ) 706 | (っ•﹏•)っ ✴==≡눈٩(`皿´҂)ง 707 | (「• ω •)「 (⌒ω⌒`) 708 | ( °ᴗ°)~ð (/❛o❛\) 709 | 710 | 武器, Weapons, Other Types 711 | ( ・∀・)・・・--------☆ 712 | (/-_・)/D・・・・・------ → 713 | (^ω^)ノ゙(((((((((●~* 714 | ( -ω-)/占~~~~~ 715 | (/・・)ノ   (( く ((へ 716 | ―⊂|=0ヘ(^^ ) 717 | ○∞∞∞∞ヽ(^ー^ ) 718 | (; ・_・)――――C 719 | (ಠ o ಠ)¤=[]:::::> 720 | (*^^)/~~~~~~~~~~◎ 721 | ¬o( ̄- ̄メ) 722 | ―(T_T)→ 723 | (((  ̄□)_/ 724 | (メ` ロ ´)︻デ═一 725 | ( ´-ω・)︻┻┳══━一 726 | (メ ̄▽ ̄)︻┳═一 727 | ✴==≡눈٩(`皿´҂)ง 728 | Q(`⌒´Q) 729 | 730 | 魔法, Magic, Other Types 731 | (ノ ˘_˘)ノ ζ|||ζ ζ|||ζ ζ|||ζ 732 | (ノ≧∀≦)ノ ‥…━━━★ 733 | (ノ>ω<)ノ :。・:*:・゚’★,。・:*:・゚’☆ 734 | (ノ°∀°)ノ⌒・*:.。. .。.:*・゜゚・*☆ 735 | ╰( ͡° ͜ʖ ͡° )つ──☆*:・゚ 736 | (# ̄□ ̄)o━∈・・━━━━☆ 737 | (⊃。•́‿•̀。)⊃━✿✿✿✿✿✿ 738 | (∩ᄑ_ᄑ)⊃━☆゚*・。*・:≡( ε:) 739 | (/ ̄ー ̄)/~~☆’.・.・:★’.・.・:☆ 740 | (∩` ロ ´)⊃━炎炎炎炎炎 741 | 742 | 食べ物, Food, Other Types 743 | (っ˘ڡ˘ς) 744 | ( o˘◡˘o) ┌iii┐ 745 | ( ’ω’)旦~~ 746 | ( ˘▽˘)っ♨ 747 | ♨o(>_<)o♨ 748 | ( ・ω・)o-{{[〃]}} 749 | ( ・ω・)⊃-[二二] 750 | ( ・・)つ―{}@{}@{}- 751 | ( ・・)つ-●●● 752 | (*´ー`)旦 旦( ̄ω ̄*) 753 | (*´з`)口゚。゚口(・∀・ ) 754 | ( o^ ^o)且 且(´ω`*) 755 | (  ̄▽ ̄)[] [](≧▽≦ ) 756 | ( *^^)o∀*∀o(^^* ) 757 | ( ^^)_旦~~  ~~U_(^^ ) 758 | (* ̄▽ ̄)旦 且(´∀`*) 759 | -●●●-c(・・ ) 760 | ( ・・)つ―●○◎- 761 | 762 | 音楽, Music, Other Types 763 | ヾ(´〇`)ノ♪♪♪ 764 | ヘ( ̄ω ̄ヘ) 765 | (〜 ̄▽ ̄)〜 766 | 〜( ̄▽ ̄〜) 767 | ヽ(o´∀`)ノ♪♬ 768 | (ノ≧∀≦)ノ 769 | ♪ヽ(^^ヽ)♪ 770 | ♪(/_ _ )/♪ 771 | ♪♬((d⌒ω⌒b))♬♪ 772 | └( ̄- ̄└)) 773 | ((┘ ̄ω ̄)┘ 774 | √( ̄‥ ̄√) 775 | └(^^)┐ 776 | ┌(^^)┘ 777 | \( ̄▽ ̄)\ 778 | /( ̄▽ ̄)/ 779 | ( ̄▽ ̄)/♫•*¨*•.¸¸♪ 780 | (^_^♪) 781 | (~˘▽˘)~ 782 | ~(˘▽˘~) 783 | ヾ(⌐■_■)ノ♪ 784 | (〜 ̄△ ̄)〜 785 | (~‾▽‾)~ 786 | ~(˘▽˘)~ 787 | 乁( • ω •乁) 788 | (「• ω •)「 789 | ⁽⁽◝( • ω • )◜⁾⁾ 790 | ✺◟( • ω • )◞✺ 791 | ♬♫♪◖(● o ●)◗♪♫♬ 792 | ( ˘ ɜ˘) ♬♪♫ 793 | ♪♪♪ ヽ(ˇ∀ˇ )ゞ 794 | (ˇ▽ˇ)ノ♪♬♫ 795 | 796 | ゲーム, Games, Other Types 797 | ( ^^)p_____|_o____q(^^ ) 798 | (/o^)/ °⊥ \(^o\) 799 | !(;゚o゚)o/ ̄ ̄ ̄ ̄ ̄ ̄ ̄~ >゚))))彡 800 | ヽ(^o^)ρ┳┻┳°σ(^o^)ノ 801 | (/_^)/  ● \(^_\) 802 | "( (≡|≡))_/ \_((≡|≡) )" 803 | ( ノ-_-)ノ゙_□ VS □_ヾ(^-^ヽ) 804 | ヽ(;^ ^)ノ゙ ......___〇 805 | (=O*_*)=O Q(*_*Q) 806 | Ю ○三 \( ̄^ ̄\) 807 | (˙ω˙)🎮(˙∀˙)🎮 808 | 809 | 顔, Faces, Other Types 810 | ( ͡° ͜ʖ ͡°) 811 | ( ͡° ʖ̯ ͡°) 812 | ( ͠° ͟ʖ ͡°) 813 | ( ͡ᵔ ͜ʖ ͡ᵔ) 814 | ( . •́ _ʖ •̀ .) 815 | ( ఠ ͟ʖ ఠ) 816 | ( ͡ಠ ʖ̯ ͡ಠ) 817 | ( ಠ ʖ̯ ಠ) 818 | ( ಠ ͜ʖ ಠ) 819 | ( ಥ ʖ̯ ಥ) 820 | ( ͡• ͜ʖ ͡• ) 821 | ( ・ิ ͜ʖ ・ิ) 822 | ( ͡ ͜ʖ ͡ ) 823 | (≖ ͜ʖ≖) 824 | (ʘ ʖ̯ ʘ) 825 | (ʘ ͟ʖ ʘ) 826 | (ʘ ͜ʖ ʘ) 827 | (;´༎ຶٹ༎ຶ`) 828 | 829 | 寝起き, waking up, Special 830 | ٩(ˊ〇ˋ*)و 831 | 832 | 敬礼, military salutation, Special 833 | ( ̄^ ̄)ゞ 834 | 835 | ちゃぶ台返し, throwing table in anger, Special 836 | (╯°益°)╯彡┻━┻ 837 | (╮°-°)╮┳━━┳ ( ╯°□°)╯ ┻━━┻ 838 | 839 | ちゃぶ台戻し, putting table back, Special 840 | ┬─┬ノ( º _ ºノ) 841 | 842 | 寝そべり, lying emotion, Special 843 | _(:3 」∠)_ 844 | ∠( ᐛ 」∠)_ 845 | 846 | その他, todo, Special 847 | (-‸ლ) 848 | (oT-T)尸 849 | ( ͡° ͜ʖ ͡°) 850 | [̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅] 851 | (ಠ_ಠ) 852 | ౦0o 。 (‾́。‾́ )y~~ 853 | ( ̄﹃ ̄) 854 | (๑ᵔ⤙ᵔ๑) 855 | (x(x_(x_x(O_o)x_x)_x)x) 856 | ( ・ω・)☞ 857 | (⌐■_■) 858 | (◕‿◕✿) 859 | (  ̄.)o-  【 TV 】 860 | `、ヽ`ヽ`、ヽ(ノ><)ノ `、ヽ`☂ヽ`、ヽ 861 | ‿︵‿︵‿︵‿ヽ(°□° )ノ︵‿︵‿︵‿︵ 862 | ( • )( • )ԅ(≖‿≖ԅ) 863 | ( ^▽^)っ✂╰⋃╯ 864 | 〜〜(/ ̄▽)/ 〜ф 865 | ଘ(੭ˊᵕˋ)੭* ੈ✩‧₊˚ 866 | ଘ(੭ˊ꒳​ˋ)੭✧ 867 | -------------------------------------------------------------------------------- /data/to_mecab_feature.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | # emoji > 2.0 4 | import emoji 5 | from kaomoji.kaomoji import Kaomoji 6 | 7 | 8 | # 9 | # kaomoji 10 | # 11 | lines = open("kaomoji-list.txt", 'r', encoding="utf-8").readlines() 12 | 13 | new_section = True 14 | header = None 15 | classifier0 = None 16 | classifier1 = None 17 | classifier2 = None 18 | 19 | for line in lines: 20 | line = line.strip() 21 | if len(line) == 0: 22 | new_section = True 23 | continue 24 | 25 | if new_section: 26 | tup = line.split(',') 27 | assert len(tup) == 3 28 | classifier0 = tup[0].lstrip().rstrip() 29 | classifier1 = tup[1].lstrip().rstrip() 30 | classifier2 = tup[2].lstrip().rstrip() 31 | new_section = False 32 | continue 33 | 34 | surface = line 35 | 36 | assert classifier0 is not None 37 | assert classifier1 is not None 38 | assert classifier2 is not None 39 | 40 | if ',' in surface: 41 | assert '"' not in surface 42 | 43 | surface = '"' + surface + '"' 44 | 45 | feature_str = "{},*,*,*,顔文字,{},{},{},*,*,{},{},顔文字".format(surface, classifier0, classifier1, classifier2, surface, surface) 46 | 47 | print(feature_str) 48 | 49 | 50 | # 51 | # emoji 52 | # 53 | 54 | for e in emoji.EMOJI_DATA.keys(): 55 | feature_str = "{},*,*,*,絵文字,*,*,*,*,*,{},{},絵文字".format(e, e, e) 56 | 57 | print(feature_str) 58 | -------------------------------------------------------------------------------- /example/Makefile: -------------------------------------------------------------------------------- 1 | # Set asan 2 | #LD_PRELOAD_CMD=`gcc -print-file-name=libasan.so` 3 | #LD_PRELOAD_CMD=`clang -print-file-name=libclang_rt.asan-x86_64.so` 4 | 5 | all: 6 | #LD_PRELOAD=$(LD_PRELOAD_CMD) python simple_tokenize.py 7 | python simple_tokenize.py 8 | 9 | -------------------------------------------------------------------------------- /example/batch_tokenize.py: -------------------------------------------------------------------------------- 1 | import jagger 2 | 3 | model_path = "model/kwdlc/patterns" 4 | 5 | tokenizer = jagger.Jagger() 6 | tokenizer.load_model(model_path) 7 | 8 | text = """ 9 | 吾輩は猫である。 10 | 名前はまだない。 11 | 明日の天気は晴れです。 12 | """ 13 | 14 | # optional: set C++ threads(CPU cores) to use 15 | # default: Use all CPU cores. 16 | tokenizer.set_threads(4) 17 | 18 | toks_list = tokenizer.tokenize_batch(text) 19 | 20 | for toks in toks_list: 21 | for tok in toks: 22 | print(tok) 23 | 24 | print("EOS") 25 | 26 | -------------------------------------------------------------------------------- /example/simple_tokenize.py: -------------------------------------------------------------------------------- 1 | import jagger 2 | 3 | model_path = "model/kwdlc/patterns" 4 | 5 | tokenizer = jagger.Jagger() 6 | tokenizer.load_model(model_path) 7 | 8 | text = "吾輩は猫である。名前はまだない。" 9 | toks = tokenizer.tokenize(text) 10 | 11 | for tok in toks: 12 | # print surface + TAB + feature 13 | print(tok) 14 | 15 | # or you can print surface and feature independently. 16 | #print(tok.surface(), tok.feature()) 17 | 18 | print("EOS") 19 | 20 | 21 | for tok in toks: 22 | print(tok.surface()) 23 | 24 | # print tag(split feature string by comma) 25 | # 26 | # optional: Set quote char(UTF-8 single char) in feature string(CVS line). default '"' 27 | # set_quote_char() must be called for each Token instance, since 28 | # tag decomposition from feature string is done on the fly. 29 | # 30 | # tok.set_quote_char('\'') 31 | # tok.set_quote_char('”') # zenkaku-quote 32 | 33 | for i in range(tok.n_tags()): 34 | print(" tag[{}] = {}".format(i, tok.tag(i))) 35 | 36 | print("EOS") 37 | -------------------------------------------------------------------------------- /jagger.BSD: -------------------------------------------------------------------------------- 1 | Copyright (c) 2022 Naoki Yoshinaga 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the 14 | distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /jagger.GPL: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /jagger.LGPL: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 2.1, February 1999 3 | 4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | [This is the first released version of the Lesser GPL. It also counts 10 | as the successor of the GNU Library Public License, version 2, hence 11 | the version number 2.1.] 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | Licenses are intended to guarantee your freedom to share and change 18 | free software--to make sure the software is free for all its users. 19 | 20 | This license, the Lesser General Public License, applies to some 21 | specially designated software packages--typically libraries--of the 22 | Free Software Foundation and other authors who decide to use it. You 23 | can use it too, but we suggest you first think carefully about whether 24 | this license or the ordinary General Public License is the better 25 | strategy to use in any particular case, based on the explanations below. 26 | 27 | When we speak of free software, we are referring to freedom of use, 28 | not price. Our General Public Licenses are designed to make sure that 29 | you have the freedom to distribute copies of free software (and charge 30 | for this service if you wish); that you receive source code or can get 31 | it if you want it; that you can change the software and use pieces of 32 | it in new free programs; and that you are informed that you can do 33 | these things. 34 | 35 | To protect your rights, we need to make restrictions that forbid 36 | distributors to deny you these rights or to ask you to surrender these 37 | rights. These restrictions translate to certain responsibilities for 38 | you if you distribute copies of the library or if you modify it. 39 | 40 | For example, if you distribute copies of the library, whether gratis 41 | or for a fee, you must give the recipients all the rights that we gave 42 | you. You must make sure that they, too, receive or can get the source 43 | code. If you link other code with the library, you must provide 44 | complete object files to the recipients, so that they can relink them 45 | with the library after making changes to the library and recompiling 46 | it. And you must show them these terms so they know their rights. 47 | 48 | We protect your rights with a two-step method: (1) we copyright the 49 | library, and (2) we offer you this license, which gives you legal 50 | permission to copy, distribute and/or modify the library. 51 | 52 | To protect each distributor, we want to make it very clear that 53 | there is no warranty for the free library. Also, if the library is 54 | modified by someone else and passed on, the recipients should know 55 | that what they have is not the original version, so that the original 56 | author's reputation will not be affected by problems that might be 57 | introduced by others. 58 | 59 | Finally, software patents pose a constant threat to the existence of 60 | any free program. We wish to make sure that a company cannot 61 | effectively restrict the users of a free program by obtaining a 62 | restrictive license from a patent holder. Therefore, we insist that 63 | any patent license obtained for a version of the library must be 64 | consistent with the full freedom of use specified in this license. 65 | 66 | Most GNU software, including some libraries, is covered by the 67 | ordinary GNU General Public License. This license, the GNU Lesser 68 | General Public License, applies to certain designated libraries, and 69 | is quite different from the ordinary General Public License. We use 70 | this license for certain libraries in order to permit linking those 71 | libraries into non-free programs. 72 | 73 | When a program is linked with a library, whether statically or using 74 | a shared library, the combination of the two is legally speaking a 75 | combined work, a derivative of the original library. The ordinary 76 | General Public License therefore permits such linking only if the 77 | entire combination fits its criteria of freedom. The Lesser General 78 | Public License permits more lax criteria for linking other code with 79 | the library. 80 | 81 | We call this license the "Lesser" General Public License because it 82 | does Less to protect the user's freedom than the ordinary General 83 | Public License. It also provides other free software developers Less 84 | of an advantage over competing non-free programs. These disadvantages 85 | are the reason we use the ordinary General Public License for many 86 | libraries. However, the Lesser license provides advantages in certain 87 | special circumstances. 88 | 89 | For example, on rare occasions, there may be a special need to 90 | encourage the widest possible use of a certain library, so that it becomes 91 | a de-facto standard. To achieve this, non-free programs must be 92 | allowed to use the library. A more frequent case is that a free 93 | library does the same job as widely used non-free libraries. In this 94 | case, there is little to gain by limiting the free library to free 95 | software only, so we use the Lesser General Public License. 96 | 97 | In other cases, permission to use a particular library in non-free 98 | programs enables a greater number of people to use a large body of 99 | free software. For example, permission to use the GNU C Library in 100 | non-free programs enables many more people to use the whole GNU 101 | operating system, as well as its variant, the GNU/Linux operating 102 | system. 103 | 104 | Although the Lesser General Public License is Less protective of the 105 | users' freedom, it does ensure that the user of a program that is 106 | linked with the Library has the freedom and the wherewithal to run 107 | that program using a modified version of the Library. 108 | 109 | The precise terms and conditions for copying, distribution and 110 | modification follow. Pay close attention to the difference between a 111 | "work based on the library" and a "work that uses the library". The 112 | former contains code derived from the library, whereas the latter must 113 | be combined with the library in order to run. 114 | 115 | GNU LESSER GENERAL PUBLIC LICENSE 116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 117 | 118 | 0. This License Agreement applies to any software library or other 119 | program which contains a notice placed by the copyright holder or 120 | other authorized party saying it may be distributed under the terms of 121 | this Lesser General Public License (also called "this License"). 122 | Each licensee is addressed as "you". 123 | 124 | A "library" means a collection of software functions and/or data 125 | prepared so as to be conveniently linked with application programs 126 | (which use some of those functions and data) to form executables. 127 | 128 | The "Library", below, refers to any such software library or work 129 | which has been distributed under these terms. A "work based on the 130 | Library" means either the Library or any derivative work under 131 | copyright law: that is to say, a work containing the Library or a 132 | portion of it, either verbatim or with modifications and/or translated 133 | straightforwardly into another language. (Hereinafter, translation is 134 | included without limitation in the term "modification".) 135 | 136 | "Source code" for a work means the preferred form of the work for 137 | making modifications to it. For a library, complete source code means 138 | all the source code for all modules it contains, plus any associated 139 | interface definition files, plus the scripts used to control compilation 140 | and installation of the library. 141 | 142 | Activities other than copying, distribution and modification are not 143 | covered by this License; they are outside its scope. The act of 144 | running a program using the Library is not restricted, and output from 145 | such a program is covered only if its contents constitute a work based 146 | on the Library (independent of the use of the Library in a tool for 147 | writing it). Whether that is true depends on what the Library does 148 | and what the program that uses the Library does. 149 | 150 | 1. You may copy and distribute verbatim copies of the Library's 151 | complete source code as you receive it, in any medium, provided that 152 | you conspicuously and appropriately publish on each copy an 153 | appropriate copyright notice and disclaimer of warranty; keep intact 154 | all the notices that refer to this License and to the absence of any 155 | warranty; and distribute a copy of this License along with the 156 | Library. 157 | 158 | You may charge a fee for the physical act of transferring a copy, 159 | and you may at your option offer warranty protection in exchange for a 160 | fee. 161 | 162 | 2. You may modify your copy or copies of the Library or any portion 163 | of it, thus forming a work based on the Library, and copy and 164 | distribute such modifications or work under the terms of Section 1 165 | above, provided that you also meet all of these conditions: 166 | 167 | a) The modified work must itself be a software library. 168 | 169 | b) You must cause the files modified to carry prominent notices 170 | stating that you changed the files and the date of any change. 171 | 172 | c) You must cause the whole of the work to be licensed at no 173 | charge to all third parties under the terms of this License. 174 | 175 | d) If a facility in the modified Library refers to a function or a 176 | table of data to be supplied by an application program that uses 177 | the facility, other than as an argument passed when the facility 178 | is invoked, then you must make a good faith effort to ensure that, 179 | in the event an application does not supply such function or 180 | table, the facility still operates, and performs whatever part of 181 | its purpose remains meaningful. 182 | 183 | (For example, a function in a library to compute square roots has 184 | a purpose that is entirely well-defined independent of the 185 | application. Therefore, Subsection 2d requires that any 186 | application-supplied function or table used by this function must 187 | be optional: if the application does not supply it, the square 188 | root function must still compute square roots.) 189 | 190 | These requirements apply to the modified work as a whole. If 191 | identifiable sections of that work are not derived from the Library, 192 | and can be reasonably considered independent and separate works in 193 | themselves, then this License, and its terms, do not apply to those 194 | sections when you distribute them as separate works. But when you 195 | distribute the same sections as part of a whole which is a work based 196 | on the Library, the distribution of the whole must be on the terms of 197 | this License, whose permissions for other licensees extend to the 198 | entire whole, and thus to each and every part regardless of who wrote 199 | it. 200 | 201 | Thus, it is not the intent of this section to claim rights or contest 202 | your rights to work written entirely by you; rather, the intent is to 203 | exercise the right to control the distribution of derivative or 204 | collective works based on the Library. 205 | 206 | In addition, mere aggregation of another work not based on the Library 207 | with the Library (or with a work based on the Library) on a volume of 208 | a storage or distribution medium does not bring the other work under 209 | the scope of this License. 210 | 211 | 3. You may opt to apply the terms of the ordinary GNU General Public 212 | License instead of this License to a given copy of the Library. To do 213 | this, you must alter all the notices that refer to this License, so 214 | that they refer to the ordinary GNU General Public License, version 2, 215 | instead of to this License. (If a newer version than version 2 of the 216 | ordinary GNU General Public License has appeared, then you can specify 217 | that version instead if you wish.) Do not make any other change in 218 | these notices. 219 | 220 | Once this change is made in a given copy, it is irreversible for 221 | that copy, so the ordinary GNU General Public License applies to all 222 | subsequent copies and derivative works made from that copy. 223 | 224 | This option is useful when you wish to copy part of the code of 225 | the Library into a program that is not a library. 226 | 227 | 4. You may copy and distribute the Library (or a portion or 228 | derivative of it, under Section 2) in object code or executable form 229 | under the terms of Sections 1 and 2 above provided that you accompany 230 | it with the complete corresponding machine-readable source code, which 231 | must be distributed under the terms of Sections 1 and 2 above on a 232 | medium customarily used for software interchange. 233 | 234 | If distribution of object code is made by offering access to copy 235 | from a designated place, then offering equivalent access to copy the 236 | source code from the same place satisfies the requirement to 237 | distribute the source code, even though third parties are not 238 | compelled to copy the source along with the object code. 239 | 240 | 5. A program that contains no derivative of any portion of the 241 | Library, but is designed to work with the Library by being compiled or 242 | linked with it, is called a "work that uses the Library". Such a 243 | work, in isolation, is not a derivative work of the Library, and 244 | therefore falls outside the scope of this License. 245 | 246 | However, linking a "work that uses the Library" with the Library 247 | creates an executable that is a derivative of the Library (because it 248 | contains portions of the Library), rather than a "work that uses the 249 | library". The executable is therefore covered by this License. 250 | Section 6 states terms for distribution of such executables. 251 | 252 | When a "work that uses the Library" uses material from a header file 253 | that is part of the Library, the object code for the work may be a 254 | derivative work of the Library even though the source code is not. 255 | Whether this is true is especially significant if the work can be 256 | linked without the Library, or if the work is itself a library. The 257 | threshold for this to be true is not precisely defined by law. 258 | 259 | If such an object file uses only numerical parameters, data 260 | structure layouts and accessors, and small macros and small inline 261 | functions (ten lines or less in length), then the use of the object 262 | file is unrestricted, regardless of whether it is legally a derivative 263 | work. (Executables containing this object code plus portions of the 264 | Library will still fall under Section 6.) 265 | 266 | Otherwise, if the work is a derivative of the Library, you may 267 | distribute the object code for the work under the terms of Section 6. 268 | Any executables containing that work also fall under Section 6, 269 | whether or not they are linked directly with the Library itself. 270 | 271 | 6. As an exception to the Sections above, you may also combine or 272 | link a "work that uses the Library" with the Library to produce a 273 | work containing portions of the Library, and distribute that work 274 | under terms of your choice, provided that the terms permit 275 | modification of the work for the customer's own use and reverse 276 | engineering for debugging such modifications. 277 | 278 | You must give prominent notice with each copy of the work that the 279 | Library is used in it and that the Library and its use are covered by 280 | this License. You must supply a copy of this License. If the work 281 | during execution displays copyright notices, you must include the 282 | copyright notice for the Library among them, as well as a reference 283 | directing the user to the copy of this License. Also, you must do one 284 | of these things: 285 | 286 | a) Accompany the work with the complete corresponding 287 | machine-readable source code for the Library including whatever 288 | changes were used in the work (which must be distributed under 289 | Sections 1 and 2 above); and, if the work is an executable linked 290 | with the Library, with the complete machine-readable "work that 291 | uses the Library", as object code and/or source code, so that the 292 | user can modify the Library and then relink to produce a modified 293 | executable containing the modified Library. (It is understood 294 | that the user who changes the contents of definitions files in the 295 | Library will not necessarily be able to recompile the application 296 | to use the modified definitions.) 297 | 298 | b) Use a suitable shared library mechanism for linking with the 299 | Library. A suitable mechanism is one that (1) uses at run time a 300 | copy of the library already present on the user's computer system, 301 | rather than copying library functions into the executable, and (2) 302 | will operate properly with a modified version of the library, if 303 | the user installs one, as long as the modified version is 304 | interface-compatible with the version that the work was made with. 305 | 306 | c) Accompany the work with a written offer, valid for at 307 | least three years, to give the same user the materials 308 | specified in Subsection 6a, above, for a charge no more 309 | than the cost of performing this distribution. 310 | 311 | d) If distribution of the work is made by offering access to copy 312 | from a designated place, offer equivalent access to copy the above 313 | specified materials from the same place. 314 | 315 | e) Verify that the user has already received a copy of these 316 | materials or that you have already sent this user a copy. 317 | 318 | For an executable, the required form of the "work that uses the 319 | Library" must include any data and utility programs needed for 320 | reproducing the executable from it. However, as a special exception, 321 | the materials to be distributed need not include anything that is 322 | normally distributed (in either source or binary form) with the major 323 | components (compiler, kernel, and so on) of the operating system on 324 | which the executable runs, unless that component itself accompanies 325 | the executable. 326 | 327 | It may happen that this requirement contradicts the license 328 | restrictions of other proprietary libraries that do not normally 329 | accompany the operating system. Such a contradiction means you cannot 330 | use both them and the Library together in an executable that you 331 | distribute. 332 | 333 | 7. You may place library facilities that are a work based on the 334 | Library side-by-side in a single library together with other library 335 | facilities not covered by this License, and distribute such a combined 336 | library, provided that the separate distribution of the work based on 337 | the Library and of the other library facilities is otherwise 338 | permitted, and provided that you do these two things: 339 | 340 | a) Accompany the combined library with a copy of the same work 341 | based on the Library, uncombined with any other library 342 | facilities. This must be distributed under the terms of the 343 | Sections above. 344 | 345 | b) Give prominent notice with the combined library of the fact 346 | that part of it is a work based on the Library, and explaining 347 | where to find the accompanying uncombined form of the same work. 348 | 349 | 8. You may not copy, modify, sublicense, link with, or distribute 350 | the Library except as expressly provided under this License. Any 351 | attempt otherwise to copy, modify, sublicense, link with, or 352 | distribute the Library is void, and will automatically terminate your 353 | rights under this License. However, parties who have received copies, 354 | or rights, from you under this License will not have their licenses 355 | terminated so long as such parties remain in full compliance. 356 | 357 | 9. You are not required to accept this License, since you have not 358 | signed it. However, nothing else grants you permission to modify or 359 | distribute the Library or its derivative works. These actions are 360 | prohibited by law if you do not accept this License. Therefore, by 361 | modifying or distributing the Library (or any work based on the 362 | Library), you indicate your acceptance of this License to do so, and 363 | all its terms and conditions for copying, distributing or modifying 364 | the Library or works based on it. 365 | 366 | 10. Each time you redistribute the Library (or any work based on the 367 | Library), the recipient automatically receives a license from the 368 | original licensor to copy, distribute, link with or modify the Library 369 | subject to these terms and conditions. You may not impose any further 370 | restrictions on the recipients' exercise of the rights granted herein. 371 | You are not responsible for enforcing compliance by third parties with 372 | this License. 373 | 374 | 11. If, as a consequence of a court judgment or allegation of patent 375 | infringement or for any other reason (not limited to patent issues), 376 | conditions are imposed on you (whether by court order, agreement or 377 | otherwise) that contradict the conditions of this License, they do not 378 | excuse you from the conditions of this License. If you cannot 379 | distribute so as to satisfy simultaneously your obligations under this 380 | License and any other pertinent obligations, then as a consequence you 381 | may not distribute the Library at all. For example, if a patent 382 | license would not permit royalty-free redistribution of the Library by 383 | all those who receive copies directly or indirectly through you, then 384 | the only way you could satisfy both it and this License would be to 385 | refrain entirely from distribution of the Library. 386 | 387 | If any portion of this section is held invalid or unenforceable under any 388 | particular circumstance, the balance of the section is intended to apply, 389 | and the section as a whole is intended to apply in other circumstances. 390 | 391 | It is not the purpose of this section to induce you to infringe any 392 | patents or other property right claims or to contest validity of any 393 | such claims; this section has the sole purpose of protecting the 394 | integrity of the free software distribution system which is 395 | implemented by public license practices. Many people have made 396 | generous contributions to the wide range of software distributed 397 | through that system in reliance on consistent application of that 398 | system; it is up to the author/donor to decide if he or she is willing 399 | to distribute software through any other system and a licensee cannot 400 | impose that choice. 401 | 402 | This section is intended to make thoroughly clear what is believed to 403 | be a consequence of the rest of this License. 404 | 405 | 12. If the distribution and/or use of the Library is restricted in 406 | certain countries either by patents or by copyrighted interfaces, the 407 | original copyright holder who places the Library under this License may add 408 | an explicit geographical distribution limitation excluding those countries, 409 | so that distribution is permitted only in or among countries not thus 410 | excluded. In such case, this License incorporates the limitation as if 411 | written in the body of this License. 412 | 413 | 13. The Free Software Foundation may publish revised and/or new 414 | versions of the Lesser General Public License from time to time. 415 | Such new versions will be similar in spirit to the present version, 416 | but may differ in detail to address new problems or concerns. 417 | 418 | Each version is given a distinguishing version number. If the Library 419 | specifies a version number of this License which applies to it and 420 | "any later version", you have the option of following the terms and 421 | conditions either of that version or of any later version published by 422 | the Free Software Foundation. If the Library does not specify a 423 | license version number, you may choose any version ever published by 424 | the Free Software Foundation. 425 | 426 | 14. If you wish to incorporate parts of the Library into other free 427 | programs whose distribution conditions are incompatible with these, 428 | write to the author to ask for permission. For software which is 429 | copyrighted by the Free Software Foundation, write to the Free 430 | Software Foundation; we sometimes make exceptions for this. Our 431 | decision will be guided by the two goals of preserving the free status 432 | of all derivatives of our free software and of promoting the sharing 433 | and reuse of software generally. 434 | 435 | NO WARRANTY 436 | 437 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 443 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 444 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 446 | 447 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 456 | DAMAGES. 457 | 458 | END OF TERMS AND CONDITIONS 459 | 460 | How to Apply These Terms to Your New Libraries 461 | 462 | If you develop a new library, and you want it to be of the greatest 463 | possible use to the public, we recommend making it free software that 464 | everyone can redistribute and change. You can do so by permitting 465 | redistribution under these terms (or, alternatively, under the terms of the 466 | ordinary General Public License). 467 | 468 | To apply these terms, attach the following notices to the library. It is 469 | safest to attach them to the start of each source file to most effectively 470 | convey the exclusion of warranty; and each file should have at least the 471 | "copyright" line and a pointer to where the full notice is found. 472 | 473 | 474 | Copyright (C) 475 | 476 | This library is free software; you can redistribute it and/or 477 | modify it under the terms of the GNU Lesser General Public 478 | License as published by the Free Software Foundation; either 479 | version 2.1 of the License, or (at your option) any later version. 480 | 481 | This library is distributed in the hope that it will be useful, 482 | but WITHOUT ANY WARRANTY; without even the implied warranty of 483 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 484 | Lesser General Public License for more details. 485 | 486 | You should have received a copy of the GNU Lesser General Public 487 | License along with this library; if not, write to the Free Software 488 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 489 | 490 | Also add information on how to contact you by electronic and paper mail. 491 | 492 | You should also get your employer (if you work as a programmer) or your 493 | school, if any, to sign a "copyright disclaimer" for the library, if 494 | necessary. Here is a sample; alter the names: 495 | 496 | Yoyodyne, Inc., hereby disclaims all copyright interest in the 497 | library `Frob' (a library for tweaking knobs) written by James Random Hacker. 498 | 499 | , 1 April 1990 500 | Ty Coon, President of Vice 501 | 502 | That's all there is to it! 503 | -------------------------------------------------------------------------------- /jagger.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lighttransport/jagger-python/ec01614a131e7d161930f1823b7ce1585630933b/jagger.png -------------------------------------------------------------------------------- /jagger/__init__.py: -------------------------------------------------------------------------------- 1 | from jagger_ext import * 2 | 3 | # load setptools_scm generated _version.py 4 | try: 5 | from ._version import version, __version__ 6 | from ._version import version_tuple 7 | except: 8 | __version__ = version = '0.0.0.dev' 9 | __version_tuple__ = version_tuple = (0, 0, 0, 'dev', 'git') 10 | 11 | import os 12 | import sys 13 | from pathlib import Path 14 | 15 | class Jagger: 16 | def __init__(self): 17 | 18 | self._tagger = JaggerExt() 19 | 20 | def load_model(self, dict_path: Path): 21 | self._tagger.load_model(str(dict_path)) 22 | 23 | def tokenize(self, s: str): 24 | return self._tagger.tokenize(s) 25 | 26 | def tokenize_batch(self, s: str): 27 | if isinstance(s, list): 28 | s = '\n'.join(s) 29 | # strip redundant '\n'(if input is a list of text which endswith '\n' 30 | s.replace('\n\n', '\n') 31 | 32 | return self._tagger.tokenize_batch(s) 33 | 34 | def set_threads(self, n: int): 35 | return self._tagger.set_threads(n) 36 | 37 | 38 | -------------------------------------------------------------------------------- /jagger/ccedar_core.h: -------------------------------------------------------------------------------- 1 | // ccedar -- C++ implementation of Character-wise, Efficiently-updatable Double ARray trie (minimal version for Jagger) 2 | // $Id: ccedar_core.h 2025 2022-12-16 06:18:29Z ynaga $ 3 | // Copyright (c) 2022 Naoki Yoshinaga 4 | #ifndef CCEDAR_CORE_H 5 | #define CCEDAR_CORE_H 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace ccedar { 12 | // typedefs 13 | template struct to_unsigned; 14 | template <> struct to_unsigned { typedef unsigned char type; }; 15 | template <> struct to_unsigned { typedef unsigned int type; }; 16 | template size_t key_len (const T* p); 17 | template <> size_t key_len (const char *p) { return std::strlen (p); } 18 | // dynamic double array 19 | template 25 | class da { 26 | public: 27 | enum { MAX_KEY_CODE = 1 << MAX_KEY_BITS, MAX_ALLOC_SIZE = MAX_KEY_CODE << 4 }; 28 | enum error_code { CEDAR_NO_VALUE = NO_VALUE, CEDAR_NO_PATH = NO_PATH }; 29 | typedef typename to_unsigned ::type ukey_type; 30 | typedef value_type result_type; 31 | struct node { 32 | union { int base; value_type value; }; // negative means prev empty index 33 | int check; // negative means next empty index 34 | node (int base_ = 0, int check_ = 0) : base (base_), check (check_) {} 35 | }; 36 | struct ninfo { // x1.5 update speed; x2 memory (8n -> 16n); can be 12n 37 | ukey_type sibling; // right sibling (= 0 if not exist) 38 | ukey_type child; // first child 39 | ninfo () : sibling (0), child (0) {} 40 | }; 41 | struct block { // a block w/ sizeof (key_type) << 8 elements 42 | int prev; // prev block; 3 bytes 43 | int next; // next block; 3 bytes 44 | int num; // # empty elements; 0 - sizeof (key_type) << 8 45 | int ok; // minimum # branching failed to locate - 1; soft limit 46 | int trial; // # trial 47 | int ehead; // first empty item 48 | block () : prev (0), next (0), num (MAX_KEY_CODE), ok (MAX_KEY_CODE), trial (0), ehead (0) {} 49 | }; 50 | da () : _bheadF (0), _bheadC (0), _bheadO (0), _capacity (0), _size (0), _ok () 51 | { _initialize (); } 52 | ~da () { clear (); } 53 | // interfance 54 | template 55 | T exactMatchSearch (const key_type* key) const 56 | { return exactMatchSearch (key, key_len (key)); } 57 | template 58 | T exactMatchSearch (const key_type* key, size_t len, size_t from = 0) const { 59 | union { int i; value_type x; } b; 60 | size_t pos = 0; 61 | b.i = _find (key, from, pos, len); 62 | if (b.i == CEDAR_NO_PATH) b.i = CEDAR_NO_VALUE; 63 | T result; 64 | _set_result (&result, b.x, len, from); 65 | return result; 66 | } 67 | template 68 | size_t commonPrefixSearch (const key_type* key, T* result, size_t result_len) const 69 | { return commonPrefixSearch (key, result, result_len, key_len (key)); } 70 | template 71 | size_t commonPrefixSearch (const key_type* key, T* result, size_t result_len, size_t len, size_t from = 0) const { 72 | size_t num = 0; 73 | for (size_t pos = 0; pos < len; ) { 74 | union { int i; value_type x; } b; 75 | b.i = _find (key, from, pos, pos + 1); 76 | if (b.i == CEDAR_NO_VALUE) continue; 77 | if (b.i == CEDAR_NO_PATH) return num; 78 | if (num < result_len) _set_result (&result[num], b.x, pos, from); 79 | ++num; 80 | } 81 | return num; 82 | } 83 | value_type traverse (const key_type* key, size_t& from, size_t& pos) const 84 | { return traverse (key, from, pos, key_len (key)); } 85 | value_type traverse (const key_type* key, size_t& from, size_t& pos, size_t len) const { 86 | union { int i; value_type x; } b; 87 | b.i = _find (key, from, pos, len); 88 | return b.x; 89 | } 90 | value_type& update (const key_type* key) 91 | { return update (key, key_len (key)); } 92 | value_type& update (const key_type* key, size_t len, value_type val = value_type (0)) 93 | { size_t from (0), pos (0); return update (key, from, pos, len, val); } 94 | value_type& update (const key_type* key, size_t& from, size_t& pos, size_t len, value_type val) { 95 | if (! len && ! from) 96 | _err (__FILE__, __LINE__, "failed to insert zero-length key\n"); 97 | for (const ukey_type* const key_ = reinterpret_cast (key); 98 | pos < len; ++pos) { 99 | from = static_cast (_follow (from, key_[pos])); 100 | } 101 | const int to = _follow (from, 0); 102 | return _array[to].value += val; 103 | } 104 | int save (const char* fn, const char* mode = "wb") const { 105 | FILE* fp = std::fopen (fn, mode); 106 | if (! fp) return -1; 107 | std::fwrite (_array.data(), sizeof (node), static_cast (_size), fp); 108 | std::fclose (fp); 109 | return 0; 110 | } 111 | int open (const char* fn, const char* mode = "rb") { 112 | FILE* fp = std::fopen (fn, mode); 113 | if (! fp) return -1; 114 | // get size 115 | if (std::fseek (fp, 0, SEEK_END) != 0) return -1; 116 | const size_t size_ = static_cast (std::ftell (fp)) / sizeof (node); 117 | if (std::fseek (fp, 0, SEEK_SET) != 0) return -1; 118 | // set array 119 | //_array = static_cast (std::malloc (sizeof (node) * size_)); 120 | _array.resize(size_); 121 | if (size_ != std::fread (_array.data(), sizeof (node), size_, fp)) return -1; 122 | std::fclose (fp); 123 | _size = static_cast (size_); 124 | return 0; 125 | } 126 | void set_array (const void* p, size_t nbytes_) { 127 | clear (false); 128 | //_array = const_cast(static_cast (p)); 129 | _array.resize(nbytes_ / sizeof(node)); 130 | memcpy(_array.data(), p, nbytes_); 131 | _size = static_cast (nbytes_ / sizeof(node)); 132 | //_no_delete = true; 133 | } 134 | const void* array () const { return _array.data(); } 135 | void clear (const bool reuse = true) { 136 | //if (_array && ! _no_delete) std::free (_array); 137 | //if (_ninfo) std::free (_ninfo); 138 | //if (_block) std::free (_block); 139 | //_array = 0; _ninfo = 0; _block = 0; 140 | _array.clear(); 141 | _ninfo.clear(); 142 | _block.clear(); 143 | 144 | _bheadF = _bheadC = _bheadO = _capacity = _size = 0; 145 | if (reuse) _initialize (); 146 | //_no_delete = false; 147 | } 148 | private: 149 | // currently disabled; implement these if you need 150 | da (const da&); 151 | da& operator= (const da&); 152 | //node* _array; 153 | //ninfo* _ninfo; 154 | //block* _block; 155 | std::vector _array; 156 | std::vector _ninfo; 157 | std::vector _block; 158 | int _bheadF{0}; // first block of Full; 0 159 | int _bheadC{0}; // first block of Closed; 0 if no Closed 160 | int _bheadO{0}; // first block of Open; 0 if no Open 161 | int _capacity{0}; 162 | int _size{0}; 163 | //int _no_delete{false}; // deprecated 164 | int _ok[MAX_KEY_CODE + 1]; 165 | // 166 | static void _err (const char* fn, const int ln, const char* msg) 167 | { std::fprintf (stderr, "cedar: %s [%d]: %s", fn, ln, msg); std::exit (1); } 168 | #if 0 169 | template 170 | static void _realloc_array (T*& p, const int size_n, const int size_p = 0) { 171 | void* tmp = std::realloc (p, sizeof (T) * static_cast (size_n)); 172 | if (! tmp) 173 | std::free (p), _err (__FILE__, __LINE__, "memory reallocation failed\n"); 174 | p = static_cast (tmp); 175 | static const T T0 = T (); 176 | for (T* q (p + size_p), * const r (p + size_n); q != r; ++q) *q = T0; 177 | } 178 | #endif 179 | template 180 | static void _resize_array (std::vector & p, const int size_n, const int size_p = 0) { 181 | p.resize(size_n); 182 | 183 | static const T T0 = T (); 184 | for (size_t i = size_p; i < size_n; i++) { 185 | p[i] = T0; 186 | } 187 | } 188 | 189 | void _initialize () { // initialize the first special block 190 | _resize_array (_array, MAX_KEY_CODE, MAX_KEY_CODE); 191 | _resize_array (_ninfo, MAX_KEY_CODE); 192 | _resize_array (_block, 1); 193 | _array[0] = node (0, -1); 194 | for (int i = 1; i < MAX_KEY_CODE; ++i) 195 | _array[i] = node (i == 1 ? -(MAX_KEY_CODE - 1) : - (i - 1), i == (MAX_KEY_CODE - 1) ? -1 : - (i + 1)); 196 | _block[0].ehead = 1; // bug fix for erase 197 | _capacity = _size = MAX_KEY_CODE; 198 | for (size_t i = 0; i <= MAX_KEY_CODE; ++i) _ok[i] = static_cast (i); 199 | } 200 | // follow/create edge 201 | int _follow (size_t& from, const ukey_type& label) { 202 | int to = 0; 203 | const int base = _array[from].base; 204 | if (base < 0 || _array[to = base ^ label].check < 0) { 205 | to = _pop_enode (base, label, static_cast (from)); 206 | _push_sibling (from, to ^ label, label, base >= 0); 207 | } else if (_array[to].check != static_cast (from)) 208 | to = _resolve (from, base, label); 209 | return to; 210 | } 211 | // find key from double array 212 | int _find (const key_type* key, size_t& from, size_t& pos, const size_t len) const { 213 | for (const ukey_type* const key_ = reinterpret_cast (key); 214 | pos < len; ) { // follow link 215 | size_t to = static_cast (_array[from].base); to ^= key_[pos]; 216 | if (_array[to].check != static_cast (from)) return CEDAR_NO_PATH; 217 | ++pos; 218 | from = to; 219 | } 220 | const node n = _array[_array[from].base ^ 0]; 221 | if (n.check != static_cast (from)) return CEDAR_NO_VALUE; 222 | return n.base; 223 | } 224 | void _set_result (result_type* x, value_type r, size_t = 0, size_t = 0) const 225 | { *x = r; } 226 | void _pop_block (const int bi, int& head_in, const bool last) { 227 | if (last) { // last one poped; Closed or Open 228 | head_in = 0; 229 | } else { 230 | const block& b = _block[bi]; 231 | _block[b.prev].next = b.next; 232 | _block[b.next].prev = b.prev; 233 | if (bi == head_in) head_in = b.next; 234 | } 235 | } 236 | void _push_block (const int bi, int& head_out, const bool empty) { 237 | block& b = _block[bi]; 238 | if (empty) { // the destination is empty 239 | head_out = b.prev = b.next = bi; 240 | } else { // use most recently pushed 241 | int& tail_out = _block[head_out].prev; 242 | b.prev = tail_out; 243 | b.next = head_out; 244 | head_out = tail_out = _block[tail_out].next = bi; 245 | } 246 | } 247 | int _add_block () { 248 | if (_size == _capacity) { // allocate memory if needed 249 | _capacity += _size >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : _size; 250 | _resize_array (_array, _capacity, _capacity); 251 | _resize_array (_ninfo, _capacity, _size); 252 | _resize_array (_block, _capacity >> MAX_KEY_BITS, _size >> MAX_KEY_BITS); 253 | } 254 | _block[_size >> MAX_KEY_BITS].ehead = _size; 255 | _array[_size] = node (- (_size + (MAX_KEY_CODE - 1)), - (_size + 1)); 256 | for (int i = _size + 1; i < _size + (MAX_KEY_CODE - 1); ++i) 257 | _array[i] = node (-(i - 1), -(i + 1)); 258 | _array[_size + (MAX_KEY_CODE - 1)] = node (- (_size + (MAX_KEY_CODE - 2)), -_size); 259 | _push_block (_size >> MAX_KEY_BITS, _bheadO, ! _bheadO); // append to block Open 260 | _size += MAX_KEY_CODE; 261 | return (_size >> MAX_KEY_BITS) - 1; 262 | } 263 | // transfer block from one start w/ head_in to one start w/ head_out 264 | void _transfer_block (const int bi, int& head_in, int& head_out) { 265 | _pop_block (bi, head_in, bi == _block[bi].next); 266 | _push_block (bi, head_out, ! head_out && _block[bi].num); 267 | } 268 | // pop empty node from block; never transfer the special block (bi = 0) 269 | int _pop_enode (const int base, const ukey_type label, const int from) { 270 | const int e = base < 0 ? _find_place () : base ^ label; 271 | const int bi = e >> MAX_KEY_BITS; 272 | node& n = _array[e]; 273 | block& b = _block[bi]; 274 | if (--b.num == 0) { 275 | if (bi) _transfer_block (bi, _bheadC, _bheadF); // Closed to Full 276 | } else { // release empty node from empty ring 277 | _array[-n.base].check = n.check; 278 | _array[-n.check].base = n.base; 279 | if (e == b.ehead) b.ehead = -n.check; // set ehead 280 | if (bi && b.num == 1 && b.trial != MAX_TRIAL) // Open to Closed 281 | _transfer_block (bi, _bheadO, _bheadC); 282 | } 283 | // initialize the released node 284 | if (label) n.base = -1; else n.value = value_type (0); n.check = from; 285 | if (base < 0) _array[from].base = e ^ label; 286 | return e; 287 | } 288 | // push empty node into empty ring 289 | void _push_enode (const int e) { 290 | const int bi = e >> MAX_KEY_BITS; 291 | block& b = _block[bi]; 292 | if (++b.num == 1) { // Full to Closed 293 | b.ehead = e; 294 | _array[e] = node (-e, -e); 295 | if (bi) _transfer_block (bi, _bheadF, _bheadC); // Full to Closed 296 | } else { 297 | const int prev = b.ehead; 298 | const int next = -_array[prev].check; 299 | _array[e] = node (-prev, -next); 300 | _array[prev].check = _array[next].base = -e; 301 | if (b.num == 2 || b.trial == MAX_TRIAL) // Closed to Open 302 | if (bi) _transfer_block (bi, _bheadC, _bheadO); 303 | b.trial = 0; 304 | } 305 | if (b.ok < _ok[b.num]) b.ok = _ok[b.num]; 306 | _ninfo[e] = ninfo (); // reset ninfo; no child, no sibling 307 | } 308 | // push label to from's child 309 | void _push_sibling (const size_t from, const int base, const ukey_type label, const bool flag = true) { 310 | ukey_type* c = &_ninfo[from].child; 311 | if (flag && ! *c) 312 | c = &_ninfo[base ^ *c].sibling; 313 | _ninfo[base ^ label].sibling = *c, *c = label; 314 | } 315 | // pop label from from's child 316 | void _pop_sibling (const size_t from, const int base, const ukey_type label) { 317 | ukey_type* c = &_ninfo[from].child; 318 | while (*c != label) c = &_ninfo[base ^ *c].sibling; 319 | *c = _ninfo[base ^ label].sibling; 320 | } 321 | // check whether to replace branching w/ the newly added node 322 | bool _consult (const int base_n, const int base_p, ukey_type c_n, ukey_type c_p) const { 323 | do if (! (c_p = _ninfo[base_p ^ c_p].sibling)) return false; 324 | while ((c_n = _ninfo[base_n ^ c_n].sibling)); 325 | return true; 326 | } 327 | // enumerate (equal to or more than one) child nodes 328 | ukey_type* _set_child (ukey_type* p, const int base, ukey_type c, const int label = -1) { 329 | --p; 330 | if (! c) { *++p = c; c = _ninfo[base ^ c].sibling; } // 0: terminal 331 | if (label != -1) *++p = static_cast (label); 332 | while (c) { *++p = c; c = _ninfo[base ^ c].sibling; } 333 | return p; 334 | } 335 | // explore new block to settle down 336 | int _find_place () { 337 | if (_bheadC) return _block[_bheadC].ehead; 338 | if (_bheadO) return _block[_bheadO].ehead; 339 | return _add_block () << MAX_KEY_BITS; 340 | } 341 | int _find_place (const ukey_type* const first, const ukey_type* const last) { 342 | if (int bi = _bheadO) { 343 | const int bz = _block[_bheadO].prev; 344 | const int nc = static_cast (last - first + 1); 345 | while (1) { // set candidate block 346 | block& b = _block[bi]; 347 | if (b.num >= nc && nc <= b.ok) // explore configuration 348 | for (int e = b.ehead;;) { 349 | const int base = e ^ *first; 350 | for (const ukey_type* p = first; _array[base ^ *++p].check < 0; ) 351 | if (p == last) return b.ehead = e; // no conflict 352 | if ((e = -_array[e].check) == b.ehead) break; 353 | } 354 | b.ok = nc - 1; // mod 355 | if (b.ok < _ok[b.num]) _ok[b.num] = b.ok; 356 | const int bi_ = b.next; 357 | if (++b.trial == MAX_TRIAL) _transfer_block (bi, _bheadO, _bheadC); 358 | if (bi == bz) break; 359 | bi = bi_; 360 | }; 361 | } 362 | return _add_block () << MAX_KEY_BITS; 363 | } 364 | // resolve conflict on base_n ^ label_n = base_p ^ label_p 365 | int _resolve (size_t& from_n, const int base_n, const ukey_type label_n) { 366 | // examine siblings of conflicted nodes 367 | const int to_pn = base_n ^ label_n; 368 | const int from_p = _array[to_pn].check; 369 | const int base_p = _array[from_p].base; 370 | const bool flag // whether to replace siblings of newly added 371 | = _consult (base_n, base_p, _ninfo[from_n].child, _ninfo[from_p].child); 372 | ukey_type child[MAX_KEY_CODE]; 373 | ukey_type* const first = &child[0]; 374 | ukey_type* const last = 375 | flag ? _set_child (first, base_n, _ninfo[from_n].child, label_n) 376 | : _set_child (first, base_p, _ninfo[from_p].child); 377 | const int base = 378 | (first == last ? _find_place () : _find_place (first, last)) ^ *first; 379 | // replace & modify empty list 380 | const int from = flag ? static_cast (from_n) : from_p; 381 | const int base_ = flag ? base_n : base_p; 382 | if (flag && *first == label_n) _ninfo[from].child = label_n; // new child 383 | _array[from].base = base; // new base 384 | for (const ukey_type* p = first; p <= last; ++p) { // to_ => to 385 | const int to = _pop_enode (base, *p, from); 386 | const int to_ = base_ ^ *p; 387 | _ninfo[to].sibling = (p == last ? 0 : *(p + 1)); 388 | if (flag && to_ == to_pn) continue; // skip newcomer (no child) 389 | node& n = _array[to]; 390 | node& n_ = _array[to_]; 391 | if ((n.base = n_.base) > 0 && *p) { // copy base; bug fix 392 | ukey_type c = _ninfo[to].child = _ninfo[to_].child; 393 | do _array[n.base ^ c].check = to; // adjust grand son's check 394 | while ((c = _ninfo[n.base ^ c].sibling)); 395 | } 396 | if (! flag && to_ == static_cast (from_n)) // parent node moved 397 | from_n = static_cast (to); // bug fix 398 | if (! flag && to_ == to_pn) { // the address is immediately used 399 | _push_sibling (from_n, to_pn ^ label_n, label_n); 400 | _ninfo[to_].child = 0; // remember to reset child 401 | if (label_n) n_.base = -1; else n_.value = value_type (0); 402 | n_.check = static_cast (from_n); 403 | } else 404 | _push_enode (to_); 405 | } 406 | return flag ? base ^ label_n : to_pn; 407 | } 408 | }; 409 | } 410 | #endif 411 | -------------------------------------------------------------------------------- /jagger/jagger.h: -------------------------------------------------------------------------------- 1 | // Jagger -- deterministic pattern-based Japanese tagger 2 | // $Id: jagger.h 2028 2023-01-30 05:39:25Z ynaga $ 3 | // Copyright (c) 2022 Naoki Yoshinaga 4 | #ifndef JAGGER_H 5 | #define JAGGER_H 6 | 7 | #ifdef _WIN32 8 | #define WIN32_LEAN_AND_MEAN 9 | #ifndef NOMINMAX 10 | #define NOMINMAX 11 | #endif 12 | #include 13 | #include 14 | #include 15 | #endif 16 | 17 | #if !defined(_WIN32) 18 | #include 19 | #endif 20 | 21 | #if defined(JAGGER_USE_MMAP_IO) 22 | #if !defined(_WIN32) 23 | #include 24 | #include 25 | //#include 26 | #endif 27 | #endif 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | // 42 | #include "ccedar_core.h" 43 | 44 | #ifdef HAVE_CONFIG_H 45 | #include "config.h" 46 | #endif 47 | 48 | #ifndef JAGGER_DEFAULT_MODEL 49 | #define JAGGER_DEFAULT_MODEL "model/kwdlc" 50 | #endif 51 | 52 | #ifndef NUM_POS_FIELD 53 | // mecab style 54 | #define NUM_POS_FIELD 4 55 | #endif 56 | 57 | 58 | 59 | static void my_errx(int retcode, const char *fmt, const char *s) 60 | { 61 | fprintf(stderr, "jagger: "); 62 | fprintf(stderr, fmt, s); 63 | fprintf(stderr, "\n"); 64 | exit(retcode); 65 | } 66 | 67 | static const size_t BUF_SIZE = 1 << 18; 68 | static const size_t CP_MAX = 0x10ffff; 69 | static const size_t MAX_PLEN = 1 << 6; 70 | 71 | static const char* FEAT_UNK = "\x09\xE5\x90\x8D\xE8\xA9\x9E\x2C\xE6\x99\xAE\xE9\x80\x9A\xE5\x90\x8D\xE8\xA9\x9E\x2C\x2A\x2C\x2A"; 72 | 73 | // compute length of UTF8 character *p 74 | static inline int u8_len (const char *p) { 75 | static const uint8_t u8bytes[256] = { // must be static to tame compilers 76 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 77 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 78 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 79 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 80 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 81 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 82 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 83 | 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6 84 | }; 85 | return u8bytes[static_cast (*p)]; 86 | } 87 | 88 | // examine UTF8 sequence p consist of only num / alpha / kana characters 89 | static inline int char_type (const char* p, const char* end, const ccedar::da & chars) { 90 | int b (u8_len (p)), n (chars.exactMatchSearch (p, b)); 91 | if (n == -1) return 3; 92 | while ((p += b) != end) 93 | if (chars.exactMatchSearch (p, b = u8_len (p)) != n) return 3; 94 | return n; 95 | } 96 | 97 | // convert UTF-8 char to code point 98 | static inline int unicode (const char* p, int& b) { 99 | const unsigned char *p_ = reinterpret_cast (p); 100 | const int p0 (p_[0]), p1 (p_[1]), p2 (p_[2]), p3 (p_[3]); 101 | switch (b = u8_len (p)) { 102 | case 1: return p0 & 0x7f; 103 | case 2: return ((p0 & 0x1f) << 6) | (p1 & 0x3f); 104 | case 3: return ((p0 & 0xf) << 12) | ((p1 & 0x3f) << 6) | (p2 & 0x3f); 105 | case 4: return ((p0 & 0x7) << 18) | ((p1 & 0x3f) << 12) | ((p2 & 0x3f) << 6) | (p3 & 0x3f); 106 | default: my_errx (1, "UTF-8 decode error: %s", p); 107 | } 108 | return 0; 109 | } 110 | 111 | static const char* skip_to (const char* p, const size_t n, const char c) { 112 | for (size_t i = 0; i < n; ++i, ++p) 113 | while (*p != c && *p != '\n') ++p; 114 | return p; 115 | } 116 | 117 | class sbag_t { 118 | private: 119 | ccedar::da _str2id; 120 | std::vector _id2str; 121 | public: 122 | sbag_t () : _str2id (), _id2str () {} 123 | sbag_t (const char *f) : _str2id (), _id2str () { to_i (f, std::strlen (f)); } 124 | ~sbag_t () {} 125 | const std::string& to_s (const size_t fi) const { return _id2str[fi]; } 126 | size_t to_i (const std::string& f) { return to_i (f.c_str (), f.size ()); } 127 | size_t to_i (const char *f, const size_t len) { 128 | int &n = _str2id.update (f, len); 129 | if (n) return n - 1; 130 | _id2str.push_back (std::string (f, len)); 131 | return static_cast ((n = static_cast (_id2str.size ())) - 1); 132 | } 133 | int find (const char* f, const size_t len) const 134 | { return _str2id.exactMatchSearch (f, len); } 135 | size_t size () const { return _id2str.size (); } 136 | void serialize (std::vector & ret, std::vector & offsets) { 137 | for (std::vector ::const_iterator it = _id2str.begin (); 138 | it != _id2str.end (); ++it) { 139 | const uint16_t len = static_cast (it->size ()); 140 | size_t offset = ret.size (); 141 | offsets.push_back (offset); 142 | #ifdef USE_COMPACT_DICT 143 | ret.resize (offset + sizeof (uint16_t) + len); 144 | std::memcpy (&ret[offset], &len, sizeof (uint16_t)); 145 | offset += sizeof (uint16_t); 146 | #else 147 | ret.resize (offset + len); 148 | #endif 149 | std::memcpy (&ret[offset], it->c_str (), len); 150 | } 151 | } 152 | }; 153 | 154 | class simple_reader { 155 | private: 156 | const int _fd; 157 | char* _buf; 158 | size_t _start, _end, _size, _capacity; // ..._start..._end..._size..._capacity 159 | public: 160 | simple_reader (const char* fn = 0, size_t size = BUF_SIZE) : _fd (fn ? ::open (fn, O_RDONLY) : 0), _buf (static_cast (std::malloc (sizeof (char) * size))), _start (0), _end (0), _size (::read (_fd, _buf, size)), _capacity (size) 161 | { if (_fd == -1) std::free (_buf), my_errx (1, "no such file: %s", fn); } 162 | ~simple_reader () { std::free (_buf); } 163 | size_t gets (char** line) { 164 | if (! _size) return 0; 165 | do { // search '\n' in the buffer 166 | if (void *p = std::memchr (_buf + _end, '\n', _size - _end)) { 167 | *line = _buf + _start; 168 | _start = _end = static_cast (p) - _buf + 1; 169 | return _buf + _end - *line; 170 | } 171 | _end = _size - _start; 172 | if (_start) { // prepare space for loading more data 173 | std::memmove (_buf, _buf + _start, _size - _start); 174 | _size -= _start; _start = 0; 175 | } else // buffer is too short to read a single line 176 | _buf = static_cast (std::realloc (_buf, _capacity <<= 1)); 177 | if (size_t size = ::read (_fd, _buf + _size, _capacity - _size)) { 178 | _size += size; // read some data 179 | } else { // EOF or premature INPUT 180 | *line = _buf + _start; 181 | _size = 0; // end loop 182 | return _end - _start; 183 | } 184 | } while (1); 185 | } 186 | }; 187 | #endif 188 | -------------------------------------------------------------------------------- /jagger/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import jagger 4 | import argparse 5 | 6 | def is_valid_model_file(parser, arg): 7 | if not os.path.isfile(arg): 8 | parser.error('The model file `{}` does not exist!'.format(arg)) 9 | else: 10 | return arg 11 | 12 | def main(): 13 | parser = argparse.ArgumentParser("Python binding of Jagger.") 14 | parser.add_argument('-m', '--model', metavar='FILE', type=lambda x: is_valid_model_file(parser, x), default=jagger.default_model_path, 15 | help="Path to model(dict) file. When `-m/--model` is not speicified and the default model file(``) is not found in the system, it will raise an error.".format(jagger.default_model_path)) 16 | 17 | args = parser.parse_args() 18 | 19 | print(args.model) 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | # NOTE: setuptools_scm>=8 is not supported in py3.6 cibuildwheel env. 4 | # so use older setuptools_scm for a while 5 | #"setuptools>=64", 6 | #"setuptools_scm>=8", 7 | "setuptools>=45", 8 | "setuptools_scm[toml]<8", 9 | "wheel", 10 | "pybind11>=2.10.0", 11 | ] 12 | build-backend = "setuptools.build_meta" 13 | 14 | [tool.black] 15 | line-length = 140 16 | 17 | [project] 18 | name = "jagger" 19 | 20 | # Use setuptools_scm 21 | dynamic = ["version"] 22 | 23 | readme = {file = "README.md", content-type = "text/markdown"} 24 | 25 | 26 | [project.scripts] 27 | jagger = "jagger:main" 28 | 29 | [tool.setuptools_scm] 30 | # setuptools_scm>=8 31 | #version_file = "jagger/_version.py" 32 | 33 | # setuptools_scm<8 34 | write_to = "jagger/_version.py" 35 | -------------------------------------------------------------------------------- /python-binding-train-jagger.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from setuptools import setup 4 | from pybind11.setup_helpers import Pybind11Extension 5 | 6 | # Should be False in the release 7 | dev_mode = False 8 | 9 | jagger_compile_args=[ 10 | ] 11 | 12 | if sys.platform.startswith('win32'): 13 | # Assume MSVC 14 | pass 15 | else: 16 | jagger_compile_args.append("-std=c++11") 17 | 18 | 19 | if dev_mode: 20 | jagger_compile_args.append('-O0') 21 | jagger_compile_args.append('-g') 22 | jagger_compile_args.append('-fsanitize=address') 23 | 24 | ext_modules = [ 25 | Pybind11Extension("jagger_ext", ["jagger/python-binding-jagger.cc"], 26 | include_dirs=['.'], 27 | extra_compile_args=jagger_compile_args, 28 | ), 29 | ] 30 | 31 | setup( 32 | name="jagger", 33 | packages=['jagger'], 34 | # version is now set by setuptools_scm 35 | #version="v0.1.17", 36 | ext_modules=ext_modules, 37 | long_description=open("./README.md", 'r', encoding='utf8').read(), 38 | long_description_content_type='text/markdown', 39 | # NOTE: entry_points are set in pyproject.toml 40 | #entry_points={ 41 | # 'console_scripts': [ 42 | # "jagger=jagger.main:main" 43 | # ] 44 | #}, 45 | license_files= ('LICENSE', 'jagger.BSD', 'jagger.GPL', 'jagger.LGPL'), 46 | install_requires=[]) 47 | -------------------------------------------------------------------------------- /train/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | set(EXE_TARGET "train_jagger") 4 | project(${EXE_TARGET} CXX) 5 | 6 | # cmake modules 7 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/../cmake) 8 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/../cmake/sanitizers) 9 | find_package(Sanitizers) # Address sanitizer (-DSANITIZE_ADDRESS=ON) 10 | 11 | 12 | set(CMAKE_CXX_STANDARD 11) 13 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 14 | set(CMAKE_CXX_EXTENSIONS OFF) 15 | 16 | add_executable(${EXE_TARGET} train_jagger.cc) 17 | 18 | target_include_directories(${EXE_TARGET} PRIVATE ../jagger) 19 | 20 | target_compile_definitions(${EXE_TARGET} PRIVATE "JAGGER_DEFAULT_MODEL=\"/usr/local/lib/jagger/model/kwdlc\"") 21 | 22 | # 23 | target_compile_definitions(${EXE_TARGET} PRIVATE "NUM_POS_FIELD=4") 24 | #target_compile_definitions(${EXE_TARGET} PRIVATE "USE_JUMANDIC=1") 25 | 26 | # [VisualStudio] 27 | if(WIN32) 28 | # Set ${EXE_TARGET} as a startup project for VS IDE 29 | set_property(DIRECTORY PROPERTY VS_STARTUP_PROJECT ${EXE_TARGET}) 30 | 31 | # For easier debugging in VS IDE(cmake 3.8.0 or later required) Set working 32 | # directory where CMakeLists.txt is placed. 33 | if(CMAKE_VERSION VERSION_GREATER 3.8.0) 34 | set_target_properties( 35 | ${EXE_TARGET} PROPERTIES VS_DEBUGGER_WORKING_DIRECTORY 36 | "${CMAKE_CURRENT_SOURCE_DIR}") 37 | endif() 38 | endif() 39 | -------------------------------------------------------------------------------- /train/README.md: -------------------------------------------------------------------------------- 1 | # train model 2 | 3 | ## Requirements 4 | 5 | Unixish system or Windows. 6 | 7 | ## Train with KWDLC 8 | 9 | See Jagger site for details: https://www.tkl.iis.u-tokyo.ac.jp/~ynaga/jagger/index.en.html 10 | 11 | ``` 12 | $ gawk '{ printf "%s", ($1 == "EOS") ? "\n" : $1 }' model/kwdlc/train.JAG > model/kwdlc/train 13 | $ gawk '{ printf "%s", ($1 == "EOS") ? "\n" : $1 }' model/kwdlc/dev.JAG > model/kwdlc/dev 14 | $ gawk '{ printf "%s", ($1 == "EOS") ? "\n" : $1 }' model/kwdlc/test.JAG > model/kwdlc/test 15 | 16 | $ find mecab-jumandic-7.0-20130310 -name "*.csv" | sort | xargs cat > model/kwdlc/dict 17 | # ./build/train_jagger model/kwdlc/dict model/kwdlc/train.JAG > model/kwdlc/patterns 18 | 19 | ``` 20 | 21 | ## Train with Vaporetto(W.I.P.) 22 | 23 | ``` 24 | $ python -m pip install vaporetto 25 | $ python -m pip install zstandard 26 | 27 | # Download precompiled model 28 | $ wget https://github.com/daac-tools/vaporetto-models/releases/download/v0.5.0/bccwj-suw+unidic_pos+pron.tar.xz 29 | $ tar xvf bccwj-sun+unidic_pos+pron.tar.xz 30 | ``` 31 | 32 | T.B.W. 33 | 34 | 35 | ## Train with CharShu 36 | 37 | T.B.W. 38 | -------------------------------------------------------------------------------- /train/bootstrap-linux.sh: -------------------------------------------------------------------------------- 1 | curdir=`pwd` 2 | 3 | builddir=${curdir}/build 4 | 5 | rm -rf ${builddir} 6 | mkdir ${builddir} 7 | 8 | cmake -B${builddir} -S. \ 9 | -DSANITIZE_ADDRESS=0 \ 10 | -DCMAKE_BUILD_TYPE=RelWithDebInfo \ 11 | -DCMAKE_VERBOSE_MAKEFILE=1 12 | -------------------------------------------------------------------------------- /train/bootstrap-llvm-mingw-cross.sh: -------------------------------------------------------------------------------- 1 | # llvm-mingw cross compile 2 | # Assume Ninja is installed on your system 3 | curdir=`pwd` 4 | 5 | # Set path to llvm-mingw in env var. 6 | # https://github.com/mstorsjo/llvm-mingw 7 | export LLVM_MINGW_DIR=/mnt/data/local/llvm-mingw-20231128-ucrt-ubuntu-20.04-x86_64/ 8 | 9 | builddir=${curdir}/build-llvm-mingw 10 | 11 | rm -rf ${builddir} 12 | mkdir ${builddir} 13 | 14 | cd ${builddir} && cmake \ 15 | -DCMAKE_TOOLCHAIN_FILE=${curdir}/cmake/llvm-mingw-cross.cmake \ 16 | -G "Ninja" \ 17 | -DCMAKE_VERBOSE_MAKEFILE=1 \ 18 | .. 19 | 20 | cd ${curdir} 21 | -------------------------------------------------------------------------------- /train/tagging.py: -------------------------------------------------------------------------------- 1 | import vaporetto 2 | import zstandard 3 | 4 | dict_path = 'bccwj-suw+unidic_pos+pron/bccwj-suw+unidic_pos+pron.model.zst' 5 | 6 | dctx = zstandard.ZstdDecompressor() 7 | with open(dict_path, 'rb') as fp: 8 | with dctx.stream_reader(fp) as dict_reader: 9 | tokenizer = vaporetto.Vaporetto(dict_reader.read(), predict_tags = True) 10 | 11 | text = '吾輩は猫である' 12 | 13 | toks = tokenizer.tokenize(text) 14 | 15 | for tok in toks: 16 | print("{}\t{}".format(tok.surface(), tok.tag(0))) 17 | 18 | 19 | # Print with jagger friendly format 20 | -------------------------------------------------------------------------------- /train/train_jagger.cc: -------------------------------------------------------------------------------- 1 | // Jagger -- deterministic pattern-based Japanese tagger 2 | // $Id: train_jagger.cc 2031 2023-02-17 21:47:05Z ynaga $ 3 | // Copyright (c) 2022 Naoki Yoshinaga 4 | #include 5 | 6 | #ifdef HAVE_CONFIG_H 7 | #include "config.h" 8 | #endif 9 | 10 | static const char* chars_[] = {"0123456789〇一二三四五六七八九十百千万億兆数・", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ@:/.", "ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヷヸヹヺーヽヾヿ", 0}; 11 | 12 | struct triple { 13 | int first, second, third; 14 | triple (const int first_, const int second_, const int third_) : first (first_), second (second_), third (third_) {} 15 | }; 16 | 17 | int main (int argc, char** argv) { 18 | std::string train, dict; 19 | { // options (minimal) 20 | if (argc < 3) { 21 | fprintf(stderr, "Usage: %s dict train\n", argv[0]); 22 | exit(-1); 23 | } 24 | dict = argv[1]; 25 | train = argv[2]; 26 | //extern char *optarg; 27 | //extern int optind; 28 | //for (int opt = 0; (opt = getopt (argc, argv, "d:")) != -1; ) 29 | // if (opt == 'd') dict = optarg; 30 | //if (optind == argc) errx (1, " extract patterns for Jagger from dictionary and training data\nUsage: %s -d dict train > patterns\n\nOptions:\n -d dict\tdictionary csv", argv[0]); 31 | //train = argv[optind]; 32 | } 33 | ccedar::da chars; 34 | sbag_t fbag, pbag_; 35 | std::vector > si2fi2fi; 36 | std::vector si2fi; 37 | std::vector > > pi2fi2sc; 38 | std::vector fi2c; 39 | size_t max_plen = 0; 40 | char* line = 0; 41 | std::fprintf (stderr, "reading seed patterns from dictionary..."); 42 | { // read seeds from dictionary 43 | simple_reader reader (dict.c_str ()); 44 | while (const size_t len = reader.gets (&line)) { 45 | const char *p (line), *seed (p), *end (p + len - 1); 46 | const bool quoted = *p++ == '"'; 47 | if (quoted) 48 | while (*p != '"') ++p; // for words including , 49 | p = skip_to (p, 1, ','); 50 | max_plen = std::max (static_cast (p - seed - (quoted ? 3 : 1)), max_plen); 51 | const int pi = pbag_.to_i (quoted ? seed + 1 : seed, p - seed - (quoted ? 3 : 1)); 52 | if (pi == si2fi2fi.size ()) si2fi2fi.push_back (std::map ()); 53 | const char *f = skip_to (p, 3, ','); // read features 54 | p = skip_to (f, NUM_POS_FIELD, ',') - 1; 55 | si2fi2fi[pi].insert (std::make_pair (fbag.to_i (f, p - f), 56 | fbag.to_i (f, end - f))); // may not unique 57 | } 58 | fi2c.resize (fbag.size (), 0); 59 | } 60 | std::fprintf (stderr, "done; %zu words, %zu features\n", si2fi2fi.size (), fbag.size ()); 61 | std::fprintf (stderr, "regarding num / alpha / kana as seed patterns..."); 62 | for (int i (0), b (0); chars_[i]; ++i) // read seeds from num / alpha / kana 63 | for (const char *p = &chars_[i][0]; *p; p += b) { 64 | chars.update (p, b = u8_len (p)) = i; 65 | pbag_.to_i (p, b); 66 | } 67 | pi2fi2sc.resize (pbag_.size ()); 68 | const int num_seed = static_cast (pbag_.size ()); 69 | std::fprintf (stderr, "done; # seeds = %d\n", num_seed); 70 | { // enumerate patterns 71 | std::fprintf (stderr, "mining patterns from training data..."); 72 | std::vector tokens, pis; 73 | std::string sent; 74 | simple_reader reader (train.c_str ()); 75 | while (const size_t len = reader.gets (&line)) { 76 | if (std::strncmp (line, "EOS\n", 4) == 0) { 77 | char *p (&sent[0]), *end (&sent[0] + sent.size ()); 78 | std::string f_prev ("\tBOS"); 79 | for (std::vector ::const_iterator it = tokens.begin (); it != tokens.end (); ++it, pis.clear ()) { 80 | const int tlen (it->first), fi (it->second), fi_ (it->third); 81 | for (char *q = p + tlen; q <= std::min (p + max_plen, end); q += u8_len (q)) { 82 | pis.push_back (triple (pbag_.to_i (p, q - p), fi, tlen)); 83 | const bool first = pis.back ().first >= pi2fi2sc.size (); 84 | pis.push_back (triple (pbag_.to_i (std::string (p, q - p) + f_prev), fi, tlen)); 85 | if (first) break; // new pattern 86 | } 87 | const int n_ = pbag_.find (p, tlen); // reject tokens > max_plen 88 | if ((n_ == -1 || n_ > num_seed) && char_type (p, p + tlen, chars) != 0) { // POS-only pattern for unseen tokens 89 | if (fi2c.size () <= fi_) fi2c.resize (fi + 1); 90 | fi2c[fi_] += 1; 91 | const int fi__ = fbag.to_i (fbag.to_s (fi_) + ",*.*,*"); 92 | pis.push_back (triple (pbag_.to_i (f_prev), fi__, 0)); 93 | } 94 | pi2fi2sc.resize (pbag_.size ()); 95 | for (std::vector ::const_iterator jt = pis.begin (); jt != pis.end (); ++jt) 96 | ++pi2fi2sc[jt->first].insert (std::make_pair (jt->second, std::make_pair (jt->third, 0))).first->second.second; 97 | f_prev = "\t" + fbag.to_s (fi_); 98 | p += tlen; 99 | } 100 | tokens.clear (); 101 | sent.clear (); 102 | } else { // token 103 | const char *t (line), *f (skip_to (t, 1, '\t')), *p (skip_to (f, NUM_POS_FIELD, ',') - 1), *end (line + len - 1); 104 | tokens.push_back (triple (f - 1 - t, fbag.to_i (f, end - f), fbag.to_i (f, p - f))); 105 | sent += std::string (t, f - 1 - t); 106 | } 107 | } 108 | } 109 | std::fprintf (stderr, "done; %zu pattern candidates\n", pbag_.size ()); 110 | std::map > pi2sf; 111 | ccedar::da patterns; 112 | std::vector > counter; 113 | std::vector > pis; 114 | { // pruning patterns 115 | long max_fi = std::max_element (fi2c.begin (), fi2c.end ()) - fi2c.begin (); 116 | for (int i = 0; i < pi2fi2sc.size (); ++i) 117 | pis.push_back (std::make_pair (pbag_.to_s (i), i)); 118 | std::sort (pis.begin (), pis.end ()); 119 | std::fprintf (stderr, "pruning patterns..."); 120 | for (int i = 0; i < pis.size (); ++i) { 121 | const int pi = pis[i].second; 122 | const std::string& p = pbag_.to_s (pi); 123 | int bytes (p.size ()), fi (max_fi), count (0); 124 | if (pi2fi2sc[pi].empty ()) { // unseen patterns (seeds) 125 | if (pi < si2fi2fi.size ()) { // words in dictionary 126 | const std::map & fi2fi = si2fi2fi[pi]; 127 | std::map ::const_iterator jt (fi2fi.begin ()), jt_end (fi2fi.end ()); 128 | size_t max_fic (fi2c[jt->first]), fi_ (jt->first); 129 | for (++jt; jt != jt_end; ++jt) 130 | if (fi2c[jt->first] > max_fic || (fi2c[jt->first] == max_fic)) 131 | fi_ = jt->first, max_fic = fi2c[fi_]; 132 | fi = fi2fi.find (fi_)->second; 133 | } 134 | } else { // perform pruning for seen patterns 135 | const std::map >& fi2sc = pi2fi2sc[pi]; 136 | std::vector s2c (max_plen + 1, 0); 137 | for (std::map >::const_iterator jt = fi2sc.begin (); 138 | jt != fi2sc.end (); ++jt) // bytes to count for pi 139 | s2c[jt->second.first] += jt->second.second, 140 | count += jt->second.second; 141 | size_t max_count = 0; 142 | for (std::vector ::iterator it = s2c.begin (); it != s2c.end (); ++it) 143 | if (*it >= max_count) // =: prefer longer match 144 | max_count = *it, 145 | bytes = std::distance (s2c.begin (), it); 146 | size_t max_sfc = 0; 147 | for (std::map >::const_iterator jt = fi2sc.begin (); 148 | jt != fi2sc.end (); ++jt) 149 | if (jt->second.first == bytes && jt->second.second > max_sfc) 150 | fi = jt->first, max_sfc = jt->second.second; 151 | ccedar::da ::result_type result[MAX_PLEN]; 152 | const int num = patterns.commonPrefixSearch (p.c_str (), &result[0], max_plen, p.size ()); 153 | if (num > 0 && std::make_pair (bytes, fi) == pi2sf[result[num - 1]]) // && count < 70) 154 | continue; 155 | } 156 | counter.push_back (std::make_pair (count, -i)); 157 | pi2sf.insert (std::make_pair (pi, std::make_pair (bytes, fi))); 158 | patterns.update (p.c_str (), p.size ()) = static_cast (pi); 159 | } 160 | std::fprintf (stderr, "done; %zu -> %zu patterns\n", pi2fi2sc.size (), pi2sf.size ()); 161 | } 162 | { // output patterns from frequent one to rare one 163 | std::sort (counter.rbegin (), counter.rend ()); 164 | for (std::vector >::const_iterator it = counter.begin (); 165 | it != counter.end (); ++it) { 166 | const size_t pi (pis[-it->second].second), count (it->first), bytes (pi2sf[pi].first); 167 | const std::string &w (pbag_.to_s (pi)), &f (fbag.to_s (pi2sf[pi].second)); 168 | const int ctype = bytes ? char_type (&w[0], &w[0] + bytes, chars) : 0; 169 | std::fprintf (stdout, "%zu\t%s\t%s%zu\t%d\t%s\n", count, w.c_str (), w.find ("\t") == std::string::npos ? "\t" : "", bytes, ctype, f.c_str ()); 170 | } 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /train/vcsetup.bat: -------------------------------------------------------------------------------- 1 | rmdir /s /q build 2 | mkdir build 3 | 4 | cmake -G "Visual Studio 17 2022" -A x64 -Bbuild -H. 5 | --------------------------------------------------------------------------------