├── .build.cmd ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── appveyor.yml ├── build-wheels.sh ├── docs ├── Makefile ├── conf.py └── index.rst ├── rust ├── Cargo.toml ├── rust_fst.h └── src │ ├── lib.rs │ ├── map.rs │ ├── set.rs │ └── util.rs ├── rust_fst ├── __init__.py ├── common.py ├── lib.py ├── map.py └── set.py ├── rust_setuptools.py ├── setup.py ├── test-requirements.txt ├── tests ├── test_map.py └── test_set.py └── tox.ini /.build.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | :: To build extensions for 64 bit Python 3, we need to configure environment 3 | :: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of: 4 | :: MS Windows SDK for Windows 7 and .NET Framework 4 5 | :: 6 | :: More details at: 7 | :: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows 8 | 9 | IF "%DISTUTILS_USE_SDK%"=="1" ( 10 | ECHO Configuring environment to build with MSVC on a 64bit architecture 11 | ECHO Using Windows SDK 7.1 12 | "C:\Program Files\Microsoft SDKs\Windows\v7.1\Setup\WindowsSdkVer.exe" -q -version:v7.1 13 | CALL "C:\Program Files\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64 /release 14 | SET MSSdk=1 15 | REM Need the following to allow tox to see the SDK compiler 16 | SET TOX_TESTENV_PASSENV=DISTUTILS_USE_SDK MSSdk INCLUDE LIB 17 | ) ELSE ( 18 | ECHO Using default MSVC build environment 19 | ) 20 | 21 | CALL %* 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .cache 3 | /build 4 | /dist 5 | /*.egg-info 6 | *.pyc 7 | Cargo.lock 8 | fstwrapper/target 9 | .tox 10 | /wheelhouse 11 | /docs/_build 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | notifications: 2 | email: false 3 | 4 | matrix: 5 | include: 6 | - os: osx 7 | - os: linux 8 | sudo: required 9 | services: 10 | - docker 11 | env: DOCKER_IMAGE=quay.io/pypa/manylinux1_x86_64 12 | - sudo: required 13 | services: 14 | - docker 15 | env: DOCKER_IMAGE=quay.io/pypa/manylinux1_i686 16 | PRE_CMD=linux32 17 | 18 | install: 19 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then docker pull $DOCKER_IMAGE; fi 20 | 21 | script: 22 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then ./build-wheels.sh $TRAVIS_OS_NAME; fi 23 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then docker run --rm -v `pwd`:/io $DOCKER_IMAGE $PRE_CMD /io/build-wheels.sh $TRAVIS_OS_NAME; fi 24 | - ls wheelhouse/ 25 | 26 | before_deploy: 27 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo pip install appveyor-artifacts; fi 28 | - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then appveyor-artifacts -c $(git rev-parse HEAD) -o jbaiter -n python-rust-fst download; fi 29 | # Don't deploy sdist package when building OSX wheels 30 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then export PYPI_DISTRIBUTIONS='check'; fi 31 | - mkdir -p dist_; mv dist/rust_fst*.whl dist_; rm -rf dist; mv dist_ dist; 32 | - mv wheelhouse/rust_fst*.whl dist; ls dist 33 | # 'twine' installation fails for the old infrastructure because of the 34 | # missing 'sudo, so we install it manually here 35 | - sudo pip install twine 36 | 37 | deploy: 38 | - provider: releases 39 | api_key: 40 | secure: 0MY7OzVqfPyEdynSZR2ke8VBEjMyAh6MHlXh9jlfdROgaoZ/TUNPlmlDtV+Dd/ZG/nIjcakoIWaFA3/Ta8ekYievoi/vtvdrecHem/O/9zmLglWKJwD3hukUw3w1/YngquzYqlKJrX9pYX3io3hFTUzxDx/5DXYiuRA7q8cvgmztAnDKtEqbRRMVc24Q54mbDmF000sELgumjAaCH3Ronu9f5zXIhrLHDp/gHAlH3LQ19/FNn7rNvx+ZfpLlHYN0RFunXuG7OcDsQMWv/TEUy+rjTV16z5N2xPmonFij37/w5bMgBjPPHTafky24avqoZvGaRDoCkQCU3STrl+pK/ljcnq7Et9Q4p1RPod0ypkh43FANUWsszB5AmTabDxINjFe7UTSsLnzOk7kDXmGU55ZQe87+x1l/GFm6CdZUrL4zkxzN5vA4COe9CGYC6EXieQaOADa62B9PmsbrZuaEUBVOWfSJuMXrTiIKjhB5gmqAuGNsRea8sIAdd9M4mMthLm0rLFrRIH/+W7jANRN7xiL1jchXkMLMn/Fse8jusxXfMF0DYFUYOXfro+ILGw6kCyLkgK5xhArKYahlUqVrHCSQA3PDG3G7F21HtYXhV0ZFnyrwiXZTOpKshMW7TLGYcnhbHbXuyHW4JpoFcFRuHmCZODBtQrfF/KbEAZ06P+8= 41 | file_glob: true 42 | file: ${HOME}/build/${TRAVIS_REPO_SLUG}/dist/rust_fst*.whl 43 | skip_cleanup: true 44 | on: 45 | tags: true 46 | repo: jbaiter/python-rust-fst 47 | - provider: pypi 48 | user: jbaiter 49 | password: 50 | secure: rjroeCazKXEzIc9ROgu6ifqXU6ZN2ihGGuUTE9lZ3dW9zwdfCYS3eCMhW9FrM/hXIj9Sd7SSAl7onHfGj7Fe2R7PkR8BFKwYUKsn8E2R6vnsUzQC23Cb5SXlEWFBIbZm7ATmsgiPxX61jHXXn/PupHM3kU1FRaBLADXhkO6flA2tZKFXSZ947RR+CQKKKJlkoW4gn7g8a+azWHMY6dDU7RBRHtC3Yz27ggM2iYIgdZvCfYLAJlsX+puxPk5YLZJus52zQ6xo7nG7xEkI66EsPuL98f979aGLhxrRA2aqvQXFr3CXMkXf2EHe23bbIspd+blShUnIGHPbmx1ODd+0tW0FiyDkciju+Q43Ffq/mW8jrUXPC5QCiTIQbBBQGUa9n6jV/SrKHA+HBQWUhwKEYO7kP+7LU1wRBtIje2woKj4XLFh7uN/UacT65adzQEpQmXxxNntjJrievosrlAssK2aqZtAiZNXJNWU0Ig6FV/mm9JgN/bdJEesiGZWmrJigSqk3OupwznwawRGtcndKFNgxDAbNahdgpgqyH3HCDvP35+3kUmu9s9TdXU9xuZee0vf2XY/THTQTg27zAHUNEBXUQf8as1N7f+MWkHE4zdizAErIyD5nNSaJgK5ixsEB6WUkZ/kVYKIFwyeTXtkn31F5/TxoCESC/XbmRPeE8r8= 51 | skip_cleanup: true 52 | on: 53 | tags: true 54 | repo: jbaiter/python-rust-fst 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Andrew Gallant 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-rust-fst 2 | 3 | [![appveyor](https://ci.appveyor.com/api/projects/status/github/jbaiter/python-rust-fst)](https://ci.appveyor.com/project/jbaiter/python-rust-fst) 4 | [![travis](https://travis-ci.org/jbaiter/python-rust-fst.svg)](https://travis-ci.org/jbaiter/python-rust-fst) 5 | [![pypi downloads](https://img.shields.io/pypi/dm/rust_fst.svg?maxAge=2592000)](https://pypi.python.org/pypi/rust-fst) 6 | [![pypi version](https://img.shields.io/pypi/v/rust_fst.svg?maxAge=2592000)](https://pypi.python.org/pypi/rust_fst) 7 | [![pypi wheel](https://img.shields.io/pypi/wheel/rust_fst.svg?maxAge=2592000)](https://pypi.python.org/pypi/rust_fst) 8 | 9 | Python bindings for [burntsushi's][1] [fst crate][2] ([rustdocs][3]) 10 | for FST-backed sets and maps. 11 | 12 | For reasons why you might want to consider using it, see BurntSushi's great 13 | article on ["Index[ing] 1,600,000,000 Keys with Automata and Rust"][4]. 14 | 15 | **tl;dr**: 16 | - Work with larger-than-memory sets 17 | - Perform fuzzy search using Levenshtein automata 18 | 19 | 20 | ## Installation 21 | `rust_fst` is available as a binary wheel for the most common platforms (Linux 22 | 64bit x86, Windows 32/64bit x86 and OSX 64bit x86) and thus **does not require 23 | a Rust installation.** 24 | 25 | Just run `pip install rust_fst` to install the latest stable version of the 26 | package. 27 | 28 | 29 | ## Development 30 | - You will need: 31 | * Python >= 3.3, Python or PyPy >= 2.7 with development headers installed 32 | * Rust nightly (install via [rustup][5]) 33 | - Run `rustup override add nightly` to add an override for rustup to use the 34 | nightly channel for the repository 35 | - Install with pip (without the `-e` flag, it does not work!) 36 | - Run tests with `py.test python-rust-fst/tests` and make sure you are not 37 | in the root of the repo, since the installed (and compiled) package will not 38 | be used in that case. 39 | 40 | 41 | ## Status 42 | The package exposes almost all functionality of the `fst` crate, except for: 43 | 44 | - Combining the results of slicing, `search` and `search_re` with set operations 45 | - Using raw transducers 46 | 47 | 48 | ## Examples 49 | ```python 50 | from rust_fst import Map, Set 51 | 52 | # Building a set in memory 53 | keys = ["fa", "fo", "fob", "focus", "foo", "food", "foul"] 54 | s = Set.from_iter(keys) 55 | 56 | # Fuzzy searches on the set 57 | matches = list(s.search(term="foo", max_dist=1)) 58 | assert matches == ["fo", "fob", "foo", "food"] 59 | 60 | # Searching with a regular expression 61 | matches = list(s.search_re(r'f\w{2}')) 62 | assert matches == ["fob", "foo"] 63 | 64 | # Store map on disk, requiring only constant memory for querying 65 | items = [("bruce", 1), ("clarence", 2), ("stevie", 3)] 66 | m = Map.from_iter(items, path="/tmp/map.fst") 67 | 68 | # Find all items whose key is greater or equal (in lexicographical sense) to 69 | # 'clarence' 70 | matches = dict(m['clarence':]) 71 | assert matches == {'clarence': 2, 'stevie': 3} 72 | 73 | # Create a map from a file input, using generators/yield 74 | # The input file must be sorted on the first column, and look roughly like 75 | # keyA 123 76 | # keyB 456 77 | def file_iterator(fpath): 78 | with open(fpath, 'rt') as fp: 79 | for line in fp: 80 | key, value = line.strip().split() 81 | yield key, int(value) 82 | m = Map.from_iter( file_iterator('/your/input/file/'), '/your/mmapped/output.fst') 83 | 84 | # re-open a file you built previously with from_iter() 85 | m = Map(path='/path/to/existing.fst') 86 | ``` 87 | 88 | 89 | ## Documentation 90 | Head over to [readthedocs.org][6] for the API documentation. 91 | 92 | If you want to know more about performance characteristics, memory usage 93 | and about the implementation details, please head over to the 94 | [documentation for the Rust crate][2] 95 | 96 | 97 | [1]: http://burntsushi.net 98 | [2]: https://github.com/BurntSushi/fst 99 | [3]: http://burntsushi.net/rustdoc/fst/ 100 | [4]: http://blog.burntsushi.net/transducers/ 101 | [5]: https://www.rustup.rs/ 102 | [6]: https://rust-fst.readthedocs.org/ 103 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | matrix: 3 | - PYARCH: "" 4 | RUST_TARGET: "i686-pc-windows-msvc" 5 | - PYARCH: "-x64" 6 | RUST_TARGET: "x86_64-pc-windows-msvc" 7 | 8 | install: 9 | - SET PYTHON=C:\Python27%PYARCH% 10 | - "%PYTHON%\\python -m pip install -U pip setuptools wheel" 11 | # Install development version of milksnake that fixes Windows dylib discovery 12 | - "%PYTHON%\\python -m pip install -U git+https://github.com/getsentry/milksnake" 13 | - ps: Start-FileDownload "https://static.rust-lang.org/dist/rust-nightly-${env:RUST_TARGET}.exe" 14 | - rust-nightly-%RUST_TARGET%.exe /VERYSILENT /NORESTART /DIR="C:\Program Files (x86)\Rust" 15 | - SET PATH=%PATH%;C:\Program Files (x86)\Rust\bin 16 | - SET PATH=%PATH%;C:\MinGW\bin 17 | 18 | build_script: 19 | - "%PYTHON%\\python -m pip -v wheel . -w .\\wheelhouse" 20 | # Rename the wheel so it is valid for all Python versions 21 | # This is possible since our shared library does not link against any 22 | # Python ABI 23 | # TODO: This leaves the old Python ABI/ABI tags in the 'WHEEL' file inside 24 | # of the wheel. However, `pip` does not seem to validate against that 25 | # currently, so we're safe until they change it... 26 | - ps: | 27 | get-childItem wheelhouse\*-cp*-win*.whl | rename-item -newname { 28 | $_.name -replace '-cp.*?-cp.*?-','-py2.py3-none-' 29 | } 30 | - dir wheelhouse 31 | 32 | test_script: 33 | - SET PROJPATH=C:\Projects\python-rust-fst 34 | - cd c:\projects 35 | - ps: | 36 | $PyVersions = @("27", "33", "34", "35") 37 | foreach ($pyver in $PyVersions) { 38 | $python = "C:\\Python$pyver${env:PYARCH}\\python" 39 | &$python -m pip install -U pip setuptools wheel cffi pytest decorator psutil 40 | &$python -m pip -v install rust_fst --no-index -f "${env:PROJPATH}\wheelhouse" 41 | &$python -m pytest "${env:PROJPATH}\tests" 42 | } 43 | 44 | artifacts: 45 | - path: wheelhouse\rust_fst*.whl 46 | -------------------------------------------------------------------------------- /build-wheels.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e -x 3 | 4 | function install_rust { 5 | curl https://sh.rustup.rs > /tmp/rustup.sh 6 | curl https://static.rust-lang.org/rustup.sh > /tmp/rustup.sh 7 | chmod +x /tmp/rustup.sh 8 | /tmp/rustup.sh -y --disable-sudo --channel=$1 9 | } 10 | 11 | function clean_project { 12 | # Remove compiled files that might cause conflicts 13 | pushd /io/ 14 | rm -rf .cache .eggs rust_fst/_ffi.py build *.egg-info 15 | find ./ -name "__pycache__" -type d -print0 |xargs -0 rm -rf 16 | find ./ -name "*.pyc" -type f -print0 |xargs -0 rm -rf 17 | find ./ -name "*.so" -type f -print0 |xargs -0 rm -rf 18 | popd 19 | } 20 | 21 | RUST_CHANNEL=nightly 22 | 23 | # It doesn't matter with which Python version we build the wheel, so we 24 | # use the oldest supported one 25 | if [[ $1 == "osx" ]]; then 26 | brew update 27 | brew install 28 | pip install -U pip setuptools wheel 29 | install_rust $RUST_CHANNEL 30 | pip wheel . -w ./wheelhouse 31 | pip install -v rust_fst --no-index -f ./wheelhouse 32 | pip install -r "test-requirements.txt" 33 | cd ../ 34 | py.test ./python-rust-fst/tests 35 | else 36 | PYBIN=/opt/python/cp27-cp27m/bin 37 | # Clean build files 38 | clean_project 39 | 40 | install_rust $RUST_CHANNEL 41 | 42 | # Remove old wheels 43 | rm -rf /io/wheelhouse/* || echo "No old wheels to delete" 44 | 45 | # Install libraries needed for compiling the extension 46 | yum -q -y install libffi-devel 47 | 48 | # Compile wheel 49 | ${PYBIN}/python -m pip wheel /io/ -w /wheelhouse/ 50 | 51 | # Move pure wheels to target directory 52 | mkdir -p /io/wheelhouse 53 | mv /wheelhouse/*any.whl /io/wheelhouse || echo "No pure wheels to move" 54 | 55 | # Bundle external shared libraries into the wheel 56 | for whl in /wheelhouse/*.whl; do 57 | auditwheel repair $whl -w /io/wheelhouse/ 58 | done 59 | 60 | # Set permissions on wheels 61 | chmod -R a+rw /io/wheelhouse 62 | 63 | # Install packages and test with all Python versions 64 | for PYBIN in /opt/python/*/bin/; do 65 | ${PYBIN}/python -m pip install cffi 66 | ${PYBIN}/python -m pip install -r "/io/test-requirements.txt" 67 | ${PYBIN}/python -m pip install rust_fst --no-index -f /io/wheelhouse 68 | ${PYBIN}/python -m pytest --verbose /io/tests 69 | clean_project 70 | done 71 | fi 72 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " epub3 to make an epub3" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | @echo " dummy to check syntax errors of document sources" 51 | 52 | .PHONY: clean 53 | clean: 54 | rm -rf $(BUILDDIR)/* 55 | 56 | .PHONY: html 57 | html: 58 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 61 | 62 | .PHONY: dirhtml 63 | dirhtml: 64 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 65 | @echo 66 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 67 | 68 | .PHONY: singlehtml 69 | singlehtml: 70 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 71 | @echo 72 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 73 | 74 | .PHONY: pickle 75 | pickle: 76 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 77 | @echo 78 | @echo "Build finished; now you can process the pickle files." 79 | 80 | .PHONY: json 81 | json: 82 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 83 | @echo 84 | @echo "Build finished; now you can process the JSON files." 85 | 86 | .PHONY: htmlhelp 87 | htmlhelp: 88 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 89 | @echo 90 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 91 | ".hhp project file in $(BUILDDIR)/htmlhelp." 92 | 93 | .PHONY: qthelp 94 | qthelp: 95 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 96 | @echo 97 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 98 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 99 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/rust-fst.qhcp" 100 | @echo "To view the help file:" 101 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/rust-fst.qhc" 102 | 103 | .PHONY: applehelp 104 | applehelp: 105 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 106 | @echo 107 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 108 | @echo "N.B. You won't be able to view it unless you put it in" \ 109 | "~/Library/Documentation/Help or install it in your application" \ 110 | "bundle." 111 | 112 | .PHONY: devhelp 113 | devhelp: 114 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 115 | @echo 116 | @echo "Build finished." 117 | @echo "To view the help file:" 118 | @echo "# mkdir -p $$HOME/.local/share/devhelp/rust-fst" 119 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/rust-fst" 120 | @echo "# devhelp" 121 | 122 | .PHONY: epub 123 | epub: 124 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 125 | @echo 126 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 127 | 128 | .PHONY: epub3 129 | epub3: 130 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 131 | @echo 132 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 133 | 134 | .PHONY: latex 135 | latex: 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo 138 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 139 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 140 | "(use \`make latexpdf' here to do that automatically)." 141 | 142 | .PHONY: latexpdf 143 | latexpdf: 144 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 145 | @echo "Running LaTeX files through pdflatex..." 146 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 147 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 148 | 149 | .PHONY: latexpdfja 150 | latexpdfja: 151 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 152 | @echo "Running LaTeX files through platex and dvipdfmx..." 153 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 154 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 155 | 156 | .PHONY: text 157 | text: 158 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 159 | @echo 160 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 161 | 162 | .PHONY: man 163 | man: 164 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 165 | @echo 166 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 167 | 168 | .PHONY: texinfo 169 | texinfo: 170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 171 | @echo 172 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 173 | @echo "Run \`make' in that directory to run these through makeinfo" \ 174 | "(use \`make info' here to do that automatically)." 175 | 176 | .PHONY: info 177 | info: 178 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 179 | @echo "Running Texinfo files through makeinfo..." 180 | make -C $(BUILDDIR)/texinfo info 181 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 182 | 183 | .PHONY: gettext 184 | gettext: 185 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 186 | @echo 187 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 188 | 189 | .PHONY: changes 190 | changes: 191 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 192 | @echo 193 | @echo "The overview file is in $(BUILDDIR)/changes." 194 | 195 | .PHONY: linkcheck 196 | linkcheck: 197 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 198 | @echo 199 | @echo "Link check complete; look for any errors in the above output " \ 200 | "or in $(BUILDDIR)/linkcheck/output.txt." 201 | 202 | .PHONY: doctest 203 | doctest: 204 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 205 | @echo "Testing of doctests in the sources finished, look at the " \ 206 | "results in $(BUILDDIR)/doctest/output.txt." 207 | 208 | .PHONY: coverage 209 | coverage: 210 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 211 | @echo "Testing of coverage in the sources finished, look at the " \ 212 | "results in $(BUILDDIR)/coverage/python.txt." 213 | 214 | .PHONY: xml 215 | xml: 216 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 217 | @echo 218 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 219 | 220 | .PHONY: pseudoxml 221 | pseudoxml: 222 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 223 | @echo 224 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 225 | 226 | .PHONY: dummy 227 | dummy: 228 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 229 | @echo 230 | @echo "Build finished. Dummy builder generates no files." 231 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # rust-fst documentation build configuration file, created by 4 | # sphinx-quickstart on Wed May 25 21:05:04 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | try: 18 | from unittest.mock import MagicMock 19 | except ImportError: 20 | from mock import Mock as MagicMock 21 | sys.path.insert(0, os.path.abspath('..')) 22 | 23 | class Mock(MagicMock): 24 | @classmethod 25 | def __getattr__(cls, name): 26 | return Mock() 27 | 28 | MOCK_MODULES = ['rust_fst._ffi'] 29 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) 30 | import rust_fst 31 | 32 | import sphinx_rtd_theme 33 | 34 | # If extensions (or modules to document with autodoc) are in another directory, 35 | # add these directories to sys.path here. If the directory is relative to the 36 | # documentation root, use os.path.abspath to make it absolute, like shown here. 37 | 38 | # -- General configuration ------------------------------------------------ 39 | 40 | # If your documentation needs a minimal Sphinx version, state it here. 41 | #needs_sphinx = '1.0' 42 | 43 | # Add any Sphinx extension module names here, as strings. They can be 44 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 45 | # ones. 46 | extensions = [ 47 | 'sphinx.ext.autodoc', 48 | 'sphinx.ext.intersphinx', 49 | 'sphinx.ext.todo', 50 | 'sphinx.ext.mathjax', 51 | 'sphinx.ext.viewcode', 52 | ] 53 | 54 | # Add any paths that contain templates here, relative to this directory. 55 | templates_path = ['_templates'] 56 | 57 | # The suffix(es) of source filenames. 58 | # You can specify multiple suffix as a list of string: 59 | # source_suffix = ['.rst', '.md'] 60 | source_suffix = '.rst' 61 | 62 | # The encoding of source files. 63 | #source_encoding = 'utf-8-sig' 64 | 65 | # The master toctree document. 66 | master_doc = 'index' 67 | 68 | # General information about the project. 69 | project = u'rust-fst' 70 | copyright = u'2016, Johannes Baiter' 71 | author = u'Johannes Baiter' 72 | 73 | # The version info for the project you're documenting, acts as replacement for 74 | # |version| and |release|, also used in various other places throughout the 75 | # built documents. 76 | # 77 | # The short X.Y version. 78 | version = u'0.1.2' 79 | # The full version, including alpha/beta/rc tags. 80 | release = u'0.1.2' 81 | 82 | # The language for content autogenerated by Sphinx. Refer to documentation 83 | # for a list of supported languages. 84 | # 85 | # This is also used if you do content translation via gettext catalogs. 86 | # Usually you set "language" from the command line for these cases. 87 | language = None 88 | 89 | # There are two options for replacing |today|: either, you set today to some 90 | # non-false value, then it is used: 91 | #today = '' 92 | # Else, today_fmt is used as the format for a strftime call. 93 | #today_fmt = '%B %d, %Y' 94 | 95 | # List of patterns, relative to source directory, that match files and 96 | # directories to ignore when looking for source files. 97 | # This patterns also effect to html_static_path and html_extra_path 98 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 99 | 100 | # The reST default role (used for this markup: `text`) to use for all 101 | # documents. 102 | #default_role = None 103 | 104 | # If true, '()' will be appended to :func: etc. cross-reference text. 105 | #add_function_parentheses = True 106 | 107 | # If true, the current module name will be prepended to all description 108 | # unit titles (such as .. function::). 109 | #add_module_names = True 110 | 111 | # If true, sectionauthor and moduleauthor directives will be shown in the 112 | # output. They are ignored by default. 113 | #show_authors = False 114 | 115 | # The name of the Pygments (syntax highlighting) style to use. 116 | pygments_style = 'sphinx' 117 | 118 | # A list of ignored prefixes for module index sorting. 119 | #modindex_common_prefix = [] 120 | 121 | # If true, keep warnings as "system message" paragraphs in the built documents. 122 | #keep_warnings = False 123 | 124 | # If true, `todo` and `todoList` produce output, else they produce nothing. 125 | todo_include_todos = True 126 | 127 | 128 | # -- Options for HTML output ---------------------------------------------- 129 | 130 | # The theme to use for HTML and HTML Help pages. See the documentation for 131 | # a list of builtin themes. 132 | html_theme = 'sphinx_rtd_theme' 133 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 134 | 135 | # Theme options are theme-specific and customize the look and feel of a theme 136 | # further. For a list of options available for each theme, see the 137 | # documentation. 138 | #html_theme_options = {} 139 | 140 | # Add any paths that contain custom themes here, relative to this directory. 141 | #html_theme_path = [] 142 | 143 | # The name for this set of Sphinx documents. 144 | # " v documentation" by default. 145 | #html_title = u'rust-fst v0.1' 146 | 147 | # A shorter title for the navigation bar. Default is the same as html_title. 148 | #html_short_title = None 149 | 150 | # The name of an image file (relative to this directory) to place at the top 151 | # of the sidebar. 152 | #html_logo = None 153 | 154 | # The name of an image file (relative to this directory) to use as a favicon of 155 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 156 | # pixels large. 157 | #html_favicon = None 158 | 159 | # Add any paths that contain custom static files (such as style sheets) here, 160 | # relative to this directory. They are copied after the builtin static files, 161 | # so a file named "default.css" will overwrite the builtin "default.css". 162 | html_static_path = ['_static'] 163 | 164 | # Add any extra paths that contain custom files (such as robots.txt or 165 | # .htaccess) here, relative to this directory. These files are copied 166 | # directly to the root of the documentation. 167 | #html_extra_path = [] 168 | 169 | # If not None, a 'Last updated on:' timestamp is inserted at every page 170 | # bottom, using the given strftime format. 171 | # The empty string is equivalent to '%b %d, %Y'. 172 | #html_last_updated_fmt = None 173 | 174 | # If true, SmartyPants will be used to convert quotes and dashes to 175 | # typographically correct entities. 176 | #html_use_smartypants = True 177 | 178 | # Custom sidebar templates, maps document names to template names. 179 | #html_sidebars = {} 180 | 181 | # Additional templates that should be rendered to pages, maps page names to 182 | # template names. 183 | #html_additional_pages = {} 184 | 185 | # If false, no module index is generated. 186 | #html_domain_indices = True 187 | 188 | # If false, no index is generated. 189 | #html_use_index = True 190 | 191 | # If true, the index is split into individual pages for each letter. 192 | #html_split_index = False 193 | 194 | # If true, links to the reST sources are added to the pages. 195 | #html_show_sourcelink = True 196 | 197 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 198 | #html_show_sphinx = True 199 | 200 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 201 | #html_show_copyright = True 202 | 203 | # If true, an OpenSearch description file will be output, and all pages will 204 | # contain a tag referring to it. The value of this option must be the 205 | # base URL from which the finished HTML is served. 206 | #html_use_opensearch = '' 207 | 208 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 209 | #html_file_suffix = None 210 | 211 | # Language to be used for generating the HTML full-text search index. 212 | # Sphinx supports the following languages: 213 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 214 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' 215 | #html_search_language = 'en' 216 | 217 | # A dictionary with options for the search language support, empty by default. 218 | # 'ja' uses this config value. 219 | # 'zh' user can custom change `jieba` dictionary path. 220 | #html_search_options = {'type': 'default'} 221 | 222 | # The name of a javascript file (relative to the configuration directory) that 223 | # implements a search results scorer. If empty, the default will be used. 224 | #html_search_scorer = 'scorer.js' 225 | 226 | # Output file base name for HTML help builder. 227 | htmlhelp_basename = 'rust-fstdoc' 228 | 229 | # -- Options for LaTeX output --------------------------------------------- 230 | 231 | latex_elements = { 232 | # The paper size ('letterpaper' or 'a4paper'). 233 | #'papersize': 'letterpaper', 234 | 235 | # The font size ('10pt', '11pt' or '12pt'). 236 | #'pointsize': '10pt', 237 | 238 | # Additional stuff for the LaTeX preamble. 239 | #'preamble': '', 240 | 241 | # Latex figure (float) alignment 242 | #'figure_align': 'htbp', 243 | } 244 | 245 | # Grouping the document tree into LaTeX files. List of tuples 246 | # (source start file, target name, title, 247 | # author, documentclass [howto, manual, or own class]). 248 | latex_documents = [ 249 | (master_doc, 'rust-fst.tex', u'rust-fst Documentation', 250 | u'Johannes Baiter', 'manual'), 251 | ] 252 | 253 | # The name of an image file (relative to this directory) to place at the top of 254 | # the title page. 255 | #latex_logo = None 256 | 257 | # For "manual" documents, if this is true, then toplevel headings are parts, 258 | # not chapters. 259 | #latex_use_parts = False 260 | 261 | # If true, show page references after internal links. 262 | #latex_show_pagerefs = False 263 | 264 | # If true, show URL addresses after external links. 265 | #latex_show_urls = False 266 | 267 | # Documents to append as an appendix to all manuals. 268 | #latex_appendices = [] 269 | 270 | # If false, no module index is generated. 271 | #latex_domain_indices = True 272 | 273 | 274 | # -- Options for manual page output --------------------------------------- 275 | 276 | # One entry per manual page. List of tuples 277 | # (source start file, name, description, authors, manual section). 278 | man_pages = [ 279 | (master_doc, 'rust-fst', u'rust-fst Documentation', 280 | [author], 1) 281 | ] 282 | 283 | # If true, show URL addresses after external links. 284 | #man_show_urls = False 285 | 286 | 287 | # -- Options for Texinfo output ------------------------------------------- 288 | 289 | # Grouping the document tree into Texinfo files. List of tuples 290 | # (source start file, target name, title, author, 291 | # dir menu entry, description, category) 292 | texinfo_documents = [ 293 | (master_doc, 'rust-fst', u'rust-fst Documentation', 294 | author, 'rust-fst', 'One line description of project.', 295 | 'Miscellaneous'), 296 | ] 297 | 298 | # Documents to append as an appendix to all manuals. 299 | #texinfo_appendices = [] 300 | 301 | # If false, no module index is generated. 302 | #texinfo_domain_indices = True 303 | 304 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 305 | #texinfo_show_urls = 'footnote' 306 | 307 | # If true, do not generate a @detailmenu in the "Top" node's menu. 308 | #texinfo_no_detailmenu = False 309 | 310 | 311 | # Example configuration for intersphinx: refer to the Python standard library. 312 | intersphinx_mapping = {'python': ('https://docs.python.org/', None)} 313 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. rust-fst documentation master file, created by 2 | sphinx-quickstart on Wed May 25 21:05:04 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to rust-fst's documentation! 7 | ==================================== 8 | 9 | Python bindings for `burntsushi's`_ `fst crate`_ (rustdocs_) for FST-backed 10 | sets and maps. 11 | 12 | For reasons why you might want to consider using it, see BurntSushi's great 13 | article on `"Index[ing] 1,600,000,000 Keys with Automata and Rust" `__. 14 | 15 | If you want to know more about performance characteristics, memory usage 16 | and about the implementation details, please head over to the 17 | `documentation for the Rust crate `_. 18 | 19 | **tl;dr**: 20 | 21 | - Work with larger-than-memory sets 22 | - Perform fuzzy search using Levenshtein automata 23 | 24 | Installation 25 | ------------ 26 | 27 | - You will need: 28 | 29 | - Python >= 3.3, Python or PyPy >= 2.7 with development headers 30 | installed 31 | - Rust nightly (install via rustup_) 32 | 33 | - Clone the repository. Installation with ``pip install git+...`` does 34 | not work 35 | currently 36 | - Run ``rustup override add nightly`` to add an override for rustup to 37 | use the 38 | nightly channel for the repository 39 | - Run ``python setup.py bdist_wheel`` to generate a wheel 40 | - Install the wheel with 41 | ``pip install dist/rust_fst-0.1-py3-none-any.whl`` 42 | 43 | Status 44 | ------ 45 | 46 | The package exposes almost all functionality of the ``fst`` crate, 47 | except for: 48 | 49 | - Combining the results of slicing, ``search`` and ``search_re`` with 50 | set operations 51 | - Using raw transducers 52 | 53 | Examples 54 | -------- 55 | 56 | .. code:: python 57 | 58 | from rust_fst import Map, Set 59 | 60 | # Building a set in memory 61 | keys = ["fa", "fo", "fob", "focus", "foo", "food", "foul"] 62 | s = Set.from_iter(keys) 63 | 64 | # Fuzzy searches on the set 65 | matches = list(s.search(term="foo", max_dist=1)) 66 | assert matches == ["fo", "fob", "foo", "food"] 67 | 68 | # Searching with a regular expression 69 | matches = list(s.search_re(r'f\w{2}')) 70 | assert matches == ["fob", "foo"] 71 | 72 | # Store map on disk, requiring only constant memory for querying 73 | items = [("bruce", 1), ("clarence", 2), ("stevie", 3)] 74 | m = Map.from_iter(items, path="/tmp/map.fst") 75 | 76 | # Find all items whose key is greater or equal (in lexicographical sense) to 77 | # 'clarence' 78 | matches = dict(m['clarence':]) 79 | assert matches == {'clarence': 2, 'stevie': 3} 80 | 81 | # Create a map from a file input, using generators/yield 82 | # The input file must be sorted on the first column, and look roughly like 83 | # keyA 123 84 | # keyB 456 85 | def file_iterator(fpath): 86 | with open(fpath, 'rt') as fp: 87 | for line in fp: 88 | key, value = line.strip().split() 89 | yield key, int(value) 90 | m = Map.from_iter( file_iterator('/your/input/file/'), '/your/mmapped/output.fst') 91 | 92 | # re-open a file you built previously with from_iter() 93 | m = Map(path='/path/to/existing.fst') 94 | 95 | API Reference 96 | ------------- 97 | 98 | .. autoclass:: rust_fst.Set 99 | :member-order: bysource 100 | :members: 101 | :special-members: 102 | :exclude-members: __weakref__ 103 | 104 | .. autoclass:: rust_fst.Map 105 | :member-order: bysource 106 | :members: 107 | :special-members: 108 | :exclude-members: __weakref__ 109 | 110 | .. _burntsushi's: http://burntsushi.net 111 | .. _fst crate: https://github.com/BurntSushi/fst 112 | .. _rustdocs: http://burntsushi.net/rustdoc/fst/ 113 | .. _rustup: https://www.rustup.rs/ 114 | -------------------------------------------------------------------------------- /rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | 3 | name = "rust_fst" 4 | version = "0.3.0" 5 | authors = ["Johannes Baiter "] 6 | 7 | [lib] 8 | name = "rust_fst" 9 | crate-type = ["cdylib"] 10 | 11 | [dependencies] 12 | libc = "0.2" 13 | fst = "^0.3.5" 14 | fst-levenshtein = "^0.2.1" 15 | fst-regex = "^0.2.2" 16 | -------------------------------------------------------------------------------- /rust/rust_fst.h: -------------------------------------------------------------------------------- 1 | /** =============================== 2 | Utility 3 | =============================== **/ 4 | 5 | typedef struct { 6 | bool has_error; 7 | char* error_type; 8 | char* error_description; 9 | char* error_display; 10 | char* error_debug; 11 | } Context; 12 | 13 | typedef struct BufWriter BufWriter; 14 | typedef struct Levenshtein Levenshtein; 15 | typedef struct Regex Regex; 16 | 17 | Levenshtein* fst_levenshtein_new(Context*, char*, uint32_t); 18 | void fst_levenshtein_free(Levenshtein*); 19 | 20 | Regex* fst_regex_new(Context*, char*); 21 | void fst_regex_free(Regex*); 22 | 23 | Context* fst_context_new(); 24 | void fst_context_free(Context*); 25 | 26 | void fst_string_free(char*); 27 | 28 | BufWriter* fst_bufwriter_new(Context*, char*); 29 | void fst_bufwriter_free(BufWriter*); 30 | 31 | 32 | /** =============================== 33 | Set 34 | =============================== **/ 35 | 36 | typedef struct FileSetBuilder FileSetBuilder; 37 | typedef struct MemSetBuilder MemSetBuilder; 38 | typedef struct Set Set; 39 | typedef struct SetStream SetStream; 40 | typedef struct SetLevStream SetLevStream; 41 | typedef struct SetRegexStream SetRegexStream; 42 | typedef struct SetOpBuilder SetOpBuilder; 43 | typedef struct SetUnion SetUnion; 44 | typedef struct SetIntersection SetIntersection; 45 | typedef struct SetDifference SetDifference; 46 | typedef struct SetSymmetricDifference SetSymmetricDifference; 47 | typedef struct SetStreamBuilder SetStreamBuilder; 48 | 49 | FileSetBuilder* fst_filesetbuilder_new(Context*, BufWriter*); 50 | void fst_filesetbuilder_insert(Context*, FileSetBuilder*, char*); 51 | void fst_filesetbuilder_finish(Context*, FileSetBuilder*); 52 | 53 | MemSetBuilder* fst_memsetbuilder_new(); 54 | bool fst_memsetbuilder_insert(Context*, MemSetBuilder*, char*); 55 | Set* fst_memsetbuilder_finish(Context*, MemSetBuilder*); 56 | 57 | Set* fst_set_open(Context*, char*); 58 | bool fst_set_contains(Set*, char*); 59 | size_t fst_set_len(Set*); 60 | bool fst_set_isdisjoint(Set*, Set*); 61 | bool fst_set_issubset(Set*, Set*); 62 | bool fst_set_issuperset(Set*, Set*); 63 | SetStream* fst_set_stream(Set*); 64 | SetLevStream* fst_set_levsearch(Set*, Levenshtein*); 65 | SetRegexStream* fst_set_regexsearch(Set*, Regex*); 66 | SetOpBuilder* fst_set_make_opbuilder(Set*); 67 | void fst_set_free(Set*); 68 | 69 | char* fst_set_stream_next(SetStream*); 70 | void fst_set_stream_free(SetStream*); 71 | 72 | char* fst_set_levstream_next(SetLevStream*); 73 | void fst_set_levstream_free(SetLevStream*); 74 | 75 | char* fst_set_regexstream_next(SetRegexStream*); 76 | void fst_set_regexstream_free(SetRegexStream*); 77 | 78 | void fst_set_opbuilder_push(SetOpBuilder*, Set*); 79 | void fst_set_opbuilder_free(SetOpBuilder*); 80 | SetUnion* fst_set_opbuilder_union(SetOpBuilder*); 81 | SetIntersection* fst_set_opbuilder_intersection(SetOpBuilder*); 82 | SetDifference* fst_set_opbuilder_difference(SetOpBuilder*); 83 | SetSymmetricDifference* fst_set_opbuilder_symmetricdifference( 84 | SetOpBuilder*); 85 | 86 | char* fst_set_union_next(SetUnion*); 87 | void fst_set_union_free(SetUnion*); 88 | 89 | char* fst_set_intersection_next(SetIntersection*); 90 | void fst_set_intersection_free(SetIntersection*); 91 | 92 | char* fst_set_difference_next(SetDifference*); 93 | void fst_set_difference_free(SetDifference*); 94 | 95 | char* fst_set_symmetricdifference_next(SetSymmetricDifference*); 96 | void fst_set_symmetricdifference_free(SetSymmetricDifference*); 97 | 98 | SetStreamBuilder* fst_set_streambuilder_new(Set*); 99 | SetStreamBuilder* fst_set_streambuilder_add_ge(SetStreamBuilder*, char*); 100 | SetStreamBuilder* fst_set_streambuilder_add_lt(SetStreamBuilder*, char*); 101 | SetStream* fst_set_streambuilder_finish(SetStreamBuilder*); 102 | 103 | 104 | /** =============================== 105 | Map 106 | =============================== **/ 107 | 108 | typedef struct { 109 | char* key; 110 | uint64_t value; 111 | } MapItem; 112 | 113 | typedef struct { 114 | size_t index; 115 | uint64_t value; 116 | } IndexedValue; 117 | 118 | typedef struct { 119 | char* key; 120 | size_t num_values; 121 | IndexedValue* values; 122 | } MapOpItem; 123 | 124 | 125 | typedef struct FileMapBuilder FileMapBuilder; 126 | typedef struct MemMapBuilder MemMapBuilder; 127 | typedef struct Map Map; 128 | typedef struct MapStream MapStream; 129 | typedef struct MapLevStream MapLevStream; 130 | typedef struct MapRegexStream MapRegexStream; 131 | typedef struct MapKeyStream MapKeyStream; 132 | typedef struct MapValueStream MapValueStream; 133 | typedef struct MapOpBuilder MapOpBuilder; 134 | typedef struct MapUnion MapUnion; 135 | typedef struct MapIntersection MapIntersection; 136 | typedef struct MapDifference MapDifference; 137 | typedef struct MapSymmetricDifference MapSymmetricDifference; 138 | typedef struct MapStreamBuilder MapStreamBuilder; 139 | 140 | FileMapBuilder* fst_filemapbuilder_new(Context*, BufWriter*); 141 | bool fst_filemapbuilder_insert(Context*, FileMapBuilder*, char*, uint64_t); 142 | bool fst_filemapbuilder_finish(Context*, FileMapBuilder*); 143 | 144 | MemMapBuilder* fst_memmapbuilder_new(); 145 | bool fst_memmapbuilder_insert(Context*, MemMapBuilder*, char*, uint64_t); 146 | Map* fst_memmapbuilder_finish(Context*, MemMapBuilder*); 147 | 148 | Map* fst_map_open(Context*, char*); 149 | void fst_map_free(Map*); 150 | uint64_t fst_map_get(Context*, Map*, char*); 151 | size_t fst_map_len(Map*); 152 | bool fst_map_contains(Map*, char*); 153 | MapStream* fst_map_stream(Map*); 154 | MapKeyStream* fst_map_keys(Map*); 155 | MapValueStream* fst_map_values(Map*); 156 | MapLevStream* fst_map_levsearch(Map*, Levenshtein*); 157 | MapRegexStream* fst_map_regexsearch(Map*, Regex*); 158 | MapOpBuilder* fst_map_make_opbuilder(Map*); 159 | 160 | MapItem* fst_mapstream_next(MapStream*); 161 | void fst_mapstream_free(MapStream*); 162 | void fst_mapitem_free(MapItem*); 163 | 164 | char* fst_mapkeys_next(MapKeyStream*); 165 | void fst_mapkeys_free(MapKeyStream*); 166 | 167 | uint64_t fst_mapvalues_next(Context*, MapValueStream*); 168 | void fst_mapvalues_free(MapValueStream*); 169 | 170 | MapItem* fst_map_levstream_next(MapLevStream*); 171 | void fst_map_levstream_free(MapLevStream*); 172 | 173 | MapItem* fst_map_regexstream_next(MapRegexStream*); 174 | void fst_map_regexstream_free(MapRegexStream*); 175 | 176 | void fst_map_opbuilder_push(MapOpBuilder*, Map*); 177 | void fst_map_opbuilder_free(MapOpBuilder*); 178 | MapUnion* fst_map_opbuilder_union(MapOpBuilder*); 179 | MapIntersection* fst_map_opbuilder_intersection(MapOpBuilder*); 180 | MapDifference* fst_map_opbuilder_difference(MapOpBuilder*); 181 | MapSymmetricDifference* fst_map_opbuilder_symmetricdifference( 182 | MapOpBuilder*); 183 | void fst_map_opitem_free(MapOpItem*); 184 | 185 | MapOpItem* fst_map_union_next(MapUnion*); 186 | void fst_map_union_free(MapUnion*); 187 | 188 | MapOpItem* fst_map_intersection_next(MapIntersection*); 189 | void fst_map_intersection_free(MapIntersection*); 190 | 191 | MapOpItem* fst_map_difference_next(MapDifference*); 192 | void fst_map_difference_free(MapDifference*); 193 | 194 | MapOpItem* fst_map_symmetricdifference_next(MapSymmetricDifference*); 195 | void fst_map_symmetricdifference_free(MapSymmetricDifference*); 196 | 197 | MapStreamBuilder* fst_map_streambuilder_new(Map*); 198 | MapStreamBuilder* fst_map_streambuilder_add_ge(MapStreamBuilder*, char*); 199 | MapStreamBuilder* fst_map_streambuilder_add_lt(MapStreamBuilder*, char*); 200 | MapStream* fst_map_streambuilder_finish(MapStreamBuilder*); 201 | -------------------------------------------------------------------------------- /rust/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![crate_type = "dylib"] 2 | #![feature(core_intrinsics)] 3 | 4 | extern crate libc; 5 | extern crate fst; 6 | extern crate fst_regex; 7 | extern crate fst_levenshtein; 8 | 9 | 10 | /// Get an immutable reference from a raw pointer 11 | macro_rules! ref_from_ptr { 12 | ($p:ident) => (unsafe { 13 | assert!(!$p.is_null()); 14 | &*$p 15 | }) 16 | } 17 | 18 | /// Get a mutable reference from a raw pointer 19 | macro_rules! mutref_from_ptr { 20 | ($p:ident) => (unsafe { 21 | assert!(!$p.is_null()); 22 | &mut *$p 23 | }) 24 | } 25 | 26 | /// Get the object referenced by the raw pointer 27 | macro_rules! val_from_ptr { 28 | ($p:ident) => (unsafe { 29 | assert!(!$p.is_null()); 30 | Box::from_raw($p) 31 | }) 32 | } 33 | 34 | /// Declare a function that frees a struct's memory 35 | macro_rules! make_free_fn { 36 | ($name:ident, $t:ty) => ( 37 | #[no_mangle] 38 | pub extern fn $name(ptr: $t) { 39 | assert!(!ptr.is_null()); 40 | val_from_ptr!(ptr); 41 | } 42 | ) 43 | } 44 | 45 | /// Declare a function that returns the next item from a set stream 46 | macro_rules! set_make_next_fn { 47 | ($name:ident, $t:ty) => ( 48 | #[no_mangle] 49 | pub extern fn $name(ptr: $t) -> *const libc::c_char { 50 | let stream = mutref_from_ptr!(ptr); 51 | match stream.next() { 52 | Some(val) => ::std::ffi::CString::new(val).unwrap().into_raw(), 53 | None => ::std::ptr::null() 54 | } 55 | } 56 | ) 57 | } 58 | 59 | /// Declare a function that returns the next item from a map stream 60 | macro_rules! map_make_next_fn { 61 | ($name:ident, $t:ty) => ( 62 | #[no_mangle] 63 | pub extern fn $name(ptr: $t) -> *mut MapItem { 64 | let stream = mutref_from_ptr!(ptr); 65 | match stream.next() { 66 | Some((k, v)) => to_raw_ptr( 67 | MapItem { key: ::std::ffi::CString::new(k).unwrap().into_raw(), 68 | value: v }), 69 | None => ::std::ptr::null_mut() 70 | } 71 | } 72 | ) 73 | } 74 | 75 | /// Declare a function that returns the next item from a map stream 76 | macro_rules! mapop_make_next_fn { 77 | ($name:ident, $t:ty) => ( 78 | #[no_mangle] 79 | pub extern fn $name(ptr: $t) -> *const MapOpItem { 80 | let stream = mutref_from_ptr!(ptr); 81 | match stream.next() { 82 | Some((k, vs)) => { 83 | let vals: Vec = (0..vs.len()).map(|idx| { 84 | CIndexedValue { index: vs[idx].index, 85 | value: vs[idx].value } 86 | }).collect(); 87 | let mut vals_boxed: Box<[CIndexedValue]> = vals.into_boxed_slice(); 88 | let vals_ptr: *const CIndexedValue = vals_boxed.as_ptr(); 89 | mem::forget(vals_boxed); 90 | to_raw_ptr(MapOpItem { 91 | key: ::std::ffi::CString::new(k).unwrap().into_raw(), 92 | num_values: vs.len(), 93 | values: vals_ptr }) 94 | }, 95 | None => ::std::ptr::null_mut() 96 | } 97 | } 98 | ) 99 | } 100 | 101 | /// Evaluate an expression and in case of an error, store information about the error in the passed 102 | /// Context struct and return a default value. 103 | macro_rules! with_context { 104 | ($ctx_ptr:ident, $default_rval:expr, $e:expr) => {{ 105 | let ctx = mutref_from_ptr!($ctx_ptr); 106 | ctx.has_error = false; 107 | match $e { 108 | Ok(val) => val, 109 | Err(err) => { 110 | ctx.has_error = true; 111 | ctx.error_type = $crate::util::str_to_cstr($crate::util::get_typename(&err)); 112 | ctx.error_debug = $crate::util::str_to_cstr(&format!("{:?}", err)); 113 | ctx.error_display = $crate::util::str_to_cstr(&format!("{}", err)); 114 | ctx.error_description = $crate::util::str_to_cstr(err.description()); 115 | return $default_rval; 116 | } 117 | } 118 | }} 119 | } 120 | 121 | pub mod util; 122 | pub mod set; 123 | pub mod map; 124 | -------------------------------------------------------------------------------- /rust/src/map.rs: -------------------------------------------------------------------------------- 1 | extern crate libc; 2 | 3 | use std::error::Error; 4 | use std::fs::File; 5 | use std::io; 6 | use std::mem; 7 | use std::ptr; 8 | use fst::{IntoStreamer, Streamer, Map, MapBuilder}; 9 | use fst::map; 10 | use fst_levenshtein::Levenshtein; 11 | use fst_regex::Regex; 12 | 13 | use util::{Context, str_to_cstr, cstr_to_str, to_raw_ptr}; 14 | 15 | 16 | #[repr(C)] 17 | #[derive(Debug)] 18 | #[allow(dead_code)] 19 | pub struct MapItem { 20 | key: *const libc::c_char, 21 | value: u64, 22 | } 23 | 24 | #[repr(C)] 25 | #[derive(Debug)] 26 | #[allow(dead_code)] 27 | pub struct MapOpItem { 28 | key: *const libc::c_char, 29 | num_values: libc::size_t, 30 | values: *const CIndexedValue 31 | } 32 | 33 | #[repr(C)] 34 | #[derive(Debug)] 35 | #[allow(dead_code)] 36 | pub struct CIndexedValue { 37 | index: libc::size_t, 38 | value: u64, 39 | } 40 | 41 | pub type FileMapBuilder = MapBuilder<&'static mut io::BufWriter>; 42 | pub type MemMapBuilder = MapBuilder>; 43 | pub type MapLevStream = map::Stream<'static, &'static Levenshtein>; 44 | pub type MapRegexStream = map::Stream<'static, &'static Regex>; 45 | 46 | 47 | #[no_mangle] 48 | pub extern "C" fn fst_filemapbuilder_new(ctx: *mut Context, 49 | wtr_ptr: *mut io::BufWriter) 50 | -> *mut FileMapBuilder { 51 | let wtr = mutref_from_ptr!(wtr_ptr); 52 | to_raw_ptr(with_context!(ctx, ptr::null_mut(), 53 | MapBuilder::new(wtr))) 54 | } 55 | 56 | #[no_mangle] 57 | pub extern "C" fn fst_filemapbuilder_insert(ctx: *mut Context, 58 | ptr: *mut FileMapBuilder, 59 | key: *mut libc::c_char, 60 | val: u64) 61 | -> bool { 62 | let builder = mutref_from_ptr!(ptr); 63 | with_context!(ctx, false, builder.insert(cstr_to_str(key), val)); 64 | true 65 | } 66 | 67 | #[no_mangle] 68 | pub extern "C" fn fst_filemapbuilder_finish(ctx: *mut Context, ptr: *mut FileMapBuilder) -> bool { 69 | let builder = val_from_ptr!(ptr); 70 | with_context!(ctx, false, builder.finish()); 71 | true 72 | } 73 | 74 | #[no_mangle] 75 | pub extern "C" fn fst_memmapbuilder_new() -> *mut MemMapBuilder { 76 | to_raw_ptr(MapBuilder::memory()) 77 | } 78 | 79 | #[no_mangle] 80 | pub extern "C" fn fst_memmapbuilder_insert(ctx: *mut Context, 81 | ptr: *mut MemMapBuilder, 82 | key: *mut libc::c_char, 83 | val: u64) 84 | -> bool { 85 | let builder = mutref_from_ptr!(ptr); 86 | with_context!(ctx, false, builder.insert(cstr_to_str(key), val)); 87 | true 88 | } 89 | 90 | #[no_mangle] 91 | pub extern "C" fn fst_memmapbuilder_finish(ctx: *mut Context, ptr: *mut MemMapBuilder) -> *mut Map { 92 | let builder = val_from_ptr!(ptr); 93 | let data = with_context!(ctx, ptr::null_mut(), builder.into_inner()); 94 | let map = with_context!(ctx, ptr::null_mut(), Map::from_bytes(data)); 95 | to_raw_ptr(map) 96 | } 97 | 98 | #[no_mangle] 99 | #[allow(unused_unsafe)] 100 | pub unsafe extern "C" fn fst_map_open(ctx: *mut Context, path: *mut libc::c_char) -> *mut Map { 101 | let path = cstr_to_str(path); 102 | let map = with_context!(ctx, ptr::null_mut(), Map::from_path(path)); 103 | to_raw_ptr(map) 104 | } 105 | make_free_fn!(fst_map_free, *mut Map); 106 | 107 | #[no_mangle] 108 | pub extern "C" fn fst_map_len(ptr: *mut Map) -> libc::size_t { 109 | ref_from_ptr!(ptr).len() 110 | } 111 | 112 | #[no_mangle] 113 | pub extern "C" fn fst_map_contains(ptr: *mut Map, key: *mut libc::c_char) -> bool { 114 | ref_from_ptr!(ptr).contains_key(cstr_to_str(key)) 115 | } 116 | 117 | #[no_mangle] 118 | pub extern "C" fn fst_map_stream(ptr: *mut Map) -> *mut map::Stream<'static> { 119 | to_raw_ptr(ref_from_ptr!(ptr).stream()) 120 | } 121 | make_free_fn!(fst_mapstream_free, *mut map::Stream); 122 | map_make_next_fn!(fst_mapstream_next, *mut map::Stream); 123 | make_free_fn!(fst_mapitem_free, *mut MapItem); 124 | 125 | #[no_mangle] 126 | pub extern "C" fn fst_map_get(ctx: *mut Context, 127 | ptr: *mut Map, 128 | key: *mut libc::c_char) 129 | -> u64 { 130 | let key = cstr_to_str(key); 131 | let ctx = mutref_from_ptr!(ctx); 132 | ctx.clear(); 133 | match ref_from_ptr!(ptr).get(key) { 134 | Some(val) => val, 135 | None => { 136 | let msg = str_to_cstr(&format!("Key '{}' not in map.", key)); 137 | ctx.has_error = true; 138 | ctx.error_type = str_to_cstr("py::KeyError"); 139 | ctx.error_display = msg; 140 | return 0; 141 | } 142 | } 143 | } 144 | 145 | #[no_mangle] 146 | pub extern "C" fn fst_map_keys(ptr: *mut Map) -> *mut map::Keys<'static> { 147 | to_raw_ptr(ref_from_ptr!(ptr).keys()) 148 | } 149 | make_free_fn!(fst_mapkeys_free, *mut map::Keys); 150 | set_make_next_fn!(fst_mapkeys_next, *mut map::Keys); 151 | 152 | #[no_mangle] 153 | pub extern "C" fn fst_map_values(ptr: *mut Map) -> *mut map::Values<'static> { 154 | to_raw_ptr(ref_from_ptr!(ptr).values()) 155 | } 156 | make_free_fn!(fst_mapvalues_free, *mut map::Values); 157 | 158 | #[no_mangle] 159 | pub extern "C" fn fst_mapvalues_next(ctx: *mut Context, ptr: *mut map::Values) -> u64 { 160 | let ctx = mutref_from_ptr!(ctx); 161 | ctx.clear(); 162 | match mutref_from_ptr!(ptr).next() { 163 | Some(val) => val, 164 | None => { 165 | let msg = str_to_cstr("No more values."); 166 | ctx.has_error = true; 167 | ctx.error_type = str_to_cstr("StopIteration"); 168 | ctx.error_display = msg; 169 | return 0; 170 | } 171 | } 172 | } 173 | 174 | #[no_mangle] 175 | pub extern "C" fn fst_map_levsearch(map_ptr: *mut Map, 176 | lev_ptr: *mut Levenshtein) 177 | -> *mut MapLevStream { 178 | let map = mutref_from_ptr!(map_ptr); 179 | let lev = ref_from_ptr!(lev_ptr); 180 | to_raw_ptr(map.search(lev).into_stream()) 181 | } 182 | make_free_fn!(fst_map_levstream_free, *mut MapLevStream); 183 | map_make_next_fn!(fst_map_levstream_next, *mut MapLevStream); 184 | 185 | 186 | #[no_mangle] 187 | pub extern "C" fn fst_map_regexsearch(map_ptr: *mut Map, regex_ptr: *mut Regex) 188 | -> *mut MapRegexStream { 189 | let map = mutref_from_ptr!(map_ptr); 190 | let regex = ref_from_ptr!(regex_ptr); 191 | to_raw_ptr(map.search(regex).into_stream()) 192 | } 193 | make_free_fn!(fst_map_regexstream_free, *mut MapRegexStream); 194 | map_make_next_fn!(fst_map_regexstream_next, *mut MapRegexStream); 195 | 196 | 197 | #[no_mangle] 198 | pub extern "C" fn fst_map_make_opbuilder(ptr: *mut Map) -> *mut map::OpBuilder<'static> { 199 | let map = ref_from_ptr!(ptr); 200 | let ob = map.op(); 201 | to_raw_ptr(ob) 202 | } 203 | make_free_fn!(fst_map_opbuilder_free, *mut map::OpBuilder); 204 | make_free_fn!(fst_map_opitem_free, *mut MapOpItem); 205 | 206 | #[no_mangle] 207 | pub extern "C" fn fst_map_opbuilder_push(ptr: *mut map::OpBuilder, map_ptr: *mut Map) { 208 | let map = ref_from_ptr!(map_ptr); 209 | let ob = mutref_from_ptr!(ptr); 210 | ob.push(map); 211 | } 212 | 213 | #[no_mangle] 214 | pub extern "C" fn fst_map_opbuilder_union(ptr: *mut map::OpBuilder) 215 | -> *mut map::Union { 216 | let ob = val_from_ptr!(ptr); 217 | to_raw_ptr(ob.union()) 218 | } 219 | make_free_fn!(fst_map_union_free, *mut map::Union); 220 | mapop_make_next_fn!(fst_map_union_next, *mut map::Union); 221 | 222 | #[no_mangle] 223 | pub extern "C" fn fst_map_opbuilder_intersection(ptr: *mut map::OpBuilder) 224 | -> *mut map::Intersection { 225 | let ob = val_from_ptr!(ptr); 226 | to_raw_ptr(ob.intersection()) 227 | } 228 | make_free_fn!(fst_map_intersection_free, *mut map::Intersection); 229 | mapop_make_next_fn!(fst_map_intersection_next, *mut map::Intersection); 230 | 231 | #[no_mangle] 232 | pub extern "C" fn fst_map_opbuilder_difference(ptr: *mut map::OpBuilder) 233 | -> *mut map::Difference { 234 | let ob = val_from_ptr!(ptr); 235 | to_raw_ptr(ob.difference()) 236 | } 237 | make_free_fn!(fst_map_difference_free, *mut map::Difference); 238 | mapop_make_next_fn!(fst_map_difference_next, *mut map::Difference); 239 | 240 | #[no_mangle] 241 | pub extern "C" fn fst_map_opbuilder_symmetricdifference 242 | (ptr: *mut map::OpBuilder) 243 | -> *mut map::SymmetricDifference { 244 | let ob = val_from_ptr!(ptr); 245 | to_raw_ptr(ob.symmetric_difference()) 246 | } 247 | make_free_fn!(fst_map_symmetricdifference_free, *mut map::SymmetricDifference); 248 | mapop_make_next_fn!(fst_map_symmetricdifference_next, *mut map::SymmetricDifference); 249 | 250 | 251 | #[no_mangle] 252 | pub extern "C" fn fst_map_streambuilder_new(ptr: *mut Map) -> *mut map::StreamBuilder<'static> { 253 | let map = ref_from_ptr!(ptr); 254 | to_raw_ptr(map.range()) 255 | } 256 | 257 | #[no_mangle] 258 | pub extern "C" fn fst_map_streambuilder_add_ge(ptr: *mut map::StreamBuilder<'static>, 259 | c_bound: *mut libc::c_char) 260 | -> *mut map::StreamBuilder<'static> { 261 | let sb = val_from_ptr!(ptr); 262 | to_raw_ptr(sb.ge(cstr_to_str(c_bound))) 263 | } 264 | 265 | #[no_mangle] 266 | pub extern "C" fn fst_map_streambuilder_add_lt(ptr: *mut map::StreamBuilder<'static>, 267 | c_bound: *mut libc::c_char) 268 | -> *mut map::StreamBuilder<'static> { 269 | let sb = val_from_ptr!(ptr); 270 | to_raw_ptr(sb.lt(cstr_to_str(c_bound))) 271 | } 272 | 273 | #[no_mangle] 274 | pub extern "C" fn fst_map_streambuilder_finish(ptr: *mut map::StreamBuilder<'static>) 275 | -> *mut map::Stream { 276 | let sb = val_from_ptr!(ptr); 277 | to_raw_ptr(sb.into_stream()) 278 | } 279 | -------------------------------------------------------------------------------- /rust/src/set.rs: -------------------------------------------------------------------------------- 1 | extern crate libc; 2 | 3 | use std::error::Error; 4 | use std::fs::File; 5 | use std::io; 6 | use std::ptr; 7 | use fst::{IntoStreamer, Streamer, Set, SetBuilder}; 8 | use fst::set; 9 | use fst_levenshtein::Levenshtein; 10 | use fst_regex::Regex; 11 | 12 | use util::{Context, cstr_to_str, to_raw_ptr}; 13 | 14 | 15 | pub type FileSetBuilder = SetBuilder<&'static mut io::BufWriter>; 16 | pub type MemSetBuilder = SetBuilder>; 17 | pub type SetLevStream = set::Stream<'static, &'static Levenshtein>; 18 | pub type SetRegexStream = set::Stream<'static, &'static Regex>; 19 | 20 | 21 | #[no_mangle] 22 | pub extern "C" fn fst_filesetbuilder_new(ctx: *mut Context, 23 | wtr_ptr: *mut io::BufWriter) 24 | -> *mut FileSetBuilder { 25 | let wtr = mutref_from_ptr!(wtr_ptr); 26 | to_raw_ptr(with_context!(ctx, ptr::null_mut(), SetBuilder::new(wtr))) 27 | } 28 | 29 | #[no_mangle] 30 | pub extern "C" fn fst_filesetbuilder_insert(ctx: *mut Context, 31 | ptr: *mut FileSetBuilder, 32 | s: *mut libc::c_char) 33 | -> bool { 34 | let build = mutref_from_ptr!(ptr); 35 | with_context!(ctx, false, build.insert(cstr_to_str(s))); 36 | true 37 | } 38 | 39 | #[no_mangle] 40 | pub extern "C" fn fst_filesetbuilder_finish(ctx: *mut Context, ptr: *mut FileSetBuilder) -> bool { 41 | let build = val_from_ptr!(ptr); 42 | with_context!(ctx, false, build.finish()); 43 | true 44 | } 45 | 46 | #[no_mangle] 47 | pub extern "C" fn fst_memsetbuilder_new() -> *mut MemSetBuilder { 48 | to_raw_ptr(SetBuilder::memory()) 49 | } 50 | 51 | #[no_mangle] 52 | pub extern "C" fn fst_memsetbuilder_insert(ctx: *mut Context, 53 | ptr: *mut MemSetBuilder, 54 | s: *mut libc::c_char) 55 | -> bool { 56 | let build = mutref_from_ptr!(ptr); 57 | with_context!(ctx, false, build.insert(cstr_to_str(s))); 58 | true 59 | } 60 | 61 | #[no_mangle] 62 | pub extern "C" fn fst_memsetbuilder_finish(ctx: *mut Context, ptr: *mut MemSetBuilder) -> *mut Set { 63 | let build = val_from_ptr!(ptr); 64 | let data = with_context!(ctx, ptr::null_mut(), build.into_inner()); 65 | let set = with_context!(ctx, ptr::null_mut(), Set::from_bytes(data)); 66 | to_raw_ptr(set) 67 | } 68 | 69 | #[no_mangle] 70 | #[allow(unused_unsafe)] 71 | pub unsafe extern "C" fn fst_set_open(ctx: *mut Context, cpath: *mut libc::c_char) -> *mut Set { 72 | let path = cstr_to_str(cpath); 73 | let set = with_context!(ctx, ptr::null_mut(), Set::from_path(path)); 74 | to_raw_ptr(set) 75 | } 76 | make_free_fn!(fst_set_free, *mut Set); 77 | 78 | 79 | #[no_mangle] 80 | pub extern "C" fn fst_set_contains(ptr: *mut Set, s: *mut libc::c_char) -> bool { 81 | let set = mutref_from_ptr!(ptr); 82 | set.contains(cstr_to_str(s)) 83 | } 84 | 85 | #[no_mangle] 86 | pub extern "C" fn fst_set_stream(ptr: *mut Set) -> *mut set::Stream<'static> { 87 | let set = mutref_from_ptr!(ptr); 88 | to_raw_ptr(set.stream()) 89 | } 90 | make_free_fn!(fst_set_stream_free, *mut set::Stream); 91 | set_make_next_fn!(fst_set_stream_next, *mut set::Stream); 92 | 93 | #[no_mangle] 94 | pub extern "C" fn fst_set_len(ptr: *mut Set) -> libc::size_t { 95 | let set = mutref_from_ptr!(ptr); 96 | set.len() 97 | } 98 | 99 | #[no_mangle] 100 | pub extern "C" fn fst_set_isdisjoint(self_ptr: *mut Set, oth_ptr: *mut Set) -> bool { 101 | let slf = ref_from_ptr!(self_ptr); 102 | let oth = ref_from_ptr!(oth_ptr); 103 | slf.is_disjoint(oth) 104 | } 105 | 106 | #[no_mangle] 107 | pub extern "C" fn fst_set_issubset(self_ptr: *mut Set, oth_ptr: *mut Set) -> bool { 108 | let slf = ref_from_ptr!(self_ptr); 109 | let oth = ref_from_ptr!(oth_ptr); 110 | slf.is_subset(oth) 111 | } 112 | 113 | #[no_mangle] 114 | pub extern "C" fn fst_set_issuperset(self_ptr: *mut Set, oth_ptr: *mut Set) -> bool { 115 | let slf = ref_from_ptr!(self_ptr); 116 | let oth = ref_from_ptr!(oth_ptr); 117 | slf.is_superset(oth) 118 | } 119 | 120 | #[no_mangle] 121 | pub extern "C" fn fst_set_levsearch(set_ptr: *mut Set, 122 | lev_ptr: *mut Levenshtein) 123 | -> *mut SetLevStream { 124 | let set = mutref_from_ptr!(set_ptr); 125 | let lev = ref_from_ptr!(lev_ptr); 126 | to_raw_ptr(set.search(lev).into_stream()) 127 | } 128 | make_free_fn!(fst_set_levstream_free, *mut SetLevStream); 129 | set_make_next_fn!(fst_set_levstream_next, *mut SetLevStream); 130 | 131 | #[no_mangle] 132 | pub extern "C" fn fst_set_regexsearch(set_ptr: *mut Set, regex_ptr: *mut Regex) 133 | -> *mut SetRegexStream { 134 | let set = mutref_from_ptr!(set_ptr); 135 | let regex = ref_from_ptr!(regex_ptr); 136 | to_raw_ptr(set.search(regex).into_stream()) 137 | } 138 | make_free_fn!(fst_set_regexstream_free, *mut SetRegexStream); 139 | set_make_next_fn!(fst_set_regexstream_next, *mut SetRegexStream); 140 | 141 | #[no_mangle] 142 | pub extern "C" fn fst_set_make_opbuilder(ptr: *mut Set) -> *mut set::OpBuilder<'static> { 143 | let set = ref_from_ptr!(ptr); 144 | let ob = set.op(); 145 | to_raw_ptr(ob) 146 | } 147 | make_free_fn!(fst_set_opbuilder_free, *mut set::OpBuilder); 148 | 149 | #[no_mangle] 150 | pub extern "C" fn fst_set_opbuilder_push(ptr: *mut set::OpBuilder, set_ptr: *mut Set) { 151 | let set = ref_from_ptr!(set_ptr); 152 | let ob = mutref_from_ptr!(ptr); 153 | ob.push(set); 154 | } 155 | 156 | #[no_mangle] 157 | pub extern "C" fn fst_set_opbuilder_union(ptr: *mut set::OpBuilder) 158 | -> *mut set::Union { 159 | let ob = val_from_ptr!(ptr); 160 | to_raw_ptr(ob.union()) 161 | } 162 | make_free_fn!(fst_set_union_free, *mut set::Union); 163 | set_make_next_fn!(fst_set_union_next, *mut set::Union); 164 | 165 | #[no_mangle] 166 | pub extern "C" fn fst_set_opbuilder_intersection(ptr: *mut set::OpBuilder) 167 | -> *mut set::Intersection { 168 | let ob = val_from_ptr!(ptr); 169 | to_raw_ptr(ob.intersection()) 170 | } 171 | make_free_fn!(fst_set_intersection_free, *mut set::Intersection); 172 | set_make_next_fn!(fst_set_intersection_next, *mut set::Intersection); 173 | 174 | #[no_mangle] 175 | pub extern "C" fn fst_set_opbuilder_difference(ptr: *mut set::OpBuilder) 176 | -> *mut set::Difference { 177 | let ob = val_from_ptr!(ptr); 178 | to_raw_ptr(ob.difference()) 179 | } 180 | make_free_fn!(fst_set_difference_free, *mut set::Difference); 181 | set_make_next_fn!(fst_set_difference_next, *mut set::Difference); 182 | 183 | #[no_mangle] 184 | pub extern "C" fn fst_set_opbuilder_symmetricdifference 185 | (ptr: *mut set::OpBuilder) 186 | -> *mut set::SymmetricDifference { 187 | let ob = val_from_ptr!(ptr); 188 | to_raw_ptr(ob.symmetric_difference()) 189 | } 190 | make_free_fn!(fst_set_symmetricdifference_free, *mut set::SymmetricDifference); 191 | set_make_next_fn!(fst_set_symmetricdifference_next, *mut set::SymmetricDifference); 192 | 193 | 194 | #[no_mangle] 195 | pub extern "C" fn fst_set_streambuilder_new(ptr: *mut Set) -> *mut set::StreamBuilder<'static> { 196 | let set = ref_from_ptr!(ptr); 197 | to_raw_ptr(set.range()) 198 | } 199 | 200 | #[no_mangle] 201 | pub extern "C" fn fst_set_streambuilder_add_ge(ptr: *mut set::StreamBuilder<'static>, 202 | c_bound: *mut libc::c_char) 203 | -> *mut set::StreamBuilder<'static> { 204 | let sb = val_from_ptr!(ptr); 205 | to_raw_ptr(sb.ge(cstr_to_str(c_bound))) 206 | } 207 | 208 | #[no_mangle] 209 | pub extern "C" fn fst_set_streambuilder_add_lt(ptr: *mut set::StreamBuilder<'static>, 210 | c_bound: *mut libc::c_char) 211 | -> *mut set::StreamBuilder<'static> { 212 | let sb = val_from_ptr!(ptr); 213 | to_raw_ptr(sb.lt(cstr_to_str(c_bound))) 214 | } 215 | 216 | #[no_mangle] 217 | pub extern "C" fn fst_set_streambuilder_finish(ptr: *mut set::StreamBuilder<'static>) 218 | -> *mut set::Stream { 219 | let sb = val_from_ptr!(ptr); 220 | to_raw_ptr(sb.into_stream()) 221 | } 222 | -------------------------------------------------------------------------------- /rust/src/util.rs: -------------------------------------------------------------------------------- 1 | extern crate libc; 2 | extern crate fst_levenshtein; 3 | extern crate fst_regex; 4 | 5 | 6 | use std::error::Error; 7 | use std::ffi::{CStr, CString}; 8 | use std::fs::File; 9 | use std::intrinsics; 10 | use std::io; 11 | use std::ptr; 12 | use fst_regex::Regex; 13 | use fst_levenshtein::Levenshtein; 14 | 15 | 16 | /// Exposes information about errors over the ABI 17 | #[repr(C)] 18 | pub struct Context { 19 | pub has_error: bool, 20 | pub error_type: *mut libc::c_char, 21 | pub error_debug: *mut libc::c_char, 22 | pub error_display: *mut libc::c_char, 23 | pub error_description: *mut libc::c_char, 24 | } 25 | 26 | impl Context { 27 | pub fn clear(&mut self) { 28 | self.has_error = false; 29 | if !self.error_type.is_null() { 30 | fst_string_free(self.error_type); 31 | } 32 | if !self.error_debug.is_null() { 33 | fst_string_free(self.error_debug); 34 | } 35 | if !self.error_display.is_null() { 36 | fst_string_free(self.error_display); 37 | } 38 | if !self.error_description.is_null() { 39 | fst_string_free(self.error_description); 40 | } 41 | } 42 | } 43 | 44 | 45 | pub fn cstr_to_str<'a>(s: *mut libc::c_char) -> &'a str { 46 | let cstr = unsafe { CStr::from_ptr(s) }; 47 | cstr.to_str().unwrap() 48 | } 49 | 50 | pub fn str_to_cstr(string: &str) -> *mut libc::c_char { 51 | CString::new(string).unwrap().into_raw() 52 | } 53 | 54 | pub fn to_raw_ptr(v: T) -> *mut T { 55 | Box::into_raw(Box::new(v)) 56 | } 57 | 58 | // FIXME: This requires the nightly channel, isn't there a better way to 59 | // get this information? 60 | pub fn get_typename(_: &T) -> &'static str { 61 | unsafe { intrinsics::type_name::() } 62 | } 63 | 64 | #[no_mangle] 65 | pub extern "C" fn fst_context_new() -> *mut Context { 66 | to_raw_ptr(Context { 67 | has_error: false, 68 | error_type: ptr::null_mut(), 69 | error_description: ptr::null_mut(), 70 | error_display: ptr::null_mut(), 71 | error_debug: ptr::null_mut(), 72 | }) 73 | } 74 | make_free_fn!(fst_context_free, *mut Context); 75 | 76 | #[no_mangle] 77 | pub extern "C" fn fst_string_free(s: *mut libc::c_char) { 78 | unsafe { CString::from_raw(s) }; 79 | } 80 | 81 | #[no_mangle] 82 | pub extern "C" fn fst_bufwriter_new(ctx: *mut Context, 83 | s: *mut libc::c_char) 84 | -> *mut io::BufWriter { 85 | let path = cstr_to_str(s); 86 | let file = with_context!(ctx, ptr::null_mut(), File::create(path)); 87 | to_raw_ptr(io::BufWriter::new(file)) 88 | } 89 | make_free_fn!(fst_bufwriter_free, *mut io::BufWriter); 90 | 91 | 92 | #[no_mangle] 93 | pub extern "C" fn fst_levenshtein_new(ctx: *mut Context, 94 | c_key: *mut libc::c_char, 95 | max_dist: u32) 96 | -> *mut Levenshtein { 97 | let key = cstr_to_str(c_key); 98 | let lev = with_context!(ctx, ptr::null_mut(), 99 | Levenshtein::new(key, max_dist)); 100 | to_raw_ptr(lev) 101 | } 102 | make_free_fn!(fst_levenshtein_free, *mut Levenshtein); 103 | 104 | #[no_mangle] 105 | pub extern "C" fn fst_regex_new(ctx: *mut Context, c_pat: *mut libc::c_char) -> *mut Regex { 106 | let pat = cstr_to_str(c_pat); 107 | let re = with_context!(ctx, ptr::null_mut(), Regex::new(pat)); 108 | to_raw_ptr(re) 109 | } 110 | make_free_fn!(fst_regex_free, *mut Regex); 111 | -------------------------------------------------------------------------------- /rust_fst/__init__.py: -------------------------------------------------------------------------------- 1 | from .set import Set 2 | from .map import Map 3 | 4 | __all__ = ["Set", "Map"] 5 | -------------------------------------------------------------------------------- /rust_fst/common.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | from .lib import ffi, lib 4 | 5 | 6 | class StreamIterator(object): 7 | def __init__(self, stream_ptr, next_fn, free_fn, autom_ptr=None, 8 | autom_free_fn=None, ctx_ptr=None): 9 | self._free_fn = free_fn 10 | self._ptr = ffi.gc(stream_ptr, free_fn) 11 | self._next_fn = next_fn 12 | if autom_ptr: 13 | self._autom_ptr = ffi.gc(autom_ptr, autom_free_fn) 14 | self._autom_free_fn = autom_free_fn 15 | else: 16 | self._autom_ptr = None 17 | self._ctx = ctx_ptr 18 | 19 | def _free(self): 20 | self._free_fn(self._ptr) 21 | # Clear GC hook to prevent double-free 22 | ffi.gc(self._ptr, None) 23 | self._ptr = None 24 | if self._autom_ptr: 25 | self._autom_free_fn(self._autom_ptr) 26 | ffi.gc(self._autom_ptr, None) 27 | self._autom_ptr = None 28 | 29 | def __iter__(self): 30 | return self 31 | 32 | def next(self): 33 | return self.__next__() 34 | 35 | def __next__(self): 36 | raise NotImplementedError 37 | 38 | 39 | class KeyStreamIterator(StreamIterator): 40 | def __next__(self): 41 | c_str = self._next_fn(self._ptr) 42 | if c_str == ffi.NULL: 43 | self._free() 44 | raise StopIteration 45 | py_str = ffi.string(c_str).decode('utf8') 46 | lib.fst_string_free(c_str) 47 | return py_str 48 | 49 | 50 | class ValueStreamIterator(StreamIterator): 51 | def __next__(self): 52 | val = self._next_fn(self._ctx, self._ptr) 53 | if val == 0 and self._ctx.has_error: 54 | self._free() 55 | raise StopIteration 56 | return val 57 | 58 | 59 | class MapItemStreamIterator(StreamIterator): 60 | def __next__(self): 61 | itm = self._next_fn(self._ptr) 62 | if itm == ffi.NULL: 63 | self._free() 64 | raise StopIteration 65 | key = ffi.string(itm.key).decode('utf8') 66 | value = itm.value 67 | lib.fst_string_free(itm.key) 68 | lib.fst_mapitem_free(itm) 69 | return (key, value) 70 | 71 | 72 | IndexedValue = namedtuple("IndexedValue", ("index", "value")) 73 | 74 | 75 | class MapOpItemStreamIterator(StreamIterator): 76 | def __next__(self): 77 | itm = self._next_fn(self._ptr) 78 | if itm == ffi.NULL: 79 | self._free() 80 | raise StopIteration 81 | key = ffi.string(itm.key).decode('utf8') 82 | values = [] 83 | for n in range(itm.num_values): 84 | rust_val = itm.values[n] 85 | values.append(IndexedValue(rust_val.index, rust_val.value)) 86 | lib.fst_string_free(itm.key) 87 | lib.fst_map_opitem_free(itm) 88 | return (key, tuple(values)) 89 | -------------------------------------------------------------------------------- /rust_fst/lib.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | from ._native import ffi, lib 5 | 6 | 7 | class FstError(Exception): 8 | pass 9 | 10 | 11 | class TransducerError(FstError): 12 | pass 13 | 14 | 15 | class RegexError(FstError): 16 | pass 17 | 18 | 19 | class LevenshteinError(FstError): 20 | pass 21 | 22 | 23 | class IoError(FstError): 24 | pass 25 | 26 | 27 | EXCEPTION_MAP = { 28 | 'std::io::error::Error': OSError, 29 | 'fst::error::Error': FstError, 30 | 'fst::error::Error::Fst': TransducerError, 31 | 'fst_regex::error::Error': RegexError, 32 | 'fst_levenshtein::error::Error': LevenshteinError, 33 | 'fst::error::Error::Io': IoError, 34 | 'py::KeyError': KeyError 35 | } 36 | 37 | 38 | def checked_call(fn, ctx, *args): 39 | res = fn(ctx, *args) 40 | if not ctx.has_error: 41 | return res 42 | type_str = ffi.string(ctx.error_type).decode('utf8') 43 | if ctx.error_display != ffi.NULL: 44 | msg = ffi.string(ctx.error_display).decode('utf8').replace('\n', ' ') 45 | else: 46 | msg = None 47 | err_type = EXCEPTION_MAP.get(type_str) 48 | if err_type is FstError: 49 | if ctx.error_description != ffi.NULL: 50 | desc_str = ffi.string(ctx.error_description).decode('utf8') 51 | else: 52 | desc_str = None 53 | enum_val = re.match(r'(\w+)\(.*?\)', desc_str, re.DOTALL).group(1) 54 | err_type = EXCEPTION_MAP.get("{}::{}".format(type_str, enum_val)) 55 | if err_type is None: 56 | msg = "{}: {}".format(enum_val, msg) 57 | if err_type is None: 58 | err_type = FstError 59 | raise err_type(msg) 60 | -------------------------------------------------------------------------------- /rust_fst/map.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | 3 | from .common import (KeyStreamIterator, ValueStreamIterator, 4 | MapItemStreamIterator, MapOpItemStreamIterator) 5 | from .lib import ffi, lib, checked_call 6 | 7 | 8 | class MapBuilder(object): 9 | def insert(self, val): 10 | raise NotImplementedError 11 | 12 | def finish(self): 13 | raise NotImplementedError 14 | 15 | 16 | class FileMapBuilder(MapBuilder): 17 | def __init__(self, path): 18 | self._ctx = lib.fst_context_new() 19 | self._writer_p = checked_call( 20 | lib.fst_bufwriter_new, self._ctx, path.encode('utf8')) 21 | self._builder_p = checked_call( 22 | lib.fst_filemapbuilder_new, self._ctx, self._writer_p) 23 | 24 | def insert(self, key, val): 25 | c_key = ffi.new("char[]", key.encode('utf8')) 26 | checked_call(lib.fst_filemapbuilder_insert, 27 | self._ctx, self._builder_p, c_key, val) 28 | 29 | def finish(self): 30 | checked_call(lib.fst_filemapbuilder_finish, 31 | self._ctx, self._builder_p) 32 | lib.fst_bufwriter_free(self._writer_p) 33 | lib.fst_context_free(self._ctx) 34 | 35 | 36 | class MemMapBuilder(MapBuilder): 37 | def __init__(self): 38 | self._ctx = lib.fst_context_new() 39 | self._ptr = lib.fst_memmapbuilder_new() 40 | self._map_ptr = None 41 | 42 | def insert(self, key, val): 43 | c_key = ffi.new("char[]", key.encode('utf8')) 44 | checked_call(lib.fst_memmapbuilder_insert, self._ctx, self._ptr, 45 | c_key, val) 46 | 47 | def finish(self): 48 | self._map_ptr = checked_call(lib.fst_memmapbuilder_finish, 49 | self._ctx, self._ptr) 50 | lib.fst_context_free(self._ctx) 51 | self._ctx = None 52 | self._ptr = None 53 | 54 | def get_map(self): 55 | if self._map_ptr is None: 56 | raise ValueError("The builder has to be finished first.") 57 | return Map(_pointer=self._map_ptr) 58 | 59 | 60 | class OpBuilder(object): 61 | def __init__(self, map_ptr): 62 | # NOTE: No need for `ffi.gc`, since the struct will be free'd 63 | # once we call union/intersection/difference 64 | self._ptr = lib.fst_map_make_opbuilder(map_ptr) 65 | 66 | def push(self, map_ptr): 67 | lib.fst_map_opbuilder_push(self._ptr, map_ptr) 68 | 69 | def union(self): 70 | stream_ptr = lib.fst_map_opbuilder_union(self._ptr) 71 | return MapOpItemStreamIterator( 72 | stream_ptr, lib.fst_map_union_next, lib.fst_map_union_free) 73 | 74 | def intersection(self): 75 | stream_ptr = lib.fst_map_opbuilder_intersection(self._ptr) 76 | return MapOpItemStreamIterator( 77 | stream_ptr, lib.fst_map_intersection_next, 78 | lib.fst_map_intersection_free) 79 | 80 | def difference(self): 81 | stream_ptr = lib.fst_map_opbuilder_difference(self._ptr) 82 | return MapOpItemStreamIterator( 83 | stream_ptr, lib.fst_map_difference_next, 84 | lib.fst_map_difference_free) 85 | 86 | def symmetric_difference(self): 87 | stream_ptr = lib.fst_map_opbuilder_symmetricdifference(self._ptr) 88 | return MapOpItemStreamIterator( 89 | stream_ptr, lib.fst_map_symmetricdifference_next, 90 | lib.fst_map_symmetricdifference_free) 91 | 92 | 93 | class Map(object): 94 | """ An immutable map of unicode keys to unsigned integer values backed 95 | by a finite state transducer. 96 | 97 | The map can either be constructed in memory or on disk. For large datasets 98 | it is recommended to store it on disk, since memory usage will be constant 99 | due to the file being memory-mapped. 100 | 101 | To build a map, use the :py:meth:`from_iter` classmethod and pass it an 102 | iterator and (optionally) a path where the map should be stored. If the 103 | latter is missing, the map will be built in memory. 104 | 105 | In addition to querying the map for single keys, the following operations 106 | are supported: 107 | 108 | * Range queries with slicing syntax (i.e. `myset['c':'f']` will return an 109 | iterator over all items in the map whose keys start with 'c', 'd' or 'e') 110 | * Performing fuzzy searches on the map keys bounded by Levenshtein edit 111 | distance 112 | * Performing a search on the map keys with a regular expression 113 | * Performing set operations on multiple maps, e.g. to find different 114 | values for common keys 115 | 116 | A few caveats must be kept in mind: 117 | 118 | * Once constructed, a Map can never be modified. 119 | * Maps must be built with iterators of lexicographically sorted 120 | (str/unicode, int) tuples, where the integer value must be positive. 121 | """ 122 | 123 | @staticmethod 124 | @contextmanager 125 | def build(path=None): 126 | """ Context manager to build a new map. 127 | 128 | Call :py:meth:`insert` on the returned builder object to insert 129 | new items into the mapp. Keep in mind that insertion must happen in 130 | lexicographical order, otherwise an exception will be thrown. 131 | 132 | :param path: Path to build mapp in, or `None` if set should be built 133 | in memory 134 | :returns: :py:class:`MapBuilder` 135 | """ 136 | if path: 137 | builder = FileMapBuilder(path) 138 | else: 139 | builder = MemMapBuilder() 140 | yield builder 141 | builder.finish() 142 | 143 | @classmethod 144 | def from_iter(cls, it, path=None): 145 | """ Build a new map from an iterator. 146 | 147 | Keep in mind that the iterator must return lexicographically sorted 148 | (key, value) pairs, where the keys are unicode strings and the values 149 | unsigned integers. 150 | 151 | :param it: Iterator to build map with 152 | :type it: iterator over (str/unicode, int) pairs, where int >= 0 153 | :param path: Path to build map in, or `None` if set should be built 154 | in memory 155 | :returns: The finished map 156 | :rtype: :py:class:`Map` 157 | """ 158 | if isinstance(it, dict): 159 | it = sorted(it.items(), key=lambda x: x[0]) 160 | with cls.build(path) as builder: 161 | for key, val in it: 162 | builder.insert(key, val) 163 | if path: 164 | return cls(path=path) 165 | else: 166 | return builder.get_map() 167 | 168 | def __init__(self, path=None, _pointer=None): 169 | """ Load a map from a given file. 170 | 171 | :param path: Path to map on disk 172 | """ 173 | self._ctx = ffi.gc(lib.fst_context_new(), lib.fst_context_free) 174 | if path: 175 | s = checked_call(lib.fst_map_open, self._ctx, 176 | ffi.new("char[]", path.encode('utf8'))) 177 | else: 178 | s = _pointer 179 | self._ptr = ffi.gc(s, lib.fst_map_free) 180 | 181 | def __contains__(self, val): 182 | return lib.fst_map_contains( 183 | self._ptr, ffi.new("char[]", val.encode('utf8'))) 184 | 185 | def __getitem__(self, key): 186 | """ Get the value for a key or a range of (key, value) pairs. 187 | 188 | If the key is a slice object (e.g. `mymap['a':'f']`) an iterator 189 | over all matching items in the map will be returned. 190 | 191 | .. important:: 192 | Slicing follows the semantics for numerical indices, i.e. the 193 | `stop` value is **exclusive**. For example, `mymap['a':'c']` will 194 | return items whose key begins with 'a' or 'b', but **not** 'c'. 195 | 196 | :param key: The key to retrieve the value for or a range of 197 | unicode strings 198 | :returns: The value or an iterator over matching items 199 | """ 200 | if isinstance(key, slice): 201 | s = key 202 | if s.start and s.stop and s.start > s.stop: 203 | raise ValueError( 204 | "Start key must be lexicographically smaller than stop.") 205 | sb_ptr = lib.fst_map_streambuilder_new(self._ptr) 206 | if s.start: 207 | c_start = ffi.new("char[]", s.start.encode('utf8')) 208 | sb_ptr = lib.fst_map_streambuilder_add_ge(sb_ptr, c_start) 209 | if s.stop: 210 | c_stop = ffi.new("char[]", s.stop.encode('utf8')) 211 | sb_ptr = lib.fst_map_streambuilder_add_lt(sb_ptr, c_stop) 212 | stream_ptr = lib.fst_map_streambuilder_finish(sb_ptr) 213 | return MapItemStreamIterator(stream_ptr, lib.fst_mapstream_next, 214 | lib.fst_mapstream_free) 215 | else: 216 | return checked_call(lib.fst_map_get, self._ctx, self._ptr, 217 | ffi.new("char[]", key.encode('utf8'))) 218 | 219 | def __iter__(self): 220 | return self.keys() 221 | 222 | def __len__(self): 223 | return int(lib.fst_map_len(self._ptr)) 224 | 225 | def keys(self): 226 | """ Get an iterator over all keys in the map. """ 227 | stream_ptr = lib.fst_map_keys(self._ptr) 228 | return KeyStreamIterator(stream_ptr, lib.fst_mapkeys_next, 229 | lib.fst_mapkeys_free) 230 | 231 | def values(self): 232 | """ Get an iterator over all values in the map. """ 233 | stream_ptr = lib.fst_map_values(self._ptr) 234 | return ValueStreamIterator(stream_ptr, lib.fst_mapvalues_next, 235 | lib.fst_mapvalues_free, ctx_ptr=self._ctx) 236 | 237 | def items(self): 238 | """ Get an iterator over all (key, value) pairs in the map. """ 239 | stream_ptr = lib.fst_map_stream(self._ptr) 240 | return MapItemStreamIterator(stream_ptr, lib.fst_mapstream_next, 241 | lib.fst_mapstream_free) 242 | 243 | def search_re(self, pattern): 244 | """ Search the map with a regular expression. 245 | 246 | Note that the regular expression syntax is not Python's, but the one 247 | supported by the `regex` Rust crate, which is almost identical 248 | to the engine of the RE2 engine. 249 | 250 | For a documentation of the syntax, see: 251 | http://doc.rust-lang.org/regex/regex/index.html#syntax 252 | 253 | Due to limitations of the underlying FST, only a subset of this syntax 254 | is supported. Most notably absent are: 255 | - Lazy quantifiers (r'*?', r'+?') 256 | - Word boundaries (r'\b') 257 | - Other zero-width assertions (r'^', r'$') 258 | For background on these limitations, consult the documentation of 259 | the Rust crate: http://burntsushi.net/rustdoc/fst/struct.Regex.html 260 | 261 | :param pattern: A regular expression 262 | :returns: An iterator over all items with matching keys in 263 | the set 264 | :rtype: :py:class:`MapItemStreamIterator` 265 | """ 266 | re_ptr = checked_call( 267 | lib.fst_regex_new, self._ctx, 268 | ffi.new("char[]", pattern.encode('utf8'))) 269 | stream_ptr = lib.fst_map_regexsearch(self._ptr, re_ptr) 270 | return MapItemStreamIterator(stream_ptr, lib.fst_map_regexstream_next, 271 | lib.fst_map_regexstream_free, re_ptr, 272 | lib.fst_regex_free) 273 | 274 | def search(self, term, max_dist): 275 | """ Search the map with a Levenshtein automaton. 276 | 277 | :param term: The search term 278 | :param max_dist: The maximum edit distance for search results 279 | :returns: Matching (key, value) items in the map 280 | :rtype: :py:class:`MapItemStreamIterator` 281 | """ 282 | lev_ptr = checked_call( 283 | lib.fst_levenshtein_new, self._ctx, 284 | ffi.new("char[]", term.encode('utf8')), max_dist) 285 | stream_ptr = lib.fst_map_levsearch(self._ptr, lev_ptr) 286 | return MapItemStreamIterator(stream_ptr, lib.fst_map_levstream_next, 287 | lib.fst_map_levstream_free, lev_ptr, 288 | lib.fst_levenshtein_free) 289 | 290 | def _make_opbuilder(self, *others): 291 | opbuilder = OpBuilder(self._ptr) 292 | for oth in others: 293 | opbuilder.push(oth._ptr) 294 | return opbuilder 295 | 296 | def union(self, *others): 297 | """ Get an iterator over the items in the union of this map and others. 298 | 299 | The iterator will return pairs of `(key, [IndexedValue])`, where 300 | the latter is a list of different values for the key in the different 301 | maps, represented as a tuple of the map index and the value in the 302 | map. 303 | 304 | :param others: List of :py:class:`Map` objects 305 | :returns: Iterator over all items in all maps in lexicographical 306 | order 307 | """ 308 | return self._make_opbuilder(*others).union() 309 | 310 | def intersection(self, *others): 311 | """ Get an iterator over the items in the intersection of this map and 312 | others. 313 | 314 | The iterator will return pairs of `(key, [IndexedValue])`, where 315 | the latter is a list of different values for the key in the different 316 | maps, represented as a tuple of the map index and the value in the 317 | map. 318 | 319 | :param others: List of :py:class:`Map` objects 320 | :returns: Iterator over all items whose key exists in all of the 321 | passed maps in lexicographical order 322 | """ 323 | return self._make_opbuilder(*others).intersection() 324 | 325 | def difference(self, *others): 326 | """ Get an iterator over the items in the difference of this map and 327 | others. 328 | 329 | The iterator will return pairs of `(key, [IndexedValue])`, where 330 | the latter is a list of different values for the key in the different 331 | maps, represented as a tuple of the map index and the value in the 332 | map. 333 | 334 | :param others: List of :py:class:`Map` objects 335 | :returns: Iterator over all items whose key exists in this map, 336 | but in none of the other maps, in lexicographical order 337 | """ 338 | return self._make_opbuilder(*others).difference() 339 | 340 | def symmetric_difference(self, *others): 341 | """ Get an iterator over the items in the symmetric difference of this 342 | map and others. 343 | 344 | The iterator will return pairs of `(key, [IndexedValue])`, where 345 | the latter is a list of different values for the key in the different 346 | maps, represented as a tuple of the map index and the value in the 347 | map. 348 | 349 | :param others: List of :py:class:`Map` objects 350 | :returns: Iterator over all items whose key exists in only one of 351 | the maps in lexicographical order 352 | """ 353 | return self._make_opbuilder(*others).symmetric_difference() 354 | -------------------------------------------------------------------------------- /rust_fst/set.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | 3 | from .common import KeyStreamIterator 4 | from .lib import ffi, lib, checked_call 5 | 6 | 7 | class SetBuilder(object): 8 | def insert(self, val): 9 | raise NotImplementedError 10 | 11 | def finish(self): 12 | raise NotImplementedError 13 | 14 | 15 | class FileSetBuilder(SetBuilder): 16 | def __init__(self, path): 17 | self._ctx = lib.fst_context_new() 18 | self._writer_p = checked_call( 19 | lib.fst_bufwriter_new, self._ctx, path.encode('utf8')) 20 | self._builder_p = checked_call( 21 | lib.fst_filesetbuilder_new, self._ctx, self._writer_p) 22 | 23 | def insert(self, val): 24 | c_str = ffi.new("char[]", val.encode('utf8')) 25 | checked_call(lib.fst_filesetbuilder_insert, 26 | self._ctx, self._builder_p, c_str) 27 | 28 | def finish(self): 29 | checked_call(lib.fst_filesetbuilder_finish, 30 | self._ctx, self._builder_p) 31 | lib.fst_bufwriter_free(self._writer_p) 32 | lib.fst_context_free(self._ctx) 33 | 34 | 35 | class MemSetBuilder(SetBuilder): 36 | def __init__(self): 37 | self._ctx = lib.fst_context_new() 38 | self._ptr = lib.fst_memsetbuilder_new() 39 | self._set_ptr = None 40 | 41 | def insert(self, val): 42 | c_str = ffi.new("char[]", val.encode('utf8')) 43 | checked_call(lib.fst_memsetbuilder_insert, self._ctx, self._ptr, c_str) 44 | 45 | def finish(self): 46 | self._set_ptr = checked_call(lib.fst_memsetbuilder_finish, 47 | self._ctx, self._ptr) 48 | lib.fst_context_free(self._ctx) 49 | self._ctx = None 50 | self._ptr = None 51 | 52 | def get_set(self): 53 | if self._set_ptr is None: 54 | raise ValueError("The builder has to be finished first.") 55 | return Set(None, _pointer=self._set_ptr) 56 | 57 | 58 | class OpBuilder(object): 59 | def __init__(self, set_ptr): 60 | # NOTE: No need for `ffi.gc`, since the struct will be free'd 61 | # once we call union/intersection/difference 62 | self._ptr = lib.fst_set_make_opbuilder(set_ptr) 63 | 64 | def push(self, set_ptr): 65 | lib.fst_set_opbuilder_push(self._ptr, set_ptr) 66 | 67 | def union(self): 68 | stream_ptr = lib.fst_set_opbuilder_union(self._ptr) 69 | return KeyStreamIterator(stream_ptr, lib.fst_set_union_next, 70 | lib.fst_set_union_free) 71 | 72 | def intersection(self): 73 | stream_ptr = lib.fst_set_opbuilder_intersection(self._ptr) 74 | return KeyStreamIterator(stream_ptr, lib.fst_set_intersection_next, 75 | lib.fst_set_intersection_free) 76 | 77 | def difference(self): 78 | stream_ptr = lib.fst_set_opbuilder_difference(self._ptr) 79 | return KeyStreamIterator(stream_ptr, lib.fst_set_difference_next, 80 | lib.fst_set_difference_free) 81 | 82 | def symmetric_difference(self): 83 | stream_ptr = lib.fst_set_opbuilder_symmetricdifference(self._ptr) 84 | return KeyStreamIterator(stream_ptr, 85 | lib.fst_set_symmetricdifference_next, 86 | lib.fst_set_symmetricdifference_free) 87 | 88 | 89 | class Set(object): 90 | """ An immutable ordered string set backed by a finite state transducer. 91 | 92 | The set can either be constructed in memory or on disk. For large datasets 93 | it is recommended to store it on disk, since memory usage will be constant 94 | due to the file being memory-mapped. 95 | 96 | To build a set, use the :py:meth:`from_iter` classmethod and pass it an 97 | iterator and (optionally) a path where the set should be stored. If the 98 | latter is missing, the set will be built in memory. 99 | 100 | The interface follows the built-in `set` type, with a few additions: 101 | 102 | * Range queries with slicing syntax (i.e. `myset['c':'f']` will return an 103 | iterator over all items in the set that start with 'c', 'd' or 'e') 104 | * Performing fuzzy searches on the set bounded by Levenshtein edit distance 105 | * Performing a search with a regular expression 106 | 107 | A few caveats must be kept in mind: 108 | 109 | * Once constructed, a Set can never be modified. 110 | * Sets must be built with iterators of lexicographically sorted 111 | unicode strings 112 | """ 113 | 114 | @staticmethod 115 | @contextmanager 116 | def build(path=None): 117 | """ Context manager to build a new set. 118 | 119 | Call :py:meth:`insert` on the returned builder object to insert 120 | new items into the set. Keep in mind that insertion must happen in 121 | lexicographical order, otherwise an exception will be thrown. 122 | 123 | :param path: Path to build set in, or `None` if set should be built 124 | in memory 125 | :returns: :py:class:`SetBuilder` 126 | """ 127 | if path: 128 | builder = FileSetBuilder(path) 129 | else: 130 | builder = MemSetBuilder() 131 | yield builder 132 | builder.finish() 133 | 134 | @classmethod 135 | def from_iter(cls, it, path=None): 136 | """ Build a new set from an iterator. 137 | 138 | Keep in mind that the iterator must return unicode strings in 139 | lexicographical order, otherwise an exception will be thrown. 140 | 141 | :param it: Iterator to build set with 142 | :type it: iterator over unicode strings 143 | :param path: Path to build set in, or `None` if set should be built 144 | in memory 145 | :returns: The finished set 146 | :rtype: :py:class:`Set` 147 | """ 148 | with cls.build(path) as builder: 149 | for key in it: 150 | builder.insert(key) 151 | if path: 152 | return cls(path=path) 153 | else: 154 | return builder.get_set() 155 | 156 | def __init__(self, path, _pointer=None): 157 | """ Load a set from a given file. 158 | 159 | :param path: Path to set on disk 160 | """ 161 | self._ctx = ffi.gc(lib.fst_context_new(), lib.fst_context_free) 162 | if path: 163 | s = checked_call(lib.fst_set_open, self._ctx, 164 | ffi.new("char[]", path.encode('utf8'))) 165 | else: 166 | s = _pointer 167 | self._ptr = ffi.gc(s, lib.fst_set_free) 168 | 169 | def __contains__(self, val): 170 | """ Check if the set contains the value. """ 171 | return lib.fst_set_contains( 172 | self._ptr, ffi.new("char[]", val.encode('utf8'))) 173 | 174 | def __iter__(self): 175 | """ Get an iterator over all keys in the set in lexicographical order. 176 | 177 | """ 178 | stream_ptr = lib.fst_set_stream(self._ptr) 179 | return KeyStreamIterator(stream_ptr, lib.fst_set_stream_next, 180 | lib.fst_set_stream_free) 181 | 182 | def __len__(self): 183 | """ Get the number of keys in the set. """ 184 | return int(lib.fst_set_len(self._ptr)) 185 | 186 | def __getitem__(self, s): 187 | """ Get an iterator over a range of set contents. 188 | 189 | Start and stop indices of the slice must be unicode strings. 190 | 191 | .. important:: 192 | Slicing follows the semantics for numerical indices, i.e. the 193 | `stop` value is **exclusive**. For example, given the set 194 | `s = Set.from_iter(["bar", "baz", "foo", "moo"])`, `s['b': 'f']` 195 | will only return `"bar"` and `"baz"`. 196 | 197 | :param s: A slice that specifies the range of the set to retrieve 198 | :type s: :py:class:`slice` 199 | """ 200 | if not isinstance(s, slice): 201 | raise ValueError( 202 | "Value must be a string slice (e.g. `['foo':]`)") 203 | if s.start and s.stop and s.start > s.stop: 204 | raise ValueError( 205 | "Start key must be lexicographically smaller than stop.") 206 | sb_ptr = lib.fst_set_streambuilder_new(self._ptr) 207 | if s.start: 208 | c_start = ffi.new("char[]", s.start.encode('utf8')) 209 | sb_ptr = lib.fst_set_streambuilder_add_ge(sb_ptr, c_start) 210 | if s.stop: 211 | c_stop = ffi.new("char[]", s.stop.encode('utf8')) 212 | sb_ptr = lib.fst_set_streambuilder_add_lt(sb_ptr, c_stop) 213 | stream_ptr = lib.fst_set_streambuilder_finish(sb_ptr) 214 | return KeyStreamIterator(stream_ptr, lib.fst_set_stream_next, 215 | lib.fst_set_stream_free) 216 | 217 | def _make_opbuilder(self, *others): 218 | opbuilder = OpBuilder(self._ptr) 219 | for oth in others: 220 | opbuilder.push(oth._ptr) 221 | return opbuilder 222 | 223 | def union(self, *others): 224 | """ Get an iterator over the keys in the union of this set and others. 225 | 226 | :param others: List of :py:class:`Set` objects 227 | :returns: Iterator over all keys in all sets in lexicographical 228 | order 229 | """ 230 | return self._make_opbuilder(*others).union() 231 | 232 | def intersection(self, *others): 233 | """ Get an iterator over the keys in the intersection of this set and 234 | others. 235 | 236 | :param others: List of :py:class:`Set` objects 237 | :returns: Iterator over all keys that exists in all of the passed 238 | sets in lexicographical order 239 | """ 240 | return self._make_opbuilder(*others).intersection() 241 | 242 | def difference(self, *others): 243 | """ Get an iterator over the keys in the difference of this set and 244 | others. 245 | 246 | :param others: List of :py:class:`Set` objects 247 | :returns: Iterator over all keys that exists in this set, but in 248 | none of the other sets, in lexicographical order 249 | """ 250 | return self._make_opbuilder(*others).difference() 251 | 252 | def symmetric_difference(self, *others): 253 | """ Get an iterator over the keys in the symmetric difference of this 254 | set and others. 255 | 256 | :param others: List of :py:class:`Set` objects 257 | :returns: Iterator over all keys that exists in only one of the 258 | sets in lexicographical order 259 | """ 260 | return self._make_opbuilder(*others).symmetric_difference() 261 | 262 | def issubset(self, other): 263 | """ Check if this set is a subset of another set. 264 | 265 | :param other: Another set 266 | :type other: :py:class:`Set` 267 | :rtype: bool 268 | """ 269 | return bool(lib.fst_set_issubset(self._ptr, other._ptr)) 270 | 271 | def issuperset(self, other): 272 | """ Check if this set is a superset of another set. 273 | 274 | :param other: Another set 275 | :type other: :py:class:`Set` 276 | :rtype: bool 277 | """ 278 | return bool(lib.fst_set_issuperset(self._ptr, other._ptr)) 279 | 280 | def isdisjoint(self, other): 281 | """ Check if this set is disjoint to another set. 282 | 283 | :param other: Another set 284 | :type other: :py:class:`Set` 285 | :rtype: bool 286 | """ 287 | return bool(lib.fst_set_isdisjoint(self._ptr, other._ptr)) 288 | 289 | def search_re(self, pattern): 290 | """ Search the set with a regular expression. 291 | 292 | Note that the regular expression syntax is not Python's, but the one 293 | supported by the `regex` Rust crate, which is almost identical 294 | to the engine of the RE2 engine. 295 | 296 | For a documentation of the syntax, see: 297 | http://doc.rust-lang.org/regex/regex/index.html#syntax 298 | 299 | Due to limitations of the underlying FST, only a subset of this syntax 300 | is supported. Most notably absent are: 301 | 302 | * Lazy quantifiers (``r'*?'``, ``r'+?'``) 303 | * Word boundaries (``r'\\b'``) 304 | * Other zero-width assertions (``r'^'``, ``r'$'``) 305 | 306 | For background on these limitations, consult the documentation of 307 | the Rust crate: http://burntsushi.net/rustdoc/fst/struct.Regex.html 308 | 309 | :param pattern: A regular expression 310 | :returns: An iterator over all matching keys in the set 311 | :rtype: :py:class:`KeyStreamIterator` 312 | """ 313 | re_ptr = checked_call( 314 | lib.fst_regex_new, self._ctx, 315 | ffi.new("char[]", pattern.encode('utf8'))) 316 | stream_ptr = lib.fst_set_regexsearch(self._ptr, re_ptr) 317 | return KeyStreamIterator(stream_ptr, lib.fst_set_regexstream_next, 318 | lib.fst_set_regexstream_free, re_ptr, 319 | lib.fst_regex_free) 320 | 321 | def search(self, term, max_dist): 322 | """ Search the set with a Levenshtein automaton. 323 | 324 | :param term: The search term 325 | :param max_dist: The maximum edit distance for search results 326 | :returns: Iterator over matching values in the set 327 | :rtype: :py:class:`KeyStreamIterator` 328 | """ 329 | lev_ptr = checked_call( 330 | lib.fst_levenshtein_new, self._ctx, 331 | ffi.new("char[]", term.encode('utf8')), max_dist) 332 | stream_ptr = lib.fst_set_levsearch(self._ptr, lev_ptr) 333 | return KeyStreamIterator(stream_ptr, lib.fst_set_levstream_next, 334 | lib.fst_set_levstream_free, lev_ptr, 335 | lib.fst_levenshtein_free) 336 | -------------------------------------------------------------------------------- /rust_setuptools.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ Based on code by Armin Ronacher and James Salter 3 | 4 | https://github.com/mitsuhiko/rust-setuptools 5 | https://github.com/novocaine/rust-python-ext 6 | """ 7 | from __future__ import print_function 8 | 9 | import os 10 | import sys 11 | import shutil 12 | import subprocess 13 | 14 | from distutils.cmd import Command 15 | from distutils.command.install_lib import install_lib 16 | from distutils.dist import Distribution 17 | 18 | 19 | if sys.platform == 'win32': 20 | DYNAMIC_LIB_SUFFIX = '.dll' 21 | elif sys.platform == 'darwin': 22 | DYNAMIC_LIB_SUFFIX = '.dylib' 23 | else: 24 | DYNAMIC_LIB_SUFFIX = '.so' 25 | 26 | 27 | class RustDistribution(Distribution): 28 | 29 | def __init__(self, attrs=None): 30 | Distribution.__init__(self, attrs) 31 | self.ext_modules = [] 32 | 33 | def has_ext_modules(self): 34 | return True 35 | 36 | 37 | class RustBuildCommand(Command): 38 | description = 'build rust crates into Python extensions' 39 | 40 | user_options = [] 41 | 42 | def initialize_options(self): 43 | for k, v in self.__class__.rust_build_args.items(): 44 | setattr(self, k, v) 45 | 46 | def finalize_options(self): 47 | pass 48 | 49 | def run(self): 50 | # Force binary wheel 51 | self.distribution.has_ext_modules = lambda: True 52 | self.distribution.ext_modules = [] 53 | 54 | # Make sure that if pythonXX-sys is used, it builds against the 55 | # current executing python interpreter. 56 | bindir = os.path.dirname(sys.executable) 57 | if sys.platform == 'win32': 58 | path_sep = ';' 59 | else: 60 | path_sep = ':' 61 | 62 | env = dict(os.environ) 63 | env.update({ 64 | # disables rust's pkg-config seeking for specified packages, 65 | # which causes pythonXX-sys to fall back to detecting the 66 | # interpreter from the path. 67 | 'PYTHON_2.7_NO_PKG_CONFIG': '1', 68 | 'PATH': bindir + path_sep + env.get('PATH', '') 69 | }) 70 | 71 | for crate_path, dest in self.cargo_crates: 72 | # Execute cargo. 73 | try: 74 | toml = os.path.join(crate_path, 'Cargo.toml') 75 | args = ['cargo', 'build', '--manifest-path', toml] 76 | if not self.debug: 77 | args.append('--release') 78 | args.extend(list(self.extra_cargo_args or [])) 79 | if not self.quiet: 80 | print(' '.join(args), file=sys.stderr) 81 | output = subprocess.check_output(args, env=env) 82 | except subprocess.CalledProcessError as e: 83 | msg = 'cargo failed with code: %d\n%s' % (e.returncode, e.output) 84 | raise Exception(msg) 85 | except OSError: 86 | raise Exception( 87 | 'Unable to execute cargo - this package requires rust to ' 88 | 'be installed and cargo to be on the PATH') 89 | 90 | if not self.quiet: 91 | print(output, file=sys.stderr) 92 | 93 | # Find the shared library that cargo hopefully produced and copy 94 | # it into the build directory as if it were produced by 95 | # build_cext. 96 | if self.debug: 97 | suffix = 'debug' 98 | else: 99 | suffix = 'release' 100 | 101 | dylib_path = os.path.join(crate_path, 'target/', suffix) 102 | 103 | # Ask build_ext where the shared library would go if it had built it, 104 | # then copy it there. 105 | build_ext = self.get_finalized_command('build_ext') 106 | 107 | target = os.path.dirname(build_ext.get_ext_fullpath('x')) 108 | try: 109 | os.makedirs(target) 110 | except OSError: 111 | pass 112 | 113 | target = os.path.join(target, dest) 114 | 115 | for filename in os.listdir(dylib_path): 116 | if filename.endswith(DYNAMIC_LIB_SUFFIX): 117 | shutil.copy(os.path.join(dylib_path, filename), 118 | os.path.join(target, filename)) 119 | 120 | 121 | def build_rust_cmdclass(crates, debug=False, 122 | extra_cargo_args=None, quiet=False): 123 | class _RustBuildCommand(RustBuildCommand): 124 | rust_build_args = { 125 | 'cargo_crates': crates, 126 | 'debug': debug, 127 | 'extra_cargo_args': extra_cargo_args, 128 | 'quiet': quiet, 129 | } 130 | return _RustBuildCommand 131 | 132 | 133 | def build_install_lib_cmdclass(base=None): 134 | if base is None: 135 | base = install_lib 136 | class _RustInstallLibCommand(base): 137 | def build(self): 138 | base.build(self) 139 | if not self.skip_build: 140 | self.run_command('build_rust') 141 | return _RustInstallLibCommand 142 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import platform 2 | 3 | from setuptools import setup 4 | 5 | # The AppVeyor build doesn't use rustup, so we run cargo directly there 6 | if platform.system() == 'Windows': 7 | BUILD_CMD = ['cargo', 'build', '--release'] 8 | else: 9 | BUILD_CMD = ['rustup', 'run', 'nightly', 'cargo', 'build', '--release'] 10 | 11 | 12 | def build_native(spec): 13 | build = spec.add_external_build(cmd=BUILD_CMD, path='./rust') 14 | spec.add_cffi_module( 15 | module_path='rust_fst._native', 16 | dylib=lambda: build.find_dylib('rust_fst', in_path='target/release'), 17 | header_filename=lambda: build.find_header('rust_fst.h', in_path='./'), 18 | rtld_flags=['NOW', 'NODELETE'] 19 | ) 20 | 21 | 22 | setup( 23 | name='rust-fst', 24 | version='0.2.0dev', 25 | author='Johannes Baiter', 26 | author_email='johannes.baiter@gmail.com', 27 | description=('Python bindings for the Rust `fst` create, providing sets ' 28 | 'and maps backed by finite state transducers.'), 29 | license='MIT', 30 | keywords=['fst', 'rust', 'levenshtein', 'automata', 'transducer', 31 | 'data_structures'], 32 | url='https://github.com/jbaiter/python-rust-fst', 33 | tests_require=['pytest', 'psutil', 'decorator'], 34 | packages=['rust_fst'], 35 | zip_safe=False, 36 | platforms='any', 37 | setup_requires=['milksnake'], 38 | install_requires=['milksnake'], 39 | milksnake_tasks=[build_native], 40 | classifiers=[ 41 | 'Development Status :: 4 - Beta', 42 | 'Intended Audience :: Developers', 43 | 'Intended Audience :: Science/Research', 44 | 'License :: OSI Approved :: MIT License', 45 | 'Topic :: Text Processing :: Indexing'] 46 | ) 47 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | decorator 3 | psutil 4 | -------------------------------------------------------------------------------- /tests/test_map.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pytest 3 | 4 | import rust_fst.lib as lib 5 | from rust_fst import Map 6 | 7 | 8 | TEST_ITEMS = [(u"möö", 1), (u"bar", 2), (u"baz", 1337), (u"foo", 2**16)] 9 | 10 | 11 | def do_build(path=None, items=TEST_ITEMS, sorted_=True): 12 | if sorted_: 13 | it = sorted(items) 14 | else: 15 | it = items 16 | return Map.from_iter(it=it, path=path) 17 | 18 | 19 | @pytest.fixture 20 | def fst_map(): 21 | return do_build() 22 | 23 | 24 | def test_build(tmpdir): 25 | fst_path = tmpdir.join('test.fst') 26 | do_build(str(fst_path)) 27 | assert fst_path.exists() 28 | 29 | 30 | def test_build_outoforder(tmpdir): 31 | fst_path = str(tmpdir.join('test.fst')) 32 | with pytest.raises(lib.TransducerError): 33 | do_build(fst_path, sorted_=False) 34 | 35 | 36 | def test_build_baddir(): 37 | fst_path = "/guaranteed-to-not-exist/set.fst" 38 | with pytest.raises(OSError): 39 | do_build(fst_path) 40 | 41 | 42 | def test_build_memory(fst_map): 43 | assert len(fst_map) == 4 44 | 45 | 46 | def test_map_contains(fst_map): 47 | for key, _ in TEST_ITEMS: 48 | assert key in fst_map 49 | 50 | 51 | def test_map_items(fst_map): 52 | items = list(fst_map.items()) 53 | assert items == sorted(TEST_ITEMS) 54 | 55 | 56 | def test_map_getitem(fst_map): 57 | for key, val in TEST_ITEMS: 58 | assert fst_map[key] == val 59 | 60 | 61 | def test_map_keys(fst_map): 62 | keys = list(fst_map.keys()) 63 | assert keys == sorted([k for k, _ in TEST_ITEMS]) 64 | 65 | 66 | def test_map_iter(fst_map): 67 | assert list(fst_map) == sorted([k for k, _ in TEST_ITEMS]) 68 | 69 | 70 | def test_map_values(fst_map): 71 | values = list(fst_map.values()) 72 | assert values == [v for _, v in sorted(TEST_ITEMS)] 73 | 74 | 75 | def test_map_search(fst_map): 76 | matches = list(fst_map.search("bam", 1)) 77 | assert matches == [(u"bar", 2), (u"baz", 1337)] 78 | 79 | 80 | def test_search_re(fst_map): 81 | matches = dict(fst_map.search_re(r'ba.*')) 82 | assert matches == {"bar": 2, "baz": 1337} 83 | 84 | 85 | def test_bad_pattern(fst_map): 86 | with pytest.raises(lib.RegexError): 87 | list(fst_map.search_re(r'ba.*?')) 88 | 89 | 90 | def test_map_union(): 91 | a = Map.from_iter({'bar': 8, 'baz': 16}) 92 | b = Map.from_iter({'bar': 32, 'moo': 64}) 93 | u = dict(a.union(b)) 94 | assert len(u) == 3 95 | bar_itms = [(itm.index, itm.value) for itm in u['bar']] 96 | assert bar_itms == [(0, 8), (1, 32)] 97 | baz_itms = [(itm.index, itm.value) for itm in u['baz']] 98 | assert baz_itms == [(0, 16)] 99 | moo_itms = [(itm.index, itm.value) for itm in u['moo']] 100 | assert moo_itms == [(1, 64)] 101 | 102 | 103 | def test_map_intersection(): 104 | a = Map.from_iter({'bar': 8, 'baz': 16}) 105 | b = Map.from_iter({'bar': 32, 'moo': 64}) 106 | i = dict(a.intersection(b)) 107 | assert len(i) == 1 108 | assert i['bar'] == ((0, 8), (1, 32)) 109 | 110 | 111 | def test_map_difference(): 112 | a = Map.from_iter({'bar': 8, 'baz': 16}) 113 | b = Map.from_iter({'bar': 32, 'moo': 64}) 114 | d = dict(a.difference(b)) 115 | assert len(d) == 1 116 | assert d['baz'] == ((0, 16),) 117 | 118 | 119 | def test_map_symmetric_difference(): 120 | a = Map.from_iter({'bar': 8, 'baz': 16}) 121 | b = Map.from_iter({'bar': 32, 'moo': 64}) 122 | s = dict(a.symmetric_difference(b)) 123 | assert len(s) == 2 124 | assert s['baz'] == ((0, 16),) 125 | assert s['moo'] == ((1, 64),) 126 | 127 | 128 | def test_range(fst_map): 129 | assert dict(fst_map['f':]) == {'foo': 2**16, u'möö': 1} 130 | assert dict(fst_map[:'m']) == {'bar': 2, 'baz': 1337, 'foo': 2**16} 131 | assert dict(fst_map['baz':'m']) == {'baz': 1337, 'foo': 2**16} 132 | with pytest.raises(ValueError): 133 | fst_map['c':'a'] 134 | -------------------------------------------------------------------------------- /tests/test_set.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pytest 3 | 4 | import rust_fst.lib as lib 5 | from rust_fst import Set 6 | 7 | 8 | TEST_KEYS = [u"möö", "bar", "baz", "foo"] 9 | 10 | 11 | def do_build(path, keys=TEST_KEYS, sorted_=True): 12 | with Set.build(path) as builder: 13 | for key in (sorted(keys) if sorted_ else keys): 14 | builder.insert(key) 15 | 16 | 17 | @pytest.fixture 18 | def fst_set(tmpdir): 19 | fst_path = str(tmpdir.join('test.fst')) 20 | do_build(fst_path) 21 | return Set(fst_path) 22 | 23 | 24 | def test_build(tmpdir): 25 | fst_path = tmpdir.join('test.fst') 26 | do_build(str(fst_path)) 27 | assert fst_path.exists() 28 | 29 | 30 | def test_build_outoforder(tmpdir): 31 | fst_path = str(tmpdir.join('test.fst')) 32 | with pytest.raises(lib.TransducerError): 33 | do_build(fst_path, sorted_=False) 34 | 35 | 36 | def test_build_baddir(): 37 | fst_path = "/guaranteed-to-not-exist/set.fst" 38 | with pytest.raises(OSError): 39 | with Set.build(fst_path) as builder: 40 | for key in sorted(TEST_KEYS): 41 | builder.insert(key) 42 | 43 | 44 | def test_build_memory(): 45 | memset = Set.from_iter(sorted(TEST_KEYS)) 46 | assert len(memset) == 4 47 | 48 | 49 | def test_load_badfile(tmpdir): 50 | bad_path = tmpdir.join("bad.fst") 51 | with bad_path.open('wb') as fp: 52 | fp.write(b'\xFF'*16) 53 | with pytest.raises(lib.TransducerError): 54 | Set(str(bad_path)) 55 | 56 | 57 | def test_iter(fst_set): 58 | stored_keys = list(fst_set) 59 | assert stored_keys == sorted(TEST_KEYS) 60 | 61 | 62 | def test_len(fst_set): 63 | assert len(fst_set) == 4 64 | 65 | 66 | def test_contains(fst_set): 67 | for key in TEST_KEYS: 68 | assert key in fst_set 69 | 70 | 71 | def test_issubset(tmpdir, fst_set): 72 | oth_path = tmpdir.join('other.fst') 73 | do_build(str(oth_path), keys=TEST_KEYS[:-2]) 74 | other_set = Set(str(oth_path)) 75 | assert other_set.issubset(fst_set) 76 | assert fst_set.issubset(fst_set) 77 | 78 | 79 | def test_issuperset(tmpdir, fst_set): 80 | oth_path = tmpdir.join('other.fst') 81 | do_build(str(oth_path), keys=TEST_KEYS[:-2]) 82 | other_set = Set(str(oth_path)) 83 | assert fst_set.issuperset(other_set) 84 | assert fst_set.issuperset(fst_set) 85 | 86 | 87 | def test_isdisjoint(tmpdir, fst_set): 88 | oth_path = tmpdir.join('other.fst') 89 | do_build(str(oth_path), keys=[u'ene', u'mene']) 90 | other_set = Set(str(oth_path)) 91 | assert fst_set.isdisjoint(other_set) 92 | assert other_set.isdisjoint(fst_set) 93 | assert not fst_set.isdisjoint(fst_set) 94 | assert not fst_set.issuperset(other_set) 95 | assert not fst_set.issubset(other_set) 96 | 97 | 98 | def test_search(fst_set): 99 | matches = list(fst_set.search("bam", 1)) 100 | assert matches == ["bar", "baz"] 101 | 102 | 103 | def test_levautomaton_too_big(fst_set): 104 | with pytest.raises(lib.LevenshteinError): 105 | next(fst_set.search("areallylongstring", 8)) 106 | 107 | 108 | def test_search_re(fst_set): 109 | matches = list(fst_set.search_re(r'ba.*')) 110 | assert matches == ["bar", "baz"] 111 | 112 | 113 | def test_bad_pattern(fst_set): 114 | with pytest.raises(lib.RegexError): 115 | list(fst_set.search_re(r'ba.*?')) 116 | 117 | 118 | def test_union(): 119 | a = Set.from_iter(["bar", "foo"]) 120 | b = Set.from_iter(["baz", "foo"]) 121 | assert list(a.union(b)) == ["bar", "baz", "foo"] 122 | 123 | 124 | def test_difference(): 125 | a = Set.from_iter(["bar", "foo"]) 126 | b = Set.from_iter(["baz", "foo"]) 127 | assert list(a.difference(b)) == ["bar"] 128 | 129 | 130 | def test_symmetric_difference(): 131 | a = Set.from_iter(["bar", "foo"]) 132 | b = Set.from_iter(["baz", "foo"]) 133 | assert list(a.symmetric_difference(b)) == ["bar", "baz"] 134 | 135 | 136 | def test_intersection(): 137 | a = Set.from_iter(["bar", "foo"]) 138 | b = Set.from_iter(["baz", "foo"]) 139 | assert list(a.intersection(b)) == ["foo"] 140 | 141 | 142 | def test_range(fst_set): 143 | assert list(fst_set['f':]) == ['foo', u'möö'] 144 | assert list(fst_set[:'m']) == ['bar', 'baz', 'foo'] 145 | assert list(fst_set['baz':'m']) == ['baz', 'foo'] 146 | with pytest.raises(ValueError): 147 | fst_set['c':'a'] 148 | with pytest.raises(ValueError): 149 | fst_set['c'] 150 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py27, py34, py36, pypy 8 | 9 | [testenv] 10 | commands = py.test 11 | deps = 12 | pytest 13 | cffi 14 | --------------------------------------------------------------------------------