├── .github └── workflows │ ├── build.yml │ └── deploy.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── MANIFEST.in ├── README.md ├── cortex.yaml ├── images └── logo.png ├── poetry.lock ├── pydeduplines ├── __init__.py └── pydeduplines.pyi ├── pyproject.toml ├── src └── lib.rs └── tests ├── __init__.py └── test_pydeduplines.py /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: 3 | - push 4 | - pull_request 5 | jobs: 6 | lint: 7 | if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags') 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout 11 | uses: actions/checkout@v3 12 | - name: Install latest rust 13 | uses: actions-rs/toolchain@v1 14 | with: 15 | toolchain: stable 16 | profile: minimal 17 | override: true 18 | components: clippy 19 | - name: Lint with clippy 20 | uses: actions-rs/cargo@v1 21 | with: 22 | command: clippy 23 | args: --all-targets --all-features 24 | test: 25 | runs-on: ${{ matrix.os }} 26 | needs: lint 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | python-version: 31 | - '3.7' 32 | - '3.8' 33 | - '3.9' 34 | - '3.10' 35 | - '3.11' 36 | os: 37 | - ubuntu-latest 38 | - macos-latest 39 | - windows-latest 40 | steps: 41 | - name: Checkout 42 | uses: actions/checkout@v3 43 | - name: Set up Python ${{ matrix.python-version }} 44 | uses: actions/setup-python@v3 45 | with: 46 | python-version: ${{ matrix.python-version }} 47 | - name: Install Poetry 48 | uses: abatilo/actions-poetry@v2.1.3 49 | - name: Install Rust 50 | uses: actions-rs/toolchain@v1 51 | with: 52 | profile: minimal 53 | toolchain: stable 54 | override: true 55 | - name: Install dependencies 56 | run: poetry install 57 | - name: Build Python package 58 | run: poetry run maturin develop 59 | - name: Test 60 | run: poetry run pytest -Werror tests 61 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | on: 3 | release: 4 | types: 5 | - released 6 | jobs: 7 | deploy: 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | python-version: 13 | - '3.7' 14 | - '3.8' 15 | - '3.9' 16 | - '3.10' 17 | - '3.11' 18 | os: 19 | - ubuntu-latest 20 | - macos-latest 21 | - windows-latest 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v3 25 | - name: Set up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Install Rust 30 | uses: actions-rs/toolchain@v1 31 | with: 32 | profile: minimal 33 | toolchain: stable 34 | override: true 35 | - name: Install Cross-compilers (macOS) 36 | if: matrix.os == 'macos-latest' 37 | run: | 38 | rustup target add x86_64-apple-darwin 39 | rustup target add aarch64-apple-darwin 40 | - name: Publish Package 41 | uses: PyO3/maturin-action@v1 42 | with: 43 | command: publish 44 | args: --username=__token__ ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.7' && '' || '--no-sdist' }} --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }} 45 | env: 46 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 47 | if: matrix.os != 'macos-latest' 48 | - name: Publish macOS (x86_64) Package 49 | if: matrix.os == 'macos-latest' 50 | uses: PyO3/maturin-action@v1 51 | with: 52 | command: publish 53 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=x86_64-apple-darwin --no-sdist 54 | env: 55 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} 56 | - name: Publish macOS (arm64) Package 57 | if: matrix.os == 'macos-latest' 58 | uses: PyO3/maturin-action@v1 59 | with: 60 | command: publish 61 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-apple-darwin --no-sdist 62 | env: 63 | MATURIN_PASSWORD: ${{ secrets.pypi_password }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | # Distribution / packaging 8 | .Python 9 | env/ 10 | build/ 11 | develop-eggs/ 12 | dist/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # conflict temp files 32 | *.py.orig 33 | *.mock 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | coverage_html_report/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | 61 | # PyBuilder 62 | target/ 63 | 64 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio 65 | 66 | *.iml 67 | 68 | ## Directory-based project format: 69 | .idea/ 70 | 71 | # Tests 72 | generic_tests.py 73 | cloudflare_test.py 74 | 75 | ############################ 76 | #Eclipse Specific GitIgnore# 77 | ############################ 78 | *.pydevproject 79 | .project 80 | .metadata 81 | bin/** 82 | tmp/** 83 | tmp/**/* 84 | *.tmp 85 | *.bak 86 | *.swp 87 | *~.nib 88 | local.properties 89 | .classpath 90 | .settings/ 91 | .loadpath 92 | 93 | 94 | # Git mergetool traces 95 | *.orig 96 | 97 | # VS Code internal directory 98 | .vscode/ 99 | 100 | *.dat 101 | *.code-workspace 102 | .history 103 | 104 | # Intsights development playground 105 | playground/ 106 | 107 | pytest-report\.csv 108 | *.cppimporthash 109 | .rendered.* 110 | Databases.db 111 | 112 | # Node.js 113 | dist/ 114 | node_modules/ 115 | coverage/ 116 | 117 | # Generated by Cargo 118 | # will have compiled files and executables 119 | /target/ 120 | 121 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 122 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 123 | Cargo.lock 124 | 125 | # These are backup files generated by rustfmt 126 | **/*.rs.bk 127 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "PyDeduplines" 3 | version = "0.6.1" 4 | authors = ["Gal Ben David "] 5 | edition = "2021" 6 | description = "Python library for a duplicate lines removal written in Rust" 7 | readme = "README.md" 8 | repository = "https://github.com/intsights/PyDeduplines" 9 | homepage = "https://github.com/intsights/PyDeduplines" 10 | license = "MIT" 11 | keywords = [ 12 | "unique", 13 | "lines", 14 | "rust", 15 | "pyo3", 16 | ] 17 | 18 | [package.metadata.maturin] 19 | 20 | [lib] 21 | name = "pydeduplines" 22 | crate-type = ["cdylib"] 23 | 24 | [dependencies.pyo3] 25 | version = "0.15.1" 26 | features = ["extension-module"] 27 | 28 | [dependencies] 29 | ahash = "0.7" 30 | bytecount = {version = "0.6", features = ["runtime-dispatch-simd"]} 31 | crossbeam-deque = "0.8" 32 | crossbeam-utils = "0.8" 33 | memchr = "2" 34 | parking_lot = "0.12" 35 | 36 | [profile.release] 37 | lto = true 38 | panic = "abort" 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Intsights 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include Cargo.toml 2 | include pyproject.toml 3 | recursive-include src * 4 | recursive-include pydeduplines * 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Logo 4 | 5 |

6 | Python library for a duplicate lines removal written in Rust 7 |

8 |

9 | 10 | ![license](https://img.shields.io/badge/MIT-License-blue) 11 | ![Python](https://img.shields.io/badge/Python-3.7%20%7C%203.8%20%7C%203.9-blue) 12 | ![OS](https://img.shields.io/badge/OS-Mac%20%7C%20Linux%20%7C%20Windows-blue) 13 | ![Build](https://github.com/intsights/PyDeduplines/workflows/Build/badge.svg) 14 | [![PyPi](https://img.shields.io/pypi/v/PyDeduplines.svg)](https://pypi.org/project/PyDeduplines/) 15 | 16 | ## Table of Contents 17 | 18 | - [Table of Contents](#table-of-contents) 19 | - [About The Project](#about-the-project) 20 | - [Built With](#built-with) 21 | - [Performance](#performance) 22 | - [Deduplicating](#deduplicating) 23 | - [Added Lines](#added-lines) 24 | - [Installation](#installation) 25 | - [Documentation](#documentation) 26 | - [Usage](#usage) 27 | - [License](#license) 28 | - [Contact](#contact) 29 | 30 | 31 | ## About The Project 32 | 33 | This library is used to manipulate the lines of files. To achieve speed and efficiency, the library is written in Rust. 34 | 35 | There are two functions in the library: 36 | - `compute_unique_lines` - This function takes a list of input file paths and an output file path, iterates over the input file paths and writes unique lines to the output file. 37 | - `compute_added_lines` - This function takes three arguments `first_file_path`, `second_file_path` and `output_file_path`, and writes to the output file only lines that appeared in the second file but not in the first. 38 | 39 | 40 | ### Built With 41 | 42 | * [pyo3](https://github.com/PyO3/pyo3) 43 | * [crossbeam](https://github.com/crossbeam-rs/crossbeam) 44 | * [ahash](https://github.com/tkaitchuck/aHash) 45 | * [parking_lot](https://github.com/Amanieu/parking_lot) 46 | * [memchr](https://github.com/BurntSushi/memchr) 47 | * [bytecount](https://github.com/llogiq/bytecount) 48 | 49 | 50 | ### Performance 51 | 52 | #### Deduplicating 53 | | Library | Function | Time | Peak Memory | 54 | | ------------- | ------------- | ------------- | ------------- | 55 | | [GNU Sort](https://www.gnu.org/software/coreutils/) | sort -u -o output 500mb_one 500mb_two | 37.35s | 8,261mb | 56 | | [PyDeduplines](https://github.com/intsights/PyDeduplines) | compute_unique_lines('./workdir', ['500mb_one', '500mb_two'], 'output', 16) | 4.55s | 685mb | 57 | 58 | #### Added Lines 59 | | Library | Function | Time | Peak Memory | 60 | | ------------- | ------------- | ------------- | ------------- | 61 | | [GNU Sort](https://www.gnu.org/software/coreutils/) | comm -1 -3 <(sort 500mb_one) <(sort 500mb_two) > output.txt | 26.53s | 4,132mb | 62 | | [PyDeduplines](https://github.com/intsights/PyDeduplines) | compute_added_lines('./workdir', '500mb_one', '500mb_two', 'output', 16) | 3.95s | 314mb | 63 | 64 | 65 | ### Installation 66 | 67 | ```sh 68 | pip3 install PyDeduplines 69 | ``` 70 | 71 | 72 | ## Documentation 73 | 74 | ```python 75 | def compute_unique_lines( 76 | working_directory: str, 77 | file_paths: typing.List[str], 78 | output_file_path: str, 79 | number_of_splits: int, 80 | number_of_threads: int = 0, 81 | ) -> None: ... 82 | ``` 83 | - `working_directory` - A file path of a directory to work in. Each split file would be created in this directory. 84 | - `file_paths` - A list of strings containing the input file paths to iterate over and to calculate unique values for. 85 | - `output_file_path` - The path where the unique lines will be written. 86 | - `number_of_splits` - This parameter specifies how many smaller splits are to be made from each input file based on the number of splits. The idea behind this library is defined by this parameter. The more splits, the lower the peak memory consumption. Remember that the more splits you have, the more files you have open. 87 | - `number_of_threads` - Number of parallel threads. Using *0* means to use as many cores as possible. The number of threads greater than *1* would cause multiple splits on each input file. 88 | 89 | ```python 90 | def compute_added_lines( 91 | working_directory: str, 92 | first_file_path: str, 93 | second_file_path: str, 94 | output_file_path: str, 95 | number_of_splits: int, 96 | number_of_threads: int = 0, 97 | ) -> None: ... 98 | ``` 99 | - `working_directory` - A file path of a directory to work in. Each split file would be created in this directory. 100 | - `first_file_path` - A path to the first file to be iterated over. 101 | - `second_file_path` - A file path to iterate over and find lines that do not exist in the first file. 102 | - `output_file_path` - A path to the output file that contains the lines that appeared in the second file but not in the first. 103 | - `number_of_splits` - This parameter specifies how many smaller splits are to be made from each input file based on the number of splits. The idea behind this library is defined by this parameter. The more splits, the lower the peak memory consumption. Remember that the more splits you have, the more files you have open. 104 | - `number_of_threads` - Number of parallel threads. Using *0* means to use as many cores as possible. The number of threads greater than *1* would cause multiple splits on each input file. 105 | 106 | ## Usage 107 | 108 | ```python 109 | import pydeduplines 110 | 111 | 112 | pydeduplines.compute_unique_lines( 113 | working_directory='tmp', 114 | file_paths=[ 115 | '500mb_one', 116 | '500mb_two', 117 | ], 118 | output_file_path='output', 119 | number_of_splits=4, 120 | ) 121 | 122 | pydeduplines.compute_added_lines( 123 | working_directory='tmp', 124 | first_file_path='500mb_one', 125 | second_file_path='500mb_two', 126 | output_file_path='output', 127 | number_of_splits=4, 128 | ) 129 | ``` 130 | 131 | 132 | ## License 133 | 134 | Distributed under the MIT License. See `LICENSE` for more information. 135 | 136 | 137 | ## Contact 138 | 139 | Gal Ben David - gal@intsights.com 140 | 141 | Project Link: [https://github.com/intsights/PyDeduplines](https://github.com/intsights/PyDeduplines) 142 | -------------------------------------------------------------------------------- /cortex.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | info: 3 | title: Pydeduplines 4 | description: Python library for a duplicate lines removal written in C++ 5 | x-cortex-git: 6 | github: 7 | alias: intsightsorg 8 | repository: Intsights/PyDeduplines 9 | x-cortex-tag: pydeduplines 10 | x-cortex-type: service 11 | x-cortex-domain-parents: 12 | - tag: threatintel-platform-delivery 13 | x-cortex-groups: 14 | - exposure:external-ship 15 | - target:library 16 | openapi: 3.0.1 17 | servers: 18 | - url: "/" 19 | -------------------------------------------------------------------------------- /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PyDeduplines/759f7df1f154ba56217af446c3a782e47cadb409/images/logo.png -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | name = "colorama" 3 | version = "0.4.6" 4 | description = "Cross-platform colored terminal text." 5 | category = "dev" 6 | optional = false 7 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 8 | 9 | [[package]] 10 | name = "exceptiongroup" 11 | version = "1.2.0" 12 | description = "Backport of PEP 654 (exception groups)" 13 | category = "dev" 14 | optional = false 15 | python-versions = ">=3.7" 16 | 17 | [package.extras] 18 | test = ["pytest (>=6)"] 19 | 20 | [[package]] 21 | name = "importlib-metadata" 22 | version = "6.7.0" 23 | description = "Read metadata from Python packages" 24 | category = "dev" 25 | optional = false 26 | python-versions = ">=3.7" 27 | 28 | [package.dependencies] 29 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} 30 | zipp = ">=0.5" 31 | 32 | [package.extras] 33 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"] 34 | perf = ["ipython"] 35 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-ruff", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] 36 | 37 | [[package]] 38 | name = "iniconfig" 39 | version = "2.0.0" 40 | description = "brain-dead simple config-ini parsing" 41 | category = "dev" 42 | optional = false 43 | python-versions = ">=3.7" 44 | 45 | [[package]] 46 | name = "maturin" 47 | version = "1.4.0" 48 | description = "Build and publish crates with pyo3, rust-cpython and cffi bindings as well as rust binaries as python packages" 49 | category = "dev" 50 | optional = false 51 | python-versions = ">=3.7" 52 | 53 | [package.dependencies] 54 | tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} 55 | 56 | [package.extras] 57 | zig = ["ziglang (>=0.10.0,<0.11.0)"] 58 | patchelf = ["patchelf"] 59 | 60 | [[package]] 61 | name = "packaging" 62 | version = "23.2" 63 | description = "Core utilities for Python packages" 64 | category = "dev" 65 | optional = false 66 | python-versions = ">=3.7" 67 | 68 | [[package]] 69 | name = "pluggy" 70 | version = "1.2.0" 71 | description = "plugin and hook calling mechanisms for python" 72 | category = "dev" 73 | optional = false 74 | python-versions = ">=3.7" 75 | 76 | [package.dependencies] 77 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 78 | 79 | [package.extras] 80 | dev = ["pre-commit", "tox"] 81 | testing = ["pytest", "pytest-benchmark"] 82 | 83 | [[package]] 84 | name = "pytest" 85 | version = "7.4.4" 86 | description = "pytest: simple powerful testing with Python" 87 | category = "dev" 88 | optional = false 89 | python-versions = ">=3.7" 90 | 91 | [package.dependencies] 92 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 93 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} 94 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 95 | iniconfig = "*" 96 | packaging = "*" 97 | pluggy = ">=0.12,<2.0" 98 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} 99 | 100 | [package.extras] 101 | testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] 102 | 103 | [[package]] 104 | name = "pytest-runner" 105 | version = "6.0.1" 106 | description = "Invoke py.test as distutils command with dependency resolution" 107 | category = "dev" 108 | optional = false 109 | python-versions = ">=3.7" 110 | 111 | [package.extras] 112 | docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"] 113 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-virtualenv", "types-setuptools", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] 114 | 115 | [[package]] 116 | name = "tomli" 117 | version = "2.0.1" 118 | description = "A lil' TOML parser" 119 | category = "dev" 120 | optional = false 121 | python-versions = ">=3.7" 122 | 123 | [[package]] 124 | name = "typing-extensions" 125 | version = "4.7.1" 126 | description = "Backported and Experimental Type Hints for Python 3.7+" 127 | category = "dev" 128 | optional = false 129 | python-versions = ">=3.7" 130 | 131 | [[package]] 132 | name = "zipp" 133 | version = "3.15.0" 134 | description = "Backport of pathlib-compatible object wrapper for zip files" 135 | category = "dev" 136 | optional = false 137 | python-versions = ">=3.7" 138 | 139 | [package.extras] 140 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"] 141 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "jaraco.functools", "more-itertools", "big-o", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8"] 142 | 143 | [metadata] 144 | lock-version = "1.1" 145 | python-versions = "^3.7" 146 | content-hash = "d3751775f5a48f55874329689185792d15525d44f15678cc3bfeb66b5dea0d3d" 147 | 148 | [metadata.files] 149 | colorama = [] 150 | exceptiongroup = [] 151 | importlib-metadata = [] 152 | iniconfig = [] 153 | maturin = [] 154 | packaging = [] 155 | pluggy = [] 156 | pytest = [] 157 | pytest-runner = [] 158 | tomli = [] 159 | typing-extensions = [] 160 | zipp = [] 161 | -------------------------------------------------------------------------------- /pydeduplines/__init__.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import os 3 | import pathlib 4 | import typing 5 | import shutil 6 | 7 | from . import pydeduplines 8 | 9 | 10 | def compute_unique_lines( 11 | working_directory: str, 12 | file_paths: typing.List[str], 13 | output_file_path: str, 14 | number_of_splits: int, 15 | number_of_threads: int = 0, 16 | ) -> None: 17 | try: 18 | os.makedirs( 19 | name=working_directory, 20 | exist_ok=True, 21 | ) 22 | 23 | for file_path in file_paths: 24 | if not os.path.exists( 25 | path=file_path, 26 | ): 27 | raise FileNotFoundError(f'Could not find file: {file_path}') 28 | 29 | output_file_folder = pathlib.Path(output_file_path).parent 30 | if not os.access( 31 | path=output_file_folder, 32 | mode=os.W_OK, 33 | ): 34 | raise PermissionError(f'Could not write to the output file folder: {output_file_folder}') 35 | 36 | if number_of_threads <= 0: 37 | number_of_threads = multiprocessing.cpu_count() 38 | 39 | return pydeduplines.compute_unique_lines( 40 | working_directory, 41 | file_paths, 42 | output_file_path, 43 | number_of_splits, 44 | number_of_threads, 45 | ) 46 | finally: 47 | shutil.rmtree( 48 | path=working_directory, 49 | ) 50 | 51 | 52 | def compute_added_lines( 53 | working_directory: str, 54 | first_file_path: str, 55 | second_file_path: str, 56 | output_file_path: str, 57 | number_of_splits: int, 58 | number_of_threads: int = 0, 59 | ) -> None: 60 | try: 61 | os.makedirs( 62 | name=working_directory, 63 | exist_ok=True, 64 | ) 65 | 66 | for file_path in [ 67 | first_file_path, 68 | second_file_path, 69 | ]: 70 | if not os.path.exists( 71 | path=file_path, 72 | ): 73 | raise FileNotFoundError(f'Could not find file: {file_path}') 74 | 75 | output_file_folder = pathlib.Path(output_file_path).parent 76 | if not os.access( 77 | path=output_file_folder, 78 | mode=os.W_OK, 79 | ): 80 | raise PermissionError(f'Could not write to the output file folder: {output_file_folder}') 81 | 82 | if number_of_threads <= 0: 83 | number_of_threads = multiprocessing.cpu_count() 84 | 85 | return pydeduplines.compute_added_lines( 86 | working_directory, 87 | first_file_path, 88 | second_file_path, 89 | output_file_path, 90 | number_of_splits, 91 | number_of_threads, 92 | ) 93 | finally: 94 | shutil.rmtree( 95 | path=working_directory, 96 | ) 97 | -------------------------------------------------------------------------------- /pydeduplines/pydeduplines.pyi: -------------------------------------------------------------------------------- 1 | import typing 2 | 3 | 4 | def compute_unique_lines( 5 | working_directory: str, 6 | file_paths: typing.List[str], 7 | output_file_path: str, 8 | number_of_splits: int, 9 | number_of_threads: int = 0, 10 | ) -> None: ... 11 | 12 | 13 | def compute_added_lines( 14 | working_directory: str, 15 | first_file_path: str, 16 | second_file_path: str, 17 | output_file_path: str, 18 | number_of_splits: int, 19 | number_of_threads: int = 0, 20 | ) -> None: ... 21 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=0.11,<0.12"] 3 | build-backend = "maturin" 4 | 5 | [tool.maturin] 6 | sdist-include = [ 7 | "src", 8 | "Cargo.toml", 9 | "pydeduplines", 10 | ] 11 | 12 | [tool.poetry] 13 | name = "PyDeduplines" 14 | version = "0.6.1" 15 | authors = ["Gal Ben David "] 16 | description = "Python library for a duplicate lines removal written in Rust" 17 | readme = "README.md" 18 | repository = "https://github.com/intsights/PyDeduplines" 19 | license = "MIT" 20 | keywords = [ 21 | "unique", 22 | "lines", 23 | "rust", 24 | "pyo3", 25 | ] 26 | 27 | [tool.poetry.dependencies] 28 | python = "^3.7" 29 | 30 | [tool.poetry.dev-dependencies] 31 | pytest = "*" 32 | wheel = "*" 33 | pytest-runner = "*" 34 | maturin = "*" 35 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use ahash::AHashSet; 2 | use crossbeam_deque::{Steal, Worker}; 3 | use crossbeam_utils::thread as crossbeam_thread; 4 | use memchr::memchr_iter; 5 | use parking_lot::Mutex; 6 | use pyo3::exceptions::PyRuntimeError; 7 | use pyo3::prelude::*; 8 | use pyo3::wrap_pyfunction; 9 | use std::fs; 10 | use std::fs::File; 11 | use std::io::{BufReader, BufWriter}; 12 | use std::io::prelude::*; 13 | use std::path::{PathBuf, Path}; 14 | use std::sync::Arc; 15 | use std::sync::atomic::{AtomicUsize, AtomicBool, Ordering}; 16 | use std::thread; 17 | use std::time; 18 | 19 | const OUTPUT_FILE_BUFFER_SIZE: usize = 1024 * 1024 * 10; 20 | 21 | fn split_file( 22 | working_directory: &Path, 23 | input_file_path: &Path, 24 | prefix: String, 25 | num_parts: usize, 26 | should_stop: &AtomicBool, 27 | ) -> PyResult<()> { 28 | let mut output_files = Vec::with_capacity(num_parts); 29 | for i in 0..num_parts { 30 | let part_output_file_path = working_directory.join(format!("{}{}", prefix, i)); 31 | let output_file = File::create(part_output_file_path) 32 | .map_err(|err| PyRuntimeError::new_err(format!("Could not create part_output_file_path: {:?}", err)))?; 33 | output_files.push(BufWriter::new(output_file)); 34 | } 35 | 36 | let input_file = File::open(input_file_path) 37 | .map_err(|err| PyRuntimeError::new_err(format!("Could not open input_file_path: {:?}", err)))?; 38 | let mut input_file = BufReader::new(input_file); 39 | 40 | let mut bytes = vec![]; 41 | while !should_stop.load(Ordering::Relaxed) { 42 | let buf = input_file.fill_buf()?; 43 | let consumed = buf.len(); 44 | if consumed == 0 { 45 | break; 46 | } 47 | 48 | let mut prev_index = 0; 49 | for current_index in memchr_iter(b'\n', buf) { 50 | unsafe { 51 | let line = buf.get_unchecked(prev_index..=current_index); 52 | let current_index = line.iter().map(|x| *x as usize).sum::() % num_parts; 53 | 54 | output_files.get_unchecked_mut(current_index).write_all(line) 55 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_files[index]: {:?}", err)))?; 56 | } 57 | 58 | prev_index = current_index + 1; 59 | } 60 | 61 | if prev_index < buf.len() { 62 | bytes.extend_from_slice(&buf[prev_index..]); 63 | input_file.consume(consumed); 64 | input_file.read_until(b'\n', &mut bytes)?; 65 | if !bytes.is_empty() { 66 | if !bytes.ends_with(b"\n") { 67 | bytes.push(b'\n'); 68 | } 69 | 70 | let index = bytes.iter().map(|x| *x as usize).sum::() % num_parts; 71 | 72 | unsafe { 73 | output_files.get_unchecked_mut(index).write_all(&bytes) 74 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_files[index]: {:?}", err)))?; 75 | } 76 | } 77 | bytes.clear(); 78 | } else { 79 | input_file.consume(consumed); 80 | } 81 | }; 82 | 83 | Ok(()) 84 | } 85 | 86 | fn compute_part_added_lines( 87 | first_file_path: &Path, 88 | second_file_path: &Path, 89 | output_file: Arc>>, 90 | should_stop: &AtomicBool, 91 | ) -> PyResult<()> { 92 | let first_file_data = std::fs::read(first_file_path) 93 | .map_err(|err| PyRuntimeError::new_err(format!("Could not open first_file_path: {:?}", err)))?; 94 | 95 | let number_of_lines = bytecount::count(&first_file_data, b'\n'); 96 | let mut lines_set = AHashSet::with_capacity(number_of_lines); 97 | let mut prev_index = 0; 98 | for current_index in memchr_iter(b'\n', &first_file_data) { 99 | unsafe { 100 | lines_set.insert(first_file_data.get_unchecked(prev_index..current_index)); 101 | prev_index = current_index + 1; 102 | 103 | if should_stop.load(Ordering::Relaxed) { 104 | return Ok(()); 105 | } 106 | } 107 | } 108 | 109 | let second_file = File::open(second_file_path) 110 | .map_err(|err| PyRuntimeError::new_err(format!("Could not open second_file_path: {:?}", err)))?; 111 | let mut second_file = BufReader::new(second_file); 112 | let mut output_file_buffer = Vec::with_capacity(OUTPUT_FILE_BUFFER_SIZE + 1); 113 | let mut bytes = vec![]; 114 | while !should_stop.load(Ordering::Relaxed) { 115 | let buf = second_file.fill_buf()?; 116 | let consumed = buf.len(); 117 | if consumed == 0 { 118 | break; 119 | } 120 | 121 | let mut prev_index = 0; 122 | for current_index in memchr_iter(b'\n', buf) { 123 | unsafe { 124 | let line = buf.get_unchecked(prev_index..current_index); 125 | if !lines_set.contains(line) { 126 | if output_file_buffer.len() + line.len() + 1 > OUTPUT_FILE_BUFFER_SIZE { 127 | output_file.lock().write_all(&output_file_buffer) 128 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_file_locked: {:?}", err)))?; 129 | output_file_buffer.clear(); 130 | } 131 | output_file_buffer.extend_from_slice(buf.get_unchecked(prev_index..=current_index)); 132 | } 133 | 134 | prev_index = current_index + 1; 135 | } 136 | } 137 | 138 | if prev_index < buf.len() { 139 | bytes.extend_from_slice(&buf[prev_index..]); 140 | second_file.consume(consumed); 141 | second_file.read_until(b'\n', &mut bytes)?; 142 | if bytes.len() > 1 && !lines_set.contains(&bytes[..bytes.len() - 1]) { 143 | output_file.lock().write_all(&bytes) 144 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_file_locked: {:?}", err)))?; 145 | } 146 | bytes.clear(); 147 | } else { 148 | second_file.consume(consumed); 149 | } 150 | } 151 | if !output_file_buffer.is_empty() { 152 | output_file.lock().write_all(&output_file_buffer) 153 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_file_locked: {:?}", err)))?; 154 | } 155 | 156 | Ok(()) 157 | } 158 | 159 | fn compute_part_unique_lines( 160 | file_paths: Vec, 161 | output_file: Arc>>, 162 | should_stop: &AtomicBool, 163 | ) -> PyResult<()> { 164 | let mut total_number_of_bytes = 0; 165 | for file_path in file_paths.iter() { 166 | let metadata = fs::metadata(file_path) 167 | .map_err(|err| PyRuntimeError::new_err(format!("Could not get file_path metadata: {:?}", err)))?; 168 | total_number_of_bytes += metadata.len() as usize + file_paths.len(); 169 | } 170 | 171 | let mut file_data = Vec::with_capacity(total_number_of_bytes); 172 | for file_path in file_paths.iter() { 173 | let current_file_data = std::fs::read(file_path) 174 | .map_err(|err| PyRuntimeError::new_err(format!("Could not open current_file_data: {:?}", err)))?; 175 | file_data.extend_from_slice(¤t_file_data); 176 | } 177 | 178 | let total_number_of_lines = bytecount::count(&file_data, b'\n'); 179 | let mut lines_set = AHashSet::with_capacity(total_number_of_lines); 180 | let mut output_file_buffer = Vec::with_capacity(OUTPUT_FILE_BUFFER_SIZE + 1); 181 | 182 | let mut prev_index = 0; 183 | for current_index in memchr_iter(b'\n', &file_data) { 184 | unsafe { 185 | let record = file_data.get_unchecked(prev_index..=current_index); 186 | prev_index = current_index + 1; 187 | 188 | if lines_set.insert(record) { 189 | if output_file_buffer.len() + record.len() > OUTPUT_FILE_BUFFER_SIZE { 190 | output_file.lock().write_all(&output_file_buffer) 191 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_file_locked: {:?}", err)))?; 192 | output_file_buffer.clear(); 193 | 194 | if should_stop.load(Ordering::Relaxed) { 195 | return Ok(()); 196 | } 197 | } 198 | output_file_buffer.extend_from_slice(record); 199 | } 200 | } 201 | } 202 | if !output_file_buffer.is_empty() { 203 | output_file.lock().write_all(&output_file_buffer) 204 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_file_locked: {:?}", err)))?; 205 | } 206 | 207 | Ok(()) 208 | } 209 | 210 | #[pyfunction] 211 | fn compute_added_lines( 212 | py: Python, 213 | working_directory: PathBuf, 214 | first_file_path: PathBuf, 215 | second_file_path: PathBuf, 216 | output_file_path: PathBuf, 217 | number_of_splits: usize, 218 | number_of_threads: usize, 219 | ) -> PyResult<()> { 220 | let num_parts = number_of_threads * number_of_splits; 221 | 222 | let mut python_signal_result = Ok(()); 223 | let results = Arc::new(Mutex::new(Vec::new())); 224 | let should_stop = AtomicBool::new(false); 225 | let working_threads = AtomicUsize::new(2); 226 | 227 | crossbeam_thread::scope( 228 | |s| { 229 | s.spawn( 230 | |_| { 231 | let result = split_file( 232 | working_directory.as_path(), 233 | first_file_path.as_path(), 234 | "first_".to_string(), 235 | num_parts, 236 | &should_stop, 237 | ); 238 | results.lock().push(result); 239 | working_threads.fetch_sub(1, Ordering::Relaxed); 240 | } 241 | ); 242 | s.spawn( 243 | |_| { 244 | let result = split_file( 245 | working_directory.as_path(), 246 | second_file_path.as_path(), 247 | "second_".to_string(), 248 | num_parts, 249 | &should_stop, 250 | ); 251 | results.lock().push(result); 252 | working_threads.fetch_sub(1, Ordering::Relaxed); 253 | 254 | } 255 | ); 256 | while working_threads.load(Ordering::Relaxed) != 0 { 257 | python_signal_result = py.check_signals(); 258 | if python_signal_result.is_err() { 259 | should_stop.store(true, Ordering::Relaxed); 260 | 261 | break; 262 | } 263 | 264 | thread::sleep(time::Duration::from_millis(100)); 265 | } 266 | } 267 | ).map_err(|err| PyRuntimeError::new_err(format!("Splitting thread pool has paniced: {:?}", err)))?; 268 | python_signal_result?; 269 | for result in results.lock().drain(..) { 270 | result?; 271 | } 272 | 273 | let mut python_signal_result = Ok(()); 274 | let working_threads = AtomicUsize::new(num_parts); 275 | let output_file = File::create(output_file_path) 276 | .map_err(|err| PyRuntimeError::new_err(format!("Could not create output_file_path: {:?}", err)))?; 277 | let output_file = Arc::new(Mutex::new(BufWriter::new(output_file))); 278 | 279 | crossbeam_thread::scope( 280 | |s| { 281 | let worker = Worker::new_lifo(); 282 | let stealer = worker.stealer(); 283 | 284 | for i in 0..num_parts { 285 | worker.push( 286 | ( 287 | i, 288 | output_file.clone(), 289 | &should_stop, 290 | &working_threads, 291 | &working_directory, 292 | ) 293 | ); 294 | } 295 | 296 | for _ in 0..number_of_threads { 297 | let stealer = stealer.clone(); 298 | let results = results.clone(); 299 | s.spawn( 300 | move |_| { 301 | while let Steal::Success( 302 | ( 303 | i, 304 | output_file, 305 | should_stop, 306 | working_threads, 307 | working_directory, 308 | ) 309 | ) = stealer.steal() { 310 | let result = compute_part_added_lines( 311 | working_directory.join(format!("first_{}", i)).as_path(), 312 | working_directory.join(format!("second_{}", i)).as_path(), 313 | output_file, 314 | should_stop, 315 | ); 316 | results.lock().push(result); 317 | working_threads.fetch_sub(1, Ordering::Relaxed); 318 | } 319 | } 320 | ); 321 | } 322 | 323 | while working_threads.load(Ordering::Relaxed) != 0 { 324 | python_signal_result = py.check_signals(); 325 | if python_signal_result.is_err() { 326 | should_stop.store(true, Ordering::Relaxed); 327 | 328 | break; 329 | } 330 | 331 | thread::sleep(time::Duration::from_millis(100)); 332 | } 333 | } 334 | ).map_err(|err| PyRuntimeError::new_err(format!("Computing added lines thread pool has paniced: {:?}", err)))?; 335 | python_signal_result?; 336 | for result in results.lock().drain(..) { 337 | result?; 338 | } 339 | 340 | Ok(()) 341 | } 342 | 343 | #[pyfunction] 344 | fn compute_unique_lines( 345 | py: Python, 346 | working_directory: PathBuf, 347 | file_paths: Vec, 348 | output_file_path: PathBuf, 349 | number_of_splits: usize, 350 | number_of_threads: usize, 351 | ) -> PyResult<()> { 352 | let num_parts = number_of_threads * number_of_splits; 353 | 354 | let mut python_signal_result = Ok(()); 355 | let results = Arc::new(Mutex::new(Vec::new())); 356 | let should_stop = AtomicBool::new(false); 357 | let working_threads = AtomicUsize::new(file_paths.len()); 358 | 359 | crossbeam_thread::scope( 360 | |s| { 361 | let file_paths = file_paths.to_vec(); 362 | for (i, file_path) in file_paths.into_iter().enumerate() { 363 | let working_directory = &working_directory; 364 | let working_threads = &working_threads; 365 | let should_stop = &should_stop; 366 | let results = results.clone(); 367 | s.spawn( 368 | move |_| { 369 | let result = split_file( 370 | working_directory.as_path(), 371 | file_path.as_path(), 372 | format!("{}_", i), 373 | num_parts, 374 | should_stop, 375 | ); 376 | results.lock().push(result); 377 | working_threads.fetch_sub(1, Ordering::Relaxed); 378 | } 379 | ); 380 | } 381 | 382 | while working_threads.load(Ordering::Relaxed) != 0 { 383 | python_signal_result = py.check_signals(); 384 | if python_signal_result.is_err() { 385 | should_stop.store(true, Ordering::Relaxed); 386 | 387 | break; 388 | } 389 | 390 | thread::sleep(time::Duration::from_millis(100)); 391 | } 392 | } 393 | ).map_err(|err| PyRuntimeError::new_err(format!("Splitting thread pool has paniced: {:?}", err)))?; 394 | python_signal_result?; 395 | for result in results.lock().drain(..) { 396 | result?; 397 | } 398 | 399 | let mut python_signal_result = Ok(()); 400 | let working_threads = AtomicUsize::new(num_parts); 401 | let output_file = File::create(output_file_path) 402 | .map_err(|err| PyRuntimeError::new_err(format!("Could not create output_file_path: {:?}", err)))?; 403 | let output_file = Arc::new(Mutex::new(BufWriter::new(output_file))); 404 | 405 | crossbeam_thread::scope( 406 | |s| { 407 | let file_paths = file_paths.to_vec(); 408 | let worker = Worker::new_lifo(); 409 | let stealer = worker.stealer(); 410 | 411 | for part_number in 0..num_parts { 412 | let mut part_file_paths = Vec::new(); 413 | for file_path_index in 0..file_paths.len() { 414 | part_file_paths.push( 415 | working_directory.join(format!("{}_{}", file_path_index, part_number)) 416 | ); 417 | } 418 | worker.push( 419 | ( 420 | part_file_paths, 421 | output_file.clone(), 422 | &should_stop, 423 | &working_threads, 424 | ) 425 | ); 426 | } 427 | 428 | for _ in 0..number_of_threads { 429 | let stealer = stealer.clone(); 430 | let results = results.clone(); 431 | s.spawn( 432 | move |_| { 433 | while let Steal::Success( 434 | ( 435 | part_file_paths, 436 | output_file, 437 | should_stop, 438 | working_threads, 439 | ) 440 | ) = stealer.steal() { 441 | let result = compute_part_unique_lines( 442 | part_file_paths, 443 | output_file, 444 | should_stop, 445 | ); 446 | results.lock().push(result); 447 | working_threads.fetch_sub(1, Ordering::Relaxed); 448 | } 449 | } 450 | ); 451 | } 452 | 453 | while working_threads.load(Ordering::Relaxed) != 0 { 454 | python_signal_result = py.check_signals(); 455 | if python_signal_result.is_err() { 456 | should_stop.store(true, Ordering::Relaxed); 457 | 458 | break; 459 | } 460 | 461 | thread::sleep(time::Duration::from_millis(100)); 462 | } 463 | } 464 | ).map_err(|err| PyRuntimeError::new_err(format!("Computing unique lines thread pool has paniced: {:?}", err)))?; 465 | python_signal_result?; 466 | for result in results.lock().drain(..) { 467 | result?; 468 | } 469 | 470 | Ok(()) 471 | } 472 | 473 | #[pymodule] 474 | fn pydeduplines(_py: Python, m: &PyModule) -> PyResult<()> { 475 | m.add_function(wrap_pyfunction!(compute_added_lines, m)?)?; 476 | m.add_function(wrap_pyfunction!(compute_unique_lines, m)?)?; 477 | 478 | Ok(()) 479 | } 480 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Intsights/PyDeduplines/759f7df1f154ba56217af446c3a782e47cadb409/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_pydeduplines.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import contextlib 3 | import pytest 4 | import random 5 | 6 | import pydeduplines 7 | 8 | 9 | @pytest.mark.parametrize( 10 | 'number_of_threads', 11 | [ 12 | 0, 13 | 1, 14 | 2, 15 | ] 16 | ) 17 | @pytest.mark.parametrize( 18 | 'number_of_splits', 19 | [ 20 | 1, 21 | 2, 22 | ] 23 | ) 24 | def test_compute_unique_lines_one_file( 25 | number_of_threads, 26 | number_of_splits, 27 | ): 28 | with contextlib.ExitStack() as stack: 29 | test_input_file_one = stack.enter_context( 30 | tempfile.NamedTemporaryFile('wb') 31 | ) 32 | test_output_file = stack.enter_context( 33 | tempfile.NamedTemporaryFile('rb') 34 | ) 35 | 36 | lines = [ 37 | f'line{i}'.encode() 38 | for i in range(11000) 39 | ] 40 | random.shuffle(lines) 41 | 42 | test_input_file_one.file.write(b'\n'.join(lines * 2)) 43 | test_input_file_one.file.flush() 44 | 45 | tempdir = tempfile.mkdtemp() 46 | pydeduplines.compute_unique_lines( 47 | working_directory=tempdir, 48 | file_paths=[ 49 | test_input_file_one.name, 50 | ], 51 | output_file_path=test_output_file.name, 52 | number_of_splits=number_of_splits, 53 | number_of_threads=number_of_threads, 54 | ) 55 | unique_file_data = test_output_file.read() 56 | 57 | assert sorted(unique_file_data.split(b'\n')[:-1]) == sorted(lines) 58 | 59 | 60 | @pytest.mark.parametrize( 61 | 'number_of_threads', 62 | [ 63 | 0, 64 | 1, 65 | 2, 66 | ] 67 | ) 68 | @pytest.mark.parametrize( 69 | 'number_of_splits', 70 | [ 71 | 1, 72 | 2, 73 | ] 74 | ) 75 | def test_compute_unique_lines_two_files( 76 | number_of_threads, 77 | number_of_splits, 78 | ): 79 | with contextlib.ExitStack() as stack: 80 | test_input_file_one = stack.enter_context( 81 | tempfile.NamedTemporaryFile('wb') 82 | ) 83 | test_input_file_two = stack.enter_context( 84 | tempfile.NamedTemporaryFile('wb') 85 | ) 86 | test_output_file = stack.enter_context( 87 | tempfile.NamedTemporaryFile('rb') 88 | ) 89 | 90 | lines = [ 91 | f'line{i}'.encode() 92 | for i in range(11000) 93 | ] 94 | random.shuffle(lines) 95 | 96 | test_input_file_one.file.write(b'\n'.join(lines[:10000])) 97 | test_input_file_one.file.flush() 98 | 99 | test_input_file_two.file.write(b'\n'.join(lines[:11000])) 100 | test_input_file_two.file.flush() 101 | 102 | tempdir = tempfile.mkdtemp() 103 | pydeduplines.compute_unique_lines( 104 | working_directory=tempdir, 105 | file_paths=[ 106 | test_input_file_one.name, 107 | test_input_file_two.name, 108 | ], 109 | output_file_path=test_output_file.name, 110 | number_of_splits=number_of_splits, 111 | number_of_threads=number_of_threads, 112 | ) 113 | unique_file_data = test_output_file.read() 114 | 115 | assert sorted(unique_file_data.split(b'\n')[:-1]) == sorted(lines) 116 | 117 | 118 | @pytest.mark.parametrize( 119 | 'number_of_threads', 120 | [ 121 | 0, 122 | 1, 123 | 2, 124 | ] 125 | ) 126 | @pytest.mark.parametrize( 127 | 'number_of_splits', 128 | [ 129 | 1, 130 | 2, 131 | ] 132 | ) 133 | def test_compute_added_lines( 134 | number_of_threads, 135 | number_of_splits, 136 | ): 137 | with contextlib.ExitStack() as stack: 138 | test_input_file_one = stack.enter_context( 139 | tempfile.NamedTemporaryFile('wb') 140 | ) 141 | test_input_file_two = stack.enter_context( 142 | tempfile.NamedTemporaryFile('wb') 143 | ) 144 | test_output_file = stack.enter_context( 145 | tempfile.NamedTemporaryFile('rb') 146 | ) 147 | 148 | lines = [ 149 | f'line{i}'.encode() 150 | for i in range(11000) 151 | ] 152 | random.shuffle(lines) 153 | 154 | test_input_file_one.file.write(b'\n'.join(lines[:10000])) 155 | test_input_file_one.file.flush() 156 | 157 | test_input_file_two.file.write(b'\n'.join(lines[:11000])) 158 | test_input_file_two.file.flush() 159 | 160 | tempdir = tempfile.mkdtemp() 161 | pydeduplines.compute_added_lines( 162 | working_directory=tempdir, 163 | first_file_path=test_input_file_one.name, 164 | second_file_path=test_input_file_two.name, 165 | output_file_path=test_output_file.name, 166 | number_of_splits=number_of_splits, 167 | number_of_threads=number_of_threads, 168 | ) 169 | added_lines_file_data = test_output_file.read() 170 | assert sorted(added_lines_file_data.split(b'\n')[:-1]) == sorted(lines[10000:]) 171 | --------------------------------------------------------------------------------