├── .github
└── workflows
│ ├── build.yml
│ └── deploy.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── MANIFEST.in
├── README.md
├── cortex.yaml
├── images
└── logo.png
├── poetry.lock
├── pydeduplines
├── __init__.py
└── pydeduplines.pyi
├── pyproject.toml
├── src
└── lib.rs
└── tests
├── __init__.py
└── test_pydeduplines.py
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 | on:
3 | - push
4 | - pull_request
5 | jobs:
6 | lint:
7 | if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags')
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Checkout
11 | uses: actions/checkout@v3
12 | - name: Install latest rust
13 | uses: actions-rs/toolchain@v1
14 | with:
15 | toolchain: stable
16 | profile: minimal
17 | override: true
18 | components: clippy
19 | - name: Lint with clippy
20 | uses: actions-rs/cargo@v1
21 | with:
22 | command: clippy
23 | args: --all-targets --all-features
24 | test:
25 | runs-on: ${{ matrix.os }}
26 | needs: lint
27 | strategy:
28 | fail-fast: false
29 | matrix:
30 | python-version:
31 | - '3.7'
32 | - '3.8'
33 | - '3.9'
34 | - '3.10'
35 | - '3.11'
36 | os:
37 | - ubuntu-latest
38 | - macos-latest
39 | - windows-latest
40 | steps:
41 | - name: Checkout
42 | uses: actions/checkout@v3
43 | - name: Set up Python ${{ matrix.python-version }}
44 | uses: actions/setup-python@v3
45 | with:
46 | python-version: ${{ matrix.python-version }}
47 | - name: Install Poetry
48 | uses: abatilo/actions-poetry@v2.1.3
49 | - name: Install Rust
50 | uses: actions-rs/toolchain@v1
51 | with:
52 | profile: minimal
53 | toolchain: stable
54 | override: true
55 | - name: Install dependencies
56 | run: poetry install
57 | - name: Build Python package
58 | run: poetry run maturin develop
59 | - name: Test
60 | run: poetry run pytest -Werror tests
61 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: Deploy
2 | on:
3 | release:
4 | types:
5 | - released
6 | jobs:
7 | deploy:
8 | runs-on: ${{ matrix.os }}
9 | strategy:
10 | fail-fast: false
11 | matrix:
12 | python-version:
13 | - '3.7'
14 | - '3.8'
15 | - '3.9'
16 | - '3.10'
17 | - '3.11'
18 | os:
19 | - ubuntu-latest
20 | - macos-latest
21 | - windows-latest
22 | steps:
23 | - name: Checkout
24 | uses: actions/checkout@v3
25 | - name: Set up Python ${{ matrix.python-version }}
26 | uses: actions/setup-python@v4
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 | - name: Install Rust
30 | uses: actions-rs/toolchain@v1
31 | with:
32 | profile: minimal
33 | toolchain: stable
34 | override: true
35 | - name: Install Cross-compilers (macOS)
36 | if: matrix.os == 'macos-latest'
37 | run: |
38 | rustup target add x86_64-apple-darwin
39 | rustup target add aarch64-apple-darwin
40 | - name: Publish Package
41 | uses: PyO3/maturin-action@v1
42 | with:
43 | command: publish
44 | args: --username=__token__ ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.7' && '' || '--no-sdist' }} --interpreter=python${{ !startsWith(matrix.os, 'windows') && matrix.python-version || '' }}
45 | env:
46 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
47 | if: matrix.os != 'macos-latest'
48 | - name: Publish macOS (x86_64) Package
49 | if: matrix.os == 'macos-latest'
50 | uses: PyO3/maturin-action@v1
51 | with:
52 | command: publish
53 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=x86_64-apple-darwin --no-sdist
54 | env:
55 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
56 | - name: Publish macOS (arm64) Package
57 | if: matrix.os == 'macos-latest'
58 | uses: PyO3/maturin-action@v1
59 | with:
60 | command: publish
61 | args: --username=__token__ --interpreter=python${{ matrix.python-version }} --target=aarch64-apple-darwin --no-sdist
62 | env:
63 | MATURIN_PASSWORD: ${{ secrets.pypi_password }}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 | # Distribution / packaging
8 | .Python
9 | env/
10 | build/
11 | develop-eggs/
12 | dist/
13 | downloads/
14 | eggs/
15 | .eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 |
25 | # PyInstaller
26 | # Usually these files are written by a python script from a template
27 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 |
31 | # conflict temp files
32 | *.py.orig
33 | *.mock
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | coverage_html_report/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *,cover
49 | .pytest_cache/
50 |
51 | # Translations
52 | *.mo
53 | *.pot
54 |
55 | # Django stuff:
56 | *.log
57 |
58 | # Sphinx documentation
59 | docs/_build/
60 |
61 | # PyBuilder
62 | target/
63 |
64 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
65 |
66 | *.iml
67 |
68 | ## Directory-based project format:
69 | .idea/
70 |
71 | # Tests
72 | generic_tests.py
73 | cloudflare_test.py
74 |
75 | ############################
76 | #Eclipse Specific GitIgnore#
77 | ############################
78 | *.pydevproject
79 | .project
80 | .metadata
81 | bin/**
82 | tmp/**
83 | tmp/**/*
84 | *.tmp
85 | *.bak
86 | *.swp
87 | *~.nib
88 | local.properties
89 | .classpath
90 | .settings/
91 | .loadpath
92 |
93 |
94 | # Git mergetool traces
95 | *.orig
96 |
97 | # VS Code internal directory
98 | .vscode/
99 |
100 | *.dat
101 | *.code-workspace
102 | .history
103 |
104 | # Intsights development playground
105 | playground/
106 |
107 | pytest-report\.csv
108 | *.cppimporthash
109 | .rendered.*
110 | Databases.db
111 |
112 | # Node.js
113 | dist/
114 | node_modules/
115 | coverage/
116 |
117 | # Generated by Cargo
118 | # will have compiled files and executables
119 | /target/
120 |
121 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
122 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
123 | Cargo.lock
124 |
125 | # These are backup files generated by rustfmt
126 | **/*.rs.bk
127 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "PyDeduplines"
3 | version = "0.6.1"
4 | authors = ["Gal Ben David "]
5 | edition = "2021"
6 | description = "Python library for a duplicate lines removal written in Rust"
7 | readme = "README.md"
8 | repository = "https://github.com/intsights/PyDeduplines"
9 | homepage = "https://github.com/intsights/PyDeduplines"
10 | license = "MIT"
11 | keywords = [
12 | "unique",
13 | "lines",
14 | "rust",
15 | "pyo3",
16 | ]
17 |
18 | [package.metadata.maturin]
19 |
20 | [lib]
21 | name = "pydeduplines"
22 | crate-type = ["cdylib"]
23 |
24 | [dependencies.pyo3]
25 | version = "0.15.1"
26 | features = ["extension-module"]
27 |
28 | [dependencies]
29 | ahash = "0.7"
30 | bytecount = {version = "0.6", features = ["runtime-dispatch-simd"]}
31 | crossbeam-deque = "0.8"
32 | crossbeam-utils = "0.8"
33 | memchr = "2"
34 | parking_lot = "0.12"
35 |
36 | [profile.release]
37 | lto = true
38 | panic = "abort"
39 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Intsights
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include Cargo.toml
2 | include pyproject.toml
3 | recursive-include src *
4 | recursive-include pydeduplines *
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Python library for a duplicate lines removal written in Rust
7 |
8 |
9 |
10 | 
11 | 
12 | 
13 | 
14 | [](https://pypi.org/project/PyDeduplines/)
15 |
16 | ## Table of Contents
17 |
18 | - [Table of Contents](#table-of-contents)
19 | - [About The Project](#about-the-project)
20 | - [Built With](#built-with)
21 | - [Performance](#performance)
22 | - [Deduplicating](#deduplicating)
23 | - [Added Lines](#added-lines)
24 | - [Installation](#installation)
25 | - [Documentation](#documentation)
26 | - [Usage](#usage)
27 | - [License](#license)
28 | - [Contact](#contact)
29 |
30 |
31 | ## About The Project
32 |
33 | This library is used to manipulate the lines of files. To achieve speed and efficiency, the library is written in Rust.
34 |
35 | There are two functions in the library:
36 | - `compute_unique_lines` - This function takes a list of input file paths and an output file path, iterates over the input file paths and writes unique lines to the output file.
37 | - `compute_added_lines` - This function takes three arguments `first_file_path`, `second_file_path` and `output_file_path`, and writes to the output file only lines that appeared in the second file but not in the first.
38 |
39 |
40 | ### Built With
41 |
42 | * [pyo3](https://github.com/PyO3/pyo3)
43 | * [crossbeam](https://github.com/crossbeam-rs/crossbeam)
44 | * [ahash](https://github.com/tkaitchuck/aHash)
45 | * [parking_lot](https://github.com/Amanieu/parking_lot)
46 | * [memchr](https://github.com/BurntSushi/memchr)
47 | * [bytecount](https://github.com/llogiq/bytecount)
48 |
49 |
50 | ### Performance
51 |
52 | #### Deduplicating
53 | | Library | Function | Time | Peak Memory |
54 | | ------------- | ------------- | ------------- | ------------- |
55 | | [GNU Sort](https://www.gnu.org/software/coreutils/) | sort -u -o output 500mb_one 500mb_two | 37.35s | 8,261mb |
56 | | [PyDeduplines](https://github.com/intsights/PyDeduplines) | compute_unique_lines('./workdir', ['500mb_one', '500mb_two'], 'output', 16) | 4.55s | 685mb |
57 |
58 | #### Added Lines
59 | | Library | Function | Time | Peak Memory |
60 | | ------------- | ------------- | ------------- | ------------- |
61 | | [GNU Sort](https://www.gnu.org/software/coreutils/) | comm -1 -3 <(sort 500mb_one) <(sort 500mb_two) > output.txt | 26.53s | 4,132mb |
62 | | [PyDeduplines](https://github.com/intsights/PyDeduplines) | compute_added_lines('./workdir', '500mb_one', '500mb_two', 'output', 16) | 3.95s | 314mb |
63 |
64 |
65 | ### Installation
66 |
67 | ```sh
68 | pip3 install PyDeduplines
69 | ```
70 |
71 |
72 | ## Documentation
73 |
74 | ```python
75 | def compute_unique_lines(
76 | working_directory: str,
77 | file_paths: typing.List[str],
78 | output_file_path: str,
79 | number_of_splits: int,
80 | number_of_threads: int = 0,
81 | ) -> None: ...
82 | ```
83 | - `working_directory` - A file path of a directory to work in. Each split file would be created in this directory.
84 | - `file_paths` - A list of strings containing the input file paths to iterate over and to calculate unique values for.
85 | - `output_file_path` - The path where the unique lines will be written.
86 | - `number_of_splits` - This parameter specifies how many smaller splits are to be made from each input file based on the number of splits. The idea behind this library is defined by this parameter. The more splits, the lower the peak memory consumption. Remember that the more splits you have, the more files you have open.
87 | - `number_of_threads` - Number of parallel threads. Using *0* means to use as many cores as possible. The number of threads greater than *1* would cause multiple splits on each input file.
88 |
89 | ```python
90 | def compute_added_lines(
91 | working_directory: str,
92 | first_file_path: str,
93 | second_file_path: str,
94 | output_file_path: str,
95 | number_of_splits: int,
96 | number_of_threads: int = 0,
97 | ) -> None: ...
98 | ```
99 | - `working_directory` - A file path of a directory to work in. Each split file would be created in this directory.
100 | - `first_file_path` - A path to the first file to be iterated over.
101 | - `second_file_path` - A file path to iterate over and find lines that do not exist in the first file.
102 | - `output_file_path` - A path to the output file that contains the lines that appeared in the second file but not in the first.
103 | - `number_of_splits` - This parameter specifies how many smaller splits are to be made from each input file based on the number of splits. The idea behind this library is defined by this parameter. The more splits, the lower the peak memory consumption. Remember that the more splits you have, the more files you have open.
104 | - `number_of_threads` - Number of parallel threads. Using *0* means to use as many cores as possible. The number of threads greater than *1* would cause multiple splits on each input file.
105 |
106 | ## Usage
107 |
108 | ```python
109 | import pydeduplines
110 |
111 |
112 | pydeduplines.compute_unique_lines(
113 | working_directory='tmp',
114 | file_paths=[
115 | '500mb_one',
116 | '500mb_two',
117 | ],
118 | output_file_path='output',
119 | number_of_splits=4,
120 | )
121 |
122 | pydeduplines.compute_added_lines(
123 | working_directory='tmp',
124 | first_file_path='500mb_one',
125 | second_file_path='500mb_two',
126 | output_file_path='output',
127 | number_of_splits=4,
128 | )
129 | ```
130 |
131 |
132 | ## License
133 |
134 | Distributed under the MIT License. See `LICENSE` for more information.
135 |
136 |
137 | ## Contact
138 |
139 | Gal Ben David - gal@intsights.com
140 |
141 | Project Link: [https://github.com/intsights/PyDeduplines](https://github.com/intsights/PyDeduplines)
142 |
--------------------------------------------------------------------------------
/cortex.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | info:
3 | title: Pydeduplines
4 | description: Python library for a duplicate lines removal written in C++
5 | x-cortex-git:
6 | github:
7 | alias: intsightsorg
8 | repository: Intsights/PyDeduplines
9 | x-cortex-tag: pydeduplines
10 | x-cortex-type: service
11 | x-cortex-domain-parents:
12 | - tag: threatintel-platform-delivery
13 | x-cortex-groups:
14 | - exposure:external-ship
15 | - target:library
16 | openapi: 3.0.1
17 | servers:
18 | - url: "/"
19 |
--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyDeduplines/759f7df1f154ba56217af446c3a782e47cadb409/images/logo.png
--------------------------------------------------------------------------------
/poetry.lock:
--------------------------------------------------------------------------------
1 | [[package]]
2 | name = "colorama"
3 | version = "0.4.6"
4 | description = "Cross-platform colored terminal text."
5 | category = "dev"
6 | optional = false
7 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
8 |
9 | [[package]]
10 | name = "exceptiongroup"
11 | version = "1.2.0"
12 | description = "Backport of PEP 654 (exception groups)"
13 | category = "dev"
14 | optional = false
15 | python-versions = ">=3.7"
16 |
17 | [package.extras]
18 | test = ["pytest (>=6)"]
19 |
20 | [[package]]
21 | name = "importlib-metadata"
22 | version = "6.7.0"
23 | description = "Read metadata from Python packages"
24 | category = "dev"
25 | optional = false
26 | python-versions = ">=3.7"
27 |
28 | [package.dependencies]
29 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
30 | zipp = ">=0.5"
31 |
32 | [package.extras]
33 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"]
34 | perf = ["ipython"]
35 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-ruff", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"]
36 |
37 | [[package]]
38 | name = "iniconfig"
39 | version = "2.0.0"
40 | description = "brain-dead simple config-ini parsing"
41 | category = "dev"
42 | optional = false
43 | python-versions = ">=3.7"
44 |
45 | [[package]]
46 | name = "maturin"
47 | version = "1.4.0"
48 | description = "Build and publish crates with pyo3, rust-cpython and cffi bindings as well as rust binaries as python packages"
49 | category = "dev"
50 | optional = false
51 | python-versions = ">=3.7"
52 |
53 | [package.dependencies]
54 | tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
55 |
56 | [package.extras]
57 | zig = ["ziglang (>=0.10.0,<0.11.0)"]
58 | patchelf = ["patchelf"]
59 |
60 | [[package]]
61 | name = "packaging"
62 | version = "23.2"
63 | description = "Core utilities for Python packages"
64 | category = "dev"
65 | optional = false
66 | python-versions = ">=3.7"
67 |
68 | [[package]]
69 | name = "pluggy"
70 | version = "1.2.0"
71 | description = "plugin and hook calling mechanisms for python"
72 | category = "dev"
73 | optional = false
74 | python-versions = ">=3.7"
75 |
76 | [package.dependencies]
77 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
78 |
79 | [package.extras]
80 | dev = ["pre-commit", "tox"]
81 | testing = ["pytest", "pytest-benchmark"]
82 |
83 | [[package]]
84 | name = "pytest"
85 | version = "7.4.4"
86 | description = "pytest: simple powerful testing with Python"
87 | category = "dev"
88 | optional = false
89 | python-versions = ">=3.7"
90 |
91 | [package.dependencies]
92 | colorama = {version = "*", markers = "sys_platform == \"win32\""}
93 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
94 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
95 | iniconfig = "*"
96 | packaging = "*"
97 | pluggy = ">=0.12,<2.0"
98 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
99 |
100 | [package.extras]
101 | testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
102 |
103 | [[package]]
104 | name = "pytest-runner"
105 | version = "6.0.1"
106 | description = "Invoke py.test as distutils command with dependency resolution"
107 | category = "dev"
108 | optional = false
109 | python-versions = ">=3.7"
110 |
111 | [package.extras]
112 | docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"]
113 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-virtualenv", "types-setuptools", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"]
114 |
115 | [[package]]
116 | name = "tomli"
117 | version = "2.0.1"
118 | description = "A lil' TOML parser"
119 | category = "dev"
120 | optional = false
121 | python-versions = ">=3.7"
122 |
123 | [[package]]
124 | name = "typing-extensions"
125 | version = "4.7.1"
126 | description = "Backported and Experimental Type Hints for Python 3.7+"
127 | category = "dev"
128 | optional = false
129 | python-versions = ">=3.7"
130 |
131 | [[package]]
132 | name = "zipp"
133 | version = "3.15.0"
134 | description = "Backport of pathlib-compatible object wrapper for zip files"
135 | category = "dev"
136 | optional = false
137 | python-versions = ">=3.7"
138 |
139 | [package.extras]
140 | docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "sphinx-lint", "jaraco.tidelift (>=1.4)"]
141 | testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "jaraco.functools", "more-itertools", "big-o", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8"]
142 |
143 | [metadata]
144 | lock-version = "1.1"
145 | python-versions = "^3.7"
146 | content-hash = "d3751775f5a48f55874329689185792d15525d44f15678cc3bfeb66b5dea0d3d"
147 |
148 | [metadata.files]
149 | colorama = []
150 | exceptiongroup = []
151 | importlib-metadata = []
152 | iniconfig = []
153 | maturin = []
154 | packaging = []
155 | pluggy = []
156 | pytest = []
157 | pytest-runner = []
158 | tomli = []
159 | typing-extensions = []
160 | zipp = []
161 |
--------------------------------------------------------------------------------
/pydeduplines/__init__.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | import os
3 | import pathlib
4 | import typing
5 | import shutil
6 |
7 | from . import pydeduplines
8 |
9 |
10 | def compute_unique_lines(
11 | working_directory: str,
12 | file_paths: typing.List[str],
13 | output_file_path: str,
14 | number_of_splits: int,
15 | number_of_threads: int = 0,
16 | ) -> None:
17 | try:
18 | os.makedirs(
19 | name=working_directory,
20 | exist_ok=True,
21 | )
22 |
23 | for file_path in file_paths:
24 | if not os.path.exists(
25 | path=file_path,
26 | ):
27 | raise FileNotFoundError(f'Could not find file: {file_path}')
28 |
29 | output_file_folder = pathlib.Path(output_file_path).parent
30 | if not os.access(
31 | path=output_file_folder,
32 | mode=os.W_OK,
33 | ):
34 | raise PermissionError(f'Could not write to the output file folder: {output_file_folder}')
35 |
36 | if number_of_threads <= 0:
37 | number_of_threads = multiprocessing.cpu_count()
38 |
39 | return pydeduplines.compute_unique_lines(
40 | working_directory,
41 | file_paths,
42 | output_file_path,
43 | number_of_splits,
44 | number_of_threads,
45 | )
46 | finally:
47 | shutil.rmtree(
48 | path=working_directory,
49 | )
50 |
51 |
52 | def compute_added_lines(
53 | working_directory: str,
54 | first_file_path: str,
55 | second_file_path: str,
56 | output_file_path: str,
57 | number_of_splits: int,
58 | number_of_threads: int = 0,
59 | ) -> None:
60 | try:
61 | os.makedirs(
62 | name=working_directory,
63 | exist_ok=True,
64 | )
65 |
66 | for file_path in [
67 | first_file_path,
68 | second_file_path,
69 | ]:
70 | if not os.path.exists(
71 | path=file_path,
72 | ):
73 | raise FileNotFoundError(f'Could not find file: {file_path}')
74 |
75 | output_file_folder = pathlib.Path(output_file_path).parent
76 | if not os.access(
77 | path=output_file_folder,
78 | mode=os.W_OK,
79 | ):
80 | raise PermissionError(f'Could not write to the output file folder: {output_file_folder}')
81 |
82 | if number_of_threads <= 0:
83 | number_of_threads = multiprocessing.cpu_count()
84 |
85 | return pydeduplines.compute_added_lines(
86 | working_directory,
87 | first_file_path,
88 | second_file_path,
89 | output_file_path,
90 | number_of_splits,
91 | number_of_threads,
92 | )
93 | finally:
94 | shutil.rmtree(
95 | path=working_directory,
96 | )
97 |
--------------------------------------------------------------------------------
/pydeduplines/pydeduplines.pyi:
--------------------------------------------------------------------------------
1 | import typing
2 |
3 |
4 | def compute_unique_lines(
5 | working_directory: str,
6 | file_paths: typing.List[str],
7 | output_file_path: str,
8 | number_of_splits: int,
9 | number_of_threads: int = 0,
10 | ) -> None: ...
11 |
12 |
13 | def compute_added_lines(
14 | working_directory: str,
15 | first_file_path: str,
16 | second_file_path: str,
17 | output_file_path: str,
18 | number_of_splits: int,
19 | number_of_threads: int = 0,
20 | ) -> None: ...
21 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["maturin>=0.11,<0.12"]
3 | build-backend = "maturin"
4 |
5 | [tool.maturin]
6 | sdist-include = [
7 | "src",
8 | "Cargo.toml",
9 | "pydeduplines",
10 | ]
11 |
12 | [tool.poetry]
13 | name = "PyDeduplines"
14 | version = "0.6.1"
15 | authors = ["Gal Ben David "]
16 | description = "Python library for a duplicate lines removal written in Rust"
17 | readme = "README.md"
18 | repository = "https://github.com/intsights/PyDeduplines"
19 | license = "MIT"
20 | keywords = [
21 | "unique",
22 | "lines",
23 | "rust",
24 | "pyo3",
25 | ]
26 |
27 | [tool.poetry.dependencies]
28 | python = "^3.7"
29 |
30 | [tool.poetry.dev-dependencies]
31 | pytest = "*"
32 | wheel = "*"
33 | pytest-runner = "*"
34 | maturin = "*"
35 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | use ahash::AHashSet;
2 | use crossbeam_deque::{Steal, Worker};
3 | use crossbeam_utils::thread as crossbeam_thread;
4 | use memchr::memchr_iter;
5 | use parking_lot::Mutex;
6 | use pyo3::exceptions::PyRuntimeError;
7 | use pyo3::prelude::*;
8 | use pyo3::wrap_pyfunction;
9 | use std::fs;
10 | use std::fs::File;
11 | use std::io::{BufReader, BufWriter};
12 | use std::io::prelude::*;
13 | use std::path::{PathBuf, Path};
14 | use std::sync::Arc;
15 | use std::sync::atomic::{AtomicUsize, AtomicBool, Ordering};
16 | use std::thread;
17 | use std::time;
18 |
19 | const OUTPUT_FILE_BUFFER_SIZE: usize = 1024 * 1024 * 10;
20 |
21 | fn split_file(
22 | working_directory: &Path,
23 | input_file_path: &Path,
24 | prefix: String,
25 | num_parts: usize,
26 | should_stop: &AtomicBool,
27 | ) -> PyResult<()> {
28 | let mut output_files = Vec::with_capacity(num_parts);
29 | for i in 0..num_parts {
30 | let part_output_file_path = working_directory.join(format!("{}{}", prefix, i));
31 | let output_file = File::create(part_output_file_path)
32 | .map_err(|err| PyRuntimeError::new_err(format!("Could not create part_output_file_path: {:?}", err)))?;
33 | output_files.push(BufWriter::new(output_file));
34 | }
35 |
36 | let input_file = File::open(input_file_path)
37 | .map_err(|err| PyRuntimeError::new_err(format!("Could not open input_file_path: {:?}", err)))?;
38 | let mut input_file = BufReader::new(input_file);
39 |
40 | let mut bytes = vec![];
41 | while !should_stop.load(Ordering::Relaxed) {
42 | let buf = input_file.fill_buf()?;
43 | let consumed = buf.len();
44 | if consumed == 0 {
45 | break;
46 | }
47 |
48 | let mut prev_index = 0;
49 | for current_index in memchr_iter(b'\n', buf) {
50 | unsafe {
51 | let line = buf.get_unchecked(prev_index..=current_index);
52 | let current_index = line.iter().map(|x| *x as usize).sum::() % num_parts;
53 |
54 | output_files.get_unchecked_mut(current_index).write_all(line)
55 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_files[index]: {:?}", err)))?;
56 | }
57 |
58 | prev_index = current_index + 1;
59 | }
60 |
61 | if prev_index < buf.len() {
62 | bytes.extend_from_slice(&buf[prev_index..]);
63 | input_file.consume(consumed);
64 | input_file.read_until(b'\n', &mut bytes)?;
65 | if !bytes.is_empty() {
66 | if !bytes.ends_with(b"\n") {
67 | bytes.push(b'\n');
68 | }
69 |
70 | let index = bytes.iter().map(|x| *x as usize).sum::() % num_parts;
71 |
72 | unsafe {
73 | output_files.get_unchecked_mut(index).write_all(&bytes)
74 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_files[index]: {:?}", err)))?;
75 | }
76 | }
77 | bytes.clear();
78 | } else {
79 | input_file.consume(consumed);
80 | }
81 | };
82 |
83 | Ok(())
84 | }
85 |
86 | fn compute_part_added_lines(
87 | first_file_path: &Path,
88 | second_file_path: &Path,
89 | output_file: Arc>>,
90 | should_stop: &AtomicBool,
91 | ) -> PyResult<()> {
92 | let first_file_data = std::fs::read(first_file_path)
93 | .map_err(|err| PyRuntimeError::new_err(format!("Could not open first_file_path: {:?}", err)))?;
94 |
95 | let number_of_lines = bytecount::count(&first_file_data, b'\n');
96 | let mut lines_set = AHashSet::with_capacity(number_of_lines);
97 | let mut prev_index = 0;
98 | for current_index in memchr_iter(b'\n', &first_file_data) {
99 | unsafe {
100 | lines_set.insert(first_file_data.get_unchecked(prev_index..current_index));
101 | prev_index = current_index + 1;
102 |
103 | if should_stop.load(Ordering::Relaxed) {
104 | return Ok(());
105 | }
106 | }
107 | }
108 |
109 | let second_file = File::open(second_file_path)
110 | .map_err(|err| PyRuntimeError::new_err(format!("Could not open second_file_path: {:?}", err)))?;
111 | let mut second_file = BufReader::new(second_file);
112 | let mut output_file_buffer = Vec::with_capacity(OUTPUT_FILE_BUFFER_SIZE + 1);
113 | let mut bytes = vec![];
114 | while !should_stop.load(Ordering::Relaxed) {
115 | let buf = second_file.fill_buf()?;
116 | let consumed = buf.len();
117 | if consumed == 0 {
118 | break;
119 | }
120 |
121 | let mut prev_index = 0;
122 | for current_index in memchr_iter(b'\n', buf) {
123 | unsafe {
124 | let line = buf.get_unchecked(prev_index..current_index);
125 | if !lines_set.contains(line) {
126 | if output_file_buffer.len() + line.len() + 1 > OUTPUT_FILE_BUFFER_SIZE {
127 | output_file.lock().write_all(&output_file_buffer)
128 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_file_locked: {:?}", err)))?;
129 | output_file_buffer.clear();
130 | }
131 | output_file_buffer.extend_from_slice(buf.get_unchecked(prev_index..=current_index));
132 | }
133 |
134 | prev_index = current_index + 1;
135 | }
136 | }
137 |
138 | if prev_index < buf.len() {
139 | bytes.extend_from_slice(&buf[prev_index..]);
140 | second_file.consume(consumed);
141 | second_file.read_until(b'\n', &mut bytes)?;
142 | if bytes.len() > 1 && !lines_set.contains(&bytes[..bytes.len() - 1]) {
143 | output_file.lock().write_all(&bytes)
144 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_file_locked: {:?}", err)))?;
145 | }
146 | bytes.clear();
147 | } else {
148 | second_file.consume(consumed);
149 | }
150 | }
151 | if !output_file_buffer.is_empty() {
152 | output_file.lock().write_all(&output_file_buffer)
153 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_file_locked: {:?}", err)))?;
154 | }
155 |
156 | Ok(())
157 | }
158 |
159 | fn compute_part_unique_lines(
160 | file_paths: Vec,
161 | output_file: Arc>>,
162 | should_stop: &AtomicBool,
163 | ) -> PyResult<()> {
164 | let mut total_number_of_bytes = 0;
165 | for file_path in file_paths.iter() {
166 | let metadata = fs::metadata(file_path)
167 | .map_err(|err| PyRuntimeError::new_err(format!("Could not get file_path metadata: {:?}", err)))?;
168 | total_number_of_bytes += metadata.len() as usize + file_paths.len();
169 | }
170 |
171 | let mut file_data = Vec::with_capacity(total_number_of_bytes);
172 | for file_path in file_paths.iter() {
173 | let current_file_data = std::fs::read(file_path)
174 | .map_err(|err| PyRuntimeError::new_err(format!("Could not open current_file_data: {:?}", err)))?;
175 | file_data.extend_from_slice(¤t_file_data);
176 | }
177 |
178 | let total_number_of_lines = bytecount::count(&file_data, b'\n');
179 | let mut lines_set = AHashSet::with_capacity(total_number_of_lines);
180 | let mut output_file_buffer = Vec::with_capacity(OUTPUT_FILE_BUFFER_SIZE + 1);
181 |
182 | let mut prev_index = 0;
183 | for current_index in memchr_iter(b'\n', &file_data) {
184 | unsafe {
185 | let record = file_data.get_unchecked(prev_index..=current_index);
186 | prev_index = current_index + 1;
187 |
188 | if lines_set.insert(record) {
189 | if output_file_buffer.len() + record.len() > OUTPUT_FILE_BUFFER_SIZE {
190 | output_file.lock().write_all(&output_file_buffer)
191 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_file_locked: {:?}", err)))?;
192 | output_file_buffer.clear();
193 |
194 | if should_stop.load(Ordering::Relaxed) {
195 | return Ok(());
196 | }
197 | }
198 | output_file_buffer.extend_from_slice(record);
199 | }
200 | }
201 | }
202 | if !output_file_buffer.is_empty() {
203 | output_file.lock().write_all(&output_file_buffer)
204 | .map_err(|err| PyRuntimeError::new_err(format!("Could not write to output_file_locked: {:?}", err)))?;
205 | }
206 |
207 | Ok(())
208 | }
209 |
210 | #[pyfunction]
211 | fn compute_added_lines(
212 | py: Python,
213 | working_directory: PathBuf,
214 | first_file_path: PathBuf,
215 | second_file_path: PathBuf,
216 | output_file_path: PathBuf,
217 | number_of_splits: usize,
218 | number_of_threads: usize,
219 | ) -> PyResult<()> {
220 | let num_parts = number_of_threads * number_of_splits;
221 |
222 | let mut python_signal_result = Ok(());
223 | let results = Arc::new(Mutex::new(Vec::new()));
224 | let should_stop = AtomicBool::new(false);
225 | let working_threads = AtomicUsize::new(2);
226 |
227 | crossbeam_thread::scope(
228 | |s| {
229 | s.spawn(
230 | |_| {
231 | let result = split_file(
232 | working_directory.as_path(),
233 | first_file_path.as_path(),
234 | "first_".to_string(),
235 | num_parts,
236 | &should_stop,
237 | );
238 | results.lock().push(result);
239 | working_threads.fetch_sub(1, Ordering::Relaxed);
240 | }
241 | );
242 | s.spawn(
243 | |_| {
244 | let result = split_file(
245 | working_directory.as_path(),
246 | second_file_path.as_path(),
247 | "second_".to_string(),
248 | num_parts,
249 | &should_stop,
250 | );
251 | results.lock().push(result);
252 | working_threads.fetch_sub(1, Ordering::Relaxed);
253 |
254 | }
255 | );
256 | while working_threads.load(Ordering::Relaxed) != 0 {
257 | python_signal_result = py.check_signals();
258 | if python_signal_result.is_err() {
259 | should_stop.store(true, Ordering::Relaxed);
260 |
261 | break;
262 | }
263 |
264 | thread::sleep(time::Duration::from_millis(100));
265 | }
266 | }
267 | ).map_err(|err| PyRuntimeError::new_err(format!("Splitting thread pool has paniced: {:?}", err)))?;
268 | python_signal_result?;
269 | for result in results.lock().drain(..) {
270 | result?;
271 | }
272 |
273 | let mut python_signal_result = Ok(());
274 | let working_threads = AtomicUsize::new(num_parts);
275 | let output_file = File::create(output_file_path)
276 | .map_err(|err| PyRuntimeError::new_err(format!("Could not create output_file_path: {:?}", err)))?;
277 | let output_file = Arc::new(Mutex::new(BufWriter::new(output_file)));
278 |
279 | crossbeam_thread::scope(
280 | |s| {
281 | let worker = Worker::new_lifo();
282 | let stealer = worker.stealer();
283 |
284 | for i in 0..num_parts {
285 | worker.push(
286 | (
287 | i,
288 | output_file.clone(),
289 | &should_stop,
290 | &working_threads,
291 | &working_directory,
292 | )
293 | );
294 | }
295 |
296 | for _ in 0..number_of_threads {
297 | let stealer = stealer.clone();
298 | let results = results.clone();
299 | s.spawn(
300 | move |_| {
301 | while let Steal::Success(
302 | (
303 | i,
304 | output_file,
305 | should_stop,
306 | working_threads,
307 | working_directory,
308 | )
309 | ) = stealer.steal() {
310 | let result = compute_part_added_lines(
311 | working_directory.join(format!("first_{}", i)).as_path(),
312 | working_directory.join(format!("second_{}", i)).as_path(),
313 | output_file,
314 | should_stop,
315 | );
316 | results.lock().push(result);
317 | working_threads.fetch_sub(1, Ordering::Relaxed);
318 | }
319 | }
320 | );
321 | }
322 |
323 | while working_threads.load(Ordering::Relaxed) != 0 {
324 | python_signal_result = py.check_signals();
325 | if python_signal_result.is_err() {
326 | should_stop.store(true, Ordering::Relaxed);
327 |
328 | break;
329 | }
330 |
331 | thread::sleep(time::Duration::from_millis(100));
332 | }
333 | }
334 | ).map_err(|err| PyRuntimeError::new_err(format!("Computing added lines thread pool has paniced: {:?}", err)))?;
335 | python_signal_result?;
336 | for result in results.lock().drain(..) {
337 | result?;
338 | }
339 |
340 | Ok(())
341 | }
342 |
343 | #[pyfunction]
344 | fn compute_unique_lines(
345 | py: Python,
346 | working_directory: PathBuf,
347 | file_paths: Vec,
348 | output_file_path: PathBuf,
349 | number_of_splits: usize,
350 | number_of_threads: usize,
351 | ) -> PyResult<()> {
352 | let num_parts = number_of_threads * number_of_splits;
353 |
354 | let mut python_signal_result = Ok(());
355 | let results = Arc::new(Mutex::new(Vec::new()));
356 | let should_stop = AtomicBool::new(false);
357 | let working_threads = AtomicUsize::new(file_paths.len());
358 |
359 | crossbeam_thread::scope(
360 | |s| {
361 | let file_paths = file_paths.to_vec();
362 | for (i, file_path) in file_paths.into_iter().enumerate() {
363 | let working_directory = &working_directory;
364 | let working_threads = &working_threads;
365 | let should_stop = &should_stop;
366 | let results = results.clone();
367 | s.spawn(
368 | move |_| {
369 | let result = split_file(
370 | working_directory.as_path(),
371 | file_path.as_path(),
372 | format!("{}_", i),
373 | num_parts,
374 | should_stop,
375 | );
376 | results.lock().push(result);
377 | working_threads.fetch_sub(1, Ordering::Relaxed);
378 | }
379 | );
380 | }
381 |
382 | while working_threads.load(Ordering::Relaxed) != 0 {
383 | python_signal_result = py.check_signals();
384 | if python_signal_result.is_err() {
385 | should_stop.store(true, Ordering::Relaxed);
386 |
387 | break;
388 | }
389 |
390 | thread::sleep(time::Duration::from_millis(100));
391 | }
392 | }
393 | ).map_err(|err| PyRuntimeError::new_err(format!("Splitting thread pool has paniced: {:?}", err)))?;
394 | python_signal_result?;
395 | for result in results.lock().drain(..) {
396 | result?;
397 | }
398 |
399 | let mut python_signal_result = Ok(());
400 | let working_threads = AtomicUsize::new(num_parts);
401 | let output_file = File::create(output_file_path)
402 | .map_err(|err| PyRuntimeError::new_err(format!("Could not create output_file_path: {:?}", err)))?;
403 | let output_file = Arc::new(Mutex::new(BufWriter::new(output_file)));
404 |
405 | crossbeam_thread::scope(
406 | |s| {
407 | let file_paths = file_paths.to_vec();
408 | let worker = Worker::new_lifo();
409 | let stealer = worker.stealer();
410 |
411 | for part_number in 0..num_parts {
412 | let mut part_file_paths = Vec::new();
413 | for file_path_index in 0..file_paths.len() {
414 | part_file_paths.push(
415 | working_directory.join(format!("{}_{}", file_path_index, part_number))
416 | );
417 | }
418 | worker.push(
419 | (
420 | part_file_paths,
421 | output_file.clone(),
422 | &should_stop,
423 | &working_threads,
424 | )
425 | );
426 | }
427 |
428 | for _ in 0..number_of_threads {
429 | let stealer = stealer.clone();
430 | let results = results.clone();
431 | s.spawn(
432 | move |_| {
433 | while let Steal::Success(
434 | (
435 | part_file_paths,
436 | output_file,
437 | should_stop,
438 | working_threads,
439 | )
440 | ) = stealer.steal() {
441 | let result = compute_part_unique_lines(
442 | part_file_paths,
443 | output_file,
444 | should_stop,
445 | );
446 | results.lock().push(result);
447 | working_threads.fetch_sub(1, Ordering::Relaxed);
448 | }
449 | }
450 | );
451 | }
452 |
453 | while working_threads.load(Ordering::Relaxed) != 0 {
454 | python_signal_result = py.check_signals();
455 | if python_signal_result.is_err() {
456 | should_stop.store(true, Ordering::Relaxed);
457 |
458 | break;
459 | }
460 |
461 | thread::sleep(time::Duration::from_millis(100));
462 | }
463 | }
464 | ).map_err(|err| PyRuntimeError::new_err(format!("Computing unique lines thread pool has paniced: {:?}", err)))?;
465 | python_signal_result?;
466 | for result in results.lock().drain(..) {
467 | result?;
468 | }
469 |
470 | Ok(())
471 | }
472 |
473 | #[pymodule]
474 | fn pydeduplines(_py: Python, m: &PyModule) -> PyResult<()> {
475 | m.add_function(wrap_pyfunction!(compute_added_lines, m)?)?;
476 | m.add_function(wrap_pyfunction!(compute_unique_lines, m)?)?;
477 |
478 | Ok(())
479 | }
480 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Intsights/PyDeduplines/759f7df1f154ba56217af446c3a782e47cadb409/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_pydeduplines.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | import contextlib
3 | import pytest
4 | import random
5 |
6 | import pydeduplines
7 |
8 |
9 | @pytest.mark.parametrize(
10 | 'number_of_threads',
11 | [
12 | 0,
13 | 1,
14 | 2,
15 | ]
16 | )
17 | @pytest.mark.parametrize(
18 | 'number_of_splits',
19 | [
20 | 1,
21 | 2,
22 | ]
23 | )
24 | def test_compute_unique_lines_one_file(
25 | number_of_threads,
26 | number_of_splits,
27 | ):
28 | with contextlib.ExitStack() as stack:
29 | test_input_file_one = stack.enter_context(
30 | tempfile.NamedTemporaryFile('wb')
31 | )
32 | test_output_file = stack.enter_context(
33 | tempfile.NamedTemporaryFile('rb')
34 | )
35 |
36 | lines = [
37 | f'line{i}'.encode()
38 | for i in range(11000)
39 | ]
40 | random.shuffle(lines)
41 |
42 | test_input_file_one.file.write(b'\n'.join(lines * 2))
43 | test_input_file_one.file.flush()
44 |
45 | tempdir = tempfile.mkdtemp()
46 | pydeduplines.compute_unique_lines(
47 | working_directory=tempdir,
48 | file_paths=[
49 | test_input_file_one.name,
50 | ],
51 | output_file_path=test_output_file.name,
52 | number_of_splits=number_of_splits,
53 | number_of_threads=number_of_threads,
54 | )
55 | unique_file_data = test_output_file.read()
56 |
57 | assert sorted(unique_file_data.split(b'\n')[:-1]) == sorted(lines)
58 |
59 |
60 | @pytest.mark.parametrize(
61 | 'number_of_threads',
62 | [
63 | 0,
64 | 1,
65 | 2,
66 | ]
67 | )
68 | @pytest.mark.parametrize(
69 | 'number_of_splits',
70 | [
71 | 1,
72 | 2,
73 | ]
74 | )
75 | def test_compute_unique_lines_two_files(
76 | number_of_threads,
77 | number_of_splits,
78 | ):
79 | with contextlib.ExitStack() as stack:
80 | test_input_file_one = stack.enter_context(
81 | tempfile.NamedTemporaryFile('wb')
82 | )
83 | test_input_file_two = stack.enter_context(
84 | tempfile.NamedTemporaryFile('wb')
85 | )
86 | test_output_file = stack.enter_context(
87 | tempfile.NamedTemporaryFile('rb')
88 | )
89 |
90 | lines = [
91 | f'line{i}'.encode()
92 | for i in range(11000)
93 | ]
94 | random.shuffle(lines)
95 |
96 | test_input_file_one.file.write(b'\n'.join(lines[:10000]))
97 | test_input_file_one.file.flush()
98 |
99 | test_input_file_two.file.write(b'\n'.join(lines[:11000]))
100 | test_input_file_two.file.flush()
101 |
102 | tempdir = tempfile.mkdtemp()
103 | pydeduplines.compute_unique_lines(
104 | working_directory=tempdir,
105 | file_paths=[
106 | test_input_file_one.name,
107 | test_input_file_two.name,
108 | ],
109 | output_file_path=test_output_file.name,
110 | number_of_splits=number_of_splits,
111 | number_of_threads=number_of_threads,
112 | )
113 | unique_file_data = test_output_file.read()
114 |
115 | assert sorted(unique_file_data.split(b'\n')[:-1]) == sorted(lines)
116 |
117 |
118 | @pytest.mark.parametrize(
119 | 'number_of_threads',
120 | [
121 | 0,
122 | 1,
123 | 2,
124 | ]
125 | )
126 | @pytest.mark.parametrize(
127 | 'number_of_splits',
128 | [
129 | 1,
130 | 2,
131 | ]
132 | )
133 | def test_compute_added_lines(
134 | number_of_threads,
135 | number_of_splits,
136 | ):
137 | with contextlib.ExitStack() as stack:
138 | test_input_file_one = stack.enter_context(
139 | tempfile.NamedTemporaryFile('wb')
140 | )
141 | test_input_file_two = stack.enter_context(
142 | tempfile.NamedTemporaryFile('wb')
143 | )
144 | test_output_file = stack.enter_context(
145 | tempfile.NamedTemporaryFile('rb')
146 | )
147 |
148 | lines = [
149 | f'line{i}'.encode()
150 | for i in range(11000)
151 | ]
152 | random.shuffle(lines)
153 |
154 | test_input_file_one.file.write(b'\n'.join(lines[:10000]))
155 | test_input_file_one.file.flush()
156 |
157 | test_input_file_two.file.write(b'\n'.join(lines[:11000]))
158 | test_input_file_two.file.flush()
159 |
160 | tempdir = tempfile.mkdtemp()
161 | pydeduplines.compute_added_lines(
162 | working_directory=tempdir,
163 | first_file_path=test_input_file_one.name,
164 | second_file_path=test_input_file_two.name,
165 | output_file_path=test_output_file.name,
166 | number_of_splits=number_of_splits,
167 | number_of_threads=number_of_threads,
168 | )
169 | added_lines_file_data = test_output_file.read()
170 | assert sorted(added_lines_file_data.split(b'\n')[:-1]) == sorted(lines[10000:])
171 |
--------------------------------------------------------------------------------