├── .gitignore ├── requirements.txt ├── esdedupe ├── __main__.py ├── __init__.py ├── utils.py ├── cmd.py ├── cli.py └── esdedupe.py ├── requirements-dev.txt ├── .github └── workflows │ └── ruff.yml ├── scripts └── lint ├── Dockerfile.test ├── Dockerfile ├── docker-compose.yml ├── setup.py ├── Makefile ├── tests ├── test_esdedupe_noop.py ├── test_esdedupe_simple.py ├── test_esdedupe_parallel.py └── test_esdedupe_timestamp.py ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | __pycache__ 3 | *.pyc 4 | build/* 5 | dist/* 6 | esdedupe.egg-info/* 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ujson 2 | tqdm 3 | psutil 4 | elasticsearch>=8.0.0 5 | requests 6 | urllib3>=1.26.2,<2 7 | python-benedict 8 | -------------------------------------------------------------------------------- /esdedupe/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | from .cmd import main 5 | 6 | if __name__ == "__main__": 7 | main() 8 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | ujson 2 | tqdm 3 | psutil 4 | requests 5 | elasticsearch 6 | mock 7 | pytest 8 | pytest-cov 9 | pytest-pep8 10 | autopep8 11 | flake8 12 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: Ruff 2 | on: [ push, pull_request ] 3 | jobs: 4 | ruff: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v4 8 | - uses: astral-sh/ruff-action@v3 9 | -------------------------------------------------------------------------------- /esdedupe/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | 5 | # flake8: noqa 6 | 7 | from __future__ import absolute_import, division, print_function, unicode_literals 8 | 9 | __VERSION__ = "2.0.0" 10 | 11 | from .esdedupe import Esdedupe 12 | -------------------------------------------------------------------------------- /scripts/lint: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | function main { 5 | cd "$(dirname "$0")/.." 6 | local ROOT=$(pwd) 7 | local PROJ=esdedupe 8 | 9 | local SOURCES="*.py $PROJ/*.py $PROJ/*/*.py tests/*.py" 10 | 11 | pycodestyle --ignore=E221,E241,E251,E722,E741,W504 $SOURCES 12 | flakes8 $SOURCES 13 | } 14 | 15 | main "$@" 16 | -------------------------------------------------------------------------------- /Dockerfile.test: -------------------------------------------------------------------------------- 1 | FROM debian:12-slim AS builder 2 | ENV LANG=C.UTF-8 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | RUN apt-get update -qq && apt-get install --no-install-recommends -y python3-pip python3-venv python3-dev make g++\ 5 | && python3 -m venv /opt/venv 6 | ENV PATH="/opt/venv/bin:$PATH" 7 | ADD requirements* /tmp/ 8 | RUN pip3 install wheel && pip3 install -r /tmp/requirements.txt && pip3 install -r /tmp/requirements-dev.txt 9 | WORKDIR /app 10 | ADD . /app/ 11 | RUN python3 setup.py install 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:12-slim AS builder 2 | ENV LANG=C.UTF-8 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | RUN apt-get update -qq && apt-get install --no-install-recommends -y python3-venv python3-setuptools python3-dev make g++ \ 5 | && python3 -m venv /opt/venv 6 | ENV PATH="/opt/venv/bin:$PATH" 7 | ADD . /tmp/ 8 | RUN cd /tmp && pip install wheel && pip install -r /tmp/requirements.txt && python3 setup.py install 9 | 10 | FROM debian:12-slim 11 | ENV LANG=C.UTF-8 12 | ENV DEBIAN_FRONTEND=noninteractive 13 | 14 | RUN apt-get update \ 15 | && apt-get install --no-install-recommends -y python3\ 16 | && apt-get clean && rm -rf /var/lib/apt/lists/* 17 | 18 | COPY --from=builder /opt/venv /opt/venv 19 | COPY --from=builder /usr/local/bin/ /usr/local/bin/ 20 | 21 | CMD ["/usr/local/bin/esdedupe"] 22 | 23 | -------------------------------------------------------------------------------- /esdedupe/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | 5 | import os.path 6 | import psutil 7 | 8 | 9 | SEC_PER_UNIT = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800} 10 | 11 | 12 | def bytes_fmt(num, suffix="B"): 13 | for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: 14 | if abs(num) < 1024.0: 15 | return "%3.1f%s%s" % (num, unit, suffix) 16 | num /= 1024.0 17 | return "%.1f%s%s" % (num, "Y", suffix) 18 | 19 | 20 | def memusage(): 21 | process = psutil.Process(os.getpid()) 22 | rss = process.memory_info().rss 23 | return bytes_fmt(rss) 24 | 25 | 26 | # convert simple time representation to seconds, e.g. 5m, 1h 27 | def time_to_sec(s): 28 | return int(s[:-1]) * SEC_PER_UNIT[s[-1]] 29 | 30 | 31 | # format datetime into Elastic's strict_date_optional_time 32 | def to_es_date(dt): 33 | return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z") 34 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: '3' 3 | services: 4 | es01: 5 | image: docker.elastic.co/elasticsearch/elasticsearch:7.10.2 6 | container_name: es01 7 | environment: 8 | - node.name=es01 9 | - cluster.name=es-docker-cluster 10 | - discovery.type=single-node 11 | - bootstrap.memory_lock=true 12 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m" 13 | ulimits: 14 | memlock: 15 | soft: -1 16 | hard: -1 17 | volumes: 18 | - data01:/usr/share/elasticsearch/data 19 | ports: 20 | - 9200:9200 21 | networks: 22 | - elastic 23 | test: 24 | build: 25 | context: . 26 | dockerfile: Dockerfile.test 27 | volumes: 28 | - ./:/app 29 | command: python3 -m pytest -v --capture=no tests/ 30 | depends_on: 31 | - es01 32 | 33 | volumes: 34 | data01: 35 | driver: local 36 | 37 | networks: 38 | elastic: 39 | driver: bridge 40 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Always prefer setuptools over distutils 2 | from setuptools import setup, find_packages 3 | import pathlib 4 | 5 | import esdedupe 6 | 7 | here = pathlib.Path(__file__).parent.resolve() 8 | 9 | # Get the long description from the README file 10 | long_description = (here / "README.md").read_text(encoding="utf-8") 11 | 12 | 13 | setup( 14 | author="Tomas Barton", 15 | author_email="barton.tomas@gmail.com", 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "OSI Approved :: Apache Software License", 19 | "Operating System :: OS Independent", 20 | ], 21 | description="A tool for duplicate removal from Elasticsearch", 22 | entry_points={ 23 | "console_scripts": "esdedupe=esdedupe.cmd:main", 24 | }, 25 | install_requires=["elasticsearch>5.0" "psutil", "tqdm", "ujson", "requests"], 26 | license="Apache License 2.0", 27 | keywords="elasticsearch", 28 | long_description=long_description, 29 | long_description_content_type="text/markdown", 30 | name="esdedupe", 31 | packages=find_packages(), 32 | url="https://github.com/deric/es-dedupe", 33 | version=esdedupe.__VERSION__, 34 | ) 35 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | NAME ?=es-dedupe 2 | REGISTRY ?= deric 3 | 4 | all: clean test 5 | 6 | build: 7 | grep "^FROM" Dockerfile | awk '{ print $$2 }' | uniq | xargs -P2 -n1 docker pull 8 | docker build -t $(NAME) . 9 | 10 | define RELEASE 11 | git tag "v$(1)" 12 | git push 13 | git push --tags 14 | docker tag $(NAME) $(REGISTRY)/$(NAME):v$(1) 15 | docker tag $(NAME) $(REGISTRY)/$(NAME):latest 16 | docker push $(REGISTRY)/$(NAME) 17 | endef 18 | 19 | shell: build 20 | docker run --entrypoint /bin/bash -it $(NAME) 21 | 22 | release: build 23 | $(call RELEASE,$(v)) 24 | 25 | dev: 26 | pip3 install -r requirements.txt -r requirements-dev.txt 27 | 28 | # auto correct indentation issues 29 | fix: 30 | autopep8 esdedupe/ --recursive --in-place 31 | 32 | lint: 33 | flake8 esdedupe/ 34 | 35 | package: 36 | python3 setup.py sdist bdist_wheel 37 | 38 | test: 39 | docker build -f Dockerfile.test -t esdedupe-test . 40 | docker run -v $(shell pwd):/app --entrypoint /bin/bash -it esdedupe-test 41 | 42 | clean: 43 | find . -name '*.pyc' -exec rm --force {} + 44 | find . -name '*.pyo' -exec rm --force {} + 45 | find . -name '*~' -exec rm --force {} + 46 | rm -rf esdedupe.egg-info dist build 47 | 48 | .PHONY: clean test build 49 | -------------------------------------------------------------------------------- /tests/test_esdedupe_noop.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from elasticsearch import Elasticsearch 3 | import random 4 | import string 5 | import time 6 | 7 | import esdedupe 8 | from esdedupe.cli import ArgumentParser 9 | 10 | INDEX = "test-noop" 11 | 12 | 13 | def random_string(length): 14 | # Random string with the combination of lower and upper case 15 | letters = string.ascii_letters + string.digits 16 | return "".join(random.SystemRandom().choice(letters) for i in range(length)) 17 | 18 | 19 | @pytest.fixture() 20 | def dedupe(): 21 | print("setup") 22 | es = Elasticsearch() 23 | 24 | # ignore 400 cause by IndexAlreadyExistsException when creating an index 25 | es.indices.create(index=INDEX, ignore=400, wait_for_active_shards=1) 26 | print("Created index {}".format(INDEX)) 27 | 28 | # fill with documents 29 | for i in range(10): 30 | es.create(index=INDEX, id=random_string(8), body={"name": "foo"}) 31 | for i in range(10): 32 | es.create(index=INDEX, id=random_string(8), body={"name": "bar"}) 33 | 34 | yield "dedupe" 35 | 36 | # cleanup 37 | es.indices.delete(index=INDEX, ignore=400) 38 | 39 | 40 | def test_es_ping(): 41 | es = Elasticsearch(["localhost"]) 42 | assert es.ping() 43 | 44 | 45 | class TestDedupe: 46 | def test_docs(self, dedupe): 47 | es = Elasticsearch() 48 | res = es.count(index=INDEX) 49 | # make sure elastic indexes inserted documents 50 | 51 | i = 0 52 | while res["count"] < 20: 53 | time.sleep(1) 54 | i += 1 55 | res = es.count(index=INDEX) 56 | if i > 3: 57 | assert False 58 | 59 | dedupe = esdedupe.Esdedupe() 60 | parser = ArgumentParser() 61 | dedupe.run( 62 | parser.parse_args( 63 | ["-i", INDEX, "--field", "name", "--log-stream-stdout", "--noop"] 64 | ) 65 | ) 66 | 67 | assert es.count(index=INDEX)["count"] == 20 68 | -------------------------------------------------------------------------------- /tests/test_esdedupe_simple.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from elasticsearch import Elasticsearch 3 | import random 4 | import string 5 | import time 6 | 7 | import esdedupe 8 | from esdedupe.cli import ArgumentParser 9 | 10 | INDEX = "test-index" 11 | 12 | 13 | def random_string(length): 14 | # Random string with the combination of lower and upper case 15 | letters = string.ascii_letters + string.digits 16 | return "".join(random.SystemRandom().choice(letters) for i in range(length)) 17 | 18 | 19 | @pytest.fixture() 20 | def dedupe(): 21 | print("setup") 22 | es = Elasticsearch() 23 | 24 | # ignore 400 cause by IndexAlreadyExistsException when creating an index 25 | es.indices.create(index=INDEX, ignore=400, wait_for_active_shards=1) 26 | print("Created index {}".format(INDEX)) 27 | 28 | # fill with documents 29 | for i in range(10): 30 | es.create(index=INDEX, id=random_string(8), body={"name": "foo"}) 31 | for i in range(10): 32 | es.create(index=INDEX, id=random_string(8), body={"name": "bar"}) 33 | 34 | yield "dedupe" 35 | 36 | # cleanup 37 | es.indices.delete(index=INDEX, ignore=400) 38 | 39 | 40 | def test_es_ping(): 41 | es = Elasticsearch(["localhost"]) 42 | assert es.ping() 43 | 44 | 45 | class TestDedupe: 46 | def test_docs(self, dedupe): 47 | es = Elasticsearch() 48 | res = es.count(index=INDEX) 49 | # make sure elastic indexes inserted documents 50 | 51 | i = 0 52 | while res["count"] < 20: 53 | time.sleep(1) 54 | i += 1 55 | res = es.count(index=INDEX) 56 | if i > 3: 57 | assert False 58 | 59 | dedupe = esdedupe.Esdedupe() 60 | parser = ArgumentParser() 61 | dedupe.run( 62 | parser.parse_args( 63 | ["-i", INDEX, "--field", "name", "--log-stream-stdout", "--no-progress"] 64 | ) 65 | ) 66 | 67 | i = 0 68 | while res["count"] == 20: 69 | time.sleep(1) 70 | i += 1 71 | res = es.count(index=INDEX) 72 | if i > 3: 73 | assert False 74 | 75 | assert es.count(index=INDEX)["count"] == 2 76 | -------------------------------------------------------------------------------- /tests/test_esdedupe_parallel.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from elasticsearch import Elasticsearch 3 | import random 4 | import string 5 | import time 6 | 7 | import esdedupe 8 | from esdedupe.cli import ArgumentParser 9 | 10 | INDEX = "test-parallel" 11 | 12 | 13 | def random_string(length): 14 | # Random string with the combination of lower and upper case 15 | letters = string.ascii_letters + string.digits 16 | return "".join(random.SystemRandom().choice(letters) for i in range(length)) 17 | 18 | 19 | @pytest.fixture() 20 | def dedupe(): 21 | print("setup") 22 | es = Elasticsearch() 23 | 24 | # ignore 400 cause by IndexAlreadyExistsException when creating an index 25 | es.indices.create(index=INDEX, ignore=400, wait_for_active_shards=1) 26 | print("Created index {}".format(INDEX)) 27 | 28 | # fill with documents 29 | for i in range(50): 30 | es.create(index=INDEX, id=random_string(8), body={"name": "foo"}) 31 | for i in range(50): 32 | es.create(index=INDEX, id=random_string(8), body={"name": "bar"}) 33 | 34 | yield "dedupe" 35 | 36 | # cleanup 37 | es.indices.delete(index=INDEX, ignore=400) 38 | 39 | 40 | def test_es_ping(): 41 | es = Elasticsearch(["localhost"]) 42 | assert es.ping() 43 | 44 | 45 | class TestDedupe: 46 | def test_docs(self, dedupe): 47 | es = Elasticsearch() 48 | res = es.count(index=INDEX) 49 | # make sure elastic indexes inserted documents 50 | 51 | i = 0 52 | while res["count"] < 100: 53 | time.sleep(1) 54 | i += 1 55 | res = es.count(index=INDEX) 56 | if i > 3: 57 | assert False 58 | 59 | dedupe = esdedupe.Esdedupe() 60 | parser = ArgumentParser() 61 | dedupe.run( 62 | parser.parse_args( 63 | [ 64 | "-i", 65 | INDEX, 66 | "--field", 67 | "name", 68 | "--log-stream-stdout", 69 | "-j 4", 70 | "--no-progress", 71 | ] 72 | ) 73 | ) 74 | 75 | i = 0 76 | while res["count"] == 100: 77 | time.sleep(1) 78 | i += 1 79 | res = es.count(index=INDEX) 80 | if i > 3: 81 | assert False 82 | 83 | assert es.count(index=INDEX)["count"] == 2 84 | -------------------------------------------------------------------------------- /esdedupe/cmd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | from __future__ import absolute_import, division, print_function, unicode_literals 5 | 6 | import os 7 | import sys 8 | 9 | from logging import DEBUG, INFO, WARN, Formatter, StreamHandler, getLogger 10 | from logging.handlers import SysLogHandler 11 | from sys import stderr, stdout 12 | 13 | from . import __VERSION__ 14 | from .esdedupe import Esdedupe 15 | from .cli import ArgumentParser 16 | 17 | 18 | def setup_logging(args, default_log_level=INFO, es_log=WARN): 19 | fmt = "%(asctime)s [%(thread)d] %(levelname)-5s %(name)s %(message)s" 20 | formatter = Formatter(fmt=fmt, datefmt="%Y-%m-%dT%H:%M:%S ") 21 | stream = stdout if args.log_stream_stdout else stderr 22 | handler = StreamHandler(stream=stream) 23 | handler.setFormatter(formatter) 24 | logger = getLogger() 25 | logger.addHandler(handler) 26 | 27 | if args.log_syslog: 28 | fmt = ( 29 | "esdedupe[%(process)-5s:%(thread)d]: %(name)s " 30 | "%(levelname)-5s %(message)s" 31 | ) 32 | handler = SysLogHandler( 33 | address=args.syslog_device, facility=args.syslog_facility 34 | ) 35 | handler.setFormatter(Formatter(fmt=fmt)) 36 | logger.addHandler(handler) 37 | 38 | logger.level = DEBUG if args.debug else default_log_level 39 | 40 | # elasticsearch scroll output is too verbose 41 | getLogger("elasticsearch").level = es_log 42 | 43 | 44 | def loglevel(level): 45 | return { 46 | "NOTSET": 0, 47 | "DEBUG": 10, 48 | "INFO": 20, 49 | "WARN": 30, 50 | "WARNING": 30, 51 | "ERROR": 40, 52 | "CRITICAL": 50, 53 | }[level.upper()] 54 | 55 | 56 | def main(): 57 | parser = ArgumentParser( 58 | description="Elastic duplicates deleter", add_help=True, prog="esdedupe" 59 | ) 60 | args = parser.parse_args(sys.argv[1:]) 61 | 62 | if args.version: 63 | print("esdedupe {}".format(__VERSION__)) 64 | os._exit(0) 65 | 66 | setup_logging(args, loglevel(args.level), loglevel(args.es_level)) 67 | try: 68 | dedupe = Esdedupe() 69 | dedupe.run(args) 70 | except KeyboardInterrupt: 71 | print("Interrupted by Keyboard") 72 | try: 73 | sys.exit(0) 74 | except SystemExit: 75 | os._exit(0) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /tests/test_esdedupe_timestamp.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from elasticsearch import Elasticsearch 3 | import random 4 | import string 5 | import time 6 | 7 | import esdedupe 8 | from logging import INFO 9 | from esdedupe.cli import ArgumentParser 10 | from esdedupe.cmd import setup_logging 11 | 12 | INDEX = "test-timeseries" 13 | 14 | 15 | def random_string(length): 16 | # Random string with the combination of lower and upper case 17 | letters = string.ascii_letters + string.digits 18 | return "".join(random.SystemRandom().choice(letters) for i in range(length)) 19 | 20 | 21 | @pytest.fixture() 22 | def dedupe(): 23 | print("setup") 24 | es = Elasticsearch() 25 | 26 | # ignore 400 cause by IndexAlreadyExistsException when creating an index 27 | es.indices.create(index=INDEX, ignore=400, wait_for_active_shards=1) 28 | print("Created index {}".format(INDEX)) 29 | 30 | # fill with documents 31 | for i in range(5): 32 | es.create( 33 | index=INDEX, 34 | id=random_string(8), 35 | body={"timestamp": "2021-01-01T01:01:00.000Z", "name": "foo"}, 36 | ) 37 | for i in range(5): 38 | es.create( 39 | index=INDEX, 40 | id=random_string(8), 41 | body={"timestamp": "2021-01-01T01:05:00.000Z", "name": "bar"}, 42 | ) 43 | for i in range(5): 44 | es.create( 45 | index=INDEX, 46 | id=random_string(8), 47 | body={"timestamp": "2021-01-01T01:12:00.000Z", "name": "baz"}, 48 | ) 49 | for i in range(5): 50 | es.create( 51 | index=INDEX, 52 | id=random_string(8), 53 | body={"timestamp": "2021-01-01T01:13:00.000Z", "name": "boo"}, 54 | ) 55 | 56 | yield "dedupe" 57 | 58 | # cleanup 59 | es.indices.delete(index=INDEX, ignore=400) 60 | 61 | 62 | def test_es_ping(): 63 | es = Elasticsearch(["localhost"]) 64 | assert es.ping() 65 | 66 | 67 | class TestDedupe: 68 | def test_docs(self, dedupe): 69 | es = Elasticsearch() 70 | res = es.count(index=INDEX) 71 | # make sure elastic indexes inserted documents 72 | i = 0 73 | while res["count"] < 20: 74 | time.sleep(1) 75 | i += 1 76 | res = es.count(index=INDEX) 77 | if i > 3: 78 | assert False 79 | print("doc count: {}".format(res["count"])) 80 | 81 | dedupe = esdedupe.Esdedupe() 82 | parser = ArgumentParser() 83 | args = parser.parse_args( 84 | [ 85 | "-i", 86 | INDEX, 87 | "-f", 88 | "name", 89 | "-T", 90 | "timestamp", 91 | "-w 5m", 92 | "-F 2021-01-01T01:01:00", 93 | "-U 2021-01-01T01:20:00", 94 | ] 95 | ) 96 | setup_logging(args, INFO, INFO) 97 | dedupe.run(args) 98 | 99 | i = 0 100 | while res["count"] > 19: 101 | time.sleep(1) 102 | i += 1 103 | print(res["count"]) 104 | res = es.count(index=INDEX) 105 | if i > 3: 106 | assert False 107 | 108 | assert es.count(index=INDEX)["count"] == 4 109 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ES-dedupe 2 | 3 | [![](https://images.microbadger.com/badges/version/deric/es-dedupe.svg)](https://microbadger.com/images/deric/es-dedupe) 4 | [![](https://images.microbadger.com/badges/image/deric/es-dedupe.svg)](https://microbadger.com/images/deric/es-dedupe) 5 | 6 | A tool for removing duplicated documents that are grouped by some unique field (e.g. `--field Uuid`). 7 | 8 | ## Usage 9 | 10 | Use `-h/--help` to see supported options: 11 | ``` 12 | docker run --rm deric/es-dedupe:latest esdedupe --help 13 | ``` 14 | Remove duplicates from index `exact-index-name` while searching for unique `Uuid` field: 15 | 16 | ``` 17 | docker run --rm deric/es-dedupe:latest esdedupe -H localhost -P 9200 -i exact-index-name -f Uuid > es_dedupe.log 2>&1 18 | ``` 19 | 20 | ## Multiple unique fields 21 | 22 | Build a local index using ``md5(time,device_id)` as an unique key. It might require a significant amount of memory (depends on the size of your index, it can easily grow to gigabytes - it's stored as a Python dict with a string key, which might occupy a large amount of memory). 23 | 24 | 25 | ```bash 26 | esdedupe --host localhost -field time,device_id -i my_index --noop 27 | ``` 28 | 29 | 30 | ## Examples 31 | 32 | More advanced example with documents containing timestamps. 33 | 34 | ```bash 35 | esdedupe -H localhost -f request_id -i nginx_access_logs-2021.01.29 -b 10000 --timestamp Timestamp --since "2021-01-29T15:30:00.000Z" --until "2021-01-29T16:30:00.000Z" --flush 1500 --request_timeout 180 36 | 2021-02-01T19:58:25 [139754520647488] INFO esdedupe elastic: es01, host: localhost, version: 7.6.0 37 | 2021-02-01T19:58:25 [139754520647488] INFO esdedupe Unique fields: ['request_id'] 38 | 2021-02-01T19:58:25 [139754520647488] INFO esdedupe Building documents mapping on index: nginx_access_logs-2021.01.29, batch size: 10000 39 | 2021-02-01T19:59:16 [139754520647488] INFO esdedupe Scanned 987,892 unique documents 40 | 2021-02-01T19:59:16 [139754520647488] INFO esdedupe Memory usage: 414.0MB 41 | 2021-02-01T20:00:03 [139754520647488] INFO esdedupe Scanned 1,950,957 unique documents 42 | 2021-02-01T20:00:03 [139754520647488] INFO esdedupe Memory usage: 695.0MB 43 | 2021-02-01T20:00:46 [139754520647488] INFO esdedupe Scanned 2,861,671 unique documents 44 | 2021-02-01T20:00:46 [139754520647488] INFO esdedupe Memory usage: 1007.3MB 45 | 2021-02-01T20:01:37 [139754520647488] INFO esdedupe Scanned 3,579,286 unique documents 46 | 2021-02-01T20:01:37 [139754520647488] INFO esdedupe Memory usage: 1.2GB 47 | 2021-02-01T20:02:16 [139754520647488] INFO esdedupe Found 810,993 duplicates out of 4,833,500 docs, unique documents: 4,022,507 (16.8% duplicates) 48 | 100%█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 810001/810993 [7:39:44<00:26, 37.16docs/s] 49 | 2021-02-02T03:42:01 [139754520647488] INFO esdedupe Deleted 1,621,986/810,993 documents 50 | 2021-02-02T03:42:01 [139754520647488] INFO esdedupe Successfully completed duplicates removal. Took: 7:43:36.313482 51 | ``` 52 | 53 | 54 | WARNING: Running huge bulk operations on Elastic cluster might influence performance of your cluster or even crash some nodes if heap is not large enough. 55 | 56 | A sliding window `-w / --window` could be used to prevent running out of memory on larger indexes (if you have a timestamp field): 57 | 58 | ```bash 59 | $ esdedupe -H localhost -f request_id -i nginx_access_logs-2021.02.01 -b 10000 --timestamp Timestamp --since 2021-02-01T00:00:00 --until 2021-02-01T10:30:00 --flush 2500 --request_timeout 180 -w 10m --es-level WARN 60 | 2021-02-07T01:27:07 [140045012879168] INFO esdedupe Found 1,544 duplicates out of 162,805 docs, unique documents: 161,261 (0.9% duplicates) 61 | 0%| | 1/1544 [00:17<7:25:23, 17.32s/docs]2021-02-07T01:27:25 [140045012879168] INFO esdedupe Deleted 3,088 documents (including shard replicas) 62 | 2021-02-07T01:27:25 [140045012879168] INFO esdedupe Using window 10m, from: 2021-02-01T09:30:00.000Z until: 2021-02-01T09:40:00.000Z 63 | 2021-02-07T01:27:25 [140045012879168] INFO esdedupe Building documents mapping on index: nginx_access_logs-2021.02.01, batch size: 10000 64 | 100%|██████████| 1544/1544 [00:18<00:00, 83.11docs/s] 65 | 2021-02-07T01:27:33 [140045012879168] INFO esdedupe Found 1,338 duplicates out of 162,882 docs, unique documents: 161,544 (0.8% duplicates) 66 | 0%| | 1/1338 [00:19<7:23:17, 19.89s/docs]2021-02-07T01:27:53 [140045012879168] INFO esdedupe Deleted 2,676 documents (including shard replicas) 67 | 2021-02-07T01:27:53 [140045012879168] INFO esdedupe Using window 10m, from: 2021-02-01T09:40:00.000Z until: 2021-02-01T09:50:00.000Z 68 | 2021-02-07T01:27:53 [140045012879168] INFO esdedupe Building documents mapping on index: nginx_access_logs-2021.02.01, batch size: 10000 69 | 100%|██████████| 1338/1338 [00:20<00:00, 64.36docs/s] 70 | 2021-02-07T01:28:02 [140045012879168] INFO esdedupe Found 1,321 duplicates out of 165,664 docs, unique documents: 164,343 (0.8% duplicates) 71 | 0%| | 1/1321 [00:13<4:56:58, 13.50s/docs]2021-02-07T01:28:15 [140045012879168] INFO esdedupe Deleted 2,642 documents (including shard replicas) 72 | 2021-02-07T01:28:15 [140045012879168] INFO esdedupe Using window 10m, from: 2021-02-01T09:50:00.000Z until: 2021-02-01T10:00:00.000Z 73 | 2021-02-07T01:28:15 [140045012879168] INFO esdedupe Building documents mapping on index: nginx_access_logs-2021.02.01, batch size: 10000 74 | 100%|██████████| 1321/1321 [00:14<00:00, 88.39docs/s] 75 | 2021-02-07T01:28:25 [140045012879168] INFO esdedupe Found 1,291 duplicates out of 168,842 docs, unique documents: 167,551 (0.8% duplicates) 76 | 0%| | 1/1291 [00:12<4:20:59, 12.14s/docs]2021-02-07T01:28:37 [140045012879168] INFO esdedupe Deleted 2,582 documents (including shard replicas) 77 | 2021-02-07T01:28:37 [140045012879168] INFO esdedupe Using window 10m, from: 2021-02-01T10:00:00.000Z until: 2021-02-01T10:10:00.000Z 78 | 2021-02-07T01:28:37 [140045012879168] INFO esdedupe Building documents mapping on index: nginx_access_logs-2021.02.01, batch size: 10000 79 | 100%|██████████| 1291/1291 [00:15<00:00, 82.91docs/s] 80 | 2021-02-07T01:28:48 [140045012879168] INFO esdedupe Found 1,371 duplicates out of 173,650 docs, unique documents: 172,279 (0.8% duplicates) 81 | 0%| | 1/1371 [00:18<7:07:57, 18.74s/docs]2021-02-07T01:29:07 [140045012879168] INFO esdedupe Deleted 2,742 documents (including shard replicas) 82 | 2021-02-07T01:29:07 [140045012879168] INFO esdedupe Using window 10m, from: 2021-02-01T10:10:00.000Z until: 2021-02-01T10:20:00.000Z 83 | 2021-02-07T01:29:07 [140045012879168] INFO esdedupe Building documents mapping on index: nginx_access_logs-2021.02.01, batch size: 10000 84 | 100%|██████████| 1371/1371 [00:19<00:00, 68.59docs/s] 85 | 2021-02-07T01:29:16 [140045012879168] INFO esdedupe Found 1,340 duplicates out of 183,592 docs, unique documents: 182,252 (0.7% duplicates) 86 | 0%| | 1/1340 [00:21<8:00:21, 21.52s/docs]2021-02-07T01:29:38 [140045012879168] INFO esdedupe Deleted 2,680 documents (including shard replicas) 87 | 2021-02-07T01:29:38 [140045012879168] INFO esdedupe Altogether 14115806 documents were removed (including doc replicas) 88 | 2021-02-07T01:29:38 [140045012879168] INFO esdedupe Total time: 1 day, 10:15:43.528495 89 | 90 | ``` 91 | 92 | ## Requirements 93 | For the installation use the tools provided by your operating system. 94 | 95 | On Linux this can be one of the following: yum, dnf, apt, yast, emerge, .. 96 | 97 | * Install python (2 or 3, both will work) 98 | * Install python*ujson and python*requests for the fitting python version 99 | 100 | 101 | On Windows you are pretty much on your own, but fear not, you can do the following ;-) 102 | 103 | * Download and install a python version from https://www.python.org/ . 104 | * Open a console terminal and head to the repository copy of es-deduplicator, then run: 105 | pip install -r requirements.txt 106 | 107 | 108 | ## Testing 109 | 110 | Test can be run from a Docker container. You can use supplied `docker-compose` file: 111 | ```bash 112 | docker-compose up 113 | ``` 114 | 115 | Manually run tests: 116 | ```bash 117 | pip3 install -r requirements-dev.txt 118 | python3 -m pytest -v --capture=no tests/ 119 | ``` 120 | 121 | 122 | ## History 123 | 124 | Originally written in bash which performed terribly due to slow JSON processing with pipes and `jq`. Python with `ujson` seems to be better fitted for this task. 125 | -------------------------------------------------------------------------------- /esdedupe/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | 5 | import datetime 6 | 7 | from argparse import ArgumentParser as _Base 8 | 9 | 10 | class ArgumentParser(_Base): 11 | def __init__(self, *args, **kwargs): 12 | super(ArgumentParser, self).__init__(*args, **kwargs) 13 | 14 | def parse_args(self, args): 15 | self.add_argument( 16 | "-a", 17 | "--all", 18 | action="store_true", 19 | dest="all", 20 | default=True, 21 | help="All indexes from given date till today", 22 | ) 23 | self.add_argument( 24 | "-b", 25 | "--batch", 26 | dest="batch", 27 | default=1000, 28 | type=int, 29 | help="Number of documents retrieved using one search request", 30 | ) 31 | self.add_argument( 32 | "-H", 33 | "--host", 34 | dest="host", 35 | default="localhost", 36 | help="Elasticsearch hostname", 37 | metavar="host", 38 | ) 39 | self.add_argument( 40 | "-f", 41 | "--field", 42 | dest="field", 43 | default="Uuid", 44 | help="Field in ES that is supposed to be unique", 45 | metavar="field", 46 | ) 47 | self.add_argument( 48 | "--flush", 49 | dest="flush", 50 | default=500, 51 | type=int, 52 | help="Number records send in one bulk request", 53 | ) 54 | self.add_argument( 55 | "-i", 56 | "--index", 57 | dest="index", 58 | default="", 59 | help="Elasticsearch full index name, implies NOT --all", 60 | metavar="index", 61 | ) 62 | self.add_argument( 63 | "-I", 64 | "--indexexclude", 65 | dest="indexexclude", 66 | default="", 67 | help="""Elasticsearch regular expression of index 68 | name that is to be excluded, only useful with --all""", 69 | metavar="indexexclude-regexp", 70 | ) 71 | self.add_argument( 72 | "-j", 73 | "--threads", 74 | dest="threads", 75 | default=1, 76 | type=int, 77 | help="""Number of threads to execute delete queries, 78 | when 1 seqential delete will be used. Note parallel 79 | delete can easily overload cluster""", 80 | ) 81 | self.add_argument( 82 | "-p", 83 | "--prefix", 84 | dest="prefix", 85 | default="*", 86 | help="Elasticsearch index prefix", 87 | metavar="prefix", 88 | ) 89 | self.add_argument( 90 | "-S", 91 | "--prefixseparator", 92 | dest="prefixseparator", 93 | default="-", 94 | help="""Elasticsearch index prefix separator to use 95 | between prefix, idxname and *""", 96 | metavar="prefixsep", 97 | ) 98 | self.add_argument( 99 | "-P", 100 | "--port", 101 | dest="port", 102 | default=9200, 103 | type=int, 104 | help="Elasticsearch port", 105 | metavar="port", 106 | ) 107 | self.add_argument( 108 | "-t", "--doc_type", dest="doc_type", default=None, help="ES document type" 109 | ) 110 | self.add_argument( 111 | "-T", "--timestamp", dest="timestamp", default=None, help="Timestamp field" 112 | ) 113 | self.add_argument( 114 | "-F", 115 | "--since", 116 | dest="since", 117 | type=lambda s: datetime.datetime.strptime(s.lstrip(), "%Y-%m-%dT%H:%M:%S"), 118 | default=None, 119 | help="Search from given timestamp", 120 | ) 121 | self.add_argument( 122 | "-U", 123 | "--until", 124 | dest="until", 125 | default=None, 126 | type=lambda s: datetime.datetime.strptime(s.lstrip(), "%Y-%m-%dT%H:%M:%S"), 127 | help="Search until given timestamp", 128 | ) 129 | self.add_argument( 130 | "-w", 131 | "--window", 132 | dest="window", 133 | default=None, 134 | help="Time window, requires --timestamp and --since flags", 135 | ) 136 | self.add_argument( 137 | "-v", 138 | "--version", 139 | action="store_true", 140 | dest="version", 141 | default=False, 142 | help="Print version and exit", 143 | ) 144 | self.add_argument( 145 | "--fail-fast", 146 | action="store_true", 147 | dest="fail_fast", 148 | default=False, 149 | help="Exit on exception from Elasticsearch", 150 | ) 151 | self.add_argument( 152 | "-r", 153 | "--max_retries", 154 | dest="max_retries", 155 | default=3, 156 | type=int, 157 | help="Maximum retries for rejected bulk delete", 158 | ) 159 | self.add_argument( 160 | "--initial_backoff", 161 | dest="initial_backoff", 162 | default=2, 163 | type=int, 164 | help="""Number of seconds we should wait before the first retry. 165 | Any subsequent retries will be powers of 166 | initial_backoff * 2**retry_number""", 167 | ) 168 | self.add_argument( 169 | "--scroll", 170 | dest="scroll", 171 | default="10m", 172 | help="Specify how long a consistent view of the index should be maintained for scrolled search", 173 | ) 174 | self.add_argument( 175 | "--request_timeout", 176 | dest="request_timeout", 177 | default=60, 178 | type=int, 179 | help="Elasticsearch timeout in seconds", 180 | ) 181 | self.add_argument( 182 | "-d", 183 | "--debug", 184 | action="store_true", 185 | dest="debug", 186 | default=False, 187 | help="enable debugging", 188 | ) 189 | self.add_argument( 190 | "--no-check", 191 | action="store_true", 192 | dest="no_check", 193 | default=False, 194 | help="Disable check & remove if duplicities found after with standard search query", 195 | ) 196 | self.add_argument( 197 | "-l", 198 | "--level", 199 | dest="level", 200 | default="INFO", 201 | help="Python logging level (DEBUG, INFO, WARN, ERROR, CRITICAL)", 202 | ) 203 | self.add_argument( 204 | "--es-level", 205 | dest="es_level", 206 | default="WARN", 207 | help="Elasticsearch logging level (DEBUG, INFO, WARN, ERROR, CRITICAL)", 208 | ) 209 | self.add_argument( 210 | "--log_dupl", 211 | dest="log_dupl", 212 | default=None, 213 | help="File to store duplicates mapping in JSON format", 214 | ) 215 | self.add_argument( 216 | "--log_done", 217 | dest="log_done", 218 | default="es_dedupe.done", 219 | help="Logfile containing all document IDs that remained in ES", 220 | ) 221 | self.add_argument( 222 | "--check_log", dest="check", help="Verify that documents has been deleted" 223 | ) 224 | self.add_argument( 225 | "-n", 226 | "--noop", 227 | action="store_true", 228 | dest="noop", 229 | default=False, 230 | help="Do not take any destructive action (only print delete queries)", 231 | ) 232 | self.add_argument("--user", dest="user", default=None, help="HTTP auth user") 233 | self.add_argument( 234 | "--password", dest="password", default=None, help="HTTP auth password" 235 | ) 236 | self.add_argument( 237 | "--ssl", action="store_true", dest="ssl", default=False, help="Use SSL" 238 | ) 239 | self.add_argument( 240 | "-k", 241 | "--insecure", 242 | action="store_false", 243 | dest="cert_verify", 244 | default=True, 245 | help="Allow connections to es without server certificate verify", 246 | ) 247 | self.add_argument( 248 | "--log-stream-stdout", 249 | action="store_true", 250 | default=False, 251 | help="Log to stdout instead of stderr", 252 | ) 253 | _help = "Send logging data to syslog in addition to stderr" 254 | self.add_argument( 255 | "--log-syslog", action="store_true", default=False, help=_help 256 | ) 257 | self.add_argument("--syslog-device", default="/dev/log", help="Syslog device") 258 | self.add_argument("--syslog-facility", default="local0", help="Syslog facility") 259 | self.add_argument( 260 | "--mem-report", 261 | dest="mem_report", 262 | default=1000000, 263 | type=int, 264 | help="Print memory parsing N documents, default: 1000000", 265 | ) 266 | self.add_argument( 267 | "--no-progress", 268 | action="store_true", 269 | dest="no_progress", 270 | default=False, 271 | help="Hide progress bar", 272 | ) 273 | print("esdedupe {}".format(" ".join(args))) 274 | try: 275 | parser = super(ArgumentParser, self).parse_args(args) 276 | except ValueError as e: 277 | print(e) 278 | return parser 279 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /esdedupe/esdedupe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | 5 | import hashlib 6 | import time 7 | import tqdm 8 | import ujson 9 | import requests 10 | import sys 11 | 12 | from benedict import benedict 13 | from elasticsearch import Elasticsearch, helpers 14 | from elasticsearch.helpers import parallel_bulk 15 | from elasticsearch.helpers import streaming_bulk 16 | from logging import getLogger 17 | from datetime import timedelta 18 | 19 | from . import __VERSION__ 20 | from .utils import memusage, time_to_sec, to_es_date 21 | 22 | 23 | class Esdedupe: 24 | def __init__(self): 25 | self.log = getLogger("esdedupe") 26 | self.total = 0 27 | 28 | # Process documents returned by the current search/scroll 29 | def build_index(self, docs_hash, unique_fields, hit): 30 | hashval = None 31 | hit_benedict = benedict(hit) 32 | _id = hit_benedict["_id"] 33 | # there's no need to hash, if we have just single unique key 34 | if len(unique_fields) > 1: 35 | combined_key = "" 36 | for field in unique_fields: 37 | combined_key += str(hit_benedict["_source"][field]) 38 | hashval = hashlib.md5(combined_key.encode("utf-8")).digest() 39 | else: 40 | hashval = str(hit_benedict["_source"][unique_fields[0]]) 41 | 42 | docs_hash.setdefault(hashval, []).append(_id) 43 | 44 | def elastic_uri(self, args): 45 | if args.host.startswith("http"): 46 | return "{0}:{1}".format(args.host, args.port) 47 | else: 48 | if args.ssl: 49 | return "https://{0}:{1}".format(args.host, args.port) 50 | else: 51 | return "http://{0}:{1}".format(args.host, args.port) 52 | 53 | def ping(self, args): 54 | uri = self.elastic_uri(args) 55 | if not args.cert_verify: 56 | requests.urllib3.disable_warnings() 57 | try: 58 | self.log.debug("GET {0}".format(uri)) 59 | if args.user: 60 | from requests.auth import HTTPBasicAuth 61 | 62 | resp = requests.get( 63 | uri, 64 | auth=HTTPBasicAuth(args.user, args.password), 65 | verify=args.cert_verify, 66 | ) 67 | else: 68 | resp = requests.get(uri, verify=args.cert_verify) 69 | self.log.debug("Response: {0}".format(resp.text)) 70 | if resp.status_code == 200: 71 | return 72 | else: 73 | self.log.error("{0}: {1}".format(uri, resp.text)) 74 | sys.exit(1) 75 | except requests.exceptions.SSLError as e: 76 | self.log.error( 77 | "Certificate verification failed on {0} , use -k to skip checking the certificate".format( 78 | uri 79 | ) 80 | ) 81 | self.log.error(e) 82 | sys.exit(1) 83 | except requests.exceptions.ConnectionError as e: 84 | self.log.error("Connection failed. Is ES running on {0} ?".format(uri)) 85 | self.log.error("Check --host argument and --port") 86 | self.log.error(e) 87 | # do not show this terrible traceback 88 | # self.log.error(e) 89 | sys.exit(1) 90 | return 91 | 92 | def run(self, args): 93 | start = time.time() 94 | uri = self.elastic_uri(args) 95 | self.log.info( 96 | "Starting esdedupe: {} - duplicate document removal tool".format( 97 | __VERSION__ 98 | ) 99 | ) 100 | if args.noop: 101 | self.log.info("Running in NOOP mode, no document will be deleted.") 102 | try: 103 | # test connection to Elasticsearch cluster first 104 | self.ping(args) 105 | if args.user: 106 | es = Elasticsearch( 107 | [uri], 108 | basic_auth=(args.user, args.password), 109 | verify_certs=args.cert_verify, 110 | ssl_show_warn=args.cert_verify, 111 | ) 112 | else: 113 | es = Elasticsearch( 114 | hosts=[uri], 115 | verify_certs=args.cert_verify, 116 | ssl_show_warn=args.cert_verify, 117 | ) 118 | 119 | resp = es.info() 120 | self.log.info( 121 | "elastic: {}, host: {}, version: {}".format( 122 | resp["cluster_name"], args.host, resp["version"]["number"] 123 | ) 124 | ) 125 | 126 | docs = {} 127 | dupl = 0 128 | 129 | # one or more fields to form a unique key (primary key) 130 | pk = args.field.split(",") 131 | self.log.info("Unique fields: {}".format(pk)) 132 | 133 | if args.index != "": 134 | index = args.index 135 | # if indexname specifically was set, do not do --all mode 136 | args.all = False 137 | self.process_index(es, docs, pk, dupl, index, args) 138 | 139 | end = time.time() 140 | if args.noop: 141 | self.log.info( 142 | "Simulation finished. Took: {0}".format( 143 | timedelta(seconds=(end - start)) 144 | ) 145 | ) 146 | else: 147 | if dupl > 0: 148 | self.log.info( 149 | """Successfully completed duplicates removal. 150 | Took: {0}""".format(timedelta(seconds=(end - start))) 151 | ) 152 | else: 153 | self.log.info( 154 | "Total time: {0}".format(timedelta(seconds=(end - start))) 155 | ) 156 | 157 | except ConnectionError as e: 158 | self.log.error(e) 159 | 160 | def process_index(self, es, docs, pk, dupl, index, args): 161 | if args.window: 162 | if not args.timestamp: 163 | self.log.error("Please specify --timestamp field") 164 | sys.exit(1) 165 | if not args.since: 166 | self.log.error( 167 | "Please specify --since %Y-%m-%d\"'T'\"%H:%M:%S timepoint" 168 | ) 169 | sys.exit(1) 170 | if not args.until: 171 | self.log.error( 172 | "Please specify --until %Y-%m-%d\"'T'\"%H:%M:%S timepoint" 173 | ) 174 | sys.exit(1) 175 | 176 | win = time_to_sec(args.window) 177 | self.log.info( 178 | "Timestamp based search, with window {} from {} until {}".format( 179 | args.window, args.since, args.until 180 | ) 181 | ) 182 | 183 | end = args.until 184 | 185 | currStart = args.since 186 | currEnd = args.since + timedelta(seconds=win) 187 | self.total = 0 188 | # scan & remove using sliding window 189 | while currEnd < end: 190 | docs = {} # avoid deleting same documents again and again 191 | self.log.info( 192 | "Using window {}, from: {} until: {}".format( 193 | args.window, to_es_date(currStart), to_es_date(currEnd) 194 | ) 195 | ) 196 | args.since = currStart 197 | args.until = currEnd 198 | self.total += self.scan_and_remove(es, docs, pk, dupl, index, args) 199 | currStart += timedelta(seconds=win) 200 | currEnd += timedelta(seconds=win) 201 | 202 | if currEnd != end: 203 | self.log.info( 204 | "Last check, from: {} until: {}".format( 205 | to_es_date(currStart), to_es_date(end) 206 | ) 207 | ) 208 | args.since = currStart 209 | args.until = end 210 | self.total += self.scan_and_remove(es, docs, pk, dupl, index, args) 211 | else: 212 | # "normal" index without timestamps 213 | self.total += self.scan_and_remove(es, docs, pk, dupl, index, args) 214 | self.log.info( 215 | "Altogether {} documents were removed (including doc replicas)".format( 216 | self.total 217 | ) 218 | ) 219 | 220 | def scan(self, es, docs_hash, unique_fields, index, args): 221 | i = 0 222 | self.log.info( 223 | "Building documents mapping on index: {}, batch size: {}".format( 224 | index, args.batch 225 | ) 226 | ) 227 | for hit in helpers.scan( 228 | es, 229 | index=index, 230 | size=args.batch, 231 | query=self.es_query(args), 232 | scroll=args.scroll, 233 | request_timeout=args.request_timeout, 234 | ): 235 | self.build_index(docs_hash, unique_fields, hit) 236 | i += 1 237 | if i % args.mem_report == 0: 238 | self.log.debug( 239 | "Scanned {:0,} unique documents, memory usage: {}".format( 240 | len(docs_hash), memusage() 241 | ) 242 | ) 243 | return self.count_duplicates(docs_hash) 244 | 245 | def scan_and_remove(self, es, docs_hash, unique_fields, dupl, index, args): 246 | # find duplicate documents 247 | dupl = self.scan(es, docs_hash, unique_fields, index, args) 248 | if dupl == 0: 249 | self.log.info("No duplicates found") 250 | else: 251 | self.total = len(docs_hash) 252 | self.log.info( 253 | "Found {:0,} duplicates out of {:0,} docs, unique documents: {:0,} ({:.1f}% duplicates)".format( 254 | dupl, 255 | dupl + self.total, 256 | self.total, 257 | dupl / (dupl + self.total) * 100, 258 | ) 259 | ) 260 | 261 | if args.log_dupl: 262 | self.save_documents_mapping(docs_hash, args) 263 | if args.noop: 264 | self.log.info( 265 | """In order to print matching IDs to stdout run with 266 | --debug flag or save results to JSON file using --log_dupl docs.json""" 267 | ) 268 | if args.debug: 269 | self.print_duplicates(docs_hash, index, es, args) 270 | else: 271 | if args.threads > 1: 272 | return self.parallel_delete(docs_hash, index, es, args, dupl) 273 | else: 274 | # safer option, should avoid overloading elastic 275 | return self.sequential_delete(docs_hash, index, es, args, dupl) 276 | return 0 277 | 278 | def es_query(self, args): 279 | if args.timestamp: 280 | filter = {"format": "strict_date_optional_time"} 281 | if args.since: 282 | # Greater than or equal to 283 | filter["gte"] = to_es_date(args.since) 284 | if args.until: 285 | # Less than 286 | filter["lt"] = to_es_date(args.until) 287 | query = { 288 | "query": {"bool": {"filter": [{"range": {args.timestamp: filter}}]}} 289 | } 290 | return query 291 | else: 292 | {} 293 | 294 | def print_duplicates(self, docs_hash, index, es, args): 295 | for hashval, ids in docs_hash.items(): 296 | if len(ids) > 1: 297 | # Get the documents that have mapped to the current hasval 298 | matching_docs = es.mget(index=index, body={"ids": ids}) 299 | for doc in matching_docs["docs"]: 300 | print("doc=%s" % doc) 301 | 302 | # For catching Elasticsearch exceptions 303 | def wrapper(self, gen): 304 | while True: 305 | try: 306 | yield next(gen) 307 | except StopIteration: 308 | break 309 | except Exception as e: 310 | # TODO: after catching exception we're unable to continue 311 | # which is good, we don't overload ES cluster 312 | self.log.error(e) 313 | 314 | def sequential_delete(self, docs_hash, index, es, args, duplicates): 315 | if not args.no_progress: 316 | progress = tqdm.tqdm(unit="docs", total=duplicates) 317 | successes = 0 318 | 319 | for success, info in self.wrapper( 320 | streaming_bulk( 321 | es, 322 | self.delete_iterator(docs_hash, index, args), 323 | max_retries=args.max_retries, 324 | initial_backoff=args.initial_backoff, 325 | request_timeout=args.request_timeout, 326 | chunk_size=args.flush, 327 | raise_on_exception=args.fail_fast, 328 | ) 329 | ): 330 | if success: 331 | successes += info["delete"]["_shards"]["successful"] 332 | else: 333 | print("Doc failed", info) 334 | if not args.no_progress: 335 | progress.update(1) 336 | 337 | self.log.info( 338 | "Deleted {:0,} documents (including shard replicas)".format(successes) 339 | ) 340 | return successes 341 | 342 | def parallel_delete(self, docs_hash, index, es, args, duplicates): 343 | if not args.no_progress: 344 | progress = tqdm.tqdm(unit="docs", total=duplicates) 345 | successes = 0 346 | 347 | for success, info in self.wrapper( 348 | parallel_bulk( 349 | es, 350 | self.delete_iterator(docs_hash, index, args), 351 | thread_count=args.threads, 352 | request_timeout=args.request_timeout, 353 | chunk_size=args.flush, 354 | raise_on_exception=args.fail_fast, 355 | ) 356 | ): 357 | if success: 358 | successes += info["delete"]["_shards"]["successful"] 359 | else: 360 | print("Doc failed", info) 361 | if not args.no_progress: 362 | progress.update(1) 363 | 364 | self.log.info( 365 | "Deleted {:0,} documents (including shard replicas)".format(successes) 366 | ) 367 | return successes 368 | 369 | def delete_iterator(self, docs_hash, index, args): 370 | for hashval, ids in docs_hash.items(): 371 | if len(ids) > 1: 372 | i = 0 373 | for doc_id in ids: 374 | if i > 0: # skip first document 375 | doc = {"_op_type": "delete", "_index": index, "_id": doc_id} 376 | if args.doc_type: 377 | doc["_type"] = args.doc_type 378 | yield doc 379 | i += 1 380 | 381 | def count_duplicates(self, docs_hash): 382 | duplicates = 0 383 | for hashval, ids in docs_hash.items(): 384 | size = len(ids) 385 | if size > 1: 386 | duplicates += size - 1 387 | return duplicates 388 | 389 | def save_documents_mapping(self, docs_hash, args): 390 | self.log.info("Storing documents mapping into: {}".format(args.log_dupl)) 391 | with open(args.log_dupl, "w", encoding="utf8") as ujson_file: 392 | ujson.dump(docs_hash, ujson_file) 393 | --------------------------------------------------------------------------------