├── so ├── files.txt ├── raw_data_prep_script.zip ├── operations │ └── default.json ├── track.json ├── index.json ├── challenges │ └── default.json └── README.md ├── noaa ├── files.txt ├── track.json ├── README.md ├── operations │ └── default.json ├── challenges │ └── default.json ├── index.json └── _tools │ └── process.py ├── pmc ├── files.txt ├── track.py ├── track.json ├── index.json ├── operations │ └── default.json ├── README.md └── challenges │ └── default.json ├── eventdata ├── files.txt ├── operations │ └── default.json ├── track.json ├── challenges │ └── default.json ├── index.json └── README.md ├── geopoint ├── files.txt ├── index.json ├── track.json ├── README.md ├── operations │ └── default.json └── challenges │ └── default.json ├── nested ├── files.txt ├── raw_data_prep_scripts.zip ├── track.json ├── index.json ├── queries.csv ├── operations │ └── default.json ├── README.md ├── challenges │ └── default.json └── track.py ├── nyc_taxis ├── files.txt ├── track.py ├── track.json ├── index.json ├── operations │ └── default.json ├── README.md ├── _tools │ └── parse.py └── challenges │ └── default.json ├── geonames ├── files.txt ├── track.json ├── index.json ├── README.md ├── track.py ├── operations │ └── default.json └── challenges │ └── default.json ├── geopointshape ├── files.txt ├── index.json ├── _tools │ └── parse.py ├── track.json ├── operations │ └── default.json ├── README.md └── challenges │ └── default.json ├── percolator ├── files.txt ├── index.json ├── track.json ├── operations │ └── default.json ├── challenges │ └── default.json └── README.md ├── metricbeat ├── files.txt ├── track.json ├── operations │ └── default.json └── challenges │ └── default.json ├── geoshape ├── files.txt ├── index.json ├── _tools │ └── parse.py ├── README.md ├── operations │ └── default.json ├── track.json └── challenges │ └── default.json ├── http_logs ├── track.py ├── files.txt ├── index.json ├── _tools │ └── unparse.rb ├── README.md ├── track.json └── operations │ └── default.json ├── .gitignore ├── download.sh └── README.md /so/files.txt: -------------------------------------------------------------------------------- 1 | posts.json.bz2 2 | posts-1k.json.bz2 -------------------------------------------------------------------------------- /noaa/files.txt: -------------------------------------------------------------------------------- 1 | documents.json.bz2 2 | documents-1k.json.bz2 -------------------------------------------------------------------------------- /pmc/files.txt: -------------------------------------------------------------------------------- 1 | documents.json.bz2 2 | documents-1k.json.bz2 -------------------------------------------------------------------------------- /eventdata/files.txt: -------------------------------------------------------------------------------- 1 | eventdata.json.bz2 2 | eventdata-1k.json.bz2 -------------------------------------------------------------------------------- /geopoint/files.txt: -------------------------------------------------------------------------------- 1 | documents.json.bz2 2 | documents-1k.json.bz2 -------------------------------------------------------------------------------- /nested/files.txt: -------------------------------------------------------------------------------- 1 | documents.json.bz2 2 | documents-1k.json.bz2 -------------------------------------------------------------------------------- /nyc_taxis/files.txt: -------------------------------------------------------------------------------- 1 | documents.json.bz2 2 | documents-1k.json.bz2 -------------------------------------------------------------------------------- /geonames/files.txt: -------------------------------------------------------------------------------- 1 | documents-2.json.bz2 2 | documents-2-1k.json.bz2 -------------------------------------------------------------------------------- /geopointshape/files.txt: -------------------------------------------------------------------------------- 1 | documents.json.bz2 2 | documents-1k.json.bz2 -------------------------------------------------------------------------------- /percolator/files.txt: -------------------------------------------------------------------------------- 1 | queries-2.json.bz2 2 | queries-2-1k.json.bz2 -------------------------------------------------------------------------------- /metricbeat/files.txt: -------------------------------------------------------------------------------- 1 | documents.json.bz2 2 | documents-1k.json.bz2 3 | -------------------------------------------------------------------------------- /so/raw_data_prep_script.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tlrx/rally-tracks/master/so/raw_data_prep_script.zip -------------------------------------------------------------------------------- /nested/raw_data_prep_scripts.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tlrx/rally-tracks/master/nested/raw_data_prep_scripts.zip -------------------------------------------------------------------------------- /geoshape/files.txt: -------------------------------------------------------------------------------- 1 | linestrings.json.bz2 2 | linestrings-1k.json.bz2 3 | multilinestrings.json.bz2 4 | multilinestrings-1k.json.bz2 5 | polygons.json.bz2 6 | polygons-1k.json.bz2 7 | -------------------------------------------------------------------------------- /so/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index-append", 3 | "operation-type": "bulk", 4 | "bulk-size": {{bulk_size | default(5000)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}} 6 | } 7 | -------------------------------------------------------------------------------- /eventdata/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index-append", 3 | "operation-type": "bulk", 4 | "bulk-size": {{bulk_size | default(5000)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}} 6 | } 7 | -------------------------------------------------------------------------------- /http_logs/track.py: -------------------------------------------------------------------------------- 1 | def reindex(es, params): 2 | result = es.reindex(body=params.get("body"), request_timeout=params.get("request_timeout")) 3 | return result["total"], "docs" 4 | 5 | def register(registry): 6 | registry.register_runner("reindex", reindex) 7 | -------------------------------------------------------------------------------- /pmc/track.py: -------------------------------------------------------------------------------- 1 | def put_settings(es, params): 2 | es.cluster.put_settings(body=params["body"]) 3 | 4 | 5 | def register(registry): 6 | # register a fallback for older Rally versions 7 | try: 8 | from esrally.driver.runner import PutSettings 9 | except ImportError: 10 | registry.register_runner("put-settings", put_settings) 11 | -------------------------------------------------------------------------------- /nyc_taxis/track.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | def wait_for_ml_lookback(es, params): 5 | while True: 6 | response = es.xpack.ml.get_datafeed_stats(datafeed_id=params["datafeed-id"]) 7 | if response["datafeeds"][0]["state"] == "stopped": 8 | break 9 | time.sleep(5) 10 | 11 | 12 | def register(registry): 13 | registry.register_runner("wait-for-ml-lookback", wait_for_ml_lookback) 14 | -------------------------------------------------------------------------------- /http_logs/files.txt: -------------------------------------------------------------------------------- 1 | documents-181998.json.bz2 2 | documents-191998.json.bz2 3 | documents-201998.json.bz2 4 | documents-211998.json.bz2 5 | documents-221998.json.bz2 6 | documents-231998.json.bz2 7 | documents-241998.json.bz2 8 | documents-181998-1k.json.bz2 9 | documents-191998-1k.json.bz2 10 | documents-201998-1k.json.bz2 11 | documents-211998-1k.json.bz2 12 | documents-221998-1k.json.bz2 13 | documents-231998-1k.json.bz2 14 | documents-241998-1k.json.bz2 15 | -------------------------------------------------------------------------------- /geoshape/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index.number_of_shards": {{number_of_shards | default(1)}}, 4 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 5 | "index.requests.cache.enable": false 6 | }, 7 | "mappings": { 8 | "dynamic": "strict", 9 | "_source": { 10 | "enabled": {{ source_enabled | default(true) | tojson }} 11 | }, 12 | "properties": { 13 | "shape": { 14 | "type": "geo_shape" 15 | } 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /geopoint/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index.number_of_shards": {{number_of_shards | default(5)}}, 4 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 5 | "index.requests.cache.enable": false 6 | }, 7 | "mappings": { 8 | "dynamic": "strict", 9 | "_source": { 10 | "enabled": {{ source_enabled | default(true) | tojson }} 11 | }, 12 | "properties": { 13 | "location": { 14 | "type": "geo_point" 15 | } 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /geopointshape/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index.number_of_shards": {{number_of_shards | default(5)}}, 4 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 5 | "index.requests.cache.enable": false 6 | }, 7 | "mappings": { 8 | "dynamic": "strict", 9 | "_source": { 10 | "enabled": {{ source_enabled | default(true) | tojson }} 11 | }, 12 | "properties": { 13 | "location": { 14 | "type": "geo_shape" 15 | } 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /geoshape/_tools/parse.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | import sys 4 | import re 5 | 6 | def to_json(f): 7 | for line in f: 8 | try: 9 | d = {} 10 | d["shape"] = line.strip() 11 | print(json.dumps(d)) 12 | except KeyboardInterrupt: 13 | break 14 | except Exception as e: 15 | print("Skipping malformed entry '%s' because of %s" %(line, str(e)), file=sys.stderr) 16 | 17 | if sys.argv[1] == "json": 18 | for file_name in sys.argv[2:]: 19 | with open(file_name) as f: 20 | to_json(f) 21 | else: 22 | raise Exception("Expected 'json' but got %s" %sys.argv[1]) 23 | -------------------------------------------------------------------------------- /percolator/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index.number_of_shards": {{number_of_shards | default(5)}}, 4 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 5 | "index.queries.cache.enabled": false, 6 | "index.requests.cache.enable": false 7 | }, 8 | "mappings": { 9 | "_source": { 10 | "enabled": {{ source_enabled | default(true) | tojson }} 11 | }, 12 | "dynamic": "strict", 13 | "properties": { 14 | "body": { 15 | "type": "text", 16 | "analyzer": "english" 17 | }, 18 | "query": { 19 | "type": "percolator" 20 | } 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /geopointshape/_tools/parse.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | import sys 4 | import re 5 | 6 | def to_json(f): 7 | for line in f: 8 | try: 9 | point = json.loads(line)["location"] 10 | d = {} 11 | d["location"] = "POINT (" + str(point[0]) + " " + str(point[1]) + ")" 12 | print(json.dumps(d)) 13 | except KeyboardInterrupt: 14 | break 15 | except Exception as e: 16 | print("Skipping malformed entry '%s' because of %s" %(line, str(e)), file=sys.stderr) 17 | 18 | if sys.argv[1] == "json": 19 | for file_name in sys.argv[2:]: 20 | with open(file_name) as f: 21 | to_json(f) 22 | else: 23 | raise Exception("Expected 'json' but got %s" %sys.argv[1]) 24 | -------------------------------------------------------------------------------- /metricbeat/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | { 3 | "version": 2, 4 | "description": "Metricbeat data", 5 | "indices": [ 6 | { 7 | "name": "metricbeat", 8 | "body": "index.json" 9 | } 10 | ], 11 | "corpora": [ 12 | { 13 | "name": "metricbeat", 14 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/metricbeat", 15 | "documents": [ 16 | { 17 | "source-file": "documents.json.bz2", 18 | "document-count": 1079600, 19 | "compressed-bytes":91887122, 20 | "uncompressed-bytes":1249705758 21 | } 22 | ] 23 | } 24 | ], 25 | "operations": [ 26 | {{ rally.collect(parts="operations/*.json") }} 27 | ], 28 | "challenges": [ 29 | {{ rally.collect(parts="challenges/*.json") }} 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /pmc/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | 3 | { 4 | "version": 2, 5 | "description": "Full text benchmark with academic papers from PMC", 6 | "indices": [ 7 | { 8 | "name": "pmc", 9 | "body": "index.json" 10 | } 11 | ], 12 | "corpora": [ 13 | { 14 | "name": "pmc", 15 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/pmc", 16 | "documents": [ 17 | { 18 | "source-file": "documents.json.bz2", 19 | "document-count": 574199, 20 | "compressed-bytes": 5928712141, 21 | "uncompressed-bytes": 23256051757 22 | } 23 | ] 24 | } 25 | ], 26 | "operations": [ 27 | {{ rally.collect(parts="operations/*.json") }} 28 | ], 29 | "challenges": [ 30 | {{ rally.collect(parts="challenges/*.json") }} 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /nested/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | 3 | { 4 | "version": 2, 5 | "description": "StackOverflow Q&A stored as nested docs", 6 | "indices": [ 7 | { 8 | "name": "sonested", 9 | "body": "index.json" 10 | } 11 | ], 12 | "corpora": [ 13 | { 14 | "name": "nested", 15 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/nested", 16 | "documents": [ 17 | { 18 | "source-file": "documents.json.bz2", 19 | "document-count": 11203029, 20 | "compressed-bytes": 695293381, 21 | "uncompressed-bytes": 3637747670 22 | } 23 | ] 24 | } 25 | ], 26 | "operations": [ 27 | {{ rally.collect(parts="operations/*.json") }} 28 | ], 29 | "challenges": [ 30 | {{ rally.collect(parts="challenges/*.json") }} 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /noaa/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | 3 | { 4 | "version": 2, 5 | "description": "Global daily weather measurements from NOAA", 6 | "indices": [ 7 | { 8 | "name": "weather-data-2016", 9 | "body": "index.json" 10 | } 11 | ], 12 | "corpora": [ 13 | { 14 | "name": "noaa", 15 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/noaa", 16 | "documents": [ 17 | { 18 | "source-file": "documents.json.bz2", 19 | "document-count": 33659481, 20 | "compressed-bytes": 993302204, 21 | "uncompressed-bytes": 9684262698 22 | } 23 | ] 24 | } 25 | ], 26 | "operations": [ 27 | {{ rally.collect(parts="operations/*.json") }} 28 | ], 29 | "challenges": [ 30 | {{ rally.collect(parts="challenges/*.json") }} 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /geopoint/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | 3 | { 4 | "version": 2, 5 | "description": "Point coordinates from PlanetOSM", 6 | "indices": [ 7 | { 8 | "name": "osmgeopoints", 9 | "body": "index.json" 10 | } 11 | ], 12 | "corpora": [ 13 | { 14 | "name": "geopoint", 15 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geopoint", 16 | "documents": [ 17 | { 18 | "source-file": "documents.json.bz2", 19 | "document-count": 60844404, 20 | "compressed-bytes": 505295401, 21 | "uncompressed-bytes": 2448564579 22 | } 23 | ] 24 | } 25 | ], 26 | "operations": [ 27 | {{ rally.collect(parts="operations/*.json") }} 28 | ], 29 | "challenges": [ 30 | {{ rally.collect(parts="challenges/*.json") }} 31 | ] 32 | } 33 | 34 | -------------------------------------------------------------------------------- /so/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | 3 | { 4 | "version": 2, 5 | "description": "Indexing benchmark using up to questions and answers from StackOverflow", 6 | "indices": [ 7 | { 8 | "name": "so", 9 | "body": "index.json" 10 | } 11 | ], 12 | "corpora": [ 13 | { 14 | "name": "so", 15 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/so", 16 | "documents": [ 17 | { 18 | "source-file": "posts.json.bz2", 19 | "document-count": 36062278, 20 | "compressed-bytes": 9599137228, 21 | "uncompressed-bytes": 35564808298 22 | } 23 | ] 24 | } 25 | ], 26 | "operations": [ 27 | {{ rally.collect(parts="operations/*.json") }} 28 | ], 29 | "challenges": [ 30 | {{ rally.collect(parts="challenges/*.json") }} 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /percolator/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | 3 | { 4 | "version": 2, 5 | "description": "Percolator benchmark based on AOL queries", 6 | "indices": [ 7 | { 8 | "name": "queries", 9 | "body": "index.json" 10 | } 11 | ], 12 | "corpora": [ 13 | { 14 | "name": "percolator", 15 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/percolator", 16 | "documents": [ 17 | { 18 | "source-file": "queries-2.json.bz2", 19 | "document-count": 2000000, 20 | "compressed-bytes": 105192, 21 | "uncompressed-bytes": 110039748 22 | } 23 | ] 24 | } 25 | ], 26 | "operations": [ 27 | {{ rally.collect(parts="operations/*.json") }} 28 | ], 29 | "challenges": [ 30 | {{ rally.collect(parts="challenges/*.json") }} 31 | ] 32 | } 33 | 34 | -------------------------------------------------------------------------------- /geopointshape/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | 3 | { 4 | "version": 2, 5 | "description": "Point coordinates from PlanetOSM indexed as geoshapes", 6 | "indices": [ 7 | { 8 | "name": "osmgeoshapes", 9 | "body": "index.json" 10 | } 11 | ], 12 | "corpora": [ 13 | { 14 | "name": "geopointshape", 15 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geopointshape", 16 | "documents": [ 17 | { 18 | "source-file": "documents.json.bz2", 19 | "document-count": 60844404, 20 | "compressed-bytes": 493367095, 21 | "uncompressed-bytes": 2780550484 22 | } 23 | ] 24 | } 25 | ], 26 | "operations": [ 27 | {{ rally.collect(parts="operations/*.json") }} 28 | ], 29 | "challenges": [ 30 | {{ rally.collect(parts="challenges/*.json") }} 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /geonames/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | { 3 | "version": 2, 4 | "description": "POIs from Geonames", 5 | "data-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geonames", 6 | "indices": [ 7 | { 8 | "name": "geonames", 9 | "body": "index.json" 10 | } 11 | ], 12 | "corpora": [ 13 | { 14 | "name": "geonames", 15 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geonames", 16 | "documents": [ 17 | { 18 | "source-file": "documents-2.json.bz2", 19 | "document-count": 11396505, 20 | "compressed-bytes": 264698741, 21 | "uncompressed-bytes": 3547614383 22 | } 23 | ] 24 | } 25 | ], 26 | "operations": [ 27 | {{ rally.collect(parts="operations/*.json") }} 28 | ], 29 | "challenges": [ 30 | {{ rally.collect(parts="challenges/*.json") }} 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /nyc_taxis/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | 3 | { 4 | "version": 2, 5 | "description": "Taxi rides in New York in 2015", 6 | "indices": [ 7 | { 8 | "name": "nyc_taxis", 9 | "body": "index.json" 10 | } 11 | ], 12 | "corpora": [ 13 | { 14 | "name": "nyc_taxis", 15 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/nyc_taxis", 16 | "documents": [ 17 | { 18 | "source-file": "documents.json.bz2", 19 | "#COMMENT": "ML benchmark rely on the fact that the document count stays constant.", 20 | "document-count": 165346692, 21 | "compressed-bytes": 4812721501, 22 | "uncompressed-bytes": 79802445255 23 | } 24 | ] 25 | } 26 | ], 27 | "operations": [ 28 | {{ rally.collect(parts="operations/*.json") }} 29 | ], 30 | "challenges": [ 31 | {{ rally.collect(parts="challenges/*.json") }} 32 | ] 33 | } 34 | -------------------------------------------------------------------------------- /eventdata/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | 3 | { 4 | "version": 2, 5 | "description": "This benchmark indexes HTTP access logs generated based sample logs from the elastic.co website using the generator available in https://github.com/elastic/rally-eventdata-track", 6 | "indices": [ 7 | { 8 | "name": "eventdata", 9 | "body": "index.json" 10 | } 11 | ], 12 | "corpora": [ 13 | { 14 | "name": "eventdata", 15 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/eventdata", 16 | "documents": [ 17 | { 18 | "source-file": "eventdata.json.bz2", 19 | "document-count": 20000000, 20 | "compressed-bytes": 791796014, 21 | "uncompressed-bytes": 16437108429 22 | } 23 | ] 24 | } 25 | ], 26 | "operations": [ 27 | {{ rally.collect(parts="operations/*.json") }} 28 | ], 29 | "challenges": [ 30 | {{ rally.collect(parts="challenges/*.json") }} 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /so/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index.number_of_shards": {{number_of_shards | default(5)}}, 4 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 5 | "index.requests.cache.enable": false 6 | }, 7 | "mappings": { 8 | "dynamic": "strict", 9 | "_source": { 10 | "enabled": {{ source_enabled | default(true) | tojson }} 11 | }, 12 | "properties": { 13 | "user": { 14 | "type": "keyword" 15 | }, 16 | "creationDate": { 17 | "type": "date" 18 | }, 19 | "title": { 20 | "type": "text" 21 | }, 22 | "questionId": { 23 | "type": "keyword" 24 | }, 25 | "answerId": { 26 | "type": "keyword" 27 | }, 28 | "acceptedAnswerId": { 29 | "type": "keyword" 30 | }, 31 | "tags": { 32 | "type": "keyword" 33 | }, 34 | "body": { 35 | "type": "text" 36 | }, 37 | "type": { 38 | "type": "keyword" 39 | } 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /pmc/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index.number_of_shards": {{number_of_shards | default(5)}}, 4 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 5 | "index.requests.cache.enable": false 6 | }, 7 | "mappings": { 8 | "_source": { 9 | "enabled": {{ source_enabled | default(true) | tojson }} 10 | }, 11 | "dynamic": "strict", 12 | "properties": { 13 | "name": { 14 | "type": "keyword" 15 | }, 16 | "journal": { 17 | "type": "text" 18 | }, 19 | "date": { 20 | "type": "text" 21 | }, 22 | "volume": { 23 | "type": "text" 24 | }, 25 | "issue": { 26 | "type": "text" 27 | }, 28 | "accession": { 29 | "type": "keyword" 30 | }, 31 | "timestamp": { 32 | "type": "date", 33 | "format": "yyyy-MM-dd HH:mm:ss" 34 | }, 35 | "pmid": { 36 | "type": "integer" 37 | }, 38 | "body": { 39 | "type": "text" 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /nested/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index.number_of_shards": {{number_of_shards | default(1)}}, 4 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 5 | "index.store.type": "{{store_type | default('fs')}}", 6 | "index.requests.cache.enable": false 7 | }, 8 | "mappings": { 9 | "dynamic": "strict", 10 | "_source": { 11 | "enabled": {{ source_enabled | default(true) | tojson }} 12 | }, 13 | "properties": { 14 | "user": { 15 | "type": "keyword" 16 | }, 17 | "creationDate": { 18 | "type": "date" 19 | }, 20 | "title": { 21 | "type": "text" 22 | }, 23 | "qid": { 24 | "type": "keyword" 25 | }, 26 | "tag": { 27 | "type": "keyword" 28 | }, 29 | "answer_count": { 30 | "type": "integer" 31 | }, 32 | "answers": { 33 | "type": "nested", 34 | "properties": { 35 | "user": { 36 | "type": "keyword" 37 | }, 38 | "date": { 39 | "type": "date" 40 | } 41 | } 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /nested/queries.csv: -------------------------------------------------------------------------------- 1 | java,2012-04-08T21:15:33.873Z 2 | c#,2012-01-02T13:27:55.631Z 3 | javascript,2011-09-29T09:31:37.345Z 4 | php,2012-04-10T03:16:00.727Z 5 | android,2012-04-08T15:02:52.091Z 6 | jquery,2012-04-03T08:39:17.337Z 7 | python,2012-04-09T04:44:32.264Z 8 | html,2012-04-09T05:11:21.702Z 9 | c++,2012-06-01T22:18:55.219Z 10 | ios,2012-04-09T02:15:44.330Z 11 | mysql,2011-07-16T14:27:53.863Z 12 | css,2012-04-09T17:48:32.247Z 13 | sql,2010-09-27T15:38:50.503Z 14 | asp.net,2012-04-09T06:06:10.029Z 15 | objective-c,2012-04-08T22:03:57.592Z 16 | ruby-on-rails,2012-04-08T19:35:07.412Z 17 | .net,2012-04-08T16:37:16.992Z 18 | iphone,2012-04-10T12:45:17.230Z 19 | c,2012-04-10T01:22:20.110Z 20 | arrays,2012-04-09T15:13:45.728Z 21 | sql-server,2012-04-10T07:36:56.023Z 22 | angularjs,2012-04-10T12:26:25.743Z 23 | ruby,2012-04-08T02:18:56.402Z 24 | json,2012-04-08T07:11:47.490Z 25 | ajax,2012-04-08T08:05:14.004Z 26 | regex,2012-04-08T09:13:29.263Z 27 | xml,2012-04-07T21:37:43.372Z 28 | asp.net-mvc,2012-04-09T21:23:43.302Z 29 | r,2012-03-07T11:59:36.114Z 30 | linux,2012-04-09T12:51:50.530Z 31 | wpf,2012-04-08T11:18:54.110Z 32 | django,2012-04-13T02:18:51.407Z 33 | node.js,2012-04-07T04:50:14.554Z 34 | database,2012-04-10T15:53:29.825Z 35 | xcode,2011-10-09T16:32:45.480Z 36 | -------------------------------------------------------------------------------- /http_logs/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index.number_of_shards": {{number_of_shards | default(5)}}, 4 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 5 | "index.requests.cache.enable": false 6 | }, 7 | "mappings": { 8 | "dynamic": "strict", 9 | "_source": { 10 | "enabled": {{ source_enabled | default(true) | tojson }} 11 | }, 12 | "properties": { 13 | "@timestamp": { 14 | "format": "strict_date_optional_time||epoch_second", 15 | "type": "date" 16 | }, 17 | "clientip": { 18 | "type": "ip" 19 | }, 20 | "message": { 21 | "type": "keyword", 22 | "index": false, 23 | "doc_values": false 24 | }, 25 | "request": { 26 | "type": "text", 27 | "fields": { 28 | "raw": { 29 | "ignore_above": 256, 30 | "type": "keyword" 31 | } 32 | } 33 | }, 34 | "status": { 35 | "type": "integer" 36 | }, 37 | "size": { 38 | "type": "integer" 39 | }, 40 | "geoip" : { 41 | "properties" : { 42 | "country_name": { "type": "keyword" }, 43 | "city_name": { "type": "keyword" }, 44 | "location" : { "type" : "geo_point" } 45 | } 46 | } 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /so/challenges/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "append-no-conflicts", 3 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.", 4 | "default": true, 5 | "schedule": [ 6 | { 7 | "operation": "delete-index" 8 | }, 9 | { 10 | "operation": { 11 | "operation-type": "create-index", 12 | "settings": {{index_settings | default({}) | tojson}} 13 | } 14 | }, 15 | { 16 | "name": "check-cluster-health", 17 | "operation": { 18 | "operation-type": "cluster-health", 19 | "index": "logs-*", 20 | "request-params": { 21 | "wait_for_status": "{{cluster_health | default('green')}}", 22 | "wait_for_no_relocating_shards": "true" 23 | } 24 | } 25 | }, 26 | { 27 | "operation": "index-append", 28 | "warmup-time-period": 120, 29 | "clients": {{bulk_indexing_clients | default(8)}} 30 | }, 31 | { 32 | "operation": "force-merge", 33 | "clients": 1 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /eventdata/challenges/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "append-no-conflicts", 3 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.", 4 | "default": true, 5 | "schedule": [ 6 | { 7 | "operation": "delete-index" 8 | }, 9 | { 10 | "operation": { 11 | "operation-type": "create-index", 12 | "settings": {{index_settings | default({}) | tojson}} 13 | } 14 | }, 15 | { 16 | "name": "check-cluster-health", 17 | "operation": { 18 | "operation-type": "cluster-health", 19 | "index": "eventdata", 20 | "request-params": { 21 | "wait_for_status": "{{cluster_health | default('green')}}", 22 | "wait_for_no_relocating_shards": "true" 23 | } 24 | } 25 | }, 26 | { 27 | "operation": "index-append", 28 | "warmup-time-period": 120, 29 | "clients": {{bulk_indexing_clients | default(8)}} 30 | }, 31 | { 32 | "operation": "force-merge", 33 | "clients": 1 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /geoshape/README.md: -------------------------------------------------------------------------------- 1 | ## Geoshape track 2 | 3 | This track is based on [PlanetOSM](http://wiki.openstreetmap.org/wiki/Planet.osm) data. 4 | 5 | ### Example Document 6 | 7 | ```json 8 | { 9 | "shape": "LINESTRING(-1.8212114 52.5538901, -1.8205573 52.554324)" 10 | } 11 | ``` 12 | 13 | ### Parameters 14 | 15 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`: 16 | 17 | * `linestring_bulk_size` (default: 100): The bulk request size for indexing linestrings. 18 | * `multilinestring_bulk_size` (default: 100): The bulk request size for indexing multilinestrings. 19 | * `polygon_bulk_size` (default: 100): The bulk request size for indexing polygons. 20 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests. 21 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. 22 | * `number_of_replicas` (default: 0) 23 | * `number_of_shards` (default: 1) 24 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 25 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. 26 | * `cluster_health` (default: "green"): The minimum required cluster health. 27 | 28 | ### License 29 | 30 | Same license as the original data from PlanetOSM: [Open Database License](http://wiki.openstreetmap.org/wiki/Open_Database_License). 31 | -------------------------------------------------------------------------------- /metricbeat/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index-append", 3 | "operation-type": "bulk", 4 | "bulk-size": {{bulk_size | default(10000)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}} 6 | }, 7 | { 8 | "name": "autohisto_agg", 9 | "operation-type": "search", 10 | "body": { 11 | "size": 0, 12 | "query": { 13 | "range": { 14 | "@timestamp": { 15 | "gte": "23/02/2019", 16 | "lte": "23/02/2019", 17 | "format": "dd/MM/yyyy" 18 | } 19 | } 20 | }, 21 | "aggs": { 22 | "occurrences_over_time": { 23 | "auto_date_histogram": { 24 | "field": "@timestamp", 25 | "buckets": 24 26 | } 27 | } 28 | } 29 | } 30 | }, 31 | { 32 | "name": "date_histogram_agg", 33 | "operation-type": "search", 34 | "body": { 35 | "size": 0, 36 | "query": { 37 | "range": { 38 | "@timestamp": { 39 | "gte": "23/02/2019", 40 | "lte": "23/02/2019", 41 | "format": "dd/MM/yyyy" 42 | } 43 | } 44 | }, 45 | "aggs": { 46 | "occurrences_over_time": { 47 | "date_histogram": { 48 | "field": "@timestamp", 49 | "calendar_interval": "hour" 50 | } 51 | } 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /geopointshape/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index-append", 3 | "operation-type": "bulk", 4 | "bulk-size": {{bulk_size | default(5000)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}} 6 | }, 7 | { 8 | "name": "index-update", 9 | "operation-type": "bulk", 10 | "bulk-size": {{bulk_size | default(5000)}}, 11 | "ingest-percentage": {{ingest_percentage | default(100)}}, 12 | "conflicts": "random", 13 | "on-conflict": "{{on_conflict | default('index')}}", 14 | "conflict-probability": {{conflict_probability | default(25)}}, 15 | "recency": {{recency | default(0)}} 16 | }, 17 | { 18 | "name": "polygon", 19 | "operation-type": "search", 20 | "body": { 21 | "query": { 22 | "geo_shape": { 23 | "location": { 24 | "shape": { 25 | "type": "polygon", 26 | "coordinates" : [[ 27 | [-0.1, 49.0], 28 | [5.0, 48.0], 29 | [15.0, 49.0], 30 | [14.0, 60.0], 31 | [-0.1, 61.0], 32 | [-0.1, 49.0] 33 | ]] 34 | } 35 | } 36 | } 37 | } 38 | } 39 | }, 40 | { 41 | "name": "bbox", 42 | "operation-type": "search", 43 | "body": { 44 | "query": { 45 | "geo_shape": { 46 | "location": { 47 | "shape": { 48 | "type": "envelope", 49 | "coordinates" : [[-0.1, 61.0], [15.0, 48.0]] 50 | } 51 | } 52 | } 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /eventdata/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index.number_of_shards": {{number_of_shards | default(5)}}, 4 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 5 | "index.requests.cache.enable": false 6 | }, 7 | "mappings": { 8 | "dynamic": "strict", 9 | "_source": { 10 | "enabled": {{ source_enabled | default(true) | tojson }} 11 | }, 12 | "properties": { 13 | "@timestamp": { "type": "date" }, 14 | "message": { "type": "text", "index": false }, 15 | "agent": { "type": "keyword", "ignore_above": 256 }, 16 | "bytes": { "type": "integer" }, 17 | "clientip": { "type": "ip" }, 18 | "httpversion": { "type": "keyword", "ignore_above": 256 }, 19 | "response": { "type": "short" }, 20 | "verb": { "type": "keyword", "ignore_above": 256 }, 21 | "tags": { "type": "keyword", "ignore_above": 256 }, 22 | "geoip" : { 23 | "properties" : { 24 | "country_name" : { "type": "keyword" }, 25 | "location" : { "type": "geo_point" } 26 | } 27 | }, 28 | "useragent": { 29 | "properties": { 30 | "name": { "type": "keyword", "ignore_above": 256 }, 31 | "os": { "type": "keyword", "ignore_above": 256 }, 32 | "os_name": { "type": "keyword", "ignore_above": 256 } 33 | } 34 | }, 35 | "request": { 36 | "norms": false, 37 | "type": "text", 38 | "fields": { 39 | "keyword": { "ignore_above": 256, "type": "keyword" } 40 | } 41 | }, 42 | "referrer": { 43 | "norms": false, 44 | "type": "text", 45 | "fields": { 46 | "keyword": { "ignore_above": 256, "type": "keyword" } 47 | } 48 | } 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /geopointshape/README.md: -------------------------------------------------------------------------------- 1 | ## Geopoint track 2 | 3 | This track is based on [PlanetOSM](http://wiki.openstreetmap.org/wiki/Planet.osm) data. It contains the same data as the geopoint track but indexes all points as geoshapes. 4 | 5 | ### Example Document 6 | 7 | ```json 8 | { 9 | "location": "POINT (-0.1485188 51.5250666)" 10 | } 11 | ``` 12 | 13 | ### Parameters 14 | 15 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`: 16 | 17 | * `bulk_size` (default: 5000) 18 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests. 19 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. 20 | * `conflict_probability` (default: 25): A number between 0 and 100 that defines the probability of id conflicts. This requires to run the respective challenge. 21 | * `on_conflict` (default: "index"): Whether to use an "index" or an "update" action when simulating an id conflict. 22 | * `recency` (default: 0): A number between 0 and 1 that defines whether to bias towards more recent ids when simulating conflicts. See the [Rally docs](http://esrally.readthedocs.io/en/latest/track.html#bulk) for the full definition of this parameter. This requires to run the respective challenge. 23 | * `number_of_replicas` (default: 0) 24 | * `number_of_shards` (default: 5) 25 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 26 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. 27 | * `cluster_health` (default: "green"): The minimum required cluster health. 28 | 29 | ### License 30 | 31 | Same license as the original data from PlanetOSM: [Open Database License](http://wiki.openstreetmap.org/wiki/Open_Database_License). 32 | -------------------------------------------------------------------------------- /geoshape/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index-append-linestrings", 3 | "operation-type": "bulk", 4 | "bulk-size": {{linestring_bulk_size | default(100)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}}, 6 | "corpora": "linestrings" 7 | }, 8 | { 9 | "name": "index-append-multilinestrings", 10 | "operation-type": "bulk", 11 | "bulk-size": {{multilinestring_bulk_size | default(100)}}, 12 | "ingest-percentage": {{ingest_percentage | default(100)}}, 13 | "corpora": "multilinestrings" 14 | }, 15 | { 16 | "name": "index-append-polygons", 17 | "operation-type": "bulk", 18 | "bulk-size": {{polygon_bulk_size | default(100)}}, 19 | "ingest-percentage": {{ingest_percentage | default(100)}}, 20 | "corpora": "polygons" 21 | }, 22 | { 23 | "name": "polygon", 24 | "operation-type": "search", 25 | "index": "osm*", 26 | "body": { 27 | "query": { 28 | "geo_shape": { 29 | "shape": { 30 | "shape": { 31 | "type": "polygon", 32 | "coordinates" : [[ 33 | [-0.1, 49.0], 34 | [5.0, 48.0], 35 | [15.0, 49.0], 36 | [14.0, 60.0], 37 | [-0.1, 61.0], 38 | [-0.1, 49.0] 39 | ]] 40 | } 41 | } 42 | } 43 | } 44 | } 45 | }, 46 | { 47 | "name": "bbox", 48 | "operation-type": "search", 49 | "index": "osm*", 50 | "body": { 51 | "query": { 52 | "geo_shape": { 53 | "shape": { 54 | "shape": { 55 | "type": "envelope", 56 | "coordinates" : [[-0.1, 61.0], [15.0, 48.0]] 57 | } 58 | } 59 | } 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## https://github.com/github/gitignore/blob/master/Global/OSX.gitignore 2 | 3 | .DS_Store 4 | .AppleDouble 5 | .LSOverride 6 | 7 | # Icon must end with two \r 8 | Icon 9 | 10 | 11 | # Thumbnails 12 | ._* 13 | 14 | # Files that might appear in the root of a volume 15 | .DocumentRevisions-V100 16 | .fseventsd 17 | .Spotlight-V100 18 | .TemporaryItems 19 | .Trashes 20 | .VolumeIcon.icns 21 | 22 | # Directories potentially created on remote AFP share 23 | .AppleDB 24 | .AppleDesktop 25 | Network Trash Folder 26 | Temporary Items 27 | .apdisk 28 | 29 | ## kinda based on https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore 30 | 31 | *.iml 32 | 33 | ## Directory-based project format: 34 | .idea/ 35 | 36 | ## https://github.com/github/gitignore/blob/master/Python.gitignore 37 | 38 | # Byte-compiled / optimized / DLL files 39 | __pycache__/ 40 | *.py[cod] 41 | *$py.class 42 | 43 | # C extensions 44 | *.so 45 | 46 | # Distribution / packaging 47 | .Python 48 | env/ 49 | build/ 50 | develop-eggs/ 51 | dist/ 52 | downloads/ 53 | eggs/ 54 | .eggs/ 55 | lib/ 56 | lib64/ 57 | parts/ 58 | sdist/ 59 | var/ 60 | *.egg-info/ 61 | .installed.cfg 62 | *.egg 63 | 64 | # PyInstaller 65 | # Usually these files are written by a python script from a template 66 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 67 | *.manifest 68 | *.spec 69 | 70 | # Installer logs 71 | pip-log.txt 72 | pip-delete-this-directory.txt 73 | 74 | # Unit test / coverage reports 75 | htmlcov/ 76 | .tox/ 77 | .coverage 78 | .coverage.* 79 | .cache 80 | nosetests.xml 81 | coverage.xml 82 | *,cover 83 | .hypothesis/ 84 | junit-*.xml 85 | 86 | # Translations 87 | *.mo 88 | *.pot 89 | 90 | # Django stuff: 91 | *.log 92 | 93 | # Sphinx documentation 94 | docs/_build/ 95 | 96 | # PyBuilder 97 | target/ 98 | 99 | #Pickles 100 | *.pk 101 | 102 | # pyenv 103 | .python-version -------------------------------------------------------------------------------- /nested/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index-append", 3 | "operation-type": "bulk", 4 | "bulk-size": {{bulk_size | default(5000)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}} 6 | }, 7 | { 8 | "name": "randomized-nested-queries", 9 | "operation-type": "search", 10 | "param-source": "nested-query-source" 11 | }, 12 | { 13 | "name": "randomized-nested-queries-with-inner-hits_default", 14 | "operation-type": "search", 15 | "param-source": "nested-query-source-with-inner-hits", 16 | "size" : 10, 17 | "inner_hits_size" : 3 18 | }, 19 | { 20 | "name": "randomized-nested-queries-with-inner-hits_default_big_size", 21 | "operation-type": "search", 22 | "param-source": "nested-query-source-with-inner-hits", 23 | "size" : 100, 24 | "inner_hits_size" : 100 25 | }, 26 | { 27 | "name": "randomized-term-queries", 28 | "operation-type": "search", 29 | "param-source": "term-query-source" 30 | }, 31 | { 32 | "name": "randomized-sorted-term-queries", 33 | "operation-type": "search", 34 | "param-source": "sorted-term-query-source" 35 | }, 36 | { 37 | "name": "match-all", 38 | "operation-type": "search", 39 | "body": { 40 | "query": { 41 | "match_all": {} 42 | } 43 | } 44 | }, 45 | { 46 | "name": "nested-date-histo", 47 | "operation-type": "search", 48 | "body": { 49 | "size": 0, 50 | "aggs": { 51 | "answers": { 52 | "nested": { 53 | "path": "answers" 54 | }, 55 | "aggs": { 56 | "date_histo": { 57 | "date_histogram": { 58 | "field": "answers.date", 59 | "calendar_interval": "month" 60 | } 61 | } 62 | } 63 | } 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /geoshape/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | 3 | { 4 | "version": 2, 5 | "description": "Shapes from PlanetOSM", 6 | "indices": [ 7 | { 8 | "name": "osmlinestrings", 9 | "body": "index.json" 10 | }, 11 | { 12 | "name": "osmmultilinestrings", 13 | "body": "index.json" 14 | }, 15 | { 16 | "name": "osmpolygons", 17 | "body": "index.json" 18 | } 19 | ], 20 | "corpora": [ 21 | { 22 | "name": "linestrings", 23 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geoshape", 24 | "target-index": "osmlinestrings", 25 | "documents": [ 26 | { 27 | "source-file": "linestrings.json.bz2", 28 | "document-count": 20532036, 29 | "compressed-bytes": 3697293598, 30 | "uncompressed-bytes": 12592499821 31 | } 32 | ] 33 | }, 34 | { 35 | "name": "multilinestrings", 36 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geoshape", 37 | "target-index": "osmmultilinestrings", 38 | "documents": [ 39 | { 40 | "source-file": "multilinestrings.json.bz2", 41 | "document-count": 532036, 42 | "compressed-bytes": 1816588880, 43 | "uncompressed-bytes": 5992834062 44 | } 45 | ] 46 | }, 47 | { 48 | "name": "polygons", 49 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geoshape", 50 | "target-index": "osmpolygons", 51 | "documents": [ 52 | { 53 | "source-file": "polygons.json.bz2", 54 | "document-count": 39459211, 55 | "compressed-bytes": 8835370788, 56 | "uncompressed-bytes": 30178820325 57 | } 58 | ] 59 | } 60 | ], 61 | "operations": [ 62 | {{ rally.collect(parts="operations/*.json") }} 63 | ], 64 | "challenges": [ 65 | {{ rally.collect(parts="challenges/*.json") }} 66 | ] 67 | } 68 | -------------------------------------------------------------------------------- /metricbeat/challenges/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "append-no-conflicts", 3 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.", 4 | "default": true, 5 | "schedule": [ 6 | { 7 | "operation": "delete-index" 8 | }, 9 | { 10 | "operation": { 11 | "operation-type": "create-index", 12 | "settings": {{index_settings | default({}) | tojson}} 13 | } 14 | }, 15 | { 16 | "name": "check-cluster-health", 17 | "operation": { 18 | "operation-type": "cluster-health", 19 | "index": "metricbeat", 20 | "request-params": { 21 | "wait_for_status": "{{cluster_health | default('green')}}", 22 | "wait_for_no_relocating_shards": "true" 23 | } 24 | } 25 | }, 26 | { 27 | "operation": "index-append", 28 | "warmup-time-period": 0, 29 | "clients": {{bulk_indexing_clients | default(8)}} 30 | }, 31 | { 32 | "name": "refresh-after-index", 33 | "operation": "refresh", 34 | "clients": 1 35 | }, 36 | { 37 | "operation": "force-merge", 38 | "clients": 1 39 | }, 40 | { 41 | "name": "refresh-after-force-merge", 42 | "operation": "refresh", 43 | "clients": 1 44 | }, 45 | { 46 | "operation": "autohisto_agg", 47 | "clients": 1, 48 | "warmup-iterations": 50, 49 | "iterations": 100, 50 | "target-throughput": 2 51 | }, 52 | { 53 | "operation": "date_histogram_agg", 54 | "clients": 1, 55 | "warmup-iterations": 50, 56 | "iterations": 100, 57 | "target-throughput": 2 58 | } 59 | ] 60 | } 61 | 62 | -------------------------------------------------------------------------------- /geopoint/README.md: -------------------------------------------------------------------------------- 1 | ## Geopoint track 2 | 3 | This track is based on [PlanetOSM](http://wiki.openstreetmap.org/wiki/Planet.osm) data. 4 | 5 | ### Example Document 6 | 7 | ```json 8 | { 9 | "location": [ 10 | -0.1485188, 11 | 51.5250666 12 | ] 13 | } 14 | ``` 15 | 16 | ### Parameters 17 | 18 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`: 19 | 20 | * `bulk_size` (default: 5000) 21 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests. 22 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. 23 | * `conflicts` (default: "random"): Type of id conflicts to simulate. Valid values are: 'sequential' (A document id is replaced with a document id with a sequentially increasing id), 'random' (A document id is replaced with a document id with a random other id). 24 | * `conflict_probability` (default: 25): A number between 0 and 100 that defines the probability of id conflicts. This requires to run the respective challenge. Combining ``conflicts=sequential`` and ``conflict-probability=0`` makes Rally generate index ids by itself, instead of relying on Elasticsearch's `automatic id generation `_. 25 | * `on_conflict` (default: "index"): Whether to use an "index" or an "update" action when simulating an id conflict. 26 | * `recency` (default: 0): A number between 0 and 1 that defines whether to bias towards more recent ids when simulating conflicts. See the [Rally docs](http://esrally.readthedocs.io/en/latest/track.html#bulk) for the full definition of this parameter. This requires to run the respective challenge. 27 | * `number_of_replicas` (default: 0) 28 | * `number_of_shards` (default: 5) 29 | * `max_num_segments`: The maximum number of segments to force-merge to. 30 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 31 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. 32 | * `cluster_health` (default: "green"): The minimum required cluster health. 33 | 34 | ### License 35 | 36 | Same license as the original data from PlanetOSM: [Open Database License](http://wiki.openstreetmap.org/wiki/Open_Database_License). 37 | -------------------------------------------------------------------------------- /pmc/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index-append", 3 | "operation-type": "bulk", 4 | "bulk-size": {{bulk_size | default(500)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}} 6 | }, 7 | { 8 | "name": "index-update", 9 | "operation-type": "bulk", 10 | "bulk-size": {{bulk_size | default(500)}}, 11 | "ingest-percentage": {{ingest_percentage | default(100)}}, 12 | "conflicts": "{{conflicts | default('random')}}", 13 | "on-conflict": "{{on_conflict | default('index')}}", 14 | "conflict-probability": {{conflict_probability | default(25)}}, 15 | "recency": {{recency | default(0)}} 16 | }, 17 | { 18 | "name": "default", 19 | "operation-type": "search", 20 | "body": { 21 | "query": { 22 | "match_all": {} 23 | } 24 | } 25 | }, 26 | { 27 | "name": "term", 28 | "operation-type": "search", 29 | "body": { 30 | "query": { 31 | "term": { 32 | "body": "physician" 33 | } 34 | } 35 | } 36 | }, 37 | { 38 | "name": "phrase", 39 | "operation-type": "search", 40 | "body": { 41 | "query": { 42 | "match_phrase": { 43 | "body": "newspaper coverage" 44 | } 45 | } 46 | } 47 | }, 48 | { 49 | "name": "articles_monthly_agg_uncached", 50 | "operation-type": "search", 51 | "body": { 52 | "size": 0, 53 | "aggs": { 54 | "articles_over_time": { 55 | "date_histogram": { 56 | "field": "timestamp", 57 | "calendar_interval": "month" 58 | } 59 | } 60 | } 61 | } 62 | }, 63 | { 64 | "name": "articles_monthly_agg_cached", 65 | "operation-type": "search", 66 | "cache": true, 67 | "body": { 68 | "size": 0, 69 | "aggs": { 70 | "articles_over_time": { 71 | "date_histogram": { 72 | "field": "timestamp", 73 | "calendar_interval": "month" 74 | } 75 | } 76 | } 77 | } 78 | }, 79 | { 80 | "name": "scroll", 81 | "operation-type": "search", 82 | "pages": 25, 83 | "results-per-page": 100, 84 | "body": { 85 | "query": { 86 | "match_all": {} 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /geopoint/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index-append", 3 | "operation-type": "bulk", 4 | "bulk-size": {{bulk_size | default(5000)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}} 6 | }, 7 | { 8 | "name": "index-update", 9 | "operation-type": "bulk", 10 | "bulk-size": {{bulk_size | default(5000)}}, 11 | "ingest-percentage": {{ingest_percentage | default(100)}}, 12 | "conflicts": "{{conflicts | default('random')}}", 13 | "on-conflict": "{{on_conflict | default('index')}}", 14 | "conflict-probability": {{conflict_probability | default(25)}}, 15 | "recency": {{recency | default(0)}} 16 | }, 17 | { 18 | "name": "polygon", 19 | "operation-type": "search", 20 | "body": { 21 | "query": { 22 | "geo_polygon": { 23 | "location": { 24 | "points": [ 25 | [-0.1, 49.0], 26 | [5.0, 48.0], 27 | [15.0, 49.0], 28 | [14.0, 60.0], 29 | [-0.1, 61.0], 30 | [-0.1, 49.0] 31 | ] 32 | } 33 | } 34 | } 35 | } 36 | }, 37 | { 38 | "name": "bbox", 39 | "operation-type": "search", 40 | "body": { 41 | "query": { 42 | "geo_bounding_box": { 43 | "location": { 44 | "top_left": [-0.1, 61.0], 45 | "bottom_right": [15.0, 48.0] 46 | } 47 | } 48 | } 49 | } 50 | }, 51 | { 52 | "name": "distance", 53 | "operation-type": "search", 54 | "body": { 55 | "query": { 56 | "geo_distance": { 57 | "distance": "200km", 58 | "location": [7.0, 55.0] 59 | } 60 | } 61 | } 62 | }, 63 | { 64 | "name": "distanceRange", 65 | "operation-type": "search", 66 | "body": { 67 | "query": { 68 | "match_all": {} 69 | }, 70 | "aggs": { 71 | "geo_distance_range_agg": { 72 | "geo_distance": { 73 | "field": "location", 74 | "origin": "55.0, 7.0", 75 | "unit": "km", 76 | "ranges": [ 77 | { 78 | "from": 200, 79 | "to": 400 80 | } 81 | ] 82 | } 83 | } 84 | } 85 | } 86 | } -------------------------------------------------------------------------------- /nyc_taxis/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index.number_of_shards": {{number_of_shards | default(1)}}, 4 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 5 | "index.requests.cache.enable": false 6 | }, 7 | "mappings": { 8 | "_source": { 9 | "enabled": {{ source_enabled | default(true) | tojson }} 10 | }, 11 | "properties": { 12 | "surcharge": { 13 | "scaling_factor": 100, 14 | "type": "scaled_float" 15 | }, 16 | "dropoff_datetime": { 17 | "type": "date", 18 | "format": "yyyy-MM-dd HH:mm:ss" 19 | }, 20 | "trip_type": { 21 | "type": "keyword" 22 | }, 23 | "mta_tax": { 24 | "scaling_factor": 100, 25 | "type": "scaled_float" 26 | }, 27 | "rate_code_id": { 28 | "type": "keyword" 29 | }, 30 | "passenger_count": { 31 | "type": "integer" 32 | }, 33 | "pickup_datetime": { 34 | "type": "date", 35 | "format": "yyyy-MM-dd HH:mm:ss" 36 | }, 37 | "tolls_amount": { 38 | "scaling_factor": 100, 39 | "type": "scaled_float" 40 | }, 41 | "tip_amount": { 42 | "scaling_factor": 100, 43 | "type": "scaled_float" 44 | }, 45 | "payment_type": { 46 | "type": "keyword" 47 | }, 48 | "extra": { 49 | "scaling_factor": 100, 50 | "type": "scaled_float" 51 | }, 52 | "vendor_id": { 53 | "type": "keyword" 54 | }, 55 | "store_and_fwd_flag": { 56 | "type": "keyword" 57 | }, 58 | "improvement_surcharge": { 59 | "scaling_factor": 100, 60 | "type": "scaled_float" 61 | }, 62 | "fare_amount": { 63 | "scaling_factor": 100, 64 | "type": "scaled_float" 65 | }, 66 | "ehail_fee": { 67 | "scaling_factor": 100, 68 | "type": "scaled_float" 69 | }, 70 | "cab_color": { 71 | "type": "keyword" 72 | }, 73 | "dropoff_location": { 74 | "type": "geo_point" 75 | }, 76 | "vendor_name": { 77 | "type": "text" 78 | }, 79 | "total_amount": { 80 | "scaling_factor": 100, 81 | "type": "scaled_float" 82 | }, 83 | "trip_distance": { 84 | "scaling_factor": 100, 85 | "type": "scaled_float" 86 | }, 87 | "pickup_location": { 88 | "type": "geo_point" 89 | } 90 | }, 91 | "dynamic": "strict" 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # fail this script immediately if any command fails with a non-zero exit code 4 | set -e 5 | # Treat unset env variables as an error 6 | set -u 7 | # fail on pipeline errors, e.g. when grepping 8 | set -o pipefail 9 | 10 | readonly ROOT=".rally/benchmarks" 11 | readonly URL="http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora" 12 | 13 | 14 | # see http://stackoverflow.com/a/246128 15 | SOURCE="${BASH_SOURCE[0]}" 16 | while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink 17 | DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )" 18 | SOURCE="$(readlink "$SOURCE")" 19 | [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located 20 | done 21 | readonly CURR_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )" 22 | 23 | # test number of parameters 24 | if [ $# != 1 ] 25 | then 26 | echo "Usage: $0 TRACK_NAME" 27 | exit 1 28 | fi 29 | 30 | readonly TRACK=$1 31 | 32 | TARGETS=( ) 33 | 34 | # clone track descriptions 35 | readonly REPO_TARGET="${ROOT}/tracks/default" 36 | # add to final tar 37 | TARGETS[${#TARGETS[*]}]="${REPO_TARGET}" 38 | 39 | if [ ! -d "${HOME}/${REPO_TARGET}" ] 40 | then 41 | git clone https://github.com/elastic/rally-tracks.git "${HOME}/${REPO_TARGET}" 42 | fi 43 | 44 | # check if the track actually exists 45 | if [ ! -d "${HOME}/${REPO_TARGET}/${TRACK}" ] 46 | then 47 | echo "Track ${TRACK} does not exist in ${HOME}/${REPO_TARGET}." 48 | exit 1 49 | fi 50 | 51 | # download data (unless it exists locally) 52 | readonly FILES=$(cat ${HOME}/${REPO_TARGET}/${TRACK}/files.txt) 53 | for f in ${FILES}; do 54 | TARGET_ROOT="${ROOT}/data/${TRACK}" 55 | TARGET_PATH="${TARGET_ROOT}/${f}" 56 | mkdir -p "${HOME}/${TARGET_ROOT}" 57 | TARGETS[${#TARGETS[*]}]="${TARGET_PATH}" 58 | if [ ! -f "${HOME}/${TARGET_PATH}" ] 59 | then 60 | curl -o "${HOME}/${TARGET_PATH}" "${URL}/${TRACK}/${f}" 61 | fi 62 | done 63 | 64 | readonly ARCHIVE="rally-track-data-${TRACK}.tar" 65 | # ensure everything is relative to the home directory 66 | # exclude the archive itself to prevent spurious warnings. 67 | tar -C ${HOME} --exclude="${ARCHIVE}" -cf "${ARCHIVE}" ${TARGETS[@]} 68 | 69 | echo "Created data for ${TRACK} in ${ARCHIVE}. Next steps:" 70 | echo "" 71 | echo "1. Copy it to the user home directory on the target machine(s)." 72 | echo "2. Extract with tar -xf ${ARCHIVE} (will be extracted to ~/${ROOT})." -------------------------------------------------------------------------------- /noaa/README.md: -------------------------------------------------------------------------------- 1 | ## NOAA track 2 | 3 | This track is based on a [daily weather measurement from NOAA](ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/). 4 | 5 | To recreate the document corpus: 6 | 7 | 1. Download the following files: 8 | * ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2014.csv.gz 9 | * ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2015.csv.gz 10 | * ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2016.csv.gz 11 | * ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt 12 | * ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-countries.txt 13 | * ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-states.txt 14 | 2. Decompress measurement files. For example: `gunzip 2016.csv.gz` 15 | 3. Sort the files by station. For example: `sort --field-separator=',' --key=1,2 -o 2016-sorted.csv 2016.csv` 16 | 4. Execute a script like `_tools/process.py` to create json documents. 17 | 5. Make sure that the JSON documents are randomly ordered. (The script orders measurements of the same station next to each other). This can be achieved with `shuf documents.json > documents1.json`. 18 | 6. Compress the documents json file: `bzip2 -9 -c documents1.json > documents.json.bz2` 19 | 20 | ### Example Document 21 | 22 | ```json 23 | { 24 | "date": "2016-01-01T00:00:00", 25 | "TAVG": 22.9, 26 | "station": { 27 | "elevation": 34.0, 28 | "name": "SHARJAH INTER. AIRP", 29 | "country": "United", 30 | "gsn_flag": "GSN", 31 | "location": { 32 | "lat": 25.333, 33 | "lon": 55.517 34 | }, 35 | "country_code": "AE", 36 | "wmo_id": "41196", 37 | "id": "AE000041196" 38 | }, 39 | "TMIN": 15.5 40 | } 41 | ``` 42 | 43 | ### Parameters 44 | 45 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`: 46 | 47 | * `bulk_size` (default: 5000) 48 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests. 49 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. 50 | * `number_of_replicas` (default: 0) 51 | * `number_of_shards` (default: 1) 52 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 53 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. 54 | * `cluster_health` (default: "green"): The minimum required cluster health. 55 | 56 | ### License 57 | 58 | [US Government Work data license](https://www.usa.gov/government-works) 59 | -------------------------------------------------------------------------------- /nested/README.md: -------------------------------------------------------------------------------- 1 | ## Nested track 2 | 3 | This track is based on a [dump of StackOverflow posts](https://ia800500.us.archive.org/22/items/stackexchange/stackoverflow.com-Posts.7z) retrieved as of June 10, 2016. 4 | 5 | Each question and related answers have been assembled into a single JSON doc containing: 6 | 7 | * qid: a unique ID for a question 8 | * title: a free-text field with the question title 9 | * creationDate: The date the questions was asked 10 | * user: The user's screen name and unique ID combined into a single string 11 | * tag: An array of tags describing the technologies. 12 | * answers: An array of objects, one per answer, with the following fields: 13 | * date: Date of answer 14 | * user: Answerer's screen name and unique ID combined into a single string 15 | 16 | 17 | Data preparation process: 18 | 19 | * Question and answer entries in the original posts.XML were converted to slimmed-down rows in a CSV and enriched with user names from users.xml 20 | * CSV was sorted by first two columns (questionID and answerID) 21 | * The CSV was converted to the JSON file presented here, combining questions and answers into a single JSON doc. 22 | 23 | These scripts are available in the raw_data_prep_scripts.zip file. 24 | 25 | ### Example Document 26 | 27 | ```json 28 | { 29 | "title": "Are these LAMP permissions secure?", 30 | "qid": "10000023", 31 | "answers": [ 32 | { 33 | "date": "2012-04-04T12:56:34.433", 34 | "user": "larsks (147356)" 35 | } 36 | ], 37 | "tag": [ 38 | "linux", 39 | "apache", 40 | "security", 41 | "ubuntu", 42 | "permissions" 43 | ], 44 | "user": "Trent Scott (600873)", 45 | "creationDate": "2012-04-03T19:26:57.033" 46 | } 47 | ``` 48 | 49 | ### Parameters 50 | 51 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`: 52 | 53 | * `bulk_size` (default: 5000) 54 | * `bulk_indexing_clients` (default: 4): Number of clients that issue bulk indexing requests. 55 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. 56 | * `number_of_replicas` (default: 0) 57 | * `number_of_shards` (default: 1) 58 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 59 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. 60 | * `cluster_health` (default: "green"): The minimum required cluster health. 61 | 62 | ### License 63 | 64 | We use the same license for the data as the original data: [CC-SA-3.0](http://creativecommons.org/licenses/by-sa/3.0/) 65 | 66 | -------------------------------------------------------------------------------- /so/README.md: -------------------------------------------------------------------------------- 1 | ## StackOverflow track 2 | 3 | This dataset is derived from a dump of StackOverflow posts downloaded on June 10th 2016 from 4 | https://ia800500.us.archive.org/22/items/stackexchange/stackoverflow.com-Posts.7z 5 | 6 | Each question and answer have formatted into a JSON document with the following fields: 7 | 8 | questionId: a unique ID for a question 9 | answerId: a unique ID for an answer 10 | acceptedAnswerId: the unique ID of the answer accepted for question 11 | title: a free-text field with the question title 12 | creationDate: The date the questions was asked 13 | user: The user's unique ID 14 | tags: An array of tags describing the technologies. 15 | body: Field contsaining the text of the question or answer. 16 | type: Type of post. Either 'question' or 'answer' 17 | 18 | Fields that do not have values have been left out. The body has had text extracted and been 19 | formatted to fit into JSON documents. 20 | 21 | Data preparation process: 22 | * Question and answer entries in the original posts.XML were converted to slimmed-down JSON 23 | documents. 24 | * No enrichment was performed. 25 | These scripts are available in the raw_data_prep_script.zip file. 26 | 27 | ### Example Document 28 | 29 | ```json 30 | { 31 | "user": "45", 32 | "tags": ["c#", "linq", ".net-3.5"], 33 | "questionId": "59", 34 | "creationDate": "2008-08-01T13:14:33.797", 35 | "title": "How do I get a distinct, ordered list of names from a DataTable using LINQ?", 36 | "acceptedAnswerId": "43110", 37 | "type": "question", 38 | "body": "Let's say I have a DataTable with a Name column. I want to have a collection of the unique names ordered alphabetically. The following query ignores the order by clause. var names = (from DataRow dr in dataTable.Rows orderby (string)dr[\"Name\"] select (string)dr[\"Name\"]).Distinct(); Why does the orderby not get enforced? " 39 | } 40 | ``` 41 | 42 | ### Parameters 43 | 44 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`: 45 | 46 | * `bulk_size` (default: 5000) 47 | * `bulk_indexing_clients` (default: 4): Number of clients that issue bulk indexing requests. 48 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. 49 | * `number_of_replicas` (default: 0) 50 | * `number_of_shards` (default: 5) 51 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 52 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. 53 | * `cluster_health` (default: "green"): The minimum required cluster health. 54 | 55 | ### License 56 | 57 | We use the same license for the data as the original data: [CC-SA-3.0](http://creativecommons.org/licenses/by-sa/3.0/) 58 | -------------------------------------------------------------------------------- /eventdata/README.md: -------------------------------------------------------------------------------- 1 | ## EventData track 2 | 3 | This track is based on 20 million Apache access log entries generated based on statistics from sample 4 | elastic.co access logs using the generator avilable here: https://github.com/elastic/rally-eventdata-track 5 | 6 | The size of the data file is around 15GB, which gives an average JSON record size of 822 bytes. Mappings have been optimized and some of the fields added through `geoip` and `user-agent` enrichment has been removed to achieve a more compact format. 7 | 8 | The purpose of this track is to provide an efficient way to benchmark indexing of this data type as the generator built into the rally-eventdata-track can be CPU intensive. 9 | 10 | ### Example Document 11 | 12 | ```json 13 | { 14 | "agent": "\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36\"", 15 | "useragent": { 16 | "os": "Mac OS X 10.10.2", 17 | "os_name": "Mac OS X", 18 | "name": "Chrome" 19 | }, 20 | "geoip": { 21 | "country_name": "India", 22 | "location": [80.2833, 13.083300000000008] 23 | }, 24 | "clientip": "122.178.238.140", 25 | "referrer": "\"-\"", 26 | "request": "/apple-touch-icon-144x144.png", 27 | "bytes": 0, 28 | "verb": "GET", 29 | "response": 304, 30 | "httpversion": "1.1", 31 | "@timestamp": "2017-07-03T07:51:49.995Z", 32 | "message": "122.178.238.140 - - [2017-07-03T07:51:49.995Z] \"GET /apple-touch-icon-144x144.png HTTP/1.1\" 304 0 \"-\" \"-\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36\"" 33 | } 34 | ``` 35 | 36 | ### Parameters 37 | 38 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`: 39 | 40 | * `bulk_size` (default: 5000) 41 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests. 42 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. 43 | * `number_of_replicas` (default: 0) 44 | * `number_of_shards` (default: 5) 45 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 46 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. 47 | * `cluster_health` (default: "green"): The minimum required cluster health. 48 | 49 | ### License 50 | 51 | This is licensed under the Apache License, version 2 ("ALv2"), quoted below. 52 | 53 | Copyright 2015-2018 Elasticsearch https://www.elastic.co 54 | 55 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at 56 | 57 | http://www.apache.org/licenses/LICENSE-2.0 58 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 59 | -------------------------------------------------------------------------------- /nyc_taxis/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index", 3 | "operation-type": "bulk", 4 | "bulk-size": {{bulk_size | default(10000)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}} 6 | }, 7 | { 8 | "name": "update", 9 | "operation-type": "bulk", 10 | "bulk-size": {{bulk_size | default(10000)}}, 11 | "ingest-percentage": {{ingest_percentage | default(100)}}, 12 | "conflicts": "{{conflicts | default('random')}}", 13 | "on-conflict": "{{on_conflict | default('update')}}", 14 | "conflict-probability": {{conflict_probability | default(25)}}, 15 | "recency": {{recency | default(0)}} 16 | }, 17 | { 18 | "name": "default", 19 | "operation-type": "search", 20 | "body": { 21 | "query": { 22 | "match_all": {} 23 | } 24 | } 25 | }, 26 | { 27 | "name": "range", 28 | "operation-type": "search", 29 | "body": { 30 | "query": { 31 | "range": { 32 | "total_amount": { 33 | "gte": 5, 34 | "lt": 15 35 | } 36 | } 37 | } 38 | } 39 | }, 40 | { 41 | "name": "distance_amount_agg", 42 | "operation-type": "search", 43 | "body": { 44 | "size": 0, 45 | "aggs": { 46 | "distance_histo": { 47 | "histogram": { 48 | "field": "distance", 49 | "interval": 1 50 | }, 51 | "aggs": { 52 | "total_amount_stats": { 53 | "stats": { 54 | "field": "total_amount" 55 | } 56 | } 57 | } 58 | } 59 | } 60 | } 61 | }, 62 | { 63 | "name": "autohisto_agg", 64 | "operation-type": "search", 65 | "body": { 66 | "size": 0, 67 | "query": { 68 | "range": { 69 | "dropoff_datetime": { 70 | "gte": "01/01/2015", 71 | "lte": "21/01/2015", 72 | "format": "dd/MM/yyyy" 73 | } 74 | } 75 | }, 76 | "aggs": { 77 | "dropoffs_over_time": { 78 | "auto_date_histogram": { 79 | "field": "dropoff_datetime", 80 | "buckets": 20 81 | } 82 | } 83 | } 84 | } 85 | }, 86 | { 87 | "name": "date_histogram_agg", 88 | "operation-type": "search", 89 | "body": { 90 | "size": 0, 91 | "query": { 92 | "range": { 93 | "dropoff_datetime": { 94 | "gte": "01/01/2015", 95 | "lte": "21/01/2015", 96 | "format": "dd/MM/yyyy" 97 | } 98 | } 99 | }, 100 | "aggs": { 101 | "dropoffs_over_time": { 102 | "date_histogram": { 103 | "field": "dropoff_datetime", 104 | "calendar_interval": "day" 105 | } 106 | } 107 | } 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /geonames/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index.number_of_shards": {{number_of_shards | default(5)}}, 4 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 5 | "index.store.type": "{{store_type | default('fs')}}", 6 | "index.requests.cache.enable": false 7 | }, 8 | "mappings": { 9 | "dynamic": "strict", 10 | "_source": { 11 | "enabled": {{ source_enabled | default(true) | tojson }} 12 | }, 13 | "properties": { 14 | "elevation": { 15 | "type": "integer" 16 | }, 17 | "name": { 18 | "type": "text", 19 | "fields": { 20 | "raw": { 21 | "type": "keyword" 22 | } 23 | } 24 | }, 25 | "geonameid": { 26 | "type": "long" 27 | }, 28 | "feature_class": { 29 | "type": "text", 30 | "fields": { 31 | "raw": { 32 | "type": "keyword" 33 | } 34 | } 35 | }, 36 | "location": { 37 | "type": "geo_point" 38 | }, 39 | "cc2": { 40 | "type": "text", 41 | "fields": { 42 | "raw": { 43 | "type": "keyword" 44 | } 45 | } 46 | }, 47 | "timezone": { 48 | "type": "text", 49 | "fields": { 50 | "raw": { 51 | "type": "keyword" 52 | } 53 | } 54 | }, 55 | "dem": { 56 | "type": "text", 57 | "fields": { 58 | "raw": { 59 | "type": "keyword" 60 | } 61 | } 62 | }, 63 | "country_code": { 64 | "type": "text", 65 | "fielddata": true, 66 | "fields": { 67 | "raw": { 68 | "type": "keyword" 69 | } 70 | } 71 | }, 72 | "admin1_code": { 73 | "type": "text", 74 | "fields": { 75 | "raw": { 76 | "type": "keyword" 77 | } 78 | } 79 | }, 80 | "admin2_code": { 81 | "type": "text", 82 | "fields": { 83 | "raw": { 84 | "type": "keyword" 85 | } 86 | } 87 | }, 88 | "admin3_code": { 89 | "type": "text", 90 | "fields": { 91 | "raw": { 92 | "type": "keyword" 93 | } 94 | } 95 | }, 96 | "admin4_code": { 97 | "type": "text", 98 | "fields": { 99 | "raw": { 100 | "type": "keyword" 101 | } 102 | } 103 | }, 104 | "feature_code": { 105 | "type": "text", 106 | "fields": { 107 | "raw": { 108 | "type": "keyword" 109 | } 110 | } 111 | }, 112 | "alternatenames": { 113 | "type": "text", 114 | "fields": { 115 | "raw": { 116 | "type": "keyword" 117 | } 118 | } 119 | }, 120 | "asciiname": { 121 | "type": "text", 122 | "fields": { 123 | "raw": { 124 | "type": "keyword" 125 | } 126 | } 127 | }, 128 | "population": { 129 | "type": "long" 130 | } 131 | } 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /http_logs/_tools/unparse.rb: -------------------------------------------------------------------------------- 1 | require "json" 2 | require "time" 3 | 4 | ################ 5 | # 6 | # Reconstructs (un-parses) the existing http_logs corpora (data set). The introduction of ingest node pipelines 7 | # requires the data to be JSON, but un-parsed log lines. This script was used to create the `http_logs_unparsed`, which 8 | # is a mirror copy of "http_logs`, except it is un-parsed AND the timestamp is ISO8601 (not epoch_seconds) 9 | # 10 | # The output of this is is a file with lines of JSON that appear as follows: 11 | # 12 | # {"message" : "30.87.8.0 - - [1998-05-24T15:00:01-05:00] \"GET /images/info.gif HTTP/1.0\" 200 1251"} 13 | # {"message" : "28.87.8.0 - - [1998-05-24T15:00:01-05:00] \"GET /french/images/hm_official.gif HTTP/1.1\" 200 972"} 14 | # {"message" : "17.87.8.0 - - [1998-05-24T15:00:01-05:00] \"GET /french/hosts/cfo/images/cfo/cfophot3.jpg HTTP/1.0\" 200 6695"} 15 | # 16 | # Usage: 17 | # 18 | # rm *.unparse.json 19 | # rm *.bz2 20 | # 21 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-181998.json.bz2 22 | # bunzip2 documents-181998.json.bz2 23 | # 24 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-191998.json.bz2 25 | # bunzip2 documents-191998.json.bz2 26 | # 27 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-201998.json.bz2 28 | # bunzip2 documents-201998.json.bz2 29 | # 30 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-211998.json.bz2 31 | # bunzip2 documents-211998.json.bz2 32 | # 33 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-221998.json.bz2 34 | # bunzip2 documents-221998.json.bz2 35 | # 36 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-231998.json.bz2 37 | # bunzip2 documents-231998.json.bz2 38 | # 39 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-241998.json.bz2 40 | # bunzip2 documents-241998.json.bz2 41 | # 42 | # ruby unparse.rb . 43 | # 44 | # ############# 45 | 46 | def self.getValue(data,key) 47 | data[key].nil? ? "-" : data[key].to_s 48 | end 49 | 50 | threads = 4 51 | running = 0 52 | Dir.glob(File.join(ARGV[0], "*.json")).select do |file| 53 | File.open(file.gsub('json', 'unparsed.json'), 'w') do |json_file| 54 | while running >= threads 55 | sleep 1 56 | end 57 | running = running + 1 58 | Thread.new do 59 | i = 0; 60 | File.open(file).each do |line| 61 | begin 62 | i += 1; 63 | print "." if i % 10000 == 0 64 | data = JSON.parse(line) 65 | logline = getValue(data,'clientip') + " - - [" + Time.at(data['@timestamp'].to_i).iso8601 + "] \\\"" + getValue(data,'request') + "\\\" " + getValue(data,'status') + " " + getValue(data,'size') 66 | json_log_line = "{\"message\" : \"" + logline + "\"}\n" 67 | #TODO: validate this is proper JSON. ~15 rows (.02%) were post modified to remove an invalid '\' char in the resultant JSON 68 | json_file.write(json_log_line) 69 | rescue => e 70 | puts e 71 | end 72 | end 73 | running = running - 1 74 | end 75 | while running > 0 76 | sleep 1 77 | end 78 | end 79 | end -------------------------------------------------------------------------------- /pmc/README.md: -------------------------------------------------------------------------------- 1 | ## PMC track 2 | 3 | This track contains data retrieved from [PMC](https://www.ncbi.nlm.nih.gov/pmc/tools/ftp/). 4 | 5 | Note that we have filtered the data set so only a subset of the articles is included. 6 | 7 | ### Example Document 8 | 9 | Note that the ``body`` content is actually much longer has been shortened here to increase readability. 10 | 11 | ```json 12 | { 13 | "name": "3_Biotech_2015_Dec_13_5(6)_1007-1019", 14 | "journal": "3 Biotech", 15 | "date": "2015 Dec 13", 16 | "volume": "5(6)", 17 | "issue": "1007-1019", 18 | "accession": "PMC4624133", 19 | "timestamp": "2015-10-30 20:08:11", 20 | "pmid": "", 21 | "body": "\n==== Front\n3 Biotech3 Biotech3 Biotech2190-572X2190-5738Springer ..." 22 | } 23 | ``` 24 | 25 | ### Parameters 26 | 27 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`: 28 | 29 | * `bulk_size` (default: 500) 30 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests. 31 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. 32 | * `conflicts` (default: "random"): Type of id conflicts to simulate. Valid values are: 'sequential' (A document id is replaced with a document id with a sequentially increasing id), 'random' (A document id is replaced with a document id with a random other id). 33 | * `conflict_probability` (default: 25): A number between 0 and 100 that defines the probability of id conflicts. This requires to run the respective challenge. Combining ``conflicts=sequential`` and ``conflict-probability=0`` makes Rally generate index ids by itself, instead of relying on Elasticsearch's `automatic id generation `_. 34 | * `on_conflict` (default: "index"): Whether to use an "index" or an "update" action when simulating an id conflict. 35 | * `recency` (default: 0): A number between 0 and 1 that defines whether to bias towards more recent ids when simulating conflicts. See the [Rally docs](http://esrally.readthedocs.io/en/latest/track.html#bulk) for the full definition of this parameter. This requires to run the respective challenge. 36 | * `number_of_replicas` (default: 0) 37 | * `number_of_shards` (default: 5) 38 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 39 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. 40 | * [`default_search_timeout`](https://www.elastic.co/guide/en/elasticsearch/reference/6.0/search.html#global-search-timeout) (default: -1) 41 | * `cluster_health` (default: "green"): The minimum required cluster health. 42 | 43 | ### License 44 | 45 | All articles that are included are licensed as CC-BY (http://creativecommons.org/licenses/by/2.0/) 46 | 47 | This data set is licensed under the same terms. Please refer to http://creativecommons.org/licenses/by/2.0/ for details. 48 | 49 | Attribution hint: 50 | 51 | You can download a full list of the author information for each included document from http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/pmc/attribution.txt.bz2 (size: 52.2MB) 52 | -------------------------------------------------------------------------------- /geonames/README.md: -------------------------------------------------------------------------------- 1 | ## Geonames track 2 | 3 | This track is based on a [geonames](http://www.geonames.org/) dump of the file [allCountries.zip](http://download.geonames.org/export/dump/allCountries.zip) retrieved as of April 27, 2017. 4 | 5 | For further details about the semantics of individual fields, please see the [geonames dump README](http://download.geonames.org/export/dump/readme.txt). 6 | 7 | Modifications: 8 | 9 | * The original CSV data have been converted to JSON. 10 | * We combine the original `longitude` and `latitude` fields to a new `location` field of type [geo_point](https://www.elastic.co/guide/en/elasticsearch/reference/current/geo-point.html). 11 | 12 | ### Example Document 13 | 14 | ```json 15 | { 16 | "geonameid": 2986043, 17 | "name": "Pic de Font Blanca", 18 | "asciiname": "Pic de Font Blanca", 19 | "alternatenames": "Pic de Font Blanca,Pic du Port", 20 | "feature_class": "T", 21 | "feature_code": "PK", 22 | "country_code": "AD", 23 | "admin1_code": "00", 24 | "population": 0, 25 | "dem": "2860", 26 | "timezone": "Europe/Andorra", 27 | "location": [ 28 | 1.53335, 29 | 42.64991 30 | ] 31 | } 32 | ``` 33 | 34 | ### Parameters 35 | 36 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`: 37 | 38 | * `bulk_size` (default: 5000) 39 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests. 40 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. 41 | * `conflicts` (default: "random"): Type of id conflicts to simulate. Valid values are: 'sequential' (A document id is replaced with a document id with a sequentially increasing id), 'random' (A document id is replaced with a document id with a random other id). 42 | * `conflict_probability` (default: 25): A number between 0 and 100 that defines the probability of id conflicts. This requires to run the respective challenge. Combining ``conflicts=sequential`` and ``conflict-probability=0`` makes Rally generate index ids by itself, instead of relying on Elasticsearch's `automatic id generation `_. 43 | * `on_conflict` (default: "index"): Whether to use an "index" or an "update" action when simulating an id conflict. 44 | * `recency` (default: 0): A number between 0 and 1 that defines whether to bias towards more recent ids when simulating conflicts. See the [Rally docs](http://esrally.readthedocs.io/en/latest/track.html#bulk) for the full definition of this parameter. This requires to run the respective challenge. 45 | * `number_of_replicas` (default: 0) 46 | * `number_of_shards` (default: 5) 47 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 48 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. 49 | * `cluster_health` (default: "green"): The minimum required cluster health. 50 | 51 | ### License 52 | 53 | We use the same license for the data as the original data from Geonames: 54 | 55 | ``` 56 | This work is licensed under a Creative Commons Attribution 3.0 License, 57 | see http://creativecommons.org/licenses/by/3.0/ 58 | The Data is provided "as is" without warranty or any representation of accuracy, timeliness or completeness. 59 | ``` 60 | -------------------------------------------------------------------------------- /geoshape/challenges/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "append-no-conflicts", 3 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.", 4 | "default": true, 5 | "schedule": [ 6 | { 7 | "operation": "delete-index" 8 | }, 9 | { 10 | "operation": { 11 | "operation-type": "create-index", 12 | "settings": {{index_settings | default({}) | tojson}} 13 | } 14 | }, 15 | { 16 | "name": "check-cluster-health", 17 | "operation": { 18 | "operation-type": "cluster-health", 19 | "index": "osm*", 20 | "request-params": { 21 | "wait_for_status": "{{cluster_health | default('green')}}", 22 | "wait_for_no_relocating_shards": "true" 23 | } 24 | } 25 | }, 26 | { 27 | "operation": "index-append-linestrings", 28 | "warmup-time-period": 120, 29 | "clients": {{bulk_indexing_clients | default(8)}} 30 | }, 31 | { 32 | "name": "refresh-after-linestrings-index", 33 | "operation": "refresh", 34 | "index": "osmlinestrings", 35 | "clients": 1 36 | }, 37 | { 38 | "name": "force-merge-linestrings", 39 | "operation": "force-merge", 40 | "index": "osmlinestrings", 41 | "clients": 1 42 | }, 43 | { 44 | "operation": "index-append-multilinestrings", 45 | "warmup-time-period": 120, 46 | "clients": {{bulk_indexing_clients | default(8)}} 47 | }, 48 | { 49 | "name": "refresh-after-multilinestrings-index", 50 | "operation": "refresh", 51 | "index": "osmmultilinestrings", 52 | "clients": 1 53 | }, 54 | { 55 | "name": "force-merge-multilinestrings", 56 | "operation": "force-merge", 57 | "index": "osmmultilinestrings", 58 | "clients": 1 59 | }, 60 | { 61 | "operation": "index-append-polygons", 62 | "warmup-time-period": 120, 63 | "clients": {{bulk_indexing_clients | default(8)}} 64 | }, 65 | { 66 | "name": "refresh-after-polygons-index", 67 | "operation": "refresh", 68 | "index": "osmpolygons", 69 | "clients": 1 70 | }, 71 | { 72 | "name": "force-merge-polygons", 73 | "operation": "force-merge", 74 | "index": "osmpolygons", 75 | "clients": 1 76 | }, 77 | { 78 | "name": "refresh-after-all-indices", 79 | "operation": "refresh", 80 | "clients": 1 81 | }, 82 | { 83 | "operation": "polygon", 84 | "clients": 1, 85 | "warmup-iterations": 200, 86 | "iterations": 100, 87 | "target-throughput": 0.3 88 | }, 89 | { 90 | "operation": "bbox", 91 | "clients": 1, 92 | "warmup-iterations": 200, 93 | "iterations": 100, 94 | "target-throughput": 0.25 95 | } 96 | ] 97 | } 98 | -------------------------------------------------------------------------------- /nyc_taxis/README.md: -------------------------------------------------------------------------------- 1 | ## NYC taxis track 2 | 3 | This track contains the rides that have been performed in yellow taxis in New York in 2015. It can be downloaded from http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml. 4 | 5 | This has only been tested with the 2015 dump, but this should work with any dump of the yellow taxis, and should be easy to adapt to the green taxis. 6 | 7 | Once downloaded, you can generate the mappings with: 8 | 9 | ``` 10 | python3 _tools/parse.py mappings 11 | ``` 12 | 13 | And the json documents can be generated with: 14 | 15 | ``` 16 | python3 _tools/parse.py json file_name.csv > documents.json 17 | ``` 18 | 19 | Finally the json docs can be compressed with: 20 | 21 | ``` 22 | bzip2 -k documents.json 23 | ``` 24 | 25 | ### Example Document 26 | 27 | ```json 28 | { 29 | "total_amount": 6.3, 30 | "improvement_surcharge": 0.3, 31 | "pickup_location": [ 32 | -73.92259216308594, 33 | 40.7545280456543 34 | ], 35 | "pickup_datetime": "2015-01-01 00:34:42", 36 | "trip_type": "1", 37 | "dropoff_datetime": "2015-01-01 00:38:34", 38 | "rate_code_id": "1", 39 | "tolls_amount": 0.0, 40 | "dropoff_location": [ 41 | -73.91363525390625, 42 | 40.76552200317383 43 | ], 44 | "passenger_count": 1, 45 | "fare_amount": 5.0, 46 | "extra": 0.5, 47 | "trip_distance": 0.88, 48 | "tip_amount": 0.0, 49 | "store_and_fwd_flag": "N", 50 | "payment_type": "2", 51 | "mta_tax": 0.5, 52 | "vendor_id": "2" 53 | } 54 | ``` 55 | 56 | ### Parameters 57 | 58 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`: 59 | 60 | * `bulk_size` (default: 10000) 61 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests. 62 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. 63 | * `conflicts` (default: "random"): Type of id conflicts to simulate. Valid values are: 'sequential' (A document id is replaced with a document id with a sequentially increasing id), 'random' (A document id is replaced with a document id with a random other id). 64 | * `conflict_probability` (default: 25): A number between 0 and 100 that defines the probability of id conflicts. Only used by the `update` challenge. Combining ``conflicts=sequential`` and ``conflict-probability=0`` makes Rally generate index ids by itself, instead of relying on Elasticsearch's `automatic id generation `_. 65 | * `on_conflict` (default: "index"): Whether to use an "index" or an "update" action when simulating an id conflict. Only used by the `update` challenge. 66 | * `recency` (default: 0): A number between 0 and 1 that defines whether to bias towards more recent ids when simulating conflicts. See the [Rally docs](http://esrally.readthedocs.io/en/latest/track.html#bulk) for the full definition of this parameter. Only used by the `update` challenge. 67 | * `number_of_replicas` (default: 0) 68 | * `number_of_shards` (default: 1) 69 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 70 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. 71 | * `cluster_health` (default: "green"): The minimum required cluster health. 72 | 73 | ### License 74 | 75 | According to the [Open Data Law](https://opendata.cityofnewyork.us/open-data-law/) this data is available as public domain. 76 | -------------------------------------------------------------------------------- /percolator/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index", 3 | "operation-type": "bulk", 4 | "bulk-size": {{bulk_size | default(5000)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}} 6 | }, 7 | { 8 | "name": "percolator_with_content_president_bush", 9 | "operation-type": "search", 10 | "body": { 11 | "query": { 12 | "percolate": { 13 | "field": "query", 14 | "document": { 15 | "body": "president bush" 16 | } 17 | } 18 | } 19 | } 20 | }, 21 | { 22 | "name": "percolator_with_content_saddam_hussein", 23 | "operation-type": "search", 24 | "body": { 25 | "query": { 26 | "percolate": { 27 | "field": "query", 28 | "document": { 29 | "body": "saddam hussein" 30 | } 31 | } 32 | } 33 | } 34 | }, 35 | { 36 | "name": "percolator_with_content_hurricane_katrina", 37 | "operation-type": "search", 38 | "body": { 39 | "query": { 40 | "percolate": { 41 | "field": "query", 42 | "document": { 43 | "body": "hurricane katrina" 44 | } 45 | } 46 | } 47 | } 48 | }, 49 | { 50 | "name": "percolator_with_content_google", 51 | "operation-type": "search", 52 | "body": { 53 | "query": { 54 | "percolate": { 55 | "field": "query", 56 | "document": { 57 | "body": "google" 58 | } 59 | } 60 | } 61 | } 62 | }, 63 | { 64 | "name": "percolator_no_score_with_content_google", 65 | "operation-type": "search", 66 | "body": { 67 | "query": { 68 | "constant_score": { 69 | "filter": { 70 | "percolate": { 71 | "field": "query", 72 | "document": { 73 | "body": "google" 74 | } 75 | } 76 | } 77 | } 78 | } 79 | } 80 | }, 81 | { 82 | "name": "percolator_with_highlighting", 83 | "operation-type": "search", 84 | "body": { 85 | "query": { 86 | "percolate": { 87 | "field": "query", 88 | "document": { 89 | "body": "Israeli prime minister Ariel Sharon suffers a massive stroke; he is replaced by acting prime minister Ehud Olmert" 90 | } 91 | } 92 | }, 93 | "highlight": { 94 | "fields": { 95 | "body": {} 96 | } 97 | } 98 | } 99 | }, 100 | { 101 | "name": "percolator_with_content_ignore_me", 102 | "operation-type": "search", 103 | "body": { 104 | "query": { 105 | "percolate": { 106 | "field": "query", 107 | "document": { 108 | "body": "ignore me" 109 | } 110 | } 111 | } 112 | } 113 | }, 114 | { 115 | "name": "percolator_no_score_with_content_ignore_me", 116 | "operation-type": "search", 117 | "body": { 118 | "query": { 119 | "constant_score": { 120 | "filter": { 121 | "percolate": { 122 | "field": "query", 123 | "document": { 124 | "body": "ignore me" 125 | } 126 | } 127 | } 128 | } 129 | } 130 | } 131 | } -------------------------------------------------------------------------------- /nyc_taxis/_tools/parse.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | import sys 4 | import re 5 | 6 | types = {} 7 | for f in ["vendor_id","cab_color","payment_type","trip_type","rate_code_id","store_and_fwd_flag"]: 8 | types[f] = 'keyword' 9 | for f in ["vendor_name"]: 10 | types[f] = 'text' 11 | for f in ["passenger_count"]: 12 | types[f] = 'integer' 13 | for f in ["pickup_location", "dropoff_location"]: 14 | types[f] = 'geo_point' 15 | for f in ["trip_distance", "fare_amount", "surcharge", "mta_tax", "extra", "ehail_fee", "improvement_surcharge", "tip_amount", "tolls_amount", "total_amount"]: 16 | types[f] = 'scaled_float' 17 | for f in ["pickup_datetime", "dropoff_datetime"]: 18 | types[f] = 'date' 19 | 20 | def write_mappings(): 21 | mappings = {} 22 | for (k, v) in types.items(): 23 | mappings[k] = { "type": v } 24 | if v == 'date': 25 | mappings[k]['format'] = "yyyy-MM-dd HH:mm:ss" 26 | elif v == 'scaled_float': 27 | mappings[k]['scaling_factor'] = 100 28 | mappings = { "properties": mappings } 29 | mappings['_all'] = { "enabled": False } 30 | mappings['dynamic'] = 'strict' 31 | mappings = { "type": mappings } 32 | print(json.dumps(mappings, indent=2)) 33 | 34 | def to_geo_point(d, f): 35 | lat_field = f + "_latitude" 36 | lon_field = f + "_longitude" 37 | if lat_field in d and lon_field in d: 38 | longitude = float(d[lon_field]) 39 | latitude = float(d[lat_field]) 40 | if longitude < -180 or longitude > 180 or latitude < -90 or latitude > 90: 41 | raise Exception("Malformed coordinates") 42 | d[f + '_location'] = [float(d[lon_field]), float(d[lat_field])] 43 | del d[lon_field] 44 | del d[lat_field] 45 | 46 | def to_underscore(s): 47 | s = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', s) 48 | return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s).lower() 49 | 50 | def to_json(f): 51 | fields = [] 52 | for field in f.readline().strip().split(','): 53 | field = to_underscore(field) 54 | if field.startswith('tpep_') or field.startswith('lpep_'): 55 | field = field[5:] 56 | elif field == 'ratecode_id': 57 | field = 'rate_code_id' 58 | fields.append(field) 59 | for line in f.readlines(): 60 | cols = line.strip().split(',') 61 | if len(cols) < len(fields): 62 | raise Exception("Cannot parse '%s': number of fields does not match '%s'" %(line, ",".join(fields))) 63 | 64 | try: 65 | d = {} 66 | for i in range(len(fields)): 67 | field = fields[i] 68 | value = cols[i] 69 | if value != '': # the way csv says the field does not exist 70 | d[field] = value 71 | 72 | to_geo_point(d, 'pickup') 73 | to_geo_point(d, 'dropoff') 74 | 75 | for (k, v) in d.items(): 76 | if k not in types: 77 | raise Exception("Unknown field '%s'" %k) 78 | t = types[k] 79 | try: 80 | if t == 'integer': 81 | d[k] = int(v) 82 | elif t == 'float': 83 | d[k] = float(v) 84 | except Exception as cause: 85 | raise Exception("Cannot parse (%s,%s)" %(k, v)) from cause 86 | 87 | print(json.dumps(d)) 88 | except KeyboardInterrupt: 89 | break 90 | except Exception as e: 91 | print("Skipping malformed entry '%s' because of %s" %(line, str(e)), file=sys.stderr) 92 | 93 | if sys.argv[1] == "json": 94 | for file_name in sys.argv[2:]: 95 | with open(file_name) as f: 96 | to_json(f) 97 | elif sys.argv[1] == "mappings": 98 | write_mappings() 99 | else: 100 | raise Exception("Expected 'json' or 'mappings' but got %s" %sys.argv[1]) 101 | -------------------------------------------------------------------------------- /percolator/challenges/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "append-no-conflicts", 3 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green and we want to ensure that we don't use the query cache. Document ids are unique so all index operations are append only. After that a couple of queries are run.", 4 | "default": true, 5 | "schedule": [ 6 | { 7 | "operation": "delete-index" 8 | }, 9 | { 10 | "operation": { 11 | "operation-type": "create-index", 12 | "settings": {{index_settings | default({}) | tojson}} 13 | } 14 | }, 15 | { 16 | "name": "check-cluster-health", 17 | "operation": { 18 | "operation-type": "cluster-health", 19 | "index": "queries", 20 | "request-params": { 21 | "wait_for_status": "{{cluster_health | default('green')}}", 22 | "wait_for_no_relocating_shards": "true" 23 | } 24 | } 25 | }, 26 | { 27 | "operation": "index", 28 | "#COMMENT": "This is an incredibly short warmup time period but it is necessary to get also measurement samples. As this benchmark is rather about search than indexing this is ok.", 29 | "warmup-time-period": 10, 30 | "clients": {{bulk_indexing_clients | default(8)}} 31 | }, 32 | { 33 | "name": "refresh-after-index", 34 | "operation": "refresh", 35 | "clients": 1 36 | }, 37 | { 38 | "operation": "force-merge", 39 | "clients": 1 40 | }, 41 | { 42 | "name": "refresh-after-force-merge", 43 | "operation": "refresh", 44 | "clients": 1 45 | }, 46 | { 47 | "operation": "percolator_with_content_president_bush", 48 | "clients": 1, 49 | "warmup-iterations": 100, 50 | "iterations": 100, 51 | "target-throughput": 50 52 | }, 53 | { 54 | "operation": "percolator_with_content_saddam_hussein", 55 | "clients": 1, 56 | "warmup-iterations": 100, 57 | "iterations": 100, 58 | "target-throughput": 50 59 | }, 60 | { 61 | "operation": "percolator_with_content_hurricane_katrina", 62 | "clients": 1, 63 | "warmup-iterations": 100, 64 | "iterations": 100, 65 | "target-throughput": 50 66 | }, 67 | { 68 | "operation": "percolator_with_content_google", 69 | "clients": 1, 70 | "warmup-iterations": 100, 71 | "iterations": 100, 72 | "target-throughput": 35 73 | }, 74 | { 75 | "operation": "percolator_no_score_with_content_google", 76 | "clients": 1, 77 | "warmup-iterations": 100, 78 | "iterations": 100, 79 | "target-throughput": 100 80 | }, 81 | { 82 | "operation": "percolator_with_highlighting", 83 | "clients": 1, 84 | "warmup-iterations": 100, 85 | "iterations": 100, 86 | "target-throughput": 50 87 | }, 88 | { 89 | "operation": "percolator_with_content_ignore_me", 90 | "clients": 1, 91 | "warmup-iterations": 10, 92 | "iterations": 100, 93 | "#COMMENT": "Be aware that we specify *target-interval* here! This means we issue one query every 12 seconds", 94 | "target-interval": 12 95 | }, 96 | { 97 | "operation": "percolator_no_score_with_content_ignore_me", 98 | "clients": 1, 99 | "warmup-iterations": 100, 100 | "iterations": 100, 101 | "target-throughput": 15 102 | } 103 | ] 104 | } 105 | -------------------------------------------------------------------------------- /geonames/track.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | 4 | 5 | class QueryParamSource: 6 | # We need to stick to the param source API 7 | # noinspection PyUnusedLocal 8 | def __init__(self, track, params, **kwargs): 9 | self._params = params 10 | self.infinite = True 11 | cwd = os.path.dirname(__file__) 12 | # The terms.txt file has been generated with: 13 | # sed -n '13~250p' [path_to_rally_data]/geonames/documents.json | shuf | sed -e "s/.*name\": \"//;s/\",.*$//" > terms.txt 14 | with open(os.path.join(cwd, "terms.txt"), "r") as ins: 15 | self.terms = [line.strip() for line in ins.readlines()] 16 | 17 | # We need to stick to the param source API 18 | # noinspection PyUnusedLocal 19 | def partition(self, partition_index, total_partitions): 20 | return self 21 | 22 | # Deprecated - only there for BWC reasons with Rally < 1.4.0 23 | def size(self): 24 | return 1 25 | 26 | 27 | class PureTermsQueryParamSource(QueryParamSource): 28 | def params(self): 29 | query_terms = list(self.terms) # copy 30 | query_terms.append(str(random.randint(1, 100))) # avoid caching 31 | result = { 32 | "body": { 33 | "query": { 34 | "terms": { 35 | "name.raw": query_terms 36 | } 37 | } 38 | }, 39 | "index": None 40 | } 41 | if "cache" in self._params: 42 | result["cache"] = self._params["cache"] 43 | 44 | return result 45 | 46 | 47 | class FilteredTermsQueryParamSource(QueryParamSource): 48 | def params(self): 49 | query_terms = list(self.terms) # copy 50 | query_terms.append(str(random.randint(1, 1000))) # avoid caching 51 | result = { 52 | "body": { 53 | "query": { 54 | "bool": { 55 | "must": [ 56 | { 57 | "match": { 58 | "feature_class.raw": "T" 59 | } 60 | } 61 | ], 62 | "filter": [ 63 | { 64 | "terms": { 65 | "name.raw": query_terms 66 | } 67 | } 68 | ] 69 | } 70 | } 71 | }, 72 | "index": None 73 | } 74 | if "cache" in self._params: 75 | result["cache"] = self._params["cache"] 76 | 77 | return result 78 | 79 | 80 | class ProhibitedTermsQueryParamSource(QueryParamSource): 81 | def params(self): 82 | query_terms = list(self.terms) # copy 83 | query_terms.append(str(random.randint(1, 1000))) # avoid caching 84 | result = { 85 | "body": { 86 | "query": { 87 | "bool": { 88 | "must": [ 89 | { 90 | "match": { 91 | "feature_class.raw": "A" 92 | } 93 | } 94 | ], 95 | "must_not": [ 96 | { 97 | "terms": { 98 | "name.raw": query_terms 99 | } 100 | } 101 | ] 102 | } 103 | } 104 | }, 105 | "index": None 106 | } 107 | if "cache" in self._params: 108 | result["cache"] = self._params["cache"] 109 | 110 | return result 111 | 112 | 113 | def refresh(es, params): 114 | es.indices.refresh(index=params.get("index", "_all")) 115 | 116 | 117 | def register(registry): 118 | registry.register_param_source("pure-terms-query-source", PureTermsQueryParamSource) 119 | registry.register_param_source("filtered-terms-query-source", FilteredTermsQueryParamSource) 120 | registry.register_param_source("prohibited-terms-query-source", ProhibitedTermsQueryParamSource) 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | rally-tracks 2 | ------------ 3 | 4 | This repository contains the default track specifications for the Elasticsearch benchmarking tool [Rally](https://github.com/elastic/rally). 5 | 6 | Tracks are used to describe benchmarks in Rally. 7 | 8 | You should not need to use this repository directly, except if you want to look under the hood or create your own tracks. We have created a [tutorial on how to create your own tracks](https://esrally.readthedocs.io/en/latest/adding_tracks.html). 9 | 10 | Versioning Scheme 11 | ----------------- 12 | 13 | From time to time, setting and mapping formats change in Elasticsearch. As we want to be able to support multiple versions of Elasticsearch, we also need to version track specifications. Therefore, this repository contains multiple branches. The following examples should give you an idea how the versioning scheme works: 14 | 15 | * master: tracks on this branch are compatible with the latest development version of Elasticsearch 16 | * 5.0.0-alpha2: compatible with the released version 5.0.0-alpha2. 17 | * 2: compatible with all Elasticsearch releases with the major release number 2 (e.g. 2.1, 2.2, 2.2.1) 18 | * 1.7: compatible with all Elasticsearch releases with the major release number 1 and minor release number 7 (e.g. 1.7.0, 1.7.1, 1.7.2) 19 | 20 | As you can see, branches can match exact release numbers but Rally is also lenient in case settings mapping formats did not change for a few releases. Rally will try to match in the following order: 21 | 22 | 1. major.minor.patch-extension_label (e.g. 5.0.0-alpha5) 23 | 2. major.minor.patch (e.g. 2.3.1) 24 | 3. major.minor (e.g. 2.3) 25 | 4. major (e.g. 2) 26 | 27 | Apart from that, the master branch is always considered to be compatible with the Elasticsearch master branch. 28 | 29 | To specify the version to check against, add `--distribution-version` when running Rally. It it is not specified, Rally assumes that you want to benchmark against the Elasticsearch master version. 30 | 31 | Example: If you want to benchmark Elasticsearch 5.0.0, run the following command: 32 | 33 | ``` 34 | esrally --distribution-version=5.0.0 35 | ``` 36 | 37 | How to Contribute 38 | ----------------- 39 | 40 | If you want to contribute a track, please ensure that it works against the master version of Elasticsearch (i.e. submit PRs against the master branch). We can then check whether it's feasible to backport the track to earlier Elasticsearch versions. 41 | 42 | See all details in the [contributor guidelines](https://github.com/elastic/rally/blob/master/CONTRIBUTING.md). 43 | 44 | Backporting changes 45 | ------------------- 46 | 47 | If you are a contributor with direct commit access to this repository then please backport your changes. This ensures that tracks do not work only for the latest `master` version of Elasticsearch but also for older versions. Apply backports with cherr-picks. Below you can find a walkthrough: 48 | 49 | Assume we've pushed commit `a7e0937` to master and want to backport it. This is a change to the `noaa` track. Let's check what branches are available for backporting: 50 | 51 | ``` 52 | daniel@io:tracks/default ‹master›$ git branch -r 53 | origin/1 54 | origin/2 55 | origin/5 56 | origin/HEAD -> origin/master 57 | origin/master 58 | ``` 59 | 60 | We'll go backwards starting from branch `5`, then branch `2` and finally branch `1`. After applying a change, we will test whether the track works as is for an older version of Elasticsearch. 61 | 62 | ``` 63 | git checkout 5 64 | git cherry-pick a7e0937 65 | 66 | # test the change now with an Elasticsearch 5.x distribution 67 | esrally --track=noaa --distribution-version=5.4.3 --test-mode 68 | 69 | # push the change 70 | git push origin 5 71 | ``` 72 | 73 | This particular track uses features that are only available in Elasticsearch 5 and later so we will stop here but the process continues until we've reached the earliest branch. 74 | 75 | Sometimes it is necessary to remove individual operations from a track that are not supported by earlier versions. This graceful fallback is a compromise to allow to run a subset of the track on older versions of Elasticsearch too. If this is necessary then it's best to do these changes in a separate commit. Also, don't forget to cherry-pick this separate commit too to even earlier versions if necessary. 76 | 77 | 78 | License 79 | ------- 80 | 81 | There is no single license for this repository. Licenses are chosen per track. They are typically licensed under the same terms as the source data. See the README files of each track for more details. -------------------------------------------------------------------------------- /nested/challenges/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nested-search-challenge", 3 | "description": "Indexes the document corpus for an hour using Elasticsearch default settings. After that randomized nested queries are run.", 4 | "default": true, 5 | "schedule": [ 6 | { 7 | "operation": "delete-index" 8 | }, 9 | { 10 | "operation": { 11 | "operation-type": "create-index", 12 | "settings": {{index_settings | default({}) | tojson}} 13 | } 14 | }, 15 | { 16 | "name": "check-cluster-health", 17 | "operation": { 18 | "operation-type": "cluster-health", 19 | "index": "sonested", 20 | "request-params": { 21 | "wait_for_status": "{{cluster_health | default('green')}}", 22 | "wait_for_no_relocating_shards": "true" 23 | } 24 | } 25 | }, 26 | { 27 | "operation": "index-append", 28 | "warmup-time-period": 120, 29 | "time-period": 3600, 30 | "clients": {{bulk_indexing_clients | default(4)}} 31 | }, 32 | { 33 | "name": "refresh-after-index", 34 | "operation": "refresh", 35 | "clients": 1 36 | }, 37 | { 38 | "operation": "force-merge", 39 | "clients": 1 40 | }, 41 | { 42 | "name": "refresh-after-force-merge", 43 | "operation": "refresh", 44 | "clients": 1 45 | }, 46 | { 47 | "operation": "randomized-nested-queries", 48 | "clients": 2, 49 | "target-throughput": 20, 50 | "warmup-iterations": 500, 51 | "iterations": 1000 52 | }, 53 | { 54 | "operation": "randomized-term-queries", 55 | "clients": 2, 56 | "target-throughput": 25, 57 | "warmup-iterations": 500, 58 | "iterations": 200 59 | }, 60 | { 61 | "operation": "randomized-sorted-term-queries", 62 | "clients": 2, 63 | "warmup-iterations": 500, 64 | "target-throughput": 16, 65 | "iterations": 200 66 | }, 67 | { 68 | "operation": "match-all", 69 | "clients": 2, 70 | "target-throughput": 5, 71 | "warmup-iterations": 500, 72 | "iterations": 200 73 | }, 74 | { 75 | "operation": "nested-date-histo", 76 | "clients": 2, 77 | "target-throughput": 1, 78 | "warmup-iterations": 100, 79 | "iterations": 200 80 | }, 81 | { 82 | "operation": "randomized-nested-queries-with-inner-hits_default", 83 | "clients": 2, 84 | "target-throughput": 18, 85 | "warmup-iterations": 500, 86 | "iterations": 1000 87 | }, 88 | { 89 | "operation": "randomized-nested-queries-with-inner-hits_default_big_size", 90 | "clients": 2, 91 | "target-throughput": 16, 92 | "warmup-iterations": 500, 93 | "iterations": 1000 94 | } 95 | ] 96 | }, 97 | { 98 | "name": "index-only", 99 | "description": "Indexes the document corpus for an hour using Elasticsearch default settings.", 100 | "schedule": [ 101 | { 102 | "operation": "delete-index" 103 | }, 104 | { 105 | "operation": { 106 | "operation-type": "create-index", 107 | "settings": {{index_settings | default({}) | tojson}} 108 | } 109 | }, 110 | { 111 | "name": "check-cluster-health", 112 | "operation": { 113 | "operation-type": "cluster-health", 114 | "index": "sonested", 115 | "request-params": { 116 | "wait_for_status": "{{cluster_health | default('green')}}", 117 | "wait_for_no_relocating_shards": "true" 118 | } 119 | } 120 | }, 121 | { 122 | "operation": "index-append", 123 | "warmup-time-period": 120, 124 | "time-period": 3600, 125 | "clients": {{bulk_indexing_clients | default(4)}} 126 | }, 127 | { 128 | "name": "refresh-after-index", 129 | "operation": "refresh", 130 | "clients": 1 131 | }, 132 | { 133 | "operation": "force-merge", 134 | "clients": 1 135 | }, 136 | { 137 | "name": "refresh-after-force-merge", 138 | "operation": "refresh", 139 | "clients": 1 140 | } 141 | ] 142 | } 143 | -------------------------------------------------------------------------------- /noaa/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index", 3 | "operation-type": "bulk", 4 | "bulk-size": {{bulk_size | default(5000)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}} 6 | }, 7 | { 8 | "name": "range_field_big_range", 9 | "operation-type": "search", 10 | "body": { 11 | "query": { 12 | "range": { 13 | "TRANGE": { 14 | "gte": 0, 15 | "lte": 30 16 | } 17 | } 18 | } 19 | } 20 | }, 21 | { 22 | "name": "range_field_small_range", 23 | "operation-type": "search", 24 | "body": { 25 | "query": { 26 | "range": { 27 | "TRANGE": { 28 | "gte": -20, 29 | "lte": -10 30 | } 31 | } 32 | } 33 | } 34 | }, 35 | { 36 | "name": "range_field_conjunction_big_range_small_term_query", 37 | "operation-type": "search", 38 | "body": { 39 | "query": { 40 | "bool": { 41 | "must": [ 42 | { 43 | "term": { 44 | "station.country_code": "JA" 45 | } 46 | }, 47 | { 48 | "range": { 49 | "TRANGE": { 50 | "gte": 0, 51 | "lte": 30 52 | } 53 | } 54 | } 55 | ] 56 | } 57 | } 58 | } 59 | }, 60 | { 61 | "name": "range_field_conjunction_small_range_small_term_query", 62 | "operation-type": "search", 63 | "body": { 64 | "query": { 65 | "bool": { 66 | "must": [ 67 | { 68 | "term": { 69 | "station.country_code": "JA" 70 | } 71 | }, 72 | { 73 | "range": { 74 | "TRANGE": { 75 | "gte": -20, 76 | "lte": -10 77 | } 78 | } 79 | } 80 | ] 81 | } 82 | } 83 | } 84 | }, 85 | { 86 | "name": "range_field_conjunction_small_range_big_term_query", 87 | "operation-type": "search", 88 | "body": { 89 | "query": { 90 | "bool": { 91 | "must": [ 92 | { 93 | "term": { 94 | "station.country_code": "US" 95 | } 96 | }, 97 | { 98 | "range": { 99 | "TRANGE": { 100 | "gte": -20, 101 | "lte": -10 102 | } 103 | } 104 | } 105 | ] 106 | } 107 | } 108 | } 109 | }, 110 | { 111 | "name": "range_field_conjunction_big_range_big_term_query", 112 | "operation-type": "search", 113 | "body": { 114 | "query": { 115 | "bool": { 116 | "must": [ 117 | { 118 | "term": { 119 | "station.country_code": "US" 120 | } 121 | }, 122 | { 123 | "range": { 124 | "TRANGE": { 125 | "gte": 0, 126 | "lte": 30 127 | } 128 | } 129 | } 130 | ] 131 | } 132 | } 133 | } 134 | }, 135 | { 136 | "name": "range_field_disjunction_small_range_small_term_query", 137 | "operation-type": "search", 138 | "body": { 139 | "query": { 140 | "bool": { 141 | "should": [ 142 | { 143 | "term": { 144 | "station.country_code": "JA" 145 | } 146 | }, 147 | { 148 | "range": { 149 | "TRANGE": { 150 | "gte": -20, 151 | "lte": -10 152 | } 153 | } 154 | } 155 | ] 156 | } 157 | } 158 | } 159 | }, 160 | { 161 | "name": "range_field_disjunction_big_range_small_term_query", 162 | "operation-type": "search", 163 | "body": { 164 | "query": { 165 | "bool": { 166 | "should": [ 167 | { 168 | "term": { 169 | "station.country_code": "JA" 170 | } 171 | }, 172 | { 173 | "range": { 174 | "TRANGE": { 175 | "gte": 0, 176 | "lte": 30 177 | } 178 | } 179 | } 180 | ] 181 | } 182 | } 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /percolator/README.md: -------------------------------------------------------------------------------- 1 | ## Percolator track 2 | 3 | The queries.json.bz2 file contains list of ES queries that has been randomly generated from the AOL query dataset published in 2006. Only specific queries have been selected and the rest of the file contains dummy queries. Only the query attribute is copied from the AOL query dataset, the rest of the attributes are not in this file. 4 | 5 | ### Example Document 6 | 7 | ```json 8 | { 9 | "query": { 10 | "match": { 11 | "body": { 12 | "query": "costa rica hurricanes" 13 | } 14 | } 15 | } 16 | } 17 | ``` 18 | 19 | ### Parameters 20 | 21 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`: 22 | 23 | * `bulk_size` (default: 5000) 24 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests. 25 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. 26 | * `number_of_replicas` (default: 0) 27 | * `number_of_shards` (default: 5) 28 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 29 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. 30 | * `cluster_health` (default: "green"): The minimum required cluster health. 31 | 32 | ### License 33 | 34 | AOL's original README: 35 | 36 | ``` 37 | 500k User Session Collection 38 | ---------------------------------------------- 39 | This collection is distributed for NON-COMMERCIAL RESEARCH USE ONLY. 40 | Any application of this collection for commercial purposes is STRICTLY PROHIBITED. 41 | 42 | Brief description: 43 | 44 | This collection consists of ~20M web queries collected from ~650k users over three months. 45 | The data is sorted by anonymous user ID and sequentially arranged. 46 | 47 | The goal of this collection is to provide real query log data that is based on real users. It could be used for personalization, query reformulation or other types of search research. 48 | 49 | The data set includes {AnonID, Query, QueryTime, ItemRank, ClickURL}. 50 | AnonID - an anonymous user ID number. 51 | Query - the query issued by the user, case shifted with 52 | most punctuation removed. 53 | QueryTime - the time at which the query was submitted for search. 54 | ItemRank - if the user clicked on a search result, the rank of the 55 | item on which they clicked is listed. 56 | ClickURL - if the user clicked on a search result, the domain portion of 57 | the URL in the clicked result is listed. 58 | 59 | Each line in the data represents one of two types of events: 60 | 1. A query that was NOT followed by the user clicking on a result item. 61 | 2. A click through on an item in the result list returned from a query. 62 | In the first case (query only) there is data in only the first three columns/fields -- namely AnonID, Query, and QueryTime (see above). 63 | In the second case (click through), there is data in all five columns. For click through events, the query that preceded the click through is included. Note that if a user clicked on more than one result in the list returned from a single query, there will be TWO lines in the data to represent the two events. Also note that if the user requested the next "page" or results for some query, this appears as a subsequent identical query with a later time stamp. 64 | 65 | CAVEAT EMPTOR -- SEXUALLY EXPLICIT DATA! Please be aware that these queries are not filtered to remove any content. Pornography is prevalent on the Web and unfiltered search engine logs contain queries by users who are looking for pornographic material. There are queries in this collection that use SEXUALLY EXPLICIT LANGUAGE. This collection of data is intended for use by mature adults who are not easily offended by the use of pornographic search terms. If you are offended by sexually explicit language you should not read through this data. Also be aware that in some states it may be illegal to expose a minor to this data. Please understand that the data represents REAL WORLD USERS, un-edited and randomly sampled, and that AOL is not the author of this data. 66 | 67 | Basic Collection Statistics 68 | Dates: 69 | 01 March, 2006 - 31 May, 2006 70 | 71 | Normalized queries: 72 | 36,389,567 lines of data 73 | 21,011,340 instances of new queries (w/ or w/o click-through) 74 | 7,887,022 requests for "next page" of results 75 | 19,442,629 user click-through events 76 | 16,946,938 queries w/o user click-through 77 | 10,154,742 unique (normalized) queries 78 | 657,426 unique user ID's 79 | 80 | 81 | Please reference the following publication when using this collection: 82 | 83 | G. Pass, A. Chowdhury, C. Torgeson, "A Picture of Search" The First 84 | International Conference on Scalable Information Systems, Hong Kong, June, 85 | 2006. 86 | 87 | Copyright (2006) AOL 88 | ``` 89 | -------------------------------------------------------------------------------- /http_logs/README.md: -------------------------------------------------------------------------------- 1 | ## HTTP logs track 2 | 3 | This track is based on [Web server logs from the 1998 Football world cup](http://ita.ee.lbl.gov/html/contrib/WorldCup.html). 4 | 5 | Modifications: 6 | 7 | * Applied number to IP conversion as suggested in the original readme 8 | * Removed illegal characters in "object_mappings.sort" 9 | * Transformed the source data to a bulk-friendly JSON format (ignoring all entries that 10 | contained unrecognised / problematic characters and invalid IP addresses like "0"; 11 | around 0.001% of the source data was lost due to this approach) 12 | 13 | ### Example Document 14 | 15 | ```json 16 | { 17 | "@timestamp": 898459201, 18 | "clientip": "211.11.9.0", 19 | "request": "GET /english/index.html HTTP/1.0", 20 | "status": 304, 21 | "size": 0 22 | } 23 | ``` 24 | 25 | Alternatively, an `unparsed` set of documents are also provided. The `unparsed` data set is identical to the standard 26 | data set, except the timestamp is ISO8601 and all the fields are unparsed via the `message` field. For example: 27 | 28 | ```json 29 | {"message" : "211.11.9.0 - - [1998-06-21T15:00:01-05:00] \"GET /english/index.html HTTP/1.0\" 304 0"} 30 | ``` 31 | 32 | ### Parameters 33 | 34 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`: 35 | 36 | * `bulk_size` (default: 5000) 37 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests. 38 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. 39 | * `conflicts` (default: "random"): Type of id conflicts to simulate. Valid values are: 'sequential' (A document id is replaced with a document id with a sequentially increasing id), 'random' (A document id is replaced with a document id with a random other id). 40 | * `conflict_probability` (default: 25): A number between 0 and 100 that defines the probability of id conflicts. This requires to run the respective challenge. Combining ``conflicts=sequential`` and ``conflict-probability=0`` makes Rally generate index ids by itself, instead of relying on Elasticsearch's `automatic id generation `_. 41 | * `number_of_replicas` (default: 0) 42 | * `number_of_shards` (default: 5) 43 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 44 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. 45 | * `cluster_health` (default: "green"): The minimum required cluster health. 46 | * `ingest_pipeline`: Only applicable for `--challenge=append-index-only-with-ingest-pipeline`, selects which ingest 47 | node pipeline to run. Valid options are `'baseline'` (default), `'grok'` and `'geoip'`. For example: `--challenge=append-index-only-with-ingest-pipeline --track-params="ingest_pipeline:'baseline'" ` 48 | 49 | ### License 50 | 51 | Original license text: 52 | 53 | Copyright (C) 1997, 1998, 1999 Hewlett-Packard Company 54 | ALL RIGHTS RESERVED. 55 | 56 | The enclosed software and documentation includes copyrighted works 57 | of Hewlett-Packard Co. For as long as you comply with the following 58 | limitations, you are hereby authorized to (i) use, reproduce, and 59 | modify the software and documentation, and to (ii) distribute the 60 | software and documentation, including modifications, for 61 | non-commercial purposes only. 62 | 63 | 1. The enclosed software and documentation is made available at no 64 | charge in order to advance the general development of 65 | the Internet, the World-Wide Web, and Electronic Commerce. 66 | 67 | 2. You may not delete any copyright notices contained in the 68 | software or documentation. All hard copies, and copies in 69 | source code or object code form, of the software or 70 | documentation (including modifications) must contain at least 71 | one of the copyright notices. 72 | 73 | 3. The enclosed software and documentation has not been subjected 74 | to testing and quality control and is not a Hewlett-Packard Co. 75 | product. At a future time, Hewlett-Packard Co. may or may not 76 | offer a version of the software and documentation as a product. 77 | 78 | 4. THE SOFTWARE AND DOCUMENTATION IS PROVIDED "AS IS". 79 | HEWLETT-PACKARD COMPANY DOES NOT WARRANT THAT THE USE, 80 | REPRODUCTION, MODIFICATION OR DISTRIBUTION OF THE SOFTWARE OR 81 | DOCUMENTATION WILL NOT INFRINGE A THIRD PARTY'S INTELLECTUAL 82 | PROPERTY RIGHTS. HP DOES NOT WARRANT THAT THE SOFTWARE OR 83 | DOCUMENTATION IS ERROR FREE. HP DISCLAIMS ALL WARRANTIES, 84 | EXPRESS AND IMPLIED, WITH REGARD TO THE SOFTWARE AND THE 85 | DOCUMENTATION. HP SPECIFICALLY DISCLAIMS ALL WARRANTIES OF 86 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 87 | 88 | 5. HEWLETT-PACKARD COMPANY WILL NOT IN ANY EVENT BE LIABLE FOR ANY 89 | DIRECT, INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES 90 | (INCLUDING LOST PROFITS) RELATED TO ANY USE, REPRODUCTION, 91 | MODIFICATION, OR DISTRIBUTION OF THE SOFTWARE OR DOCUMENTATION. 92 | -------------------------------------------------------------------------------- /noaa/challenges/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "append-no-conflicts", 3 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green and we want to ensure that we don't use the query cache. Document ids are unique so all index operations are append only. After that a couple of queries are run.", 4 | "default": true, 5 | "schedule": [ 6 | { 7 | "operation": "delete-index" 8 | }, 9 | { 10 | "operation": { 11 | "operation-type": "create-index", 12 | "settings": {{index_settings | default({}) | tojson}} 13 | } 14 | }, 15 | { 16 | "name": "check-cluster-health", 17 | "operation": { 18 | "operation-type": "cluster-health", 19 | "index": "weather-data-2016", 20 | "request-params": { 21 | "wait_for_status": "{{cluster_health | default('green')}}", 22 | "wait_for_no_relocating_shards": "true" 23 | } 24 | } 25 | }, 26 | { 27 | "operation": "index", 28 | "#COMMENT": "This is an incredibly short warmup time period but it is necessary to get also measurement samples. As this benchmark is rather about search than indexing this is ok.", 29 | "warmup-time-period": 10, 30 | "clients": {{bulk_indexing_clients | default(8)}} 31 | }, 32 | { 33 | "name": "refresh-after-index", 34 | "operation": "refresh", 35 | "clients": 1 36 | }, 37 | { 38 | "operation": "force-merge", 39 | "clients": 1 40 | }, 41 | { 42 | "name": "refresh-after-force-merge", 43 | "operation": "refresh", 44 | "clients": 1 45 | }, 46 | { 47 | "operation": "range_field_big_range", 48 | "clients": 1, 49 | "warmup-iterations": 100, 50 | "iterations": 500, 51 | "target-throughput": 8 52 | }, 53 | { 54 | "operation": "range_field_small_range", 55 | "clients": 1, 56 | "warmup-iterations": 100, 57 | "iterations": 500, 58 | "target-throughput": 10 59 | }, 60 | { 61 | "operation": "range_field_conjunction_big_range_small_term_query", 62 | "clients": 1, 63 | "warmup-iterations": 100, 64 | "iterations": 500, 65 | "target-throughput": 10 66 | }, 67 | { 68 | "operation": "range_field_conjunction_small_range_small_term_query", 69 | "clients": 1, 70 | "warmup-iterations": 100, 71 | "iterations": 500, 72 | "target-throughput": 10 73 | }, 74 | { 75 | "operation": "range_field_conjunction_small_range_big_term_query", 76 | "clients": 1, 77 | "warmup-iterations": 100, 78 | "iterations": 500, 79 | "target-throughput": 4 80 | }, 81 | { 82 | "operation": "range_field_conjunction_big_range_big_term_query", 83 | "clients": 1, 84 | "warmup-iterations": 100, 85 | "iterations": 500, 86 | "target-throughput": 1 87 | }, 88 | { 89 | "operation": "range_field_disjunction_small_range_small_term_query", 90 | "clients": 1, 91 | "warmup-iterations": 100, 92 | "iterations": 500, 93 | "target-throughput": 10 94 | }, 95 | { 96 | "operation": "range_field_disjunction_big_range_small_term_query", 97 | "clients": 1, 98 | "warmup-iterations": 100, 99 | "iterations": 500, 100 | "target-throughput": 6 101 | } 102 | ] 103 | }, 104 | { 105 | "name": "append-no-conflicts-index-only", 106 | "description": "Indexes the whole document corpus using Elasticsearch default settings.", 107 | "schedule": [ 108 | { 109 | "operation": "delete-index" 110 | }, 111 | { 112 | "operation": { 113 | "operation-type": "create-index", 114 | "settings": {{index_settings | default({}) | tojson}} 115 | } 116 | }, 117 | { 118 | "name": "check-cluster-health", 119 | "operation": { 120 | "operation-type": "cluster-health", 121 | "index": "weather-data-2016", 122 | "request-params": { 123 | "wait_for_status": "{{cluster_health | default('green')}}", 124 | "wait_for_no_relocating_shards": "true" 125 | } 126 | } 127 | }, 128 | { 129 | "operation": "index", 130 | "#COMMENT": "This is an incredibly short warmup time period but it is necessary to get also measurement samples. As this benchmark is rather about search than indexing this is ok.", 131 | "warmup-time-period": 10, 132 | "clients": {{bulk_indexing_clients | default(8)}} 133 | }, 134 | { 135 | "name": "refresh-after-index", 136 | "operation": "refresh", 137 | "clients": 1 138 | }, 139 | { 140 | "operation": "force-merge", 141 | "clients": 1 142 | }, 143 | { 144 | "name": "refresh-after-force-merge", 145 | "operation": "refresh", 146 | "clients": 1 147 | } 148 | ] 149 | } 150 | -------------------------------------------------------------------------------- /geopointshape/challenges/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "append-no-conflicts", 3 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.", 4 | "default": true, 5 | "schedule": [ 6 | { 7 | "operation": "delete-index" 8 | }, 9 | { 10 | "operation": { 11 | "operation-type": "create-index", 12 | "settings": {{index_settings | default({}) | tojson}} 13 | } 14 | }, 15 | { 16 | "name": "check-cluster-health", 17 | "operation": { 18 | "operation-type": "cluster-health", 19 | "index": "osmgeoshapes", 20 | "request-params": { 21 | "wait_for_status": "{{cluster_health | default('green')}}", 22 | "wait_for_no_relocating_shards": "true" 23 | } 24 | } 25 | }, 26 | { 27 | "operation": "index-append", 28 | "warmup-time-period": 120, 29 | "clients": {{bulk_indexing_clients | default(8)}} 30 | }, 31 | { 32 | "name": "refresh-after-index", 33 | "operation": "refresh", 34 | "clients": 1 35 | }, 36 | { 37 | "operation": "force-merge", 38 | "clients": 1 39 | }, 40 | { 41 | "name": "refresh-after-force-merge", 42 | "operation": "refresh", 43 | "clients": 1 44 | }, 45 | { 46 | "operation": "polygon", 47 | "clients": 1, 48 | "warmup-iterations": 200, 49 | "iterations": 100, 50 | "target-throughput": 2 51 | }, 52 | { 53 | "operation": "bbox", 54 | "clients": 1, 55 | "warmup-iterations": 200, 56 | "iterations": 100, 57 | "target-throughput": 2 58 | } 59 | ] 60 | }, 61 | { 62 | "name": "append-no-conflicts-index-only", 63 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.", 64 | "schedule": [ 65 | { 66 | "operation": "delete-index" 67 | }, 68 | { 69 | "operation": { 70 | "operation-type": "create-index", 71 | "settings": {{index_settings | default({}) | tojson}} 72 | } 73 | }, 74 | { 75 | "name": "check-cluster-health", 76 | "operation": { 77 | "operation-type": "cluster-health", 78 | "index": "osmgeoshapes", 79 | "request-params": { 80 | "wait_for_status": "{{cluster_health | default('green')}}", 81 | "wait_for_no_relocating_shards": "true" 82 | } 83 | } 84 | }, 85 | { 86 | "operation": "index-append", 87 | "warmup-time-period": 120, 88 | "clients": {{bulk_indexing_clients | default(8)}} 89 | }, 90 | { 91 | "name": "refresh-after-index", 92 | "operation": "refresh", 93 | "clients": 1 94 | }, 95 | { 96 | "operation": "force-merge", 97 | "clients": 1 98 | }, 99 | { 100 | "name": "refresh-after-force-merge", 101 | "operation": "refresh", 102 | "clients": 1 103 | } 104 | ] 105 | }, 106 | { 107 | "name": "append-fast-with-conflicts", 108 | "description": "Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings. Rally will produce duplicate ids in 25% of all documents (not configurable) so we can simulate a scenario with appends most of the time and some updates in between.", 109 | "schedule": [ 110 | { 111 | "operation": "delete-index" 112 | }, 113 | { 114 | "operation": { 115 | "operation-type": "create-index", 116 | "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { 117 | "index.refresh_interval": "30s", 118 | "index.number_of_shards": {{number_of_shards | default(6)}}, 119 | "index.translog.flush_threshold_size": "4g" 120 | }{%- endif %} 121 | } 122 | }, 123 | { 124 | "name": "check-cluster-health", 125 | "operation": { 126 | "operation-type": "cluster-health", 127 | "index": "osmgeoshapes", 128 | "request-params": { 129 | "wait_for_status": "{{cluster_health | default('green')}}", 130 | "wait_for_no_relocating_shards": "true" 131 | } 132 | } 133 | }, 134 | { 135 | "operation": "index-update", 136 | "warmup-time-period": 120, 137 | "clients": {{bulk_indexing_clients | default(8)}} 138 | }, 139 | { 140 | "name": "refresh-after-index", 141 | "operation": "refresh", 142 | "clients": 1 143 | }, 144 | { 145 | "operation": "force-merge", 146 | "clients": 1 147 | }, 148 | { 149 | "name": "refresh-after-force-merge", 150 | "operation": "refresh", 151 | "clients": 1 152 | } 153 | ] 154 | } 155 | -------------------------------------------------------------------------------- /http_logs/track.json: -------------------------------------------------------------------------------- 1 | {% import "rally.helpers" as rally with context %} 2 | 3 | { 4 | "version": 2, 5 | "description": "HTTP server log data", 6 | "#TODO": "Replace index definitions with a template after setting the track version to 2. Explicit index definitions are not necessary anymore.", 7 | "indices": [ 8 | { 9 | "name": "logs-181998", 10 | "body": "index.json" 11 | }, 12 | { 13 | "name": "logs-191998", 14 | "body": "index.json" 15 | }, 16 | { 17 | "name": "logs-201998", 18 | "body": "index.json" 19 | }, 20 | { 21 | "name": "logs-211998", 22 | "body": "index.json" 23 | }, 24 | { 25 | "name": "logs-221998", 26 | "body": "index.json" 27 | }, 28 | { 29 | "name": "logs-231998", 30 | "body": "index.json" 31 | }, 32 | { 33 | "name": "logs-241998", 34 | "body": "index.json" 35 | }, 36 | { 37 | "name": "reindexed-logs", 38 | "body": "index.json" 39 | } 40 | ], 41 | "corpora": [ 42 | {%- if ingest_pipeline is defined and ingest_pipeline == "grok" %} 43 | { 44 | "name": "http_logs_unparsed", 45 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs", 46 | "documents": [ 47 | { 48 | "target-index": "logs-181998", 49 | "source-file": "documents-181998.unparsed.json.bz2", 50 | "document-count": 2708746, 51 | "compressed-bytes": 13064317, 52 | "uncompressed-bytes": 303920342 53 | }, 54 | { 55 | "target-index": "logs-191998", 56 | "source-file": "documents-191998.unparsed.json.bz2", 57 | "document-count": 9697882, 58 | "compressed-bytes": 47211781, 59 | "uncompressed-bytes": 1088378738 60 | }, 61 | { 62 | "target-index": "logs-201998", 63 | "source-file": "documents-201998.unparsed.json.bz2", 64 | "document-count": 13053463, 65 | "compressed-bytes": 63174979, 66 | "uncompressed-bytes": 1456836090 67 | }, 68 | { 69 | "target-index": "logs-211998", 70 | "source-file": "documents-211998.unparsed.json.bz2", 71 | "document-count": 17647279, 72 | "compressed-bytes": 85607179, 73 | "uncompressed-bytes": 1975990671 74 | }, 75 | { 76 | "target-index": "logs-221998", 77 | "source-file": "documents-221998.unparsed.json.bz2", 78 | "document-count": 10716760, 79 | "compressed-bytes": 53190976, 80 | "uncompressed-bytes": 1202551382 81 | }, 82 | { 83 | "target-index": "logs-231998", 84 | "source-file": "documents-231998.unparsed.json.bz2", 85 | "document-count": 11961342, 86 | "compressed-bytes": 60705435, 87 | "uncompressed-bytes": 1334381144 88 | }, 89 | { 90 | "target-index": "logs-241998", 91 | "source-file": "documents-241998.unparsed.json.bz2", 92 | "document-count": 181463624, 93 | "compressed-bytes": 897719968, 94 | "uncompressed-bytes": 20563705716 95 | } 96 | ] 97 | } 98 | {%- else %} 99 | { 100 | "name": "http_logs", 101 | "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs", 102 | "documents": [ 103 | { 104 | "target-index": "logs-181998", 105 | "source-file": "documents-181998.json.bz2", 106 | "document-count": 2708746, 107 | "compressed-bytes": 13815456, 108 | "uncompressed-bytes": 363512754 109 | }, 110 | { 111 | "target-index": "logs-191998", 112 | "source-file": "documents-191998.json.bz2", 113 | "document-count": 9697882, 114 | "compressed-bytes": 49439633, 115 | "uncompressed-bytes": 1301732149 116 | }, 117 | { 118 | "target-index": "logs-201998", 119 | "source-file": "documents-201998.json.bz2", 120 | "document-count": 13053463, 121 | "compressed-bytes": 65623436, 122 | "uncompressed-bytes": 1744012279 123 | }, 124 | { 125 | "target-index": "logs-211998", 126 | "source-file": "documents-211998.json.bz2", 127 | "document-count": 17647279, 128 | "compressed-bytes": 88258230, 129 | "uncompressed-bytes": 2364230815 130 | }, 131 | { 132 | "target-index": "logs-221998", 133 | "source-file": "documents-221998.json.bz2", 134 | "document-count": 10716760, 135 | "compressed-bytes": 54160603, 136 | "uncompressed-bytes": 1438320123 137 | }, 138 | { 139 | "target-index": "logs-231998", 140 | "source-file": "documents-231998.json.bz2", 141 | "document-count": 11961342, 142 | "compressed-bytes": 60927822, 143 | "uncompressed-bytes": 1597530673 144 | }, 145 | { 146 | "target-index": "logs-241998", 147 | "source-file": "documents-241998.json.bz2", 148 | "document-count": 181463624, 149 | "compressed-bytes": 905378242, 150 | "uncompressed-bytes": 24555905444 151 | } 152 | ] 153 | } 154 | {%- endif %} 155 | ], 156 | "operations": [ 157 | {{ rally.collect(parts="operations/*.json") }} 158 | ], 159 | "challenges": [ 160 | {{ rally.collect(parts="challenges/*.json") }} 161 | ] 162 | } 163 | -------------------------------------------------------------------------------- /http_logs/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index-append", 3 | "operation-type": "bulk", 4 | "bulk-size": {{bulk_size | default(5000)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}}, 6 | "corpora": "http_logs" 7 | }, 8 | { 9 | "name": "index-append-with-ingest-baseline-pipeline", 10 | "operation-type": "bulk", 11 | "bulk-size": {{bulk_size | default(5000)}}, 12 | "ingest-percentage": {{ingest_percentage | default(100)}}, 13 | "pipeline": "http-log-baseline-pipeline", 14 | "corpora": "http_logs" 15 | }, 16 | { 17 | "name": "index-append-with-ingest-grok-pipeline", 18 | "operation-type": "bulk", 19 | "bulk-size": {{bulk_size | default(5000)}}, 20 | "ingest-percentage": {{ingest_percentage | default(100)}}, 21 | "pipeline": "http-log-grok-pipeline", 22 | "corpora": "http_logs_unparsed" 23 | }, 24 | { 25 | "name": "index-append-with-ingest-geoip-pipeline", 26 | "operation-type": "bulk", 27 | "bulk-size": {{bulk_size | default(5000)}}, 28 | "ingest-percentage": {{ingest_percentage | default(100)}}, 29 | "pipeline": "http-log-geoip-pipeline", 30 | "corpora": "http_logs" 31 | }, 32 | { 33 | "name": "update", 34 | "operation-type": "bulk", 35 | "bulk-size": {{bulk_size | default(5000)}}, 36 | "ingest-percentage": {{ingest_percentage | default(100)}}, 37 | "conflicts": "{{conflicts | default('random')}}", 38 | "on-conflict": "{{on_conflict | default('update')}}", 39 | "conflict-probability": {{conflict_probability | default(25)}}, 40 | "recency": {{recency | default(0)}}, 41 | "corpora": "http_logs" 42 | }, 43 | { 44 | "name": "default", 45 | "operation-type": "search", 46 | "index": "logs-*", 47 | "body": { 48 | "query": { 49 | "match_all": {} 50 | } 51 | } 52 | }, 53 | { 54 | "name": "term", 55 | "operation-type": "search", 56 | "index": "logs-*", 57 | "body": { 58 | "query": { 59 | "term": { 60 | "request.raw": { 61 | "value": "GET / HTTP/1.0" 62 | } 63 | } 64 | } 65 | } 66 | }, 67 | { 68 | "name": "range", 69 | "operation-type": "search", 70 | "index": "logs-*", 71 | "body": { 72 | "query": { 73 | "range": { 74 | "@timestamp": { 75 | "gte": "now-{{'15-05-1998' | days_ago(now)}}d/d", 76 | "lt": "now/d" 77 | } 78 | } 79 | } 80 | } 81 | }, 82 | { 83 | "name": "hourly_agg", 84 | "operation-type": "search", 85 | "index": "logs-*", 86 | "body": { 87 | "size": 0, 88 | "aggs": { 89 | "by_hour": { 90 | "date_histogram": { 91 | "field": "@timestamp", 92 | "calendar_interval": "hour" 93 | } 94 | } 95 | } 96 | } 97 | }, 98 | { 99 | "name": "scroll", 100 | "operation-type": "search", 101 | "index": "logs-*", 102 | "pages": 25, 103 | "results-per-page": 1000, 104 | "body": { 105 | "query": { 106 | "match_all": {} 107 | } 108 | } 109 | }, 110 | { 111 | "name": "desc_sort_timestamp", 112 | "operation-type": "search", 113 | "index": "logs-*", 114 | "body": { 115 | "query": { 116 | "match_all": {} 117 | }, 118 | "sort" : [ 119 | {"@timestamp" : "desc"} 120 | ] 121 | } 122 | }, 123 | { 124 | "name": "asc_sort_timestamp", 125 | "operation-type": "search", 126 | "index": "logs-*", 127 | "body": { 128 | "query": { 129 | "match_all": {} 130 | }, 131 | "sort" : [ 132 | {"@timestamp" : "asc"} 133 | ] 134 | } 135 | }, 136 | { 137 | "name": "create-http-log-baseline-pipeline", 138 | "operation-type": "put-pipeline", 139 | "id": "http-log-baseline-pipeline", 140 | "body": { 141 | "description": "Process an the documents with a processor that does nothing. Baseline for overhead of pipeline.", 142 | "processors": [ 143 | { 144 | "uppercase": { 145 | "field": "doesnotexist", 146 | "ignore_missing": true 147 | } 148 | } 149 | ] 150 | } 151 | }, 152 | { 153 | "name": "create-http-log-grok-pipeline", 154 | "operation-type": "put-pipeline", 155 | "id": "http-log-grok-pipeline", 156 | "body": { 157 | "description": "Process an http log line with grok. Requires the `unparsed` data set.", 158 | "processors": [ 159 | { 160 | "grok": { 161 | "field": "message", 162 | "patterns": [ 163 | "%{IPORHOST:clientip} %{HTTPDUSER} %{USER} \\[%{TIMESTAMP_ISO8601:@timestamp}\\] \"(?:%{WORD} %{NOTSPACE:request}(?: HTTP/%{NUMBER})?|%{DATA})\" %{NUMBER:status} (?:%{NUMBER:size}|-)" 164 | ] 165 | } 166 | } 167 | ] 168 | } 169 | }, 170 | { 171 | "name": "create-http-log-geoip-pipeline", 172 | "operation-type": "put-pipeline", 173 | "id": "http-log-geoip-pipeline", 174 | "body": { 175 | "description": "Enrich the data with the geo-ip filter. Requires --elasticsearch-plugins='ingest-geoip'", 176 | "processors": [ 177 | { 178 | "geoip": { 179 | "field": "clientip", 180 | "properties": [ 181 | "city_name", 182 | "country_name", 183 | "location" 184 | ] 185 | } 186 | } 187 | ] 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /nested/track.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | import csv 4 | 5 | 6 | class QueryParamSource: 7 | # We need to stick to the param source API 8 | # noinspection PyUnusedLocal 9 | def __init__(self, track, params, **kwargs): 10 | self._params = params 11 | self.infinite = True 12 | # here we read the queries data file into arrays which we'll then later use randomly. 13 | self.tags = [] 14 | self.dates = [] 15 | # be predictably random. The seed has been chosen by a fair dice roll. ;) 16 | random.seed(4) 17 | cwd = os.path.dirname(__file__) 18 | with open(os.path.join(cwd, "queries.csv"), "r") as ins: 19 | csvreader = csv.reader(ins) 20 | for row in csvreader: 21 | self.tags.append(row[0]) 22 | self.dates.append(row[1]) 23 | 24 | # We need to stick to the param source API 25 | # noinspection PyUnusedLocal 26 | def partition(self, partition_index, total_partitions): 27 | return self 28 | 29 | # Deprecated - only there for BWC reasons with Rally < 1.4.0 30 | def size(self): 31 | return 1 32 | 33 | 34 | class SortedTermQueryParamSource(QueryParamSource): 35 | def params(self): 36 | result = { 37 | "body": { 38 | "query": { 39 | "match": { 40 | "tag": "%s" % random.choice(self.tags) 41 | } 42 | }, 43 | "sort": [ 44 | { 45 | "answers.date": { 46 | "mode": "max", 47 | "order": "desc", 48 | "nested": { 49 | "path": "answers" 50 | } 51 | } 52 | } 53 | ] 54 | }, 55 | "index": None 56 | } 57 | if "cache" in self._params: 58 | result["cache"] = self._params["cache"] 59 | 60 | return result 61 | 62 | 63 | class TermQueryParamSource(QueryParamSource): 64 | def params(self): 65 | result = { 66 | "body": { 67 | "query": { 68 | "match": { 69 | "tag": "%s" % random.choice(self.tags) 70 | } 71 | } 72 | }, 73 | "index": None 74 | } 75 | if "cache" in self._params: 76 | result["cache"] = self._params["cache"] 77 | 78 | return result 79 | 80 | 81 | class NestedQueryParamSource(QueryParamSource): 82 | def params(self): 83 | result = { 84 | "body": { 85 | "query": { 86 | "bool": { 87 | "must": [ 88 | { 89 | "match": { 90 | "tag": "%s" % random.choice(self.tags) 91 | } 92 | }, 93 | { 94 | "nested": { 95 | "path": "answers", 96 | "query": { 97 | "range": { 98 | "answers.date": { 99 | "lte": "%s" % random.choice(self.dates) 100 | } 101 | } 102 | } 103 | } 104 | } 105 | ] 106 | } 107 | } 108 | }, 109 | "index": None 110 | } 111 | if "cache" in self._params: 112 | result["cache"] = self._params["cache"] 113 | 114 | return result 115 | 116 | 117 | class NestedQueryParamSourceWithInnerHits(QueryParamSource): 118 | def params(self): 119 | result = { 120 | "body": { 121 | "query": { 122 | "bool": { 123 | "must": [ 124 | { 125 | "match": { 126 | "tag": "%s" % random.choice(self.tags) 127 | } 128 | }, 129 | { 130 | "nested": { 131 | "path": "answers", 132 | "query": { 133 | "range": { 134 | "answers.date": { 135 | "lte": "%s" % random.choice(self.dates) 136 | } 137 | } 138 | }, 139 | "inner_hits": { 140 | "size": self._params["inner_hits_size"] 141 | } 142 | } 143 | } 144 | ] 145 | } 146 | }, 147 | "size": self._params["size"] 148 | }, 149 | "index": None 150 | } 151 | if "cache" in self._params: 152 | result["cache"] = self._params["cache"] 153 | 154 | return result 155 | 156 | 157 | def register(registry): 158 | registry.register_param_source("nested-query-source", NestedQueryParamSource) 159 | registry.register_param_source("nested-query-source-with-inner-hits", NestedQueryParamSourceWithInnerHits) 160 | registry.register_param_source("term-query-source", TermQueryParamSource) 161 | registry.register_param_source("sorted-term-query-source", SortedTermQueryParamSource) 162 | -------------------------------------------------------------------------------- /noaa/index.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index.number_of_shards": {{number_of_shards | default(1)}}, 4 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 5 | "index.queries.cache.enabled": false, 6 | "index.requests.cache.enable": false, 7 | "index.merge.policy.max_merged_segment": "100GB" 8 | }, 9 | "mappings": { 10 | "dynamic": "strict", 11 | "_source": { 12 | "enabled": {{ source_enabled | default(true) | tojson }} 13 | }, 14 | "properties": { 15 | "AWDR": { 16 | "type": "keyword" 17 | }, 18 | "AWND": { 19 | "type": "float" 20 | }, 21 | "DAPR": { 22 | "type": "keyword" 23 | }, 24 | "DASF": { 25 | "type": "keyword" 26 | }, 27 | "DATN": { 28 | "type": "keyword" 29 | }, 30 | "DATX": { 31 | "type": "keyword" 32 | }, 33 | "DWPR": { 34 | "type": "keyword" 35 | }, 36 | "EVAP": { 37 | "type": "float" 38 | }, 39 | "MDPR": { 40 | "type": "float" 41 | }, 42 | "MDSF": { 43 | "type": "keyword" 44 | }, 45 | "MDTN": { 46 | "type": "float" 47 | }, 48 | "MDTRANGE": { 49 | "type": "double_range" 50 | }, 51 | "MDTX": { 52 | "type": "float" 53 | }, 54 | "MNPN": { 55 | "type": "float" 56 | }, 57 | "MXPN": { 58 | "type": "float" 59 | }, 60 | "PGTM": { 61 | "type": "keyword" 62 | }, 63 | "PRCP": { 64 | "type": "float" 65 | }, 66 | "PSUN": { 67 | "type": "keyword" 68 | }, 69 | "SN31": { 70 | "type": "keyword" 71 | }, 72 | "SN32": { 73 | "type": "keyword" 74 | }, 75 | "SN33": { 76 | "type": "keyword" 77 | }, 78 | "SN35": { 79 | "type": "keyword" 80 | }, 81 | "SN36": { 82 | "type": "keyword" 83 | }, 84 | "SN51": { 85 | "type": "keyword" 86 | }, 87 | "SN52": { 88 | "type": "keyword" 89 | }, 90 | "SN53": { 91 | "type": "keyword" 92 | }, 93 | "SN55": { 94 | "type": "keyword" 95 | }, 96 | "SN56": { 97 | "type": "keyword" 98 | }, 99 | "SN57": { 100 | "type": "keyword" 101 | }, 102 | "SNOW": { 103 | "type": "keyword" 104 | }, 105 | "SNWD": { 106 | "type": "keyword" 107 | }, 108 | "SX31": { 109 | "type": "keyword" 110 | }, 111 | "SX32": { 112 | "type": "keyword" 113 | }, 114 | "SX33": { 115 | "type": "keyword" 116 | }, 117 | "SX35": { 118 | "type": "keyword" 119 | }, 120 | "SX36": { 121 | "type": "keyword" 122 | }, 123 | "SX51": { 124 | "type": "keyword" 125 | }, 126 | "SX52": { 127 | "type": "keyword" 128 | }, 129 | "SX53": { 130 | "type": "keyword" 131 | }, 132 | "SX55": { 133 | "type": "keyword" 134 | }, 135 | "SX56": { 136 | "type": "keyword" 137 | }, 138 | "SX57": { 139 | "type": "keyword" 140 | }, 141 | "TAVG": { 142 | "type": "float" 143 | }, 144 | "THIC": { 145 | "type": "float" 146 | }, 147 | "TMAX": { 148 | "type": "float" 149 | }, 150 | "TMIN": { 151 | "type": "float" 152 | }, 153 | "TOBS": { 154 | "type": "float" 155 | }, 156 | "TRANGE": { 157 | "type": "double_range" 158 | }, 159 | "TSUN": { 160 | "type": "keyword" 161 | }, 162 | "WDF2": { 163 | "type": "keyword" 164 | }, 165 | "WDF5": { 166 | "type": "keyword" 167 | }, 168 | "WDFG": { 169 | "type": "keyword" 170 | }, 171 | "WDMV": { 172 | "type": "keyword" 173 | }, 174 | "WESD": { 175 | "type": "float" 176 | }, 177 | "WESF": { 178 | "type": "float" 179 | }, 180 | "WSF2": { 181 | "type": "float" 182 | }, 183 | "WSF5": { 184 | "type": "float" 185 | }, 186 | "WSFG": { 187 | "type": "float" 188 | }, 189 | "WSFI": { 190 | "type": "float" 191 | }, 192 | "WT01": { 193 | "type": "keyword" 194 | }, 195 | "WT02": { 196 | "type": "keyword" 197 | }, 198 | "WT03": { 199 | "type": "keyword" 200 | }, 201 | "WT04": { 202 | "type": "keyword" 203 | }, 204 | "WT05": { 205 | "type": "keyword" 206 | }, 207 | "WT06": { 208 | "type": "keyword" 209 | }, 210 | "WT07": { 211 | "type": "keyword" 212 | }, 213 | "WT08": { 214 | "type": "keyword" 215 | }, 216 | "WT09": { 217 | "type": "keyword" 218 | }, 219 | "WT10": { 220 | "type": "keyword" 221 | }, 222 | "WT11": { 223 | "type": "keyword" 224 | }, 225 | "WT17": { 226 | "type": "keyword" 227 | }, 228 | "WT18": { 229 | "type": "keyword" 230 | }, 231 | "date": { 232 | "type": "date" 233 | }, 234 | "station": { 235 | "properties": { 236 | "country": { 237 | "type": "keyword" 238 | }, 239 | "country_code": { 240 | "type": "keyword" 241 | }, 242 | "elevation": { 243 | "type": "float" 244 | }, 245 | "gsn_flag": { 246 | "type": "keyword" 247 | }, 248 | "hcn_crn_flag": { 249 | "type": "keyword" 250 | }, 251 | "id": { 252 | "type": "keyword" 253 | }, 254 | "location": { 255 | "type": "geo_point" 256 | }, 257 | "name": { 258 | "type": "keyword" 259 | }, 260 | "state": { 261 | "type": "keyword" 262 | }, 263 | "state_code": { 264 | "type": "keyword" 265 | }, 266 | "wmo_id": { 267 | "type": "keyword" 268 | } 269 | } 270 | } 271 | } 272 | } 273 | } 274 | -------------------------------------------------------------------------------- /geopoint/challenges/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "append-no-conflicts", 3 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.", 4 | "default": true, 5 | "schedule": [ 6 | { 7 | "operation": "delete-index" 8 | }, 9 | { 10 | "operation": { 11 | "operation-type": "create-index", 12 | "settings": {{index_settings | default({}) | tojson}} 13 | } 14 | }, 15 | { 16 | "name": "check-cluster-health", 17 | "operation": { 18 | "operation-type": "cluster-health", 19 | "index": "osmgeopoints", 20 | "request-params": { 21 | "wait_for_status": "{{cluster_health | default('green')}}", 22 | "wait_for_no_relocating_shards": "true" 23 | } 24 | } 25 | }, 26 | { 27 | "operation": "index-append", 28 | "warmup-time-period": 120, 29 | "clients": {{bulk_indexing_clients | default(8)}} 30 | }, 31 | { 32 | "name": "refresh-after-index", 33 | "operation": "refresh", 34 | "clients": 1 35 | }, 36 | { 37 | "operation": { 38 | "operation-type": "force-merge"{%- if max_num_segments is defined %}, 39 | "max-num-segments": {{max_num_segments}} 40 | {%- endif %} 41 | }, 42 | "clients": 1 43 | }, 44 | { 45 | "name": "refresh-after-force-merge", 46 | "operation": "refresh", 47 | "clients": 1 48 | }, 49 | { 50 | "operation": "polygon", 51 | "clients": 1, 52 | "warmup-iterations": 200, 53 | "iterations": 100, 54 | "target-throughput": 2 55 | }, 56 | { 57 | "operation": "bbox", 58 | "clients": 1, 59 | "warmup-iterations": 200, 60 | "iterations": 100, 61 | "target-throughput": 2 62 | }, 63 | { 64 | "operation": "distance", 65 | "clients": 1, 66 | "warmup-iterations": 200, 67 | "iterations": 100, 68 | "target-throughput": 5 69 | }, 70 | { 71 | "operation": "distanceRange", 72 | "clients": 1, 73 | "warmup-iterations": 200, 74 | "iterations": 100, 75 | "target-throughput": 0.6 76 | } 77 | ] 78 | }, 79 | { 80 | "name": "append-no-conflicts-index-only", 81 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.", 82 | "schedule": [ 83 | { 84 | "operation": "delete-index" 85 | }, 86 | { 87 | "operation": { 88 | "operation-type": "create-index", 89 | "settings": {{index_settings | default({}) | tojson}} 90 | } 91 | }, 92 | { 93 | "name": "check-cluster-health", 94 | "operation": { 95 | "operation-type": "cluster-health", 96 | "index": "osmgeopoints", 97 | "request-params": { 98 | "wait_for_status": "{{cluster_health | default('green')}}", 99 | "wait_for_no_relocating_shards": "true" 100 | } 101 | } 102 | }, 103 | { 104 | "operation": "index-append", 105 | "warmup-time-period": 120, 106 | "clients": {{bulk_indexing_clients | default(8)}} 107 | }, 108 | { 109 | "name": "refresh-after-index", 110 | "operation": "refresh", 111 | "clients": 1 112 | }, 113 | { 114 | "operation": { 115 | "operation-type": "force-merge"{%- if max_num_segments is defined %}, 116 | "max-num-segments": {{max_num_segments}} 117 | {%- endif %} 118 | }, 119 | "clients": 1 120 | }, 121 | { 122 | "name": "refresh-after-force-merge", 123 | "operation": "refresh", 124 | "clients": 1 125 | } 126 | ] 127 | }, 128 | { 129 | "name": "append-fast-with-conflicts", 130 | "description": "Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings. Rally will produce duplicate ids in 25% of all documents (not configurable) so we can simulate a scenario with appends most of the time and some updates in between.", 131 | "schedule": [ 132 | { 133 | "operation": "delete-index" 134 | }, 135 | { 136 | "operation": { 137 | "operation-type": "create-index", 138 | "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { 139 | "index.refresh_interval": "30s", 140 | "index.number_of_shards": {{number_of_shards | default(6)}}, 141 | "index.translog.flush_threshold_size": "4g" 142 | }{%- endif %} 143 | } 144 | }, 145 | { 146 | "name": "check-cluster-health", 147 | "operation": { 148 | "operation-type": "cluster-health", 149 | "index": "osmgeopoints", 150 | "request-params": { 151 | "wait_for_status": "{{cluster_health | default('green')}}", 152 | "wait_for_no_relocating_shards": "true" 153 | } 154 | } 155 | }, 156 | { 157 | "operation": "index-update", 158 | "warmup-time-period": 120, 159 | "clients": {{bulk_indexing_clients | default(8)}} 160 | }, 161 | { 162 | "name": "refresh-after-index", 163 | "operation": "refresh", 164 | "clients": 1 165 | }, 166 | { 167 | "operation": { 168 | "operation-type": "force-merge"{%- if max_num_segments is defined %}, 169 | "max-num-segments": {{max_num_segments}} 170 | {%- endif %} 171 | }, 172 | "clients": 1 173 | }, 174 | { 175 | "name": "refresh-after-force-merge", 176 | "operation": "refresh", 177 | "clients": 1 178 | } 179 | ] 180 | } 181 | -------------------------------------------------------------------------------- /pmc/challenges/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "append-no-conflicts", 3 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.", 4 | "default": true, 5 | "schedule": [ 6 | { 7 | "operation": { 8 | "operation-type": "put-settings", 9 | "body": { 10 | "transient": { 11 | "search.default_search_timeout": "{{default_search_timeout | default(-1)}}" 12 | } 13 | } 14 | } 15 | }, 16 | { 17 | "operation": "delete-index" 18 | }, 19 | { 20 | "operation": { 21 | "operation-type": "create-index", 22 | "settings": {{index_settings | default({}) | tojson}} 23 | } 24 | }, 25 | { 26 | "name": "check-cluster-health", 27 | "operation": { 28 | "operation-type": "cluster-health", 29 | "index": "pmc", 30 | "request-params": { 31 | "wait_for_status": "{{cluster_health | default('green')}}", 32 | "wait_for_no_relocating_shards": "true" 33 | } 34 | } 35 | }, 36 | { 37 | "operation": "index-append", 38 | "warmup-time-period": 240, 39 | "clients": {{bulk_indexing_clients | default(8)}} 40 | }, 41 | { 42 | "name": "refresh-after-index", 43 | "operation": "refresh", 44 | "clients": 1 45 | }, 46 | { 47 | "operation": "force-merge", 48 | "clients": 1 49 | }, 50 | { 51 | "name": "refresh-after-force-merge", 52 | "operation": "refresh", 53 | "clients": 1 54 | }, 55 | { 56 | "operation": "default", 57 | "clients": 1, 58 | "warmup-iterations": 500, 59 | "iterations": 200, 60 | "target-throughput": 20 61 | }, 62 | { 63 | "operation": "term", 64 | "clients": 1, 65 | "warmup-iterations": 500, 66 | "iterations": 200, 67 | "target-throughput": 20 68 | }, 69 | { 70 | "operation": "phrase", 71 | "clients": 1, 72 | "warmup-iterations": 500, 73 | "iterations": 200, 74 | "target-throughput": 20 75 | }, 76 | { 77 | "operation": "articles_monthly_agg_uncached", 78 | "clients": 1, 79 | "warmup-iterations": 500, 80 | "iterations": 200, 81 | "target-throughput": 20 82 | }, 83 | { 84 | "operation": "articles_monthly_agg_cached", 85 | "clients": 1, 86 | "warmup-iterations": 500, 87 | "iterations": 200, 88 | "target-throughput": 20 89 | }, 90 | { 91 | "operation": "scroll", 92 | "clients": 1, 93 | "warmup-iterations": 50, 94 | "iterations": 100, 95 | "target-throughput": 0.5 96 | } 97 | ] 98 | }, 99 | { 100 | "name": "append-no-conflicts-index-only", 101 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.", 102 | "schedule": [ 103 | { 104 | "operation": "delete-index" 105 | }, 106 | { 107 | "operation": { 108 | "operation-type": "create-index", 109 | "settings": {{index_settings | default({}) | tojson}} 110 | } 111 | }, 112 | { 113 | "name": "check-cluster-health", 114 | "operation": { 115 | "operation-type": "cluster-health", 116 | "index": "pmc", 117 | "request-params": { 118 | "wait_for_status": "{{cluster_health | default('green')}}", 119 | "wait_for_no_relocating_shards": "true" 120 | } 121 | } 122 | }, 123 | { 124 | "operation": "index-append", 125 | "warmup-time-period": 240, 126 | "clients": {{bulk_indexing_clients | default(8)}} 127 | }, 128 | { 129 | "name": "refresh-after-index", 130 | "operation": "refresh", 131 | "clients": 1 132 | }, 133 | { 134 | "operation": "force-merge", 135 | "clients": 1 136 | }, 137 | { 138 | "name": "refresh-after-force-merge", 139 | "operation": "refresh", 140 | "clients": 1 141 | } 142 | ] 143 | }, 144 | { 145 | "name": "append-sorted-no-conflicts", 146 | "description": "Indexes the whole document corpus in an index sorted by timestamp field in descending order (most recent first). Document ids are unique so all index operations are append only.", 147 | "schedule": [ 148 | { 149 | "operation": "delete-index" 150 | }, 151 | { 152 | "operation": { 153 | "operation-type": "create-index", 154 | "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { 155 | "index.sort.field": "timestamp", 156 | "index.sort.order": "desc" 157 | }{%- endif %} 158 | } 159 | }, 160 | { 161 | "name": "check-cluster-health", 162 | "operation": { 163 | "operation-type": "cluster-health", 164 | "index": "pmc", 165 | "request-params": { 166 | "wait_for_status": "{{cluster_health | default('green')}}", 167 | "wait_for_no_relocating_shards": "true" 168 | } 169 | } 170 | }, 171 | { 172 | "operation": "index-append", 173 | "warmup-time-period": 240, 174 | "clients": {{bulk_indexing_clients | default(8)}} 175 | }, 176 | { 177 | "name": "refresh-after-index", 178 | "operation": "refresh", 179 | "clients": 1 180 | }, 181 | { 182 | "operation": "force-merge", 183 | "clients": 1 184 | }, 185 | { 186 | "name": "refresh-after-force-merge", 187 | "operation": "refresh", 188 | "clients": 1 189 | } 190 | ] 191 | }, 192 | { 193 | "name": "append-fast-with-conflicts", 194 | "description": "Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings. Rally will produce duplicate ids in 25% of all documents (not configurable) so we can simulate a scenario with appends most of the time and some updates in between.", 195 | "schedule": [ 196 | { 197 | "operation": "delete-index" 198 | }, 199 | { 200 | "operation": { 201 | "operation-type": "create-index", 202 | "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { 203 | "index.refresh_interval": "30s", 204 | "index.number_of_shards": {{number_of_shards | default(6)}}, 205 | "index.translog.flush_threshold_size": "4g" 206 | }{%- endif %} 207 | } 208 | }, 209 | { 210 | "name": "check-cluster-health", 211 | "operation": { 212 | "operation-type": "cluster-health", 213 | "index": "pmc", 214 | "request-params": { 215 | "wait_for_status": "{{cluster_health | default('green')}}", 216 | "wait_for_no_relocating_shards": "true" 217 | } 218 | } 219 | }, 220 | { 221 | "operation": "index-update", 222 | "warmup-time-period": 240, 223 | "clients": {{bulk_indexing_clients | default(8)}} 224 | }, 225 | { 226 | "name": "refresh-after-index", 227 | "operation": "refresh", 228 | "clients": 1 229 | }, 230 | { 231 | "operation": "force-merge", 232 | "clients": 1 233 | }, 234 | { 235 | "name": "refresh-after-force-merge", 236 | "operation": "refresh", 237 | "clients": 1 238 | } 239 | ] 240 | } 241 | -------------------------------------------------------------------------------- /noaa/_tools/process.py: -------------------------------------------------------------------------------- 1 | #################################################################### 2 | # 3 | # process the csv file into Elasticsearch json documents 4 | # 5 | #################################################################### 6 | 7 | import os 8 | import csv 9 | import json 10 | from datetime import datetime 11 | 12 | stationsFile = 'ghcnd-stations.txt' 13 | countriesFile = 'ghcnd-countries.txt' 14 | statesFile = 'ghcnd-states.txt' 15 | 16 | weatherDataFiles = ['2014-sorted.csv', '2015-sorted.csv', '2016-sorted.csv'] 17 | indexPrefix = 'weather-data' 18 | docType = 'summary' 19 | 20 | def loadStatesFile(statesFile): 21 | statesMap = {} 22 | with open(statesFile, 'r') as file: 23 | csvreader = csv.reader(file, delimiter=' ', quotechar='"') 24 | for row in csvreader: 25 | statesMap[row[0].strip()] = row[1].strip() 26 | return statesMap 27 | 28 | def loadCountriesFile(countriesFile): 29 | countriesMap = {} 30 | with open(countriesFile, 'r') as file: 31 | csvreader = csv.reader(file, delimiter=' ', quotechar='"') 32 | for row in csvreader: 33 | countriesMap[row[0].strip()] = row[1].strip() 34 | return countriesMap 35 | 36 | def loadStationsFile(stationsFile, statesFile, countriesFile): 37 | statesMap = loadStatesFile(statesFile) 38 | countriesMap = loadCountriesFile(countriesFile) 39 | stationsMap = {} 40 | with open(stationsFile, 'r') as file: 41 | for row in file: 42 | try: 43 | station = {} 44 | station['id'] = row[0:11].strip() 45 | countryCode = row[0:2].strip() 46 | if len(countryCode) > 0: 47 | station['country_code'] = countryCode 48 | station['country'] = countriesMap[countryCode] 49 | station['location'] = { 50 | 'lat': float(row[12:20].strip()), 51 | 'lon': float(row[21:30].strip()) 52 | } 53 | station['elevation'] = float(row[31:37].strip()) 54 | if countryCode == 'US': 55 | stateCode = row[38:40].strip() 56 | if len(stateCode) > 0: 57 | station['state_code'] = stateCode 58 | station['state'] = statesMap[stateCode] 59 | station['name'] = row[41:71].strip() 60 | gsn_flag = row[72:75].strip() 61 | if len(gsn_flag) > 0: 62 | station['gsn_flag'] = gsn_flag 63 | hcn_crn_flag = row[76:78].strip() 64 | if len(hcn_crn_flag) > 0: 65 | station['hcn_crn_flag'] = hcn_crn_flag 66 | wmo_id = row[80:85].strip() 67 | if len(wmo_id) > 0: 68 | station['wmo_id'] = wmo_id 69 | stationsMap[station['id']] = station 70 | except: 71 | print(row) 72 | raise e 73 | return stationsMap 74 | 75 | def processWeatherDoc(currentStationDoc): 76 | if 'TMAX' in currentStationDoc: 77 | currentStationDoc['TMAX'] = float(currentStationDoc['TMAX']) / 10.0 78 | if 'TMIN' in currentStationDoc: 79 | currentStationDoc['TMIN'] = float(currentStationDoc['TMIN']) / 10.0 80 | if 'PRCP' in currentStationDoc: 81 | currentStationDoc['PRCP'] = float(currentStationDoc['PRCP']) / 10.0 82 | if 'AWND' in currentStationDoc: 83 | currentStationDoc['AWND'] = float(currentStationDoc['AWND']) / 10.0 84 | if 'EVAP' in currentStationDoc: 85 | currentStationDoc['EVAP'] = float(currentStationDoc['EVAP']) / 10.0 86 | if 'MDEV' in currentStationDoc: 87 | currentStationDoc['MDEV'] = float(currentStationDoc['MDEV']) / 10.0 88 | if 'MDPR' in currentStationDoc: 89 | currentStationDoc['MDPR'] = float(currentStationDoc['MDPR']) / 10.0 90 | if 'MDTN' in currentStationDoc: 91 | currentStationDoc['MDTN'] = float(currentStationDoc['MDTN']) / 10.0 92 | if 'MDTX' in currentStationDoc: 93 | currentStationDoc['MDTX'] = float(currentStationDoc['MDTX']) / 10.0 94 | if 'MNPN' in currentStationDoc: 95 | currentStationDoc['MNPN'] = float(currentStationDoc['MNPN']) / 10.0 96 | if 'MXPN' in currentStationDoc: 97 | currentStationDoc['MXPN'] = float(currentStationDoc['MXPN']) / 10.0 98 | if 'TAVG' in currentStationDoc: 99 | currentStationDoc['TAVG'] = float(currentStationDoc['TAVG']) / 10.0 100 | if 'THIC' in currentStationDoc: 101 | currentStationDoc['THIC'] = float(currentStationDoc['THIC']) / 10.0 102 | if 'TOBS' in currentStationDoc: 103 | currentStationDoc['TOBS'] = float(currentStationDoc['TOBS']) / 10.0 104 | if 'WESD' in currentStationDoc: 105 | currentStationDoc['WESD'] = float(currentStationDoc['WESD']) / 10.0 106 | if 'WESF' in currentStationDoc: 107 | currentStationDoc['WESF'] = float(currentStationDoc['WESF']) / 10.0 108 | if 'WSF1' in currentStationDoc: 109 | currentStationDoc['WSF1'] = float(currentStationDoc['WSF1']) / 10.0 110 | if 'WSF2' in currentStationDoc: 111 | currentStationDoc['WSF2'] = float(currentStationDoc['WSF2']) / 10.0 112 | if 'WSF5' in currentStationDoc: 113 | currentStationDoc['WSF5'] = float(currentStationDoc['WSF5']) / 10.0 114 | if 'WSFG' in currentStationDoc: 115 | currentStationDoc['WSFG'] = float(currentStationDoc['WSFG']) / 10.0 116 | if 'WSFI' in currentStationDoc: 117 | currentStationDoc['WSFI'] = float(currentStationDoc['WSFI']) / 10.0 118 | if 'WSFM' in currentStationDoc: 119 | currentStationDoc['WSFM'] = float(currentStationDoc['WSFM']) / 10.0 120 | 121 | if 'TMIN' in currentStationDoc and 'TMAX' in currentStationDoc: 122 | if currentStationDoc['TMIN'] > currentStationDoc['TMAX']: 123 | tmp = currentStationDoc['TMIN'] 124 | currentStationDoc['TMIN'] = currentStationDoc['TMAX'] 125 | currentStationDoc['TMAX'] = tmp 126 | currentStationDoc['TRANGE'] = { 127 | "gte" : currentStationDoc['TMIN'], 128 | "lte" : currentStationDoc['TMAX'] 129 | } 130 | if 'MDTN' in currentStationDoc and 'MDTX' in currentStationDoc: 131 | if currentStationDoc['MDTN'] > currentStationDoc['MDTX']: 132 | tmp = currentStationDoc['MDTN'] 133 | currentStationDoc['MDTN'] = currentStationDoc['MDTX'] 134 | currentStationDoc['MDTX'] = tmp 135 | currentStationDoc['MDTRANGE'] = { 136 | "gte" : currentStationDoc['MDTN'], 137 | "lte" : currentStationDoc['MDTX'] 138 | } 139 | 140 | indexDoc = { 141 | '_op_type': 'create', 142 | '_index': indexPrefix + '-' + str(currentStationDoc['date'].year), 143 | '_type': docType, 144 | '_id': currentStationDoc['date'].strftime('%Y-%m-%d') + '-' + currentStationDoc['station']['id'], 145 | '_source': currentStationDoc 146 | } 147 | return indexDoc 148 | 149 | def processWeatherFile(weatherDataFile, stationsMap): 150 | with open(weatherDataFile, 'r') as file: 151 | csvreader = csv.reader(file, delimiter=',', quotechar='"') 152 | currentStationDoc = None 153 | stationDocsProcessed = 0 154 | for row in csvreader: 155 | station = stationsMap[row[0]] 156 | date = datetime.strptime(row[1], '%Y%m%d') 157 | elementType = row[2] 158 | elementValue = row[3] 159 | if currentStationDoc == None: 160 | currentStationDoc = { 161 | 'station': station, 162 | 'date': date, 163 | elementType: elementValue 164 | } 165 | elif currentStationDoc['station'] != station or currentStationDoc['date'] != date: 166 | yield processWeatherDoc(currentStationDoc) 167 | stationDocsProcessed = stationDocsProcessed + 1 168 | currentStationDoc = { 169 | 'station': station, 170 | 'date': date, 171 | elementType: elementValue 172 | } 173 | else: 174 | currentStationDoc[elementType] = elementValue 175 | 176 | stationsMap = loadStationsFile(stationsFile, statesFile, countriesFile) 177 | outFile = 'documents.json' 178 | with open(outFile, 'w+') as file: 179 | count = 0 180 | for weatherDataFile in weatherDataFiles: 181 | for doc in processWeatherFile(weatherDataFile, stationsMap): 182 | doc['_source']['date'] = doc['_source']['date'].isoformat() 183 | file.write(json.dumps(doc['_source'])) 184 | file.write('\n') 185 | count = count + 1 186 | print('Wrote ' + str(count) + ' entries') -------------------------------------------------------------------------------- /geonames/operations/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "index-append", 3 | "operation-type": "bulk", 4 | "bulk-size": {{bulk_size | default(5000)}}, 5 | "ingest-percentage": {{ingest_percentage | default(100)}} 6 | }, 7 | { 8 | "name": "index-update", 9 | "operation-type": "bulk", 10 | "bulk-size": {{bulk_size | default(5000)}}, 11 | "ingest-percentage": {{ingest_percentage | default(100)}}, 12 | "conflicts": "{{conflicts | default('random')}}", 13 | "on-conflict": "{{on_conflict | default('index')}}", 14 | "conflict-probability": {{conflict_probability | default(25)}}, 15 | "recency": {{recency | default(0)}} 16 | }, 17 | { 18 | "name": "default", 19 | "operation-type": "search", 20 | "body": { 21 | "query": { 22 | "match_all": {} 23 | } 24 | } 25 | }, 26 | { 27 | "name": "term", 28 | "operation-type": "search", 29 | "body": { 30 | "query": { 31 | "term": { 32 | "country_code": "AT" 33 | } 34 | } 35 | } 36 | }, 37 | { 38 | "name": "phrase", 39 | "operation-type": "search", 40 | "body": { 41 | "query": { 42 | "match_phrase": { 43 | "name": "Sankt Georgen" 44 | } 45 | } 46 | } 47 | }, 48 | { 49 | "name": "country_agg_uncached", 50 | "operation-type": "search", 51 | "body": { 52 | "size": 0, 53 | "aggs": { 54 | "country_population": { 55 | "terms": { 56 | "field": "country_code" 57 | }, 58 | "aggs": { 59 | "sum_population": { 60 | "sum": { 61 | "field": "population" 62 | } 63 | } 64 | } 65 | } 66 | } 67 | } 68 | }, 69 | { 70 | "name": "country_agg_cached", 71 | "operation-type": "search", 72 | "cache": true, 73 | "body": { 74 | "size": 0, 75 | "aggs": { 76 | "country_population": { 77 | "terms": { 78 | "field": "country_code" 79 | }, 80 | "aggs": { 81 | "sum_population": { 82 | "sum": { 83 | "field": "population" 84 | } 85 | } 86 | } 87 | } 88 | } 89 | } 90 | }, 91 | { 92 | "name": "scroll", 93 | "operation-type": "search", 94 | "pages": 25, 95 | "results-per-page": 1000, 96 | "body": { 97 | "query": { 98 | "match_all": {} 99 | } 100 | } 101 | }, 102 | { 103 | "name": "expression", 104 | "operation-type": "search", 105 | "body": { 106 | "query": { 107 | "function_score": { 108 | "query": { 109 | "match_all": {} 110 | }, 111 | "functions": [ 112 | { 113 | "script_score": { 114 | "script": { 115 | "source": "abs(ln(abs(doc['population']) + 1) + doc['location'].lon + doc['location'].lat) * _score", 116 | "lang": "expression" 117 | } 118 | } 119 | } 120 | ] 121 | } 122 | } 123 | } 124 | }, 125 | { 126 | "name": "painless_static", 127 | "operation-type": "search", 128 | "body": { 129 | "query": { 130 | "function_score": { 131 | "query": { 132 | "match_all": {} 133 | }, 134 | "functions": [ 135 | { 136 | "script_score": { 137 | "script": { 138 | "source": "Math.abs(Math.log(Math.abs((int)((List)doc.population).get(0)) + 1) + (double)(doc.location.lon) * (double)(doc.location.lat))/_score", 139 | "lang": "painless" 140 | } 141 | } 142 | } 143 | ] 144 | } 145 | } 146 | } 147 | }, 148 | { 149 | "name": "painless_dynamic", 150 | "operation-type": "search", 151 | "body": { 152 | "query": { 153 | "function_score": { 154 | "query": { 155 | "match_all": {} 156 | }, 157 | "functions": [ 158 | { 159 | "script_score": { 160 | "script": { 161 | "source": "Math.abs(Math.log(Math.abs(doc['population'].value) + 1) + doc['location'].lon * doc['location'].lat)/_score", 162 | "lang": "painless" 163 | } 164 | } 165 | } 166 | ] 167 | } 168 | } 169 | } 170 | }, 171 | { 172 | "name": "decay_geo_gauss_function_score", 173 | "operation-type": "search", 174 | "body": { 175 | "query": { 176 | "function_score": { 177 | "query": { 178 | "match_all": {} 179 | }, 180 | "gauss": { 181 | "location": { 182 | "origin": "52.37, 4.8951", 183 | "scale": "500km", 184 | "offset": "0km", 185 | "decay" : 0.1 186 | } 187 | } 188 | } 189 | } 190 | } 191 | }, 192 | { 193 | "name": "decay_geo_gauss_script_score", 194 | "operation-type": "search", 195 | "body": { 196 | "query": { 197 | "script_score": { 198 | "query": { 199 | "match_all": {} 200 | }, 201 | "script": { 202 | "source": "decayGeoGauss(params.origin, params.scale, params.offset, params.decay, doc['location'].value)", 203 | "params": { 204 | "origin": "52.37, 4.8951", 205 | "scale": "500km", 206 | "offset": "0km", 207 | "decay" : 0.1 208 | } 209 | } 210 | } 211 | } 212 | } 213 | }, 214 | { 215 | "name": "field_value_function_score", 216 | "operation-type": "search", 217 | "body": { 218 | "query": { 219 | "function_score": { 220 | "query": { 221 | "match_all": {} 222 | }, 223 | "field_value_factor": { 224 | "field": "population", 225 | "factor": 1.2, 226 | "modifier": "log2p" 227 | } 228 | } 229 | } 230 | } 231 | }, 232 | { 233 | "name": "field_value_script_score", 234 | "operation-type": "search", 235 | "body": { 236 | "query": { 237 | "script_score": { 238 | "query": { 239 | "match_all": {} 240 | }, 241 | "script": { 242 | "source": "Math.log10(doc['population'].value * 1.2 + 2)" 243 | } 244 | } 245 | } 246 | } 247 | }, 248 | { 249 | "name": "random_function_score", 250 | "operation-type": "search", 251 | "body": { 252 | "query": { 253 | "function_score": { 254 | "query": { 255 | "match_all": {} 256 | }, 257 | "random_score": { 258 | "seed": 100, 259 | "field": "_seq_no" 260 | } 261 | } 262 | } 263 | } 264 | }, 265 | { 266 | "name": "random_script_score", 267 | "operation-type": "search", 268 | "body": { 269 | "query": { 270 | "script_score": { 271 | "query": { 272 | "match_all": {} 273 | }, 274 | "script": { 275 | "source": "randomScore(100, '_seq_no')" 276 | } 277 | } 278 | } 279 | } 280 | }, 281 | { 282 | "name": "large_terms", 283 | "operation-type": "search", 284 | "param-source": "pure-terms-query-source" 285 | }, 286 | { 287 | "name": "large_filtered_terms", 288 | "operation-type": "search", 289 | "param-source": "filtered-terms-query-source" 290 | }, 291 | { 292 | "name": "large_prohibited_terms", 293 | "operation-type": "search", 294 | "param-source": "prohibited-terms-query-source" 295 | }, 296 | { 297 | "name": "desc_sort_population", 298 | "operation-type": "search", 299 | "body": { 300 | "query": { 301 | "match_all": {} 302 | }, 303 | "sort" : [ 304 | {"population" : "desc"} 305 | ] 306 | } 307 | }, 308 | { 309 | "name": "asc_sort_population", 310 | "operation-type": "search", 311 | "body": { 312 | "query": { 313 | "match_all": {} 314 | }, 315 | "sort" : [ 316 | {"population" : "asc"} 317 | ] 318 | } 319 | }, 320 | { 321 | "name": "desc_sort_geonameid", 322 | "operation-type": "search", 323 | "body": { 324 | "query": { 325 | "match_all": {} 326 | }, 327 | "sort" : [ 328 | {"geonameid" : "desc"} 329 | ] 330 | } 331 | }, 332 | { 333 | "name": "asc_sort_geonameid", 334 | "operation-type": "search", 335 | "body": { 336 | "query": { 337 | "match_all": {} 338 | }, 339 | "sort" : [ 340 | {"geonameid" : "asc"} 341 | ] 342 | } 343 | } 344 | -------------------------------------------------------------------------------- /geonames/challenges/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "append-no-conflicts", 3 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.", 4 | "default": true, 5 | "schedule": [ 6 | { 7 | "operation": "delete-index" 8 | }, 9 | { 10 | "operation": { 11 | "operation-type": "create-index", 12 | "settings": {{index_settings | default({}) | tojson}} 13 | } 14 | }, 15 | { 16 | "name": "check-cluster-health", 17 | "operation": { 18 | "operation-type": "cluster-health", 19 | "index": "geonames", 20 | "request-params": { 21 | "wait_for_status": "{{cluster_health | default('green')}}", 22 | "wait_for_no_relocating_shards": "true" 23 | } 24 | } 25 | }, 26 | { 27 | "operation": "index-append", 28 | "warmup-time-period": 120, 29 | "clients": {{bulk_indexing_clients | default(8)}} 30 | }, 31 | { 32 | "name": "refresh-after-index", 33 | "operation": "refresh", 34 | "clients": 1 35 | }, 36 | { 37 | "operation": "force-merge", 38 | "clients": 1 39 | }, 40 | { 41 | "name": "refresh-after-force-merge", 42 | "operation": "refresh", 43 | "clients": 1 44 | }, 45 | { 46 | "operation": "index-stats", 47 | "clients": 1, 48 | "warmup-iterations": 500, 49 | "iterations": 1000, 50 | "target-throughput": 90 51 | }, 52 | { 53 | "operation": "node-stats", 54 | "clients": 1, 55 | "warmup-iterations": 100, 56 | "iterations": 1000, 57 | "target-throughput": 90 58 | }, 59 | { 60 | "operation": "default", 61 | "clients": 1, 62 | "warmup-iterations": 500, 63 | "iterations": 1000, 64 | "target-throughput": 50 65 | }, 66 | { 67 | "operation": "term", 68 | "clients": 1, 69 | "warmup-iterations": 500, 70 | "iterations": 1000, 71 | "target-throughput": 200 72 | }, 73 | { 74 | "operation": "phrase", 75 | "clients": 1, 76 | "warmup-iterations": 500, 77 | "iterations": 1000, 78 | "target-throughput": 200 79 | }, 80 | { 81 | "operation": "country_agg_uncached", 82 | "clients": 1, 83 | "warmup-iterations": 200, 84 | "iterations": 100, 85 | "target-throughput": 4 86 | }, 87 | { 88 | "operation": "country_agg_cached", 89 | "clients": 1, 90 | "warmup-iterations": 500, 91 | "iterations": 1000, 92 | "target-throughput": 100 93 | }, 94 | { 95 | "operation": "scroll", 96 | "clients": 1, 97 | "warmup-iterations": 200, 98 | "iterations": 100, 99 | "#COMMENT": "Throughput is considered per request. So we issue one scroll request per second which will retrieve 25 pages", 100 | "target-throughput": 0.8 101 | }, 102 | { 103 | "operation": "expression", 104 | "clients": 1, 105 | "warmup-iterations": 200, 106 | "iterations": 100, 107 | "target-throughput": 2 108 | }, 109 | { 110 | "operation": "painless_static", 111 | "clients": 1, 112 | "warmup-iterations": 200, 113 | "iterations": 100, 114 | "target-throughput": 1.5 115 | }, 116 | { 117 | "operation": "painless_dynamic", 118 | "clients": 1, 119 | "warmup-iterations": 200, 120 | "iterations": 100, 121 | "target-throughput": 1.5 122 | }, 123 | { 124 | "operation": "decay_geo_gauss_function_score", 125 | "clients": 1, 126 | "warmup-iterations": 200, 127 | "iterations": 100, 128 | "target-throughput": 1 129 | }, 130 | { 131 | "operation": "decay_geo_gauss_script_score", 132 | "clients": 1, 133 | "warmup-iterations": 200, 134 | "iterations": 100, 135 | "target-throughput": 1 136 | }, 137 | { 138 | "operation": "field_value_function_score", 139 | "clients": 1, 140 | "warmup-iterations": 200, 141 | "iterations": 100, 142 | "target-throughput": 1.5 143 | }, 144 | { 145 | "operation": "field_value_script_score", 146 | "clients": 1, 147 | "warmup-iterations": 200, 148 | "iterations": 100, 149 | "target-throughput": 1.5 150 | }, 151 | { 152 | "operation": "random_function_score", 153 | "clients": 1, 154 | "warmup-iterations": 200, 155 | "iterations": 100, 156 | "target-throughput": 1.5 157 | }, 158 | { 159 | "operation": "random_script_score", 160 | "clients": 1, 161 | "warmup-iterations": 200, 162 | "iterations": 100, 163 | "target-throughput": 1.5 164 | }, 165 | { 166 | "operation": "large_terms", 167 | "clients": 1, 168 | "warmup-iterations": 200, 169 | "iterations": 100, 170 | "target-throughput": 1.5 171 | }, 172 | { 173 | "operation": "large_filtered_terms", 174 | "clients": 1, 175 | "warmup-iterations": 200, 176 | "iterations": 100, 177 | "target-throughput": 1.5 178 | }, 179 | { 180 | "operation": "large_prohibited_terms", 181 | "clients": 1, 182 | "warmup-iterations": 200, 183 | "iterations": 100, 184 | "target-throughput": 1.5 185 | }, 186 | { 187 | "operation": "desc_sort_population", 188 | "clients": 1, 189 | "warmup-iterations": 200, 190 | "iterations": 100, 191 | "target-throughput": 1.5 192 | }, 193 | { 194 | "operation": "asc_sort_population", 195 | "clients": 1, 196 | "warmup-iterations": 200, 197 | "iterations": 100, 198 | "target-throughput": 1.5 199 | }, 200 | { 201 | "operation": "desc_sort_geonameid", 202 | "clients": 1, 203 | "warmup-iterations": 200, 204 | "iterations": 100, 205 | "target-throughput": 6 206 | }, 207 | { 208 | "operation": "asc_sort_geonameid", 209 | "clients": 1, 210 | "warmup-iterations": 200, 211 | "iterations": 100, 212 | "target-throughput": 6 213 | } 214 | ] 215 | }, 216 | { 217 | "name": "append-no-conflicts-index-only", 218 | "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.", 219 | "schedule": [ 220 | { 221 | "operation": "delete-index" 222 | }, 223 | { 224 | "operation": { 225 | "operation-type": "create-index", 226 | "settings": {{index_settings | default({}) | tojson}} 227 | } 228 | }, 229 | { 230 | "name": "check-cluster-health", 231 | "operation": { 232 | "operation-type": "cluster-health", 233 | "index": "geonames", 234 | "request-params": { 235 | "wait_for_status": "{{cluster_health | default('green')}}", 236 | "wait_for_no_relocating_shards": "true" 237 | } 238 | } 239 | }, 240 | { 241 | "operation": "index-append", 242 | "warmup-time-period": 120, 243 | "clients": {{bulk_indexing_clients | default(8)}} 244 | }, 245 | { 246 | "operation": "force-merge", 247 | "clients": 1 248 | } 249 | ] 250 | }, 251 | { 252 | "name": "append-sorted-no-conflicts", 253 | "description": "Indexes the whole document corpus in an index sorted by country_code field in ascending order. Document ids are unique so all index operations are append only.", 254 | "schedule": [ 255 | { 256 | "operation": "delete-index" 257 | }, 258 | { 259 | "operation": { 260 | "operation-type": "create-index", 261 | "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { 262 | "index.sort.field": ["country_code.raw", "admin1_code.raw"], 263 | "index.sort.order": ["asc", "asc"] 264 | }{%- endif %} 265 | } 266 | }, 267 | { 268 | "name": "check-cluster-health", 269 | "operation": { 270 | "operation-type": "cluster-health", 271 | "index": "geonames", 272 | "request-params": { 273 | "wait_for_status": "{{cluster_health | default('green')}}", 274 | "wait_for_no_relocating_shards": "true" 275 | } 276 | } 277 | }, 278 | { 279 | "operation": "index-append", 280 | "warmup-time-period": 120, 281 | "clients": {{bulk_indexing_clients | default(8)}} 282 | }, 283 | { 284 | "operation": "force-merge", 285 | "clients": 1 286 | } 287 | ] 288 | }, 289 | { 290 | "name": "append-fast-with-conflicts", 291 | "description": "Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings. Rally will produce duplicate ids in 25% of all documents (not configurable) so we can simulate a scenario with appends most of the time and some updates in between.", 292 | "schedule": [ 293 | { 294 | "operation": "delete-index" 295 | }, 296 | { 297 | "operation": { 298 | "operation-type": "create-index", 299 | "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { 300 | "index.refresh_interval": "30s", 301 | "index.number_of_shards": {{number_of_shards | default(6)}}, 302 | "index.translog.flush_threshold_size": "4g" 303 | }{%- endif %} 304 | } 305 | }, 306 | { 307 | "name": "check-cluster-health", 308 | "operation": { 309 | "operation-type": "cluster-health", 310 | "index": "geonames", 311 | "request-params": { 312 | "wait_for_status": "{{cluster_health | default('green')}}", 313 | "wait_for_no_relocating_shards": "true" 314 | } 315 | } 316 | }, 317 | { 318 | "operation": "index-update", 319 | "warmup-time-period": 45, 320 | "clients": {{bulk_indexing_clients | default(8)}} 321 | }, 322 | { 323 | "operation": "force-merge", 324 | "clients": 1 325 | } 326 | ] 327 | } 328 | -------------------------------------------------------------------------------- /nyc_taxis/challenges/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "append-no-conflicts", 3 | "description": "Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings and produce a smaller index (higher compression rate). Document ids are unique so all index operations are append only. After that a couple of queries are run.", 4 | "default": true, 5 | "schedule": [ 6 | { 7 | "operation": "delete-index" 8 | }, 9 | { 10 | "operation": { 11 | "operation-type": "create-index", 12 | "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { 13 | "index.codec": "best_compression", 14 | "index.refresh_interval": "30s", 15 | "index.translog.flush_threshold_size": "4g" 16 | }{%- endif %} 17 | } 18 | }, 19 | { 20 | "name": "check-cluster-health", 21 | "operation": { 22 | "operation-type": "cluster-health", 23 | "index": "nyc_taxis", 24 | "request-params": { 25 | "wait_for_status": "{{cluster_health | default('green')}}", 26 | "wait_for_no_relocating_shards": "true" 27 | } 28 | } 29 | }, 30 | { 31 | "operation": "index", 32 | "warmup-time-period": 240, 33 | "clients": {{bulk_indexing_clients | default(8)}} 34 | }, 35 | { 36 | "name": "refresh-after-index", 37 | "operation": "refresh", 38 | "clients": 1 39 | }, 40 | { 41 | "operation": "default", 42 | "clients": 1, 43 | "warmup-iterations": 50, 44 | "iterations": 100, 45 | "target-throughput": 3 46 | }, 47 | { 48 | "operation": "range", 49 | "clients": 1, 50 | "warmup-iterations": 50, 51 | "iterations": 100, 52 | "target-throughput": 1 53 | }, 54 | { 55 | "operation": "distance_amount_agg", 56 | "clients": 1, 57 | "warmup-iterations": 50, 58 | "iterations": 100, 59 | "target-throughput": 2 60 | }, 61 | { 62 | "operation": "autohisto_agg", 63 | "clients": 1, 64 | "warmup-iterations": 50, 65 | "iterations": 100, 66 | "target-throughput": 1.5 67 | }, 68 | { 69 | "operation": "date_histogram_agg", 70 | "clients": 1, 71 | "warmup-iterations": 50, 72 | "iterations": 100, 73 | "target-throughput": 1.5 74 | } 75 | ] 76 | }, 77 | { 78 | "name": "append-no-conflicts-index-only", 79 | "description": "Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings and produce a smaller index (higher compression rate). Document ids are unique so all index operations are append only.", 80 | "schedule": [ 81 | { 82 | "operation": "delete-index" 83 | }, 84 | { 85 | "operation": { 86 | "operation-type": "create-index", 87 | "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { 88 | "index.codec": "best_compression", 89 | "index.refresh_interval": "30s", 90 | "index.translog.flush_threshold_size": "4g" 91 | }{%- endif %} 92 | } 93 | }, 94 | { 95 | "name": "check-cluster-health", 96 | "operation": { 97 | "operation-type": "cluster-health", 98 | "index": "nyc_taxis", 99 | "request-params": { 100 | "wait_for_status": "{{cluster_health | default('green')}}", 101 | "wait_for_no_relocating_shards": "true" 102 | } 103 | } 104 | }, 105 | { 106 | "operation": "index", 107 | "warmup-time-period": 240, 108 | "clients": {{bulk_indexing_clients | default(8)}} 109 | }, 110 | { 111 | "name": "refresh-after-index", 112 | "operation": "refresh", 113 | "clients": 1 114 | } 115 | ] 116 | }, 117 | { 118 | "name": "append-sorted-no-conflicts-index-only", 119 | "description": "Indexes the whole document corpus in an index sorted by pickup_datetime field in descending order (most recent first) and using a setup that will lead to a larger indexing throughput than the default settings and produce a smaller index (higher compression rate). Document ids are unique so all index operations are append only.", 120 | "schedule": [ 121 | { 122 | "operation": "delete-index" 123 | }, 124 | { 125 | "operation": { 126 | "operation-type": "create-index", 127 | "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { 128 | "index.codec": "best_compression", 129 | "index.refresh_interval": "30s", 130 | "index.translog.flush_threshold_size": "4g", 131 | "index.sort.field": "pickup_datetime", 132 | "index.sort.order": "desc" 133 | }{%- endif %} 134 | } 135 | }, 136 | { 137 | "name": "check-cluster-health", 138 | "operation": { 139 | "operation-type": "cluster-health", 140 | "index": "nyc_taxis", 141 | "request-params": { 142 | "wait_for_status": "{{cluster_health | default('green')}}", 143 | "wait_for_no_relocating_shards": "true" 144 | } 145 | } 146 | }, 147 | { 148 | "operation": "index", 149 | "warmup-time-period": 240, 150 | "clients": {{bulk_indexing_clients | default(8)}} 151 | }, 152 | { 153 | "name": "refresh-after-index", 154 | "operation": "refresh", 155 | "clients": 1 156 | } 157 | ] 158 | }, 159 | { 160 | "name": "update", 161 | "schedule": [ 162 | { 163 | "operation": "delete-index" 164 | }, 165 | { 166 | "operation": { 167 | "operation-type": "create-index", 168 | "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { 169 | "index.number_of_shards": {{number_of_shards | default(1)}}, 170 | "index.number_of_replicas": {{number_of_replicas | default(0)}}, 171 | "index.store.type": "{{store_type | default('fs')}}" 172 | }{%- endif %} 173 | } 174 | }, 175 | { 176 | "name": "check-cluster-health", 177 | "operation": { 178 | "operation-type": "cluster-health", 179 | "index": "nyc_taxis", 180 | "request-params": { 181 | "wait_for_status": "{{cluster_health | default('green')}}", 182 | "wait_for_no_relocating_shards": "true" 183 | } 184 | } 185 | }, 186 | { 187 | "operation": "update", 188 | "warmup-time-period": 1200, 189 | "clients": {{bulk_indexing_clients | default(8)}} 190 | }, 191 | { 192 | "name": "refresh-after-index", 193 | "operation": "refresh", 194 | "clients": 1 195 | }, 196 | { 197 | "operation": "force-merge", 198 | "clients": 1 199 | }, 200 | { 201 | "name": "refresh-after-force-merge", 202 | "operation": "refresh", 203 | "clients": 1 204 | } 205 | ] 206 | }, 207 | {% set ml_job_id="benchmark_ml_job" %} 208 | {% set ml_feed_id="benchmark_nyc_taxis_feed" %} 209 | { 210 | "name": "append-ml", 211 | "description": "Indexes the whole document corpus and executes a machine learning job", 212 | "schedule": [ 213 | { 214 | "operation": "delete-index" 215 | }, 216 | { 217 | "operation": { 218 | "operation-type": "create-index", 219 | "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { 220 | "index.codec": "best_compression", 221 | "index.refresh_interval": "30s", 222 | "index.translog.flush_threshold_size": "4g" 223 | }{%- endif %} 224 | } 225 | }, 226 | { 227 | "operation": { 228 | "operation-type": "delete-ml-datafeed", 229 | "datafeed-id": "{{ml_feed_id}}", 230 | "force": true 231 | } 232 | }, 233 | { 234 | "operation": { 235 | "operation-type": "delete-ml-job", 236 | "job-id": "{{ml_job_id}}", 237 | "force": true 238 | } 239 | }, 240 | { 241 | "operation": { 242 | "operation-type": "create-ml-job", 243 | "job-id": "{{ml_job_id}}", 244 | "body": { 245 | "description": "NYC Taxis (count)", 246 | "analysis_config": { 247 | "bucket_span": "1h", 248 | "summary_count_field_name": "doc_count", 249 | "detectors": [ 250 | { 251 | "detector_description": "count", 252 | "function": "count" 253 | } 254 | ] 255 | }, 256 | "data_description": { 257 | "time_field": "pickup_datetime", 258 | "time_format": "epoch_ms" 259 | }, 260 | "model_plot_config": { 261 | "enabled": true 262 | } 263 | } 264 | } 265 | }, 266 | { 267 | "operation": { 268 | "operation-type": "open-ml-job", 269 | "job-id": "{{ml_job_id}}" 270 | } 271 | }, 272 | { 273 | "operation": { 274 | "operation-type": "create-ml-datafeed", 275 | "datafeed-id": "{{ml_feed_id}}", 276 | "body": { 277 | "job_id": "{{ml_job_id}}", 278 | "indices": [ 279 | "nyc_taxis" 280 | ], 281 | "query": { 282 | "match_all": { 283 | "boost": 1 284 | } 285 | }, 286 | "aggregations": { 287 | "buckets": { 288 | "date_histogram": { 289 | "field": "pickup_datetime", 290 | "fixed_interval": "3600000ms", 291 | "offset": 0, 292 | "order": { 293 | "_key": "asc" 294 | }, 295 | "keyed": false, 296 | "min_doc_count": 0 297 | }, 298 | "aggregations": { 299 | "pickup_datetime": { 300 | "max": { 301 | "field": "pickup_datetime" 302 | } 303 | } 304 | } 305 | } 306 | }, 307 | "scroll_size": 1000, 308 | "chunking_config": { 309 | "mode": "manual", 310 | "time_span": "3600000000ms" 311 | } 312 | } 313 | } 314 | }, 315 | { 316 | "name": "check-cluster-health", 317 | "operation": { 318 | "operation-type": "cluster-health", 319 | "index": "nyc_taxis", 320 | "request-params": { 321 | "wait_for_status": "{{cluster_health | default('green')}}", 322 | "wait_for_no_relocating_shards": "true" 323 | } 324 | } 325 | }, 326 | { 327 | "operation": "index", 328 | "warmup-time-period": 240, 329 | "clients": {{bulk_indexing_clients | default(8)}} 330 | }, 331 | { 332 | "name": "refresh-after-index", 333 | "operation": "refresh" 334 | }, 335 | { 336 | "operation": "force-merge" 337 | }, 338 | { 339 | "name": "refresh-after-force-merge", 340 | "operation": "refresh" 341 | }, 342 | { 343 | "operation": { 344 | "operation-type": "start-ml-datafeed", 345 | "datafeed-id": "{{ml_feed_id}}", 346 | "body": { 347 | "end": "now" 348 | } 349 | } 350 | }, 351 | { 352 | "operation": { 353 | "operation-type": "wait-for-ml-lookback", 354 | "include-in-reporting": false, 355 | "datafeed-id": "{{ml_feed_id}}" 356 | } 357 | }, 358 | { 359 | "operation": { 360 | "operation-type": "close-ml-job", 361 | "job-id": "{{ml_job_id}}" 362 | } 363 | } 364 | ] 365 | } 366 | --------------------------------------------------------------------------------