├── so
    ├── files.txt
    ├── raw_data_prep_script.zip
    ├── operations
    │   └── default.json
    ├── track.json
    ├── index.json
    ├── challenges
    │   └── default.json
    └── README.md
├── noaa
    ├── files.txt
    ├── track.json
    ├── README.md
    ├── operations
    │   └── default.json
    ├── challenges
    │   └── default.json
    ├── index.json
    └── _tools
    │   └── process.py
├── pmc
    ├── files.txt
    ├── track.py
    ├── track.json
    ├── index.json
    ├── operations
    │   └── default.json
    ├── README.md
    └── challenges
    │   └── default.json
├── eventdata
    ├── files.txt
    ├── operations
    │   └── default.json
    ├── track.json
    ├── challenges
    │   └── default.json
    ├── index.json
    └── README.md
├── geopoint
    ├── files.txt
    ├── index.json
    ├── track.json
    ├── README.md
    ├── operations
    │   └── default.json
    └── challenges
    │   └── default.json
├── nested
    ├── files.txt
    ├── raw_data_prep_scripts.zip
    ├── track.json
    ├── index.json
    ├── queries.csv
    ├── operations
    │   └── default.json
    ├── README.md
    ├── challenges
    │   └── default.json
    └── track.py
├── nyc_taxis
    ├── files.txt
    ├── track.py
    ├── track.json
    ├── index.json
    ├── operations
    │   └── default.json
    ├── README.md
    ├── _tools
    │   └── parse.py
    └── challenges
    │   └── default.json
├── geonames
    ├── files.txt
    ├── track.json
    ├── index.json
    ├── README.md
    ├── track.py
    ├── operations
    │   └── default.json
    └── challenges
    │   └── default.json
├── geopointshape
    ├── files.txt
    ├── index.json
    ├── _tools
    │   └── parse.py
    ├── track.json
    ├── operations
    │   └── default.json
    ├── README.md
    └── challenges
    │   └── default.json
├── percolator
    ├── files.txt
    ├── index.json
    ├── track.json
    ├── operations
    │   └── default.json
    ├── challenges
    │   └── default.json
    └── README.md
├── metricbeat
    ├── files.txt
    ├── track.json
    ├── operations
    │   └── default.json
    └── challenges
    │   └── default.json
├── geoshape
    ├── files.txt
    ├── index.json
    ├── _tools
    │   └── parse.py
    ├── README.md
    ├── operations
    │   └── default.json
    ├── track.json
    └── challenges
    │   └── default.json
├── http_logs
    ├── track.py
    ├── files.txt
    ├── index.json
    ├── _tools
    │   └── unparse.rb
    ├── README.md
    ├── track.json
    └── operations
    │   └── default.json
├── .gitignore
├── download.sh
└── README.md


/so/files.txt:
--------------------------------------------------------------------------------
1 | posts.json.bz2
2 | posts-1k.json.bz2


--------------------------------------------------------------------------------
/noaa/files.txt:
--------------------------------------------------------------------------------
1 | documents.json.bz2
2 | documents-1k.json.bz2


--------------------------------------------------------------------------------
/pmc/files.txt:
--------------------------------------------------------------------------------
1 | documents.json.bz2
2 | documents-1k.json.bz2


--------------------------------------------------------------------------------
/eventdata/files.txt:
--------------------------------------------------------------------------------
1 | eventdata.json.bz2
2 | eventdata-1k.json.bz2


--------------------------------------------------------------------------------
/geopoint/files.txt:
--------------------------------------------------------------------------------
1 | documents.json.bz2
2 | documents-1k.json.bz2


--------------------------------------------------------------------------------
/nested/files.txt:
--------------------------------------------------------------------------------
1 | documents.json.bz2
2 | documents-1k.json.bz2


--------------------------------------------------------------------------------
/nyc_taxis/files.txt:
--------------------------------------------------------------------------------
1 | documents.json.bz2
2 | documents-1k.json.bz2


--------------------------------------------------------------------------------
/geonames/files.txt:
--------------------------------------------------------------------------------
1 | documents-2.json.bz2
2 | documents-2-1k.json.bz2


--------------------------------------------------------------------------------
/geopointshape/files.txt:
--------------------------------------------------------------------------------
1 | documents.json.bz2
2 | documents-1k.json.bz2


--------------------------------------------------------------------------------
/percolator/files.txt:
--------------------------------------------------------------------------------
1 | queries-2.json.bz2
2 | queries-2-1k.json.bz2


--------------------------------------------------------------------------------
/metricbeat/files.txt:
--------------------------------------------------------------------------------
1 | documents.json.bz2
2 | documents-1k.json.bz2
3 | 


--------------------------------------------------------------------------------
/so/raw_data_prep_script.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tlrx/rally-tracks/master/so/raw_data_prep_script.zip


--------------------------------------------------------------------------------
/nested/raw_data_prep_scripts.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tlrx/rally-tracks/master/nested/raw_data_prep_scripts.zip


--------------------------------------------------------------------------------
/geoshape/files.txt:
--------------------------------------------------------------------------------
1 | linestrings.json.bz2
2 | linestrings-1k.json.bz2
3 | multilinestrings.json.bz2
4 | multilinestrings-1k.json.bz2
5 | polygons.json.bz2
6 | polygons-1k.json.bz2
7 | 


--------------------------------------------------------------------------------
/so/operations/default.json:
--------------------------------------------------------------------------------
1 |     {
2 |       "name": "index-append",
3 |       "operation-type": "bulk",
4 |       "bulk-size": {{bulk_size | default(5000)}},
5 |       "ingest-percentage": {{ingest_percentage | default(100)}}
6 |     }
7 |     


--------------------------------------------------------------------------------
/eventdata/operations/default.json:
--------------------------------------------------------------------------------
1 |     {
2 |       "name": "index-append",
3 |       "operation-type": "bulk",
4 |       "bulk-size": {{bulk_size | default(5000)}},
5 |       "ingest-percentage": {{ingest_percentage | default(100)}}
6 |     }
7 |     


--------------------------------------------------------------------------------
/http_logs/track.py:
--------------------------------------------------------------------------------
1 | def reindex(es, params):
2 |   result = es.reindex(body=params.get("body"), request_timeout=params.get("request_timeout"))
3 |   return result["total"], "docs"
4 | 
5 | def register(registry):
6 |   registry.register_runner("reindex", reindex)
7 | 


--------------------------------------------------------------------------------
/pmc/track.py:
--------------------------------------------------------------------------------
 1 | def put_settings(es, params):
 2 |     es.cluster.put_settings(body=params["body"])
 3 | 
 4 | 
 5 | def register(registry):
 6 |     # register a fallback for older Rally versions
 7 |     try:
 8 |         from esrally.driver.runner import PutSettings
 9 |     except ImportError:
10 |         registry.register_runner("put-settings", put_settings)
11 | 


--------------------------------------------------------------------------------
/nyc_taxis/track.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | 
 4 | def wait_for_ml_lookback(es, params):
 5 |     while True:
 6 |         response = es.xpack.ml.get_datafeed_stats(datafeed_id=params["datafeed-id"])
 7 |         if response["datafeeds"][0]["state"] == "stopped":
 8 |             break
 9 |         time.sleep(5)
10 | 
11 | 
12 | def register(registry):
13 |     registry.register_runner("wait-for-ml-lookback", wait_for_ml_lookback)
14 | 


--------------------------------------------------------------------------------
/http_logs/files.txt:
--------------------------------------------------------------------------------
 1 | documents-181998.json.bz2
 2 | documents-191998.json.bz2
 3 | documents-201998.json.bz2
 4 | documents-211998.json.bz2
 5 | documents-221998.json.bz2
 6 | documents-231998.json.bz2
 7 | documents-241998.json.bz2
 8 | documents-181998-1k.json.bz2
 9 | documents-191998-1k.json.bz2
10 | documents-201998-1k.json.bz2
11 | documents-211998-1k.json.bz2
12 | documents-221998-1k.json.bz2
13 | documents-231998-1k.json.bz2
14 | documents-241998-1k.json.bz2
15 | 


--------------------------------------------------------------------------------
/geoshape/index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "index.number_of_shards": {{number_of_shards | default(1)}},
 4 |     "index.number_of_replicas": {{number_of_replicas | default(0)}},
 5 |     "index.requests.cache.enable": false
 6 |   },
 7 |   "mappings": {
 8 |     "dynamic": "strict",
 9 |     "_source": {
10 |       "enabled": {{ source_enabled | default(true) | tojson }}
11 |     },
12 |     "properties": {
13 |       "shape": {
14 |         "type": "geo_shape"
15 |       }
16 |     }
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/geopoint/index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "index.number_of_shards": {{number_of_shards | default(5)}},
 4 |     "index.number_of_replicas": {{number_of_replicas | default(0)}},
 5 |     "index.requests.cache.enable": false
 6 |   },
 7 |   "mappings": {
 8 |     "dynamic": "strict",
 9 |     "_source": {
10 |       "enabled": {{ source_enabled | default(true) | tojson }}
11 |     },
12 |     "properties": {
13 |       "location": {
14 |         "type": "geo_point"
15 |       }
16 |     }
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/geopointshape/index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "index.number_of_shards": {{number_of_shards | default(5)}},
 4 |     "index.number_of_replicas": {{number_of_replicas | default(0)}},
 5 |     "index.requests.cache.enable": false
 6 |   },
 7 |   "mappings": {
 8 |     "dynamic": "strict",
 9 |     "_source": {
10 |       "enabled": {{ source_enabled | default(true) | tojson }}
11 |     },
12 |     "properties": {
13 |       "location": {
14 |         "type": "geo_shape"
15 |       }
16 |     }
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/geoshape/_tools/parse.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import csv
 3 | import sys
 4 | import re
 5 | 
 6 | def to_json(f):
 7 |   for line in f:
 8 |     try:
 9 |       d = {}
10 |       d["shape"] = line.strip()
11 |       print(json.dumps(d))
12 |     except KeyboardInterrupt:
13 |       break
14 |     except Exception as e:
15 |       print("Skipping malformed entry '%s' because of %s" %(line, str(e)), file=sys.stderr)
16 | 
17 | if sys.argv[1] == "json":
18 |   for file_name in sys.argv[2:]:
19 |     with open(file_name) as f:
20 |       to_json(f)
21 | else:
22 |   raise Exception("Expected 'json' but got %s" %sys.argv[1])
23 | 


--------------------------------------------------------------------------------
/percolator/index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "index.number_of_shards": {{number_of_shards | default(5)}},
 4 |     "index.number_of_replicas": {{number_of_replicas | default(0)}},
 5 |     "index.queries.cache.enabled": false,
 6 |     "index.requests.cache.enable": false
 7 |   },
 8 |   "mappings": {
 9 |     "_source": {
10 |       "enabled": {{ source_enabled | default(true) | tojson }}
11 |     },
12 |     "dynamic": "strict",
13 |     "properties": {
14 |       "body": {
15 |         "type": "text",
16 |         "analyzer": "english"
17 |       },
18 |       "query": {
19 |         "type": "percolator"
20 |       }
21 |     }
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/geopointshape/_tools/parse.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import csv
 3 | import sys
 4 | import re
 5 | 
 6 | def to_json(f):
 7 |   for line in f:
 8 |     try:
 9 |       point = json.loads(line)["location"]
10 |       d = {}
11 |       d["location"] = "POINT (" + str(point[0]) + " " + str(point[1]) + ")"
12 |       print(json.dumps(d))
13 |     except KeyboardInterrupt:
14 |       break
15 |     except Exception as e:
16 |       print("Skipping malformed entry '%s' because of %s" %(line, str(e)), file=sys.stderr)
17 | 
18 | if sys.argv[1] == "json":
19 |   for file_name in sys.argv[2:]:
20 |     with open(file_name) as f:
21 |       to_json(f)
22 | else:
23 |   raise Exception("Expected 'json' but got %s" %sys.argv[1])
24 | 


--------------------------------------------------------------------------------
/metricbeat/track.json:
--------------------------------------------------------------------------------
 1 | {% import "rally.helpers" as rally with context %}
 2 | {
 3 |   "version": 2,
 4 |   "description": "Metricbeat data",
 5 |   "indices": [
 6 |     {
 7 |       "name": "metricbeat",
 8 |       "body": "index.json"
 9 |     }
10 |   ],
11 |   "corpora": [
12 |     {
13 |       "name": "metricbeat",
14 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/metricbeat",
15 |       "documents": [
16 |         {
17 |           "source-file": "documents.json.bz2",
18 |           "document-count": 1079600,
19 |           "compressed-bytes":91887122,
20 |           "uncompressed-bytes":1249705758
21 |         }
22 |       ]
23 |     }
24 |   ],
25 |   "operations": [
26 |     {{ rally.collect(parts="operations/*.json") }}
27 |   ],
28 |   "challenges": [
29 |     {{ rally.collect(parts="challenges/*.json") }}
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/pmc/track.json:
--------------------------------------------------------------------------------
 1 | {% import "rally.helpers" as rally with context %}
 2 | 
 3 | {
 4 |   "version": 2,
 5 |   "description": "Full text benchmark with academic papers from PMC",
 6 |   "indices": [
 7 |     {
 8 |       "name": "pmc",
 9 |       "body": "index.json"
10 |     }
11 |   ],
12 |   "corpora": [
13 |     {
14 |       "name": "pmc",
15 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/pmc",
16 |       "documents": [
17 |         {
18 |           "source-file": "documents.json.bz2",
19 |           "document-count": 574199,
20 |           "compressed-bytes": 5928712141,
21 |           "uncompressed-bytes": 23256051757
22 |         }
23 |       ]
24 |     }
25 |   ],
26 |   "operations": [
27 |     {{ rally.collect(parts="operations/*.json") }}
28 |   ],
29 |   "challenges": [
30 |     {{ rally.collect(parts="challenges/*.json") }}
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/nested/track.json:
--------------------------------------------------------------------------------
 1 | {% import "rally.helpers" as rally with context %}
 2 | 
 3 | {
 4 |   "version": 2,
 5 |   "description": "StackOverflow Q&A stored as nested docs",
 6 |   "indices": [
 7 |     {
 8 |       "name": "sonested",
 9 |       "body": "index.json"
10 |     }
11 |   ],
12 |   "corpora": [
13 |     {
14 |       "name": "nested",
15 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/nested",
16 |       "documents": [
17 |         {
18 |           "source-file": "documents.json.bz2",
19 |           "document-count": 11203029,
20 |           "compressed-bytes": 695293381,
21 |           "uncompressed-bytes": 3637747670
22 |         }
23 |       ]
24 |     }
25 |   ],
26 |   "operations": [
27 |     {{ rally.collect(parts="operations/*.json") }}
28 |   ],
29 |   "challenges": [
30 |     {{ rally.collect(parts="challenges/*.json") }}
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/noaa/track.json:
--------------------------------------------------------------------------------
 1 | {% import "rally.helpers" as rally with context %}
 2 | 
 3 | {
 4 |   "version": 2,
 5 |   "description": "Global daily weather measurements from NOAA",
 6 |   "indices": [
 7 |     {
 8 |       "name": "weather-data-2016",
 9 |       "body": "index.json"
10 |     }
11 |   ],
12 |   "corpora": [
13 |     {
14 |       "name": "noaa",
15 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/noaa",
16 |       "documents": [
17 |         {
18 |           "source-file": "documents.json.bz2",
19 |           "document-count": 33659481,
20 |           "compressed-bytes": 993302204,
21 |           "uncompressed-bytes": 9684262698
22 |         }
23 |       ]
24 |     }
25 |   ],
26 |   "operations": [
27 |     {{ rally.collect(parts="operations/*.json") }}
28 |   ],
29 |   "challenges": [
30 |     {{ rally.collect(parts="challenges/*.json") }}
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/geopoint/track.json:
--------------------------------------------------------------------------------
 1 | {% import "rally.helpers" as rally with context %}
 2 | 
 3 | {
 4 |   "version": 2,
 5 |   "description": "Point coordinates from PlanetOSM",
 6 |   "indices": [
 7 |     {
 8 |       "name": "osmgeopoints",
 9 |       "body": "index.json"
10 |     }
11 |   ],
12 |   "corpora": [
13 |     {
14 |       "name": "geopoint",
15 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geopoint",
16 |       "documents": [
17 |         {
18 |           "source-file": "documents.json.bz2",
19 |           "document-count": 60844404,
20 |           "compressed-bytes": 505295401,
21 |           "uncompressed-bytes": 2448564579
22 |         }
23 |       ]
24 |     }
25 |   ],
26 |   "operations": [
27 |     {{ rally.collect(parts="operations/*.json") }}
28 |   ],
29 |   "challenges": [
30 |     {{ rally.collect(parts="challenges/*.json") }}
31 |   ]
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/so/track.json:
--------------------------------------------------------------------------------
 1 | {% import "rally.helpers" as rally with context %}
 2 | 
 3 | {
 4 |   "version": 2,
 5 |   "description": "Indexing benchmark using up to questions and answers from StackOverflow",
 6 |   "indices": [
 7 |     {
 8 |       "name": "so",
 9 |       "body": "index.json"
10 |     }
11 |   ],
12 |   "corpora": [
13 |     {
14 |       "name": "so",
15 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/so",
16 |       "documents": [
17 |         {
18 |           "source-file": "posts.json.bz2",
19 |           "document-count": 36062278,
20 |           "compressed-bytes": 9599137228,
21 |           "uncompressed-bytes": 35564808298
22 |         }
23 |       ]
24 |     }
25 |   ],
26 |   "operations": [
27 |     {{ rally.collect(parts="operations/*.json") }}
28 |   ],
29 |   "challenges": [
30 |     {{ rally.collect(parts="challenges/*.json") }}
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/percolator/track.json:
--------------------------------------------------------------------------------
 1 | {% import "rally.helpers" as rally with context %}
 2 | 
 3 | {
 4 |   "version": 2,
 5 |   "description": "Percolator benchmark based on AOL queries",
 6 |   "indices": [
 7 |     {
 8 |       "name": "queries",
 9 |       "body": "index.json"
10 |     }
11 |   ],
12 |   "corpora": [
13 |     {
14 |       "name": "percolator",
15 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/percolator",
16 |       "documents": [
17 |         {
18 |           "source-file": "queries-2.json.bz2",
19 |           "document-count": 2000000,
20 |           "compressed-bytes": 105192,
21 |           "uncompressed-bytes": 110039748
22 |         }
23 |       ]
24 |     }
25 |   ],
26 |   "operations": [
27 |     {{ rally.collect(parts="operations/*.json") }}
28 |   ],
29 |   "challenges": [
30 |     {{ rally.collect(parts="challenges/*.json") }}
31 |   ]
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/geopointshape/track.json:
--------------------------------------------------------------------------------
 1 | {% import "rally.helpers" as rally with context %}
 2 | 
 3 | {
 4 |   "version": 2,
 5 |   "description": "Point coordinates from PlanetOSM indexed as geoshapes",
 6 |   "indices": [
 7 |     {
 8 |       "name": "osmgeoshapes",
 9 |       "body": "index.json"
10 |     }
11 |   ],
12 |   "corpora": [
13 |     {
14 |       "name": "geopointshape",
15 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geopointshape",
16 |       "documents": [
17 |         {
18 |           "source-file": "documents.json.bz2",
19 |           "document-count": 60844404,
20 |           "compressed-bytes": 493367095,
21 |           "uncompressed-bytes": 2780550484
22 |         }
23 |       ]
24 |     }
25 |   ],
26 |   "operations": [
27 |     {{ rally.collect(parts="operations/*.json") }}
28 |   ],
29 |   "challenges": [
30 |     {{ rally.collect(parts="challenges/*.json") }}
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/geonames/track.json:
--------------------------------------------------------------------------------
 1 | {% import "rally.helpers" as rally with context %}
 2 | {
 3 |   "version": 2,
 4 |   "description": "POIs from Geonames",
 5 |   "data-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geonames",
 6 |   "indices": [
 7 |     {
 8 |       "name": "geonames",
 9 |       "body": "index.json"
10 |     }
11 |   ],
12 |   "corpora": [
13 |     {
14 |       "name": "geonames",
15 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geonames",
16 |       "documents": [
17 |         {
18 |           "source-file": "documents-2.json.bz2",
19 |           "document-count": 11396505,
20 |           "compressed-bytes": 264698741,
21 |           "uncompressed-bytes": 3547614383
22 |         }
23 |       ]
24 |     }
25 |   ],
26 |   "operations": [
27 |     {{ rally.collect(parts="operations/*.json") }}
28 |   ],
29 |   "challenges": [
30 |     {{ rally.collect(parts="challenges/*.json") }}
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/nyc_taxis/track.json:
--------------------------------------------------------------------------------
 1 | {% import "rally.helpers" as rally with context %}
 2 | 
 3 | {
 4 |   "version": 2,
 5 |   "description": "Taxi rides in New York in 2015",
 6 |   "indices": [
 7 |     {
 8 |       "name": "nyc_taxis",
 9 |       "body": "index.json"
10 |     }
11 |   ],
12 |   "corpora": [
13 |     {
14 |       "name": "nyc_taxis",
15 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/nyc_taxis",
16 |       "documents": [
17 |         {
18 |           "source-file": "documents.json.bz2",
19 |           "#COMMENT": "ML benchmark rely on the fact that the document count stays constant.",
20 |           "document-count": 165346692,
21 |           "compressed-bytes": 4812721501,
22 |           "uncompressed-bytes": 79802445255
23 |         }
24 |       ]
25 |     }
26 |   ],
27 |   "operations": [
28 |     {{ rally.collect(parts="operations/*.json") }}
29 |   ],
30 |   "challenges": [
31 |     {{ rally.collect(parts="challenges/*.json") }}
32 |   ]
33 | }
34 | 


--------------------------------------------------------------------------------
/eventdata/track.json:
--------------------------------------------------------------------------------
 1 | {% import "rally.helpers" as rally with context %}
 2 | 
 3 | {
 4 |   "version": 2,
 5 |   "description": "This benchmark indexes HTTP access logs generated based sample logs from the elastic.co website using the generator available in https://github.com/elastic/rally-eventdata-track",
 6 |   "indices": [
 7 |     {
 8 |       "name": "eventdata",
 9 |       "body": "index.json"
10 |     }
11 |   ],
12 |   "corpora": [
13 |     {
14 |       "name": "eventdata",
15 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/eventdata",
16 |       "documents": [
17 |         {
18 |           "source-file": "eventdata.json.bz2",
19 |           "document-count": 20000000,
20 |           "compressed-bytes": 791796014,
21 |           "uncompressed-bytes": 16437108429
22 |         }
23 |       ]
24 |     }
25 |   ],
26 |   "operations": [
27 |     {{ rally.collect(parts="operations/*.json") }}
28 |   ],
29 |   "challenges": [
30 |     {{ rally.collect(parts="challenges/*.json") }}
31 |   ]
32 | }
33 | 


--------------------------------------------------------------------------------
/so/index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "index.number_of_shards": {{number_of_shards | default(5)}},
 4 |     "index.number_of_replicas": {{number_of_replicas | default(0)}},
 5 |     "index.requests.cache.enable": false
 6 |   },
 7 |   "mappings": {
 8 |     "dynamic": "strict",
 9 |     "_source": {
10 |       "enabled": {{ source_enabled | default(true) | tojson }}
11 |     },
12 |     "properties": {
13 |       "user": {
14 |         "type": "keyword"
15 |       },
16 |       "creationDate": {
17 |         "type": "date"
18 |       },
19 |       "title": {
20 |         "type": "text"
21 |       },
22 |       "questionId": {
23 |         "type": "keyword"
24 |       },
25 |       "answerId": {
26 |         "type": "keyword"
27 |       },
28 |       "acceptedAnswerId": {
29 |         "type": "keyword"
30 |       },
31 |       "tags": {
32 |         "type": "keyword"
33 |       },
34 |       "body": {
35 |         "type": "text"
36 |       },
37 |       "type": {
38 |         "type": "keyword"
39 |       }
40 |     }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/pmc/index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "index.number_of_shards": {{number_of_shards | default(5)}},
 4 |     "index.number_of_replicas": {{number_of_replicas | default(0)}},
 5 |     "index.requests.cache.enable": false
 6 |   },
 7 |   "mappings": {
 8 |     "_source": {
 9 |       "enabled": {{ source_enabled | default(true) | tojson }}
10 |     },
11 |     "dynamic": "strict",
12 |     "properties": {
13 |       "name": {
14 |         "type": "keyword"
15 |       },
16 |       "journal": {
17 |         "type": "text"
18 |       },
19 |       "date": {
20 |         "type": "text"
21 |       },
22 |       "volume": {
23 |         "type": "text"
24 |       },
25 |       "issue": {
26 |         "type": "text"
27 |       },
28 |       "accession": {
29 |         "type": "keyword"
30 |       },
31 |       "timestamp": {
32 |         "type": "date",
33 |         "format": "yyyy-MM-dd HH:mm:ss"
34 |       },
35 |       "pmid": {
36 |         "type": "integer"
37 |       },
38 |       "body": {
39 |         "type": "text"
40 |       }
41 |     }
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/nested/index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "index.number_of_shards": {{number_of_shards | default(1)}},
 4 |     "index.number_of_replicas": {{number_of_replicas | default(0)}},
 5 |     "index.store.type": "{{store_type | default('fs')}}",
 6 |     "index.requests.cache.enable": false
 7 |   },
 8 |   "mappings": {
 9 |     "dynamic": "strict",
10 |     "_source": {
11 |       "enabled": {{ source_enabled | default(true) | tojson }}
12 |     },
13 |     "properties": {
14 |       "user": {
15 |         "type": "keyword"
16 |       },
17 |       "creationDate": {
18 |         "type": "date"
19 |       },
20 |       "title": {
21 |         "type": "text"
22 |       },
23 |       "qid": {
24 |         "type": "keyword"
25 |       },
26 |       "tag": {
27 |         "type": "keyword"
28 |       },
29 |       "answer_count": {
30 |         "type": "integer"
31 |       },
32 |       "answers": {
33 |         "type": "nested",
34 |         "properties": {
35 |           "user": {
36 |             "type": "keyword"
37 |           },
38 |           "date": {
39 |             "type": "date"
40 |           }
41 |         }
42 |       }
43 |     }
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/nested/queries.csv:
--------------------------------------------------------------------------------
 1 | java,2012-04-08T21:15:33.873Z
 2 | c#,2012-01-02T13:27:55.631Z
 3 | javascript,2011-09-29T09:31:37.345Z
 4 | php,2012-04-10T03:16:00.727Z
 5 | android,2012-04-08T15:02:52.091Z
 6 | jquery,2012-04-03T08:39:17.337Z
 7 | python,2012-04-09T04:44:32.264Z
 8 | html,2012-04-09T05:11:21.702Z
 9 | c++,2012-06-01T22:18:55.219Z
10 | ios,2012-04-09T02:15:44.330Z
11 | mysql,2011-07-16T14:27:53.863Z
12 | css,2012-04-09T17:48:32.247Z
13 | sql,2010-09-27T15:38:50.503Z
14 | asp.net,2012-04-09T06:06:10.029Z
15 | objective-c,2012-04-08T22:03:57.592Z
16 | ruby-on-rails,2012-04-08T19:35:07.412Z
17 | .net,2012-04-08T16:37:16.992Z
18 | iphone,2012-04-10T12:45:17.230Z
19 | c,2012-04-10T01:22:20.110Z
20 | arrays,2012-04-09T15:13:45.728Z
21 | sql-server,2012-04-10T07:36:56.023Z
22 | angularjs,2012-04-10T12:26:25.743Z
23 | ruby,2012-04-08T02:18:56.402Z
24 | json,2012-04-08T07:11:47.490Z
25 | ajax,2012-04-08T08:05:14.004Z
26 | regex,2012-04-08T09:13:29.263Z
27 | xml,2012-04-07T21:37:43.372Z
28 | asp.net-mvc,2012-04-09T21:23:43.302Z
29 | r,2012-03-07T11:59:36.114Z
30 | linux,2012-04-09T12:51:50.530Z
31 | wpf,2012-04-08T11:18:54.110Z
32 | django,2012-04-13T02:18:51.407Z
33 | node.js,2012-04-07T04:50:14.554Z
34 | database,2012-04-10T15:53:29.825Z
35 | xcode,2011-10-09T16:32:45.480Z
36 | 


--------------------------------------------------------------------------------
/http_logs/index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "index.number_of_shards": {{number_of_shards | default(5)}},
 4 |     "index.number_of_replicas": {{number_of_replicas | default(0)}},
 5 |     "index.requests.cache.enable": false
 6 |   },
 7 |   "mappings": {
 8 |     "dynamic": "strict",
 9 |     "_source": {
10 |       "enabled": {{ source_enabled | default(true) | tojson }}
11 |     },
12 |     "properties": {
13 |       "@timestamp": {
14 |         "format": "strict_date_optional_time||epoch_second",
15 |         "type": "date"
16 |       },
17 |       "clientip": {
18 |         "type": "ip"
19 |       },
20 |       "message": {
21 |         "type": "keyword",
22 |         "index": false,
23 |         "doc_values": false
24 |       },
25 |       "request": {
26 |         "type": "text",
27 |         "fields": {
28 |           "raw": {
29 |             "ignore_above": 256,
30 |             "type": "keyword"
31 |           }
32 |         }
33 |       },
34 |       "status": {
35 |         "type": "integer"
36 |       },
37 |       "size": {
38 |         "type": "integer"
39 |       },
40 |       "geoip" : {
41 |         "properties" : {
42 |            "country_name": { "type": "keyword" },
43 |            "city_name": { "type": "keyword" },
44 |            "location" : { "type" : "geo_point" }
45 |         }
46 |       }
47 |     }
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/so/challenges/default.json:
--------------------------------------------------------------------------------
 1 |     {
 2 |       "name": "append-no-conflicts",
 3 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.",
 4 |       "default": true,
 5 |       "schedule": [
 6 |         {
 7 |           "operation": "delete-index"
 8 |         },
 9 |         {
10 |           "operation": {
11 |             "operation-type": "create-index",
12 |             "settings": {{index_settings | default({}) | tojson}}
13 |           }
14 |         },
15 |         {
16 |           "name": "check-cluster-health",
17 |           "operation": {
18 |             "operation-type": "cluster-health",
19 |             "index": "logs-*",
20 |             "request-params": {
21 |               "wait_for_status": "{{cluster_health | default('green')}}",
22 |               "wait_for_no_relocating_shards": "true"
23 |             }
24 |           }
25 |         },
26 |         {
27 |           "operation": "index-append",
28 |           "warmup-time-period": 120,
29 |           "clients": {{bulk_indexing_clients | default(8)}}
30 |         },
31 |         {
32 |           "operation": "force-merge",
33 |           "clients": 1
34 |         }
35 |       ]
36 |     }
37 | 


--------------------------------------------------------------------------------
/eventdata/challenges/default.json:
--------------------------------------------------------------------------------
 1 |     {
 2 |       "name": "append-no-conflicts",
 3 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.",
 4 |       "default": true,
 5 |       "schedule": [
 6 |         {
 7 |           "operation": "delete-index"
 8 |         },
 9 |         {
10 |           "operation": {
11 |             "operation-type": "create-index",
12 |             "settings": {{index_settings | default({}) | tojson}}
13 |           }
14 |         },
15 |         {
16 |           "name": "check-cluster-health",
17 |           "operation": {
18 |             "operation-type": "cluster-health",
19 |             "index": "eventdata",
20 |             "request-params": {
21 |               "wait_for_status": "{{cluster_health | default('green')}}",
22 |               "wait_for_no_relocating_shards": "true"
23 |             }
24 |           }
25 |         },
26 |         {
27 |           "operation": "index-append",
28 |           "warmup-time-period": 120,
29 |           "clients": {{bulk_indexing_clients | default(8)}}
30 |         },
31 |         {
32 |           "operation": "force-merge",
33 |           "clients": 1
34 |         }
35 |       ]
36 |     }
37 | 


--------------------------------------------------------------------------------
/geoshape/README.md:
--------------------------------------------------------------------------------
 1 | ## Geoshape track
 2 | 
 3 | This track is based on [PlanetOSM](http://wiki.openstreetmap.org/wiki/Planet.osm) data.
 4 | 
 5 | ### Example Document
 6 | 
 7 | ```json
 8 | {
 9 |   "shape": "LINESTRING(-1.8212114 52.5538901, -1.8205573 52.554324)"
10 | }
11 | ```
12 | 
13 | ### Parameters
14 | 
15 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`:
16 | 
17 | * `linestring_bulk_size` (default: 100): The bulk request size for indexing linestrings.
18 | * `multilinestring_bulk_size` (default: 100): The bulk request size for indexing multilinestrings.
19 | * `polygon_bulk_size` (default: 100): The bulk request size for indexing polygons.
20 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests.
21 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
22 | * `number_of_replicas` (default: 0)
23 | * `number_of_shards` (default: 1)
24 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index.
25 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
26 | * `cluster_health` (default: "green"): The minimum required cluster health.
27 | 
28 | ### License
29 | 
30 | Same license as the original data from PlanetOSM: [Open Database License](http://wiki.openstreetmap.org/wiki/Open_Database_License).
31 | 


--------------------------------------------------------------------------------
/metricbeat/operations/default.json:
--------------------------------------------------------------------------------
 1 |     {
 2 |       "name": "index-append",
 3 |       "operation-type": "bulk",
 4 |       "bulk-size": {{bulk_size | default(10000)}},
 5 |       "ingest-percentage": {{ingest_percentage | default(100)}}
 6 |     },
 7 |     {
 8 |       "name": "autohisto_agg",
 9 |       "operation-type": "search",
10 |       "body": {
11 |         "size": 0,
12 |         "query": {
13 |           "range": {
14 |             "@timestamp": {
15 |               "gte": "23/02/2019",
16 |               "lte": "23/02/2019",
17 |               "format": "dd/MM/yyyy"
18 |             }
19 |           }
20 |         },
21 |         "aggs": {
22 |           "occurrences_over_time": {
23 |             "auto_date_histogram": {
24 |               "field": "@timestamp",
25 |               "buckets": 24
26 |             }
27 |           }
28 |         }
29 |       }
30 |     },
31 |     {
32 |       "name": "date_histogram_agg",
33 |       "operation-type": "search",
34 |       "body": {
35 |         "size": 0,
36 |         "query": {
37 |           "range": {
38 |               "@timestamp": {
39 |               "gte": "23/02/2019",
40 |               "lte": "23/02/2019",
41 |               "format": "dd/MM/yyyy"
42 |             }
43 |           }
44 |         },
45 |         "aggs": {
46 |           "occurrences_over_time": {
47 |             "date_histogram": {
48 |               "field": "@timestamp",
49 |               "calendar_interval": "hour"
50 |             }
51 |           }
52 |         }
53 |       }
54 |     }
55 | 


--------------------------------------------------------------------------------
/geopointshape/operations/default.json:
--------------------------------------------------------------------------------
 1 |     {
 2 |       "name": "index-append",
 3 |       "operation-type": "bulk",
 4 |       "bulk-size": {{bulk_size | default(5000)}},
 5 |       "ingest-percentage": {{ingest_percentage | default(100)}}
 6 |     },
 7 |     {
 8 |       "name": "index-update",
 9 |       "operation-type": "bulk",
10 |       "bulk-size": {{bulk_size | default(5000)}},
11 |       "ingest-percentage": {{ingest_percentage | default(100)}},
12 |       "conflicts": "random",
13 |       "on-conflict": "{{on_conflict | default('index')}}",
14 |       "conflict-probability": {{conflict_probability | default(25)}},
15 |       "recency": {{recency | default(0)}}
16 |     },
17 |     {
18 |       "name": "polygon",
19 |       "operation-type": "search",
20 |       "body": {
21 |         "query": {
22 |           "geo_shape": {
23 |             "location": {
24 |                "shape": {
25 |                   "type": "polygon",
26 |                   "coordinates" : [[
27 |                     [-0.1, 49.0],
28 |                     [5.0, 48.0],
29 |                     [15.0, 49.0],
30 |                     [14.0, 60.0],
31 |                     [-0.1, 61.0],
32 |                     [-0.1, 49.0]
33 |                   ]]
34 |               }
35 |             }
36 |           }
37 |         }
38 |       }
39 |     },
40 |     {
41 |       "name": "bbox",
42 |       "operation-type": "search",
43 |       "body": {
44 |         "query": {
45 |           "geo_shape": {
46 |             "location": {
47 |               "shape": {
48 |                  "type": "envelope",
49 |                  "coordinates" : [[-0.1, 61.0], [15.0, 48.0]]
50 |               }
51 |             }
52 |           }
53 |         }
54 |       }
55 |     }
56 | 


--------------------------------------------------------------------------------
/eventdata/index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "index.number_of_shards": {{number_of_shards | default(5)}},
 4 |     "index.number_of_replicas": {{number_of_replicas | default(0)}},
 5 |     "index.requests.cache.enable": false
 6 |   },
 7 |   "mappings": {
 8 |     "dynamic": "strict",
 9 |     "_source": {
10 |       "enabled": {{ source_enabled | default(true) | tojson }}
11 |     },
12 |     "properties": {
13 |       "@timestamp": { "type": "date" },
14 |       "message": { "type": "text", "index": false },
15 |       "agent": { "type": "keyword", "ignore_above": 256 },
16 |       "bytes": { "type": "integer" },
17 |       "clientip": { "type": "ip" },
18 |       "httpversion": { "type": "keyword", "ignore_above": 256 },
19 |       "response": { "type": "short" },
20 |       "verb": { "type": "keyword", "ignore_above": 256 },
21 |       "tags": { "type": "keyword", "ignore_above": 256 },
22 |       "geoip" : {
23 |         "properties" : {
24 |           "country_name" : { "type": "keyword" },
25 |           "location" : { "type": "geo_point" }
26 |         }
27 |       },
28 |       "useragent": {
29 |         "properties": {
30 |           "name": { "type": "keyword", "ignore_above": 256 },
31 |           "os": { "type": "keyword", "ignore_above": 256 },
32 |           "os_name": { "type": "keyword", "ignore_above": 256 }
33 |         }
34 |       },
35 |       "request": {
36 |         "norms": false,
37 |         "type": "text",
38 |         "fields": {
39 |           "keyword": { "ignore_above": 256, "type": "keyword" }
40 |         }
41 |       },
42 |       "referrer": {
43 |         "norms": false,
44 |         "type": "text",
45 |         "fields": {
46 |           "keyword": { "ignore_above": 256, "type": "keyword" }
47 |         }
48 |       }
49 |     }
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/geopointshape/README.md:
--------------------------------------------------------------------------------
 1 | ## Geopoint track
 2 | 
 3 | This track is based on [PlanetOSM](http://wiki.openstreetmap.org/wiki/Planet.osm) data. It contains the same data as the geopoint track but indexes all points as geoshapes.
 4 | 
 5 | ### Example Document
 6 | 
 7 | ```json
 8 | {
 9 |   "location": "POINT (-0.1485188 51.5250666)"
10 | }
11 | ```
12 | 
13 | ### Parameters
14 | 
15 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`:
16 | 
17 | * `bulk_size` (default: 5000)
18 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests.
19 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
20 | * `conflict_probability` (default: 25): A number between 0 and 100 that defines the probability of id conflicts. This requires to run the respective challenge.
21 | * `on_conflict` (default: "index"): Whether to use an "index" or an "update" action when simulating an id conflict.
22 | * `recency` (default: 0): A number between 0 and 1 that defines whether to bias towards more recent ids when simulating conflicts. See the [Rally docs](http://esrally.readthedocs.io/en/latest/track.html#bulk) for the full definition of this parameter. This requires to run the respective challenge.
23 | * `number_of_replicas` (default: 0)
24 | * `number_of_shards` (default: 5)
25 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index.
26 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
27 | * `cluster_health` (default: "green"): The minimum required cluster health.
28 | 
29 | ### License
30 | 
31 | Same license as the original data from PlanetOSM: [Open Database License](http://wiki.openstreetmap.org/wiki/Open_Database_License).
32 | 


--------------------------------------------------------------------------------
/geoshape/operations/default.json:
--------------------------------------------------------------------------------
 1 |     {
 2 |       "name": "index-append-linestrings",
 3 |       "operation-type": "bulk",
 4 |       "bulk-size": {{linestring_bulk_size | default(100)}},
 5 |       "ingest-percentage": {{ingest_percentage | default(100)}},
 6 |       "corpora": "linestrings"
 7 |     },
 8 |     {
 9 |       "name": "index-append-multilinestrings",
10 |       "operation-type": "bulk",
11 |       "bulk-size": {{multilinestring_bulk_size | default(100)}},
12 |       "ingest-percentage": {{ingest_percentage | default(100)}},
13 |       "corpora": "multilinestrings"
14 |     },
15 |     {
16 |       "name": "index-append-polygons",
17 |       "operation-type": "bulk",
18 |       "bulk-size": {{polygon_bulk_size | default(100)}},
19 |       "ingest-percentage": {{ingest_percentage | default(100)}},
20 |       "corpora": "polygons"
21 |     },
22 |     {
23 |       "name": "polygon",
24 |       "operation-type": "search",
25 |       "index": "osm*",
26 |       "body": {
27 |         "query": {
28 |           "geo_shape": {
29 |             "shape": {
30 |                "shape": {
31 |                   "type": "polygon",
32 |                   "coordinates" : [[
33 |                     [-0.1, 49.0],
34 |                     [5.0, 48.0],
35 |                     [15.0, 49.0],
36 |                     [14.0, 60.0],
37 |                     [-0.1, 61.0],
38 |                     [-0.1, 49.0]
39 |                   ]]
40 |               }
41 |             }
42 |           }
43 |         }
44 |       }
45 |     },
46 |     {
47 |       "name": "bbox",
48 |       "operation-type": "search",
49 |       "index": "osm*",
50 |       "body": {
51 |         "query": {
52 |           "geo_shape": {
53 |             "shape": {
54 |                "shape": {
55 |                   "type": "envelope",
56 |                   "coordinates" : [[-0.1, 61.0], [15.0, 48.0]]
57 |               }
58 |             }
59 |           }
60 |         }
61 |       }
62 |     }
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## https://github.com/github/gitignore/blob/master/Global/OSX.gitignore
  2 | 
  3 | .DS_Store
  4 | .AppleDouble
  5 | .LSOverride
  6 | 
  7 | # Icon must end with two \r
  8 | Icon
  9 | 
 10 | 
 11 | # Thumbnails
 12 | ._*
 13 | 
 14 | # Files that might appear in the root of a volume
 15 | .DocumentRevisions-V100
 16 | .fseventsd
 17 | .Spotlight-V100
 18 | .TemporaryItems
 19 | .Trashes
 20 | .VolumeIcon.icns
 21 | 
 22 | # Directories potentially created on remote AFP share
 23 | .AppleDB
 24 | .AppleDesktop
 25 | Network Trash Folder
 26 | Temporary Items
 27 | .apdisk
 28 | 
 29 | ## kinda based on https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore
 30 | 
 31 | *.iml
 32 | 
 33 | ## Directory-based project format:
 34 | .idea/
 35 | 
 36 | ## https://github.com/github/gitignore/blob/master/Python.gitignore
 37 | 
 38 | # Byte-compiled / optimized / DLL files
 39 | __pycache__/
 40 | *.py[cod]
 41 | *$py.class
 42 | 
 43 | # C extensions
 44 | *.so
 45 | 
 46 | # Distribution / packaging
 47 | .Python
 48 | env/
 49 | build/
 50 | develop-eggs/
 51 | dist/
 52 | downloads/
 53 | eggs/
 54 | .eggs/
 55 | lib/
 56 | lib64/
 57 | parts/
 58 | sdist/
 59 | var/
 60 | *.egg-info/
 61 | .installed.cfg
 62 | *.egg
 63 | 
 64 | # PyInstaller
 65 | #  Usually these files are written by a python script from a template
 66 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 67 | *.manifest
 68 | *.spec
 69 | 
 70 | # Installer logs
 71 | pip-log.txt
 72 | pip-delete-this-directory.txt
 73 | 
 74 | # Unit test / coverage reports
 75 | htmlcov/
 76 | .tox/
 77 | .coverage
 78 | .coverage.*
 79 | .cache
 80 | nosetests.xml
 81 | coverage.xml
 82 | *,cover
 83 | .hypothesis/
 84 | junit-*.xml
 85 | 
 86 | # Translations
 87 | *.mo
 88 | *.pot
 89 | 
 90 | # Django stuff:
 91 | *.log
 92 | 
 93 | # Sphinx documentation
 94 | docs/_build/
 95 | 
 96 | # PyBuilder
 97 | target/
 98 | 
 99 | #Pickles
100 | *.pk
101 | 
102 | # pyenv
103 | .python-version


--------------------------------------------------------------------------------
/nested/operations/default.json:
--------------------------------------------------------------------------------
 1 |     {
 2 |       "name": "index-append",
 3 |       "operation-type": "bulk",
 4 |       "bulk-size": {{bulk_size | default(5000)}},
 5 |       "ingest-percentage": {{ingest_percentage | default(100)}}
 6 |     },
 7 |     {
 8 |       "name": "randomized-nested-queries",
 9 |       "operation-type": "search",
10 |       "param-source": "nested-query-source"
11 |     },
12 |     {
13 |       "name": "randomized-nested-queries-with-inner-hits_default",
14 |       "operation-type": "search",
15 |       "param-source": "nested-query-source-with-inner-hits",
16 |       "size" : 10,
17 |       "inner_hits_size" : 3
18 |     },
19 |     {
20 |       "name": "randomized-nested-queries-with-inner-hits_default_big_size",
21 |       "operation-type": "search",
22 |       "param-source": "nested-query-source-with-inner-hits",
23 |       "size" : 100,
24 |       "inner_hits_size" : 100
25 |     },
26 |     {
27 |       "name": "randomized-term-queries",
28 |       "operation-type": "search",
29 |       "param-source": "term-query-source"
30 |     },
31 |     {
32 |       "name": "randomized-sorted-term-queries",
33 |       "operation-type": "search",
34 |       "param-source": "sorted-term-query-source"
35 |     },
36 |     {
37 |       "name": "match-all",
38 |       "operation-type": "search",
39 |       "body": {
40 |         "query": {
41 |           "match_all": {}
42 |         }
43 |       }
44 |     },
45 |     {
46 |       "name": "nested-date-histo",
47 |       "operation-type": "search",
48 |       "body": {
49 |         "size": 0,
50 |         "aggs": {
51 |           "answers": {
52 |             "nested": {
53 |               "path": "answers"
54 |             },
55 |             "aggs": {
56 |               "date_histo": {
57 |                 "date_histogram": {
58 |                   "field": "answers.date",
59 |                   "calendar_interval": "month"
60 |                 }
61 |               }
62 |             }
63 |           }
64 |         }
65 |       }
66 |     }
67 | 


--------------------------------------------------------------------------------
/geoshape/track.json:
--------------------------------------------------------------------------------
 1 | {% import "rally.helpers" as rally with context %}
 2 | 
 3 | {
 4 |   "version": 2,
 5 |   "description": "Shapes from PlanetOSM",
 6 |   "indices": [
 7 |     {
 8 |       "name": "osmlinestrings",
 9 |       "body": "index.json"
10 |     },
11 |     {
12 |       "name": "osmmultilinestrings",
13 |       "body": "index.json"
14 |     },
15 |     {
16 |       "name": "osmpolygons",
17 |       "body": "index.json"
18 |     }
19 |   ],
20 |   "corpora": [
21 |     {
22 |       "name": "linestrings",
23 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geoshape",
24 |       "target-index": "osmlinestrings",
25 |       "documents": [
26 |         {
27 |           "source-file": "linestrings.json.bz2",
28 |           "document-count": 20532036,
29 |           "compressed-bytes": 3697293598,
30 |           "uncompressed-bytes": 12592499821
31 |         }
32 |       ]
33 |     },
34 |     {
35 |       "name": "multilinestrings",
36 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geoshape",
37 |       "target-index": "osmmultilinestrings",
38 |       "documents": [
39 |         {
40 |           "source-file": "multilinestrings.json.bz2",
41 |           "document-count": 532036,
42 |           "compressed-bytes": 1816588880,
43 |           "uncompressed-bytes": 5992834062
44 |         }
45 |       ]
46 |     },
47 |     {
48 |       "name": "polygons",
49 |       "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/geoshape",
50 |       "target-index": "osmpolygons",
51 |       "documents": [
52 |         {
53 |           "source-file": "polygons.json.bz2",
54 |           "document-count": 39459211,
55 |           "compressed-bytes": 8835370788,
56 |           "uncompressed-bytes": 30178820325
57 |         }
58 |       ]
59 |     }
60 |   ],
61 |   "operations": [
62 |     {{ rally.collect(parts="operations/*.json") }}
63 |   ],
64 |   "challenges": [
65 |     {{ rally.collect(parts="challenges/*.json") }}
66 |   ]
67 | }
68 | 


--------------------------------------------------------------------------------
/metricbeat/challenges/default.json:
--------------------------------------------------------------------------------
 1 |     {
 2 |       "name": "append-no-conflicts",
 3 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.",
 4 |       "default": true,
 5 |       "schedule": [
 6 |         {
 7 |           "operation": "delete-index"
 8 |         },
 9 |         {
10 |           "operation": {
11 |             "operation-type": "create-index",
12 |             "settings": {{index_settings | default({}) | tojson}}
13 |           }
14 |         },
15 |         {
16 |           "name": "check-cluster-health",
17 |           "operation": {
18 |             "operation-type": "cluster-health",
19 |             "index": "metricbeat",
20 |             "request-params": {
21 |               "wait_for_status": "{{cluster_health | default('green')}}",
22 |               "wait_for_no_relocating_shards": "true"
23 |             }
24 |           }
25 |         },
26 |         {
27 |           "operation": "index-append",
28 |           "warmup-time-period": 0,
29 |           "clients": {{bulk_indexing_clients | default(8)}}
30 |         },
31 |         {
32 |           "name": "refresh-after-index",
33 |           "operation": "refresh",
34 |           "clients": 1
35 |         },
36 |         {
37 |           "operation": "force-merge",
38 |           "clients": 1
39 |         },
40 |         {
41 |           "name": "refresh-after-force-merge",
42 |           "operation": "refresh",
43 |           "clients": 1
44 |         },
45 |         {
46 |           "operation": "autohisto_agg",
47 |           "clients": 1,
48 |           "warmup-iterations": 50,
49 |           "iterations": 100,
50 |           "target-throughput": 2
51 |         },
52 |         {
53 |           "operation": "date_histogram_agg",
54 |           "clients": 1,
55 |           "warmup-iterations": 50,
56 |           "iterations": 100,
57 |           "target-throughput": 2
58 |         }
59 |       ]
60 |     }
61 |     
62 | 


--------------------------------------------------------------------------------
/geopoint/README.md:
--------------------------------------------------------------------------------
 1 | ## Geopoint track
 2 | 
 3 | This track is based on [PlanetOSM](http://wiki.openstreetmap.org/wiki/Planet.osm) data. 
 4 | 
 5 | ### Example Document
 6 | 
 7 | ```json
 8 | {
 9 |   "location": [
10 |     -0.1485188,
11 |     51.5250666
12 |   ]
13 | }
14 | ```
15 | 
16 | ### Parameters
17 | 
18 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`:
19 | 
20 | * `bulk_size` (default: 5000)
21 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests.
22 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
23 | * `conflicts` (default: "random"): Type of id conflicts to simulate. Valid values are: 'sequential' (A document id is replaced with a document id with a sequentially increasing id), 'random' (A document id is replaced with a document id with a random other id).
24 | * `conflict_probability` (default: 25): A number between 0 and 100 that defines the probability of id conflicts. This requires to run the respective challenge. Combining ``conflicts=sequential`` and ``conflict-probability=0`` makes Rally generate index ids by itself, instead of relying on Elasticsearch's `automatic id generation <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#_automatic_id_generation>`_.
25 | * `on_conflict` (default: "index"): Whether to use an "index" or an "update" action when simulating an id conflict.
26 | * `recency` (default: 0): A number between 0 and 1 that defines whether to bias towards more recent ids when simulating conflicts. See the [Rally docs](http://esrally.readthedocs.io/en/latest/track.html#bulk) for the full definition of this parameter. This requires to run the respective challenge.
27 | * `number_of_replicas` (default: 0)
28 | * `number_of_shards` (default: 5)
29 | * `max_num_segments`: The maximum number of segments to force-merge to.
30 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index.
31 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
32 | * `cluster_health` (default: "green"): The minimum required cluster health.
33 | 
34 | ### License
35 | 
36 | Same license as the original data from PlanetOSM: [Open Database License](http://wiki.openstreetmap.org/wiki/Open_Database_License).
37 | 


--------------------------------------------------------------------------------
/pmc/operations/default.json:
--------------------------------------------------------------------------------
 1 |     {
 2 |       "name": "index-append",
 3 |       "operation-type": "bulk",
 4 |       "bulk-size": {{bulk_size | default(500)}},
 5 |       "ingest-percentage": {{ingest_percentage | default(100)}}
 6 |     },
 7 |     {
 8 |       "name": "index-update",
 9 |       "operation-type": "bulk",
10 |       "bulk-size": {{bulk_size | default(500)}},
11 |       "ingest-percentage": {{ingest_percentage | default(100)}},
12 |       "conflicts": "{{conflicts | default('random')}}",
13 |       "on-conflict": "{{on_conflict | default('index')}}",
14 |       "conflict-probability": {{conflict_probability | default(25)}},
15 |       "recency": {{recency | default(0)}}
16 |     },
17 |     {
18 |       "name": "default",
19 |       "operation-type": "search",
20 |       "body": {
21 |         "query": {
22 |           "match_all": {}
23 |         }
24 |       }
25 |     },
26 |     {
27 |       "name": "term",
28 |       "operation-type": "search",
29 |       "body": {
30 |         "query": {
31 |           "term": {
32 |             "body": "physician"
33 |           }
34 |         }
35 |       }
36 |     },
37 |     {
38 |       "name": "phrase",
39 |       "operation-type": "search",
40 |       "body": {
41 |         "query": {
42 |           "match_phrase": {
43 |             "body": "newspaper coverage"
44 |           }
45 |         }
46 |       }
47 |     },
48 |     {
49 |       "name": "articles_monthly_agg_uncached",
50 |       "operation-type": "search",
51 |       "body": {
52 |         "size": 0,
53 |         "aggs": {
54 |           "articles_over_time": {
55 |             "date_histogram": {
56 |               "field": "timestamp",
57 |               "calendar_interval": "month"
58 |             }
59 |           }
60 |         }
61 |       }
62 |     },
63 |     {
64 |       "name": "articles_monthly_agg_cached",
65 |       "operation-type": "search",
66 |       "cache": true,
67 |       "body": {
68 |         "size": 0,
69 |         "aggs": {
70 |           "articles_over_time": {
71 |             "date_histogram": {
72 |               "field": "timestamp",
73 |               "calendar_interval": "month"
74 |             }
75 |           }
76 |         }
77 |       }
78 |     },
79 |     {
80 |       "name": "scroll",
81 |       "operation-type": "search",
82 |       "pages": 25,
83 |       "results-per-page": 100,
84 |       "body": {
85 |         "query": {
86 |           "match_all": {}
87 |         }
88 |       }
89 |     }
90 | 


--------------------------------------------------------------------------------
/geopoint/operations/default.json:
--------------------------------------------------------------------------------
 1 |     {
 2 |       "name": "index-append",
 3 |       "operation-type": "bulk",
 4 |       "bulk-size": {{bulk_size | default(5000)}},
 5 |       "ingest-percentage": {{ingest_percentage | default(100)}}
 6 |     },
 7 |     {
 8 |       "name": "index-update",
 9 |       "operation-type": "bulk",
10 |       "bulk-size": {{bulk_size | default(5000)}},
11 |       "ingest-percentage": {{ingest_percentage | default(100)}},
12 |       "conflicts": "{{conflicts | default('random')}}",
13 |       "on-conflict": "{{on_conflict | default('index')}}",
14 |       "conflict-probability": {{conflict_probability | default(25)}},
15 |       "recency": {{recency | default(0)}}
16 |     },
17 |     {
18 |       "name": "polygon",
19 |       "operation-type": "search",
20 |       "body": {
21 |         "query": {
22 |           "geo_polygon": {
23 |             "location": {
24 |               "points": [
25 |                 [-0.1, 49.0],
26 |                 [5.0, 48.0],
27 |                 [15.0, 49.0],
28 |                 [14.0, 60.0],
29 |                 [-0.1, 61.0],
30 |                 [-0.1, 49.0]
31 |               ]
32 |             }
33 |           }
34 |         }
35 |       }
36 |     },
37 |     {
38 |       "name": "bbox",
39 |       "operation-type": "search",
40 |       "body": {
41 |         "query": {
42 |           "geo_bounding_box": {
43 |             "location": {
44 |               "top_left": [-0.1, 61.0],
45 |               "bottom_right": [15.0, 48.0]
46 |             }
47 |           }
48 |         }
49 |       }
50 |     },
51 |     {
52 |       "name": "distance",
53 |       "operation-type": "search",
54 |       "body": {
55 |         "query": {
56 |           "geo_distance": {
57 |             "distance": "200km",
58 |             "location": [7.0, 55.0]
59 |           }
60 |         }
61 |       }
62 |     },
63 |     {
64 |       "name": "distanceRange",
65 |       "operation-type": "search",
66 |       "body": {
67 |         "query": {
68 |           "match_all": {}
69 |         },
70 |         "aggs": {
71 |           "geo_distance_range_agg": {
72 |             "geo_distance": {
73 |               "field": "location",
74 |               "origin": "55.0, 7.0",
75 |               "unit": "km",
76 |               "ranges": [
77 |                 {
78 |                   "from": 200,
79 |                   "to": 400
80 |                 }
81 |               ]
82 |             }
83 |           }
84 |         }
85 |       }
86 |     }


--------------------------------------------------------------------------------
/nyc_taxis/index.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "index.number_of_shards": {{number_of_shards | default(1)}},
 4 |     "index.number_of_replicas": {{number_of_replicas | default(0)}},
 5 |     "index.requests.cache.enable": false
 6 |   },
 7 |   "mappings": {
 8 |     "_source": {
 9 |       "enabled": {{ source_enabled | default(true) | tojson }}
10 |     },
11 |     "properties": {
12 |       "surcharge": {
13 |         "scaling_factor": 100,
14 |         "type": "scaled_float"
15 |       },
16 |       "dropoff_datetime": {
17 |         "type": "date",
18 |         "format": "yyyy-MM-dd HH:mm:ss"
19 |       },
20 |       "trip_type": {
21 |         "type": "keyword"
22 |       },
23 |       "mta_tax": {
24 |         "scaling_factor": 100,
25 |         "type": "scaled_float"
26 |       },
27 |       "rate_code_id": {
28 |         "type": "keyword"
29 |       },
30 |       "passenger_count": {
31 |         "type": "integer"
32 |       },
33 |       "pickup_datetime": {
34 |         "type": "date",
35 |         "format": "yyyy-MM-dd HH:mm:ss"
36 |       },
37 |       "tolls_amount": {
38 |         "scaling_factor": 100,
39 |         "type": "scaled_float"
40 |       },
41 |       "tip_amount": {
42 |         "scaling_factor": 100,
43 |         "type": "scaled_float"
44 |       },
45 |       "payment_type": {
46 |         "type": "keyword"
47 |       },
48 |       "extra": {
49 |         "scaling_factor": 100,
50 |         "type": "scaled_float"
51 |       },
52 |       "vendor_id": {
53 |         "type": "keyword"
54 |       },
55 |       "store_and_fwd_flag": {
56 |         "type": "keyword"
57 |       },
58 |       "improvement_surcharge": {
59 |         "scaling_factor": 100,
60 |         "type": "scaled_float"
61 |       },
62 |       "fare_amount": {
63 |         "scaling_factor": 100,
64 |         "type": "scaled_float"
65 |       },
66 |       "ehail_fee": {
67 |         "scaling_factor": 100,
68 |         "type": "scaled_float"
69 |       },
70 |       "cab_color": {
71 |         "type": "keyword"
72 |       },
73 |       "dropoff_location": {
74 |         "type": "geo_point"
75 |       },
76 |       "vendor_name": {
77 |         "type": "text"
78 |       },
79 |       "total_amount": {
80 |         "scaling_factor": 100,
81 |         "type": "scaled_float"
82 |       },
83 |       "trip_distance": {
84 |         "scaling_factor": 100,
85 |         "type": "scaled_float"
86 |       },
87 |       "pickup_location": {
88 |         "type": "geo_point"
89 |       }
90 |     },
91 |     "dynamic": "strict"
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # fail this script immediately if any command fails with a non-zero exit code
 4 | set -e
 5 | # Treat unset env variables as an error
 6 | set -u
 7 | # fail on pipeline errors, e.g. when grepping
 8 | set -o pipefail
 9 | 
10 | readonly ROOT=".rally/benchmarks"
11 | readonly URL="http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora"
12 | 
13 | 
14 | # see http://stackoverflow.com/a/246128
15 | SOURCE="${BASH_SOURCE[0]}"
16 | while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
17 |   DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
18 |   SOURCE="$(readlink "$SOURCE")"
19 |   [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
20 | done
21 | readonly CURR_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
22 | 
23 | # test number of parameters
24 | if [ $# != 1 ]
25 | then
26 |     echo "Usage: $0 TRACK_NAME"
27 |     exit 1
28 | fi
29 | 
30 | readonly TRACK=$1
31 | 
32 | TARGETS=( )
33 | 
34 | # clone track descriptions
35 | readonly REPO_TARGET="${ROOT}/tracks/default"
36 | # add to final tar
37 | TARGETS[${#TARGETS[*]}]="${REPO_TARGET}"
38 | 
39 | if [ ! -d "${HOME}/${REPO_TARGET}" ]
40 | then
41 |     git clone https://github.com/elastic/rally-tracks.git "${HOME}/${REPO_TARGET}"
42 | fi
43 | 
44 | # check if the track actually exists
45 | if [ ! -d "${HOME}/${REPO_TARGET}/${TRACK}" ]
46 | then
47 |     echo "Track ${TRACK} does not exist in ${HOME}/${REPO_TARGET}."
48 |     exit 1
49 | fi
50 | 
51 | # download data (unless it exists locally)
52 | readonly FILES=$(cat ${HOME}/${REPO_TARGET}/${TRACK}/files.txt)
53 | for f in ${FILES}; do
54 |     TARGET_ROOT="${ROOT}/data/${TRACK}"
55 |     TARGET_PATH="${TARGET_ROOT}/${f}"
56 |     mkdir -p "${HOME}/${TARGET_ROOT}"
57 |     TARGETS[${#TARGETS[*]}]="${TARGET_PATH}"
58 |     if [ ! -f "${HOME}/${TARGET_PATH}" ]
59 |     then
60 |         curl -o "${HOME}/${TARGET_PATH}" "${URL}/${TRACK}/${f}"
61 |     fi
62 | done
63 | 
64 | readonly ARCHIVE="rally-track-data-${TRACK}.tar"
65 | # ensure everything is relative to the home directory
66 | # exclude the archive itself to prevent spurious warnings.
67 | tar -C ${HOME} --exclude="${ARCHIVE}" -cf "${ARCHIVE}" ${TARGETS[@]}
68 | 
69 | echo "Created data for ${TRACK} in ${ARCHIVE}. Next steps:"
70 | echo ""
71 | echo "1. Copy it to the user home directory on the target machine(s)."
72 | echo "2. Extract with tar -xf ${ARCHIVE} (will be extracted to ~/${ROOT})."


--------------------------------------------------------------------------------
/noaa/README.md:
--------------------------------------------------------------------------------
 1 | ## NOAA track
 2 | 
 3 | This track is based on a [daily weather measurement from NOAA](ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/).
 4 | 
 5 | To recreate the document corpus:
 6 | 
 7 | 1. Download the following files:
 8 |     * ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2014.csv.gz
 9 |     * ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2015.csv.gz
10 |     * ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2016.csv.gz
11 |     * ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt
12 |     * ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-countries.txt
13 |     * ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-states.txt
14 | 2. Decompress measurement files. For example: `gunzip 2016.csv.gz`
15 | 3. Sort the files by station. For example: `sort --field-separator=',' --key=1,2 -o 2016-sorted.csv 2016.csv`
16 | 4. Execute a script like `_tools/process.py` to create json documents.
17 | 5. Make sure that the JSON documents are randomly ordered. (The script orders measurements of the same station next to each other). This can be achieved with `shuf documents.json > documents1.json`. 
18 | 6. Compress the documents json file: `bzip2 -9 -c documents1.json > documents.json.bz2`
19 | 
20 | ### Example Document
21 | 
22 | ```json
23 | {
24 |   "date": "2016-01-01T00:00:00",
25 |   "TAVG": 22.9,
26 |   "station": {
27 |     "elevation": 34.0,
28 |     "name": "SHARJAH INTER. AIRP",
29 |     "country": "United",
30 |     "gsn_flag": "GSN",
31 |     "location": {
32 |       "lat": 25.333,
33 |       "lon": 55.517
34 |     },
35 |     "country_code": "AE",
36 |     "wmo_id": "41196",
37 |     "id": "AE000041196"
38 |   },
39 |   "TMIN": 15.5
40 | }
41 | ```
42 | 
43 | ### Parameters
44 | 
45 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`:
46 | 
47 | * `bulk_size` (default: 5000)
48 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests.
49 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
50 | * `number_of_replicas` (default: 0)
51 | * `number_of_shards` (default: 1)
52 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index.
53 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
54 | * `cluster_health` (default: "green"): The minimum required cluster health.
55 | 
56 | ### License
57 | 
58 | [US Government Work data license](https://www.usa.gov/government-works)
59 | 


--------------------------------------------------------------------------------
/nested/README.md:
--------------------------------------------------------------------------------
 1 | ## Nested track
 2 | 
 3 | This track is based on a [dump of StackOverflow posts](https://ia800500.us.archive.org/22/items/stackexchange/stackoverflow.com-Posts.7z) retrieved as of June 10, 2016.
 4 | 
 5 | Each question and related answers have been assembled into a single JSON doc containing:
 6 | 
 7 | * qid: a unique ID for a question 
 8 | * title: a free-text field with the question title
 9 | * creationDate:	The date the questions was asked 
10 | * user:	The user's screen name and unique ID combined into a single string
11 | * tag: An array of tags describing the technologies.
12 | * answers: An array of objects, one per answer, with the following fields:
13 |     * date: Date of answer
14 |     * user: Answerer's screen name and unique ID combined into a single string
15 | 		
16 | 
17 | Data preparation process:
18 | 
19 | * Question and answer entries in the original posts.XML were converted to slimmed-down rows in a CSV and enriched with user names from users.xml
20 | * CSV was sorted by first two columns (questionID and answerID)
21 | * The CSV was converted to the JSON file presented here, combining questions and answers into a single JSON doc.
22 | 
23 | These scripts are available in the raw_data_prep_scripts.zip file.
24 | 
25 | ### Example Document
26 | 
27 | ```json
28 | {
29 |   "title": "Are these LAMP permissions secure?",
30 |   "qid": "10000023",
31 |   "answers": [
32 |     {
33 |       "date": "2012-04-04T12:56:34.433",
34 |       "user": "larsks (147356)"
35 |     }
36 |   ],
37 |   "tag": [
38 |     "linux",
39 |     "apache",
40 |     "security",
41 |     "ubuntu",
42 |     "permissions"
43 |   ],
44 |   "user": "Trent Scott (600873)",
45 |   "creationDate": "2012-04-03T19:26:57.033"
46 | }
47 | ```
48 | 
49 | ### Parameters
50 | 
51 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`:
52 | 
53 | * `bulk_size` (default: 5000)
54 | * `bulk_indexing_clients` (default: 4): Number of clients that issue bulk indexing requests.
55 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
56 | * `number_of_replicas` (default: 0)
57 | * `number_of_shards` (default: 1)
58 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index.
59 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
60 | * `cluster_health` (default: "green"): The minimum required cluster health.
61 | 
62 | ### License
63 | 
64 | We use the same license for the data as the original data: [CC-SA-3.0](http://creativecommons.org/licenses/by-sa/3.0/)
65 | 
66 | 


--------------------------------------------------------------------------------
/so/README.md:
--------------------------------------------------------------------------------
 1 | ## StackOverflow track
 2 | 
 3 | This dataset is derived from a dump of StackOverflow posts downloaded on June 10th 2016 from
 4 | https://ia800500.us.archive.org/22/items/stackexchange/stackoverflow.com-Posts.7z
 5 | 
 6 | Each question and answer have formatted into a JSON document with the following fields:
 7 |     
 8 | 	questionId:	      a unique ID for a question
 9 | 	answerId:         a unique ID for an answer
10 | 	acceptedAnswerId: the unique ID of the answer accepted for question
11 | 	title:	          a free-text field with the question title
12 | 	creationDate:	  The date the questions was asked 
13 | 	user:	          The user's unique ID
14 | 	tags:	          An array of tags describing the technologies.
15 |     body:             Field contsaining the text of the question or answer.
16 |     type:             Type of post. Either 'question' or 'answer'
17 | 	
18 | Fields that do not have values have been left out. The body has had text extracted and been 
19 | formatted to fit into JSON documents.
20 | 
21 | Data preparation process:
22 | * Question and answer entries in the original posts.XML were converted to slimmed-down JSON 
23 |   documents.
24 | * No enrichment was performed.
25 | These scripts are available in the raw_data_prep_script.zip file.
26 | 
27 | ### Example Document
28 | 
29 | ```json
30 | {
31 | 	"user": "45",
32 | 	"tags": ["c#", "linq", ".net-3.5"],
33 | 	"questionId": "59",
34 | 	"creationDate": "2008-08-01T13:14:33.797",
35 | 	"title": "How do I get a distinct, ordered list of names from a DataTable using LINQ?",
36 | 	"acceptedAnswerId": "43110",
37 | 	"type": "question",
38 | 	"body": "Let's say I have a DataTable with a Name column. I want to have a collection of the unique names ordered alphabetically. The following query ignores the order by clause. var names = (from DataRow dr in dataTable.Rows orderby (string)dr[\"Name\"] select (string)dr[\"Name\"]).Distinct(); Why does the orderby not get enforced? "
39 | }
40 | ```
41 | 
42 | ### Parameters
43 | 
44 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`:
45 | 
46 | * `bulk_size` (default: 5000)
47 | * `bulk_indexing_clients` (default: 4): Number of clients that issue bulk indexing requests.
48 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
49 | * `number_of_replicas` (default: 0)
50 | * `number_of_shards` (default: 5)
51 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index.
52 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
53 | * `cluster_health` (default: "green"): The minimum required cluster health.
54 | 
55 | ### License
56 | 
57 | We use the same license for the data as the original data: [CC-SA-3.0](http://creativecommons.org/licenses/by-sa/3.0/)
58 | 


--------------------------------------------------------------------------------
/eventdata/README.md:
--------------------------------------------------------------------------------
 1 | ## EventData track
 2 | 
 3 | This track is based on 20 million Apache access log entries generated based on statistics from sample 
 4 | elastic.co access logs using the generator avilable here: https://github.com/elastic/rally-eventdata-track
 5 | 
 6 | The size of the data file is around 15GB, which gives an average JSON record size of 822 bytes. Mappings have been optimized and some of the fields added through `geoip` and `user-agent` enrichment has been removed to achieve a more compact format.
 7 | 
 8 | The purpose of this track is to provide an efficient way to benchmark indexing of this data type as the generator built into the rally-eventdata-track can be CPU intensive.
 9 | 
10 | ### Example Document
11 | 
12 | ```json
13 | {
14 | 	"agent": "\"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36\"",
15 | 	"useragent": {
16 | 		"os": "Mac OS X 10.10.2",
17 | 		"os_name": "Mac OS X",
18 | 		"name": "Chrome"
19 | 	},
20 | 	"geoip": {
21 | 		"country_name": "India",
22 | 		"location": [80.2833, 13.083300000000008]
23 | 	},
24 | 	"clientip": "122.178.238.140",
25 | 	"referrer": "\"-\"",
26 | 	"request": "/apple-touch-icon-144x144.png",
27 | 	"bytes": 0,
28 | 	"verb": "GET",
29 | 	"response": 304,
30 | 	"httpversion": "1.1",
31 | 	"@timestamp": "2017-07-03T07:51:49.995Z",
32 | 	"message": "122.178.238.140 - - [2017-07-03T07:51:49.995Z] \"GET /apple-touch-icon-144x144.png HTTP/1.1\" 304 0 \"-\" \"-\" \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36\""
33 | }
34 | ```
35 | 
36 | ### Parameters
37 | 
38 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`:
39 | 
40 | * `bulk_size` (default: 5000)
41 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests.
42 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
43 | * `number_of_replicas` (default: 0)
44 | * `number_of_shards` (default: 5)
45 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 
46 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
47 | * `cluster_health` (default: "green"): The minimum required cluster health.
48 | 
49 | ### License
50 | 
51 | This is licensed under the Apache License, version 2 ("ALv2"), quoted below.
52 | 
53 | Copyright 2015-2018 Elasticsearch https://www.elastic.co
54 | 
55 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
56 | 
57 | http://www.apache.org/licenses/LICENSE-2.0
58 | Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
59 | 


--------------------------------------------------------------------------------
/nyc_taxis/operations/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "index",
  3 |       "operation-type": "bulk",
  4 |       "bulk-size": {{bulk_size | default(10000)}},
  5 |       "ingest-percentage": {{ingest_percentage | default(100)}}
  6 |     },
  7 |     {
  8 |       "name": "update",
  9 |       "operation-type": "bulk",
 10 |       "bulk-size": {{bulk_size | default(10000)}},
 11 |       "ingest-percentage": {{ingest_percentage | default(100)}},
 12 |       "conflicts": "{{conflicts | default('random')}}",
 13 |       "on-conflict": "{{on_conflict | default('update')}}",
 14 |       "conflict-probability": {{conflict_probability | default(25)}},
 15 |       "recency": {{recency | default(0)}}
 16 |     },
 17 |     {
 18 |       "name": "default",
 19 |       "operation-type": "search",
 20 |       "body": {
 21 |         "query": {
 22 |           "match_all": {}
 23 |         }
 24 |       }
 25 |     },
 26 |     {
 27 |       "name": "range",
 28 |       "operation-type": "search",
 29 |       "body": {
 30 |         "query": {
 31 |           "range": {
 32 |             "total_amount": {
 33 |               "gte": 5,
 34 |               "lt": 15
 35 |             }
 36 |           }
 37 |         }
 38 |       }
 39 |     },
 40 |     {
 41 |       "name": "distance_amount_agg",
 42 |       "operation-type": "search",
 43 |       "body": {
 44 |         "size": 0,
 45 |         "aggs": {
 46 |           "distance_histo": {
 47 |             "histogram": {
 48 |               "field": "distance",
 49 |               "interval": 1
 50 |             },
 51 |             "aggs": {
 52 |               "total_amount_stats": {
 53 |                 "stats": {
 54 |                   "field": "total_amount"
 55 |                 }
 56 |               }
 57 |             }
 58 |           }
 59 |         }
 60 |       }
 61 |     },
 62 |     {
 63 |       "name": "autohisto_agg",
 64 |       "operation-type": "search",
 65 |       "body": {
 66 |         "size": 0,
 67 |         "query": {
 68 |           "range": {
 69 |             "dropoff_datetime": {
 70 |               "gte": "01/01/2015",
 71 |               "lte": "21/01/2015",
 72 |               "format": "dd/MM/yyyy"
 73 |             }
 74 |           }
 75 |         },
 76 |         "aggs": {
 77 |           "dropoffs_over_time": {
 78 |             "auto_date_histogram": {
 79 |               "field": "dropoff_datetime",
 80 |               "buckets": 20
 81 |             }
 82 |           }
 83 |         }
 84 |       }
 85 |     },
 86 |     {
 87 |       "name": "date_histogram_agg",
 88 |       "operation-type": "search",
 89 |       "body": {
 90 |         "size": 0,
 91 |         "query": {
 92 |           "range": {
 93 |               "dropoff_datetime": {
 94 |               "gte": "01/01/2015",
 95 |               "lte": "21/01/2015",
 96 |               "format": "dd/MM/yyyy"
 97 |             }
 98 |           }
 99 |         },
100 |         "aggs": {
101 |           "dropoffs_over_time": {
102 |             "date_histogram": {
103 |               "field": "dropoff_datetime",
104 |               "calendar_interval": "day"
105 |             }
106 |           }
107 |         }
108 |       }
109 |     }
110 | 


--------------------------------------------------------------------------------
/geonames/index.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "settings": {
  3 |     "index.number_of_shards": {{number_of_shards | default(5)}},
  4 |     "index.number_of_replicas": {{number_of_replicas | default(0)}},
  5 |     "index.store.type": "{{store_type | default('fs')}}",
  6 |     "index.requests.cache.enable": false
  7 |   },
  8 |   "mappings": {
  9 |     "dynamic": "strict",
 10 |     "_source": {
 11 |       "enabled": {{ source_enabled | default(true) | tojson }}
 12 |     },
 13 |     "properties": {
 14 |       "elevation": {
 15 |         "type": "integer"
 16 |       },
 17 |       "name": {
 18 |         "type": "text",
 19 |         "fields": {
 20 |           "raw": {
 21 |             "type": "keyword"
 22 |           }
 23 |         }
 24 |       },
 25 |       "geonameid": {
 26 |         "type": "long"
 27 |       },
 28 |       "feature_class": {
 29 |         "type": "text",
 30 |         "fields": {
 31 |           "raw": {
 32 |             "type": "keyword"
 33 |           }
 34 |         }
 35 |       },
 36 |       "location": {
 37 |         "type": "geo_point"
 38 |       },
 39 |       "cc2": {
 40 |         "type": "text",
 41 |         "fields": {
 42 |           "raw": {
 43 |             "type": "keyword"
 44 |           }
 45 |         }
 46 |       },
 47 |       "timezone": {
 48 |         "type": "text",
 49 |         "fields": {
 50 |           "raw": {
 51 |             "type": "keyword"
 52 |           }
 53 |         }
 54 |       },
 55 |       "dem": {
 56 |         "type": "text",
 57 |         "fields": {
 58 |           "raw": {
 59 |             "type": "keyword"
 60 |           }
 61 |         }
 62 |       },
 63 |       "country_code": {
 64 |         "type": "text",
 65 |         "fielddata": true,
 66 |         "fields": {
 67 |           "raw": {
 68 |             "type": "keyword"
 69 |           }
 70 |         }
 71 |       },
 72 |       "admin1_code": {
 73 |         "type": "text",
 74 |         "fields": {
 75 |           "raw": {
 76 |             "type": "keyword"
 77 |           }
 78 |         }
 79 |       },
 80 |       "admin2_code": {
 81 |         "type": "text",
 82 |         "fields": {
 83 |           "raw": {
 84 |             "type": "keyword"
 85 |           }
 86 |         }
 87 |       },
 88 |       "admin3_code": {
 89 |         "type": "text",
 90 |         "fields": {
 91 |           "raw": {
 92 |             "type": "keyword"
 93 |           }
 94 |         }
 95 |       },
 96 |       "admin4_code": {
 97 |         "type": "text",
 98 |         "fields": {
 99 |           "raw": {
100 |             "type": "keyword"
101 |           }
102 |         }
103 |       },
104 |       "feature_code": {
105 |         "type": "text",
106 |         "fields": {
107 |           "raw": {
108 |             "type": "keyword"
109 |           }
110 |         }
111 |       },
112 |       "alternatenames": {
113 |         "type": "text",
114 |         "fields": {
115 |           "raw": {
116 |             "type": "keyword"
117 |           }
118 |         }
119 |       },
120 |       "asciiname": {
121 |         "type": "text",
122 |         "fields": {
123 |           "raw": {
124 |             "type": "keyword"
125 |           }
126 |         }
127 |       },
128 |       "population": {
129 |         "type": "long"
130 |       }
131 |     }
132 |   }
133 | }
134 | 


--------------------------------------------------------------------------------
/http_logs/_tools/unparse.rb:
--------------------------------------------------------------------------------
 1 | require "json"
 2 | require "time"
 3 | 
 4 | ################
 5 | #
 6 | # Reconstructs (un-parses) the existing http_logs corpora (data set). The introduction of ingest node pipelines
 7 | # requires the data to be JSON, but un-parsed log lines. This script was used to create the `http_logs_unparsed`, which
 8 | # is a mirror copy of "http_logs`, except it is un-parsed AND the timestamp is ISO8601 (not epoch_seconds)
 9 | #
10 | # The output of this is is a file with lines of JSON that appear as follows:
11 | #
12 | # {"message" : "30.87.8.0 - - [1998-05-24T15:00:01-05:00] \"GET /images/info.gif HTTP/1.0\" 200 1251"}
13 | # {"message" : "28.87.8.0 - - [1998-05-24T15:00:01-05:00] \"GET /french/images/hm_official.gif HTTP/1.1\" 200 972"}
14 | # {"message" : "17.87.8.0 - - [1998-05-24T15:00:01-05:00] \"GET /french/hosts/cfo/images/cfo/cfophot3.jpg HTTP/1.0\" 200 6695"}
15 | #
16 | # Usage:
17 | #
18 | # rm *.unparse.json
19 | # rm *.bz2
20 | #
21 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-181998.json.bz2
22 | # bunzip2 documents-181998.json.bz2
23 | #
24 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-191998.json.bz2
25 | # bunzip2 documents-191998.json.bz2
26 | #
27 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-201998.json.bz2
28 | # bunzip2 documents-201998.json.bz2
29 | #
30 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-211998.json.bz2
31 | # bunzip2 documents-211998.json.bz2
32 | #
33 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-221998.json.bz2
34 | # bunzip2 documents-221998.json.bz2
35 | #
36 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-231998.json.bz2
37 | # bunzip2 documents-231998.json.bz2
38 | #
39 | # wget http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs/documents-241998.json.bz2
40 | # bunzip2 documents-241998.json.bz2
41 | #
42 | # ruby unparse.rb .
43 | #
44 | # #############
45 | 
46 | def self.getValue(data,key)
47 |   data[key].nil? ? "-" : data[key].to_s
48 | end
49 | 
50 | threads = 4
51 | running = 0
52 | Dir.glob(File.join(ARGV[0], "*.json")).select do |file|
53 |   File.open(file.gsub('json', 'unparsed.json'), 'w') do |json_file|
54 |     while running >= threads
55 |       sleep 1
56 |     end
57 |     running = running + 1
58 |     Thread.new do
59 |       i = 0;
60 |       File.open(file).each do |line|
61 |         begin
62 |           i += 1;
63 |           print "." if i % 10000 == 0
64 |           data = JSON.parse(line)
65 |           logline = getValue(data,'clientip')  + " - - [" + Time.at(data['@timestamp'].to_i).iso8601 + "] \\\"" + getValue(data,'request') + "\\\" " + getValue(data,'status') + " " + getValue(data,'size')
66 |           json_log_line = "{\"message\" : \"" + logline + "\"}\n"
67 |           #TODO: validate this is proper JSON. ~15 rows (.02%) were post modified to remove an invalid '\' char in the resultant JSON
68 |           json_file.write(json_log_line)
69 |         rescue => e
70 |           puts e
71 |         end
72 |       end
73 |       running = running - 1
74 |     end
75 |     while running > 0
76 |       sleep 1
77 |     end
78 |   end
79 | end


--------------------------------------------------------------------------------
/pmc/README.md:
--------------------------------------------------------------------------------
 1 | ## PMC track
 2 | 
 3 | This track contains data retrieved from [PMC](https://www.ncbi.nlm.nih.gov/pmc/tools/ftp/).
 4 | 
 5 | Note that we have filtered the data set so only a subset of the articles is included. 
 6 | 
 7 | ### Example Document
 8 | 
 9 | Note that the ``body`` content is actually much longer has been shortened here to increase readability.
10 | 
11 | ```json
12 | {
13 |   "name": "3_Biotech_2015_Dec_13_5(6)_1007-1019",
14 |   "journal": "3 Biotech",
15 |   "date": "2015 Dec 13",
16 |   "volume": "5(6)",
17 |   "issue": "1007-1019",
18 |   "accession": "PMC4624133",
19 |   "timestamp": "2015-10-30 20:08:11",
20 |   "pmid": "",
21 |   "body": "\n==== Front\n3 Biotech3 Biotech3 Biotech2190-572X2190-5738Springer ..."
22 | }
23 | ```
24 | 
25 | ### Parameters
26 | 
27 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`:
28 | 
29 | * `bulk_size` (default: 500)
30 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests.
31 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
32 | * `conflicts` (default: "random"): Type of id conflicts to simulate. Valid values are: 'sequential' (A document id is replaced with a document id with a sequentially increasing id), 'random' (A document id is replaced with a document id with a random other id).
33 | * `conflict_probability` (default: 25): A number between 0 and 100 that defines the probability of id conflicts. This requires to run the respective challenge. Combining ``conflicts=sequential`` and ``conflict-probability=0`` makes Rally generate index ids by itself, instead of relying on Elasticsearch's `automatic id generation <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#_automatic_id_generation>`_.
34 | * `on_conflict` (default: "index"): Whether to use an "index" or an "update" action when simulating an id conflict.
35 | * `recency` (default: 0): A number between 0 and 1 that defines whether to bias towards more recent ids when simulating conflicts. See the [Rally docs](http://esrally.readthedocs.io/en/latest/track.html#bulk) for the full definition of this parameter. This requires to run the respective challenge.
36 | * `number_of_replicas` (default: 0)
37 | * `number_of_shards` (default: 5)
38 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index.
39 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
40 | * [`default_search_timeout`](https://www.elastic.co/guide/en/elasticsearch/reference/6.0/search.html#global-search-timeout) (default: -1)
41 | * `cluster_health` (default: "green"): The minimum required cluster health.
42 | 
43 | ### License
44 | 
45 | All articles that are included are licensed as CC-BY (http://creativecommons.org/licenses/by/2.0/)
46 | 
47 | This data set is licensed under the same terms. Please refer to http://creativecommons.org/licenses/by/2.0/ for details.
48 | 
49 | Attribution hint: 
50 | 
51 | You can download a full list of the author information for each included document from http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/pmc/attribution.txt.bz2 (size: 52.2MB)
52 | 


--------------------------------------------------------------------------------
/geonames/README.md:
--------------------------------------------------------------------------------
 1 | ## Geonames track
 2 | 
 3 | This track is based on a [geonames](http://www.geonames.org/) dump of the file [allCountries.zip](http://download.geonames.org/export/dump/allCountries.zip) retrieved as of April 27, 2017. 
 4 | 
 5 | For further details about the semantics of individual fields, please see the [geonames dump README](http://download.geonames.org/export/dump/readme.txt).
 6 | 
 7 | Modifications:
 8 | 
 9 | * The original CSV data have been converted to JSON.
10 | * We combine the original `longitude` and `latitude` fields to a new `location` field of type [geo_point](https://www.elastic.co/guide/en/elasticsearch/reference/current/geo-point.html).
11 | 
12 | ### Example Document
13 | 
14 | ```json
15 | {
16 |   "geonameid": 2986043,
17 |   "name": "Pic de Font Blanca",
18 |   "asciiname": "Pic de Font Blanca",
19 |   "alternatenames": "Pic de Font Blanca,Pic du Port",
20 |   "feature_class": "T",
21 |   "feature_code": "PK",
22 |   "country_code": "AD",
23 |   "admin1_code": "00",
24 |   "population": 0,
25 |   "dem": "2860",
26 |   "timezone": "Europe/Andorra",
27 |   "location": [
28 |     1.53335,
29 |     42.64991
30 |   ]
31 | }
32 | ```
33 | 
34 | ### Parameters
35 | 
36 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`:
37 | 
38 | * `bulk_size` (default: 5000)
39 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests.
40 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
41 | * `conflicts` (default: "random"): Type of id conflicts to simulate. Valid values are: 'sequential' (A document id is replaced with a document id with a sequentially increasing id), 'random' (A document id is replaced with a document id with a random other id).
42 | * `conflict_probability` (default: 25): A number between 0 and 100 that defines the probability of id conflicts. This requires to run the respective challenge. Combining ``conflicts=sequential`` and ``conflict-probability=0`` makes Rally generate index ids by itself, instead of relying on Elasticsearch's `automatic id generation <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#_automatic_id_generation>`_.
43 | * `on_conflict` (default: "index"): Whether to use an "index" or an "update" action when simulating an id conflict.
44 | * `recency` (default: 0): A number between 0 and 1 that defines whether to bias towards more recent ids when simulating conflicts. See the [Rally docs](http://esrally.readthedocs.io/en/latest/track.html#bulk) for the full definition of this parameter. This requires to run the respective challenge.
45 | * `number_of_replicas` (default: 0)
46 | * `number_of_shards` (default: 5)
47 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. 
48 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
49 | * `cluster_health` (default: "green"): The minimum required cluster health.
50 | 
51 | ### License
52 | 
53 | We use the same license for the data as the original data from Geonames:
54 | 
55 | ```
56 | This work is licensed under a Creative Commons Attribution 3.0 License,
57 | see http://creativecommons.org/licenses/by/3.0/
58 | The Data is provided "as is" without warranty or any representation of accuracy, timeliness or completeness.
59 | ```
60 | 


--------------------------------------------------------------------------------
/geoshape/challenges/default.json:
--------------------------------------------------------------------------------
 1 |     {
 2 |       "name": "append-no-conflicts",
 3 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.",
 4 |       "default": true,
 5 |       "schedule": [
 6 |         {
 7 |           "operation": "delete-index"
 8 |         },
 9 |         {
10 |           "operation": {
11 |             "operation-type": "create-index",
12 |             "settings": {{index_settings | default({}) | tojson}}
13 |           }
14 |         },
15 |         {
16 |           "name": "check-cluster-health",
17 |           "operation": {
18 |             "operation-type": "cluster-health",
19 |             "index": "osm*",
20 |             "request-params": {
21 |               "wait_for_status": "{{cluster_health | default('green')}}",
22 |               "wait_for_no_relocating_shards": "true"
23 |             }
24 |           }
25 |         },
26 |         {
27 |           "operation": "index-append-linestrings",
28 |           "warmup-time-period": 120,
29 |           "clients": {{bulk_indexing_clients | default(8)}}
30 |         },
31 |         {
32 |           "name": "refresh-after-linestrings-index",
33 |           "operation": "refresh",
34 |           "index": "osmlinestrings",
35 |           "clients": 1
36 |         },
37 |         {
38 |           "name": "force-merge-linestrings",
39 |           "operation": "force-merge",
40 |           "index": "osmlinestrings",
41 |           "clients": 1
42 |         },
43 |         {
44 |           "operation": "index-append-multilinestrings",
45 |           "warmup-time-period": 120,
46 |           "clients": {{bulk_indexing_clients | default(8)}}
47 |         },
48 |         {
49 |           "name": "refresh-after-multilinestrings-index",
50 |           "operation": "refresh",
51 |           "index": "osmmultilinestrings",
52 |           "clients": 1
53 |         },
54 |         {
55 |           "name": "force-merge-multilinestrings",
56 |           "operation": "force-merge",
57 |           "index": "osmmultilinestrings",
58 |           "clients": 1
59 |         },
60 |         {
61 |           "operation": "index-append-polygons",
62 |           "warmup-time-period": 120,
63 |           "clients": {{bulk_indexing_clients | default(8)}}
64 |         },
65 |         {
66 |           "name": "refresh-after-polygons-index",
67 |           "operation": "refresh",
68 |           "index": "osmpolygons",
69 |           "clients": 1
70 |         },
71 |         {
72 |           "name": "force-merge-polygons",
73 |           "operation": "force-merge",
74 |           "index": "osmpolygons",
75 |           "clients": 1
76 |         },
77 |         {
78 |           "name": "refresh-after-all-indices",
79 |           "operation": "refresh",
80 |           "clients": 1
81 |         },
82 |         {
83 |           "operation": "polygon",
84 |           "clients": 1,
85 |           "warmup-iterations": 200,
86 |           "iterations": 100,
87 |           "target-throughput": 0.3
88 |         },
89 |         {
90 |           "operation": "bbox",
91 |           "clients": 1,
92 |           "warmup-iterations": 200,
93 |           "iterations": 100,
94 |           "target-throughput": 0.25
95 |         }
96 |       ]
97 |     }
98 | 


--------------------------------------------------------------------------------
/nyc_taxis/README.md:
--------------------------------------------------------------------------------
 1 | ## NYC taxis track
 2 | 
 3 | This track contains the rides that have been performed in yellow taxis in New York in 2015. It can be downloaded from http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml.
 4 | 
 5 | This has only been tested with the 2015 dump, but this should work with any dump of the yellow taxis, and should be easy to adapt to the green taxis.
 6 | 
 7 | Once downloaded, you can generate the mappings with:
 8 | 
 9 | ```
10 | python3 _tools/parse.py mappings
11 | ```
12 | 
13 | And the json documents  can be generated with:
14 | 
15 | ```  
16 | python3 _tools/parse.py json file_name.csv > documents.json
17 | ```
18 | 
19 | Finally the json docs can be compressed with:
20 | 
21 | ```
22 | bzip2 -k documents.json
23 | ```
24 | 
25 | ### Example Document
26 | 
27 | ```json
28 | {
29 |   "total_amount": 6.3,
30 |   "improvement_surcharge": 0.3,
31 |   "pickup_location": [
32 |     -73.92259216308594,
33 |     40.7545280456543
34 |   ],
35 |   "pickup_datetime": "2015-01-01 00:34:42",
36 |   "trip_type": "1",
37 |   "dropoff_datetime": "2015-01-01 00:38:34",
38 |   "rate_code_id": "1",
39 |   "tolls_amount": 0.0,
40 |   "dropoff_location": [
41 |     -73.91363525390625,
42 |     40.76552200317383
43 |   ],
44 |   "passenger_count": 1,
45 |   "fare_amount": 5.0,
46 |   "extra": 0.5,
47 |   "trip_distance": 0.88,
48 |   "tip_amount": 0.0,
49 |   "store_and_fwd_flag": "N",
50 |   "payment_type": "2",
51 |   "mta_tax": 0.5,
52 |   "vendor_id": "2"
53 | }
54 | ```
55 | 
56 | ### Parameters
57 | 
58 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`:
59 | 
60 | * `bulk_size` (default: 10000)
61 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests.
62 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
63 | * `conflicts` (default: "random"): Type of id conflicts to simulate. Valid values are: 'sequential' (A document id is replaced with a document id with a sequentially increasing id), 'random' (A document id is replaced with a document id with a random other id).
64 | * `conflict_probability` (default: 25): A number between 0 and 100 that defines the probability of id conflicts. Only used by the `update` challenge. Combining ``conflicts=sequential`` and ``conflict-probability=0`` makes Rally generate index ids by itself, instead of relying on Elasticsearch's `automatic id generation <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#_automatic_id_generation>`_.
65 | * `on_conflict` (default: "index"): Whether to use an "index" or an "update" action when simulating an id conflict. Only used by the `update` challenge.
66 | * `recency` (default: 0): A number between 0 and 1 that defines whether to bias towards more recent ids when simulating conflicts. See the [Rally docs](http://esrally.readthedocs.io/en/latest/track.html#bulk) for the full definition of this parameter. Only used by the `update` challenge.
67 | * `number_of_replicas` (default: 0)
68 | * `number_of_shards` (default: 1)
69 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index.
70 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
71 | * `cluster_health` (default: "green"): The minimum required cluster health.
72 | 
73 | ### License
74 | 
75 | According to the [Open Data Law](https://opendata.cityofnewyork.us/open-data-law/) this data is available as public domain.
76 | 


--------------------------------------------------------------------------------
/percolator/operations/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "index",
  3 |       "operation-type": "bulk",
  4 |       "bulk-size": {{bulk_size | default(5000)}},
  5 |       "ingest-percentage": {{ingest_percentage | default(100)}}
  6 |     },
  7 |     {
  8 |       "name": "percolator_with_content_president_bush",
  9 |       "operation-type": "search",
 10 |       "body": {
 11 |         "query": {
 12 |           "percolate": {
 13 |             "field": "query",
 14 |             "document": {
 15 |               "body": "president bush"
 16 |             }
 17 |           }
 18 |         }
 19 |       }
 20 |     },
 21 |     {
 22 |       "name": "percolator_with_content_saddam_hussein",
 23 |       "operation-type": "search",
 24 |       "body": {
 25 |         "query": {
 26 |           "percolate": {
 27 |             "field": "query",
 28 |             "document": {
 29 |               "body": "saddam hussein"
 30 |             }
 31 |           }
 32 |         }
 33 |       }
 34 |     },
 35 |     {
 36 |       "name": "percolator_with_content_hurricane_katrina",
 37 |       "operation-type": "search",
 38 |       "body": {
 39 |         "query": {
 40 |           "percolate": {
 41 |             "field": "query",
 42 |             "document": {
 43 |               "body": "hurricane katrina"
 44 |             }
 45 |           }
 46 |         }
 47 |       }
 48 |     },
 49 |     {
 50 |       "name": "percolator_with_content_google",
 51 |       "operation-type": "search",
 52 |       "body": {
 53 |         "query": {
 54 |           "percolate": {
 55 |             "field": "query",
 56 |             "document": {
 57 |               "body": "google"
 58 |             }
 59 |           }
 60 |         }
 61 |       }
 62 |     },
 63 |     {
 64 |       "name": "percolator_no_score_with_content_google",
 65 |       "operation-type": "search",
 66 |       "body": {
 67 |         "query": {
 68 |           "constant_score": {
 69 |             "filter": {
 70 |               "percolate": {
 71 |                 "field": "query",
 72 |                 "document": {
 73 |                   "body": "google"
 74 |                 }
 75 |               }
 76 |             }
 77 |           }
 78 |         }
 79 |       }
 80 |     },
 81 |     {
 82 |       "name": "percolator_with_highlighting",
 83 |       "operation-type": "search",
 84 |       "body": {
 85 |         "query": {
 86 |           "percolate": {
 87 |             "field": "query",
 88 |             "document": {
 89 |               "body": "Israeli prime minister Ariel Sharon suffers a massive stroke; he is replaced by acting prime minister Ehud Olmert"
 90 |             }
 91 |           }
 92 |         },
 93 |         "highlight": {
 94 |           "fields": {
 95 |             "body": {}
 96 |           }
 97 |         }
 98 |       }
 99 |     },
100 |     {
101 |       "name": "percolator_with_content_ignore_me",
102 |       "operation-type": "search",
103 |       "body": {
104 |         "query": {
105 |           "percolate": {
106 |             "field": "query",
107 |             "document": {
108 |               "body": "ignore me"
109 |             }
110 |           }
111 |         }
112 |       }
113 |     },
114 |     {
115 |       "name": "percolator_no_score_with_content_ignore_me",
116 |       "operation-type": "search",
117 |       "body": {
118 |         "query": {
119 |           "constant_score": {
120 |             "filter": {
121 |               "percolate": {
122 |                 "field": "query",
123 |                 "document": {
124 |                   "body": "ignore me"
125 |                 }
126 |               }
127 |             }
128 |           }
129 |         }
130 |       }
131 |     }


--------------------------------------------------------------------------------
/nyc_taxis/_tools/parse.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import csv
  3 | import sys
  4 | import re
  5 | 
  6 | types = {}
  7 | for f in ["vendor_id","cab_color","payment_type","trip_type","rate_code_id","store_and_fwd_flag"]:
  8 |   types[f] = 'keyword'
  9 | for f in ["vendor_name"]:
 10 |   types[f] = 'text'
 11 | for f in ["passenger_count"]:
 12 |   types[f] = 'integer'
 13 | for f in ["pickup_location", "dropoff_location"]:
 14 |   types[f] = 'geo_point'
 15 | for f in ["trip_distance", "fare_amount", "surcharge", "mta_tax", "extra", "ehail_fee", "improvement_surcharge", "tip_amount", "tolls_amount", "total_amount"]:
 16 |   types[f] = 'scaled_float'
 17 | for f in ["pickup_datetime", "dropoff_datetime"]:
 18 |   types[f] = 'date'
 19 | 
 20 | def write_mappings():
 21 |   mappings = {}
 22 |   for (k, v) in types.items():
 23 |     mappings[k] = { "type": v }
 24 |     if v == 'date':
 25 |       mappings[k]['format'] = "yyyy-MM-dd HH:mm:ss"
 26 |     elif v == 'scaled_float':
 27 |       mappings[k]['scaling_factor'] = 100
 28 |   mappings = { "properties": mappings }
 29 |   mappings['_all'] = { "enabled": False }
 30 |   mappings['dynamic'] = 'strict'
 31 |   mappings = { "type": mappings }
 32 |   print(json.dumps(mappings, indent=2))
 33 | 
 34 | def to_geo_point(d, f):
 35 |   lat_field = f + "_latitude"
 36 |   lon_field = f + "_longitude"
 37 |   if lat_field in d and lon_field in d:
 38 |     longitude = float(d[lon_field])
 39 |     latitude = float(d[lat_field])
 40 |     if longitude < -180 or longitude > 180 or latitude < -90 or latitude > 90:
 41 |       raise Exception("Malformed coordinates")
 42 |     d[f + '_location'] = [float(d[lon_field]), float(d[lat_field])]
 43 |     del d[lon_field]
 44 |     del d[lat_field]
 45 | 
 46 | def to_underscore(s):
 47 |   s = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', s)
 48 |   return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s).lower()
 49 | 
 50 | def to_json(f):
 51 |   fields = []
 52 |   for field in f.readline().strip().split(','):
 53 |     field = to_underscore(field)
 54 |     if field.startswith('tpep_') or field.startswith('lpep_'):
 55 |       field = field[5:]
 56 |     elif field == 'ratecode_id':
 57 |       field = 'rate_code_id'
 58 |     fields.append(field)
 59 |   for line in f.readlines():
 60 |     cols = line.strip().split(',')
 61 |     if len(cols) < len(fields):
 62 |       raise Exception("Cannot parse '%s': number of fields does not match '%s'" %(line, ",".join(fields)))
 63 | 
 64 |     try:
 65 |       d = {}
 66 |       for i in range(len(fields)):
 67 |         field = fields[i]
 68 |         value = cols[i]
 69 |         if value != '': # the way csv says the field does not exist
 70 |           d[field] = value
 71 | 
 72 |       to_geo_point(d, 'pickup')
 73 |       to_geo_point(d, 'dropoff')
 74 | 
 75 |       for (k, v) in d.items():
 76 |         if k not in types:
 77 |           raise Exception("Unknown field '%s'" %k)
 78 |         t = types[k]
 79 |         try:
 80 |           if t == 'integer':
 81 |             d[k] = int(v)
 82 |           elif t == 'float':
 83 |             d[k] = float(v)
 84 |         except Exception as cause:
 85 |           raise Exception("Cannot parse (%s,%s)" %(k, v)) from cause
 86 | 
 87 |       print(json.dumps(d))
 88 |     except KeyboardInterrupt:
 89 |       break
 90 |     except Exception as e:
 91 |       print("Skipping malformed entry '%s' because of %s" %(line, str(e)), file=sys.stderr)
 92 | 
 93 | if sys.argv[1] == "json":
 94 |   for file_name in sys.argv[2:]:
 95 |     with open(file_name) as f:
 96 |       to_json(f)
 97 | elif sys.argv[1] == "mappings":
 98 |   write_mappings()
 99 | else:
100 |   raise Exception("Expected 'json' or 'mappings' but got %s" %sys.argv[1])
101 | 


--------------------------------------------------------------------------------
/percolator/challenges/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "append-no-conflicts",
  3 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green and we want to ensure that we don't use the query cache. Document ids are unique so all index operations are append only. After that a couple of queries are run.",
  4 |       "default": true,
  5 |       "schedule": [
  6 |         {
  7 |           "operation": "delete-index"
  8 |         },
  9 |         {
 10 |           "operation": {
 11 |             "operation-type": "create-index",
 12 |             "settings": {{index_settings | default({}) | tojson}}
 13 |           }
 14 |         },
 15 |         {
 16 |           "name": "check-cluster-health",
 17 |           "operation": {
 18 |             "operation-type": "cluster-health",
 19 |             "index": "queries",
 20 |             "request-params": {
 21 |               "wait_for_status": "{{cluster_health | default('green')}}",
 22 |               "wait_for_no_relocating_shards": "true"
 23 |             }
 24 |           }
 25 |         },
 26 |         {
 27 |           "operation": "index",
 28 |           "#COMMENT": "This is an incredibly short warmup time period but it is necessary to get also measurement samples. As this benchmark is rather about search than indexing this is ok.",
 29 |           "warmup-time-period": 10,
 30 |           "clients": {{bulk_indexing_clients | default(8)}}
 31 |         },
 32 |         {
 33 |           "name": "refresh-after-index",
 34 |           "operation": "refresh",
 35 |           "clients": 1
 36 |         },
 37 |         {
 38 |           "operation": "force-merge",
 39 |           "clients": 1
 40 |         },
 41 |         {
 42 |           "name": "refresh-after-force-merge",
 43 |           "operation": "refresh",
 44 |           "clients": 1
 45 |         },
 46 |         {
 47 |           "operation": "percolator_with_content_president_bush",
 48 |           "clients": 1,
 49 |           "warmup-iterations": 100,
 50 |           "iterations": 100,
 51 |           "target-throughput": 50
 52 |         },
 53 |         {
 54 |           "operation": "percolator_with_content_saddam_hussein",
 55 |           "clients": 1,
 56 |           "warmup-iterations": 100,
 57 |           "iterations": 100,
 58 |           "target-throughput": 50
 59 |         },
 60 |         {
 61 |           "operation": "percolator_with_content_hurricane_katrina",
 62 |           "clients": 1,
 63 |           "warmup-iterations": 100,
 64 |           "iterations": 100,
 65 |           "target-throughput": 50
 66 |         },
 67 |         {
 68 |           "operation": "percolator_with_content_google",
 69 |           "clients": 1,
 70 |           "warmup-iterations": 100,
 71 |           "iterations": 100,
 72 |           "target-throughput": 35
 73 |         },
 74 |         {
 75 |           "operation": "percolator_no_score_with_content_google",
 76 |           "clients": 1,
 77 |           "warmup-iterations": 100,
 78 |           "iterations": 100,
 79 |           "target-throughput": 100
 80 |         },
 81 |         {
 82 |           "operation": "percolator_with_highlighting",
 83 |           "clients": 1,
 84 |           "warmup-iterations": 100,
 85 |           "iterations": 100,
 86 |           "target-throughput": 50
 87 |         },
 88 |         {
 89 |           "operation": "percolator_with_content_ignore_me",
 90 |           "clients": 1,
 91 |           "warmup-iterations": 10,
 92 |           "iterations": 100,
 93 |           "#COMMENT": "Be aware that we specify *target-interval* here! This means we issue one query every 12 seconds",
 94 |           "target-interval": 12
 95 |         },
 96 |         {
 97 |           "operation": "percolator_no_score_with_content_ignore_me",
 98 |           "clients": 1,
 99 |           "warmup-iterations": 100,
100 |           "iterations": 100,
101 |           "target-throughput": 15
102 |         }
103 |       ]
104 |     }
105 | 


--------------------------------------------------------------------------------
/geonames/track.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import os
  3 | 
  4 | 
  5 | class QueryParamSource:
  6 |     # We need to stick to the param source API
  7 |     # noinspection PyUnusedLocal
  8 |     def __init__(self, track, params, **kwargs):
  9 |         self._params = params
 10 |         self.infinite = True
 11 |         cwd = os.path.dirname(__file__)
 12 |         # The terms.txt file has been generated with:
 13 |         # sed -n '13~250p' [path_to_rally_data]/geonames/documents.json | shuf | sed -e "s/.*name\": \"//;s/\",.*$//" > terms.txt
 14 |         with open(os.path.join(cwd, "terms.txt"), "r") as ins:
 15 |             self.terms = [line.strip() for line in ins.readlines()]
 16 | 
 17 |     # We need to stick to the param source API
 18 |     # noinspection PyUnusedLocal
 19 |     def partition(self, partition_index, total_partitions):
 20 |         return self
 21 | 
 22 |     # Deprecated - only there for BWC reasons with Rally < 1.4.0
 23 |     def size(self):
 24 |         return 1
 25 | 
 26 | 
 27 | class PureTermsQueryParamSource(QueryParamSource):
 28 |     def params(self):
 29 |         query_terms = list(self.terms)  # copy
 30 |         query_terms.append(str(random.randint(1, 100)))  # avoid caching
 31 |         result = {
 32 |             "body": {
 33 |                 "query": {
 34 |                     "terms": {
 35 |                         "name.raw": query_terms
 36 |                     }
 37 |                 }
 38 |             },
 39 |             "index": None
 40 |         }
 41 |         if "cache" in self._params:
 42 |             result["cache"] = self._params["cache"]
 43 | 
 44 |         return result
 45 | 
 46 | 
 47 | class FilteredTermsQueryParamSource(QueryParamSource):
 48 |     def params(self):
 49 |         query_terms = list(self.terms)  # copy
 50 |         query_terms.append(str(random.randint(1, 1000)))  # avoid caching
 51 |         result = {
 52 |             "body": {
 53 |                 "query": {
 54 |                     "bool": {
 55 |                         "must": [
 56 |                             {
 57 |                                 "match": {
 58 |                                     "feature_class.raw": "T"
 59 |                                 }
 60 |                             }
 61 |                         ],
 62 |                         "filter": [
 63 |                             {
 64 |                                 "terms": {
 65 |                                     "name.raw": query_terms
 66 |                                 }
 67 |                             }
 68 |                         ]
 69 |                     }
 70 |                 }
 71 |             },
 72 |             "index": None
 73 |         }
 74 |         if "cache" in self._params:
 75 |             result["cache"] = self._params["cache"]
 76 | 
 77 |         return result
 78 | 
 79 | 
 80 | class ProhibitedTermsQueryParamSource(QueryParamSource):
 81 |     def params(self):
 82 |         query_terms = list(self.terms)  # copy
 83 |         query_terms.append(str(random.randint(1, 1000)))  # avoid caching
 84 |         result = {
 85 |             "body": {
 86 |                 "query": {
 87 |                     "bool": {
 88 |                         "must": [
 89 |                             {
 90 |                                 "match": {
 91 |                                     "feature_class.raw": "A"
 92 |                                 }
 93 |                             }
 94 |                         ],
 95 |                         "must_not": [
 96 |                             {
 97 |                                 "terms": {
 98 |                                     "name.raw": query_terms
 99 |                                 }
100 |                             }
101 |                         ]
102 |                     }
103 |                 }
104 |             },
105 |             "index": None
106 |         }
107 |         if "cache" in self._params:
108 |             result["cache"] = self._params["cache"]
109 | 
110 |         return result
111 | 
112 | 
113 | def refresh(es, params):
114 |     es.indices.refresh(index=params.get("index", "_all"))
115 | 
116 | 
117 | def register(registry):
118 |     registry.register_param_source("pure-terms-query-source", PureTermsQueryParamSource)
119 |     registry.register_param_source("filtered-terms-query-source", FilteredTermsQueryParamSource)
120 |     registry.register_param_source("prohibited-terms-query-source", ProhibitedTermsQueryParamSource)
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | rally-tracks
 2 | ------------
 3 | 
 4 | This repository contains the default track specifications for the Elasticsearch benchmarking tool [Rally](https://github.com/elastic/rally).
 5 | 
 6 | Tracks are used to describe benchmarks in Rally.
 7 | 
 8 | You should not need to use this repository directly, except if you want to look under the hood or create your own tracks. We have created a [tutorial on how to create your own tracks](https://esrally.readthedocs.io/en/latest/adding_tracks.html).
 9 | 
10 | Versioning Scheme
11 | -----------------
12 | 
13 | From time to time, setting and mapping formats change in Elasticsearch. As we want to be able to support multiple versions of Elasticsearch, we also need to version track specifications. Therefore, this repository contains multiple branches. The following examples should give you an idea how the versioning scheme works:
14 | 
15 | * master: tracks on this branch are compatible with the latest development version of Elasticsearch
16 | * 5.0.0-alpha2: compatible with the released version 5.0.0-alpha2.
17 | * 2: compatible with all Elasticsearch releases with the major release number 2 (e.g. 2.1, 2.2, 2.2.1)
18 | * 1.7: compatible with all Elasticsearch releases with the major release number 1 and minor release number 7 (e.g. 1.7.0, 1.7.1, 1.7.2)
19 | 
20 | As you can see, branches can match exact release numbers but Rally is also lenient in case settings mapping formats did not change for a few releases. Rally will try to match in the following order:
21 | 
22 | 1. major.minor.patch-extension_label (e.g. 5.0.0-alpha5)
23 | 2. major.minor.patch (e.g. 2.3.1)
24 | 3. major.minor (e.g. 2.3)
25 | 4. major (e.g. 2)
26 | 
27 | Apart from that, the master branch is always considered to be compatible with the Elasticsearch master branch.
28 | 
29 | To specify the version to check against, add `--distribution-version` when running Rally. It it is not specified, Rally assumes that you want to benchmark against the Elasticsearch master version. 
30 | 
31 | Example: If you want to benchmark Elasticsearch 5.0.0, run the following command:
32 | 
33 | ```
34 | esrally --distribution-version=5.0.0
35 | ```
36 | 
37 | How to Contribute
38 | -----------------
39 | 
40 | If you want to contribute a track, please ensure that it works against the master version of Elasticsearch (i.e. submit PRs against the master branch). We can then check whether it's feasible to backport the track to earlier Elasticsearch versions.
41 |  
42 | See all details in the [contributor guidelines](https://github.com/elastic/rally/blob/master/CONTRIBUTING.md).
43 | 
44 | Backporting changes
45 | -------------------
46 | 
47 | If you are a contributor with direct commit access to this repository then please backport your changes. This ensures that tracks do not work only for the latest `master` version of Elasticsearch but also for older versions. Apply backports with cherr-picks. Below you can find a walkthrough:
48 | 
49 | Assume we've pushed commit `a7e0937` to master and want to backport it. This is a change to the `noaa` track. Let's check what branches are available for backporting:
50 | 
51 | ```
52 | daniel@io:tracks/default ‹master›$ git branch -r
53 |   origin/1
54 |   origin/2
55 |   origin/5
56 |   origin/HEAD -> origin/master
57 |   origin/master
58 | ```
59 | 
60 | We'll go backwards starting from branch `5`, then branch `2` and finally branch `1`. After applying a change, we will test whether the track works as is for an older version of Elasticsearch.
61 | 
62 | ```
63 | git checkout 5
64 | git cherry-pick a7e0937
65 | 
66 | # test the change now with an Elasticsearch 5.x distribution
67 | esrally --track=noaa --distribution-version=5.4.3 --test-mode
68 | 
69 | # push the change
70 | git push origin 5
71 | ```
72 | 
73 | This particular track uses features that are only available in Elasticsearch 5 and later so we will stop here but the process continues until we've reached the earliest branch. 
74 | 
75 | Sometimes it is necessary to remove individual operations from a track that are not supported by earlier versions. This graceful fallback is a compromise to allow to run a subset of the track on older versions of Elasticsearch too. If this is necessary then it's best to do these changes in a separate commit. Also, don't forget to cherry-pick this separate commit too to even earlier versions if necessary.  
76 | 
77 |  
78 | License
79 | -------
80 |  
81 | There is no single license for this repository. Licenses are chosen per track. They are typically licensed under the same terms as the source data. See the README files of each track for more details.


--------------------------------------------------------------------------------
/nested/challenges/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "nested-search-challenge",
  3 |       "description": "Indexes the document corpus for an hour using Elasticsearch default settings. After that randomized nested queries are run.",
  4 |       "default": true,
  5 |       "schedule": [
  6 |         {
  7 |           "operation": "delete-index"
  8 |         },
  9 |         {
 10 |           "operation": {
 11 |             "operation-type": "create-index",
 12 |             "settings": {{index_settings | default({}) | tojson}}
 13 |           }
 14 |         },
 15 |         {
 16 |           "name": "check-cluster-health",
 17 |           "operation": {
 18 |             "operation-type": "cluster-health",
 19 |             "index": "sonested",
 20 |             "request-params": {
 21 |               "wait_for_status": "{{cluster_health | default('green')}}",
 22 |               "wait_for_no_relocating_shards": "true"
 23 |             }
 24 |           }
 25 |         },
 26 |        {
 27 |           "operation": "index-append",
 28 |           "warmup-time-period": 120,
 29 |           "time-period": 3600,
 30 |           "clients": {{bulk_indexing_clients | default(4)}}
 31 |         },
 32 |         {
 33 |           "name": "refresh-after-index",
 34 |           "operation": "refresh",
 35 |           "clients": 1
 36 |         },
 37 |         {
 38 |           "operation": "force-merge",
 39 |           "clients": 1
 40 |         },
 41 |         {
 42 |           "name": "refresh-after-force-merge",
 43 |           "operation": "refresh",
 44 |           "clients": 1
 45 |         },
 46 |         {
 47 |           "operation": "randomized-nested-queries",
 48 |           "clients": 2,
 49 |           "target-throughput": 20,
 50 |           "warmup-iterations": 500,
 51 |           "iterations": 1000
 52 |         },
 53 |         {
 54 |           "operation": "randomized-term-queries",
 55 |           "clients": 2,
 56 |           "target-throughput": 25,
 57 |           "warmup-iterations": 500,
 58 |           "iterations": 200
 59 |         },
 60 |         {
 61 |           "operation": "randomized-sorted-term-queries",
 62 |           "clients": 2,
 63 |           "warmup-iterations": 500,
 64 |           "target-throughput": 16,
 65 |           "iterations": 200
 66 |         },
 67 |         {
 68 |           "operation": "match-all",
 69 |           "clients": 2,
 70 |           "target-throughput": 5,
 71 |           "warmup-iterations": 500,
 72 |           "iterations": 200
 73 |         },
 74 |         {
 75 |           "operation": "nested-date-histo",
 76 |           "clients": 2,
 77 |           "target-throughput": 1,
 78 |           "warmup-iterations": 100,
 79 |           "iterations": 200
 80 |         },
 81 |         {
 82 |           "operation": "randomized-nested-queries-with-inner-hits_default",
 83 |           "clients": 2,
 84 |           "target-throughput": 18,
 85 |           "warmup-iterations": 500,
 86 |           "iterations": 1000
 87 |         },
 88 |         {
 89 |           "operation": "randomized-nested-queries-with-inner-hits_default_big_size",
 90 |           "clients": 2,
 91 |           "target-throughput": 16,
 92 |           "warmup-iterations": 500,
 93 |           "iterations": 1000
 94 |         }
 95 |       ]
 96 |     },
 97 |     {
 98 |       "name": "index-only",
 99 |       "description": "Indexes the document corpus for an hour using Elasticsearch default settings.",
100 |       "schedule": [
101 |         {
102 |           "operation": "delete-index"
103 |         },
104 |         {
105 |           "operation": {
106 |             "operation-type": "create-index",
107 |             "settings": {{index_settings | default({}) | tojson}}
108 |           }
109 |         },
110 |         {
111 |           "name": "check-cluster-health",
112 |           "operation": {
113 |             "operation-type": "cluster-health",
114 |             "index": "sonested",
115 |             "request-params": {
116 |               "wait_for_status": "{{cluster_health | default('green')}}",
117 |               "wait_for_no_relocating_shards": "true"
118 |             }
119 |           }
120 |         },
121 |         {
122 |           "operation": "index-append",
123 |           "warmup-time-period": 120,
124 |           "time-period": 3600,
125 |           "clients": {{bulk_indexing_clients | default(4)}}
126 |         },
127 |         {
128 |         "name": "refresh-after-index",
129 |         "operation": "refresh",
130 |         "clients": 1
131 |         },
132 |         {
133 |           "operation": "force-merge",
134 |           "clients": 1
135 |         },
136 |         {
137 |           "name": "refresh-after-force-merge",
138 |           "operation": "refresh",
139 |           "clients": 1
140 |         }
141 |       ]
142 |     }
143 | 


--------------------------------------------------------------------------------
/noaa/operations/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "index",
  3 |       "operation-type": "bulk",
  4 |       "bulk-size": {{bulk_size | default(5000)}},
  5 |       "ingest-percentage": {{ingest_percentage | default(100)}}
  6 |     },
  7 |     {
  8 |       "name": "range_field_big_range",
  9 |       "operation-type": "search",
 10 |       "body": {
 11 |         "query": {
 12 |           "range": {
 13 |             "TRANGE": {
 14 |               "gte": 0,
 15 |               "lte": 30
 16 |             }
 17 |           }
 18 |         }
 19 |       }
 20 |     },
 21 |     {
 22 |       "name": "range_field_small_range",
 23 |       "operation-type": "search",
 24 |       "body": {
 25 |         "query": {
 26 |           "range": {
 27 |             "TRANGE": {
 28 |               "gte": -20,
 29 |               "lte": -10
 30 |             }
 31 |           }
 32 |         }
 33 |       }
 34 |     },
 35 |     {
 36 |       "name": "range_field_conjunction_big_range_small_term_query",
 37 |       "operation-type": "search",
 38 |       "body": {
 39 |         "query": {
 40 |           "bool": {
 41 |             "must": [
 42 |               {
 43 |                 "term": {
 44 |                   "station.country_code": "JA"
 45 |                 }
 46 |               },
 47 |               {
 48 |                 "range": {
 49 |                   "TRANGE": {
 50 |                     "gte": 0,
 51 |                     "lte": 30
 52 |                   }
 53 |                 }
 54 |               }
 55 |             ]
 56 |           }
 57 |         }
 58 |       }
 59 |     },
 60 |     {
 61 |       "name": "range_field_conjunction_small_range_small_term_query",
 62 |       "operation-type": "search",
 63 |       "body": {
 64 |         "query": {
 65 |           "bool": {
 66 |             "must": [
 67 |               {
 68 |                 "term": {
 69 |                   "station.country_code": "JA"
 70 |                 }
 71 |               },
 72 |               {
 73 |                 "range": {
 74 |                   "TRANGE": {
 75 |                     "gte": -20,
 76 |                     "lte": -10
 77 |                   }
 78 |                 }
 79 |               }
 80 |             ]
 81 |           }
 82 |         }
 83 |       }
 84 |     },
 85 |     {
 86 |       "name": "range_field_conjunction_small_range_big_term_query",
 87 |       "operation-type": "search",
 88 |       "body": {
 89 |         "query": {
 90 |           "bool": {
 91 |             "must": [
 92 |               {
 93 |                 "term": {
 94 |                   "station.country_code": "US"
 95 |                 }
 96 |               },
 97 |               {
 98 |                 "range": {
 99 |                   "TRANGE": {
100 |                     "gte": -20,
101 |                     "lte": -10
102 |                   }
103 |                 }
104 |               }
105 |             ]
106 |           }
107 |         }
108 |       }
109 |     },
110 |     {
111 |       "name": "range_field_conjunction_big_range_big_term_query",
112 |       "operation-type": "search",
113 |       "body": {
114 |         "query": {
115 |           "bool": {
116 |             "must": [
117 |               {
118 |                 "term": {
119 |                   "station.country_code": "US"
120 |                 }
121 |               },
122 |               {
123 |                 "range": {
124 |                   "TRANGE": {
125 |                     "gte": 0,
126 |                     "lte": 30
127 |                   }
128 |                 }
129 |               }
130 |             ]
131 |           }
132 |         }
133 |       }
134 |     },
135 |     {
136 |       "name": "range_field_disjunction_small_range_small_term_query",
137 |       "operation-type": "search",
138 |       "body": {
139 |         "query": {
140 |           "bool": {
141 |             "should": [
142 |               {
143 |                 "term": {
144 |                   "station.country_code": "JA"
145 |                 }
146 |               },
147 |               {
148 |                 "range": {
149 |                   "TRANGE": {
150 |                     "gte": -20,
151 |                     "lte": -10
152 |                   }
153 |                 }
154 |               }
155 |             ]
156 |           }
157 |         }
158 |       }
159 |     },
160 |     {
161 |       "name": "range_field_disjunction_big_range_small_term_query",
162 |       "operation-type": "search",
163 |       "body": {
164 |         "query": {
165 |           "bool": {
166 |             "should": [
167 |               {
168 |                 "term": {
169 |                   "station.country_code": "JA"
170 |                 }
171 |               },
172 |               {
173 |                 "range": {
174 |                   "TRANGE": {
175 |                     "gte": 0,
176 |                     "lte": 30
177 |                   }
178 |                 }
179 |               }
180 |             ]
181 |           }
182 |         }
183 |       }
184 |     }
185 |     


--------------------------------------------------------------------------------
/percolator/README.md:
--------------------------------------------------------------------------------
 1 | ## Percolator track
 2 | 
 3 | The queries.json.bz2 file contains list of ES queries that has been randomly generated from the AOL query dataset published in 2006. Only specific queries have been selected and the rest of the file contains dummy queries. Only the query attribute is copied from the AOL query dataset, the rest of the attributes are not in this file.
 4 | 
 5 | ### Example Document
 6 | 
 7 | ```json
 8 | {
 9 |   "query": {
10 |     "match": {
11 |       "body": {
12 |         "query": "costa rica hurricanes"
13 |       }
14 |     }
15 |   }
16 | }
17 | ```
18 | 
19 | ### Parameters
20 | 
21 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`:
22 | 
23 | * `bulk_size` (default: 5000)
24 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests.
25 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
26 | * `number_of_replicas` (default: 0)
27 | * `number_of_shards` (default: 5)
28 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index.
29 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
30 | * `cluster_health` (default: "green"): The minimum required cluster health.
31 | 
32 | ### License
33 | 
34 | AOL's original README:
35 | 
36 | ```
37 | 500k User Session Collection
38 | ----------------------------------------------
39 | This collection is distributed for NON-COMMERCIAL RESEARCH USE ONLY. 
40 | Any application of this collection for commercial purposes is STRICTLY PROHIBITED.
41 | 
42 | Brief description:
43 | 
44 | This collection consists of ~20M web queries collected from ~650k users over three months.
45 | The data is sorted by anonymous user ID and sequentially arranged. 
46 | 
47 | The goal of this collection is to provide real query log data that is based on real users. It could be used for personalization, query reformulation or other types of search research. 
48 | 
49 | The data set includes {AnonID, Query, QueryTime, ItemRank, ClickURL}.
50 |         AnonID - an anonymous user ID number.
51 |         Query  - the query issued by the user, case shifted with
52 |                  most punctuation removed.
53 |         QueryTime - the time at which the query was submitted for search.
54 |         ItemRank  - if the user clicked on a search result, the rank of the
55 |                     item on which they clicked is listed. 
56 |         ClickURL  - if the user clicked on a search result, the domain portion of 
57 |                     the URL in the clicked result is listed.
58 | 
59 | Each line in the data represents one of two types of events:
60 |         1. A query that was NOT followed by the user clicking on a result item.
61 |         2. A click through on an item in the result list returned from a query.
62 | In the first case (query only) there is data in only the first three columns/fields -- namely AnonID, Query, and QueryTime (see above). 
63 | In the second case (click through), there is data in all five columns.  For click through events, the query that preceded the click through is included.  Note that if a user clicked on more than one result in the list returned from a single query, there will be TWO lines in the data to represent the two events.  Also note that if the user requested the next "page" or results for some query, this appears as a subsequent identical query with a later time stamp.
64 | 
65 | CAVEAT EMPTOR -- SEXUALLY EXPLICIT DATA!  Please be aware that these queries are not filtered to remove any content.  Pornography is prevalent on the Web and unfiltered search engine logs contain queries by users who are looking for pornographic material.  There are queries in this collection that use SEXUALLY EXPLICIT LANGUAGE.  This collection of data is intended for use by mature adults who are not easily offended by the use of pornographic search terms.  If you are offended by sexually explicit language you should not read through this data.  Also be aware that in some states it may be illegal to expose a minor to this data.  Please understand that the data represents REAL WORLD USERS, un-edited and randomly sampled, and that AOL is not the author of this data.
66 | 
67 | Basic Collection Statistics
68 | Dates:
69 |   01 March, 2006 - 31 May, 2006
70 | 
71 | Normalized queries:
72 |   36,389,567 lines of data
73 |   21,011,340 instances of new queries (w/ or w/o click-through)
74 |    7,887,022 requests for "next page" of results
75 |   19,442,629 user click-through events
76 |   16,946,938 queries w/o user click-through
77 |   10,154,742 unique (normalized) queries
78 |      657,426 unique user ID's
79 | 
80 | 
81 | Please reference the following publication when using this collection:
82 | 
83 | G. Pass, A. Chowdhury, C. Torgeson,  "A Picture of Search"  The First 
84 | International Conference on Scalable Information Systems, Hong Kong, June, 
85 | 2006.
86 | 
87 | Copyright (2006) AOL
88 | ```
89 | 


--------------------------------------------------------------------------------
/http_logs/README.md:
--------------------------------------------------------------------------------
 1 | ## HTTP logs track
 2 | 
 3 | This track is based on [Web server logs from the 1998 Football world cup](http://ita.ee.lbl.gov/html/contrib/WorldCup.html). 
 4 | 
 5 | Modifications: 
 6 | 
 7 | * Applied number to IP conversion as suggested in the original readme
 8 | * Removed illegal characters in "object_mappings.sort"
 9 | * Transformed the source data to a bulk-friendly JSON format (ignoring all entries that
10 |   contained unrecognised / problematic characters and invalid IP addresses like "0";
11 |   around 0.001% of the source data was lost due to this approach)
12 | 
13 | ### Example Document
14 | 
15 | ```json
16 | {
17 |   "@timestamp": 898459201,
18 |   "clientip": "211.11.9.0",
19 |   "request": "GET /english/index.html HTTP/1.0",
20 |   "status": 304,
21 |   "size": 0
22 | }
23 | ```
24 | 
25 | Alternatively, an `unparsed` set of documents are also provided. The `unparsed` data set is identical to the standard 
26 | data set, except the timestamp is ISO8601 and all the fields are unparsed via the `message` field.  For example:
27 | 
28 | ```json
29 | {"message" : "211.11.9.0 - - [1998-06-21T15:00:01-05:00] \"GET /english/index.html HTTP/1.0\" 304 0"}
30 | ```
31 | 
32 | ### Parameters
33 | 
34 | This track allows to overwrite the following parameters with Rally 0.8.0+ using `--track-params`:
35 | 
36 | * `bulk_size` (default: 5000)
37 | * `bulk_indexing_clients` (default: 8): Number of clients that issue bulk indexing requests.
38 | * `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested.
39 | * `conflicts` (default: "random"): Type of id conflicts to simulate. Valid values are: 'sequential' (A document id is replaced with a document id with a sequentially increasing id), 'random' (A document id is replaced with a document id with a random other id).
40 | * `conflict_probability` (default: 25): A number between 0 and 100 that defines the probability of id conflicts. This requires to run the respective challenge. Combining ``conflicts=sequential`` and ``conflict-probability=0`` makes Rally generate index ids by itself, instead of relying on Elasticsearch's `automatic id generation <https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#_automatic_id_generation>`_.
41 | * `number_of_replicas` (default: 0)
42 | * `number_of_shards` (default: 5)
43 | * `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index.
44 | * `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly.
45 | * `cluster_health` (default: "green"): The minimum required cluster health.
46 | * `ingest_pipeline`: Only applicable for `--challenge=append-index-only-with-ingest-pipeline`, selects which ingest
47 | node pipeline to run. Valid options are `'baseline'` (default), `'grok'`  and `'geoip'`. For example: `--challenge=append-index-only-with-ingest-pipeline --track-params="ingest_pipeline:'baseline'" `
48 | 
49 | ### License
50 | 
51 | Original license text:
52 | 
53 |                Copyright (C) 1997, 1998, 1999 Hewlett-Packard Company
54 |                              ALL RIGHTS RESERVED.
55 |      
56 |       The enclosed software and documentation includes copyrighted works
57 |       of Hewlett-Packard Co. For as long as you comply with the following
58 |       limitations, you are hereby authorized to (i) use, reproduce, and
59 |       modify the software and documentation, and to (ii) distribute the
60 |       software and documentation, including modifications, for
61 |       non-commercial purposes only.
62 |           
63 |       1.  The enclosed software and documentation is made available at no
64 |           charge in order to advance the general development of
65 |           the Internet, the World-Wide Web, and Electronic Commerce.
66 |      
67 |       2.  You may not delete any copyright notices contained in the
68 |           software or documentation. All hard copies, and copies in
69 |           source code or object code form, of the software or
70 |           documentation (including modifications) must contain at least
71 |           one of the copyright notices.
72 |      
73 |       3.  The enclosed software and documentation has not been subjected
74 |           to testing and quality control and is not a Hewlett-Packard Co.
75 |           product. At a future time, Hewlett-Packard Co. may or may not
76 |           offer a version of the software and documentation as a product.
77 |       
78 |       4.  THE SOFTWARE AND DOCUMENTATION IS PROVIDED "AS IS".
79 |           HEWLETT-PACKARD COMPANY DOES NOT WARRANT THAT THE USE,
80 |           REPRODUCTION, MODIFICATION OR DISTRIBUTION OF THE SOFTWARE OR
81 |           DOCUMENTATION WILL NOT INFRINGE A THIRD PARTY'S INTELLECTUAL
82 |           PROPERTY RIGHTS. HP DOES NOT WARRANT THAT THE SOFTWARE OR
83 |           DOCUMENTATION IS ERROR FREE. HP DISCLAIMS ALL WARRANTIES,
84 |           EXPRESS AND IMPLIED, WITH REGARD TO THE SOFTWARE AND THE
85 |           DOCUMENTATION. HP SPECIFICALLY DISCLAIMS ALL WARRANTIES OF
86 |           MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
87 |       
88 |       5.  HEWLETT-PACKARD COMPANY WILL NOT IN ANY EVENT BE LIABLE FOR ANY
89 |           DIRECT, INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES
90 |           (INCLUDING LOST PROFITS) RELATED TO ANY USE, REPRODUCTION,
91 |           MODIFICATION, OR DISTRIBUTION OF THE SOFTWARE OR DOCUMENTATION.
92 | 


--------------------------------------------------------------------------------
/noaa/challenges/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "append-no-conflicts",
  3 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green and we want to ensure that we don't use the query cache. Document ids are unique so all index operations are append only. After that a couple of queries are run.",
  4 |       "default": true,
  5 |       "schedule": [
  6 |         {
  7 |           "operation": "delete-index"
  8 |         },
  9 |         {
 10 |           "operation": {
 11 |             "operation-type": "create-index",
 12 |             "settings": {{index_settings | default({}) | tojson}}
 13 |           }
 14 |         },
 15 |         {
 16 |           "name": "check-cluster-health",
 17 |           "operation": {
 18 |             "operation-type": "cluster-health",
 19 |             "index": "weather-data-2016",
 20 |             "request-params": {
 21 |               "wait_for_status": "{{cluster_health | default('green')}}",
 22 |               "wait_for_no_relocating_shards": "true"
 23 |             }
 24 |           }
 25 |         },
 26 |         {
 27 |           "operation": "index",
 28 |           "#COMMENT": "This is an incredibly short warmup time period but it is necessary to get also measurement samples. As this benchmark is rather about search than indexing this is ok.",
 29 |           "warmup-time-period": 10,
 30 |           "clients": {{bulk_indexing_clients | default(8)}}
 31 |         },
 32 |         {
 33 |           "name": "refresh-after-index",
 34 |           "operation": "refresh",
 35 |           "clients": 1
 36 |         },
 37 |         {
 38 |           "operation": "force-merge",
 39 |           "clients": 1
 40 |         },
 41 |         {
 42 |           "name": "refresh-after-force-merge",
 43 |           "operation": "refresh",
 44 |           "clients": 1
 45 |         },
 46 |         {
 47 |           "operation": "range_field_big_range",
 48 |           "clients": 1,
 49 |           "warmup-iterations": 100,
 50 |           "iterations": 500,
 51 |           "target-throughput": 8
 52 |         },
 53 |         {
 54 |           "operation": "range_field_small_range",
 55 |           "clients": 1,
 56 |           "warmup-iterations": 100,
 57 |           "iterations": 500,
 58 |           "target-throughput": 10
 59 |         },
 60 |         {
 61 |           "operation": "range_field_conjunction_big_range_small_term_query",
 62 |           "clients": 1,
 63 |           "warmup-iterations": 100,
 64 |           "iterations": 500,
 65 |           "target-throughput": 10
 66 |         },
 67 |         {
 68 |           "operation": "range_field_conjunction_small_range_small_term_query",
 69 |           "clients": 1,
 70 |           "warmup-iterations": 100,
 71 |           "iterations": 500,
 72 |           "target-throughput": 10
 73 |         },
 74 |         {
 75 |           "operation": "range_field_conjunction_small_range_big_term_query",
 76 |           "clients": 1,
 77 |           "warmup-iterations": 100,
 78 |           "iterations": 500,
 79 |           "target-throughput": 4
 80 |         },
 81 |         {
 82 |           "operation": "range_field_conjunction_big_range_big_term_query",
 83 |           "clients": 1,
 84 |           "warmup-iterations": 100,
 85 |           "iterations": 500,
 86 |           "target-throughput": 1
 87 |         },
 88 |         {
 89 |           "operation": "range_field_disjunction_small_range_small_term_query",
 90 |           "clients": 1,
 91 |           "warmup-iterations": 100,
 92 |           "iterations": 500,
 93 |           "target-throughput": 10
 94 |         },
 95 |         {
 96 |           "operation": "range_field_disjunction_big_range_small_term_query",
 97 |           "clients": 1,
 98 |           "warmup-iterations": 100,
 99 |           "iterations": 500,
100 |           "target-throughput": 6
101 |         }
102 |       ]
103 |     },
104 |     {
105 |       "name": "append-no-conflicts-index-only",
106 |       "description": "Indexes the whole document corpus using Elasticsearch default settings.",
107 |       "schedule": [
108 |         {
109 |           "operation": "delete-index"
110 |         },
111 |         {
112 |           "operation": {
113 |             "operation-type": "create-index",
114 |             "settings": {{index_settings | default({}) | tojson}}
115 |           }
116 |         },
117 |         {
118 |           "name": "check-cluster-health",
119 |           "operation": {
120 |             "operation-type": "cluster-health",
121 |             "index": "weather-data-2016",
122 |             "request-params": {
123 |               "wait_for_status": "{{cluster_health | default('green')}}",
124 |               "wait_for_no_relocating_shards": "true"
125 |             }
126 |           }
127 |         },
128 |         {
129 |           "operation": "index",
130 |           "#COMMENT": "This is an incredibly short warmup time period but it is necessary to get also measurement samples. As this benchmark is rather about search than indexing this is ok.",
131 |           "warmup-time-period": 10,
132 |           "clients": {{bulk_indexing_clients | default(8)}}
133 |         },
134 |         {
135 |           "name": "refresh-after-index",
136 |           "operation": "refresh",
137 |           "clients": 1
138 |         },
139 |         {
140 |           "operation": "force-merge",
141 |           "clients": 1
142 |         },
143 |         {
144 |           "name": "refresh-after-force-merge",
145 |           "operation": "refresh",
146 |           "clients": 1
147 |         }
148 |       ]
149 |     }
150 | 


--------------------------------------------------------------------------------
/geopointshape/challenges/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "append-no-conflicts",
  3 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.",
  4 |       "default": true,
  5 |       "schedule": [
  6 |         {
  7 |           "operation": "delete-index"
  8 |         },
  9 |         {
 10 |           "operation": {
 11 |             "operation-type": "create-index",
 12 |             "settings": {{index_settings | default({}) | tojson}}
 13 |           }
 14 |         },
 15 |         {
 16 |           "name": "check-cluster-health",
 17 |           "operation": {
 18 |             "operation-type": "cluster-health",
 19 |             "index": "osmgeoshapes",
 20 |             "request-params": {
 21 |               "wait_for_status": "{{cluster_health | default('green')}}",
 22 |               "wait_for_no_relocating_shards": "true"
 23 |             }
 24 |           }
 25 |         },
 26 |         {
 27 |           "operation": "index-append",
 28 |           "warmup-time-period": 120,
 29 |           "clients": {{bulk_indexing_clients | default(8)}}
 30 |         },
 31 |         {
 32 |           "name": "refresh-after-index",
 33 |           "operation": "refresh",
 34 |           "clients": 1
 35 |         },
 36 |         {
 37 |           "operation": "force-merge",
 38 |           "clients": 1
 39 |         },
 40 |         {
 41 |           "name": "refresh-after-force-merge",
 42 |           "operation": "refresh",
 43 |           "clients": 1
 44 |         },
 45 |         {
 46 |           "operation": "polygon",
 47 |           "clients": 1,
 48 |           "warmup-iterations": 200,
 49 |           "iterations": 100,
 50 |           "target-throughput": 2
 51 |         },
 52 |         {
 53 |           "operation": "bbox",
 54 |           "clients": 1,
 55 |           "warmup-iterations": 200,
 56 |           "iterations": 100,
 57 |           "target-throughput": 2
 58 |         }
 59 |       ]
 60 |     },
 61 |     {
 62 |       "name": "append-no-conflicts-index-only",
 63 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.",
 64 |       "schedule": [
 65 |         {
 66 |           "operation": "delete-index"
 67 |         },
 68 |         {
 69 |           "operation": {
 70 |             "operation-type": "create-index",
 71 |             "settings": {{index_settings | default({}) | tojson}}
 72 |           }
 73 |         },
 74 |         {
 75 |           "name": "check-cluster-health",
 76 |           "operation": {
 77 |             "operation-type": "cluster-health",
 78 |             "index": "osmgeoshapes",
 79 |             "request-params": {
 80 |               "wait_for_status": "{{cluster_health | default('green')}}",
 81 |               "wait_for_no_relocating_shards": "true"
 82 |             }
 83 |           }
 84 |         },
 85 |         {
 86 |           "operation": "index-append",
 87 |           "warmup-time-period": 120,
 88 |           "clients": {{bulk_indexing_clients | default(8)}}
 89 |         },
 90 |         {
 91 |         "name": "refresh-after-index",
 92 |         "operation": "refresh",
 93 |         "clients": 1
 94 |         },
 95 |         {
 96 |           "operation": "force-merge",
 97 |           "clients": 1
 98 |         },
 99 |         {
100 |         "name": "refresh-after-force-merge",
101 |         "operation": "refresh",
102 |         "clients": 1
103 |         }
104 |       ]
105 |     },
106 |     {
107 |       "name": "append-fast-with-conflicts",
108 |       "description": "Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings. Rally will produce duplicate ids in 25% of all documents (not configurable) so we can simulate a scenario with appends most of the time and some updates in between.",
109 |       "schedule": [
110 |         {
111 |           "operation": "delete-index"
112 |         },
113 |         {
114 |           "operation": {
115 |             "operation-type": "create-index",
116 |             "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
117 |               "index.refresh_interval": "30s",
118 |               "index.number_of_shards": {{number_of_shards | default(6)}},
119 |               "index.translog.flush_threshold_size": "4g"
120 |             }{%- endif %}
121 |           }
122 |         },
123 |         {
124 |           "name": "check-cluster-health",
125 |           "operation": {
126 |             "operation-type": "cluster-health",
127 |             "index": "osmgeoshapes",
128 |             "request-params": {
129 |               "wait_for_status": "{{cluster_health | default('green')}}",
130 |               "wait_for_no_relocating_shards": "true"
131 |             }
132 |           }
133 |         },
134 |         {
135 |           "operation": "index-update",
136 |           "warmup-time-period": 120,
137 |           "clients": {{bulk_indexing_clients | default(8)}}
138 |         },
139 |         {
140 |         "name": "refresh-after-index",
141 |         "operation": "refresh",
142 |         "clients": 1
143 |         },
144 |         {
145 |           "operation": "force-merge",
146 |           "clients": 1
147 |         },
148 |         {
149 |         "name": "refresh-after-force-merge",
150 |         "operation": "refresh",
151 |         "clients": 1
152 |         }
153 |       ]
154 |     }
155 | 


--------------------------------------------------------------------------------
/http_logs/track.json:
--------------------------------------------------------------------------------
  1 | {% import "rally.helpers" as rally with context %}
  2 | 
  3 | {
  4 |   "version": 2,
  5 |   "description": "HTTP server log data",
  6 |   "#TODO": "Replace index definitions with a template after setting the track version to 2. Explicit index definitions are not necessary anymore.",
  7 |   "indices": [
  8 |     {
  9 |       "name": "logs-181998",
 10 |       "body": "index.json"
 11 |     },
 12 |     {
 13 |       "name": "logs-191998",
 14 |       "body": "index.json"
 15 |     },
 16 |     {
 17 |       "name": "logs-201998",
 18 |       "body": "index.json"
 19 |     },
 20 |     {
 21 |       "name": "logs-211998",
 22 |       "body": "index.json"
 23 |     },
 24 |     {
 25 |       "name": "logs-221998",
 26 |       "body": "index.json"
 27 |     },
 28 |     {
 29 |       "name": "logs-231998",
 30 |       "body": "index.json"
 31 |     },
 32 |     {
 33 |       "name": "logs-241998",
 34 |       "body": "index.json"
 35 |     },
 36 |     {
 37 |       "name": "reindexed-logs",
 38 |       "body": "index.json"
 39 |     }
 40 |   ],
 41 |   "corpora": [
 42 |       {%- if ingest_pipeline is defined and ingest_pipeline == "grok" %}
 43 |         {
 44 |           "name": "http_logs_unparsed",
 45 |           "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs",
 46 |           "documents": [
 47 |           {
 48 |             "target-index": "logs-181998",
 49 |             "source-file": "documents-181998.unparsed.json.bz2",
 50 |             "document-count": 2708746,
 51 |             "compressed-bytes": 13064317,
 52 |             "uncompressed-bytes": 303920342
 53 |           },
 54 |           {
 55 |             "target-index": "logs-191998",
 56 |             "source-file": "documents-191998.unparsed.json.bz2",
 57 |             "document-count": 9697882,
 58 |             "compressed-bytes": 47211781,
 59 |             "uncompressed-bytes": 1088378738
 60 |           },
 61 |           {
 62 |             "target-index": "logs-201998",
 63 |             "source-file": "documents-201998.unparsed.json.bz2",
 64 |             "document-count": 13053463,
 65 |             "compressed-bytes": 63174979,
 66 |             "uncompressed-bytes": 1456836090
 67 |           },
 68 |           {
 69 |             "target-index": "logs-211998",
 70 |             "source-file": "documents-211998.unparsed.json.bz2",
 71 |             "document-count": 17647279,
 72 |             "compressed-bytes": 85607179,
 73 |             "uncompressed-bytes": 1975990671
 74 |           },
 75 |           {
 76 |             "target-index": "logs-221998",
 77 |             "source-file": "documents-221998.unparsed.json.bz2",
 78 |             "document-count": 10716760,
 79 |             "compressed-bytes": 53190976,
 80 |             "uncompressed-bytes": 1202551382
 81 |           },
 82 |           {
 83 |             "target-index": "logs-231998",
 84 |             "source-file": "documents-231998.unparsed.json.bz2",
 85 |             "document-count": 11961342,
 86 |             "compressed-bytes": 60705435,
 87 |             "uncompressed-bytes": 1334381144
 88 |           },
 89 |           {
 90 |             "target-index": "logs-241998",
 91 |             "source-file": "documents-241998.unparsed.json.bz2",
 92 |             "document-count": 181463624,
 93 |             "compressed-bytes": 897719968,
 94 |             "uncompressed-bytes": 20563705716
 95 |           }
 96 |         ]
 97 |       }
 98 |     {%- else %}
 99 |       {
100 |         "name": "http_logs",
101 |         "base-url": "http://benchmarks.elasticsearch.org.s3.amazonaws.com/corpora/http_logs",
102 |         "documents": [
103 |           {
104 |             "target-index": "logs-181998",
105 |             "source-file": "documents-181998.json.bz2",
106 |             "document-count": 2708746,
107 |             "compressed-bytes": 13815456,
108 |             "uncompressed-bytes": 363512754
109 |           },
110 |           {
111 |             "target-index": "logs-191998",
112 |             "source-file": "documents-191998.json.bz2",
113 |             "document-count": 9697882,
114 |             "compressed-bytes": 49439633,
115 |             "uncompressed-bytes": 1301732149
116 |           },
117 |           {
118 |             "target-index": "logs-201998",
119 |             "source-file": "documents-201998.json.bz2",
120 |             "document-count": 13053463,
121 |             "compressed-bytes": 65623436,
122 |             "uncompressed-bytes": 1744012279
123 |           },
124 |           {
125 |             "target-index": "logs-211998",
126 |             "source-file": "documents-211998.json.bz2",
127 |             "document-count": 17647279,
128 |             "compressed-bytes": 88258230,
129 |             "uncompressed-bytes": 2364230815
130 |           },
131 |           {
132 |             "target-index": "logs-221998",
133 |             "source-file": "documents-221998.json.bz2",
134 |             "document-count": 10716760,
135 |             "compressed-bytes": 54160603,
136 |             "uncompressed-bytes": 1438320123
137 |           },
138 |           {
139 |             "target-index": "logs-231998",
140 |             "source-file": "documents-231998.json.bz2",
141 |             "document-count": 11961342,
142 |             "compressed-bytes": 60927822,
143 |             "uncompressed-bytes": 1597530673
144 |           },
145 |           {
146 |             "target-index": "logs-241998",
147 |             "source-file": "documents-241998.json.bz2",
148 |             "document-count": 181463624,
149 |             "compressed-bytes": 905378242,
150 |             "uncompressed-bytes": 24555905444
151 |           }
152 |         ]
153 |       }
154 |     {%- endif %}
155 |   ],
156 |   "operations": [
157 |     {{ rally.collect(parts="operations/*.json") }}
158 |   ],
159 |   "challenges": [
160 |     {{ rally.collect(parts="challenges/*.json") }}
161 |   ]
162 | }
163 | 


--------------------------------------------------------------------------------
/http_logs/operations/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "index-append",
  3 |       "operation-type": "bulk",
  4 |       "bulk-size": {{bulk_size | default(5000)}},
  5 |       "ingest-percentage": {{ingest_percentage | default(100)}},
  6 |       "corpora": "http_logs"
  7 |     },
  8 |     {
  9 |       "name": "index-append-with-ingest-baseline-pipeline",
 10 |       "operation-type": "bulk",
 11 |       "bulk-size": {{bulk_size | default(5000)}},
 12 |       "ingest-percentage": {{ingest_percentage | default(100)}},
 13 |       "pipeline": "http-log-baseline-pipeline",
 14 |       "corpora": "http_logs"
 15 |     },
 16 |     {
 17 |       "name": "index-append-with-ingest-grok-pipeline",
 18 |       "operation-type": "bulk",
 19 |       "bulk-size": {{bulk_size | default(5000)}},
 20 |       "ingest-percentage": {{ingest_percentage | default(100)}},
 21 |       "pipeline": "http-log-grok-pipeline",
 22 |       "corpora": "http_logs_unparsed"
 23 |     },
 24 |     {
 25 |       "name": "index-append-with-ingest-geoip-pipeline",
 26 |       "operation-type": "bulk",
 27 |       "bulk-size": {{bulk_size | default(5000)}},
 28 |       "ingest-percentage": {{ingest_percentage | default(100)}},
 29 |       "pipeline": "http-log-geoip-pipeline",
 30 |       "corpora": "http_logs"
 31 |     },
 32 |     {
 33 |       "name": "update",
 34 |       "operation-type": "bulk",
 35 |       "bulk-size": {{bulk_size | default(5000)}},
 36 |       "ingest-percentage": {{ingest_percentage | default(100)}},
 37 |       "conflicts": "{{conflicts | default('random')}}",
 38 |       "on-conflict": "{{on_conflict | default('update')}}",
 39 |       "conflict-probability": {{conflict_probability | default(25)}},
 40 |       "recency": {{recency | default(0)}},
 41 |       "corpora": "http_logs"
 42 |     },
 43 |     {
 44 |       "name": "default",
 45 |       "operation-type": "search",
 46 |       "index": "logs-*",
 47 |       "body": {
 48 |         "query": {
 49 |           "match_all": {}
 50 |         }
 51 |       }
 52 |     },
 53 |     {
 54 |       "name": "term",
 55 |       "operation-type": "search",
 56 |       "index": "logs-*",
 57 |       "body": {
 58 |         "query": {
 59 |           "term": {
 60 |             "request.raw": {
 61 |               "value": "GET / HTTP/1.0"
 62 |             }
 63 |           }
 64 |         }
 65 |       }
 66 |     },
 67 |     {
 68 |       "name": "range",
 69 |       "operation-type": "search",
 70 |       "index": "logs-*",
 71 |       "body": {
 72 |         "query": {
 73 |           "range": {
 74 |             "@timestamp": {
 75 |               "gte": "now-{{'15-05-1998' | days_ago(now)}}d/d",
 76 |               "lt": "now/d"
 77 |             }
 78 |           }
 79 |         }
 80 |       }
 81 |     },
 82 |     {
 83 |       "name": "hourly_agg",
 84 |       "operation-type": "search",
 85 |       "index": "logs-*",
 86 |       "body": {
 87 |         "size": 0,
 88 |         "aggs": {
 89 |           "by_hour": {
 90 |             "date_histogram": {
 91 |               "field": "@timestamp",
 92 |               "calendar_interval": "hour"
 93 |             }
 94 |           }
 95 |         }
 96 |       }
 97 |     },
 98 |     {
 99 |       "name": "scroll",
100 |       "operation-type": "search",
101 |       "index": "logs-*",
102 |       "pages": 25,
103 |       "results-per-page": 1000,
104 |       "body": {
105 |         "query": {
106 |           "match_all": {}
107 |         }
108 |       }
109 |     },
110 |     {
111 |       "name": "desc_sort_timestamp",
112 |       "operation-type": "search",
113 |       "index": "logs-*",
114 |       "body": {
115 |         "query": {
116 |           "match_all": {}
117 |         },
118 |         "sort" : [
119 |           {"@timestamp" : "desc"}
120 |         ]
121 |       }
122 |     },
123 |     {
124 |       "name": "asc_sort_timestamp",
125 |       "operation-type": "search",
126 |       "index": "logs-*",
127 |       "body": {
128 |         "query": {
129 |           "match_all": {}
130 |         },
131 |         "sort" : [
132 |           {"@timestamp" : "asc"}
133 |         ]
134 |       }
135 |     },
136 |     {
137 |       "name": "create-http-log-baseline-pipeline",
138 |       "operation-type": "put-pipeline",
139 |       "id": "http-log-baseline-pipeline",
140 |       "body": {
141 |         "description": "Process an the documents with a processor that does nothing. Baseline for overhead of pipeline.",
142 |         "processors": [
143 |           {
144 |             "uppercase": {
145 |               "field": "doesnotexist",
146 |               "ignore_missing": true
147 |             }
148 |           }
149 |         ]
150 |       }
151 |     },
152 |     {
153 |       "name": "create-http-log-grok-pipeline",
154 |       "operation-type": "put-pipeline",
155 |       "id": "http-log-grok-pipeline",
156 |       "body": {
157 |         "description": "Process an http log line with grok. Requires the `unparsed` data set.",
158 |         "processors": [
159 |           {
160 |             "grok": {
161 |               "field": "message",
162 |               "patterns": [
163 |                 "%{IPORHOST:clientip} %{HTTPDUSER} %{USER} \\[%{TIMESTAMP_ISO8601:@timestamp}\\] \"(?:%{WORD} %{NOTSPACE:request}(?: HTTP/%{NUMBER})?|%{DATA})\" %{NUMBER:status} (?:%{NUMBER:size}|-)"
164 |               ]
165 |             }
166 |           }
167 |         ]
168 |       }
169 |     },
170 |     {
171 |       "name": "create-http-log-geoip-pipeline",
172 |       "operation-type": "put-pipeline",
173 |       "id": "http-log-geoip-pipeline",
174 |       "body": {
175 |         "description": "Enrich the data with the geo-ip filter. Requires --elasticsearch-plugins='ingest-geoip'",
176 |         "processors": [
177 |           {
178 |             "geoip": {
179 |                "field": "clientip",
180 |                "properties": [
181 |                   "city_name",
182 |                   "country_name",
183 |                   "location"
184 |                 ]
185 |             }
186 |           }
187 |         ]
188 |       }
189 |     }
190 | 


--------------------------------------------------------------------------------
/nested/track.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import os
  3 | import csv
  4 | 
  5 | 
  6 | class QueryParamSource:
  7 |     # We need to stick to the param source API
  8 |     # noinspection PyUnusedLocal
  9 |     def __init__(self, track, params, **kwargs):
 10 |         self._params = params
 11 |         self.infinite = True
 12 |         # here we read the queries data file into arrays which we'll then later use randomly.
 13 |         self.tags = []
 14 |         self.dates = []
 15 |         # be predictably random. The seed has been chosen by a fair dice roll. ;)
 16 |         random.seed(4)
 17 |         cwd = os.path.dirname(__file__)
 18 |         with open(os.path.join(cwd, "queries.csv"), "r") as ins:
 19 |             csvreader = csv.reader(ins)
 20 |             for row in csvreader:
 21 |                 self.tags.append(row[0])
 22 |                 self.dates.append(row[1])
 23 | 
 24 |     # We need to stick to the param source API
 25 |     # noinspection PyUnusedLocal
 26 |     def partition(self, partition_index, total_partitions):
 27 |         return self
 28 | 
 29 |     # Deprecated - only there for BWC reasons with Rally < 1.4.0
 30 |     def size(self):
 31 |         return 1
 32 | 
 33 | 
 34 | class SortedTermQueryParamSource(QueryParamSource):
 35 |     def params(self):
 36 |         result = {
 37 |             "body": {
 38 |                 "query": {
 39 |                     "match": {
 40 |                         "tag": "%s" % random.choice(self.tags)
 41 |                     }
 42 |                 },
 43 |                 "sort": [
 44 |                     {
 45 |                         "answers.date": {
 46 |                             "mode": "max",
 47 |                             "order": "desc",
 48 |                             "nested": {
 49 |                                 "path": "answers"
 50 |                             }
 51 |                         }
 52 |                     }
 53 |                 ]
 54 |             },
 55 |             "index": None
 56 |         }
 57 |         if "cache" in self._params:
 58 |             result["cache"] = self._params["cache"]
 59 | 
 60 |         return result
 61 | 
 62 | 
 63 | class TermQueryParamSource(QueryParamSource):
 64 |     def params(self):
 65 |         result = {
 66 |             "body": {
 67 |                 "query": {
 68 |                     "match": {
 69 |                         "tag": "%s" % random.choice(self.tags)
 70 |                     }
 71 |                 }
 72 |             },
 73 |             "index": None
 74 |         }
 75 |         if "cache" in self._params:
 76 |             result["cache"] = self._params["cache"]
 77 | 
 78 |         return result
 79 | 
 80 | 
 81 | class NestedQueryParamSource(QueryParamSource):
 82 |     def params(self):
 83 |         result = {
 84 |             "body": {
 85 |                 "query": {
 86 |                     "bool": {
 87 |                         "must": [
 88 |                             {
 89 |                                 "match": {
 90 |                                     "tag": "%s" % random.choice(self.tags)
 91 |                                 }
 92 |                             },
 93 |                             {
 94 |                                 "nested": {
 95 |                                     "path": "answers",
 96 |                                     "query": {
 97 |                                         "range": {
 98 |                                             "answers.date": {
 99 |                                                 "lte": "%s" % random.choice(self.dates)
100 |                                             }
101 |                                         }
102 |                                     }
103 |                                 }
104 |                             }
105 |                         ]
106 |                     }
107 |                 }
108 |             },
109 |             "index": None
110 |         }
111 |         if "cache" in self._params:
112 |             result["cache"] = self._params["cache"]
113 | 
114 |         return result
115 | 
116 | 
117 | class NestedQueryParamSourceWithInnerHits(QueryParamSource):
118 |     def params(self):
119 |         result = {
120 |             "body": {
121 |                 "query": {
122 |                     "bool": {
123 |                         "must": [
124 |                             {
125 |                                 "match": {
126 |                                     "tag": "%s" % random.choice(self.tags)
127 |                                 }
128 |                             },
129 |                             {
130 |                                 "nested": {
131 |                                     "path": "answers",
132 |                                     "query": {
133 |                                         "range": {
134 |                                             "answers.date": {
135 |                                                 "lte": "%s" % random.choice(self.dates)
136 |                                             }
137 |                                         }
138 |                                     },
139 |                                     "inner_hits": {
140 |                                         "size": self._params["inner_hits_size"]
141 |                                     }
142 |                                 }
143 |                             }
144 |                         ]
145 |                     }
146 |                 },
147 |                 "size": self._params["size"]
148 |             },
149 |             "index": None
150 |         }
151 |         if "cache" in self._params:
152 |             result["cache"] = self._params["cache"]
153 | 
154 |         return result
155 | 
156 | 
157 | def register(registry):
158 |     registry.register_param_source("nested-query-source", NestedQueryParamSource)
159 |     registry.register_param_source("nested-query-source-with-inner-hits", NestedQueryParamSourceWithInnerHits)
160 |     registry.register_param_source("term-query-source", TermQueryParamSource)
161 |     registry.register_param_source("sorted-term-query-source", SortedTermQueryParamSource)
162 | 


--------------------------------------------------------------------------------
/noaa/index.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "settings": {
  3 |     "index.number_of_shards": {{number_of_shards | default(1)}},
  4 |     "index.number_of_replicas": {{number_of_replicas | default(0)}},
  5 |     "index.queries.cache.enabled": false,
  6 |     "index.requests.cache.enable": false,
  7 |     "index.merge.policy.max_merged_segment": "100GB"
  8 |   },
  9 |   "mappings": {
 10 |     "dynamic": "strict",
 11 |     "_source": {
 12 |       "enabled": {{ source_enabled | default(true) | tojson }}
 13 |     },
 14 |     "properties": {
 15 |       "AWDR": {
 16 |         "type": "keyword"
 17 |       },
 18 |       "AWND": {
 19 |         "type": "float"
 20 |       },
 21 |       "DAPR": {
 22 |         "type": "keyword"
 23 |       },
 24 |       "DASF": {
 25 |         "type": "keyword"
 26 |       },
 27 |       "DATN": {
 28 |         "type": "keyword"
 29 |       },
 30 |       "DATX": {
 31 |         "type": "keyword"
 32 |       },
 33 |       "DWPR": {
 34 |         "type": "keyword"
 35 |       },
 36 |       "EVAP": {
 37 |         "type": "float"
 38 |       },
 39 |       "MDPR": {
 40 |         "type": "float"
 41 |       },
 42 |       "MDSF": {
 43 |         "type": "keyword"
 44 |       },
 45 |       "MDTN": {
 46 |         "type": "float"
 47 |       },
 48 |       "MDTRANGE": {
 49 |         "type": "double_range"
 50 |       },
 51 |       "MDTX": {
 52 |         "type": "float"
 53 |       },
 54 |       "MNPN": {
 55 |         "type": "float"
 56 |       },
 57 |       "MXPN": {
 58 |         "type": "float"
 59 |       },
 60 |       "PGTM": {
 61 |         "type": "keyword"
 62 |       },
 63 |       "PRCP": {
 64 |         "type": "float"
 65 |       },
 66 |       "PSUN": {
 67 |         "type": "keyword"
 68 |       },
 69 |       "SN31": {
 70 |         "type": "keyword"
 71 |       },
 72 |       "SN32": {
 73 |         "type": "keyword"
 74 |       },
 75 |       "SN33": {
 76 |         "type": "keyword"
 77 |       },
 78 |       "SN35": {
 79 |         "type": "keyword"
 80 |       },
 81 |       "SN36": {
 82 |         "type": "keyword"
 83 |       },
 84 |       "SN51": {
 85 |         "type": "keyword"
 86 |       },
 87 |       "SN52": {
 88 |         "type": "keyword"
 89 |       },
 90 |       "SN53": {
 91 |         "type": "keyword"
 92 |       },
 93 |       "SN55": {
 94 |         "type": "keyword"
 95 |       },
 96 |       "SN56": {
 97 |         "type": "keyword"
 98 |       },
 99 |       "SN57": {
100 |         "type": "keyword"
101 |       },
102 |       "SNOW": {
103 |         "type": "keyword"
104 |       },
105 |       "SNWD": {
106 |         "type": "keyword"
107 |       },
108 |       "SX31": {
109 |         "type": "keyword"
110 |       },
111 |       "SX32": {
112 |         "type": "keyword"
113 |       },
114 |       "SX33": {
115 |         "type": "keyword"
116 |       },
117 |       "SX35": {
118 |         "type": "keyword"
119 |       },
120 |       "SX36": {
121 |         "type": "keyword"
122 |       },
123 |       "SX51": {
124 |         "type": "keyword"
125 |       },
126 |       "SX52": {
127 |         "type": "keyword"
128 |       },
129 |       "SX53": {
130 |         "type": "keyword"
131 |       },
132 |       "SX55": {
133 |         "type": "keyword"
134 |       },
135 |       "SX56": {
136 |         "type": "keyword"
137 |       },
138 |       "SX57": {
139 |         "type": "keyword"
140 |       },
141 |       "TAVG": {
142 |         "type": "float"
143 |       },
144 |       "THIC": {
145 |         "type": "float"
146 |       },
147 |       "TMAX": {
148 |         "type": "float"
149 |       },
150 |       "TMIN": {
151 |         "type": "float"
152 |       },
153 |       "TOBS": {
154 |         "type": "float"
155 |       },
156 |       "TRANGE": {
157 |         "type": "double_range"
158 |       },
159 |       "TSUN": {
160 |         "type": "keyword"
161 |       },
162 |       "WDF2": {
163 |         "type": "keyword"
164 |       },
165 |       "WDF5": {
166 |         "type": "keyword"
167 |       },
168 |       "WDFG": {
169 |         "type": "keyword"
170 |       },
171 |       "WDMV": {
172 |         "type": "keyword"
173 |       },
174 |       "WESD": {
175 |         "type": "float"
176 |       },
177 |       "WESF": {
178 |         "type": "float"
179 |       },
180 |       "WSF2": {
181 |         "type": "float"
182 |       },
183 |       "WSF5": {
184 |         "type": "float"
185 |       },
186 |       "WSFG": {
187 |         "type": "float"
188 |       },
189 |       "WSFI": {
190 |         "type": "float"
191 |       },
192 |       "WT01": {
193 |         "type": "keyword"
194 |       },
195 |       "WT02": {
196 |         "type": "keyword"
197 |       },
198 |       "WT03": {
199 |         "type": "keyword"
200 |       },
201 |       "WT04": {
202 |         "type": "keyword"
203 |       },
204 |       "WT05": {
205 |         "type": "keyword"
206 |       },
207 |       "WT06": {
208 |         "type": "keyword"
209 |       },
210 |       "WT07": {
211 |         "type": "keyword"
212 |       },
213 |       "WT08": {
214 |         "type": "keyword"
215 |       },
216 |       "WT09": {
217 |         "type": "keyword"
218 |       },
219 |       "WT10": {
220 |         "type": "keyword"
221 |       },
222 |       "WT11": {
223 |         "type": "keyword"
224 |       },
225 |       "WT17": {
226 |         "type": "keyword"
227 |       },
228 |       "WT18": {
229 |         "type": "keyword"
230 |       },
231 |       "date": {
232 |         "type": "date"
233 |       },
234 |       "station": {
235 |         "properties": {
236 |           "country": {
237 |             "type": "keyword"
238 |           },
239 |           "country_code": {
240 |             "type": "keyword"
241 |           },
242 |           "elevation": {
243 |             "type": "float"
244 |           },
245 |           "gsn_flag": {
246 |             "type": "keyword"
247 |           },
248 |           "hcn_crn_flag": {
249 |             "type": "keyword"
250 |           },
251 |           "id": {
252 |             "type": "keyword"
253 |           },
254 |           "location": {
255 |             "type": "geo_point"
256 |           },
257 |           "name": {
258 |             "type": "keyword"
259 |           },
260 |           "state": {
261 |             "type": "keyword"
262 |           },
263 |           "state_code": {
264 |             "type": "keyword"
265 |           },
266 |           "wmo_id": {
267 |             "type": "keyword"
268 |           }
269 |         }
270 |       }
271 |     }
272 |   }
273 | }
274 | 


--------------------------------------------------------------------------------
/geopoint/challenges/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "append-no-conflicts",
  3 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.",
  4 |       "default": true,
  5 |       "schedule": [
  6 |         {
  7 |           "operation": "delete-index"
  8 |         },
  9 |         {
 10 |           "operation": {
 11 |             "operation-type": "create-index",
 12 |             "settings": {{index_settings | default({}) | tojson}}
 13 |           }
 14 |         },
 15 |         {
 16 |           "name": "check-cluster-health",
 17 |           "operation": {
 18 |             "operation-type": "cluster-health",
 19 |             "index": "osmgeopoints",
 20 |             "request-params": {
 21 |               "wait_for_status": "{{cluster_health | default('green')}}",
 22 |               "wait_for_no_relocating_shards": "true"
 23 |             }
 24 |           }
 25 |         },
 26 |         {
 27 |           "operation": "index-append",
 28 |           "warmup-time-period": 120,
 29 |           "clients": {{bulk_indexing_clients | default(8)}}
 30 |         },
 31 |         {
 32 |           "name": "refresh-after-index",
 33 |           "operation": "refresh",
 34 |           "clients": 1
 35 |         },
 36 |         {
 37 |           "operation": {
 38 |             "operation-type": "force-merge"{%- if max_num_segments is defined %},
 39 |             "max-num-segments": {{max_num_segments}}
 40 |              {%- endif %}
 41 |           },
 42 |           "clients": 1
 43 |         },
 44 |         {
 45 |           "name": "refresh-after-force-merge",
 46 |           "operation": "refresh",
 47 |           "clients": 1
 48 |         },
 49 |         {
 50 |           "operation": "polygon",
 51 |           "clients": 1,
 52 |           "warmup-iterations": 200,
 53 |           "iterations": 100,
 54 |           "target-throughput": 2
 55 |         },
 56 |         {
 57 |           "operation": "bbox",
 58 |           "clients": 1,
 59 |           "warmup-iterations": 200,
 60 |           "iterations": 100,
 61 |           "target-throughput": 2
 62 |         },
 63 |         {
 64 |           "operation": "distance",
 65 |           "clients": 1,
 66 |           "warmup-iterations": 200,
 67 |           "iterations": 100,
 68 |           "target-throughput": 5
 69 |         },
 70 |         {
 71 |           "operation": "distanceRange",
 72 |           "clients": 1,
 73 |           "warmup-iterations": 200,
 74 |           "iterations": 100,
 75 |           "target-throughput": 0.6
 76 |         }
 77 |       ]
 78 |     },
 79 |     {
 80 |       "name": "append-no-conflicts-index-only",
 81 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.",
 82 |       "schedule": [
 83 |         {
 84 |           "operation": "delete-index"
 85 |         },
 86 |         {
 87 |           "operation": {
 88 |             "operation-type": "create-index",
 89 |             "settings": {{index_settings | default({}) | tojson}}
 90 |           }
 91 |         },
 92 |         {
 93 |           "name": "check-cluster-health",
 94 |           "operation": {
 95 |             "operation-type": "cluster-health",
 96 |             "index": "osmgeopoints",
 97 |             "request-params": {
 98 |               "wait_for_status": "{{cluster_health | default('green')}}",
 99 |               "wait_for_no_relocating_shards": "true"
100 |             }
101 |           }
102 |         },
103 |         {
104 |           "operation": "index-append",
105 |           "warmup-time-period": 120,
106 |           "clients": {{bulk_indexing_clients | default(8)}}
107 |         },
108 |         {
109 |         "name": "refresh-after-index",
110 |         "operation": "refresh",
111 |         "clients": 1
112 |         },
113 |         {
114 |           "operation": {
115 |             "operation-type": "force-merge"{%- if max_num_segments is defined %},
116 |             "max-num-segments": {{max_num_segments}}
117 |              {%- endif %}
118 |           },
119 |           "clients": 1
120 |         },
121 |         {
122 |         "name": "refresh-after-force-merge",
123 |         "operation": "refresh",
124 |         "clients": 1
125 |         }
126 |       ]
127 |     },
128 |     {
129 |       "name": "append-fast-with-conflicts",
130 |       "description": "Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings. Rally will produce duplicate ids in 25% of all documents (not configurable) so we can simulate a scenario with appends most of the time and some updates in between.",
131 |       "schedule": [
132 |         {
133 |           "operation": "delete-index"
134 |         },
135 |         {
136 |           "operation": {
137 |             "operation-type": "create-index",
138 |             "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
139 |               "index.refresh_interval": "30s",
140 |               "index.number_of_shards": {{number_of_shards | default(6)}},
141 |               "index.translog.flush_threshold_size": "4g"
142 |             }{%- endif %}
143 |           }
144 |         },
145 |         {
146 |           "name": "check-cluster-health",
147 |           "operation": {
148 |             "operation-type": "cluster-health",
149 |             "index": "osmgeopoints",
150 |             "request-params": {
151 |               "wait_for_status": "{{cluster_health | default('green')}}",
152 |               "wait_for_no_relocating_shards": "true"
153 |             }
154 |           }
155 |         },
156 |         {
157 |           "operation": "index-update",
158 |           "warmup-time-period": 120,
159 |           "clients": {{bulk_indexing_clients | default(8)}}
160 |         },
161 |         {
162 |         "name": "refresh-after-index",
163 |         "operation": "refresh",
164 |         "clients": 1
165 |         },
166 |         {
167 |           "operation": {
168 |             "operation-type": "force-merge"{%- if max_num_segments is defined %},
169 |             "max-num-segments": {{max_num_segments}}
170 |              {%- endif %}
171 |           },
172 |           "clients": 1
173 |         },
174 |         {
175 |         "name": "refresh-after-force-merge",
176 |         "operation": "refresh",
177 |         "clients": 1
178 |         }
179 |       ]
180 |     }
181 | 


--------------------------------------------------------------------------------
/pmc/challenges/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "append-no-conflicts",
  3 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.",
  4 |       "default": true,
  5 |       "schedule": [
  6 |         {
  7 |           "operation": {
  8 |             "operation-type": "put-settings",
  9 |             "body": {
 10 |               "transient": {
 11 |                   "search.default_search_timeout": "{{default_search_timeout | default(-1)}}"
 12 |               }
 13 |             }
 14 |           }
 15 |         },
 16 |         {
 17 |           "operation": "delete-index"
 18 |         },
 19 |         {
 20 |           "operation": {
 21 |             "operation-type": "create-index",
 22 |             "settings": {{index_settings | default({}) | tojson}}
 23 |           }
 24 |         },
 25 |         {
 26 |           "name": "check-cluster-health",
 27 |           "operation": {
 28 |             "operation-type": "cluster-health",
 29 |             "index": "pmc",
 30 |             "request-params": {
 31 |               "wait_for_status": "{{cluster_health | default('green')}}",
 32 |               "wait_for_no_relocating_shards": "true"
 33 |             }
 34 |           }
 35 |         },
 36 |         {
 37 |           "operation": "index-append",
 38 |           "warmup-time-period": 240,
 39 |           "clients": {{bulk_indexing_clients | default(8)}}
 40 |         },
 41 |         {
 42 |           "name": "refresh-after-index",
 43 |           "operation": "refresh",
 44 |           "clients": 1
 45 |         },
 46 |         {
 47 |           "operation": "force-merge",
 48 |           "clients": 1
 49 |         },
 50 |         {
 51 |           "name": "refresh-after-force-merge",
 52 |           "operation": "refresh",
 53 |           "clients": 1
 54 |         },
 55 |         {
 56 |           "operation": "default",
 57 |           "clients": 1,
 58 |           "warmup-iterations": 500,
 59 |           "iterations": 200,
 60 |           "target-throughput": 20
 61 |         },
 62 |         {
 63 |           "operation": "term",
 64 |           "clients": 1,
 65 |           "warmup-iterations": 500,
 66 |           "iterations": 200,
 67 |           "target-throughput": 20
 68 |         },
 69 |         {
 70 |           "operation": "phrase",
 71 |           "clients": 1,
 72 |           "warmup-iterations": 500,
 73 |           "iterations": 200,
 74 |           "target-throughput": 20
 75 |         },
 76 |         {
 77 |           "operation": "articles_monthly_agg_uncached",
 78 |           "clients": 1,
 79 |           "warmup-iterations": 500,
 80 |           "iterations": 200,
 81 |           "target-throughput": 20
 82 |         },
 83 |         {
 84 |           "operation": "articles_monthly_agg_cached",
 85 |           "clients": 1,
 86 |           "warmup-iterations": 500,
 87 |           "iterations": 200,
 88 |           "target-throughput": 20
 89 |         },
 90 |         {
 91 |           "operation": "scroll",
 92 |           "clients": 1,
 93 |           "warmup-iterations": 50,
 94 |           "iterations": 100,
 95 |           "target-throughput": 0.5
 96 |         }
 97 |       ]
 98 |     },
 99 |     {
100 |       "name": "append-no-conflicts-index-only",
101 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.",
102 |       "schedule": [
103 |         {
104 |           "operation": "delete-index"
105 |         },
106 |         {
107 |           "operation": {
108 |             "operation-type": "create-index",
109 |             "settings": {{index_settings | default({}) | tojson}}
110 |           }
111 |         },
112 |         {
113 |           "name": "check-cluster-health",
114 |           "operation": {
115 |             "operation-type": "cluster-health",
116 |             "index": "pmc",
117 |             "request-params": {
118 |               "wait_for_status": "{{cluster_health | default('green')}}",
119 |               "wait_for_no_relocating_shards": "true"
120 |             }
121 |           }
122 |         },
123 |         {
124 |           "operation": "index-append",
125 |           "warmup-time-period": 240,
126 |           "clients": {{bulk_indexing_clients | default(8)}}
127 |         },
128 |         {
129 |           "name": "refresh-after-index",
130 |           "operation": "refresh",
131 |           "clients": 1
132 |         },
133 |         {
134 |           "operation": "force-merge",
135 |           "clients": 1
136 |         },
137 |         {
138 |           "name": "refresh-after-force-merge",
139 |           "operation": "refresh",
140 |           "clients": 1
141 |         }
142 |       ]
143 |     },
144 |     {
145 |       "name": "append-sorted-no-conflicts",
146 |       "description": "Indexes the whole document corpus in an index sorted by timestamp field in descending order (most recent first). Document ids are unique so all index operations are append only.",
147 |       "schedule": [
148 |         {
149 |           "operation": "delete-index"
150 |         },
151 |         {
152 |           "operation": {
153 |             "operation-type": "create-index",
154 |             "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
155 |               "index.sort.field": "timestamp",
156 |               "index.sort.order": "desc"
157 |             }{%- endif %}
158 |           }
159 |         },
160 |         {
161 |           "name": "check-cluster-health",
162 |           "operation": {
163 |             "operation-type": "cluster-health",
164 |             "index": "pmc",
165 |             "request-params": {
166 |               "wait_for_status": "{{cluster_health | default('green')}}",
167 |               "wait_for_no_relocating_shards": "true"
168 |             }
169 |           }
170 |         },
171 |         {
172 |           "operation": "index-append",
173 |           "warmup-time-period": 240,
174 |           "clients": {{bulk_indexing_clients | default(8)}}
175 |         },
176 |         {
177 |           "name": "refresh-after-index",
178 |           "operation": "refresh",
179 |           "clients": 1
180 |         },
181 |         {
182 |           "operation": "force-merge",
183 |           "clients": 1
184 |         },
185 |         {
186 |           "name": "refresh-after-force-merge",
187 |           "operation": "refresh",
188 |           "clients": 1
189 |         }
190 |       ]
191 |     },
192 |     {
193 |       "name": "append-fast-with-conflicts",
194 |       "description": "Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings. Rally will produce duplicate ids in 25% of all documents (not configurable) so we can simulate a scenario with appends most of the time and some updates in between.",
195 |       "schedule": [
196 |         {
197 |           "operation": "delete-index"
198 |         },
199 |         {
200 |           "operation": {
201 |             "operation-type": "create-index",
202 |             "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
203 |               "index.refresh_interval": "30s",
204 |               "index.number_of_shards": {{number_of_shards | default(6)}},
205 |               "index.translog.flush_threshold_size": "4g"
206 |             }{%- endif %}
207 |           }
208 |         },
209 |         {
210 |           "name": "check-cluster-health",
211 |           "operation": {
212 |             "operation-type": "cluster-health",
213 |             "index": "pmc",
214 |             "request-params": {
215 |               "wait_for_status": "{{cluster_health | default('green')}}",
216 |               "wait_for_no_relocating_shards": "true"
217 |             }
218 |           }
219 |         },
220 |         {
221 |           "operation": "index-update",
222 |           "warmup-time-period": 240,
223 |           "clients": {{bulk_indexing_clients | default(8)}}
224 |         },
225 |         {
226 |           "name": "refresh-after-index",
227 |           "operation": "refresh",
228 |           "clients": 1
229 |         },
230 |         {
231 |           "operation": "force-merge",
232 |           "clients": 1
233 |         },
234 |         {
235 |           "name": "refresh-after-force-merge",
236 |           "operation": "refresh",
237 |           "clients": 1
238 |         }
239 |       ]
240 |     }
241 | 


--------------------------------------------------------------------------------
/noaa/_tools/process.py:
--------------------------------------------------------------------------------
  1 | ####################################################################
  2 | #
  3 | # process the csv file into Elasticsearch json documents
  4 | #
  5 | ####################################################################
  6 | 
  7 | import os
  8 | import csv
  9 | import json
 10 | from datetime import datetime
 11 | 
 12 | stationsFile = 'ghcnd-stations.txt'
 13 | countriesFile = 'ghcnd-countries.txt'
 14 | statesFile = 'ghcnd-states.txt'
 15 | 
 16 | weatherDataFiles = ['2014-sorted.csv', '2015-sorted.csv', '2016-sorted.csv']
 17 | indexPrefix = 'weather-data'
 18 | docType = 'summary'
 19 | 
 20 | def loadStatesFile(statesFile):
 21 |     statesMap = {}
 22 |     with open(statesFile, 'r') as file:
 23 |         csvreader = csv.reader(file, delimiter=' ', quotechar='"')
 24 |         for row in csvreader:
 25 |             statesMap[row[0].strip()] = row[1].strip()
 26 |     return statesMap
 27 | 
 28 | def loadCountriesFile(countriesFile):
 29 |     countriesMap = {}
 30 |     with open(countriesFile, 'r') as file:
 31 |         csvreader = csv.reader(file, delimiter=' ', quotechar='"')
 32 |         for row in csvreader:
 33 |             countriesMap[row[0].strip()] = row[1].strip()
 34 |     return countriesMap
 35 | 
 36 | def loadStationsFile(stationsFile, statesFile, countriesFile):
 37 |     statesMap = loadStatesFile(statesFile)
 38 |     countriesMap = loadCountriesFile(countriesFile)
 39 |     stationsMap = {}
 40 |     with open(stationsFile, 'r') as file:
 41 |         for row in file:
 42 |             try:
 43 |                 station = {}
 44 |                 station['id'] = row[0:11].strip()
 45 |                 countryCode = row[0:2].strip()
 46 |                 if len(countryCode) > 0:
 47 |                     station['country_code'] = countryCode
 48 |                     station['country'] = countriesMap[countryCode]
 49 |                 station['location'] = {
 50 |                     'lat': float(row[12:20].strip()),
 51 |                     'lon': float(row[21:30].strip())
 52 |                 }
 53 |                 station['elevation'] = float(row[31:37].strip())
 54 |                 if countryCode == 'US':
 55 |                     stateCode = row[38:40].strip()
 56 |                     if len(stateCode) > 0:
 57 |                         station['state_code'] = stateCode
 58 |                         station['state'] = statesMap[stateCode]
 59 |                 station['name'] = row[41:71].strip()
 60 |                 gsn_flag = row[72:75].strip()
 61 |                 if len(gsn_flag) > 0:
 62 |                     station['gsn_flag'] = gsn_flag
 63 |                 hcn_crn_flag = row[76:78].strip()
 64 |                 if len(hcn_crn_flag) > 0:
 65 |                     station['hcn_crn_flag'] = hcn_crn_flag
 66 |                 wmo_id = row[80:85].strip()
 67 |                 if len(wmo_id) > 0:
 68 |                     station['wmo_id'] = wmo_id
 69 |                 stationsMap[station['id']] = station
 70 |             except:
 71 |                 print(row)
 72 |                 raise e
 73 |     return stationsMap
 74 | 
 75 | def processWeatherDoc(currentStationDoc):
 76 |     if 'TMAX' in currentStationDoc:
 77 |         currentStationDoc['TMAX'] = float(currentStationDoc['TMAX']) / 10.0
 78 |     if 'TMIN' in currentStationDoc:
 79 |         currentStationDoc['TMIN'] = float(currentStationDoc['TMIN']) / 10.0
 80 |     if 'PRCP' in currentStationDoc:
 81 |         currentStationDoc['PRCP'] = float(currentStationDoc['PRCP']) / 10.0
 82 |     if 'AWND' in currentStationDoc:
 83 |         currentStationDoc['AWND'] = float(currentStationDoc['AWND']) / 10.0
 84 |     if 'EVAP' in currentStationDoc:
 85 |         currentStationDoc['EVAP'] = float(currentStationDoc['EVAP']) / 10.0
 86 |     if 'MDEV' in currentStationDoc:
 87 |         currentStationDoc['MDEV'] = float(currentStationDoc['MDEV']) / 10.0
 88 |     if 'MDPR' in currentStationDoc:
 89 |         currentStationDoc['MDPR'] = float(currentStationDoc['MDPR']) / 10.0
 90 |     if 'MDTN' in currentStationDoc:
 91 |         currentStationDoc['MDTN'] = float(currentStationDoc['MDTN']) / 10.0
 92 |     if 'MDTX' in currentStationDoc:
 93 |         currentStationDoc['MDTX'] = float(currentStationDoc['MDTX']) / 10.0
 94 |     if 'MNPN' in currentStationDoc:
 95 |         currentStationDoc['MNPN'] = float(currentStationDoc['MNPN']) / 10.0
 96 |     if 'MXPN' in currentStationDoc:
 97 |         currentStationDoc['MXPN'] = float(currentStationDoc['MXPN']) / 10.0
 98 |     if 'TAVG' in currentStationDoc:
 99 |         currentStationDoc['TAVG'] = float(currentStationDoc['TAVG']) / 10.0
100 |     if 'THIC' in currentStationDoc:
101 |         currentStationDoc['THIC'] = float(currentStationDoc['THIC']) / 10.0
102 |     if 'TOBS' in currentStationDoc:
103 |         currentStationDoc['TOBS'] = float(currentStationDoc['TOBS']) / 10.0
104 |     if 'WESD' in currentStationDoc:
105 |         currentStationDoc['WESD'] = float(currentStationDoc['WESD']) / 10.0
106 |     if 'WESF' in currentStationDoc:
107 |         currentStationDoc['WESF'] = float(currentStationDoc['WESF']) / 10.0
108 |     if 'WSF1' in currentStationDoc:
109 |         currentStationDoc['WSF1'] = float(currentStationDoc['WSF1']) / 10.0
110 |     if 'WSF2' in currentStationDoc:
111 |         currentStationDoc['WSF2'] = float(currentStationDoc['WSF2']) / 10.0
112 |     if 'WSF5' in currentStationDoc:
113 |         currentStationDoc['WSF5'] = float(currentStationDoc['WSF5']) / 10.0
114 |     if 'WSFG' in currentStationDoc:
115 |         currentStationDoc['WSFG'] = float(currentStationDoc['WSFG']) / 10.0
116 |     if 'WSFI' in currentStationDoc:
117 |         currentStationDoc['WSFI'] = float(currentStationDoc['WSFI']) / 10.0
118 |     if 'WSFM' in currentStationDoc:
119 |         currentStationDoc['WSFM'] = float(currentStationDoc['WSFM']) / 10.0
120 | 
121 |     if 'TMIN' in currentStationDoc and 'TMAX' in currentStationDoc:
122 |         if currentStationDoc['TMIN'] > currentStationDoc['TMAX']:
123 |             tmp = currentStationDoc['TMIN']
124 |             currentStationDoc['TMIN'] = currentStationDoc['TMAX']
125 |             currentStationDoc['TMAX'] = tmp
126 |         currentStationDoc['TRANGE'] = {
127 |             "gte" : currentStationDoc['TMIN'],
128 |             "lte" : currentStationDoc['TMAX']
129 |         }
130 |     if 'MDTN' in currentStationDoc and 'MDTX' in currentStationDoc:
131 |         if currentStationDoc['MDTN'] > currentStationDoc['MDTX']:
132 |             tmp = currentStationDoc['MDTN']
133 |             currentStationDoc['MDTN'] = currentStationDoc['MDTX']
134 |             currentStationDoc['MDTX'] = tmp
135 |         currentStationDoc['MDTRANGE'] = {
136 |             "gte" : currentStationDoc['MDTN'],
137 |             "lte" : currentStationDoc['MDTX']
138 |         }
139 | 
140 |     indexDoc = {
141 |         '_op_type': 'create',
142 |         '_index': indexPrefix + '-' + str(currentStationDoc['date'].year),
143 |         '_type': docType,
144 |         '_id': currentStationDoc['date'].strftime('%Y-%m-%d') + '-' + currentStationDoc['station']['id'],
145 |         '_source': currentStationDoc
146 |     }
147 |     return indexDoc
148 | 
149 | def processWeatherFile(weatherDataFile, stationsMap):
150 |     with open(weatherDataFile, 'r') as file:
151 |         csvreader = csv.reader(file, delimiter=',', quotechar='"')
152 |         currentStationDoc = None
153 |         stationDocsProcessed = 0
154 |         for row in csvreader:
155 |             station = stationsMap[row[0]]
156 |             date = datetime.strptime(row[1], '%Y%m%d')
157 |             elementType = row[2]
158 |             elementValue = row[3]
159 |             if currentStationDoc == None:
160 |                 currentStationDoc = {
161 |                     'station': station,
162 |                     'date': date,
163 |                     elementType: elementValue
164 |                 }
165 |             elif currentStationDoc['station'] != station or currentStationDoc['date'] != date:
166 |                 yield processWeatherDoc(currentStationDoc)
167 |                 stationDocsProcessed = stationDocsProcessed + 1
168 |                 currentStationDoc = {
169 |                     'station': station,
170 |                     'date': date,
171 |                     elementType: elementValue
172 |                 }
173 |             else:
174 |                 currentStationDoc[elementType] = elementValue
175 | 
176 | stationsMap = loadStationsFile(stationsFile, statesFile, countriesFile)
177 | outFile = 'documents.json'
178 | with open(outFile, 'w+') as file:
179 |     count = 0
180 |     for weatherDataFile in weatherDataFiles:
181 |         for doc in processWeatherFile(weatherDataFile, stationsMap):
182 |             doc['_source']['date'] = doc['_source']['date'].isoformat()
183 |             file.write(json.dumps(doc['_source']))
184 |             file.write('\n')
185 |             count = count + 1
186 | print('Wrote ' + str(count) + ' entries')


--------------------------------------------------------------------------------
/geonames/operations/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "index-append",
  3 |       "operation-type": "bulk",
  4 |       "bulk-size": {{bulk_size | default(5000)}},
  5 |       "ingest-percentage": {{ingest_percentage | default(100)}}
  6 |     },
  7 |     {
  8 |       "name": "index-update",
  9 |       "operation-type": "bulk",
 10 |       "bulk-size": {{bulk_size | default(5000)}},
 11 |       "ingest-percentage": {{ingest_percentage | default(100)}},
 12 |       "conflicts": "{{conflicts | default('random')}}",
 13 |       "on-conflict": "{{on_conflict | default('index')}}",
 14 |       "conflict-probability": {{conflict_probability | default(25)}},
 15 |       "recency": {{recency | default(0)}}
 16 |     },
 17 |     {
 18 |       "name": "default",
 19 |       "operation-type": "search",
 20 |       "body": {
 21 |         "query": {
 22 |           "match_all": {}
 23 |         }
 24 |       }
 25 |     },
 26 |     {
 27 |       "name": "term",
 28 |       "operation-type": "search",
 29 |       "body": {
 30 |         "query": {
 31 |           "term": {
 32 |             "country_code": "AT"
 33 |           }
 34 |         }
 35 |       }
 36 |     },
 37 |     {
 38 |       "name": "phrase",
 39 |       "operation-type": "search",
 40 |       "body": {
 41 |         "query": {
 42 |           "match_phrase": {
 43 |             "name": "Sankt Georgen"
 44 |           }
 45 |         }
 46 |       }
 47 |     },
 48 |     {
 49 |       "name": "country_agg_uncached",
 50 |       "operation-type": "search",
 51 |       "body": {
 52 |         "size": 0,
 53 |         "aggs": {
 54 |           "country_population": {
 55 |             "terms": {
 56 |               "field": "country_code"
 57 |             },
 58 |             "aggs": {
 59 |               "sum_population": {
 60 |                 "sum": {
 61 |                   "field": "population"
 62 |                 }
 63 |               }
 64 |             }
 65 |           }
 66 |         }
 67 |       }
 68 |     },
 69 |     {
 70 |       "name": "country_agg_cached",
 71 |       "operation-type": "search",
 72 |       "cache": true,
 73 |       "body": {
 74 |         "size": 0,
 75 |         "aggs": {
 76 |           "country_population": {
 77 |             "terms": {
 78 |               "field": "country_code"
 79 |             },
 80 |             "aggs": {
 81 |               "sum_population": {
 82 |                 "sum": {
 83 |                   "field": "population"
 84 |                 }
 85 |               }
 86 |             }
 87 |           }
 88 |         }
 89 |       }
 90 |     },
 91 |     {
 92 |       "name": "scroll",
 93 |       "operation-type": "search",
 94 |       "pages": 25,
 95 |       "results-per-page": 1000,
 96 |       "body": {
 97 |         "query": {
 98 |           "match_all": {}
 99 |         }
100 |       }
101 |     },
102 |     {
103 |       "name": "expression",
104 |       "operation-type": "search",
105 |       "body": {
106 |         "query": {
107 |           "function_score": {
108 |             "query": {
109 |               "match_all": {}
110 |             },
111 |             "functions": [
112 |               {
113 |                 "script_score": {
114 |                   "script": {
115 |                     "source": "abs(ln(abs(doc['population']) + 1) + doc['location'].lon + doc['location'].lat) * _score",
116 |                     "lang": "expression"
117 |                   }
118 |                 }
119 |               }
120 |             ]
121 |           }
122 |         }
123 |       }
124 |     },
125 |     {
126 |       "name": "painless_static",
127 |       "operation-type": "search",
128 |       "body": {
129 |         "query": {
130 |           "function_score": {
131 |             "query": {
132 |               "match_all": {}
133 |             },
134 |             "functions": [
135 |               {
136 |                 "script_score": {
137 |                   "script": {
138 |                     "source": "Math.abs(Math.log(Math.abs((int)((List)doc.population).get(0)) + 1) + (double)(doc.location.lon) * (double)(doc.location.lat))/_score",
139 |                     "lang": "painless"
140 |                   }
141 |                 }
142 |               }
143 |             ]
144 |           }
145 |         }
146 |       }
147 |     },
148 |     {
149 |       "name": "painless_dynamic",
150 |       "operation-type": "search",
151 |       "body": {
152 |         "query": {
153 |           "function_score": {
154 |             "query": {
155 |               "match_all": {}
156 |             },
157 |             "functions": [
158 |               {
159 |                 "script_score": {
160 |                   "script": {
161 |                     "source": "Math.abs(Math.log(Math.abs(doc['population'].value) + 1) + doc['location'].lon * doc['location'].lat)/_score",
162 |                     "lang": "painless"
163 |                   }
164 |                 }
165 |               }
166 |             ]
167 |           }
168 |         }
169 |       }
170 |     },
171 |     {
172 |       "name": "decay_geo_gauss_function_score",
173 |       "operation-type": "search",
174 |       "body": {
175 |         "query": {
176 |           "function_score": {
177 |             "query": {
178 |               "match_all": {}
179 |             },  
180 |             "gauss": {
181 |               "location": {
182 |                 "origin": "52.37, 4.8951",
183 |                 "scale": "500km",
184 |                 "offset": "0km",
185 |                 "decay" : 0.1
186 |               }
187 |             }     
188 |           }
189 |         }
190 |       }
191 |     },
192 |    {
193 |       "name": "decay_geo_gauss_script_score",
194 |       "operation-type": "search",
195 |       "body": {
196 |         "query": {
197 |           "script_score": {
198 |             "query": {
199 |               "match_all": {}
200 |             },
201 |             "script": {
202 |               "source": "decayGeoGauss(params.origin, params.scale, params.offset, params.decay, doc['location'].value)",
203 |               "params": {
204 |                  "origin": "52.37, 4.8951",
205 |                   "scale": "500km",
206 |                   "offset": "0km",
207 |                   "decay" : 0.1 
208 |               }
209 |             }
210 |           }
211 |         }
212 |       }
213 |     },
214 |     {
215 |       "name": "field_value_function_score",
216 |       "operation-type": "search",
217 |       "body": {
218 |         "query": {
219 |           "function_score": {
220 |             "query": {
221 |               "match_all": {}
222 |             },
223 |             "field_value_factor": {
224 |               "field": "population",
225 |               "factor": 1.2,
226 |               "modifier": "log2p"
227 |             }  
228 |           }
229 |         }
230 |       }
231 |     }, 
232 |     {
233 |       "name": "field_value_script_score",
234 |       "operation-type": "search",
235 |       "body": {
236 |         "query": {
237 |           "script_score": {
238 |             "query": {
239 |               "match_all": {}
240 |             },
241 |             "script": {
242 |               "source": "Math.log10(doc['population'].value * 1.2 + 2)"
243 |             }
244 |           }
245 |         }
246 |       }
247 |     },
248 |     {
249 |       "name": "random_function_score",
250 |       "operation-type": "search",
251 |       "body": {
252 |         "query": {
253 |           "function_score": {
254 |             "query": {
255 |               "match_all": {}
256 |             },
257 |             "random_score": {
258 |               "seed": 100,
259 |               "field": "_seq_no"
260 |             }  
261 |           }
262 |         }
263 |       }
264 |     },  
265 |     {
266 |       "name": "random_script_score",
267 |       "operation-type": "search",
268 |       "body": {
269 |         "query": {
270 |           "script_score": {
271 |             "query": {
272 |               "match_all": {}
273 |             },
274 |             "script": {
275 |               "source": "randomScore(100, '_seq_no')"
276 |             }
277 |           }
278 |         }
279 |       }
280 |     },     
281 |     {
282 |       "name": "large_terms",
283 |       "operation-type": "search",
284 |       "param-source": "pure-terms-query-source"
285 |     },
286 |     {
287 |       "name": "large_filtered_terms",
288 |       "operation-type": "search",
289 |       "param-source": "filtered-terms-query-source"
290 |     },
291 |     {
292 |       "name": "large_prohibited_terms",
293 |       "operation-type": "search",
294 |       "param-source": "prohibited-terms-query-source"
295 |     },
296 |     {
297 |       "name": "desc_sort_population",
298 |       "operation-type": "search",
299 |       "body": {
300 |         "query": {
301 |           "match_all": {}
302 |         },
303 |         "sort" : [
304 |           {"population" : "desc"}
305 |         ]
306 |       }
307 |     },
308 |     {
309 |       "name": "asc_sort_population",
310 |       "operation-type": "search",
311 |       "body": {
312 |         "query": {
313 |           "match_all": {}
314 |         },
315 |         "sort" : [
316 |           {"population" : "asc"}
317 |         ]
318 |       }
319 |     },
320 |     {
321 |       "name": "desc_sort_geonameid",
322 |       "operation-type": "search",
323 |       "body": {
324 |         "query": {
325 |           "match_all": {}
326 |         },
327 |         "sort" : [
328 |           {"geonameid" : "desc"}
329 |         ]
330 |       }
331 |     },
332 |     {
333 |       "name": "asc_sort_geonameid",
334 |       "operation-type": "search",
335 |       "body": {
336 |         "query": {
337 |           "match_all": {}
338 |         },
339 |         "sort" : [
340 |           {"geonameid" : "asc"}
341 |         ]
342 |       }
343 |     }
344 | 


--------------------------------------------------------------------------------
/geonames/challenges/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "append-no-conflicts",
  3 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only. After that a couple of queries are run.",
  4 |       "default": true,
  5 |       "schedule": [
  6 |         {
  7 |           "operation": "delete-index"
  8 |         },
  9 |         {
 10 |           "operation": {
 11 |             "operation-type": "create-index",
 12 |             "settings": {{index_settings | default({}) | tojson}}
 13 |           }
 14 |         },
 15 |         {
 16 |           "name": "check-cluster-health",
 17 |           "operation": {
 18 |             "operation-type": "cluster-health",
 19 |             "index": "geonames",
 20 |             "request-params": {
 21 |               "wait_for_status": "{{cluster_health | default('green')}}",
 22 |               "wait_for_no_relocating_shards": "true"
 23 |             }
 24 |           }
 25 |         },
 26 |         {
 27 |           "operation": "index-append",
 28 |           "warmup-time-period": 120,
 29 |           "clients": {{bulk_indexing_clients | default(8)}}
 30 |         },
 31 |         {
 32 |           "name": "refresh-after-index",
 33 |           "operation": "refresh",
 34 |           "clients": 1
 35 |         },
 36 |         {
 37 |           "operation": "force-merge",
 38 |           "clients": 1
 39 |         },
 40 |         {
 41 |           "name": "refresh-after-force-merge",
 42 |           "operation": "refresh",
 43 |           "clients": 1
 44 |         },
 45 |         {
 46 |           "operation": "index-stats",
 47 |           "clients": 1,
 48 |           "warmup-iterations": 500,
 49 |           "iterations": 1000,
 50 |           "target-throughput": 90
 51 |         },
 52 |         {
 53 |           "operation": "node-stats",
 54 |           "clients": 1,
 55 |           "warmup-iterations": 100,
 56 |           "iterations": 1000,
 57 |           "target-throughput": 90
 58 |         },
 59 |         {
 60 |           "operation": "default",
 61 |           "clients": 1,
 62 |           "warmup-iterations": 500,
 63 |           "iterations": 1000,
 64 |           "target-throughput": 50
 65 |         },
 66 |         {
 67 |           "operation": "term",
 68 |           "clients": 1,
 69 |           "warmup-iterations": 500,
 70 |           "iterations": 1000,
 71 |           "target-throughput": 200
 72 |         },
 73 |         {
 74 |           "operation": "phrase",
 75 |           "clients": 1,
 76 |           "warmup-iterations": 500,
 77 |           "iterations": 1000,
 78 |           "target-throughput": 200
 79 |         },
 80 |         {
 81 |           "operation": "country_agg_uncached",
 82 |           "clients": 1,
 83 |           "warmup-iterations": 200,
 84 |           "iterations": 100,
 85 |           "target-throughput": 4
 86 |         },
 87 |         {
 88 |           "operation": "country_agg_cached",
 89 |           "clients": 1,
 90 |           "warmup-iterations": 500,
 91 |           "iterations": 1000,
 92 |           "target-throughput": 100
 93 |         },
 94 |         {
 95 |           "operation": "scroll",
 96 |           "clients": 1,
 97 |           "warmup-iterations": 200,
 98 |           "iterations": 100,
 99 |           "#COMMENT": "Throughput is considered per request. So we issue one scroll request per second which will retrieve 25 pages",
100 |           "target-throughput": 0.8
101 |         },
102 |         {
103 |           "operation": "expression",
104 |           "clients": 1,
105 |           "warmup-iterations": 200,
106 |           "iterations": 100,
107 |           "target-throughput": 2
108 |         },
109 |         {
110 |           "operation": "painless_static",
111 |           "clients": 1,
112 |           "warmup-iterations": 200,
113 |           "iterations": 100,
114 |           "target-throughput": 1.5
115 |         },
116 |         {
117 |           "operation": "painless_dynamic",
118 |           "clients": 1,
119 |           "warmup-iterations": 200,
120 |           "iterations": 100,
121 |           "target-throughput": 1.5
122 |         },
123 |         {
124 |           "operation": "decay_geo_gauss_function_score",
125 |           "clients": 1,
126 |           "warmup-iterations": 200,
127 |           "iterations": 100,
128 |           "target-throughput": 1
129 |         },
130 |         {
131 |           "operation": "decay_geo_gauss_script_score",
132 |           "clients": 1,
133 |           "warmup-iterations": 200,
134 |           "iterations": 100,
135 |           "target-throughput": 1
136 |         },
137 |         {
138 |           "operation": "field_value_function_score",
139 |           "clients": 1,
140 |           "warmup-iterations": 200,
141 |           "iterations": 100,
142 |           "target-throughput": 1.5
143 |         },
144 |         {
145 |           "operation": "field_value_script_score",
146 |           "clients": 1,
147 |           "warmup-iterations": 200,
148 |           "iterations": 100,
149 |           "target-throughput": 1.5
150 |         },
151 |         {
152 |           "operation": "random_function_score",
153 |           "clients": 1,
154 |           "warmup-iterations": 200,
155 |           "iterations": 100,
156 |           "target-throughput": 1.5
157 |         },
158 |         {
159 |           "operation": "random_script_score",
160 |           "clients": 1,
161 |           "warmup-iterations": 200,
162 |           "iterations": 100,
163 |           "target-throughput": 1.5
164 |         },
165 |         {
166 |           "operation": "large_terms",
167 |           "clients": 1,
168 |           "warmup-iterations": 200,
169 |           "iterations": 100,
170 |           "target-throughput": 1.5
171 |         },
172 |         {
173 |           "operation": "large_filtered_terms",
174 |           "clients": 1,
175 |           "warmup-iterations": 200,
176 |           "iterations": 100,
177 |           "target-throughput": 1.5
178 |         },
179 |         {
180 |           "operation": "large_prohibited_terms",
181 |           "clients": 1,
182 |           "warmup-iterations": 200,
183 |           "iterations": 100,
184 |           "target-throughput": 1.5
185 |         },
186 |         {
187 |           "operation": "desc_sort_population",
188 |           "clients": 1,
189 |           "warmup-iterations": 200,
190 |           "iterations": 100,
191 |           "target-throughput": 1.5
192 |         },
193 |         {
194 |           "operation": "asc_sort_population",
195 |           "clients": 1,
196 |           "warmup-iterations": 200,
197 |           "iterations": 100,
198 |           "target-throughput": 1.5
199 |         },
200 |         {
201 |           "operation": "desc_sort_geonameid",
202 |           "clients": 1,
203 |           "warmup-iterations": 200,
204 |           "iterations": 100,
205 |           "target-throughput": 6
206 |         },
207 |         {
208 |           "operation": "asc_sort_geonameid",
209 |           "clients": 1,
210 |           "warmup-iterations": 200,
211 |           "iterations": 100,
212 |           "target-throughput": 6
213 |         }
214 |       ]
215 |     },
216 |     {
217 |       "name": "append-no-conflicts-index-only",
218 |       "description": "Indexes the whole document corpus using Elasticsearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Rally will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.",
219 |       "schedule": [
220 |         {
221 |           "operation": "delete-index"
222 |         },
223 |         {
224 |           "operation": {
225 |             "operation-type": "create-index",
226 |             "settings": {{index_settings | default({}) | tojson}}
227 |           }
228 |         },
229 |         {
230 |           "name": "check-cluster-health",
231 |           "operation": {
232 |             "operation-type": "cluster-health",
233 |             "index": "geonames",
234 |             "request-params": {
235 |               "wait_for_status": "{{cluster_health | default('green')}}",
236 |               "wait_for_no_relocating_shards": "true"
237 |             }
238 |           }
239 |         },
240 |         {
241 |           "operation": "index-append",
242 |           "warmup-time-period": 120,
243 |           "clients": {{bulk_indexing_clients | default(8)}}
244 |         },
245 |         {
246 |           "operation": "force-merge",
247 |           "clients": 1
248 |         }
249 |       ]
250 |     },
251 |     {
252 |       "name": "append-sorted-no-conflicts",
253 |       "description": "Indexes the whole document corpus in an index sorted by country_code field in ascending order. Document ids are unique so all index operations are append only.",
254 |       "schedule": [
255 |         {
256 |           "operation": "delete-index"
257 |         },
258 |         {
259 |           "operation": {
260 |             "operation-type": "create-index",
261 |             "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
262 |               "index.sort.field": ["country_code.raw", "admin1_code.raw"],
263 |               "index.sort.order": ["asc", "asc"]
264 |             }{%- endif %}
265 |           }
266 |         },
267 |         {
268 |           "name": "check-cluster-health",
269 |           "operation": {
270 |             "operation-type": "cluster-health",
271 |             "index": "geonames",
272 |             "request-params": {
273 |               "wait_for_status": "{{cluster_health | default('green')}}",
274 |               "wait_for_no_relocating_shards": "true"
275 |             }
276 |           }
277 |         },
278 |         {
279 |           "operation": "index-append",
280 |           "warmup-time-period": 120,
281 |           "clients": {{bulk_indexing_clients | default(8)}}
282 |         },
283 |         {
284 |           "operation": "force-merge",
285 |           "clients": 1
286 |         }
287 |       ]
288 |     },
289 |     {
290 |       "name": "append-fast-with-conflicts",
291 |       "description": "Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings. Rally will produce duplicate ids in 25% of all documents (not configurable) so we can simulate a scenario with appends most of the time and some updates in between.",
292 |       "schedule": [
293 |         {
294 |           "operation": "delete-index"
295 |         },
296 |         {
297 |           "operation": {
298 |             "operation-type": "create-index",
299 |             "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
300 |               "index.refresh_interval": "30s",
301 |               "index.number_of_shards": {{number_of_shards | default(6)}},
302 |               "index.translog.flush_threshold_size": "4g"
303 |             }{%- endif %}
304 |           }
305 |         },
306 |         {
307 |           "name": "check-cluster-health",
308 |           "operation": {
309 |             "operation-type": "cluster-health",
310 |             "index": "geonames",
311 |             "request-params": {
312 |               "wait_for_status": "{{cluster_health | default('green')}}",
313 |               "wait_for_no_relocating_shards": "true"
314 |             }
315 |           }
316 |         },
317 |         {
318 |           "operation": "index-update",
319 |           "warmup-time-period": 45,
320 |           "clients": {{bulk_indexing_clients | default(8)}}
321 |         },
322 |         {
323 |           "operation": "force-merge",
324 |           "clients": 1
325 |         }
326 |       ]
327 |     }
328 | 


--------------------------------------------------------------------------------
/nyc_taxis/challenges/default.json:
--------------------------------------------------------------------------------
  1 |     {
  2 |       "name": "append-no-conflicts",
  3 |       "description": "Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings and produce a smaller index (higher compression rate). Document ids are unique so all index operations are append only. After that a couple of queries are run.",
  4 |       "default": true,
  5 |       "schedule": [
  6 |         {
  7 |           "operation": "delete-index"
  8 |         },
  9 |         {
 10 |           "operation": {
 11 |             "operation-type": "create-index",
 12 |             "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
 13 |               "index.codec": "best_compression",
 14 |               "index.refresh_interval": "30s",
 15 |               "index.translog.flush_threshold_size": "4g"
 16 |             }{%- endif %}
 17 |           }
 18 |         },
 19 |         {
 20 |           "name": "check-cluster-health",
 21 |           "operation": {
 22 |             "operation-type": "cluster-health",
 23 |             "index": "nyc_taxis",
 24 |             "request-params": {
 25 |               "wait_for_status": "{{cluster_health | default('green')}}",
 26 |               "wait_for_no_relocating_shards": "true"
 27 |             }
 28 |           }
 29 |         },
 30 |         {
 31 |           "operation": "index",
 32 |           "warmup-time-period": 240,
 33 |           "clients": {{bulk_indexing_clients | default(8)}}
 34 |         },
 35 |         {
 36 |           "name": "refresh-after-index",
 37 |           "operation": "refresh",
 38 |           "clients": 1
 39 |         },
 40 |         {
 41 |           "operation": "default",
 42 |           "clients": 1,
 43 |           "warmup-iterations": 50,
 44 |           "iterations": 100,
 45 |           "target-throughput": 3
 46 |         },
 47 |         {
 48 |           "operation": "range",
 49 |           "clients": 1,
 50 |           "warmup-iterations": 50,
 51 |           "iterations": 100,
 52 |           "target-throughput": 1
 53 |         },
 54 |         {
 55 |           "operation": "distance_amount_agg",
 56 |           "clients": 1,
 57 |           "warmup-iterations": 50,
 58 |           "iterations": 100,
 59 |           "target-throughput": 2
 60 |         },
 61 |         {
 62 |           "operation": "autohisto_agg",
 63 |           "clients": 1,
 64 |           "warmup-iterations": 50,
 65 |           "iterations": 100,
 66 |           "target-throughput": 1.5
 67 |         },
 68 |         {
 69 |           "operation": "date_histogram_agg",
 70 |           "clients": 1,
 71 |           "warmup-iterations": 50,
 72 |           "iterations": 100,
 73 |           "target-throughput": 1.5
 74 |         }
 75 |       ]
 76 |     },
 77 |     {
 78 |       "name": "append-no-conflicts-index-only",
 79 |       "description": "Indexes the whole document corpus using a setup that will lead to a larger indexing throughput than the default settings and produce a smaller index (higher compression rate). Document ids are unique so all index operations are append only.",
 80 |       "schedule": [
 81 |         {
 82 |           "operation": "delete-index"
 83 |         },
 84 |         {
 85 |           "operation": {
 86 |             "operation-type": "create-index",
 87 |             "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
 88 |               "index.codec": "best_compression",
 89 |               "index.refresh_interval": "30s",
 90 |               "index.translog.flush_threshold_size": "4g"
 91 |             }{%- endif %}
 92 |           }
 93 |         },
 94 |         {
 95 |           "name": "check-cluster-health",
 96 |           "operation": {
 97 |             "operation-type": "cluster-health",
 98 |             "index": "nyc_taxis",
 99 |             "request-params": {
100 |               "wait_for_status": "{{cluster_health | default('green')}}",
101 |               "wait_for_no_relocating_shards": "true"
102 |             }
103 |           }
104 |         },
105 |         {
106 |           "operation": "index",
107 |           "warmup-time-period": 240,
108 |           "clients": {{bulk_indexing_clients | default(8)}}
109 |         },
110 |         {
111 |           "name": "refresh-after-index",
112 |           "operation": "refresh",
113 |           "clients": 1
114 |         }
115 |       ]
116 |     },
117 |     {
118 |       "name": "append-sorted-no-conflicts-index-only",
119 |       "description": "Indexes the whole document corpus in an index sorted by pickup_datetime field in descending order (most recent first) and using a setup that will lead to a larger indexing throughput than the default settings and produce a smaller index (higher compression rate). Document ids are unique so all index operations are append only.",
120 |       "schedule": [
121 |         {
122 |           "operation": "delete-index"
123 |         },
124 |         {
125 |           "operation": {
126 |             "operation-type": "create-index",
127 |             "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
128 |               "index.codec": "best_compression",
129 |               "index.refresh_interval": "30s",
130 |               "index.translog.flush_threshold_size": "4g",
131 |               "index.sort.field": "pickup_datetime",
132 |               "index.sort.order": "desc"
133 |             }{%- endif %}
134 |           }
135 |         },
136 |         {
137 |           "name": "check-cluster-health",
138 |           "operation": {
139 |             "operation-type": "cluster-health",
140 |             "index": "nyc_taxis",
141 |             "request-params": {
142 |               "wait_for_status": "{{cluster_health | default('green')}}",
143 |               "wait_for_no_relocating_shards": "true"
144 |             }
145 |           }
146 |         },
147 |         {
148 |           "operation": "index",
149 |           "warmup-time-period": 240,
150 |           "clients": {{bulk_indexing_clients | default(8)}}
151 |         },
152 |         {
153 |           "name": "refresh-after-index",
154 |           "operation": "refresh",
155 |           "clients": 1
156 |         }
157 |       ]
158 |     },
159 |     {
160 |       "name": "update",
161 |       "schedule": [
162 |         {
163 |           "operation": "delete-index"
164 |         },
165 |         {
166 |           "operation": {
167 |             "operation-type": "create-index",
168 |             "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
169 |               "index.number_of_shards": {{number_of_shards | default(1)}},
170 |               "index.number_of_replicas": {{number_of_replicas | default(0)}},
171 |               "index.store.type": "{{store_type | default('fs')}}"
172 |             }{%- endif %}
173 |           }
174 |         },
175 |         {
176 |           "name": "check-cluster-health",
177 |           "operation": {
178 |             "operation-type": "cluster-health",
179 |             "index": "nyc_taxis",
180 |             "request-params": {
181 |               "wait_for_status": "{{cluster_health | default('green')}}",
182 |               "wait_for_no_relocating_shards": "true"
183 |             }
184 |           }
185 |         },
186 |         {
187 |           "operation": "update",
188 |           "warmup-time-period": 1200,
189 |           "clients": {{bulk_indexing_clients | default(8)}}
190 |         },
191 |         {
192 |           "name": "refresh-after-index",
193 |           "operation": "refresh",
194 |           "clients": 1
195 |         },
196 |         {
197 |           "operation": "force-merge",
198 |           "clients": 1
199 |         },
200 |         {
201 |           "name": "refresh-after-force-merge",
202 |           "operation": "refresh",
203 |           "clients": 1
204 |         }
205 |       ]
206 |     },
207 | {% set ml_job_id="benchmark_ml_job" %}
208 | {% set ml_feed_id="benchmark_nyc_taxis_feed" %}
209 |     {
210 |       "name": "append-ml",
211 |       "description": "Indexes the whole document corpus and executes a machine learning job",
212 |       "schedule": [
213 |         {
214 |           "operation": "delete-index"
215 |         },
216 |         {
217 |           "operation": {
218 |             "operation-type": "create-index",
219 |             "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} {
220 |               "index.codec": "best_compression",
221 |               "index.refresh_interval": "30s",
222 |               "index.translog.flush_threshold_size": "4g"
223 |             }{%- endif %}
224 |           }
225 |         },
226 |         {
227 |           "operation": {
228 |             "operation-type": "delete-ml-datafeed",
229 |             "datafeed-id": "{{ml_feed_id}}",
230 |             "force": true
231 |           }
232 |         },
233 |         {
234 |           "operation": {
235 |             "operation-type": "delete-ml-job",
236 |             "job-id": "{{ml_job_id}}",
237 |             "force": true
238 |           }
239 |         },
240 |         {
241 |           "operation": {
242 |             "operation-type": "create-ml-job",
243 |             "job-id": "{{ml_job_id}}",
244 |             "body": {
245 |               "description": "NYC Taxis (count)",
246 |               "analysis_config": {
247 |                 "bucket_span": "1h",
248 |                 "summary_count_field_name": "doc_count",
249 |                 "detectors": [
250 |                   {
251 |                     "detector_description": "count",
252 |                     "function": "count"
253 |                   }
254 |                 ]
255 |               },
256 |               "data_description": {
257 |                 "time_field": "pickup_datetime",
258 |                 "time_format": "epoch_ms"
259 |               },
260 |               "model_plot_config": {
261 |                 "enabled": true
262 |               }
263 |             }
264 |           }
265 |         },
266 |         {
267 |           "operation": {
268 |             "operation-type": "open-ml-job",
269 |             "job-id": "{{ml_job_id}}"
270 |           }
271 |         },
272 |         {
273 |           "operation": {
274 |             "operation-type": "create-ml-datafeed",
275 |             "datafeed-id": "{{ml_feed_id}}",
276 |             "body": {
277 |               "job_id": "{{ml_job_id}}",
278 |               "indices": [
279 |                 "nyc_taxis"
280 |               ],
281 |               "query": {
282 |                 "match_all": {
283 |                   "boost": 1
284 |                 }
285 |               },
286 |               "aggregations": {
287 |                 "buckets": {
288 |                   "date_histogram": {
289 |                     "field": "pickup_datetime",
290 |                     "fixed_interval": "3600000ms",
291 |                     "offset": 0,
292 |                     "order": {
293 |                       "_key": "asc"
294 |                     },
295 |                     "keyed": false,
296 |                     "min_doc_count": 0
297 |                   },
298 |                   "aggregations": {
299 |                     "pickup_datetime": {
300 |                       "max": {
301 |                         "field": "pickup_datetime"
302 |                       }
303 |                     }
304 |                   }
305 |                 }
306 |               },
307 |               "scroll_size": 1000,
308 |               "chunking_config": {
309 |                 "mode": "manual",
310 |                 "time_span": "3600000000ms"
311 |               }
312 |             }
313 |           }
314 |         },
315 |         {
316 |           "name": "check-cluster-health",
317 |           "operation": {
318 |             "operation-type": "cluster-health",
319 |             "index": "nyc_taxis",
320 |             "request-params": {
321 |               "wait_for_status": "{{cluster_health | default('green')}}",
322 |               "wait_for_no_relocating_shards": "true"
323 |             }
324 |           }
325 |         },
326 |         {
327 |           "operation": "index",
328 |           "warmup-time-period": 240,
329 |           "clients": {{bulk_indexing_clients | default(8)}}
330 |         },
331 |         {
332 |           "name": "refresh-after-index",
333 |           "operation": "refresh"
334 |         },
335 |         {
336 |           "operation": "force-merge"
337 |         },
338 |         {
339 |           "name": "refresh-after-force-merge",
340 |           "operation": "refresh"
341 |         },
342 | 	    {
343 |           "operation": {
344 |             "operation-type": "start-ml-datafeed",
345 |             "datafeed-id": "{{ml_feed_id}}",
346 |             "body": {
347 |               "end": "now"
348 |             }
349 |           }
350 |         },
351 |         {
352 |           "operation": {
353 |             "operation-type": "wait-for-ml-lookback",
354 |             "include-in-reporting": false,
355 |             "datafeed-id": "{{ml_feed_id}}"
356 |           }
357 |         },
358 |         {
359 |           "operation": {
360 |             "operation-type": "close-ml-job",
361 |             "job-id": "{{ml_job_id}}"
362 |           }
363 |         }
364 |       ]
365 |     }
366 | 


--------------------------------------------------------------------------------