├── elasticsearch_runner ├── __init__.py ├── test │ ├── __init__.py │ ├── test_configuration.py │ └── test_elasticsearch_runner.py ├── configuration.py ├── resources │ ├── embedded_logging.yml │ └── embedded_elasticsearch.yml └── runner.py ├── .gitignore ├── requirements.txt ├── .travis.yml ├── setup.py └── readme.md /elasticsearch_runner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elasticsearch_runner/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | temp/ 2 | *.pyc 3 | .idea 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML 2 | elasticsearch 3 | requests 4 | psutil 5 | lxml -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.3" 5 | - "3.4" 6 | - "3.5" 7 | install: "pip install -r requirements.txt" 8 | script: nosetests -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='elasticsearch-runner', 5 | version='0.1', 6 | packages=['elasticsearch_runner.resources', 'elasticsearch_runner', 'elasticsearch_runner.test'], 7 | url='https://bitbucket.org/comperio/comperio-text-analytics', 8 | license='For internal use only.', 9 | author='Andre Lynum', 10 | author_email='andre.lynum@comperiosearch.com', 11 | description='Lightweight runner for transient Elasticsearch instances, f.ex for testing.', 12 | install_requires=['PyYAML', 'elasticsearch', 'requests', 'psutil', 'lxml'], 13 | package_data={'resources': ['embedded_elasticsearch.yml']} 14 | ) 15 | -------------------------------------------------------------------------------- /elasticsearch_runner/test/test_configuration.py: -------------------------------------------------------------------------------- 1 | try: 2 | from StringIO import StringIO 3 | except ImportError: 4 | from io import StringIO 5 | from unittest import TestCase 6 | 7 | import yaml 8 | 9 | from elasticsearch_runner.configuration import generate_config, serialize_config 10 | 11 | __author__ = 'alynum' 12 | 13 | 14 | class TestConfiguration(TestCase): 15 | def test_generate_config(self): 16 | self.assertEqual({ 17 | 'marvel': {'agent': {'enabled': 'false'}}, 18 | 'index': { 19 | 'number_of_shards': 1, 20 | 'number_of_replicas': 0, 21 | }, 22 | 'http': { 23 | 'cors': { 24 | 'enabled': True 25 | } 26 | }, 27 | 'cluster': {'name': 'ba'}}, 28 | generate_config(cluster_name='ba')) 29 | 30 | def test_serialize_config(self): 31 | s = StringIO() 32 | c = generate_config(cluster_name='ba') 33 | serialize_config(s, c) 34 | s.seek(0) 35 | 36 | self.assertEqual(c, yaml.load(s)) 37 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ## Elasticsearch test runner [![Build Status](https://travis-ci.org/comperiosearch/python-elasticsearch-runner.svg)](https://travis-ci.org/comperiosearch/python-elasticsearch-runner) 2 | 3 | The python-elasticsearch-runner contains a standalone Python runner for Elasticsearch. This is intended 4 | for transient and lightweight usage such as small integration tests. 5 | 6 | The runner takes about 10 sec. to start so it should be a part of at least module level setup/teardown in 7 | order to minimize test run time. 8 | 9 | The following code sets up the runner instance at module level with nosetests if placed in __init__.py: 10 | 11 | ```python 12 | from elasticsearch_runner.runner import ElasticsearchRunner 13 | 14 | es_runner = ElasticsearchRunner() 15 | 16 | def setup(): 17 | es_runner.install() 18 | es_runner.run() 19 | es_runner.wait_for_green() 20 | 21 | def teardown(): 22 | if es_runner and es_runner.is_running(): 23 | es_runner.stop() 24 | ``` 25 | 26 | The runner instance can then be queried for the port number when connecting: 27 | 28 | ```python 29 | es = Elasticsearch(hosts=['localhost:%d' % es_runner.es_state.port]) 30 | ``` 31 | 32 | 33 | 34 | ### Some details 35 | Should run with python 2.7. 3.3 and 3.4 36 | By default, elasticsearch version 2.1.0 is used, and everything is installed into HOME/.elasticsearch_runner (most systems) or APP_DATA/elasticsearch_runner (windows) folder. 37 | 38 | 39 | ```python 40 | es_runner = ElasticsearchRunner(version=1.0.0, install_path=/var/test/) 41 | ``` 42 | 43 | The elasticsearch runner accepts parameters for elasticsearch version and install path. 44 | The install path is where the Elasticsearch software package and data storage will be kept. 45 | Install path can also be provided as the environment variable 'elasticsearch-runner-install-path', and if set will override the install_path parameter. 46 | 47 | -------------------------------------------------------------------------------- /elasticsearch_runner/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from random import randint 3 | 4 | import yaml 5 | 6 | 7 | def generate_config(cluster_name=None, log_path=None, data_path=None): 8 | """ 9 | Generates basic Elasticsearch configuration for setting up the runner. 10 | 11 | :param cluster_name: Set as cluster.name option. 12 | :type cluster_name: str|unicode 13 | :param log_path: Set as path.log option. 14 | :type log_path: str|unicode 15 | :param data_path: Set as path.data option. 16 | :type data_path str|unicode 17 | :rtype : dict 18 | :return: Elasticsearch configuration as dict. 19 | """ 20 | config = { 21 | 'marvel': {'agent': {'enabled':'false'}}, 22 | 'index': { 23 | 'number_of_shards': 1, 24 | 'number_of_replicas': 0, 25 | }, 26 | 'http': { 27 | 'cors': { 28 | 'enabled': True 29 | } 30 | } 31 | } 32 | 33 | if not cluster_name: 34 | cluster_name = generate_cluster_name() 35 | 36 | config['cluster'] = {'name': cluster_name} 37 | 38 | if log_path or data_path: 39 | path = {} 40 | 41 | if log_path: 42 | path['log'] = log_path 43 | 44 | if data_path: 45 | path['data'] = data_path 46 | 47 | config['path'] = path 48 | 49 | return config 50 | 51 | 52 | def generate_cluster_name(prefix='elasticsearch_runner'): 53 | """ 54 | Generates a cluster name with a prefix and a random number. 55 | 56 | :param prefix: Cluster name prefix. 57 | :rtype : str|unicode 58 | :return: cluster name string 59 | 60 | TODO make this collision safe 61 | """ 62 | cluster_name = '%s_%7d' % (prefix, randint(1, 9999999)) 63 | 64 | return cluster_name 65 | 66 | 67 | def serialize_config(stream, config): 68 | """ 69 | Serialize Elasticsearch configuration dict to YAML formatted file. 70 | 71 | :param stream: Stream to write YAML configuration to. 72 | :param config: Elasticsearch configuration as dict. 73 | :type config: dict 74 | :rtype : dict 75 | :return: The passed configuration dict. 76 | """ 77 | yaml.dump(config, stream=stream) 78 | 79 | return config 80 | 81 | 82 | def package_path(): 83 | """ 84 | Returns the path to the root of the package directory. 85 | 86 | :rtype : str|unicode 87 | :return: The root project path as a string. 88 | """ 89 | self_path = os.path.dirname(os.path.abspath(__file__)) 90 | 91 | return os.path.abspath(os.path.join(self_path, '..')) -------------------------------------------------------------------------------- /elasticsearch_runner/resources/embedded_logging.yml: -------------------------------------------------------------------------------- 1 | # you can override this using by setting a system property, for example -Des.logger.level=DEBUG 2 | es.logger.level: INFO 3 | rootLogger: ${es.logger.level}, console, file 4 | logger: 5 | # log action execution errors for easier debugging 6 | action: DEBUG 7 | 8 | # deprecation logging, turn to DEBUG to see them 9 | deprecation: INFO, deprecation_log_file 10 | 11 | # reduce the logging for aws, too much is logged under the default INFO 12 | com.amazonaws: WARN 13 | # aws will try to do some sketchy JMX stuff, but its not needed. 14 | com.amazonaws.jmx.SdkMBeanRegistrySupport: ERROR 15 | com.amazonaws.metrics.AwsSdkMetrics: ERROR 16 | 17 | org.apache.http: INFO 18 | 19 | # gateway 20 | #gateway: DEBUG 21 | #index.gateway: DEBUG 22 | 23 | # peer shard recovery 24 | #indices.recovery: DEBUG 25 | 26 | # discovery 27 | #discovery: TRACE 28 | 29 | index.search.slowlog: TRACE, index_search_slow_log_file 30 | index.indexing.slowlog: TRACE, index_indexing_slow_log_file 31 | 32 | additivity: 33 | index.search.slowlog: false 34 | index.indexing.slowlog: false 35 | deprecation: false 36 | 37 | appender: 38 | console: 39 | type: console 40 | layout: 41 | type: consolePattern 42 | conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n" 43 | 44 | file: 45 | type: dailyRollingFile 46 | file: ${path.logs}/${cluster.name}.log 47 | datePattern: "'.'yyyy-MM-dd" 48 | layout: 49 | type: pattern 50 | conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %.10000m%n" 51 | 52 | # Use the following log4j-extras RollingFileAppender to enable gzip compression of log files. 53 | # For more information see https://logging.apache.org/log4j/extras/apidocs/org/apache/log4j/rolling/RollingFileAppender.html 54 | #file: 55 | #type: extrasRollingFile 56 | #file: ${path.logs}/${cluster.name}.log 57 | #rollingPolicy: timeBased 58 | #rollingPolicy.FileNamePattern: ${path.logs}/${cluster.name}.log.%d{yyyy-MM-dd}.gz 59 | #layout: 60 | #type: pattern 61 | #conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n" 62 | 63 | deprecation_log_file: 64 | type: dailyRollingFile 65 | file: ${path.logs}/${cluster.name}_deprecation.log 66 | datePattern: "'.'yyyy-MM-dd" 67 | layout: 68 | type: pattern 69 | conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n" 70 | 71 | index_search_slow_log_file: 72 | type: dailyRollingFile 73 | file: ${path.logs}/${cluster.name}_index_search_slowlog.log 74 | datePattern: "'.'yyyy-MM-dd" 75 | layout: 76 | type: pattern 77 | conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n" 78 | 79 | index_indexing_slow_log_file: 80 | type: dailyRollingFile 81 | file: ${path.logs}/${cluster.name}_index_indexing_slowlog.log 82 | datePattern: "'.'yyyy-MM-dd" 83 | layout: 84 | type: pattern 85 | conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n" 86 | -------------------------------------------------------------------------------- /elasticsearch_runner/test/test_elasticsearch_runner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | from unittest import TestCase 4 | import json 5 | import requests 6 | from elasticsearch_runner.runner import ElasticsearchRunner, process_exists, parse_es_log_header 7 | 8 | 9 | class TestElasticsearchRunner(TestCase): 10 | def __init__(self, methodName='runTest'): 11 | super(TestElasticsearchRunner, self).__init__(methodName) 12 | self.runner = None 13 | self.runner2 = None 14 | 15 | def tearDown(self): 16 | super(TestElasticsearchRunner, self).tearDown() 17 | 18 | if self.runner and self.runner.is_running(): 19 | self.runner.stop() 20 | 21 | if self.runner2 and self.runner2.is_running(): 22 | self.runner2.stop() 23 | 24 | def test_run(self): 25 | self.runner = ElasticsearchRunner() 26 | self.runner.install() 27 | self.runner.run() 28 | self.runner.wait_for_green() 29 | 30 | self.assertTrue(self.runner.is_running()) 31 | 32 | health_resp = requests.get('http://localhost:%d/_cluster/health' % self.runner.es_state.port) 33 | self.assertEqual(200, health_resp.status_code) 34 | health_data = json.loads(health_resp.text) 35 | self.assertEqual(health_data['status'], 'green') 36 | 37 | server_pid = self.runner.es_state.server_pid 38 | 39 | self.runner.stop() 40 | 41 | self.assertFalse(process_exists(server_pid)) 42 | self.assertFalse(self.runner.is_running()) 43 | self.assertIsNone(self.runner.es_state) 44 | 45 | def test_run_multiple(self): 46 | self.runner = ElasticsearchRunner() 47 | self.runner.install() 48 | self.runner.run() 49 | self.runner.wait_for_green() 50 | 51 | self.assertTrue(self.runner.is_running()) 52 | 53 | self.runner2 = ElasticsearchRunner() 54 | self.runner2.install() 55 | self.runner2.run() 56 | self.runner2.wait_for_green() 57 | 58 | self.assertTrue(self.runner2.is_running()) 59 | 60 | health_resp = requests.get('http://localhost:%d/_cluster/health' % self.runner.es_state.port) 61 | self.assertEqual(200, health_resp.status_code) 62 | health_data = json.loads(health_resp.text) 63 | self.assertEqual(health_data['status'], 'green') 64 | 65 | health_resp = requests.get('http://localhost:%d/_cluster/health' % self.runner2.es_state.port) 66 | self.assertEqual(200, health_resp.status_code) 67 | health_data = json.loads(health_resp.text) 68 | self.assertEqual(health_data['status'], 'green') 69 | 70 | server_pid = self.runner.es_state.server_pid 71 | 72 | self.runner.stop() 73 | 74 | self.assertFalse(process_exists(server_pid)) 75 | self.assertFalse(self.runner.is_running()) 76 | self.assertIsNone(self.runner.es_state) 77 | 78 | server_pid = self.runner2.es_state.server_pid 79 | 80 | self.runner2.stop() 81 | 82 | self.assertFalse(process_exists(server_pid)) 83 | self.assertFalse(self.runner2.is_running()) 84 | self.assertIsNone(self.runner2.es_state) 85 | 86 | def test_es_wrapper_call(self): 87 | # NB! beware that if the environment variable 'elasticsearch-runner-install-path' is set this test will fail 88 | runner = ElasticsearchRunner(install_path='fakepath') 89 | self.assertEqual(runner._es_wrapper_call('nt'), 90 | [os.path.sep.join(['fakepath', runner.version_folder, 'bin', 'elasticsearch.bat'])]) 91 | self.assertEqual(runner._es_wrapper_call('posix'), 92 | ['/bin/sh', os.path.sep.join(['fakepath', runner.version_folder, 'bin', 'elasticsearch'])]) 93 | 94 | def test_run_version2(self): 95 | es_version = '2.1.0' 96 | self.runner = ElasticsearchRunner(version=es_version) 97 | self.runner.install() 98 | self.runner.run() 99 | self.runner.wait_for_green() 100 | 101 | self.assertTrue(self.runner.is_running()) 102 | 103 | health_resp = requests.get('http://localhost:%d/_cluster/health' % self.runner.es_state.port) 104 | self.assertEqual(200, health_resp.status_code) 105 | health_data = json.loads(health_resp.text) 106 | self.assertEqual(health_data['status'], 'green') 107 | status = requests.get('http://localhost:%d' % self.runner.es_state.port) 108 | status_data = json.loads(status.text) 109 | self.assertEqual(status_data['version']['number'], es_version) 110 | server_pid = self.runner.es_state.server_pid 111 | 112 | self.runner.stop() 113 | 114 | self.assertFalse(process_exists(server_pid)) 115 | self.assertFalse(self.runner.is_running()) 116 | self.assertIsNone(self.runner.es_state) 117 | 118 | def test_run_version15(self): 119 | es_version = '1.5.2' 120 | self.runner = ElasticsearchRunner(version=es_version) 121 | self.runner.install() 122 | self.runner.run() 123 | self.runner.wait_for_green() 124 | 125 | self.assertTrue(self.runner.is_running()) 126 | 127 | health_resp = requests.get('http://localhost:%d/_cluster/health' % self.runner.es_state.port) 128 | self.assertEqual(200, health_resp.status_code) 129 | health_data = json.loads(health_resp.text) 130 | self.assertEqual(health_data['status'], 'green') 131 | status = requests.get('http://localhost:%d' % self.runner.es_state.port) 132 | status_data = json.loads(status.text) 133 | self.assertEqual(status_data['version']['number'], es_version) 134 | server_pid = self.runner.es_state.server_pid 135 | 136 | self.runner.stop() 137 | 138 | self.assertFalse(process_exists(server_pid)) 139 | self.assertFalse(self.runner.is_running()) 140 | self.assertIsNone(self.runner.es_state) 141 | 142 | def test_parse_log_header_esv2_format(self): 143 | testStream = io.StringIO() 144 | testStream.write( 145 | u"[2015-10-08 11:21:02,427][INFO ][node ] [Hero] version[2.0.0-rc1], pid[208], build[4757962/2015-10-01T10:06:08Z]\n") 146 | testStream.write( 147 | u"[2015-10-08 11:21:09,025][INFO ][http ] [Hero] publish_address {127.0.0.1:9200}, bound_addresses {127.0.0.1:9200}, {[::1]:9200}\n") 148 | testStream.write(u"[2015-10-08 11:04:15,784][INFO ][node ] [Hero] started\n") 149 | testStream.seek(0) 150 | server_pid, es_port = parse_es_log_header(testStream) 151 | self.assertEqual(server_pid, 208) 152 | self.assertEqual(es_port, 9200) 153 | 154 | def test_parse_log_header_esv1_format(self): 155 | testStream = io.StringIO() 156 | testStream.write( 157 | u"[2015-10-08 11:04:09,252][INFO ][node ] [Astronomer] version[1.7.2], pid[8248], build[e43676b/2015-09-14T09:49:53Z]\n") 158 | testStream.write( 159 | u"[2015-10-08 11:04:15,784][INFO ][http ] [Astronomer] bound_address {inet[/0:0:0:0:0:0:0:0:9200]}, publish_address {inet[/10.0.80.134:9200]}\n") 160 | testStream.write(u"[2015-10-08 11:04:15,784][INFO ][node ] [Astronomer] started\n") 161 | testStream.seek(0) 162 | server_pid, es_port = parse_es_log_header(testStream) 163 | self.assertEqual(server_pid, 8248) 164 | self.assertEqual(es_port, 9200) 165 | -------------------------------------------------------------------------------- /elasticsearch_runner/resources/embedded_elasticsearch.yml: -------------------------------------------------------------------------------- 1 | ##################### Elasticsearch Configuration Example ##################### 2 | 3 | # This file contains an overview of various configuration settings, 4 | # targeted at operations staff. Application developers should 5 | # consult the guide at . 6 | # 7 | # The installation procedure is covered at 8 | # . 9 | # 10 | # Elasticsearch comes with reasonable defaults for most settings, 11 | # so you can try it out without bothering with configuration. 12 | # 13 | # Most of the time, these defaults are just fine for running a production 14 | # cluster. If you're fine-tuning your cluster, or wondering about the 15 | # effect of certain configuration option, please _do ask_ on the 16 | # mailing list or IRC channel [http://elasticsearch.org/community]. 17 | 18 | # Any element in the configuration can be replaced with environment variables 19 | # by placing them in ${...} notation. For example: 20 | # 21 | #node.rack: ${RACK_ENV_VAR} 22 | 23 | # For information on supported formats and syntax for the config file, see 24 | # 25 | 26 | 27 | ################################### Cluster ################################### 28 | 29 | # Cluster name identifies your cluster for auto-discovery. If you're running 30 | # multiple clusters on the same network, make sure you're using unique names. 31 | # 32 | # cluster.name: no-cluster 33 | 34 | 35 | #################################### Node ##################################### 36 | 37 | # Node names are generated dynamically on startup, so you're relieved 38 | # from configuring them manually. You can tie this node to a specific name: 39 | # 40 | node.name: "Embedded Elasticsearch for testing" 41 | 42 | # Every node can be configured to allow or deny being eligible as the master, 43 | # and to allow or deny to store the data. 44 | # 45 | # Allow this node to be eligible as a master node (enabled by default): 46 | # 47 | #node.master: true 48 | # 49 | # Allow this node to store data (enabled by default): 50 | # 51 | #node.data: true 52 | 53 | # You can exploit these settings to design advanced cluster topologies. 54 | # 55 | # 1. You want this node to never become a master node, only to hold data. 56 | # This will be the "workhorse" of your cluster. 57 | # 58 | #node.master: false 59 | #node.data: true 60 | # 61 | # 2. You want this node to only serve as a master: to not store any data and 62 | # to have free resources. This will be the "coordinator" of your cluster. 63 | # 64 | #node.master: true 65 | #node.data: false 66 | # 67 | # 3. You want this node to be neither master nor data node, but 68 | # to act as a "search load balancer" (fetching data from nodes, 69 | # aggregating results, etc.) 70 | # 71 | #node.master: false 72 | #node.data: false 73 | 74 | # Use the Cluster Health API [http://localhost:9200/_cluster/health], the 75 | # Node Info API [http://localhost:9200/_nodes] or GUI tools 76 | # such as , 77 | # , 78 | # and 79 | # to inspect the cluster state. 80 | 81 | # A node can have generic attributes associated with it, which can later be used 82 | # for customized shard allocation filtering, or allocation awareness. An attribute 83 | # is a simple key value pair, similar to node.key: value, here is an example: 84 | # 85 | #node.rack: rack314 86 | 87 | # By default, multiple nodes are allowed to start from the same installation location 88 | # to disable it, set the following: 89 | #node.max_local_storage_nodes: 1 90 | 91 | 92 | #################################### Index #################################### 93 | 94 | # You can set a number of options (such as shard/replica options, mapping 95 | # or analyzer definitions, translog settings, ...) for indices globally, 96 | # in this file. 97 | # 98 | # Note, that it makes more sense to configure index settings specifically for 99 | # a certain index, either when creating it or by using the index templates API. 100 | # 101 | # See and 102 | # 103 | # for more information. 104 | 105 | # Set the number of shards (splits) of an index (5 by default): 106 | # 107 | #index.number_of_shards: 5 108 | 109 | # Set the number of replicas (additional copies) of an index (1 by default): 110 | # 111 | #index.number_of_replicas: 1 112 | 113 | # Note, that for development on a local machine, with small indices, it usually 114 | # makes sense to "disable" the distributed features: 115 | # 116 | index.number_of_shards: 1 117 | index.number_of_replicas: 0 118 | 119 | # These settings directly affect the performance of index and search operations 120 | # in your cluster. Assuming you have enough machines to hold shards and 121 | # replicas, the rule of thumb is: 122 | # 123 | # 1. Having more *shards* enhances the _indexing_ performance and allows to 124 | # _distribute_ a big index across machines. 125 | # 2. Having more *replicas* enhances the _search_ performance and improves the 126 | # cluster _availability_. 127 | # 128 | # The "number_of_shards" is a one-time setting for an index. 129 | # 130 | # The "number_of_replicas" can be increased or decreased anytime, 131 | # by using the Index Update Settings API. 132 | # 133 | # Elasticsearch takes care about load balancing, relocating, gathering the 134 | # results from nodes, etc. Experiment with different settings to fine-tune 135 | # your setup. 136 | 137 | # Use the Index Status API () to inspect 138 | # the index status. 139 | 140 | 141 | #################################### Paths #################################### 142 | 143 | # Path to directory containing configuration (this file and logging.yml): 144 | # 145 | #path.conf: /path/to/conf 146 | 147 | # Path to directory where to store index data allocated for this node. 148 | # 149 | #path.data: /path/to/data 150 | # 151 | # Can optionally include more than one location, causing data to be striped across 152 | # the locations (a la RAID 0) on a file level, favouring locations with most free 153 | # space on creation. For example: 154 | # 155 | #path.data: /path/to/data1,/path/to/data2 156 | 157 | # Path to temporary files: 158 | # 159 | #path.work: /path/to/work 160 | 161 | # Path to log files: 162 | # 163 | #path.logs: /path/to/logs 164 | 165 | # Path to where plugins are installed: 166 | # 167 | #path.plugins: /path/to/plugins 168 | 169 | 170 | #################################### Plugin ################################### 171 | 172 | # If a plugin listed here is not installed for current node, the node will not start. 173 | # 174 | #plugin.mandatory: mapper-attachments,lang-groovy 175 | 176 | 177 | ################################### Memory #################################### 178 | 179 | # Elasticsearch performs poorly when JVM starts swapping: you should ensure that 180 | # it _never_ swaps. 181 | # 182 | # Set this property to true to lock the memory: 183 | # 184 | #bootstrap.mlockall: true 185 | 186 | # Make sure that the ES_MIN_MEM and ES_MAX_MEM environment variables are set 187 | # to the same value, and that the machine has enough memory to allocate 188 | # for Elasticsearch, leaving enough memory for the operating system itself. 189 | # 190 | # You should also make sure that the Elasticsearch process is allowed to lock 191 | # the memory, eg. by using `ulimit -l unlimited`. 192 | 193 | 194 | ############################## Network And HTTP ############################### 195 | 196 | # Elasticsearch, by default, binds itself to the 0.0.0.0 address, and listens 197 | # on port [9200-9300] for HTTP traffic and on port [9300-9400] for node-to-node 198 | # communication. (the range means that if the port is busy, it will automatically 199 | # try the next port). 200 | 201 | # Set the bind address specifically (IPv4 or IPv6): 202 | # 203 | #network.bind_host: 192.168.0.1 204 | 205 | # Set the address other nodes will use to communicate with this node. If not 206 | # set, it is automatically derived. It must point to an actual IP address. 207 | # 208 | #network.publish_host: 192.168.0.1 209 | 210 | # Set both 'bind_host' and 'publish_host': 211 | # 212 | #network.host: 192.168.0.1 213 | 214 | # Set a custom port for the node to node communication (9300 by default): 215 | # 216 | #transport.tcp.port: 9300 217 | 218 | # Enable compression for all communication between nodes (disabled by default): 219 | # 220 | #transport.tcp.compress: true 221 | 222 | # Set a custom port to listen for HTTP traffic: 223 | # 224 | #http.port: 9200 225 | 226 | # Set a custom allowed content length: 227 | # 228 | #http.max_content_length: 100mb 229 | 230 | # Disable HTTP completely: 231 | # 232 | #http.enabled: false 233 | 234 | 235 | ################################### Gateway ################################### 236 | 237 | # The gateway allows for persisting the cluster state between full cluster 238 | # restarts. Every change to the state (such as adding an index) will be stored 239 | # in the gateway, and when the cluster starts up for the first time, 240 | # it will read its state from the gateway. 241 | 242 | # There are several types of gateway implementations. For more information, see 243 | # . 244 | 245 | # The default gateway type is the "local" gateway (recommended): 246 | # 247 | #gateway.type: local 248 | 249 | # Settings below control how and when to start the initial recovery process on 250 | # a full cluster restart (to reuse as much local data as possible when using shared 251 | # gateway). 252 | 253 | # Allow recovery process after N nodes in a cluster are up: 254 | # 255 | #gateway.recover_after_nodes: 1 256 | 257 | # Set the timeout to initiate the recovery process, once the N nodes 258 | # from previous setting are up (accepts time value): 259 | # 260 | #gateway.recover_after_time: 5m 261 | 262 | # Set how many nodes are expected in this cluster. Once these N nodes 263 | # are up (and recover_after_nodes is met), begin recovery process immediately 264 | # (without waiting for recover_after_time to expire): 265 | # 266 | #gateway.expected_nodes: 2 267 | 268 | 269 | ############################# Recovery Throttling ############################# 270 | 271 | # These settings allow to control the process of shards allocation between 272 | # nodes during initial recovery, replica allocation, rebalancing, 273 | # or when adding and removing nodes. 274 | 275 | # Set the number of concurrent recoveries happening on a node: 276 | # 277 | # 1. During the initial recovery 278 | # 279 | #cluster.routing.allocation.node_initial_primaries_recoveries: 4 280 | # 281 | # 2. During adding/removing nodes, rebalancing, etc 282 | # 283 | #cluster.routing.allocation.node_concurrent_recoveries: 2 284 | 285 | # Set to throttle throughput when recovering (eg. 100mb, by default 20mb): 286 | # 287 | #indices.recovery.max_bytes_per_sec: 20mb 288 | 289 | # Set to limit the number of open concurrent streams when 290 | # recovering a shard from a peer: 291 | # 292 | #indices.recovery.concurrent_streams: 5 293 | 294 | 295 | ################################## Discovery ################################## 296 | 297 | # Discovery infrastructure ensures nodes can be found within a cluster 298 | # and master node is elected. Multicast discovery is the default. 299 | 300 | # Set to ensure a node sees N other master eligible nodes to be considered 301 | # operational within the cluster. This should be set to a quorum/majority of 302 | # the master-eligible nodes in the cluster. 303 | # 304 | #discovery.zen.minimum_master_nodes: 1 305 | 306 | # Set the time to wait for ping responses from other nodes when discovering. 307 | # Set this option to a higher value on a slow or congested network 308 | # to minimize discovery failures: 309 | # 310 | #discovery.zen.ping.timeout: 3s 311 | 312 | # For more information, see 313 | # 314 | 315 | # Unicast discovery allows to explicitly control which nodes will be used 316 | # to discover the cluster. It can be used when multicast is not present, 317 | # or to restrict the cluster communication-wise. 318 | # 319 | # 1. Disable multicast discovery (enabled by default): 320 | # 321 | #discovery.zen.ping.multicast.enabled: false 322 | # 323 | # 2. Configure an initial list of master nodes in the cluster 324 | # to perform discovery when new nodes (master or data) are started: 325 | # 326 | #discovery.zen.ping.unicast.hosts: ["host1", "host2:port"] 327 | 328 | # EC2 discovery allows to use AWS EC2 API in order to perform discovery. 329 | # 330 | # You have to install the cloud-aws plugin for enabling the EC2 discovery. 331 | # 332 | # For more information, see 333 | # 334 | # 335 | # See 336 | # for a step-by-step tutorial. 337 | 338 | # GCE discovery allows to use Google Compute Engine API in order to perform discovery. 339 | # 340 | # You have to install the cloud-gce plugin for enabling the GCE discovery. 341 | # 342 | # For more information, see . 343 | 344 | # Azure discovery allows to use Azure API in order to perform discovery. 345 | # 346 | # You have to install the cloud-azure plugin for enabling the Azure discovery. 347 | # 348 | # For more information, see . 349 | 350 | ################################## Slow Log ################################## 351 | 352 | # Shard level query and fetch threshold logging. 353 | 354 | #index.search.slowlog.threshold.query.warn: 10s 355 | #index.search.slowlog.threshold.query.info: 5s 356 | #index.search.slowlog.threshold.query.debug: 2s 357 | #index.search.slowlog.threshold.query.trace: 500ms 358 | 359 | #index.search.slowlog.threshold.fetch.warn: 1s 360 | #index.search.slowlog.threshold.fetch.info: 800ms 361 | #index.search.slowlog.threshold.fetch.debug: 500ms 362 | #index.search.slowlog.threshold.fetch.trace: 200ms 363 | 364 | #index.indexing.slowlog.threshold.index.warn: 10s 365 | #index.indexing.slowlog.threshold.index.info: 5s 366 | #index.indexing.slowlog.threshold.index.debug: 2s 367 | #index.indexing.slowlog.threshold.index.trace: 500ms 368 | 369 | ################################## GC Logging ################################ 370 | 371 | #monitor.jvm.gc.young.warn: 1000ms 372 | #monitor.jvm.gc.young.info: 700ms 373 | #monitor.jvm.gc.young.debug: 400ms 374 | 375 | #monitor.jvm.gc.old.warn: 10s 376 | #monitor.jvm.gc.old.info: 5s 377 | #monitor.jvm.gc.old.debug: 2s 378 | 379 | ################################## Security ################################ 380 | 381 | # Uncomment if you want to enable JSONP as a valid return transport on the 382 | # http server. With this enabled, it may pose a security risk, so disabling 383 | # it unless you need it is recommended (it is disabled by default). 384 | # 385 | #http.jsonp.enable: true 386 | -------------------------------------------------------------------------------- /elasticsearch_runner/runner.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import json 3 | import logging 4 | import os 5 | import re 6 | from shutil import copyfile, rmtree 7 | from tempfile import mkdtemp 8 | from time import sleep, clock 9 | from zipfile import ZipFile 10 | from subprocess import Popen 11 | import errno 12 | import sys 13 | PY3 = sys.version_info > (3,) 14 | if PY3: 15 | import urllib.request, urllib.parse, urllib.error 16 | import urllib.parse 17 | else: 18 | from urlparse import urlparse 19 | 20 | from psutil import Process, NoSuchProcess 21 | import requests 22 | 23 | from elasticsearch_runner.configuration import serialize_config, generate_config, generate_cluster_name, package_path 24 | 25 | """ 26 | Class for starting, stopping and managing an Elasticsearch instance from within a Python process. 27 | 28 | Intended for testing and other lightweight purposes with transient data. 29 | 30 | TODO Faster Elasticsearch startup. 31 | """ 32 | 33 | 34 | ES_DEFAULT_VERSION = '2.1.0' 35 | 36 | ES_URLS = {'1.7.2': 'https://download.elastic.co/elasticsearch/elasticsearch/elasticsearch-1.7.2.zip', 37 | '2.0.0': 'https://download.elasticsearch.org/elasticsearch/release/org/elasticsearch/distribution/zip/elasticsearch/2.0.0/elasticsearch-2.0.0.zip'} 38 | 39 | ES_DEFAULT_URL_LOCATION = 'https://download.elastic.co/elasticsearch/elasticsearch/elasticsearch' 40 | ES2_DEFAULT_URL_LOCATION= 'https://download.elasticsearch.org/elasticsearch/release/org/elasticsearch/distribution/zip/elasticsearch/' 41 | 42 | def fn_from_url(url): 43 | """ 44 | Extract the final part of an url in order to get the filename of a downloaded url. 45 | 46 | :param url: url string 47 | :type url : str|unicode 48 | :rtype : str|unicode 49 | :return: url filename part 50 | """ 51 | 52 | 53 | if PY3: 54 | parse = urllib.parse.urlparse(url) 55 | else: 56 | parse = urlparse(url) 57 | return os.path.basename(parse.path) 58 | 59 | 60 | def download_file(url, dest_path): 61 | """ 62 | Download the file pointed to by the url to the path specified . 63 | If the file is already present at the path it will not be downloaded and the path to this file 64 | is returned. 65 | 66 | :param url: url string pointing to the file 67 | :type url : str|unicode 68 | :param dest_path: path to location where the file will be stored locally 69 | :type dest_path : str|unicode 70 | :rtype : str|unicode 71 | :return: path to the downloaded file 72 | """ 73 | if not os.path.exists(dest_path): 74 | os.makedirs(dest_path) 75 | 76 | fn = fn_from_url(url) 77 | full_fn = os.path.join(dest_path, fn) 78 | 79 | if os.path.exists(full_fn): 80 | logging.info('Dataset archive %s already exists in %s ...' % (fn, dest_path)) 81 | else: 82 | r = requests.get(url, stream=True) 83 | with open(full_fn, 'wb') as f: 84 | for chunk in r.iter_content(chunk_size=1024): 85 | if chunk: # filter out keep-alive new chunks 86 | f.write(chunk) 87 | f.flush() 88 | 89 | return full_fn 90 | 91 | 92 | def check_java(): 93 | """ 94 | Simple check for Java availability on the local system. 95 | 96 | :rtype : bool 97 | :return: True if Java available on the command line 98 | """ 99 | return os.system('java -version') == 0 100 | 101 | 102 | def process_exists(pid): 103 | """ 104 | Check if there is a process with this PID. 105 | 106 | :param pid: Process ID 107 | :type pid: int 108 | :rtype : bool 109 | :return: True if the process exists, False otherwise 110 | """ 111 | if os.name == 'nt': 112 | # TODO something more solid on windows? 113 | try: 114 | return Process(pid).status() == 'running' 115 | except NoSuchProcess: 116 | return False 117 | else: 118 | try: 119 | os.kill(pid, 0) 120 | except OSError: 121 | return False 122 | 123 | return True 124 | 125 | 126 | def parse_es_log_header(log_file, limit=200): 127 | """ 128 | Look at Elasticsearch log for startup messages containing system information. The log is read until the starting 129 | message is detected or the number of lines read exceed the limit. 130 | The log file must be open fir reading and at the desired position, ie. the end to read incoming log lines. 131 | 132 | :param log_file: open for reading file instance for the log file at the correct position 133 | :type log_file: FileIO 134 | :param limit: max lines to read before returning 135 | :type limit: int 136 | :rtype : (int|None, int|None) 137 | :return: A tuple with the Elasticsearch instance PID and REST endpoint port number, ie. (pid, port) 138 | """ 139 | line = log_file.readline() 140 | server_pid = None 141 | es_port = None 142 | count = 0 143 | 144 | while count < limit: 145 | count += 1 146 | line = line.strip() 147 | 148 | if line == '': 149 | sleep(.1) 150 | 151 | m = re.search('pid\[(\d+)\]', line) 152 | if m: 153 | server_pid = int(m.group(1)) 154 | 155 | m = re.search(r'\[http.*publish_address.*:(\d+)[\]}|}]', line) 156 | if m: 157 | es_port = int(m.group(1)) 158 | 159 | if re.search('started', line): 160 | return server_pid, es_port 161 | 162 | line = log_file.readline() 163 | 164 | logging.warn('Read more than %d lines while parsing Elasticsearch log header. Giving up ...' % limit) 165 | 166 | return server_pid, es_port 167 | 168 | 169 | # tuple holding information about the current Elasticsearch process 170 | ElasticsearchState = namedtuple('ElasticsearchState', 'server_pid wrapper_pid port config_fn') 171 | 172 | 173 | class ElasticsearchRunner: 174 | """ 175 | Runs a basic single node Elasticsearch instance for testing or other lightweight purposes. 176 | """ 177 | 178 | def __init__(self, install_path=None, transient=False, version=None): 179 | """ 180 | :param version: Elasticsearch version to run. Defaults to 2.1.0 181 | :type version: string 182 | :param install_path: The path where the Elasticsearch software package and data storage will be kept. 183 | If no install path set, installs into APPDATA (windows)or HOME/.elasticsearch_runner (other) 184 | Install_path can be provided as the environment variable 'elasticsearch-runner-install-path' 185 | If environment variable provided it will override install_path parameter 186 | :type install_path: str|unicode 187 | :param transient: Not implemented. 188 | :type transient: bool 189 | """ 190 | if os.getenv('elasticsearch-runner-install-path'): 191 | install_path = os.getenv('elasticsearch-runner-install-path') 192 | 193 | if install_path: 194 | self.install_path = install_path 195 | else: 196 | if os.name == 'nt': 197 | self.install_path = os.path.join(os.getenv("APPDATA"), 'elasticsearch_runner', 'embedded-es') 198 | else: 199 | self.install_path = os.path.join(os.getenv("HOME"), '.elasticsearch_runner', 'embedded-es') 200 | if version: 201 | self.version = version 202 | else: 203 | self.version = ES_DEFAULT_VERSION 204 | self.version_folder = "elasticsearch-%s" % self.version 205 | self.transient = transient 206 | self.es_state = None 207 | self.es_config = None 208 | 209 | if not check_java(): 210 | logging.error('Java not installed. Elasticsearch won\'t be able to run ...') 211 | 212 | def install(self): 213 | """ 214 | Download and install the Elasticsearch software in the install path. If already downloaded or installed 215 | those steps are skipped. 216 | 217 | :rtype : ElasticsearchRunner 218 | :return: The instance called on. 219 | """ 220 | if self.version in ES_URLS: 221 | download_url = ES_URLS[self.version] 222 | else: 223 | if self.version.startswith('1'): 224 | download_url = "%s-%s.zip" % (ES_DEFAULT_URL_LOCATION, self.version) 225 | else: 226 | download_url = "%s%s/elasticsearch-%s.zip" % (ES2_DEFAULT_URL_LOCATION, self.version, self.version) 227 | es_archive_fn = download_file(download_url, self.install_path) 228 | 229 | if not os.path.exists(os.path.join(self.install_path, self.version_folder)): 230 | with ZipFile(es_archive_fn, "r") as z: 231 | z.extractall(self.install_path) 232 | 233 | # insert basic config file 234 | copyfile(os.path.join(package_path(), 'elasticsearch_runner', 'resources', 'embedded_elasticsearch.yml'), 235 | os.path.join(self.install_path, self.version_folder, 'config', 'elasticsearch.yml')) 236 | 237 | return self 238 | 239 | def run(self): 240 | """ 241 | Start the elasticsearch server. Running REST port and PID is stored in the es_state field. 242 | 243 | :rtype : ElasticsearchRunner 244 | :return: The instance called on. 245 | """ 246 | if self.is_running(): 247 | logging.warn('Elasticsearch already running ...') 248 | else: 249 | # generate and insert Elasticsearch configuration file with transient data and log paths 250 | cluster_name = generate_cluster_name() 251 | cluster_path = mkdtemp(prefix='%s-%s-' % (self.version, cluster_name), dir=self.install_path) 252 | es_data_dir = os.path.join(cluster_path, "data") 253 | es_config_dir = os.path.join(cluster_path, "config") 254 | es_log_dir = os.path.join(cluster_path, "log") 255 | self.es_config = generate_config(cluster_name=cluster_name, data_path=es_data_dir, log_path=es_log_dir) 256 | config_fn = os.path.join(es_config_dir, 'elasticsearch.yml') 257 | 258 | try: 259 | os.makedirs(es_log_dir) 260 | os.makedirs(es_data_dir) 261 | os.makedirs(es_config_dir) 262 | except OSError as exception: 263 | if exception.errno != errno.EEXIST: 264 | raise 265 | with open(config_fn, 'w') as f: 266 | serialize_config(f, self.es_config) 267 | 268 | copyfile(os.path.join(package_path(), 'elasticsearch_runner', 'resources', 'embedded_logging.yml'), 269 | os.path.join(es_config_dir, 'logging.yml')) 270 | 271 | es_log_fn = os.path.join(es_log_dir, '%s.log' % cluster_name) 272 | # create the log file if it doesn't exist yet. We need to open it and seek to to the end before 273 | # sniffing out the configuration info from the log. 274 | 275 | open(es_log_fn, 'a').close() 276 | 277 | runcall = self._es_wrapper_call(os.name) + ['-Des.path.conf=%s' % es_config_dir, '-Des.path.logs=%s' % es_log_dir] 278 | wrapper_proc = Popen(runcall) 279 | 280 | es_log_f = open(es_log_fn, 'r') 281 | es_log_f.seek(0, 2) 282 | 283 | # watch the log 284 | server_pid, es_port = parse_es_log_header(es_log_f) 285 | 286 | if not server_pid: 287 | logging.error('Server PID not detected ... runcall was %s' % runcall) 288 | 289 | if not es_port: 290 | logging.error('Server http port not detected ...') 291 | 292 | self.es_state = ElasticsearchState(wrapper_pid=wrapper_proc.pid, 293 | server_pid=server_pid, 294 | port=es_port, 295 | config_fn=config_fn) 296 | return self 297 | 298 | def _es_wrapper_call(self, os_name): 299 | """ 300 | :param os_name: OS identifier as returned by os.name 301 | :type os_name: str|unicode 302 | :rtype : list[str|unicode] 303 | :return: 304 | """ 305 | if os_name == 'nt': 306 | es_bin = [os.path.join(self.install_path, self.version_folder, 'bin', 'elasticsearch.bat')] 307 | else: 308 | es_bin = ['/bin/sh', os.path.join(self.install_path, self.version_folder, 'bin', 'elasticsearch')] 309 | 310 | return es_bin 311 | 312 | def stop(self): 313 | """ 314 | Stop the Elasticsearch server. 315 | 316 | :rtype : ElasticsearchRunner 317 | :return: The instance called on. 318 | """ 319 | if self.is_running(): 320 | server_proc = Process(self.es_state.server_pid) 321 | server_proc.terminate() 322 | server_proc.wait() 323 | 324 | if process_exists(self.es_state.server_pid): 325 | logging.warn('Failed to stop Elasticsearch server process PID %d ...' % self.es_state.server_pid) 326 | 327 | # delete transient directories 328 | if 'path' in self.es_config: 329 | if 'log' in self.es_config['path']: 330 | log_path = self.es_config['path']['log'] 331 | logging.info('Removing transient log path %s ...' % log_path) 332 | rmtree(log_path) 333 | 334 | if 'data' in self.es_config['path']: 335 | data_path = self.es_config['path']['data'] 336 | logging.info('Removing transient data path %s ...' % data_path) 337 | rmtree(data_path) 338 | 339 | # delete temporary config file 340 | if os.path.exists(self.es_state.config_fn): 341 | logging.info('Removing transient configuration file %s ...' % self.es_state.config_fn) 342 | os.remove(self.es_state.config_fn) 343 | 344 | self.es_state = None 345 | self.es_config = None 346 | else: 347 | logging.warn('Elasticsearch is not running ...') 348 | 349 | return self 350 | 351 | def is_running(self): 352 | """ 353 | Checks if the instance has a running server process and that thhe process exists. 354 | 355 | :rtype : bool 356 | :return: True if the servier is running, False if not. 357 | """ 358 | state = self.es_state 359 | 360 | return state and process_exists(state.server_pid) 361 | 362 | def wait_for_green(self, timeout=1.): 363 | """ 364 | Check if cluster status is green and wait for it to become green if it's not. 365 | Run after starting the runner to ensure that the Elasticsearch instance is ready. 366 | 367 | :param timeout: The time to wait for green cluster response in seconds. 368 | :type timeout: int|long|float 369 | :rtype : ElasticsearchRunner 370 | :return: 371 | """ 372 | if not self.es_state: 373 | logging.warn('Elasticsearch runner is not started ...') 374 | return self 375 | 376 | if self.es_state.port is None: 377 | logging.warn('Elasticsearch runner not properly started ...') 378 | return self 379 | end_time = clock() + timeout 380 | health_resp = requests.get('http://localhost:%d/_cluster/health' % self.es_state.port) 381 | health_data = json.loads(health_resp.text) 382 | 383 | while health_data['status'] != 'green': 384 | if clock() > end_time: 385 | logging.error('Elasticsearch cluster failed to turn green in %f seconds, current status is %s ...' % 386 | (timeout, health_data['status'])) 387 | 388 | return self 389 | 390 | health_resp = requests.get('http://localhost:%d/_cluster/health' % self.es_state.port) 391 | health_data = json.loads(health_resp.text) 392 | 393 | return self 394 | --------------------------------------------------------------------------------