├── .gitattributes ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── es_pandas ├── __init__.py └── es_pandas.py ├── requirements.txt ├── setup.py └── test.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.xml 3 | build/lib/es_pandas/__init__.py 4 | build/lib/es_pandas/es_pandas.py 5 | *.whl 6 | *.gz 7 | *.txt 8 | es_pandas.egg-info/PKG-INFO 9 | *.pickle 10 | *.pyc 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | services: 4 | - elasticsearch 5 | 6 | python: 7 | - "3.7" 8 | 9 | script: 10 | - curl "localhost:9200" 11 | - python test.py 12 | - curl "localhost:9200/_cat/indices" 13 | - curl "localhost:9200/demo/_search?pretty" 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Frank 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # es_pandas 2 | [![Build Status](https://travis-ci.org/fuyb1992/es_pandas.svg?branch=master)](https://travis-ci.org/fuyb1992/es_pandas)

[![LICENSE](https://img.shields.io/badge/license-Anti%20996-blue.svg)](https://github.com/996icu/996.ICU/blob/master/LICENSE) [![PyPi version](https://img.shields.io/pypi/v/es_pandas)](https://pypi.org/project/es-pandas/) 3 | [![Downloads](https://pepy.tech/badge/es-pandas/month)](https://pepy.tech/project/es-pandas) 4 | 5 | Read, write and update large scale [pandas](http://pandas.pydata.org/) DataFrame with [ElasticSearch](https://www.elastic.co/). 6 | 7 | 8 | ## Requirements 9 | This package should work on Python3(>=3.4) and ElasticSearch should be version 5.x, 6.x or 7.x. 10 | 11 | Installation 12 | The package is hosted on PyPi and can be installed with pip: 13 | ``` 14 | pip install es_pandas 15 | ``` 16 | #### Deprecation Notice 17 | 18 | Supporting of ElasticSearch 5.x will by deprecated in future version. 19 | 20 | ## Usage 21 | 22 | ``` 23 | import time 24 | 25 | import pandas as pd 26 | 27 | from es_pandas import es_pandas 28 | 29 | 30 | # Information of es cluseter 31 | es_host = 'localhost:9200' 32 | index = 'demo' 33 | 34 | # crete es_pandas instance 35 | ep = es_pandas(es_host) 36 | 37 | # Example data frame 38 | df = pd.DataFrame({'Num': [x for x in range(100000)]}) 39 | df['Alpha'] = 'Hello' 40 | df['Date'] = pd.datetime.now() 41 | 42 | # init template if you want 43 | doc_type = 'demo' 44 | ep.init_es_tmpl(df, doc_type) 45 | 46 | # Example of write data to es, use the template you create 47 | ep.to_es(df, index, doc_type=doc_type, thread_count=2, chunk_size=10000) 48 | 49 | # set use_index=True if you want to use DataFrame index as records' _id 50 | ep.to_es(df, index, doc_type=doc_type, use_index=True, thread_count=2, chunk_size=10000) 51 | 52 | # delete records from es 53 | ep.to_es(df.iloc[5000:], index, doc_type=doc_type, _op_type='delete', thread_count=2, chunk_size=10000) 54 | 55 | # Update doc by doc _id 56 | df.iloc[:1000, 1] = 'Bye' 57 | df.iloc[:1000, 2] = pd.datetime.now() 58 | ep.to_es(df.iloc[:1000, 1:], index, doc_type=doc_type, _op_type='update') 59 | 60 | # Example of read data from es 61 | df = ep.to_pandas(index) 62 | print(df.head()) 63 | 64 | # return certain fields in es 65 | heads = ['Num', 'Date'] 66 | df = ep.to_pandas(index, heads=heads) 67 | print(df.head()) 68 | 69 | # set certain columns dtype 70 | dtype = {'Num': 'float', 'Alpha': object} 71 | df = ep.to_pandas(index, dtype=dtype) 72 | print(df.dtypes) 73 | 74 | # infer dtype from es template 75 | df = ep.to_pandas(index, infer_dtype=True) 76 | print(df.dtypes) 77 | 78 | # use query_sql parameter if you want to do query in sql 79 | 80 | # Example of write data to es with pandas.io.json 81 | ep.to_es(df, index, doc_type=doc_type, use_pandas_json=True, thread_count=2, chunk_size=10000) 82 | print('write es doc with pandas.io.json finished') 83 | ``` 84 | -------------------------------------------------------------------------------- /es_pandas/__init__.py: -------------------------------------------------------------------------------- 1 | from .es_pandas import es_pandas 2 | 3 | VERSION = (0, 0, 23) 4 | __version__ = VERSION 5 | __versionstr__ = ".".join(map(str, VERSION)) 6 | -------------------------------------------------------------------------------- /es_pandas/es_pandas.py: -------------------------------------------------------------------------------- 1 | import re 2 | import tqdm 3 | import warnings 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from pandas.io import json 9 | from elasticsearch import Elasticsearch, helpers 10 | 11 | 12 | class es_pandas(object): 13 | """ 14 | Read, write and update large scale pandas DataFrame with Elasticsearch 15 | """ 16 | def __init__(self, *args, **kwargs): 17 | self.es = Elasticsearch(*args, **kwargs) 18 | self.ic = self.es.indices 19 | self.dtype_mapping = {'text': 'category', 'date': 'datetime64[ns]'} 20 | self.id_col = '_id' 21 | self.es_version_str = self.es.info()['version']['number'] 22 | self.es_version = [int(x) for x in re.findall("[0-9]+", self.es_version_str)] 23 | if self.es_version[0] < 6: 24 | warnings.warn('Supporting of ElasticSearch 5.x will by deprecated in future version, ' 25 | 'current es version: %s' % self.es_version_str, category=FutureWarning) 26 | 27 | def to_es(self, df, index, doc_type=None, use_index=False, show_progress=True, 28 | success_threshold=0.9, _op_type='index', use_pandas_json=False, date_format='iso', **kwargs): 29 | """ 30 | :param df: pandas DataFrame data 31 | :param index: full name of es indices 32 | :param doc_type: full name of es template 33 | :param use_index: use DataFrame index as records' _id 34 | :param success_threshold: 35 | :param _op_type: elasticsearch _op_type, default 'index', choices: 'index', 'create', 'update', 'delete' 36 | :param use_pandas_json: default False, if True, use pandas.io.json serialize 37 | :param date_format: default iso, only works when use_pandas_json=True 38 | :return: num of the number of data written into es successfully 39 | """ 40 | if self.es_version[0] > 6: 41 | doc_type = None 42 | elif self.es_version[0] > 5: 43 | doc_type = '_doc' 44 | elif not doc_type: 45 | doc_type = index + '_type' 46 | gen = helpers.parallel_bulk(self.es, 47 | (self.rec_to_actions(df, index, doc_type=doc_type, show_progress=show_progress, 48 | use_index=use_index, _op_type=_op_type, 49 | use_pandas_json=use_pandas_json, date_format=date_format)), 50 | **kwargs) 51 | 52 | success_num = np.sum([res[0] for res in gen]) 53 | rec_num = len(df) 54 | fail_num = rec_num - success_num 55 | 56 | if (success_num / rec_num) < success_threshold: 57 | raise Exception('%d records write failed' % fail_num) 58 | 59 | return success_num 60 | 61 | @staticmethod 62 | def get_source(anl, show_progress=False, count=0): 63 | if show_progress: 64 | with tqdm.tqdm(total=count) as bar: 65 | for i in range(count): 66 | mes = next(anl) 67 | yield {'_id': mes['_id'], **mes['_source']} 68 | bar.update() 69 | else: 70 | for mes in anl: 71 | yield {'_id': mes['_id'], **mes['_source']} 72 | 73 | def infer_dtype(self, index, heads): 74 | if self.es_version[0] > 6: 75 | mapping = self.ic.get_mapping(index=index) 76 | else: 77 | # Fix es client unrecongnized parameter 'include_type_name' bug for es 6.x 78 | mapping = self.ic.get_mapping(index=index) 79 | key = [k for k in mapping[index]['mappings'].keys() if k != '_default_'] 80 | if len(key) < 1: 81 | raise Exception('No templates exits: %s' % index) 82 | mapping[index]['mappings']['properties'] = mapping[index]['mappings'][key[0]]['properties'] 83 | dtype = {k: v['type'] for k, v in mapping[index]['mappings']['properties'].items() if k in heads} 84 | dtype = {k: self.dtype_mapping[v] for k, v in dtype.items() if v in self.dtype_mapping} 85 | return dtype 86 | 87 | def to_pandas(self, index, query_rule=None, heads=None, dtype=None, infer_dtype=False, show_progress=True, 88 | query_sql=None, **kwargs): 89 | """ 90 | scroll datas from es, and convert to dataframe, the index of dataframe is from es index, 91 | about 2 million records/min 92 | Args: 93 | index: full name of es indices 94 | query_rule: dict, default match_all, elasticsearch query DSL 95 | heads: certain columns get from es fields, None for all fields 96 | dtype: dict like, pandas dtypes for certain columns 97 | infer_dtype: bool, default False, if true, get dtype from es template 98 | show_progress: bool, default True, if true, show progressbar on console 99 | query_sql: string or dict, default None, SQL containing query to filter 100 | Returns: 101 | DataFrame 102 | """ 103 | if heads is None: 104 | heads = [] 105 | if dtype is None: 106 | dtype = dict() 107 | if query_sql: 108 | if isinstance(query_sql, str): 109 | dsl_from_sql = self.es.sql.translate(query=query_sql) 110 | elif isinstance(query_sql, dict): 111 | dsl_from_sql = self.es.sql.translate(query_sql) 112 | else: 113 | raise Exception('Parameter data type error, query_sql should be string or dict type') 114 | if query_rule: 115 | raise Exception('Cannot use query_rule and query_sql at the same time') 116 | else: 117 | query_rule = {'query': dsl_from_sql['query']} 118 | elif not query_rule: 119 | query_rule = {'query': {'match_all': {}}} 120 | count = self.es.count(index=index, body=query_rule)['count'] 121 | if count < 1: 122 | return pd.DataFrame() 123 | query_rule['_source'] = heads 124 | anl = helpers.scan(self.es, query=query_rule, index=index, **kwargs) 125 | df = pd.DataFrame(self.get_source(anl, show_progress=show_progress, count=count)).set_index('_id') 126 | if infer_dtype: 127 | dtype = self.infer_dtype(index, df.columns.values) 128 | if len(dtype): 129 | df = df.astype(dtype) 130 | return df 131 | 132 | @staticmethod 133 | def serialize(row, columns, use_pandas_json, iso_dates): 134 | if use_pandas_json: 135 | return json.dumps(dict(zip(columns, row)), iso_dates=iso_dates) 136 | return dict(zip(columns, [None if np.all(pd.isna(r)) else r for r in row])) 137 | 138 | @staticmethod 139 | def gen_action(**kwargs): 140 | return {k: v for k, v in kwargs.items() if v is not None} 141 | 142 | def rec_to_actions(self, df, index, doc_type=None, use_index=False, _op_type='index', use_pandas_json=False, 143 | date_format='iso', show_progress=True): 144 | if show_progress: 145 | bar = tqdm.tqdm(total=df.shape[0]) 146 | else: 147 | bar = BarNothing() 148 | columns = df.columns.tolist() 149 | iso_dates = date_format == 'iso' 150 | if use_index and (_op_type in ['create', 'index']): 151 | for i, row in enumerate(df.itertuples(name=None, index=use_index)): 152 | bar.update() 153 | _id = row[0] 154 | record = self.serialize(row[1:], columns, use_pandas_json, iso_dates) 155 | action = self.gen_action(_op_type=_op_type, _index=index, _type=doc_type, _id=_id, _source=record) 156 | yield action 157 | elif (not use_index) and (_op_type == 'index'): 158 | for i, row in enumerate(df.itertuples(name=None, index=use_index)): 159 | bar.update() 160 | record = self.serialize(row, columns, use_pandas_json, iso_dates) 161 | action = self.gen_action(_op_type=_op_type, _index=index, _type=doc_type, _source=record) 162 | yield action 163 | elif _op_type == 'update': 164 | for i, row in enumerate(df.itertuples(name=None, index=True)): 165 | bar.update() 166 | _id = row[0] 167 | record = self.serialize(row[1:], columns, False, iso_dates) 168 | action = self.gen_action(_op_type=_op_type, _index=index, _type=doc_type, _id=_id, doc=record) 169 | yield action 170 | elif _op_type == 'delete': 171 | for i, _id in enumerate(df.index.values.tolist()): 172 | bar.update() 173 | action = self.gen_action(_op_type=_op_type, _index=index, _type=doc_type, _id=_id) 174 | yield action 175 | else: 176 | raise Exception('[%s] action with %s using index not supported' % (_op_type, '' if use_index else 'not')) 177 | 178 | def init_es_tmpl(self, df, doc_type, delete=False, index_patterns=None, **kwargs): 179 | """ 180 | 181 | :param df: pd.DataFrame 182 | :param doc_type: str, name of doc_type 183 | :param delete: bool, if True, delete existed template 184 | :param index_patterns: list, default None, [doc_type*] 185 | :param kwargs: kwargs for template settings, 186 | example: number_of_shards, number_of_replicas, refresh_interval 187 | :return: 188 | """ 189 | tmpl_exits = self.es.indices.exists_template(name=doc_type) 190 | if tmpl_exits and (not delete): 191 | return 192 | if index_patterns is None: 193 | index_patterns = ['%s*' % doc_type] 194 | columns_body = {} 195 | 196 | if isinstance(df, pd.DataFrame): 197 | iter_dict = df.dtypes.to_dict() 198 | elif isinstance(df, dict): 199 | iter_dict = df 200 | else: 201 | raise Exception('init tmpl type is error, only accept DataFrame or dict of head with type mapping') 202 | for key, data_type in iter_dict.items(): 203 | type_str = getattr(data_type, 'name', data_type).lower() 204 | if 'int' in type_str: 205 | columns_body[key] = {'type': 'long'} 206 | elif 'datetime' in type_str: 207 | columns_body[key] = {'type': 'date'} 208 | elif 'float' in type_str: 209 | columns_body[key] = {'type': 'float'} 210 | else: 211 | columns_body[key] = {'type': 'keyword', 'ignore_above': '256'} 212 | if self.es_version[0] > 6: 213 | mappings = {'properties': columns_body} 214 | elif self.es_version[0] > 5: 215 | mappings = {'_doc': {'properties': columns_body}} 216 | else: 217 | mappings = {'_default_': {'properties': columns_body}} 218 | if tmpl_exits and delete: 219 | self.es.indices.delete_template(name=doc_type) 220 | print('Delete and put template: %s' % doc_type) 221 | self.es.indices.put_template(name=doc_type, index_patterns=index_patterns, mappings=mappings, settings=kwargs) 222 | print('New template %s added' % doc_type) 223 | 224 | 225 | class BarNothing(object): 226 | def update(self, arg): 227 | pass 228 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | elasticsearch 3 | tqdm 4 | requests 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from es_pandas import __versionstr__ 2 | import setuptools 3 | 4 | with open('README.md', 'r') as fh: 5 | long_description = fh.read() 6 | 7 | 8 | setuptools.setup( 9 | name='es_pandas', 10 | version=__versionstr__, 11 | author='Frank', 12 | author_email='fu.frank@foxmail.com', 13 | description='Read, write and update large scale pandas DataFrame with ElasticSearch', 14 | long_description=long_description, 15 | long_description_content_type='text/markdown', 16 | url='https://github.com/fuyb1992/es_pandas', 17 | packages=setuptools.find_packages(), 18 | install_requires=open('requirements.txt').read().strip().split('\n'), 19 | classifiers=[ 20 | 'Programming Language :: Python :: 3', 21 | 'License :: OSI Approved :: MIT License', 22 | 'Operating System :: OS Independent', 23 | ], 24 | ) 25 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pytz 3 | 4 | import pandas as pd 5 | 6 | from datetime import datetime 7 | from es_pandas import es_pandas 8 | 9 | 10 | # Information of es cluseter 11 | es_host = 'http://localhost:9200' 12 | index = 'demo' 13 | tz = pytz.timezone('Asia/Shanghai') 14 | 15 | # crete es_pandas instance 16 | ep = es_pandas(es_host) 17 | 18 | # Example data frame 19 | df = pd.DataFrame({'Num': [x for x in range(100000)]}) 20 | df['Alpha'] = 'Hello' 21 | df['Date'] = datetime.now().astimezone(tz) 22 | # add null value 23 | df.iloc[0] = None 24 | 25 | # init template if you want 26 | doc_type = 'demo' 27 | ep.init_es_tmpl(df, doc_type, delete=True) 28 | 29 | # Example of write data to es 30 | ep.to_es(df, index, doc_type=doc_type, thread_count=2, chunk_size=5000) 31 | print('write es doc without index finished') 32 | 33 | # Example of use DataFrame index as es doc _id 34 | ep.to_es(df, index, doc_type=doc_type, use_index=True, thread_count=2, chunk_size=1000, show_progress=False) 35 | print('write es doc with index finished') 36 | 37 | # waiting for es data writing 38 | time.sleep(5) 39 | 40 | # Delete doc by doc _id 41 | ep.to_es(df.iloc[5000:], index, doc_type=doc_type, _op_type='delete', thread_count=2, chunk_size=1000) 42 | print('delete es doc finished') 43 | 44 | # waiting for es data writing 45 | time.sleep(5) 46 | 47 | # Update doc by doc _id 48 | df.iloc[:1000, 1] = 'Bye' 49 | df.iloc[:1000, 2] = datetime.now().astimezone(tz) 50 | ep.to_es(df.iloc[:1000, 1:], index, doc_type=doc_type, _op_type='update', thread_count=2, chunk_size=1000) 51 | print('update es doc finished') 52 | 53 | # waiting for es data writing 54 | time.sleep(5) 55 | 56 | # get certain fields from es, set certain columns dtype 57 | heads = ['Num', 'Date', 'Alpha'] 58 | dtype = {'Num': 'float', 'Alpha': object} 59 | df = ep.to_pandas(index, heads=heads, dtype=dtype, size=500) 60 | print(df.head()) 61 | print(df.dtypes) 62 | 63 | # infer dtypes from es template 64 | df = ep.to_pandas(index, infer_dtype=True, size=500) 65 | print(df.dtypes) 66 | 67 | # Example of write data to es with pandas.io.json 68 | ep.to_es(df, index, doc_type=doc_type, use_pandas_json=True, thread_count=2, chunk_size=1000) 69 | print('write es doc with pandas.io.json finished') 70 | --------------------------------------------------------------------------------