├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── doc ├── Makefile ├── conf.py ├── index.rst ├── make.bat └── pandasticsearch.rst ├── pandasticsearch ├── __init__.py ├── client.py ├── dataframe.py ├── errors.py ├── operators │ ├── __init__.py │ ├── aggregator.py │ ├── filter.py │ ├── grouper.py │ └── sorter.py ├── queries.py └── types.py ├── requirements.txt ├── setup.py └── tests ├── __init__.py ├── test_cilent.py ├── test_dataframe.py ├── test_operators.py ├── test_queries.py └── test_types.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc 3 | .eggs/ 4 | .idea/ 5 | dist/ 6 | *.egg-info 7 | build/ 8 | .cache/ 9 | pandasticsearch/__pycache__/ 10 | pandasticsearch/utils/__pycache__/ 11 | tests/__pycache__/ 12 | tests/utils/__pycache__/ 13 | /doc/_build/ 14 | /.doc/ 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.4" 5 | install: 6 | - pip install -r requirements.txt 7 | script: python setup.py test -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ### 0.4.0 2 | 3 | * support filter function against ES version >= 5.0 4 | * support customized terms aggregation: `df.groupby(df.age.terms(limit=10, include=[1, 2, 3]))` 5 | * support aggregation metric alias 6 | 7 | ### 0.3.0 8 | 9 | * support groupby date interval: `df.groupby(df.date.date_interval('1d'))` 10 | * parameter change: `DataFrame.from_es(..., index=...)` to `DataFrame.from_es(url=..., index=...)` 11 | 12 | ### 0.2.0 13 | 14 | * support metric agg: `stats`, `extended_stats` 15 | * support boolean filter: `like`, `rlike`, `startswith`, `notnull` 16 | * display time in `df.show()` 17 | 18 | ### 0.1.0 19 | 20 | * support groupby ranges: `df.groupby(df.age.ranges([10,12,14]))` 21 | * support script filter : `df.filter(ScriptFilter('2016 - doc["age"].value > 1995'))` 22 | * support script sort : `df.sort(ScriptSort('doc["age"].value * 2'))` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2016 onesuper 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | recursive-include docs 4 | global-include pandasticsearch *.py 5 | include requirements.txt 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Pandasticsearch 2 | 3 | [![Build Status](https://travis-ci.org/onesuper/pandasticsearch.svg?branch=master)](https://travis-ci.org/onesuper/pandasticsearch) [![PyPI](https://img.shields.io/pypi/v/pandasticsearch.svg)](https://pypi.python.org/pypi/pandasticsearch) 4 | 5 | 6 | Pandasticsearch is an Elasticsearch client for data-analysis purpose. 7 | It provides table-like access to Elasticsearch documents, similar 8 | to the Python Pandas library and R DataFrames. 9 | 10 | To install: 11 | 12 | ``` 13 | pip install pandasticsearch 14 | # if you intent to export Pandas DataFrame 15 | pip install pandasticsearch[pandas] 16 | ``` 17 | 18 | Elasticsearch is skilled in real-time indexing, search and data-analysis. 19 | Pandasticsearch can convert the analysis results (e.g. multi-level nested aggregation) 20 | into [Pandas](http://pandas.pydata.org) DataFrame objects for subsequent data analysis. 21 | 22 | 23 | Checkout the API doc: [http://pandasticsearch.readthedocs.io/en/latest/](http://pandasticsearch.readthedocs.io/en/latest/). 24 | 25 | ## Usage 26 | 27 | ### DataFrame API 28 | 29 | A `DataFrame` object accesses Elasticsearch data with high level operations. 30 | It is type-safe, easy-to-use and Pandas-flavored. 31 | 32 | ```python 33 | # Create a DataFrame object 34 | from pandasticsearch import DataFrame 35 | df = DataFrame.from_es(url='http://localhost:9200', index='people', username='abc', password='abc') 36 | 37 | # Print the schema(mapping) of the index 38 | df.print_schema() 39 | # company 40 | # |-- employee 41 | # |-- name: {'index': 'not_analyzed', 'type': 'string'} 42 | # |-- age: {'type': 'integer'} 43 | # |-- gender: {'index': 'not_analyzed', 'type': 'string'} 44 | 45 | # Inspect the columns 46 | df.columns 47 | #['name', 'age', 'gender'] 48 | 49 | # Denote a column 50 | df.name 51 | # Column('name') 52 | df['age'] 53 | # Column('age') 54 | 55 | # Projection 56 | df.filter(df.age < 25).select('name', 'age').collect() 57 | # [Row(age=12,name='Alice'), Row(age=11,name='Bob'), Row(age=13,name='Leo')] 58 | 59 | # Print the rows into console 60 | df.filter(df.age < 25).select('name').show(3) 61 | # +------+ 62 | # | name | 63 | # +------+ 64 | # | Alice| 65 | # | Bob | 66 | # | Leo | 67 | # +------+ 68 | 69 | # Convert to Pandas object for subsequent analysis 70 | df[df.gender == 'male'].agg(df.age.avg).to_pandas() 71 | # avg(age) 72 | # 0 12 73 | 74 | 75 | # Dump all your dataset to Pandas DataFrame in memory for subsequent analysis 76 | df.to_pandas() 77 | # ... 78 | 79 | # Limit your data amount, if your dataset is too large 80 | df.limit(1000).to_pandas() 81 | # ... 82 | 83 | 84 | # Translate the DataFrame to an ES query (dictionary) 85 | df[df.gender == 'male'].agg(df.age.avg).to_dict() 86 | # {'query': {'filtered': {'filter': {'term': {'gender': 'male'}}}}, 'aggregations': {'avg(birthYear)': 87 | # {'avg': {'field': 'birthYear'}}}, 'size': 0} 88 | 89 | ``` 90 | 91 | 92 | 93 | ### Filter 94 | 95 | ```python 96 | # Filter by a boolean condition 97 | df.filter(df.age < 13).collect() 98 | # [Row(age=12,gender='female',name='Alice'), Row(age=11,gender='male',name='Bob')] 99 | 100 | # Filter by a set of boolean conditions (by &) 101 | df.filter((df.age < 13) & (df.gender == 'male')).collect() 102 | # Row(age=11,gender='male',name='Bob')] 103 | 104 | # Filter by a set of boolean conditions (by chaining) 105 | df.filter(df.age < 13).filter(df.gender == 'male').collect() 106 | # Row(age=11,gender='male',name='Bob')] 107 | 108 | # Filter by a wildcard (sql `like`) 109 | df.filter(df.name.like('A*')).collect() 110 | # [Row(age=12,gender='female',name='Alice')] 111 | 112 | # Filter by a regular expression (sql `rlike`) 113 | df.filter(df.name.rlike('A.l.e')).collect() 114 | # [Row(age=12,gender='female',name='Alice')] 115 | 116 | # Filter by a prefixed string pattern 117 | df.filter(df.name.startswith('Al')).collect() 118 | # [Row(age=12,gender='female',name='Alice')] 119 | 120 | # Filter by a script 121 | df.filter('2016 - doc["age"].value > 1995').collect() 122 | # [Row(age=12,name='Alice'), Row(age=13,name='Leo')] 123 | ``` 124 | 125 | 126 | ### Aggregation 127 | ```python 128 | # Aggregation 129 | df[df.gender == 'male'].agg(df.age.avg).collect() 130 | # [Row(avg(age)=12)] 131 | 132 | # Metric alias 133 | df[df.gender == 'male'].agg(df.age.avg.alias('avg_age')).collect() 134 | # [Row(avg_age=12)] 135 | 136 | # Groupby only (will give the `doc_count`) 137 | df.groupby('gender').collect() 138 | # [Row(doc_count=1), Row(doc_count=2)] 139 | 140 | # Groupby and then aggregate metric 141 | df.groupby('gender').agg(df.age.max).collect() 142 | # [Row(doc_count=1, max(age)=12), Row(doc_count=2, max(age)=13)] 143 | 144 | # Groupby and then aggregate multiple metrics(max and value_count) 145 | df.groupby('gender').agg(df.age.value_count, df.age.max,).collect() 146 | # [Row(value_count(age)=1, max(age)=12), Row(value_count(age)=2, max(age)=13)] 147 | 148 | # Group by a set of ranges 149 | df.groupby(df.age.ranges([10,12,14])).to_pandas() 150 | # doc_count 151 | # range(10,12,14) 152 | # 10.0-12.0 2 153 | # 12.0-14.0 1 154 | 155 | # Advanced ES aggregation 156 | df.groupby(df.gender).agg(df.age.stats).to_pandas() 157 | df.agg(df.age.extended_stats).to_pandas() 158 | df.agg(df.age.percentiles).to_pandas() 159 | df.groupby(df.date.date_interval('1d')).to_pandas() 160 | 161 | # Customized aggregation terms 162 | df.groupby(df.age.terms(size=5, include=[1, 2, 3])) 163 | ``` 164 | 165 | ### Sort 166 | ```python 167 | # Sort 168 | df.sort(df.age.asc).select('name', 'age').collect() 169 | # [Row(age=11,name='Bob'), Row(age=12,name='Alice'), Row(age=13,name='Leo')] 170 | 171 | # Sort by a script 172 | df.sort('doc["age"].value * 2').collect() 173 | # [Row(age=11,name='Bob'), Row(age=12,name='Alice'), Row(age=13,name='Leo')] 174 | ``` 175 | 176 | ## Use with Another Python Client 177 | 178 | Pandasticsearch can also be used with another full featured Python client: 179 | 180 | * [elasticsearch-py](https://github.com/elastic/elasticsearch-py) (Official) 181 | * [Elasticsearch-SQL](https://github.com/NLPchina/elasticsearch-sql) 182 | * [pyelasticsearch](https://github.com/pyelasticsearch/pyelasticsearch) 183 | * [pyes](https://github.com/aparo/pyes) 184 | 185 | 186 | ### Build query 187 | 188 | ```Python 189 | from pandasticsearch import DataFrame 190 | body = df[df['gender'] == 'male'].agg(df['age'].avg).to_dict() 191 | 192 | from elasticsearch import Elasticsearch 193 | result_dict = es.search(index="recruit", body=body) 194 | ``` 195 | 196 | ### Parse result 197 | 198 | ```python 199 | from elasticsearch import Elasticsearch 200 | es = Elasticsearch('http://localhost:9200') 201 | result_dict = es.search(index="recruit", body={"query": {"match_all": {}}}) 202 | 203 | from pandasticsearch import Select 204 | pandas_df = Select.from_dict(result_dict).to_pandas() 205 | ``` 206 | 207 | 208 | ## Compatibility 209 | 210 | An integer argument `compat` needs to be passed to `from_es` to resolve compatibility issues (default 2): 211 | 212 | ### 5.0 213 | 214 | ``` 215 | df = DataFrame.from_es(url='http://localhost:9200', index='people', doc_type='mapping_name', compat=5) 216 | ``` 217 | 218 | For ES version under 7.0, a `doc_type` must be given to specify index mappings (it is deprecated in 7.0). 219 | 220 | 221 | ### 7.0 222 | 223 | 224 | ``` 225 | df = DataFrame.from_es(url='http://localhost:9200', index='people', compat=7) 226 | ``` 227 | 228 | 229 | ## Related Articles 230 | 231 | * [Spark and Elasticsearch for real-time data analysis](https://web.archive.org/web/20150911151523/https://spark-summit.org/2015-east/wp-content/uploads/2015/03/SSE15-35-Leau.pdf) 232 | 233 | 234 | ## LICENSE 235 | 236 | MIT 237 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help 18 | help: 19 | @echo "Please use \`make ' where is one of" 20 | @echo " html to make standalone HTML files" 21 | @echo " dirhtml to make HTML files named index.html in directories" 22 | @echo " singlehtml to make a single large HTML file" 23 | @echo " pickle to make pickle files" 24 | @echo " json to make JSON files" 25 | @echo " htmlhelp to make HTML files and a HTML help project" 26 | @echo " qthelp to make HTML files and a qthelp project" 27 | @echo " applehelp to make an Apple Help Book" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " epub3 to make an epub3" 31 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 32 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 33 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 34 | @echo " text to make text files" 35 | @echo " man to make manual pages" 36 | @echo " texinfo to make Texinfo files" 37 | @echo " info to make Texinfo files and run them through makeinfo" 38 | @echo " gettext to make PO message catalogs" 39 | @echo " changes to make an overview of all changed/added/deprecated items" 40 | @echo " xml to make Docutils-native XML files" 41 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 42 | @echo " linkcheck to check all external links for integrity" 43 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 44 | @echo " coverage to run coverage check of the documentation (if enabled)" 45 | @echo " dummy to check syntax errors of document sources" 46 | 47 | .PHONY: clean 48 | clean: 49 | rm -rf $(BUILDDIR)/* 50 | 51 | .PHONY: html 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | .PHONY: dirhtml 58 | dirhtml: 59 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 60 | @echo 61 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 62 | 63 | .PHONY: singlehtml 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | .PHONY: pickle 70 | pickle: 71 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 72 | @echo 73 | @echo "Build finished; now you can process the pickle files." 74 | 75 | .PHONY: json 76 | json: 77 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 78 | @echo 79 | @echo "Build finished; now you can process the JSON files." 80 | 81 | .PHONY: htmlhelp 82 | htmlhelp: 83 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 84 | @echo 85 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 86 | ".hhp project file in $(BUILDDIR)/htmlhelp." 87 | 88 | .PHONY: qthelp 89 | qthelp: 90 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 91 | @echo 92 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 93 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 94 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pandasticsearch.qhcp" 95 | @echo "To view the help file:" 96 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pandasticsearch.qhc" 97 | 98 | .PHONY: applehelp 99 | applehelp: 100 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 101 | @echo 102 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 103 | @echo "N.B. You won't be able to view it unless you put it in" \ 104 | "~/Library/Documentation/Help or install it in your application" \ 105 | "bundle." 106 | 107 | .PHONY: devhelp 108 | devhelp: 109 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 110 | @echo 111 | @echo "Build finished." 112 | @echo "To view the help file:" 113 | @echo "# mkdir -p $$HOME/.local/share/devhelp/pandasticsearch" 114 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pandasticsearch" 115 | @echo "# devhelp" 116 | 117 | .PHONY: epub 118 | epub: 119 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 120 | @echo 121 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 122 | 123 | .PHONY: epub3 124 | epub3: 125 | $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 126 | @echo 127 | @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." 128 | 129 | .PHONY: latex 130 | latex: 131 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 132 | @echo 133 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 134 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 135 | "(use \`make latexpdf' here to do that automatically)." 136 | 137 | .PHONY: latexpdf 138 | latexpdf: 139 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 140 | @echo "Running LaTeX files through pdflatex..." 141 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 142 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 143 | 144 | .PHONY: latexpdfja 145 | latexpdfja: 146 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 147 | @echo "Running LaTeX files through platex and dvipdfmx..." 148 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 149 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 150 | 151 | .PHONY: text 152 | text: 153 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 154 | @echo 155 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 156 | 157 | .PHONY: man 158 | man: 159 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 160 | @echo 161 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 162 | 163 | .PHONY: texinfo 164 | texinfo: 165 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 166 | @echo 167 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 168 | @echo "Run \`make' in that directory to run these through makeinfo" \ 169 | "(use \`make info' here to do that automatically)." 170 | 171 | .PHONY: info 172 | info: 173 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 174 | @echo "Running Texinfo files through makeinfo..." 175 | make -C $(BUILDDIR)/texinfo info 176 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 177 | 178 | .PHONY: gettext 179 | gettext: 180 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 181 | @echo 182 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 183 | 184 | .PHONY: changes 185 | changes: 186 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 187 | @echo 188 | @echo "The overview file is in $(BUILDDIR)/changes." 189 | 190 | .PHONY: linkcheck 191 | linkcheck: 192 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 193 | @echo 194 | @echo "Link check complete; look for any errors in the above output " \ 195 | "or in $(BUILDDIR)/linkcheck/output.txt." 196 | 197 | .PHONY: doctest 198 | doctest: 199 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 200 | @echo "Testing of doctests in the sources finished, look at the " \ 201 | "results in $(BUILDDIR)/doctest/output.txt." 202 | 203 | .PHONY: coverage 204 | coverage: 205 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 206 | @echo "Testing of coverage in the sources finished, look at the " \ 207 | "results in $(BUILDDIR)/coverage/python.txt." 208 | 209 | .PHONY: xml 210 | xml: 211 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 212 | @echo 213 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 214 | 215 | .PHONY: pseudoxml 216 | pseudoxml: 217 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 218 | @echo 219 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 220 | 221 | .PHONY: dummy 222 | dummy: 223 | $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy 224 | @echo 225 | @echo "Build finished. Dummy builder generates no files." 226 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # pandasticsearch documentation build configuration file, created by 5 | # sphinx-quickstart on Thu Nov 17 15:52:03 2016. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath('.')) 23 | sys.path.insert(0, os.path.abspath('..')) 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | import sphinx_rtd_theme 28 | 29 | html_theme = "sphinx_rtd_theme" 30 | 31 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 32 | 33 | # If your documentation needs a minimal Sphinx version, state it here. 34 | # 35 | # needs_sphinx = '1.0' 36 | 37 | # Add any Sphinx extension module names here, as strings. They can be 38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 39 | # ones. 40 | extensions = [ 41 | 'sphinx.ext.autodoc', 42 | ] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # The suffix(es) of source filenames. 48 | # You can specify multiple suffix as a list of string: 49 | # 50 | # source_suffix = ['.rst', '.md'] 51 | source_suffix = '.rst' 52 | 53 | # The encoding of source files. 54 | # 55 | # source_encoding = 'utf-8-sig' 56 | 57 | # The master toctree document. 58 | master_doc = 'index' 59 | 60 | # General information about the project. 61 | project = 'pandasticsearch' 62 | copyright = '2016, onesuper' 63 | author = 'onesuper' 64 | 65 | # The version info for the project you're documenting, acts as replacement for 66 | # |version| and |release|, also used in various other places throughout the 67 | # built documents. 68 | # 69 | # The short X.Y version. 70 | version = '0.2.0' 71 | # The full version, including alpha/beta/rc tags. 72 | release = '0.2.0' 73 | 74 | # The language for content autogenerated by Sphinx. Refer to documentation 75 | # for a list of supported languages. 76 | # 77 | # This is also used if you do content translation via gettext catalogs. 78 | # Usually you set "language" from the command line for these cases. 79 | language = None 80 | 81 | # There are two options for replacing |today|: either, you set today to some 82 | # non-false value, then it is used: 83 | # 84 | # today = '' 85 | # 86 | # Else, today_fmt is used as the format for a strftime call. 87 | # 88 | # today_fmt = '%B %d, %Y' 89 | 90 | # List of patterns, relative to source directory, that match files and 91 | # directories to ignore when looking for source files. 92 | # This patterns also effect to html_static_path and html_extra_path 93 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 94 | 95 | # The reST default role (used for this markup: `text`) to use for all 96 | # documents. 97 | # 98 | # default_role = None 99 | 100 | # If true, '()' will be appended to :func: etc. cross-reference text. 101 | # 102 | # add_function_parentheses = True 103 | 104 | # If true, the current module name will be prepended to all description 105 | # unit titles (such as .. function::). 106 | # 107 | # add_module_names = True 108 | 109 | # If true, sectionauthor and moduleauthor directives will be shown in the 110 | # output. They are ignored by default. 111 | # 112 | # show_authors = False 113 | 114 | # The name of the Pygments (syntax highlighting) style to use. 115 | pygments_style = 'sphinx' 116 | 117 | # A list of ignored prefixes for module index sorting. 118 | # modindex_common_prefix = [] 119 | 120 | # If true, keep warnings as "system message" paragraphs in the built documents. 121 | # keep_warnings = False 122 | 123 | # If true, `todo` and `todoList` produce output, else they produce nothing. 124 | todo_include_todos = False 125 | 126 | 127 | # -- Options for HTML output ---------------------------------------------- 128 | 129 | # The theme to use for HTML and HTML Help pages. See the documentation for 130 | # a list of builtin themes. 131 | # 132 | #html_theme = 'alabaster' 133 | 134 | # Theme options are theme-specific and customize the look and feel of a theme 135 | # further. For a list of options available for each theme, see the 136 | # documentation. 137 | # 138 | # html_theme_options = {} 139 | 140 | # Add any paths that contain custom themes here, relative to this directory. 141 | # html_theme_path = [] 142 | 143 | # The name for this set of Sphinx documents. 144 | # " v documentation" by default. 145 | # 146 | # html_title = 'pandasticsearch v0.2.0' 147 | 148 | # A shorter title for the navigation bar. Default is the same as html_title. 149 | # 150 | # html_short_title = None 151 | 152 | # The name of an image file (relative to this directory) to place at the top 153 | # of the sidebar. 154 | # 155 | # html_logo = None 156 | 157 | # The name of an image file (relative to this directory) to use as a favicon of 158 | # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 159 | # pixels large. 160 | # 161 | # html_favicon = None 162 | 163 | # Add any paths that contain custom static files (such as style sheets) here, 164 | # relative to this directory. They are copied after the builtin static files, 165 | # so a file named "default.css" will overwrite the builtin "default.css". 166 | html_static_path = ['_static'] 167 | 168 | # Add any extra paths that contain custom files (such as robots.txt or 169 | # .htaccess) here, relative to this directory. These files are copied 170 | # directly to the root of the documentation. 171 | # 172 | # html_extra_path = [] 173 | 174 | # If not None, a 'Last updated on:' timestamp is inserted at every page 175 | # bottom, using the given strftime format. 176 | # The empty string is equivalent to '%b %d, %Y'. 177 | # 178 | # html_last_updated_fmt = None 179 | 180 | # If true, SmartyPants will be used to convert quotes and dashes to 181 | # typographically correct entities. 182 | # 183 | # html_use_smartypants = True 184 | 185 | # Custom sidebar templates, maps document names to template names. 186 | # 187 | # html_sidebars = {} 188 | 189 | # Additional templates that should be rendered to pages, maps page names to 190 | # template names. 191 | # 192 | # html_additional_pages = {} 193 | 194 | # If false, no module index is generated. 195 | # 196 | # html_domain_indices = True 197 | 198 | # If false, no index is generated. 199 | # 200 | # html_use_index = True 201 | 202 | # If true, the index is split into individual pages for each letter. 203 | # 204 | # html_split_index = False 205 | 206 | # If true, links to the reST sources are added to the pages. 207 | # 208 | # html_show_sourcelink = True 209 | 210 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 211 | # 212 | # html_show_sphinx = True 213 | 214 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 215 | # 216 | # html_show_copyright = True 217 | 218 | # If true, an OpenSearch description file will be output, and all pages will 219 | # contain a tag referring to it. The value of this option must be the 220 | # base URL from which the finished HTML is served. 221 | # 222 | # html_use_opensearch = '' 223 | 224 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 225 | # html_file_suffix = None 226 | 227 | # Language to be used for generating the HTML full-text search index. 228 | # Sphinx supports the following languages: 229 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' 230 | # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' 231 | # 232 | # html_search_language = 'en' 233 | 234 | # A dictionary with options for the search language support, empty by default. 235 | # 'ja' uses this config value. 236 | # 'zh' user can custom change `jieba` dictionary path. 237 | # 238 | # html_search_options = {'type': 'default'} 239 | 240 | # The name of a javascript file (relative to the configuration directory) that 241 | # implements a search results scorer. If empty, the default will be used. 242 | # 243 | # html_search_scorer = 'scorer.js' 244 | 245 | # Output file base name for HTML help builder. 246 | htmlhelp_basename = 'pandasticsearchdoc' 247 | 248 | # -- Options for LaTeX output --------------------------------------------- 249 | 250 | latex_elements = { 251 | # The paper size ('letterpaper' or 'a4paper'). 252 | # 253 | # 'papersize': 'letterpaper', 254 | 255 | # The font size ('10pt', '11pt' or '12pt'). 256 | # 257 | # 'pointsize': '10pt', 258 | 259 | # Additional stuff for the LaTeX preamble. 260 | # 261 | # 'preamble': '', 262 | 263 | # Latex figure (float) alignment 264 | # 265 | # 'figure_align': 'htbp', 266 | } 267 | 268 | # Grouping the document tree into LaTeX files. List of tuples 269 | # (source start file, target name, title, 270 | # author, documentclass [howto, manual, or own class]). 271 | latex_documents = [ 272 | (master_doc, 'pandasticsearch.tex', 'pandasticsearch Documentation', 273 | 'onesuper', 'manual'), 274 | ] 275 | 276 | # The name of an image file (relative to this directory) to place at the top of 277 | # the title page. 278 | # 279 | # latex_logo = None 280 | 281 | # For "manual" documents, if this is true, then toplevel headings are parts, 282 | # not chapters. 283 | # 284 | # latex_use_parts = False 285 | 286 | # If true, show page references after internal links. 287 | # 288 | # latex_show_pagerefs = False 289 | 290 | # If true, show URL addresses after external links. 291 | # 292 | # latex_show_urls = False 293 | 294 | # Documents to append as an appendix to all manuals. 295 | # 296 | # latex_appendices = [] 297 | 298 | # It false, will not define \strong, \code, itleref, \crossref ... but only 299 | # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added 300 | # packages. 301 | # 302 | # latex_keep_old_macro_names = True 303 | 304 | # If false, no module index is generated. 305 | # 306 | # latex_domain_indices = True 307 | 308 | 309 | # -- Options for manual page output --------------------------------------- 310 | 311 | # One entry per manual page. List of tuples 312 | # (source start file, name, description, authors, manual section). 313 | man_pages = [ 314 | (master_doc, 'pandasticsearch', 'pandasticsearch Documentation', 315 | [author], 1) 316 | ] 317 | 318 | # If true, show URL addresses after external links. 319 | # 320 | # man_show_urls = False 321 | 322 | 323 | # -- Options for Texinfo output ------------------------------------------- 324 | 325 | # Grouping the document tree into Texinfo files. List of tuples 326 | # (source start file, target name, title, author, 327 | # dir menu entry, description, category) 328 | texinfo_documents = [ 329 | (master_doc, 'pandasticsearch', 'pandasticsearch Documentation', 330 | author, 'pandasticsearch', 'One line description of project.', 331 | 'Miscellaneous'), 332 | ] 333 | 334 | # Documents to append as an appendix to all manuals. 335 | # 336 | # texinfo_appendices = [] 337 | 338 | # If false, no module index is generated. 339 | # 340 | # texinfo_domain_indices = True 341 | 342 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 343 | # 344 | # texinfo_show_urls = 'footnote' 345 | 346 | # If true, do not generate a @detailmenu in the "Top" node's menu. 347 | # 348 | # texinfo_no_detailmenu = False 349 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. pandasticsearch documentation master file, created by 2 | sphinx-quickstart on Thu Nov 17 15:52:03 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Pandasticsearch 7 | =========================================== 8 | 9 | Contents: 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | pandasticsearch.rst 15 | 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | 24 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. epub3 to make an epub3 31 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 32 | echo. text to make text files 33 | echo. man to make manual pages 34 | echo. texinfo to make Texinfo files 35 | echo. gettext to make PO message catalogs 36 | echo. changes to make an overview over all changed/added/deprecated items 37 | echo. xml to make Docutils-native XML files 38 | echo. pseudoxml to make pseudoxml-XML files for display purposes 39 | echo. linkcheck to check all external links for integrity 40 | echo. doctest to run all doctests embedded in the documentation if enabled 41 | echo. coverage to run coverage check of the documentation if enabled 42 | echo. dummy to check syntax errors of document sources 43 | goto end 44 | ) 45 | 46 | if "%1" == "clean" ( 47 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 48 | del /q /s %BUILDDIR%\* 49 | goto end 50 | ) 51 | 52 | 53 | REM Check if sphinx-build is available and fallback to Python version if any 54 | %SPHINXBUILD% 1>NUL 2>NUL 55 | if errorlevel 9009 goto sphinx_python 56 | goto sphinx_ok 57 | 58 | :sphinx_python 59 | 60 | set SPHINXBUILD=python -m sphinx.__init__ 61 | %SPHINXBUILD% 2> nul 62 | if errorlevel 9009 ( 63 | echo. 64 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 65 | echo.installed, then set the SPHINXBUILD environment variable to point 66 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 67 | echo.may add the Sphinx directory to PATH. 68 | echo. 69 | echo.If you don't have Sphinx installed, grab it from 70 | echo.http://sphinx-doc.org/ 71 | exit /b 1 72 | ) 73 | 74 | :sphinx_ok 75 | 76 | 77 | if "%1" == "html" ( 78 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 79 | if errorlevel 1 exit /b 1 80 | echo. 81 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 82 | goto end 83 | ) 84 | 85 | if "%1" == "dirhtml" ( 86 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 87 | if errorlevel 1 exit /b 1 88 | echo. 89 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 90 | goto end 91 | ) 92 | 93 | if "%1" == "singlehtml" ( 94 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 95 | if errorlevel 1 exit /b 1 96 | echo. 97 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 98 | goto end 99 | ) 100 | 101 | if "%1" == "pickle" ( 102 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 103 | if errorlevel 1 exit /b 1 104 | echo. 105 | echo.Build finished; now you can process the pickle files. 106 | goto end 107 | ) 108 | 109 | if "%1" == "json" ( 110 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 111 | if errorlevel 1 exit /b 1 112 | echo. 113 | echo.Build finished; now you can process the JSON files. 114 | goto end 115 | ) 116 | 117 | if "%1" == "htmlhelp" ( 118 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 119 | if errorlevel 1 exit /b 1 120 | echo. 121 | echo.Build finished; now you can run HTML Help Workshop with the ^ 122 | .hhp project file in %BUILDDIR%/htmlhelp. 123 | goto end 124 | ) 125 | 126 | if "%1" == "qthelp" ( 127 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 128 | if errorlevel 1 exit /b 1 129 | echo. 130 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 131 | .qhcp project file in %BUILDDIR%/qthelp, like this: 132 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pandasticsearch.qhcp 133 | echo.To view the help file: 134 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pandasticsearch.ghc 135 | goto end 136 | ) 137 | 138 | if "%1" == "devhelp" ( 139 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 140 | if errorlevel 1 exit /b 1 141 | echo. 142 | echo.Build finished. 143 | goto end 144 | ) 145 | 146 | if "%1" == "epub" ( 147 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 148 | if errorlevel 1 exit /b 1 149 | echo. 150 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 151 | goto end 152 | ) 153 | 154 | if "%1" == "epub3" ( 155 | %SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3 156 | if errorlevel 1 exit /b 1 157 | echo. 158 | echo.Build finished. The epub3 file is in %BUILDDIR%/epub3. 159 | goto end 160 | ) 161 | 162 | if "%1" == "latex" ( 163 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 164 | if errorlevel 1 exit /b 1 165 | echo. 166 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdf" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "latexpdfja" ( 181 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 182 | cd %BUILDDIR%/latex 183 | make all-pdf-ja 184 | cd %~dp0 185 | echo. 186 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 187 | goto end 188 | ) 189 | 190 | if "%1" == "text" ( 191 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 192 | if errorlevel 1 exit /b 1 193 | echo. 194 | echo.Build finished. The text files are in %BUILDDIR%/text. 195 | goto end 196 | ) 197 | 198 | if "%1" == "man" ( 199 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 200 | if errorlevel 1 exit /b 1 201 | echo. 202 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 203 | goto end 204 | ) 205 | 206 | if "%1" == "texinfo" ( 207 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 208 | if errorlevel 1 exit /b 1 209 | echo. 210 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 211 | goto end 212 | ) 213 | 214 | if "%1" == "gettext" ( 215 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 216 | if errorlevel 1 exit /b 1 217 | echo. 218 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 219 | goto end 220 | ) 221 | 222 | if "%1" == "changes" ( 223 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 224 | if errorlevel 1 exit /b 1 225 | echo. 226 | echo.The overview file is in %BUILDDIR%/changes. 227 | goto end 228 | ) 229 | 230 | if "%1" == "linkcheck" ( 231 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 232 | if errorlevel 1 exit /b 1 233 | echo. 234 | echo.Link check complete; look for any errors in the above output ^ 235 | or in %BUILDDIR%/linkcheck/output.txt. 236 | goto end 237 | ) 238 | 239 | if "%1" == "doctest" ( 240 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 241 | if errorlevel 1 exit /b 1 242 | echo. 243 | echo.Testing of doctests in the sources finished, look at the ^ 244 | results in %BUILDDIR%/doctest/output.txt. 245 | goto end 246 | ) 247 | 248 | if "%1" == "coverage" ( 249 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 250 | if errorlevel 1 exit /b 1 251 | echo. 252 | echo.Testing of coverage in the sources finished, look at the ^ 253 | results in %BUILDDIR%/coverage/python.txt. 254 | goto end 255 | ) 256 | 257 | if "%1" == "xml" ( 258 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 259 | if errorlevel 1 exit /b 1 260 | echo. 261 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 262 | goto end 263 | ) 264 | 265 | if "%1" == "pseudoxml" ( 266 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 267 | if errorlevel 1 exit /b 1 268 | echo. 269 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 270 | goto end 271 | ) 272 | 273 | if "%1" == "dummy" ( 274 | %SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy 275 | if errorlevel 1 exit /b 1 276 | echo. 277 | echo.Build finished. Dummy builder generates no files. 278 | goto end 279 | ) 280 | 281 | :end 282 | -------------------------------------------------------------------------------- /doc/pandasticsearch.rst: -------------------------------------------------------------------------------- 1 | pandasticsearch package 2 | ======================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | pandasticsearch.client module 8 | ----------------------------- 9 | 10 | .. automodule:: pandasticsearch.client 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | pandasticsearch.dataframe module 16 | -------------------------------- 17 | 18 | .. automodule:: pandasticsearch.dataframe 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | pandasticsearch.errors module 24 | ----------------------------- 25 | 26 | .. automodule:: pandasticsearch.errors 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | pandasticsearch.operators module 32 | -------------------------------- 33 | 34 | .. automodule:: pandasticsearch.operators 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | pandasticsearch.queries module 40 | ------------------------------ 41 | 42 | .. automodule:: pandasticsearch.queries 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | pandasticsearch.types module 48 | ---------------------------- 49 | 50 | .. automodule:: pandasticsearch.types 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | 56 | Module contents 57 | --------------- 58 | 59 | .. automodule:: pandasticsearch 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | -------------------------------------------------------------------------------- /pandasticsearch/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from pandasticsearch.dataframe import DataFrame, Column 4 | from pandasticsearch.client import RestClient 5 | from pandasticsearch.queries import Select, Agg 6 | from pandasticsearch.types import Row 7 | 8 | col = Column 9 | -------------------------------------------------------------------------------- /pandasticsearch/client.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import json 4 | import sys 5 | import base64 6 | import ssl 7 | from six.moves import urllib 8 | 9 | from pandasticsearch.errors import ServerDefinedException 10 | 11 | 12 | class RestClient(object): 13 | """ 14 | RestClient talks to Elasticsearch cluster through native RESTful API. 15 | """ 16 | 17 | def __init__(self, host, username=None, password=None, verify_ssl=True): 18 | """ 19 | Initialize the RESTful from the keyword arguments. 20 | 21 | :param str host: Host URL of Broker node in the Elasticsearch cluster 22 | :param str optional username: Username for authentication 23 | :param str optional password: Password for authentication 24 | :param bool optional verify_ssl: Whether or not verify the SSL certificate 25 | """ 26 | self.host = host 27 | self.username = username 28 | self.password = password 29 | self.verify_ssl = verify_ssl 30 | 31 | def _prepare_url(self, path): 32 | if self.host.endswith('/'): 33 | url = self.host + path 34 | else: 35 | if path.startswith('/'): 36 | url = self.host + path 37 | else: 38 | url = self.host + '/' + path 39 | return url 40 | 41 | def get(self, path, params=None): 42 | """ 43 | Sends a GET request to Elasticsearch. 44 | 45 | :param path: Path of the verb and resource 46 | :param optional params: Dictionary to be sent in the query string. 47 | :return: The response as a dictionary. 48 | 49 | >>> from pandasticsearch import RestClient 50 | >>> client = RestClient('http://host:port') 51 | >>> print(client.get('index_name/_search')) 52 | """ 53 | try: 54 | url = self._prepare_url(path) 55 | username = self.username 56 | password = self.password 57 | verify_ssl = self.verify_ssl 58 | 59 | if params is not None: 60 | url = '{0}?{1}'.format(url, urllib.parse.urlencode(params)) 61 | 62 | req = urllib.request.Request(url=url) 63 | 64 | if username is not None and password is not None: 65 | s = '%s:%s' % (username, password) 66 | base64creds = base64.b64encode(s.encode('utf-8')).decode('utf-8') 67 | req.add_header("Authorization", "Basic %s" % base64creds) 68 | 69 | if verify_ssl is False: 70 | context = ssl._create_unverified_context() 71 | res = urllib.request.urlopen(req, context=context) 72 | else: 73 | res = urllib.request.urlopen(req) 74 | 75 | data = res.read().decode("utf-8") 76 | res.close() 77 | except urllib.error.HTTPError: 78 | _, e, _ = sys.exc_info() 79 | reason = None 80 | if e.code != 200: 81 | try: 82 | reason = json.loads(e.read().decode("utf-8")) 83 | except (ValueError, AttributeError, KeyError): 84 | pass 85 | else: 86 | reason = reason.get('error', None) 87 | 88 | raise ServerDefinedException(reason) 89 | else: 90 | return json.loads(data) 91 | 92 | def post(self, path, data, params=None): 93 | """ 94 | Sends a POST request to Elasticsearch. 95 | 96 | :param path: The path of the verb and resource, e.g. "/index_name/_search" 97 | :param data: The json data to send in the body of the request. 98 | :param optional params: Dictionary to be sent in the query string. 99 | :return: The response as a dictionary. 100 | 101 | >>> from pandasticsearch import RestClient 102 | >>> client = RestClient('http://host:port') 103 | >>> print(client.post(path='index/_search', data={"query":{"match_all":{}}})) 104 | """ 105 | try: 106 | url = self._prepare_url(path) 107 | username = self.username 108 | password = self.password 109 | verify_ssl = self.verify_ssl 110 | 111 | if params is not None: 112 | url = '{0}?{1}'.format(url, urllib.parse.urlencode(params)) 113 | 114 | req = urllib.request.Request(url=url, data=json.dumps(data).encode('utf-8'), 115 | headers={'Content-Type': 'application/json'}) 116 | 117 | if username is not None and password is not None: 118 | s = '%s:%s' % (username, password) 119 | base64creds = base64.b64encode(s.encode('utf-8')).decode('utf-8') 120 | req.add_header("Authorization", "Basic %s" % base64creds) 121 | 122 | if verify_ssl is False: 123 | context = ssl._create_unverified_context() 124 | res = urllib.request.urlopen(req, context=context) 125 | else: 126 | res = urllib.request.urlopen(req) 127 | 128 | data = res.read().decode("utf-8") 129 | res.close() 130 | except urllib.error.HTTPError: 131 | _, e, _ = sys.exc_info() 132 | reason = None 133 | if e.code != 200: 134 | try: 135 | reason = json.loads(e.read().decode("utf-8")) 136 | except (ValueError, AttributeError, KeyError): 137 | pass 138 | else: 139 | reason = reason.get('error', None) 140 | 141 | raise ServerDefinedException(reason) 142 | else: 143 | return json.loads(data) 144 | -------------------------------------------------------------------------------- /pandasticsearch/dataframe.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | from pandasticsearch.client import RestClient 4 | from pandasticsearch.queries import Agg, ScrollSelect 5 | from pandasticsearch.operators import * 6 | from pandasticsearch.types import Column, Row 7 | from pandasticsearch.errors import DataFrameException 8 | 9 | import json 10 | import six 11 | import sys 12 | import copy 13 | 14 | _unbound_index_err = DataFrameException('DataFrame is not bound to ES index') 15 | 16 | _count_aggregator = MetricAggregator('_index', 'value_count', alias='count').build() 17 | 18 | 19 | class DataFrame(object): 20 | """ 21 | A :class:`DataFrame` treats index and documents in Elasticsearch as named columns and rows. 22 | 23 | >>> from pandasticsearch import DataFrame 24 | >>> df = DataFrame.from_es('http://host:port', index='people') 25 | 26 | Customizing the endpoint of the ElasticSearch: 27 | 28 | >>> from pandasticsearch import DataFrame 29 | >>> from pandasticsearch.client import RestClient 30 | >>> df = DataFrame(client=RestClient('http://host:port',), index='people') 31 | 32 | It can be converted to Pandas object for subsequent analysis: 33 | 34 | >>> df.to_pandas() 35 | """ 36 | 37 | def __init__(self, **kwargs): 38 | self._client = kwargs.get('client', None) 39 | self._mapping = kwargs.get('mapping', None) 40 | self._doc_type = kwargs.get('doc_type', None) 41 | self._index = kwargs.get('index', None) 42 | self._compat = kwargs.get('compat', 2) 43 | self._filter = kwargs.get('filter', None) 44 | self._groupby = kwargs.get('groupby', None) 45 | self._aggregation = kwargs.get('aggregation', None) 46 | self._sort = kwargs.get('sort', None) 47 | self._projection = kwargs.get('projection', None) 48 | self._limit = kwargs.get('limit', 100) 49 | self._last_query = None 50 | 51 | @property 52 | def index(self): 53 | """ 54 | Returns the index name. 55 | 56 | :return: string as the name 57 | 58 | >>> df.index 59 | people/children 60 | """ 61 | if self._index is None: 62 | return None 63 | return self._index + '/' + self._doc_type if self._doc_type else self._index 64 | 65 | @property 66 | def columns(self): 67 | """ 68 | Returns all column names as a list. 69 | 70 | :return: column names as a list 71 | 72 | >>> df.columns 73 | ['age', 'name'] 74 | """ 75 | return sorted(self._get_cols(self._mapping)) if self._mapping else None 76 | 77 | @property 78 | def schema(self): 79 | """ 80 | Returns the schema(mapping) of the index/type as a dictionary. 81 | """ 82 | return self._mapping 83 | 84 | @staticmethod 85 | def from_es(**kwargs): 86 | """ 87 | Creates an :class:`DataFrame ` object by providing the URL of ElasticSearch node and the name of the index. 88 | 89 | :param str url: URL of the node connected to (default: 'http://localhost:9200') 90 | :param str index: The name of the index 91 | :param str doc_type: The type of the document 92 | :param str compat: The compatible ES version (an integer number) 93 | :return: DataFrame object for accessing 94 | :rtype: DataFrame 95 | 96 | >>> from pandasticsearch import DataFrame 97 | >>> df = DataFrame.from_es('http://host:port', index='people') 98 | """ 99 | 100 | doc_type = kwargs.get('doc_type', None) 101 | index = kwargs.get('index', None) 102 | url = kwargs.get('url', 'http://localhost:9200') 103 | compat = kwargs.get('compat', 2) 104 | username = kwargs.get('username', None) 105 | password = kwargs.get('password', None) 106 | verify_ssl = kwargs.get('verify_ssl', True) 107 | 108 | if index is None: 109 | raise ValueError('Index name must be specified') 110 | 111 | if doc_type is None: 112 | path = index 113 | else: 114 | path = index + '/' + doc_type 115 | 116 | client = RestClient(url, username, password, verify_ssl) 117 | 118 | mapping = client.get(path) 119 | 120 | return DataFrame(client=client, mapping=mapping, index=index, doc_type=doc_type, compat=compat) 121 | 122 | def __getattr__(self, name): 123 | """ 124 | Returns a :class:`types.Column ` object denoted by ``name``. 125 | """ 126 | if name not in self.columns: 127 | raise AttributeError( 128 | "'%s' object has no attribute '%s'" % (self.__class__.__name__, name)) 129 | return Column(name) 130 | 131 | def __getitem__(self, item): 132 | if isinstance(item, six.string_types): 133 | if item not in self.columns: 134 | raise TypeError('Column does not exist: [{0}]'.format(item)) 135 | return Column(item) 136 | elif isinstance(item, BooleanFilter): 137 | self._filter = item 138 | return self 139 | else: 140 | raise TypeError('Unsupported expr: [{0}]'.format(item)) 141 | 142 | def filter(self, condition): 143 | """ 144 | Filters rows using a given condition. 145 | 146 | where() is an alias for filter(). 147 | 148 | :param condition: :class:`BooleanFilter ` object or a string 149 | 150 | >>> df.filter(df['age'] < 13).collect() 151 | [Row(age=12,gender='female',name='Alice'), Row(age=11,gender='male',name='Bob')] 152 | """ 153 | 154 | if isinstance(condition, six.string_types): 155 | _filter = ScriptFilter(condition) 156 | elif isinstance(condition, BooleanFilter): 157 | _filter = condition 158 | else: 159 | raise TypeError('{0} is supposed to be str or BooleanFilter'.format(condition)) 160 | 161 | # chaining filter treated as AND 162 | if self._filter is not None: 163 | _filter = (self._filter & _filter) 164 | 165 | return DataFrame(client=self._client, 166 | index=self._index, 167 | doc_type=self._doc_type, 168 | mapping=self._mapping, 169 | filter=_filter, 170 | groupby=self._groupby, 171 | aggregation=self._aggregation, 172 | projection=self._projection, 173 | sort=self._sort, 174 | limit=self._limit, 175 | compat=self._compat) 176 | 177 | where = filter 178 | 179 | def select(self, *cols): 180 | """ 181 | Projects a set of columns and returns a new :class:`DataFrame ` 182 | 183 | :param cols: list of column names or :class:`Column `. 184 | 185 | >>> df.filter(df['age'] < 25).select('name', 'age').collect() 186 | [Row(age=12,name='Alice'), Row(age=11,name='Bob'), Row(age=13,name='Leo')] 187 | """ 188 | projection = [] 189 | for col in cols: 190 | if isinstance(col, six.string_types): 191 | projection.append(getattr(self, col)) 192 | elif isinstance(col, Column): 193 | projection.append(col) 194 | else: 195 | raise TypeError('{0} is supposed to be str or Column'.format(col)) 196 | return DataFrame(client=self._client, 197 | index=self._index, 198 | doc_type=self._doc_type, 199 | mapping=self._mapping, 200 | filter=self._filter, 201 | groupby=self._groupby, 202 | aggregation=self._aggregation, 203 | projection=projection, 204 | sort=self._sort, 205 | limit=self._limit, 206 | compat=self._compat) 207 | 208 | def limit(self, num): 209 | """ 210 | Limits the result count to the number specified. 211 | """ 212 | assert isinstance(num, int) 213 | assert num >= 1 214 | return DataFrame(client=self._client, 215 | index=self._index, 216 | doc_type=self._doc_type, 217 | mapping=self._mapping, 218 | filter=self._filter, 219 | groupby=self._groupby, 220 | aggregation=self._aggregation, 221 | projection=self._projection, 222 | sort=self._sort, 223 | limit=num, 224 | compat=self._compat) 225 | 226 | def groupby(self, *cols): 227 | """ 228 | Returns a new :class:`DataFrame ` object grouped by the specified column(s). 229 | 230 | :param cols: A list of column names, :class:`Column ` or :class:`Grouper ` objects 231 | """ 232 | columns = [] 233 | if len(cols) == 1 and isinstance(cols[0], Grouper): 234 | groupby = cols[0].build() 235 | else: 236 | for col in cols: 237 | if isinstance(col, six.string_types): 238 | columns.append(getattr(self, col)) 239 | elif isinstance(col, Column): 240 | columns.append(col) 241 | else: 242 | raise TypeError('{0} is supposed to be str or Column'.format(col)) 243 | names = [col.field_name() for col in columns] 244 | groupby = Grouper.from_list(names).build() 245 | 246 | return DataFrame(client=self._client, 247 | index=self._index, 248 | doc_type=self._doc_type, 249 | mapping=self._mapping, 250 | filter=self._filter, 251 | groupby=groupby, 252 | aggregation=self._aggregation, 253 | projection=self._projection, 254 | sort=self._sort, 255 | limit=self.limit, 256 | compat=self._compat) 257 | 258 | def agg(self, *aggs): 259 | """ 260 | Aggregate on the entire DataFrame without groups. 261 | 262 | :param aggs: a list of :class:`Aggregator ` objects 263 | 264 | >>> df[df['gender'] == 'male'].agg(df['age'].avg).collect() 265 | [Row(avg(age)=12)] 266 | """ 267 | aggregation = {} 268 | for agg in aggs: 269 | assert isinstance(agg, Aggregator) 270 | aggregation.update(agg.build()) 271 | 272 | return DataFrame(client=self._client, 273 | index=self._index, 274 | doc_type=self._doc_type, 275 | mapping=self._mapping, 276 | filter=self._filter, 277 | groupby=self._groupby, 278 | aggregation=aggregation, 279 | projection=self._projection, 280 | sort=self._sort, 281 | limit=self._limit, 282 | compat=self._compat) 283 | 284 | def sort(self, *cols): 285 | """ 286 | Returns a new :class:`DataFrame ` object sorted by the specified column(s). 287 | 288 | :param cols: A list of column names, :class:`Column ` or :class:`Sorter `. 289 | 290 | orderby() is an alias for sort(). 291 | 292 | >>> df.sort(df['age'].asc).collect() 293 | [Row(age=11,name='Bob'), Row(age=12,name='Alice'), Row(age=13,name='Leo')] 294 | """ 295 | sorts = [] 296 | for col in cols: 297 | if isinstance(col, six.string_types): 298 | sorts.append(ScriptSorter(col).build()) 299 | elif isinstance(col, Sorter): 300 | sorts.append(col.build()) 301 | else: 302 | raise TypeError('{0} is supposed to be str or Sorter'.format(col)) 303 | 304 | return DataFrame(client=self._client, 305 | index=self._index, 306 | doc_type=self._doc_type, 307 | mapping=self._mapping, 308 | filter=self._filter, 309 | groupby=self._groupby, 310 | aggregation=self._aggregation, 311 | projection=self._projection, 312 | sort=sorts, 313 | limit=self._limit, 314 | compat=self._compat) 315 | 316 | orderby = sort 317 | 318 | def _execute(self): 319 | if self._client is None: 320 | raise _unbound_index_err 321 | 322 | if self._doc_type is None: 323 | path = self._index + '/_search' 324 | else: 325 | path = self._index + '/' + self._doc_type + '/_search' 326 | 327 | if self._aggregation is None and self._groupby is None: 328 | 329 | def _scroll(): 330 | row_counter = 0 331 | 332 | _query = self._build_query() 333 | resp = self._client.post(path, params={"scroll": "10s"}, data=_query) 334 | scroll_id = resp.get("_scroll_id") 335 | try: 336 | while scroll_id and resp["hits"]["hits"]: 337 | if row_counter >= self._limit: 338 | break 339 | 340 | for hit in resp["hits"]["hits"]: 341 | 342 | if row_counter >= self._limit: 343 | break 344 | 345 | row_counter += 1 346 | yield hit 347 | 348 | resp = self._client.post('_search/scroll', 349 | data={"scroll_id": scroll_id, "scroll": "10s"}) 350 | scroll_id = resp.get("_scroll_id") 351 | 352 | finally: 353 | # TODO(onesuper): Delete the scroll resource anyway 354 | pass 355 | return ScrollSelect(_scroll) 356 | 357 | else: 358 | res_dict = self._client.post(path, data=self._build_query()) 359 | return Agg.from_dict(res_dict) 360 | 361 | def collect(self): 362 | """ 363 | Returns all the records as a list of Row. 364 | 365 | :return: list of :class:`Row ` 366 | 367 | >>> df.collect() 368 | [Row(age=2, name='Alice'), Row(age=5, name='Bob')] 369 | """ 370 | query = self._execute() 371 | return [Row(**v) for v in query.result] 372 | 373 | def to_pandas(self): 374 | """ 375 | Export to a Pandas DataFrame object. 376 | 377 | :return: The DataFrame representing the query result 378 | 379 | >>> df[df['gender'] == 'male'].agg(Avg('age')).to_pandas() 380 | avg(age) 381 | 0 12 382 | """ 383 | query = self._execute() 384 | return query.to_pandas() 385 | 386 | def count(self): 387 | """ 388 | Returns a list of numbers indicating the count for each group 389 | 390 | >>> df.groupby(df.gender).count() 391 | [2, 1] 392 | """ 393 | df = DataFrame(client=self._client, 394 | index=self._index, 395 | doc_type=self._doc_type, 396 | mapping=self._mapping, 397 | filter=self._filter, 398 | groupby=self._groupby, 399 | aggregation=_count_aggregator, 400 | projection=self._projection, 401 | sort=self._sort, 402 | limit=self._limit, 403 | compat=self._compat) 404 | return df 405 | 406 | def show(self, n=200, truncate=15): 407 | """ 408 | Prints the first ``n`` rows to the console. 409 | 410 | :param n: Number of rows to show. 411 | :param truncate: Number of words to be truncated for each column. 412 | 413 | >>> df.filter(df['age'] < 25).select('name').show(3) 414 | +------+ 415 | | name | 416 | +------+ 417 | | Alice| 418 | | Bob | 419 | | Leo | 420 | +------+ 421 | """ 422 | assert n > 0 423 | 424 | if self._aggregation: 425 | raise DataFrameException('show() is not allowed for aggregation. use collect() instead') 426 | 427 | query = self._execute() 428 | 429 | if self._projection: 430 | cols = [col.field_name() for col in self._projection] 431 | else: 432 | cols = self.columns 433 | 434 | if cols is None: 435 | raise _unbound_index_err 436 | 437 | sys.stdout.write(query.result_as_tabular(cols, n, truncate)) 438 | 439 | def __repr__(self): 440 | if self.columns is None: 441 | return "DataFrame(Unbound)" 442 | return "DataFrame[%s]" % (", ".join("%s" % c for c in self.columns)) 443 | 444 | def print_debug(self): 445 | """ 446 | Post the query to the Elasticsearch Server and prints out the result it returned 447 | """ 448 | if self._client is None: 449 | raise _unbound_index_err 450 | sys.stdout.write(json.dumps(self._client.post(data=self._build_query()), indent=4)) 451 | 452 | def to_dict(self): 453 | """ 454 | Converts the current :class:`DataFrame ` object to Elasticsearch search dictionary. 455 | 456 | :return: a dictionary which obeys the Elasticsearch RESTful protocol 457 | """ 458 | return self._build_query() 459 | 460 | def print_schema(self): 461 | """ 462 | Prints out the schema in the tree format. 463 | 464 | >>> df.print_schema() 465 | index_name 466 | |-- type_name 467 | |-- experience : {'type': 'integer'} 468 | |-- id : {'type': 'string'} 469 | |-- mobile : {'index': 'not_analyzed', 'type': 'string'} 470 | |-- regions : {'index': 'not_analyzed', 'type': 'string'} 471 | """ 472 | if self._index is None: 473 | return 474 | 475 | sys.stdout.write('{0}\n'.format(self._index)) 476 | index_name = list(self._mapping.keys())[0] 477 | if self._compat >= 7: 478 | json_obj = self._mapping[index_name]["mappings"]["properties"] 479 | sys.stdout.write(self.resolve_schema(json_obj)) 480 | else: 481 | if self._doc_type is not None: 482 | json_obj = self._mapping[index_name]["mappings"][self._doc_type]["properties"] 483 | sys.stdout.write(self.resolve_schema(json_obj)) 484 | else: 485 | raise DataFrameException('Please specify mapping for ES version under 7') 486 | 487 | def resolve_schema(self, json_prop, res_schema="", depth=1): 488 | for field in json_prop: 489 | if "properties" in json_prop[field]: 490 | res_schema += "{}|--{}:\n".format(' ' * depth, field) 491 | res_schema = self.resolve_schema(json_prop[field]["properties"], 492 | res_schema, depth=depth+1) 493 | else: 494 | res_schema += "{}|--{}: {}\n".format(' ' * depth, field, json_prop[field]) 495 | return res_schema 496 | 497 | def _build_query(self): 498 | query = dict() 499 | 500 | query['size'] = 20 # batch size for scroll search 501 | 502 | if self._groupby and not self._aggregation: 503 | query['aggregations'] = self._groupby 504 | query['size'] = 0 505 | 506 | if self._aggregation: 507 | if self._groupby is None: 508 | query['aggregations'] = self._aggregation 509 | query['size'] = 0 510 | 511 | else: 512 | agg = copy.deepcopy(self._groupby) 513 | # insert aggregator to the inner-most grouper 514 | inner_most = agg 515 | while True: 516 | key = list(inner_most.keys())[0] 517 | if 'aggregations' in inner_most[key]: 518 | inner_most = inner_most[key]['aggregations'] 519 | else: 520 | break 521 | key = list(inner_most.keys())[0] 522 | inner_most[key]['aggregations'] = self._aggregation 523 | query['aggregations'] = agg 524 | query['size'] = 0 525 | 526 | if self._filter: 527 | assert isinstance(self._filter, BooleanFilter) 528 | if self._compat >= 5: 529 | query['query'] = {'bool': {'filter': self._filter.build()}} 530 | else: 531 | query['query'] = {'filtered': {'filter': self._filter.build()}} 532 | 533 | if self._projection: 534 | query['_source'] = {"includes": [col.field_name() for col in self._projection], "excludes": []} 535 | 536 | if self._sort: 537 | query['sort'] = self._sort 538 | self._last_query = query 539 | return query 540 | 541 | def _get_cols(self, mapping): 542 | cols = self._get_mappings(mapping) 543 | 544 | if len(cols) == 0: 545 | raise DataFrameException('0 columns found in mapping') 546 | return cols 547 | 548 | @classmethod 549 | def resolve_mappings(cls, json_map): 550 | prop = [] 551 | for field in json_map: 552 | nested_props = [] 553 | if "properties" in json_map[field]: 554 | nested_props = cls.resolve_mappings(json_map[field]["properties"]) 555 | if len(nested_props) == 0: 556 | prop.append(field) 557 | else: 558 | for nested_prop in nested_props: 559 | prop.append("{}.{}".format(field, nested_prop)) 560 | return prop 561 | 562 | def _get_mappings(self, json_map): 563 | index_name = list(self._mapping.keys())[0] 564 | 565 | if self._compat >= 7: 566 | return DataFrame.resolve_mappings(json_map[index_name]["mappings"]["properties"]) 567 | else: 568 | if self._doc_type is not None: 569 | return DataFrame.resolve_mappings(json_map[index_name]["mappings"][self._doc_type]["properties"]) 570 | else: 571 | raise DataFrameException('Please specify doc_type for ES version under 7') 572 | -------------------------------------------------------------------------------- /pandasticsearch/errors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | 4 | class PandasticSearchException(RuntimeError): 5 | def __init__(self, msg): 6 | super(PandasticSearchException, self).__init__(msg) 7 | 8 | 9 | class NoSuchDependencyException(PandasticSearchException): 10 | pass 11 | 12 | 13 | class ServerDefinedException(PandasticSearchException): 14 | pass 15 | 16 | 17 | class ParseResultException(PandasticSearchException): 18 | pass 19 | 20 | 21 | class DataFrameException(PandasticSearchException): 22 | pass 23 | -------------------------------------------------------------------------------- /pandasticsearch/operators/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | from pandasticsearch.operators.aggregator import * 4 | from pandasticsearch.operators.grouper import * 5 | from pandasticsearch.operators.filter import * 6 | from pandasticsearch.operators.sorter import * 7 | -------------------------------------------------------------------------------- /pandasticsearch/operators/aggregator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | _metric_aggs = ('avg', 'min', 'max', 'cardinality', 'value_count', 'sum', 4 | 'percentiles', 'percentile_ranks', 'stats', 'extended_stats') 5 | 6 | 7 | class Aggregator(object): 8 | def __init__(self, field): 9 | self._field = field 10 | 11 | def build(self): 12 | pass 13 | 14 | 15 | class MetricAggregator(Aggregator): 16 | def __init__(self, field, agg_type, alias=None, params=None): 17 | super(MetricAggregator, self).__init__(field) 18 | self._agg_type = agg_type 19 | self._alias = alias 20 | self._params = params 21 | 22 | def alias(self, alias): 23 | self._alias = alias 24 | return self 25 | 26 | def build(self): 27 | if self._agg_type not in _metric_aggs: 28 | raise Exception('Not support metric aggregator: {0}'.format(self._type)) 29 | 30 | if self._alias is None: 31 | name = '{0}({1})'.format(self._agg_type, self._field) 32 | else: 33 | name = self._alias 34 | 35 | agg_field = dict() 36 | agg_field['field'] = self._field 37 | if self._params is not None: 38 | agg_field.update(self._params) 39 | return {name: {self._agg_type: agg_field}} 40 | 41 | -------------------------------------------------------------------------------- /pandasticsearch/operators/filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | 4 | # Es filter builder for BooleanCond 5 | class BooleanFilter(object): 6 | def __init__(self, *args): 7 | self._filter = None 8 | 9 | def __and__(self, x): 10 | # Combine results 11 | if isinstance(self, AndFilter): 12 | self.subtree['must'].append(x.build()) 13 | return self 14 | elif isinstance(x, AndFilter): 15 | x.subtree['must'].append(self.subtree) 16 | return x 17 | return AndFilter(self, x) 18 | 19 | def __or__(self, x): 20 | # Combine results 21 | if isinstance(self, OrFilter): 22 | self.subtree['should'].append(x.build()) 23 | return self 24 | elif isinstance(x, OrFilter): 25 | x.subtree['should'].append(self.subtree) 26 | return x 27 | return OrFilter(self, x) 28 | 29 | def __invert__(self): 30 | return NotFilter(self) 31 | 32 | @property 33 | def subtree(self): 34 | if 'bool' in self._filter: 35 | return self._filter['bool'] 36 | else: 37 | return self._filter 38 | 39 | def build(self): 40 | return self._filter 41 | 42 | 43 | # Binary operator 44 | class AndFilter(BooleanFilter): 45 | def __init__(self, *args): 46 | [isinstance(x, BooleanFilter) for x in args] 47 | super(AndFilter, self).__init__() 48 | self._filter = {'bool': {'must': [x.build() for x in args]}} 49 | 50 | 51 | class OrFilter(BooleanFilter): 52 | def __init__(self, *args): 53 | [isinstance(x, BooleanFilter) for x in args] 54 | super(OrFilter, self).__init__() 55 | self._filter = {'bool': {'should': [x.build() for x in args]}} 56 | 57 | 58 | class NotFilter(BooleanFilter): 59 | def __init__(self, x): 60 | assert isinstance(x, BooleanFilter) 61 | super(NotFilter, self).__init__() 62 | self._filter = {'bool': {'must_not': x.build()}} 63 | 64 | 65 | # LeafBooleanFilter 66 | class GreaterEqual(BooleanFilter): 67 | def __init__(self, field, value): 68 | super(GreaterEqual, self).__init__() 69 | self._filter = {'range': {field: {'gte': value}}} 70 | 71 | 72 | class Greater(BooleanFilter): 73 | def __init__(self, field, value): 74 | super(Greater, self).__init__() 75 | self._filter = {'range': {field: {'gt': value}}} 76 | 77 | 78 | class LessEqual(BooleanFilter): 79 | def __init__(self, field, value): 80 | super(LessEqual, self).__init__() 81 | self._filter = {'range': {field: {'lte': value}}} 82 | 83 | 84 | class Less(BooleanFilter): 85 | def __init__(self, field, value): 86 | super(Less, self).__init__() 87 | self._filter = {'range': {field: {'lt': value}}} 88 | 89 | 90 | class Equal(BooleanFilter): 91 | def __init__(self, field, value): 92 | super(Equal, self).__init__() 93 | self._filter = {'term': {field: value}} 94 | 95 | 96 | class IsIn(BooleanFilter): 97 | def __init__(self, field, value): 98 | super(IsIn, self).__init__() 99 | assert isinstance(value, list) 100 | self._filter = {'terms': {field: value}} 101 | 102 | 103 | class Like(BooleanFilter): 104 | def __init__(self, field, value): 105 | super(Like, self).__init__() 106 | self._filter = {'wildcard': {field: value}} 107 | 108 | 109 | class Rlike(BooleanFilter): 110 | def __init__(self, field, value): 111 | super(Rlike, self).__init__() 112 | self._filter = {'regexp': {field: value}} 113 | 114 | 115 | class Startswith(BooleanFilter): 116 | def __init__(self, field, value): 117 | super(Startswith, self).__init__() 118 | self._filter = {'prefix': {field: value}} 119 | 120 | 121 | class IsNull(BooleanFilter): 122 | """ 123 | .. _Find documents missing indexed values 124 | https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-exists-query.html 125 | """ 126 | def __init__(self, field): 127 | super(IsNull, self).__init__() 128 | self._filter = {'bool': {'must_not': {'exists': {'field': field}}}} 129 | 130 | 131 | class NotNull(BooleanFilter): 132 | def __init__(self, field): 133 | super(NotNull, self).__init__() 134 | self._filter = {'exists': {'field': field}} 135 | 136 | 137 | class ScriptFilter(BooleanFilter): 138 | def __init__(self, inline, lang=None, params=None): 139 | super(ScriptFilter, self).__init__() 140 | script = {'inline': inline} 141 | if lang is not None: 142 | script['lang'] = lang 143 | if params is not None: 144 | script['params'] = params 145 | self._filter = {'script': {'script': script}} -------------------------------------------------------------------------------- /pandasticsearch/operators/grouper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | 4 | class Grouper(object): 5 | def __init__(self, field, size=20, inner=None, include=None, exclude=None): 6 | self._field = field 7 | self._size = size 8 | self._inner = inner 9 | self._include = include 10 | self._exclude = exclude 11 | 12 | @staticmethod 13 | def from_list(l): 14 | if len(l) == 1: 15 | return Grouper(l[0]) 16 | return Grouper(l[0], inner=Grouper.from_list(l[1:])) 17 | 18 | def build(self): 19 | terms = {'field': self._field, 'size': self._size} 20 | if self._exclude is not None: 21 | assert isinstance(self._exclude, list) 22 | terms['exclude'] = self._exclude 23 | if self._include is not None: 24 | assert isinstance(self._include, list) 25 | terms['include'] = self._include 26 | 27 | agg = {"terms": terms} 28 | 29 | if self._inner is not None: 30 | agg["aggregations"] = self._inner.build() 31 | 32 | return {self._field: agg} 33 | 34 | 35 | class RangeGrouper(Grouper): 36 | def __init__(self, field, range_list): 37 | assert isinstance(range_list, list) 38 | super(RangeGrouper, self).__init__(field) 39 | self._field = field 40 | self._range_list = range_list 41 | 42 | def build(self): 43 | ranges = [] 44 | starts = self._range_list[:-1] 45 | ends = self._range_list[1:] 46 | for start, end in zip(starts, ends): 47 | ranges.append({'from': start, 'to': end}) 48 | 49 | name = 'range(' + ','.join([str(x) for x in self._range_list]) + ')' 50 | return {name: {'range': {'field': self._field, 'ranges': ranges}}} 51 | 52 | 53 | class DateGrouper(Grouper): 54 | def __init__(self, field, interval, format): 55 | super(DateGrouper, self).__init__(field) 56 | self._field = field 57 | self._interval = interval 58 | self._format = format 59 | 60 | def build(self): 61 | name = 'date({0},{1})'.format(self._field, self._interval) 62 | return {name: {'date_histogram': { 63 | 'field': self._field, 64 | 'interval': self._interval, 65 | 'format': self._format, 66 | }}} 67 | -------------------------------------------------------------------------------- /pandasticsearch/operators/sorter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | 4 | _sort_mode = ('min', 'max', 'sum', 'avg', 'median') 5 | 6 | 7 | class Sorter(object): 8 | def __init__(self, field, order='desc', mode=None): 9 | self._field = field 10 | self._order = order 11 | self._mode = mode 12 | 13 | def build(self): 14 | sort = {} 15 | if self._mode is not None: 16 | if self._mode not in _sort_mode: 17 | raise Exception('Not support sort mode: {0}'.format(self._mode)) 18 | sort['mode'] = self._mode 19 | sort['order'] = self._order 20 | return {self._field: sort} 21 | 22 | 23 | class ScriptSorter(object): 24 | def __init__(self, script, order='desc', type='number', params=None): 25 | self._order = order 26 | self._script = script 27 | self._params = params 28 | self._type = type 29 | 30 | def build(self): 31 | script = {'script': self._script, 'type': self._type, 'order': self._order} 32 | if self._params: 33 | script['params'] = self._params 34 | return {'_script': script} 35 | 36 | 37 | -------------------------------------------------------------------------------- /pandasticsearch/queries.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | try: 4 | from collections.abc import MutableSequence 5 | except ImportError: 6 | from collections import MutableSequence 7 | import json 8 | import six 9 | 10 | from pandasticsearch.errors import NoSuchDependencyException 11 | 12 | 13 | class Query(MutableSequence): 14 | def __init__(self): 15 | super(Query, self).__init__() 16 | self._values = None 17 | self._result_dict = {} 18 | self._took_millis = 0 19 | 20 | def explain_result(self, result=None): 21 | if isinstance(result, dict): 22 | self._result_dict = result 23 | self._took_millis = self._result_dict['took'] 24 | 25 | def to_pandas(self): 26 | """ 27 | Export the current query result to a Pandas DataFrame object. 28 | """ 29 | raise NotImplementedError('implemented in subclass') 30 | 31 | def print_json(self): 32 | indented_json = json.dumps(self._result_dict, sort_keys=True, separators=(',', ': '), indent=4, 33 | ensure_ascii=False) 34 | print(indented_json) 35 | 36 | @property 37 | def result(self): 38 | return self._values 39 | 40 | @property 41 | def millis_taken(self): 42 | return self._took_millis 43 | 44 | @property 45 | def json(self): 46 | """ 47 | Gets the original JSON representation returned by Elasticsearch REST API 48 | :return: The JSON string indicating the query result 49 | :rtype: string 50 | """ 51 | return json.dumps(self._result_dict) 52 | 53 | def insert(self, index, value): 54 | self._values.insert(index, value) 55 | 56 | def append(self, value): 57 | self._values.append(value) 58 | 59 | def __str__(self): 60 | return str(self._values) 61 | 62 | def __len__(self): 63 | return len(self._values) 64 | 65 | def __delitem__(self, index): 66 | del self._values[index] 67 | 68 | def __setitem__(self, index, value): 69 | self._values[index] = value 70 | 71 | def __getitem__(self, index): 72 | return self._values[index] 73 | 74 | 75 | class Select(Query): 76 | def __init__(self): 77 | super(Select, self).__init__() 78 | 79 | def resolve_fields(self, row): 80 | fields = {} 81 | for field in row: 82 | nested_fields = {} 83 | if isinstance(row[field], dict): 84 | nested_fields = self.resolve_fields(row[field]) 85 | for n_field, val in nested_fields.items(): 86 | fields["{}.{}".format(field, n_field)] = val 87 | else: 88 | fields[field] = row[field] 89 | return fields 90 | 91 | def hit_to_row(self, hit): 92 | row = {} 93 | for k in hit.keys(): 94 | if k == '_source': 95 | solved_fields = self.resolve_fields(hit['_source']) 96 | row.update(solved_fields) 97 | elif k.startswith('_'): 98 | row[k] = hit[k] 99 | return row 100 | 101 | def explain_result(self, result=None): 102 | super(Select, self).explain_result(result) 103 | self._values = [self.hit_to_row(hit) 104 | for hit in self._result_dict['hits']['hits']] 105 | 106 | def to_pandas(self): 107 | try: 108 | import pandas 109 | except ImportError: 110 | raise NoSuchDependencyException('this method requires pandas library') 111 | if self._values: 112 | df = pandas.DataFrame(data=self._values) 113 | return df 114 | 115 | @staticmethod 116 | def from_dict(d): 117 | query = Select() 118 | query.explain_result(d) 119 | return query 120 | 121 | @classmethod 122 | def _stringfy_value(cls, value): 123 | b = six.StringIO() 124 | if value: 125 | b.write(repr(value)) 126 | else: 127 | b.write('(NULL)') 128 | return b.getvalue() 129 | 130 | def result_as_tabular(self, cols, n, truncate=20): 131 | b = six.StringIO() 132 | widths = [] 133 | tavnit = '|' 134 | separator = '+' 135 | 136 | cached_result = [kv for kv in self.result[:n]] 137 | for col in cols: 138 | maxlen = len(col) 139 | for kv in cached_result: 140 | if col in kv: 141 | s = Select._stringfy_value(kv[col]) 142 | else: 143 | s = '(NULL)' 144 | if len(s) > maxlen: 145 | maxlen = len(s) 146 | widths.append(min(maxlen, truncate)) 147 | 148 | for w in widths: 149 | tavnit += ' %-' + '%ss |' % (w,) 150 | separator += '-' * w + '--+' 151 | 152 | b.write(separator + '\n') 153 | b.write(tavnit % tuple(cols) + '\n') 154 | b.write(separator + '\n') 155 | for kv in cached_result: 156 | row = [] 157 | for col in cols: 158 | if col in kv: 159 | s = Select._stringfy_value(kv[col]) 160 | if len(s) > truncate: 161 | s = s[:truncate - 3] + '...' 162 | else: 163 | s = '(NULL)' 164 | row.append(s) 165 | b.write(tavnit % tuple(row) + '\n') 166 | b.write(separator + '\n') 167 | return b.getvalue() 168 | 169 | 170 | class ScrollSelect(Select): 171 | """ 172 | millis_taken/json not supported for ScrollSelect 173 | """ 174 | def __init__(self, hits_generator): 175 | super(ScrollSelect, self).__init__() 176 | self.hits_generator = hits_generator 177 | 178 | @property 179 | def result(self): 180 | return [r for r in self.row_generator()] 181 | 182 | def __str__(self): 183 | return str(self.result) 184 | 185 | def __len__(self): 186 | return len(self.result) 187 | 188 | def row_generator(self): 189 | for hit in self.hits_generator(): 190 | yield self.hit_to_row(hit) 191 | 192 | def to_pandas(self): 193 | try: 194 | import pandas 195 | except ImportError: 196 | raise NoSuchDependencyException('this method requires pandas library') 197 | 198 | df = pandas.DataFrame(self.row_generator()) 199 | return df 200 | 201 | 202 | class Agg(Query): 203 | def __init__(self): 204 | super(Agg, self).__init__() 205 | self._index_names = None 206 | self._indexes = None 207 | 208 | def explain_result(self, result=None): 209 | super(Agg, self).explain_result(result) 210 | tuples = list(Agg._process_agg(self._result_dict['aggregations'])) 211 | assert len(tuples) > 0 212 | self._index_names = list(tuples[0][0]) 213 | self._values = [] 214 | self._indexes = [] 215 | for t in tuples: 216 | _, index, row = t 217 | self._values.append(row) 218 | if len(index) > 0: 219 | self._indexes.append(index) 220 | 221 | @property 222 | def index(self): 223 | return self._indexes 224 | 225 | def to_pandas(self): 226 | try: 227 | import pandas 228 | except ImportError: 229 | raise NoSuchDependencyException('this method requires pandas library') 230 | if self._values is not None: 231 | if len(self._indexes) > 0: 232 | index = pandas.MultiIndex.from_tuples(self._indexes, names=self._index_names) 233 | df = pandas.DataFrame(data=self._values, index=index) 234 | else: 235 | df = pandas.DataFrame(data=self._values) 236 | return df 237 | 238 | @classmethod 239 | def _process_agg(cls, bucket, indexes=(), names=()): 240 | """ 241 | Recursively extract agg values 242 | :param bucket: a bucket contains either sub-buckets or a bunch of aggregated values 243 | :return: a list of tuples: (index_name, index_tuple, row) 244 | """ 245 | # for each agg, yield a row 246 | row = {} 247 | for k, v in bucket.items(): 248 | if isinstance(v, dict): 249 | if 'buckets' in v: 250 | for sub_bucket in v['buckets']: 251 | 252 | if 'key_as_string' in sub_bucket: 253 | key = sub_bucket['key_as_string'] 254 | else: 255 | key = sub_bucket['key'] 256 | for x in Agg._process_agg(sub_bucket, 257 | indexes + (key,), 258 | names + (k,)): 259 | yield x 260 | elif 'value' in v: 261 | row[k] = v['value'] 262 | elif 'values' in v: # percentiles 263 | row = v['values'] 264 | else: 265 | row.update(v) # stats 266 | else: 267 | if k == 'doc_count': # count docs 268 | row['doc_count'] = v 269 | 270 | if len(row) > 0: 271 | yield (names, indexes, row) 272 | 273 | @staticmethod 274 | def from_dict(d): 275 | agg = Agg() 276 | agg.explain_result(d) 277 | return agg 278 | -------------------------------------------------------------------------------- /pandasticsearch/types.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | from pandasticsearch.operators import * 4 | import six 5 | 6 | 7 | class Column(object): 8 | def __init__(self, field): 9 | self._field = field 10 | 11 | def field_name(self): 12 | return self._field 13 | 14 | def __eq__(self, other): 15 | return Equal(field=self._field, value=other) 16 | 17 | def __ne__(self, other): 18 | return ~Equal(field=self._field, value=other) 19 | 20 | def __gt__(self, other): 21 | return Greater(field=self._field, value=other) 22 | 23 | def __lt__(self, other): 24 | return Less(field=self._field, value=other) 25 | 26 | def __ge__(self, other): 27 | return GreaterEqual(field=self._field, value=other) 28 | 29 | def __le__(self, other): 30 | return LessEqual(field=self._field, value=other) 31 | 32 | def isin(self, values): 33 | """ 34 | Returns a :class:`BooleanFilter ` 35 | 36 | :param values: A list of values to filter terms 37 | :return: :class:`BooleanFilter ` 38 | 39 | df.filter(df.gender.isin(['male', 'female']) 40 | """ 41 | return IsIn(field=self._field, value=values) 42 | 43 | def like(self, wildcard): 44 | """ 45 | Returns a :class:`BooleanFilter ` 46 | 47 | :param str wildcard: The wildcard to filter the column with. 48 | :return: :class:`BooleanFilter ` 49 | 50 | >>> df.filter(df.name.like('A*')) 51 | """ 52 | return Like(field=self._field, value=wildcard) 53 | 54 | def rlike(self, regexp): 55 | """ 56 | Returns a :class:`BooleanFilter ` 57 | 58 | :param str regexp: The regular expression to filter the column with. 59 | :return: :class:`BooleanFilter ` 60 | 61 | >>> df.filter(df.name.rlike('A.l.e')) 62 | """ 63 | return Rlike(field=self._field, value=regexp) 64 | 65 | def startswith(self, substr): 66 | """ 67 | Returns a :class:`BooleanFilter ` 68 | 69 | :param str substr: The sub string to filter the column with. 70 | :return: :class:`BooleanFilter ` 71 | 72 | >>> df.filter(df.name.startswith('Al') 73 | """ 74 | return Startswith(field=self._field, value=substr) 75 | 76 | def ranges(self, values): 77 | """ 78 | Returns a :class:`Grouper ` 79 | 80 | :param values: A list of numeric values 81 | :return: :class:`Grouper ` 82 | 83 | >>> df.groupby(df.age.ranges([10,12,14])) 84 | """ 85 | return RangeGrouper(field=self._field, range_list=values) 86 | 87 | def date_interval(self, interval, format='yyyy/MM/dd HH:mm:ss'): 88 | """ 89 | Returns a :class:`Grouper ` 90 | 91 | :param interval: A string indicating date interval 92 | :param format: Date format string 93 | :return: :class:`Grouper ` 94 | 95 | >>> df.groupby(df.date_interval('1d')) 96 | """ 97 | return DateGrouper(field=self._field, interval=interval, format=format) 98 | 99 | def terms(self, limit=20, include=None, exclude=None): 100 | """ 101 | Returns a :class:`Grouper ` 102 | 103 | :param limit: limit the number of terms to be aggregated (default 20) 104 | :param include: the exact term to be included 105 | :param exclude: the exact term to be excluded 106 | 107 | :return: :class:`Grouper ` 108 | 109 | >>> df.groupby(df.age.terms(limit=10, include=[1, 2, 3])) 110 | """ 111 | return Grouper(field=self._field, size=limit, include=include, exclude=exclude) 112 | 113 | @property 114 | def isnull(self): 115 | """ 116 | :class:`BooleanFilter ` to indicate the null column value 117 | 118 | :return: :class:`BooleanFilter ` 119 | """ 120 | return IsNull(field=self._field) 121 | 122 | @property 123 | def notnull(self): 124 | """ 125 | :class:`BooleanFilter ` to indicate the non-null column value 126 | 127 | :return: :class:`BooleanFilter ` 128 | """ 129 | return NotNull(field=self._field) 130 | 131 | @property 132 | def desc(self, mode=None): 133 | """ 134 | Descending :class:`Sorter ` 135 | 136 | :return: :class:`Sorter ` 137 | 138 | >>> df.orderyby(df.age.desc) 139 | """ 140 | return Sorter(self._field, mode=mode) 141 | 142 | @property 143 | def asc(self, mode=None): 144 | """ 145 | Ascending :class:`Sorter ` 146 | 147 | :return: :class:`Sorter ` 148 | 149 | >>> df.orderyby(df.age.asc) 150 | """ 151 | return Sorter(self._field, order='asc', mode=mode) 152 | 153 | @property 154 | def max(self): 155 | """ 156 | Max aggregator 157 | 158 | :return: :class:`Aggregator ` 159 | 160 | >>> df.groupby(df.gender).agg(df.age.max) 161 | """ 162 | return MetricAggregator(self._field, 'max') 163 | 164 | @property 165 | def min(self): 166 | """ 167 | Min aggregator 168 | 169 | :return: :class:`Aggregator ` 170 | 171 | >>> df.groupby(df.gender).agg(df.age.min) 172 | """ 173 | return MetricAggregator(self._field, 'min') 174 | 175 | @property 176 | def avg(self): 177 | """ 178 | Avg aggregator 179 | 180 | :return: :class:`Aggregator ` 181 | 182 | >>> df.groupby(df.gender).agg(df.age.avg) 183 | """ 184 | return MetricAggregator(self._field, 'avg') 185 | 186 | @property 187 | def sum(self): 188 | """ 189 | Sum aggregator 190 | 191 | :return: :class:`Aggregator ` 192 | 193 | >>> df.groupby(df.gender).agg(df.age.sum) 194 | """ 195 | return MetricAggregator(self._field, 'sum') 196 | 197 | @property 198 | def value_count(self): 199 | """ 200 | Value count aggregator 201 | 202 | :return: :class:`Aggregator ` 203 | 204 | >>> df.groupby(df.gender).agg(df.age.value_count) 205 | """ 206 | return MetricAggregator(self._field, 'value_count') 207 | 208 | count = value_count 209 | 210 | @property 211 | def cardinality(self): 212 | """ 213 | Distince aggregator 214 | 215 | :return: :class:`Aggregator ` 216 | 217 | >>> df.groupby(df.gender).agg(df.age.cardinality) 218 | >>> df.groupby(df.gender).agg(df.age.distinct_count) 219 | """ 220 | return MetricAggregator(self._field, 'cardinality') 221 | 222 | distinct_count = cardinality 223 | 224 | @property 225 | def percentiles(self): 226 | """ 227 | Percentile aggregator 228 | 229 | :return: :class:`Aggregator ` 230 | 231 | >>> df.groupby(df.gender).agg(df.age.percentiles) 232 | """ 233 | return MetricAggregator(self._field, 'percentiles') 234 | 235 | @property 236 | def percentile_ranks(self): 237 | """ 238 | Percentile ranks aggregator 239 | 240 | :return: :class:`Aggregator ` 241 | 242 | >>> df.groupby(df.gender).agg(df.age.percentile_ranks) 243 | """ 244 | return MetricAggregator(self._field, 'percentile_ranks') 245 | 246 | @property 247 | def stats(self): 248 | """ 249 | Stats aggregator 250 | 251 | :return: :class:`Aggregator ` 252 | 253 | >>> df.groupby(df.gender).agg(df.age.stats) 254 | """ 255 | return MetricAggregator(self._field, 'stats') 256 | 257 | @property 258 | def extended_stats(self): 259 | """ 260 | Extended stats aggregator 261 | 262 | :return: :class:`Aggregator ` 263 | 264 | >>> df.groupby(df.gender).agg(df.age.extended_stats) 265 | """ 266 | return MetricAggregator(self._field, 'extended_stats') 267 | 268 | 269 | class Row(tuple): 270 | """ 271 | The builtin :class:`DataFrame ` row type for accessing before converted into Pandas DataFrame. 272 | The fields will be sorted by names. 273 | 274 | >>> row = Row(name="Alice", age=12) 275 | >>> row 276 | Row(age=12, name='Alice') 277 | >>> row['name'], row['age'] 278 | ('Alice', 12) 279 | >>> row.name, row.age 280 | ('Alice', 12) 281 | >>> 'name' in row 282 | True 283 | >>> 'wrong_key' in row 284 | """ 285 | 286 | def __new__(cls, **kwargs): 287 | names = sorted(kwargs.keys()) 288 | row = tuple.__new__(cls, [kwargs[n] for n in names]) 289 | row._fields = names 290 | return row 291 | 292 | def __getitem__(self, name): 293 | try: 294 | idx = self._fields.index(name) 295 | return super(Row, self).__getitem__(idx) 296 | except IndexError: 297 | raise KeyError(name) 298 | except ValueError: 299 | raise ValueError(name) 300 | 301 | def __contains__(self, name): 302 | return name in self._fields 303 | 304 | def __repr__(self): 305 | return 'Row(' + ','.join( 306 | ['{0}={1}'.format(k, Row._stringfy(v)) for k, v in zip(self._fields, tuple(self))]) + ')' 307 | 308 | @classmethod 309 | def _stringfy(cls, v): 310 | b = six.StringIO() 311 | b.write(repr(v)) 312 | return b.getvalue() 313 | 314 | def as_dict(self): 315 | return dict((x, y) for x, y in zip(self._fields, self)) 316 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | six>=1.9.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open('requirements.txt') as f: 4 | required = f.read().splitlines() 5 | 6 | extras_require = { 7 | "pandas": ["pandas"], 8 | } 9 | 10 | setup( 11 | name='pandasticsearch', 12 | version='0.5.3', 13 | author='onesuper', 14 | author_email='onesuperclark@gmail.com', 15 | packages=['pandasticsearch', 'pandasticsearch.operators'], 16 | url='http://pypi.python.org/pypi/pandasticsearch/', 17 | license='MIT', 18 | description='A Pandastic Elasticsearch client for data analyzing.', 19 | install_requires=required, 20 | extras_require=extras_require, 21 | test_suite='nose.collector', 22 | tests_require=['nose', 'mock'], 23 | ) 24 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/onesuper/pandasticsearch/f99a01a1dc0dc57dacefd0280598055922372418/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_cilent.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import unittest 3 | from mock import patch, Mock 4 | 5 | from pandasticsearch.client import RestClient 6 | 7 | 8 | class TestClients(unittest.TestCase): 9 | @patch('pandasticsearch.client.urllib.request.urlopen') 10 | def test_rest_client_returns_results(self, mock_urlopen): 11 | response = Mock() 12 | response.read.return_value = """{"hits" : {"hits": [{"_source": {}}] }}""".encode("utf-8") 13 | mock_urlopen.return_value = response 14 | 15 | client = RestClient("http://localhost:9200") 16 | 17 | json = client.post('/test/_search', data="xxxx") 18 | 19 | print(json) 20 | self.assertIsNotNone(json) 21 | self.assertEqual(json, {"hits": {"hits": [{"_source": {}}]}}) 22 | 23 | 24 | if __name__ == '__main__': 25 | unittest.main() 26 | -------------------------------------------------------------------------------- /tests/test_dataframe.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import unittest 3 | from mock import patch, Mock 4 | import json 5 | from pandasticsearch.dataframe import DataFrame, Column 6 | from pandasticsearch.operators import * 7 | 8 | 9 | @patch('pandasticsearch.client.urllib.request.urlopen') 10 | def create_df_from_es(mock_urlopen): 11 | response = Mock() 12 | dic = { 13 | "index": { 14 | "mappings": { 15 | "doc_type": { 16 | "properties": { 17 | "a": { 18 | "type": "integer" 19 | }, 20 | "b": { 21 | "type": "integer" 22 | }, 23 | "c": {"properties": { 24 | "d": {"type": "keyword", "ignore_above": 1024}, 25 | "e": {"type": "keyword", "ignore_above": 1024} 26 | } 27 | } 28 | } 29 | } 30 | } 31 | } 32 | } 33 | response.read.return_value = json.dumps(dic).encode("utf-8") 34 | mock_urlopen.return_value = response 35 | return DataFrame.from_es(url="http://localhost:9200", index='index', doc_type='doc_type') 36 | 37 | 38 | @patch('pandasticsearch.client.urllib.request.urlopen') 39 | def create_df_from_es_after_removal_mapping(mock_urlopen): 40 | """ 41 | https://www.elastic.co/guide/en/elasticsearch/reference/current/removal-of-types.html 42 | """ 43 | response = Mock() 44 | dic = { 45 | "index": { 46 | "mappings": { 47 | "properties": { 48 | "a": { 49 | "type": "integer" 50 | }, 51 | "b": { 52 | "type": "integer" 53 | }, 54 | "c": {"properties": { 55 | "d": {"type": "keyword", "ignore_above": 1024}, 56 | "e": {"type": "keyword", "ignore_above": 1024} 57 | } 58 | } 59 | } 60 | } 61 | } 62 | } 63 | response.read.return_value = json.dumps(dic).encode("utf-8") 64 | mock_urlopen.return_value = response 65 | return DataFrame.from_es(url="http://localhost:9200", index='index', compat=7) 66 | 67 | 68 | class TestDataFrame(unittest.TestCase): 69 | def test_getitem(self): 70 | df = create_df_from_es() 71 | self.assertTrue(isinstance(df['a'], Column)) 72 | self.assertTrue(isinstance(df['b'], Column)) 73 | 74 | expr = df['a'] > 2 75 | self.assertTrue(isinstance(expr, BooleanFilter)) 76 | self.assertTrue(isinstance(df[expr], DataFrame)) 77 | self.assertEqual(df[expr]._filter.build(), {'range': {'a': {'gt': 2}}}) 78 | 79 | def test_getattr(self): 80 | df = create_df_from_es() 81 | self.assertTrue(isinstance(df.a, Column)) 82 | self.assertTrue(isinstance(df.b, Column)) 83 | 84 | def test_index(self): 85 | df = create_df_from_es() 86 | self.assertEqual(df.index, 'index/doc_type') 87 | 88 | def test_index_after_removal_mapping(self): 89 | df = create_df_from_es_after_removal_mapping() 90 | self.assertEqual(df.index, 'index') 91 | 92 | def test_columns(self): 93 | df = create_df_from_es() 94 | self.assertEqual(df.columns, ['a', 'b', 'c.d', 'c.e']) 95 | 96 | def test_print_schema(self): 97 | df = create_df_from_es() 98 | df.print_schema() 99 | 100 | def test_columns_after_removal_mapping(self): 101 | df = create_df_from_es_after_removal_mapping() 102 | self.assertEqual(df.columns, ['a', 'b', 'c.d', 'c.e']) 103 | 104 | def test_print_schema_after_removal_mapping(self): 105 | df = create_df_from_es_after_removal_mapping() 106 | df.print_schema() 107 | 108 | def test_init(self): 109 | df = create_df_from_es() 110 | self.assertEqual(df.to_dict(), {'size': 20}) 111 | 112 | def test_filter(self): 113 | df = create_df_from_es() 114 | 115 | self.assertEqual((df.filter(df['a'] > 2)).to_dict(), 116 | {'query': {'filtered': {'filter': {'range': {'a': {'gt': 2}}}}}, 117 | 'size': 20}) 118 | 119 | self.assertEqual((df.filter((df['a'] > 2) & (df.b == 1))).to_dict(), 120 | {'query': {'filtered': {'filter': {'bool': {'must': [ 121 | {'range': {'a': {'gt': 2}}}, 122 | {'term': {'b': 1}}]}}}}, 123 | 'size': 20}) 124 | 125 | self.assertEqual((df.filter(df['a'] > 2).filter(df.b == 1)).to_dict(), 126 | {'query': {'filtered': {'filter': {'bool': {'must': [ 127 | {'range': {'a': {'gt': 2}}}, 128 | {'term': {'b': 1}}]}}}}, 129 | 'size': 20}) 130 | 131 | self.assertEqual(df.where(Greater('a', 2)).to_dict(), 132 | {'query': {'filtered': {'filter': {'range': {'a': {'gt': 2}}}}}, 133 | 'size': 20}) 134 | 135 | self.assertEqual(df.filter('2016 - doc["age"].value > 1995').to_dict(), 136 | {'query': {'filtered': { 137 | 'filter': {'script': {'script': {'inline': '2016 - doc["age"].value > 1995'}}}}}, 138 | 'size': 20}) 139 | 140 | def test_groupby(self): 141 | df = create_df_from_es() 142 | self.assertEqual((df.groupby(df.a)).to_dict(), 143 | {'aggregations': {'a': {'terms': {'field': 'a', 'size': 20}}}, 'size': 0}) 144 | 145 | self.assertEqual((df.groupby(df['a'], df['b'])).to_dict(), 146 | { 147 | 'aggregations': { 148 | 'a': { 149 | 'aggregations': { 150 | 'b': { 151 | 'terms': {'field': 'b', 'size': 20}} 152 | }, 153 | 'terms': {'field': 'a', 'size': 20}}}, 154 | 'size': 0}) 155 | 156 | def test_agg(self): 157 | df = create_df_from_es() 158 | self.assertEqual((df.agg(MetricAggregator('a', 'avg'))).to_dict(), 159 | {'aggregations': {'avg(a)': {'avg': {'field': 'a'}}}, 'size': 0}) 160 | 161 | def test_sort(self): 162 | df = create_df_from_es() 163 | self.assertEqual((df.sort(df['a'].asc)).to_dict(), 164 | {'sort': [{'a': {'order': 'asc'}}], 'size': 20}) 165 | 166 | self.assertEqual((df.sort(Sorter('a'), Sorter('b'))).to_dict(), 167 | {'sort': [{'a': {'order': 'desc'}}, 168 | {'b': {'order': 'desc'}}], 'size': 20}) 169 | 170 | self.assertEqual((df.sort('doc["age"].value * 2')).to_dict(), 171 | {'sort': [{'_script': { 172 | 'order': 'desc', 173 | 'script': 'doc["age"].value * 2', 174 | 'type': 'number' 175 | }}], 'size': 20}) 176 | 177 | def test_select(self): 178 | df = create_df_from_es() 179 | self.assertEqual(df.select('a').to_dict(), 180 | {'_source': {'excludes': [], 'includes': ['a']}, 'size': 20}) 181 | 182 | self.assertEqual(df.select(df['a'], df['b']).to_dict(), 183 | {'_source': {'excludes': [], 'includes': ['a', 'b']}, 'size': 20}) 184 | 185 | def test_limit(self): 186 | df = create_df_from_es() 187 | self.assertEqual(df.limit(199).to_dict(), {'size': 20}) 188 | 189 | def test_complex(self): 190 | df = create_df_from_es() 191 | 192 | df2 = df.filter(df['a'] > 2) 193 | df3 = df2.select('a').limit(30) 194 | 195 | print(df3.to_dict()) 196 | self.assertEqual(df3.to_dict(), 197 | {'_source': {'excludes': [], 'includes': ['a']}, 198 | 'query': {'filtered': {'filter': {'range': {'a': {'gt': 2}}}}}, 199 | 'size': 20}) 200 | 201 | df4 = df3.groupby('b') 202 | df5 = df4.agg(MetricAggregator('a', 'avg')) 203 | 204 | print(df5.to_dict()) 205 | 206 | self.assertEqual(df5.to_dict(), 207 | {'_source': {'excludes': [], 'includes': ['a']}, 208 | 'aggregations': { 209 | 'b': { 210 | 'terms': {'field': 'b', 'size': 20}, 211 | 'aggregations': { 212 | 'avg(a)': {'avg': {'field': 'a'}}}} 213 | }, 214 | 'query': {'filtered': {'filter': {'range': {'a': {'gt': 2}}}}}, 215 | 'size': 0}) 216 | 217 | df6 = df5.sort(Sorter('a')) 218 | 219 | print(df6.to_dict()) 220 | self.assertEqual(df6.to_dict(), 221 | {'_source': {'excludes': [], 'includes': ['a']}, 222 | 'aggregations': { 223 | 'b': { 224 | 'terms': {'field': 'b', 'size': 20}, 225 | 'aggregations': { 226 | 'avg(a)': {'avg': {'field': 'a'}}}} 227 | }, 228 | 'query': {'filtered': {'filter': {'range': {'a': {'gt': 2}}}}}, 229 | 'sort': [{'a': {'order': 'desc'}}], 230 | 'size': 0}) 231 | 232 | def test_complex_agg(self): 233 | df = create_df_from_es() 234 | df2 = df.groupby(df.b, df.a) 235 | 236 | df3 = df2.agg(MetricAggregator('a', 'avg')) 237 | 238 | self.assertEqual(df3.to_dict(), 239 | { 240 | 'size': 0, 241 | 'aggregations': { 242 | 'b': { 243 | 'terms': {'field': 'b', 'size': 20}, 244 | 'aggregations': { 245 | 'a': { 246 | 'terms': {'field': 'a', 'size': 20}, 247 | 'aggregations': { 248 | 'avg(a)': {'avg': {'field': 'a'}}}} 249 | }}}}) 250 | 251 | 252 | if __name__ == '__main__': 253 | unittest.main() 254 | -------------------------------------------------------------------------------- /tests/test_operators.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import unittest 3 | 4 | from pandasticsearch.operators import * 5 | 6 | 7 | class TestOperators(unittest.TestCase): 8 | def test_metric_agg(self): 9 | self.assertEqual(MetricAggregator('x', 'avg').build(), {'avg(x)': {'avg': {'field': 'x'}}}) 10 | self.assertEqual(MetricAggregator('x', 'max').build(), {'max(x)': {'max': {'field': 'x'}}}) 11 | self.assertEqual(MetricAggregator('x', 'max').alias('max_x').build(), {'max_x': {'max': {'field': 'x'}}}) 12 | 13 | def test_grouper(self): 14 | exp = Grouper('a', size=100, include=['x', 'y']) 15 | self.assertEqual(exp.build(), 16 | {'a': {'terms': {'field': 'a', 'size': 100, 'include': ['x', 'y']}}}) 17 | 18 | def test_nested_grouper(self): 19 | exp = Grouper('a', inner=Grouper('b', inner=Grouper('c'))) 20 | self.assertEqual(exp.build(), 21 | { 22 | 'a': { 23 | 'terms': {'field': 'a', 'size': 20}, 24 | 'aggregations': { 25 | 'b': { 26 | 'terms': {'field': 'b', 'size': 20}, 27 | 'aggregations': { 28 | 'c': {'terms': {'field': 'c', 'size': 20}}}}}}}) 29 | 30 | def test_range_grouper(self): 31 | exp = RangeGrouper('a', [1, 3, 6]) 32 | self.assertEqual(exp.build(), { 33 | 'range(1,3,6)': {'range': {'ranges': [{'to': 3, 'from': 1}, {'to': 6, 'from': 3}], 'field': 'a'}}}) 34 | 35 | def test_date_grouper(self): 36 | exp = DateGrouper('a', '1d', 'm') 37 | self.assertEqual(exp.build(), { 38 | 'date(a,1d)': {'date_histogram': {'interval': '1d', 'field': 'a', 'format': 'm'}}}) 39 | 40 | def test_sorter(self): 41 | self.assertEqual(Sorter('x').build(), {'x': {'order': 'desc'}}) 42 | self.assertEqual(Sorter('x', mode='avg').build(), {'x': {'order': 'desc', 'mode': 'avg'}}) 43 | 44 | self.assertEqual(ScriptSorter('doc["field_name"].value * factor', params={'factor': 1.1}).build(), 45 | {'_script': {'type': 'number', 46 | 'order': 'desc', 47 | 'script': 'doc["field_name"].value * factor', 48 | "params": { 49 | "factor": 1.1 50 | }}}) 51 | 52 | def test_leaf_boolean_filter(self): 53 | self.assertEqual(GreaterEqual('a', 2).build(), {"range": {"a": {"gte": 2}}}) 54 | self.assertEqual(LessEqual('a', 2).build(), {"range": {"a": {"lte": 2}}}) 55 | self.assertEqual(Less('a', 2).build(), {"range": {"a": {"lt": 2}}}) 56 | self.assertEqual(Equal('a', 2).build(), {"term": {"a": 2}}) 57 | exp = Equal('a', 2) 58 | self.assertEqual((~exp).build()['bool'], {"must_not": {"term": {"a": 2}}}) 59 | self.assertEqual(Greater('a', 2).build(), {"range": {"a": {"gt": 2}}}) 60 | self.assertEqual(IsIn('a', [1, 2, 3]).build(), {'terms': {'a': [1, 2, 3]}}) 61 | self.assertEqual(Like('a', 'a*b').build(), {'wildcard': {'a': 'a*b'}}) 62 | self.assertEqual(Rlike('a', 'a*b').build(), {'regexp': {'a': 'a*b'}}) 63 | self.assertEqual(Startswith('a', 'jj').build(), {'prefix': {'a': 'jj'}}) 64 | self.assertEqual(IsNull('a').build(), (~NotNull('a')).build()) 65 | self.assertEqual(NotNull('a').build(), {'exists': {'field': 'a'}}) 66 | self.assertEqual(ScriptFilter('doc["num1"].value > params.param1', params={'param1': 5}).build(), 67 | {'script': { 68 | 'script': { 69 | 'inline': 'doc["num1"].value > params.param1', 70 | 'params': {'param1': 5}}}}) 71 | 72 | def test_and_filter1(self): 73 | exp = GreaterEqual('a', 2) & Less('b', 3) 74 | self.assertEqual( 75 | exp.build(), 76 | { 77 | 'bool': { 78 | 'must': [ 79 | {'range': {'a': {'gte': 2}}}, 80 | {'range': {'b': {'lt': 3}}} 81 | ] 82 | } 83 | }) 84 | 85 | def test_and_filter2(self): 86 | exp = GreaterEqual('a', 2) & Less('b', 3) & Equal('c', 4) 87 | self.assertEqual( 88 | exp.build(), 89 | { 90 | 'bool': { 91 | 'must': [ 92 | {'range': {'a': {'gte': 2}}}, 93 | {'range': {'b': {'lt': 3}}}, 94 | {'term': {'c': 4}} 95 | ] 96 | } 97 | }) 98 | 99 | def test_and_filter2a(self): 100 | exp = GreaterEqual('a', 2) & Less('b', 3) & (Equal('c', 4) & Equal('d', 5)) 101 | self.assertEqual( 102 | exp.build(), 103 | { 104 | 'bool': { 105 | 'must': [ 106 | {'range': {'a': {'gte': 2}}}, 107 | {'range': {'b': {'lt': 3}}}, 108 | {'bool': {'must': [ 109 | {'term': {'c': 4}}, 110 | {'term': {'d': 5}}, 111 | ]}}, 112 | ] 113 | } 114 | }) 115 | 116 | def test_and_filter3(self): 117 | exp = GreaterEqual('a', 2) & (Less('b', 3) & Equal('c', 4)) 118 | self.assertEqual( 119 | exp.build(), 120 | { 121 | 'bool': { 122 | 'must': [ 123 | {'range': {'b': {'lt': 3}}}, 124 | {'term': {'c': 4}}, 125 | {'range': {'a': {'gte': 2}}} 126 | ] 127 | } 128 | }) 129 | 130 | def test_or_filter1(self): 131 | exp = GreaterEqual('a', 2) | Less('b', 3) 132 | self.assertEqual( 133 | exp.build(), 134 | { 135 | 'bool': { 136 | 'should': [ 137 | {'range': {'a': {'gte': 2}}}, 138 | {'range': {'b': {'lt': 3}}} 139 | ] 140 | } 141 | }) 142 | 143 | def test_or_filter2(self): 144 | exp = GreaterEqual('a', 2) | Less('b', 3) | Equal('c', 4) 145 | self.assertEqual( 146 | exp.build(), 147 | { 148 | 'bool': { 149 | 'should': [ 150 | {'range': {'a': {'gte': 2}}}, 151 | {'range': {'b': {'lt': 3}}}, 152 | {'term': {'c': 4}} 153 | ] 154 | } 155 | }) 156 | 157 | def test_or_filter2a(self): 158 | exp = GreaterEqual('a', 2) | Less('b', 3) | (Equal('c', 4) & Equal('d', 5)) 159 | self.assertEqual( 160 | exp.build(), 161 | { 162 | 'bool': { 163 | 'should': [ 164 | {'range': {'a': {'gte': 2}}}, 165 | {'range': {'b': {'lt': 3}}}, 166 | {'bool': {'must': [ 167 | {'term': {'c': 4}}, 168 | {'term': {'d': 5}}, 169 | ]}}, 170 | ] 171 | } 172 | }) 173 | 174 | def test_or_filter3(self): 175 | exp = GreaterEqual('a', 2) | (Less('b', 3) | Equal('c', 4)) 176 | self.assertEqual( 177 | exp.build(), 178 | { 179 | 'bool': { 180 | 'should': [ 181 | {'range': {'b': {'lt': 3}}}, 182 | {'term': {'c': 4}}, 183 | {'range': {'a': {'gte': 2}}} 184 | ] 185 | } 186 | }) 187 | 188 | def test_not_filter(self): 189 | exp = ~GreaterEqual('a', 2) 190 | self.assertEqual( 191 | exp.build(), 192 | { 193 | 'bool': { 194 | 'must_not': {'range': {'a': {'gte': 2}}} 195 | } 196 | }) 197 | 198 | def test_not_not_filter(self): 199 | exp = ~~GreaterEqual('a', 2) 200 | 201 | self.assertEqual( 202 | exp.build(), 203 | { 204 | 'bool': { 205 | 'must_not': { 206 | 'bool': { 207 | 'must_not': {'range': {'a': {'gte': 2}}} 208 | } 209 | } 210 | } 211 | }) 212 | 213 | def test_not_and_filter(self): 214 | exp = ~(GreaterEqual('a', 2) & Less('b', 3)) 215 | self.assertEqual( 216 | exp.build(), 217 | { 218 | 'bool': { 219 | 'must_not': { 220 | 'bool': { 221 | 'must': [ 222 | {'range': {'a': {'gte': 2}}}, 223 | {'range': {'b': {'lt': 3}}} 224 | ] 225 | } 226 | } 227 | } 228 | }) 229 | 230 | def test_and_or_filter(self): 231 | exp = GreaterEqual('a', 2) & (Less('b', 3) | Equal('c', 4)) 232 | self.assertEqual( 233 | exp.build(), 234 | { 235 | 'bool': { 236 | 'must': [ 237 | {'range': {'a': {'gte': 2}}}, 238 | { 239 | 'bool': { 240 | 'should': [ 241 | {'range': {'b': {'lt': 3}}}, 242 | {'term': {'c': 4}} 243 | ] 244 | } 245 | } 246 | ] 247 | } 248 | }) 249 | 250 | def test_and_not_or_filter(self): 251 | exp = GreaterEqual('a', 2) & ~(Less('b', 3) | Equal('c', 4)) 252 | self.assertEqual( 253 | exp.build(), 254 | { 255 | 'bool': { 256 | 'must': [ 257 | {'range': {'a': {'gte': 2}}}, 258 | { 259 | 'bool': { 260 | 'must_not': { 261 | 'bool': { 262 | 'should': [ 263 | {'range': {'b': {'lt': 3}}}, 264 | {'term': {'c': 4}} 265 | ] 266 | } 267 | 268 | } 269 | } 270 | } 271 | ] 272 | } 273 | }) 274 | 275 | 276 | if __name__ == '__main__': 277 | unittest.main() 278 | -------------------------------------------------------------------------------- /tests/test_queries.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import unittest 3 | 4 | from pandasticsearch.queries import Select, Agg, ScrollSelect 5 | 6 | 7 | def create_hits(): 8 | return { 9 | 'hits': { 10 | 'hits': [ 11 | {'_source': {'a': 1, 'b': 1}}, 12 | {'_source': {'a': 2, 'b': 2}}, 13 | {'_source': {'a': 3, 'b': 3}}, 14 | ] 15 | }, 16 | 'took': 1 17 | } 18 | 19 | 20 | def mock_hits_generator(): 21 | yield {'_source': {'a': 1, 'b': 1}} 22 | yield {'_source': {'a': 2, 'b': 2}} 23 | yield {'_source': {'a': 3, 'b': 3}} 24 | 25 | 26 | class TestQueries(unittest.TestCase): 27 | def test_select_explain_result(self): 28 | select = Select() 29 | select._result_dict = create_hits() 30 | select.explain_result() 31 | print(select) 32 | print(repr(select)) 33 | 34 | self.assertIsNotNone(select.result) 35 | self.assertEqual(len(select), 3) 36 | 37 | def test_select_from_dict(self): 38 | select = Select.from_dict(create_hits()) 39 | print(select) 40 | print(repr(select)) 41 | 42 | self.assertIsNotNone(select.result) 43 | self.assertEqual(len(select), 3) 44 | 45 | def test_scroll_select(self): 46 | select = ScrollSelect(mock_hits_generator) 47 | print(select) 48 | print(repr(select)) 49 | self.assertEquals(select.result[:2], select.result[:2]) 50 | self.assertEqual(len(select), 3) 51 | 52 | def test_select_result(self): 53 | select = Select.from_dict(create_hits()) 54 | print(select.result) 55 | 56 | self.assertIsNotNone(select.result) 57 | self.assertEqual(len(select.result[0]), 2) 58 | 59 | def test_agg_buckets(self): 60 | agg = Agg() 61 | agg._result_dict = { 62 | 'took': 1, 63 | 'aggregations': { 64 | 'agg_key': { 65 | 'buckets': [ 66 | { 67 | 'key': 'a', 68 | 'f1': {'value': 100}, 69 | 'f2': {'value': 1}, 70 | "doc_count": 12 71 | }, 72 | { 73 | 'key': 'b', 74 | 'f1': {'value': 200}, 75 | 'f2': {'value': 2}, 76 | "doc_count": 13 77 | }, 78 | ] 79 | } 80 | } 81 | } 82 | 83 | agg.explain_result() 84 | print(agg.result) 85 | self.assertEqual(agg.result, [{'f1': 100, 'f2': 1, 'doc_count': 12}, 86 | {'f1': 200, 'f2': 2, 'doc_count': 13}]) 87 | self.assertEqual(agg.index, [('a',), 88 | ('b',)]) 89 | 90 | def test_agg_date_histogram(self): 91 | agg = Agg() 92 | agg._result_dict = { 93 | 'took': 3, 94 | 'aggregations': { 95 | 'my_date_histogram': { 96 | 'buckets': [ 97 | { 98 | 'doc_count': 1, 99 | 'key': 1480392360000, 100 | 'key_as_string': '2016-11-29T04:06:00.000Z', 101 | 'f1': {'value': 0.1}, 102 | }, 103 | { 104 | 'doc_count': 1, 105 | 'key': 1480392420000, 106 | 'key_as_string': '2016-11-29T04:07:00.000Z', 107 | 'f1': {'value': 0.2}, 108 | }, 109 | { 110 | 'doc_count': 1, 111 | 'key': 1480392480000, 112 | 'key_as_string': '2016-11-29T04:08:00.000Z', 113 | 'f1': {'value': 0.3}, 114 | }, 115 | { 116 | 'doc_count': 1, 117 | 'key': 1480392540000, 118 | 'key_as_string': '2016-11-29T04:09:00.000Z', 119 | 'f1': {'value': 0.4}, 120 | }, 121 | { 122 | 'doc_count': 1, 123 | 'key': 1480392600000, 124 | 'key_as_string': '2016-11-29T04:10:00.000Z', 125 | 'f1': {'value': 0.5}, 126 | } 127 | ] 128 | } 129 | } 130 | } 131 | agg.explain_result() 132 | print(agg.result) 133 | self.assertEqual(agg.result, [{'f1': 0.1, 'doc_count': 1}, 134 | {'f1': 0.2, 'doc_count': 1}, 135 | {'f1': 0.3, 'doc_count': 1}, 136 | {'f1': 0.4, 'doc_count': 1}, 137 | {'f1': 0.5, 'doc_count': 1}]) 138 | self.assertEqual(agg.index, [('2016-11-29T04:06:00.000Z',), 139 | ('2016-11-29T04:07:00.000Z',), 140 | ('2016-11-29T04:08:00.000Z',), 141 | ('2016-11-29T04:09:00.000Z',), 142 | ('2016-11-29T04:10:00.000Z',)]) 143 | 144 | def test_agg_nested_buckets(self): 145 | agg = Agg() 146 | agg._result_dict = { 147 | 'took': 1, 148 | 'aggregations': { 149 | 'agg_key1': { 150 | 'buckets': [ 151 | { 152 | 'key': 'a', 153 | 'agg_key2': { 154 | 'buckets': [ 155 | { 156 | 'key': 'x', 157 | 'f1': {'value': 100}, 158 | 'f2': {'value': 1}, 159 | "doc_count": 11 160 | }, 161 | { 162 | 'key': 'y', 163 | 'f1': {'value': 200}, 164 | 'f2': {'value': 2}, 165 | "doc_count": 12 166 | }, 167 | ] 168 | } 169 | }, 170 | { 171 | 'key': 'b', 172 | 'agg_key2': { 173 | 'buckets': [ 174 | { 175 | 'key': 'x', 176 | 'f1': {'value': 300}, 177 | 'f2': {'value': 3}, 178 | "doc_count": 13 179 | }, 180 | { 181 | 'key': 'y', 182 | 'f1': {'value': 400}, 183 | 'f2': {'value': 4}, 184 | "doc_count": 14 185 | }, 186 | ] 187 | } 188 | } 189 | ] 190 | } 191 | } 192 | } 193 | 194 | agg.explain_result() 195 | print(agg.result) 196 | 197 | self.assertEqual(agg.result, 198 | [{'f1': 100, 'f2': 1, 'doc_count': 11}, 199 | {'f1': 200, 'f2': 2, 'doc_count': 12}, 200 | {'f1': 300, 'f2': 3, 'doc_count': 13}, 201 | {'f1': 400, 'f2': 4, 'doc_count': 14}]) 202 | self.assertEqual(agg.index, [('a', 'x'), 203 | ('a', 'y'), 204 | ('b', 'x'), 205 | ('b', 'y')]) 206 | 207 | 208 | if __name__ == '__main__': 209 | unittest.main() 210 | -------------------------------------------------------------------------------- /tests/test_types.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import unittest 3 | 4 | from pandasticsearch.operators import * 5 | from pandasticsearch.types import Row, Column 6 | 7 | 8 | class TestSchema(unittest.TestCase): 9 | def test_row(self): 10 | row = Row(a=1, b='你好,世界') 11 | print(repr(row)) 12 | 13 | self.assertEqual(row['a'], 1) 14 | self.assertEqual(row['b'], '你好,世界') 15 | self.assertEqual(row.as_dict(), {'a': 1, 'b': '你好,世界'}) 16 | 17 | def test_column(self): 18 | col = Column('b') 19 | self._assert_equal_filter(col > 2, Greater('b', 2)) 20 | self._assert_equal_filter(col >= 2, GreaterEqual('b', 2)) 21 | self._assert_equal_filter(col < 2, Less('b', 2)) 22 | self._assert_equal_filter(col <= 2, LessEqual('b', 2)) 23 | self._assert_equal_filter(col == 2, Equal('b', 2)) 24 | self._assert_equal_filter(col != 2, ~Equal('b', 2)) 25 | self._assert_equal_filter(col.isin([1, 2, 3]), IsIn('b', [1, 2, 3])) 26 | self._assert_equal_filter(col.like('a*b'), Like('b', 'a*b')) 27 | self._assert_equal_filter(col.rlike('a*b'), Rlike('b', 'a*b')) 28 | self._assert_equal_filter(col.startswith('jj'), Startswith('b', 'jj')) 29 | self._assert_equal_filter(col.isnull, IsNull('b')) 30 | self._assert_equal_filter(col.notnull, NotNull('b')) 31 | 32 | def _assert_equal_filter(self, x, y): 33 | self.assertTrue(x, BooleanFilter) 34 | self.assertTrue(y, BooleanFilter) 35 | self.assertEqual(x.build(), y.build()) 36 | 37 | 38 | if __name__ == '__main__': 39 | unittest.main() 40 | --------------------------------------------------------------------------------