├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST
├── MANIFEST.in
├── README.md
├── README.rst
├── __init__.py
├── invalid_raw_data.json
├── jsonpyes
├── jsonpyes.egg-info
    ├── PKG-INFO
    ├── SOURCES.txt
    ├── dependency_links.txt
    ├── requires.txt
    └── top_level.txt
├── jsonpyes.py
├── jsonpyes_contrib
    ├── __init__.py
    └── utils.py
├── raw_data.json
├── requirements.txt
├── setup.cfg
├── setup.py
└── static
    ├── algo.png
    ├── jsonpyes_data_processing_diagram.png
    ├── snapshot100.jpg
    ├── snapshot101.jpg
    ├── snapshot102.jpg
    ├── snapshot103.jpg
    ├── snapshot104.jpg
    ├── snapshot105.jpg
    ├── snapshot106.jpg
    ├── snapshot132.png
    ├── snapshot133.png
    ├── snapshot135.png
    ├── snapshot139.png
    ├── snapshot235.png
    ├── snapshot236.png
    ├── snapshot237.png
    ├── snapshot98.jpg
    └── snapshot99.jpg


/.gitignore:
--------------------------------------------------------------------------------
 1 | install.txt
 2 | # Ignore temp big raw JSON data file
 3 | raw_big_data.json
 4 | # Ignore .directory created by file manager dolphine
 5 | .directory
 6 | # For csv.gz data
 7 | *.csv.gz
 8 | # For aria2 downloading tempearary files
 9 | *.aria2
10 | # For url(s) which was captured by pandas from HTML pages
11 | *.url
12 | # For ruby tool "dap" packages
13 | # Ignore rvm files
14 | .ruby-version
15 | .ruby-gemset
16 | 
17 | # Ignore geoip data file
18 | data/geoip.dat
19 | 
20 | data/
21 | env_python27_nervey/
22 | *.swp
23 | *.swo
24 | *.swq
25 | # Byte-compiled / optimized / DLL files
26 | __pycache__/
27 | *.py[cod]
28 | *~
29 | .idea/
30 | .idea/*
31 | # C extensions
32 | *.so
33 | # Distribution / packaging
34 | .Python
35 | env/
36 | bin/
37 | build/
38 | develop-eggs/
39 | dist/
40 | eggs/
41 | lib/
42 | lib64/
43 | parts/
44 | sdist/
45 | var/
46 | # uncomment the line to enable pip uninstallation
47 | #*.egg-info/
48 | .installed.cfg
49 | *.egg
50 | # Installer logs
51 | pip-log.txt
52 | pip-delete-this-directory.txt
53 | # Unit test / coverage reports
54 | htmlcov/
55 | .tox/
56 | .coverage
57 | .cache
58 | nosetests.xml
59 | coverage.xml
60 | # Translations
61 | *.mo
62 | # Mr Developer
63 | .mr.developer.cfg
64 | .project
65 | .pydevproject
66 | # Rope
67 | .ropeproject
68 | # Django stuff:
69 | *.log
70 | *.pot
71 | # Sphinx documentation
72 | docs/_build/
73 | # setup.py install (package cache, no need)
74 | .eggs
75 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Example
 2 | #language: ruby
 3 | #rvm:
 4 | # - "1.8.7"
 5 | # - "1.9.2"
 6 | # - "1.9.3"
 7 | # - rbx
 8 | ## uncomment this line if your project needs to run something other than :
 9 | ## script: bundle exec rspec spec
10 | #
11 | #
12 | language: python
13 | python:
14 |     # give up 2.6 support
15 |     #  - "2.6"
16 |   - "2.7"
17 |   #- "3.2"
18 |   #- "3.3"
19 |   - "3.4"
20 |   - "3.5"
21 |   - "3.6"
22 |   #- "3.5-dev" # 3.5 development branch
23 |   #- "nightly" # currently points to 3.6-dev
24 | # command to install dependencies
25 | #install: "pip install -r requirements.txt"
26 | install: 
27 |   - pip install -r requirements.txt
28 |   - pip install coveralls
29 | # command to run tests
30 | script: nosetests
31 | 
32 | # add test from coveralls
33 | # more infor from https://github.com/coagulant/coveralls-python
34 | script:
35 |     coverage run --source=jsonpyes setup.py test
36 | 
37 | after_success:
38 |     coveralls
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |                           __________________
 2 |                          / ____/_  __/ ____/
 3 |                         / /_    / / / __/
 4 |                        / __/   / / / /___
 5 |                       /_/     /_/ /_____/
 6 | 
 7 |                    FREER THAN EVER PUBLIC LICENSE
 8 | 
 9 |                         Version 1, March 2016
10 | 
11 |              Copyright (C) 2016 Alexander Liu <alex(at)nervey.com>
12 | 
13 | If there exists a most free license, this will be freer than it.
14 | Everyone is permitted to copy, distribute or modifiy anything under this license.
15 | 
16 |                    FREER THAN EVER PUBLIC LICENSE
17 |     TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 
18 | 
19 | 0. You are freer than ever to copy, distribute and modifiy anything under this license.
20 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | LICENSE
3 | README.md
4 | jsonpyes.py
5 | setup.cfg
6 | setup.py
7 | contrib/__init__.py
8 | contrib/utils.py
9 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst README.md LICENSE jsonpyes.py jsonpyes
2 | # file GENERATED by distutils, do NOT edit
3 | #jsonpyes.py
4 | #setup.cfg
5 | #setup.py
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | json-py-es
  2 | ==========
  3 | 
  4 | [![Downloads](https://pepy.tech/badge/jsonpyes)](https://pepy.tech/project/jsonpyes)
  5 | [![Build Status](https://travis-ci.org/xros/jsonpyes.svg?branch=master)](https://travis-ci.org/xros/jsonpyes)
  6 | [![GitHub release](https://img.shields.io/github/release/xros/jsonpyes.svg)](https://github.com/xros/jsonpyes/releases)
  7 | [![GitHub license](https://img.shields.io/github/license/xros/jsonpyes.svg)](https://github.com/xros/jsonpyes/blob/master/LICENSE)
  8 | 
  9 | Alexander Liu
 10 | 
 11 | * To import raw JSON data files to ElasticSearch in one line of commands
 12 | 
 13 | ![jsonpyes diagram](static/jsonpyes_data_processing_diagram.png)
 14 | 
 15 | Very fast -- 4 to 10 times faster when processing big data.
 16 | 
 17 | 
 18 | ### Installation
 19 | 
 20 | ```pip install jsonpyes```  
 21 | 
 22 | > **Notice**: Before using `pip` to install `jsonpyes`, firstly you need to install `python-pip` on your system. ( Supports Python2.7, 3,4, 3.5, 3.6 )
 23 | 
 24 | 
 25 | jsonpyes
 26 | --------
 27 | 
 28 | ![user interface](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot236.png)
 29 | 
 30 | #### Instructions:
 31 |     There are 3 proccesses of importing raw JSON data to ElasticSearch
 32 |     1. Only validating raw JSON data
 33 |     2. Without validating ,just import data to ElasticSearch
 34 |     3. After validating successfully, then import data to ElasticSearch
 35 | 
 36 |     A valid JSON file here refers to a JSON file stacked with many lines of data
 37 |     
 38 |     file valid_data.json and its content
 39 | 
 40 |     {"key1": "valueA", "key2": {"sub_key1": "value2A", "sub_key2": ["Good", "Morning"]}}
 41 |     {"key1": "valueB", "key2": {"sub_key1": "value2B", "sub_key2": ["Good", "Afternoon"]}}
 42 |     ...
 43 |     {"key1": "valueC", "key2": {"sub_key1": "value2C", "sub_key2": ["Good", "Evening"]}}
 44 | 
 45 | 
 46 | 
 47 | Functions included
 48 | ------------------
 49 | 
 50 | ##### 1. Validating JSON format data
 51 | 
 52 | ```jsonpyes --data raw_data.json --check```
 53 | 
 54 | If the json data file is valid: 
 55 | 
 56 | ![json valid](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot98.jpg)
 57 | 
 58 | If the json data file is invalid: 
 59 | 
 60 | ![json invalid](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot99.jpg)
 61 | 
 62 | ##### 2. Only importing without validating
 63 | 
 64 | ```jsonpyes --data raw_data.json --bulk http://localhost:9200 --import --index myindex2 --type mytype2```
 65 | 
 66 | Notice: If the raw JSON data file is invalid, `jsonpyes` will not import it.
 67 | 
 68 | Or enable multi-threads ```jsonpyes --data raw_data.json --bulk http://localhost:9200 --import --index myindex2 --type mytype2 --thread 8```
 69 | 
 70 | ![no threads](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot102.jpg)
 71 | 
 72 | ```jsonpyes``` supports multi-threads when importing data to elasticsearch
 73 | 
 74 | 
 75 | ##### Multi-threads comparison
 76 | 
 77 | 1. No multi-threads 
 78 | 
 79 |     ![benchmarks](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot237.png)
 80 | 
 81 | 2. With 8 threads and `jsonpyes` cuts files into pieces, then destributes to workers fairly 
 82 | 
 83 |     ![use helpers.bulk API with multi-threads](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot235.png)
 84 | 
 85 | > As you can see these two containers have same docs loaded, if we use **_--thread 8_** it could be several times faster, usually 5 to 10 times faster.
 86 | > That really depends on your computer/server resources.
 87 | This was tested on a 4GB RAM / 2.4Ghz intel i5 Linux x64 laptop system.
 88 | 
 89 | And it works.
 90 | 
 91 | ![it works](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot105.jpg)
 92 | 
 93 | ##### 3. Both validating and importing
 94 | 
 95 | ```jsonpyes --data raw_data.json --bulk http://localhost:9200 --import --index myindex1 --type mytype1 --check```
 96 | 
 97 | ![validating and importing](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot135.png)
 98 | 
 99 | And it works.
100 | 
101 | ![the results](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot101.jpg)
102 | 
103 | 
104 | Reference
105 | ---------
106 | * Algorithm handwritting
107 | 
108 | ![handwritting](https://raw.githubusercontent.com/xros/jsonpyes/master/static/algo.png)
109 | 
110 | ##### Happy hacking!
111 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | json-py-es
  2 | ==========
  3 | 
  4 | |Downloads| |Build Status| |GitHub release| |GitHub license|
  5 | 
  6 | Alexander Liu
  7 | 
  8 | -  To import raw JSON data files to ElasticSearch in one line of
  9 |    commands
 10 | 
 11 | .. figure:: static/jsonpyes_data_processing_diagram.png
 12 |    :alt: jsonpyes diagram
 13 | 
 14 |    jsonpyes diagram
 15 | 
 16 | Very fast -- 4 to 10 times faster when processing big data.
 17 | 
 18 | Installation
 19 | ~~~~~~~~~~~~
 20 | 
 21 | ``pip install jsonpyes``
 22 | 
 23 |     **Notice**: Before using ``pip`` to install ``jsonpyes``, firstly
 24 |     you need to install ``python-pip`` on your system. ( Supports
 25 |     Python2.7, 3,4, 3.5, 3.6 )
 26 | 
 27 | jsonpyes
 28 | --------
 29 | 
 30 | .. figure:: https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot236.png
 31 |    :alt: user interface
 32 | 
 33 |    user interface
 34 | 
 35 | Instructions:
 36 | ^^^^^^^^^^^^^
 37 | 
 38 | ::
 39 | 
 40 |     There are 3 proccesses of importing raw JSON data to ElasticSearch
 41 |     1. Only validating raw JSON data
 42 |     2. Without validating ,just import data to ElasticSearch
 43 |     3. After validating successfully, then import data to ElasticSearch
 44 | 
 45 |     A valid JSON file here refers to a JSON file stacked with many lines of data
 46 | 
 47 |     file valid_data.json and its content
 48 | 
 49 |     {"key1": "valueA", "key2": {"sub_key1": "value2A", "sub_key2": ["Good", "Morning"]}}
 50 |     {"key1": "valueB", "key2": {"sub_key1": "value2B", "sub_key2": ["Good", "Afternoon"]}}
 51 |     ...
 52 |     {"key1": "valueC", "key2": {"sub_key1": "value2C", "sub_key2": ["Good", "Evening"]}}
 53 | 
 54 | Functions included
 55 | ------------------
 56 | 
 57 | 1. Validating JSON format data
 58 | ''''''''''''''''''''''''''''''
 59 | 
 60 | ``jsonpyes --data raw_data.json --check``
 61 | 
 62 | If the json data file is valid:
 63 | 
 64 | .. figure:: https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot98.jpg
 65 |    :alt: json valid
 66 | 
 67 |    json valid
 68 | 
 69 | If the json data file is invalid:
 70 | 
 71 | .. figure:: https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot99.jpg
 72 |    :alt: json invalid
 73 | 
 74 |    json invalid
 75 | 
 76 | 2. Only importing without validating
 77 | ''''''''''''''''''''''''''''''''''''
 78 | 
 79 | ``jsonpyes --data raw_data.json --bulk http://localhost:9200 --import --index myindex2 --type mytype2``
 80 | 
 81 | Notice: If the raw JSON data file is invalid, ``jsonpyes`` will not
 82 | import it.
 83 | 
 84 | Or enable multi-threads
 85 | ``jsonpyes --data raw_data.json --bulk http://localhost:9200 --import --index myindex2 --type mytype2 --thread 8``
 86 | 
 87 | .. figure:: https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot102.jpg
 88 |    :alt: no threads
 89 | 
 90 |    no threads
 91 | 
 92 | ``jsonpyes`` supports multi-threads when importing data to elasticsearch
 93 | 
 94 | Multi-threads comparison
 95 | ''''''''''''''''''''''''
 96 | 
 97 | 1. No multi-threads
 98 | 
 99 |    .. figure:: https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot237.png
100 |       :alt: benchmarks
101 | 
102 |       benchmarks
103 | 
104 | 2. With 8 threads and ``jsonpyes`` cuts files into pieces, then
105 |    destributes to workers fairly
106 | 
107 |    .. figure:: https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot235.png
108 |       :alt: use helpers.bulk API with multi-threads
109 | 
110 |       use helpers.bulk API with multi-threads
111 | 
112 |     As you can see these two containers have same docs loaded, if we use
113 |     ***--thread 8*** it could be several times faster, usually 5 to 10
114 |     times faster. That really depends on your computer/server resources.
115 |     This was tested on a 4GB RAM / 2.4Ghz intel i5 Linux x64 laptop
116 |     system.
117 | 
118 | And it works.
119 | 
120 | .. figure:: https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot105.jpg
121 |    :alt: it works
122 | 
123 |    it works
124 | 
125 | 3. Both validating and importing
126 | ''''''''''''''''''''''''''''''''
127 | 
128 | ``jsonpyes --data raw_data.json --bulk http://localhost:9200 --import --index myindex1 --type mytype1 --check``
129 | 
130 | .. figure:: https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot135.png
131 |    :alt: validating and importing
132 | 
133 |    validating and importing
134 | 
135 | And it works.
136 | 
137 | .. figure:: https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot101.jpg
138 |    :alt: the results
139 | 
140 |    the results
141 | 
142 | Reference
143 | ---------
144 | 
145 | -  Algorithm handwritting
146 | 
147 | .. figure:: https://raw.githubusercontent.com/xros/jsonpyes/master/static/algo.png
148 |    :alt: handwritting
149 | 
150 |    handwritting
151 | 
152 | Happy hacking!
153 | ''''''''''''''
154 | 
155 | .. |Downloads| image:: https://pepy.tech/badge/jsonpyes
156 |    :target: https://pepy.tech/project/jsonpyes
157 | .. |Build Status| image:: https://travis-ci.org/xros/jsonpyes.svg?branch=master
158 |    :target: https://travis-ci.org/xros/jsonpyes
159 | .. |GitHub release| image:: https://img.shields.io/github/release/xros/jsonpyes.svg
160 |    :target: https://github.com/xros/jsonpyes/releases
161 | .. |GitHub license| image:: https://img.shields.io/github/license/xros/jsonpyes.svg
162 |    :target: https://github.com/xros/jsonpyes/blob/master/LICENSE
163 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from contrib.utils import count_file_lines
2 | __all__ = ['contrib']
3 | 


--------------------------------------------------------------------------------
/invalid_raw_data.json:
--------------------------------------------------------------------------------
 1 | {"ip":"74.63.161.124","timestamp":"2015-10-16T14:00:13-04:00","data":{},"error":"EOF","error_component":"banner"}
 2 | {"ip":"65.36.165.102","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220 MORS.ad.safesecureweb.com Microsoft ESMTP MAIL Service, Version: 6.0.3790.4675 ready at  Fri, 16 Oct 2015 13:57:02 -0400 \r\n","ehlo":"250-MORS.ad.safesecureweb.com Hello [141.212.122.128]\r\n250-TURN\r\n250-SIZE 2097152\r\n250-ETRN\r\n250-PIPELINING\r\n250-DSN\r\n250-ENHANCEDSTATUSCODES\r\n250-8bitmime\r\n250-BINARYMIME\r\n250-CHUNKING\r\n250-VRFY\r\n250 OK\r\n","starttls":"554 5.7.3 Unable to initialize security subsystem\r\n"},"error":"Bad return code for STARTTLS","error_component":"starttls"}
 3 | {"ip":"23.91.113.228","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220 poppy.arvixe.com ESMTP\r\n","ehlo":"250-poppy.arvixe.com\r\n250-SIZE 20480000\r\n250 AUTH LOGIN\r\n","starttls":"502 STARTTLS NOT ALLOWED.\r\n"},"error":"Bad return code for STARTTLS","error_component":"starttls"}
 4 | {"ip":"204.3.67.57","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220 center-transforming-relationships.com ESMTP \r\n","ehlo":"250-center-transforming-relationships.com \r\n250-PIPELINING\r\n250-SIZE 36700160\r\n250 8BITMIME\r\n","starttls":"502 unimplemented (#5.5.1)\r\n"},"error":"Bad return code for STARTTLS","error_component":"starttls"}
 5 | {"ip":"45.43.231.247","timestamp":"2015-10-16T14:00:13-04:00","data":{},"error":"read tcp 45.43.231.247:25: connection reset by peer","error_component":"banner"}
 6 | {"ip":"204.73.44.226","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220 Augusoft Internal Email Server At Your Service\r\n","ehlo":"250-mrsshepherd.com [141.212.122.128], this server offers 4 extensions\r\n250-AUTH LOGIN\r\n250-SIZE 20480000\r\n250-HELP\r\n250 AUTH=LOGIN\r\n","starttls":"503 Bad sequence of commands\r\n"},"error":"Bad return code for STARTTLS","error_component":"starttls"}
 7 | {"ip":"200.46.210.154","timestamp":"2015-10-16T14:00:13-04:00","data":{},"error":"read tcp 200.46.210.154:25: connection reset by peer","error_component":"banner"}
 8 | {"ip":"50.28.80.160","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220-raptor.ws9000.net ESMTP Exim 4.85 #2 Fri, 16 Oct 2015 14:00:13 -0400 \r\n220-We do not authorize the use of this system to transport unsolicited, \r\n220 and/or bulk e-mail.\r\n","ehlo":"250-raptor.ws9000.net Hello researchscan383.eecs.umich.edu [141.212.122.128]\r\n250-SIZE 52428800\r\n250-8BITMIME\r\n250-PIPELINING\r\n250-AUTH PLAIN LOGIN\r\n250-STARTTLS\r\n250 HELP\r\n","starttls":"220 TLS go ahead\r\n","tls":{"client_hello":{"random":"tF27lvgudpkDoEsorUp0saOIgVPfjc7ZbJFCmAZxbHQ="}}},"error":"remote error: handshake failure","error_component":"starttls"}
 9 | {"ip":"50.93.209.78","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220-charybdis.restechservices.net ESMTP Exim 4.85 #2 Fri, 16 Oct 2015 13:00:12 -0500 \r\n220-We do not authorize the use of this system to transport unsolicited, \r\n220 and/or bulk e-mail.\r\n","ehlo":"250-charybdis.restechservices.net Hello researchscan383.eecs.umich.edu [141.212.122.128]\r\n250-SIZE 52428800\r\n250-8BITMIME\r\n250-PIPELINING\r\n250-AUTH PLAIN LOGIN\r\n250-STARTTLS\r\n250 HELP\r\n","starttls":"220 TLS go ahead\r\n","tls":{"client_hello":{"random":"FT0jMqzr1KkF/eurB1K2xdgfNLw0K8+9P6HwMtrSaWw="}}},"error":"remote error: handshake failure","error_component":"starttls"}
10 | "ip":"66.96.180.160","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220 ESMTP Fri, 16 Oct 2015 14:00:13 -0400: UCE strictly prohibited\r\n","ehlo":"250-bosauthsmtp10.yourhostingaccount.com Hello eecs.umich.edu [141.212.122.128]\r\n250-SIZE 34603008\r\n250-8BITMIME\r\n250-PIPELINING\r\n250-AUTH PLAIN LOGIN\r\n250-STARTTLS\r\n250 HELP\r\n","starttls":"220 TLS go ahead\r\n","tls":{"client_hello":{"random":"SoAhgklxKMZefAJywXW8FjEg+815O8NpsTwx2sQxvtQ="}}},"error":"remote error: handshake failure","error_component":"starttls"}
11 | 


--------------------------------------------------------------------------------
/jsonpyes:
--------------------------------------------------------------------------------
1 | jsonpyes.py


--------------------------------------------------------------------------------
/jsonpyes.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
  1 | Metadata-Version: 1.1
  2 | Name: jsonpyes
  3 | Version: 1.4.2
  4 | Summary: A Tool to Import JSON raw data to ElasticSearch in one line of commands
  5 | Home-page: https://github.com/xros/jsonpyes
  6 | Author: Alexander Liu
  7 | Author-email: alex@nervey.com
  8 | License: FTE V1
  9 | Download-URL: https://github.com/xros/jsonpyes/archive/1.4.2.zip
 10 | Description: json-py-es
 11 |         ==========
 12 |         
 13 |         [![Downloads](https://pepy.tech/badge/jsonpyes)](https://pepy.tech/project/jsonpyes)
 14 |         [![Build Status](https://travis-ci.org/xros/jsonpyes.svg?branch=master)](https://travis-ci.org/xros/jsonpyes)
 15 |         [![GitHub release](https://img.shields.io/github/release/xros/jsonpyes.svg)](https://github.com/xros/jsonpyes/releases)
 16 |         [![GitHub license](https://img.shields.io/github/license/xros/jsonpyes.svg)](https://github.com/xros/jsonpyes/blob/master/LICENSE)
 17 |         
 18 |         Alexander Liu
 19 |         
 20 |         * To import raw JSON data files to ElasticSearch in one line of commands
 21 |         
 22 |         ![jsonpyes diagram](static/jsonpyes_data_processing_diagram.png)
 23 |         
 24 |         Very fast -- 4 to 10 times faster when processing big data.
 25 |         
 26 |         
 27 |         ### Installation
 28 |         
 29 |         ```pip install jsonpyes```  
 30 |         
 31 |         > **Notice**: Before using `pip` to install `jsonpyes`, firstly you need to install `python-pip` on your system. ( Supports Python2.7, 3,4, 3.5, 3.6 )
 32 |         
 33 |         
 34 |         jsonpyes
 35 |         --------
 36 |         
 37 |         ![user interface](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot236.png)
 38 |         
 39 |         #### Instructions:
 40 |             There are 3 proccesses of importing raw JSON data to ElasticSearch
 41 |             1. Only validating raw JSON data
 42 |             2. Without validating ,just import data to ElasticSearch
 43 |             3. After validating successfully, then import data to ElasticSearch
 44 |         
 45 |             A valid JSON file here refers to a JSON file stacked with many lines of data
 46 |             
 47 |             file valid_data.json and its content
 48 |         
 49 |             {"key1": "valueA", "key2": {"sub_key1": "value2A", "sub_key2": ["Good", "Morning"]}}
 50 |             {"key1": "valueB", "key2": {"sub_key1": "value2B", "sub_key2": ["Good", "Afternoon"]}}
 51 |             ...
 52 |             {"key1": "valueC", "key2": {"sub_key1": "value2C", "sub_key2": ["Good", "Evening"]}}
 53 |         
 54 |         
 55 |         
 56 |         Functions included
 57 |         ------------------
 58 |         
 59 |         ##### 1. Validating JSON format data
 60 |         
 61 |         ```jsonpyes --data raw_data.json --check```
 62 |         
 63 |         If the json data file is valid: 
 64 |         
 65 |         ![json valid](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot98.jpg)
 66 |         
 67 |         If the json data file is invalid: 
 68 |         
 69 |         ![json invalid](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot99.jpg)
 70 |         
 71 |         ##### 2. Only importing without validating
 72 |         
 73 |         ```jsonpyes --data raw_data.json --bulk http://localhost:9200 --import --index myindex2 --type mytype2```
 74 |         
 75 |         Notice: If the raw JSON data file is invalid, `jsonpyes` will not import it.
 76 |         
 77 |         Or enable multi-threads ```jsonpyes --data raw_data.json --bulk http://localhost:9200 --import --index myindex2 --type mytype2 --thread 8```
 78 |         
 79 |         ![no threads](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot102.jpg)
 80 |         
 81 |         ```jsonpyes``` supports multi-threads when importing data to elasticsearch
 82 |         
 83 |         
 84 |         ##### Multi-threads comparison
 85 |         
 86 |         1. No multi-threads 
 87 |         
 88 |             ![benchmarks](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot237.png)
 89 |         
 90 |         2. With 8 threads and `jsonpyes` cuts files into pieces, then destributes to workers fairly 
 91 |         
 92 |             ![use helpers.bulk API with multi-threads](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot235.png)
 93 |         
 94 |         > As you can see these two containers have same docs loaded, if we use **_--thread 8_** it could be several times faster, usually 5 to 10 times faster.
 95 |         > That really depends on your computer/server resources.
 96 |         This was tested on a 4GB RAM / 2.4Ghz intel i5 Linux x64 laptop system.
 97 |         
 98 |         And it works.
 99 |         
100 |         ![it works](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot105.jpg)
101 |         
102 |         ##### 3. Both validating and importing
103 |         
104 |         ```jsonpyes --data raw_data.json --bulk http://localhost:9200 --import --index myindex1 --type mytype1 --check```
105 |         
106 |         ![validating and importing](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot135.png)
107 |         
108 |         And it works.
109 |         
110 |         ![the results](https://raw.githubusercontent.com/xros/jsonpyes/master/static/snapshot101.jpg)
111 |         
112 |         
113 |         Reference
114 |         ---------
115 |         * Algorithm handwritting
116 |         
117 |         ![handwritting](https://raw.githubusercontent.com/xros/jsonpyes/master/static/algo.png)
118 |         
119 |         ##### Happy hacking!
120 |         
121 | Keywords: elasticsearch,json,json2es,jsonpyes
122 | Platform: Unix
123 | Platform: Linux
124 | Platform: OSX
125 | Platform: Android
126 | Platform: Windows
127 | 


--------------------------------------------------------------------------------
/jsonpyes.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LICENSE
 2 | MANIFEST.in
 3 | README.md
 4 | README.rst
 5 | jsonpyes
 6 | jsonpyes.py
 7 | setup.cfg
 8 | setup.py
 9 | jsonpyes.egg-info/PKG-INFO
10 | jsonpyes.egg-info/SOURCES.txt
11 | jsonpyes.egg-info/dependency_links.txt
12 | jsonpyes.egg-info/requires.txt
13 | jsonpyes.egg-info/top_level.txt
14 | jsonpyes_contrib/__init__.py
15 | jsonpyes_contrib/utils.py


--------------------------------------------------------------------------------
/jsonpyes.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/jsonpyes.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | elasticsearch
2 | simplejson
3 | 


--------------------------------------------------------------------------------
/jsonpyes.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | jsonpyes_contrib
2 | 


--------------------------------------------------------------------------------
/jsonpyes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # encoding: utf-8
  3 | """
  4 |                           __________________
  5 |                          / ____/_  __/ ____/
  6 |                         / /_    / / / __/
  7 |                        / __/   / / / /___
  8 |                       /_/     /_/ /_____/
  9 | 
 10 |                    FREER THAN EVER PUBLIC LICENSE
 11 | 
 12 |                         Version 1, March 2016
 13 | 
 14 |              Copyright (C) 2016 Alexander Liu <alex(at)nervey.com>
 15 | 
 16 | If there exists a most free license, this will be freer than it.
 17 | Everyone is permitted to copy, distribute or modifiy anything under this license.
 18 | 
 19 |                    FREER THAN EVER PUBLIC LICENSE
 20 |     TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 
 21 | 
 22 | 0. You are freer than ever to copy, distribute and modifiy anything under this license.
 23 | """
 24 | 
 25 | 
 26 | from elasticsearch import Elasticsearch
 27 | from elasticsearch import helpers
 28 | import sys
 29 | import subprocess
 30 | import threading
 31 | import linecache
 32 | import jsonpyes_contrib
 33 | from jsonpyes_contrib.utils import count_file_lines as c_file_lines
 34 | import logging
 35 | import time
 36 | 
 37 | try:
 38 |     import simplejson as json
 39 | except ImportError:
 40 |     import json
 41 | 
 42 | __author__ = "Alexander Liu"
 43 | 
 44 | 
 45 | version = jsonpyes_contrib.__version__
 46 | 
 47 | 
 48 | """
 49 | Instructions:
 50 |     There are 3 proccesses of importing raw JSON data to ElasticSearch
 51 |     1. Only validating raw JSON data
 52 |     2. Without validating ,just import data to ElasticSearch
 53 |     3. After validating successfully, then import data to ElasticSearch
 54 | """
 55 | 
 56 | es = Elasticsearch(['http://localhost:9200'], verify_certs=True)
 57 | 
 58 | def show_version():
 59 |     print(version)
 60 | 
 61 | 
 62 | def show_help():
 63 |     print("""
 64 |                _______ ____  _   __      ______  __      ___________
 65 |               / / ___// __ \/ | / /     / __ \ \/ /     / ____/ ___/
 66 |          __  / /\__ \/ / / /  |/ /_____/ /_/ /\  /_____/ __/  \__ \\
 67 |         / /_/ /___/ / /_/ / /|  /_____/ ____/ / /_____/ /___ ___/ /
 68 |         \____//____/\____/_/ |_/     /_/     /_/     /_____//____/
 69 |         
 70 |                         Import raw JSON to ElasticSearch in one line of commands
 71 |                                                            -- Alexander Liu
 72 | 
 73 |                                                            """
 74 |                                                            +
 75 |                                                            
 76 |                                                            version
 77 | 
 78 |                                                            +
 79 |                                                            """
 80 | 
 81 |             
 82 |     Options include:
 83 | 
 84 |         --data                  : The JSON data file
 85 |         --check                 : Check whether the file is valid raw JSON for ElasticSearch
 86 |         --bulk                  : ElasticSearch bulk API address
 87 |         --index                 : Index name
 88 |         --type                  : Index type
 89 |         --import                : Import raw JSON data to ES. This proccess does "--check" and data importing
 90 |         --thread                : Threads amount, default 1. The more threads, the faster when importing or checking
 91 |         --version               : Prints the version number 
 92 |         --help                  : Display this help 
 93 | 
 94 |     Notice:
 95 | 
 96 |         It's recommended that you use multi-threads when importing data. Because it's way faster.
 97 | 
 98 | 
 99 |     Examples:
100 | 
101 |     1) Only check
102 |     > $~ jsonpyes --data raw_data.json --check
103 |     > All raw JSON data valid!
104 | 
105 |     2) Only import without checking
106 |     > $~ jsonpyes --data raw_data.json --bulk http://localhost:9200 --import --index myindex1 --type mytype1
107 |     > Successfully data imported!
108 | 
109 |     3) Import after checking successfully with 8 threads
110 |     > $~ jsonpyes --data raw_data.json --bulk http://localhost:9200 --import --index myindex1 --type mytype1 --check --thread 8
111 |     > All raw JSON data valid!
112 |     > Successfully data imported!
113 | 
114 | 
115 |     """)
116 | 
117 | def validate_json_data(json_file=""):
118 |     """
119 |     To validate whether the JSON data file is fully a JSON file without any format invalidation
120 |     """
121 |     if str(json_file)=="":
122 |         raise ValueError("No JSON file was input\n")
123 |     else:
124 |         try:
125 |             f = open(json_file, 'r')
126 |         except IOError as e:
127 |             raise IOError('Can not open the file "%s" with error \n%s\n' % (json_file, str(e)))
128 |         else:
129 |             f.close()
130 |             with open(json_file, 'r') as f:
131 |                 for line in f:
132 |                     # try to load each line of JSON data and convert it into Python object
133 |                     try:
134 |                         one_dict = json.loads(line)
135 |                     except Exception as e:
136 |                         print("JSON data not valid with error \n %s \n" % (str(e)))
137 |                         return False
138 |                     else:
139 |                         pass
140 |             # assume all JSON valid
141 |             return True
142 | 
143 | 
144 | def worker_import_to_es_for_threading(data='a_raw_file.json', start_line=0, stop_line=0, es=es, index="", doc_type=""):
145 |     # NOTICE: the 'start_line' and 'stop_line' are all included. 'stop_line' can not be omitted.
146 |     actions = []
147 |     try_times = 0
148 |     es = es
149 |     # Using linecache to read big data in RAM
150 |     for i in range(start_line, stop_line + 1):
151 |         # Enhancement: This version of jsonpyes use `elastisearch.helpers.bulk`. Thanks to suggestion from piterjoin
152 |         row = linecache.getline(data, i)
153 |         try:
154 |             action = {
155 |                 "_index": index,
156 |                 "_type": doc_type,
157 |                 "_source": json.loads(row)
158 |             }
159 |         except Exception as e:
160 |             logging.warning(str(e))
161 |             continue
162 | 
163 |         actions.append(action)
164 | 
165 |         # https://elasticsearch-py.readthedocs.org/en/master/helpers.html?highlight=helpers#module-elasticsearch.helpers
166 |         # In some cases, the size of 1000 docs can be at around 10 MB. And they are stored in RAM
167 | 
168 |         if len(actions) >= 5000:
169 |             # try serveral times if ES rejects, is busy or down
170 |             while try_times < 5:
171 |                 try:
172 |                     # single chunk_bytes max upto 200 MB assumption
173 |                     helpers.bulk(es, actions)
174 |                     try_times = 0
175 |                     break
176 |                 except Exception as e:
177 |                     try_times = try_times + 1
178 |                     logging.warning("Can not send a group of actions(docs) to ElasticSearch using parallel_bulk, with error: " + str(e))
179 |                     # wait for the ElasticSearch to response
180 |                     time.sleep(5)
181 |             if try_times >= 5:
182 |                 msg = "After trying " + str(try_times) + \
183 |                     " times. It still can not send a group of actions(docs) to ElasticSearch using parallel_bulk, with error: " + str(e)
184 |                 logging.error(msg)
185 |                 try_times = 0
186 | 
187 |             # delete previous docs
188 |             del actions[0:len(actions)]
189 | 
190 |     # clear all the caches out of the for-loop every time -- loop
191 |     linecache.clearcache()
192 | 
193 |     # if we have leftovers, finish them
194 |     if len(actions) > 0:
195 |         try:
196 |             helpers.bulk(es, actions)
197 |         except Exception as e:
198 |             logging.warning("Can not send a group of actions(docs) to ElasticSearch using parallel_bulk, with error: " + str(e))
199 |         # delete previous docs
200 |         del actions[0:len(actions)]
201 | 
202 |     # terminate this job
203 |     return
204 | 
205 | def new_return_start_stop_for_multi_thread_in_list(lines=0, thread_amount=1):
206 |     """Return a list
207 |     Return lines to read for each thread equally
208 |     for example. 37 lines, 4 threads
209 | 
210 |     [
211 |         {"start": 1, "stop": 10},
212 |         {"start": 11, "stop": 20},
213 |         {"start": 21, "stop": 30},
214 |         {"start": 31, "stop": 37}
215 |     ]
216 | 
217 |     # start from line 1, ends at line 37 (includes line 37 in data file)
218 | 
219 |     """
220 | 
221 | 
222 |     # lets assume if there were 17 lines and 4 threads, 
223 |     # thread (1)(2)(3) can have 5 job tasks maximumly. thread (4) only has 2 job tasks
224 |     #
225 |     # their job list:
226 |     #               iter 0        iter 1             iter 2            iter 3
227 |     #               thread 1      thread 2           thread 3          thread 4
228 |     # line/job num  1,2,3,4,5     6,7,8,9,10         11,12,13,14,15    16,17
229 |     #
230 |     # iter means iteration
231 |     #
232 | 
233 |     start_stop_line_list = []
234 |     each_has = lines / thread_amount 
235 |     # last_remains = lines - (thread_amount * each_has)
236 |     last_remains = lines % thread_amount                        # 17 % 4 -> 1
237 |     
238 |     for t in range(thread_amount):
239 |         start_stop_line_list.append(
240 |             {
241 |                 "start": each_has * t + 1,
242 |                 "stop": each_has * ( t + 1)
243 |             }
244 |         )
245 |     if last_remains > 0:
246 |         start_stop_line_list[-1] = {
247 |             "start": each_has * (thread_amount - 1) + 1,
248 |             "stop": lines
249 |         }
250 | 
251 |     return start_stop_line_list
252 | 
253 | 
254 | 
255 | 
256 | 
257 | class StoppableThread(threading.Thread):
258 |     """Thread class with a stop() method. The thread itself has to check
259 |         regularly for the stopped() condition."""
260 | 
261 |     def __init__(self, *args, **kwargs):
262 |         super(StoppableThread, self).__init__()
263 |         self._stop = threading.Event()
264 | 
265 |     def stop(self):
266 |         self._stop.set()
267 | 
268 |     def stopped(self):
269 |         return self._stop.isSet()
270 | 
271 | 
272 | 
273 | class Jsonpyes(object):
274 |     """Re-edit this in the future maybe.
275 |     Pending, no need until now
276 |     """
277 |     
278 |     def __init__(self):
279 |         pass
280 | 
281 |     def count_file_lines(json_files=""):
282 |         return c_file_lines(json_file)
283 | 
284 |     def validate_json_data(json_file=""):
285 |         """
286 |         To validate whether the JSON data file is fully a JSON file without any format validation
287 |         """
288 |         if str(json_file)=="":
289 |             raise ValueError("No JSON file was input\n")
290 |         else:
291 |             try:
292 |                 f = open(json_file, 'r')
293 |             except IOError as e:
294 |                 raise IOError('Can not open the file "%s" with error \n%s\n' % (json_file, str(e)))
295 |             else:
296 |                 f.close()
297 |                 with open(json_file, 'r') as f:
298 |                     for line in f:
299 |                         # try to load each line of JSON data and convert it into Python object
300 |                         try:
301 |                             one_dict = json.loads(line)
302 |                         except Exception as e:
303 |                             print("JSON data not valid with error \n %s \n" % (str(e)))
304 |                             return False
305 |                         else:
306 |                             pass
307 |                 # assume all JSON valid
308 |                 return True
309 |     
310 |     # TODO add muti-threads support
311 |     def importWithOutChecking(self):
312 |         pass
313 | 
314 | 
315 |     def importAfterChecking(self):
316 |         pass
317 | 
318 | 
319 | def run():
320 |     """
321 |     """
322 |     
323 |     if len(sys.argv) == 1:
324 |         show_help()
325 |         return
326 |     else:
327 |         # logic set
328 |         process_jobs = []
329 |         
330 |         for i in range(len(sys.argv[0:])):
331 |             if sys.argv[i].startswith("--"):
332 |                 try:
333 |                     option = sys.argv[i][2:]
334 |                 except:
335 |                     show_help()
336 |                     return
337 | 
338 |                 # show version
339 |                 if option == "version":
340 |                     show_version()
341 |                     return
342 | 
343 |                 # show some help
344 |                 elif option == "help":
345 |                     show_help()
346 |                     return
347 | 
348 |                 # get the raw data
349 |                 elif option == "data":
350 |                     # Add info to jobs
351 |                     process_jobs.append(
352 |                         {"data": sys.argv[i+1]}
353 |                     )
354 | 
355 |                 # get the bulk URL
356 |                 elif option == "bulk":
357 |                     # Add info to jobs
358 |                     process_jobs.append(
359 |                         {"bulk": sys.argv[i+1]}
360 |                     ) 
361 |                 
362 |                 # get the bulk index
363 |                 elif option == "index":
364 |                     # Add info to jobs
365 |                     process_jobs.append(
366 |                         {"index": sys.argv[i+1]}
367 |                     ) 
368 |                 
369 |                 # get the bulk type
370 |                 elif option == "type":
371 |                     # Add info to jobs
372 |                     process_jobs.append(
373 |                         {"type": sys.argv[i+1]}
374 |                     ) 
375 |                 
376 |                 
377 |                 # check raw JSON
378 |                 elif option == "check":
379 |                     # Add info to jobs
380 |                     process_jobs.append(
381 |                         "check"
382 |                     )
383 | 
384 |                     
385 |                 # check if bulk API is valid      
386 |                 elif option == "import":
387 |                     # Add info to jobs
388 |                     process_jobs.append(
389 |                         "import"
390 |                     )
391 | 
392 |                 # add multi-threads support
393 |                 elif option == "thread":
394 |                     # Add info to jobs
395 |                     process_jobs.append(
396 |                         {"thread_amount": sys.argv[i+1]}
397 |                     ) 
398 |                     process_jobs.append(
399 |                         "thread"
400 |                     )
401 |                 
402 | 
403 |         data = ""
404 |         bulk = ""
405 |         index = ""
406 |         doc_type = ""
407 |         thread_amount = 1
408 |         # Get info from process_jobs
409 |         # fix the syntax bug after upgrading support from Python2.7 to Python3.6, thanks to @tdracz
410 |         for job in process_jobs:
411 |             if type(job) == dict:
412 |                 if 'data' in job:
413 |                     data = job['data']
414 |                 if 'bulk' in job:
415 |                     bulk = job['bulk']
416 |                 if 'index' in job:
417 |                     index = job['index']
418 |                 if 'type' in job:
419 |                     doc_type = job['type']
420 |                 if 'thread' in job:
421 |                     thread_amount = int(job['thread_amount'])
422 | 
423 | 
424 |         #### 1) Only check not importing
425 |         if ("check" in process_jobs) and ("import" not in process_jobs) :
426 |             # check JSON
427 |             flag = validate_json_data(json_file=data)
428 |             if flag == True:
429 |                 print("All raw JSON data valid!")
430 |             return
431 |                 
432 |         # Process the jobs in process_jobs
433 |         # 2) Only import without checking
434 |         #### 2.1) import, check , no multi-threads
435 |         if ("check" in process_jobs) and ("import" in process_jobs) and ("thread" not in process_jobs):
436 |             
437 |             # check JSON
438 |             flag = validate_json_data(json_file=data)
439 |             if flag == True:
440 |                 print("All raw JSON data valid!")
441 |                 
442 |             es = Elasticsearch([bulk], verify_certs=True)
443 |             # read JSON data
444 |             with open(data, 'r') as f:
445 |                 for line in f:
446 |                     es.index(index=index, doc_type=doc_type, 
447 |                         #id=2, 
448 |                         body=json.loads(line)
449 |                     )
450 |             
451 |             print("Successfully data imported!")
452 |             return
453 | 
454 | 
455 |         #### 2.2) import, no check, no multi-threads
456 |         if ("check" not in process_jobs) and ("import" in process_jobs) and ("thread" not in process_jobs):
457 |             es = Elasticsearch([bulk], verify_certs=True)
458 |             # read JSON data
459 |             with open(data, 'r') as f:
460 |                 for line in f:
461 |                     es.index(index=index, doc_type=doc_type, 
462 |                         #id=2, 
463 |                         body=json.loads(line)
464 |                     )
465 |             
466 |             print("Successfully data imported!")
467 |             return
468 | 
469 | 
470 |         #### 2.3) import, no check, multi-threads
471 |         if ("import" in process_jobs) and ("check" not in process_jobs) and ("thread" in process_jobs):
472 |    
473 | 
474 | 
475 |             # check file lines
476 |             lines = c_file_lines(json_file=data)
477 |             # if lines < 1024, it will only use 1 thread to finish this job, no matter how many you want
478 |             if lines < 1024:
479 |             #if lines < 4:                                              # Only for debugging 
480 |                 es = Elasticsearch([bulk], verify_certs=True)
481 |                 # read JSON data
482 |                 with open(data, 'r') as f:
483 |                     for line in f:
484 |                         es.index(index=index, doc_type=doc_type, 
485 |                             #id=2, 
486 |                             body=json.loads(line)
487 |                         )
488 |             else:
489 |                 # calculate each thread reads how many lines
490 |                 start_stop_line_list = new_return_start_stop_for_multi_thread_in_list(lines=lines, thread_amount=thread_amount)
491 | 
492 |                 threads = []
493 |                 for i in start_stop_line_list:
494 |                     #t = StoppableThread(target=worker_import_to_es_for_threading, args=(data, i['start'], i['stop']))
495 |                     t = threading.Thread(target=worker_import_to_es_for_threading, 
496 |                                          args=(data, i['start'], i['stop'], Elasticsearch([bulk], verify_certs=True), index, doc_type, )
497 |                     )
498 |                     threads.append(t)
499 |                     t.start()
500 |                     t.join()
501 |                     
502 | 
503 |                 # stop all threads if interrupts
504 |                 try:
505 |                     while len(threading.enumerate()) > 1:
506 |                         pass
507 |                     print("Successfully data imported!")
508 |                     return
509 |                 except KeyboardInterrupt:
510 |                     # for i in threads:
511 |                         # i.stop()
512 |                     print("Data importing interrupted!")
513 |                     exit(0)
514 |                     return
515 | 
516 |             print("Successfully data imported!")
517 |             return
518 |  
519 |  
520 |          #### 2.4) import, check, multi-threads
521 | 
522 |         if ("import" in process_jobs) and ("check" in process_jobs) and ("thread" in process_jobs):
523 | 
524 |             # check JSON
525 |             flag = validate_json_data(json_file=data)
526 |             if flag == True:
527 |                 print("All raw JSON data valid!")
528 | 
529 | 
530 | 
531 |             # check file lines
532 |             lines = c_file_lines(json_file=data)
533 |             # if lines < 1024, it will only use 1 thread to finish this job, no matter how many you want
534 |             if lines < 1024:
535 |             #if lines < 4:                                              # Only for debugging 
536 |                 es = Elasticsearch([bulk], verify_certs=True)
537 |                 # read JSON data
538 |                 with open(data, 'r') as f:
539 |                     for line in f:
540 |                         es.index(index=index, doc_type=doc_type, 
541 |                             #id=2, 
542 |                             body=json.loads(line)
543 |                         )
544 |                 print("Successfully data imported!")
545 |                 exit(0)
546 |                 return
547 |             else:
548 |                 # calculate each thread reads how many lines
549 |                 start_stop_line_list = new_return_start_stop_for_multi_thread_in_list(lines=lines, thread_amount=thread_amount)
550 | 
551 |                 threads = []
552 |                 for i in start_stop_line_list:
553 |                     #t = StoppableThread(target=worker_import_to_es_for_threading, args=(data, i['start'], i['stop']))
554 |                     t = threading.Thread(target=worker_import_to_es_for_threading, 
555 |                                          args=(data, i['start'], i['stop'], Elasticsearch([bulk], verify_certs=True), index, doc_type, )
556 |                     )
557 |                     threads.append(t)
558 |                     t.start()
559 |                     t.join()
560 |                     
561 | 
562 |                 # stop all threads if interrupts
563 |                 try:
564 |                     # there is at least one main threading for all threadings
565 |                     while len(threading.enumerate()) > 1:
566 |                         pass
567 |                     print("Successfully data imported!")
568 |                     exit(0)
569 |                     return
570 |                 except KeyboardInterrupt:
571 |                     print(len(threading.enumerate()))
572 |                     # for i in threads:
573 |                         # i.stop()
574 |                     print("Data importing interrupted!")
575 |                     exit(0)
576 |                     return
577 | 
578 |  
579 |         else:
580 |             show_help()
581 |             return
582 |     
583 |     
584 |         
585 |  
586 |     
587 | 
588 | 
589 | if __name__ == "__main__":
590 |     run()
591 |     exit(0)
592 | 


--------------------------------------------------------------------------------
/jsonpyes_contrib/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | """
3 | # version number like 1.2.3a0 or 1.2.3, must have at least 2 parts, like 1.2  
4 | __version__ = '1.4.2'
5 | 


--------------------------------------------------------------------------------
/jsonpyes_contrib/utils.py:
--------------------------------------------------------------------------------
 1 | def count_file_lines(json_file=""):
 2 |     """
 3 |     # TODO 
 4 |     Read lines before using multi-threads to do data importing I/O jobs
 5 |     """
 6 | 
 7 |     num_lines = 0
 8 |     with open(json_file, 'r') as f:
 9 |         for line in f:
10 |             num_lines += 1
11 |     return num_lines
12 | 
13 | 


--------------------------------------------------------------------------------
/raw_data.json:
--------------------------------------------------------------------------------
 1 | {"ip":"74.63.161.124","timestamp":"2015-10-16T14:00:13-04:00","data":{},"error":"EOF","error_component":"banner"}
 2 | {"ip":"65.36.165.102","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220 MORS.ad.safesecureweb.com Microsoft ESMTP MAIL Service, Version: 6.0.3790.4675 ready at  Fri, 16 Oct 2015 13:57:02 -0400 \r\n","ehlo":"250-MORS.ad.safesecureweb.com Hello [141.212.122.128]\r\n250-TURN\r\n250-SIZE 2097152\r\n250-ETRN\r\n250-PIPELINING\r\n250-DSN\r\n250-ENHANCEDSTATUSCODES\r\n250-8bitmime\r\n250-BINARYMIME\r\n250-CHUNKING\r\n250-VRFY\r\n250 OK\r\n","starttls":"554 5.7.3 Unable to initialize security subsystem\r\n"},"error":"Bad return code for STARTTLS","error_component":"starttls"}
 3 | {"ip":"23.91.113.228","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220 poppy.arvixe.com ESMTP\r\n","ehlo":"250-poppy.arvixe.com\r\n250-SIZE 20480000\r\n250 AUTH LOGIN\r\n","starttls":"502 STARTTLS NOT ALLOWED.\r\n"},"error":"Bad return code for STARTTLS","error_component":"starttls"}
 4 | {"ip":"204.3.67.57","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220 center-transforming-relationships.com ESMTP \r\n","ehlo":"250-center-transforming-relationships.com \r\n250-PIPELINING\r\n250-SIZE 36700160\r\n250 8BITMIME\r\n","starttls":"502 unimplemented (#5.5.1)\r\n"},"error":"Bad return code for STARTTLS","error_component":"starttls"}
 5 | {"ip":"45.43.231.247","timestamp":"2015-10-16T14:00:13-04:00","data":{},"error":"read tcp 45.43.231.247:25: connection reset by peer","error_component":"banner"}
 6 | {"ip":"204.73.44.226","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220 Augusoft Internal Email Server At Your Service\r\n","ehlo":"250-mrsshepherd.com [141.212.122.128], this server offers 4 extensions\r\n250-AUTH LOGIN\r\n250-SIZE 20480000\r\n250-HELP\r\n250 AUTH=LOGIN\r\n","starttls":"503 Bad sequence of commands\r\n"},"error":"Bad return code for STARTTLS","error_component":"starttls"}
 7 | {"ip":"200.46.210.154","timestamp":"2015-10-16T14:00:13-04:00","data":{},"error":"read tcp 200.46.210.154:25: connection reset by peer","error_component":"banner"}
 8 | {"ip":"50.28.80.160","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220-raptor.ws9000.net ESMTP Exim 4.85 #2 Fri, 16 Oct 2015 14:00:13 -0400 \r\n220-We do not authorize the use of this system to transport unsolicited, \r\n220 and/or bulk e-mail.\r\n","ehlo":"250-raptor.ws9000.net Hello researchscan383.eecs.umich.edu [141.212.122.128]\r\n250-SIZE 52428800\r\n250-8BITMIME\r\n250-PIPELINING\r\n250-AUTH PLAIN LOGIN\r\n250-STARTTLS\r\n250 HELP\r\n","starttls":"220 TLS go ahead\r\n","tls":{"client_hello":{"random":"tF27lvgudpkDoEsorUp0saOIgVPfjc7ZbJFCmAZxbHQ="}}},"error":"remote error: handshake failure","error_component":"starttls"}
 9 | {"ip":"50.93.209.78","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220-charybdis.restechservices.net ESMTP Exim 4.85 #2 Fri, 16 Oct 2015 13:00:12 -0500 \r\n220-We do not authorize the use of this system to transport unsolicited, \r\n220 and/or bulk e-mail.\r\n","ehlo":"250-charybdis.restechservices.net Hello researchscan383.eecs.umich.edu [141.212.122.128]\r\n250-SIZE 52428800\r\n250-8BITMIME\r\n250-PIPELINING\r\n250-AUTH PLAIN LOGIN\r\n250-STARTTLS\r\n250 HELP\r\n","starttls":"220 TLS go ahead\r\n","tls":{"client_hello":{"random":"FT0jMqzr1KkF/eurB1K2xdgfNLw0K8+9P6HwMtrSaWw="}}},"error":"remote error: handshake failure","error_component":"starttls"}
10 | {"ip":"66.96.180.160","timestamp":"2015-10-16T14:00:13-04:00","data":{"banner":"220 ESMTP Fri, 16 Oct 2015 14:00:13 -0400: UCE strictly prohibited\r\n","ehlo":"250-bosauthsmtp10.yourhostingaccount.com Hello eecs.umich.edu [141.212.122.128]\r\n250-SIZE 34603008\r\n250-8BITMIME\r\n250-PIPELINING\r\n250-AUTH PLAIN LOGIN\r\n250-STARTTLS\r\n250 HELP\r\n","starttls":"220 TLS go ahead\r\n","tls":{"client_hello":{"random":"SoAhgklxKMZefAJywXW8FjEg+815O8NpsTwx2sQxvtQ="}}},"error":"remote error: handshake failure","error_component":"starttls"}
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | elasticsearch
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | from distutils.core import setup 
 3 | from setuptools import find_packages
 4 | 
 5 | import codecs
 6 | 
 7 | # Try to build doc, converting from language markdown to rst using pandoc if it is installed
 8 | import re
 9 | import platform
10 | import subprocess
11 | import os
12 | import sys
13 | 
14 | CUR_DIR_PATH = os.path.dirname(os.path.abspath(__file__))
15 | 
16 | 
17 | # py_version = platform.python_version_tuple()
18 | 
19 | convert_the_doc_command = "pandoc --from=markdown --to=rst --output=" + os.path.join(CUR_DIR_PATH, "README.rst") + " " + os.path.join(CUR_DIR_PATH, "README.md")
20 | 
21 | 
22 | # Trying to build the doc
23 | try:
24 |     os.popen(convert_the_doc_command)
25 | except Exception as e:
26 |     pass
27 | 
28 | # subprocess has check_output since python 2.7.0+
29 | # if py_version <= (2, 6, 9):
30 |     # try:
31 |         # result = os.popen(convert_the_doc_command)
32 |     # except Exception, e:
33 |         # pass
34 | # else:
35 |     # try:
36 |         # result = subprocess.check_output(convert_the_doc_command , shell=True)
37 |     # except Exception, e:
38 |         # pass
39 | 
40 | 
41 | def read_file(filename, encoding='utf8'):
42 |     """Read unicode from given file."""
43 |     with codecs.open(filename, encoding=encoding) as fd:
44 |         return fd.read()
45 | 
46 | here = os.path.abspath(os.path.dirname(__file__))
47 | 
48 | # read version number (and other metadata) from package init
49 | init_fn = os.path.join(here, 'jsonpyes_contrib', '__init__.py')
50 | meta = dict(re.findall(r"""__([a-z]+)__ = '([^']+)""", read_file(init_fn)))
51 | 
52 | # Get the long description from the README file
53 | readme = read_file(os.path.join(here, 'README.rst'))
54 | readme_md = read_file(os.path.join(here, 'README.md'))
55 | # changes = read_file(os.path.join(here, 'CHANGES.rst'))
56 | version = meta['version']
57 | 
58 | 
59 | setup(name='jsonpyes',
60 |         version=version,
61 |         author="Alexander Liu",
62 |         author_email='alex@nervey.com',
63 |         license="FTE V1",
64 |         description="A Tool to Import JSON raw data to ElasticSearch in one line of commands",
65 |         long_description=readme_md,
66 |         long_description_content_type='text/markdown',
67 |         platforms=["Unix","Linux","OSX","Android","Windows"],
68 |         url="https://github.com/xros/jsonpyes",
69 |         # download_url="https://github.com/xros/jsonpyes/tarball/" + version,
70 |         # download_url="https://github.com/xros/jsonpyes/archive/master.zip",
71 |         download_url="https://github.com/xros/jsonpyes/archive/" + version + ".zip",
72 |         #py_modules=['elasticsearch','jsonpyes','simplejson'],
73 |         install_requires=[
74 |             'elasticsearch',
75 |             'simplejson',
76 |         ],
77 |         keywords=["elasticsearch","json","json2es","jsonpyes"],
78 |         classifiers=[],
79 |         # Make this script executable in command line
80 |         #scripts=['jsonpyes.py'],
81 |         scripts=['jsonpyes'],
82 |         #packages=['contrib', ],
83 |         packages=find_packages(),
84 |         include_package_data=True,
85 | )
86 | 


--------------------------------------------------------------------------------
/static/algo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/algo.png


--------------------------------------------------------------------------------
/static/jsonpyes_data_processing_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/jsonpyes_data_processing_diagram.png


--------------------------------------------------------------------------------
/static/snapshot100.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot100.jpg


--------------------------------------------------------------------------------
/static/snapshot101.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot101.jpg


--------------------------------------------------------------------------------
/static/snapshot102.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot102.jpg


--------------------------------------------------------------------------------
/static/snapshot103.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot103.jpg


--------------------------------------------------------------------------------
/static/snapshot104.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot104.jpg


--------------------------------------------------------------------------------
/static/snapshot105.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot105.jpg


--------------------------------------------------------------------------------
/static/snapshot106.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot106.jpg


--------------------------------------------------------------------------------
/static/snapshot132.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot132.png


--------------------------------------------------------------------------------
/static/snapshot133.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot133.png


--------------------------------------------------------------------------------
/static/snapshot135.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot135.png


--------------------------------------------------------------------------------
/static/snapshot139.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot139.png


--------------------------------------------------------------------------------
/static/snapshot235.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot235.png


--------------------------------------------------------------------------------
/static/snapshot236.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot236.png


--------------------------------------------------------------------------------
/static/snapshot237.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot237.png


--------------------------------------------------------------------------------
/static/snapshot98.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot98.jpg


--------------------------------------------------------------------------------
/static/snapshot99.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xros/jsonpyes/a174ec35cea218a4f77b013af9796a977163c884/static/snapshot99.jpg


--------------------------------------------------------------------------------