├── .gitignore
├── LICENSE
├── README.md
├── build.gradle
├── circle.yml
├── gradle-local-dependency-rules.json
├── gradle.properties
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── settings.gradle
└── src
    ├── main
        ├── java
        │   └── uy
        │   │   └── kohesive
        │   │       └── elasticsearch
        │   │           └── dataimport
        │   │               └── udf
        │   │                   └── Udfs.java
        ├── kotlin
        │   └── uy
        │   │   └── kohesive
        │   │       └── elasticsearch
        │   │           └── dataimport
        │   │               ├── AlgoliaDataImportHandler.kt
        │   │               ├── AlgoliaStateManager.kt
        │   │               ├── App.kt
        │   │               ├── Config.kt
        │   │               ├── DataImportHandler.kt
        │   │               ├── EsDataImportHandler.kt
        │   │               ├── Exceptions.kt
        │   │               ├── MicroEsClient.kt
        │   │               ├── State.kt
        │   │               ├── Udfs.kt
        │   │               └── Util.kt
        └── resources
        │   └── logback.xml
    └── test
        ├── kotlin
            └── uy
            │   └── kohesive
            │       └── elasticsearch
            │           └── dataimport
            │               └── ManualTestOfDataImport.kt
        └── resources
            ├── manual-mappings.json
            └── test.sql


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.ipr
3 | *.iml
4 | *.iws
5 | build/
6 | out/
7 | classes/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2016-2018 Jayson Minard (jayson.minard@gmail.com) and Kohesive
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![GitHub release](https://img.shields.io/github/release/kohesive/elasticsearch-data-import-handler.svg)](https://github.com/kohesive/elasticsearch-data-import-handler/releases)  [![CircleCI branch](https://img.shields.io/circleci/project/kohesive/elasticsearch-data-import-handler/master.svg)](https://circleci.com/gh/kohesive/elasticsearch-data-import-handler/tree/master) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](https://github.com/kohesive/elasticsearch-data-import-handler/blob/master/LICENSE)  [![ES](https://img.shields.io/badge/ES-5.x-orange.svg)](https://github.com/elastic/elasticsearch) [![ES](https://img.shields.io/badge/ES-6.x-orange.svg)](https://github.com/elastic/elasticsearch) 
  2 | 
  3 | 
  4 | # Elasticsearch Data Import Handler
  5 | # Elasticsearch/Algolia Data Import Handler
  6 | 
  7 | A data import handler for Elasticsearch/Algolia
  8 | 
  9 | * Simple
 10 | * Powerful
 11 | * Use SQL statements that can span multiple databases, text files, and ElasticSearch/Algolia indexes
 12 | * Process full load and incremental updates
 13 | * Output columnar and structured JSON to ElasticSearch/Algolia
 14 | 
 15 | Running is simple.  With Java 8 installed, [download a release](https://github.com/kohesive/elasticsearch-data-import-handler/releases) and then run it:
 16 | 
 17 | ```
 18 | kohesive-es-dih <configFile.conf>
 19 | ```
 20 | 
 21 | The config file is in [HOCON format](https://github.com/typesafehub/config/blob/master/HOCON.md) which is a relaxed 
 22 | version of JSON and allows [multi-line-string](https://github.com/typesafehub/config/blob/master/HOCON.md#multi-line-strings) 
 23 | which is very useful when writing larger SQL statements.
 24 | 
 25 | The configuration file follows this format:
 26 | 
 27 | ```hocon
 28 | {
 29 |   "sources": {
 30 |     "elasticsearch": [ ],
 31 |     "jdbc": [ ],
 32 |     "filesystem": [ ]
 33 |   },
 34 |   "prepStatements": [ ],
 35 |   "importSteps": [ ]
 36 | }
 37 | ```
 38 | 
 39 | First you should provide 1 or more `sources`.  Each source becomes a temporary table in a unified catalog of tables from 
 40 | which you can query and join together.
 41 | 
 42 | Optionally, you can include preparatory steps in `prepStatements` which are usually additional  temporary tables 
 43 | that may be shared by the later import steps.  The `prepStatements` execute in order and **do not** do any date range 
 44 | substitution.
 45 | 
 46 | Lastly you specify `importSteps` which are queries that use any of the `sources` and temporary tables created in 
 47 | `prepStatements`; where the results of each SQL query is pushed into an Elasticsearch index.  
 48 | 
 49 | **Tip:** The SQL used is anything available to [Apache Spark SQL](https://docs.databricks.com/spark/latest/spark-sql/index.html)
 50 | 
 51 | ### Let's start with an example, 
 52 | ...of loading 1 table from MySQL into Elasticsearch:
 53 | 
 54 | ```hocon
 55 | {
 56 |   "sources": {
 57 |     "jdbc": [
 58 |       {
 59 |         "jdbcUrl": "jdbc:mysql://localhost/test?useSSL=false",
 60 |         "driverClass": "com.mysql.jdbc.Driver",
 61 |         "defaultSchema": "test",
 62 |         "auth": {
 63 |           "username": "myusername",
 64 |           "password": "mypass"
 65 |         },
 66 |         "driverJars": [],  # MySQL and Postgres JARS are included automatically, this property can be omitted completely
 67 |         "tables": [
 68 |           {
 69 |             "sparkTable": "Users",
 70 |             "sourceTable": "UserEntities"
 71 |           }
 72 |         ]
 73 |       }
 74 |     ]
 75 |   },
 76 |   "importSteps": [
 77 |     {
 78 |       "description": "Data loaders for base data sets",
 79 |       "targetElasticsearch": {
 80 |         "nodes": [
 81 |           "localhost:9200"
 82 |         ],
 83 |         "settings": {
 84 |           "es.index.auto.create": true
 85 |         }
 86 |       },
 87 |       "statements": [
 88 |         {
 89 |           "id": "Q4499_1233",
 90 |           "description": "Load User data into ES",
 91 |           "indexName": "aa-test-user",
 92 |           "indexType": "user",
 93 |           "newIndexSettingsFile": "./test-index-settings-and-mappings.json",  # optional, otherwise template or infered mappings are applied
 94 |           "settings": {
 95 |             "es.mapping.id": "guid"
 96 |           },
 97 |           "sqlQuery": """
 98 |                 SELECT guid, first_name, last_name, organization 
 99 |                   FROM Users
100 |                  WHERE dtUpdated BETWEEN '{lastRun}' and '{thisRun}'
101 |           """
102 |         }
103 |       ]
104 |     }
105 |   ]
106 | }
107 | ```
108 | 
109 | You will see that the JDBC source is provided, which must include the JDBC driver for the database, connection information,
110 | and a mapping from the original `sourceTable` database table to the temporary table `sparkTable` that will be used in 
111 | later SQL queries.  The name `sparkTable` is used because this system runs an embedded Apache Spark, and is creating Spark SQL
112 | tables from the configuration.  
113 | 
114 | Since this process runs in Apache Spark, there might be additional options you wish to set when 
115 | the data is loaded.  For advanced users who know what these are, you can add `settings` map at the the `jdbc` connection
116 | level to apply to all tables within that connection, or at the per-`tables` level of the configuration to apply to only
117 | one table.  
118 | 
119 | The `importSteps` are a collection of target Elasticsearch clusters and one or more SQL statements for each.  Each statement
120 | must have a unique `id` so that state can be tracked, changing or removing the `id` will result in a full data load running
121 | for a given query.  
122 | 
123 | Notice that the SQL statement includes the use of the `{lastRun}` and `{thisRun}` macros.  These will substitute the current
124 | date/time into the SQL as a SQL Date formated string.  The granularity is SECONDS, and the local time zone of the data 
125 | import processor is used.  Also, be sure to put the date macros inside quotes.
126 | 
127 | (_Since version `0.9.0-ALPHA`_) SQL statements can also be provided in a file, using `sqlFile` instead of `sqlQuery`, 
128 | where the file path is relative to the configuration file.
129 | 
130 | The `indexType` field is the target type within the Elasticsearch `indexName` for the documents.  You can use either the 
131 | literal type name, or include a macro of `{fieldName}` where `fieldName` is one of the fields in the SQL result set.  
132 | 
133 | The `newIndexSettingsFile` is optional and allows a settings+mappings JSON file to be applied when creating a new index.
134 | Otherwise any index templates or implied mappings will be used on index creation.  The `es.index.auto.create` flag must
135 | be active (by default it is), otherwise this file is not used.
136 | 
137 | The `settings` object for Elasticsearch are important, and the most common basic settings you may wish to set (at either 
138 | the connection or statement level) are:
139 | 
140 | 
141 | |Setting|Description|
142 | |-------|-----------|
143 | |es.index.auto.create|Whether to auto-create the target index if it doesn't already exist, default is `true`|
144 | |es.mapping.id|Which field in the SQL results should be used as the ID for the document (if absent, autogenerated ID's are used)|
145 | |es.ingest.pipeline|Which ingest pipeline should be used to pre-process incoming records into Elasticsearch|
146 | 
147 | **Tip:** For advanced use cases, please see documentation for [Elasticsearch-Spark settings](https://www.elastic.co/guide/en/elasticsearch/hadoop/current/configuration.html).
148 | 
149 | ### Let's try a more complex example,
150 | ...of joining a csv text file to the SQL statement
151 | 
152 | ```hocon
153 | {
154 |   "sources": {
155 |     "jdbc": [
156 |       {
157 |          # ... same as previous example
158 |       }
159 |     ],
160 |      "filesystem": [
161 |           {
162 |             "directory": "/data/sources/raw",
163 |             "tables": [
164 |               {
165 |                 "sparkTable": "UserEmotions",
166 |                 "format": "csv",
167 |                 "filespecs": [
168 |                   "test.csv"
169 |                 ],
170 |                 "settings": {
171 |                   "header": true,
172 |                   "inferSchema": true
173 |                 }
174 |               }
175 |             ]
176 |           }
177 |         ]
178 |   },
179 |   "importSteps": [
180 |     {
181 |       "description": "Data loaders for base data sets",
182 |       "targetElasticsearch": {
183 |         # ... same as previous example
184 |       },
185 |       "statements": [
186 |         {
187 |           "id": "Q4499_1233",
188 |           "description": "Load User data with merged Emotions into ES",
189 |           "indexName": "aa-test-user",
190 |           "indexType": "user",
191 |           "settings": {
192 |             "es.mapping.id": "guid"
193 |           },
194 |           "sqlQuery": """
195 |                 SELECT u.guid, u.first_name, u.last_name, u.organization, ue.emotion 
196 |                   FROM Users AS u LEFT OUTER JOIN UserEmotions AS ue ON (u.guid = ue.guid)
197 |                  WHERE u.dtUpdated BETWEEN '{lastRun}' and '{thisRun}'
198 |           """
199 |         }
200 |       ]
201 |     }
202 |   ]
203 | }
204 | ```
205 | 
206 | We have changed the configuration adding the file source.  Here the `directory` must exist and then `filespec` is a list
207 | of specific filenames, or wildcards.  For example `["test1.csv", "test2.csv"]` or `["test*.csv"]` are both valid.  Here again
208 | you may see `settings` maps appearing at the filesystem directory level or for each table.  
209 | 
210 | The format may be `csv`, `json`, tab delimited, or any other import file format supported by the default Apache Spark 
211 | distribution.  Some settings you might find useful for `csv` include:
212 | 
213 | |Setting|Description|
214 | |-------|-----------|
215 | |header|Whether a header line is present or not (true/false)|
216 | |delimiter|What delimiter between the fields, default `,`|
217 | |quote|If fields are quoted, what character is used for quoting, default `"`|
218 | |escape|If escaping of characters is needed, what character is used, default `\`|
219 | |charset|If the file is not `UTF-8`, what charset is it?|
220 | |inferSchema|The system can infer datatypes by scanning all the data first, then loading in a second pass (true/false)|
221 | |nullValue|This string can replace any null values, otherwise they are truly null|
222 | |dateFormat|specifies a date format for recognizing and parsing date fields (follows [SimpleDateFormat](https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html))|
223 | 
224 | **Tip:** For other use cases, read all [CSV Options](https://github.com/databricks/spark-csv)
225 | 
226 | ### And if you want an example with an Elasticsearch source
227 | ...here is the additional part of the configuration:
228 | 
229 | ```hocon
230 | {
231 |   "sources": {
232 |     "elasticsearch": [
233 |       {
234 |         "nodes": [
235 |           "localhost:9200"
236 |         ],
237 |         "tables": [
238 |           {
239 |             "sparkTable": "SearchDocuments",
240 |             "indexName": "document-contents-2017",
241 |             "indexType": "rawdoc",
242 |             "esQuery": {
243 |               "match_all": {}
244 |             }
245 |           }
246 |         ]
247 |       }
248 |     ],
249 |     # ...
250 | }
251 | ```
252 | 
253 | Now I have the Elasticsearch index acting as a table.  Like before, `settings` maybe applied at the connection or `tables` level
254 | if you have special needs.  And please note that the `esQuery` contains the first level of query filtering, and if absent 
255 | defaults to a `match_all` query.  You can write the query as above, or you can also include the top level `"query": { ... }"` 
256 | element surrounding the query.  
257 | 
258 | ### Let's go for it, here's a bigger example
259 | ...showing an import that generates nested JSON and also a preparatory step creating temporary tables.
260 | 
261 | We will use the same source tables as the previous examples and imagine that we now have `Orgs` and `OrgMembers` tables.
262 | We will create a final list of `Users` with their organizations nested inside each.  We also will update the users if
263 | any of the related tables change.  This might cause an explosion of updates, so be careful with your planning about how
264 | you use the `{lastRun}` and `{thisRun}` values.
265 | 
266 | ```hocon
267 | {
268 |   "sources": {
269 |     # ... same as above but we have added `Orgs` and `OrgMembers`
270 |   },
271 |   "prepStatements": [
272 |     {
273 |       "description": "Create views of only the active users",
274 |       "sqlQuery":  """
275 |             CREATE TEMPORARY VIEW ActiveUsers AS
276 |                 SELECT * FROM Users WHERE isActive = 1
277 |       """
278 |     },
279 |     {
280 |       "description": "Create views of only the active orgs",
281 |       "sqlQuery":  """
282 |             CREATE TEMPORARY VIEW ActiveOrgs AS
283 |                 SELECT * FROM Orgs WHERE isActive = 1
284 |       """
285 |     }
286 |   ],
287 |   "importSteps": [
288 |     {
289 |       "description": "Data loaders for base data sets",
290 |       "targetElasticsearch": {
291 |         "nodes": [
292 |           "localhost:9200"
293 |         ],
294 |         "settings": {
295 |           "es.index.auto.create": true
296 |         }
297 |       },
298 |       "statements": [
299 |         {
300 |           "id": "X9A90Z_1",
301 |           "description": "Load denormalized User + Org Memberships into ES",
302 |           "indexName": "aa-test-user",
303 |           "indexType": "{docType}",
304 |           "settings": {
305 |             "es.mapping.id": "guid"
306 |           },
307 |           "sqlFile": "structured-user-load.sql"
308 |         }
309 |       ]
310 |     }
311 |   ]
312 | }
313 | ```
314 | 
315 | and the `structured-user-load.sql` file placed in the same directory:
316 | 
317 | ```sql
318 | WITH orgMembers AS
319 | (
320 |    SELECT our.guid AS roleOrgGuid, our.userGuid AS roleUserGuid, our.orgUserRoleType AS roleType, our.dtUpdated AS dtRoleUpdated,
321 |           oe.displayName AS orgDisplayName, oe.dtUpdated AS dtOrgUpdated
322 |      FROM ActiveOrgs AS oe JOIN OrgMembers AS our ON (our.orgGuid = oe.guid)
323 | ),
324 | userWithOrg AS (
325 |    SELECT ue.guid, struct(ue.*) AS user, struct(om.*) AS orgMembership
326 |      FROM ActiveUsers AS ue LEFT OUTER JOIN orgMembers AS om ON (om.roleUserGuid = ue.guid)
327 | ),
328 | modifiedUserData AS (
329 |     SELECT guid, first(user) as user, collect_list(orgMembership) AS orgMemberships
330 |              FROM userWithOrg AS ue
331 |             WHERE user.dtUpdated between "{lastRun}" AND "{thisRun}" OR
332 |                   orgMembership.dtRoleUpdated between  "{lastRun}" AND "{thisRun}" OR
333 |                   orgMembership.dtOrgUpdated between  "{lastRun}" AND "{thisRun}"
334 |          GROUP BY guid
335 | ),
336 | usersWithEmotions AS (
337 |     SELECT mu.*, em.emotion FROM modifiedUserData AS mu LEFT OUTER JOIN UserEmotions AS em ON (mu.guid = em.guid)
338 | )
339 | SELECT user.accountType as docType, user.guid,
340 |        user.identity, user.displayName, user.contactEmail, user.avatarUrl, user.gravatarEmail, user.blurb,
341 |        user.location, user.defaultTraitPrivacyType, user.companyName, user.isActive, user.isHeadless,
342 |        emotion, user.dtCreated, user.dtUpdated, orgMemberships FROM usersWithEmotions
343 | ```
344 | 
345 | Ok, that was a bit much.  But here is some research you can do.  In the [SQL Functions API Reference](https://spark.apache.org/docs/2.1.0/api/java/org/apache/spark/sql/functions.html) 
346 | you will find the `collect_list` and `struct` functions.  They are used above to create structured results, like nested JSON.  The `struct()`
347 | function is combining columns into an object, and the `collect_list` is aggregating the objects into an array.  
348 | 
349 | We are also using the `guid` from the result set as the Elasticsearch document `_id`, and we have a literal field coming back
350 | in the documents that we are using via the `indexType` setting as a macro `{docType}` pointing at that field from the result set.
351 | This is handy if different documents in the results will have different types. 
352 | 
353 | So at the end we have a result that looks like:
354 | 
355 | ```hocon
356 | {
357 |   "docType": "...",
358 |   "guid": "...",
359 |   "identity": "...",
360 |   # ...
361 |   "orgMemberships": [
362 |      {
363 |          "roleOrgGuid": "...",
364 |          "orgDisplayName": "...",
365 |          # ...
366 |      },
367 |      {
368 |          "roleOrgGuid": "...",
369 |          "orgDisplayName": "...",
370 |          # ...
371 |      }
372 |   ]
373 | }
374 | ```
375 | 
376 | ### Algolia
377 | 
378 | To define Algolia index as an import step target, configure the `targetAlgolia` inside your import step config like this:
379 |  
380 | ```
381 | "importSteps": [
382 |     {
383 |       "description": "Load products table into Algolia index",
384 |       "targetAlgolia": {
385 |         "applicationId": "YOURAPPID",
386 |         "apiKey": "yourapikey"
387 |       },
388 |       ...
389 | ```
390 | 
391 | The statement configuration is almost identical to the one of ElasticSearch: use `index` field to specify the Algolia index, and 
392 | the `idField` to specify the row that would be used as Algolia index's `objectID` (optional):
393 |  
394 | ```
395 | "statements": [
396 |   {
397 |     "id": "Import-Products-Table,
398 |     "description": "Load products data into Algolia index",
399 |     "idField": "product_id",
400 |     "indexName": "ProductsIndex",
401 |     "sqlQuery": """
402 |       SELECT id AS product_id, product_name, manufacturer_name FROM Products 
403 |       JOIN Manufacturers ON Products.manufacturer_id = Manufacturers.id
404 |       WHERE product_modify_date BETWEEN '{lastRun}' AND '{thisRun}'
405 |     """
406 |   }
407 | ]
408 | ```
409 | 
410 | Algolia target also supports "Delete" statements, which must return a column with a name defined by `idField` (see above),
411 | containing the `objectID`s to be deleted from Algolia index. To define such a statement, specify the `"action": "delete"` in its definition, e.g.:
412 | 
413 | ```
414 | "statements": [
415 |   {
416 |     "id": "Clear-Products-Table,
417 |     "description": "Delete previously removed products from Algolia index",
418 |     "idField": "product_id",
419 |     "indexName": "ProductsIndex",
420 |     "action": "delete",
421 |     "sqlQuery": """
422 |       SELECT product_id FROM DeletedProducts
423 |       WHERE product_delete_date BETWEEN '{lastRun}' AND '{thisRun}'
424 |     """
425 |   }
426 | ]
427 | ```
428 | 
429 | ### SQL Reference:
430 | 
431 | The data import handler uses Spark SQL, and you can read the [Spark SQL Reference](https://docs.databricks.com/spark/latest/spark-sql/index.html) 
432 | for full details on the supported SQL.  
433 | 
434 | A list of [SQL Functions](https://spark.apache.org/docs/2.1.0/api/java/org/apache/spark/sql/functions.html) 
435 | is available in raw API docs.  (_TODO: find better reference_)
436 | 
437 | The Data Import Handler also defines some UDF functions for use within SQL:
438 | 
439 | |Function|Description|Since&nbsp;Version|
440 | |-------|-----------|--------------|
441 | |stripHtml(string)|Removes all HTML tags and returns only the text (including unescaping of HTML Entities)|`0.6.0-ALPHA`|
442 | |unescapeHtmlEntites(string)|Unescapes HTML entities found in the text|`0.6.0-ALPHA`|
443 | |fluffly(string)|A silly function that prepends the word "fluffly" to the text, used as a *test* function to mark values as being changed by processing|`0.6.0-ALPHA`|
444 | 
445 | ### Auth and HTTPS for Elasticsearch
446 | 
447 | (_Since version `0.8.0-ALPHA`_)
448 | 
449 | For basic AUTH with Elasticsearch you can add the following to the source or target Elasticsearch definitions:
450 | 
451 | ```hocon
452 |     "basicAuth": {
453 |         "username": "elastic",
454 |         "password": "changeme"
455 |     }
456 | ```
457 | 
458 | And for SSL, enable it within the Elasticsearch definition as well:
459 | 
460 | ```hocon
461 |    "enableSsl": true
462 | ```
463 | 
464 | Here is a full example, when connection to Elastic Cloud instance:
465 | 
466 | ```hocon
467 | "targetElasticsearch": {
468 |     "nodes": [
469 |       "123myinstance456.us-east-1.aws.found.io"
470 |     ],
471 |     "basicAuth": {
472 |         "username": "elastic",
473 |         "password": "changeme"
474 |     },
475 |     "port": 9243,
476 |     "enableSsl": true,
477 |     "settings": {
478 |       "es.index.auto.create": true,
479 |       "es.nodes.wan.only": true
480 |     }
481 | }
482 | ```
483 | 
484 | Note the use of the `es.nodes.wan.only` setting to use the external host names for the cluster members, and not internal AWS addresses.
485 | 
486 | ### State Management and History:
487 | 
488 | State for the `lastRun` value is per-statement and stored in the target Elasticsearch cluster for that statement.  An index
489 | will be created called `.kohesive-dih-state-v2` which stores the last run state, a lock for current running statements, and
490 | a log of all previous runs (success and failures, along with row count processed by the statement query).  
491 | 
492 | You should inspect this log (index `.kohesive-dih-state-v2` type `log`)if you wish to monitor the results of runs.
493 | 
494 | ### Parallelism
495 | 
496 | By default, the data import handler is running locally with `Processors-1` parallelism.  You can set the following 
497 | top-level configuration setting to change the parallelism:
498 | 
499 | ```
500 | {
501 |     "sparkMaster": "local[N]",
502 |     "sources": { 
503 |         # ... 
504 |     } 
505 | }
506 | ```
507 | 
508 | Where `N` is the number of partitions you wish to run.  Since this is the Spark master setting, some people might try
509 | connecting to a Spark cluster using the setting.  It just might work! 
510 | 
511 | ### Memory Issues
512 | 
513 | If you run out of memory you can set the Java VM parameters via the `KOHESIVE_ES_DIH_OPTS` environment variable before
514 | running the `kohesive-es-dih` script.  For example, to set it to 2G: `-Xmx2g`   
515 | 
516 | For Spark driver and executor memory settings (_Since version `0.9.0-ALPHA`_), you can add Spark Configuration settings 
517 | in `sparkConfig` map, for example:
518 | 
519 | ```
520 | {
521 |     "sparkMaster": "local[4]",
522 |     "sparkConfig": {
523 |        "spark.driver.memory": "300mb",
524 |        "spark.executor.memory": "512mb"
525 |     },
526 |     "sources": { 
527 |         # ... 
528 |     } 
529 | }
530 | ```
531 | 
532 | Note that `spark.executor.memory` has a minimum allowed value that you cannot go below, and you will receive an error if
533 | it is set to low or the JVM does not have enough memory.
534 | 
535 | [Other Spark memory configuration settings](https://spark.apache.org/docs/latest/configuration.html#memory-management) such 
536 | as usage of off-heap space can be defined as mentioned above in the Spark Configuration settings.
537 | 
538 | *Query Caching:*
539 | 
540 | Prior to version 0.10.0-ALPHA all queries were cached into memory which could cause an overflow.  Now by default they
541 | are not cached.  For each statement (preperatory or import statements) you can specify these properties to control caching,
542 | otherwise no caching is performed.
543 | 
544 | ```
545 |    "cache": true,
546 |    "persist": "MEMORY_AND_DISK_SER"
547 | ```
548 | 
549 | Where the persist setting is one of the storage levels defined [in the RDD persistence guide](https://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence), and
550 | the default is `MEMORY_ONLY` which might overflow memory if not enough JVM or off-heap space is permitted.
551 | 
552 | Note that caching helps with preparatory steps that are then used later, or to speed up the two step process
553 | of running the main import queries followed by counting the number of rows that the query caused to be imported.  But
554 | it is not required.
555 | 
556 | 
557 | ### NUMBER and DECIMAL Types Being Disallowed
558 | 
559 | If you have errors such as:
560 | 
561 | > Decimal types are not supported by Elasticsearch
562 | 
563 | This is an issue explained further by the Elasticsearch team in [ES-Hadoop #842](https://github.com/elastic/elasticsearch-hadoop/issues/842).
564 | 
565 | Basically, they want to avoid precision loss during the SQL stage and instead let Elasticsearch use knowledge of the actual mappings later to do the conversion.
566 | So the recommendation is that you cast the type to a `String` type of decimal, or a `Integer` type if not and then let the conversion
567 | happen at the moment of indexing.
568 | 
569 | This would look something like:
570 | 
571 | ```
572 | SELECT id, name, cast(somethingDecimal as String) AS somethingDecimal, dtCreated, dtUpdated
573 |    FROM myTable
574 | ```
575 | 
576 | ### TODOs
577 | 
578 | **See:** [Issues](https://github.com/kohesive/elasticsearch-data-import-handler/issues) for TODO, feature requests, ideas and issues.
579 | 
580 | ## Special Thanks
581 | 
582 | ![YourKit logo](https://www.yourkit.com/images/yklogo.png)
583 | 
584 | YourKit supports open source projects with its full-featured Java Profiler.
585 | YourKit, LLC is the creator of [YourKit Java Profiler](https://www.yourkit.com/java/profiler/)
586 | and [YourKit .NET Profiler](https://www.yourkit.com/.net/profiler/),
587 | innovative and intelligent tools for profiling Java and .NET applications.
588 | 
589 | 
590 | 
591 | 


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
 1 | plugins {
 2 |     id 'nebula.kotlin' version '1.1.61' apply false
 3 |     id 'nebula.resolution-rules' version '5.1.1' apply false
 4 | }
 5 | 
 6 | apply plugin: 'nebula.kotlin'
 7 | apply plugin: 'nebula.resolution-rules'
 8 | apply plugin: 'java'
 9 | apply plugin: 'idea'
10 | 
11 | apply plugin: 'application'
12 | 
13 | mainClassName = 'uy.kohesive.elasticsearch.dataimport.App'
14 | applicationName = 'kohesive-es-dih'
15 | 
16 | sourceCompatibility = JavaVersion.VERSION_1_8
17 | targetCompatibility = JavaVersion.VERSION_1_8
18 | 
19 | dependencies {
20 |     compile group: 'org.jetbrains.kotlin', name: 'kotlin-stdlib-jre8', version: version_kotlin
21 |     compile group: 'org.jetbrains.kotlin', name: 'kotlin-reflect', version: version_kotlin
22 | 
23 |     compile group: 'com.fasterxml.jackson.module', name: 'jackson-module-kotlin', version: version_jackson_kotlin_module
24 |     compile group: 'com.fasterxml.jackson.datatype', name: 'jackson-datatype-jdk8', version: version_jackson_compatible_with_spark
25 |     compile group: 'com.fasterxml.jackson.datatype', name: 'jackson-datatype-jsr310', version: version_jackson_compatible_with_spark
26 | 
27 |     compile group: 'com.typesafe', name: 'config', version: version_typesafe_config
28 | 
29 |     compile group: 'org.elasticsearch', name: "elasticsearch-spark-20_${version_scala}", version: version_elasticsearch
30 |     compile group: 'com.squareup.okhttp3', name: 'okhttp', version: version_okhttp
31 | 
32 |     compile group: 'org.apache.spark', name: "spark-sql_${version_scala}", version: version_spark
33 | 
34 |     compile group: 'org.jsoup', name: 'jsoup', version: version_jsoup
35 | 
36 |     compile (group: 'com.algolia', name: 'algoliasearch', version: version_algolia) {
37 |         exclude group: 'com.fasterxml.jackson.module'
38 |         exclude group: 'com.fasterxml.jackson.datatype'
39 |         exclude group: 'com.fasterxml.jackson.core'
40 |     }
41 | 
42 |     resolutionRules group: 'com.netflix.nebula', name: 'gradle-resolution-rules', version: version_nebula_resolution
43 |     resolutionRules files("${rootDir}/gradle-local-dependency-rules.json")
44 | 
45 |     compile group: 'org.slf4j', name: 'slf4j-api', version: version_slf4j
46 |     runtime group: 'ch.qos.logback', name: 'logback-classic', version: version_logback
47 | 
48 |     compile group: 'mysql', name: 'mysql-connector-java', version: version_jdbc_mysql
49 |     compile group: 'org.postgresql', name: 'postgresql', version: version_jdbc_postresql
50 | 
51 |     testCompile group: 'junit', name: 'junit', version: version_junit
52 |     testCompile  group: 'org.jetbrains.kotlin', name: 'kotlin-test-junit'
53 | 
54 | }
55 | 
56 | nebulaResolutionRules {
57 |     optional = ['slf4j-bridge']
58 | }
59 | 
60 | repositories {
61 |     mavenCentral()
62 | }
63 | 
64 | compileKotlin {
65 |     kotlinOptions {
66 |         jvmTarget = 1.8
67 |     }
68 | }
69 | 
70 | task wrapper(type: Wrapper) {
71 |     gradleVersion = version_gradle
72 | }
73 | 
74 | tasks.withType(Tar){
75 |     compression = Compression.GZIP
76 | }
77 | 
78 | idea {
79 |     module {
80 |         downloadJavadoc = true
81 |         downloadSources = true
82 |     }
83 | }


--------------------------------------------------------------------------------
/circle.yml:
--------------------------------------------------------------------------------
 1 | # Java Gradle CircleCI 2.0 configuration file
 2 | #
 3 | # Check https://circleci.com/docs/2.0/language-java/ for more details
 4 | #
 5 | version: 2
 6 | jobs:
 7 |   build:
 8 |     docker:
 9 |       # specify the version you desire here
10 |       - image: circleci/openjdk:8-jdk
11 | 
12 |       # Specify service dependencies here if necessary
13 |       # CircleCI maintains a library of pre-built images
14 |       # documented at https://circleci.com/docs/2.0/circleci-images/
15 |       # - image: circleci/postgres:9.4
16 | 
17 |     working_directory: ~/repo
18 | 
19 |     environment:
20 |       # Customize the JVM maximum heap limit
21 |       JVM_OPTS: -Xmx3200m
22 |       TERM: dumb
23 | 
24 |     steps:
25 |       - checkout
26 | 
27 |       # Download and cache dependencies
28 |       - restore_cache:
29 |           keys:
30 |           - v1-dependencies-{{ checksum "build.gradle" }}
31 |           # fallback to using the latest cache if no exact match is found
32 |           - v1-dependencies-
33 | 
34 |       - run: gradle dependencies
35 | 
36 |       - save_cache:
37 |           paths:
38 |             - ~/.gradle
39 |           key: v1-dependencies-{{ checksum "build.gradle" }}
40 | 
41 |       # run tests!
42 |       - run: ./gradlew clean build test


--------------------------------------------------------------------------------
/gradle-local-dependency-rules.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "exclude": [
 3 |     {
 4 |       "module": "org.slf4j:slf4j-log4j12",
 5 |       "reason": "This does the reverse of what we want, routes to log4j, we want to route to slf4j",
 6 |       "author": "Jayson Minard <jayson.minard@gmail.com>",
 7 |       "date": "2017-02-10T20:00:00.000Z"
 8 |     }
 9 |   ],
10 |   "replace": [],
11 |   "align": [],
12 |   "deny": [],
13 |   "reject": []
14 | }
15 | 


--------------------------------------------------------------------------------
/gradle.properties:
--------------------------------------------------------------------------------
 1 | group = uy.kohesive.elasticsearch
 2 | version = 1.0.0-ES-5.6.x-BETA-01
 3 | 
 4 | version_gradle=4.5.1
 5 | version_kotlin=1.1.61
 6 | 
 7 | version_nebula_resolution=0.52.0
 8 | version_jackson_compatible_with_spark=2.6.7
 9 | version_jackson_kotlin_module=2.6.7
10 | version_spark=2.2.1
11 | 
12 | version_typesafe_config=1.3.1
13 | 
14 | version_elasticsearch=5.6.7
15 | version_okhttp=3.10.0
16 | version_scala=2.11
17 | 
18 | version_jsoup=1.10.2
19 | 
20 | version_slf4j=1.7.+
21 | version_logback=1.2.+
22 | version_junit=4.12
23 | 
24 | version_jdbc_mysql=5.1.45
25 | version_jdbc_postresql=42.2.1
26 | 
27 | version_algolia=2.15.6
28 | 


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kohesive/elasticsearch-data-import-handler/029412ffef2169a34a80ce7b516e019687238c0b/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Sun Feb 25 15:56:41 CST 2018
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.5.1-bin.zip
7 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | 
  3 | ##############################################################################
  4 | ##
  5 | ##  Gradle start up script for UN*X
  6 | ##
  7 | ##############################################################################
  8 | 
  9 | # Attempt to set APP_HOME
 10 | # Resolve links: $0 may be a link
 11 | PRG="$0"
 12 | # Need this for relative symlinks.
 13 | while [ -h "$PRG" ] ; do
 14 |     ls=`ls -ld "$PRG"`
 15 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 16 |     if expr "$link" : '/.*' > /dev/null; then
 17 |         PRG="$link"
 18 |     else
 19 |         PRG=`dirname "$PRG"`"/$link"
 20 |     fi
 21 | done
 22 | SAVED="`pwd`"
 23 | cd "`dirname \"$PRG\"`/" >/dev/null
 24 | APP_HOME="`pwd -P`"
 25 | cd "$SAVED" >/dev/null
 26 | 
 27 | APP_NAME="Gradle"
 28 | APP_BASE_NAME=`basename "$0"`
 29 | 
 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 31 | DEFAULT_JVM_OPTS=""
 32 | 
 33 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 34 | MAX_FD="maximum"
 35 | 
 36 | warn ( ) {
 37 |     echo "$*"
 38 | }
 39 | 
 40 | die ( ) {
 41 |     echo
 42 |     echo "$*"
 43 |     echo
 44 |     exit 1
 45 | }
 46 | 
 47 | # OS specific support (must be 'true' or 'false').
 48 | cygwin=false
 49 | msys=false
 50 | darwin=false
 51 | nonstop=false
 52 | case "`uname`" in
 53 |   CYGWIN* )
 54 |     cygwin=true
 55 |     ;;
 56 |   Darwin* )
 57 |     darwin=true
 58 |     ;;
 59 |   MINGW* )
 60 |     msys=true
 61 |     ;;
 62 |   NONSTOP* )
 63 |     nonstop=true
 64 |     ;;
 65 | esac
 66 | 
 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 68 | 
 69 | # Determine the Java command to use to start the JVM.
 70 | if [ -n "$JAVA_HOME" ] ; then
 71 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 72 |         # IBM's JDK on AIX uses strange locations for the executables
 73 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 74 |     else
 75 |         JAVACMD="$JAVA_HOME/bin/java"
 76 |     fi
 77 |     if [ ! -x "$JAVACMD" ] ; then
 78 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 79 | 
 80 | Please set the JAVA_HOME variable in your environment to match the
 81 | location of your Java installation."
 82 |     fi
 83 | else
 84 |     JAVACMD="java"
 85 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 86 | 
 87 | Please set the JAVA_HOME variable in your environment to match the
 88 | location of your Java installation."
 89 | fi
 90 | 
 91 | # Increase the maximum file descriptors if we can.
 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
 93 |     MAX_FD_LIMIT=`ulimit -H -n`
 94 |     if [ $? -eq 0 ] ; then
 95 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
 96 |             MAX_FD="$MAX_FD_LIMIT"
 97 |         fi
 98 |         ulimit -n $MAX_FD
 99 |         if [ $? -ne 0 ] ; then
100 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
101 |         fi
102 |     else
103 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 |     fi
105 | fi
106 | 
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 | 
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 |     JAVACMD=`cygpath --unix "$JAVACMD"`
117 | 
118 |     # We build the pattern for arguments to be converted via cygpath
119 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 |     SEP=""
121 |     for dir in $ROOTDIRSRAW ; do
122 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
123 |         SEP="|"
124 |     done
125 |     OURCYGPATTERN="(^($ROOTDIRS))"
126 |     # Add a user-defined pattern to the cygpath arguments
127 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 |     fi
130 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 |     i=0
132 |     for arg in "$@" ; do
133 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
135 | 
136 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
137 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 |         else
139 |             eval `echo args$i`="\"$arg\""
140 |         fi
141 |         i=$((i+1))
142 |     done
143 |     case $i in
144 |         (0) set -- ;;
145 |         (1) set -- "$args0" ;;
146 |         (2) set -- "$args0" "$args1" ;;
147 |         (3) set -- "$args0" "$args1" "$args2" ;;
148 |         (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 |         (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 |         (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 |         (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 |         (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 |         (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 |     esac
155 | fi
156 | 
157 | # Escape application args
158 | save ( ) {
159 |     for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160 |     echo " "
161 | }
162 | APP_ARGS=$(save "$@")
163 | 
164 | # Collect all arguments for the java command, following the shell quoting and substitution rules
165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166 | 
167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169 |   cd "$(dirname "$0")"
170 | fi
171 | 
172 | exec "$JAVACMD" "$@"
173 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 | 
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | 
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 | 
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 | 
61 | set CMD_LINE_ARGS=%*
62 | 
63 | :execute
64 | @rem Setup the command line
65 | 
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 | 
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 | 
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 | 
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 | 
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 | 
84 | :omega
85 | 


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'kohesive-elasticsearch-data-import-handler'


--------------------------------------------------------------------------------
/src/main/java/uy/kohesive/elasticsearch/dataimport/udf/Udfs.java:
--------------------------------------------------------------------------------
 1 | package uy.kohesive.elasticsearch.dataimport.udf;
 2 | 
 3 | import org.apache.spark.sql.SparkSession;
 4 | import org.apache.spark.sql.api.java.UDF1;
 5 | import org.apache.spark.sql.api.java.UDF2;
 6 | import org.apache.spark.sql.types.DataTypes;
 7 | 
 8 | public class Udfs {
 9 | 
10 |     public static <T extends String, RT extends String> void registerStringToStringUdf(final SparkSession spark, final String name, final UDF1<T, RT> f) {
11 |         spark.udf().register(name, f, DataTypes.StringType);
12 |     }
13 | 
14 |     public static <T1 extends Object, T2 extends Object, RT extends Object> void registerAnyAnyToTimestampUdf(final SparkSession spark, final String name, final UDF2<T1, T2, RT> f) {
15 |         spark.udf().register(name, f, DataTypes.TimestampType);
16 |     }
17 | 
18 |     public static <T1 extends String, T2 extends String, RT extends String> void registerStringStringToStringUdf(final SparkSession spark, final String name, final UDF2<T1, T2, RT> f) {
19 |         spark.udf().register(name, f, DataTypes.StringType);
20 |     }
21 | 
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/kotlin/uy/kohesive/elasticsearch/dataimport/AlgoliaDataImportHandler.kt:
--------------------------------------------------------------------------------
  1 | package uy.kohesive.elasticsearch.dataimport
  2 | 
  3 | import com.algolia.search.APIClient
  4 | import com.algolia.search.ApacheAPIClientBuilder
  5 | import com.fasterxml.jackson.module.kotlin.readValue
  6 | import org.apache.spark.TaskContext
  7 | import org.apache.spark.sql.Dataset
  8 | import org.apache.spark.sql.Row
  9 | import org.apache.spark.sql.types.StructType
 10 | import org.elasticsearch.hadoop.cfg.PropertiesSettings
 11 | import org.elasticsearch.hadoop.serialization.json.JacksonJsonGenerator
 12 | import org.elasticsearch.hadoop.util.FastByteArrayOutputStream
 13 | import org.elasticsearch.spark.cfg.SparkSettingsManager
 14 | import org.elasticsearch.spark.sql.DataFrameValueWriter
 15 | import scala.Tuple2
 16 | import scala.collection.Iterator
 17 | import scala.runtime.AbstractFunction2
 18 | import java.io.File
 19 | import java.io.Serializable
 20 | 
 21 | class AlgoliaDataImportHandler(
 22 |     override val statement: DataImportStatement,
 23 |     override val configRelativeDir: File,
 24 | 
 25 |     targetAlgolia: AlgoliaTargetConnection
 26 | ) : StatementDataImportHandler, Serializable {
 27 | 
 28 |     val options = mapOf(
 29 |         "algolia.write.applicationid" to targetAlgolia.applicationId,
 30 |         "algolia.write.apikey"        to targetAlgolia.apiKey,
 31 |         "algolia.write.idfield"       to statement.idField,
 32 |         "algolia.write.index"         to statement.indexName
 33 |     )
 34 | 
 35 |     override fun prepareIndex() {
 36 |         // TODO: implement
 37 |     }
 38 | 
 39 |     override fun import(dataSet: Dataset<Row>): Long {
 40 |         AlgoliaSparkTaskRunner.runInSpark(dataSet, options, statement.getAction())
 41 |         return dataSet.count()
 42 |     }
 43 | }
 44 | 
 45 | object AlgoliaSparkTaskRunner {
 46 | 
 47 |     fun runInSpark(ds: Dataset<Row>, cfg: Map<String, String?>, action: StatementAction) {
 48 |         val sparkCtx = ds.sqlContext().sparkContext()
 49 |         val sparkCfg = SparkSettingsManager().load(sparkCtx.conf)
 50 | 
 51 |         val algoliaCfg = PropertiesSettings().load(sparkCfg.save())
 52 |         algoliaCfg.merge(cfg)
 53 | 
 54 |         val rdd = ds.toDF().rdd()
 55 | 
 56 |         val serializedSettings = algoliaCfg.save()
 57 |         val schema = ds.schema()
 58 | 
 59 |         sparkCtx.runJob<Row, Long>(
 60 |             rdd,
 61 |             object : AbstractFunction2<TaskContext, Iterator<Row>, Long>(), Serializable {
 62 |                 override fun apply(taskContext: TaskContext, data: Iterator<Row>): Long {
 63 |                     val task = when (action) {
 64 |                         StatementAction.Index  -> AlgoliaDataFrameWriter(schema, serializedSettings)
 65 |                         StatementAction.Delete -> AlgoliaObjectsDeleteTask(schema, serializedSettings)
 66 |                     }
 67 |                     task.write(taskContext, data)
 68 |                     return 0L
 69 |                 }
 70 |             },
 71 |             scala.reflect.`ClassTag$`.`MODULE$`.apply<Long>(Long::class.java)
 72 |         )
 73 |     }
 74 | 
 75 | }
 76 | 
 77 | abstract class AlgoliaDataFrameBufferedTask(val schema: StructType, serializedSettings: String) {
 78 | 
 79 |     companion object {
 80 |         val DefaultBulkSize = 50
 81 |     }
 82 | 
 83 |     protected val settings = PropertiesSettings().load(serializedSettings)
 84 | 
 85 |     protected val bulkSize = settings.getProperty("algolia.write.bulkSize")?.toInt() ?: DefaultBulkSize
 86 | 
 87 |     protected val idField: String? = settings.getProperty("algolia.write.idfield")
 88 | 
 89 |     protected val algoliaClient: APIClient = ApacheAPIClientBuilder(
 90 |         settings.getProperty("algolia.write.applicationid"),
 91 |         settings.getProperty("algolia.write.apikey")
 92 |     ).setObjectMapper(JSON).build()
 93 | 
 94 |     protected val targetIndex = algoliaClient.initIndex(settings.getProperty("algolia.write.index"), Map::class.java)
 95 | 
 96 |     protected val buffer = ArrayList<String>()
 97 | 
 98 |     private fun flush() {
 99 |         val objectsToWrite: List<Map<String, Any?>> = buffer.map { rowStr ->
100 |             JSON.readValue<Map<String, Any>>(rowStr).let { map ->
101 |                 if (idField != null) {
102 |                     map + ("objectID" to map[idField])
103 |                 } else {
104 |                     map
105 |                 }
106 |             }
107 |         }
108 |         if (objectsToWrite.isNotEmpty()) {
109 |             flush(objectsToWrite)
110 |             buffer.clear()
111 |         }
112 |     }
113 | 
114 |     abstract fun flush(objects: List<Map<String, Any?>>)
115 | 
116 |     fun write(taskContext: TaskContext, data: Iterator<Row>) {
117 |         fun tryFlush() {
118 |             if (buffer.size >= bulkSize) {
119 |                 flush()
120 |             }
121 |         }
122 | 
123 |         while (data.hasNext()) {
124 |             val row       = data.next()
125 |             val out       = FastByteArrayOutputStream()
126 |             val generator = JacksonJsonGenerator(out)
127 | 
128 |             generator.use { generator ->
129 |                 DataFrameValueWriter().write(Tuple2(row, schema), generator)
130 |             }
131 | 
132 |             buffer.add(out.toString())
133 |             tryFlush()
134 |         }
135 | 
136 |         flush()
137 |     }
138 | 
139 | }
140 | 
141 | class AlgoliaDataFrameWriter(schema: StructType, serializedSettings: String) : AlgoliaDataFrameBufferedTask(schema, serializedSettings) {
142 | 
143 |     override fun flush(objects: List<Map<String, Any?>>) {
144 |         targetIndex.addObjects(objects)
145 |     }
146 | 
147 | }
148 | 
149 | class AlgoliaObjectsDeleteTask(schema: StructType, serializedSettings: String) : AlgoliaDataFrameBufferedTask(schema, serializedSettings) {
150 | 
151 |     override fun flush(objects: List<Map<String, Any?>>) {
152 |         if (idField == null) {
153 |             throw IllegalStateException("Delete statement must have `idField` defined")
154 |         }
155 |         targetIndex.deleteObjects(objects.map { it[idField]?.toString() }.filterNotNull())
156 |     }
157 | 
158 | }
159 | 
160 | 


--------------------------------------------------------------------------------
/src/main/kotlin/uy/kohesive/elasticsearch/dataimport/AlgoliaStateManager.kt:
--------------------------------------------------------------------------------
  1 | package uy.kohesive.elasticsearch.dataimport
  2 | 
  3 | import com.algolia.search.APIClient
  4 | import com.algolia.search.ApacheAPIClientBuilder
  5 | import com.algolia.search.Index
  6 | import com.algolia.search.objects.IndexSettings
  7 | import com.algolia.search.objects.Query
  8 | import java.time.Instant
  9 | 
 10 | data class AlgoliaState(
 11 |     val targetIndex: String,
 12 |     val statementId: String,
 13 |     val status: String,
 14 |     val lastRunId: String,
 15 |     val lastErrorMsg: String?,
 16 |     val lastRunDate: Long,
 17 |     val lastRowCount: Long
 18 | )
 19 | 
 20 | data class AlgoliaLog(
 21 |     val targetIndex: String,
 22 |     val statementId: String,
 23 |     val runId: String,
 24 |     val status: String,
 25 |     val errorMsg: String?,
 26 |     val runDate: Long,
 27 |     val rowCount: Long
 28 | )
 29 | 
 30 | data class AlgoliaLock(
 31 |     val targetIndex: String,
 32 |     val statementId: String,
 33 |     val runId: String,
 34 |     val lockDate: Long
 35 | )
 36 | 
 37 | // TODO: this 'locking' mechanism is not safe, Algolia doesn't allow atomic updates
 38 | class AlgoliaStateManager(applicationId: String, apiKey: String) : StateManager {
 39 | 
 40 |     companion object {
 41 |         val StateIndex = "kohesive-dih-state"
 42 |         val LogIndex   = "kohesive-dih-log"
 43 |         val LockIndex  = "kohesive-dih-lock"
 44 |     }
 45 | 
 46 |     private lateinit var stateIndex: Index<AlgoliaState>
 47 |     private lateinit var logIndex: Index<AlgoliaLog>
 48 |     private lateinit var lockIndex: Index<AlgoliaLock>
 49 | 
 50 |     val algoliaClient: APIClient = ApacheAPIClientBuilder(applicationId, apiKey).setObjectMapper(JSON).build()
 51 | 
 52 |     override fun init() {
 53 |         fun <T> initIndex(indexName: String, clazz: Class<T>, settings: IndexSettings.() -> Unit): Index<T> {
 54 |             return algoliaClient.listIndices().firstOrNull { it.name == indexName }?.let {
 55 |                 algoliaClient.initIndex(indexName, clazz)
 56 |             } ?: kotlin.run {
 57 |                 algoliaClient.initIndex(indexName, clazz).apply {
 58 |                     this.settings = IndexSettings().apply {
 59 |                         settings()
 60 |                     }
 61 |                 }
 62 |             }
 63 |         }
 64 | 
 65 |         stateIndex = initIndex(StateIndex, AlgoliaState::class.java) {
 66 |             searchableAttributes          = listOf("targetIndex", "statementId", "status", "lastRunId", "lastErrorMsg")
 67 |             numericAttributesForFiltering = listOf("lastRunDate", "lastRowCount")
 68 |         }
 69 |         logIndex = initIndex(LogIndex, AlgoliaLog::class.java) {
 70 |             searchableAttributes          = listOf("targetIndex", "statementId", "runId", "status", "errorMsg")
 71 |             numericAttributesForFiltering = listOf("runDate", "rowCount")
 72 |         }
 73 |         lockIndex = initIndex(LockIndex, AlgoliaLock::class.java) {
 74 |             searchableAttributes          = listOf("targetIndex", "statementId", "runId")
 75 |             numericAttributesForFiltering = listOf("lockDate")
 76 |         }
 77 | 
 78 |         ttlKillOldLocks()
 79 |     }
 80 | 
 81 |     override fun lockStatement(runId: String, statement: DataImportStatement): Boolean {
 82 |         if (lockIndex.getObject(statement.stateKey()).isPresent) {
 83 |             ttlKillOldLocks()
 84 |             return pingLockStatement(runId, statement)
 85 |         } else {
 86 |             lockIndex.addObject(statement.stateKey(), AlgoliaLock(
 87 |                 targetIndex = statement.indexName,
 88 |                 statementId = statement.id,
 89 |                 runId       = runId,
 90 |                 lockDate    = now()
 91 |             ))
 92 |             return true
 93 |         }
 94 |     }
 95 | 
 96 |     override fun pingLockStatement(runId: String, statement: DataImportStatement): Boolean {
 97 |         return lockIndex.getObject(statement.stateKey()).map { lock ->
 98 |             if (lock.runId == runId) {
 99 |                 lockIndex.partialUpdateObject(statement.stateKey(), lock.copy(
100 |                     lockDate = now()
101 |                 ))
102 |                 true
103 |             } else {
104 |                 throw DataImportException("State manager failed, cannot acquire lock for ${statement.stateKey()} -- it is held by ${lock.runId} since ${lock.lockDate}")
105 |             }
106 |         }.orElse(false)
107 |     }
108 | 
109 |     private fun now() = Instant.now().toEpochMilli()
110 | 
111 |     private fun ttlKillOldLocks() {
112 |         try {
113 |             lockIndex.deleteByQuery(Query().setNumericFilters(listOf("lockDate < ${ now() - (1000 * 60 * 15) }")))
114 |         } catch (t: Throwable) {
115 |             throw DataImportException("State manager failed, TTL delete query for locks failed", t)
116 |         }
117 |     }
118 | 
119 |     override fun unlockStatement(runId: String, statement: DataImportStatement) {
120 |         if (pingLockStatement(runId, statement)) {
121 |             try {
122 |                 lockIndex.deleteObject(statement.stateKey())
123 |             } catch (t: Throwable) {
124 |                 throw DataImportException("State manager failed, cannot delete lock for ${statement.stateKey()}", t)
125 |             }
126 |         }
127 |     }
128 | 
129 |     override fun writeStateForStatement(runId: String, statement: DataImportStatement, lastRunStart: Instant, status: String, lastRowCount: Long, errMsg: String?) {
130 |         stateIndex.addObject(statement.stateKey(), AlgoliaState(
131 |             targetIndex  = statement.indexName,
132 |             statementId  = statement.id,
133 |             lastErrorMsg = errMsg,
134 |             lastRowCount = lastRowCount,
135 |             lastRunDate  = lastRunStart.toEpochMilli(),
136 |             lastRunId    = runId,
137 |             status       = status
138 |         ))
139 |     }
140 | 
141 |     override fun readStateForStatement(runId: String, statement: DataImportStatement): Instant? {
142 |         return stateIndex.getObject(statement.stateKey()).map { Instant.ofEpochMilli(it.lastRunDate) }.orElse(null)
143 |     }
144 | 
145 |     override fun logStatement(runId: String, statement: DataImportStatement, lastRunStart: Instant, status: String, rowCount: Long, errMsg: String?) {
146 |         logIndex.addObject("${statement.stateKey()}_run_${runId}", AlgoliaLog(
147 |             targetIndex = statement.indexName,
148 |             status      = status,
149 |             statementId = statement.id,
150 |             runId       = runId,
151 |             errorMsg    = errMsg,
152 |             rowCount    = rowCount,
153 |             runDate     = lastRunStart.toEpochMilli()
154 |         ))
155 |     }
156 | }


--------------------------------------------------------------------------------
/src/main/kotlin/uy/kohesive/elasticsearch/dataimport/App.kt:
--------------------------------------------------------------------------------
  1 | package uy.kohesive.elasticsearch.dataimport
  2 | 
  3 | import com.fasterxml.jackson.databind.ObjectMapper
  4 | import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
  5 | import com.fasterxml.jackson.module.kotlin.readValue
  6 | import com.typesafe.config.ConfigFactory
  7 | import com.typesafe.config.ConfigRenderOptions
  8 | import com.typesafe.config.ConfigResolveOptions
  9 | import org.apache.spark.sql.AnalysisException
 10 | import org.apache.spark.sql.SparkSession
 11 | import org.apache.spark.sql.catalyst.parser.ParseException
 12 | import org.apache.spark.storage.StorageLevel
 13 | import java.io.File
 14 | import java.io.InputStream
 15 | import java.io.InputStreamReader
 16 | import java.net.URL
 17 | import java.net.URLClassLoader
 18 | import java.sql.Timestamp
 19 | import java.time.Instant
 20 | import java.time.LocalDateTime
 21 | import java.time.ZoneOffset
 22 | import java.time.temporal.ChronoUnit
 23 | import java.util.*
 24 | 
 25 | class App {
 26 |     companion object {
 27 |         @JvmStatic fun main(args: Array<String>) {
 28 |             println("Kohesive - Elasticsearch Data Import Utility")
 29 |             val configFile = args.takeIf { it.isNotEmpty() }?.let { File(it[0]).normalize().absoluteFile }
 30 | 
 31 |             if (configFile == null || !configFile.exists()) {
 32 |                 println("  ERROR:  configFile must be specified and must exist")
 33 |                 configFile?.let { println("${it.absolutePath} does not exist") }
 34 |                 println("  usage:  App <configFile.json>")
 35 |                 println()
 36 |                 System.exit(-1)
 37 |             }
 38 | 
 39 |             try {
 40 |                 App().run(configFile!!.inputStream(), configFile.parentFile)
 41 |             } catch (ex: Throwable) {
 42 |                 System.err.println("Data import failed due to:")
 43 |                 System.err.println(ex.message)
 44 |                 System.err.println()
 45 |                 System.err.println("Debug stack trace:")
 46 |                 ex.printStackTrace()
 47 |                 System.exit(-1)
 48 |             }
 49 |         }
 50 |     }
 51 | 
 52 |     enum class ImportTarget {
 53 |         ElasticSearch,
 54 |         Algolia
 55 |     }
 56 | 
 57 |     fun run(configInput: InputStream, configRelativeDir: File) {
 58 |         val hoconCfg = ConfigFactory.parseReader(InputStreamReader(configInput)).resolve(ConfigResolveOptions.noSystem())
 59 |         val jsonCfg  = hoconCfg.root().render(ConfigRenderOptions.concise().setJson(true))
 60 |         val cfg      = jacksonObjectMapper().readValue<DataImportHandlerConfig>(jsonCfg)
 61 | 
 62 |         val uniqueId = UUID.randomUUID().toString()
 63 | 
 64 |         val sparkMaster = cfg.sparkMaster ?: "local[${(Runtime.getRuntime().availableProcessors() - 1).coerceAtLeast(1)}]"
 65 | 
 66 |         fun fileRelativeToConfig(filename: String): File {
 67 |             return configRelativeDir.resolve(filename).canonicalFile
 68 |         }
 69 | 
 70 |         println("Connecting to target ES/Algolia to check state...")
 71 |         val stateMap: Map<String, StateManager> = cfg.importSteps.map { importStep ->
 72 |             val mgr = if (importStep.targetElasticsearch != null) {
 73 |                 ElasticSearchStateManager(
 74 |                     nodes     = importStep.targetElasticsearch.nodes,
 75 |                     port      = importStep.targetElasticsearch.port ?: 9200,
 76 |                     enableSsl = importStep.targetElasticsearch.enableSsl ?: false,
 77 |                     auth      = importStep.targetElasticsearch.basicAuth
 78 |                 )
 79 |             } else if (importStep.targetAlgolia != null) {
 80 |                 AlgoliaStateManager(
 81 |                     applicationId = importStep.targetAlgolia.applicationId,
 82 |                     apiKey        = importStep.targetAlgolia.apiKey
 83 |                 )
 84 |             } else {
 85 |                 throw IllegalStateException(importStep.description + " import step neither declares ES nor Algolia target")
 86 |             }
 87 | 
 88 |             mgr.init()
 89 | 
 90 |             importStep.statements.map { statement ->
 91 |                 statement.id to mgr
 92 |             }
 93 |         }.flatten().toMap()
 94 | 
 95 |         val NOSTATE = LocalDateTime.of(1900, 1, 1, 0, 0, 0, 0).atZone(ZoneOffset.UTC).toInstant()
 96 | 
 97 |         fun DataImportStatement.validate(importTarget: ImportTarget) {
 98 |             // do a little validation of the ..
 99 |             if (newIndexSettingsFile != null) {
100 |                 val checkFile = fileRelativeToConfig(newIndexSettingsFile)
101 |                 if (!checkFile.exists()) {
102 |                     throw IllegalStateException("The statement '${id}' new-index mapping file must exist: $checkFile")
103 |                 }
104 |             }
105 |             if (importTarget == ImportTarget.ElasticSearch && indexType == null && type == null) {
106 |                 throw IllegalArgumentException("The statement '${id}' is missing `indexType`")
107 |             }
108 |             if (type != null && indexType == null) {
109 |                 System.err.println("     Statement configuration parameter `type` is deprecated, use `indexType`")
110 |             }
111 |             if (sqlQuery == null && sqlFile == null) {
112 |                 throw IllegalArgumentException("The statement '${id}' is missing one of `sqlQuery` or `sqlFile`")
113 |             }
114 |             if (sqlQuery != null && sqlFile != null) {
115 |                 throw IllegalArgumentException("The statement '${id}' should have only one of `sqlQuery` or `sqlFile`")
116 |             }
117 |             if (sqlFile != null && !fileRelativeToConfig(sqlFile).exists()) {
118 |                 throw IllegalArgumentException("The statement '${id}' `sqlFile` must exist")
119 |             }
120 |         }
121 |         
122 |         val lastRuns: Map<String, Instant> = cfg.importSteps.map { importStep ->
123 |             val importTarget = if (importStep.targetElasticsearch != null) ImportTarget.ElasticSearch else ImportTarget.Algolia
124 |             importStep.statements.map { statement ->
125 |                 val lastState = stateMap.get(statement.id)!!.readStateForStatement(uniqueId, statement)?.truncatedTo(ChronoUnit.SECONDS) ?: NOSTATE
126 |                 println("  Statement ${statement.id} - ${statement.description}")
127 |                 println("     LAST RUN: ${if (lastState == NOSTATE) "never" else lastState.toIsoString()}")
128 | 
129 |                 statement.validate(importTarget)
130 |                 statement.id to lastState
131 |             }
132 |         }.flatten().toMap()
133 | 
134 |         cfg.prepStatements?.forEach { statement ->
135 |             if (statement.sqlQuery == null && statement.sqlFile == null) {
136 |                 throw IllegalArgumentException("A prepStatement is missing one of `sqlQuery` or `sqlFile`")
137 |             }
138 |             if (statement.sqlQuery != null && statement.sqlFile != null) {
139 |                 throw IllegalArgumentException("A prepStatement should have only one of `sqlQuery` or `sqlFile`")
140 |             }
141 |             if (statement.sqlFile != null && !fileRelativeToConfig(statement.sqlFile).exists()) {
142 |                 throw IllegalArgumentException("A prepStatement `sqlFile` must exist:  ${statement.sqlFile}")
143 |             }
144 |         }
145 | 
146 |         val thisRunDate = Instant.now().truncatedTo(ChronoUnit.SECONDS)
147 | 
148 |         val oldClassLoader = Thread.currentThread().contextClassLoader
149 |         val extraClasspath = cfg.sources.jdbc?.map { it.driverJars }?.filterNotNull()?.flatten()?.map { URL("file://${File(it).normalize().absolutePath}") }?.toTypedArray() ?: emptyArray()
150 |         val newClassLoader = URLClassLoader(extraClasspath, oldClassLoader)
151 | 
152 |         Thread.currentThread().contextClassLoader = newClassLoader
153 |         try {
154 |             SparkSession.builder()
155 |                     .appName("esDataImport-${uniqueId}")
156 |                     .config("spark.ui.enabled", false)
157 |                     .apply {
158 |                         cfg.sparkConfig?.forEach {
159 |                             config(it.key,  it.value)
160 |                         }
161 |                     }
162 |                     .master(sparkMaster).getOrCreate().use { spark ->
163 | 
164 |                 // add extra UDF functions
165 |                 DataImportHandlerUdfs.registerSparkUdfs(spark)
166 | 
167 |                 // setup FILE inputs
168 |                 println()
169 |                 cfg.sources.filesystem?.forEach { filesystem ->
170 |                     println("Mounting filesystem ${filesystem.directory}")
171 |                     val dir = File(filesystem.directory).normalize()
172 |                     if (!dir.exists()) {
173 |                         throw DataImportException("Invalid filesystem directory: ${dir} does not exist")
174 |                     }
175 |                     filesystem.tables.forEach { table ->
176 |                         println("Create table ${table.sparkTable} from filespec ${table.filespecs.joinToString()}")
177 |                         val options = mutableMapOf<String, String>()
178 |                         filesystem.settings?.let { options.putAll(it) }
179 |                         table.settings?.let { options.putAll(it) }
180 |                         val fileSpecs = table.filespecs.map { "${dir.absolutePath}${File.separatorChar}${it}" }.toTypedArray()
181 |                         spark.read().format(table.format)
182 |                                 .options(options)
183 |                                 .load(*fileSpecs)
184 |                                 .createOrReplaceTempView(table.sparkTable)
185 |                     }
186 |                 }
187 | 
188 |                 // setup JDBC inputs
189 |                 println()
190 |                 cfg.sources.jdbc?.forEach { jdbc ->
191 |                     println("Mounting JDBC source ${jdbc.jdbcUrl}")
192 |                     jdbc.tables.forEach { table ->
193 |                         println("Creating table ${table.sparkTable} from JDBC ${table.sourceTable}")
194 |                         val sourceTable = table.sourceTable.takeIf { '.' in it } ?: if (jdbc.defaultSchema.isNullOrBlank()) table.sourceTable else "${jdbc.defaultSchema}.${table.sourceTable}"
195 |                         val options = mutableMapOf("url" to jdbc.jdbcUrl,
196 |                                 "driver" to jdbc.driverClass,
197 |                                 "user" to jdbc.auth.username,
198 |                                 "password" to jdbc.auth.password,
199 |                                 "dbtable" to sourceTable)
200 |                         jdbc.settings?.let { options.putAll(it) }
201 |                         table.settings?.let { options.putAll(it) }
202 |                         spark.read().format("jdbc")
203 |                                 .options(options)
204 |                                 .load()
205 |                                 .createOrReplaceTempView(table.sparkTable)
206 |                     }
207 |                 }
208 | 
209 |                 // setup ES inputs
210 |                 println()
211 |                 cfg.sources.elasticsearch?.forEach { es ->
212 |                     println("Mounting Elasticsearch source ${es.nodes.joinToString()}")
213 |                     es.tables.forEach { table ->
214 |                         val indexType = table.indexType ?: table.type
215 |                         println("Creating table ${table.sparkTable} from Elasticsearch index ${table.indexName}/${indexType}")
216 |                         if (indexType == null) {
217 |                             throw IllegalArgumentException("  Source configuration is missing parameter `indexType`")
218 |                         }
219 |                         if (table.type != null) {
220 |                             System.err.println("  Source configuration parameter `type` is deprecated, use `indexType`")
221 |                         }
222 |                         val options = mutableMapOf("es.nodes" to es.nodes.joinToString(","))
223 |                         es.basicAuth?.let {
224 |                             options.put("es.net.http.auth.user", it.username)
225 |                             options.put("es.net.http.auth.pass", it.password)
226 |                         }
227 |                         es.port?.let {
228 |                             options.put("es.port", it.toString())
229 |                         }
230 |                         es.enableSsl?.let {
231 |                             options.put("es.net.ssl", it.toString())
232 |                         }
233 | 
234 |                         es.settings?.let { options.putAll(it) }
235 |                         table.settings?.let { options.putAll(it) }
236 | 
237 |                         if (table.esQuery != null) {
238 |                             if (table.esQuery is Map<*, *>) {
239 |                                 @Suppress("UNCHECKED_CAST")
240 |                                 val root = if (table.esQuery.containsKey("query")) table.esQuery else mapOf("query" to table.esQuery)
241 |                                 options.put("es.query", ObjectMapper().writeValueAsString(root).replace('\n', ' '))
242 |                             } else {
243 |                                 options.put("es.query", table.esQuery.toString())
244 |                             }
245 |                         } else {
246 |                             // defaults to match_all
247 |                         }
248 | 
249 |                         spark.read().format("org.elasticsearch.spark.sql")
250 |                                 .options(options)
251 |                                 .load(indexSpec(table.indexName, indexType))
252 |                                 .createOrReplaceTempView(table.sparkTable)
253 |                     }
254 |                 }
255 | 
256 |                 // run prep-queries
257 |                 println()
258 |                 cfg.prepStatements?.forEach { statement ->
259 |                     try {
260 |                         println("\nRunning prep-statement:\n${statement.description.replaceIndent("  ")}")
261 |                         val rawQuery = statement.sqlQuery ?: fileRelativeToConfig(statement.sqlFile!!).readText()
262 |                         spark.sql(rawQuery).let {
263 |                             if (statement.cache ?: false) {
264 |                                 val storeLevel = statement.persist?.let { StorageLevel.fromString(it) }
265 |                                 if (storeLevel != null) {
266 |                                     it.persist(storeLevel)
267 |                                 } else {
268 |                                     it.cache()
269 |                                 }
270 |                             } else {
271 |                                 it
272 |                             }
273 |                         }
274 |                     } catch (ex: Throwable) {
275 |                         val msg = ex.toNiceMessage()
276 |                         throw DataImportException("Prep Statement: ${statement.description}\n$msg", ex)
277 |                     }
278 |                     println()
279 |                 }
280 | 
281 |                 fun Importer.getDataImportHandler(statement: DataImportStatement): StatementDataImportHandler {
282 |                     return if (targetElasticsearch != null) {
283 |                         EsDataImportHandler(statement, configRelativeDir, targetElasticsearch)
284 |                     } else if (targetAlgolia != null) {
285 |                         AlgoliaDataImportHandler(statement, configRelativeDir, targetAlgolia)
286 |                     } else {
287 |                         throw IllegalStateException(description + " import step neither declares ES nor Algolia target")
288 |                     }
289 |                 }
290 | 
291 |                 // run importers
292 |                 println()
293 |                 cfg.importSteps.forEach { import ->
294 |                     println("\nRunning importer:\n${import.description.replaceIndent("  ")}")
295 |                     import.statements.forEach { statement ->
296 |                         val stateMgr = stateMap.get(statement.id)!!
297 |                         val lastRun  = lastRuns.get(statement.id)!!
298 | 
299 |                         // SQL times will be local time zone, so much match the server
300 |                         val sqlMinDate = Timestamp.from(lastRun).toString()
301 |                         val sqlMaxDate = Timestamp.from(thisRunDate).toString()
302 | 
303 |                         val dateMsg = if (lastRun == NOSTATE) {
304 |                             "range NEVER to '$sqlMaxDate'"
305 |                         } else {
306 |                             "range '$sqlMinDate' to '$sqlMaxDate'"
307 |                         }
308 | 
309 |                         println("\n    Execute statement:  ($dateMsg)\n${statement.description.replaceIndent("        ")}")
310 | 
311 |                         if (!stateMgr.lockStatement(uniqueId, statement)) {
312 |                             System.err.println("        Cannot acquire lock for statement ${statement.id}")
313 |                         } else {
314 |                             try {
315 |                                 val importHandler = import.getDataImportHandler(statement)
316 |                                 importHandler.prepareIndex()
317 | 
318 |                                 val rawQuery       = statement.sqlQuery ?: fileRelativeToConfig(statement.sqlFile!!).readText()
319 |                                 val subDataInQuery = rawQuery.replace("{lastRun}", sqlMinDate).replace("{thisRun}", sqlMaxDate)
320 |                                 val sqlResults     = try {
321 |                                     spark.sql(subDataInQuery).let {
322 |                                         if (statement.cache ?: false) {
323 |                                             val storeLevel = statement.persist?.let { StorageLevel.fromString(it) }
324 |                                             if (storeLevel != null) {
325 |                                                 it.persist(storeLevel)
326 |                                             } else {
327 |                                                 it.cache()
328 |                                             }
329 |                                         } else {
330 |                                             it
331 |                                         }
332 |                                     }
333 |                                 } catch (ex: Throwable) {
334 |                                     val msg = ex.toNiceMessage()
335 |                                     throw DataImportException(msg, ex)
336 |                                 }
337 | 
338 |                                 val rowCount = importHandler.import(sqlResults)
339 |                                 println("        Rows processed: $rowCount")
340 | 
341 |                                 stateMgr.writeStateForStatement(uniqueId, statement, thisRunDate, "success", rowCount, null)
342 |                                 stateMgr.logStatement(uniqueId, statement, thisRunDate, "success", rowCount, null)
343 |                             } catch (ex: Throwable) {
344 |                                 val msg = ex.message ?: "unknown failure"
345 |                                 stateMgr.writeStateForStatement(uniqueId, statement, lastRun, "error", 0, msg)
346 |                                 stateMgr.logStatement(uniqueId, statement, thisRunDate, "error", 0, msg)
347 |                                 // System.err.println("\nProcess FAILED:  \n$msg\n")
348 |                                 throw ex
349 |                             } finally {
350 |                                 stateMgr.unlockStatement(uniqueId, statement)
351 |                             }
352 |                         }
353 |                     }
354 |                 }
355 | 
356 |                 println("\nShutting down...")
357 |             }
358 |             println("\nDONE.")
359 |         } finally {
360 |             Thread.currentThread().contextClassLoader = oldClassLoader
361 |         }
362 |     }
363 | }
364 | 
365 | fun Throwable.toNiceMessage(): String = when (this) {
366 |     is AnalysisException -> """Error:(${this.line().takeIf { it.isDefined }?.toString() ?: "?"},${this.startPosition().takeIf { it.isDefined }?.toString() ?: "?"}) ${this.message}"""
367 |     is ParseException -> """Error:(${this.line().takeIf { it.isDefined }?.toString() ?: "?"},${this.startPosition().takeIf { it.isDefined }?.toString() ?: "?"}) ${this.message}"""
368 |     else -> this.message ?: "unknown error"
369 | }
370 | 
371 | 
372 | fun indexSpec(indexName: String, type: String?): String {
373 |     return "${indexName}/${type?.trim() ?: ""}"
374 | }
375 | 


--------------------------------------------------------------------------------
/src/main/kotlin/uy/kohesive/elasticsearch/dataimport/Config.kt:
--------------------------------------------------------------------------------
 1 | package uy.kohesive.elasticsearch.dataimport
 2 | 
 3 | 
 4 | data class DataImportHandlerConfig(val sparkMaster: String? = null,
 5 |                                    val sources: Connections,
 6 |                                    val prepStatements: List<PrepStatement>? = null,
 7 |                                    val importSteps: List<Importer>,
 8 |                                    val sparkConfig: Map<String, String>? = null)
 9 | 
10 | data class AuthInfo(val username: String, val password: String)
11 | 
12 | data class Connections(val elasticsearch: List<EsConnection>?,
13 |                        val jdbc: List<JdbcConnection>? = null,
14 |                        val filesystem: List<FileDir>? = null)
15 | 
16 | data class EsConnection(val nodes: List<String>,
17 |                         val basicAuth: AuthInfo? = null,
18 |                         val port: Int? = 9200,
19 |                         val enableSsl: Boolean? = false,
20 |                         val tables: List<EsSource>,
21 |                         val settings: Map<String, String>? = null) {
22 | }
23 | 
24 | data class JdbcConnection(val jdbcUrl: String,
25 |                           val driverClass: String,
26 |                           val defaultSchema: String,
27 |                           val auth: AuthInfo,
28 |                           val driverJars: List<String>? = null,
29 |                           val tables: List<JdbcSource>,
30 |                           val settings: Map<String, String>? = null)
31 | 
32 | data class FileDir(val directory: String,
33 |                    val tables: List<FileSource>,
34 |                    val settings: Map<String, String>? = null)
35 | 
36 | data class JdbcSource(val sparkTable: String,
37 |                       val sourceTable: String,
38 |                       val settings: Map<String, String>? = null)
39 | 
40 | data class FileSource(val sparkTable: String, val format: String, val filespecs: List<String>,
41 |                       val settings: Map<String, String>? = null)
42 | 
43 | data class EsSource(val sparkTable: String, val indexName: String, val type: String?, val indexType: String?, val esQuery: Any? = null,
44 |                     val settings: Map<String, String>? = null)
45 | 
46 | data class PrepStatement(val description: String, val sqlQuery: String?, val sqlFile: String?, val cache: Boolean? = null, val persist: String? = "MEMORY_ONLY")
47 | 
48 | data class Importer(val description: String,
49 |                     val targetElasticsearch: EsTargetConnection?,
50 |                     val targetAlgolia: AlgoliaTargetConnection?,
51 |                     val statements: List<DataImportStatement>)
52 | 
53 | data class AlgoliaTargetConnection(val applicationId: String, val apiKey: String)
54 | 
55 | data class EsTargetConnection(val nodes: List<String>,
56 |                               val basicAuth: AuthInfo? = null,
57 |                               val port: Int? = 9200,
58 |                               val enableSsl: Boolean? = false,
59 |                               val settings: Map<String, String>? = null)
60 | 
61 | data class DataImportStatement(val id: String,
62 |                                val description: String,
63 |                                val indexName: String,
64 |                                val indexType: String?,
65 |                                val type: String?,
66 |                                val action: String?,
67 |                                val idField: String?,
68 |                                val newIndexSettingsFile: String?,
69 |                                val sqlQuery: String?,
70 |                                val sqlFile: String?,
71 |                                val cache: Boolean? = null,
72 |                                val persist: String? = "MEMORY_ONLY",
73 |                                val settings: Map<String, String>? = null) {
74 | 
75 |     fun getAction(): StatementAction = if (action?.toLowerCase() == "delete") {
76 |         StatementAction.Delete
77 |     } else {
78 |         StatementAction.Index
79 |     }
80 | 
81 | }
82 | 
83 | enum class StatementAction {
84 |     Index,
85 |     Delete
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/kotlin/uy/kohesive/elasticsearch/dataimport/DataImportHandler.kt:
--------------------------------------------------------------------------------
 1 | package uy.kohesive.elasticsearch.dataimport
 2 | 
 3 | import org.apache.spark.sql.Dataset
 4 | import org.apache.spark.sql.Row
 5 | import java.io.File
 6 | 
 7 | interface StatementDataImportHandler {
 8 | 
 9 |     val configRelativeDir: File
10 | 
11 |     val statement: DataImportStatement
12 | 
13 |     fun prepareIndex()
14 | 
15 |     /**
16 |      * Returns processed rows count.
17 |      */
18 |     fun import(dataSet: Dataset<Row>): Long
19 | 
20 |     fun fileRelativeToConfig(filename: String): File = configRelativeDir.resolve(filename).canonicalFile
21 | 
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/src/main/kotlin/uy/kohesive/elasticsearch/dataimport/EsDataImportHandler.kt:
--------------------------------------------------------------------------------
 1 | package uy.kohesive.elasticsearch.dataimport
 2 | 
 3 | import org.apache.spark.sql.Dataset
 4 | import org.apache.spark.sql.Row
 5 | import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL
 6 | import java.io.File
 7 | 
 8 | class EsDataImportHandler(
 9 |     override val statement: DataImportStatement,
10 |     override val configRelativeDir: File,
11 | 
12 |     targetElasticsearch: EsTargetConnection
13 | ) : StatementDataImportHandler {
14 | 
15 |     val options = mutableMapOf("es.nodes" to targetElasticsearch.nodes.joinToString(","))
16 | 
17 |     init {
18 |         // look for table create setting
19 |         targetElasticsearch.basicAuth?.let {
20 |             options.put("es.net.http.auth.user", it.username)
21 |             options.put("es.net.http.auth.pass", it.password)
22 |         }
23 |         targetElasticsearch.port?.let { port ->
24 |             options.put("es.port", port.toString())
25 |         }
26 |         targetElasticsearch.enableSsl?.let { enableSsl ->
27 |             options.put("es.net.ssl", enableSsl.toString())
28 |         }
29 |         targetElasticsearch.settings?.let { options.putAll(it) }
30 |         statement.settings?.let { options.putAll(it) }
31 |     }
32 | 
33 |     val esClient = MicroEsClient(targetElasticsearch.nodes,
34 |         targetElasticsearch.port ?: 9200,
35 |         targetElasticsearch.enableSsl ?: false,
36 |         targetElasticsearch.basicAuth
37 |     )
38 | 
39 |     override fun import(dataSet: Dataset<Row>): Long {
40 |         JavaEsSparkSQL.saveToEs(dataSet, indexSpec(statement.indexName, statement.indexType ?: statement.type), options)
41 |         return dataSet.count()
42 |     }
43 | 
44 |     override fun prepareIndex() {
45 |         val autocreate: Boolean = options.getOrDefault("es.index.auto.create", "true").toBoolean()
46 | 
47 |         val indexExists = esClient.checkIndexExists(statement.indexName)
48 |         if (!autocreate) {
49 |             if (!indexExists) {
50 |                 throw IllegalStateException("Index auto-create setting 'es.index.auto.create' is false and index ${statement.indexName} does not exist.")
51 |             }
52 |         } else {
53 |             if (!indexExists) {
54 |                 println("        Index ${statement.indexName} does not exist, auto creating")
55 |                 if (statement.newIndexSettingsFile != null) {
56 |                     val checkFile = fileRelativeToConfig(statement.newIndexSettingsFile)
57 |                     println("        Creating ${statement.indexName} with settings/mapping file: $checkFile")
58 |                     val response = esClient.createIndex(statement.indexName, checkFile.readText())
59 |                     if (!response.isSuccess) {
60 |                         throw IllegalStateException("Could not create index ${statement.indexName} with settings/mapping file: $checkFile, due to:\n${response.responseJson}")
61 |                     }
62 |                 }  else {
63 |                     println("        Index will be created without settings/mappings file, will use index templates or dynamic mappings")
64 |                 }
65 |             }
66 |         }
67 |     }
68 | }


--------------------------------------------------------------------------------
/src/main/kotlin/uy/kohesive/elasticsearch/dataimport/Exceptions.kt:
--------------------------------------------------------------------------------
1 | package uy.kohesive.elasticsearch.dataimport
2 | 
3 | class DataImportException(msg: String, cause: Throwable? = null) : Exception(msg, cause)
4 | 


--------------------------------------------------------------------------------
/src/main/kotlin/uy/kohesive/elasticsearch/dataimport/MicroEsClient.kt:
--------------------------------------------------------------------------------
  1 | package uy.kohesive.elasticsearch.dataimport
  2 | 
  3 | import com.fasterxml.jackson.module.kotlin.readValue
  4 | import okhttp3.*
  5 | 
  6 | /**
  7 |  * TODO:  Change to use RestClient from ES / Spark integration
  8 |  */
  9 | 
 10 | class MicroEsClient(nodes: List<String>, port: Int = 9200, enableSsl: Boolean = false, auth: AuthInfo? = null) {
 11 |     val http = OkHttpClient().newBuilder().apply {
 12 |         auth?.let { addInterceptor(BasicAuthInterceptor(it)) }
 13 |     }.build()
 14 |     val protocol = if (enableSsl) "https" else "http"
 15 |     val host = nodes.first()
 16 |     val hostWithPort = if (':' in host.substringAfter('@', host)) host else "${host}:${port}"
 17 |     val url = "${protocol}://${hostWithPort}"
 18 | 
 19 |     fun String.fixRestAppendage(): String {
 20 |         if (this.startsWith("?")) return this
 21 |         if (this.startsWith("/")) return this
 22 |         return "/" + this
 23 |     }
 24 | 
 25 |     fun makeIndexTypeUrl(index: String, type: String) = "$url/$index/$type"
 26 |     fun makeIndexTypeUrl(index: String, type: String, restOfUrl: String) = "$url/$index/$type${restOfUrl.fixRestAppendage()}"
 27 |     fun makeIndexTypeIdUrl(index: String, type: String, id: String, restOfUrl: String) = "$url/$index/$type/$id${restOfUrl.fixRestAppendage()}"
 28 | 
 29 |     private fun OkHttpClient.get(url: String): CallResponse {
 30 |         val request = Request.Builder().url(url).build()
 31 |         val response = http.newCall(request).execute()
 32 |         return CallResponse(response.code(), response.use { it.body()?.string() ?: "" })
 33 |     }
 34 | 
 35 |     private fun OkHttpClient.delete(url: String): CallResponse {
 36 |         val request = Request.Builder().url(url).delete().build()
 37 |         val response = http.newCall(request).execute()
 38 |         return CallResponse(response.code(), response.use { it.body()?.string() ?: "" })
 39 |     }
 40 | 
 41 |     private fun OkHttpClient.delete(url: String, jsonBody: String): CallResponse {
 42 |         val jsonMediaType = MediaType.parse("application/json; charset=utf-8")
 43 |         val body = RequestBody.create(jsonMediaType, jsonBody)
 44 |         val request = Request.Builder().url(url).delete(body).build()
 45 |         val response = http.newCall(request).execute()
 46 |         return CallResponse(response.code(), response.use { it.body()?.string() ?: "" })
 47 |     }
 48 | 
 49 |     private fun OkHttpClient.post(url: String, jsonBody: String): CallResponse {
 50 |         val jsonMediaType = MediaType.parse("application/json; charset=utf-8")
 51 |         val body = RequestBody.create(jsonMediaType, jsonBody)
 52 |         val request = Request.Builder().url(url).post(body).build()
 53 |         val response = http.newCall(request).execute()
 54 |         return CallResponse(response.code(), response.use { it.body()?.string() ?: "" })
 55 |     }
 56 | 
 57 |     private fun OkHttpClient.put(url: String, jsonBody: String): CallResponse {
 58 |         val jsonMediaType = MediaType.parse("application/json; charset=utf-8")
 59 |         val body = RequestBody.create(jsonMediaType, jsonBody)
 60 |         val request = Request.Builder().url(url).put(body).build()
 61 |         val response = http.newCall(request).execute()
 62 |         return CallResponse(response.code(), response.use { it.body()?.string() ?: "" })
 63 |     }
 64 | 
 65 |     fun indexTypePOST(indexName: String, indexType: String, restOfUrl: String, postJson: String): CallResponse {
 66 |         return http.post(makeIndexTypeUrl(indexName, indexType, restOfUrl), postJson)
 67 |     }
 68 | 
 69 |     fun indexTypeIdPOST(indexName: String, indexType: String, id: String, restOfUrl: String, postJson: String): CallResponse {
 70 |         return http.post(makeIndexTypeIdUrl(indexName, indexType, id, restOfUrl), postJson)
 71 |     }
 72 | 
 73 |     fun indexTypeDELETE(indexName: String, indexType: String, restOfUrl: String): CallResponse {
 74 |         return http.delete(makeIndexTypeUrl(indexName, indexType, restOfUrl))
 75 |     }
 76 | 
 77 |     fun indexTypeIdDELETE(indexName: String, indexType: String, id: String, restOfUrl: String): CallResponse {
 78 |         return http.delete(makeIndexTypeIdUrl(indexName, indexType, id, restOfUrl))
 79 |     }
 80 | 
 81 |     fun indexTypeGET(indexName: String, indexType: String): CallResponse {
 82 |         return http.get(makeIndexTypeUrl(indexName, indexType))
 83 |     }
 84 | 
 85 |     fun indexTypeGET(indexName: String, indexType: String, restOfUrl: String): CallResponse {
 86 |         return http.get(makeIndexTypeUrl(indexName, indexType, restOfUrl))
 87 |     }
 88 | 
 89 |     fun indexTypeIdGET(indexName: String, indexType: String, id: String): CallResponse {
 90 |         return http.get(makeIndexTypeUrl(indexName, indexType, id))
 91 |     }
 92 | 
 93 |     fun indexTypeIdGET(indexName: String, indexType: String, id: String, restOfUrl: String): CallResponse {
 94 |         return http.get(makeIndexTypeIdUrl(indexName, indexType, id, restOfUrl))
 95 |     }
 96 | 
 97 |     fun createIndex(indexName: String, settingsJson: String): CallResponse {
 98 |         return http.put("${url}/${indexName}", settingsJson)
 99 |     }
100 | 
101 |     fun waitForIndexGreen(indexName: String) {
102 |         val response = http.get("${url}/_cluster/health/${indexName}?wait_for_status=green&timeout=10s")
103 |         if (!response.isSuccess) throw DataImportException("State manager failed, cannot check state index status")
104 |         val state = JSON.readTree(response.responseJson)
105 |         if (state.get("timed_out").asBoolean()) throw DataImportException("State manager failed, timeout waiting on state index to be 'green'")
106 |         if (state.get("status").asText() != "green") throw DataImportException("State manager failed, state index must be 'green' but was '${state.get("status")}'")
107 |     }
108 | 
109 |     fun checkIndexExists(indexName: String): Boolean {
110 |         return http.get("${url}/${indexName}").isSuccess
111 |     }
112 | 
113 |     inline fun <reified T : Any> mapFromSource(response: String): T = JSON.readTree(response).get("_source").traverse().let { JSON.readValue<T>(it) }!!
114 | 
115 |     data class CallResponse(val code: Int, val responseJson: String) {
116 |         val isSuccess: Boolean get() = code in 200..299
117 |     }
118 | }
119 | 
120 | class BasicAuthInterceptor(val authInfo: AuthInfo) : Interceptor {
121 |     override fun intercept(chain: Interceptor.Chain): Response {
122 |         val request = chain.request()
123 |         val requestWithAuth = request.newBuilder().header("Authorization", Credentials.basic(authInfo.username, authInfo.password)).build()
124 |         return chain.proceed(requestWithAuth)
125 |     }
126 | }


--------------------------------------------------------------------------------
/src/main/kotlin/uy/kohesive/elasticsearch/dataimport/State.kt:
--------------------------------------------------------------------------------
  1 | package uy.kohesive.elasticsearch.dataimport
  2 | 
  3 | import java.time.Instant
  4 | 
  5 | interface StateManager {
  6 |     fun init()
  7 |     fun lockStatement(runId: String, statement: DataImportStatement): Boolean
  8 |     fun pingLockStatement(runId: String, statement: DataImportStatement): Boolean
  9 |     fun unlockStatement(runId: String, statement: DataImportStatement)
 10 |     fun writeStateForStatement(runId: String, statement: DataImportStatement, lastRunStart: Instant, status: String, lastRowCount: Long, errMsg: String? = null)
 11 |     fun readStateForStatement(runId: String, statement: DataImportStatement): Instant?
 12 |     fun logStatement(runId: String, statement: DataImportStatement, lastRunStart: Instant, status: String, rowCount: Long, errMsg: String? = null)
 13 | }
 14 | 
 15 | fun DataImportStatement.stateKey(): String = this.indexName + "-" + this.id
 16 | 
 17 | // TODO: better state management
 18 | // This is NOT using the ES client because we do not want conflicts with Spark dependencies
 19 | class ElasticSearchStateManager(val nodes: List<String>, val port: Int = 9200, val enableSsl: Boolean = false, val auth: AuthInfo?) : StateManager {
 20 |     val STATE_INDEX = ".kohesive-dih-state-v2"
 21 |     val esClient = MicroEsClient(nodes, port, enableSsl, auth)
 22 | 
 23 |     override fun init() {
 24 |         if (esClient.checkIndexExists(STATE_INDEX)) {
 25 |             esClient.waitForIndexGreen(STATE_INDEX)
 26 |         } else {
 27 |             val response = esClient.createIndex(STATE_INDEX, """
 28 |                {
 29 |                   "settings": {
 30 |                       "number_of_shards": 1,
 31 |                       "number_of_replicas": "0"
 32 |                   },
 33 |                   "mappings": {
 34 |                      "state": {
 35 |                          "properties": {
 36 |                              "targetIndex": { "type": "keyword" },
 37 |                              "statementId": { "type": "keyword" },
 38 |                              "lastRunDate": { "type": "date" },
 39 |                              "status": { "type": "keyword" },
 40 |                              "lastRunId": { "type": "keyword" },
 41 |                              "lastErrorMsg": { "type": "text" },
 42 |                              "lastRowCount": { "type": "long" }
 43 |                          }
 44 |                      },
 45 |                      "log": {
 46 |                         "properties": {
 47 |                              "targetIndex": { "type": "keyword" },
 48 |                              "statementId": { "type": "keyword" },
 49 |                              "runId": { "type": "keyword" },
 50 |                              "runDate": { "type": "date" },
 51 |                              "status": { "type": "keyword" },
 52 |                              "errorMsg": { "type": "text" },
 53 |                              "rowCount": { "type": "long" }
 54 |                         }
 55 |                      },
 56 |                      "lock": {
 57 |                         "properties": {
 58 |                              "targetIndex": { "type": "keyword" },
 59 |                              "statementId": { "type": "keyword" },
 60 |                              "runId": { "type": "keyword" },
 61 |                              "lockDate": { "type": "date" }
 62 |                         }
 63 |                      }
 64 |                   }
 65 |                }
 66 |             """)
 67 | 
 68 |             if (response.isSuccess) {
 69 |                 esClient.waitForIndexGreen(STATE_INDEX)
 70 |             } else {
 71 |                 throw DataImportException("State manager failed, cannot create state index\n${response.responseJson}")
 72 |             }
 73 |         }
 74 | 
 75 |         ttlKillOldLocks()
 76 |     }
 77 | 
 78 |     data class Lock(val runId: String, val targetIndex: String, val statementId: String, val lockDate: Instant)
 79 | 
 80 |     private fun ttlKillOldLocks() {
 81 |         val response = esClient.indexTypePOST(STATE_INDEX, "lock", "/_delete_by_query?refresh",
 82 |                 """
 83 |                        { "query": { "range": { "lockDate": { "lt": "now-15m" } } } }
 84 |                     """)
 85 |         if (!response.isSuccess) throw DataImportException("State manager failed, TTL delete query for locks failed\n${response.responseJson}")
 86 |     }
 87 | 
 88 |     override fun lockStatement(runId: String, statement: DataImportStatement): Boolean {
 89 |         val response = esClient.indexTypeIdPOST(STATE_INDEX, "lock", statement.stateKey(), "?op_type=create",
 90 |                 JSON.writeValueAsString(Lock(runId, statement.indexName, statement.id, Instant.now())))
 91 | 
 92 |         if (!response.isSuccess) {
 93 |             ttlKillOldLocks()
 94 |             return pingLockStatement(runId, statement)
 95 |         }
 96 |         return true
 97 |     }
 98 | 
 99 |     override fun pingLockStatement(runId: String, statement: DataImportStatement): Boolean {
100 |         val response = esClient.indexTypeIdGET(STATE_INDEX, "lock", statement.stateKey())
101 |         if (response.isSuccess) {
102 |             val lock = esClient.mapFromSource<Lock>(response.responseJson)
103 |             if (lock.runId == runId) {
104 |                 // TODO: we did a get, so have the version, change to a update with version ID
105 |                 val updResponse = esClient.indexTypePOST(STATE_INDEX, "lock", "/_update_by_query?refresh", """
106 |                     {
107 |                         "script": {
108 |                             "inline": "ctx._source.lockDate = Instant.ofEpochMilli(${Instant.now().toEpochMilli()}L)",
109 |                             "lang": "painless"
110 |                         },
111 |                         "query": {
112 |                             "bool": {
113 |                                "must": [
114 |                                   { "term": { "runId": "$runId" } },
115 |                                   { "term": { "targetIndex": "${statement.indexName}" } },
116 |                                   { "term": { "statementId": "${statement.id}" } }
117 |                                ]
118 |                             }
119 |                         }
120 |                     }
121 |                 """)
122 |                 if (!updResponse.isSuccess) {
123 |                     throw DataImportException("State manager failed, cannot acquire lock for ${statement.stateKey()} - had conflict on pinging of lock\n${updResponse.responseJson}")
124 |                 }
125 |             } else {
126 |                 throw DataImportException("State manager failed, cannot acquire lock for ${statement.stateKey()} -- it is held by ${lock.runId} since ${lock.lockDate.toIsoString()}\n${response.responseJson}")
127 |             }
128 |         }
129 |         return true
130 |     }
131 | 
132 |     override fun unlockStatement(runId: String, statement: DataImportStatement) {
133 |         if (pingLockStatement(runId, statement)) {
134 |             val response =  esClient.indexTypeIdDELETE(STATE_INDEX, "lock", statement.stateKey(), "?refresh")
135 |             if (!response.isSuccess) {
136 |                 throw DataImportException("State manager failed, cannot delete lock for ${statement.stateKey()}\n${response.responseJson}")
137 |             }
138 |         }
139 |     }
140 | 
141 |     data class State(val targetIndex: String, val statementId: String, val lastRunDate: Instant, val status: String, val lastRunId: String, val lastErrorMesasge: String?, val lastRowCount: Long)
142 | 
143 |     data class StateLog(val targetIndex: String, val statementId: String, val runId: String, val runDate: Instant, val status: String, val errorMsg: String?, val rowCount: Long)
144 | 
145 |     override fun writeStateForStatement(runId: String, statement: DataImportStatement, lastRunStart: Instant, status: String, lastRowCount: Long, errMsg: String?) {
146 |         val response = esClient.indexTypeIdPOST(STATE_INDEX, "state", statement.stateKey(), "?refresh",
147 |                 JSON.writeValueAsString(State(statement.indexName, statement.id, lastRunStart, status, runId, errMsg, lastRowCount)))
148 |         if (!response.isSuccess) {
149 |             throw DataImportException("State manager failed, cannot update state for ${statement.stateKey()}\n${response.responseJson}")
150 |         }
151 |     }
152 | 
153 |     override fun readStateForStatement(runId: String, statement: DataImportStatement): Instant? {
154 |         val response = esClient.indexTypeIdGET(STATE_INDEX, "state", statement.stateKey())
155 |         if (response.isSuccess) {
156 |             val state = esClient.mapFromSource<State>(response.responseJson)
157 |             return state.lastRunDate
158 |         } else {
159 |             return null
160 |         }
161 |     }
162 | 
163 |     override fun logStatement(runId: String, statement: DataImportStatement, lastRunStart: Instant, status: String, rowCount: Long, errMsg: String?) {
164 |         val response = esClient.indexTypeIdPOST(STATE_INDEX, "log", "${statement.stateKey()}_run_${runId}", "?refresh",
165 |                 JSON.writeValueAsString(StateLog(statement.indexName, statement.id, runId, lastRunStart, status, errMsg, rowCount)))
166 |         if (!response.isSuccess) {
167 |             throw DataImportException("State manager failed, cannot log state for ${statement.stateKey()}\n${response.responseJson}")
168 |         }
169 |     }
170 | }
171 | 


--------------------------------------------------------------------------------
/src/main/kotlin/uy/kohesive/elasticsearch/dataimport/Udfs.kt:
--------------------------------------------------------------------------------
 1 | package uy.kohesive.elasticsearch.dataimport
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.jsoup.Jsoup
 5 | import org.jsoup.parser.Parser
 6 | import org.jsoup.safety.Whitelist
 7 | import uy.kohesive.elasticsearch.dataimport.udf.Udfs
 8 | import java.sql.Date
 9 | import java.sql.Timestamp
10 | 
11 | object DataImportHandlerUdfs {
12 |     fun registerSparkUdfs(spark: SparkSession) {
13 |         Udfs.registerStringToStringUdf(spark, "fluffly", fluffly)
14 |         Udfs.registerStringToStringUdf(spark, "stripHtml", stripHtmlCompletely)
15 |         Udfs.registerStringToStringUdf(spark, "normalizeQuotes", normalizeQuotes)
16 |         Udfs.registerStringToStringUdf(spark, "unescapeHtmlEntites", unescapeHtmlEntities)
17 |         Udfs.registerAnyAnyToTimestampUdf(spark, "combineDateTime", combineDateTime)
18 |     }
19 | 
20 |     @JvmStatic val whiteListMap = mapOf(
21 |             "none" to Whitelist.none(),
22 |             "basic" to Whitelist.basic(),
23 |             "basicwithimages" to Whitelist.basicWithImages(),
24 |             "relaxed" to Whitelist.relaxed(),
25 |             "simpletext" to Whitelist.simpleText(),
26 |             "simple" to Whitelist.simpleText()
27 |     )
28 | 
29 |     @JvmStatic val fluffly = fun (v: String): String = "fluffly " + v
30 | 
31 |     @JvmStatic val stripHtmlCompletely = fun (v: String): String {
32 |         return Jsoup.parseBodyFragment(v).text()
33 |     }
34 | 
35 |     @JvmStatic val normalizeQuotes = fun (v: String): String {
36 |         return v.replace("\\'", "'").replace("''", "\"")
37 |     }
38 | 
39 |     @JvmStatic val unescapeHtmlEntities = fun (v: String): String {
40 |         return Parser.unescapeEntities(v, false)
41 |     }
42 | 
43 |     @JvmStatic val combineDateTime = fun (date: Date?, time: Timestamp?): Timestamp? {
44 |         // https://stackoverflow.com/questions/26649530/merge-date-and-time-into-timestamp
45 |         if (date == null || time == null) {
46 |             return null
47 |         }
48 | 
49 | //        val dd = (date.time / 86400000L * 86400000L) - date.timezoneOffset * 60000
50 | //        val tt = time.time - time.time / 86400000L * 86400000L
51 | //        return Timestamp(dd + tt)
52 | 
53 |         return Timestamp(date.year, date.month, date.date, time.hours, time.minutes, time.seconds, 0)
54 |     }
55 | 
56 | }


--------------------------------------------------------------------------------
/src/main/kotlin/uy/kohesive/elasticsearch/dataimport/Util.kt:
--------------------------------------------------------------------------------
 1 | package uy.kohesive.elasticsearch.dataimport
 2 | 
 3 | import com.fasterxml.jackson.databind.DeserializationFeature
 4 | import com.fasterxml.jackson.databind.SerializationFeature
 5 | import com.fasterxml.jackson.datatype.jdk8.Jdk8Module
 6 | import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
 7 | import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
 8 | import java.time.format.DateTimeFormatter
 9 | import java.time.temporal.Temporal
10 | 
11 | fun isoDateFormat(): DateTimeFormatter = DateTimeFormatter.ISO_INSTANT
12 | fun Temporal.toIsoString(): String = isoDateFormat().format(this)
13 | 
14 | val JSON = jacksonObjectMapper().registerModules(JavaTimeModule(), Jdk8Module()).apply {
15 |     configure(SerializationFeature.WRITE_DATE_TIMESTAMPS_AS_NANOSECONDS, false)
16 |     configure(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS, true)
17 |     configure(DeserializationFeature.READ_DATE_TIMESTAMPS_AS_NANOSECONDS, false)
18 | }


--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 4 |         <!-- encoders are assigned the type
 5 |              ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 6 |         <encoder>
 7 |             <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |         </encoder>
 9 |     </appender>
10 | 
11 |     <logger name="uy.kohesive" level="INFO"/>
12 | 
13 |     <root level="ERROR">
14 |         <appender-ref ref="STDOUT" />
15 |     </root>
16 | </configuration>


--------------------------------------------------------------------------------
/src/test/kotlin/uy/kohesive/elasticsearch/dataimport/ManualTestOfDataImport.kt:
--------------------------------------------------------------------------------
 1 | package uy.kohesive.elasticsearch.dataimport
 2 | 
 3 | import java.io.ByteArrayInputStream
 4 | import java.io.File
 5 | 
 6 | class ManualTestOfDataImport {
 7 |     // TODO: This test requires Elasticsearch to be available, it is difficult to Run ES and Spark together due to conflicting dependencies
 8 |     companion object {
 9 |         @JvmStatic fun main(args: Array<String>) {
10 |             val confFile = File("./src/test/resources/manual-test.conf")
11 |             App().run(confFile.inputStream(), confFile.parentFile)
12 |         }
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/src/test/resources/manual-mappings.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "settings": {
  3 |     "index": {
  4 |       "analysis": {
  5 |         "filter": {
  6 |           "de_stop_filter": {
  7 |             "type": "stop",
  8 |             "stopwords": "_german_"
  9 |           },
 10 |           "de_stemmer_filter": {
 11 |             "type": "stemmer",
 12 |             "langauge": "minimal_german",
 13 |             "respect_keywords": true
 14 |           },
 15 |           "unique_same_position_filter": {
 16 |             "type": "unique",
 17 |             "only_on_same_position": true
 18 |           },
 19 |           "de_decomp_filter": {
 20 |             "type": "decompound",
 21 |             "language": "de",
 22 |             "respect_keywords": true
 23 |           },
 24 |           "de_fst_decomp_filter": {
 25 |             "type": "fst_decompound",
 26 |             "language": "de",
 27 |             "respect_keywords": true
 28 |           },
 29 |           "de_baseform_filter": {
 30 |             "type": "baseform",
 31 |             "language": "de",
 32 |             "respect_keywords": true
 33 |           },
 34 |           "de_lemmatize_filter": {
 35 |             "type": "lemmatize",
 36 |             "language": "de",
 37 |             "respect_keywords": true
 38 |           },
 39 |           "de_synonym_lowercase_filter": {
 40 |             "type": "synonym",
 41 |             "synonyms_path": "analysis/de-synonyms-lowercase-minimal.txt",
 42 |             "respect_keywords": true
 43 |           },
 44 |           "de_synonym_casesensitive_filter": {
 45 |             "type": "synonym",
 46 |             "synonyms_path": "analysis/de-synonyms-case-sensitive-minimal.txt",
 47 |             "respect_keywords": true
 48 |           },
 49 |           "haystack_ngram_filter": {
 50 |             "type": "nGram",
 51 |             "min_gram": "3",
 52 |             "max_gram": "15"
 53 |           },
 54 |           "haystack_edge_ngram_filter": {
 55 |             "type": "edge_ngram",
 56 |             "min_gram": "3",
 57 |             "max_gram": "15"
 58 |           }
 59 |         },
 60 |         "analyzer": {
 61 |           "de_decomp_query_analysis": {
 62 |             "type": "custom",
 63 |             "tokenizer": "icu_tokenizer",
 64 |             "filter": [
 65 |               "keyword_repeat",
 66 |               "de_synonym_casesensitive_filter",
 67 |               "icu_normalizer",
 68 |               "de_stop_filter",
 69 |               "de_synonym_lowercase_filter",
 70 |               "de_lemmatize_filter",
 71 |               "icu_folding",
 72 |               "unique_same_position_filter"
 73 |             ]
 74 |           },
 75 |           "de_decomp_index_analysis": {
 76 |             "type": "custom",
 77 |             "tokenizer": "icu_tokenizer",
 78 |             "filter": [
 79 |               "keyword_repeat",
 80 |               "de_synonym_casesensitive_filter",
 81 |               "icu_normalizer",
 82 |               "de_stop_filter",
 83 |               "de_synonym_lowercase_filter",
 84 |               "de_decomp_filter",
 85 |               "de_lemmatize_filter",
 86 |               "icu_folding",
 87 |               "unique_same_position_filter"
 88 |             ]
 89 |           },
 90 |           "de_decomp_nostop_query_analysis": {
 91 |             "type": "custom",
 92 |             "tokenizer": "icu_tokenizer",
 93 |             "filter": [
 94 |               "keyword_repeat",
 95 |               "de_synonym_casesensitive_filter",
 96 |               "icu_normalizer",
 97 |               "de_synonym_lowercase_filter",
 98 |               "de_lemmatize_filter",
 99 |               "icu_folding",
100 |               "unique_same_position_filter"
101 |             ]
102 |           },
103 |           "de_decomp_nostop_index_analysis": {
104 |             "type": "custom",
105 |             "tokenizer": "icu_tokenizer",
106 |             "filter": [
107 |               "keyword_repeat",
108 |               "de_synonym_casesensitive_filter",
109 |               "icu_normalizer",
110 |               "de_synonym_lowercase_filter",
111 |               "de_decomp_filter",
112 |               "de_lemmatize_filter",
113 |               "icu_folding",
114 |               "unique_same_position_filter"
115 |             ]
116 |           },
117 |           "de_synonym_analysis": {
118 |             "type": "custom",
119 |             "tokenizer": "icu_tokenizer",
120 |             "filter": [
121 |               "keyword_repeat",
122 |               "de_synonym_casesensitive_filter",
123 |               "icu_normalizer",
124 |               "de_stop_filter",
125 |               "de_synonym_lowercase_filter",
126 |               "de_lemmatize_filter",
127 |               "icu_folding",
128 |               "de_stemmer_filter",
129 |               "unique_same_position_filter"
130 |             ]
131 |           },
132 |           "de_synonym_nostop_analysis": {
133 |             "type": "custom",
134 |             "tokenizer": "icu_tokenizer",
135 |             "filter": [
136 |               "keyword_repeat",
137 |               "de_synonym_casesensitive_filter",
138 |               "icu_normalizer",
139 |               "de_synonym_lowercase_filter",
140 |               "de_lemmatize_filter",
141 |               "icu_folding",
142 |               "de_stemmer_filter",
143 |               "unique_same_position_filter"
144 |             ]
145 |           },
146 |           "de_base_analysis": {
147 |             "type": "custom",
148 |             "tokenizer": "icu_tokenizer",
149 |             "filter": [
150 |               "icu_normalizer",
151 |               "de_stop_filter",
152 |               "icu_folding",
153 |               "de_stemmer_filter"
154 |             ]
155 |           },
156 |           "de_base_nostop_analysis": {
157 |             "type": "custom",
158 |             "tokenizer": "icu_tokenizer",
159 |             "filter": [
160 |               "icu_normalizer",
161 |               "icu_folding",
162 |               "de_stemmer_filter"
163 |             ]
164 |           },
165 |           "ngram_analysis": {
166 |             "type": "custom",
167 |             "filter": [
168 |               "icu_normalizer",
169 |               "haystack_ngram_filter"
170 |             ],
171 |             "tokenizer": "icu_tokenizer"
172 |           },
173 |           "edge_ngram_analysis": {
174 |             "type": "custom",
175 |             "filter": [
176 |               "icu_normalizer",
177 |               "haystack_edge_ngram_filter"
178 |             ],
179 |             "tokenizer": "icu_tokenizer"
180 |           }
181 |         }
182 |       },
183 |       "number_of_shards": "5",
184 |       "number_of_replicas": "0"
185 |     }
186 |   },
187 |   "mappings": {
188 |     "modelresult": {
189 |       "_all": {
190 |         "enabled": false
191 |       },
192 |       "properties": {
193 |         "content_auto": {
194 |           "type": "text",
195 |           "analyzer": "ngram_analysis"
196 |         },
197 |         "heading": {
198 |           "type": "text",
199 |           "analyzer": "ngram_analysis",
200 |           "fields": {
201 |             "raw": {
202 |               "type": "text",
203 |               "analyzer": "german"
204 |             },
205 |             "de_base": {
206 |               "type": "text",
207 |               "analyzer": "de_base_analysis"
208 |             },
209 |             "de_syn": {
210 |               "type": "text",
211 |               "analyzer": "de_synonym_analysis"
212 |             },
213 |             "de_decomp": {
214 |               "type": "text",
215 |               "analyzer": "de_decomp_index_analysis",
216 |               "search_analyzer": "de_decomp_query_analysis"
217 |             },
218 |             "ngram": {
219 |               "type": "text",
220 |               "analyzer": "ngram_analysis"
221 |             }
222 |           }
223 |         },
224 |         "date": {
225 |           "type": "date",
226 |           "format": "date_optional_time||epoch_millis"
227 |         },
228 |         "django_ct": {
229 |           "type": "keyword",
230 |           "include_in_all": false
231 |         },
232 |         "django_id": {
233 |           "type": "keyword",
234 |           "include_in_all": false
235 |         },
236 |         "id": {
237 |           "type": "keyword"
238 |         },
239 |         "offer": {
240 |           "type": "boolean"
241 |         },
242 |         "staffname": {
243 |           "type": "text",
244 |           "copy_to": "spelldata",
245 |           "fields": {
246 |             "raw": {
247 |               "type": "text",
248 |               "analyzer": "german"
249 |             },
250 |             "de_base": {
251 |               "type": "text",
252 |               "analyzer": "de_base_analysis"
253 |             },
254 |             "de_base_nostop": {
255 |               "type": "text",
256 |               "analyzer": "de_base_nostop_analysis"
257 |             },
258 |             "de_syn": {
259 |               "type": "text",
260 |               "analyzer": "de_synonym_analysis"
261 |             },
262 |             "de_syn_nostop": {
263 |               "type": "text",
264 |               "analyzer": "de_synonym_nostop_analysis"
265 |             },
266 |             "de_decomp": {
267 |               "type": "text",
268 |               "analyzer": "de_decomp_index_analysis",
269 |               "search_analyzer": "de_decomp_query_analysis",
270 |               "term_vector": "with_positions_offsets"
271 |             },
272 |             "de_decomp_nostop": {
273 |               "type": "text",
274 |               "analyzer": "de_decomp_nostop_index_analysis",
275 |               "search_analyzer": "de_decomp_nostop_query_analysis",
276 |               "term_vector": "with_positions_offsets"
277 |             },
278 |             "ngram": {
279 |               "type": "text",
280 |               "analyzer": "ngram_analysis"
281 |             },
282 |             "engram": {
283 |               "type": "text",
284 |               "analyzer": "edge_ngram_analysis"
285 |             }
286 |           }
287 |         },
288 |         "text": {
289 |           "type": "text",
290 |           "copy_to": "spelldata",
291 |           "fields": {
292 |             "raw": {
293 |               "type": "text",
294 |               "analyzer": "german",
295 |               "term_vector": "with_positions_offsets"
296 |             },
297 |             "de_base": {
298 |               "type": "text",
299 |               "analyzer": "de_base_analysis",
300 |               "term_vector": "with_positions_offsets"
301 |             },
302 |             "de_base_nostop": {
303 |               "type": "text",
304 |               "analyzer": "de_base_nostop_analysis",
305 |               "term_vector": "with_positions_offsets"
306 |             },
307 |             "de_syn_nostop": {
308 |               "type": "text",
309 |               "analyzer": "de_synonym_nostop_analysis",
310 |               "term_vector": "with_positions_offsets"
311 |             },
312 |             "de_syn": {
313 |               "type": "text",
314 |               "analyzer": "de_synonym_analysis",
315 |               "term_vector": "with_positions_offsets"
316 |             },
317 |             "de_decomp": {
318 |               "type": "text",
319 |               "analyzer": "de_decomp_index_analysis",
320 |               "search_analyzer": "de_decomp_query_analysis",
321 |               "term_vector": "with_positions_offsets"
322 |             },
323 |             "de_decomp_nostop": {
324 |               "type": "text",
325 |               "analyzer": "de_decomp_nostop_index_analysis",
326 |               "search_analyzer": "de_decomp_nostop_query_analysis",
327 |               "term_vector": "with_positions_offsets"
328 |             },
329 |             "ngram": {
330 |               "type": "text",
331 |               "analyzer": "ngram_analysis"
332 |             },
333 |             "engram": {
334 |               "type": "text",
335 |               "analyzer": "edge_ngram_analysis"
336 |             }
337 |           },
338 |           "term_vector": "with_positions_offsets"
339 |         },
340 |         "title": {
341 |           "type": "text",
342 |           "copy_to": "spelldata",
343 |           "fields": {
344 |             "raw": {
345 |               "type": "text",
346 |               "analyzer": "german",
347 |               "term_vector": "with_positions_offsets"
348 |             },
349 |             "de_base": {
350 |               "type": "text",
351 |               "analyzer": "de_base_analysis",
352 |               "term_vector": "with_positions_offsets"
353 |             },
354 |             "de_base_nostop": {
355 |               "type": "text",
356 |               "analyzer": "de_base_nostop_analysis",
357 |               "term_vector": "with_positions_offsets"
358 |             },
359 |             "de_syn_nostop": {
360 |               "type": "text",
361 |               "analyzer": "de_synonym_nostop_analysis",
362 |               "term_vector": "with_positions_offsets"
363 |             },
364 |             "de_syn": {
365 |               "type": "text",
366 |               "analyzer": "de_synonym_analysis",
367 |               "term_vector": "with_positions_offsets"
368 |             },
369 |             "de_decomp": {
370 |               "type": "text",
371 |               "analyzer": "de_decomp_index_analysis",
372 |               "search_analyzer": "de_decomp_query_analysis",
373 |               "term_vector": "with_positions_offsets"
374 |             },
375 |             "de_decomp_nostop": {
376 |               "type": "text",
377 |               "analyzer": "de_decomp_nostop_index_analysis",
378 |               "search_analyzer": "de_decomp_nostop_query_analysis",
379 |               "term_vector": "with_positions_offsets"
380 |             },
381 |             "ngram": {
382 |               "type": "text",
383 |               "analyzer": "ngram_analysis"
384 |             },
385 |             "engram": {
386 |               "type": "text",
387 |               "analyzer": "edge_ngram_analysis"
388 |             }
389 |           },
390 |           "term_vector": "with_positions_offsets"
391 |         },
392 |         "spelldata": {
393 |           "type": "text",
394 |           "fields": {
395 |             "raw": {
396 |               "type": "text",
397 |               "analyzer": "german",
398 |               "term_vector": "with_positions_offsets"
399 |             },
400 |             "de_base": {
401 |               "type": "text",
402 |               "analyzer": "de_base_analysis",
403 |               "term_vector": "with_positions_offsets"
404 |             },
405 |             "de_base_nostop": {
406 |               "type": "text",
407 |               "analyzer": "de_base_nostop_analysis",
408 |               "term_vector": "with_positions_offsets"
409 |             },
410 |             "de_syn_nostop": {
411 |               "type": "text",
412 |               "analyzer": "de_synonym_nostop_analysis",
413 |               "term_vector": "with_positions_offsets"
414 |             },
415 |             "de_syn": {
416 |               "type": "text",
417 |               "analyzer": "de_synonym_analysis",
418 |               "term_vector": "with_positions_offsets"
419 |             },
420 |             "de_decomp": {
421 |               "type": "text",
422 |               "analyzer": "de_decomp_index_analysis",
423 |               "search_analyzer": "de_decomp_query_analysis",
424 |               "term_vector": "with_positions_offsets"
425 |             },
426 |             "de_decomp_nostop": {
427 |               "type": "text",
428 |               "analyzer": "de_decomp_nostop_index_analysis",
429 |               "search_analyzer": "de_decomp_nostop_query_analysis",
430 |               "term_vector": "with_positions_offsets"
431 |             },
432 |             "ngram": {
433 |               "type": "text",
434 |               "analyzer": "ngram_analysis"
435 |             },
436 |             "engram": {
437 |               "type": "text",
438 |               "analyzer": "edge_ngram_analysis"
439 |             }
440 |           },
441 |           "term_vector": "with_positions_offsets"
442 |         }
443 |       }
444 |     }
445 |   }
446 | }


--------------------------------------------------------------------------------
/src/test/resources/test.sql:
--------------------------------------------------------------------------------
 1 | WITH orgMembers AS
 2 | (
 3 |   SELECT our.guid AS roleOrgGuid, our.userGuid AS roleUserGuid, our.orgUserRoleType AS roleType, our.dtUpdated AS dtRoleUpdated,
 4 |   oe.displayName AS orgDisplayName, oe.dtUpdated AS dtOrgUpdated
 5 |   FROM OrgEntity AS oe JOIN OrgUserRole AS our ON (our.orgGuid = oe.guid)
 6 | ),
 7 | userWithOrg AS (
 8 |   SELECT ue.guid, struct(ue.*) AS user, struct(om.*) AS orgMembership
 9 |   FROM UserEntity AS ue LEFT OUTER JOIN orgMembers AS om ON (om.roleUserGuid = ue.guid)
10 | ),
11 | modifiedUsers AS (
12 |   SELECT guid, first(user) as user, collect_list(orgMembership) AS orgMemberships
13 |     FROM userWithOrg AS ue
14 |    WHERE user.dtUpdated between "{lastRun}" AND "{thisRun}" OR
15 |          orgMembership.dtRoleUpdated between  "{lastRun}" AND "{thisRun}" OR
16 |          orgMembership.dtOrgUpdated between  "{lastRun}" AND "{thisRun}"
17 |    GROUP BY guid
18 | ),
19 | usersWithEmotions AS (
20 |   SELECT mu.*, em.emotion FROM modifiedUsers AS mu LEFT OUTER JOIN UserEmotions AS em ON (mu.guid = em.guid)
21 | )
22 | SELECT 'things' as type,
23 |   user.guid, user.identity, user.displayName, user.contactEmail, user.avatarUrl, user.gravatarEmail, user.blurb,
24 |   user.location, user.defaultTraitPrivacyType, user.companyName, user.isActive, user.isHeadless,
25 |   emotion, user.dtCreated, user.dtUpdated, orgMemberships FROM usersWithEmotions


--------------------------------------------------------------------------------