├── .gitignore ├── LICENSE ├── README.md ├── build.gradle ├── circle.yml ├── gradle-local-dependency-rules.json ├── gradle.properties ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── settings.gradle └── src ├── main ├── java │ └── uy │ │ └── kohesive │ │ └── elasticsearch │ │ └── dataimport │ │ └── udf │ │ └── Udfs.java ├── kotlin │ └── uy │ │ └── kohesive │ │ └── elasticsearch │ │ └── dataimport │ │ ├── AlgoliaDataImportHandler.kt │ │ ├── AlgoliaStateManager.kt │ │ ├── App.kt │ │ ├── Config.kt │ │ ├── DataImportHandler.kt │ │ ├── EsDataImportHandler.kt │ │ ├── Exceptions.kt │ │ ├── MicroEsClient.kt │ │ ├── State.kt │ │ ├── Udfs.kt │ │ └── Util.kt └── resources │ └── logback.xml └── test ├── kotlin └── uy │ └── kohesive │ └── elasticsearch │ └── dataimport │ └── ManualTestOfDataImport.kt └── resources ├── manual-mappings.json └── test.sql /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.ipr 3 | *.iml 4 | *.iws 5 | build/ 6 | out/ 7 | classes/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2016-2018 Jayson Minard (jayson.minard@gmail.com) and Kohesive 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GitHub release](https://img.shields.io/github/release/kohesive/elasticsearch-data-import-handler.svg)](https://github.com/kohesive/elasticsearch-data-import-handler/releases) [![CircleCI branch](https://img.shields.io/circleci/project/kohesive/elasticsearch-data-import-handler/master.svg)](https://circleci.com/gh/kohesive/elasticsearch-data-import-handler/tree/master) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](https://github.com/kohesive/elasticsearch-data-import-handler/blob/master/LICENSE) [![ES](https://img.shields.io/badge/ES-5.x-orange.svg)](https://github.com/elastic/elasticsearch) [![ES](https://img.shields.io/badge/ES-6.x-orange.svg)](https://github.com/elastic/elasticsearch) 2 | 3 | 4 | # Elasticsearch Data Import Handler 5 | # Elasticsearch/Algolia Data Import Handler 6 | 7 | A data import handler for Elasticsearch/Algolia 8 | 9 | * Simple 10 | * Powerful 11 | * Use SQL statements that can span multiple databases, text files, and ElasticSearch/Algolia indexes 12 | * Process full load and incremental updates 13 | * Output columnar and structured JSON to ElasticSearch/Algolia 14 | 15 | Running is simple. With Java 8 installed, [download a release](https://github.com/kohesive/elasticsearch-data-import-handler/releases) and then run it: 16 | 17 | ``` 18 | kohesive-es-dih 19 | ``` 20 | 21 | The config file is in [HOCON format](https://github.com/typesafehub/config/blob/master/HOCON.md) which is a relaxed 22 | version of JSON and allows [multi-line-string](https://github.com/typesafehub/config/blob/master/HOCON.md#multi-line-strings) 23 | which is very useful when writing larger SQL statements. 24 | 25 | The configuration file follows this format: 26 | 27 | ```hocon 28 | { 29 | "sources": { 30 | "elasticsearch": [ ], 31 | "jdbc": [ ], 32 | "filesystem": [ ] 33 | }, 34 | "prepStatements": [ ], 35 | "importSteps": [ ] 36 | } 37 | ``` 38 | 39 | First you should provide 1 or more `sources`. Each source becomes a temporary table in a unified catalog of tables from 40 | which you can query and join together. 41 | 42 | Optionally, you can include preparatory steps in `prepStatements` which are usually additional temporary tables 43 | that may be shared by the later import steps. The `prepStatements` execute in order and **do not** do any date range 44 | substitution. 45 | 46 | Lastly you specify `importSteps` which are queries that use any of the `sources` and temporary tables created in 47 | `prepStatements`; where the results of each SQL query is pushed into an Elasticsearch index. 48 | 49 | **Tip:** The SQL used is anything available to [Apache Spark SQL](https://docs.databricks.com/spark/latest/spark-sql/index.html) 50 | 51 | ### Let's start with an example, 52 | ...of loading 1 table from MySQL into Elasticsearch: 53 | 54 | ```hocon 55 | { 56 | "sources": { 57 | "jdbc": [ 58 | { 59 | "jdbcUrl": "jdbc:mysql://localhost/test?useSSL=false", 60 | "driverClass": "com.mysql.jdbc.Driver", 61 | "defaultSchema": "test", 62 | "auth": { 63 | "username": "myusername", 64 | "password": "mypass" 65 | }, 66 | "driverJars": [], # MySQL and Postgres JARS are included automatically, this property can be omitted completely 67 | "tables": [ 68 | { 69 | "sparkTable": "Users", 70 | "sourceTable": "UserEntities" 71 | } 72 | ] 73 | } 74 | ] 75 | }, 76 | "importSteps": [ 77 | { 78 | "description": "Data loaders for base data sets", 79 | "targetElasticsearch": { 80 | "nodes": [ 81 | "localhost:9200" 82 | ], 83 | "settings": { 84 | "es.index.auto.create": true 85 | } 86 | }, 87 | "statements": [ 88 | { 89 | "id": "Q4499_1233", 90 | "description": "Load User data into ES", 91 | "indexName": "aa-test-user", 92 | "indexType": "user", 93 | "newIndexSettingsFile": "./test-index-settings-and-mappings.json", # optional, otherwise template or infered mappings are applied 94 | "settings": { 95 | "es.mapping.id": "guid" 96 | }, 97 | "sqlQuery": """ 98 | SELECT guid, first_name, last_name, organization 99 | FROM Users 100 | WHERE dtUpdated BETWEEN '{lastRun}' and '{thisRun}' 101 | """ 102 | } 103 | ] 104 | } 105 | ] 106 | } 107 | ``` 108 | 109 | You will see that the JDBC source is provided, which must include the JDBC driver for the database, connection information, 110 | and a mapping from the original `sourceTable` database table to the temporary table `sparkTable` that will be used in 111 | later SQL queries. The name `sparkTable` is used because this system runs an embedded Apache Spark, and is creating Spark SQL 112 | tables from the configuration. 113 | 114 | Since this process runs in Apache Spark, there might be additional options you wish to set when 115 | the data is loaded. For advanced users who know what these are, you can add `settings` map at the the `jdbc` connection 116 | level to apply to all tables within that connection, or at the per-`tables` level of the configuration to apply to only 117 | one table. 118 | 119 | The `importSteps` are a collection of target Elasticsearch clusters and one or more SQL statements for each. Each statement 120 | must have a unique `id` so that state can be tracked, changing or removing the `id` will result in a full data load running 121 | for a given query. 122 | 123 | Notice that the SQL statement includes the use of the `{lastRun}` and `{thisRun}` macros. These will substitute the current 124 | date/time into the SQL as a SQL Date formated string. The granularity is SECONDS, and the local time zone of the data 125 | import processor is used. Also, be sure to put the date macros inside quotes. 126 | 127 | (_Since version `0.9.0-ALPHA`_) SQL statements can also be provided in a file, using `sqlFile` instead of `sqlQuery`, 128 | where the file path is relative to the configuration file. 129 | 130 | The `indexType` field is the target type within the Elasticsearch `indexName` for the documents. You can use either the 131 | literal type name, or include a macro of `{fieldName}` where `fieldName` is one of the fields in the SQL result set. 132 | 133 | The `newIndexSettingsFile` is optional and allows a settings+mappings JSON file to be applied when creating a new index. 134 | Otherwise any index templates or implied mappings will be used on index creation. The `es.index.auto.create` flag must 135 | be active (by default it is), otherwise this file is not used. 136 | 137 | The `settings` object for Elasticsearch are important, and the most common basic settings you may wish to set (at either 138 | the connection or statement level) are: 139 | 140 | 141 | |Setting|Description| 142 | |-------|-----------| 143 | |es.index.auto.create|Whether to auto-create the target index if it doesn't already exist, default is `true`| 144 | |es.mapping.id|Which field in the SQL results should be used as the ID for the document (if absent, autogenerated ID's are used)| 145 | |es.ingest.pipeline|Which ingest pipeline should be used to pre-process incoming records into Elasticsearch| 146 | 147 | **Tip:** For advanced use cases, please see documentation for [Elasticsearch-Spark settings](https://www.elastic.co/guide/en/elasticsearch/hadoop/current/configuration.html). 148 | 149 | ### Let's try a more complex example, 150 | ...of joining a csv text file to the SQL statement 151 | 152 | ```hocon 153 | { 154 | "sources": { 155 | "jdbc": [ 156 | { 157 | # ... same as previous example 158 | } 159 | ], 160 | "filesystem": [ 161 | { 162 | "directory": "/data/sources/raw", 163 | "tables": [ 164 | { 165 | "sparkTable": "UserEmotions", 166 | "format": "csv", 167 | "filespecs": [ 168 | "test.csv" 169 | ], 170 | "settings": { 171 | "header": true, 172 | "inferSchema": true 173 | } 174 | } 175 | ] 176 | } 177 | ] 178 | }, 179 | "importSteps": [ 180 | { 181 | "description": "Data loaders for base data sets", 182 | "targetElasticsearch": { 183 | # ... same as previous example 184 | }, 185 | "statements": [ 186 | { 187 | "id": "Q4499_1233", 188 | "description": "Load User data with merged Emotions into ES", 189 | "indexName": "aa-test-user", 190 | "indexType": "user", 191 | "settings": { 192 | "es.mapping.id": "guid" 193 | }, 194 | "sqlQuery": """ 195 | SELECT u.guid, u.first_name, u.last_name, u.organization, ue.emotion 196 | FROM Users AS u LEFT OUTER JOIN UserEmotions AS ue ON (u.guid = ue.guid) 197 | WHERE u.dtUpdated BETWEEN '{lastRun}' and '{thisRun}' 198 | """ 199 | } 200 | ] 201 | } 202 | ] 203 | } 204 | ``` 205 | 206 | We have changed the configuration adding the file source. Here the `directory` must exist and then `filespec` is a list 207 | of specific filenames, or wildcards. For example `["test1.csv", "test2.csv"]` or `["test*.csv"]` are both valid. Here again 208 | you may see `settings` maps appearing at the filesystem directory level or for each table. 209 | 210 | The format may be `csv`, `json`, tab delimited, or any other import file format supported by the default Apache Spark 211 | distribution. Some settings you might find useful for `csv` include: 212 | 213 | |Setting|Description| 214 | |-------|-----------| 215 | |header|Whether a header line is present or not (true/false)| 216 | |delimiter|What delimiter between the fields, default `,`| 217 | |quote|If fields are quoted, what character is used for quoting, default `"`| 218 | |escape|If escaping of characters is needed, what character is used, default `\`| 219 | |charset|If the file is not `UTF-8`, what charset is it?| 220 | |inferSchema|The system can infer datatypes by scanning all the data first, then loading in a second pass (true/false)| 221 | |nullValue|This string can replace any null values, otherwise they are truly null| 222 | |dateFormat|specifies a date format for recognizing and parsing date fields (follows [SimpleDateFormat](https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html))| 223 | 224 | **Tip:** For other use cases, read all [CSV Options](https://github.com/databricks/spark-csv) 225 | 226 | ### And if you want an example with an Elasticsearch source 227 | ...here is the additional part of the configuration: 228 | 229 | ```hocon 230 | { 231 | "sources": { 232 | "elasticsearch": [ 233 | { 234 | "nodes": [ 235 | "localhost:9200" 236 | ], 237 | "tables": [ 238 | { 239 | "sparkTable": "SearchDocuments", 240 | "indexName": "document-contents-2017", 241 | "indexType": "rawdoc", 242 | "esQuery": { 243 | "match_all": {} 244 | } 245 | } 246 | ] 247 | } 248 | ], 249 | # ... 250 | } 251 | ``` 252 | 253 | Now I have the Elasticsearch index acting as a table. Like before, `settings` maybe applied at the connection or `tables` level 254 | if you have special needs. And please note that the `esQuery` contains the first level of query filtering, and if absent 255 | defaults to a `match_all` query. You can write the query as above, or you can also include the top level `"query": { ... }"` 256 | element surrounding the query. 257 | 258 | ### Let's go for it, here's a bigger example 259 | ...showing an import that generates nested JSON and also a preparatory step creating temporary tables. 260 | 261 | We will use the same source tables as the previous examples and imagine that we now have `Orgs` and `OrgMembers` tables. 262 | We will create a final list of `Users` with their organizations nested inside each. We also will update the users if 263 | any of the related tables change. This might cause an explosion of updates, so be careful with your planning about how 264 | you use the `{lastRun}` and `{thisRun}` values. 265 | 266 | ```hocon 267 | { 268 | "sources": { 269 | # ... same as above but we have added `Orgs` and `OrgMembers` 270 | }, 271 | "prepStatements": [ 272 | { 273 | "description": "Create views of only the active users", 274 | "sqlQuery": """ 275 | CREATE TEMPORARY VIEW ActiveUsers AS 276 | SELECT * FROM Users WHERE isActive = 1 277 | """ 278 | }, 279 | { 280 | "description": "Create views of only the active orgs", 281 | "sqlQuery": """ 282 | CREATE TEMPORARY VIEW ActiveOrgs AS 283 | SELECT * FROM Orgs WHERE isActive = 1 284 | """ 285 | } 286 | ], 287 | "importSteps": [ 288 | { 289 | "description": "Data loaders for base data sets", 290 | "targetElasticsearch": { 291 | "nodes": [ 292 | "localhost:9200" 293 | ], 294 | "settings": { 295 | "es.index.auto.create": true 296 | } 297 | }, 298 | "statements": [ 299 | { 300 | "id": "X9A90Z_1", 301 | "description": "Load denormalized User + Org Memberships into ES", 302 | "indexName": "aa-test-user", 303 | "indexType": "{docType}", 304 | "settings": { 305 | "es.mapping.id": "guid" 306 | }, 307 | "sqlFile": "structured-user-load.sql" 308 | } 309 | ] 310 | } 311 | ] 312 | } 313 | ``` 314 | 315 | and the `structured-user-load.sql` file placed in the same directory: 316 | 317 | ```sql 318 | WITH orgMembers AS 319 | ( 320 | SELECT our.guid AS roleOrgGuid, our.userGuid AS roleUserGuid, our.orgUserRoleType AS roleType, our.dtUpdated AS dtRoleUpdated, 321 | oe.displayName AS orgDisplayName, oe.dtUpdated AS dtOrgUpdated 322 | FROM ActiveOrgs AS oe JOIN OrgMembers AS our ON (our.orgGuid = oe.guid) 323 | ), 324 | userWithOrg AS ( 325 | SELECT ue.guid, struct(ue.*) AS user, struct(om.*) AS orgMembership 326 | FROM ActiveUsers AS ue LEFT OUTER JOIN orgMembers AS om ON (om.roleUserGuid = ue.guid) 327 | ), 328 | modifiedUserData AS ( 329 | SELECT guid, first(user) as user, collect_list(orgMembership) AS orgMemberships 330 | FROM userWithOrg AS ue 331 | WHERE user.dtUpdated between "{lastRun}" AND "{thisRun}" OR 332 | orgMembership.dtRoleUpdated between "{lastRun}" AND "{thisRun}" OR 333 | orgMembership.dtOrgUpdated between "{lastRun}" AND "{thisRun}" 334 | GROUP BY guid 335 | ), 336 | usersWithEmotions AS ( 337 | SELECT mu.*, em.emotion FROM modifiedUserData AS mu LEFT OUTER JOIN UserEmotions AS em ON (mu.guid = em.guid) 338 | ) 339 | SELECT user.accountType as docType, user.guid, 340 | user.identity, user.displayName, user.contactEmail, user.avatarUrl, user.gravatarEmail, user.blurb, 341 | user.location, user.defaultTraitPrivacyType, user.companyName, user.isActive, user.isHeadless, 342 | emotion, user.dtCreated, user.dtUpdated, orgMemberships FROM usersWithEmotions 343 | ``` 344 | 345 | Ok, that was a bit much. But here is some research you can do. In the [SQL Functions API Reference](https://spark.apache.org/docs/2.1.0/api/java/org/apache/spark/sql/functions.html) 346 | you will find the `collect_list` and `struct` functions. They are used above to create structured results, like nested JSON. The `struct()` 347 | function is combining columns into an object, and the `collect_list` is aggregating the objects into an array. 348 | 349 | We are also using the `guid` from the result set as the Elasticsearch document `_id`, and we have a literal field coming back 350 | in the documents that we are using via the `indexType` setting as a macro `{docType}` pointing at that field from the result set. 351 | This is handy if different documents in the results will have different types. 352 | 353 | So at the end we have a result that looks like: 354 | 355 | ```hocon 356 | { 357 | "docType": "...", 358 | "guid": "...", 359 | "identity": "...", 360 | # ... 361 | "orgMemberships": [ 362 | { 363 | "roleOrgGuid": "...", 364 | "orgDisplayName": "...", 365 | # ... 366 | }, 367 | { 368 | "roleOrgGuid": "...", 369 | "orgDisplayName": "...", 370 | # ... 371 | } 372 | ] 373 | } 374 | ``` 375 | 376 | ### Algolia 377 | 378 | To define Algolia index as an import step target, configure the `targetAlgolia` inside your import step config like this: 379 | 380 | ``` 381 | "importSteps": [ 382 | { 383 | "description": "Load products table into Algolia index", 384 | "targetAlgolia": { 385 | "applicationId": "YOURAPPID", 386 | "apiKey": "yourapikey" 387 | }, 388 | ... 389 | ``` 390 | 391 | The statement configuration is almost identical to the one of ElasticSearch: use `index` field to specify the Algolia index, and 392 | the `idField` to specify the row that would be used as Algolia index's `objectID` (optional): 393 | 394 | ``` 395 | "statements": [ 396 | { 397 | "id": "Import-Products-Table, 398 | "description": "Load products data into Algolia index", 399 | "idField": "product_id", 400 | "indexName": "ProductsIndex", 401 | "sqlQuery": """ 402 | SELECT id AS product_id, product_name, manufacturer_name FROM Products 403 | JOIN Manufacturers ON Products.manufacturer_id = Manufacturers.id 404 | WHERE product_modify_date BETWEEN '{lastRun}' AND '{thisRun}' 405 | """ 406 | } 407 | ] 408 | ``` 409 | 410 | Algolia target also supports "Delete" statements, which must return a column with a name defined by `idField` (see above), 411 | containing the `objectID`s to be deleted from Algolia index. To define such a statement, specify the `"action": "delete"` in its definition, e.g.: 412 | 413 | ``` 414 | "statements": [ 415 | { 416 | "id": "Clear-Products-Table, 417 | "description": "Delete previously removed products from Algolia index", 418 | "idField": "product_id", 419 | "indexName": "ProductsIndex", 420 | "action": "delete", 421 | "sqlQuery": """ 422 | SELECT product_id FROM DeletedProducts 423 | WHERE product_delete_date BETWEEN '{lastRun}' AND '{thisRun}' 424 | """ 425 | } 426 | ] 427 | ``` 428 | 429 | ### SQL Reference: 430 | 431 | The data import handler uses Spark SQL, and you can read the [Spark SQL Reference](https://docs.databricks.com/spark/latest/spark-sql/index.html) 432 | for full details on the supported SQL. 433 | 434 | A list of [SQL Functions](https://spark.apache.org/docs/2.1.0/api/java/org/apache/spark/sql/functions.html) 435 | is available in raw API docs. (_TODO: find better reference_) 436 | 437 | The Data Import Handler also defines some UDF functions for use within SQL: 438 | 439 | |Function|Description|Since Version| 440 | |-------|-----------|--------------| 441 | |stripHtml(string)|Removes all HTML tags and returns only the text (including unescaping of HTML Entities)|`0.6.0-ALPHA`| 442 | |unescapeHtmlEntites(string)|Unescapes HTML entities found in the text|`0.6.0-ALPHA`| 443 | |fluffly(string)|A silly function that prepends the word "fluffly" to the text, used as a *test* function to mark values as being changed by processing|`0.6.0-ALPHA`| 444 | 445 | ### Auth and HTTPS for Elasticsearch 446 | 447 | (_Since version `0.8.0-ALPHA`_) 448 | 449 | For basic AUTH with Elasticsearch you can add the following to the source or target Elasticsearch definitions: 450 | 451 | ```hocon 452 | "basicAuth": { 453 | "username": "elastic", 454 | "password": "changeme" 455 | } 456 | ``` 457 | 458 | And for SSL, enable it within the Elasticsearch definition as well: 459 | 460 | ```hocon 461 | "enableSsl": true 462 | ``` 463 | 464 | Here is a full example, when connection to Elastic Cloud instance: 465 | 466 | ```hocon 467 | "targetElasticsearch": { 468 | "nodes": [ 469 | "123myinstance456.us-east-1.aws.found.io" 470 | ], 471 | "basicAuth": { 472 | "username": "elastic", 473 | "password": "changeme" 474 | }, 475 | "port": 9243, 476 | "enableSsl": true, 477 | "settings": { 478 | "es.index.auto.create": true, 479 | "es.nodes.wan.only": true 480 | } 481 | } 482 | ``` 483 | 484 | Note the use of the `es.nodes.wan.only` setting to use the external host names for the cluster members, and not internal AWS addresses. 485 | 486 | ### State Management and History: 487 | 488 | State for the `lastRun` value is per-statement and stored in the target Elasticsearch cluster for that statement. An index 489 | will be created called `.kohesive-dih-state-v2` which stores the last run state, a lock for current running statements, and 490 | a log of all previous runs (success and failures, along with row count processed by the statement query). 491 | 492 | You should inspect this log (index `.kohesive-dih-state-v2` type `log`)if you wish to monitor the results of runs. 493 | 494 | ### Parallelism 495 | 496 | By default, the data import handler is running locally with `Processors-1` parallelism. You can set the following 497 | top-level configuration setting to change the parallelism: 498 | 499 | ``` 500 | { 501 | "sparkMaster": "local[N]", 502 | "sources": { 503 | # ... 504 | } 505 | } 506 | ``` 507 | 508 | Where `N` is the number of partitions you wish to run. Since this is the Spark master setting, some people might try 509 | connecting to a Spark cluster using the setting. It just might work! 510 | 511 | ### Memory Issues 512 | 513 | If you run out of memory you can set the Java VM parameters via the `KOHESIVE_ES_DIH_OPTS` environment variable before 514 | running the `kohesive-es-dih` script. For example, to set it to 2G: `-Xmx2g` 515 | 516 | For Spark driver and executor memory settings (_Since version `0.9.0-ALPHA`_), you can add Spark Configuration settings 517 | in `sparkConfig` map, for example: 518 | 519 | ``` 520 | { 521 | "sparkMaster": "local[4]", 522 | "sparkConfig": { 523 | "spark.driver.memory": "300mb", 524 | "spark.executor.memory": "512mb" 525 | }, 526 | "sources": { 527 | # ... 528 | } 529 | } 530 | ``` 531 | 532 | Note that `spark.executor.memory` has a minimum allowed value that you cannot go below, and you will receive an error if 533 | it is set to low or the JVM does not have enough memory. 534 | 535 | [Other Spark memory configuration settings](https://spark.apache.org/docs/latest/configuration.html#memory-management) such 536 | as usage of off-heap space can be defined as mentioned above in the Spark Configuration settings. 537 | 538 | *Query Caching:* 539 | 540 | Prior to version 0.10.0-ALPHA all queries were cached into memory which could cause an overflow. Now by default they 541 | are not cached. For each statement (preperatory or import statements) you can specify these properties to control caching, 542 | otherwise no caching is performed. 543 | 544 | ``` 545 | "cache": true, 546 | "persist": "MEMORY_AND_DISK_SER" 547 | ``` 548 | 549 | Where the persist setting is one of the storage levels defined [in the RDD persistence guide](https://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence), and 550 | the default is `MEMORY_ONLY` which might overflow memory if not enough JVM or off-heap space is permitted. 551 | 552 | Note that caching helps with preparatory steps that are then used later, or to speed up the two step process 553 | of running the main import queries followed by counting the number of rows that the query caused to be imported. But 554 | it is not required. 555 | 556 | 557 | ### NUMBER and DECIMAL Types Being Disallowed 558 | 559 | If you have errors such as: 560 | 561 | > Decimal types are not supported by Elasticsearch 562 | 563 | This is an issue explained further by the Elasticsearch team in [ES-Hadoop #842](https://github.com/elastic/elasticsearch-hadoop/issues/842). 564 | 565 | Basically, they want to avoid precision loss during the SQL stage and instead let Elasticsearch use knowledge of the actual mappings later to do the conversion. 566 | So the recommendation is that you cast the type to a `String` type of decimal, or a `Integer` type if not and then let the conversion 567 | happen at the moment of indexing. 568 | 569 | This would look something like: 570 | 571 | ``` 572 | SELECT id, name, cast(somethingDecimal as String) AS somethingDecimal, dtCreated, dtUpdated 573 | FROM myTable 574 | ``` 575 | 576 | ### TODOs 577 | 578 | **See:** [Issues](https://github.com/kohesive/elasticsearch-data-import-handler/issues) for TODO, feature requests, ideas and issues. 579 | 580 | ## Special Thanks 581 | 582 | ![YourKit logo](https://www.yourkit.com/images/yklogo.png) 583 | 584 | YourKit supports open source projects with its full-featured Java Profiler. 585 | YourKit, LLC is the creator of [YourKit Java Profiler](https://www.yourkit.com/java/profiler/) 586 | and [YourKit .NET Profiler](https://www.yourkit.com/.net/profiler/), 587 | innovative and intelligent tools for profiling Java and .NET applications. 588 | 589 | 590 | 591 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | plugins { 2 | id 'nebula.kotlin' version '1.1.61' apply false 3 | id 'nebula.resolution-rules' version '5.1.1' apply false 4 | } 5 | 6 | apply plugin: 'nebula.kotlin' 7 | apply plugin: 'nebula.resolution-rules' 8 | apply plugin: 'java' 9 | apply plugin: 'idea' 10 | 11 | apply plugin: 'application' 12 | 13 | mainClassName = 'uy.kohesive.elasticsearch.dataimport.App' 14 | applicationName = 'kohesive-es-dih' 15 | 16 | sourceCompatibility = JavaVersion.VERSION_1_8 17 | targetCompatibility = JavaVersion.VERSION_1_8 18 | 19 | dependencies { 20 | compile group: 'org.jetbrains.kotlin', name: 'kotlin-stdlib-jre8', version: version_kotlin 21 | compile group: 'org.jetbrains.kotlin', name: 'kotlin-reflect', version: version_kotlin 22 | 23 | compile group: 'com.fasterxml.jackson.module', name: 'jackson-module-kotlin', version: version_jackson_kotlin_module 24 | compile group: 'com.fasterxml.jackson.datatype', name: 'jackson-datatype-jdk8', version: version_jackson_compatible_with_spark 25 | compile group: 'com.fasterxml.jackson.datatype', name: 'jackson-datatype-jsr310', version: version_jackson_compatible_with_spark 26 | 27 | compile group: 'com.typesafe', name: 'config', version: version_typesafe_config 28 | 29 | compile group: 'org.elasticsearch', name: "elasticsearch-spark-20_${version_scala}", version: version_elasticsearch 30 | compile group: 'com.squareup.okhttp3', name: 'okhttp', version: version_okhttp 31 | 32 | compile group: 'org.apache.spark', name: "spark-sql_${version_scala}", version: version_spark 33 | 34 | compile group: 'org.jsoup', name: 'jsoup', version: version_jsoup 35 | 36 | compile (group: 'com.algolia', name: 'algoliasearch', version: version_algolia) { 37 | exclude group: 'com.fasterxml.jackson.module' 38 | exclude group: 'com.fasterxml.jackson.datatype' 39 | exclude group: 'com.fasterxml.jackson.core' 40 | } 41 | 42 | resolutionRules group: 'com.netflix.nebula', name: 'gradle-resolution-rules', version: version_nebula_resolution 43 | resolutionRules files("${rootDir}/gradle-local-dependency-rules.json") 44 | 45 | compile group: 'org.slf4j', name: 'slf4j-api', version: version_slf4j 46 | runtime group: 'ch.qos.logback', name: 'logback-classic', version: version_logback 47 | 48 | compile group: 'mysql', name: 'mysql-connector-java', version: version_jdbc_mysql 49 | compile group: 'org.postgresql', name: 'postgresql', version: version_jdbc_postresql 50 | 51 | testCompile group: 'junit', name: 'junit', version: version_junit 52 | testCompile group: 'org.jetbrains.kotlin', name: 'kotlin-test-junit' 53 | 54 | } 55 | 56 | nebulaResolutionRules { 57 | optional = ['slf4j-bridge'] 58 | } 59 | 60 | repositories { 61 | mavenCentral() 62 | } 63 | 64 | compileKotlin { 65 | kotlinOptions { 66 | jvmTarget = 1.8 67 | } 68 | } 69 | 70 | task wrapper(type: Wrapper) { 71 | gradleVersion = version_gradle 72 | } 73 | 74 | tasks.withType(Tar){ 75 | compression = Compression.GZIP 76 | } 77 | 78 | idea { 79 | module { 80 | downloadJavadoc = true 81 | downloadSources = true 82 | } 83 | } -------------------------------------------------------------------------------- /circle.yml: -------------------------------------------------------------------------------- 1 | # Java Gradle CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-java/ for more details 4 | # 5 | version: 2 6 | jobs: 7 | build: 8 | docker: 9 | # specify the version you desire here 10 | - image: circleci/openjdk:8-jdk 11 | 12 | # Specify service dependencies here if necessary 13 | # CircleCI maintains a library of pre-built images 14 | # documented at https://circleci.com/docs/2.0/circleci-images/ 15 | # - image: circleci/postgres:9.4 16 | 17 | working_directory: ~/repo 18 | 19 | environment: 20 | # Customize the JVM maximum heap limit 21 | JVM_OPTS: -Xmx3200m 22 | TERM: dumb 23 | 24 | steps: 25 | - checkout 26 | 27 | # Download and cache dependencies 28 | - restore_cache: 29 | keys: 30 | - v1-dependencies-{{ checksum "build.gradle" }} 31 | # fallback to using the latest cache if no exact match is found 32 | - v1-dependencies- 33 | 34 | - run: gradle dependencies 35 | 36 | - save_cache: 37 | paths: 38 | - ~/.gradle 39 | key: v1-dependencies-{{ checksum "build.gradle" }} 40 | 41 | # run tests! 42 | - run: ./gradlew clean build test -------------------------------------------------------------------------------- /gradle-local-dependency-rules.json: -------------------------------------------------------------------------------- 1 | { 2 | "exclude": [ 3 | { 4 | "module": "org.slf4j:slf4j-log4j12", 5 | "reason": "This does the reverse of what we want, routes to log4j, we want to route to slf4j", 6 | "author": "Jayson Minard ", 7 | "date": "2017-02-10T20:00:00.000Z" 8 | } 9 | ], 10 | "replace": [], 11 | "align": [], 12 | "deny": [], 13 | "reject": [] 14 | } 15 | -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | group = uy.kohesive.elasticsearch 2 | version = 1.0.0-ES-5.6.x-BETA-01 3 | 4 | version_gradle=4.5.1 5 | version_kotlin=1.1.61 6 | 7 | version_nebula_resolution=0.52.0 8 | version_jackson_compatible_with_spark=2.6.7 9 | version_jackson_kotlin_module=2.6.7 10 | version_spark=2.2.1 11 | 12 | version_typesafe_config=1.3.1 13 | 14 | version_elasticsearch=5.6.7 15 | version_okhttp=3.10.0 16 | version_scala=2.11 17 | 18 | version_jsoup=1.10.2 19 | 20 | version_slf4j=1.7.+ 21 | version_logback=1.2.+ 22 | version_junit=4.12 23 | 24 | version_jdbc_mysql=5.1.45 25 | version_jdbc_postresql=42.2.1 26 | 27 | version_algolia=2.15.6 28 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kohesive/elasticsearch-data-import-handler/029412ffef2169a34a80ce7b516e019687238c0b/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Sun Feb 25 15:56:41 CST 2018 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.5.1-bin.zip 7 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Attempt to set APP_HOME 10 | # Resolve links: $0 may be a link 11 | PRG="$0" 12 | # Need this for relative symlinks. 13 | while [ -h "$PRG" ] ; do 14 | ls=`ls -ld "$PRG"` 15 | link=`expr "$ls" : '.*-> \(.*\)$'` 16 | if expr "$link" : '/.*' > /dev/null; then 17 | PRG="$link" 18 | else 19 | PRG=`dirname "$PRG"`"/$link" 20 | fi 21 | done 22 | SAVED="`pwd`" 23 | cd "`dirname \"$PRG\"`/" >/dev/null 24 | APP_HOME="`pwd -P`" 25 | cd "$SAVED" >/dev/null 26 | 27 | APP_NAME="Gradle" 28 | APP_BASE_NAME=`basename "$0"` 29 | 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 31 | DEFAULT_JVM_OPTS="" 32 | 33 | # Use the maximum available, or set MAX_FD != -1 to use that value. 34 | MAX_FD="maximum" 35 | 36 | warn ( ) { 37 | echo "$*" 38 | } 39 | 40 | die ( ) { 41 | echo 42 | echo "$*" 43 | echo 44 | exit 1 45 | } 46 | 47 | # OS specific support (must be 'true' or 'false'). 48 | cygwin=false 49 | msys=false 50 | darwin=false 51 | nonstop=false 52 | case "`uname`" in 53 | CYGWIN* ) 54 | cygwin=true 55 | ;; 56 | Darwin* ) 57 | darwin=true 58 | ;; 59 | MINGW* ) 60 | msys=true 61 | ;; 62 | NONSTOP* ) 63 | nonstop=true 64 | ;; 65 | esac 66 | 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 68 | 69 | # Determine the Java command to use to start the JVM. 70 | if [ -n "$JAVA_HOME" ] ; then 71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 72 | # IBM's JDK on AIX uses strange locations for the executables 73 | JAVACMD="$JAVA_HOME/jre/sh/java" 74 | else 75 | JAVACMD="$JAVA_HOME/bin/java" 76 | fi 77 | if [ ! -x "$JAVACMD" ] ; then 78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 79 | 80 | Please set the JAVA_HOME variable in your environment to match the 81 | location of your Java installation." 82 | fi 83 | else 84 | JAVACMD="java" 85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 86 | 87 | Please set the JAVA_HOME variable in your environment to match the 88 | location of your Java installation." 89 | fi 90 | 91 | # Increase the maximum file descriptors if we can. 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 93 | MAX_FD_LIMIT=`ulimit -H -n` 94 | if [ $? -eq 0 ] ; then 95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 96 | MAX_FD="$MAX_FD_LIMIT" 97 | fi 98 | ulimit -n $MAX_FD 99 | if [ $? -ne 0 ] ; then 100 | warn "Could not set maximum file descriptor limit: $MAX_FD" 101 | fi 102 | else 103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 104 | fi 105 | fi 106 | 107 | # For Darwin, add options to specify how the application appears in the dock 108 | if $darwin; then 109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 110 | fi 111 | 112 | # For Cygwin, switch paths to Windows format before running java 113 | if $cygwin ; then 114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 116 | JAVACMD=`cygpath --unix "$JAVACMD"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Escape application args 158 | save ( ) { 159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 160 | echo " " 161 | } 162 | APP_ARGS=$(save "$@") 163 | 164 | # Collect all arguments for the java command, following the shell quoting and substitution rules 165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 166 | 167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong 168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then 169 | cd "$(dirname "$0")" 170 | fi 171 | 172 | exec "$JAVACMD" "$@" 173 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS= 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'kohesive-elasticsearch-data-import-handler' -------------------------------------------------------------------------------- /src/main/java/uy/kohesive/elasticsearch/dataimport/udf/Udfs.java: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport.udf; 2 | 3 | import org.apache.spark.sql.SparkSession; 4 | import org.apache.spark.sql.api.java.UDF1; 5 | import org.apache.spark.sql.api.java.UDF2; 6 | import org.apache.spark.sql.types.DataTypes; 7 | 8 | public class Udfs { 9 | 10 | public static void registerStringToStringUdf(final SparkSession spark, final String name, final UDF1 f) { 11 | spark.udf().register(name, f, DataTypes.StringType); 12 | } 13 | 14 | public static void registerAnyAnyToTimestampUdf(final SparkSession spark, final String name, final UDF2 f) { 15 | spark.udf().register(name, f, DataTypes.TimestampType); 16 | } 17 | 18 | public static void registerStringStringToStringUdf(final SparkSession spark, final String name, final UDF2 f) { 19 | spark.udf().register(name, f, DataTypes.StringType); 20 | } 21 | 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/kotlin/uy/kohesive/elasticsearch/dataimport/AlgoliaDataImportHandler.kt: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport 2 | 3 | import com.algolia.search.APIClient 4 | import com.algolia.search.ApacheAPIClientBuilder 5 | import com.fasterxml.jackson.module.kotlin.readValue 6 | import org.apache.spark.TaskContext 7 | import org.apache.spark.sql.Dataset 8 | import org.apache.spark.sql.Row 9 | import org.apache.spark.sql.types.StructType 10 | import org.elasticsearch.hadoop.cfg.PropertiesSettings 11 | import org.elasticsearch.hadoop.serialization.json.JacksonJsonGenerator 12 | import org.elasticsearch.hadoop.util.FastByteArrayOutputStream 13 | import org.elasticsearch.spark.cfg.SparkSettingsManager 14 | import org.elasticsearch.spark.sql.DataFrameValueWriter 15 | import scala.Tuple2 16 | import scala.collection.Iterator 17 | import scala.runtime.AbstractFunction2 18 | import java.io.File 19 | import java.io.Serializable 20 | 21 | class AlgoliaDataImportHandler( 22 | override val statement: DataImportStatement, 23 | override val configRelativeDir: File, 24 | 25 | targetAlgolia: AlgoliaTargetConnection 26 | ) : StatementDataImportHandler, Serializable { 27 | 28 | val options = mapOf( 29 | "algolia.write.applicationid" to targetAlgolia.applicationId, 30 | "algolia.write.apikey" to targetAlgolia.apiKey, 31 | "algolia.write.idfield" to statement.idField, 32 | "algolia.write.index" to statement.indexName 33 | ) 34 | 35 | override fun prepareIndex() { 36 | // TODO: implement 37 | } 38 | 39 | override fun import(dataSet: Dataset): Long { 40 | AlgoliaSparkTaskRunner.runInSpark(dataSet, options, statement.getAction()) 41 | return dataSet.count() 42 | } 43 | } 44 | 45 | object AlgoliaSparkTaskRunner { 46 | 47 | fun runInSpark(ds: Dataset, cfg: Map, action: StatementAction) { 48 | val sparkCtx = ds.sqlContext().sparkContext() 49 | val sparkCfg = SparkSettingsManager().load(sparkCtx.conf) 50 | 51 | val algoliaCfg = PropertiesSettings().load(sparkCfg.save()) 52 | algoliaCfg.merge(cfg) 53 | 54 | val rdd = ds.toDF().rdd() 55 | 56 | val serializedSettings = algoliaCfg.save() 57 | val schema = ds.schema() 58 | 59 | sparkCtx.runJob( 60 | rdd, 61 | object : AbstractFunction2, Long>(), Serializable { 62 | override fun apply(taskContext: TaskContext, data: Iterator): Long { 63 | val task = when (action) { 64 | StatementAction.Index -> AlgoliaDataFrameWriter(schema, serializedSettings) 65 | StatementAction.Delete -> AlgoliaObjectsDeleteTask(schema, serializedSettings) 66 | } 67 | task.write(taskContext, data) 68 | return 0L 69 | } 70 | }, 71 | scala.reflect.`ClassTag$`.`MODULE$`.apply(Long::class.java) 72 | ) 73 | } 74 | 75 | } 76 | 77 | abstract class AlgoliaDataFrameBufferedTask(val schema: StructType, serializedSettings: String) { 78 | 79 | companion object { 80 | val DefaultBulkSize = 50 81 | } 82 | 83 | protected val settings = PropertiesSettings().load(serializedSettings) 84 | 85 | protected val bulkSize = settings.getProperty("algolia.write.bulkSize")?.toInt() ?: DefaultBulkSize 86 | 87 | protected val idField: String? = settings.getProperty("algolia.write.idfield") 88 | 89 | protected val algoliaClient: APIClient = ApacheAPIClientBuilder( 90 | settings.getProperty("algolia.write.applicationid"), 91 | settings.getProperty("algolia.write.apikey") 92 | ).setObjectMapper(JSON).build() 93 | 94 | protected val targetIndex = algoliaClient.initIndex(settings.getProperty("algolia.write.index"), Map::class.java) 95 | 96 | protected val buffer = ArrayList() 97 | 98 | private fun flush() { 99 | val objectsToWrite: List> = buffer.map { rowStr -> 100 | JSON.readValue>(rowStr).let { map -> 101 | if (idField != null) { 102 | map + ("objectID" to map[idField]) 103 | } else { 104 | map 105 | } 106 | } 107 | } 108 | if (objectsToWrite.isNotEmpty()) { 109 | flush(objectsToWrite) 110 | buffer.clear() 111 | } 112 | } 113 | 114 | abstract fun flush(objects: List>) 115 | 116 | fun write(taskContext: TaskContext, data: Iterator) { 117 | fun tryFlush() { 118 | if (buffer.size >= bulkSize) { 119 | flush() 120 | } 121 | } 122 | 123 | while (data.hasNext()) { 124 | val row = data.next() 125 | val out = FastByteArrayOutputStream() 126 | val generator = JacksonJsonGenerator(out) 127 | 128 | generator.use { generator -> 129 | DataFrameValueWriter().write(Tuple2(row, schema), generator) 130 | } 131 | 132 | buffer.add(out.toString()) 133 | tryFlush() 134 | } 135 | 136 | flush() 137 | } 138 | 139 | } 140 | 141 | class AlgoliaDataFrameWriter(schema: StructType, serializedSettings: String) : AlgoliaDataFrameBufferedTask(schema, serializedSettings) { 142 | 143 | override fun flush(objects: List>) { 144 | targetIndex.addObjects(objects) 145 | } 146 | 147 | } 148 | 149 | class AlgoliaObjectsDeleteTask(schema: StructType, serializedSettings: String) : AlgoliaDataFrameBufferedTask(schema, serializedSettings) { 150 | 151 | override fun flush(objects: List>) { 152 | if (idField == null) { 153 | throw IllegalStateException("Delete statement must have `idField` defined") 154 | } 155 | targetIndex.deleteObjects(objects.map { it[idField]?.toString() }.filterNotNull()) 156 | } 157 | 158 | } 159 | 160 | -------------------------------------------------------------------------------- /src/main/kotlin/uy/kohesive/elasticsearch/dataimport/AlgoliaStateManager.kt: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport 2 | 3 | import com.algolia.search.APIClient 4 | import com.algolia.search.ApacheAPIClientBuilder 5 | import com.algolia.search.Index 6 | import com.algolia.search.objects.IndexSettings 7 | import com.algolia.search.objects.Query 8 | import java.time.Instant 9 | 10 | data class AlgoliaState( 11 | val targetIndex: String, 12 | val statementId: String, 13 | val status: String, 14 | val lastRunId: String, 15 | val lastErrorMsg: String?, 16 | val lastRunDate: Long, 17 | val lastRowCount: Long 18 | ) 19 | 20 | data class AlgoliaLog( 21 | val targetIndex: String, 22 | val statementId: String, 23 | val runId: String, 24 | val status: String, 25 | val errorMsg: String?, 26 | val runDate: Long, 27 | val rowCount: Long 28 | ) 29 | 30 | data class AlgoliaLock( 31 | val targetIndex: String, 32 | val statementId: String, 33 | val runId: String, 34 | val lockDate: Long 35 | ) 36 | 37 | // TODO: this 'locking' mechanism is not safe, Algolia doesn't allow atomic updates 38 | class AlgoliaStateManager(applicationId: String, apiKey: String) : StateManager { 39 | 40 | companion object { 41 | val StateIndex = "kohesive-dih-state" 42 | val LogIndex = "kohesive-dih-log" 43 | val LockIndex = "kohesive-dih-lock" 44 | } 45 | 46 | private lateinit var stateIndex: Index 47 | private lateinit var logIndex: Index 48 | private lateinit var lockIndex: Index 49 | 50 | val algoliaClient: APIClient = ApacheAPIClientBuilder(applicationId, apiKey).setObjectMapper(JSON).build() 51 | 52 | override fun init() { 53 | fun initIndex(indexName: String, clazz: Class, settings: IndexSettings.() -> Unit): Index { 54 | return algoliaClient.listIndices().firstOrNull { it.name == indexName }?.let { 55 | algoliaClient.initIndex(indexName, clazz) 56 | } ?: kotlin.run { 57 | algoliaClient.initIndex(indexName, clazz).apply { 58 | this.settings = IndexSettings().apply { 59 | settings() 60 | } 61 | } 62 | } 63 | } 64 | 65 | stateIndex = initIndex(StateIndex, AlgoliaState::class.java) { 66 | searchableAttributes = listOf("targetIndex", "statementId", "status", "lastRunId", "lastErrorMsg") 67 | numericAttributesForFiltering = listOf("lastRunDate", "lastRowCount") 68 | } 69 | logIndex = initIndex(LogIndex, AlgoliaLog::class.java) { 70 | searchableAttributes = listOf("targetIndex", "statementId", "runId", "status", "errorMsg") 71 | numericAttributesForFiltering = listOf("runDate", "rowCount") 72 | } 73 | lockIndex = initIndex(LockIndex, AlgoliaLock::class.java) { 74 | searchableAttributes = listOf("targetIndex", "statementId", "runId") 75 | numericAttributesForFiltering = listOf("lockDate") 76 | } 77 | 78 | ttlKillOldLocks() 79 | } 80 | 81 | override fun lockStatement(runId: String, statement: DataImportStatement): Boolean { 82 | if (lockIndex.getObject(statement.stateKey()).isPresent) { 83 | ttlKillOldLocks() 84 | return pingLockStatement(runId, statement) 85 | } else { 86 | lockIndex.addObject(statement.stateKey(), AlgoliaLock( 87 | targetIndex = statement.indexName, 88 | statementId = statement.id, 89 | runId = runId, 90 | lockDate = now() 91 | )) 92 | return true 93 | } 94 | } 95 | 96 | override fun pingLockStatement(runId: String, statement: DataImportStatement): Boolean { 97 | return lockIndex.getObject(statement.stateKey()).map { lock -> 98 | if (lock.runId == runId) { 99 | lockIndex.partialUpdateObject(statement.stateKey(), lock.copy( 100 | lockDate = now() 101 | )) 102 | true 103 | } else { 104 | throw DataImportException("State manager failed, cannot acquire lock for ${statement.stateKey()} -- it is held by ${lock.runId} since ${lock.lockDate}") 105 | } 106 | }.orElse(false) 107 | } 108 | 109 | private fun now() = Instant.now().toEpochMilli() 110 | 111 | private fun ttlKillOldLocks() { 112 | try { 113 | lockIndex.deleteByQuery(Query().setNumericFilters(listOf("lockDate < ${ now() - (1000 * 60 * 15) }"))) 114 | } catch (t: Throwable) { 115 | throw DataImportException("State manager failed, TTL delete query for locks failed", t) 116 | } 117 | } 118 | 119 | override fun unlockStatement(runId: String, statement: DataImportStatement) { 120 | if (pingLockStatement(runId, statement)) { 121 | try { 122 | lockIndex.deleteObject(statement.stateKey()) 123 | } catch (t: Throwable) { 124 | throw DataImportException("State manager failed, cannot delete lock for ${statement.stateKey()}", t) 125 | } 126 | } 127 | } 128 | 129 | override fun writeStateForStatement(runId: String, statement: DataImportStatement, lastRunStart: Instant, status: String, lastRowCount: Long, errMsg: String?) { 130 | stateIndex.addObject(statement.stateKey(), AlgoliaState( 131 | targetIndex = statement.indexName, 132 | statementId = statement.id, 133 | lastErrorMsg = errMsg, 134 | lastRowCount = lastRowCount, 135 | lastRunDate = lastRunStart.toEpochMilli(), 136 | lastRunId = runId, 137 | status = status 138 | )) 139 | } 140 | 141 | override fun readStateForStatement(runId: String, statement: DataImportStatement): Instant? { 142 | return stateIndex.getObject(statement.stateKey()).map { Instant.ofEpochMilli(it.lastRunDate) }.orElse(null) 143 | } 144 | 145 | override fun logStatement(runId: String, statement: DataImportStatement, lastRunStart: Instant, status: String, rowCount: Long, errMsg: String?) { 146 | logIndex.addObject("${statement.stateKey()}_run_${runId}", AlgoliaLog( 147 | targetIndex = statement.indexName, 148 | status = status, 149 | statementId = statement.id, 150 | runId = runId, 151 | errorMsg = errMsg, 152 | rowCount = rowCount, 153 | runDate = lastRunStart.toEpochMilli() 154 | )) 155 | } 156 | } -------------------------------------------------------------------------------- /src/main/kotlin/uy/kohesive/elasticsearch/dataimport/App.kt: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper 4 | import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper 5 | import com.fasterxml.jackson.module.kotlin.readValue 6 | import com.typesafe.config.ConfigFactory 7 | import com.typesafe.config.ConfigRenderOptions 8 | import com.typesafe.config.ConfigResolveOptions 9 | import org.apache.spark.sql.AnalysisException 10 | import org.apache.spark.sql.SparkSession 11 | import org.apache.spark.sql.catalyst.parser.ParseException 12 | import org.apache.spark.storage.StorageLevel 13 | import java.io.File 14 | import java.io.InputStream 15 | import java.io.InputStreamReader 16 | import java.net.URL 17 | import java.net.URLClassLoader 18 | import java.sql.Timestamp 19 | import java.time.Instant 20 | import java.time.LocalDateTime 21 | import java.time.ZoneOffset 22 | import java.time.temporal.ChronoUnit 23 | import java.util.* 24 | 25 | class App { 26 | companion object { 27 | @JvmStatic fun main(args: Array) { 28 | println("Kohesive - Elasticsearch Data Import Utility") 29 | val configFile = args.takeIf { it.isNotEmpty() }?.let { File(it[0]).normalize().absoluteFile } 30 | 31 | if (configFile == null || !configFile.exists()) { 32 | println(" ERROR: configFile must be specified and must exist") 33 | configFile?.let { println("${it.absolutePath} does not exist") } 34 | println(" usage: App ") 35 | println() 36 | System.exit(-1) 37 | } 38 | 39 | try { 40 | App().run(configFile!!.inputStream(), configFile.parentFile) 41 | } catch (ex: Throwable) { 42 | System.err.println("Data import failed due to:") 43 | System.err.println(ex.message) 44 | System.err.println() 45 | System.err.println("Debug stack trace:") 46 | ex.printStackTrace() 47 | System.exit(-1) 48 | } 49 | } 50 | } 51 | 52 | enum class ImportTarget { 53 | ElasticSearch, 54 | Algolia 55 | } 56 | 57 | fun run(configInput: InputStream, configRelativeDir: File) { 58 | val hoconCfg = ConfigFactory.parseReader(InputStreamReader(configInput)).resolve(ConfigResolveOptions.noSystem()) 59 | val jsonCfg = hoconCfg.root().render(ConfigRenderOptions.concise().setJson(true)) 60 | val cfg = jacksonObjectMapper().readValue(jsonCfg) 61 | 62 | val uniqueId = UUID.randomUUID().toString() 63 | 64 | val sparkMaster = cfg.sparkMaster ?: "local[${(Runtime.getRuntime().availableProcessors() - 1).coerceAtLeast(1)}]" 65 | 66 | fun fileRelativeToConfig(filename: String): File { 67 | return configRelativeDir.resolve(filename).canonicalFile 68 | } 69 | 70 | println("Connecting to target ES/Algolia to check state...") 71 | val stateMap: Map = cfg.importSteps.map { importStep -> 72 | val mgr = if (importStep.targetElasticsearch != null) { 73 | ElasticSearchStateManager( 74 | nodes = importStep.targetElasticsearch.nodes, 75 | port = importStep.targetElasticsearch.port ?: 9200, 76 | enableSsl = importStep.targetElasticsearch.enableSsl ?: false, 77 | auth = importStep.targetElasticsearch.basicAuth 78 | ) 79 | } else if (importStep.targetAlgolia != null) { 80 | AlgoliaStateManager( 81 | applicationId = importStep.targetAlgolia.applicationId, 82 | apiKey = importStep.targetAlgolia.apiKey 83 | ) 84 | } else { 85 | throw IllegalStateException(importStep.description + " import step neither declares ES nor Algolia target") 86 | } 87 | 88 | mgr.init() 89 | 90 | importStep.statements.map { statement -> 91 | statement.id to mgr 92 | } 93 | }.flatten().toMap() 94 | 95 | val NOSTATE = LocalDateTime.of(1900, 1, 1, 0, 0, 0, 0).atZone(ZoneOffset.UTC).toInstant() 96 | 97 | fun DataImportStatement.validate(importTarget: ImportTarget) { 98 | // do a little validation of the .. 99 | if (newIndexSettingsFile != null) { 100 | val checkFile = fileRelativeToConfig(newIndexSettingsFile) 101 | if (!checkFile.exists()) { 102 | throw IllegalStateException("The statement '${id}' new-index mapping file must exist: $checkFile") 103 | } 104 | } 105 | if (importTarget == ImportTarget.ElasticSearch && indexType == null && type == null) { 106 | throw IllegalArgumentException("The statement '${id}' is missing `indexType`") 107 | } 108 | if (type != null && indexType == null) { 109 | System.err.println(" Statement configuration parameter `type` is deprecated, use `indexType`") 110 | } 111 | if (sqlQuery == null && sqlFile == null) { 112 | throw IllegalArgumentException("The statement '${id}' is missing one of `sqlQuery` or `sqlFile`") 113 | } 114 | if (sqlQuery != null && sqlFile != null) { 115 | throw IllegalArgumentException("The statement '${id}' should have only one of `sqlQuery` or `sqlFile`") 116 | } 117 | if (sqlFile != null && !fileRelativeToConfig(sqlFile).exists()) { 118 | throw IllegalArgumentException("The statement '${id}' `sqlFile` must exist") 119 | } 120 | } 121 | 122 | val lastRuns: Map = cfg.importSteps.map { importStep -> 123 | val importTarget = if (importStep.targetElasticsearch != null) ImportTarget.ElasticSearch else ImportTarget.Algolia 124 | importStep.statements.map { statement -> 125 | val lastState = stateMap.get(statement.id)!!.readStateForStatement(uniqueId, statement)?.truncatedTo(ChronoUnit.SECONDS) ?: NOSTATE 126 | println(" Statement ${statement.id} - ${statement.description}") 127 | println(" LAST RUN: ${if (lastState == NOSTATE) "never" else lastState.toIsoString()}") 128 | 129 | statement.validate(importTarget) 130 | statement.id to lastState 131 | } 132 | }.flatten().toMap() 133 | 134 | cfg.prepStatements?.forEach { statement -> 135 | if (statement.sqlQuery == null && statement.sqlFile == null) { 136 | throw IllegalArgumentException("A prepStatement is missing one of `sqlQuery` or `sqlFile`") 137 | } 138 | if (statement.sqlQuery != null && statement.sqlFile != null) { 139 | throw IllegalArgumentException("A prepStatement should have only one of `sqlQuery` or `sqlFile`") 140 | } 141 | if (statement.sqlFile != null && !fileRelativeToConfig(statement.sqlFile).exists()) { 142 | throw IllegalArgumentException("A prepStatement `sqlFile` must exist: ${statement.sqlFile}") 143 | } 144 | } 145 | 146 | val thisRunDate = Instant.now().truncatedTo(ChronoUnit.SECONDS) 147 | 148 | val oldClassLoader = Thread.currentThread().contextClassLoader 149 | val extraClasspath = cfg.sources.jdbc?.map { it.driverJars }?.filterNotNull()?.flatten()?.map { URL("file://${File(it).normalize().absolutePath}") }?.toTypedArray() ?: emptyArray() 150 | val newClassLoader = URLClassLoader(extraClasspath, oldClassLoader) 151 | 152 | Thread.currentThread().contextClassLoader = newClassLoader 153 | try { 154 | SparkSession.builder() 155 | .appName("esDataImport-${uniqueId}") 156 | .config("spark.ui.enabled", false) 157 | .apply { 158 | cfg.sparkConfig?.forEach { 159 | config(it.key, it.value) 160 | } 161 | } 162 | .master(sparkMaster).getOrCreate().use { spark -> 163 | 164 | // add extra UDF functions 165 | DataImportHandlerUdfs.registerSparkUdfs(spark) 166 | 167 | // setup FILE inputs 168 | println() 169 | cfg.sources.filesystem?.forEach { filesystem -> 170 | println("Mounting filesystem ${filesystem.directory}") 171 | val dir = File(filesystem.directory).normalize() 172 | if (!dir.exists()) { 173 | throw DataImportException("Invalid filesystem directory: ${dir} does not exist") 174 | } 175 | filesystem.tables.forEach { table -> 176 | println("Create table ${table.sparkTable} from filespec ${table.filespecs.joinToString()}") 177 | val options = mutableMapOf() 178 | filesystem.settings?.let { options.putAll(it) } 179 | table.settings?.let { options.putAll(it) } 180 | val fileSpecs = table.filespecs.map { "${dir.absolutePath}${File.separatorChar}${it}" }.toTypedArray() 181 | spark.read().format(table.format) 182 | .options(options) 183 | .load(*fileSpecs) 184 | .createOrReplaceTempView(table.sparkTable) 185 | } 186 | } 187 | 188 | // setup JDBC inputs 189 | println() 190 | cfg.sources.jdbc?.forEach { jdbc -> 191 | println("Mounting JDBC source ${jdbc.jdbcUrl}") 192 | jdbc.tables.forEach { table -> 193 | println("Creating table ${table.sparkTable} from JDBC ${table.sourceTable}") 194 | val sourceTable = table.sourceTable.takeIf { '.' in it } ?: if (jdbc.defaultSchema.isNullOrBlank()) table.sourceTable else "${jdbc.defaultSchema}.${table.sourceTable}" 195 | val options = mutableMapOf("url" to jdbc.jdbcUrl, 196 | "driver" to jdbc.driverClass, 197 | "user" to jdbc.auth.username, 198 | "password" to jdbc.auth.password, 199 | "dbtable" to sourceTable) 200 | jdbc.settings?.let { options.putAll(it) } 201 | table.settings?.let { options.putAll(it) } 202 | spark.read().format("jdbc") 203 | .options(options) 204 | .load() 205 | .createOrReplaceTempView(table.sparkTable) 206 | } 207 | } 208 | 209 | // setup ES inputs 210 | println() 211 | cfg.sources.elasticsearch?.forEach { es -> 212 | println("Mounting Elasticsearch source ${es.nodes.joinToString()}") 213 | es.tables.forEach { table -> 214 | val indexType = table.indexType ?: table.type 215 | println("Creating table ${table.sparkTable} from Elasticsearch index ${table.indexName}/${indexType}") 216 | if (indexType == null) { 217 | throw IllegalArgumentException(" Source configuration is missing parameter `indexType`") 218 | } 219 | if (table.type != null) { 220 | System.err.println(" Source configuration parameter `type` is deprecated, use `indexType`") 221 | } 222 | val options = mutableMapOf("es.nodes" to es.nodes.joinToString(",")) 223 | es.basicAuth?.let { 224 | options.put("es.net.http.auth.user", it.username) 225 | options.put("es.net.http.auth.pass", it.password) 226 | } 227 | es.port?.let { 228 | options.put("es.port", it.toString()) 229 | } 230 | es.enableSsl?.let { 231 | options.put("es.net.ssl", it.toString()) 232 | } 233 | 234 | es.settings?.let { options.putAll(it) } 235 | table.settings?.let { options.putAll(it) } 236 | 237 | if (table.esQuery != null) { 238 | if (table.esQuery is Map<*, *>) { 239 | @Suppress("UNCHECKED_CAST") 240 | val root = if (table.esQuery.containsKey("query")) table.esQuery else mapOf("query" to table.esQuery) 241 | options.put("es.query", ObjectMapper().writeValueAsString(root).replace('\n', ' ')) 242 | } else { 243 | options.put("es.query", table.esQuery.toString()) 244 | } 245 | } else { 246 | // defaults to match_all 247 | } 248 | 249 | spark.read().format("org.elasticsearch.spark.sql") 250 | .options(options) 251 | .load(indexSpec(table.indexName, indexType)) 252 | .createOrReplaceTempView(table.sparkTable) 253 | } 254 | } 255 | 256 | // run prep-queries 257 | println() 258 | cfg.prepStatements?.forEach { statement -> 259 | try { 260 | println("\nRunning prep-statement:\n${statement.description.replaceIndent(" ")}") 261 | val rawQuery = statement.sqlQuery ?: fileRelativeToConfig(statement.sqlFile!!).readText() 262 | spark.sql(rawQuery).let { 263 | if (statement.cache ?: false) { 264 | val storeLevel = statement.persist?.let { StorageLevel.fromString(it) } 265 | if (storeLevel != null) { 266 | it.persist(storeLevel) 267 | } else { 268 | it.cache() 269 | } 270 | } else { 271 | it 272 | } 273 | } 274 | } catch (ex: Throwable) { 275 | val msg = ex.toNiceMessage() 276 | throw DataImportException("Prep Statement: ${statement.description}\n$msg", ex) 277 | } 278 | println() 279 | } 280 | 281 | fun Importer.getDataImportHandler(statement: DataImportStatement): StatementDataImportHandler { 282 | return if (targetElasticsearch != null) { 283 | EsDataImportHandler(statement, configRelativeDir, targetElasticsearch) 284 | } else if (targetAlgolia != null) { 285 | AlgoliaDataImportHandler(statement, configRelativeDir, targetAlgolia) 286 | } else { 287 | throw IllegalStateException(description + " import step neither declares ES nor Algolia target") 288 | } 289 | } 290 | 291 | // run importers 292 | println() 293 | cfg.importSteps.forEach { import -> 294 | println("\nRunning importer:\n${import.description.replaceIndent(" ")}") 295 | import.statements.forEach { statement -> 296 | val stateMgr = stateMap.get(statement.id)!! 297 | val lastRun = lastRuns.get(statement.id)!! 298 | 299 | // SQL times will be local time zone, so much match the server 300 | val sqlMinDate = Timestamp.from(lastRun).toString() 301 | val sqlMaxDate = Timestamp.from(thisRunDate).toString() 302 | 303 | val dateMsg = if (lastRun == NOSTATE) { 304 | "range NEVER to '$sqlMaxDate'" 305 | } else { 306 | "range '$sqlMinDate' to '$sqlMaxDate'" 307 | } 308 | 309 | println("\n Execute statement: ($dateMsg)\n${statement.description.replaceIndent(" ")}") 310 | 311 | if (!stateMgr.lockStatement(uniqueId, statement)) { 312 | System.err.println(" Cannot acquire lock for statement ${statement.id}") 313 | } else { 314 | try { 315 | val importHandler = import.getDataImportHandler(statement) 316 | importHandler.prepareIndex() 317 | 318 | val rawQuery = statement.sqlQuery ?: fileRelativeToConfig(statement.sqlFile!!).readText() 319 | val subDataInQuery = rawQuery.replace("{lastRun}", sqlMinDate).replace("{thisRun}", sqlMaxDate) 320 | val sqlResults = try { 321 | spark.sql(subDataInQuery).let { 322 | if (statement.cache ?: false) { 323 | val storeLevel = statement.persist?.let { StorageLevel.fromString(it) } 324 | if (storeLevel != null) { 325 | it.persist(storeLevel) 326 | } else { 327 | it.cache() 328 | } 329 | } else { 330 | it 331 | } 332 | } 333 | } catch (ex: Throwable) { 334 | val msg = ex.toNiceMessage() 335 | throw DataImportException(msg, ex) 336 | } 337 | 338 | val rowCount = importHandler.import(sqlResults) 339 | println(" Rows processed: $rowCount") 340 | 341 | stateMgr.writeStateForStatement(uniqueId, statement, thisRunDate, "success", rowCount, null) 342 | stateMgr.logStatement(uniqueId, statement, thisRunDate, "success", rowCount, null) 343 | } catch (ex: Throwable) { 344 | val msg = ex.message ?: "unknown failure" 345 | stateMgr.writeStateForStatement(uniqueId, statement, lastRun, "error", 0, msg) 346 | stateMgr.logStatement(uniqueId, statement, thisRunDate, "error", 0, msg) 347 | // System.err.println("\nProcess FAILED: \n$msg\n") 348 | throw ex 349 | } finally { 350 | stateMgr.unlockStatement(uniqueId, statement) 351 | } 352 | } 353 | } 354 | } 355 | 356 | println("\nShutting down...") 357 | } 358 | println("\nDONE.") 359 | } finally { 360 | Thread.currentThread().contextClassLoader = oldClassLoader 361 | } 362 | } 363 | } 364 | 365 | fun Throwable.toNiceMessage(): String = when (this) { 366 | is AnalysisException -> """Error:(${this.line().takeIf { it.isDefined }?.toString() ?: "?"},${this.startPosition().takeIf { it.isDefined }?.toString() ?: "?"}) ${this.message}""" 367 | is ParseException -> """Error:(${this.line().takeIf { it.isDefined }?.toString() ?: "?"},${this.startPosition().takeIf { it.isDefined }?.toString() ?: "?"}) ${this.message}""" 368 | else -> this.message ?: "unknown error" 369 | } 370 | 371 | 372 | fun indexSpec(indexName: String, type: String?): String { 373 | return "${indexName}/${type?.trim() ?: ""}" 374 | } 375 | -------------------------------------------------------------------------------- /src/main/kotlin/uy/kohesive/elasticsearch/dataimport/Config.kt: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport 2 | 3 | 4 | data class DataImportHandlerConfig(val sparkMaster: String? = null, 5 | val sources: Connections, 6 | val prepStatements: List? = null, 7 | val importSteps: List, 8 | val sparkConfig: Map? = null) 9 | 10 | data class AuthInfo(val username: String, val password: String) 11 | 12 | data class Connections(val elasticsearch: List?, 13 | val jdbc: List? = null, 14 | val filesystem: List? = null) 15 | 16 | data class EsConnection(val nodes: List, 17 | val basicAuth: AuthInfo? = null, 18 | val port: Int? = 9200, 19 | val enableSsl: Boolean? = false, 20 | val tables: List, 21 | val settings: Map? = null) { 22 | } 23 | 24 | data class JdbcConnection(val jdbcUrl: String, 25 | val driverClass: String, 26 | val defaultSchema: String, 27 | val auth: AuthInfo, 28 | val driverJars: List? = null, 29 | val tables: List, 30 | val settings: Map? = null) 31 | 32 | data class FileDir(val directory: String, 33 | val tables: List, 34 | val settings: Map? = null) 35 | 36 | data class JdbcSource(val sparkTable: String, 37 | val sourceTable: String, 38 | val settings: Map? = null) 39 | 40 | data class FileSource(val sparkTable: String, val format: String, val filespecs: List, 41 | val settings: Map? = null) 42 | 43 | data class EsSource(val sparkTable: String, val indexName: String, val type: String?, val indexType: String?, val esQuery: Any? = null, 44 | val settings: Map? = null) 45 | 46 | data class PrepStatement(val description: String, val sqlQuery: String?, val sqlFile: String?, val cache: Boolean? = null, val persist: String? = "MEMORY_ONLY") 47 | 48 | data class Importer(val description: String, 49 | val targetElasticsearch: EsTargetConnection?, 50 | val targetAlgolia: AlgoliaTargetConnection?, 51 | val statements: List) 52 | 53 | data class AlgoliaTargetConnection(val applicationId: String, val apiKey: String) 54 | 55 | data class EsTargetConnection(val nodes: List, 56 | val basicAuth: AuthInfo? = null, 57 | val port: Int? = 9200, 58 | val enableSsl: Boolean? = false, 59 | val settings: Map? = null) 60 | 61 | data class DataImportStatement(val id: String, 62 | val description: String, 63 | val indexName: String, 64 | val indexType: String?, 65 | val type: String?, 66 | val action: String?, 67 | val idField: String?, 68 | val newIndexSettingsFile: String?, 69 | val sqlQuery: String?, 70 | val sqlFile: String?, 71 | val cache: Boolean? = null, 72 | val persist: String? = "MEMORY_ONLY", 73 | val settings: Map? = null) { 74 | 75 | fun getAction(): StatementAction = if (action?.toLowerCase() == "delete") { 76 | StatementAction.Delete 77 | } else { 78 | StatementAction.Index 79 | } 80 | 81 | } 82 | 83 | enum class StatementAction { 84 | Index, 85 | Delete 86 | } 87 | -------------------------------------------------------------------------------- /src/main/kotlin/uy/kohesive/elasticsearch/dataimport/DataImportHandler.kt: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport 2 | 3 | import org.apache.spark.sql.Dataset 4 | import org.apache.spark.sql.Row 5 | import java.io.File 6 | 7 | interface StatementDataImportHandler { 8 | 9 | val configRelativeDir: File 10 | 11 | val statement: DataImportStatement 12 | 13 | fun prepareIndex() 14 | 15 | /** 16 | * Returns processed rows count. 17 | */ 18 | fun import(dataSet: Dataset): Long 19 | 20 | fun fileRelativeToConfig(filename: String): File = configRelativeDir.resolve(filename).canonicalFile 21 | 22 | } 23 | 24 | -------------------------------------------------------------------------------- /src/main/kotlin/uy/kohesive/elasticsearch/dataimport/EsDataImportHandler.kt: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport 2 | 3 | import org.apache.spark.sql.Dataset 4 | import org.apache.spark.sql.Row 5 | import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL 6 | import java.io.File 7 | 8 | class EsDataImportHandler( 9 | override val statement: DataImportStatement, 10 | override val configRelativeDir: File, 11 | 12 | targetElasticsearch: EsTargetConnection 13 | ) : StatementDataImportHandler { 14 | 15 | val options = mutableMapOf("es.nodes" to targetElasticsearch.nodes.joinToString(",")) 16 | 17 | init { 18 | // look for table create setting 19 | targetElasticsearch.basicAuth?.let { 20 | options.put("es.net.http.auth.user", it.username) 21 | options.put("es.net.http.auth.pass", it.password) 22 | } 23 | targetElasticsearch.port?.let { port -> 24 | options.put("es.port", port.toString()) 25 | } 26 | targetElasticsearch.enableSsl?.let { enableSsl -> 27 | options.put("es.net.ssl", enableSsl.toString()) 28 | } 29 | targetElasticsearch.settings?.let { options.putAll(it) } 30 | statement.settings?.let { options.putAll(it) } 31 | } 32 | 33 | val esClient = MicroEsClient(targetElasticsearch.nodes, 34 | targetElasticsearch.port ?: 9200, 35 | targetElasticsearch.enableSsl ?: false, 36 | targetElasticsearch.basicAuth 37 | ) 38 | 39 | override fun import(dataSet: Dataset): Long { 40 | JavaEsSparkSQL.saveToEs(dataSet, indexSpec(statement.indexName, statement.indexType ?: statement.type), options) 41 | return dataSet.count() 42 | } 43 | 44 | override fun prepareIndex() { 45 | val autocreate: Boolean = options.getOrDefault("es.index.auto.create", "true").toBoolean() 46 | 47 | val indexExists = esClient.checkIndexExists(statement.indexName) 48 | if (!autocreate) { 49 | if (!indexExists) { 50 | throw IllegalStateException("Index auto-create setting 'es.index.auto.create' is false and index ${statement.indexName} does not exist.") 51 | } 52 | } else { 53 | if (!indexExists) { 54 | println(" Index ${statement.indexName} does not exist, auto creating") 55 | if (statement.newIndexSettingsFile != null) { 56 | val checkFile = fileRelativeToConfig(statement.newIndexSettingsFile) 57 | println(" Creating ${statement.indexName} with settings/mapping file: $checkFile") 58 | val response = esClient.createIndex(statement.indexName, checkFile.readText()) 59 | if (!response.isSuccess) { 60 | throw IllegalStateException("Could not create index ${statement.indexName} with settings/mapping file: $checkFile, due to:\n${response.responseJson}") 61 | } 62 | } else { 63 | println(" Index will be created without settings/mappings file, will use index templates or dynamic mappings") 64 | } 65 | } 66 | } 67 | } 68 | } -------------------------------------------------------------------------------- /src/main/kotlin/uy/kohesive/elasticsearch/dataimport/Exceptions.kt: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport 2 | 3 | class DataImportException(msg: String, cause: Throwable? = null) : Exception(msg, cause) 4 | -------------------------------------------------------------------------------- /src/main/kotlin/uy/kohesive/elasticsearch/dataimport/MicroEsClient.kt: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport 2 | 3 | import com.fasterxml.jackson.module.kotlin.readValue 4 | import okhttp3.* 5 | 6 | /** 7 | * TODO: Change to use RestClient from ES / Spark integration 8 | */ 9 | 10 | class MicroEsClient(nodes: List, port: Int = 9200, enableSsl: Boolean = false, auth: AuthInfo? = null) { 11 | val http = OkHttpClient().newBuilder().apply { 12 | auth?.let { addInterceptor(BasicAuthInterceptor(it)) } 13 | }.build() 14 | val protocol = if (enableSsl) "https" else "http" 15 | val host = nodes.first() 16 | val hostWithPort = if (':' in host.substringAfter('@', host)) host else "${host}:${port}" 17 | val url = "${protocol}://${hostWithPort}" 18 | 19 | fun String.fixRestAppendage(): String { 20 | if (this.startsWith("?")) return this 21 | if (this.startsWith("/")) return this 22 | return "/" + this 23 | } 24 | 25 | fun makeIndexTypeUrl(index: String, type: String) = "$url/$index/$type" 26 | fun makeIndexTypeUrl(index: String, type: String, restOfUrl: String) = "$url/$index/$type${restOfUrl.fixRestAppendage()}" 27 | fun makeIndexTypeIdUrl(index: String, type: String, id: String, restOfUrl: String) = "$url/$index/$type/$id${restOfUrl.fixRestAppendage()}" 28 | 29 | private fun OkHttpClient.get(url: String): CallResponse { 30 | val request = Request.Builder().url(url).build() 31 | val response = http.newCall(request).execute() 32 | return CallResponse(response.code(), response.use { it.body()?.string() ?: "" }) 33 | } 34 | 35 | private fun OkHttpClient.delete(url: String): CallResponse { 36 | val request = Request.Builder().url(url).delete().build() 37 | val response = http.newCall(request).execute() 38 | return CallResponse(response.code(), response.use { it.body()?.string() ?: "" }) 39 | } 40 | 41 | private fun OkHttpClient.delete(url: String, jsonBody: String): CallResponse { 42 | val jsonMediaType = MediaType.parse("application/json; charset=utf-8") 43 | val body = RequestBody.create(jsonMediaType, jsonBody) 44 | val request = Request.Builder().url(url).delete(body).build() 45 | val response = http.newCall(request).execute() 46 | return CallResponse(response.code(), response.use { it.body()?.string() ?: "" }) 47 | } 48 | 49 | private fun OkHttpClient.post(url: String, jsonBody: String): CallResponse { 50 | val jsonMediaType = MediaType.parse("application/json; charset=utf-8") 51 | val body = RequestBody.create(jsonMediaType, jsonBody) 52 | val request = Request.Builder().url(url).post(body).build() 53 | val response = http.newCall(request).execute() 54 | return CallResponse(response.code(), response.use { it.body()?.string() ?: "" }) 55 | } 56 | 57 | private fun OkHttpClient.put(url: String, jsonBody: String): CallResponse { 58 | val jsonMediaType = MediaType.parse("application/json; charset=utf-8") 59 | val body = RequestBody.create(jsonMediaType, jsonBody) 60 | val request = Request.Builder().url(url).put(body).build() 61 | val response = http.newCall(request).execute() 62 | return CallResponse(response.code(), response.use { it.body()?.string() ?: "" }) 63 | } 64 | 65 | fun indexTypePOST(indexName: String, indexType: String, restOfUrl: String, postJson: String): CallResponse { 66 | return http.post(makeIndexTypeUrl(indexName, indexType, restOfUrl), postJson) 67 | } 68 | 69 | fun indexTypeIdPOST(indexName: String, indexType: String, id: String, restOfUrl: String, postJson: String): CallResponse { 70 | return http.post(makeIndexTypeIdUrl(indexName, indexType, id, restOfUrl), postJson) 71 | } 72 | 73 | fun indexTypeDELETE(indexName: String, indexType: String, restOfUrl: String): CallResponse { 74 | return http.delete(makeIndexTypeUrl(indexName, indexType, restOfUrl)) 75 | } 76 | 77 | fun indexTypeIdDELETE(indexName: String, indexType: String, id: String, restOfUrl: String): CallResponse { 78 | return http.delete(makeIndexTypeIdUrl(indexName, indexType, id, restOfUrl)) 79 | } 80 | 81 | fun indexTypeGET(indexName: String, indexType: String): CallResponse { 82 | return http.get(makeIndexTypeUrl(indexName, indexType)) 83 | } 84 | 85 | fun indexTypeGET(indexName: String, indexType: String, restOfUrl: String): CallResponse { 86 | return http.get(makeIndexTypeUrl(indexName, indexType, restOfUrl)) 87 | } 88 | 89 | fun indexTypeIdGET(indexName: String, indexType: String, id: String): CallResponse { 90 | return http.get(makeIndexTypeUrl(indexName, indexType, id)) 91 | } 92 | 93 | fun indexTypeIdGET(indexName: String, indexType: String, id: String, restOfUrl: String): CallResponse { 94 | return http.get(makeIndexTypeIdUrl(indexName, indexType, id, restOfUrl)) 95 | } 96 | 97 | fun createIndex(indexName: String, settingsJson: String): CallResponse { 98 | return http.put("${url}/${indexName}", settingsJson) 99 | } 100 | 101 | fun waitForIndexGreen(indexName: String) { 102 | val response = http.get("${url}/_cluster/health/${indexName}?wait_for_status=green&timeout=10s") 103 | if (!response.isSuccess) throw DataImportException("State manager failed, cannot check state index status") 104 | val state = JSON.readTree(response.responseJson) 105 | if (state.get("timed_out").asBoolean()) throw DataImportException("State manager failed, timeout waiting on state index to be 'green'") 106 | if (state.get("status").asText() != "green") throw DataImportException("State manager failed, state index must be 'green' but was '${state.get("status")}'") 107 | } 108 | 109 | fun checkIndexExists(indexName: String): Boolean { 110 | return http.get("${url}/${indexName}").isSuccess 111 | } 112 | 113 | inline fun mapFromSource(response: String): T = JSON.readTree(response).get("_source").traverse().let { JSON.readValue(it) }!! 114 | 115 | data class CallResponse(val code: Int, val responseJson: String) { 116 | val isSuccess: Boolean get() = code in 200..299 117 | } 118 | } 119 | 120 | class BasicAuthInterceptor(val authInfo: AuthInfo) : Interceptor { 121 | override fun intercept(chain: Interceptor.Chain): Response { 122 | val request = chain.request() 123 | val requestWithAuth = request.newBuilder().header("Authorization", Credentials.basic(authInfo.username, authInfo.password)).build() 124 | return chain.proceed(requestWithAuth) 125 | } 126 | } -------------------------------------------------------------------------------- /src/main/kotlin/uy/kohesive/elasticsearch/dataimport/State.kt: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport 2 | 3 | import java.time.Instant 4 | 5 | interface StateManager { 6 | fun init() 7 | fun lockStatement(runId: String, statement: DataImportStatement): Boolean 8 | fun pingLockStatement(runId: String, statement: DataImportStatement): Boolean 9 | fun unlockStatement(runId: String, statement: DataImportStatement) 10 | fun writeStateForStatement(runId: String, statement: DataImportStatement, lastRunStart: Instant, status: String, lastRowCount: Long, errMsg: String? = null) 11 | fun readStateForStatement(runId: String, statement: DataImportStatement): Instant? 12 | fun logStatement(runId: String, statement: DataImportStatement, lastRunStart: Instant, status: String, rowCount: Long, errMsg: String? = null) 13 | } 14 | 15 | fun DataImportStatement.stateKey(): String = this.indexName + "-" + this.id 16 | 17 | // TODO: better state management 18 | // This is NOT using the ES client because we do not want conflicts with Spark dependencies 19 | class ElasticSearchStateManager(val nodes: List, val port: Int = 9200, val enableSsl: Boolean = false, val auth: AuthInfo?) : StateManager { 20 | val STATE_INDEX = ".kohesive-dih-state-v2" 21 | val esClient = MicroEsClient(nodes, port, enableSsl, auth) 22 | 23 | override fun init() { 24 | if (esClient.checkIndexExists(STATE_INDEX)) { 25 | esClient.waitForIndexGreen(STATE_INDEX) 26 | } else { 27 | val response = esClient.createIndex(STATE_INDEX, """ 28 | { 29 | "settings": { 30 | "number_of_shards": 1, 31 | "number_of_replicas": "0" 32 | }, 33 | "mappings": { 34 | "state": { 35 | "properties": { 36 | "targetIndex": { "type": "keyword" }, 37 | "statementId": { "type": "keyword" }, 38 | "lastRunDate": { "type": "date" }, 39 | "status": { "type": "keyword" }, 40 | "lastRunId": { "type": "keyword" }, 41 | "lastErrorMsg": { "type": "text" }, 42 | "lastRowCount": { "type": "long" } 43 | } 44 | }, 45 | "log": { 46 | "properties": { 47 | "targetIndex": { "type": "keyword" }, 48 | "statementId": { "type": "keyword" }, 49 | "runId": { "type": "keyword" }, 50 | "runDate": { "type": "date" }, 51 | "status": { "type": "keyword" }, 52 | "errorMsg": { "type": "text" }, 53 | "rowCount": { "type": "long" } 54 | } 55 | }, 56 | "lock": { 57 | "properties": { 58 | "targetIndex": { "type": "keyword" }, 59 | "statementId": { "type": "keyword" }, 60 | "runId": { "type": "keyword" }, 61 | "lockDate": { "type": "date" } 62 | } 63 | } 64 | } 65 | } 66 | """) 67 | 68 | if (response.isSuccess) { 69 | esClient.waitForIndexGreen(STATE_INDEX) 70 | } else { 71 | throw DataImportException("State manager failed, cannot create state index\n${response.responseJson}") 72 | } 73 | } 74 | 75 | ttlKillOldLocks() 76 | } 77 | 78 | data class Lock(val runId: String, val targetIndex: String, val statementId: String, val lockDate: Instant) 79 | 80 | private fun ttlKillOldLocks() { 81 | val response = esClient.indexTypePOST(STATE_INDEX, "lock", "/_delete_by_query?refresh", 82 | """ 83 | { "query": { "range": { "lockDate": { "lt": "now-15m" } } } } 84 | """) 85 | if (!response.isSuccess) throw DataImportException("State manager failed, TTL delete query for locks failed\n${response.responseJson}") 86 | } 87 | 88 | override fun lockStatement(runId: String, statement: DataImportStatement): Boolean { 89 | val response = esClient.indexTypeIdPOST(STATE_INDEX, "lock", statement.stateKey(), "?op_type=create", 90 | JSON.writeValueAsString(Lock(runId, statement.indexName, statement.id, Instant.now()))) 91 | 92 | if (!response.isSuccess) { 93 | ttlKillOldLocks() 94 | return pingLockStatement(runId, statement) 95 | } 96 | return true 97 | } 98 | 99 | override fun pingLockStatement(runId: String, statement: DataImportStatement): Boolean { 100 | val response = esClient.indexTypeIdGET(STATE_INDEX, "lock", statement.stateKey()) 101 | if (response.isSuccess) { 102 | val lock = esClient.mapFromSource(response.responseJson) 103 | if (lock.runId == runId) { 104 | // TODO: we did a get, so have the version, change to a update with version ID 105 | val updResponse = esClient.indexTypePOST(STATE_INDEX, "lock", "/_update_by_query?refresh", """ 106 | { 107 | "script": { 108 | "inline": "ctx._source.lockDate = Instant.ofEpochMilli(${Instant.now().toEpochMilli()}L)", 109 | "lang": "painless" 110 | }, 111 | "query": { 112 | "bool": { 113 | "must": [ 114 | { "term": { "runId": "$runId" } }, 115 | { "term": { "targetIndex": "${statement.indexName}" } }, 116 | { "term": { "statementId": "${statement.id}" } } 117 | ] 118 | } 119 | } 120 | } 121 | """) 122 | if (!updResponse.isSuccess) { 123 | throw DataImportException("State manager failed, cannot acquire lock for ${statement.stateKey()} - had conflict on pinging of lock\n${updResponse.responseJson}") 124 | } 125 | } else { 126 | throw DataImportException("State manager failed, cannot acquire lock for ${statement.stateKey()} -- it is held by ${lock.runId} since ${lock.lockDate.toIsoString()}\n${response.responseJson}") 127 | } 128 | } 129 | return true 130 | } 131 | 132 | override fun unlockStatement(runId: String, statement: DataImportStatement) { 133 | if (pingLockStatement(runId, statement)) { 134 | val response = esClient.indexTypeIdDELETE(STATE_INDEX, "lock", statement.stateKey(), "?refresh") 135 | if (!response.isSuccess) { 136 | throw DataImportException("State manager failed, cannot delete lock for ${statement.stateKey()}\n${response.responseJson}") 137 | } 138 | } 139 | } 140 | 141 | data class State(val targetIndex: String, val statementId: String, val lastRunDate: Instant, val status: String, val lastRunId: String, val lastErrorMesasge: String?, val lastRowCount: Long) 142 | 143 | data class StateLog(val targetIndex: String, val statementId: String, val runId: String, val runDate: Instant, val status: String, val errorMsg: String?, val rowCount: Long) 144 | 145 | override fun writeStateForStatement(runId: String, statement: DataImportStatement, lastRunStart: Instant, status: String, lastRowCount: Long, errMsg: String?) { 146 | val response = esClient.indexTypeIdPOST(STATE_INDEX, "state", statement.stateKey(), "?refresh", 147 | JSON.writeValueAsString(State(statement.indexName, statement.id, lastRunStart, status, runId, errMsg, lastRowCount))) 148 | if (!response.isSuccess) { 149 | throw DataImportException("State manager failed, cannot update state for ${statement.stateKey()}\n${response.responseJson}") 150 | } 151 | } 152 | 153 | override fun readStateForStatement(runId: String, statement: DataImportStatement): Instant? { 154 | val response = esClient.indexTypeIdGET(STATE_INDEX, "state", statement.stateKey()) 155 | if (response.isSuccess) { 156 | val state = esClient.mapFromSource(response.responseJson) 157 | return state.lastRunDate 158 | } else { 159 | return null 160 | } 161 | } 162 | 163 | override fun logStatement(runId: String, statement: DataImportStatement, lastRunStart: Instant, status: String, rowCount: Long, errMsg: String?) { 164 | val response = esClient.indexTypeIdPOST(STATE_INDEX, "log", "${statement.stateKey()}_run_${runId}", "?refresh", 165 | JSON.writeValueAsString(StateLog(statement.indexName, statement.id, runId, lastRunStart, status, errMsg, rowCount))) 166 | if (!response.isSuccess) { 167 | throw DataImportException("State manager failed, cannot log state for ${statement.stateKey()}\n${response.responseJson}") 168 | } 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /src/main/kotlin/uy/kohesive/elasticsearch/dataimport/Udfs.kt: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.jsoup.Jsoup 5 | import org.jsoup.parser.Parser 6 | import org.jsoup.safety.Whitelist 7 | import uy.kohesive.elasticsearch.dataimport.udf.Udfs 8 | import java.sql.Date 9 | import java.sql.Timestamp 10 | 11 | object DataImportHandlerUdfs { 12 | fun registerSparkUdfs(spark: SparkSession) { 13 | Udfs.registerStringToStringUdf(spark, "fluffly", fluffly) 14 | Udfs.registerStringToStringUdf(spark, "stripHtml", stripHtmlCompletely) 15 | Udfs.registerStringToStringUdf(spark, "normalizeQuotes", normalizeQuotes) 16 | Udfs.registerStringToStringUdf(spark, "unescapeHtmlEntites", unescapeHtmlEntities) 17 | Udfs.registerAnyAnyToTimestampUdf(spark, "combineDateTime", combineDateTime) 18 | } 19 | 20 | @JvmStatic val whiteListMap = mapOf( 21 | "none" to Whitelist.none(), 22 | "basic" to Whitelist.basic(), 23 | "basicwithimages" to Whitelist.basicWithImages(), 24 | "relaxed" to Whitelist.relaxed(), 25 | "simpletext" to Whitelist.simpleText(), 26 | "simple" to Whitelist.simpleText() 27 | ) 28 | 29 | @JvmStatic val fluffly = fun (v: String): String = "fluffly " + v 30 | 31 | @JvmStatic val stripHtmlCompletely = fun (v: String): String { 32 | return Jsoup.parseBodyFragment(v).text() 33 | } 34 | 35 | @JvmStatic val normalizeQuotes = fun (v: String): String { 36 | return v.replace("\\'", "'").replace("''", "\"") 37 | } 38 | 39 | @JvmStatic val unescapeHtmlEntities = fun (v: String): String { 40 | return Parser.unescapeEntities(v, false) 41 | } 42 | 43 | @JvmStatic val combineDateTime = fun (date: Date?, time: Timestamp?): Timestamp? { 44 | // https://stackoverflow.com/questions/26649530/merge-date-and-time-into-timestamp 45 | if (date == null || time == null) { 46 | return null 47 | } 48 | 49 | // val dd = (date.time / 86400000L * 86400000L) - date.timezoneOffset * 60000 50 | // val tt = time.time - time.time / 86400000L * 86400000L 51 | // return Timestamp(dd + tt) 52 | 53 | return Timestamp(date.year, date.month, date.date, time.hours, time.minutes, time.seconds, 0) 54 | } 55 | 56 | } -------------------------------------------------------------------------------- /src/main/kotlin/uy/kohesive/elasticsearch/dataimport/Util.kt: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport 2 | 3 | import com.fasterxml.jackson.databind.DeserializationFeature 4 | import com.fasterxml.jackson.databind.SerializationFeature 5 | import com.fasterxml.jackson.datatype.jdk8.Jdk8Module 6 | import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule 7 | import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper 8 | import java.time.format.DateTimeFormatter 9 | import java.time.temporal.Temporal 10 | 11 | fun isoDateFormat(): DateTimeFormatter = DateTimeFormatter.ISO_INSTANT 12 | fun Temporal.toIsoString(): String = isoDateFormat().format(this) 13 | 14 | val JSON = jacksonObjectMapper().registerModules(JavaTimeModule(), Jdk8Module()).apply { 15 | configure(SerializationFeature.WRITE_DATE_TIMESTAMPS_AS_NANOSECONDS, false) 16 | configure(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS, true) 17 | configure(DeserializationFeature.READ_DATE_TIMESTAMPS_AS_NANOSECONDS, false) 18 | } -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /src/test/kotlin/uy/kohesive/elasticsearch/dataimport/ManualTestOfDataImport.kt: -------------------------------------------------------------------------------- 1 | package uy.kohesive.elasticsearch.dataimport 2 | 3 | import java.io.ByteArrayInputStream 4 | import java.io.File 5 | 6 | class ManualTestOfDataImport { 7 | // TODO: This test requires Elasticsearch to be available, it is difficult to Run ES and Spark together due to conflicting dependencies 8 | companion object { 9 | @JvmStatic fun main(args: Array) { 10 | val confFile = File("./src/test/resources/manual-test.conf") 11 | App().run(confFile.inputStream(), confFile.parentFile) 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/test/resources/manual-mappings.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index": { 4 | "analysis": { 5 | "filter": { 6 | "de_stop_filter": { 7 | "type": "stop", 8 | "stopwords": "_german_" 9 | }, 10 | "de_stemmer_filter": { 11 | "type": "stemmer", 12 | "langauge": "minimal_german", 13 | "respect_keywords": true 14 | }, 15 | "unique_same_position_filter": { 16 | "type": "unique", 17 | "only_on_same_position": true 18 | }, 19 | "de_decomp_filter": { 20 | "type": "decompound", 21 | "language": "de", 22 | "respect_keywords": true 23 | }, 24 | "de_fst_decomp_filter": { 25 | "type": "fst_decompound", 26 | "language": "de", 27 | "respect_keywords": true 28 | }, 29 | "de_baseform_filter": { 30 | "type": "baseform", 31 | "language": "de", 32 | "respect_keywords": true 33 | }, 34 | "de_lemmatize_filter": { 35 | "type": "lemmatize", 36 | "language": "de", 37 | "respect_keywords": true 38 | }, 39 | "de_synonym_lowercase_filter": { 40 | "type": "synonym", 41 | "synonyms_path": "analysis/de-synonyms-lowercase-minimal.txt", 42 | "respect_keywords": true 43 | }, 44 | "de_synonym_casesensitive_filter": { 45 | "type": "synonym", 46 | "synonyms_path": "analysis/de-synonyms-case-sensitive-minimal.txt", 47 | "respect_keywords": true 48 | }, 49 | "haystack_ngram_filter": { 50 | "type": "nGram", 51 | "min_gram": "3", 52 | "max_gram": "15" 53 | }, 54 | "haystack_edge_ngram_filter": { 55 | "type": "edge_ngram", 56 | "min_gram": "3", 57 | "max_gram": "15" 58 | } 59 | }, 60 | "analyzer": { 61 | "de_decomp_query_analysis": { 62 | "type": "custom", 63 | "tokenizer": "icu_tokenizer", 64 | "filter": [ 65 | "keyword_repeat", 66 | "de_synonym_casesensitive_filter", 67 | "icu_normalizer", 68 | "de_stop_filter", 69 | "de_synonym_lowercase_filter", 70 | "de_lemmatize_filter", 71 | "icu_folding", 72 | "unique_same_position_filter" 73 | ] 74 | }, 75 | "de_decomp_index_analysis": { 76 | "type": "custom", 77 | "tokenizer": "icu_tokenizer", 78 | "filter": [ 79 | "keyword_repeat", 80 | "de_synonym_casesensitive_filter", 81 | "icu_normalizer", 82 | "de_stop_filter", 83 | "de_synonym_lowercase_filter", 84 | "de_decomp_filter", 85 | "de_lemmatize_filter", 86 | "icu_folding", 87 | "unique_same_position_filter" 88 | ] 89 | }, 90 | "de_decomp_nostop_query_analysis": { 91 | "type": "custom", 92 | "tokenizer": "icu_tokenizer", 93 | "filter": [ 94 | "keyword_repeat", 95 | "de_synonym_casesensitive_filter", 96 | "icu_normalizer", 97 | "de_synonym_lowercase_filter", 98 | "de_lemmatize_filter", 99 | "icu_folding", 100 | "unique_same_position_filter" 101 | ] 102 | }, 103 | "de_decomp_nostop_index_analysis": { 104 | "type": "custom", 105 | "tokenizer": "icu_tokenizer", 106 | "filter": [ 107 | "keyword_repeat", 108 | "de_synonym_casesensitive_filter", 109 | "icu_normalizer", 110 | "de_synonym_lowercase_filter", 111 | "de_decomp_filter", 112 | "de_lemmatize_filter", 113 | "icu_folding", 114 | "unique_same_position_filter" 115 | ] 116 | }, 117 | "de_synonym_analysis": { 118 | "type": "custom", 119 | "tokenizer": "icu_tokenizer", 120 | "filter": [ 121 | "keyword_repeat", 122 | "de_synonym_casesensitive_filter", 123 | "icu_normalizer", 124 | "de_stop_filter", 125 | "de_synonym_lowercase_filter", 126 | "de_lemmatize_filter", 127 | "icu_folding", 128 | "de_stemmer_filter", 129 | "unique_same_position_filter" 130 | ] 131 | }, 132 | "de_synonym_nostop_analysis": { 133 | "type": "custom", 134 | "tokenizer": "icu_tokenizer", 135 | "filter": [ 136 | "keyword_repeat", 137 | "de_synonym_casesensitive_filter", 138 | "icu_normalizer", 139 | "de_synonym_lowercase_filter", 140 | "de_lemmatize_filter", 141 | "icu_folding", 142 | "de_stemmer_filter", 143 | "unique_same_position_filter" 144 | ] 145 | }, 146 | "de_base_analysis": { 147 | "type": "custom", 148 | "tokenizer": "icu_tokenizer", 149 | "filter": [ 150 | "icu_normalizer", 151 | "de_stop_filter", 152 | "icu_folding", 153 | "de_stemmer_filter" 154 | ] 155 | }, 156 | "de_base_nostop_analysis": { 157 | "type": "custom", 158 | "tokenizer": "icu_tokenizer", 159 | "filter": [ 160 | "icu_normalizer", 161 | "icu_folding", 162 | "de_stemmer_filter" 163 | ] 164 | }, 165 | "ngram_analysis": { 166 | "type": "custom", 167 | "filter": [ 168 | "icu_normalizer", 169 | "haystack_ngram_filter" 170 | ], 171 | "tokenizer": "icu_tokenizer" 172 | }, 173 | "edge_ngram_analysis": { 174 | "type": "custom", 175 | "filter": [ 176 | "icu_normalizer", 177 | "haystack_edge_ngram_filter" 178 | ], 179 | "tokenizer": "icu_tokenizer" 180 | } 181 | } 182 | }, 183 | "number_of_shards": "5", 184 | "number_of_replicas": "0" 185 | } 186 | }, 187 | "mappings": { 188 | "modelresult": { 189 | "_all": { 190 | "enabled": false 191 | }, 192 | "properties": { 193 | "content_auto": { 194 | "type": "text", 195 | "analyzer": "ngram_analysis" 196 | }, 197 | "heading": { 198 | "type": "text", 199 | "analyzer": "ngram_analysis", 200 | "fields": { 201 | "raw": { 202 | "type": "text", 203 | "analyzer": "german" 204 | }, 205 | "de_base": { 206 | "type": "text", 207 | "analyzer": "de_base_analysis" 208 | }, 209 | "de_syn": { 210 | "type": "text", 211 | "analyzer": "de_synonym_analysis" 212 | }, 213 | "de_decomp": { 214 | "type": "text", 215 | "analyzer": "de_decomp_index_analysis", 216 | "search_analyzer": "de_decomp_query_analysis" 217 | }, 218 | "ngram": { 219 | "type": "text", 220 | "analyzer": "ngram_analysis" 221 | } 222 | } 223 | }, 224 | "date": { 225 | "type": "date", 226 | "format": "date_optional_time||epoch_millis" 227 | }, 228 | "django_ct": { 229 | "type": "keyword", 230 | "include_in_all": false 231 | }, 232 | "django_id": { 233 | "type": "keyword", 234 | "include_in_all": false 235 | }, 236 | "id": { 237 | "type": "keyword" 238 | }, 239 | "offer": { 240 | "type": "boolean" 241 | }, 242 | "staffname": { 243 | "type": "text", 244 | "copy_to": "spelldata", 245 | "fields": { 246 | "raw": { 247 | "type": "text", 248 | "analyzer": "german" 249 | }, 250 | "de_base": { 251 | "type": "text", 252 | "analyzer": "de_base_analysis" 253 | }, 254 | "de_base_nostop": { 255 | "type": "text", 256 | "analyzer": "de_base_nostop_analysis" 257 | }, 258 | "de_syn": { 259 | "type": "text", 260 | "analyzer": "de_synonym_analysis" 261 | }, 262 | "de_syn_nostop": { 263 | "type": "text", 264 | "analyzer": "de_synonym_nostop_analysis" 265 | }, 266 | "de_decomp": { 267 | "type": "text", 268 | "analyzer": "de_decomp_index_analysis", 269 | "search_analyzer": "de_decomp_query_analysis", 270 | "term_vector": "with_positions_offsets" 271 | }, 272 | "de_decomp_nostop": { 273 | "type": "text", 274 | "analyzer": "de_decomp_nostop_index_analysis", 275 | "search_analyzer": "de_decomp_nostop_query_analysis", 276 | "term_vector": "with_positions_offsets" 277 | }, 278 | "ngram": { 279 | "type": "text", 280 | "analyzer": "ngram_analysis" 281 | }, 282 | "engram": { 283 | "type": "text", 284 | "analyzer": "edge_ngram_analysis" 285 | } 286 | } 287 | }, 288 | "text": { 289 | "type": "text", 290 | "copy_to": "spelldata", 291 | "fields": { 292 | "raw": { 293 | "type": "text", 294 | "analyzer": "german", 295 | "term_vector": "with_positions_offsets" 296 | }, 297 | "de_base": { 298 | "type": "text", 299 | "analyzer": "de_base_analysis", 300 | "term_vector": "with_positions_offsets" 301 | }, 302 | "de_base_nostop": { 303 | "type": "text", 304 | "analyzer": "de_base_nostop_analysis", 305 | "term_vector": "with_positions_offsets" 306 | }, 307 | "de_syn_nostop": { 308 | "type": "text", 309 | "analyzer": "de_synonym_nostop_analysis", 310 | "term_vector": "with_positions_offsets" 311 | }, 312 | "de_syn": { 313 | "type": "text", 314 | "analyzer": "de_synonym_analysis", 315 | "term_vector": "with_positions_offsets" 316 | }, 317 | "de_decomp": { 318 | "type": "text", 319 | "analyzer": "de_decomp_index_analysis", 320 | "search_analyzer": "de_decomp_query_analysis", 321 | "term_vector": "with_positions_offsets" 322 | }, 323 | "de_decomp_nostop": { 324 | "type": "text", 325 | "analyzer": "de_decomp_nostop_index_analysis", 326 | "search_analyzer": "de_decomp_nostop_query_analysis", 327 | "term_vector": "with_positions_offsets" 328 | }, 329 | "ngram": { 330 | "type": "text", 331 | "analyzer": "ngram_analysis" 332 | }, 333 | "engram": { 334 | "type": "text", 335 | "analyzer": "edge_ngram_analysis" 336 | } 337 | }, 338 | "term_vector": "with_positions_offsets" 339 | }, 340 | "title": { 341 | "type": "text", 342 | "copy_to": "spelldata", 343 | "fields": { 344 | "raw": { 345 | "type": "text", 346 | "analyzer": "german", 347 | "term_vector": "with_positions_offsets" 348 | }, 349 | "de_base": { 350 | "type": "text", 351 | "analyzer": "de_base_analysis", 352 | "term_vector": "with_positions_offsets" 353 | }, 354 | "de_base_nostop": { 355 | "type": "text", 356 | "analyzer": "de_base_nostop_analysis", 357 | "term_vector": "with_positions_offsets" 358 | }, 359 | "de_syn_nostop": { 360 | "type": "text", 361 | "analyzer": "de_synonym_nostop_analysis", 362 | "term_vector": "with_positions_offsets" 363 | }, 364 | "de_syn": { 365 | "type": "text", 366 | "analyzer": "de_synonym_analysis", 367 | "term_vector": "with_positions_offsets" 368 | }, 369 | "de_decomp": { 370 | "type": "text", 371 | "analyzer": "de_decomp_index_analysis", 372 | "search_analyzer": "de_decomp_query_analysis", 373 | "term_vector": "with_positions_offsets" 374 | }, 375 | "de_decomp_nostop": { 376 | "type": "text", 377 | "analyzer": "de_decomp_nostop_index_analysis", 378 | "search_analyzer": "de_decomp_nostop_query_analysis", 379 | "term_vector": "with_positions_offsets" 380 | }, 381 | "ngram": { 382 | "type": "text", 383 | "analyzer": "ngram_analysis" 384 | }, 385 | "engram": { 386 | "type": "text", 387 | "analyzer": "edge_ngram_analysis" 388 | } 389 | }, 390 | "term_vector": "with_positions_offsets" 391 | }, 392 | "spelldata": { 393 | "type": "text", 394 | "fields": { 395 | "raw": { 396 | "type": "text", 397 | "analyzer": "german", 398 | "term_vector": "with_positions_offsets" 399 | }, 400 | "de_base": { 401 | "type": "text", 402 | "analyzer": "de_base_analysis", 403 | "term_vector": "with_positions_offsets" 404 | }, 405 | "de_base_nostop": { 406 | "type": "text", 407 | "analyzer": "de_base_nostop_analysis", 408 | "term_vector": "with_positions_offsets" 409 | }, 410 | "de_syn_nostop": { 411 | "type": "text", 412 | "analyzer": "de_synonym_nostop_analysis", 413 | "term_vector": "with_positions_offsets" 414 | }, 415 | "de_syn": { 416 | "type": "text", 417 | "analyzer": "de_synonym_analysis", 418 | "term_vector": "with_positions_offsets" 419 | }, 420 | "de_decomp": { 421 | "type": "text", 422 | "analyzer": "de_decomp_index_analysis", 423 | "search_analyzer": "de_decomp_query_analysis", 424 | "term_vector": "with_positions_offsets" 425 | }, 426 | "de_decomp_nostop": { 427 | "type": "text", 428 | "analyzer": "de_decomp_nostop_index_analysis", 429 | "search_analyzer": "de_decomp_nostop_query_analysis", 430 | "term_vector": "with_positions_offsets" 431 | }, 432 | "ngram": { 433 | "type": "text", 434 | "analyzer": "ngram_analysis" 435 | }, 436 | "engram": { 437 | "type": "text", 438 | "analyzer": "edge_ngram_analysis" 439 | } 440 | }, 441 | "term_vector": "with_positions_offsets" 442 | } 443 | } 444 | } 445 | } 446 | } -------------------------------------------------------------------------------- /src/test/resources/test.sql: -------------------------------------------------------------------------------- 1 | WITH orgMembers AS 2 | ( 3 | SELECT our.guid AS roleOrgGuid, our.userGuid AS roleUserGuid, our.orgUserRoleType AS roleType, our.dtUpdated AS dtRoleUpdated, 4 | oe.displayName AS orgDisplayName, oe.dtUpdated AS dtOrgUpdated 5 | FROM OrgEntity AS oe JOIN OrgUserRole AS our ON (our.orgGuid = oe.guid) 6 | ), 7 | userWithOrg AS ( 8 | SELECT ue.guid, struct(ue.*) AS user, struct(om.*) AS orgMembership 9 | FROM UserEntity AS ue LEFT OUTER JOIN orgMembers AS om ON (om.roleUserGuid = ue.guid) 10 | ), 11 | modifiedUsers AS ( 12 | SELECT guid, first(user) as user, collect_list(orgMembership) AS orgMemberships 13 | FROM userWithOrg AS ue 14 | WHERE user.dtUpdated between "{lastRun}" AND "{thisRun}" OR 15 | orgMembership.dtRoleUpdated between "{lastRun}" AND "{thisRun}" OR 16 | orgMembership.dtOrgUpdated between "{lastRun}" AND "{thisRun}" 17 | GROUP BY guid 18 | ), 19 | usersWithEmotions AS ( 20 | SELECT mu.*, em.emotion FROM modifiedUsers AS mu LEFT OUTER JOIN UserEmotions AS em ON (mu.guid = em.guid) 21 | ) 22 | SELECT 'things' as type, 23 | user.guid, user.identity, user.displayName, user.contactEmail, user.avatarUrl, user.gravatarEmail, user.blurb, 24 | user.location, user.defaultTraitPrivacyType, user.companyName, user.isActive, user.isHeadless, 25 | emotion, user.dtCreated, user.dtUpdated, orgMemberships FROM usersWithEmotions --------------------------------------------------------------------------------