├── .project ├── README.md ├── apache-real-time ├── flume.conf ├── morphline.conf ├── schema.xml └── solrconfig.xml ├── email-search ├── email-schema.xml ├── flume.config ├── indexer-config.xml ├── ks-indexer-email-morphlines.conf ├── morphlines.conf └── schema.xml └── ocr ├── IdmpExtraction.scala ├── indexer-config.xml ├── morphlines.conf └── schema.xml /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | cloudera-search 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | cloudera-search 2 | =============== 3 | This repository has all you need to set up Cloudera Search in Environment 4 | 5 | The directories are broken down by Use Cases 6 | -------------------------------------------------------------------------------- /apache-real-time/flume.conf: -------------------------------------------------------------------------------- 1 | # Please paste flume.conf here. Example: 2 | 3 | # Sources, channels, and sinks are defined per 4 | # agent name, in this case 'tier1'. 5 | tier1.sources = source1 6 | tier1.channels = channel1 7 | tier1.sinks = sink1 8 | 9 | # For each source, channel, and sink, set 10 | # standard properties. 11 | 12 | 13 | 14 | # Syslog Source Configuration 15 | tier1.sources.source1.type = syslogtcp 16 | # the hostname that Flume Syslog source will be running on 17 | tier1.sources.source1.host = localhost 18 | # the port that Flume Syslog source will listen on 19 | tier1.sources.source1.port = 5040 20 | tier1.sources.source1.channels = channel1 21 | 22 | 23 | # Solr Sink configuration 24 | tier1.sinks.sink1.type = org.apache.flume.sink.solr.morphline.MorphlineSolrSink 25 | tier1.sinks.sink1.morphlineFile = /tmp/morphline.conf 26 | tier1.sinks.sink1.channel = channel1 27 | 28 | 29 | 30 | # Kafka Channel Configuration 31 | tier1.channels.channel1.type = org.apache.flume.channel.kafka.KafkaChannel 32 | tier1.channels.channel1.capacity = 10000 33 | tier1.channels.channel1.transactionCapacity = 1000 34 | tier1.channels.channel1.brokerList = kafkaf-2:9092,kafkaf-3:9092 35 | tier1.channels.channel1.topic = channel1 36 | tier1.channels.channel1.zookeeperConnect = kafkaf-1:2181 37 | tier1.channels.channel1.groupId = flume2 38 | -------------------------------------------------------------------------------- /apache-real-time/morphline.conf: -------------------------------------------------------------------------------- 1 | # Specify server locations in a SOLR_LOCATOR variable; 2 | # used later in variable substitutions 3 | # Change the zkHost to point to your own Zookeeper quorum 4 | SOLR_LOCATOR : { 5 | # Name of solr collection 6 | collection : accessCollection 7 | # ZooKeeper ensemble 8 | zkHost : "localhost:2181/solr" 9 | } 10 | 11 | # Specify an array of one or more morphlines, each of which defines an ETL 12 | # transformation chain. A morphline consists of one or more (potentially 13 | # nested) commands. A morphline is a way to consume records (e.g. Flume events, 14 | # HDFS files or blocks), turn them into a stream of records, and pipe the stream 15 | # of records through a set of easily configurable transformations on it's way to 16 | # Solr (or a MapReduceIndexerTool RecordWriter that feeds via a Reducer into Solr). 17 | morphlines : [ 18 | { 19 | # Name used to identify a morphline. E.g. used if there are multiple morphlines in a 20 | # morphline config file 21 | id : morphline1 22 | # Import all morphline commands in these java packages and their subpackages. 23 | # Other commands that may be present on the classpath are not visible to this morphline. 24 | importCommands : ["org.kitesdk.**", "org.apache.solr.**"] 25 | commands : [ 26 | { 27 | ## Read the email stream and break it up into individual messages. 28 | ## The beginning of a message is marked by regex clause below 29 | ## The reason we use this command is that one event can have multiple 30 | ## messages 31 | readCSV { 32 | separator: " " 33 | columns: [client_ip,C1,C2,time,dummy1,request,code,bytes,referer,user_agent,C3] 34 | ignoreFirstLine : false 35 | quoteChar : "\"" 36 | commentPrefix : "" 37 | trim : true 38 | charset : UTF-8 39 | } 40 | } 41 | { 42 | split { 43 | inputField : request 44 | outputFields : [method, url, protocol] 45 | separator : " " 46 | isRegex : false 47 | #separator : """\s*,\s*""" 48 | # #isRegex : true 49 | addEmptyStrings : false 50 | trim : true 51 | } 52 | } 53 | { 54 | split { 55 | inputField : url 56 | outputFields : ["", app, subapp] 57 | separator : "\/" 58 | isRegex : false 59 | #separator : """\s*,\s*""" 60 | # #isRegex : true 61 | addEmptyStrings : false 62 | trim : true 63 | } 64 | } 65 | { 66 | userAgent { 67 | inputField : user_agent 68 | outputFields : { 69 | user_agent_family : "@{ua_family}" 70 | user_agent_major : "@{ua_major}" 71 | device_family : "@{device_family}" 72 | os_family : "@{os_family}" 73 | os_major : "@{os_major}" 74 | } 75 | } 76 | } 77 | { 78 | #Extract GEO information 79 | geoIP { 80 | inputField : client_ip 81 | database : "/tmp/GeoLite2-City.mmdb" 82 | } 83 | } 84 | { 85 | 86 | # extract parts of the geolocation info from the Jackson JsonNode Java 87 | # # object contained in the _attachment_body field and store the parts in 88 | # # the given record output fields: 89 | extractJsonPaths { 90 | flatten : false 91 | paths : { 92 | country_code : /country/iso_code 93 | country_name : /country/names/en 94 | region_code : /continent/code 95 | #"/subdivisions[]/names/en" : "/subdivisions[]/names/en" 96 | #"/subdivisions[]/iso_code" : "/subdivisions[]/iso_code" 97 | city : /city/names/en 98 | #/postal/code : /postal/code 99 | latitude : /location/latitude 100 | longitude : /location/longitude 101 | #/location/latitude_longitude : /location/latitude_longitude 102 | #/location/longitude_latitude : /location/longitude_latitude 103 | } 104 | } 105 | } 106 | #{logInfo { format : "BODY : {}", args : ["@{}"] } } 107 | # add Unique ID, in case our message_id field from above is not present 108 | { 109 | generateUUID { 110 | field:id 111 | } 112 | } 113 | 114 | # convert the timestamp field to "yyyy-MM-dd'T'HH:mm:ss.SSSZ" format 115 | { 116 | # 21/Nov/2014:22:08:27 117 | convertTimestamp { 118 | field : time 119 | inputFormats : ["[dd/MMM/yyyy:HH:mm:ss", "EEE, d MMM yyyy HH:mm:ss Z", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", "yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd"] 120 | inputTimezone : America/Los_Angeles 121 | outputFormat : "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'" 122 | outputTimezone : UTC 123 | } 124 | } 125 | 126 | # Consume the output record of the previous command and pipe another 127 | # record downstream. 128 | # 129 | # This command sanitizes record fields that are unknown to Solr schema.xml 130 | # by deleting them. Recall that Solr throws an exception on any attempt to 131 | # load a document that contains a field that isn't specified in schema.xml 132 | { 133 | sanitizeUnknownSolrFields { 134 | # Location from which to fetch Solr schema 135 | solrLocator : ${SOLR_LOCATOR} 136 | } 137 | } 138 | 139 | # load the record into a SolrServer or MapReduce SolrOutputFormat. 140 | { 141 | loadSolr { 142 | solrLocator : ${SOLR_LOCATOR} 143 | } 144 | } 145 | ] 146 | } 147 | ] 148 | 149 | 150 | -------------------------------------------------------------------------------- /apache-real-time/schema.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 47 | 48 | 49 | 65 | 66 | 67 | 101 | 102 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 149 | id 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 184 | 185 | 191 | 192 | 195 | 196 | 197 | 198 | 205 | 206 | 210 | 211 | 212 | 213 | 214 | 215 | 230 | 231 | 237 | 238 | 239 | 240 | 241 | 242 | 252 | 253 | 254 | 255 | 256 | 257 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 314 | 315 | 316 | 327 | 328 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 348 | 349 | 350 | 351 | 352 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 370 | 371 | 372 | 373 | 376 | 380 | 385 | 386 | 387 | 388 | 391 | 392 | 393 | 394 | 395 | 396 | 401 | 402 | 403 | 404 | 407 | 408 | 409 | 410 | 411 | 420 | 421 | 422 | 423 | 426 | 430 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 468 | 469 | 470 | 471 | 472 | 474 | 475 | 476 | 477 | 478 | 479 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 499 | 500 | 504 | 505 | 506 | 509 | 510 | 513 | 514 | 515 | 516 | 527 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 593 | 594 | 595 | 606 | 607 | 608 | 609 | 610 | 611 | 615 | 617 | 618 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 | 923 | 924 | 925 | 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 947 | 948 | 949 | 950 | 951 | 952 | 953 | 954 | 955 | 956 | 957 | 958 | 959 | 960 | 961 | 962 | 963 | 964 | 965 | 966 | 967 | 968 | 969 | 974 | 979 | 980 | 981 | -------------------------------------------------------------------------------- /email-search/email-schema.xml: -------------------------------------------------------------------------------- 1 | 29 | 30 | 31 | 47 | 48 | 49 | 83 | 84 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 117 | id 118 | 119 | 120 | 126 | 127 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 158 | 159 | 163 | 164 | 165 | 166 | 167 | 168 | 183 | 184 | 190 | 191 | 192 | 193 | 194 | 195 | 205 | 206 | 207 | 208 | 209 | 210 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 267 | 268 | 269 | 280 | 281 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 301 | 302 | 303 | 304 | 305 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 347 | 348 | 353 | 354 | 355 | 356 | 359 | 363 | 368 | 369 | 370 | 371 | 374 | 375 | 376 | 377 | 378 | 379 | 384 | 385 | 386 | 387 | 390 | 391 | 392 | 393 | 394 | 403 | 404 | 405 | 406 | 409 | 413 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 451 | 452 | 453 | 454 | 455 | 457 | 458 | 459 | 460 | 461 | 462 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 482 | 483 | 487 | 488 | 489 | 492 | 493 | 496 | 497 | 498 | 499 | 510 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 576 | 577 | 578 | 589 | 590 | 591 | 592 | 593 | 594 | 598 | 600 | 601 | 612 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 853 | 859 | 860 | 861 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 | 923 | 924 | 925 | 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 947 | 948 | 949 | 950 | 951 | 952 | 953 | 954 | 955 | 956 | 957 | 958 | 959 | 960 | 961 | 962 | 963 | 964 | 965 | 966 | 967 | 968 | 969 | 970 | 971 | 972 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 980 | 981 | 982 | 983 | 984 | 985 | 986 | 987 | 988 | 989 | 990 | 991 | 992 | 993 | 994 | 995 | 996 | 997 | 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | 1006 | 1011 | 1016 | 1017 | 1018 | -------------------------------------------------------------------------------- /email-search/flume.config: -------------------------------------------------------------------------------- 1 | # Please paste flume.conf here. Example: 2 | 3 | # Sources, channels, and sinks are defined per 4 | # agent name, in this case 'tier1'. 5 | tier1.sources=emailsSrc 6 | tier1.channels=emailsChannel solrChannel 7 | tier1.sinks=emailsSink solrSink 8 | 9 | 10 | # For each source, channel, and sink, set 11 | # standard properties. 12 | #tier1.sources.emailsSrc.command=tail -F /tmp/emails.txt 13 | #tier1.sources.emailsSrc.type=exec 14 | #tier1.sources.emailsSrc.command=cat /tmp/emails/emails_sample.txt 15 | tier1.sources.emailsSrc.type=spooldir 16 | tier1.sources.emailsSrc.spoolDir =/tmp/emails/spool 17 | tier1.sources.emailsSrc.deserializer = org.apache.flume.sink.solr.morphline.BlobDeserializer$Builder 18 | tier1.sources.emailsSrc.deletePolicy=immediate 19 | tier1.sources.emailsSrc.channels=emailsChannel solrChannel 20 | tier1.sources.emailsSrc.selector.type=replicating 21 | 22 | 23 | tier1.channels.emailsChannel.type=memory 24 | tier1.channels.solrChannel.type=memory 25 | 26 | 27 | # HDFS Sink 28 | tier1.sinks.emailsSink.type=hdfs 29 | tier1.sinks.emailsSink.hdfs.path=/user/flume 30 | tier1.sinks.emailsSink.hdfs.filePrefix=email_log 31 | tier1.sinks.emailsSink.hdfs.rollSize=102400000 32 | tier1.sinks.emailsSink.hdfs.batchSize=10000 33 | tier1.sinks.emailsSink.hdfs.rollCount=0 34 | tier1.sinks.emailsSink.hdfs.minBlockReplicas=1 35 | tier1.sinks.emailsSink.hdfs.txnEventMax=10000 36 | tier1.sinks.emailsSink.hdfs.callTimeout=1000000 37 | tier1.sinks.emailsSink.channel=emailsChannel 38 | 39 | # SOLR Sink 40 | tier1.sinks.solrSink.type=org.apache.flume.sink.solr.morphline.MorphlineSolrSink 41 | tier1.sinks.solrSink.channel=solrChannel 42 | tier1.sinks.solrSink.morphlineFile=/tmp/morphline.conf 43 | 44 | 45 | # Other properties are specific to each type of 46 | # source, channel, or sink. In this case, we 47 | # specify the capacity of the memory channel. 48 | tier1.channels.emailsChannel.capacity=100000000 49 | tier1.channels.emailsChannel.transactionCapacity=10000000 50 | 51 | tier1.channels.solrChannel.capacity=100000000 52 | tier1.channels.solrChannel.transactionCapacity=10000000 53 | 54 | -------------------------------------------------------------------------------- /email-search/indexer-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /email-search/ks-indexer-email-morphlines.conf: -------------------------------------------------------------------------------- 1 | SOLR_LOCATOR : { 2 | # Name of solr collection 3 | collection : collection1 4 | 5 | # ZooKeeper ensemble 6 | zkHost : "$ZK_HOST" 7 | } 8 | 9 | 10 | morphlines : [ 11 | { 12 | id : morphline1 13 | importCommands : ["com.cloudera.**", "com.ngdata.**"] 14 | 15 | commands : [ 16 | { 17 | extractHBaseCells { 18 | mappings : [ 19 | { 20 | inputColumn : "messages:*" 21 | outputField : "message" 22 | type : string 23 | source : value 24 | } 25 | ] 26 | } 27 | } 28 | ## Break up the email text into SOLR fields 29 | { 30 | if { 31 | conditions: [ 32 | { 33 | not{ 34 | grok { 35 | expressions : { 36 | message: """(?s)(.*?)(Message-ID: <)(?(.+?))(>.*?)(Date: )(?(.+?))( \(.+?)(From: )(?(.*?))((To: )(?(.+?)))?(Subject: )(?(.*?))((Cc: )(?(.*)))?(Mime.+?)((Bcc: )(?(.*)))?(X-From: )(?(.*?))(X-To: )(?(.*?))(X-cc: )(?(.*?))(X-bcc: )(?(.*?))(X-Folder: )(?(.*?))(X-Origin: )(?(.*?))(X-FileName: )(?(.*?))(\n)(?(.*))""" 37 | } 38 | extract: inplace 39 | findSubstrings: false 40 | addEmptyStrings: false 41 | numRequiredMatches: all 42 | } 43 | } 44 | } 45 | ] 46 | then:[ 47 | { logInfo { format : "found no grok match: {}", args : ["@{}"] } } 48 | { dropRecord {} } 49 | ] 50 | } 51 | } 52 | 53 | # add Unique ID, in case our message_id field from above is not present 54 | { 55 | generateUUID { 56 | field:message_id 57 | } 58 | } 59 | 60 | # convert the timestamp field to "yyyy-MM-dd'T'HH:mm:ss.SSSZ" format 61 | { 62 | convertTimestamp { 63 | field : date 64 | inputFormats : ["EEE, d MMM yyyy HH:mm:ss Z", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", "yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd"] 65 | inputTimezone : America/Los_Angeles 66 | outputFormat : "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'" 67 | outputTimezone : UTC 68 | } 69 | } 70 | 71 | # Consume the output record of the previous command and pipe another 72 | # record downstream. 73 | # 74 | # This command sanitizes record fields that are unknown to Solr schema.xml 75 | # by deleting them. Recall that Solr throws an exception on any attempt to 76 | # load a document that contains a field that isn't specified in schema.xml 77 | { 78 | sanitizeUnknownSolrFields { 79 | # Location from which to fetch Solr schema 80 | solrLocator : ${SOLR_LOCATOR} 81 | } 82 | } 83 | ] 84 | } 85 | ] 86 | -------------------------------------------------------------------------------- /email-search/morphlines.conf: -------------------------------------------------------------------------------- 1 | # Specify server locations in a SOLR_LOCATOR variable; 2 | # used later in variable substitutions 3 | # Change the zkHost to point to your own Zookeeper quorum 4 | SOLR_LOCATOR : { 5 | # Name of solr collection 6 | collection : email_collection 7 | # ZooKeeper ensemble 8 | zkHost : "clust2:2181/solr" 9 | } 10 | 11 | # Specify an array of one or more morphlines, each of which defines an ETL 12 | # transformation chain. A morphline consists of one or more (potentially 13 | # nested) commands. A morphline is a way to consume records (e.g. Flume events, 14 | # HDFS files or blocks), turn them into a stream of records, and pipe the stream 15 | # of records through a set of easily configurable transformations on it's way to 16 | # Solr (or a MapReduceIndexerTool RecordWriter that feeds via a Reducer into Solr). 17 | morphlines : [ 18 | { 19 | # Name used to identify a morphline. E.g. used if there are multiple morphlines in a 20 | # morphline config file 21 | id : morphline1 22 | # Import all morphline commands in these java packages and their subpackages. 23 | # Other commands that may be present on the classpath are not visible to this morphline. 24 | importCommands : ["com.cloudera.**", "org.apache.solr.**"] 25 | commands : [ 26 | { 27 | ## Read the email stream and break it up into individual messages. 28 | ## The beginning of a message is marked by regex clause below 29 | ## The reason we use this command is that one event can have multiple 30 | ## messages 31 | readMultiLine { 32 | regex : "Message-ID:.*" 33 | what : next 34 | charset : UTF-8 35 | } 36 | } 37 | ## Break up the email text into SOLR fields 38 | { 39 | if { 40 | conditions: [ 41 | { 42 | not{ 43 | grok { 44 | expressions : { 45 | message: """(?s)(.*?)(Message-ID: <)(?(.+?))(>.*?)(Date: )(?(.+?))( \(.+?)(From: )(?(.*?))((To: )(?(.+?)))?(Subject: )(?(.*?))((Cc: )(?(.*)))?(Mime.+?)((Bcc: )(?(.*)))?(X-From: )(?(.*?))(X-To: )(?(.*?))(X-cc: )(?(.*?))(X-bcc: )(?(.*?))(X-Folder: )(?(.*?))(X-Origin: )(?(.*?))(X-FileName: )(?(.*?))(\n)(?(.*))""" 46 | } 47 | extract: inplace 48 | findSubstrings: false 49 | addEmptyStrings: false 50 | numRequiredMatches: all 51 | } 52 | } 53 | } 54 | ] 55 | then:[ 56 | { logInfo { format : "found no grok match: {}", args : ["@{}"] } } 57 | { dropRecord {} } 58 | ] 59 | } 60 | } 61 | 62 | # add Unique ID, in case our message_id field from above is not present 63 | { 64 | generateUUID { 65 | field:message_id 66 | } 67 | } 68 | 69 | # convert the timestamp field to "yyyy-MM-dd'T'HH:mm:ss.SSSZ" format 70 | { 71 | convertTimestamp { 72 | field : date 73 | inputFormats : ["EEE, d MMM yyyy HH:mm:ss Z", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", "yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd"] 74 | inputTimezone : America/Los_Angeles 75 | outputFormat : "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'" 76 | outputTimezone : UTC 77 | } 78 | } 79 | 80 | # Consume the output record of the previous command and pipe another 81 | # record downstream. 82 | # 83 | # This command sanitizes record fields that are unknown to Solr schema.xml 84 | # by deleting them. Recall that Solr throws an exception on any attempt to 85 | # load a document that contains a field that isn't specified in schema.xml 86 | { 87 | sanitizeUnknownSolrFields { 88 | # Location from which to fetch Solr schema 89 | solrLocator : ${SOLR_LOCATOR} 90 | } 91 | } 92 | 93 | # load the record into a SolrServer or MapReduce SolrOutputFormat. 94 | { 95 | loadSolr { 96 | solrLocator : ${SOLR_LOCATOR} 97 | } 98 | } 99 | ] 100 | } 101 | ] 102 | -------------------------------------------------------------------------------- /email-search/schema.xml: -------------------------------------------------------------------------------- 1 | 2 | 30 | 31 | 32 | 48 | 49 | 50 | 84 | 85 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 117 | message_id 118 | 119 | 120 | 126 | 127 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 157 | 158 | 162 | 163 | 164 | 165 | 166 | 167 | 182 | 183 | 189 | 190 | 191 | 192 | 193 | 194 | 204 | 205 | 206 | 207 | 208 | 209 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 266 | 267 | 268 | 279 | 280 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 300 | 301 | 302 | 303 | 304 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 346 | 347 | 352 | 353 | 354 | 355 | 358 | 362 | 367 | 368 | 369 | 370 | 373 | 374 | 375 | 376 | 377 | 378 | 383 | 384 | 385 | 386 | 389 | 390 | 391 | 392 | 393 | 402 | 403 | 404 | 405 | 408 | 412 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 450 | 451 | 452 | 453 | 454 | 456 | 457 | 458 | 459 | 460 | 461 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 481 | 482 | 486 | 487 | 488 | 491 | 492 | 495 | 496 | 497 | 498 | 509 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 575 | 576 | 577 | 588 | 589 | 590 | 591 | 592 | 593 | 597 | 599 | 600 | 611 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 858 | 859 | 860 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 | 923 | 924 | 925 | 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 947 | 948 | 949 | 950 | 951 | 952 | 953 | 954 | 955 | 956 | 957 | 958 | 959 | 960 | 961 | 962 | 963 | 964 | 965 | 966 | 967 | 968 | 969 | 970 | 971 | 972 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 980 | 981 | 982 | 983 | 984 | 985 | 986 | 987 | 988 | 989 | 990 | 991 | 992 | 993 | 994 | 995 | 996 | 997 | 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | 1010 | 1015 | 1016 | 1017 | -------------------------------------------------------------------------------- /ocr/IdmpExtraction.scala: -------------------------------------------------------------------------------- 1 | / ** 2 | * @author Jeff Shmain 3 | * @author Vartika Singh 4 | * / 5 | 6 | package com.cloudera.sa.OCR 7 | 8 | import org.bytedeco.javacpp.lept._; 9 | import org.bytedeco.javacpp.tesseract._; 10 | import java.io.ByteArrayOutputStream; 11 | import java.nio.ByteBuffer; 12 | import java.nio.ByteBuffer 13 | import java.awt.Image 14 | import java.awt.image.RenderedImage; 15 | import java.io.File; 16 | import scala.collection.mutable.StringBuilder 17 | import collection.JavaConversions._ 18 | import java.io.IOException; 19 | import java.util.List; 20 | import javax.imageio.ImageIO; 21 | import org.ghost4j.analyzer.AnalysisItem; 22 | import org.ghost4j.analyzer.FontAnalyzer; 23 | import org.ghost4j.document.PDFDocument; 24 | import org.ghost4j.renderer.SimpleRenderer; 25 | import org.bytedeco.javacpp._; 26 | import java.io._ 27 | import org.apache.spark.{Logging, SerializableWritable, SparkConf, SparkContext} 28 | import org.apache.hadoop.hbase.HBaseConfiguration 29 | import org.apache.hadoop.hbase.client.Connection 30 | import org.apache.hadoop.hbase.client.ConnectionFactory 31 | import org.apache.hadoop.hbase.TableName 32 | import org.apache.hadoop.hbase.client.Put 33 | import org.apache.hadoop.hbase.client.Get 34 | import org.apache.hadoop.hbase.util.Bytes 35 | 36 | object IdmpExtraction { 37 | def main(args: Array[String]) { 38 | 39 | 40 | val conf = new SparkConf().setAppName("IDMP Processor") 41 | val sc = new SparkContext(conf) 42 | 43 | /** Read in PDFs into the RDD */ 44 | val files = sc.binaryFiles ("hdfs://nameservice1/data/raw") 45 | files.map(convertFunc(_)).count 46 | } 47 | 48 | 49 | /** Populate the HBase table 50 | * @param fileName This corresponds to the rowID in HBase 51 | * @param lines The parsed output. 52 | * dataformat: binaryPDF:PortableDataStream 53 | */ 54 | def populateHbase ( 55 | fileName:String, 56 | lines: String, 57 | pdf:org.apache.spark.input.PortableDataStream) : Unit = 58 | { 59 | /** Configure and open a HBase connection */ 60 | val conf = HBaseConfiguration.create() 61 | val conn= ConnectionFactory.createConnection( conf ); 62 | val mddsTbl = conn.getTable( TableName.valueOf( "mdds" )); 63 | val cf = "info" 64 | val put = new Put( Bytes.toBytes( fileName )) 65 | 66 | /** 67 | * Extract Fields here using Regexes 68 | * Create Put objects and send to hbase 69 | */ 70 | val aAndCP = """(?s)(?m).*\d\d\d\d\d-\d\d\d\d(.*)\nRe: (\w\d\d\d\d\d\d).*""".r 71 | val approvedP = """(?s)(?m).*(You may, therefore, market the device, subject to the general controls provisions of the Act).*""".r 72 | 73 | lines match { 74 | case 75 | aAndCP( addr, casenum ) => put.add( Bytes.toBytes( cf ), Bytes.toBytes( "submitter_info" ), Bytes.toBytes( addr ) ).add( Bytes.toBytes( cf ), Bytes.toBytes( "case_num" ), Bytes.toBytes( casenum )) 76 | case _ => println( "did not match a regex" ) 77 | } 78 | 79 | lines match { 80 | case 81 | approvedP( approved ) => put.add( Bytes.toBytes( cf ), Bytes.toBytes( "approved" ), Bytes.toBytes( "yes" )) 82 | case _ => put.add( Bytes.toBytes( cf ), Bytes.toBytes( "approved" ), Bytes.toBytes( "no" )) 83 | } 84 | 85 | lines.split("\n").foreach { 86 | 87 | val regNumRegex = """Regulation Number:\s+(.+)""".r 88 | val regNameRegex = """Regulation Name:\s+(.+)""".r 89 | val regClassRegex = """Regulatory Class:\s+(.+)""".r 90 | val productCodeRegex = """Product Code:\s+(.+)""".r 91 | val datedRegex = """Dated:\s+(\w{3,10}\s+\d{1,2},\s+\d{4}).*""".r 92 | val receivedRegex = """Received:\s+(\w{3,10}\s+\d{1,2},\s+\d{4}).*""".r 93 | val deviceNameRegex = """Trade/Device Name:\s+(.+)""".r 94 | 95 | _ match { 96 | case regNumRegex( regNum ) => put.add( Bytes.toBytes( cf ), Bytes.toBytes( "reg_num" ), Bytes.toBytes( regNum )) 97 | case regNameRegex(regName) => put.add(Bytes.toBytes( cf ), Bytes.toBytes( "reg_name" ), Bytes.toBytes( regName )) 98 | case regClassRegex( regClass ) => put.add( Bytes.toBytes( cf ), Bytes.toBytes( "reg_class" ), Bytes.toBytes( regClass )) 99 | case productCodeRegex( productCode ) => put.add( Bytes.toBytes( cf ), Bytes.toBytes( "product_code" ), Bytes.toBytes( productCode )) 100 | case datedRegex( dated ) => put.add( Bytes.toBytes( cf ), Bytes.toBytes( "dated" ), Bytes.toBytes( dated )) 101 | case receivedRegex( received ) => put.add( Bytes.toBytes( cf ), Bytes.toBytes( "received" ), Bytes.toBytes( received )) 102 | case deviceNameRegex( deviceName ) => put.add( Bytes.toBytes( cf ), Bytes.toBytes( "device_name" ), Bytes.toBytes( deviceName )) 103 | 104 | case _ => print( "" ) 105 | } 106 | } 107 | put.add( Bytes.toBytes( cf ), Bytes.toBytes( "text" ), Bytes.toBytes( lines )) 108 | val pdfBytes = pdf.toArray.clone 109 | put.add(Bytes.toBytes( "obj" ), Bytes.toBytes( "pdf" ), pdfBytes ) 110 | 111 | mddsTbl.put( put ) 112 | mddsTbl.close 113 | conn.close 114 | } 115 | 116 | /** Method to convert a PDF document to images and hence OCR 117 | * @param PDF File(s) to process 118 | */ 119 | def convertFunc ( 120 | file: (String, org.apache.spark.input.PortableDataStream) 121 | ) : Unit = 122 | { 123 | /** Render the PDF into a list of images with 300 dpi resolution 124 | * One image per PDF page, a PDF document may have multiple pages 125 | */ 126 | val document: PDFDocument = new PDFDocument( ); 127 | document.load( file._2.open ) 128 | file._2.close 129 | val renderer :SimpleRenderer = new SimpleRenderer( ) 130 | renderer.setResolution( 300 ) 131 | val images:List[Image] = renderer.render( document ) 132 | 133 | /** Iterate through the image list and extract OCR 134 | * using Tesseract API. 135 | */ 136 | var r:StringBuilder = new StringBuilder 137 | images.toList.foreach{ x=> 138 | val imageByteStream = new ByteArrayOutputStream( ) 139 | ImageIO.write( 140 | x.asInstanceOf[RenderedImage], "png", imageByteStream ) 141 | val pix: PIX = pixReadMem( 142 | ByteBuffer.wrap( imageByteStream.toByteArray( ) ).array( ), 143 | ByteBuffer.wrap( imageByteStream.toByteArray( ) ).capacity( ) 144 | ) 145 | val api: TessBaseAPI = new TessBaseAPI( ) 146 | /** We assume the documents are in English here, hence \”eng\” */ 147 | api.Init( null, "eng" ) 148 | api.SetImage(pix) 149 | r.append(api.GetUTF8Text().getString()) 150 | imageByteStream.close 151 | pixDestroy(pix) 152 | api.End 153 | } 154 | 155 | /** Write the generated data into HBase */ 156 | populateHbase( file._1, r.toString( ), file._2 ) 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /ocr/indexer-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /ocr/morphlines.conf: -------------------------------------------------------------------------------- 1 | SOLR_LOCATOR : { 2 | # Name of solr collection 3 | collection : mdds_collection 4 | 5 | # ZooKeeper ensemble 6 | zkHost : "$ZK_HOST" 7 | } 8 | 9 | morphlines : [ 10 | { 11 | id : morphline 12 | importCommands : ["org.kitesdk.**", "com.ngdata.**"] 13 | 14 | commands : [ 15 | { 16 | extractHBaseCells { 17 | mappings : [ 18 | { 19 | inputColumn : "info:submitter_info" 20 | outputField : "submitter_info" 21 | type : string 22 | source : value 23 | } 24 | { 25 | inputColumn : "info:case_num" 26 | outputField : "case_num" 27 | type : string 28 | source : value 29 | } 30 | { 31 | inputColumn : "info:device_name" 32 | outputField : "device_name" 33 | type : string 34 | source : value 35 | } 36 | { 37 | inputColumn : "info:reg_num" 38 | outputField : "reg_num" 39 | type : string 40 | source : value 41 | } 42 | { 43 | inputColumn : "info:reg_name" 44 | outputField : "reg_name" 45 | type : string 46 | source : value 47 | } 48 | { 49 | inputColumn : "info:reg_class" 50 | outputField : "reg_class" 51 | type : string 52 | source : value 53 | } 54 | { 55 | inputColumn : "info:product_code" 56 | outputField : "product_code" 57 | type : string 58 | source : value 59 | } 60 | { 61 | inputColumn : "info:dated" 62 | outputField : "dated" 63 | type : string 64 | source : value 65 | } 66 | { 67 | inputColumn : "info:received" 68 | outputField : "received" 69 | type : string 70 | source : value 71 | } 72 | { 73 | inputColumn : "info:approved" 74 | outputField : "approved" 75 | type : string 76 | source : value 77 | } 78 | { 79 | inputColumn : "info:text" 80 | outputField : "text" 81 | type : string 82 | source : value 83 | } 84 | ] 85 | } 86 | } 87 | 88 | { 89 | convertTimestamp { 90 | field : dated 91 | inputFormats : ["MMMM d, yyyy", "yyyy-MM-dd","MMMM d,yyyy"] 92 | inputTimezone : America/Los_Angeles 93 | outputFormat : "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'" 94 | outputTimezone : UTC 95 | } 96 | } 97 | { 98 | convertTimestamp { 99 | field : received 100 | inputFormats : ["MMMM d, yyyy", "yyyy-MM-dd","MMMM d,yyyy"] 101 | inputTimezone : America/Los_Angeles 102 | outputFormat : "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'" 103 | outputTimezone : UTC 104 | } 105 | } 106 | # Consume the output record of the previous command and pipe another 107 | # record downstream. 108 | # 109 | # This command sanitizes record fields that are unknown to Solr schema.xml 110 | # by deleting them. Recall that Solr throws an exception on any attempt to 111 | # load a document that contains a field that isn't specified in schema.xml 112 | { 113 | sanitizeUnknownSolrFields { 114 | # Location from which to fetch Solr schema 115 | solrLocator : ${SOLR_LOCATOR} 116 | } 117 | } 118 | # { logDebug { format : "output record: {}", args : ["@{}"] } } 119 | ] 120 | } 121 | ] 122 | --------------------------------------------------------------------------------