├── .gitignore ├── LICENSE.txt ├── README.md ├── build.sbt ├── data ├── 20newsgroups.json ├── Twitter140sample.txt ├── emails.json ├── import_eventserver.py ├── sentimentanalysis.json └── stopwords.json ├── engine.json ├── project ├── assembly.sbt └── build.properties ├── src └── main │ └── scala │ ├── DataSource.scala │ ├── Engine.scala │ ├── Evaluation.scala │ ├── LRAlgorithm.scala │ ├── NBAlgorithm.scala │ ├── Preparator.scala │ └── Serving.scala └── template.json /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | target/ 4 | project/project/ 5 | project/target/ 6 | manifest.json 7 | best.json 8 | pio.log 9 | pio.sbt -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Classification Engine 2 | 3 | Look at the following 4 | [tutorial](https://predictionio.apache.org/demo/textclassification/) 5 | for a Quick Start guide and implementation details. 6 | 7 | # Release Information 8 | 9 | ## Version 6.0 10 | 11 | - Use Apache Lucene as tokenizer 12 | - Add stopwords filter 13 | - Rename Scala package name 14 | - Update SBT version 15 | 16 | ## Version 5.0 **First Apache Version** 17 | 18 | - Major changes to namespace to reflect donation to the Apache Software Foundation. 19 | - Build changes to support modified Apache build mechanism 20 | 21 | ## Version 4.0 22 | 23 | Re-structure and design preparator and algo. less memory usage and run time is faster. 24 | Move BIDMach, VW & SPPMI algo changes to `bidmach` branch temporarily. 25 | 26 | ## Version 3.1 27 | 28 | Fix DataSource to read "content", "e-mail", and use label "spam" for tutorial data. 29 | Fix engine.json for default algorithm setting. 30 | 31 | 32 | ## Version 2.2 33 | 34 | Modified PreparedData to use MLLib hashing and tf-idf implementations. 35 | 36 | ## Version 2.1 37 | 38 | Fixed dot product implementation in the predict methods to work with batch predict method for evaluation. 39 | 40 | ## Version 2.0 41 | 42 | Included three different data sets: e-mail spam, 20 newsgroups, and the rotten tomatoes semantic analysis set. Includes Multinomial Logistic Regression algorithm for text classification. 43 | 44 | ## Version 1.2 45 | 46 | Fixed import script bug occuring with Python 2. 47 | 48 | ## Version 1.1 Changes 49 | 50 | Changed data import Python script to pull straight from the [20 newsgroups](http://qwone.com/~jason/20Newsgroups/) page. 51 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "org.example.textclassification" 2 | 3 | scalaVersion := "2.11.8" 4 | libraryDependencies ++= Seq( 5 | "org.apache.predictionio" %% "apache-predictionio-core" % "0.12.0-incubating" % "provided", 6 | "org.apache.spark" %% "spark-core" % "2.1.1" % "provided", 7 | "org.apache.spark" %% "spark-mllib" % "2.1.1" % "provided", 8 | "org.apache.lucene" % "lucene-core" % "6.5.1") 9 | -------------------------------------------------------------------------------- /data/import_eventserver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Import sample data for classification engine 3 | """ 4 | 5 | import predictionio 6 | import argparse 7 | 8 | def import_events(client, file): 9 | f = open(file, 'r') 10 | count = 0 11 | print("Importing data...") 12 | for line in f: 13 | data = line.rstrip('\r\n').split(",") 14 | plan = data[0] 15 | #Not strictly CSV, after the first comma, no longer delimiting 16 | text = ",".join(data[1:]) 17 | client.create_event( 18 | event="$set", 19 | entity_type="user", 20 | entity_id=str(count), # use the count num as user ID 21 | properties= { 22 | "text" : text, 23 | "category" : plan, 24 | "label" : int(plan) 25 | } 26 | ) 27 | count += 1 28 | f.close() 29 | print("%s events are imported." % count) 30 | 31 | if __name__ == '__main__': 32 | parser = argparse.ArgumentParser( 33 | description="Import sample data for classification engine") 34 | parser.add_argument('--access_key', default='invald_access_key') 35 | parser.add_argument('--url', default="http://localhost:7070") 36 | parser.add_argument('--file', default="./data/Twitter140sample.txt") 37 | 38 | args = parser.parse_args() 39 | print(args) 40 | 41 | client = predictionio.EventClient( 42 | access_key=args.access_key, 43 | url=args.url, 44 | threads=5, 45 | qsize=500) 46 | import_events(client, args.file) 47 | -------------------------------------------------------------------------------- /data/stopwords.json: -------------------------------------------------------------------------------- 1 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 1, "entityType": "resource", "properties": {"word": "co"}} 2 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 2, "entityType": "resource", "properties": {"word": "become"}} 3 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 3, "entityType": "resource", "properties": {"word": "him"}} 4 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 4, "entityType": "resource", "properties": {"word": "part"}} 5 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 5, "entityType": "resource", "properties": {"word": "anything"}} 6 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 6, "entityType": "resource", "properties": {"word": "somehow"}} 7 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 7, "entityType": "resource", "properties": {"word": "therefore"}} 8 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 8, "entityType": "resource", "properties": {"word": "himself"}} 9 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 9, "entityType": "resource", "properties": {"word": "which"}} 10 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 10, "entityType": "resource", "properties": {"word": "un"}} 11 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 11, "entityType": "resource", "properties": {"word": "without"}} 12 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 12, "entityType": "resource", "properties": {"word": "its"}} 13 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 13, "entityType": "resource", "properties": {"word": "see"}} 14 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 14, "entityType": "resource", "properties": {"word": "three"}} 15 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 15, "entityType": "resource", "properties": {"word": "thick"}} 16 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 16, "entityType": "resource", "properties": {"word": "hereupon"}} 17 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.452+0000", "entityId": 17, "entityType": "resource", "properties": {"word": "anyone"}} 18 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 18, "entityType": "resource", "properties": {"word": "moreover"}} 19 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 19, "entityType": "resource", "properties": {"word": "whereafter"}} 20 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 20, "entityType": "resource", "properties": {"word": "yourselves"}} 21 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 21, "entityType": "resource", "properties": {"word": "thereby"}} 22 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 22, "entityType": "resource", "properties": {"word": "anywhere"}} 23 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 23, "entityType": "resource", "properties": {"word": "too"}} 24 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 24, "entityType": "resource", "properties": {"word": "then"}} 25 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 25, "entityType": "resource", "properties": {"word": "whoever"}} 26 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 26, "entityType": "resource", "properties": {"word": "throughout"}} 27 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 27, "entityType": "resource", "properties": {"word": "under"}} 28 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 28, "entityType": "resource", "properties": {"word": "nevertheless"}} 29 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 29, "entityType": "resource", "properties": {"word": "sometime"}} 30 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 30, "entityType": "resource", "properties": {"word": "some"}} 31 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 31, "entityType": "resource", "properties": {"word": "hereafter"}} 32 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 32, "entityType": "resource", "properties": {"word": "nothing"}} 33 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 33, "entityType": "resource", "properties": {"word": "yours"}} 34 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 34, "entityType": "resource", "properties": {"word": "own"}} 35 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 35, "entityType": "resource", "properties": {"word": "empty"}} 36 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 36, "entityType": "resource", "properties": {"word": "from"}} 37 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 37, "entityType": "resource", "properties": {"word": "eight"}} 38 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 38, "entityType": "resource", "properties": {"word": "per"}} 39 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 39, "entityType": "resource", "properties": {"word": "found"}} 40 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 40, "entityType": "resource", "properties": {"word": "it"}} 41 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 41, "entityType": "resource", "properties": {"word": "hereby"}} 42 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 42, "entityType": "resource", "properties": {"word": "con"}} 43 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 43, "entityType": "resource", "properties": {"word": "itself"}} 44 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 44, "entityType": "resource", "properties": {"word": "rather"}} 45 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 45, "entityType": "resource", "properties": {"word": "six"}} 46 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 46, "entityType": "resource", "properties": {"word": "eg"}} 47 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 47, "entityType": "resource", "properties": {"word": "together"}} 48 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 48, "entityType": "resource", "properties": {"word": "may"}} 49 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 49, "entityType": "resource", "properties": {"word": "four"}} 50 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 50, "entityType": "resource", "properties": {"word": "move"}} 51 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.453+0000", "entityId": 51, "entityType": "resource", "properties": {"word": "seeming"}} 52 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 52, "entityType": "resource", "properties": {"word": "couldnt"}} 53 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 53, "entityType": "resource", "properties": {"word": "or"}} 54 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 54, "entityType": "resource", "properties": {"word": "twenty"}} 55 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 55, "entityType": "resource", "properties": {"word": "much"}} 56 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 56, "entityType": "resource", "properties": {"word": "onto"}} 57 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 57, "entityType": "resource", "properties": {"word": "with"}} 58 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 58, "entityType": "resource", "properties": {"word": "as"}} 59 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 59, "entityType": "resource", "properties": {"word": "along"}} 60 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 60, "entityType": "resource", "properties": {"word": "my"}} 61 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 61, "entityType": "resource", "properties": {"word": "due"}} 62 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 62, "entityType": "resource", "properties": {"word": "fifteen"}} 63 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 63, "entityType": "resource", "properties": {"word": "ourselves"}} 64 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 64, "entityType": "resource", "properties": {"word": "through"}} 65 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 65, "entityType": "resource", "properties": {"word": "meanwhile"}} 66 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 66, "entityType": "resource", "properties": {"word": "inc"}} 67 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 67, "entityType": "resource", "properties": {"word": "though"}} 68 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 68, "entityType": "resource", "properties": {"word": "another"}} 69 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 69, "entityType": "resource", "properties": {"word": "your"}} 70 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 70, "entityType": "resource", "properties": {"word": "sincere"}} 71 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 71, "entityType": "resource", "properties": {"word": "whom"}} 72 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 72, "entityType": "resource", "properties": {"word": "beforehand"}} 73 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 73, "entityType": "resource", "properties": {"word": "seemed"}} 74 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 74, "entityType": "resource", "properties": {"word": "first"}} 75 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 75, "entityType": "resource", "properties": {"word": "a"}} 76 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 76, "entityType": "resource", "properties": {"word": "elsewhere"}} 77 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 77, "entityType": "resource", "properties": {"word": "i"}} 78 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 78, "entityType": "resource", "properties": {"word": "nobody"}} 79 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 79, "entityType": "resource", "properties": {"word": "almost"}} 80 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 80, "entityType": "resource", "properties": {"word": "go"}} 81 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 81, "entityType": "resource", "properties": {"word": "out"}} 82 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 82, "entityType": "resource", "properties": {"word": "next"}} 83 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 83, "entityType": "resource", "properties": {"word": "up"}} 84 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 84, "entityType": "resource", "properties": {"word": "me"}} 85 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.454+0000", "entityId": 85, "entityType": "resource", "properties": {"word": "however"}} 86 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 86, "entityType": "resource", "properties": {"word": "made"}} 87 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 87, "entityType": "resource", "properties": {"word": "wherever"}} 88 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 88, "entityType": "resource", "properties": {"word": "done"}} 89 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 89, "entityType": "resource", "properties": {"word": "besides"}} 90 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 90, "entityType": "resource", "properties": {"word": "hence"}} 91 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 91, "entityType": "resource", "properties": {"word": "well"}} 92 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 92, "entityType": "resource", "properties": {"word": "anyhow"}} 93 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 93, "entityType": "resource", "properties": {"word": "whose"}} 94 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 94, "entityType": "resource", "properties": {"word": "became"}} 95 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 95, "entityType": "resource", "properties": {"word": "being"}} 96 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 96, "entityType": "resource", "properties": {"word": "why"}} 97 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 97, "entityType": "resource", "properties": {"word": "every"}} 98 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 98, "entityType": "resource", "properties": {"word": "this"}} 99 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 99, "entityType": "resource", "properties": {"word": "last"}} 100 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 100, "entityType": "resource", "properties": {"word": "off"}} 101 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 101, "entityType": "resource", "properties": {"word": "herself"}} 102 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 102, "entityType": "resource", "properties": {"word": "you"}} 103 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 103, "entityType": "resource", "properties": {"word": "between"}} 104 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 104, "entityType": "resource", "properties": {"word": "beyond"}} 105 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 105, "entityType": "resource", "properties": {"word": "these"}} 106 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 106, "entityType": "resource", "properties": {"word": "within"}} 107 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 107, "entityType": "resource", "properties": {"word": "thence"}} 108 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 108, "entityType": "resource", "properties": {"word": "always"}} 109 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 109, "entityType": "resource", "properties": {"word": "will"}} 110 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 110, "entityType": "resource", "properties": {"word": "mostly"}} 111 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 111, "entityType": "resource", "properties": {"word": "of"}} 112 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 112, "entityType": "resource", "properties": {"word": "whence"}} 113 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 113, "entityType": "resource", "properties": {"word": "give"}} 114 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 114, "entityType": "resource", "properties": {"word": "than"}} 115 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 115, "entityType": "resource", "properties": {"word": "cant"}} 116 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 116, "entityType": "resource", "properties": {"word": "now"}} 117 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 117, "entityType": "resource", "properties": {"word": "side"}} 118 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.455+0000", "entityId": 118, "entityType": "resource", "properties": {"word": "something"}} 119 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 119, "entityType": "resource", "properties": {"word": "otherwise"}} 120 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 120, "entityType": "resource", "properties": {"word": "either"}} 121 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 121, "entityType": "resource", "properties": {"word": "toward"}} 122 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 122, "entityType": "resource", "properties": {"word": "except"}} 123 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 123, "entityType": "resource", "properties": {"word": "thru"}} 124 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 124, "entityType": "resource", "properties": {"word": "ever"}} 125 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 125, "entityType": "resource", "properties": {"word": "somewhere"}} 126 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 126, "entityType": "resource", "properties": {"word": "were"}} 127 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 127, "entityType": "resource", "properties": {"word": "whereas"}} 128 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 128, "entityType": "resource", "properties": {"word": "top"}} 129 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 129, "entityType": "resource", "properties": {"word": "what"}} 130 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 130, "entityType": "resource", "properties": {"word": "ltd"}} 131 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 131, "entityType": "resource", "properties": {"word": "any"}} 132 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 132, "entityType": "resource", "properties": {"word": "system"}} 133 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 133, "entityType": "resource", "properties": {"word": "whereby"}} 134 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 134, "entityType": "resource", "properties": {"word": "becomes"}} 135 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 135, "entityType": "resource", "properties": {"word": "for"}} 136 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 136, "entityType": "resource", "properties": {"word": "over"}} 137 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 137, "entityType": "resource", "properties": {"word": "also"}} 138 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 138, "entityType": "resource", "properties": {"word": "until"}} 139 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 139, "entityType": "resource", "properties": {"word": "many"}} 140 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 140, "entityType": "resource", "properties": {"word": "further"}} 141 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 141, "entityType": "resource", "properties": {"word": "already"}} 142 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 142, "entityType": "resource", "properties": {"word": "eleven"}} 143 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 143, "entityType": "resource", "properties": {"word": "might"}} 144 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 144, "entityType": "resource", "properties": {"word": "can"}} 145 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 145, "entityType": "resource", "properties": {"word": "former"}} 146 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 146, "entityType": "resource", "properties": {"word": "same"}} 147 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 147, "entityType": "resource", "properties": {"word": "twelve"}} 148 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 148, "entityType": "resource", "properties": {"word": "everywhere"}} 149 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 149, "entityType": "resource", "properties": {"word": "becoming"}} 150 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 150, "entityType": "resource", "properties": {"word": "full"}} 151 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 151, "entityType": "resource", "properties": {"word": "still"}} 152 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 152, "entityType": "resource", "properties": {"word": "therein"}} 153 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.456+0000", "entityId": 153, "entityType": "resource", "properties": {"word": "very"}} 154 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 154, "entityType": "resource", "properties": {"word": "behind"}} 155 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 155, "entityType": "resource", "properties": {"word": "anyway"}} 156 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 156, "entityType": "resource", "properties": {"word": "both"}} 157 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 157, "entityType": "resource", "properties": {"word": "thereafter"}} 158 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 158, "entityType": "resource", "properties": {"word": "he"}} 159 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 159, "entityType": "resource", "properties": {"word": "in"}} 160 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 160, "entityType": "resource", "properties": {"word": "re"}} 161 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 161, "entityType": "resource", "properties": {"word": "everyone"}} 162 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 162, "entityType": "resource", "properties": {"word": "mill"}} 163 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 163, "entityType": "resource", "properties": {"word": "perhaps"}} 164 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 164, "entityType": "resource", "properties": {"word": "whether"}} 165 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 165, "entityType": "resource", "properties": {"word": "be"}} 166 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 166, "entityType": "resource", "properties": {"word": "yourself"}} 167 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 167, "entityType": "resource", "properties": {"word": "while"}} 168 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 168, "entityType": "resource", "properties": {"word": "keep"}} 169 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 169, "entityType": "resource", "properties": {"word": "find"}} 170 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 170, "entityType": "resource", "properties": {"word": "latterly"}} 171 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 171, "entityType": "resource", "properties": {"word": "hers"}} 172 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 172, "entityType": "resource", "properties": {"word": "amongst"}} 173 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 173, "entityType": "resource", "properties": {"word": "please"}} 174 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 174, "entityType": "resource", "properties": {"word": "towards"}} 175 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 175, "entityType": "resource", "properties": {"word": "alone"}} 176 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 176, "entityType": "resource", "properties": {"word": "have"}} 177 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 177, "entityType": "resource", "properties": {"word": "when"}} 178 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 178, "entityType": "resource", "properties": {"word": "whereupon"}} 179 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 179, "entityType": "resource", "properties": {"word": "mine"}} 180 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 180, "entityType": "resource", "properties": {"word": "thereupon"}} 181 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 181, "entityType": "resource", "properties": {"word": "across"}} 182 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 182, "entityType": "resource", "properties": {"word": "had"}} 183 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 183, "entityType": "resource", "properties": {"word": "herein"}} 184 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 184, "entityType": "resource", "properties": {"word": "ie"}} 185 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 185, "entityType": "resource", "properties": {"word": "other"}} 186 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 186, "entityType": "resource", "properties": {"word": "would"}} 187 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.457+0000", "entityId": 187, "entityType": "resource", "properties": {"word": "at"}} 188 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 188, "entityType": "resource", "properties": {"word": "although"}} 189 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 189, "entityType": "resource", "properties": {"word": "myself"}} 190 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 190, "entityType": "resource", "properties": {"word": "our"}} 191 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 191, "entityType": "resource", "properties": {"word": "each"}} 192 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 192, "entityType": "resource", "properties": {"word": "third"}} 193 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 193, "entityType": "resource", "properties": {"word": "once"}} 194 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 194, "entityType": "resource", "properties": {"word": "again"}} 195 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 195, "entityType": "resource", "properties": {"word": "afterwards"}} 196 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 196, "entityType": "resource", "properties": {"word": "beside"}} 197 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 197, "entityType": "resource", "properties": {"word": "fify"}} 198 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 198, "entityType": "resource", "properties": {"word": "ours"}} 199 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 199, "entityType": "resource", "properties": {"word": "an"}} 200 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 200, "entityType": "resource", "properties": {"word": "five"}} 201 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 201, "entityType": "resource", "properties": {"word": "less"}} 202 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 202, "entityType": "resource", "properties": {"word": "down"}} 203 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 203, "entityType": "resource", "properties": {"word": "namely"}} 204 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 204, "entityType": "resource", "properties": {"word": "everything"}} 205 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 205, "entityType": "resource", "properties": {"word": "has"}} 206 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 206, "entityType": "resource", "properties": {"word": "here"}} 207 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 207, "entityType": "resource", "properties": {"word": "so"}} 208 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 208, "entityType": "resource", "properties": {"word": "de"}} 209 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 209, "entityType": "resource", "properties": {"word": "them"}} 210 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 210, "entityType": "resource", "properties": {"word": "formerly"}} 211 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 211, "entityType": "resource", "properties": {"word": "cannot"}} 212 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 212, "entityType": "resource", "properties": {"word": "the"}} 213 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 213, "entityType": "resource", "properties": {"word": "must"}} 214 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 214, "entityType": "resource", "properties": {"word": "their"}} 215 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 215, "entityType": "resource", "properties": {"word": "into"}} 216 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 216, "entityType": "resource", "properties": {"word": "others"}} 217 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 217, "entityType": "resource", "properties": {"word": "two"}} 218 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 218, "entityType": "resource", "properties": {"word": "before"}} 219 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 219, "entityType": "resource", "properties": {"word": "serious"}} 220 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 220, "entityType": "resource", "properties": {"word": "was"}} 221 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.458+0000", "entityId": 221, "entityType": "resource", "properties": {"word": "thin"}} 222 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 222, "entityType": "resource", "properties": {"word": "upon"}} 223 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 223, "entityType": "resource", "properties": {"word": "via"}} 224 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 224, "entityType": "resource", "properties": {"word": "seems"}} 225 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 225, "entityType": "resource", "properties": {"word": "those"}} 226 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 226, "entityType": "resource", "properties": {"word": "below"}} 227 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 227, "entityType": "resource", "properties": {"word": "least"}} 228 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 228, "entityType": "resource", "properties": {"word": "thus"}} 229 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 229, "entityType": "resource", "properties": {"word": "whither"}} 230 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 230, "entityType": "resource", "properties": {"word": "among"}} 231 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 231, "entityType": "resource", "properties": {"word": "after"}} 232 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 232, "entityType": "resource", "properties": {"word": "whenever"}} 233 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 233, "entityType": "resource", "properties": {"word": "above"}} 234 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 234, "entityType": "resource", "properties": {"word": "bottom"}} 235 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 235, "entityType": "resource", "properties": {"word": "detail"}} 236 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 236, "entityType": "resource", "properties": {"word": "there"}} 237 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 237, "entityType": "resource", "properties": {"word": "nine"}} 238 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 238, "entityType": "resource", "properties": {"word": "all"}} 239 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 239, "entityType": "resource", "properties": {"word": "whole"}} 240 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 240, "entityType": "resource", "properties": {"word": "interest"}} 241 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 241, "entityType": "resource", "properties": {"word": "us"}} 242 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 242, "entityType": "resource", "properties": {"word": "forty"}} 243 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 243, "entityType": "resource", "properties": {"word": "to"}} 244 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 244, "entityType": "resource", "properties": {"word": "etc"}} 245 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 245, "entityType": "resource", "properties": {"word": "few"}} 246 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 246, "entityType": "resource", "properties": {"word": "she"}} 247 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 247, "entityType": "resource", "properties": {"word": "else"}} 248 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 248, "entityType": "resource", "properties": {"word": "her"}} 249 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 249, "entityType": "resource", "properties": {"word": "themselves"}} 250 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 250, "entityType": "resource", "properties": {"word": "am"}} 251 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 251, "entityType": "resource", "properties": {"word": "neither"}} 252 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 252, "entityType": "resource", "properties": {"word": "even"}} 253 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 253, "entityType": "resource", "properties": {"word": "describe"}} 254 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.459+0000", "entityId": 254, "entityType": "resource", "properties": {"word": "more"}} 255 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 255, "entityType": "resource", "properties": {"word": "we"}} 256 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 256, "entityType": "resource", "properties": {"word": "against"}} 257 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 257, "entityType": "resource", "properties": {"word": "latter"}} 258 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 258, "entityType": "resource", "properties": {"word": "never"}} 259 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 259, "entityType": "resource", "properties": {"word": "nor"}} 260 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 260, "entityType": "resource", "properties": {"word": "during"}} 261 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 261, "entityType": "resource", "properties": {"word": "fill"}} 262 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 262, "entityType": "resource", "properties": {"word": "whatever"}} 263 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 263, "entityType": "resource", "properties": {"word": "amoungst"}} 264 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 264, "entityType": "resource", "properties": {"word": "around"}} 265 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 265, "entityType": "resource", "properties": {"word": "how"}} 266 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 266, "entityType": "resource", "properties": {"word": "who"}} 267 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 267, "entityType": "resource", "properties": {"word": "often"}} 268 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 268, "entityType": "resource", "properties": {"word": "one"}} 269 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 269, "entityType": "resource", "properties": {"word": "are"}} 270 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 270, "entityType": "resource", "properties": {"word": "since"}} 271 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 271, "entityType": "resource", "properties": {"word": "fire"}} 272 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 272, "entityType": "resource", "properties": {"word": "get"}} 273 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 273, "entityType": "resource", "properties": {"word": "and"}} 274 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 274, "entityType": "resource", "properties": {"word": "been"}} 275 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 275, "entityType": "resource", "properties": {"word": "about"}} 276 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 276, "entityType": "resource", "properties": {"word": "nowhere"}} 277 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 277, "entityType": "resource", "properties": {"word": "several"}} 278 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 278, "entityType": "resource", "properties": {"word": "amount"}} 279 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 279, "entityType": "resource", "properties": {"word": "show"}} 280 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 280, "entityType": "resource", "properties": {"word": "where"}} 281 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 281, "entityType": "resource", "properties": {"word": "cry"}} 282 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 282, "entityType": "resource", "properties": {"word": "hundred"}} 283 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 283, "entityType": "resource", "properties": {"word": "is"}} 284 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 284, "entityType": "resource", "properties": {"word": "but"}} 285 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 285, "entityType": "resource", "properties": {"word": "indeed"}} 286 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 286, "entityType": "resource", "properties": {"word": "only"}} 287 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 287, "entityType": "resource", "properties": {"word": "name"}} 288 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 288, "entityType": "resource", "properties": {"word": "call"}} 289 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.460+0000", "entityId": 289, "entityType": "resource", "properties": {"word": "because"}} 290 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 290, "entityType": "resource", "properties": {"word": "sometimes"}} 291 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 291, "entityType": "resource", "properties": {"word": "such"}} 292 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 292, "entityType": "resource", "properties": {"word": "his"}} 293 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 293, "entityType": "resource", "properties": {"word": "enough"}} 294 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 294, "entityType": "resource", "properties": {"word": "ten"}} 295 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 295, "entityType": "resource", "properties": {"word": "most"}} 296 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 296, "entityType": "resource", "properties": {"word": "wherein"}} 297 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 297, "entityType": "resource", "properties": {"word": "yet"}} 298 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 298, "entityType": "resource", "properties": {"word": "no"}} 299 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 299, "entityType": "resource", "properties": {"word": "by"}} 300 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 300, "entityType": "resource", "properties": {"word": "not"}} 301 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 301, "entityType": "resource", "properties": {"word": "on"}} 302 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 302, "entityType": "resource", "properties": {"word": "should"}} 303 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 303, "entityType": "resource", "properties": {"word": "seem"}} 304 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 304, "entityType": "resource", "properties": {"word": "bill"}} 305 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 305, "entityType": "resource", "properties": {"word": "back"}} 306 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 306, "entityType": "resource", "properties": {"word": "none"}} 307 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 307, "entityType": "resource", "properties": {"word": "noone"}} 308 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 308, "entityType": "resource", "properties": {"word": "that"}} 309 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 309, "entityType": "resource", "properties": {"word": "sixty"}} 310 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 310, "entityType": "resource", "properties": {"word": "could"}} 311 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 311, "entityType": "resource", "properties": {"word": "someone"}} 312 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 312, "entityType": "resource", "properties": {"word": "hasnt"}} 313 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 313, "entityType": "resource", "properties": {"word": "put"}} 314 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 314, "entityType": "resource", "properties": {"word": "if"}} 315 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 315, "entityType": "resource", "properties": {"word": "take"}} 316 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 316, "entityType": "resource", "properties": {"word": "do"}} 317 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 317, "entityType": "resource", "properties": {"word": "they"}} 318 | {"event": "stopwords", "eventTime": "2015-06-08T17:01:37.461+0000", "entityId": 318, "entityType": "resource", "properties": {"word": "front"}} 319 | -------------------------------------------------------------------------------- /engine.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "default", 3 | "description": "Default settings", 4 | "engineFactory": "org.example.textclassification.TextClassificationEngine", 5 | "datasource": { 6 | "params": { 7 | "appName": "MyTextApp" 8 | } 9 | }, 10 | "preparator": { 11 | "params": { 12 | "nGram": 1, 13 | "numFeatures": 500, 14 | "SPPMI": false 15 | } 16 | }, 17 | "algorithms": [ 18 | { 19 | "name": "lr", 20 | "params": { 21 | "regParam": 0.00000005 22 | } 23 | } 24 | ] 25 | } 26 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.4") 2 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.15 2 | -------------------------------------------------------------------------------- /src/main/scala/DataSource.scala: -------------------------------------------------------------------------------- 1 | package org.example.textclassification 2 | 3 | import org.apache.predictionio.controller.PDataSource 4 | import org.apache.predictionio.controller.EmptyEvaluationInfo 5 | import org.apache.predictionio.controller.Params 6 | import org.apache.predictionio.controller.SanityCheck 7 | import org.apache.predictionio.data.store.PEventStore 8 | 9 | import org.apache.spark.SparkContext 10 | import org.apache.spark.rdd.RDD 11 | 12 | import grizzled.slf4j.Logger 13 | 14 | /** Define Data Source parameters. 15 | * appName is the application name. 16 | * evalK is the the number of folds that are to be used for cross validation (optional) 17 | */ 18 | case class DataSourceParams( 19 | appName: String, 20 | evalK: Option[Int] 21 | ) extends Params 22 | 23 | 24 | /** Define your DataSource component. Remember, you must 25 | * implement a readTraining method, and, optionally, a 26 | * readEval method. 27 | */ 28 | class DataSource ( 29 | val dsp : DataSourceParams 30 | ) extends PDataSource[TrainingData, EmptyEvaluationInfo, Query, ActualResult] { 31 | 32 | @transient lazy val logger = Logger[this.type] 33 | 34 | /** Helper function used to store data given a SparkContext. */ 35 | private def readEventData(sc: SparkContext) : RDD[Observation] = { 36 | //Get RDD of Events. 37 | PEventStore.find( 38 | appName = dsp.appName, 39 | entityType = Some("content"), // specify data entity type 40 | eventNames = Some(List("e-mail")) // specify data event name 41 | 42 | // Convert collected RDD of events to and RDD of Observation 43 | // objects. 44 | )(sc).map(e => { 45 | val label : String = e.properties.get[String]("label") 46 | Observation( 47 | if (label == "spam") 1.0 else 0.0, 48 | e.properties.get[String]("text"), 49 | label 50 | ) 51 | }).cache 52 | } 53 | 54 | /** Helper function used to store stop words from event server. */ 55 | private def readStopWords(sc : SparkContext) : Set[String] = { 56 | PEventStore.find( 57 | appName = dsp.appName, 58 | entityType = Some("resource"), 59 | eventNames = Some(List("stopwords")) 60 | 61 | //Convert collected RDD of strings to a string set. 62 | )(sc) 63 | .map(e => e.properties.get[String]("word")) 64 | .collect 65 | .toSet 66 | } 67 | 68 | /** Read in data and stop words from event server 69 | * and store them in a TrainingData instance. 70 | */ 71 | override 72 | def readTraining(sc: SparkContext): TrainingData = { 73 | new TrainingData(readEventData(sc), readStopWords(sc)) 74 | } 75 | 76 | /** Used for evaluation: reads in event data and creates cross-validation folds. */ 77 | override 78 | def readEval(sc: SparkContext): 79 | Seq[(TrainingData, EmptyEvaluationInfo, RDD[(Query, ActualResult)])] = { 80 | // Zip your RDD of events read from the server with indices 81 | // for the purposes of creating our folds. 82 | val data = readEventData(sc).zipWithIndex() 83 | // Create cross validation folds by partitioning indices 84 | // based on their index value modulo the number of folds. 85 | (0 until dsp.evalK.get).map { k => 86 | // Prepare training data for fold. 87 | val train = new TrainingData( 88 | data.filter(_._2 % dsp.evalK.get != k).map(_._1), 89 | readStopWords 90 | ((sc))) 91 | 92 | // Prepare test data for fold. 93 | val test = data.filter(_._2 % dsp.evalK.get == k) 94 | .map(_._1) 95 | .map(e => (Query(e.text), ActualResult(e.category))) 96 | 97 | (train, new EmptyEvaluationInfo, test) 98 | } 99 | } 100 | 101 | } 102 | 103 | /** Observation class serving as a wrapper for both our 104 | * data's class label and document string. 105 | */ 106 | case class Observation( 107 | label: Double, 108 | text: String, 109 | category: String 110 | ) 111 | 112 | /** TrainingData class serving as a wrapper for all 113 | * read in from the Event Server. 114 | */ 115 | class TrainingData( 116 | val data : RDD[Observation], 117 | val stopWords : Set[String] 118 | ) extends Serializable with SanityCheck { 119 | 120 | /** Sanity check to make sure your data is being fed in correctly. */ 121 | override 122 | def sanityCheck(): Unit = { 123 | try { 124 | val obs : Array[Double] = data.takeSample(false, 5).map(_.label) 125 | 126 | println() 127 | (0 until 5).foreach( 128 | k => println("Observation " + (k + 1) +" label: " + obs(k)) 129 | ) 130 | println() 131 | } catch { 132 | case (e : ArrayIndexOutOfBoundsException) => { 133 | println() 134 | println("Data set is empty, make sure event fields match imported data.") 135 | println() 136 | } 137 | } 138 | 139 | } 140 | 141 | } 142 | -------------------------------------------------------------------------------- /src/main/scala/Engine.scala: -------------------------------------------------------------------------------- 1 | package org.example.textclassification 2 | 3 | import org.apache.predictionio.controller.EngineFactory 4 | import org.apache.predictionio.controller.Engine 5 | 6 | /** Define Query class which serves as a wrapper for 7 | * new text data. 8 | */ 9 | case class Query(text: String) 10 | 11 | /** Define PredictedResult class which serves as a 12 | * wrapper for a predicted class label and the associated 13 | * prediction confidence. 14 | */ 15 | case class PredictedResult( 16 | category: String, 17 | confidence: Double) 18 | 19 | /** Define ActualResult class which serves as a wrapper 20 | * for an observation's true class label. 21 | */ 22 | case class ActualResult(category: String) 23 | 24 | /** Define Engine */ 25 | object TextClassificationEngine extends EngineFactory { 26 | def apply() = { 27 | new Engine( 28 | classOf[DataSource], 29 | classOf[Preparator], 30 | Map( 31 | "nb" -> classOf[NBAlgorithm], 32 | "lr" -> classOf[LRAlgorithm] 33 | ), 34 | classOf[Serving]) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/Evaluation.scala: -------------------------------------------------------------------------------- 1 | package org.example.textclassification 2 | 3 | import org.apache.predictionio.controller.AverageMetric 4 | import org.apache.predictionio.controller.Evaluation 5 | import org.apache.predictionio.controller.EmptyEvaluationInfo 6 | import org.apache.predictionio.controller.EngineParamsGenerator 7 | import org.apache.predictionio.controller.EngineParams 8 | 9 | /** Create an accuracy metric for evaluating our supervised learning model. */ 10 | case class Accuracy() 11 | extends AverageMetric[EmptyEvaluationInfo, Query, PredictedResult, ActualResult] { 12 | 13 | /** Method for calculating prediction accuracy. */ 14 | override 15 | def calculate( 16 | query: Query, 17 | predicted: PredictedResult, 18 | actual: ActualResult 19 | ) : Double = if (predicted.category == actual.category) 1.0 else 0.0 20 | } 21 | 22 | 23 | /** Define your evaluation object implementing the accuracy metric defined 24 | * above. 25 | */ 26 | object AccuracyEvaluation extends Evaluation { 27 | 28 | // Define Engine and Metric used in Evaluation. 29 | engineMetric = ( 30 | TextClassificationEngine(), 31 | Accuracy() 32 | ) 33 | } 34 | 35 | /** Set your engine parameters for evaluation procedure.*/ 36 | object EngineParamsList extends EngineParamsGenerator { 37 | 38 | // Set data source and preparator parameters. 39 | private[this] val baseEP = EngineParams( 40 | dataSourceParams = DataSourceParams(appName = "MyTextApp", evalK = Some(3)), 41 | preparatorParams = PreparatorParams(nGram = 2, numFeatures = 500) 42 | ) 43 | 44 | // Set the algorithm params for which we will assess an accuracy score. 45 | engineParamsList = Seq( 46 | baseEP.copy(algorithmParamsList = Seq(("nb", NBAlgorithmParams(0.25)))), 47 | baseEP.copy(algorithmParamsList = Seq(("nb", NBAlgorithmParams(1.0)))), 48 | baseEP.copy(algorithmParamsList = Seq(("lr", LRAlgorithmParams(0.5)))), 49 | baseEP.copy(algorithmParamsList = Seq(("lr", LRAlgorithmParams(1.25)))) 50 | ) 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/LRAlgorithm.scala: -------------------------------------------------------------------------------- 1 | package org.example.textclassification 2 | 3 | import org.apache.predictionio.controller.P2LAlgorithm 4 | import org.apache.predictionio.controller.Params 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.ml.classification.LogisticRegression 7 | import org.apache.spark.sql.DataFrame 8 | import org.apache.spark.sql.functions 9 | import org.apache.spark.sql.SQLContext 10 | import org.apache.spark.sql.expressions.UserDefinedFunction 11 | import grizzled.slf4j.Logger 12 | import org.apache.spark.mllib.regression.LabeledPoint 13 | 14 | case class LRAlgorithmParams(regParam: Double) extends Params 15 | 16 | class LRAlgorithm(val ap: LRAlgorithmParams) 17 | extends P2LAlgorithm[PreparedData, LRModel, Query, PredictedResult] { 18 | 19 | @transient lazy val logger = Logger[this.type] 20 | 21 | override 22 | def train(sc: SparkContext, pd: PreparedData): LRModel = { 23 | 24 | // Import SQLContext for creating DataFrame. 25 | val sql: SQLContext = new SQLContext(sc) 26 | import sql.implicits._ 27 | 28 | val lr = new LogisticRegression() 29 | .setMaxIter(10) 30 | .setThreshold(0.5) 31 | .setRegParam(ap.regParam) 32 | 33 | val labels: Seq[Double] = pd.categoryMap.keys.toSeq 34 | 35 | val data = labels.foldLeft(pd.transformedData.map { case LabeledPoint(label, feature) => 36 | // Convert old LabeledPoint to ML's LabeledPoint 37 | new org.apache.spark.ml.feature.LabeledPoint(label, feature.asML) 38 | }.toDF)( //transform to Spark DataFrame 39 | // Add the different binary columns for each label. 40 | (data: DataFrame, label: Double) => { 41 | // function: multiclass labels --> binary labels 42 | val f: UserDefinedFunction = functions.udf((e : Double) => if (e == label) 1.0 else 0.0) 43 | 44 | data.withColumn(label.toInt.toString, f(data("label"))) 45 | } 46 | ) 47 | 48 | // Create a logistic regression model for each class. 49 | val lrModels : Seq[(Double, LREstimate)] = labels.map( 50 | label => { 51 | val lab = label.toInt.toString 52 | 53 | val fit = lr.setLabelCol(lab).fit( 54 | data.select(lab, "features") 55 | ) 56 | 57 | // Return (label, feature coefficients, and intercept term. 58 | (label, LREstimate(fit.coefficients.toArray, fit.intercept)) 59 | 60 | } 61 | ) 62 | 63 | new LRModel( 64 | tfIdf = pd.tfIdf, 65 | categoryMap = pd.categoryMap, 66 | lrModels = lrModels 67 | ) 68 | } 69 | 70 | override 71 | def predict(model: LRModel, query: Query): PredictedResult = { 72 | model.predict(query.text) 73 | } 74 | } 75 | 76 | case class LREstimate ( 77 | coefficients : Array[Double], 78 | intercept : Double 79 | ) 80 | 81 | class LRModel( 82 | val tfIdf: TFIDFModel, 83 | val categoryMap: Map[Double, String], 84 | val lrModels: Seq[(Double, LREstimate)]) extends Serializable { 85 | 86 | /** Enable vector inner product for prediction. */ 87 | private def innerProduct (x : Array[Double], y : Array[Double]) : Double = { 88 | x.zip(y).map(e => e._1 * e._2).sum 89 | } 90 | 91 | /** Define prediction rule. */ 92 | def predict(text: String): PredictedResult = { 93 | val x: Array[Double] = tfIdf.transform(text).toArray 94 | 95 | // Logistic Regression binary formula for positive probability. 96 | // According to MLLib documentation, class labeled 0 is used as pivot. 97 | // Thus, we are using: 98 | // log(p1/p0) = log(p1/(1 - p1)) = b0 + xTb =: z 99 | // p1 = exp(z) * (1 - p1) 100 | // p1 * (1 + exp(z)) = exp(z) 101 | // p1 = exp(z)/(1 + exp(z)) 102 | val pred = lrModels.map( 103 | e => { 104 | val z = scala.math.exp(innerProduct(e._2.coefficients, x) + e._2.intercept) 105 | (e._1, z / (1 + z)) 106 | } 107 | ).maxBy(_._2) 108 | 109 | PredictedResult(categoryMap(pred._1), pred._2) 110 | } 111 | 112 | override def toString = s"LR model" 113 | } 114 | -------------------------------------------------------------------------------- /src/main/scala/NBAlgorithm.scala: -------------------------------------------------------------------------------- 1 | package org.example.textclassification 2 | 3 | import org.apache.predictionio.controller.P2LAlgorithm 4 | import org.apache.predictionio.controller.Params 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.mllib.classification.NaiveBayes 7 | import org.apache.spark.mllib.classification.NaiveBayesModel 8 | import org.apache.spark.mllib.linalg.Vector 9 | 10 | import scala.math._ 11 | 12 | /** Define parameters for Supervised Learning Model. We are 13 | * using a Naive Bayes classifier, which gives us only one 14 | * hyperparameter in this stage. 15 | */ 16 | case class NBAlgorithmParams(lambda: Double) extends Params 17 | 18 | /** Define SupervisedAlgorithm class. */ 19 | class NBAlgorithm( 20 | val ap: NBAlgorithmParams 21 | ) extends P2LAlgorithm[PreparedData, NBModel, Query, PredictedResult] { 22 | 23 | /** Train your model. */ 24 | override 25 | def train(sc: SparkContext, pd: PreparedData): NBModel = { 26 | // Fit a Naive Bayes model using the prepared data. 27 | val nb: NaiveBayesModel = NaiveBayes.train(pd.transformedData, ap.lambda) 28 | 29 | new NBModel( 30 | tfIdf = pd.tfIdf, 31 | categoryMap = pd.categoryMap, 32 | nb = nb) 33 | } 34 | 35 | /** Prediction method for trained model. */ 36 | override 37 | def predict(model: NBModel, query: Query): PredictedResult = { 38 | model.predict(query.text) 39 | } 40 | } 41 | 42 | class NBModel( 43 | val tfIdf: TFIDFModel, 44 | val categoryMap: Map[Double, String], 45 | val nb: NaiveBayesModel 46 | ) extends Serializable { 47 | 48 | private def innerProduct (x : Array[Double], y : Array[Double]) : Double = { 49 | x.zip(y).map(e => e._1 * e._2).sum 50 | } 51 | 52 | val normalize = (u: Array[Double]) => { 53 | val uSum = u.sum 54 | 55 | u.map(e => e / uSum) 56 | } 57 | 58 | private val scoreArray = nb.pi.zip(nb.theta) 59 | 60 | /** Given a document string, return a vector of corresponding 61 | * class membership probabilities. 62 | * Helper function used to normalize probability scores. 63 | * Returns an object of type Array[Double] 64 | */ 65 | private def getScores(doc: String): Array[Double] = { 66 | // Vectorize query 67 | val x: Vector = tfIdf.transform(doc) 68 | 69 | val z = scoreArray 70 | .map(e => innerProduct(e._2, x.toArray) + e._1) 71 | 72 | normalize((0 until z.size).map(k => exp(z(k) - z.max)).toArray) 73 | } 74 | 75 | /** Implement predict method for our model using 76 | * the prediction rule given in tutorial. 77 | */ 78 | def predict(doc : String) : PredictedResult = { 79 | val x: Array[Double] = getScores(doc) 80 | val y: (Double, Double) = (nb.labels zip x).maxBy(_._2) 81 | PredictedResult(categoryMap.getOrElse(y._1, ""), y._2) 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/scala/Preparator.scala: -------------------------------------------------------------------------------- 1 | package org.example.textclassification 2 | 3 | import org.apache.predictionio.controller.PPreparator 4 | import org.apache.predictionio.controller.Params 5 | 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.mllib.feature.{IDF, IDFModel, HashingTF} 8 | import org.apache.spark.mllib.linalg.Vector 9 | import org.apache.spark.mllib.regression.LabeledPoint 10 | import org.apache.spark.rdd.RDD 11 | 12 | import org.apache.lucene.analysis.standard.StandardAnalyzer 13 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute 14 | 15 | import java.io.StringReader 16 | 17 | import scala.collection.mutable 18 | 19 | /** Define Preparator parameters. Recall that for our data 20 | * representation we are only required to input the n-gram window 21 | * components. 22 | */ 23 | case class PreparatorParams( 24 | nGram: Int, 25 | numFeatures: Int = 15000 26 | ) extends Params 27 | 28 | /** define your Preparator class */ 29 | class Preparator(pp: PreparatorParams) 30 | extends PPreparator[TrainingData, PreparedData] { 31 | 32 | override 33 | def prepare(sc: SparkContext, td: TrainingData): PreparedData = { 34 | 35 | val tfHasher = new TFHasher(pp.numFeatures, pp.nGram, td.stopWords) 36 | 37 | // Convert trainingdata's observation text into TF vector 38 | // and then fit a IDF model 39 | val idf: IDFModel = new IDF().fit(td.data.map(e => tfHasher.hashTF(e.text))) 40 | 41 | val tfIdfModel = new TFIDFModel( 42 | hasher = tfHasher, 43 | idf = idf 44 | ) 45 | 46 | // Transform RDD[Observation] to RDD[(Label, text)] 47 | val doc: RDD[(Double, String)] = td.data.map (obs => (obs.label, obs.text)) 48 | 49 | // transform RDD[(Label, text)] to RDD[LabeledPoint] 50 | val transformedData: RDD[(LabeledPoint)] = tfIdfModel.transform(doc) 51 | 52 | // Finally extract category map, associating label to category. 53 | val categoryMap = td.data.map(obs => (obs.label, obs.category)).collectAsMap.toMap 54 | 55 | new PreparedData( 56 | tfIdf = tfIdfModel, 57 | transformedData = transformedData, 58 | categoryMap = categoryMap 59 | ) 60 | } 61 | 62 | } 63 | 64 | class TFHasher( 65 | val numFeatures: Int, 66 | val nGram: Int, 67 | val stopWords:Set[String] 68 | ) extends Serializable { 69 | 70 | private val hasher = new HashingTF(numFeatures = numFeatures) 71 | 72 | /** Use Lucene StandardAnalyzer to tokenize text **/ 73 | def tokenize(content: String): Seq[String] = { 74 | val tReader = new StringReader(content) 75 | val analyzer = new StandardAnalyzer() 76 | val tStream = analyzer.tokenStream("contents", tReader) 77 | val term = tStream.addAttribute(classOf[CharTermAttribute]) 78 | tStream.reset() 79 | 80 | val result = mutable.ArrayBuffer.empty[String] 81 | while (tStream.incrementToken()) { 82 | val termValue = term.toString 83 | 84 | result += term.toString 85 | 86 | } 87 | result 88 | } 89 | 90 | 91 | /** Hashing function: Text -> term frequency vector. */ 92 | def hashTF(text: String): Vector = { 93 | val newList : Array[String] = tokenize(text) 94 | .filterNot(stopWords.contains(_)) 95 | .sliding(nGram) 96 | .map(_.mkString) 97 | .toArray 98 | 99 | hasher.transform(newList) 100 | } 101 | } 102 | 103 | class TFIDFModel( 104 | val hasher: TFHasher, 105 | val idf: IDFModel 106 | ) extends Serializable { 107 | 108 | /** transform text to tf-idf vector. */ 109 | def transform(text: String): Vector = { 110 | // Map(n-gram -> document tf) 111 | idf.transform(hasher.hashTF(text)) 112 | } 113 | 114 | /** transform RDD of (label, text) to RDD of LabeledPoint */ 115 | def transform(doc: RDD[(Double, String)]): RDD[LabeledPoint] = { 116 | doc.map{ case (label, text) => LabeledPoint(label, transform(text)) } 117 | } 118 | } 119 | 120 | class PreparedData( 121 | val tfIdf: TFIDFModel, 122 | val transformedData: RDD[LabeledPoint], 123 | val categoryMap: Map[Double, String] 124 | ) extends Serializable 125 | -------------------------------------------------------------------------------- /src/main/scala/Serving.scala: -------------------------------------------------------------------------------- 1 | package org.example.textclassification 2 | 3 | import org.apache.predictionio.controller.LServing 4 | 5 | class Serving 6 | extends LServing[Query, PredictedResult] { 7 | 8 | override 9 | def serve(query: Query, 10 | predictedResults: Seq[PredictedResult]): PredictedResult = { 11 | predictedResults.maxBy(e => e.confidence) 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /template.json: -------------------------------------------------------------------------------- 1 | {"pio": {"version": { "min": "0.10.0-incubating" }}} 2 | --------------------------------------------------------------------------------