├── .gitignore
├── NOTICE.txt
├── README.md
├── build.sbt
├── data
    ├── handmade-event-description.txt
    ├── integration-test-expected.txt
    ├── sample-handmade-data.txt
    └── sample_movielens_data.txt
├── docs
    └── universal_recommender.md
├── engine.json
├── engine.json.minimum
├── event-names-test-engine.json
├── examples
    ├── import_handmade.py
    ├── import_movielens_eventserver.py
    ├── integration-test
    ├── multi-query-handmade.sh
    ├── multi-query-movielens.sh
    ├── pop-test-query.sh
    ├── single-query-eventNames.sh
    └── single-query-handmade.sh
├── handmade-engine.json
├── pop-engine.json
├── project
    ├── assembly.sbt
    └── pio-build.sbt
├── src
    └── main
    │   └── scala
    │       ├── DataSource.scala
    │       ├── Engine.scala
    │       ├── PopModel.scala
    │       ├── Preparator.scala
    │       ├── Serving.scala
    │       ├── URAlgorithm.scala
    │       ├── URModel.scala
    │       ├── esClient.scala
    │       └── package.scala
└── template.json


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache/
 6 | .history/
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | 
15 | # Scala-IDE specific
16 | .scala_dependencies
17 | .worksheet
18 | 
19 | # Mac specific
20 | .DS_Store
21 | 
22 | # IntelliJ IDEA
23 | *.iml
24 | .idea
25 | 
26 | #PredictionIO specific
27 | manifest.json
28 | target/
29 | pio.log
30 | /pio.sbt
31 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | ==============================================================
 2 |  ActionML LLC
 3 |  Copyright 2015 and onwards ActionML LLC
 4 | ==============================================================
 5 | 
 6 | This product includes software developed by
 7 | ActionML (http://actionml.com/).
 8 | 
 9 | It includes software from other Apache Software Foundation projects,
10 | including, but not limited to:
11 |   - Elasticsearch (Apache 2 license)
12 |   - Apache Hadoop
13 |   - Apache Spark
14 |   - Apache Spark
15 |   - Apache Mahout
16 | 
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Universal Recommendation Template
  2 | 
  3 | The Universal Recommender (UR) is a Cooccurrence type that creates correlators from several user actions, events, or profile information and performs the recommendations query with a Search Engine. It also supports item properties for filtering and boosting recommendations. This allows users to make use of any part of their user's clickstream or even profile and context information in making recommendations. TBD: several forms of popularity type backfill and content-based correlators for content based recommendations. Also filters on property date ranges. With these additions it will more closely live up to the name "Universal"
  4 | 
  5 | ##Quick Start
  6 | Check the prerequisites below before setup, it will inform choices made.
  7 | 
  8 |  1. [Install the PredictionIO framework](https://docs.prediction.io/install/) **be sure to choose HBase and Elasticsearch** for storage. This template requires Elasticsearch.
  9 |  2. Make sure the PIO console and services are running, check with `pio status`
 10 |  3. [Install this template](https://docs.prediction.io/start/download/) with `pio template get PredictionIO/template-scala-parallel-universal-recommendation`
 11 | 
 12 | ###Import Sample Data
 13 | 
 14 | 1. Create a new app name, change `appName` in `engine.json`
 15 | 2. Run `pio app new **your-new-app-name**`
 16 | 4. Import sample events by running `python examples/import_handmade.py --access_key **your-access-key**` where the key can be retrieved with `pio app list`
 17 | 3. The engine.json file in the root directory of your new UR template is set up for the data you just imported (make sure to create a new one for your data) Edit this file and change the `appName` parameter to match what you called the app in step #2
 18 | 5. Perform `pio build`, `pio train`, and `pio deploy`
 19 | 6. To execute some sample queries run `./examples/single-query-handmade.sh`
 20 | 
 21 | ##Important Notes for the Impatient
 22 | 
 23 |  - The Universal Recommender v0.2.0+ requires PredictionIO v0.9.5+
 24 |  - When sending events through the SDK, REST API, or importing it is required that all usage/preference events are named in the engine.json and **there must be data for the first named event** otherwise there will be **no model created** and errors will occur during training.
 25 |  - When sending usage events it is required that the entityType is "user" and targetEntityType is "item". The type of the item is inferred from the event names, which must be one of the eventNames in the engine.json.
 26 |  - **Elasticsearch**: The UR **requires Eleasticsearch** since it performs the last step in the algorithm. It will store the model created at `pio train` time.
 27 |  - **EventStore**: The EventServer may use another DB than HBase but has been most heavily tested with HBase.
 28 | 
 29 | ##What is a Universal Recommender
 30 | 
 31 | The Universal Recommender (UR) will accept a range of data, auto correlate it, and allow for very flexible queries. The UR is different from most recommenders in these ways:
 32 | 
 33 | * It takes a single very strong "primary" event type&mdash;one that clearly reflects a user's preference&mdash;and correlates any number of other "secondary" event types. user profile data, and user context data to the primary event. This has the effect of using virtually anything we know about the user to recommend the items attached to the primary event. Much of a user’s clickstream can be used to make recommendations. If a user has no history of the primary action (purchase for instance) but does have history of the secondary data, personalized recommendations for purchases can still be made. With user purchase history the recommendations become better. This is very important because it means better recommendations for more users than typical recommenders.
 34 | * It can boost and filter based on events or item metadata/properties. This means it can give personalized recommendations that are biased toward “sci-fi” and filtered to only include “promoted” items when the business rules call for this.
 35 | * It can use a user's context to make recommendations even when the user is new. If usage data has been gathered for other users for referring URL, device type, or location, for instance, there may be a correlation between this data and items preferred. The UR can detect this if it exists and recommend based on this context, even to new users. We call this "micro-segmented" recommendations since they are not personal but group users based on limited contextual information. These will not be as good as when more behavioral information is know about the user but may be better than simply returning popular items.
 36 | * It includes a fallback to some form of item popularity when there is no other information known about the user. Backfill types include popular, trending, and hot. Backfill can be boosted or filtered by item metadata just as any recommendation.
 37 | * All of the above can be mixed into a single query for blended results and so the query can be tuned to a great many applications without special data or separate models.
 38 | * Real-time user history is used in all recommendations. Even anonymous users will get recommendations if they have recorded preference history and a user-id. There is no  requirement to "retrain" the model to make this happen. The rule of thumb is to retrain based on frequency of adding new items. So for breaking-news articles you may want to retrain frequently but for ecom once a day would be fine. In either case real-time user behavior affects recommendations.
 39 | 
 40 | ###Typical Uses:
 41 | * **Personalized Recommendations**
 42 | * **Similar Item Recommendations**: "people who liked this also like these"
 43 | * **Shopping Cart Recommendations**:  more generally item-set recommendations. This can be applied to wishlists, watchlists, likes, any set of items that may go together.
 44 | * **Popular Items**: These can even be the primary form of recommendation if desired for some applications since several forms are supported. By default if a user has no recommendations popular items will backfill to achieve the number required.
 45 | * **Hybrid Collaborative Filtering and Content-based Recommendations**: since item properties can boost or filter recommendations and can often also be treated as secondary user preference data a smooth blend of usage and content can be achieved.
 46 | 
 47 | ##Configuration, Events, and Queries
 48 | 
 49 | ###Primary and Secondary Data
 50 | 
 51 | **There must be a "primary" event/action recorded for some number of users**. This action defines the type of item returned in recommendations and is the measure by which all secondary data is measured. More technically speaking all secondary data is tested for correlation to the primary event. Secondary data can be anything that you may think of as giving some insight into the user. If something in the secondary data has no correlation to the primary event it will have no effect on recommendations. For instance in an ecom setting you may want "buy" as a primary event. There may be many (but none is also fine) secondary events like (user-id, device-preference, device-id). This can be thought of as a user's device preference and recorded at all logins. If this doesn't correlate to items bought it will not effect recommendations.
 52 | 
 53 | ###Biases
 54 | 
 55 | These take the form of boosts and filters where a neutral bias is 1.0. The importance of some part of the query may be boosted by a positive non-zero float. If the bias is < 0 it is considered a filter&mdash;meaning no recommendation is made that lacks the filter value(s). One example of a filter is where it may make sense to show only "electronics" recommendations when the user is viewing an electronics product. Biases are often applied to a list of data, for instance the user is looking at a video page with a cast of actors. The "cast" list is metadata attached to items and a query can show "people who liked this, also liked these" type recommendations but also include the current cast boosted by 0.5. This can be seen as showing similar item recommendations but using the cast members in a way that will not overpower the similar items (since by default they have a neutral 1.0 boost). The result would be similar items favoring ones with similar cast members.
 56 | 
 57 | ###Dates
 58 | 
 59 | Dates can be used to filter recommendations in one of two ways, where the data range is attached to items or is specified in the query:
 60 | 
 61 |  1. The date range can be attached to every item and checked against the current date. The current date can be in the query or defaults to the current prediction server date. This mode requires that all items have a upper and lower date attached to them as a property. It is designed to be something like an "available after" and "expired after". The default check against server date is triggered when the expireDateName and availableDateName are both specified but no date is passed in with the query. **Note**: Both dates must be attached to items or they will not be recommended. To have one-sided filter make the available date some time far in the past and/or the expire date some time far in the future.
 62 |  2. A "dateRange" can be specified in the query and the recommended items will have a date that lies between the range dates.
 63 | 
 64 | ###Engine.json
 65 | 
 66 | This file allows the user to describe and set parameters that control the engine operations. Many values have defaults so the following can be seen as the minimum for an ecom app with only one "buy" event. Reasonable defaults are used so try this first and add tunings or new event types and item property fields as you become more familiar.
 67 | 
 68 | ####Simple Default Values
 69 |     {
 70 | 	  "comment":" This config file uses default settings for all but the required values see README.md for docs",
 71 | 	  "id": "default",
 72 | 	  "description": "Default settings",
 73 | 	  "engineFactory": "org.template.RecommendationEngine",
 74 | 	  "datasource": {
 75 | 	    "params" : {
 76 | 	      "name": "sample-handmade-data.txt",
 77 | 	      "appName": "handmade",
 78 | 	      "eventNames": ["purchase", "view"]
 79 | 	    }
 80 | 	  },
 81 | 	  "sparkConf": {
 82 | 	    "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
 83 | 	    "spark.kryo.registrator": "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator",
 84 | 	    "spark.kryo.referenceTracking": "false",
 85 | 	    "spark.kryoserializer.buffer.mb": "300",
 86 | 	    "spark.kryoserializer.buffer": "300m",
 87 | 	    "spark.executor.memory": "4g",
 88 | 	    "es.index.auto.create": "true"
 89 | 	  },
 90 | 	  "algorithms": [
 91 | 	    {
 92 | 	      "comment": "simplest setup where all values are default, popularity based backfill, must add eventsNames",
 93 | 	      "name": "ur",
 94 | 	      "params": {
 95 | 	        "appName": "handmade",
 96 | 	        "indexName": "urindex",
 97 | 	        "typeName": "items",
 98 | 	        "comment": "must have data for the first event or the model will not build, other events are optional",
 99 | 	        "eventNames": ["purchase", "view"]
100 | 	      }
101 | 	    }
102 | 	  ]
103 | 	}
104 | 
105 | 
106 | 
107 | ####Complete Parameter Set
108 | 
109 | A full list of tuning and config parameters is below. See the field description for specific meaning. Some of the parameters work as defaults values for every query and can be overridden or added to in the query.
110 | 
111 | **Note:** It is strongly advised that you try the default/simple settings first before changing them. The possible exception is adding secondary events in the `eventNames` array.
112 | 
113 |     {
114 |       "id": "default",
115 |       "description": "Default settings",
116 |       "comment": "replace this with your JVM package prefix, like org.apache",
117 |       "engineFactory": "org.template.RecommendationEngine",
118 |       "datasource": {
119 |         "params" : {
120 |           "name": "some-data",
121 |           "appName": "URApp1",
122 |           "eventNames": ["buy", "view"]
123 |         }
124 |       },
125 |       “comment”: “This is for Mahout and Elasticsearch, the values are minimums and should not be removed”,
126 |       "sparkConf": {
127 |         "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
128 |         "spark.kryo.registrator": "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator",
129 |         "spark.kryo.referenceTracking": "false",
130 |         "spark.kryoserializer.buffer.mb": "200",
131 |         "spark.executor.memory": "4g",
132 |         "es.index.auto.create": "true"
133 |       },
134 |       "algorithms": [
135 |         {
136 |           "name": "ur",
137 |           "params": {
138 |             "appName": "URApp1",
139 |             "indexName": "urindex",
140 |             "typeName": "items",
141 |             "eventNames": ["buy", "view"],
142 |             "blacklistEvents": ["buy", "view"],
143 |             "maxEventsPerEventType": 100,
144 |             "maxCorrelatorsPerEventType": 50,
145 |             "maxQueryEvents": 500,
146 |             "num": 20,
147 |             "seed": 3,
148 |             "recsModel": "all",
149 | 			"backfillField": {
150 |   				"backfillType": "popular",
151 |   				"eventnames": ["buy", "view"],
152 |   				"duration": 259200
153 |   			},
154 |             "expireDateName": "expireDateFieldName",
155 |             "availableDateName": "availableDateFieldName",
156 |             "dateName": "dateFieldName",
157 |             "userbias": -maxFloat..maxFloat,
158 |             "itembias": -maxFloat..maxFloat,
159 |             "returnSelf": true | false,
160 |             “fields”: [
161 |               {
162 |                 “name”: ”fieldname”,
163 |                 “values”: [“fieldValue1”, ...],
164 |                 “bias”: -maxFloat..maxFloat,
165 |               },...
166 |             ]
167 |           }
168 |         }
169 |       ]
170 |     }
171 | 
172 | The “params” section controls most of the features of the UR. Possible values are:
173 | 
174 | * **appName**: required string describing the app using the engine. Must be the same as is seen with `pio app list`
175 | * **indexName**: required string describing the index for all correlators, something like "urindex". The Elasticsearch URI for its REST interface is `http:/**elasticsearch-machine**/indexName/typeName/...` You can access ES through its REST interface here.
176 | * **typeName**: required string describing the type in Elasticsearch terminology, something like "items". This has no important meaning but must be part of the Elasticsearch URI for queries.
177 | * **eventNames**: required array of string identifiers describing action events recorded for users, things like “purchase”, “watch”, “add-to-cart”, even “location”, or “device” can be considered actions and used in recommendations. The first action is to be considered the primary action because it **must** exist in the data and is considered the strongest indication of user preference for items, the others are secondary for cooccurrence and cross-cooccurrence calculations. The secondary actions/events may or may not have target entity ids that correspond to the items to be recommended, so they are allowed to be things like category-ids, device-ids, location-ids... For example: a category-pref event would have a category-id as the target entity id but a view would have an item-id as the target entity id (see Events below). Both work fine as long as all usage events are tied to users.
178 | * **maxEventsPerEventType** optional (use with great care), default = 500. Amount of usage history to keep use in model calculation.
179 | * **maxCorrelatorsPerEventType**: optional (use with great care), default = 50. An integer that controls how many of the strongest correlators are created for every event type named in `eventNames`.
180 | * **maxQueryEvents**: optional (use with great care), default = 100. An integer specifying the number of most recent primary actions used to make recommendations for an individual. More implies some will be less recent actions. Theoretically using the right number will capture the user’s current interests.
181 | * **num**: optional, default = 20. An integer telling the engine the maximum number of recommendations to return per query but less may be returned if the query produces less results or post recommendations filters like blacklists remove some.
182 | * **blacklistEvents**: optional, default = the primary action. An array of strings corresponding to the actions taken on items, which will cause them to be removed from recommendations. These will have the same values as some user actions - so “purchase” might be best for an ecom application since there is often little need to recommend something the user has already bought. If this is not specified then the primary event is assumed. To blacklist no event, specify an empty array. Note that not all actions are taken on the same items being recommended. For instance every time a user goes to a category page this could be recorded as a category preference so if this event is used in a blacklist it will have no effect, the category and item ids should never match. If you want to filter certain categories, use a field filter and specify all categories allowed.
183 | * **fields**: optional, default = none. An array of default field based query boosts and filters applied to every query. The name = type or field name for metadata stored in the EventStore with $set and $unset events. Values = and array of one or more values to use in any query. The values will be looked for in the field name. Bias will either boost the importance of this part of the query or use it as a filter. Positive biases are boosts any negative number will filter out any results that do not contain the values in the field name.
184 | * **userBias**: optional (use with great care), default = none. Amount to favor user history in creating recommendations, 1 is neutral, and negative number means to use as a filter so the user history must be used in recommendations, any positive number greater than one will boost the importance of user history in recommendations.
185 | * **itemBias**: optional (use with great care), default = none. Same as userbias but applied to similar items to the item supplied in the query.
186 | * **expireDateName**: optional, name of the item properties field that contains the date the item expires or is unavailable to recommend.
187 | * **availableDateName**: optional, name of the item properties field that contains the date the item is available to recommend.
188 | * **dateName**: optional, a date or timestamp used in a `dateRange` recommendations filter.
189 | * **returnSelf**: optional, default = false. Boolean asking to include the item that was part of the query (if there was one) as part of the results. The default is false and this is by far the most common use so this is seldom required.
190 | * **recsModel**: optional, default = "all", which means  collaborative filtering with popular items returned when no other recommendations can be made. Otherwise: "all", "collabFiltering", "backfill". If only "backfill" is specified then the train will create only some backfill type like popular. If only "collabFiltering" then no backfill will be included when there are no other recommendations.
191 | * **backfillField**: optional (use with great care), default: backfillType = popular, eventNames = only the first/primary event in `eventNames`, corresponding to the primary action, duration = 259200, which is the number of seconds in a 3 days. The primary/first event used for recommendations is always attached to items you wish to recommend, the other events are not necessarily attached to the same items. If events like "category-preference" are used then popular categories will be calculated and this will have no effect for backfill. Possible backfillTypes are "popular", "trending", and "hot", which correspond to the number of events in the duration, the average event velocity or the average event acceleration over the time indicated. This is calculated for every event and is used to rank them and so can be used with biasing metadata so you can get, for instance, hot items in some category. **Note**: when using "hot" the algorithm divides the events into three periods and since event tend to be cyclical by day, 3 days will produce results mostly free of daily effects for all types. Making this time period smaller may cause odd effects from time of day the algorithm is executed. Popular is not split and trending splits the events in two. So choose the duration accordingly.
192 | * **seed**: Set this if you want repeatable downsampling for some offline tests. This can be ignored and shouldn't be set in production.
193 | 
194 | ###Queries
195 | 
196 | ####Simple Personalized Query
197 | 
198 | 	{
199 | 	  “user”: “xyz”
200 | 	}
201 | 
202 | This gets all default values from the engine.json and uses only action correlators for the types specified there.
203 | 
204 | ####Simple Similar Items Query
205 | 
206 | 	{
207 | 	  “item”: “53454543513”
208 | 	}
209 | 
210 | This returns items that are similar to the query item, and blacklist and backfill are defaulted to what is in the engine.json
211 | 
212 | ####Full Query Parameters
213 | 
214 | Query fields determine what data is used to match when returning recommendations. Some fields have default values in engine.json and so may never be needed in individual queries. On the other hand all values from engine.json may be overridden or added to in an individual query. The only requirement is that there must be a user or item in every query.
215 | 
216 |     {
217 |       “user”: “xyz”,
218 |       “userBias”: -maxFloat..maxFloat,
219 |       “item”: “53454543513”,
220 |       “itemBias”: -maxFloat..maxFloat,
221 |       “num”: 4,
222 |       "fields”: [
223 |         {
224 |           “name”: ”fieldname”,
225 |           “values”: [“fieldValue1”, ...],
226 |           “bias”: -maxFloat..maxFloat
227 |         }, ...
228 |       ]
229 |       "dateRange": {
230 |         "name": "dateFieldName",
231 |         "beforeDate": "2015-09-15T11:28:45.114-07:00",
232 |         "afterDate": "2015-08-15T11:28:45.114-07:00"
233 |       },
234 |       "currentDate": "2015-08-15T11:28:45.114-07:00",
235 |       “blacklistItems”: [“itemId1”, “itemId2”, ...],
236 |       "returnSelf": true | false,
237 |     }
238 | 
239 | * **user**: optional, contains a unique id for the user. This may be a user not in the **training**: data, so a new or anonymous user who has an anonymous id. All user history captured in near real-time can be used to influence recommendations, there is no need to retrain to enable this.
240 | * **userBias**: optional (use with great care), the amount to favor the user's history in making recommendations. The user may be anonymous as long as the id is unique from any authenticated user. This tells the recommender to return recommendations based on the user’s event history. Used for personalized recommendations. Overrides and bias in engine.json.
241 | * **item**: optional, contains the unique item identifier
242 | * **itemBias**: optional (use with great care), the amount to favor similar items in making recommendations. This tells the recommender to return items similar to this the item specified. Use for “people who liked this also liked these”. Overrides any bias in engine.json
243 | * **fields**: optional, array of fields values and biases to use in this query. The name = type or field name for metadata stored in the EventStore with $set and $unset events. Values = an array on one or more values to use in this query. The values will be looked for in the field name. Bias will either boost the importance of this part of the query or use it as a filter. Positive biases are boosts any negative number will filter out any results that do not contain the values in the field name.
244 | * **num**: optional max number of recommendations to return. There is no guarantee that this number will be returned for every query. Adding backfill in the engine.json will make it much more likely to return this number of recommendations.
245 | * **blacklistItems**: optional. Unlike the engine.json, which specifies event types this part of the query specifies individual items to remove from returned recommendations. It can be used to remove duplicates when items are already shown in a specific context. This is called anti-flood in recommender use.
246 | * **dateRange** optional, default is not range filter. One of the bound can be omitted but not both. Values for the `beforeDate` and `afterDate` are strings in ISO 8601 format. A date range is ignored if **currentDate** is also specified in the query.
247 | * **currentDate** optional, must be specified if used. Overrides the **dateRange** is both are in the query.
248 | * **returnSelf**: optional boolean asking to include the item that was part of the query (if there was one) as part of the results. Defaults to false.
249 | 
250 | All query params are optional, the only rule is that there must be an item or user specified. Defaults are either noted or taken from algorithm values, which themselves may have defaults. This allows very simple queries for the simple, most used cases.
251 | 
252 | The query returns personalized recommendations, similar items, or a mix including backfill. The query itself determines this by supplying item, user or both. Some examples are:
253 | 
254 | ###Contextual Personalized
255 | 
256 | 	{
257 | 	  “user”: “xyz”,
258 | 	  “fields”: [
259 | 	    {
260 | 	      “name”: “categories”
261 | 	      “values”: [“series”, “mini-series”],
262 | 	      “bias”: -1 // filter out all except ‘series’ or ‘mini-series’
263 | 	    },{
264 | 	      “name”: “genre”,
265 | 	      “values”: [“sci-fi”, “detective”],
266 | 	      “bias”: 1.02 // boost/favor recommendations with the `genre’ = `sci-fi` or ‘detective’
267 | 	    }
268 | 	  ]
269 | 	}
270 | 
271 | This returns items based on user "xyz" history filtered by categories and boosted to favor more genre specific items. The values for fields have been attached to items with $set events where the “name” corresponds to a doc field and the “values” correspond to the contents of the field. The “bias” is used to indicate a filter or a boost. For Solr or Elasticsearch the boost is sent as-is to the engine and it’s meaning is determined by the engine (Lucene in either case). As always the blacklist and backfill use the defaults in engine.json.
272 | 
273 | ###Date ranges as query filters
274 | When the a date is stored in the items properties it can be used in a date range query. This is most often used by the app server since it may know what the range is, while a client query may only know the current date and so use the "Current Date" filter below.
275 | 
276 |     {
277 | 	  “user”: “xyz”,
278 | 	  “fields”: [
279 | 	    {
280 | 	      “name”: “categories”,
281 | 	      “values”: [“series”, “mini-series”],
282 | 	      “bias”: -1 // filter out all except ‘series’ or ‘mini-series’
283 | 	    },{
284 | 	      “name”: “genre”,
285 | 	      “values”: [“sci-fi”, “detective”],
286 | 	      “bias”: 1.02 // boost/favor recommendations with the `genre’ = `sci-fi` or ‘detective’
287 | 	    }
288 | 	  ],
289 |       "dateRange": {
290 |         "name": "availabledate",
291 |         "before": "2015-08-15T11:28:45.114-07:00",
292 |         "after": "2015-08-20T11:28:45.114-07:00"
293 |       }
294 | 	}
295 | 
296 | 
297 | Items are assumed to have a field of the same `name` that has a date associated with it using a `$set` event. The query will return only those recommendations where the date field is in range. Either date bound can be omitted for a on-sided range. The range applies to all returned recommendations, even those for popular items.
298 | 
299 | ###Current Date as a query filter
300 | When setting an available date and expire date on items, the current date can be used as a filter, the UR will check that the current date is before the expire date, and after or equal to the available date. You can use either expire date or available date or both. The names of these item fields is specified in the engine.json.
301 | 
302 |     {
303 | 	  “user”: “xyz”,
304 | 	  “fields”: [
305 | 	    {
306 | 	      “name”: “categories”,
307 | 	      “values”: [“series”, “mini-series”],
308 | 	      “bias”: -1 // filter out all except ‘series’ or ‘mini-series’
309 | 	    },{
310 | 	      “name”: “genre”,
311 | 	      “values”: [“sci-fi”, “detective”],
312 | 	      “bias”: 1.02
313 |         }
314 | 	  ],
315 |       "currentDate": "2015-08-15T11:28:45.114-07:00"
316 | 	}
317 | 
318 | ###Contextual Personalized with Similar Items
319 | 
320 | 	{
321 | 	  “user”: “xyz”,
322 | 	  "userBias": 2, // favor personal recommendations
323 | 	  “item”: “53454543513”, // fallback to contextual recommendations
324 | 	  “fields”: [
325 | 	    {
326 | 	      “name”: “categories”,
327 | 	      “values”: [“series”, “mini-series”],
328 | 	      “bias”: -1 // filter out all except ‘series’ or ‘mini-series’
329 | 	    },{
330 | 	      “name”: “genre”,
331 | 	      “values”: [“sci-fi”, “detective”],
332 | 	      “bias”: 1.02 // boost/favor recommendations with the `genre’ = `sci-fi` or ‘detective’
333 | 	    }
334 | 	  ]
335 | 	}
336 | 
337 | This returns items based on user xyz history or similar to item 53454543513 but favoring user history recommendations. These are filtered by categories and boosted to favor more genre specific items.
338 | 
339 | **Note**: This query should be considered **experimental**. Mixing user history with item similarity is possible but may have unexpected results. If you use this you should realize that user and item recommendations may be quite divergent and so mixing the them in query may produce nonsense. Use this only with the engine.json settings for "userbias" and "itembias" to favor one over the other.
340 | 
341 | 
342 | ###Popular Items
343 | 
344 | 	{
345 | 	}
346 | 
347 | This is a simple way to get popular items. All returned scores will be 0 but the order will be based on relative popularity. Field-based biases for boosts and filters can also be applied.
348 | 
349 | ##Events
350 | The Universal takes in potentially many events. These should be seen as a primary event, which is a very clear indication of a user preference and secondary events that we think may tell us something about user "taste" in some way. The Universal Recommender is built on a distributed Correlation Engine so it will test that these secondary events actually relate to the primary one and those that do not correlate will have little or no effect on recommendations (though they will make it longer to train and get query results). It is recommended that you start with one or two events and increase the number as you see how these events effect results and timing.
351 | 
352 | ###Usage Events
353 | 
354 | Events in PredicitonIO are sent to the EventSever in the following form:
355 | 
356 | 	{
357 | 		"event" : "purchase",
358 | 		"entityType" : "user",
359 | 		"entityId" : "1243617",
360 | 		"targetEntityType" : "item",
361 | 		"targetEntityId" : "iPad",
362 | 		"properties" : {},
363 | 		"eventTime" : "2015-10-05T21:02:49.228Z"
364 | 	}
365 | 
366 | This is what a "purchase" event looks like. Note that a usage event **always** is from a user and has a user id. Also the "targetEntityType" is always "item". The actual target entity is implied by the event name. So to create a "category-preference" event you would send something like this:
367 | 
368 | 	{
369 | 		"event" : "category-preference",
370 | 		"entityType" : "user",
371 | 		"entityId" : "1243617",
372 | 		"targetEntityType" : "item",
373 | 		"targetEntityId" : "electronics",
374 | 		"properties" : {},
375 | 		"eventTime" : "2015-10-05T21:02:49.228Z"
376 | 	}
377 | 
378 | This event would be sent when the user clicked on the "electronics" category or perhaps purchased an item that was in the "electronics" category. Note that the "targetEntityType" is always "item".
379 | 
380 | ###Property Change Events
381 | 
382 | To attach properties to items use a $set event like this:
383 | 
384 | 	{
385 | 		"event" : "$set",
386 | 		"entityType" : "item",
387 | 		"entityId" : "ipad",
388 | 		"properties" : {
389 | 			"category": ["electronics", "mobile-phones"],
390 | 			"expireDate": "2016-10-05T21:02:49.228Z",
391 | 			"availableDate": "2015-10-05T21:02:49.228Z"
392 | 		},
393 | 		"eventTime" : "2015-10-05T21:02:49.228Z"
394 | 	}
395 | 
396 | Unless a property has a special meaning specified in the engine.json, like date values, the property is assumed to be an array of strings, which act as categorical tags. You can add things like "premium" to the "tier" property then later if the user is a subscriber you can set a filter that allows recommendations from `"tier": ["free", "premium"]` where a non subscriber might only get recommendations for `"tier": ["free"]`. These are passed in to the query using the `"fields"` parameter (see Contextual queries above).
397 | 
398 | Using properties is how boosts and filters are applied to recommended items. It may seem odd to treat a category as a filter **and** as a secondary event (category-preference) but the two pieces of data are used in quite different ways. As properties they bias the recommendations, when they are events they add to user data that returns recommendations. In other words as properties they work with boost and filter business rules as secondary usage events they show something about user taste to make recommendations better.
399 | 
400 | 
401 | ##Creating a New Model or Adding Event Types
402 | 
403 | To begin using new data with an engine that has been used with sample data or using different events follow these steps:
404 | 
405 | 1. Create a new app name, backup your old `engine.json` and change `appName` in the new `engine.json`
406 | 2. Run `pio app new **your-new-app-name**`
407 | 3. Make any changes to `engine.json` to specify new event names and config values. Make sure `"eventNames": ["**your-primary-event**", "**a-secondary-event**", "**another-secondary-event**", ...]` contains the exact string used for your events and that the primary one is first in the list.
408 | 4. Import new events or allow enough to accumulate into the EventStore. If you are using sample events from a file run `python examples/**your-python-import-script**.py --access_key **your-access-key**` where the key can be retrieved with `pio app list`
409 | 5. Perform `pio build`, `pio train`, and `pio deploy`
410 | 6. Copy and edit the sample query script to match your new data. For new user ids pick a user that exists in the events, same for metadata `fields`, and items.
411 | 7. Run your edited query script and check the recommendations.
412 | 
413 | ##Tests
414 | **Integration test**: Once PIO and all services are running but before any model is deployed, run `./examples/integration-test` This will print a list of differences in the actual results from the expected results, none means the test passed. Not that the model will remain deployed and will have to be deployed over or killed by pid.
415 | 
416 | **Event name restricted query test**: this is for the feature that allows event names to be specified in the query. It restricts the user history that is used to create recommendations and is primarily for use with the MAP@k cross-validation test. The engine config removes the blacklisting of items so it must be used when doing MAP@k calculations. This test uses the simple sample data. Steps to try the test are:
417 | 
418 | 1. start pio and all services
419 | 2. `pio app new handmade`
420 | 3. `python examples/import_handmade.py --access_key <key-from-app-new>`
421 | 4. `cp engine.json engine.json.orig`
422 | 5. `cp event-names-test=engine.json engine.json`
423 | 5. `pio train`
424 | 6. `pio deploy`
425 | 5. `./examples/single-eventNames-query.sh`
426 | 6. restore engine.json
427 | 7. kill the deployed prediction server
428 | 
429 | **MAP@k**: This tests the predictive power of each usage event/indicator. All eventNames used in queries must be removed from the blacklisted events in the engine.json used for a particular dataset. So if `"eventNames": ["purchase","view"]` is in the engine.json for the dataset, these events must be removed from the blacklist with `"blacklist": []`, which tells the engine to not blacklist items with `eventNames` for a user. Allowing blacklisting will artificially lower MAP@k and so not give the desired result.
430 | 
431 | ## Versions
432 | 
433 | ### v0.2.3
434 | 
435 |  - removed isEmpty calls that were taking an extremely long time to execute, results in considerable speedup. Now the vast majority of `pio train` time is taken up by writing to Elasticsearch. This can be optimized by creating and ES cluster or giving ES lots of memory.
436 |  
437 | ### v0.2.2
438 | 
439 |  - a query with no item or user will get recommendations based on popularity
440 |  - a new integration test has been added
441 |  - a regression bug where some ids were being tokenized by Elasticsearch, leading to incorrect results, was fixed. **NOTE: for users with complex ids containing dashes or spaces this is an important fix.**
442 |  - a dateRange in the query now takes precedence to the item attached expiration and available dates.
443 | 
444 | ### v0.2.1
445 | 
446 |  - date ranges attached to items will be compared to the prediction servers current data if no date is provided in the query.
447 | 
448 | ### v0.2.0
449 | 
450 |  - date range filters implemented
451 |  - hot/trending/popular used for backfill and when no other recommendations are returned by the query
452 |  - filters/bias < 0 caused scores to be altered in v0.1.1 fixed in this version so filters have no effect on scoring.
453 |  - the model is now hot-swapped in Elasticsearch so no downtime should be seen, in fact there is no need to run `pio deploy` to make the new model active.
454 |  - it is now possible to have an engine.json (call it something else) dedicated to recalculating the popularity model. This allows fast updates to popularity without recalculating the collaborative filtering model.
455 |  - Elasticsearch can now be in cluster mode
456 | 
457 | ### v0.1.1
458 | 
459 |  - ids are now exact matches, for v0.1.0 the ids had to be lower case and were subject to tokenizing analysis so using that version is not recommended.
460 | 
461 | ### v0.1.0
462 | 
463 |  - user and item based queries supported
464 |  - multiple usage events supported
465 |  - filters and boosts supported on item properties and on user or item based results.
466 |  - fast writing to Elasticsearch using Spark
467 |  - convention over configuration for queries, defaults make simple/typical queries simple and overrides add greater expressiveness.
468 | 
469 | ### Known issues
470 | 
471 |  - see the github [issues list](https://github.com/PredictionIO/template-scala-parallel-universal-recommendation/issues)
472 | 
473 | ## References
474 | 
475 |  * Other documentation of the algorithm is [here](http://mahout.apache.org/users/algorithms/intro-cooccurrence-spark.html)
476 |  * A free ebook, which talks about the general idea: [Practical Machine Learning](https://www.mapr.com/practical-machine-learning).
477 |  * A slide deck, which talks about mixing actions and other correlator types, including content-based ones: [Creating a Unified Recommender](http://www.slideshare.net/pferrel/unified-recommender-39986309?ref=http://occamsmachete.com/ml/)
478 |  * Two blog posts: What's New in Recommenders: part [#1](http://occamsmachete.com/ml/2014/08/11/mahout-on-spark-whats-new-in-recommenders/) [#2](http://occamsmachete.com/ml/2014/09/09/mahout-on-spark-whats-new-in-recommenders-part-2/)
479 |  * A post describing the log-likelihood ratio: [Surprise and Coincidence](http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html) LLR is used to reduce noise in the data while keeping the calculations O(n) complexity.
480 |  
481 | #License
482 | This Software is licensed under the Apache Software Foundation version 2 licence found here: http://www.apache.org/licenses/LICENSE-2.0
483 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "template-scala-parallel-universal-recommendation"
 2 | 
 3 | version := "0.2.3"
 4 | 
 5 | organization := "io.prediction"
 6 | 
 7 | val mahoutVersion = "0.11.1"
 8 | 
 9 | libraryDependencies ++= Seq(
10 |   "io.prediction"    %% "core" % pioVersion.value % "provided",
11 |   "org.apache.spark" %% "spark-core" % "1.3.0" % "provided",
12 |   "org.apache.spark" %% "spark-mllib" % "1.3.0" % "provided",
13 |   "org.xerial.snappy" % "snappy-java" % "1.1.1.7",
14 |   // Mahout's Spark libs
15 |   "org.apache.mahout" %% "mahout-math-scala" % mahoutVersion,
16 |   "org.apache.mahout" %% "mahout-spark" % mahoutVersion
17 |     exclude("org.apache.spark", "spark-core_2.10"),
18 |   "org.apache.mahout"  % "mahout-math" % mahoutVersion,
19 |   "org.apache.mahout"  % "mahout-hdfs" % mahoutVersion
20 |     exclude("com.thoughtworks.xstream", "xstream")
21 |     exclude("org.apache.hadoop", "hadoop-client"),
22 |   // other external libs
23 |   "com.thoughtworks.xstream" % "xstream" % "1.4.4"
24 |     exclude("xmlpull", "xmlpull"),
25 |   "org.elasticsearch" % "elasticsearch-spark_2.10" % "2.1.0.Beta4"
26 |     exclude("org.apache.spark", "spark-catalyst_2.10")
27 |     exclude("org.apache.spark", "spark-sql_2.10"),
28 |   "org.json4s" %% "json4s-native" % "3.2.11"
29 | )
30 | 
31 | resolvers += Resolver.mavenLocal
32 | 
33 | assemblyMergeStrategy in assembly := {
34 |   case "plugin.properties" => MergeStrategy.discard
35 |   case PathList(ps @ _*) if ps.last endsWith "package-info.class" =>
36 |     MergeStrategy.first
37 |   case x =>
38 |     val oldStrategy = (assemblyMergeStrategy in assembly).value
39 |     oldStrategy(x)
40 | }
41 | 


--------------------------------------------------------------------------------
/data/handmade-event-description.txt:
--------------------------------------------------------------------------------
 1 | Event: purchase entity_id: u1 target_entity_id: iphone
 2 | Event: purchase entity_id: u1 target_entity_id: ipad
 3 | Event: purchase entity_id: u2 target_entity_id: nexus
 4 | Event: purchase entity_id: u2 target_entity_id: galaxy
 5 | Event: purchase entity_id: u3 target_entity_id: surface
 6 | Event: purchase entity_id: u4 target_entity_id: iphone
 7 | Event: purchase entity_id: u4 target_entity_id: galaxy
 8 | Event: view entity_id: u1 target_entity_id: phones
 9 | Event: view entity_id: u1 target_entity_id: mobile_acc
10 | Event: view entity_id: u2 target_entity_id: phones
11 | Event: view entity_id: u2 target_entity_id: tablets
12 | Event: view entity_id: u2 target_entity_id: mobile_acc
13 | Event: view entity_id: u3 target_entity_id: mobile_acc
14 | Event: view entity_id: u4 target_entity_id: phones
15 | Event: view entity_id: u4 target_entity_id: tablets
16 | Event: view entity_id: u4 target_entity_id: soap
17 | Event: view entity_id: u5 target_entity_id: soap
18 | Event: $set entity_id: iphone properties/catagory: phones properties/date: 2015-08-30T12:24:41 properties/expiredate: 2015-09-01T12:24:41
19 | Event: $set entity_id: ipad properties/catagory: tablets properties/availabledate: 2015-08-29T12:24:41 properties/date: 2015-08-31T12:24:41 properties/expiredate: 2015-09-02T12:24:41
20 | Event: $set entity_id: nexus properties/catagory: tablets properties/availabledate: 2015-08-30T12:24:41 properties/date: 2015-09-01T12:24:41 properties/expiredate: 2015-09-03T12:24:41
21 | Event: $set entity_id: galaxy properties/catagory: phones properties/date: 2015-09-02T12:24:41 properties/expiredate: 2015-09-04T12:24:41
22 | Event: $set entity_id: surface properties/catagory: tablets properties/availabledate: 2015-09-01T12:24:41 properties/date: 2015-09-03T12:24:41


--------------------------------------------------------------------------------
/data/integration-test-expected.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Queries to illustrate many use cases on a small standard dataset and for an automated integration test.
 3 | 
 4 | WARNING: for this to produce the correct result you must:
 5 |      $ python examples/import_handmade.py --access_key <your-app-accesskey>
 6 |   2. Copy handmade-engine.json to engine.json.
 7 |   3. Run 'pio build', 'pio train', and 'pio deploy'
 8 |   4. The queries must be run the same day as the import was done because date filters are part of the test.
 9 | NOTE: due to available and expire dates you should never see the Iphone 5 or Iphone 6.
10 | 
11 | ============ simple user recs ============
12 | 
13 | Recommendations for user: u1
14 | 
15 | {"itemScores":[{"item":"Galaxy","score":0.8880454897880554},{"item":"Nexus","score":0.288095086812973},{"item":"Surface","score":0.05261862277984619}]}
16 | 
17 | Recommendations for user: U 2
18 | 
19 | {"itemScores":[{"item":"Iphone 4","score":0.9535322189331055},{"item":"Ipad-retina","score":0.05624813959002495},{"item":"Surface","score":0.029876574873924255}]}
20 | 
21 | Recommendations for user: u-3
22 | 
23 | {"itemScores":[{"item":"Iphone 4","score":0.18315871059894562},{"item":"Galaxy","score":0.18315871059894562},{"item":"Nexus","score":0.18315871059894562},{"item":"Ipad-retina","score":0.07201286405324936}]}
24 | 
25 | Recommendations for user: u-4
26 | 
27 | {"itemScores":[{"item":"Galaxy","score":0.8880454897880554},{"item":"Nexus","score":0.288095086812973},{"item":"Surface","score":0.05261862277984619}]}
28 | 
29 | Recommendations for user: u5
30 | 
31 | {"itemScores":[{"item":"Iphone 4","score":0.800000011920929},{"item":"Galaxy","score":0.800000011920929},{"item":"Nexus","score":0.0},{"item":"Ipad-retina","score":0.0}]}
32 | 
33 | ============ simple similar item recs ============
34 | 
35 | Recommendations for item: Iphone 4
36 | 
37 | {"itemScores":[{"item":"Galaxy","score":0.6802681088447571},{"item":"Nexus","score":0.3414953947067261},{"item":"Ipad-retina","score":0.06429719924926758},{"item":"Surface","score":0.013116735965013504}]}
38 | 
39 | Recommendations for item: Ipad-retina
40 | 
41 | {"itemScores":[{"item":"Iphone 4","score":0.8768638968467712},{"item":"Galaxy","score":0.8768638968467712},{"item":"Nexus","score":0.23742027580738068},{"item":"Surface","score":0.04049335792660713}]}
42 | 
43 | Recommendations for item: Nexus
44 | 
45 | {"itemScores":[{"item":"Iphone 4","score":1.1343861818313599},{"item":"Galaxy","score":0.5476991534233093},{"item":"Ipad-retina","score":0.04849598929286003},{"item":"Surface","score":0.02529095485806465}]}
46 | 
47 | Recommendations for item: Galaxy
48 | 
49 | {"itemScores":[{"item":"Iphone 4","score":0.7461300492286682},{"item":"Nexus","score":0.23864313960075378},{"item":"Ipad-retina","score":0.08540179580450058},{"item":"Surface","score":0.014605122618377209}]}
50 | 
51 | Recommendations for item: Surface
52 | 
53 | {"itemScores":[{"item":"Iphone 4","score":0.4070388376712799},{"item":"Galaxy","score":0.4070388376712799},{"item":"Nexus","score":0.4070388376712799},{"item":"Ipad-retina","score":0.17534448206424713}]}
54 | 
55 | ============ popular item recs only ============
56 | 
57 | query with no item or user id, ordered by popularity
58 | 
59 | {"itemScores":[{"item":"Iphone 4","score":0.0},{"item":"Galaxy","score":0.0},{"item":"Nexus","score":0.0},{"item":"Ipad-retina","score":0.0}]}
60 | 
61 | Recommendations for non-existant user: xyz, all from popularity
62 | 
63 | {"itemScores":[{"item":"Iphone 4","score":0.0},{"item":"Galaxy","score":0.0},{"item":"Nexus","score":0.0},{"item":"Ipad-retina","score":0.0}]}
64 | 
65 | Recommendations for non-existant item: xyz, all from popularity
66 | 
67 | {"itemScores":[{"item":"Iphone 4","score":0.0},{"item":"Galaxy","score":0.0},{"item":"Nexus","score":0.0},{"item":"Ipad-retina","score":0.0}]}
68 | 
69 | Recommendations for no user no item, all from popularity, Tablets filter
70 | 
71 | {"itemScores":[{"item":"Nexus","score":0.0},{"item":"Ipad-retina","score":0.0},{"item":"Surface","score":0.0}]}
72 | 
73 | Recommendations for no user no item, all from popularity, Tablets boost
74 | 
75 | {"itemScores":[{"item":"Nexus","score":0.9369767904281616},{"item":"Surface","score":0.6666666865348816},{"item":"Ipad-retina","score":0.204568549990654},{"item":"Iphone 4","score":0.0}]}
76 | 
77 | Recommendations for no user no item, all from popularity, Tablets boost, Estados Unidos Mexicanos filter
78 | 
79 | {"itemScores":[{"item":"Ipad-retina","score":0.21918058395385742},{"item":"Iphone 4","score":0.0}]}
80 | 
81 | ============ dateRange filter ============
82 | 
83 | Recommendations for user: u1
84 | 
85 | {"itemScores":[{"item":"Surface","score":0.049329958856105804}]}
86 | 
87 | ============ query with item and user *EXPERIMENTAL* ============
88 | 
89 | Recommendations for no user no item, all from popularity, Tablets boost, Estados Unidos Mexicanos filter
90 | 
91 | {"itemScores":[{"item":"Galaxy","score":1.3019405603408813},{"item":"Nexus","score":0.5091257691383362},{"item":"Surface","score":0.04033438116312027}]}
92 | 


--------------------------------------------------------------------------------
/data/sample-handmade-data.txt:
--------------------------------------------------------------------------------
 1 | u1,purchase,Iphone 6
 2 | u1,purchase,Iphone 5
 3 | u1,purchase,Iphone 4
 4 | u1,purchase,Ipad-retina
 5 | U 2,purchase,Nexus
 6 | U 2,purchase,Galaxy
 7 | u-3,purchase,Surface
 8 | u-4,purchase,Iphone 5
 9 | u-4,purchase,Iphone 4
10 | u-4,purchase,Galaxy
11 | u1,view,Phones
12 | u1,view,Mobile-acc
13 | U 2,view,Phones
14 | U 2,view,Tablets
15 | U 2,view,Mobile-acc
16 | u-3,view,Mobile-acc
17 | u-4,view,Phones
18 | u-4,view,Tablets
19 | u-4,view,Soap
20 | u5,view,Soap
21 | Iphone 6,$set,categories:Phones:Electronics:Apple
22 | Iphone 5,$set,categories:Phones:Electronics:Apple
23 | Iphone 4,$set,categories:Phones:Electronics:Apple
24 | Ipad-retina,$set,categories:Tablets:Electronics:Apple
25 | Nexus,$set,categories:Tablets:Electronics:Google
26 | Galaxy,$set,categories:Phones:Electronics:Samsung
27 | Surface,$set,categories:Tablets:Electronics:Microsoft
28 | Iphone 4,$set,countries:United States:Canada:Estados Unidos Mexicanos
29 | Ipad-retina,$set,countries:United States:Estados Unidos Mexicanos
30 | Nexus,$set,countries:United States:Canada
31 | Galaxy,$set,countries:United States
32 | Surface,$set,countries:United States:Canada
33 | 


--------------------------------------------------------------------------------
/data/sample_movielens_data.txt:
--------------------------------------------------------------------------------
   1 | 0::2::3
   2 | 0::3::1
   3 | 0::5::2
   4 | 0::9::4
   5 | 0::11::1
   6 | 0::12::2
   7 | 0::15::1
   8 | 0::17::1
   9 | 0::19::1
  10 | 0::21::1
  11 | 0::23::1
  12 | 0::26::3
  13 | 0::27::1
  14 | 0::28::1
  15 | 0::29::1
  16 | 0::30::1
  17 | 0::31::1
  18 | 0::34::1
  19 | 0::37::1
  20 | 0::41::2
  21 | 0::44::1
  22 | 0::45::2
  23 | 0::46::1
  24 | 0::47::1
  25 | 0::48::1
  26 | 0::50::1
  27 | 0::51::1
  28 | 0::54::1
  29 | 0::55::1
  30 | 0::59::2
  31 | 0::61::2
  32 | 0::64::1
  33 | 0::67::1
  34 | 0::68::1
  35 | 0::69::1
  36 | 0::71::1
  37 | 0::72::1
  38 | 0::77::2
  39 | 0::79::1
  40 | 0::83::1
  41 | 0::87::1
  42 | 0::89::2
  43 | 0::91::3
  44 | 0::92::4
  45 | 0::94::1
  46 | 0::95::2
  47 | 0::96::1
  48 | 0::98::1
  49 | 0::99::1
  50 | 1::2::2
  51 | 1::3::1
  52 | 1::4::2
  53 | 1::6::1
  54 | 1::9::3
  55 | 1::12::1
  56 | 1::13::1
  57 | 1::14::1
  58 | 1::16::1
  59 | 1::19::1
  60 | 1::21::3
  61 | 1::27::1
  62 | 1::28::3
  63 | 1::33::1
  64 | 1::36::2
  65 | 1::37::1
  66 | 1::40::1
  67 | 1::41::2
  68 | 1::43::1
  69 | 1::44::1
  70 | 1::47::1
  71 | 1::50::1
  72 | 1::54::1
  73 | 1::56::2
  74 | 1::57::1
  75 | 1::58::1
  76 | 1::60::1
  77 | 1::62::4
  78 | 1::63::1
  79 | 1::67::1
  80 | 1::68::4
  81 | 1::70::2
  82 | 1::72::1
  83 | 1::73::1
  84 | 1::74::2
  85 | 1::76::1
  86 | 1::77::3
  87 | 1::78::1
  88 | 1::81::1
  89 | 1::82::1
  90 | 1::85::3
  91 | 1::86::2
  92 | 1::88::2
  93 | 1::91::1
  94 | 1::92::2
  95 | 1::93::1
  96 | 1::94::2
  97 | 1::96::1
  98 | 1::97::1
  99 | 2::4::3
 100 | 2::6::1
 101 | 2::8::5
 102 | 2::9::1
 103 | 2::10::1
 104 | 2::12::3
 105 | 2::13::1
 106 | 2::15::2
 107 | 2::18::2
 108 | 2::19::4
 109 | 2::22::1
 110 | 2::26::1
 111 | 2::28::1
 112 | 2::34::4
 113 | 2::35::1
 114 | 2::37::5
 115 | 2::38::1
 116 | 2::39::5
 117 | 2::40::4
 118 | 2::47::1
 119 | 2::50::1
 120 | 2::52::2
 121 | 2::54::1
 122 | 2::55::1
 123 | 2::57::2
 124 | 2::58::2
 125 | 2::59::1
 126 | 2::61::1
 127 | 2::62::1
 128 | 2::64::1
 129 | 2::65::1
 130 | 2::66::3
 131 | 2::68::1
 132 | 2::71::3
 133 | 2::76::1
 134 | 2::77::1
 135 | 2::78::1
 136 | 2::80::1
 137 | 2::83::5
 138 | 2::85::1
 139 | 2::87::2
 140 | 2::88::1
 141 | 2::89::4
 142 | 2::90::1
 143 | 2::92::4
 144 | 2::93::5
 145 | 3::0::1
 146 | 3::1::1
 147 | 3::2::1
 148 | 3::7::3
 149 | 3::8::3
 150 | 3::9::1
 151 | 3::14::1
 152 | 3::15::1
 153 | 3::16::1
 154 | 3::18::4
 155 | 3::19::1
 156 | 3::24::3
 157 | 3::26::1
 158 | 3::29::3
 159 | 3::33::1
 160 | 3::34::3
 161 | 3::35::1
 162 | 3::36::3
 163 | 3::37::1
 164 | 3::38::2
 165 | 3::43::1
 166 | 3::44::1
 167 | 3::46::1
 168 | 3::47::1
 169 | 3::51::5
 170 | 3::52::3
 171 | 3::56::1
 172 | 3::58::1
 173 | 3::60::3
 174 | 3::62::1
 175 | 3::65::2
 176 | 3::66::1
 177 | 3::67::1
 178 | 3::68::2
 179 | 3::70::1
 180 | 3::72::2
 181 | 3::76::3
 182 | 3::79::3
 183 | 3::80::4
 184 | 3::81::1
 185 | 3::83::1
 186 | 3::84::1
 187 | 3::86::1
 188 | 3::87::2
 189 | 3::88::4
 190 | 3::89::1
 191 | 3::91::1
 192 | 3::94::3
 193 | 4::1::1
 194 | 4::6::1
 195 | 4::8::1
 196 | 4::9::1
 197 | 4::10::1
 198 | 4::11::1
 199 | 4::12::1
 200 | 4::13::1
 201 | 4::14::2
 202 | 4::15::1
 203 | 4::17::1
 204 | 4::20::1
 205 | 4::22::1
 206 | 4::23::1
 207 | 4::24::1
 208 | 4::29::4
 209 | 4::30::1
 210 | 4::31::1
 211 | 4::34::1
 212 | 4::35::1
 213 | 4::36::1
 214 | 4::39::2
 215 | 4::40::3
 216 | 4::41::4
 217 | 4::43::2
 218 | 4::44::1
 219 | 4::45::1
 220 | 4::46::1
 221 | 4::47::1
 222 | 4::49::2
 223 | 4::50::1
 224 | 4::51::1
 225 | 4::52::4
 226 | 4::54::1
 227 | 4::55::1
 228 | 4::60::3
 229 | 4::61::1
 230 | 4::62::4
 231 | 4::63::3
 232 | 4::65::1
 233 | 4::67::2
 234 | 4::69::1
 235 | 4::70::4
 236 | 4::71::1
 237 | 4::73::1
 238 | 4::78::1
 239 | 4::84::1
 240 | 4::85::1
 241 | 4::87::3
 242 | 4::88::3
 243 | 4::89::2
 244 | 4::96::1
 245 | 4::97::1
 246 | 4::98::1
 247 | 4::99::1
 248 | 5::0::1
 249 | 5::1::1
 250 | 5::4::1
 251 | 5::5::1
 252 | 5::8::1
 253 | 5::9::3
 254 | 5::10::2
 255 | 5::13::3
 256 | 5::15::1
 257 | 5::19::1
 258 | 5::20::3
 259 | 5::21::2
 260 | 5::23::3
 261 | 5::27::1
 262 | 5::28::1
 263 | 5::29::1
 264 | 5::31::1
 265 | 5::36::3
 266 | 5::38::2
 267 | 5::39::1
 268 | 5::42::1
 269 | 5::48::3
 270 | 5::49::4
 271 | 5::50::3
 272 | 5::51::1
 273 | 5::52::1
 274 | 5::54::1
 275 | 5::55::5
 276 | 5::56::3
 277 | 5::58::1
 278 | 5::60::1
 279 | 5::61::1
 280 | 5::64::3
 281 | 5::65::2
 282 | 5::68::4
 283 | 5::70::1
 284 | 5::71::1
 285 | 5::72::1
 286 | 5::74::1
 287 | 5::79::1
 288 | 5::81::2
 289 | 5::84::1
 290 | 5::85::1
 291 | 5::86::1
 292 | 5::88::1
 293 | 5::90::4
 294 | 5::91::2
 295 | 5::95::2
 296 | 5::99::1
 297 | 6::0::1
 298 | 6::1::1
 299 | 6::2::3
 300 | 6::5::1
 301 | 6::6::1
 302 | 6::9::1
 303 | 6::10::1
 304 | 6::15::2
 305 | 6::16::2
 306 | 6::17::1
 307 | 6::18::1
 308 | 6::20::1
 309 | 6::21::1
 310 | 6::22::1
 311 | 6::24::1
 312 | 6::25::5
 313 | 6::26::1
 314 | 6::28::1
 315 | 6::30::1
 316 | 6::33::1
 317 | 6::38::1
 318 | 6::39::1
 319 | 6::43::4
 320 | 6::44::1
 321 | 6::45::1
 322 | 6::48::1
 323 | 6::49::1
 324 | 6::50::1
 325 | 6::53::1
 326 | 6::54::1
 327 | 6::55::1
 328 | 6::56::1
 329 | 6::58::4
 330 | 6::59::1
 331 | 6::60::1
 332 | 6::61::3
 333 | 6::63::3
 334 | 6::66::1
 335 | 6::67::3
 336 | 6::68::1
 337 | 6::69::1
 338 | 6::71::2
 339 | 6::73::1
 340 | 6::75::1
 341 | 6::77::1
 342 | 6::79::1
 343 | 6::81::1
 344 | 6::84::1
 345 | 6::85::3
 346 | 6::86::1
 347 | 6::87::1
 348 | 6::88::1
 349 | 6::89::1
 350 | 6::91::2
 351 | 6::94::1
 352 | 6::95::2
 353 | 6::96::1
 354 | 7::1::1
 355 | 7::2::2
 356 | 7::3::1
 357 | 7::4::1
 358 | 7::7::1
 359 | 7::10::1
 360 | 7::11::2
 361 | 7::14::2
 362 | 7::15::1
 363 | 7::16::1
 364 | 7::18::1
 365 | 7::21::1
 366 | 7::22::1
 367 | 7::23::1
 368 | 7::25::5
 369 | 7::26::1
 370 | 7::29::4
 371 | 7::30::1
 372 | 7::31::3
 373 | 7::32::1
 374 | 7::33::1
 375 | 7::35::1
 376 | 7::37::2
 377 | 7::39::3
 378 | 7::40::2
 379 | 7::42::2
 380 | 7::44::1
 381 | 7::45::2
 382 | 7::47::4
 383 | 7::48::1
 384 | 7::49::1
 385 | 7::53::1
 386 | 7::54::1
 387 | 7::55::1
 388 | 7::56::1
 389 | 7::59::1
 390 | 7::61::2
 391 | 7::62::3
 392 | 7::63::2
 393 | 7::66::1
 394 | 7::67::3
 395 | 7::74::1
 396 | 7::75::1
 397 | 7::76::3
 398 | 7::77::1
 399 | 7::81::1
 400 | 7::82::1
 401 | 7::84::2
 402 | 7::85::4
 403 | 7::86::1
 404 | 7::92::2
 405 | 7::96::1
 406 | 7::97::1
 407 | 7::98::1
 408 | 8::0::1
 409 | 8::2::4
 410 | 8::3::2
 411 | 8::4::2
 412 | 8::5::1
 413 | 8::7::1
 414 | 8::9::1
 415 | 8::11::1
 416 | 8::15::1
 417 | 8::18::1
 418 | 8::19::1
 419 | 8::21::1
 420 | 8::29::5
 421 | 8::31::3
 422 | 8::33::1
 423 | 8::35::1
 424 | 8::36::1
 425 | 8::40::2
 426 | 8::44::1
 427 | 8::45::1
 428 | 8::50::1
 429 | 8::51::1
 430 | 8::52::5
 431 | 8::53::5
 432 | 8::54::1
 433 | 8::55::1
 434 | 8::56::1
 435 | 8::58::4
 436 | 8::60::3
 437 | 8::62::4
 438 | 8::64::1
 439 | 8::67::3
 440 | 8::69::1
 441 | 8::71::1
 442 | 8::72::3
 443 | 8::77::3
 444 | 8::78::1
 445 | 8::79::1
 446 | 8::83::1
 447 | 8::85::5
 448 | 8::86::1
 449 | 8::88::1
 450 | 8::90::1
 451 | 8::92::2
 452 | 8::95::4
 453 | 8::96::3
 454 | 8::97::1
 455 | 8::98::1
 456 | 8::99::1
 457 | 9::2::3
 458 | 9::3::1
 459 | 9::4::1
 460 | 9::5::1
 461 | 9::6::1
 462 | 9::7::5
 463 | 9::9::1
 464 | 9::12::1
 465 | 9::14::3
 466 | 9::15::1
 467 | 9::19::1
 468 | 9::21::1
 469 | 9::22::1
 470 | 9::24::1
 471 | 9::25::1
 472 | 9::26::1
 473 | 9::30::3
 474 | 9::32::4
 475 | 9::35::2
 476 | 9::36::2
 477 | 9::37::2
 478 | 9::38::1
 479 | 9::39::1
 480 | 9::43::3
 481 | 9::49::5
 482 | 9::50::3
 483 | 9::53::1
 484 | 9::54::1
 485 | 9::58::1
 486 | 9::59::1
 487 | 9::60::1
 488 | 9::61::1
 489 | 9::63::3
 490 | 9::64::3
 491 | 9::68::1
 492 | 9::69::1
 493 | 9::70::3
 494 | 9::71::1
 495 | 9::73::2
 496 | 9::75::1
 497 | 9::77::2
 498 | 9::81::2
 499 | 9::82::1
 500 | 9::83::1
 501 | 9::84::1
 502 | 9::86::1
 503 | 9::87::4
 504 | 9::88::1
 505 | 9::90::3
 506 | 9::94::2
 507 | 9::95::3
 508 | 9::97::2
 509 | 9::98::1
 510 | 10::0::3
 511 | 10::2::4
 512 | 10::4::3
 513 | 10::7::1
 514 | 10::8::1
 515 | 10::10::1
 516 | 10::13::2
 517 | 10::14::1
 518 | 10::16::2
 519 | 10::17::1
 520 | 10::18::1
 521 | 10::21::1
 522 | 10::22::1
 523 | 10::24::1
 524 | 10::25::3
 525 | 10::28::1
 526 | 10::35::1
 527 | 10::36::1
 528 | 10::37::1
 529 | 10::38::1
 530 | 10::39::1
 531 | 10::40::4
 532 | 10::41::2
 533 | 10::42::3
 534 | 10::43::1
 535 | 10::49::3
 536 | 10::50::1
 537 | 10::51::1
 538 | 10::52::1
 539 | 10::55::2
 540 | 10::56::1
 541 | 10::58::1
 542 | 10::63::1
 543 | 10::66::1
 544 | 10::67::2
 545 | 10::68::1
 546 | 10::75::1
 547 | 10::77::1
 548 | 10::79::1
 549 | 10::86::1
 550 | 10::89::3
 551 | 10::90::1
 552 | 10::97::1
 553 | 10::98::1
 554 | 11::0::1
 555 | 11::6::2
 556 | 11::9::1
 557 | 11::10::1
 558 | 11::11::1
 559 | 11::12::1
 560 | 11::13::4
 561 | 11::16::1
 562 | 11::18::5
 563 | 11::19::4
 564 | 11::20::1
 565 | 11::21::1
 566 | 11::22::1
 567 | 11::23::5
 568 | 11::25::1
 569 | 11::27::5
 570 | 11::30::5
 571 | 11::32::5
 572 | 11::35::3
 573 | 11::36::2
 574 | 11::37::2
 575 | 11::38::4
 576 | 11::39::1
 577 | 11::40::1
 578 | 11::41::1
 579 | 11::43::2
 580 | 11::45::1
 581 | 11::47::1
 582 | 11::48::5
 583 | 11::50::4
 584 | 11::51::3
 585 | 11::59::1
 586 | 11::61::1
 587 | 11::62::1
 588 | 11::64::1
 589 | 11::66::4
 590 | 11::67::1
 591 | 11::69::5
 592 | 11::70::1
 593 | 11::71::3
 594 | 11::72::3
 595 | 11::75::3
 596 | 11::76::1
 597 | 11::77::1
 598 | 11::78::1
 599 | 11::79::5
 600 | 11::80::3
 601 | 11::81::4
 602 | 11::82::1
 603 | 11::86::1
 604 | 11::88::1
 605 | 11::89::1
 606 | 11::90::4
 607 | 11::94::2
 608 | 11::97::3
 609 | 11::99::1
 610 | 12::2::1
 611 | 12::4::1
 612 | 12::6::1
 613 | 12::7::3
 614 | 12::8::1
 615 | 12::14::1
 616 | 12::15::2
 617 | 12::16::4
 618 | 12::17::5
 619 | 12::18::2
 620 | 12::21::1
 621 | 12::22::2
 622 | 12::23::3
 623 | 12::24::1
 624 | 12::25::1
 625 | 12::27::5
 626 | 12::30::2
 627 | 12::31::4
 628 | 12::35::5
 629 | 12::38::1
 630 | 12::41::1
 631 | 12::44::2
 632 | 12::45::1
 633 | 12::50::4
 634 | 12::51::1
 635 | 12::52::1
 636 | 12::53::1
 637 | 12::54::1
 638 | 12::56::2
 639 | 12::57::1
 640 | 12::60::1
 641 | 12::63::1
 642 | 12::64::5
 643 | 12::66::3
 644 | 12::67::1
 645 | 12::70::1
 646 | 12::72::1
 647 | 12::74::1
 648 | 12::75::1
 649 | 12::77::1
 650 | 12::78::1
 651 | 12::79::3
 652 | 12::82::2
 653 | 12::83::1
 654 | 12::84::1
 655 | 12::85::1
 656 | 12::86::1
 657 | 12::87::1
 658 | 12::88::1
 659 | 12::91::3
 660 | 12::92::1
 661 | 12::94::4
 662 | 12::95::2
 663 | 12::96::1
 664 | 12::98::2
 665 | 13::0::1
 666 | 13::3::1
 667 | 13::4::2
 668 | 13::5::1
 669 | 13::6::1
 670 | 13::12::1
 671 | 13::14::2
 672 | 13::15::1
 673 | 13::17::1
 674 | 13::18::3
 675 | 13::20::1
 676 | 13::21::1
 677 | 13::22::1
 678 | 13::26::1
 679 | 13::27::1
 680 | 13::29::3
 681 | 13::31::1
 682 | 13::33::1
 683 | 13::40::2
 684 | 13::43::2
 685 | 13::44::1
 686 | 13::45::1
 687 | 13::49::1
 688 | 13::51::1
 689 | 13::52::2
 690 | 13::53::3
 691 | 13::54::1
 692 | 13::62::1
 693 | 13::63::2
 694 | 13::64::1
 695 | 13::68::1
 696 | 13::71::1
 697 | 13::72::3
 698 | 13::73::1
 699 | 13::74::3
 700 | 13::77::2
 701 | 13::78::1
 702 | 13::79::2
 703 | 13::83::3
 704 | 13::85::1
 705 | 13::86::1
 706 | 13::87::2
 707 | 13::88::2
 708 | 13::90::1
 709 | 13::93::4
 710 | 13::94::1
 711 | 13::98::1
 712 | 13::99::1
 713 | 14::1::1
 714 | 14::3::3
 715 | 14::4::1
 716 | 14::5::1
 717 | 14::6::1
 718 | 14::7::1
 719 | 14::9::1
 720 | 14::10::1
 721 | 14::11::1
 722 | 14::12::1
 723 | 14::13::1
 724 | 14::14::3
 725 | 14::15::1
 726 | 14::16::1
 727 | 14::17::1
 728 | 14::20::1
 729 | 14::21::1
 730 | 14::24::1
 731 | 14::25::2
 732 | 14::27::1
 733 | 14::28::1
 734 | 14::29::5
 735 | 14::31::3
 736 | 14::34::1
 737 | 14::36::1
 738 | 14::37::2
 739 | 14::39::2
 740 | 14::40::1
 741 | 14::44::1
 742 | 14::45::1
 743 | 14::47::3
 744 | 14::48::1
 745 | 14::49::1
 746 | 14::51::1
 747 | 14::52::5
 748 | 14::53::3
 749 | 14::54::1
 750 | 14::55::1
 751 | 14::56::1
 752 | 14::62::4
 753 | 14::63::5
 754 | 14::67::3
 755 | 14::68::1
 756 | 14::69::3
 757 | 14::71::1
 758 | 14::72::4
 759 | 14::73::1
 760 | 14::76::5
 761 | 14::79::1
 762 | 14::82::1
 763 | 14::83::1
 764 | 14::88::1
 765 | 14::93::3
 766 | 14::94::1
 767 | 14::95::2
 768 | 14::96::4
 769 | 14::98::1
 770 | 15::0::1
 771 | 15::1::4
 772 | 15::2::1
 773 | 15::5::2
 774 | 15::6::1
 775 | 15::7::1
 776 | 15::13::1
 777 | 15::14::1
 778 | 15::15::1
 779 | 15::17::2
 780 | 15::19::2
 781 | 15::22::2
 782 | 15::23::2
 783 | 15::25::1
 784 | 15::26::3
 785 | 15::27::1
 786 | 15::28::2
 787 | 15::29::1
 788 | 15::32::1
 789 | 15::33::2
 790 | 15::34::1
 791 | 15::35::2
 792 | 15::36::1
 793 | 15::37::1
 794 | 15::39::1
 795 | 15::42::1
 796 | 15::46::5
 797 | 15::48::2
 798 | 15::50::2
 799 | 15::51::1
 800 | 15::52::1
 801 | 15::58::1
 802 | 15::62::1
 803 | 15::64::3
 804 | 15::65::2
 805 | 15::72::1
 806 | 15::73::1
 807 | 15::74::1
 808 | 15::79::1
 809 | 15::80::1
 810 | 15::81::1
 811 | 15::82::2
 812 | 15::85::1
 813 | 15::87::1
 814 | 15::91::2
 815 | 15::96::1
 816 | 15::97::1
 817 | 15::98::3
 818 | 16::2::1
 819 | 16::5::3
 820 | 16::6::2
 821 | 16::7::1
 822 | 16::9::1
 823 | 16::12::1
 824 | 16::14::1
 825 | 16::15::1
 826 | 16::19::1
 827 | 16::21::2
 828 | 16::29::4
 829 | 16::30::2
 830 | 16::32::1
 831 | 16::34::1
 832 | 16::36::1
 833 | 16::38::1
 834 | 16::46::1
 835 | 16::47::3
 836 | 16::48::1
 837 | 16::49::1
 838 | 16::50::1
 839 | 16::51::5
 840 | 16::54::5
 841 | 16::55::1
 842 | 16::56::2
 843 | 16::57::1
 844 | 16::60::1
 845 | 16::63::2
 846 | 16::65::1
 847 | 16::67::1
 848 | 16::72::1
 849 | 16::74::1
 850 | 16::80::1
 851 | 16::81::1
 852 | 16::82::1
 853 | 16::85::5
 854 | 16::86::1
 855 | 16::90::5
 856 | 16::91::1
 857 | 16::93::1
 858 | 16::94::3
 859 | 16::95::2
 860 | 16::96::3
 861 | 16::98::3
 862 | 16::99::1
 863 | 17::2::1
 864 | 17::3::1
 865 | 17::6::1
 866 | 17::10::4
 867 | 17::11::1
 868 | 17::13::2
 869 | 17::17::5
 870 | 17::19::1
 871 | 17::20::5
 872 | 17::22::4
 873 | 17::28::1
 874 | 17::29::1
 875 | 17::33::1
 876 | 17::34::1
 877 | 17::35::2
 878 | 17::37::1
 879 | 17::38::1
 880 | 17::45::1
 881 | 17::46::5
 882 | 17::47::1
 883 | 17::49::3
 884 | 17::51::1
 885 | 17::55::5
 886 | 17::56::3
 887 | 17::57::1
 888 | 17::58::1
 889 | 17::59::1
 890 | 17::60::1
 891 | 17::63::1
 892 | 17::66::1
 893 | 17::68::4
 894 | 17::69::1
 895 | 17::70::1
 896 | 17::72::1
 897 | 17::73::3
 898 | 17::78::1
 899 | 17::79::1
 900 | 17::82::2
 901 | 17::84::1
 902 | 17::90::5
 903 | 17::91::3
 904 | 17::92::1
 905 | 17::93::1
 906 | 17::94::4
 907 | 17::95::2
 908 | 17::97::1
 909 | 18::1::1
 910 | 18::4::3
 911 | 18::5::2
 912 | 18::6::1
 913 | 18::7::1
 914 | 18::10::1
 915 | 18::11::4
 916 | 18::12::2
 917 | 18::13::1
 918 | 18::15::1
 919 | 18::18::1
 920 | 18::20::1
 921 | 18::21::2
 922 | 18::22::1
 923 | 18::23::2
 924 | 18::25::1
 925 | 18::26::1
 926 | 18::27::1
 927 | 18::28::5
 928 | 18::29::1
 929 | 18::31::1
 930 | 18::32::1
 931 | 18::36::1
 932 | 18::38::5
 933 | 18::39::5
 934 | 18::40::1
 935 | 18::42::1
 936 | 18::43::1
 937 | 18::44::4
 938 | 18::46::1
 939 | 18::47::1
 940 | 18::48::1
 941 | 18::51::2
 942 | 18::55::1
 943 | 18::56::1
 944 | 18::57::1
 945 | 18::62::1
 946 | 18::63::1
 947 | 18::66::3
 948 | 18::67::1
 949 | 18::70::1
 950 | 18::75::1
 951 | 18::76::3
 952 | 18::77::1
 953 | 18::80::3
 954 | 18::81::3
 955 | 18::82::1
 956 | 18::83::5
 957 | 18::84::1
 958 | 18::97::1
 959 | 18::98::1
 960 | 18::99::2
 961 | 19::0::1
 962 | 19::1::1
 963 | 19::2::1
 964 | 19::4::1
 965 | 19::6::2
 966 | 19::11::1
 967 | 19::12::1
 968 | 19::14::1
 969 | 19::23::1
 970 | 19::26::1
 971 | 19::31::1
 972 | 19::32::4
 973 | 19::33::1
 974 | 19::34::1
 975 | 19::37::1
 976 | 19::38::1
 977 | 19::41::1
 978 | 19::43::1
 979 | 19::45::1
 980 | 19::48::1
 981 | 19::49::1
 982 | 19::50::2
 983 | 19::53::2
 984 | 19::54::3
 985 | 19::55::1
 986 | 19::56::2
 987 | 19::58::1
 988 | 19::61::1
 989 | 19::62::1
 990 | 19::63::1
 991 | 19::64::1
 992 | 19::65::1
 993 | 19::69::2
 994 | 19::72::1
 995 | 19::74::3
 996 | 19::76::1
 997 | 19::78::1
 998 | 19::79::1
 999 | 19::81::1
1000 | 19::82::1
1001 | 19::84::1
1002 | 19::86::1
1003 | 19::87::2
1004 | 19::90::4
1005 | 19::93::1
1006 | 19::94::4
1007 | 19::95::2
1008 | 19::96::1
1009 | 19::98::4
1010 | 20::0::1
1011 | 20::1::1
1012 | 20::2::2
1013 | 20::4::2
1014 | 20::6::1
1015 | 20::8::1
1016 | 20::12::1
1017 | 20::21::2
1018 | 20::22::5
1019 | 20::24::2
1020 | 20::25::1
1021 | 20::26::1
1022 | 20::29::2
1023 | 20::30::2
1024 | 20::32::2
1025 | 20::39::1
1026 | 20::40::1
1027 | 20::41::2
1028 | 20::45::2
1029 | 20::48::1
1030 | 20::50::1
1031 | 20::51::3
1032 | 20::53::3
1033 | 20::55::1
1034 | 20::57::2
1035 | 20::60::1
1036 | 20::61::1
1037 | 20::64::1
1038 | 20::66::1
1039 | 20::70::2
1040 | 20::72::1
1041 | 20::73::2
1042 | 20::75::4
1043 | 20::76::1
1044 | 20::77::4
1045 | 20::78::1
1046 | 20::79::1
1047 | 20::84::2
1048 | 20::85::2
1049 | 20::88::3
1050 | 20::89::1
1051 | 20::90::3
1052 | 20::91::1
1053 | 20::92::2
1054 | 20::93::1
1055 | 20::94::4
1056 | 20::97::1
1057 | 21::0::1
1058 | 21::2::4
1059 | 21::3::1
1060 | 21::7::2
1061 | 21::11::1
1062 | 21::12::1
1063 | 21::13::1
1064 | 21::14::3
1065 | 21::17::1
1066 | 21::19::1
1067 | 21::20::1
1068 | 21::21::1
1069 | 21::22::1
1070 | 21::23::1
1071 | 21::24::1
1072 | 21::27::1
1073 | 21::29::5
1074 | 21::30::2
1075 | 21::38::1
1076 | 21::40::2
1077 | 21::43::3
1078 | 21::44::1
1079 | 21::45::1
1080 | 21::46::1
1081 | 21::48::1
1082 | 21::51::1
1083 | 21::53::5
1084 | 21::54::1
1085 | 21::55::1
1086 | 21::56::1
1087 | 21::58::3
1088 | 21::59::3
1089 | 21::64::1
1090 | 21::66::1
1091 | 21::68::1
1092 | 21::71::1
1093 | 21::73::1
1094 | 21::74::4
1095 | 21::80::1
1096 | 21::81::1
1097 | 21::83::1
1098 | 21::84::1
1099 | 21::85::3
1100 | 21::87::4
1101 | 21::89::2
1102 | 21::92::2
1103 | 21::96::3
1104 | 21::99::1
1105 | 22::0::1
1106 | 22::3::2
1107 | 22::5::2
1108 | 22::6::2
1109 | 22::9::1
1110 | 22::10::1
1111 | 22::11::1
1112 | 22::13::1
1113 | 22::14::1
1114 | 22::16::1
1115 | 22::18::3
1116 | 22::19::1
1117 | 22::22::5
1118 | 22::25::1
1119 | 22::26::1
1120 | 22::29::3
1121 | 22::30::5
1122 | 22::32::4
1123 | 22::33::1
1124 | 22::35::1
1125 | 22::36::3
1126 | 22::37::1
1127 | 22::40::1
1128 | 22::41::3
1129 | 22::44::1
1130 | 22::45::2
1131 | 22::48::1
1132 | 22::51::5
1133 | 22::55::1
1134 | 22::56::2
1135 | 22::60::3
1136 | 22::61::1
1137 | 22::62::4
1138 | 22::63::1
1139 | 22::65::1
1140 | 22::66::1
1141 | 22::68::4
1142 | 22::69::4
1143 | 22::70::3
1144 | 22::71::1
1145 | 22::74::5
1146 | 22::75::5
1147 | 22::78::1
1148 | 22::80::3
1149 | 22::81::1
1150 | 22::82::1
1151 | 22::84::1
1152 | 22::86::1
1153 | 22::87::3
1154 | 22::88::5
1155 | 22::90::2
1156 | 22::92::3
1157 | 22::95::2
1158 | 22::96::2
1159 | 22::98::4
1160 | 22::99::1
1161 | 23::0::1
1162 | 23::2::1
1163 | 23::4::1
1164 | 23::6::2
1165 | 23::10::4
1166 | 23::12::1
1167 | 23::13::4
1168 | 23::14::1
1169 | 23::15::1
1170 | 23::18::4
1171 | 23::22::2
1172 | 23::23::4
1173 | 23::24::1
1174 | 23::25::1
1175 | 23::26::1
1176 | 23::27::5
1177 | 23::28::1
1178 | 23::29::1
1179 | 23::30::4
1180 | 23::32::5
1181 | 23::33::2
1182 | 23::36::3
1183 | 23::37::1
1184 | 23::38::1
1185 | 23::39::1
1186 | 23::43::1
1187 | 23::48::5
1188 | 23::49::5
1189 | 23::50::4
1190 | 23::53::1
1191 | 23::55::5
1192 | 23::57::1
1193 | 23::59::1
1194 | 23::60::1
1195 | 23::61::1
1196 | 23::64::4
1197 | 23::65::5
1198 | 23::66::2
1199 | 23::67::1
1200 | 23::68::3
1201 | 23::69::1
1202 | 23::72::1
1203 | 23::73::3
1204 | 23::77::1
1205 | 23::82::2
1206 | 23::83::1
1207 | 23::84::1
1208 | 23::85::1
1209 | 23::87::3
1210 | 23::88::1
1211 | 23::95::2
1212 | 23::97::1
1213 | 24::4::1
1214 | 24::6::3
1215 | 24::7::1
1216 | 24::10::2
1217 | 24::12::1
1218 | 24::15::1
1219 | 24::19::1
1220 | 24::24::1
1221 | 24::27::3
1222 | 24::30::5
1223 | 24::31::1
1224 | 24::32::3
1225 | 24::33::1
1226 | 24::37::1
1227 | 24::39::1
1228 | 24::40::1
1229 | 24::42::1
1230 | 24::43::3
1231 | 24::45::2
1232 | 24::46::1
1233 | 24::47::1
1234 | 24::48::1
1235 | 24::49::1
1236 | 24::50::1
1237 | 24::52::5
1238 | 24::57::1
1239 | 24::59::4
1240 | 24::63::4
1241 | 24::65::1
1242 | 24::66::1
1243 | 24::67::1
1244 | 24::68::3
1245 | 24::69::5
1246 | 24::71::1
1247 | 24::72::4
1248 | 24::77::4
1249 | 24::78::1
1250 | 24::80::1
1251 | 24::82::1
1252 | 24::84::1
1253 | 24::86::1
1254 | 24::87::1
1255 | 24::88::2
1256 | 24::89::1
1257 | 24::90::5
1258 | 24::91::1
1259 | 24::92::1
1260 | 24::94::2
1261 | 24::95::1
1262 | 24::96::5
1263 | 24::98::1
1264 | 24::99::1
1265 | 25::1::3
1266 | 25::2::1
1267 | 25::7::1
1268 | 25::9::1
1269 | 25::12::3
1270 | 25::16::3
1271 | 25::17::1
1272 | 25::18::1
1273 | 25::20::1
1274 | 25::22::1
1275 | 25::23::1
1276 | 25::26::2
1277 | 25::29::1
1278 | 25::30::1
1279 | 25::31::2
1280 | 25::33::4
1281 | 25::34::3
1282 | 25::35::2
1283 | 25::36::1
1284 | 25::37::1
1285 | 25::40::1
1286 | 25::41::1
1287 | 25::43::1
1288 | 25::47::4
1289 | 25::50::1
1290 | 25::51::1
1291 | 25::53::1
1292 | 25::56::1
1293 | 25::58::2
1294 | 25::64::2
1295 | 25::67::2
1296 | 25::68::1
1297 | 25::70::1
1298 | 25::71::4
1299 | 25::73::1
1300 | 25::74::1
1301 | 25::76::1
1302 | 25::79::1
1303 | 25::82::1
1304 | 25::84::2
1305 | 25::85::1
1306 | 25::91::3
1307 | 25::92::1
1308 | 25::94::1
1309 | 25::95::1
1310 | 25::97::2
1311 | 26::0::1
1312 | 26::1::1
1313 | 26::2::1
1314 | 26::3::1
1315 | 26::4::4
1316 | 26::5::2
1317 | 26::6::3
1318 | 26::7::5
1319 | 26::13::3
1320 | 26::14::1
1321 | 26::16::1
1322 | 26::18::3
1323 | 26::20::1
1324 | 26::21::3
1325 | 26::22::5
1326 | 26::23::5
1327 | 26::24::5
1328 | 26::27::1
1329 | 26::31::1
1330 | 26::35::1
1331 | 26::36::4
1332 | 26::40::1
1333 | 26::44::1
1334 | 26::45::2
1335 | 26::47::1
1336 | 26::48::1
1337 | 26::49::3
1338 | 26::50::2
1339 | 26::52::1
1340 | 26::54::4
1341 | 26::55::1
1342 | 26::57::3
1343 | 26::58::1
1344 | 26::61::1
1345 | 26::62::2
1346 | 26::66::1
1347 | 26::68::4
1348 | 26::71::1
1349 | 26::73::4
1350 | 26::76::1
1351 | 26::81::3
1352 | 26::85::1
1353 | 26::86::3
1354 | 26::88::5
1355 | 26::91::1
1356 | 26::94::5
1357 | 26::95::1
1358 | 26::96::1
1359 | 26::97::1
1360 | 27::0::1
1361 | 27::9::1
1362 | 27::10::1
1363 | 27::18::4
1364 | 27::19::3
1365 | 27::20::1
1366 | 27::22::2
1367 | 27::24::2
1368 | 27::25::1
1369 | 27::27::3
1370 | 27::28::1
1371 | 27::29::1
1372 | 27::31::1
1373 | 27::33::3
1374 | 27::40::1
1375 | 27::42::1
1376 | 27::43::1
1377 | 27::44::3
1378 | 27::45::1
1379 | 27::51::3
1380 | 27::52::1
1381 | 27::55::3
1382 | 27::57::1
1383 | 27::59::1
1384 | 27::60::1
1385 | 27::61::1
1386 | 27::64::1
1387 | 27::66::3
1388 | 27::68::1
1389 | 27::70::1
1390 | 27::71::2
1391 | 27::72::1
1392 | 27::75::3
1393 | 27::78::1
1394 | 27::80::3
1395 | 27::82::1
1396 | 27::83::3
1397 | 27::86::1
1398 | 27::87::2
1399 | 27::90::1
1400 | 27::91::1
1401 | 27::92::1
1402 | 27::93::1
1403 | 27::94::2
1404 | 27::95::1
1405 | 27::98::1
1406 | 28::0::3
1407 | 28::1::1
1408 | 28::2::4
1409 | 28::3::1
1410 | 28::6::1
1411 | 28::7::1
1412 | 28::12::5
1413 | 28::13::2
1414 | 28::14::1
1415 | 28::15::1
1416 | 28::17::1
1417 | 28::19::3
1418 | 28::20::1
1419 | 28::23::3
1420 | 28::24::3
1421 | 28::27::1
1422 | 28::29::1
1423 | 28::33::1
1424 | 28::34::1
1425 | 28::36::1
1426 | 28::38::2
1427 | 28::39::2
1428 | 28::44::1
1429 | 28::45::1
1430 | 28::49::4
1431 | 28::50::1
1432 | 28::52::1
1433 | 28::54::1
1434 | 28::56::1
1435 | 28::57::3
1436 | 28::58::1
1437 | 28::59::1
1438 | 28::60::1
1439 | 28::62::3
1440 | 28::63::1
1441 | 28::65::1
1442 | 28::75::1
1443 | 28::78::1
1444 | 28::81::5
1445 | 28::82::4
1446 | 28::83::1
1447 | 28::85::1
1448 | 28::88::2
1449 | 28::89::4
1450 | 28::90::1
1451 | 28::92::5
1452 | 28::94::1
1453 | 28::95::2
1454 | 28::98::1
1455 | 28::99::1
1456 | 29::3::1
1457 | 29::4::1
1458 | 29::5::1
1459 | 29::7::2
1460 | 29::9::1
1461 | 29::10::3
1462 | 29::11::1
1463 | 29::13::3
1464 | 29::14::1
1465 | 29::15::1
1466 | 29::17::3
1467 | 29::19::3
1468 | 29::22::3
1469 | 29::23::4
1470 | 29::25::1
1471 | 29::29::1
1472 | 29::31::1
1473 | 29::32::4
1474 | 29::33::2
1475 | 29::36::2
1476 | 29::38::3
1477 | 29::39::1
1478 | 29::42::1
1479 | 29::46::5
1480 | 29::49::3
1481 | 29::51::2
1482 | 29::59::1
1483 | 29::61::1
1484 | 29::62::1
1485 | 29::67::1
1486 | 29::68::3
1487 | 29::69::1
1488 | 29::70::1
1489 | 29::74::1
1490 | 29::75::1
1491 | 29::79::2
1492 | 29::80::1
1493 | 29::81::2
1494 | 29::83::1
1495 | 29::85::1
1496 | 29::86::1
1497 | 29::90::4
1498 | 29::93::1
1499 | 29::94::4
1500 | 29::97::1
1501 | 29::99::1
1502 | 


--------------------------------------------------------------------------------
/docs/universal_recommender.md:
--------------------------------------------------------------------------------
 1 | # Universal Recommender
 2 | 
 3 | ##Quick Start
 4 | 
 5 |  1. [Install the PredictionIO framework](https://docs.prediction.io/install/) **be sure to choose HBase and Elasticsearch** for storage. This template requires Elasticsearch.
 6 |  2. Make sure the PIO console and services are running, check with `pio status`
 7 |  3. [Install this template](https://docs.prediction.io/start/download/) **be sure to specify this template** with `pio template get PredictionIO/template-scala-parallel-universal-recommendation`
 8 |  
 9 | **To import and experiment with the simple example data**
10 | 
11 | 1. Create a new app name, change `appName` in `engine.json`
12 | 2. Run `pio app new **your-new-app-name**`
13 | 4. Import sample events by running `python data/import_handmade.py --access_key **your-access-key**` where the key can be retrieved with `pio app list`
14 | 3. The engine.json file in the root directory of your new UR template is set up for the data you just imported (make sure to create a new one for your data) Edit this file and change the `appName` parameter to match what you called the app in step #2
15 | 5. Perform `pio build`, `pio train`, and `pio deploy`
16 | 6. To execute some sample queries run `./examples/query-handmade.sh`
17 | 
18 | If there are timeouts, enable the delays that are commented out in the script&mdash;for now. In the production environment the engines will "warm up" with caching and will execute queries much faster. Also all services can be configured or scaled to meet virtually any performance needs.
19 | 
20 | **See the [Github README.md](https://github.com/PredictionIO/template-scala-parallel-universal-recommendation) for further usage instructions**
21 | 
22 | ##What is a Universal Recommender
23 | 
24 | The Universal Recommender (UR) will accept a range of data, auto correlate it, and allow for very flexible queries. The UR is different from most recommenders in these ways:
25 | 
26 | * It takes a single very strong "primary" event type&mdash;one that clearly reflects a user's preference&mdash;and correlates any number of other event types to the primary event. This has the effect of using virtually any user action to recommend the primary action. Much of a user’s clickstream can be used to make recommendations. If a user has no history of the primary action (purchase for instance) but does have history of views, personalized recommendations for purchases can still be made. With user purchase history the recommendations become better. ALS-type recommenders have been used with event weights but except for ratings these often do not result in better performance.
27 | * It can boost and filter based on events or item metadata/properties. This means it can give personalized recs that are biased toward “SciFi” and filtered to only include “Promoted” items when the business rules call for this.
28 | * It can use a user's context to make recommendations even when the user is new. If usage data has been gathered for other users for referring URL, device type, or location, for instance, there may be a correlation between this data and items preferred. The UR can detect this **if** it exists and recommend based on this context, even to new users. We call this "micro-segmented" recommendations since they are not personal but group users based on limited contextual information. These will not be as good as when more is know about the user but may be better than simply returning popular items.
29 | * It includes a fallback to some form of item popularity when there is no other information known about the user (not implemented in v0.1.0).
30 | * All of the above can be mixed into a single query for blended results and so the query can be tuned to a great many applications. Also since only one query is made and boosting is supported, a query can be constructed with several fallbacks. Usage data is most important so boost that high, micro-segemnting data may be better than popularity so boost that lower, and popularity fills in if no other recommendations are available.
31 | 
32 | Other features:
33 | 
34 |  * Makes recommendations based on realtime user history. Even anonymous users will get recommendations if they have recorded preference history and a user-id. There is no hard requirement to retrain the model to make this happen. 
35 |  
36 | TBD:
37 | 
38 |  * Date range filters based on Date properties of items
39 |  * Populatiy type recommendations backfill for returning "trending", or "hot" items when no other recommendations are available from the training data. 
40 |  * Content-based correlators for content-based recommendations
41 | 
42 | ## References
43 | 
44 |  * Other documentation of the algorithm is [here](http://mahout.apache.org/users/algorithms/intro-cooccurrence-spark.html)
45 |  * A free ebook, which talks about the general idea: [Practical Machine Learning](https://www.mapr.com/practical-machine-learning).
46 |  * A slide deck, which talks about mixing actions and other indicator types, including content-based ones: [Creating a Unified Recommender](http://www.slideshare.net/pferrel/unified-recommender-39986309?ref=http://occamsmachete.com/ml/)
47 |  * Two blog posts: What's New in Recommenders: part [#1](http://occamsmachete.com/ml/2014/08/11/mahout-on-spark-whats-new-in-recommenders/) [#2](http://occamsmachete.com/ml/2014/09/09/mahout-on-spark-whats-new-in-recommenders-part-2/)
48 |  * A post describing the log-likelihood ratio: [Surprise and Coincidence](http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html) LLR is used to reduce noise in the data while keeping the calculations O(n) complexity.
49 | 


--------------------------------------------------------------------------------
/engine.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "comment":" This config file uses default settings for all but the required values see README.md for docs",
 3 |   "id": "default",
 4 |   "description": "Default settings",
 5 |   "engineFactory": "org.template.RecommendationEngine",
 6 |   "datasource": {
 7 |     "params" : {
 8 |       "name": "sample-handmade-data.txt",
 9 |       "appName": "handmade",
10 |       "eventNames": ["purchase", "view"]
11 |     }
12 |   },
13 |   "sparkConf": {
14 |     "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
15 |     "spark.kryo.registrator": "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator",
16 |     "spark.kryo.referenceTracking": "false",
17 |     "spark.kryoserializer.buffer": "300m",
18 |     "spark.executor.memory": "4g",
19 |     "es.index.auto.create": "true"
20 |   },
21 |   "algorithms": [
22 |     {
23 |       "comment": "simplest setup where all values are default, popularity based backfill, must add eventsNames",
24 |       "name": "ur",
25 |       "params": {
26 |         "appName": "handmade",
27 |         "indexName": "urindex",
28 |         "typeName": "items",
29 |         "comment": "must have data for the first event or the model will not build, other events are optional",
30 |         "eventNames": ["purchase", "view"]
31 |       }
32 |     }
33 |   ]
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/engine.json.minimum:
--------------------------------------------------------------------------------
 1 | {
 2 |   "comment":" This config file uses default settings for all but the required values see README.md for docs",
 3 |   "id": "default",
 4 |   "description": "Default settings",
 5 |   "engineFactory": "org.template.RecommendationEngine",
 6 |   "datasource": {
 7 |     "params" : {
 8 |       "name": "sample-handmade-data.txt",
 9 |       "appName": "handmade",
10 |       "eventNames": ["purchase", "view"]
11 |     }
12 |   },
13 |   "sparkConf": {
14 |     "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
15 |     "spark.kryo.registrator": "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator",
16 |     "spark.kryo.referenceTracking": "false",
17 |     "spark.kryoserializer.buffer.mb": "300",
18 |     "spark.kryoserializer.buffer": "300m",
19 |     "spark.executor.memory": "4g",
20 |     "es.index.auto.create": "true"
21 |   },
22 |   "algorithms": [
23 |     {
24 |       "comment": "simplest setup where all values are default, popularity based backfill, must add eventsNames",
25 |       "name": "ur",
26 |       "params": {
27 |         "appName": "handmade",
28 |         "indexName": "urindex",
29 |         "typeName": "items",
30 |         "comment": "must have data for the first event or the model will not build, other events are optional",
31 |         "eventNames": ["purchase", "view"]
32 |       }
33 |     }
34 |   ]
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/event-names-test-engine.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "comment":" This config file uses default settings for all but the required values see README.md for docs",
 3 |   "id": "default",
 4 |   "description": "Default settings",
 5 |   "engineFactory": "org.template.RecommendationEngine",
 6 |   "datasource": {
 7 |     "params" : {
 8 |       "name": "sample-handmade-data.txt",
 9 |       "appName": "handmade",
10 |       "eventNames": ["purchase", "view"]
11 |     }
12 |   },
13 |   "sparkConf": {
14 |     "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
15 |     "spark.kryo.registrator": "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator",
16 |     "spark.kryo.referenceTracking": "false",
17 |     "spark.kryoserializer.buffer.mb": "300",
18 |     "spark.kryoserializer.buffer": "300m",
19 |     "spark.executor.memory": "4g",
20 |     "es.index.auto.create": "true"
21 |   },
22 |   "algorithms": [
23 |     {
24 |       "comment": "simplest setup where all values are default, popularity based backfill, must add eventsNames",
25 |       "name": "ur",
26 |       "params": {
27 |         "appName": "handmade",
28 |         "indexName": "urindex",
29 |         "typeName": "items",
30 |         "comment": "must have data for the first event or the model will not build, other events are optional",
31 |         "eventNames": ["purchase", "view"],
32 |         "blacklistEvents": []
33 |       }
34 |     }
35 |   ]
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/examples/import_handmade.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Import sample data for recommendation engine
  3 | """
  4 | 
  5 | import predictionio
  6 | import argparse
  7 | import random
  8 | import datetime
  9 | import pytz
 10 | 
 11 | RATE_ACTIONS_DELIMITER = ","
 12 | PROPERTIES_DELIMITER = ":"
 13 | SEED = 1
 14 | 
 15 | 
 16 | def import_events(client, file):
 17 |   f = open(file, 'r')
 18 |   random.seed(SEED)
 19 |   count = 0
 20 |   # year, month, day[, hour[, minute[, second[
 21 |   #event_date = datetime.datetime(2015, 8, 13, 12, 24, 41)
 22 |   event_date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days=2.7)
 23 |   date_increment = datetime.timedelta(days=0.8)
 24 |   available_date = event_date + datetime.timedelta(days=-2)
 25 |   expire_date = event_date + datetime.timedelta(days=2)
 26 |   print "Importing data..."
 27 | 
 28 |   for line in f:
 29 |     data = line.rstrip('\r\n').split(RATE_ACTIONS_DELIMITER)
 30 |     # For demonstration purpose action names are taken from input along with secondary actions on
 31 |     # For the UR add some item metadata
 32 | 
 33 |     if (data[1] == "purchase"):
 34 |       client.create_event(
 35 |         event=data[1],
 36 |         entity_type="user",
 37 |         entity_id=data[0],
 38 |         target_entity_type="item",
 39 |         target_entity_id=data[2],
 40 |       )
 41 |       print "Event: " + data[1] + " entity_id: " + data[0] + " target_entity_id: " + data[2]
 42 |     elif (data[1] == "view"):  # assumes other event type is 'view'
 43 |       client.create_event(
 44 |         event=data[1],
 45 |         entity_type="user",
 46 |         entity_id=data[0],
 47 |         target_entity_type="item",  # type of item in this action
 48 |         target_entity_id=data[2],
 49 |       )
 50 |       print "Event: " + data[1] + " entity_id: " + data[0] + " target_entity_id: " + data[2]
 51 |     elif (data[1] == "$set"):  # must be a set event
 52 |       properties = data[2].split(PROPERTIES_DELIMITER)
 53 |       prop_name = properties.pop(0)
 54 |       client.create_event(
 55 |         event=data[1],
 56 |         entity_type="item",
 57 |         entity_id=data[0],
 58 |         properties={prop_name: properties}
 59 |       )
 60 |       print "Event: " + data[1] + " entity_id: " + data[0] + " properties/"+prop_name+": " + str(properties)
 61 |     count += 1
 62 | 
 63 |   items = ['Iphone 6', 'Ipad-retina', 'Nexus', 'Surface', 'Iphone 4', 'Galaxy', 'Iphone 5']
 64 |   print "All items: " + str(items)
 65 |   for item in items:
 66 | 
 67 |     client.create_event(
 68 |       event="$set",
 69 |       entity_type="item",
 70 |       entity_id=item,
 71 |       properties={"expires": expire_date.isoformat(),
 72 |                   "available": available_date.isoformat(), "date": event_date.isoformat()}
 73 |     )
 74 |     print "Event: $set entity_id: " + item + \
 75 |             " properties/availableDate: " + available_date.isoformat() + \
 76 |             " properties/date: " + event_date.isoformat() + \
 77 |             " properties/expireDate: " + expire_date.isoformat()
 78 |     expire_date += date_increment
 79 |     event_date += date_increment
 80 |     available_date += date_increment
 81 |     count += 1
 82 | 
 83 |   f.close()
 84 |   print "%s events are imported." % count
 85 | 
 86 | 
 87 | if __name__ == '__main__':
 88 |   parser = argparse.ArgumentParser(
 89 |     description="Import sample data for recommendation engine")
 90 |   parser.add_argument('--access_key', default='invald_access_key')
 91 |   parser.add_argument('--url', default="http://localhost:7070")
 92 |   parser.add_argument('--file', default="./data/sample-handmade-data.txt")
 93 | 
 94 |   args = parser.parse_args()
 95 |   print args
 96 | 
 97 |   client = predictionio.EventClient(
 98 |     access_key=args.access_key,
 99 |     url=args.url,
100 |     threads=5,
101 |     qsize=500)
102 |   import_events(client, args.file)
103 | 


--------------------------------------------------------------------------------
/examples/import_movielens_eventserver.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Import sample data for recommendation engine
 3 | """
 4 | 
 5 | import predictionio
 6 | import argparse
 7 | import random
 8 | 
 9 | RATE_ACTIONS_DELIMITER = "::"
10 | SEED = 3
11 | 
12 | def import_events(client, file):
13 |   f = open(file, 'r')
14 |   random.seed(SEED)
15 |   count = 0
16 |   print "Importing data..."
17 |   for line in f:
18 |     data = line.rstrip('\r\n').split(RATE_ACTIONS_DELIMITER)
19 |     # For demonstration purpose, randomly mix in some buy events
20 |     # For the UR add some item metadata
21 |     if (random.randint(0, 1) == 1):
22 |       client.create_event(
23 |         event="rate",
24 |         entity_type="user",
25 |         entity_id=data[0],
26 |         target_entity_type="item",
27 |         target_entity_id=data[1],
28 |       )
29 |     else:
30 |       client.create_event(
31 |         event="buy",
32 |         entity_type="user",
33 |         entity_id=data[0],
34 |         target_entity_type="item",
35 |         target_entity_id=data[1],
36 |       )
37 |     if (random.randint(0, 1) == 1):
38 |       client.create_event(
39 |         event="$set",
40 |         entity_type="item",
41 |         entity_id=data[1],
42 |         properties= { "category": ["cat1", "cat5"] }
43 |       )
44 |     else:
45 |       client.create_event(
46 |         event="$set",
47 |         entity_type="item",
48 |         entity_id=data[1],
49 |         properties= { "category": ["cat1", "cat2"] }
50 |       )
51 |     count += 1
52 |   f.close()
53 |   print "%s events are imported." % count
54 | 
55 | if __name__ == '__main__':
56 |   parser = argparse.ArgumentParser(
57 |     description="Import sample data for recommendation engine")
58 |   parser.add_argument('--access_key', default='invald_access_key')
59 |   parser.add_argument('--url', default="http://localhost:7070")
60 |   parser.add_argument('--file', default="./data/sample_movielens_data.txt")
61 | 
62 |   args = parser.parse_args()
63 |   print args
64 | 
65 |   client = predictionio.EventClient(
66 |     access_key=args.access_key,
67 |     url=args.url,
68 |     threads=5,
69 |     qsize=500)
70 |   import_events(client, args.file)
71 | 


--------------------------------------------------------------------------------
/examples/integration-test:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # exit on any error
 4 | set -e
 5 | 
 6 | echo ""
 7 | echo "Integration test for The Universal Recommender."
 8 | echo "If some step fails check that your engine.json file has been restored"
 9 | echo "or look for it in 'user-engine.json'"
10 | echo ""
11 | 
12 | echo "Checking for needed files"
13 | if [ ! -f handmade-engine.json ]; then
14 |     echo "File not found: handmade-engine.json"
15 |     exit 1
16 | fi
17 | 
18 | if [ -f user-engine.json ]; then
19 |     echo "File user-engine.json found, this may be an error so we cannot replace engine.json"
20 |     exit 1
21 | fi
22 | 
23 | if [ ! -f data/integration-test-expected.txt ]; then
24 |     echo "File not found: data/integration-test-expected.txt"
25 |     exit 1
26 | fi
27 | 
28 | echo ""
29 | echo "Checking status, should exit if pio is not running."
30 | pio status
31 | 
32 | echo ""
33 | echo "Checking to see if handmade app exists, should exit if not."
34 | pio app show handmade
35 | 
36 | echo ""
37 | echo "Moving engine.json to user-engine.json"
38 | cp -n engine.json user-engine.json
39 | 
40 | echo ""
41 | echo "Moving handmade-engine.json to engine.json for integration test."
42 | cp handmade-engine.json engine.json
43 | 
44 | echo ""
45 | echo "Deleting handmade app data since the test is date dependent"
46 | pio app data-delete handmade
47 | 
48 | echo ""
49 | echo "Importing data for integration test"
50 | # get the access_key from pio app list
51 | ACCESS_KEY=`pio app show handmade | grep Key | cut -f 7 -d ' '`
52 | echo -n "Access key: "
53 | echo $ACCESS_KEY
54 | python examples/import_handmade.py --access_key $ACCESS_KEY
55 | 
56 | echo ""
57 | echo "Building and delpoying model"
58 | pio build
59 | pio train  -- --driver-memory 2g
60 | echo "Model will remain deployed after this test"
61 | nohup pio deploy > deploy.out &
62 | echo "Waiting 20 seconds for the server to start"
63 | sleep 20
64 | 
65 | echo ""
66 | echo "Running test query."
67 | ./examples/multi-query-handmade.sh > test.out
68 | 
69 | echo ""
70 | echo "Restoring engine.json"
71 | mv user-engine.json engine.json
72 | 
73 | echo ""
74 | echo "Differences between expected and actual results, none is a passing test:"
75 | diff data/integration-test-expected.txt test.out
76 | 
77 | echo ""
78 | echo "Note that the engine is still deployed until killed or this shell exists."
79 | 


--------------------------------------------------------------------------------
/examples/multi-query-handmade.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | echo ""
  4 | echo "Queries to illustrate many use cases on a small standard dataset and for an automated integration test."
  5 | echo ""
  6 | echo "WARNING: for this to produce the correct result you must:"
  7 |  echo"  1. Import data with "
  8 | echo "     $ python examples/import_handmade.py --access_key <your-app-accesskey>"
  9 | echo "  2. Copy handmade-engine.json to engine.json."
 10 | echo "  3. Run 'pio build', 'pio train', and 'pio deploy'"
 11 | echo "  4. The queries must be run the same day as the import was done because date filters are part of the test."
 12 | echo "NOTE: due to available and expire dates you should never see the Iphone 5 or Iphone 6."
 13 | 
 14 | echo ""
 15 | echo "============ simple user recs ============"
 16 | echo ""
 17 | echo "Recommendations for user: u1"
 18 | echo ""
 19 | curl -H "Content-Type: application/json" -d '
 20 | {
 21 |     "user": "u1"
 22 | }' http://localhost:8000/queries.json
 23 | echo ""
 24 | 
 25 | 
 26 | echo ""
 27 | echo "Recommendations for user: U 2"
 28 | echo ""
 29 | curl -H "Content-Type: application/json" -d '
 30 | {
 31 |     "user": "U 2"
 32 | }' http://localhost:8000/queries.json
 33 | echo ""
 34 | 
 35 | 
 36 | echo ""
 37 | echo "Recommendations for user: u-3"
 38 | echo ""
 39 | curl -H "Content-Type: application/json" -d '
 40 | {
 41 |     "user": "u-3"
 42 | }' http://localhost:8000/queries.json
 43 | echo ""
 44 | 
 45 | 
 46 | echo ""
 47 | echo "Recommendations for user: u-4"
 48 | echo ""
 49 | curl -H "Content-Type: application/json" -d '
 50 | {
 51 |     "user": "u1"
 52 | }' http://localhost:8000/queries.json
 53 | echo ""
 54 | 
 55 | 
 56 | echo ""
 57 | echo "Recommendations for user: u5"
 58 | echo ""
 59 | curl -H "Content-Type: application/json" -d '
 60 | {
 61 |     "user": "u5"
 62 | }' http://localhost:8000/queries.json
 63 | echo ""
 64 | 
 65 | echo ""
 66 | echo "============ simple similar item recs ============"
 67 | echo ""
 68 | echo "Recommendations for item: Iphone 4"
 69 | echo ""
 70 | curl -H "Content-Type: application/json" -d '
 71 | {
 72 |     "item": "Iphone 4"
 73 | }' http://localhost:8000/queries.json
 74 | echo ""
 75 | 
 76 | echo ""
 77 | echo "Recommendations for item: Ipad-retina"
 78 | echo ""
 79 | curl -H "Content-Type: application/json" -d '
 80 | {
 81 |     "item": "Ipad-retina"
 82 | }' http://localhost:8000/queries.json
 83 | echo ""
 84 | 
 85 | echo ""
 86 | echo "Recommendations for item: Nexus"
 87 | echo ""
 88 | curl -H "Content-Type: application/json" -d '
 89 | {
 90 |     "item": "Nexus"
 91 | }' http://localhost:8000/queries.json
 92 | echo ""
 93 | 
 94 | echo ""
 95 | echo "Recommendations for item: Galaxy"
 96 | echo ""
 97 | curl -H "Content-Type: application/json" -d '
 98 | {
 99 |     "item": "Galaxy"
100 | }' http://localhost:8000/queries.json
101 | echo ""
102 | 
103 | echo ""
104 | echo "Recommendations for item: Surface"
105 | echo ""
106 | curl -H "Content-Type: application/json" -d '
107 | {
108 |     "item": "Surface"
109 | }' http://localhost:8000/queries.json
110 | echo ""
111 | 
112 | echo ""
113 | echo "============ popular item recs only ============"
114 | echo ""
115 | echo "query with no item or user id, ordered by popularity"
116 | echo ""
117 | curl -H "Content-Type: application/json" -d '
118 | {
119 | }' http://localhost:8000/queries.json
120 | echo ""
121 | 
122 | echo ""
123 | echo "Recommendations for non-existant user: xyz, all from popularity"
124 | echo ""
125 | curl -H "Content-Type: application/json" -d '
126 | {
127 |     "user": "xyz"
128 | }' http://localhost:8000/queries.json
129 | echo ""
130 | 
131 | echo ""
132 | echo "Recommendations for non-existant item: xyz, all from popularity"
133 | echo ""
134 | curl -H "Content-Type: application/json" -d '
135 | {
136 |     "item": "xyz"
137 | }' http://localhost:8000/queries.json
138 | echo ""
139 | 
140 | 
141 | echo ""
142 | echo "Recommendations for no user no item, all from popularity, Tablets filter"
143 | echo ""
144 | curl -H "Content-Type: application/json" -d '
145 | {
146 |     "fields": [{
147 |         "name": "categories",
148 |         "values": ["Tablets"],
149 |         "bias": -1
150 |     }]
151 | }' http://localhost:8000/queries.json
152 | echo ""
153 | 
154 | 
155 | echo ""
156 | echo "Recommendations for no user no item, all from popularity, Tablets boost"
157 | echo ""
158 | curl -H "Content-Type: application/json" -d '
159 | {
160 |     "fields": [{
161 |         "name": "categories",
162 |         "values": ["Tablets"],
163 |         "bias": 1.05
164 |     }]
165 | }' http://localhost:8000/queries.json
166 | echo ""
167 | 
168 | 
169 | echo ""
170 | echo "Recommendations for no user no item, all from popularity, Tablets boost, Estados Unidos Mexicanos filter"
171 | echo ""
172 | curl -H "Content-Type: application/json" -d '
173 | {
174 |     "fields": [{
175 |         "name": "categories",
176 |         "values": ["Tablets"],
177 |         "bias": 1.05
178 |     }, {
179 |         "name": "countries",
180 |         "values": ["Estados Unidos Mexicanos"],
181 |         "bias": -1
182 |     }]
183 | }' http://localhost:8000/queries.json
184 | echo ""
185 | 
186 | 
187 | echo ""
188 | echo "============ dateRange filter ============"
189 | echo ""
190 | if [[ "$OSTYPE" == "linux-gnu" ]]; then
191 |   BEFORE=`date --date="tomorrow" --iso-8601=seconds`
192 |   AFTER=`date --date="1 day ago" --iso-8601=seconds`
193 | else
194 |   BEFORE=`date -v +1d +"%Y-%m-%dT%H:%M:%SZ"`
195 |   AFTER=`date -v -1d +"%Y-%m-%dT%H:%M:%SZ"`
196 | fi
197 | #echo "before: $BEFORE after: $AFTER"
198 | echo "Recommendations for user: u1"
199 | echo ""
200 | curl -H "Content-Type: application/json" -d "
201 | {
202 |     \"user\": \"u1\",
203 |     \"dateRange\": {
204 |         \"name\": \"date\",
205 |         \"before\": \"$BEFORE\",
206 |         \"after\": \"$AFTER\"
207 |     }
208 | }" http://localhost:8000/queries.json
209 | echo ""
210 | 
211 | echo ""
212 | echo "============ query with item and user *EXPERIMENTAL* ============"
213 | # This is experimental, use at your own risk, not well founded in theory
214 | echo ""
215 | echo "Recommendations for no user no item, all from popularity, Tablets boost, Estados Unidos Mexicanos filter"
216 | echo ""
217 | curl -H "Content-Type: application/json" -d '
218 | {
219 |     "user": "u1",
220 |     "item": "Iphone 4"
221 | }' http://localhost:8000/queries.json
222 | echo ""
223 | 
224 | 


--------------------------------------------------------------------------------
/examples/multi-query-movielens.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | echo ""
 3 | echo "Simple personalized query"
 4 | echo ""
 5 | curl -H "Content-Type: application/json" -d '
 6 | {
 7 |     "user": "1",
 8 |     "num": 10
 9 | }' http://localhost:8000/queries.json
10 | echo ""
11 | 
12 | #sleep 2
13 | 
14 | echo ""
15 | echo "Simple similar item query"
16 | echo ""
17 | curl -H "Content-Type: application/json" -d '
18 | {
19 |     "item": "62",
20 |     "num": 15
21 | }' http://localhost:8000/queries.json
22 | echo ""
23 | 
24 | #sleep 2
25 | 
26 | echo ""
27 | echo "Simple personalized query with category boost"
28 | echo ""
29 | curl -H "Content-Type: application/json" -d '
30 | {
31 |     "user": "1",
32 |     "num": 20,
33 |     "fields": [{
34 |         "name": "category",
35 |         "values": ["cat5"],
36 |         "bias": 1.005
37 |     }]
38 | }' http://localhost:8000/queries.json
39 | echo ""
40 | echo ""
41 | 


--------------------------------------------------------------------------------
/examples/pop-test-query.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | echo ""
 4 | echo "Recommendations for popular using default pop model"
 5 | echo ""
 6 | curl -H "Content-Type: application/json" -d '
 7 | {
 8 | }' http://localhost:8000/queries.json
 9 | echo ""
10 | 
11 | 


--------------------------------------------------------------------------------
/examples/single-query-eventNames.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | echo "Recommendations from popular"
 4 | echo ""
 5 | curl -H "Content-Type: application/json" -d '
 6 | {
 7 | }' http://localhost:8000/queries.json
 8 | echo ""
 9 | echo ""
10 | 
11 | echo "Recommendations for user: u1 purchase and view events"
12 | echo ""
13 | curl -H "Content-Type: application/json" -d '
14 | {
15 |     "user": "u1"
16 | }' http://localhost:8000/queries.json
17 | echo ""
18 | echo ""
19 | 
20 | echo "Recommendations for user: u1 from purchase event alone, should have some non-popular based recs"
21 | echo ""
22 | curl -H "Content-Type: application/json" -d '
23 | {
24 |     "user": "u1",
25 |     "eventNames": ["purchase"]
26 | }' http://localhost:8000/queries.json
27 | echo ""
28 | echo ""
29 | 
30 | echo "Recommendations for user: u1 from view event alone, should have some non-popular based recs"
31 | echo ""
32 | curl -H "Content-Type: application/json" -d '
33 | {
34 |     "user": "u1",
35 |     "eventNames": ["view"]
36 | }' http://localhost:8000/queries.json
37 | echo ""
38 | echo ""
39 | 
40 | 


--------------------------------------------------------------------------------
/examples/single-query-handmade.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | echo "Recommendations for user: u1"
 4 | echo ""
 5 | curl -H "Content-Type: application/json" -d '
 6 | {
 7 |     "user": "u1"
 8 | }' http://localhost:8000/queries.json
 9 | echo ""
10 | 
11 | 


--------------------------------------------------------------------------------
/handmade-engine.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "comment":" This config file uses default settings for all but the required values see README.md for docs",
 3 |   "id": "default",
 4 |   "description": "Default settings",
 5 |   "engineFactory": "org.template.RecommendationEngine",
 6 |   "datasource": {
 7 |     "params" : {
 8 |       "name": "sample-handmade-data.txt",
 9 |       "appName": "handmade",
10 |       "eventNames": ["purchase", "view"]
11 |     }
12 |   },
13 |   "sparkConf": {
14 |     "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
15 |     "spark.kryo.registrator": "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator",
16 |     "spark.kryo.referenceTracking": "false",
17 |     "spark.kryoserializer.buffer.mb": "300",
18 |     "spark.kryoserializer.buffer": "300m",
19 |     "spark.executor.memory": "4g",
20 |     "es.index.auto.create": "true"
21 |   },
22 |   "algorithms": [
23 |     {
24 |       "comment": "simplest setup where all values are default, popularity based backfill, must add eventsNames",
25 |       "name": "ur",
26 |       "params": {
27 |         "appName": "handmade",
28 |         "indexName": "urindex",
29 |         "typeName": "items",
30 |         "comment": "must have data for the first event or the model will not build, other events are optional",
31 |         "eventNames": ["purchase", "view"],
32 |         "availableDateName": "available",
33 |         "expireDateName": "expires",
34 |         "dateName": "date",
35 |         "num": 4
36 |       }
37 |     }
38 |   ]
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/pop-engine.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "id": "popularity-only",
 3 |   "description": "Default settings",
 4 |   "engineFactory": "org.template.RecommendationEngine",
 5 |   "datasource": {
 6 |     "params" : {
 7 |       "name": "sample-handmade-data.txt",
 8 |       "appName": "handmade",
 9 |       "eventNames": ["purchase", "view"]
10 |     }
11 |   },
12 |   "sparkConf": {
13 |     "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
14 |     "spark.kryo.registrator": "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator",
15 |     "spark.kryo.referenceTracking": "false",
16 |     "spark.kryoserializer.buffer.mb": "300",
17 |     "spark.executor.memory": "4g",
18 |     "es.index.auto.create": "true"
19 |   },
20 |   "algorithms": [
21 |     {
22 |       "comment": "setup to only calculate a popularity model for *hot* and add it to the existing model for backfill",
23 |       "name": "ur",
24 |       "params": {
25 |         "appName": "handmade",
26 |         "indexName": "urindex",
27 |         "typeName": "items",
28 |         "eventNames": ["purchase", "view"],
29 |         "comment": "this will overwrite any existing popularity model, default is *popular* so add this to engine.json if you want *hot* to be always refreshed during train",
30 |         "recsModel": "backfill",
31 |         "backfillField": {
32 |           "backfillType": "hot"
33 |         }
34 |       }
35 |     }
36 |   ]
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")
2 | 


--------------------------------------------------------------------------------
/project/pio-build.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("io.prediction" % "pio-build" % "0.9.0")
2 | 


--------------------------------------------------------------------------------
/src/main/scala/DataSource.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright ActionML, LLC under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * ActionML licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.template
 19 | 
 20 | import _root_.io.prediction.controller.PDataSource
 21 | import _root_.io.prediction.controller.EmptyEvaluationInfo
 22 | import _root_.io.prediction.controller.EmptyActualResult
 23 | import _root_.io.prediction.controller.Params
 24 | import _root_.io.prediction.data.storage.{PropertyMap, Event}
 25 | import _root_.io.prediction.data.store.PEventStore
 26 | import org.apache.mahout.math.indexeddataset.{BiDictionary, IndexedDataset}
 27 | import org.apache.spark.SparkContext
 28 | import org.apache.spark.rdd.RDD
 29 | import grizzled.slf4j.Logger
 30 | 
 31 | /** Taken from engine.json these are passed in to the DataSource constructor
 32 |   *
 33 |   * @param appName registered name for the app
 34 |   * @param eventNames a list of named events expected. The first is the primary event, the rest are secondary. These
 35 |   *                   will be used to create the primary correlator and cross-cooccurrence secondary correlators.
 36 |   */
 37 | case class DataSourceParams(
 38 |    appName: String,
 39 |    eventNames: List[String]) // IMPORTANT: eventNames must be exactly the same as URAlgorithmParams eventNames
 40 |   extends Params
 41 | 
 42 | /** Read specified events from the PEventStore and creates RDDs for each event. A list of pairs (eventName, eventRDD)
 43 |   * are sent to the Preparator for further processing.
 44 |   * @param dsp parameters taken from engine.json
 45 |   */
 46 | class DataSource(val dsp: DataSourceParams)
 47 |   extends PDataSource[TrainingData,
 48 |       EmptyEvaluationInfo, Query, EmptyActualResult] {
 49 | 
 50 |   @transient lazy val logger = Logger[this.type]
 51 | 
 52 |   /** Reads events from PEventStore and create and RDD for each */
 53 |   override
 54 |   def readTraining(sc: SparkContext): TrainingData = {
 55 | 
 56 |     val eventNames = dsp.eventNames
 57 | 
 58 |     val eventsRDD = PEventStore.find(
 59 |       appName = dsp.appName,
 60 |       entityType = Some("user"),
 61 |       eventNames = Some(eventNames),
 62 |       targetEntityType = Some(Some("item")))(sc)
 63 | 
 64 |     // now separate the events by event name
 65 |     val actionRDDs = eventNames.map { eventName =>
 66 |       val actionRDD = eventsRDD.filter { event =>
 67 | 
 68 |         require(eventNames.contains(event.event), s"Unexpected event ${event} is read.") // is this really needed?
 69 |         require(event.entityId.nonEmpty && event.targetEntityId.get.nonEmpty, "Empty user or item ID")
 70 | 
 71 |         eventName.equals(event.event)
 72 | 
 73 |       }.map { event =>
 74 |         (event.entityId, event.targetEntityId.get)
 75 |       }.cache()
 76 | 
 77 |       (eventName, actionRDD)
 78 |     }
 79 | 
 80 |     // aggregating all $set/$unsets for metadata fields, which are attached to items
 81 |     val fieldsRDD = PEventStore.aggregateProperties(
 82 |       appName= dsp.appName,
 83 |       entityType=  "item")(sc)
 84 | 
 85 |     // Have a list of (actionName, RDD), for each action
 86 |     // todo: some day allow data to be content, which requires rethinking how to use EventStore
 87 |     new TrainingData(actionRDDs, fieldsRDD)
 88 |   }
 89 | }
 90 | 
 91 | /** Low level RDD based representation of the data ready for the Preparator
 92 |   *
 93 |   * @param actions List of Tuples (actionName, actionRDD)qw
 94 |   * @param fieldsRDD RDD of item keyed PropertyMap for item metadata
 95 |   */
 96 | class TrainingData(
 97 |     val actions: List[(String, RDD[(String, String)])],
 98 |     val fieldsRDD: RDD[(String, PropertyMap)])
 99 |   extends Serializable {
100 | 
101 |   override def toString = {
102 |     val a = actions.map { t =>
103 |       s"${t._1} actions: [count:${t._2.count()}] + sample:${t._2.take(2).toList} "
104 |     }.toString()
105 |     val f = s"Item metadata: [count:${fieldsRDD.count}] + sample:${fieldsRDD.take(2).toList} "
106 |     a + f
107 |   }
108 | 
109 | }


--------------------------------------------------------------------------------
/src/main/scala/Engine.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright ActionML, LLC under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * ActionML licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.template
19 | 
20 | import java.util.Date
21 | 
22 | import io.prediction.controller.{EngineFactory, Engine}
23 | 
24 | /** This file contains case classes that are used with reflection to specify how query and config
25 |   * JSON is to be parsed. the Query case class, for instance defines the way a JSON query is to be
26 |   * formed. The same for param case classes.
27 |   */
28 | 
29 | /** The Query spec with optional values. The only hard rule is that there must be either a user or
30 |   * an item id. All other values are optional. */
31 | case class Query(
32 |     user: Option[String] = None, // must be a user or item id
33 |     userBias: Option[Float] = None, // default: whatever is in algorithm params or 1
34 |     item: Option[String] = None, // must be a user or item id
35 |     itemBias: Option[Float] = None, // default: whatever is in algorithm params or 1
36 |     fields: Option[List[Field]] = None, // default: whatever is in algorithm params or None
37 |     currentDate: Option[String] = None, // if used will override dateRange filter, currentDate must lie between the item's
38 |     // expireDateName value and availableDateName value, all are ISO 8601 dates
39 |     dateRange: Option[DateRange] = None, // optional before and after filter applied to a date field
40 |     blacklistItems: Option[List[String]] = None, // default: whatever is in algorithm params or None
41 |     returnSelf: Option[Boolean] = None,// means for an item query should the item itself be returned, defaults
42 |                                        // to what is in the algorithm params or false
43 |     num: Option[Int] = None, // default: whatever is in algorithm params, which itself has a default--probably 20
44 |     eventNames: Option[List[String]]) // names used to ID all user actions
45 |   extends Serializable
46 | 
47 | /** Used to specify how Fields are represented in engine.json */
48 | case class Field( // no optional values for fields, whne specified
49 |     name: String, // name of metadata field
50 |     values: Array[String], // fields can have multiple values like tags of a single value as when using hierarchical
51 |     // taxonomies
52 |     bias: Float)// any positive value is a boost, negative is a filter
53 |   extends Serializable
54 | 
55 | /** Used to specify the date range for a query */
56 | case class DateRange(
57 |     name: String, // name of item property for the date comparison
58 |     before: Option[String], // empty strings means no filter
59 |     after: Option[String]) // both empty should be ignored
60 |   extends Serializable
61 | 
62 | /** results of a MMRAlgoritm.predict */
63 | case class PredictedResult(
64 |     itemScores: Array[ItemScore])
65 |   extends Serializable
66 | 
67 | case class ItemScore(
68 |     item: String, // item id
69 |     score: Double )// used to rank, original score returned from teh search engine
70 |   extends Serializable
71 | 
72 | object RecommendationEngine extends EngineFactory {
73 |   def apply() = {
74 |     new Engine(
75 |       classOf[DataSource],
76 |       classOf[Preparator],
77 |       Map("ur" -> classOf[URAlgorithm]), // IMPORTANT: "ur" must be the "name" of the parameter set in engine.json
78 |       classOf[Serving])
79 |   }
80 | }


--------------------------------------------------------------------------------
/src/main/scala/PopModel.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright ActionML, LLC under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * ActionML licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.template
 19 | 
 20 | import grizzled.slf4j.Logger
 21 | import io.prediction.data.storage.Event
 22 | import org.apache.spark.SparkContext
 23 | import org.apache.spark.rdd.RDD
 24 | import io.prediction.data.store.PEventStore
 25 | import org.joda.time.format.ISODateTimeFormat
 26 | import org.joda.time.{DateTime, Interval}
 27 | 
 28 | 
 29 | object PopModel {
 30 | 
 31 |   @transient lazy val logger = Logger[this.type]
 32 | 
 33 |   def calc (
 34 |     modelName: Option[String] = None,
 35 |     eventNames: List[String],
 36 |     appName: String,
 37 |     duration: Int = 0,
 38 |     endDateOption: Option[String] = None)(implicit sc: SparkContext): Option[RDD[(String, Float)]] = {
 39 | 
 40 |     // startDate should always be 'now' except in unusual conditions like for testing
 41 |     val endDate = if (endDateOption.isEmpty ) DateTime.now else {
 42 |       try {
 43 |         ISODateTimeFormat.dateTimeParser().parseDateTime(endDateOption.get)
 44 |       } catch {
 45 |         case e: IllegalArgumentException => e
 46 |           logger.warn("Bad endDate for popModel: " + endDateOption.get + " using 'now'")
 47 |           DateTime.now
 48 |       }
 49 |     }
 50 | 
 51 |     // based on type of popularity model return a set of (item-id, ranking-number) for all items
 52 |     modelName match {
 53 |       case Some("popular") => calcPopular(appName, eventNames, new Interval(endDate.minusSeconds(duration), endDate))
 54 |       case Some("trending") => calcTrending(appName, eventNames, new Interval(endDate.minusSeconds(duration), endDate))
 55 |       case Some("hot") => calcHot(appName, eventNames, new Interval(endDate.minusSeconds(duration), endDate))
 56 |       case _ => None // debatable, this is either an error or may need to default to popular, why call popModel otherwise
 57 |     }
 58 |   }
 59 | 
 60 |   /** Creates a rank from the number of named events per item for the duration */
 61 |   def calcPopular(appName: String, eventNames: List[String] = List.empty,
 62 |     interval: Interval)(implicit sc: SparkContext): Option[RDD[(String, Float)]] = {
 63 | 
 64 |     val events = eventsRDD(appName, eventNames, interval)
 65 |     val retval = events.map { e => (e.targetEntityId, e.event) }
 66 |       .groupByKey()
 67 |       .map { case(itemID, itEvents) => (itemID.get, itEvents.size.toFloat)}
 68 |       .reduceByKey (_+_) // make this a double in Elaseticsearch)
 69 |     if (!retval.isEmpty()) Some(retval) else None
 70 |   }
 71 | 
 72 |   /** Creates a rank for each item by dividing the duration in two and counting named events in both buckets
 73 |     * then dividing most recent by less recent. This ranks by change in popularity or velocity of populatiy change.
 74 |     * Interval(start, end) end instant is always greater than or equal to the start instant.
 75 |     */
 76 |   def calcTrending(appName: String, eventNames: List[String] = List.empty,
 77 |     interval: Interval)(implicit sc: SparkContext): Option[RDD[(String, Float)]] = {
 78 | 
 79 |     val olderInterval = new Interval(interval.getStart,
 80 |       interval.getStart().plusMillis((interval.toDurationMillis/2)toInt))
 81 |     val newerInterval = new Interval(interval.getStart().plusMillis((interval.toDurationMillis/2)toInt), interval.getEnd)
 82 | 
 83 |     val intervalMillis = interval.toDurationMillis
 84 |     val olderPopRDD = calcPopular(appName, eventNames, olderInterval)
 85 |     if ( olderPopRDD.nonEmpty) {
 86 |       val newerPopRDD = calcPopular(appName, eventNames, newerInterval)
 87 |       if ( newerPopRDD.nonEmpty ) {
 88 |         val retval = newerPopRDD.get.join(olderPopRDD.get).map { case (item, (newerScore, olderScore)) =>
 89 |           val velocity = (newerScore - olderScore)
 90 |           (item, velocity)
 91 |         }
 92 |         if (!retval.isEmpty()) Some(retval) else None
 93 |       } else None
 94 |     } else None
 95 |   }
 96 | 
 97 |   /** Creates a rank for each item by divding all events per item into three buckets and calculating the change in
 98 |     * velocity over time, in other words the acceleration of popularity change.
 99 |     */
100 |   def calcHot(appName: String, eventNames: List[String] = List.empty,
101 |     interval: Interval)(implicit sc: SparkContext): Option[RDD[(String, Float)]] = {
102 |     val olderInterval = new Interval(interval.getStart,
103 |       interval.getStart().plusMillis((interval.toDurationMillis/3)toInt))
104 |     val middleInterval = new Interval(olderInterval.getEnd,
105 |       olderInterval.getEnd().plusMillis((olderInterval.toDurationMillis)toInt))
106 |     val newerInterval = new Interval(middleInterval.getEnd, interval.getEnd)
107 | 
108 |     val olderPopRDD = calcPopular(appName, eventNames, olderInterval)
109 |     if (olderPopRDD.nonEmpty){ // todo: may want to allow an interval with no events, give them 0 counts
110 |       //val debug = olderPopRDD.get.count()
111 |       val middlePopRDD = calcPopular(appName, eventNames, middleInterval)
112 |       if (middlePopRDD.nonEmpty){
113 |         //val debug = middlePopRDD.get.count()
114 |         val newerPopRDD = calcPopular(appName, eventNames, newerInterval)
115 |         if (newerPopRDD.nonEmpty){
116 |           //val debug = newerPopRDD.get.count()
117 |           val newVelocityRDD = newerPopRDD.get.join(middlePopRDD.get).map { case( item, (newerScore, olderScore)) =>
118 |             val velocity = (newerScore - olderScore)
119 |             (item, velocity)
120 |           }
121 |           val oldVelocityRDD = middlePopRDD.get.join(olderPopRDD.get).map { case( item, (newerScore, olderScore)) =>
122 |             val velocity = (newerScore - olderScore)
123 |             (item, velocity)
124 |           }
125 |           Some( newVelocityRDD.join(oldVelocityRDD).map { case (item, (newVelocity, oldVelocity)) =>
126 |             val acceleration = (newVelocity - oldVelocity)
127 |             (item, acceleration)
128 |           })
129 |         } else None
130 |       } else None
131 |     } else None
132 |   }
133 | 
134 |   def eventsRDD(appName: String, eventNames: List[String], interval: Interval)
135 |     (implicit sc: SparkContext): RDD[Event] = {
136 | 
137 |     PEventStore.find(
138 |       appName = appName,
139 |       startTime = Some(interval.getStart),
140 |       untilTime = Some(interval.getEnd),
141 |       eventNames = Some(eventNames)
142 |     )(sc)
143 |   }
144 | 
145 | }
146 | 


--------------------------------------------------------------------------------
/src/main/scala/Preparator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright ActionML, LLC under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * ActionML licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.template
19 | 
20 | import io.prediction.controller.PPreparator
21 | import io.prediction.data.storage.PropertyMap
22 | import org.apache.mahout.math.indexeddataset.{IndexedDataset, BiDictionary}
23 | import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark
24 | import org.apache.spark.SparkContext
25 | import org.apache.spark.rdd.RDD
26 | 
27 | class Preparator
28 |   extends PPreparator[TrainingData, PreparedData] {
29 | 
30 |   /** Create [[org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark]] rdd backed
31 |     * "distributed row matrices" from the input string keyed rdds.
32 |     * @param sc Spark context
33 |     * @param trainingData list of (actionName, actionRDD)
34 |     * @return list of (correlatorName, correlatorIndexedDataset)
35 |     */
36 |   def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = {
37 |     // now that we have all actions in separate RDDs we must merge any user dictionaries and
38 |     // make sure the same user ids map to the correct events
39 |     var userDictionary: Option[BiDictionary] = None
40 | 
41 |     val indexedDatasets = trainingData.actions.map{ case(eventName, eventIDS) =>
42 | 
43 |       // passing in previous row dictionary will use the values if they exist
44 |       // and append any new ids, so after all are constructed we have all user ids in the last dictionary
45 |       val ids = IndexedDatasetSpark(eventIDS, userDictionary)(sc)
46 |       userDictionary = Some(ids.rowIDs)
47 |       (eventName, ids)
48 |     }
49 | 
50 |     // now make sure all matrices have identical row space since this corresponds to all users
51 |     val numUsers = userDictionary.get.size
52 |     val numPrimary = indexedDatasets.head._2.matrix.nrow
53 |     // todo: check to see that there are events in primary event IndexedDataset and abort if not.
54 |     val rowAdjustedIds = indexedDatasets.map { case(eventName, eventIDS) =>
55 |       (eventName, eventIDS.create(eventIDS.matrix, userDictionary.get, eventIDS.columnIDs).newRowCardinality(numUsers))
56 |     }
57 | 
58 |     new PreparedData(rowAdjustedIds, trainingData.fieldsRDD)
59 |   }
60 | 
61 | }
62 | 
63 | class PreparedData(
64 |     val actions: List[(String, IndexedDataset)],
65 |     val fieldsRDD: RDD[(String, PropertyMap)])
66 |   extends Serializable


--------------------------------------------------------------------------------
/src/main/scala/Serving.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright ActionML, LLC under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * ActionML licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.template
19 | 
20 | import io.prediction.controller.LServing
21 | 
22 | class Serving
23 |   extends LServing[Query, PredictedResult] {
24 | 
25 |   override
26 |   def serve(query: Query,
27 |     predictedResults: Seq[PredictedResult]): PredictedResult = {
28 |     predictedResults.head
29 |   }
30 | }


--------------------------------------------------------------------------------
/src/main/scala/URAlgorithm.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright ActionML, LLC under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * ActionML licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.template
 19 | 
 20 | import java.util
 21 | import io.prediction.controller.P2LAlgorithm
 22 | import io.prediction.controller.Params
 23 | import io.prediction.data
 24 | import io.prediction.data.storage.{PropertyMap, Event}
 25 | import io.prediction.data.store.LEventStore
 26 | import org.apache.mahout.math.cf.SimilarityAnalysis
 27 | import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark
 28 | import org.apache.spark.rdd.RDD
 29 | import org.joda.time.DateTime
 30 | import org.json4s
 31 | import org.json4s.JsonAST
 32 | import org.json4s.JsonAST._
 33 | import scala.collection.JavaConverters._
 34 | import scala.collection.immutable
 35 | import scala.concurrent.duration.Duration
 36 | import org.apache.spark.SparkContext
 37 | import org.json4s.JsonDSL._
 38 | import org.json4s.jackson.JsonMethods._
 39 | import scala.collection.convert.wrapAsScala._
 40 | import grizzled.slf4j.Logger
 41 | import org.elasticsearch.spark._
 42 | 
 43 | /** Setting the option in the params case class doesn't work as expected when the param is missing from
 44 |   * engine.json so set these for use in the algorithm when they are not present in the engine.json
 45 |   */
 46 | object defaultURAlgorithmParams {
 47 |   val DefaultMaxEventsPerEventType = 500
 48 |   val DefaultNum = 20
 49 |   val DefaultMaxCorrelatorsPerEventType = 50
 50 |   val DefaultMaxQueryEvents = 100 // default number of user history events to use in recs query
 51 | 
 52 |   val DefaultExpireDateName = "expireDate" // default name for the expire date property of an item
 53 |   val DefaultAvailableDateName = "availableDate" //defualt name for and item's available after date
 54 |   val DefaultDateName = "date" // when using a date range in the query this is the name of the item's date
 55 |   val DefaultRecsModel = "all" // use CF + backfill
 56 |   val DefaultBackfillParams = BackfillField()
 57 |   val DefaultBackfillFieldName = "popRank"
 58 | }
 59 | 
 60 | case class BackfillField(
 61 |   name: String = "popRank",
 62 |   backfillType: String = "popular", // may be 'hot', or 'trending' also
 63 |   eventnames: Option[List[String]] = None, // None means use the algo eventnames list, otherwise a list of events
 64 |   endDate: Option[String] = None, // used only for tests, specifies the start (oldest date) of the popModel's duration
 65 |   duration: Int = 259200) // number of seconds worth of events to use in calculation of backfill
 66 | 
 67 | /** Instantiated from engine.json */
 68 | case class URAlgorithmParams(
 69 |     appName: String, // filled in from engine.json
 70 |     indexName: String, // can optionally be used to specify the elasticsearch index name
 71 |     typeName: String, // can optionally be used to specify the elasticsearch type name
 72 |     recsModel: Option[String] = Some(defaultURAlgorithmParams.DefaultRecsModel), // "all", "collabFiltering", "backfill"
 73 |     eventNames: List[String], // names used to ID all user actions
 74 |     blacklistEvents: Option[List[String]] = None,// None means use the primary event, empty array means no filter
 75 |     // number of events in user-based recs query
 76 |     maxQueryEvents: Option[Int] = Some(defaultURAlgorithmParams.DefaultMaxQueryEvents),
 77 |     maxEventsPerEventType: Option[Int] = Some(defaultURAlgorithmParams.DefaultMaxEventsPerEventType),
 78 |     maxCorrelatorsPerEventType: Option[Int] = Some(defaultURAlgorithmParams.DefaultMaxCorrelatorsPerEventType),
 79 |     num: Option[Int] = Some(defaultURAlgorithmParams.DefaultNum), // default max # of recs requested
 80 |     userBias: Option[Float] = None, // will cause the default search engine boost of 1.0
 81 |     itemBias: Option[Float] = None, // will cause the default search engine boost of 1.0
 82 |     returnSelf: Option[Boolean] = None, // query building logic defaults this to false
 83 |     fields: Option[List[Field]] = None, //defaults to no fields
 84 |     // leave out for default or popular
 85 |     backfillField: Option[BackfillField] = None,
 86 |     // name of date property field for when the item is avalable
 87 |     availableDateName: Option[String] = Some(defaultURAlgorithmParams.DefaultAvailableDateName),
 88 |     // name of date property field for when an item is no longer available
 89 |     expireDateName: Option[String] = Some(defaultURAlgorithmParams.DefaultExpireDateName),
 90 |     // used as the subject of a dateRange in queries, specifies the name of the item property 
 91 |     dateName: Option[String] = Some(defaultURAlgorithmParams.DefaultDateName),
 92 |     seed: Option[Long] = None) // seed is not used presently
 93 |   extends Params //fixed default make it reproducible unless supplied
 94 | 
 95 | /** Creates cooccurrence, cross-cooccurrence and eventually content correlators with
 96 |   * [[org.apache.mahout.math.cf.SimilarityAnalysis]] The analysis part of the recommender is
 97 |   * done here but the algorithm can predict only when the coocurrence data is indexed in a
 98 |   * search engine like Elasticsearch. This is done in URModel.save.
 99 |   *
100 |   * @param ap taken from engine.json to describe limits and event types
101 |   */
102 | class URAlgorithm(val ap: URAlgorithmParams)
103 |   extends P2LAlgorithm[PreparedData, URModel, Query, PredictedResult] {
104 | 
105 |   case class BoostableCorrelators(actionName: String, itemIDs: Seq[String], boost: Option[Float])
106 |   case class FilterCorrelators(actionName: String, itemIDs: Seq[String])
107 | 
108 |   @transient lazy val logger = Logger[this.type]
109 | 
110 |   def train(sc: SparkContext, data: PreparedData): URModel = {
111 | 
112 |     val dateNames = Some(List(ap.dateName.getOrElse(""), ap.availableDateName.getOrElse(""),
113 |       ap.expireDateName.getOrElse(""))) // todo: return None if all are empty?
114 |     val backfillFieldName = ap.backfillField.getOrElse(BackfillField()).name
115 | 
116 |     ap.recsModel.getOrElse(defaultURAlgorithmParams.DefaultRecsModel) match {
117 |       case "all" => calcAll(sc, data, dateNames, backfillFieldName)
118 |       case "collabFiltering" => calcAll(sc, data, dateNames, backfillFieldName, popular = false )
119 |       case "backfill" => calcPop(sc, data, dateNames, backfillFieldName)
120 |       // error, throw an exception
121 |       case _ => throw new IllegalArgumentException("Bad recsModel in engine definition params, possibly a bad json value.")
122 |     }
123 |   }
124 | 
125 |   /** Calculates recs model as well as popularity model */
126 |   def calcAll(
127 |     sc: SparkContext,
128 |     data: PreparedData,
129 |     dateNames: Option[List[String]] = None,
130 |     backfillFieldName: String,
131 |     popular: Boolean = true):
132 |     URModel = {
133 | 
134 |     // No one likes empty training data.
135 |     require(data.actions.take(1).nonEmpty,
136 |       s"Primary action in PreparedData cannot be empty." +
137 |         " Please check if DataSource generates TrainingData" +
138 |         " and Preprator generates PreparedData correctly.")
139 | 
140 |     val backfillParams = ap.backfillField.getOrElse(defaultURAlgorithmParams.DefaultBackfillParams)
141 |     val nonDefaultMappings = Map(backfillParams.name -> "float")
142 |     logger.info("Actions read now creating correlators")
143 |     val cooccurrenceIDSs = SimilarityAnalysis.cooccurrencesIDSs(
144 |       data.actions.map(_._2).toArray,
145 |       randomSeed = ap.seed.getOrElse(System.currentTimeMillis()).toInt,
146 |       maxInterestingItemsPerThing = ap.maxCorrelatorsPerEventType
147 |         .getOrElse(defaultURAlgorithmParams.DefaultMaxCorrelatorsPerEventType),
148 |       maxNumInteractions = ap.maxEventsPerEventType.getOrElse(defaultURAlgorithmParams.DefaultMaxEventsPerEventType))
149 |       .map(_.asInstanceOf[IndexedDatasetSpark]) // strip action names
150 |     val cooccurrenceCorrelators = cooccurrenceIDSs.zip(data.actions.map(_._1)).map(_.swap) //add back the actionNames
151 | 
152 |     val popModel = if (popular) {
153 |       val duration = ap.backfillField.getOrElse(defaultURAlgorithmParams.DefaultBackfillParams).duration
154 |       val backfillEvents = backfillParams.eventnames.getOrElse(List(ap.eventNames.head))
155 |       val start = ap.backfillField.getOrElse(defaultURAlgorithmParams.DefaultBackfillParams).endDate
156 |       PopModel.calc(Some(backfillParams.backfillType), backfillEvents, ap.appName, duration)(sc)
157 |     } else None
158 | 
159 |     val allPropertiesRDD = if (popModel.nonEmpty) {
160 |       data.fieldsRDD.cogroup[Float](popModel.get).map { case (item, pms) =>
161 |         val pm = if (pms._1.nonEmpty && pms._2.nonEmpty) {
162 |           val newPM = pms._1.head.fields + (backfillFieldName -> JDouble(pms._2.head))
163 |           PropertyMap(newPM, pms._1.head.firstUpdated, DateTime.now())
164 |         } else if (pms._2.nonEmpty) PropertyMap(Map(backfillFieldName -> JDouble(pms._2.head)), DateTime.now(), DateTime.now())
165 |         else PropertyMap( Map.empty[String, JValue], DateTime.now, DateTime.now) // some error????
166 |         (item, pm)
167 |       }
168 |     } else data.fieldsRDD
169 | 
170 |     logger.info("Correlators created now putting into URModel")
171 |     new URModel(
172 |       Some(cooccurrenceCorrelators),
173 |       Some(allPropertiesRDD),
174 |       ap.indexName,
175 |       dateNames,
176 |       typeMappings = Some(nonDefaultMappings))
177 |   }
178 | 
179 |   /** This function creates a URModel from an existing index in Elasticsearch + new popularity ranking
180 |     * It is used when you want to re-calc the popularity model between training on useage data. It leaves
181 |     * the part of the model created from usage data alone and only modifies the popularity ranking.
182 |     */
183 |   def calcPop(
184 |     sc: SparkContext,
185 |     data: PreparedData,
186 |     dateNames: Option[List[String]] = None,
187 |     backfillFieldName: String = ""): URModel = {
188 |     
189 |     val backfillParams = ap.backfillField.getOrElse(defaultURAlgorithmParams.DefaultBackfillParams)
190 |     val backfillEvents = backfillParams.eventnames.getOrElse(List(ap.eventNames.head))//default to first/primary event
191 |     val start = ap.backfillField.getOrElse(defaultURAlgorithmParams.DefaultBackfillParams).endDate
192 |     val popModel = PopModel.calc(
193 |       Some(backfillParams.backfillType), 
194 |       backfillEvents, 
195 |       ap.appName, 
196 |       backfillParams.duration, 
197 |       start)(sc)
198 |     val popRDD = if (popModel.nonEmpty) {
199 |       val model = popModel.get.map { case (item, rank)  =>
200 |         val newPM = Map(backfillFieldName -> JDouble(rank))
201 |         (item, PropertyMap(newPM, DateTime.now, DateTime.now))
202 |       }
203 |       Some(model)
204 |     } else None
205 | 
206 |     val propertiesRDD = if (popModel.nonEmpty) {
207 |       val currentMetadata = esClient.getRDD(sc, ap.indexName, ap.typeName)
208 |       if (currentMetadata.nonEmpty) { // may be an empty index so ignore
209 |         Some(popModel.get.cogroup[collection.Map[String, AnyRef]](currentMetadata.get)
210 |         .map { case (item, (ranks, pms)) =>
211 |           if (ranks.nonEmpty) pms.head + (backfillFieldName -> ranks.head)
212 |           else if (pms.nonEmpty) pms.head
213 |           else Map.empty[String, AnyRef] // could happen if only calculating popularity, which may leave out items with
214 |           // no events
215 |         })
216 |       } else None
217 |     } else None
218 | 
219 |     // returns the existing model plus new popularity ranking
220 |     new URModel(
221 |       None,
222 |       None,
223 |       ap.indexName,
224 |       None,
225 |       propertiesRDD = propertiesRDD,
226 |       typeMappings = Some(Map(backfillFieldName -> "float")))
227 |   }
228 | 
229 |   var queryEventNames = List.empty[String] // if passed in with the query overrides the engine.json list--used in MAP@k
230 |   //testing, this only effects which events are used in queries.
231 | 
232 |   /** Return a list of items recommended for a user identified in the query
233 |     * The ES json query looks like this:
234 |     *  {
235 |     *    "size": 20
236 |     *    "query": {
237 |     *      "bool": {
238 |     *        "should": [
239 |     *          {
240 |     *            "terms": {
241 |     *              "rate": ["0", "67", "4"]
242 |     *            }
243 |     *          },
244 |     *          {
245 |     *            "terms": {
246 |     *              "buy": ["0", "32"],
247 |     *              "boost": 2
248 |     *            }
249 |     *          },
250 |     *          { // categorical boosts
251 |     *            "terms": {
252 |     *              "category": ["cat1"],
253 |     *              "boost": 1.05
254 |     *            }
255 |     *          }
256 |     *        ],
257 |     *        "must": [ // categorical filters
258 |     *          {
259 |     *            "terms": {
260 |     *              "category": ["cat1"],
261 |     *              "boost": 0
262 |     *            }
263 |     *          },
264 |     *         {
265 |     *        "must_not": [//blacklisted items
266 |     *          {
267 |     *            "ids": {
268 |     *              "values": ["items-id1", "item-id2", ...]
269 |     *            }
270 |     *          },
271 |     *         {
272 |     *           "constant_score": {// date in query must fall between the expire and avqilable dates of an item
273 |     *             "filter": {
274 |     *               "range": {
275 |     *                 "availabledate": {
276 |     *                   "lte": "2015-08-30T12:24:41-07:00"
277 |     *                 }
278 |     *               }
279 |     *             },
280 |     *             "boost": 0
281 |     *           }
282 |     *         },
283 |     *         {
284 |     *           "constant_score": {// date range filter in query must be between these item property values
285 |     *             "filter": {
286 |     *               "range" : {
287 |     *                 "expiredate" : {
288 |     *                   "gte": "2015-08-15T11:28:45.114-07:00"
289 |     *                   "lt": "2015-08-20T11:28:45.114-07:00"
290 |     *                 }
291 |     *               }
292 |     *             }, "boost": 0
293 |     *           }
294 |     *         },
295 |     *         {
296 |     *           "constant_score": { // this orders popular items for backfill
297 |     *              "filter": {
298 |     *                 "match_all": {}
299 |     *              },
300 |     *              "boost": 0.000001 // must have as least a small number to be boostable
301 |     *           }
302 |     *        }
303 |     *      }
304 |     *    }
305 |     *  }
306 |     *
307 |     * @param model <strong>Ignored!</strong> since the model is already in Elasticsearch
308 |     * @param query contains query spec
309 |     * @todo Need to prune that query to minimum required for data include, for instance no need for the popularity
310 |     *       ranking if no PopModel is being used, same for "must" clause and dates.
311 |     */
312 |   def predict(model: URModel, query: Query): PredictedResult = {
313 |     logger.info(s"Query received, user id: ${query.user}, item id: ${query.item}")
314 | 
315 |     queryEventNames = query.eventNames.getOrElse(ap.eventNames) // eventNames in query take precedence for the query
316 |     // part of their use
317 |     val backfillFieldName = ap.backfillField.getOrElse(BackfillField()).name
318 |     val queryAndBlacklist = buildQuery(ap, query, backfillFieldName)
319 |     val recs = esClient.search(queryAndBlacklist._1, ap.indexName)
320 |     // should have all blacklisted items excluded
321 |     // todo: need to add dithering, mean, sigma, seed required, make a seed that only changes on some fixed time
322 |     // period so the recs ordering stays fixed for that time period.
323 |     recs
324 |   }
325 | 
326 |   /** Build a query from default algorithms params and the query itself taking into account defaults */
327 |   def buildQuery(ap: URAlgorithmParams, query: Query, backfillFieldName: String = ""): (String, List[Event]) = {
328 | 
329 |     try{ // require the minimum of a user or item, if not then return popular if any
330 |       //require( query.item.nonEmpty || query.user.nonEmpty, "Warning: a query must include either a user or item id")
331 | 
332 |       // create a list of all query correlators that can have a bias (boost or filter) attached
333 |       val alluserEvents = getBiasedRecentUserActions(query)
334 | 
335 |       // create a list of all boosted query correlators
336 |       val recentUserHistory = if ( ap.userBias.getOrElse(1f) >= 0f )
337 |         alluserEvents._1.slice(0, ap.maxQueryEvents.getOrElse(defaultURAlgorithmParams.DefaultMaxQueryEvents) - 1)
338 |       else List.empty[BoostableCorrelators]
339 | 
340 |       val similarItems = if ( ap.itemBias.getOrElse(1f) >= 0f )
341 |         getBiasedSimilarItems(query)
342 |       else List.empty[BoostableCorrelators]
343 | 
344 |       val boostedMetadata = getBoostedMetadata(query)
345 | 
346 |       val allBoostedCorrelators = recentUserHistory ++ similarItems ++ boostedMetadata
347 | 
348 |       // create a lsit of all query correlators that are to be used to filter results
349 |       val recentUserHistoryFilter = if ( ap.userBias.getOrElse(1f) < 0f ) {
350 |         // strip any boosts
351 |         alluserEvents._1.map { i =>
352 |           FilterCorrelators(i.actionName, i.itemIDs)
353 |         }.slice(0, ap.maxQueryEvents.getOrElse(defaultURAlgorithmParams.DefaultMaxQueryEvents) - 1)
354 |       } else List.empty[FilterCorrelators]
355 | 
356 |       val similarItemsFilter = if ( ap.itemBias.getOrElse(1f) < 0f ) {
357 |         getBiasedSimilarItems(query).map { i =>
358 |           FilterCorrelators(i.actionName, i.itemIDs)
359 |         }.toList
360 |       } else List.empty[FilterCorrelators]
361 | 
362 |       val filteringMetadata = getFilteringMetadata(query)
363 | 
364 |       val filteringDateRange = getFilteringDateRange(query)
365 | 
366 |       val allFilteringCorrelators = recentUserHistoryFilter ++ similarItemsFilter ++ filteringMetadata
367 | 
368 |       // since users have action history and items have correlators and both correspond to the same "actions" like
369 |       // purchase or view, we'll pass both to the query if the user history or items correlators are empty
370 |       // then metadata or backfill must be relied on to return results.
371 | 
372 |       val numRecs = query.num.getOrElse(ap.num.getOrElse(defaultURAlgorithmParams.DefaultNum))
373 | 
374 |       val shouldFields: Option[List[JValue]] = if (allBoostedCorrelators.isEmpty) None
375 |       else {
376 |         Some(allBoostedCorrelators.map { i =>
377 |           render(("terms" -> (i.actionName -> i.itemIDs) ~ ("boost" -> i.boost)))
378 |         }.toList)
379 |       }
380 |       val popModelSort = List(parse(
381 |         """
382 |           |{
383 |           |  "constant_score": {
384 |           |    "filter": {
385 |           |      "match_all": {}
386 |           |    },
387 |           |    "boost": 0
388 |           |  }
389 |           |}
390 |           |""".stripMargin))
391 | 
392 |       val should: List[JValue] = if (shouldFields.isEmpty) popModelSort else shouldFields.get ::: popModelSort
393 | 
394 | 
395 |       val mustFields: List[JValue] = allFilteringCorrelators.map { i =>
396 |         render(("terms" -> (i.actionName -> i.itemIDs) ~ ("boost" -> 0)))}.toList
397 |       val must: List[JValue] = mustFields ::: filteringDateRange
398 | 
399 |       val mustNotFields: JValue = render(("ids" -> ("values" -> getExcludedItems (alluserEvents._2, query)) ~ ("boost" -> 0)))
400 |       val mustNot: JValue = mustNotFields
401 | 
402 |       val popQuery = if (ap.recsModel.getOrElse("all") == "all" ||
403 |         ap.recsModel.getOrElse("all") == "backfill") {
404 |         Some(List(
405 |           parse( """{"_score": {"order": "desc"}}"""),
406 |           parse(
407 |             s"""
408 |                |{
409 |                |    "${backfillFieldName}": {
410 |                |      "unmapped_type": "double",
411 |                |      "order": "desc"
412 |                |    }
413 |                |}""".stripMargin)))
414 |       } else None
415 | 
416 | 
417 |       val json =
418 |         (
419 |           ("size" -> numRecs) ~
420 |           ("query"->
421 |             ("bool"->
422 |               ("should"-> should) ~
423 |               ("must"-> must) ~
424 |               ("must_not"-> mustNot) ~
425 |               ("minimum_should_match" -> 1))
426 |           ) ~
427 |           ("sort" -> popQuery))
428 |       val j = compact(render(json))
429 |       logger.info(s"Query: \n${j}\n")
430 |       (compact(render(json)), alluserEvents._2)
431 |     } catch {
432 |       case e: IllegalArgumentException =>
433 |         ("", List.empty[Event])
434 |     }
435 |   }
436 | 
437 |   /** Create a list of item ids that the user has interacted with or are not to be included in recommendations */
438 |   def getExcludedItems(userEvents: List[Event], query: Query): List[String] = {
439 | 
440 |     val blacklistedItems = userEvents.filter { event =>
441 |       if (ap.blacklistEvents.nonEmpty) {
442 |         // either a list or an empty list of filtering events so honor them
443 |         if (ap.blacklistEvents.get == List.empty[String]) false // no filtering events so all are allowed
444 |         else ap.blacklistEvents.get.contains(event.event) // if its filtered remove it, else allow
445 |       } else ap.eventNames(0).equals(event.event) // remove the primary event if nothing specified
446 |     }.map (_.targetEntityId.getOrElse("")) ++ query.blacklistItems.getOrElse(List.empty[String])
447 |     .distinct
448 | 
449 |     // Now conditionally add the query item itself
450 |     val includeSelf = query.returnSelf.getOrElse(ap.returnSelf.getOrElse(false))
451 |     val allExcludedItems = if ( !includeSelf && query.item.nonEmpty )
452 |       blacklistedItems :+ query.item.get // add the query item to be excuded
453 |     else
454 |       blacklistedItems
455 |     allExcludedItems.distinct
456 |   }
457 | 
458 |   /** Get similar items for an item, these are already in the action correlators in ES */
459 |   def getBiasedSimilarItems(query: Query): Seq[BoostableCorrelators] = {
460 |     if (query.item.nonEmpty) {
461 |       val m = esClient.getSource(ap.indexName, ap.typeName, query.item.get)
462 | 
463 |       if (m != null) {
464 |         val itemEventBias = query.itemBias.getOrElse(ap.itemBias.getOrElse(1f))
465 |         val itemEventsBoost = if (itemEventBias > 0 && itemEventBias != 1) Some(itemEventBias) else None
466 |         ap.eventNames.map { action =>
467 |           val items = try {
468 |             if (m.containsKey(action) && m.get(action) != null) m.get(action).asInstanceOf[util.ArrayList[String]].toList
469 |             else List.empty[String]
470 |           } catch {
471 |             case cce: ClassCastException =>
472 |               logger.warn(s"Bad value in item ${query.item} corresponding to key: ${action} that was not a List[String]" +
473 |                 " ignored.")
474 |               List.empty[String]
475 |           }
476 |           val rItems = if (items.size <= ap.maxQueryEvents.getOrElse(defaultURAlgorithmParams.DefaultMaxQueryEvents))
477 |             items else items.slice(0, ap.maxQueryEvents.getOrElse(defaultURAlgorithmParams.DefaultMaxQueryEvents) - 1)
478 |           BoostableCorrelators(action, rItems, itemEventsBoost)
479 |         }
480 |       } else List.empty[BoostableCorrelators] // no similar items
481 |     } else List.empty[BoostableCorrelators] // no item specified
482 |   }
483 | 
484 |   /** Get recent events of the user on items to create the recommendations query from */
485 |   def getBiasedRecentUserActions(
486 |     query: Query): (Seq[BoostableCorrelators], List[Event]) = {
487 | 
488 |     val recentEvents = try {
489 |       LEventStore.findByEntity(
490 |         appName = ap.appName,
491 |         // entityType and entityId is specified for fast lookup
492 |         entityType = "user",
493 |         entityId = query.user.get,
494 |         // one query per eventName is not ideal, maybe one query for lots of events then split by eventName
495 |         //eventNames = Some(Seq(action)),// get all and separate later
496 |         eventNames = Some(queryEventNames),// get all and separate later
497 |         targetEntityType = None,
498 |         // limit = Some(maxQueryEvents), // this will get all history then each action can be limited before using in
499 |         // the query
500 |         latest = true,
501 |         // set time limit to avoid super long DB access
502 |         timeout = Duration(200, "millis")
503 |       ).toList
504 |     } catch {
505 |       case e: scala.concurrent.TimeoutException =>
506 |         logger.error(s"Timeout when read recent events." +
507 |           s" Empty list is used. ${e}")
508 |         List.empty[Event]
509 |       case e: NoSuchElementException => // todo: bad form to use an exception to check if there is a user id
510 |         logger.info("No user id for recs, returning similar items for the item specified")
511 |         List.empty[Event]
512 |       case e: Exception => // fatal because of error, an empty query
513 |         logger.error(s"Error when read recent events: ${e}")
514 |         throw e
515 |     }
516 | 
517 |     val userEventBias = query.userBias.getOrElse(ap.userBias.getOrElse(1f))
518 |     val userEventsBoost = if (userEventBias > 0 && userEventBias != 1) Some(userEventBias) else None
519 |     //val rActions = ap.eventNames.map { action =>
520 |     val rActions = queryEventNames.map { action =>
521 |       var items = List[String]()
522 | 
523 |       for ( event <- recentEvents )
524 |         if (event.event == action && items.size <
525 |           ap.maxQueryEvents.getOrElse(defaultURAlgorithmParams.DefaultMaxQueryEvents)) {
526 |           items = event.targetEntityId.get :: items
527 |           // todo: may throw exception and we should ignore the event instead of crashing
528 |         }
529 |       // userBias may be None, which will cause no JSON output for this
530 |       BoostableCorrelators(action, items.distinct, userEventsBoost)
531 |     }
532 |     (rActions, recentEvents)
533 |   }
534 | 
535 |   /** get all metadata fields that potentially have boosts (not filters) */
536 |   def getBoostedMetadata( query: Query ): List[BoostableCorrelators] = {
537 |     val paramsBoostedFields = ap.fields.getOrElse(List.empty[Field]).filter( field => field.bias < 0 ).map { field =>
538 |       BoostableCorrelators(field.name, field.values, Some(field.bias))
539 |     }
540 | 
541 |     val queryBoostedFields = query.fields.getOrElse(List.empty[Field]).filter { field =>
542 |       field.bias >=  0f
543 |     }.map { field =>
544 |       BoostableCorrelators(field.name, field.values, Some(field.bias))
545 |     }
546 | 
547 |     (queryBoostedFields ++ paramsBoostedFields).distinct  // de-dup and favor query fields
548 |   }
549 | 
550 |   /** get all metadata fields that are filters (not boosts) */
551 |   def getFilteringMetadata( query: Query ): List[FilterCorrelators] = {
552 |     val paramsFilterFields = ap.fields.getOrElse(List.empty[Field]).filter( field => field.bias >= 0 ).map { field =>
553 |       FilterCorrelators(field.name, field.values)
554 |     }
555 | 
556 |     val queryFilterFields = query.fields.getOrElse(List.empty[Field]).filter { field =>
557 |       field.bias <  0f
558 |     }.map { field =>
559 |       FilterCorrelators(field.name, field.values)
560 |     }
561 | 
562 |     (queryFilterFields ++ paramsFilterFields).distinct // de-dup and favor query fields
563 |   }
564 | 
565 |   /** get part of query for dates and date ranges */
566 |   def getFilteringDateRange( query: Query ): List[JValue] = {
567 | 
568 |     var json: List[JValue] = List.empty[JValue]
569 |     // currentDate in the query overrides the dateRange in the same query so ignore daterange if both
570 |     val currentDate = query.currentDate.getOrElse(DateTime.now().toDateTimeISO.toString)
571 | 
572 |     if (query.dateRange.nonEmpty &&
573 |       (query.dateRange.get.after.nonEmpty || query.dateRange.get.before.nonEmpty)) {
574 |       val name = query.dateRange.get.name
575 |       val before = query.dateRange.get.before.getOrElse("")
576 |       val after = query.dateRange.get.after.getOrElse("")
577 |       val rangeStart = s"""
578 |         |{
579 |         |  "constant_score": {
580 |         |    "filter": {
581 |         |      "range": {
582 |         |        "${name}": {
583 |         """.stripMargin
584 | 
585 |       val rangeAfter = s"""
586 |         |          "gt": "${after}"
587 |         """.stripMargin
588 | 
589 |       val rangeBefore = s"""
590 |         |          "lt": "${before}"
591 |         """.stripMargin
592 | 
593 |       val rangeEnd = s"""
594 |         |        }
595 |         |      }
596 |         |    },
597 |         |    "boost": 0
598 |         |  }
599 |         |}
600 |         """.stripMargin
601 | 
602 |       var range = rangeStart
603 |       if (!after.isEmpty) {
604 |         range += rangeAfter
605 |         if (!before.isEmpty) range += ","
606 |       }
607 |       if (!before.isEmpty) range += rangeBefore
608 |       range += rangeEnd
609 | 
610 |       json = json :+ parse(range)
611 |     } else if (ap.availableDateName.nonEmpty && ap.expireDateName.nonEmpty) {// use the query date or system date
612 |       val availableDate = ap.availableDateName.get // never None
613 |       val expireDate = ap.expireDateName.get
614 | 
615 |       val available = s"""
616 |         |{
617 |         |  "constant_score": {
618 |         |    "filter": {
619 |         |      "range": {
620 |         |        "${availableDate}": {
621 |         |          "lte": "${currentDate}"
622 |         |        }
623 |         |      }
624 |         |    },
625 |         |    "boost": 0
626 |         |  }
627 |         |}
628 |         """.stripMargin
629 | 
630 |       json = json :+ parse(available)
631 |       val expire = s"""
632 |         |{
633 |         |  "constant_score": {
634 |         |    "filter": {
635 |         |      "range": {
636 |         |        "${expireDate}": {
637 |         |          "gt": "${currentDate}"
638 |         |        }
639 |         |      }
640 |         |    },
641 |         |    "boost": 0
642 |         |  }
643 |         |}
644 |         """.stripMargin
645 |       json = json :+ parse(expire)
646 |     } else {
647 |       logger.info("Misconfigured date information, either your engine.json date settings or your query's dateRange is incorrect.\nIngoring date information for this query.")
648 |     }
649 |     json
650 |   }
651 | 
652 | }


--------------------------------------------------------------------------------
/src/main/scala/URModel.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright ActionML, LLC under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * ActionML licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.template
 19 | 
 20 | import java.util.Date
 21 | 
 22 | import grizzled.slf4j.Logger
 23 | 
 24 | import io.prediction.controller.{PersistentModelLoader, PersistentModel}
 25 | import io.prediction.data.storage.PropertyMap
 26 | import org.apache.mahout.math.indexeddataset.IndexedDataset
 27 | import org.apache.spark.rdd.RDD
 28 | import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark
 29 | import org.joda.time.DateTime
 30 | import org.json4s.JsonAST.JArray
 31 | import org.json4s._
 32 | import org.template.conversions.IndexedDatasetConversions
 33 | import org.elasticsearch.spark._
 34 | import org.apache.spark.SparkContext
 35 | 
 36 | 
 37 | /** Universal Recommender models to save in ES */
 38 | class URModel(
 39 |     coocurrenceMatrices: Option[List[(String, IndexedDataset)]],
 40 |     fieldsRDD: Option[RDD[(String, PropertyMap)]],
 41 |     indexName: String,
 42 |     dateNames: Option[List[String]] = None,
 43 |     nullModel: Boolean = false,
 44 |     typeMappings: Option[Map[String, String]] = None, // maps fieldname that need type mapping in Elasticsearch
 45 |     propertiesRDD: Option[RDD[collection.Map[String, Any]]] = None)
 46 |     // a little hack to allow a dummy model used to save but not
 47 |     // retrieve (see companion object's apply)
 48 |   extends PersistentModel[URAlgorithmParams] {
 49 |   @transient lazy val logger = Logger[this.type]
 50 | 
 51 |   /** Save all fields to be indexed by Elasticsearch and queried for recs
 52 |     * This will is something like a table with row IDs = item IDs and separate fields for all
 53 |     * cooccurrence and cross-cooccurrence correlators and metadata for each item. Metadata fields are
 54 |     * limited to text term collections so vector types. Scalar values can be used but depend on
 55 |     * Elasticsearch's support. One exception is the Data scalar, which is also supported
 56 |     * @param id
 57 |     * @param params from engine.json, algorithm control params
 58 |     * @param sc The spark constext already created for execution
 59 |     * @return always returns true since most other reasons to not save cause exceptions
 60 |     */
 61 |   def save(id: String, params: URAlgorithmParams, sc: SparkContext): Boolean = {
 62 | 
 63 |     if (nullModel) throw new IllegalStateException("Saving a null model created from loading an old one.")
 64 | 
 65 |     val esIndexURI = s"/${params.indexName}/${params.typeName}"
 66 | 
 67 |     // for ES we need to create the entire index in an rdd of maps, one per item so we'll use
 68 |     // convert cooccurrence matrices into correlators as RDD[(itemID, (actionName, Seq[itemID])]
 69 |     // do they need to be in Elasticsearch format
 70 |     logger.info("Converting cooccurrence matrices into correlators")
 71 |     val correlators = if (coocurrenceMatrices.nonEmpty) coocurrenceMatrices.get.map { case (actionName, dataset) =>
 72 |       dataset.asInstanceOf[IndexedDatasetSpark].toStringMapRDD(actionName).asInstanceOf[RDD[(String, Map[String, Any])]]
 73 |       //} else List.empty[RDD[(String, Map[String, Seq[String]])]] // empty mena only calculating PopModel
 74 |     } else List.empty[RDD[(String, Map[String, Any])]] // empty mena only calculating PopModel
 75 | 
 76 |     // getting action names since they will be ES fields
 77 |     logger.info(s"Getting a list of action name strings")
 78 |     val allActions = coocurrenceMatrices.getOrElse(List.empty[(String, IndexedDatasetSpark)]).map(_._1)
 79 | 
 80 |     logger.info(s"Ready to pass date fields names to closure ${dateNames}")
 81 |     val closureDateNames = dateNames.getOrElse(List.empty[String])
 82 |     // convert the PropertyMap into Map[String, Seq[String]] for ES
 83 |     logger.info("Converting PropertyMap into Elasticsearch style rdd")
 84 |     var properties = List.empty[RDD[(String, Map[String, Any])]]
 85 |     var allPropKeys = List.empty[String]
 86 |     if (fieldsRDD.nonEmpty) {
 87 |       properties = List(fieldsRDD.get.map { case (item, pm) =>
 88 |         var m: Map[String, Any] = Map()
 89 |         for (key <- pm.keySet) {
 90 |           val k = key
 91 |           val v = pm.get[JValue](key)
 92 |           try {
 93 |             // if we get something unexpected, add ignore and add nothing to the map
 94 |             pm.get[JValue](key) match {
 95 |               case JArray(list) => // assumes all lists are string tokens for bias
 96 |                 val l = list.map {
 97 |                   case JString(s) => s
 98 |                   case _ => ""
 99 |                 }
100 |                 m = m + (key -> l)
101 |               case JString(s) => // name for this field is in engine params
102 |                 if (closureDateNames.contains(key)) {
103 |                   // one of the date fields
104 |                   val dateTime = new DateTime(s)
105 |                   val date: java.util.Date = dateTime.toDate()
106 |                   m = m + (key -> date)
107 |                 }
108 |               case JDouble(rank) => // only the ranking double from PopModel should be here
109 |                 m = m + (key -> rank)
110 |               case JInt(someInt) => // not sure what this is but pass it on
111 |                 m = m + (key -> someInt)
112 |             }
113 |           } catch {
114 |             case e: ClassCastException => e
115 |             case e: IllegalArgumentException => e
116 |             case e: MatchError => e
117 |             //got something we didn't expect so ignore it, put nothing in the map
118 |           }
119 |         }
120 |         (item, m)
121 |       })
122 |       allPropKeys = properties.head.flatMap(_._2.keySet).distinct.collect().toList
123 |     }
124 | 
125 | 
126 |     // these need to be indexed with "not_analyzed" and no norms so have to
127 |     // collect all field names before ES index create
128 |     val allFields = (allActions ++ allPropKeys).distinct // shouldn't need distinct but it's fast
129 | 
130 |     if (propertiesRDD.isEmpty) {
131 |       // Elasticsearch takes a Map with all fields, not a tuple
132 |       logger.info("Grouping all correlators into doc + fields for writing to index")
133 |       logger.info(s"Finding non-empty RDDs from a list of ${correlators.length} correlators and " +
134 |         s"${properties.length} properties")
135 |       val esRDDs: List[RDD[(String, Map[String, Any])]] =
136 |         //(correlators ::: properties).filterNot(c => c.isEmpty())// for some reason way too slow
137 |         (correlators ::: properties)
138 |           //c.take(1).length == 0
139 |       if (esRDDs.nonEmpty) {
140 |         val esFields = groupAll(esRDDs).map { case (item, map) =>
141 |           // todo: every map's items must be checked for value type and converted before writing to ES
142 |           val esMap = map + ("id" -> item)
143 |           esMap
144 |         }
145 |         // create a new index then hot-swap the new index by re-aliasing to it then delete old index
146 |         logger.info("New data to index, performing a hot swap of the index.")
147 |         esClient.hotSwap(
148 |           params.indexName,
149 |           params.typeName,
150 |           esFields.asInstanceOf[RDD[scala.collection.Map[String,Any]]],
151 |           allFields,
152 |           typeMappings)
153 |       } else logger.warn("No data to write. May have been caused by a failed or stopped `pio train`, " +
154 |         "try running it again")
155 | 
156 |     } else {
157 |       // this happens when updating only the popularity backfill model but to do a hotSwap we need to dup the
158 |       // entire index
159 | 
160 |       // create a new index then hot-swap the new index by re-aliasing to it then delete old index
161 |       esClient.hotSwap(params.indexName, params.typeName, propertiesRDD.get, allFields,
162 |         typeMappings)
163 |     }
164 |     true
165 |   }
166 |   
167 |   def groupAll( fields: Seq[RDD[(String, (Map[String, Any]))]]): RDD[(String, (Map[String, Any]))] = {
168 |     //if (fields.size > 1 && !fields.head.isEmpty() && !fields(1).isEmpty()) {
169 |     if (fields.size > 1) {
170 |       fields.head.cogroup[Map[String, Any]](groupAll(fields.drop(1))).map { case (key, pairMapSeqs) =>
171 |         // to be safe merge all maps but should only be one per rdd element
172 |         val rdd1Maps = pairMapSeqs._1.foldLeft(Map.empty[String, Any])(_ ++ _)
173 |         val rdd2Maps = pairMapSeqs._2.foldLeft(Map.empty[String, Any])(_ ++ _)
174 |         val fullMap = rdd1Maps ++ rdd2Maps
175 |         (key, fullMap)
176 |       }
177 |     } else fields.head
178 |   }
179 | 
180 |   override def toString = {
181 |     s"URModel in Elasticsearch at index: ${indexName}"
182 |   }
183 | 
184 | 
185 | }
186 | 
187 | object URModel
188 |   extends PersistentModelLoader[URAlgorithmParams, URModel] {
189 |   @transient lazy val logger = Logger[this.type]
190 | 
191 |   /** This is actually only used to read saved values and since they are in Elasticsearch we don't need to read
192 |     * this means we create a null model since it will not be used.
193 |     * todo: we should rejigger the template framework so this is not required.
194 |     * @param id ignored
195 |     * @param params ignored
196 |     * @param sc ignored
197 |     * @return dummy null model
198 |     */
199 |   def apply(id: String, params: URAlgorithmParams, sc: Option[SparkContext]): URModel = {
200 |     // todo: need changes in PIO to remove the need for this
201 |     val urm = new URModel(null, null, null, nullModel =  true)
202 |     logger.info("Created dummy null model")
203 |     urm
204 |   }
205 | 
206 | }
207 | 


--------------------------------------------------------------------------------
/src/main/scala/esClient.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright ActionML, LLC under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * ActionML licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.template
 19 | 
 20 | import java.util
 21 | 
 22 | import grizzled.slf4j.Logger
 23 | import io.prediction.data.storage.{Storage, StorageClientConfig, elasticsearch}
 24 | import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.GetRequest
 25 | import org.apache.spark.SparkContext
 26 | import org.apache.spark.rdd.RDD
 27 | import org.elasticsearch.action.admin.indices.alias.get.GetAliasesRequest
 28 | import org.elasticsearch.action.admin.indices.create.CreateIndexRequest
 29 | import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest
 30 | import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsRequest
 31 | import org.elasticsearch.action.admin.indices.refresh.RefreshRequest
 32 | import org.elasticsearch.action.get.GetResponse
 33 | import org.elasticsearch.client.transport.TransportClient
 34 | import org.elasticsearch.common.settings.{Settings, ImmutableSettings}
 35 | import org.joda.time.DateTime
 36 | import org.json4s.jackson.JsonMethods._
 37 | import org.elasticsearch.spark._
 38 | import org.elasticsearch.node.NodeBuilder._
 39 | 
 40 | import scala.collection.immutable
 41 | import scala.collection.parallel.mutable
 42 | 
 43 | /** Elasticsearch notes:
 44 |   * 1) every query clause wil laffect scores unless it has a constant_score and boost: 0
 45 |   * 2) the Spark index writer is fast but must assemble all data for the index before the write occurs
 46 |   * 3) many operations must be followed by a refresh before the action takes effect--sortof like a transaction commit
 47 |   * 4) to use like a DB you must specify that the index of fields are `not_analyzed` so they won't be lowercased,
 48 |   *    stemmed, tokenized, etc. Then the values are literal and must match exactly what is in the query (no analyzer)
 49 |   */
 50 | 
 51 | /** Defines methods to use on Elasticsearch. */
 52 | object esClient {
 53 |   @transient lazy val logger = Logger[this.type]
 54 | 
 55 |   private lazy val client = if (Storage.getConfig("ELASTICSEARCH").nonEmpty)
 56 |       new elasticsearch.StorageClient(Storage.getConfig("ELASTICSEARCH").get).client
 57 |     else
 58 |       throw new IllegalStateException("No Elasticsearch client configuration detected, check your pio-env.sh for" +
 59 |         "proper configuration settings")
 60 | 
 61 |   // wrong way that uses only default settings, which will be a localhost ES sever.
 62 |   //private lazy val client = new elasticsearch.StorageClient(StorageClientConfig()).client
 63 | 
 64 |   /** Delete all data from an instance but do not commit it. Until the "refresh" is done on the index
 65 |     * the changes will not be reflected.
 66 |     * @param indexName will delete all types under this index, types are not used by the UR
 67 |     * @param refresh
 68 |     * @return true if all is well
 69 |     */
 70 |   def deleteIndex(indexName: String, refresh: Boolean = false): Boolean = {
 71 |     //val debug = client.connectedNodes()
 72 |     if (client.admin().indices().exists(new IndicesExistsRequest(indexName)).actionGet().isExists()) {
 73 |       val delete = client.admin().indices().delete(new DeleteIndexRequest(indexName)).actionGet()
 74 |       if (!delete.isAcknowledged) {
 75 |         logger.info(s"Index ${indexName} wasn't deleted, but may have quietly failed.")
 76 |       } else {
 77 |         // now refresh to get it 'committed'
 78 |         // todo: should do this after the new index is created so no index downtime
 79 |         if (refresh) refreshIndex(indexName)
 80 |       }
 81 |       true
 82 |     } else {
 83 |       logger.warn(s"Elasticsearch index: ${indexName} wasn't deleted because it didn't exist. This may be an error.")
 84 |       false
 85 |     }
 86 |   }
 87 | 
 88 |   /** Creates a new empty index in Elasticsearch and initializes mappings for fields that will be used
 89 |     * @param indexName elasticsearch name
 90 |     * @param indexType names the type of index, usually use the item name
 91 |     * @param fieldNames ES field names
 92 |     * @param typeMappings indicates which ES fields are to be not_analyzed without norms
 93 |     * @param refresh should the index be refreshed so the create is committed
 94 |     * @return true if all is well
 95 |     */
 96 |   def createIndex(
 97 |     indexName: String,
 98 |     indexType: String = "items",
 99 |     fieldNames: List[String],
100 |     typeMappings: Option[Map[String, String]] = None,
101 |     refresh: Boolean = false): Boolean = {
102 |     if (!client.admin().indices().exists(new IndicesExistsRequest(indexName)).actionGet().isExists()) {
103 |       var mappings = """
104 |         |{
105 |         |  "properties": {
106 |         """.stripMargin.replace("\n", "")
107 | 
108 |       def mappingsField(t: String) = {
109 |         s"""
110 |         |    : {
111 |         |      "type": "${t}",
112 |         |      "index": "not_analyzed",
113 |         |      "norms" : {
114 |         |        "enabled" : false
115 |         |      }
116 |         |    },
117 |         """.stripMargin.replace("\n", "")
118 |       }
119 | 
120 |       val mappingsTail = """
121 |         |    "id": {
122 |         |      "type": "string",
123 |         |      "index": "not_analyzed",
124 |         |      "norms" : {
125 |         |        "enabled" : false
126 |         |      }
127 |         |    }
128 |         |  }
129 |         |}
130 |       """.stripMargin.replace("\n", "")
131 | 
132 |       fieldNames.foreach { fieldName =>
133 |         if (typeMappings.nonEmpty && typeMappings.get.contains(fieldName))
134 |           mappings += (fieldName + mappingsField(typeMappings.get(fieldName)))
135 |         else // unspecified fields are treated as not_analyzed strings
136 |           mappings += (fieldName + mappingsField("string"))
137 |       }
138 |       mappings += mappingsTail // any other string is not_analyzed
139 | 
140 |       val cir = new CreateIndexRequest(indexName).mapping("items",mappings)
141 |       val create = client.admin().indices().create(cir).actionGet()
142 |       if (!create.isAcknowledged) {
143 |         logger.info(s"Index ${indexName} wasn't created, but may have quietly failed.")
144 |       } else {
145 |         // now refresh to get it 'committed'
146 |         // todo: should do this after the new index is created so no index downtime
147 |         if (refresh) refreshIndex(indexName)
148 |       }
149 |       true
150 |     } else {
151 |       logger.warn(s"Elasticsearch index: ${indexName} wasn't created because it already exists. This may be an error.")
152 |       false
153 |     }
154 |   }
155 | 
156 |   /** Commits any pending changes to the index */
157 |   def refreshIndex(indexName: String): Unit = {
158 |     client.admin().indices().refresh(new RefreshRequest(indexName)).actionGet()
159 |   }
160 | 
161 |   /** Create new index and hot-swap the new after it's indexed and ready to take over, then delete the old */
162 |   def hotSwap(
163 |     alias: String,
164 |     typeName: String = "items",
165 |     indexRDD: RDD[scala.collection.Map[String,Any]],
166 |     fieldNames: List[String],
167 |     typeMappings: Option[Map[String, String]] = None): Unit = {
168 |     // get index for alias, change a char, create new one with new id and index it, swap alias and delete old one
169 |     val aliasMetadata = client.admin().indices().prepareGetAliases(alias).get().getAliases
170 |     val newIndex = alias + "_" + DateTime.now().getMillis.toString
171 |     createIndex(newIndex, typeName, fieldNames, typeMappings)
172 | 
173 |     val newIndexURI = "/" + newIndex + "/" + typeName
174 |     indexRDD.saveToEs(newIndexURI, Map("es.mapping.id" -> "id"))
175 |     //refreshIndex(newIndex)
176 | 
177 |     if (!aliasMetadata.isEmpty
178 |     && aliasMetadata.get(alias) != null
179 |     && aliasMetadata.get(alias).get(0) != null) { // was alias so remove the old one
180 |       //append the DateTime to the alias to create an index name
181 |       val oldIndex = aliasMetadata.get(alias).get(0).getIndexRouting
182 |       client.admin().indices().prepareAliases()
183 |         .removeAlias(oldIndex, alias)
184 |         .addAlias(newIndex, alias)
185 |         .execute().actionGet()
186 |       deleteIndex(oldIndex) // now can safely delete the old one since it's not used
187 |     } else { // todo: could be more than one index with 'alias' so
188 |       // no alias so add one
189 |       //to clean up any indexes that exist with the alias name
190 |       val indices = util.Arrays.asList(client.admin().indices().prepareGetIndex().get().indices()).get(0)
191 |       if (indices.contains(alias)) {
192 |         //refreshIndex(alias)
193 |         deleteIndex(alias) // index named like the new alias so delete it
194 |       }
195 |       // slight downtime, but only for one case of upgrading the UR engine from v0.1.x to v0.2.0+
196 |       client.admin().indices().prepareAliases()
197 |         .addAlias(newIndex, alias)
198 |         .execute().actionGet()
199 |     }
200 |     // clean out any old indexes that were the product of a failed train?
201 |     val indices = util.Arrays.asList(client.admin().indices().prepareGetIndex().get().indices()).get(0)
202 |     indices.map{ index =>
203 |       if (index.contains(alias) && index != newIndex) deleteIndex(index) //clean out any old orphaned indexes
204 |     }
205 | 
206 |   }
207 | 
208 |   /** Performs a search using the JSON query String
209 |     *
210 |     * @param query the JSON query string parable by Elasticsearch
211 |     * @param indexName the index to search
212 |     * @return a [PredictedResults] collection
213 |     */
214 |   def search(query: String, indexName: String): PredictedResult = {
215 |     val sr = client.prepareSearch(indexName).setSource(query).get()
216 | 
217 |     if (!sr.isTimedOut) {
218 |       val recs = sr.getHits.getHits.map( hit => new ItemScore(hit.getId, hit.getScore.toDouble) )
219 |       logger.info(s"Results: ${sr.getHits.getHits.size} retrieved of " +
220 |         s"a possible ${sr.getHits.totalHits()}")
221 |       new PredictedResult(recs)
222 |     } else {
223 |       logger.info(s"No results for query ${parse(query)}")
224 |       new PredictedResult(Array.empty[ItemScore])
225 |     }
226 | 
227 |   }
228 | 
229 |   /** Gets the "source" field of an Elasticsearch document
230 |     *
231 |     * @param indexName index that contains the doc/item
232 |     * @param typeName type name used to construct ES REST URI
233 |     * @param doc for UR the item id
234 |     * @return source [java.util.Map] of field names to any valid field values or null if empty
235 |     */
236 |   def getSource(indexName: String, typeName: String, doc: String): util.Map[String, AnyRef] = {
237 |     client.prepareGet(indexName, typeName, doc)
238 |       .execute()
239 |       .actionGet().getSource
240 |   }
241 | 
242 |   /*
243 |   public Set<String> getIndicesFromAliasName(String aliasName) {
244 | 
245 |     IndicesAdminClient iac = client.admin().indices();
246 |     ImmutableOpenMap<String, List<AliasMetaData>> map = iac.getAliases(new GetAliasesRequest(aliasName))
247 |             .actionGet().getAliases();
248 | 
249 |     final Set<String> allIndices = new HashSet<>();
250 |     map.keysIt().forEachRemaining(allIndices::add);
251 |     return allIndices;
252 | }
253 |    */
254 |   def getIndexName(alias: String): Option[String] = {
255 | 
256 |     val allIndicesMap = client.admin().indices().getAliases(new GetAliasesRequest(alias)).actionGet().getAliases
257 | 
258 |     if (allIndicesMap.size() == 1) { // must be a 1-1 mapping of alias <-> index
259 |       var  indexName: String = ""
260 |       var itr = allIndicesMap.keysIt()
261 |       while ( itr.hasNext )
262 |         indexName = itr.next()
263 |       Some(indexName) // the one index the alias points to
264 |     } else {
265 |       // delete all the indices that are pointed to by the alias, they can't be used
266 |       logger.warn("There is no 1-1 mapping of index to alias so deleting the old indexes that are referenced by the " +
267 |         "alias. This may have been caused by a crashed or stopped `pio train` operation so try running it again.")
268 |       val i = allIndicesMap.keys().toArray.asInstanceOf[Array[String]]
269 |       for ( indexName <- i ){
270 |         deleteIndex(indexName, true)
271 |       }
272 | 
273 |       None // if more than one abort, need to clean up bad aliases
274 |     }
275 |   }
276 | 
277 |   def getRDD(sc: SparkContext, alias: String, typeName: String):
278 |   Option[RDD[(String, collection.Map[String, AnyRef])]] = {
279 |     val index = getIndexName(alias)
280 |     if (index.nonEmpty) { // ensures there is a 1-1 mapping of alias to index
281 |       val indexAsRDD = sc.esRDD(alias + "/" + typeName)
282 |       //val debug = indexAsRDD.count()
283 |       Some(indexAsRDD)
284 |     } else None // error so no index for the alias
285 |   }
286 | }


--------------------------------------------------------------------------------
/src/main/scala/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright ActionML, LLC under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.template
19 | 
20 | import grizzled.slf4j.Logger
21 | import scala.collection.JavaConversions._
22 | import org.apache.mahout.sparkbindings.SparkDistributedContext
23 | import org.apache.mahout.sparkbindings.indexeddataset.IndexedDatasetSpark
24 | import org.apache.mahout.sparkbindings._
25 | import org.apache.spark.rdd.RDD
26 | 
27 | /** Utility conversions for IndexedDatasetSpark */
28 | package object conversions {
29 | 
30 |   implicit class IndexedDatasetConversions(val indexedDataset: IndexedDatasetSpark) {
31 |     def toStringMapRDD(actionName: String): RDD[(String, Map[String, Seq[String]])] = {
32 |       @transient lazy val logger = Logger[this.type]
33 | 
34 |       //val matrix = indexedDataset.matrix.checkpoint()
35 |       val rowIDDictionary = indexedDataset.rowIDs
36 |       implicit val sc = indexedDataset.matrix.context.asInstanceOf[SparkDistributedContext].sc
37 |       val rowIDDictionary_bcast = sc.broadcast(rowIDDictionary)
38 | 
39 |       val columnIDDictionary = indexedDataset.columnIDs
40 |       val columnIDDictionary_bcast = sc.broadcast(columnIDDictionary)
41 | 
42 |       // may want to mapPartition and create bulk updates as a slight optimization
43 |       // creates an RDD of (itemID, Map[correlatorName, list-of-correlator-values])
44 |       indexedDataset.matrix.rdd.map[(String, Map[String, Seq[String]])] { case (rowNum, itemVector) =>
45 | 
46 |         // turn non-zeros into list for sorting
47 |         var itemList = List[(Int, Double)]()
48 |         for (ve <- itemVector.nonZeroes) {
49 |           val v = ve
50 |           itemList = itemList :+(ve.index, ve.get)
51 |         }
52 |         //sort by highest strength value descending(-)
53 |         val vector = itemList.sortBy { elem => -elem._2 }
54 | 
55 |         val itemID = rowIDDictionary_bcast.value.inverse.getOrElse(rowNum, "INVALID_ITEM_ID")
56 |         try {
57 | 
58 |           require(itemID != "INVALID_ITEM_ID", s"Bad row number in  matrix, skipping item ${rowNum}")
59 |           require(vector.nonEmpty, s"No values so skipping item ${rowNum}")
60 | 
61 |           // create a list of element ids
62 |           val values = vector.map { item =>
63 |             columnIDDictionary_bcast.value.inverse.getOrElse(item._1, "") // should always be in the dictionary
64 |           }
65 | 
66 |           (itemID, Map(actionName -> values))
67 | 
68 |         } catch {
69 |           case cce: IllegalArgumentException => //non-fatal, ignore line
70 |             null.asInstanceOf[(String, Map[String, Seq[String]])]
71 |         }
72 | 
73 |       }.filter(_ != null)
74 |     }
75 |   }
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/template.json:
--------------------------------------------------------------------------------
1 | {"pio": {"version": { "min": "0.9.5" }}}
2 | 


--------------------------------------------------------------------------------