├── test
    ├── __init__.py
    ├── test_json_to_csv_converter.py
    ├── test_category_predictor.py
    ├── test_autopilot.py
    └── test_weighted_positivity.py
├── review_autopilot
    ├── __init__.py
    ├── generate.py
    └── autopilot.py
├── category_predictor
    ├── __init__.py
    ├── predict.py
    └── category_predictor.py
├── positive_category_words
    ├── __init__.py
    ├── simple_global_positivity.py
    └── weighted_category_positivity.py
├── .gitignore
├── tox.ini
├── .travis.yml
├── setup.py
├── LICENSE.txt
├── README.md
└── json_to_csv_converter.py


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/review_autopilot/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/category_predictor/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/positive_category_words/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.swp
3 | *.pyc
4 | *.json
5 | .tox
6 | *.egg-info
7 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py26,py27
 3 | 
 4 | [testenv]
 5 | deps =
 6 |     testify
 7 |     mrjob
 8 |     unittest2
 9 | commands =
10 |     testify -v test
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | os:
 3 |     - osx
 4 |     - linux
 5 | env:
 6 |     matrix:
 7 |         - TOX_ENV=py26
 8 |         - TOX_ENV=py27
 9 | install:
10 |     - pip install -e .
11 | script: testify -v test
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Setup for dataset-examples."""
 3 | from setuptools import setup, find_packages
 4 | 
 5 | requires = [
 6 |         'mrjob',
 7 |         'testify',
 8 |         'unittest2',
 9 |         ]
10 | 
11 | setup(
12 |         name='dataset-examples',
13 |         description='Examples for the Yelp datasets.',
14 |         author='Yelp',
15 |         url='https://github.com/Yelp/dataset-examples',
16 |         packages=find_packages(),
17 |         install_requires=requires,
18 |         tests_require=requires,
19 |         )
20 | 
21 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2011 Yelp
 2 | 
 3 |   Licensed under the Apache License, Version 2.0 (the "License");
 4 |   you may not use this file except in compliance with the License.
 5 |   You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |   Unless required by applicable law or agreed to in writing, software
10 |   distributed under the License is distributed on an "AS IS" BASIS,
11 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12 |   implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License.
15 | 


--------------------------------------------------------------------------------
/test/test_json_to_csv_converter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Test the json to csv converter script."""
 3 | from unittest2 import TestCase
 4 | 
 5 | import json_to_csv_converter
 6 | 
 7 | 
 8 | class TestJsonToCsvConverter(TestCase):
 9 | 
10 |     """Test the json to csv converter script."""
11 | 
12 |     test_biz = {
13 |         'type':'business',
14 |         'business_id': 123,
15 |         'hours': {
16 |             'Monday': {
17 |                 'open': "11:30",
18 |                 'close': "21:00",
19 |                 },
20 |             },
21 |         }
22 |     test_biz_column_names = frozenset(['type', 'business_id', 'hours.Monday.open', 'hours.Monday.close'])
23 |     test_review = {
24 |         'type': 'review',
25 |         'user_id': 345,
26 |         'votes': {
27 |             'funny': 1,
28 |             },
29 |         }
30 |     test_review_column_names = frozenset(['type', 'user_id', 'votes.funny'])
31 | 
32 |     def test_get_column_names(self):
33 |         """Test that we see the expected column names for the test objects."""
34 |         biz_column_names = set(json_to_csv_converter.get_column_names(self.test_biz))
35 |         self.assertEqual(biz_column_names, self.test_biz_column_names)
36 | 
37 |         review_column_names = set(json_to_csv_converter.get_column_names(self.test_review))
38 |         self.assertEqual(review_column_names, self.test_review_column_names)
39 | 
40 |     def test_get_nested_value(self):
41 |         """Test getting a nested value from a dict given a flat key."""
42 |         # non-nested values
43 |         self.assertEqual(
44 |                 json_to_csv_converter.get_nested_value(self.test_review, 'type'),
45 |                 'review'
46 |                 )
47 |         # nested values
48 |         self.assertEqual(
49 |                 json_to_csv_converter.get_nested_value(self.test_biz, 'hours.Monday.open'),
50 |                 '11:30'
51 |                 )
52 |         # unknown values
53 |         self.assertIsNone(
54 |                 json_to_csv_converter.get_nested_value(self.test_review, 'this.is.not.in.the.review'),
55 |                 )
56 | 
57 | 


--------------------------------------------------------------------------------
/positive_category_words/simple_global_positivity.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2011 Yelp and Contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import re
16 | 
17 | from mrjob.job import MRJob
18 | from mrjob.protocol import JSONValueProtocol
19 | 
20 | MINIMUM_OCCURENCES = 1000
21 | 
22 | def avg_and_total(iterable):
23 |     """Compute the average over a numeric iterable."""
24 |     items = 0
25 |     total = 0.0
26 | 
27 |     for item in iterable:
28 |         total += item
29 |         items += 1
30 | 
31 |     return total / items, total
32 | 
33 | class PositiveWords(MRJob):
34 |     """Find the most positive words in the dataset."""
35 | 
36 |     # The input is the dataset - interpret each line as a single json
37 |     # value (the key will be None)
38 |     INPUT_PROTOCOL = JSONValueProtocol
39 | 
40 |     def review_mapper(self, _, data):
41 |         """Walk over reviews, emitting each word and its rating."""
42 |         if data['type'] != 'review':
43 |             return
44 | 
45 |         # normalize words by lowercasing and dropping non-alpha
46 |         # characters
47 |         norm = lambda word: re.sub('[^a-z]', '', word.lower())
48 |         # only include a word once per-review (which de-emphasizes
49 |         # proper nouns)
50 |         words = set(norm(word) for word in data['text'].split())
51 | 
52 |         for word in words:
53 |             yield word, data['stars']
54 | 
55 |     def positivity_reducer(self, word, ratings):
56 |         """Emit average star rating, word in a format we can easily
57 |         sort with the unix sort command: 
58 |         [star average * 100, total count], word.
59 |         """
60 |         avg, total = avg_and_total(ratings)
61 | 
62 |         if total < MINIMUM_OCCURENCES:
63 |             return
64 | 
65 |         yield (int(avg * 100), total), word
66 | 
67 |     def steps(self):
68 |         return [self.mr(), # Split apart the dataset into multiple
69 |                 # chunks. In regular hadoop-land you could change the
70 |                 # splitter. This is normally < 30 seconds of work.
71 |                 self.mr(self.review_mapper, self.positivity_reducer)]
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     PositiveWords().run()
76 | 


--------------------------------------------------------------------------------
/test/test_category_predictor.py:
--------------------------------------------------------------------------------
 1 | from __future__ import with_statement
 2 | 
 3 | import json
 4 | import unittest
 5 | from unittest import TestCase
 6 | from StringIO import StringIO
 7 | 
 8 | from category_predictor.category_predictor import CategoryPredictor
 9 | 
10 | # These templates can be used to make a json string very easily.
11 | REVIEW_TEMPLATE = '{"type":"review", "stars":3, "text":"%s",\
12 | "business_id":"%s"}\n'
13 | BUSINESS_TEMPLATE = '{"type":"business", "categories":["%s"], \
14 | "business_id":"%s"}\n'
15 | LONG_TEXT = "Hello world" * 101
16 | TEXT = u"Hello"
17 | BIZ_ID = u"Yelp"
18 | CATEGORY = u'Company'
19 | 
20 | 
21 | class TestCategoryPredictor(TestCase):
22 | 
23 |     def test_smoke(self):
24 |         """Does a complete run with mock data"""
25 |         business = BUSINESS_TEMPLATE % (CATEGORY, BIZ_ID)
26 |         review = REVIEW_TEMPLATE % (LONG_TEXT, BIZ_ID)
27 |         total_input = business + review
28 |         static_stdin = StringIO(total_input)
29 | 
30 |         job = CategoryPredictor(['-r', 'inline', '--no-conf', '-'])
31 |         job.sandbox(stdin=static_stdin)
32 | 
33 |         results = []
34 |         with job.make_runner() as runner:
35 |             runner.run()
36 |             for line in runner.stream_output():
37 |                 key, value = job.parse_output_line(line)
38 |                 results.append(value)
39 | 
40 |         # Results should be the probability of that category being chosen.
41 |         result = {CATEGORY: 1}
42 |         self.assertEqual(results[0], result)
43 | 
44 |     def test_review_category(self):
45 |         """Tests the category_mapper to make sure it is properly running"""
46 |         business = BUSINESS_TEMPLATE % (CATEGORY, BIZ_ID)
47 |         review = REVIEW_TEMPLATE % (TEXT, BIZ_ID)
48 |         job = CategoryPredictor()
49 |         review_results = list(job.review_category_mapper(None, json.loads(review)))
50 |         biz_results = list(job.review_category_mapper(None, json.loads(business)))
51 |         self.assertEqual(review_results, [(BIZ_ID, ('review', TEXT))])
52 |         self.assertEqual(biz_results, [(BIZ_ID, ('categories', [CATEGORY]))])
53 | 
54 |     def test_categories_to_reviews(self):
55 |         """Tests add_categories_to_reviews to make sure it is properly running"""
56 |         category = [('categories', [CATEGORY]), ('review', TEXT)]
57 | 
58 |         job = CategoryPredictor()
59 |         category_results = list(job.add_categories_to_reviews_reducer(BIZ_ID, category))
60 |         result = [('all', {CATEGORY: 1}), (CATEGORY, TEXT)]
61 |         self.assertEqual(category_results,result)
62 | 
63 |     def test_tokenize_reviews(self):
64 |         """Tests tokenize_reviews_mapper to make sure it is properly running"""
65 |         review = {CATEGORY: 1}
66 | 
67 |         job = CategoryPredictor()
68 |         token_results = list(job.tokenize_reviews_mapper('all', review))
69 |         result = [('all', {CATEGORY: 1})]
70 |         self.assertEqual(token_results, result)
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     unittest.main()
75 | 


--------------------------------------------------------------------------------
/test/test_autopilot.py:
--------------------------------------------------------------------------------
 1 | from __future__ import with_statement
 2 | 
 3 | import unittest
 4 | from unittest import TestCase
 5 | from StringIO import StringIO
 6 | 
 7 | from review_autopilot.autopilot import ReviewAutoPilot
 8 | 
 9 | # These are used to create stdin string data.
10 | CATEGORY = 'Company'
11 | REVIEW_TEMPLATE = '{"type":"review", "stars":3, "text":"%s",\
12 | "business_id":"%s"}\n'
13 | BUSINESS_TEMPLATE = '{"type":"business", "categories": "%s",\
14 | "business_id":"%s"}\n'
15 | TEXT = 'Hello!'
16 | ID = 128411
17 | BIZ = 'Yelp'
18 | # This is used to pass around dict data, which is slightly different than
19 | # the string data above.
20 | DATA = [
21 |     {'type':'business', 'business_id': ID, 'data':'Info here'},
22 |     {'type': 'review', 'business_id':ID, 'text': TEXT}
23 | ]
24 | 
25 | 
26 | class TestReviewAutoPilotCase(TestCase):
27 | 
28 |     def test_business_mapper(self):
29 |         """tests the individual mappers of ReviewAutoPilot"""
30 |         job = ReviewAutoPilot()
31 |         biz_results = list(job.business_join_mapper(None, DATA[0]))
32 |         review_results = list(job.business_join_mapper(None, DATA[1]))
33 | 
34 |         biz_after_results = [(ID, ('business', DATA[0]))]
35 |         review_after_results = [(ID, ('review', DATA[1]['text']))] 
36 | 
37 |         self.assertEqual(biz_results, biz_after_results)
38 |         self.assertEqual(review_results, review_after_results)
39 | 
40 |     def test_smoke(self):
41 |         """Uses small, static dataset possible on local, since a full run takes
42 |         too long."""
43 | 
44 |         # Random data to feed into the markov model.
45 |         # I use long runs of foo to get through the threshold filters.
46 |         text = ('foo bar foo baz foo car foo daz ' + ('foo ' * 10) + 'foofoo yelp'
47 |             'foo yar foo foo bar bar dar')
48 |         single_review = REVIEW_TEMPLATE % (text, BIZ)
49 |         business = BUSINESS_TEMPLATE % (CATEGORY, BIZ)
50 |         static_stdin = StringIO(single_review + business)
51 | 
52 |         job = ReviewAutoPilot(['-r', 'inline', '--no-conf', '-'])
53 |         job.sandbox(stdin=static_stdin)
54 | 
55 |         results = []
56 |         with job.make_runner() as runner:
57 |             runner.run()
58 |             for line in runner.stream_output():
59 |                 key, value = job.parse_output_line(line)
60 |                 results.append(value)
61 | 
62 |         # Normal output to compare
63 |         result = {'foo': 0.99009900990099009, '<end>': 0.0099009900990099011}
64 |         self.assertEqual(results[0], result)
65 | 
66 |     def test_categories_reducer(self):
67 |         """Tests join_reviews_with_categories_reducer with null data and some
68 |         static data."""
69 |         job = ReviewAutoPilot()
70 |         VALUES = (('business', {'categories': CATEGORY}), ('review', TEXT))
71 |         category_results = list(job.join_reviews_with_categories_reducer(BIZ, VALUES))
72 |         results = [(CATEGORY, TEXT)]
73 |         self.assertEqual(category_results, results)
74 | 
75 |     def test_split_mapper(self):
76 |         """Tests split_mapper reducer in autopilot"""
77 |         job = ReviewAutoPilot()
78 |         TEST_RETURN = (('hello', 'C'), ('<end>', 1))
79 |         self.assertEqual(job.review_split_mapper(CATEGORY, TEXT).next(),
80 |             TEST_RETURN)
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     unittest.main()
85 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/Yelp/dataset-examples.svg)](https://travis-ci.org/Yelp/dataset-examples)
 2 | 
 3 | Yelp's Academic Dataset Examples
 4 | ================================
 5 | 
 6 | We're providing three examples for use with the datasets available at [http://www.yelp.com/dataset_challenge](http://www.yelp.com/dataset_challenge) and 
 7 | [http://www.yelp.com/academic_dataset](http://www.yelp.com/academic_dataset). They all depend on
 8 | [mrjob](https://github.com/Yelp/mrjob) and python 2.6 or later.
 9 | 
10 | To install all dependencies: `$ pip install -e .`
11 | 
12 | To test: `$ tox`
13 | 
14 | Samples
15 | ------------
16 | 
17 | `json_to_csv_converter`: Convert the dataset from json format to csv format.
18 | 
19 | ```bash
20 | $ python json_to_csv_converter.py yelp_academic_dataset.json # Creates yelp_academic_dataset.csv
21 | ```
22 | 
23 | `category_predictor`: Given some text, predict likely categories. For example:
24 | 
25 | ```bash
26 | $ python category_predictor/category_predictor.py yelp_academic_dataset.json > category_predictor.json
27 | $ python category_predictor/predict.py category_predictor.json "bacon donut"
28 | Category: "Food" - 82.66% chance
29 | Category: "Restaurants" - 16.99% chance
30 | Category: "Donuts" - 0.12% chance
31 | Category: "Basque" - 0.02% chance
32 | Category: "Spanish" - 0.02% chance
33 | ```
34 | 
35 | `review_autopilot`: Use a markov chain to finish a review. For
36 | example:
37 | 
38 | ```bash
39 | $ python review_autopilot/generate.py Food 'They have the best'
40 | They have the best coffee is good food was delicious cookies and
41 | a few friends i think they make this
42 | ```
43 | 
44 | `positive_category_words`: See the Yelp engineering blog for
45 | details about this example. In short, it generates positivity
46 | scores for words either globally or per-category.
47 | 
48 | Basic set-up
49 | ------------
50 | 
51 | You can use any of mjrob's runner with these examples, but we'll focus
52 | on the local and emr runner (if you have access to your own hadoop
53 | cluster, check out the mrjob docs for instructions on how to set this
54 | up).
55 | 
56 | Local mode couldn't be easier:
57 | 
58 |     # this step will take a VERY long time
59 |     python review_autopilot/autopilot.py yelp_academic_dataset.json > autopilot.json
60 | 
61 |     # this should be instant
62 | 	python review_autopilot/generate.py Food 'They have the best'
63 | 	> hot dogs ever
64 | 
65 | Waiting a long time is kind of lame, no? Let's try the same thing
66 | using EMR.
67 | 
68 | First off, you'll need an `aws_access_key` and an
69 | `aws_secret_access_key`. You can get these from the AWS console
70 | (you'll need to sign up for an AWS developer account and enable s3 /
71 | emr usage, if you haven't already).
72 | 
73 | Create a simple mrjob.conf file, like this:
74 | 
75 |     runners:
76 |       emr:
77 |         aws_access_key_id: YOUR_ACCESS_KEY
78 |         aws_secret_access_key: YOUR_SECRET_KEY
79 | 
80 | Now that that's done, you can run the autopilot script on EMR.
81 | 
82 |     # WARNING: this will cost you roughly $2 and take 10-20 minutes
83 | 	python review_autopilot/autopilot.py --num-ec2-instances 10 --ec2-instance-type c1.medium -v --runner emr yelp_academic_dataset.json
84 | 
85 | 
86 | You can save money (and time) by re-using jobflows and uploading the
87 | dataset to a personal, private s3 bucket - check out the mrjob docs for
88 | instructions on doing this.
89 | 


--------------------------------------------------------------------------------
/test/test_weighted_positivity.py:
--------------------------------------------------------------------------------
 1 | from __future__ import with_statement
 2 | 
 3 | import json
 4 | import unittest
 5 | from unittest import TestCase
 6 | from StringIO import StringIO
 7 | import testify as T
 8 | 
 9 | from positive_category_words.weighted_category_positivity import WeightedPositiveWords
10 | 
11 | 
12 | CATEGORY = u'Company'
13 | REVIEW_TEMPLATE = ('{"type":"review", "stars":3, "text":"%s",'
14 | '"business_id":"%s"}\n')
15 | BUSINESS_TEMPLATE = ('{"type":"business", "categories":["%s"], '
16 | '"business_id":"%s"}\n')
17 | TEXT = u"Hello world"
18 | BIZ_NAME = u'Qdoba'
19 | 
20 | 
21 | class TestWeightedPositiveWords(TestCase):
22 | 
23 |     def test_smoke(self):
24 |         """Does a full run of weighted positive words"""
25 | 
26 |         # Need 3 mock businesses to test
27 |         business1 = BUSINESS_TEMPLATE % (CATEGORY, "Yelp")
28 |         business2 = BUSINESS_TEMPLATE % (CATEGORY, "Target")
29 |         business3 = BUSINESS_TEMPLATE % (CATEGORY, "Walmart") 
30 |         # Need more than 1 review for weighted threshold
31 |         review1 = REVIEW_TEMPLATE % (TEXT, "Yelp")
32 |         review2 = REVIEW_TEMPLATE % (TEXT, "Target")
33 |         review3 = REVIEW_TEMPLATE % (TEXT, "Walmart")
34 | 
35 |         # Need at least 50 occurrences of reviews, so multiply the first review by 20
36 |         total_input = (business1 + business2 + business3
37 |             + (review1 * 20) + review2 + review3)
38 |         static_stdin = StringIO(total_input)
39 | 
40 |         job = WeightedPositiveWords(['-r', 'inline', '--no-conf', '-'])
41 |         job.sandbox(stdin=static_stdin)
42 | 
43 |         results = []
44 |         with job.make_runner() as runner:
45 |             runner.run()
46 |             for line in runner.stream_output():
47 |                 key, value = job.parse_output_line(line)
48 |                 results.append(value)
49 |         end_result = [[CATEGORY, 66.0, 'hello'], [CATEGORY, 66.0, 'world']]
50 |         self.assertEqual(results, end_result)
51 | 
52 |     def test_review_category(self):
53 |         """Test the review_category_mapper function with a mock input"""
54 | 
55 |         review = REVIEW_TEMPLATE % (TEXT, BIZ_NAME)
56 |         business = BUSINESS_TEMPLATE % (CATEGORY, BIZ_NAME)
57 | 
58 |         job = WeightedPositiveWords()
59 |         review_results = list(job.review_category_mapper(None, json.loads(review)))
60 |         biz_results = list(job.review_category_mapper(None, json.loads(business)))
61 |         review_after_results = [(BIZ_NAME, ('review', (TEXT, 3)))]                
62 |         biz_after_results = [(BIZ_NAME, ('categories', [CATEGORY]))]
63 |         self.assertEqual(review_results, review_after_results)
64 |         self.assertEqual(biz_results, biz_after_results)
65 | 
66 | 
67 |     def test_category_join(self):
68 |         """Test the category_join_reducer function with the same results
69 |         from above. These tests should be used to isolate where an error
70 |         will come from if a person changes any of the functions in the mr
71 |         """
72 |         review_or_categories = (('review', (TEXT, 3)),  ('categories', [CATEGORY]))
73 | 
74 |         job = WeightedPositiveWords()
75 |         join_results = list(job.category_join_reducer(BIZ_NAME, review_or_categories))
76 |         results = [(CATEGORY, (BIZ_NAME, (TEXT, 3)))]
77 |         self.assertEqual(join_results, results)
78 | 
79 |     def test_review_mapper(self):
80 |         """Test the review_mapper function to make sure that based on a mock input,
81 |         it produces the correct calculated output
82 |         """
83 |         biz_review_positivity = (BIZ_NAME, (TEXT, 3))
84 | 
85 |         job = WeightedPositiveWords()
86 |         review_results = list(job.review_mapper(CATEGORY, biz_review_positivity))
87 |         results = [((CATEGORY, u'world'), (BIZ_NAME, 3)), ((CATEGORY, u'hello'), (BIZ_NAME, 3))]
88 |         T.assert_sorted_equal(review_results, results)
89 | 
90 | if __name__ == '__main__':
91 |     unittest.main()
92 | 


--------------------------------------------------------------------------------
/review_autopilot/generate.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2011 Yelp and Contributors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from __future__ import with_statement
 16 | 
 17 | import random
 18 | import sys
 19 | 
 20 | import autopilot
 21 | 
 22 | class ReviewMarkovGenerator(object):
 23 |     """Generate the remainder of a review, given a category and some
 24 |     start text.
 25 |     """
 26 |     
 27 |     @classmethod
 28 |     def load_data(cls, input_file):
 29 |         """Read the output of the ReviewAutoPilot mrjob, returning a
 30 |         transition distribution. The transition distribution is a
 31 |         dictionary with category keys. Each category key points to
 32 |         another dictionary, which contains word keys, which contain
 33 |         another set of dictionaries, which contain the probability of
 34 |         transitioning to the the next word.
 35 | 
 36 |         Here's an example:
 37 | 
 38 |         category_transitions = {'Food': {'hot': {'dog': 1.0}}}
 39 | 
 40 |         This means that for the category Food, the word 'hot' has a
 41 |         100% probability of being followed by the word 'dog'.
 42 |         """
 43 |         job = autopilot.ReviewAutoPilot()
 44 | 
 45 |         category_transitions = {}
 46 | 
 47 |         with open(input_file) as src:
 48 |             for line in src:
 49 |                 (category, start), transitions = job.parse_output_line(line)
 50 | 
 51 |                 category_transitions.setdefault(category, {})[start] = transitions
 52 | 
 53 |         return category_transitions
 54 | 
 55 |     @classmethod
 56 |     def sample(cls, distribution):
 57 |         """Sample from a dictionary containing a probability
 58 |         distribution.
 59 |         """
 60 |         guess = random.random()
 61 | 
 62 |         for word, prob in distribution.iteritems():
 63 |             if guess <= prob:
 64 |                 return word
 65 | 
 66 |             guess -= prob
 67 | 
 68 |         # random.random() returns a value between 0 and 1. The values
 69 |         # of distribution are assumed to sum to 1 (since distribution
 70 |         # is a probability distribution), so random.random() -
 71 |         # sum(values) == 0. If this is not the case, then distribution
 72 |         # is not a valid distribution.
 73 |         assert False, "distribution is not a valid probability distribution!"
 74 | 
 75 |     def __init__(self, input_file):
 76 |         """input_file: the output of the ReviewAutopilot job."""
 77 |         self.category_transitions = self.load_data(input_file)
 78 | 
 79 |     def complete(self, category, text):
 80 |         """Complete some text."""
 81 |         if category not in self.category_transitions:
 82 |             raise KeyError('Unknown category (invalid or not enough data): %s' % category)
 83 | 
 84 |         words = list(autopilot.words(text))
 85 | 
 86 |         last_word = words[-1]
 87 |         transitions = self.category_transitions[category]
 88 |         while True:
 89 |             next_word = self.sample(transitions[last_word])
 90 | 
 91 |             # the end-of-review token is None, which is JSON null,
 92 |             # which is coerced to the string "null" (since json
 93 |             # objects can only have strings as keys)
 94 |             if next_word == "<end>":
 95 |                 break
 96 | 
 97 |             text += ' ' + next_word
 98 |             last_word = next_word
 99 | 
100 |         return text
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     input_file = sys.argv[1]
105 |     category = sys.argv[2]
106 |     text = sys.argv[3]
107 | 
108 |     print ReviewMarkovGenerator(input_file).complete(category, text)
109 | 


--------------------------------------------------------------------------------
/json_to_csv_converter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Convert the Yelp Dataset Challenge dataset from json format to csv.
  3 | 
  4 | For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge
  5 | 
  6 | """
  7 | import argparse
  8 | import collections
  9 | import csv
 10 | import simplejson as json
 11 | 
 12 | 
 13 | def read_and_write_file(json_file_path, csv_file_path, column_names):
 14 |     """Read in the json dataset file and write it out to a csv file, given the column names."""
 15 |     with open(csv_file_path, 'wb+') as fout:
 16 |         csv_file = csv.writer(fout)
 17 |         csv_file.writerow(list(column_names))
 18 |         with open(json_file_path) as fin:
 19 |             for line in fin:
 20 |                 line_contents = json.loads(line)
 21 |                 csv_file.writerow(get_row(line_contents, column_names))
 22 | 
 23 | def get_superset_of_column_names_from_file(json_file_path):
 24 |     """Read in the json dataset file and return the superset of column names."""
 25 |     column_names = set()
 26 |     with open(json_file_path) as fin:
 27 |         for line in fin:
 28 |             line_contents = json.loads(line)
 29 |             column_names.update(
 30 |                     set(get_column_names(line_contents).keys())
 31 |                     )
 32 |     return column_names
 33 | 
 34 | def get_column_names(line_contents, parent_key=''):
 35 |     """Return a list of flattened key names given a dict.
 36 | 
 37 |     Example:
 38 | 
 39 |         line_contents = {
 40 |             'a': {
 41 |                 'b': 2,
 42 |                 'c': 3,
 43 |                 },
 44 |         }
 45 | 
 46 |         will return: ['a.b', 'a.c']
 47 | 
 48 |     These will be the column names for the eventual csv file.
 49 | 
 50 |     """
 51 |     column_names = []
 52 |     for k, v in line_contents.iteritems():
 53 |         column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
 54 |         if isinstance(v, collections.MutableMapping):
 55 |             column_names.extend(
 56 |                     get_column_names(v, column_name).items()
 57 |                     )
 58 |         else:
 59 |             column_names.append((column_name, v))
 60 |     return dict(column_names)
 61 | 
 62 | def get_nested_value(d, key):
 63 |     """Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`.
 64 |     
 65 |     Example:
 66 | 
 67 |         d = {
 68 |             'a': {
 69 |                 'b': 2,
 70 |                 'c': 3,
 71 |                 },
 72 |         }
 73 |         key = 'a.b'
 74 | 
 75 |         will return: 2
 76 |     
 77 |     """
 78 |     if '.' not in key:
 79 |         if key not in d:
 80 |             return None
 81 |         return d[key]
 82 |     base_key, sub_key = key.split('.', 1)
 83 |     if base_key not in d:
 84 |         return None
 85 |     sub_dict = d[base_key]
 86 |     return get_nested_value(sub_dict, sub_key)
 87 | 
 88 | def get_row(line_contents, column_names):
 89 |     """Return a csv compatible row given column names and a dict."""
 90 |     row = []
 91 |     for column_name in column_names:
 92 |         line_value = get_nested_value(
 93 |                         line_contents,
 94 |                         column_name,
 95 |                         )
 96 |         if isinstance(line_value, unicode):
 97 |             row.append('{0}'.format(line_value.encode('utf-8')))
 98 |         elif line_value is not None:
 99 |             row.append('{0}'.format(line_value))
100 |         else:
101 |             row.append('')
102 |     return row
103 | 
104 | if __name__ == '__main__':
105 |     """Convert a yelp dataset file from json to csv."""
106 | 
107 |     parser = argparse.ArgumentParser(
108 |             description='Convert Yelp Dataset Challenge data from JSON format to CSV.',
109 |             )
110 | 
111 |     parser.add_argument(
112 |             'json_file',
113 |             type=str,
114 |             help='The json file to convert.',
115 |             )
116 | 
117 |     args = parser.parse_args()
118 | 
119 |     json_file = args.json_file
120 |     csv_file = '{0}.csv'.format(json_file.split('.json')[0])
121 | 
122 |     column_names = get_superset_of_column_names_from_file(json_file)
123 |     read_and_write_file(json_file, csv_file, column_names)
124 | 


--------------------------------------------------------------------------------
/category_predictor/predict.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2011 Yelp and Contributors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Use the output from the CategoryPredictor MRJob to predict the
 16 | category of text. This uses a simple naive-bayes model - see
 17 | http://en.wikipedia.org/wiki/Naive_Bayes_classifier for more details.
 18 | """
 19 | 
 20 | from __future__ import with_statement
 21 | 
 22 | import math
 23 | import sys
 24 | 
 25 | import category_predictor
 26 | 
 27 | class ReviewCategoryClassifier(object):
 28 | 	"""Predict categories for text using a simple naive-bayes classifier."""
 29 | 
 30 | 	@classmethod
 31 | 	def load_data(cls, input_file):
 32 | 		"""Read the output of the CategoryPredictor mrjob, returning
 33 | 		total category counts (count of # of reviews for each
 34 | 		category), and counts of words for each category.
 35 | 		"""
 36 | 
 37 | 		job = category_predictor.CategoryPredictor()
 38 | 
 39 | 		category_counts = None
 40 | 		word_counts = {}
 41 | 
 42 | 		with open(input_file) as src:
 43 | 			for line in src:
 44 | 				category, counts = job.parse_output_line(line)
 45 | 
 46 | 				if category == 'all':
 47 | 					category_counts = counts
 48 | 				else:
 49 | 					word_counts[category] = counts
 50 | 
 51 | 		return category_counts, word_counts
 52 | 
 53 | 	@classmethod
 54 | 	def normalize_counts(cls, counts):
 55 | 		"""Convert a dictionary of counts into a log-probability
 56 | 		distribution.
 57 | 		"""
 58 | 		total = sum(counts.itervalues())
 59 | 		lg_total = math.log(total)
 60 | 
 61 | 		return dict((key, math.log(cnt) - lg_total) for key, cnt in counts.iteritems())
 62 | 
 63 | 	def __init__(self, input_file):
 64 | 		"""input_file: the output of the CategoryPredictor job."""
 65 | 		category_counts, word_counts = self.load_data(input_file)
 66 | 
 67 | 		self.word_given_cat_prob = {}
 68 | 		for cat, counts in word_counts.iteritems():
 69 | 			self.word_given_cat_prob[cat] = self.normalize_counts(counts)
 70 | 
 71 | 		# filter out categories which have no words
 72 | 		seen_categories = set(word_counts)
 73 | 		seen_category_counts = dict((cat, count) for cat, count in category_counts.iteritems() \
 74 | 										if cat in seen_categories)
 75 | 		self.category_prob = self.normalize_counts(seen_category_counts)
 76 | 
 77 | 	def classify(self, text):
 78 | 		"""Classify some text using the result of the
 79 | 		CategoryPredictor MRJob. We use a basic naive-bayes model,
 80 | 		eg, argmax_category p(category) * p(words | category) ==
 81 | 		p(category) * pi_{i \in words} p(word_i | category).
 82 | 
 83 | 		p(category) is stored in self.category_prob, p(word | category
 84 | 		is in self.word_given_cat_prob.
 85 | 		"""
 86 | 		# start with prob(category)
 87 | 		lg_scores = self.category_prob.copy()
 88 | 
 89 | 		# then multiply in the individual word probabilities
 90 | 		# NOTE: we're actually adding here, but that's because our
 91 | 		# distributions are made up of log probabilities, which are
 92 | 		# more accurate for small probabilities. See
 93 | 		# http://en.wikipedia.org/wiki/Log_probability for more
 94 | 		# details.
 95 | 		for word in category_predictor.words(text):
 96 | 			for cat in lg_scores:
 97 | 				cat_probs = self.word_given_cat_prob[cat]
 98 | 
 99 | 				if word in cat_probs:
100 | 					lg_scores[cat] += cat_probs[word]
101 | 				else:
102 | 					lg_scores[cat] += cat_probs['UNK']
103 | 
104 | 		# convert scores to a non-log value
105 | 		scores = dict((cat, math.exp(score)) for cat, score in lg_scores.iteritems())
106 | 
107 | 		# normalize the scores again - this isnt' strictly necessary,
108 | 		# but it's nice to report probabilities with our guesses
109 | 		total = sum(scores.itervalues())
110 | 		return dict((cat, prob / total) for cat, prob in scores.iteritems())
111 | 
112 | 
113 | if __name__ == "__main__":
114 | 	input_file = sys.argv[1]
115 | 	text = sys.argv[2]
116 | 
117 | 	guesses = ReviewCategoryClassifier(input_file).classify(text)
118 | 
119 | 	best_guesses = sorted(guesses.iteritems(), key=lambda (_, prob): prob, reverse=True)[:5]
120 | 
121 | 	for guess, prob in best_guesses:
122 | 		print 'Category: "%s" - %.2f%% chance' % (guess, prob * 100)
123 | 


--------------------------------------------------------------------------------
/positive_category_words/weighted_category_positivity.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2011 Yelp and Contributors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import re
 16 | 
 17 | from mrjob.job import MRJob
 18 | from mrjob.protocol import JSONValueProtocol
 19 | 
 20 | def avg_and_total(iterable):
 21 |     """Compute the average over a numeric iterable."""
 22 |     items = 0
 23 |     total = 0.0
 24 | 
 25 |     for item in iterable:
 26 |         total += item
 27 |         items += 1
 28 | 
 29 |     return total / items, total
 30 | 
 31 | # Considerably lower than for the simple global script, since category
 32 | # data is much more sparse
 33 | MINIMUM_OCCURENCES = 50
 34 | 
 35 | # Require reviews from AT LEAST this many distinct businesses before
 36 | # we include a word (prevents very popular restaurant names from
 37 | # showing up in the list)
 38 | MINIMUM_BUSINESSES = 3
 39 | 
 40 | class WeightedPositiveWords(MRJob):
 41 |     """Find the most positive words in the dataset."""
 42 | 
 43 |     # The input is the dataset - interpret each line as a single json
 44 |     # value (the key will be None)
 45 |     INPUT_PROTOCOL = JSONValueProtocol
 46 | 
 47 |     def review_category_mapper(self, _, data):
 48 |         """Walk over reviews, emitting each word and its rating."""
 49 |         if data['type'] == 'review':
 50 |             yield data['business_id'], ('review', (data['text'], data['stars']))
 51 | 
 52 |         elif data['type'] == 'business':
 53 |             # skip businesses with no categories
 54 |             if data['categories']:
 55 |                 yield data['business_id'], ('categories', data['categories'])
 56 | 
 57 |     def category_join_reducer(self, business_id, reviews_or_categories):
 58 |         """Take in business_id, ((review text and rating) or category information), emit
 59 |         category, (biz_id, (review, rating)).
 60 |         """
 61 |         categories = None
 62 |         reviews = []
 63 | 
 64 |         for data_type, data in reviews_or_categories:
 65 |             if data_type == 'review':
 66 |                 reviews.append(data)
 67 |             else:
 68 |                 categories = data
 69 | 
 70 |         # no categories found, skip this
 71 |         if not categories:
 72 |             return
 73 | 
 74 |         for category in categories:
 75 |             for review_positivity in reviews:
 76 |                 yield category, (business_id, review_positivity)
 77 | 
 78 |     def review_mapper(self, category, biz_review_positivity):
 79 |         """Take in category, (biz_id, (review, rating)) and split the
 80 |         review into individual unique words. Emit 
 81 |         (category, word), (biz_id, rating), which will then be used to
 82 |         gather info about each category / word pair.
 83 |         """
 84 |         biz_id, (review, positivity) = biz_review_positivity
 85 | 
 86 |         # normalize words by lowercasing and dropping non-alpha
 87 |         # characters
 88 |         norm = lambda word: re.sub('[^a-z]', '', word.lower())
 89 |         # only include a word once per-review (which de-emphasizes
 90 |         # proper nouns)
 91 |         words = set(norm(word) for word in review.split())
 92 | 
 93 |         for word in words:
 94 |             yield (category, word), (biz_id, positivity)
 95 | 
 96 |     def positivity_reducer(self, category_word, biz_positivities):
 97 |         """Read (category, word), (biz_id, positivity), and compute
 98 |         the average positivity for the category-word pair. Skip words
 99 |         that don't occur frequently enough or for not enough unique
100 |         businesses.
101 | 
102 |         Emits rating, (category, # reviews with word, word).
103 |         """
104 | 
105 |         category, word = category_word
106 | 
107 |         businesses = set()
108 |         positivities = []
109 |         for biz_id, positivity in biz_positivities:
110 |             businesses.add(biz_id)
111 |             positivities.append(positivity)
112 | 
113 |         # don't include words that only show up for a few businesses
114 |         if len(businesses) < MINIMUM_BUSINESSES:
115 |             return
116 | 
117 |         avg, total = avg_and_total(positivities)
118 | 
119 |         if total < MINIMUM_OCCURENCES:
120 |             return
121 | 
122 |         yield int(avg * 100), (category, total, word)
123 | 
124 |     def steps(self):
125 |         return [ self.mr(self.review_category_mapper, self.category_join_reducer),
126 |                 self.mr(self.review_mapper, self.positivity_reducer)]
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     WeightedPositiveWords().run()
131 | 


--------------------------------------------------------------------------------
/review_autopilot/autopilot.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2011 Yelp and Contributors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Gather the data necessary to generate reviews using a simple markov
 16 | model (see http://en.wikipedia.org/wiki/Markov_chain for more
 17 | details). We gather word-next word counts for each category,
 18 | eliminating rare pairs.
 19 | """
 20 | 
 21 | import re
 22 | 
 23 | from mrjob.job import MRJob
 24 | from mrjob.protocol import JSONValueProtocol
 25 | 
 26 | # Chance that the review will end after any given word.
 27 | END_OF_REVIEW_RATE = 0.01
 28 | 
 29 | MINIMUM_PAIR_COUNT = 5
 30 | MINIMUM_FOLLOW_PERCENTAGE = 0.01
 31 | 
 32 | def words(text):
 33 |     """An iterator over tokens (words) in text. Replace this with a
 34 |     stemmer or other smarter logic.
 35 |     """
 36 | 
 37 |     for word in text.split():
 38 |         # normalize words by lowercasing and dropping non-alpha
 39 |         # characters
 40 |         normed = re.sub('[^a-z]', '', word.lower())
 41 | 
 42 |         if not normed:
 43 |             continue
 44 | 
 45 |         yield normed
 46 | 
 47 | def word_pairs(text):
 48 |     """Given some text, yield out pairs of words (eg bigrams)."""
 49 |     last_word = None
 50 | 
 51 |     for word in words(text):
 52 |         if last_word is not None:
 53 |             yield last_word, word
 54 |         last_word = word
 55 | 
 56 |     yield last_word, "<end>"
 57 | 
 58 | class ReviewAutoPilot(MRJob):
 59 |     """Very simple markov model for reviews, parameterized on business category."""
 60 | 
 61 |     INPUT_PROTOCOL = JSONValueProtocol
 62 | 
 63 |     def business_join_mapper(self, _, data):
 64 |         """Walk through reviews and businesses, yielding out the raw
 65 |         data.
 66 |         """
 67 |         if data['type'] == 'business':
 68 |             yield data['business_id'], ('business', data)
 69 |         elif data['type'] == 'review':
 70 |             yield data['business_id'], ('review', data['text'])
 71 | 
 72 |     def join_reviews_with_categories_reducer(self, business_id, reviews_or_biz):
 73 |         """Join reviews with the categories from the associated
 74 |         business.
 75 |         """
 76 |         categories = None
 77 |         reviews = []
 78 | 
 79 |         for data_type, data in reviews_or_biz:
 80 |             if data_type == 'business':
 81 |                 categories = data['categories']
 82 |             else:
 83 |                 reviews.append(data)
 84 | 
 85 |         # don't bother with these businesses
 86 |         if not categories:
 87 |             return
 88 | 
 89 |         for review in reviews:
 90 |             yield categories, review
 91 | 
 92 |     def review_split_mapper(self, categories, review):
 93 |         """Split a review into pairs of words and yield out 
 94 |         (start word, category), (follow word, count), combining
 95 |         repeated pairs into a single emission.
 96 |         """
 97 |         pair_counts = {}
 98 | 
 99 |         for pair in word_pairs(review):
100 |             pair_counts[pair] = pair_counts.get(pair, 0) + 1
101 | 
102 |         for (start, follow), count in pair_counts.iteritems():
103 |             for category in categories:
104 |                 yield (start, category), (follow, count)
105 | 
106 |     def follow_probs_reducer(self, start_word_category, follow_word_counts):
107 |         """Given a start word and a category, find the distribution
108 |         over next words. When normalized, this count defines the
109 |         transition probability for the markov chain.
110 |         """
111 |         start, category = start_word_category
112 |         follow_counts = {}
113 | 
114 |         for follow_word, count in follow_word_counts:
115 |             follow_counts[follow_word] = follow_counts.get(follow_word, 0) + count
116 | 
117 |         total_transitions = float(sum(follow_counts.itervalues()))
118 | 
119 |         include_word = lambda count: count > MINIMUM_PAIR_COUNT and count / total_transitions > MINIMUM_FOLLOW_PERCENTAGE
120 |         thresholded_follow_counts = dict((word, count) for word, count in follow_counts.iteritems() if include_word(count))
121 | 
122 |         # filter out transitions where the transition has either
123 |         # occurred a minimum number of times, or does not make up a
124 |         # minimum percentage of outgoing transitions.
125 |         if not thresholded_follow_counts:
126 |             return
127 | 
128 |         # put a small weight on <end>, which means 'end of review'.
129 |         thresholded_follow_counts['<end>'] = thresholded_follow_counts.get('<end>', 0.0) 
130 |         thresholded_follow_counts['<end>'] += END_OF_REVIEW_RATE * float(sum(thresholded_follow_counts.itervalues()))
131 | 
132 |         # re-normalize the remaining transition weights.
133 |         new_total = float(sum(thresholded_follow_counts.itervalues()))
134 |         percentages = dict((follow, count / new_total) for follow, count in thresholded_follow_counts.iteritems())
135 | 
136 |         yield (category, start), percentages
137 | 
138 |     def steps(self):
139 |         return [ self.mr(mapper=self.business_join_mapper, reducer=self.join_reviews_with_categories_reducer),
140 |                 self.mr(mapper=self.review_split_mapper, reducer=self.follow_probs_reducer)]
141 | 
142 | if __name__ == "__main__":
143 |     ReviewAutoPilot().run()
144 | 
145 | 


--------------------------------------------------------------------------------
/category_predictor/category_predictor.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2011 Yelp and Contributors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """An MRJob that constructs the data necessary to predict category
 16 | information
 17 | """
 18 | 
 19 | import re
 20 | 
 21 | from mrjob.job import MRJob
 22 | from mrjob.protocol import JSONValueProtocol
 23 | 
 24 | # require at least this many occurences for a word to show up for a
 25 | # given category 
 26 | MINIMUM_OCCURENCES = 100
 27 | 
 28 | def words(text):
 29 |     """An iterator over tokens (words) in text. Replace this with a
 30 |     stemmer or other smarter logic.
 31 |     """
 32 | 
 33 |     for word in text.split():
 34 |         # normalize words by lowercasing and dropping non-alpha
 35 |         # characters
 36 |         normed = re.sub('[^a-z]', '', word.lower())
 37 | 
 38 |         if normed:
 39 |             yield normed
 40 | 
 41 | class CategoryPredictor(MRJob):
 42 |     """A very simple category predictor. Trains on review data and
 43 |     generates a simple naive-bayes model that can predict the category
 44 |     of some text.
 45 |     """
 46 | 
 47 |     # The input is the dataset - interpret each line as a single json
 48 |     # value (the key will be None)
 49 |     INPUT_PROTOCOL = JSONValueProtocol
 50 | 
 51 |     def review_category_mapper(self, _, data):
 52 |         """Visit reviews and businesses, yielding out (business_id,
 53 |         (review or category)).
 54 |         """
 55 |         if data['type'] == 'review':
 56 |             yield data['business_id'], ('review', data['text'])
 57 |         elif data['type'] == 'business':
 58 |             yield data['business_id'], ('categories', data['categories'])
 59 | 
 60 |     def add_categories_to_reviews_reducer(self, business_id, reviews_or_categories):
 61 |         """Yield out (category, review) for each category-review
 62 |         pair. We'll do the actual review tokenizing in the next
 63 |         mapper, since you typically have much more map-capacity than
 64 |         reduce-capacity.
 65 |         """
 66 |         categories = None
 67 |         reviews = []
 68 | 
 69 |         for data_type, data in reviews_or_categories:
 70 |             if data_type == 'review':
 71 |                 reviews.append(data)
 72 |             else:
 73 |                 categories = data
 74 | 
 75 |         # We either didn't find a matching business, or this biz
 76 |         # doesn't have any categories. In either case, we can drop
 77 |         # these reviews.
 78 |         if not categories:
 79 |             return
 80 | 
 81 |         # Yield out review counts in the same format as the
 82 |         # tokenize_reviews_mapper. We'll special case the 'all' key in
 83 |         # that method, but afterwards it will be treated the same.
 84 |         yield 'all', dict((cat, len(reviews)) for cat in categories)
 85 | 
 86 |         for category in categories:
 87 |             for review in reviews:
 88 |                 yield category, review
 89 | 
 90 |     def tokenize_reviews_mapper(self, category, review):
 91 |         """Split reviews into words, yielding out (category, {word: count}) and
 92 |         ('all', {word: count}). We yield out a dictionary of counts
 93 |         rather than a single entry per-word to reduce the amount of
 94 |         i/o between mapper and reducer.
 95 |         """
 96 |         # special case - pass through category counts (which are
 97 |         # already formatted like the output of this mapper)
 98 |         if category == 'all':
 99 |             yield category, review
100 |             return
101 | 
102 |         counts = {}
103 |         for word in words(review):
104 |             counts[word] = counts.get(word, 0) + 1
105 | 
106 |         yield category, counts
107 | 
108 |     def sum_counts(self, category, counts):
109 |         """Sum up dictionaries of counts, filter out rare words
110 |         (bucketing them into an unknown word bucket), and yield the
111 |         counts.
112 |         """
113 |         raw_count = {}
114 | 
115 |         # sum up the individual counts
116 |         for word_count in counts:
117 |             for word, count in word_count.iteritems():
118 |                 raw_count[word] = raw_count.get(word, 0) + count
119 | 
120 |         # don't filter out low-mass categories
121 |         if category == 'all':
122 |             yield category, raw_count
123 |             return
124 | 
125 |         # filter out low-count words; assign a very low mass to
126 |         # unknown words
127 |         filtered_counts = {}
128 |         for word, count in raw_count.iteritems():
129 |             if count > MINIMUM_OCCURENCES:
130 |                 filtered_counts[word] = count
131 | 
132 |         # don't include categories with every word filtered out
133 |         if not filtered_counts:
134 |             return
135 | 
136 |         # Assign a small mass to unknown tokens - check out
137 |         # http://en.wikipedia.org/wiki/Laplacian_smoothing for background.
138 |         filtered_counts['UNK'] = 0.01
139 | 
140 |         # emit the result
141 |         yield category, filtered_counts
142 | 
143 |     def steps(self):
144 |         return [self.mr(mapper=self.review_category_mapper, 
145 |                 reducer=self.add_categories_to_reviews_reducer),
146 |             self.mr(mapper=self.tokenize_reviews_mapper, 
147 |                 reducer=self.sum_counts)] 
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     CategoryPredictor().run()
152 | 
153 | 


--------------------------------------------------------------------------------