├── test ├── __init__.py ├── test_json_to_csv_converter.py ├── test_category_predictor.py ├── test_autopilot.py └── test_weighted_positivity.py ├── review_autopilot ├── __init__.py ├── generate.py └── autopilot.py ├── category_predictor ├── __init__.py ├── predict.py └── category_predictor.py ├── positive_category_words ├── __init__.py ├── simple_global_positivity.py └── weighted_category_positivity.py ├── .gitignore ├── tox.ini ├── .travis.yml ├── setup.py ├── LICENSE.txt ├── README.md └── json_to_csv_converter.py /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /review_autopilot/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /category_predictor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /positive_category_words/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.swp 3 | *.pyc 4 | *.json 5 | .tox 6 | *.egg-info 7 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py26,py27 3 | 4 | [testenv] 5 | deps = 6 | testify 7 | mrjob 8 | unittest2 9 | commands = 10 | testify -v test 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | os: 3 | - osx 4 | - linux 5 | env: 6 | matrix: 7 | - TOX_ENV=py26 8 | - TOX_ENV=py27 9 | install: 10 | - pip install -e . 11 | script: testify -v test 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Setup for dataset-examples.""" 3 | from setuptools import setup, find_packages 4 | 5 | requires = [ 6 | 'mrjob', 7 | 'testify', 8 | 'unittest2', 9 | ] 10 | 11 | setup( 12 | name='dataset-examples', 13 | description='Examples for the Yelp datasets.', 14 | author='Yelp', 15 | url='https://github.com/Yelp/dataset-examples', 16 | packages=find_packages(), 17 | install_requires=requires, 18 | tests_require=requires, 19 | ) 20 | 21 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2011 Yelp 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | -------------------------------------------------------------------------------- /test/test_json_to_csv_converter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Test the json to csv converter script.""" 3 | from unittest2 import TestCase 4 | 5 | import json_to_csv_converter 6 | 7 | 8 | class TestJsonToCsvConverter(TestCase): 9 | 10 | """Test the json to csv converter script.""" 11 | 12 | test_biz = { 13 | 'type':'business', 14 | 'business_id': 123, 15 | 'hours': { 16 | 'Monday': { 17 | 'open': "11:30", 18 | 'close': "21:00", 19 | }, 20 | }, 21 | } 22 | test_biz_column_names = frozenset(['type', 'business_id', 'hours.Monday.open', 'hours.Monday.close']) 23 | test_review = { 24 | 'type': 'review', 25 | 'user_id': 345, 26 | 'votes': { 27 | 'funny': 1, 28 | }, 29 | } 30 | test_review_column_names = frozenset(['type', 'user_id', 'votes.funny']) 31 | 32 | def test_get_column_names(self): 33 | """Test that we see the expected column names for the test objects.""" 34 | biz_column_names = set(json_to_csv_converter.get_column_names(self.test_biz)) 35 | self.assertEqual(biz_column_names, self.test_biz_column_names) 36 | 37 | review_column_names = set(json_to_csv_converter.get_column_names(self.test_review)) 38 | self.assertEqual(review_column_names, self.test_review_column_names) 39 | 40 | def test_get_nested_value(self): 41 | """Test getting a nested value from a dict given a flat key.""" 42 | # non-nested values 43 | self.assertEqual( 44 | json_to_csv_converter.get_nested_value(self.test_review, 'type'), 45 | 'review' 46 | ) 47 | # nested values 48 | self.assertEqual( 49 | json_to_csv_converter.get_nested_value(self.test_biz, 'hours.Monday.open'), 50 | '11:30' 51 | ) 52 | # unknown values 53 | self.assertIsNone( 54 | json_to_csv_converter.get_nested_value(self.test_review, 'this.is.not.in.the.review'), 55 | ) 56 | 57 | -------------------------------------------------------------------------------- /positive_category_words/simple_global_positivity.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Yelp and Contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import re 16 | 17 | from mrjob.job import MRJob 18 | from mrjob.protocol import JSONValueProtocol 19 | 20 | MINIMUM_OCCURENCES = 1000 21 | 22 | def avg_and_total(iterable): 23 | """Compute the average over a numeric iterable.""" 24 | items = 0 25 | total = 0.0 26 | 27 | for item in iterable: 28 | total += item 29 | items += 1 30 | 31 | return total / items, total 32 | 33 | class PositiveWords(MRJob): 34 | """Find the most positive words in the dataset.""" 35 | 36 | # The input is the dataset - interpret each line as a single json 37 | # value (the key will be None) 38 | INPUT_PROTOCOL = JSONValueProtocol 39 | 40 | def review_mapper(self, _, data): 41 | """Walk over reviews, emitting each word and its rating.""" 42 | if data['type'] != 'review': 43 | return 44 | 45 | # normalize words by lowercasing and dropping non-alpha 46 | # characters 47 | norm = lambda word: re.sub('[^a-z]', '', word.lower()) 48 | # only include a word once per-review (which de-emphasizes 49 | # proper nouns) 50 | words = set(norm(word) for word in data['text'].split()) 51 | 52 | for word in words: 53 | yield word, data['stars'] 54 | 55 | def positivity_reducer(self, word, ratings): 56 | """Emit average star rating, word in a format we can easily 57 | sort with the unix sort command: 58 | [star average * 100, total count], word. 59 | """ 60 | avg, total = avg_and_total(ratings) 61 | 62 | if total < MINIMUM_OCCURENCES: 63 | return 64 | 65 | yield (int(avg * 100), total), word 66 | 67 | def steps(self): 68 | return [self.mr(), # Split apart the dataset into multiple 69 | # chunks. In regular hadoop-land you could change the 70 | # splitter. This is normally < 30 seconds of work. 71 | self.mr(self.review_mapper, self.positivity_reducer)] 72 | 73 | 74 | if __name__ == "__main__": 75 | PositiveWords().run() 76 | -------------------------------------------------------------------------------- /test/test_category_predictor.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | 3 | import json 4 | import unittest 5 | from unittest import TestCase 6 | from StringIO import StringIO 7 | 8 | from category_predictor.category_predictor import CategoryPredictor 9 | 10 | # These templates can be used to make a json string very easily. 11 | REVIEW_TEMPLATE = '{"type":"review", "stars":3, "text":"%s",\ 12 | "business_id":"%s"}\n' 13 | BUSINESS_TEMPLATE = '{"type":"business", "categories":["%s"], \ 14 | "business_id":"%s"}\n' 15 | LONG_TEXT = "Hello world" * 101 16 | TEXT = u"Hello" 17 | BIZ_ID = u"Yelp" 18 | CATEGORY = u'Company' 19 | 20 | 21 | class TestCategoryPredictor(TestCase): 22 | 23 | def test_smoke(self): 24 | """Does a complete run with mock data""" 25 | business = BUSINESS_TEMPLATE % (CATEGORY, BIZ_ID) 26 | review = REVIEW_TEMPLATE % (LONG_TEXT, BIZ_ID) 27 | total_input = business + review 28 | static_stdin = StringIO(total_input) 29 | 30 | job = CategoryPredictor(['-r', 'inline', '--no-conf', '-']) 31 | job.sandbox(stdin=static_stdin) 32 | 33 | results = [] 34 | with job.make_runner() as runner: 35 | runner.run() 36 | for line in runner.stream_output(): 37 | key, value = job.parse_output_line(line) 38 | results.append(value) 39 | 40 | # Results should be the probability of that category being chosen. 41 | result = {CATEGORY: 1} 42 | self.assertEqual(results[0], result) 43 | 44 | def test_review_category(self): 45 | """Tests the category_mapper to make sure it is properly running""" 46 | business = BUSINESS_TEMPLATE % (CATEGORY, BIZ_ID) 47 | review = REVIEW_TEMPLATE % (TEXT, BIZ_ID) 48 | job = CategoryPredictor() 49 | review_results = list(job.review_category_mapper(None, json.loads(review))) 50 | biz_results = list(job.review_category_mapper(None, json.loads(business))) 51 | self.assertEqual(review_results, [(BIZ_ID, ('review', TEXT))]) 52 | self.assertEqual(biz_results, [(BIZ_ID, ('categories', [CATEGORY]))]) 53 | 54 | def test_categories_to_reviews(self): 55 | """Tests add_categories_to_reviews to make sure it is properly running""" 56 | category = [('categories', [CATEGORY]), ('review', TEXT)] 57 | 58 | job = CategoryPredictor() 59 | category_results = list(job.add_categories_to_reviews_reducer(BIZ_ID, category)) 60 | result = [('all', {CATEGORY: 1}), (CATEGORY, TEXT)] 61 | self.assertEqual(category_results,result) 62 | 63 | def test_tokenize_reviews(self): 64 | """Tests tokenize_reviews_mapper to make sure it is properly running""" 65 | review = {CATEGORY: 1} 66 | 67 | job = CategoryPredictor() 68 | token_results = list(job.tokenize_reviews_mapper('all', review)) 69 | result = [('all', {CATEGORY: 1})] 70 | self.assertEqual(token_results, result) 71 | 72 | 73 | if __name__ == '__main__': 74 | unittest.main() 75 | -------------------------------------------------------------------------------- /test/test_autopilot.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | 3 | import unittest 4 | from unittest import TestCase 5 | from StringIO import StringIO 6 | 7 | from review_autopilot.autopilot import ReviewAutoPilot 8 | 9 | # These are used to create stdin string data. 10 | CATEGORY = 'Company' 11 | REVIEW_TEMPLATE = '{"type":"review", "stars":3, "text":"%s",\ 12 | "business_id":"%s"}\n' 13 | BUSINESS_TEMPLATE = '{"type":"business", "categories": "%s",\ 14 | "business_id":"%s"}\n' 15 | TEXT = 'Hello!' 16 | ID = 128411 17 | BIZ = 'Yelp' 18 | # This is used to pass around dict data, which is slightly different than 19 | # the string data above. 20 | DATA = [ 21 | {'type':'business', 'business_id': ID, 'data':'Info here'}, 22 | {'type': 'review', 'business_id':ID, 'text': TEXT} 23 | ] 24 | 25 | 26 | class TestReviewAutoPilotCase(TestCase): 27 | 28 | def test_business_mapper(self): 29 | """tests the individual mappers of ReviewAutoPilot""" 30 | job = ReviewAutoPilot() 31 | biz_results = list(job.business_join_mapper(None, DATA[0])) 32 | review_results = list(job.business_join_mapper(None, DATA[1])) 33 | 34 | biz_after_results = [(ID, ('business', DATA[0]))] 35 | review_after_results = [(ID, ('review', DATA[1]['text']))] 36 | 37 | self.assertEqual(biz_results, biz_after_results) 38 | self.assertEqual(review_results, review_after_results) 39 | 40 | def test_smoke(self): 41 | """Uses small, static dataset possible on local, since a full run takes 42 | too long.""" 43 | 44 | # Random data to feed into the markov model. 45 | # I use long runs of foo to get through the threshold filters. 46 | text = ('foo bar foo baz foo car foo daz ' + ('foo ' * 10) + 'foofoo yelp' 47 | 'foo yar foo foo bar bar dar') 48 | single_review = REVIEW_TEMPLATE % (text, BIZ) 49 | business = BUSINESS_TEMPLATE % (CATEGORY, BIZ) 50 | static_stdin = StringIO(single_review + business) 51 | 52 | job = ReviewAutoPilot(['-r', 'inline', '--no-conf', '-']) 53 | job.sandbox(stdin=static_stdin) 54 | 55 | results = [] 56 | with job.make_runner() as runner: 57 | runner.run() 58 | for line in runner.stream_output(): 59 | key, value = job.parse_output_line(line) 60 | results.append(value) 61 | 62 | # Normal output to compare 63 | result = {'foo': 0.99009900990099009, '': 0.0099009900990099011} 64 | self.assertEqual(results[0], result) 65 | 66 | def test_categories_reducer(self): 67 | """Tests join_reviews_with_categories_reducer with null data and some 68 | static data.""" 69 | job = ReviewAutoPilot() 70 | VALUES = (('business', {'categories': CATEGORY}), ('review', TEXT)) 71 | category_results = list(job.join_reviews_with_categories_reducer(BIZ, VALUES)) 72 | results = [(CATEGORY, TEXT)] 73 | self.assertEqual(category_results, results) 74 | 75 | def test_split_mapper(self): 76 | """Tests split_mapper reducer in autopilot""" 77 | job = ReviewAutoPilot() 78 | TEST_RETURN = (('hello', 'C'), ('', 1)) 79 | self.assertEqual(job.review_split_mapper(CATEGORY, TEXT).next(), 80 | TEST_RETURN) 81 | 82 | 83 | if __name__ == '__main__': 84 | unittest.main() 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/Yelp/dataset-examples.svg)](https://travis-ci.org/Yelp/dataset-examples) 2 | 3 | Yelp's Academic Dataset Examples 4 | ================================ 5 | 6 | We're providing three examples for use with the datasets available at [http://www.yelp.com/dataset_challenge](http://www.yelp.com/dataset_challenge) and 7 | [http://www.yelp.com/academic_dataset](http://www.yelp.com/academic_dataset). They all depend on 8 | [mrjob](https://github.com/Yelp/mrjob) and python 2.6 or later. 9 | 10 | To install all dependencies: `$ pip install -e .` 11 | 12 | To test: `$ tox` 13 | 14 | Samples 15 | ------------ 16 | 17 | `json_to_csv_converter`: Convert the dataset from json format to csv format. 18 | 19 | ```bash 20 | $ python json_to_csv_converter.py yelp_academic_dataset.json # Creates yelp_academic_dataset.csv 21 | ``` 22 | 23 | `category_predictor`: Given some text, predict likely categories. For example: 24 | 25 | ```bash 26 | $ python category_predictor/category_predictor.py yelp_academic_dataset.json > category_predictor.json 27 | $ python category_predictor/predict.py category_predictor.json "bacon donut" 28 | Category: "Food" - 82.66% chance 29 | Category: "Restaurants" - 16.99% chance 30 | Category: "Donuts" - 0.12% chance 31 | Category: "Basque" - 0.02% chance 32 | Category: "Spanish" - 0.02% chance 33 | ``` 34 | 35 | `review_autopilot`: Use a markov chain to finish a review. For 36 | example: 37 | 38 | ```bash 39 | $ python review_autopilot/generate.py Food 'They have the best' 40 | They have the best coffee is good food was delicious cookies and 41 | a few friends i think they make this 42 | ``` 43 | 44 | `positive_category_words`: See the Yelp engineering blog for 45 | details about this example. In short, it generates positivity 46 | scores for words either globally or per-category. 47 | 48 | Basic set-up 49 | ------------ 50 | 51 | You can use any of mjrob's runner with these examples, but we'll focus 52 | on the local and emr runner (if you have access to your own hadoop 53 | cluster, check out the mrjob docs for instructions on how to set this 54 | up). 55 | 56 | Local mode couldn't be easier: 57 | 58 | # this step will take a VERY long time 59 | python review_autopilot/autopilot.py yelp_academic_dataset.json > autopilot.json 60 | 61 | # this should be instant 62 | python review_autopilot/generate.py Food 'They have the best' 63 | > hot dogs ever 64 | 65 | Waiting a long time is kind of lame, no? Let's try the same thing 66 | using EMR. 67 | 68 | First off, you'll need an `aws_access_key` and an 69 | `aws_secret_access_key`. You can get these from the AWS console 70 | (you'll need to sign up for an AWS developer account and enable s3 / 71 | emr usage, if you haven't already). 72 | 73 | Create a simple mrjob.conf file, like this: 74 | 75 | runners: 76 | emr: 77 | aws_access_key_id: YOUR_ACCESS_KEY 78 | aws_secret_access_key: YOUR_SECRET_KEY 79 | 80 | Now that that's done, you can run the autopilot script on EMR. 81 | 82 | # WARNING: this will cost you roughly $2 and take 10-20 minutes 83 | python review_autopilot/autopilot.py --num-ec2-instances 10 --ec2-instance-type c1.medium -v --runner emr yelp_academic_dataset.json 84 | 85 | 86 | You can save money (and time) by re-using jobflows and uploading the 87 | dataset to a personal, private s3 bucket - check out the mrjob docs for 88 | instructions on doing this. 89 | -------------------------------------------------------------------------------- /test/test_weighted_positivity.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | 3 | import json 4 | import unittest 5 | from unittest import TestCase 6 | from StringIO import StringIO 7 | import testify as T 8 | 9 | from positive_category_words.weighted_category_positivity import WeightedPositiveWords 10 | 11 | 12 | CATEGORY = u'Company' 13 | REVIEW_TEMPLATE = ('{"type":"review", "stars":3, "text":"%s",' 14 | '"business_id":"%s"}\n') 15 | BUSINESS_TEMPLATE = ('{"type":"business", "categories":["%s"], ' 16 | '"business_id":"%s"}\n') 17 | TEXT = u"Hello world" 18 | BIZ_NAME = u'Qdoba' 19 | 20 | 21 | class TestWeightedPositiveWords(TestCase): 22 | 23 | def test_smoke(self): 24 | """Does a full run of weighted positive words""" 25 | 26 | # Need 3 mock businesses to test 27 | business1 = BUSINESS_TEMPLATE % (CATEGORY, "Yelp") 28 | business2 = BUSINESS_TEMPLATE % (CATEGORY, "Target") 29 | business3 = BUSINESS_TEMPLATE % (CATEGORY, "Walmart") 30 | # Need more than 1 review for weighted threshold 31 | review1 = REVIEW_TEMPLATE % (TEXT, "Yelp") 32 | review2 = REVIEW_TEMPLATE % (TEXT, "Target") 33 | review3 = REVIEW_TEMPLATE % (TEXT, "Walmart") 34 | 35 | # Need at least 50 occurrences of reviews, so multiply the first review by 20 36 | total_input = (business1 + business2 + business3 37 | + (review1 * 20) + review2 + review3) 38 | static_stdin = StringIO(total_input) 39 | 40 | job = WeightedPositiveWords(['-r', 'inline', '--no-conf', '-']) 41 | job.sandbox(stdin=static_stdin) 42 | 43 | results = [] 44 | with job.make_runner() as runner: 45 | runner.run() 46 | for line in runner.stream_output(): 47 | key, value = job.parse_output_line(line) 48 | results.append(value) 49 | end_result = [[CATEGORY, 66.0, 'hello'], [CATEGORY, 66.0, 'world']] 50 | self.assertEqual(results, end_result) 51 | 52 | def test_review_category(self): 53 | """Test the review_category_mapper function with a mock input""" 54 | 55 | review = REVIEW_TEMPLATE % (TEXT, BIZ_NAME) 56 | business = BUSINESS_TEMPLATE % (CATEGORY, BIZ_NAME) 57 | 58 | job = WeightedPositiveWords() 59 | review_results = list(job.review_category_mapper(None, json.loads(review))) 60 | biz_results = list(job.review_category_mapper(None, json.loads(business))) 61 | review_after_results = [(BIZ_NAME, ('review', (TEXT, 3)))] 62 | biz_after_results = [(BIZ_NAME, ('categories', [CATEGORY]))] 63 | self.assertEqual(review_results, review_after_results) 64 | self.assertEqual(biz_results, biz_after_results) 65 | 66 | 67 | def test_category_join(self): 68 | """Test the category_join_reducer function with the same results 69 | from above. These tests should be used to isolate where an error 70 | will come from if a person changes any of the functions in the mr 71 | """ 72 | review_or_categories = (('review', (TEXT, 3)), ('categories', [CATEGORY])) 73 | 74 | job = WeightedPositiveWords() 75 | join_results = list(job.category_join_reducer(BIZ_NAME, review_or_categories)) 76 | results = [(CATEGORY, (BIZ_NAME, (TEXT, 3)))] 77 | self.assertEqual(join_results, results) 78 | 79 | def test_review_mapper(self): 80 | """Test the review_mapper function to make sure that based on a mock input, 81 | it produces the correct calculated output 82 | """ 83 | biz_review_positivity = (BIZ_NAME, (TEXT, 3)) 84 | 85 | job = WeightedPositiveWords() 86 | review_results = list(job.review_mapper(CATEGORY, biz_review_positivity)) 87 | results = [((CATEGORY, u'world'), (BIZ_NAME, 3)), ((CATEGORY, u'hello'), (BIZ_NAME, 3))] 88 | T.assert_sorted_equal(review_results, results) 89 | 90 | if __name__ == '__main__': 91 | unittest.main() 92 | -------------------------------------------------------------------------------- /review_autopilot/generate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Yelp and Contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import with_statement 16 | 17 | import random 18 | import sys 19 | 20 | import autopilot 21 | 22 | class ReviewMarkovGenerator(object): 23 | """Generate the remainder of a review, given a category and some 24 | start text. 25 | """ 26 | 27 | @classmethod 28 | def load_data(cls, input_file): 29 | """Read the output of the ReviewAutoPilot mrjob, returning a 30 | transition distribution. The transition distribution is a 31 | dictionary with category keys. Each category key points to 32 | another dictionary, which contains word keys, which contain 33 | another set of dictionaries, which contain the probability of 34 | transitioning to the the next word. 35 | 36 | Here's an example: 37 | 38 | category_transitions = {'Food': {'hot': {'dog': 1.0}}} 39 | 40 | This means that for the category Food, the word 'hot' has a 41 | 100% probability of being followed by the word 'dog'. 42 | """ 43 | job = autopilot.ReviewAutoPilot() 44 | 45 | category_transitions = {} 46 | 47 | with open(input_file) as src: 48 | for line in src: 49 | (category, start), transitions = job.parse_output_line(line) 50 | 51 | category_transitions.setdefault(category, {})[start] = transitions 52 | 53 | return category_transitions 54 | 55 | @classmethod 56 | def sample(cls, distribution): 57 | """Sample from a dictionary containing a probability 58 | distribution. 59 | """ 60 | guess = random.random() 61 | 62 | for word, prob in distribution.iteritems(): 63 | if guess <= prob: 64 | return word 65 | 66 | guess -= prob 67 | 68 | # random.random() returns a value between 0 and 1. The values 69 | # of distribution are assumed to sum to 1 (since distribution 70 | # is a probability distribution), so random.random() - 71 | # sum(values) == 0. If this is not the case, then distribution 72 | # is not a valid distribution. 73 | assert False, "distribution is not a valid probability distribution!" 74 | 75 | def __init__(self, input_file): 76 | """input_file: the output of the ReviewAutopilot job.""" 77 | self.category_transitions = self.load_data(input_file) 78 | 79 | def complete(self, category, text): 80 | """Complete some text.""" 81 | if category not in self.category_transitions: 82 | raise KeyError('Unknown category (invalid or not enough data): %s' % category) 83 | 84 | words = list(autopilot.words(text)) 85 | 86 | last_word = words[-1] 87 | transitions = self.category_transitions[category] 88 | while True: 89 | next_word = self.sample(transitions[last_word]) 90 | 91 | # the end-of-review token is None, which is JSON null, 92 | # which is coerced to the string "null" (since json 93 | # objects can only have strings as keys) 94 | if next_word == "": 95 | break 96 | 97 | text += ' ' + next_word 98 | last_word = next_word 99 | 100 | return text 101 | 102 | 103 | if __name__ == "__main__": 104 | input_file = sys.argv[1] 105 | category = sys.argv[2] 106 | text = sys.argv[3] 107 | 108 | print ReviewMarkovGenerator(input_file).complete(category, text) 109 | -------------------------------------------------------------------------------- /json_to_csv_converter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Convert the Yelp Dataset Challenge dataset from json format to csv. 3 | 4 | For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge 5 | 6 | """ 7 | import argparse 8 | import collections 9 | import csv 10 | import simplejson as json 11 | 12 | 13 | def read_and_write_file(json_file_path, csv_file_path, column_names): 14 | """Read in the json dataset file and write it out to a csv file, given the column names.""" 15 | with open(csv_file_path, 'wb+') as fout: 16 | csv_file = csv.writer(fout) 17 | csv_file.writerow(list(column_names)) 18 | with open(json_file_path) as fin: 19 | for line in fin: 20 | line_contents = json.loads(line) 21 | csv_file.writerow(get_row(line_contents, column_names)) 22 | 23 | def get_superset_of_column_names_from_file(json_file_path): 24 | """Read in the json dataset file and return the superset of column names.""" 25 | column_names = set() 26 | with open(json_file_path) as fin: 27 | for line in fin: 28 | line_contents = json.loads(line) 29 | column_names.update( 30 | set(get_column_names(line_contents).keys()) 31 | ) 32 | return column_names 33 | 34 | def get_column_names(line_contents, parent_key=''): 35 | """Return a list of flattened key names given a dict. 36 | 37 | Example: 38 | 39 | line_contents = { 40 | 'a': { 41 | 'b': 2, 42 | 'c': 3, 43 | }, 44 | } 45 | 46 | will return: ['a.b', 'a.c'] 47 | 48 | These will be the column names for the eventual csv file. 49 | 50 | """ 51 | column_names = [] 52 | for k, v in line_contents.iteritems(): 53 | column_name = "{0}.{1}".format(parent_key, k) if parent_key else k 54 | if isinstance(v, collections.MutableMapping): 55 | column_names.extend( 56 | get_column_names(v, column_name).items() 57 | ) 58 | else: 59 | column_names.append((column_name, v)) 60 | return dict(column_names) 61 | 62 | def get_nested_value(d, key): 63 | """Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`. 64 | 65 | Example: 66 | 67 | d = { 68 | 'a': { 69 | 'b': 2, 70 | 'c': 3, 71 | }, 72 | } 73 | key = 'a.b' 74 | 75 | will return: 2 76 | 77 | """ 78 | if '.' not in key: 79 | if key not in d: 80 | return None 81 | return d[key] 82 | base_key, sub_key = key.split('.', 1) 83 | if base_key not in d: 84 | return None 85 | sub_dict = d[base_key] 86 | return get_nested_value(sub_dict, sub_key) 87 | 88 | def get_row(line_contents, column_names): 89 | """Return a csv compatible row given column names and a dict.""" 90 | row = [] 91 | for column_name in column_names: 92 | line_value = get_nested_value( 93 | line_contents, 94 | column_name, 95 | ) 96 | if isinstance(line_value, unicode): 97 | row.append('{0}'.format(line_value.encode('utf-8'))) 98 | elif line_value is not None: 99 | row.append('{0}'.format(line_value)) 100 | else: 101 | row.append('') 102 | return row 103 | 104 | if __name__ == '__main__': 105 | """Convert a yelp dataset file from json to csv.""" 106 | 107 | parser = argparse.ArgumentParser( 108 | description='Convert Yelp Dataset Challenge data from JSON format to CSV.', 109 | ) 110 | 111 | parser.add_argument( 112 | 'json_file', 113 | type=str, 114 | help='The json file to convert.', 115 | ) 116 | 117 | args = parser.parse_args() 118 | 119 | json_file = args.json_file 120 | csv_file = '{0}.csv'.format(json_file.split('.json')[0]) 121 | 122 | column_names = get_superset_of_column_names_from_file(json_file) 123 | read_and_write_file(json_file, csv_file, column_names) 124 | -------------------------------------------------------------------------------- /category_predictor/predict.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Yelp and Contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Use the output from the CategoryPredictor MRJob to predict the 16 | category of text. This uses a simple naive-bayes model - see 17 | http://en.wikipedia.org/wiki/Naive_Bayes_classifier for more details. 18 | """ 19 | 20 | from __future__ import with_statement 21 | 22 | import math 23 | import sys 24 | 25 | import category_predictor 26 | 27 | class ReviewCategoryClassifier(object): 28 | """Predict categories for text using a simple naive-bayes classifier.""" 29 | 30 | @classmethod 31 | def load_data(cls, input_file): 32 | """Read the output of the CategoryPredictor mrjob, returning 33 | total category counts (count of # of reviews for each 34 | category), and counts of words for each category. 35 | """ 36 | 37 | job = category_predictor.CategoryPredictor() 38 | 39 | category_counts = None 40 | word_counts = {} 41 | 42 | with open(input_file) as src: 43 | for line in src: 44 | category, counts = job.parse_output_line(line) 45 | 46 | if category == 'all': 47 | category_counts = counts 48 | else: 49 | word_counts[category] = counts 50 | 51 | return category_counts, word_counts 52 | 53 | @classmethod 54 | def normalize_counts(cls, counts): 55 | """Convert a dictionary of counts into a log-probability 56 | distribution. 57 | """ 58 | total = sum(counts.itervalues()) 59 | lg_total = math.log(total) 60 | 61 | return dict((key, math.log(cnt) - lg_total) for key, cnt in counts.iteritems()) 62 | 63 | def __init__(self, input_file): 64 | """input_file: the output of the CategoryPredictor job.""" 65 | category_counts, word_counts = self.load_data(input_file) 66 | 67 | self.word_given_cat_prob = {} 68 | for cat, counts in word_counts.iteritems(): 69 | self.word_given_cat_prob[cat] = self.normalize_counts(counts) 70 | 71 | # filter out categories which have no words 72 | seen_categories = set(word_counts) 73 | seen_category_counts = dict((cat, count) for cat, count in category_counts.iteritems() \ 74 | if cat in seen_categories) 75 | self.category_prob = self.normalize_counts(seen_category_counts) 76 | 77 | def classify(self, text): 78 | """Classify some text using the result of the 79 | CategoryPredictor MRJob. We use a basic naive-bayes model, 80 | eg, argmax_category p(category) * p(words | category) == 81 | p(category) * pi_{i \in words} p(word_i | category). 82 | 83 | p(category) is stored in self.category_prob, p(word | category 84 | is in self.word_given_cat_prob. 85 | """ 86 | # start with prob(category) 87 | lg_scores = self.category_prob.copy() 88 | 89 | # then multiply in the individual word probabilities 90 | # NOTE: we're actually adding here, but that's because our 91 | # distributions are made up of log probabilities, which are 92 | # more accurate for small probabilities. See 93 | # http://en.wikipedia.org/wiki/Log_probability for more 94 | # details. 95 | for word in category_predictor.words(text): 96 | for cat in lg_scores: 97 | cat_probs = self.word_given_cat_prob[cat] 98 | 99 | if word in cat_probs: 100 | lg_scores[cat] += cat_probs[word] 101 | else: 102 | lg_scores[cat] += cat_probs['UNK'] 103 | 104 | # convert scores to a non-log value 105 | scores = dict((cat, math.exp(score)) for cat, score in lg_scores.iteritems()) 106 | 107 | # normalize the scores again - this isnt' strictly necessary, 108 | # but it's nice to report probabilities with our guesses 109 | total = sum(scores.itervalues()) 110 | return dict((cat, prob / total) for cat, prob in scores.iteritems()) 111 | 112 | 113 | if __name__ == "__main__": 114 | input_file = sys.argv[1] 115 | text = sys.argv[2] 116 | 117 | guesses = ReviewCategoryClassifier(input_file).classify(text) 118 | 119 | best_guesses = sorted(guesses.iteritems(), key=lambda (_, prob): prob, reverse=True)[:5] 120 | 121 | for guess, prob in best_guesses: 122 | print 'Category: "%s" - %.2f%% chance' % (guess, prob * 100) 123 | -------------------------------------------------------------------------------- /positive_category_words/weighted_category_positivity.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Yelp and Contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import re 16 | 17 | from mrjob.job import MRJob 18 | from mrjob.protocol import JSONValueProtocol 19 | 20 | def avg_and_total(iterable): 21 | """Compute the average over a numeric iterable.""" 22 | items = 0 23 | total = 0.0 24 | 25 | for item in iterable: 26 | total += item 27 | items += 1 28 | 29 | return total / items, total 30 | 31 | # Considerably lower than for the simple global script, since category 32 | # data is much more sparse 33 | MINIMUM_OCCURENCES = 50 34 | 35 | # Require reviews from AT LEAST this many distinct businesses before 36 | # we include a word (prevents very popular restaurant names from 37 | # showing up in the list) 38 | MINIMUM_BUSINESSES = 3 39 | 40 | class WeightedPositiveWords(MRJob): 41 | """Find the most positive words in the dataset.""" 42 | 43 | # The input is the dataset - interpret each line as a single json 44 | # value (the key will be None) 45 | INPUT_PROTOCOL = JSONValueProtocol 46 | 47 | def review_category_mapper(self, _, data): 48 | """Walk over reviews, emitting each word and its rating.""" 49 | if data['type'] == 'review': 50 | yield data['business_id'], ('review', (data['text'], data['stars'])) 51 | 52 | elif data['type'] == 'business': 53 | # skip businesses with no categories 54 | if data['categories']: 55 | yield data['business_id'], ('categories', data['categories']) 56 | 57 | def category_join_reducer(self, business_id, reviews_or_categories): 58 | """Take in business_id, ((review text and rating) or category information), emit 59 | category, (biz_id, (review, rating)). 60 | """ 61 | categories = None 62 | reviews = [] 63 | 64 | for data_type, data in reviews_or_categories: 65 | if data_type == 'review': 66 | reviews.append(data) 67 | else: 68 | categories = data 69 | 70 | # no categories found, skip this 71 | if not categories: 72 | return 73 | 74 | for category in categories: 75 | for review_positivity in reviews: 76 | yield category, (business_id, review_positivity) 77 | 78 | def review_mapper(self, category, biz_review_positivity): 79 | """Take in category, (biz_id, (review, rating)) and split the 80 | review into individual unique words. Emit 81 | (category, word), (biz_id, rating), which will then be used to 82 | gather info about each category / word pair. 83 | """ 84 | biz_id, (review, positivity) = biz_review_positivity 85 | 86 | # normalize words by lowercasing and dropping non-alpha 87 | # characters 88 | norm = lambda word: re.sub('[^a-z]', '', word.lower()) 89 | # only include a word once per-review (which de-emphasizes 90 | # proper nouns) 91 | words = set(norm(word) for word in review.split()) 92 | 93 | for word in words: 94 | yield (category, word), (biz_id, positivity) 95 | 96 | def positivity_reducer(self, category_word, biz_positivities): 97 | """Read (category, word), (biz_id, positivity), and compute 98 | the average positivity for the category-word pair. Skip words 99 | that don't occur frequently enough or for not enough unique 100 | businesses. 101 | 102 | Emits rating, (category, # reviews with word, word). 103 | """ 104 | 105 | category, word = category_word 106 | 107 | businesses = set() 108 | positivities = [] 109 | for biz_id, positivity in biz_positivities: 110 | businesses.add(biz_id) 111 | positivities.append(positivity) 112 | 113 | # don't include words that only show up for a few businesses 114 | if len(businesses) < MINIMUM_BUSINESSES: 115 | return 116 | 117 | avg, total = avg_and_total(positivities) 118 | 119 | if total < MINIMUM_OCCURENCES: 120 | return 121 | 122 | yield int(avg * 100), (category, total, word) 123 | 124 | def steps(self): 125 | return [ self.mr(self.review_category_mapper, self.category_join_reducer), 126 | self.mr(self.review_mapper, self.positivity_reducer)] 127 | 128 | 129 | if __name__ == "__main__": 130 | WeightedPositiveWords().run() 131 | -------------------------------------------------------------------------------- /review_autopilot/autopilot.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Yelp and Contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Gather the data necessary to generate reviews using a simple markov 16 | model (see http://en.wikipedia.org/wiki/Markov_chain for more 17 | details). We gather word-next word counts for each category, 18 | eliminating rare pairs. 19 | """ 20 | 21 | import re 22 | 23 | from mrjob.job import MRJob 24 | from mrjob.protocol import JSONValueProtocol 25 | 26 | # Chance that the review will end after any given word. 27 | END_OF_REVIEW_RATE = 0.01 28 | 29 | MINIMUM_PAIR_COUNT = 5 30 | MINIMUM_FOLLOW_PERCENTAGE = 0.01 31 | 32 | def words(text): 33 | """An iterator over tokens (words) in text. Replace this with a 34 | stemmer or other smarter logic. 35 | """ 36 | 37 | for word in text.split(): 38 | # normalize words by lowercasing and dropping non-alpha 39 | # characters 40 | normed = re.sub('[^a-z]', '', word.lower()) 41 | 42 | if not normed: 43 | continue 44 | 45 | yield normed 46 | 47 | def word_pairs(text): 48 | """Given some text, yield out pairs of words (eg bigrams).""" 49 | last_word = None 50 | 51 | for word in words(text): 52 | if last_word is not None: 53 | yield last_word, word 54 | last_word = word 55 | 56 | yield last_word, "" 57 | 58 | class ReviewAutoPilot(MRJob): 59 | """Very simple markov model for reviews, parameterized on business category.""" 60 | 61 | INPUT_PROTOCOL = JSONValueProtocol 62 | 63 | def business_join_mapper(self, _, data): 64 | """Walk through reviews and businesses, yielding out the raw 65 | data. 66 | """ 67 | if data['type'] == 'business': 68 | yield data['business_id'], ('business', data) 69 | elif data['type'] == 'review': 70 | yield data['business_id'], ('review', data['text']) 71 | 72 | def join_reviews_with_categories_reducer(self, business_id, reviews_or_biz): 73 | """Join reviews with the categories from the associated 74 | business. 75 | """ 76 | categories = None 77 | reviews = [] 78 | 79 | for data_type, data in reviews_or_biz: 80 | if data_type == 'business': 81 | categories = data['categories'] 82 | else: 83 | reviews.append(data) 84 | 85 | # don't bother with these businesses 86 | if not categories: 87 | return 88 | 89 | for review in reviews: 90 | yield categories, review 91 | 92 | def review_split_mapper(self, categories, review): 93 | """Split a review into pairs of words and yield out 94 | (start word, category), (follow word, count), combining 95 | repeated pairs into a single emission. 96 | """ 97 | pair_counts = {} 98 | 99 | for pair in word_pairs(review): 100 | pair_counts[pair] = pair_counts.get(pair, 0) + 1 101 | 102 | for (start, follow), count in pair_counts.iteritems(): 103 | for category in categories: 104 | yield (start, category), (follow, count) 105 | 106 | def follow_probs_reducer(self, start_word_category, follow_word_counts): 107 | """Given a start word and a category, find the distribution 108 | over next words. When normalized, this count defines the 109 | transition probability for the markov chain. 110 | """ 111 | start, category = start_word_category 112 | follow_counts = {} 113 | 114 | for follow_word, count in follow_word_counts: 115 | follow_counts[follow_word] = follow_counts.get(follow_word, 0) + count 116 | 117 | total_transitions = float(sum(follow_counts.itervalues())) 118 | 119 | include_word = lambda count: count > MINIMUM_PAIR_COUNT and count / total_transitions > MINIMUM_FOLLOW_PERCENTAGE 120 | thresholded_follow_counts = dict((word, count) for word, count in follow_counts.iteritems() if include_word(count)) 121 | 122 | # filter out transitions where the transition has either 123 | # occurred a minimum number of times, or does not make up a 124 | # minimum percentage of outgoing transitions. 125 | if not thresholded_follow_counts: 126 | return 127 | 128 | # put a small weight on , which means 'end of review'. 129 | thresholded_follow_counts[''] = thresholded_follow_counts.get('', 0.0) 130 | thresholded_follow_counts[''] += END_OF_REVIEW_RATE * float(sum(thresholded_follow_counts.itervalues())) 131 | 132 | # re-normalize the remaining transition weights. 133 | new_total = float(sum(thresholded_follow_counts.itervalues())) 134 | percentages = dict((follow, count / new_total) for follow, count in thresholded_follow_counts.iteritems()) 135 | 136 | yield (category, start), percentages 137 | 138 | def steps(self): 139 | return [ self.mr(mapper=self.business_join_mapper, reducer=self.join_reviews_with_categories_reducer), 140 | self.mr(mapper=self.review_split_mapper, reducer=self.follow_probs_reducer)] 141 | 142 | if __name__ == "__main__": 143 | ReviewAutoPilot().run() 144 | 145 | -------------------------------------------------------------------------------- /category_predictor/category_predictor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Yelp and Contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """An MRJob that constructs the data necessary to predict category 16 | information 17 | """ 18 | 19 | import re 20 | 21 | from mrjob.job import MRJob 22 | from mrjob.protocol import JSONValueProtocol 23 | 24 | # require at least this many occurences for a word to show up for a 25 | # given category 26 | MINIMUM_OCCURENCES = 100 27 | 28 | def words(text): 29 | """An iterator over tokens (words) in text. Replace this with a 30 | stemmer or other smarter logic. 31 | """ 32 | 33 | for word in text.split(): 34 | # normalize words by lowercasing and dropping non-alpha 35 | # characters 36 | normed = re.sub('[^a-z]', '', word.lower()) 37 | 38 | if normed: 39 | yield normed 40 | 41 | class CategoryPredictor(MRJob): 42 | """A very simple category predictor. Trains on review data and 43 | generates a simple naive-bayes model that can predict the category 44 | of some text. 45 | """ 46 | 47 | # The input is the dataset - interpret each line as a single json 48 | # value (the key will be None) 49 | INPUT_PROTOCOL = JSONValueProtocol 50 | 51 | def review_category_mapper(self, _, data): 52 | """Visit reviews and businesses, yielding out (business_id, 53 | (review or category)). 54 | """ 55 | if data['type'] == 'review': 56 | yield data['business_id'], ('review', data['text']) 57 | elif data['type'] == 'business': 58 | yield data['business_id'], ('categories', data['categories']) 59 | 60 | def add_categories_to_reviews_reducer(self, business_id, reviews_or_categories): 61 | """Yield out (category, review) for each category-review 62 | pair. We'll do the actual review tokenizing in the next 63 | mapper, since you typically have much more map-capacity than 64 | reduce-capacity. 65 | """ 66 | categories = None 67 | reviews = [] 68 | 69 | for data_type, data in reviews_or_categories: 70 | if data_type == 'review': 71 | reviews.append(data) 72 | else: 73 | categories = data 74 | 75 | # We either didn't find a matching business, or this biz 76 | # doesn't have any categories. In either case, we can drop 77 | # these reviews. 78 | if not categories: 79 | return 80 | 81 | # Yield out review counts in the same format as the 82 | # tokenize_reviews_mapper. We'll special case the 'all' key in 83 | # that method, but afterwards it will be treated the same. 84 | yield 'all', dict((cat, len(reviews)) for cat in categories) 85 | 86 | for category in categories: 87 | for review in reviews: 88 | yield category, review 89 | 90 | def tokenize_reviews_mapper(self, category, review): 91 | """Split reviews into words, yielding out (category, {word: count}) and 92 | ('all', {word: count}). We yield out a dictionary of counts 93 | rather than a single entry per-word to reduce the amount of 94 | i/o between mapper and reducer. 95 | """ 96 | # special case - pass through category counts (which are 97 | # already formatted like the output of this mapper) 98 | if category == 'all': 99 | yield category, review 100 | return 101 | 102 | counts = {} 103 | for word in words(review): 104 | counts[word] = counts.get(word, 0) + 1 105 | 106 | yield category, counts 107 | 108 | def sum_counts(self, category, counts): 109 | """Sum up dictionaries of counts, filter out rare words 110 | (bucketing them into an unknown word bucket), and yield the 111 | counts. 112 | """ 113 | raw_count = {} 114 | 115 | # sum up the individual counts 116 | for word_count in counts: 117 | for word, count in word_count.iteritems(): 118 | raw_count[word] = raw_count.get(word, 0) + count 119 | 120 | # don't filter out low-mass categories 121 | if category == 'all': 122 | yield category, raw_count 123 | return 124 | 125 | # filter out low-count words; assign a very low mass to 126 | # unknown words 127 | filtered_counts = {} 128 | for word, count in raw_count.iteritems(): 129 | if count > MINIMUM_OCCURENCES: 130 | filtered_counts[word] = count 131 | 132 | # don't include categories with every word filtered out 133 | if not filtered_counts: 134 | return 135 | 136 | # Assign a small mass to unknown tokens - check out 137 | # http://en.wikipedia.org/wiki/Laplacian_smoothing for background. 138 | filtered_counts['UNK'] = 0.01 139 | 140 | # emit the result 141 | yield category, filtered_counts 142 | 143 | def steps(self): 144 | return [self.mr(mapper=self.review_category_mapper, 145 | reducer=self.add_categories_to_reviews_reducer), 146 | self.mr(mapper=self.tokenize_reviews_mapper, 147 | reducer=self.sum_counts)] 148 | 149 | 150 | if __name__ == "__main__": 151 | CategoryPredictor().run() 152 | 153 | --------------------------------------------------------------------------------