├── LICENSE ├── README.md ├── avgdelay ├── flights_per_day_january │ ├── mapper.py │ └── reducer.py ├── mapper.py └── reducer.py ├── blog_regression ├── framework.py ├── mapper.py ├── params.txt ├── reducer.py ├── spark-app.py └── spark-sklearn-app.py ├── data ├── BlogFeedback.zip ├── avgprice_kwh_state.zip ├── baseball.tgz ├── dating.zip ├── earthquakes.zip ├── flight_data.zip ├── log_data.zip ├── ml-100k.zip ├── ml-1m.zip ├── obama.txt.zip ├── ontime.zip ├── reuters.zip ├── sentiment_analysis.tgz ├── sf_parking.zip ├── shakespeare.txt.zip ├── spam_classifier.zip ├── stock_analysis.tgz ├── textcorpus.zip ├── weblogs.csv.zip ├── wikistats.zip └── winequality.tgz ├── describe ├── mapper.py ├── reducer.py └── spark-app.py ├── flume ├── client.conf ├── collector.conf ├── impression_tracker.py └── setup.sh ├── hive ├── baseball │ ├── init.hql │ └── top_50_players_by_homeruns.hql ├── flight_data │ ├── create_schema.hql │ ├── load_data.hql │ └── select_queries.hql └── log_data │ ├── create_apache_log_table.sql │ └── load_apache_log_data.sql ├── mllib ├── classification │ └── spam_classifier.py ├── clustering │ └── earthquakes_clustering.py └── collaborative_filtering │ └── als │ ├── matchmaker.py │ ├── movie_recommender.py │ └── rateMovies ├── recommender ├── framework.py ├── mapper1.py ├── mapper2.py ├── recommendations.py ├── reducer1.py └── reducer2.py ├── spark ├── movie_ratings.py ├── prime.py ├── sales.py └── wordcount.py ├── sparksql └── SF Parking Availability.ipynb ├── streaming └── framework.py ├── tfidf ├── mapper1.py ├── mapper2.py ├── mapper3.py ├── reducer1.py ├── reducer2.py ├── reducer3.py └── spark-app.py ├── trigrams ├── framework.py ├── mapper.py ├── reducer.py ├── spark-app.py └── stopwords.txt ├── wines ├── framework.py ├── mapper.py ├── reducer.py ├── theta.json └── wines.py └── wordcount ├── StreamingWordCount.zip ├── StreamingWordCount ├── job.sh ├── mapper.py └── reducer.py ├── WordCount.zip └── WordCount ├── SumReducer.java ├── WordCount.java └── WordMapper.java /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/README.md -------------------------------------------------------------------------------- /avgdelay/flights_per_day_january/mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/avgdelay/flights_per_day_january/mapper.py -------------------------------------------------------------------------------- /avgdelay/flights_per_day_january/reducer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/avgdelay/flights_per_day_january/reducer.py -------------------------------------------------------------------------------- /avgdelay/mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/avgdelay/mapper.py -------------------------------------------------------------------------------- /avgdelay/reducer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/avgdelay/reducer.py -------------------------------------------------------------------------------- /blog_regression/framework.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/blog_regression/framework.py -------------------------------------------------------------------------------- /blog_regression/mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/blog_regression/mapper.py -------------------------------------------------------------------------------- /blog_regression/params.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/blog_regression/params.txt -------------------------------------------------------------------------------- /blog_regression/reducer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/blog_regression/reducer.py -------------------------------------------------------------------------------- /blog_regression/spark-app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/blog_regression/spark-app.py -------------------------------------------------------------------------------- /blog_regression/spark-sklearn-app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/blog_regression/spark-sklearn-app.py -------------------------------------------------------------------------------- /data/BlogFeedback.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/BlogFeedback.zip -------------------------------------------------------------------------------- /data/avgprice_kwh_state.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/avgprice_kwh_state.zip -------------------------------------------------------------------------------- /data/baseball.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/baseball.tgz -------------------------------------------------------------------------------- /data/dating.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/dating.zip -------------------------------------------------------------------------------- /data/earthquakes.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/earthquakes.zip -------------------------------------------------------------------------------- /data/flight_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/flight_data.zip -------------------------------------------------------------------------------- /data/log_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/log_data.zip -------------------------------------------------------------------------------- /data/ml-100k.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/ml-100k.zip -------------------------------------------------------------------------------- /data/ml-1m.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/ml-1m.zip -------------------------------------------------------------------------------- /data/obama.txt.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/obama.txt.zip -------------------------------------------------------------------------------- /data/ontime.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/ontime.zip -------------------------------------------------------------------------------- /data/reuters.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/reuters.zip -------------------------------------------------------------------------------- /data/sentiment_analysis.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/sentiment_analysis.tgz -------------------------------------------------------------------------------- /data/sf_parking.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/sf_parking.zip -------------------------------------------------------------------------------- /data/shakespeare.txt.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/shakespeare.txt.zip -------------------------------------------------------------------------------- /data/spam_classifier.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/spam_classifier.zip -------------------------------------------------------------------------------- /data/stock_analysis.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/stock_analysis.tgz -------------------------------------------------------------------------------- /data/textcorpus.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/textcorpus.zip -------------------------------------------------------------------------------- /data/weblogs.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/weblogs.csv.zip -------------------------------------------------------------------------------- /data/wikistats.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/wikistats.zip -------------------------------------------------------------------------------- /data/winequality.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/data/winequality.tgz -------------------------------------------------------------------------------- /describe/mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/describe/mapper.py -------------------------------------------------------------------------------- /describe/reducer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/describe/reducer.py -------------------------------------------------------------------------------- /describe/spark-app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/describe/spark-app.py -------------------------------------------------------------------------------- /flume/client.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/flume/client.conf -------------------------------------------------------------------------------- /flume/collector.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/flume/collector.conf -------------------------------------------------------------------------------- /flume/impression_tracker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/flume/impression_tracker.py -------------------------------------------------------------------------------- /flume/setup.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/flume/setup.sh -------------------------------------------------------------------------------- /hive/baseball/init.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/hive/baseball/init.hql -------------------------------------------------------------------------------- /hive/baseball/top_50_players_by_homeruns.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/hive/baseball/top_50_players_by_homeruns.hql -------------------------------------------------------------------------------- /hive/flight_data/create_schema.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/hive/flight_data/create_schema.hql -------------------------------------------------------------------------------- /hive/flight_data/load_data.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/hive/flight_data/load_data.hql -------------------------------------------------------------------------------- /hive/flight_data/select_queries.hql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/hive/flight_data/select_queries.hql -------------------------------------------------------------------------------- /hive/log_data/create_apache_log_table.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/hive/log_data/create_apache_log_table.sql -------------------------------------------------------------------------------- /hive/log_data/load_apache_log_data.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/hive/log_data/load_apache_log_data.sql -------------------------------------------------------------------------------- /mllib/classification/spam_classifier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/mllib/classification/spam_classifier.py -------------------------------------------------------------------------------- /mllib/clustering/earthquakes_clustering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/mllib/clustering/earthquakes_clustering.py -------------------------------------------------------------------------------- /mllib/collaborative_filtering/als/matchmaker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/mllib/collaborative_filtering/als/matchmaker.py -------------------------------------------------------------------------------- /mllib/collaborative_filtering/als/movie_recommender.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/mllib/collaborative_filtering/als/movie_recommender.py -------------------------------------------------------------------------------- /mllib/collaborative_filtering/als/rateMovies: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/mllib/collaborative_filtering/als/rateMovies -------------------------------------------------------------------------------- /recommender/framework.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/recommender/framework.py -------------------------------------------------------------------------------- /recommender/mapper1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/recommender/mapper1.py -------------------------------------------------------------------------------- /recommender/mapper2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/recommender/mapper2.py -------------------------------------------------------------------------------- /recommender/recommendations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/recommender/recommendations.py -------------------------------------------------------------------------------- /recommender/reducer1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/recommender/reducer1.py -------------------------------------------------------------------------------- /recommender/reducer2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/recommender/reducer2.py -------------------------------------------------------------------------------- /spark/movie_ratings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/spark/movie_ratings.py -------------------------------------------------------------------------------- /spark/prime.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/spark/prime.py -------------------------------------------------------------------------------- /spark/sales.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/spark/sales.py -------------------------------------------------------------------------------- /spark/wordcount.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/spark/wordcount.py -------------------------------------------------------------------------------- /sparksql/SF Parking Availability.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/sparksql/SF Parking Availability.ipynb -------------------------------------------------------------------------------- /streaming/framework.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/streaming/framework.py -------------------------------------------------------------------------------- /tfidf/mapper1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/tfidf/mapper1.py -------------------------------------------------------------------------------- /tfidf/mapper2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/tfidf/mapper2.py -------------------------------------------------------------------------------- /tfidf/mapper3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/tfidf/mapper3.py -------------------------------------------------------------------------------- /tfidf/reducer1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/tfidf/reducer1.py -------------------------------------------------------------------------------- /tfidf/reducer2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/tfidf/reducer2.py -------------------------------------------------------------------------------- /tfidf/reducer3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/tfidf/reducer3.py -------------------------------------------------------------------------------- /tfidf/spark-app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/tfidf/spark-app.py -------------------------------------------------------------------------------- /trigrams/framework.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/trigrams/framework.py -------------------------------------------------------------------------------- /trigrams/mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/trigrams/mapper.py -------------------------------------------------------------------------------- /trigrams/reducer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/trigrams/reducer.py -------------------------------------------------------------------------------- /trigrams/spark-app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/trigrams/spark-app.py -------------------------------------------------------------------------------- /trigrams/stopwords.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/trigrams/stopwords.txt -------------------------------------------------------------------------------- /wines/framework.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wines/framework.py -------------------------------------------------------------------------------- /wines/mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wines/mapper.py -------------------------------------------------------------------------------- /wines/reducer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wines/reducer.py -------------------------------------------------------------------------------- /wines/theta.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wines/theta.json -------------------------------------------------------------------------------- /wines/wines.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wines/wines.py -------------------------------------------------------------------------------- /wordcount/StreamingWordCount.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wordcount/StreamingWordCount.zip -------------------------------------------------------------------------------- /wordcount/StreamingWordCount/job.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wordcount/StreamingWordCount/job.sh -------------------------------------------------------------------------------- /wordcount/StreamingWordCount/mapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wordcount/StreamingWordCount/mapper.py -------------------------------------------------------------------------------- /wordcount/StreamingWordCount/reducer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wordcount/StreamingWordCount/reducer.py -------------------------------------------------------------------------------- /wordcount/WordCount.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wordcount/WordCount.zip -------------------------------------------------------------------------------- /wordcount/WordCount/SumReducer.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wordcount/WordCount/SumReducer.java -------------------------------------------------------------------------------- /wordcount/WordCount/WordCount.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wordcount/WordCount/WordCount.java -------------------------------------------------------------------------------- /wordcount/WordCount/WordMapper.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oreillymedia/Data_Analytics_with_Hadoop/HEAD/wordcount/WordCount/WordMapper.java --------------------------------------------------------------------------------