├── README.md ├── Assignments ├── lab_group_accumulators.ipynb ├── schemas-and-accumulators.ipynb ├── changing_document_shape.ipynb ├── unwind_and_group_lab.ipynb ├── entity-resolution.ipynb ├── cursor_like_methods.ipynb ├── lab__graphlookup.ipynb ├── lookup_lab.ipynb ├── expressions_with_project.ipynb ├── linear-regression-on-titanic-data-set.ipynb └── Decision+Tree.ipynb └── LessonNotes ├── linear-regressions-with-mongodb.ipynb ├── migrating-schema-lesson.ipynb ├── principal-component-analysis.ipynb ├── associative_rules__lesson.ipynb ├── pearson_correlation.ipynb └── tree_like__lesson.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # MongoDB Aggregation Framework 2 | 3 | Work from the [MongoDB Aggregation Framework](https://www.coursera.org/learn/mongodb-aggregation-framework) course on Coursera. 4 | 5 | ## Content 6 | 7 | ### The Fundamentals of MongoDB Aggregation 8 | 9 | - Aggregation Introduction 10 | - The Concept of Pipelines 11 | - Aggregation Structure and Syntax 12 | - $match: Filtering Documents 13 | - Using $project 14 | - Using Expressions 15 | - Cursor Like Methods Stages 16 | - The $group Stage 17 | - $unwind 18 | - The $lookup Stage 19 | 20 | ### Leveraging MongoDB's Flexible Schema 21 | 22 | - mongoimport 23 | - Importance of Schema 24 | - Exploring Schemas 25 | - Migrating Your Schema 26 | - Views 27 | - Supplementing Schemas with Accumulators 28 | - Tree-like Data in Individual Documents 29 | - Expressive Lookup Basics 30 | - Entity Resolution with $lookup 31 | 32 | ### Machine Learning with MongoDB 33 | 34 | - Calculation of Persons-Rho 35 | - Intro: Associative Role Learning 36 | - Principal Component Analysis 37 | - Intro to Linear Regressions 38 | - Decision Trees 39 | - Intro to Clustering Algorithms 40 | -------------------------------------------------------------------------------- /Assignments/lab_group_accumulators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pymongo" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 19 | "course_client = pymongo.MongoClient(course_cluster_uri)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "movies = course_client['aggregations']['movies']" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Lab: Group Accumulators\n", 36 | "\n", 37 | "## For this lab, you'll be using group accumulators." 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Question\n", 45 | "\n", 46 | "In this lab, you will need to capture the highest `imdb.rating`, lowest `imdb.rating`, average, and **sample** standard deviation for all films that won an Oscar.\n", 47 | "\n", 48 | "You may find documentation on [group accumulators](https://docs.mongodb.com/manual/reference/operator/aggregation-group/#group-accumulator-operators) helpful!\n", 49 | "\n", 50 | "The matching stage to find films with Oscar wins is provided below." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 4, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "matching = {\n", 60 | " \"$match\": {\n", 61 | " \"awards\": { \"$regex\": \"Won \\\\d{1,2} Oscars?\"}\n", 62 | " }\n", 63 | "}" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 13, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "grouping = {\n", 73 | " \"$group\": {\n", 74 | " \"_id\": None,\n", 75 | " \"highest_rating\": { \"$max\": \"$imdb.rating\" },\n", 76 | " \"lowest_rating\": { \"$min\": \"$imdb.rating\" },\n", 77 | " \"average_rating\": { \"$avg\": \"$imdb.rating\" },\n", 78 | " \"sample_st_dev_rating\": { \"$stdDevSamp\": \"$imdb.rating\" }\n", 79 | " }\n", 80 | "}" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 14, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/plain": [ 91 | "[{'_id': None,\n", 92 | " 'highest_rating': 9.2,\n", 93 | " 'lowest_rating': 4.5,\n", 94 | " 'average_rating': 7.527024070021882,\n", 95 | " 'sample_st_dev_rating': 0.5988145513344504}]" 96 | ] 97 | }, 98 | "metadata": {}, 99 | "output_type": "display_data" 100 | } 101 | ], 102 | "source": [ 103 | "pipeline = [\n", 104 | " matching,\n", 105 | " grouping\n", 106 | "]\n", 107 | "\n", 108 | "display(list(movies.aggregate(pipeline)))" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [] 117 | } 118 | ], 119 | "metadata": { 120 | "kernelspec": { 121 | "display_name": "Python 3", 122 | "language": "python", 123 | "name": "python3" 124 | }, 125 | "language_info": { 126 | "codemirror_mode": { 127 | "name": "ipython", 128 | "version": 3 129 | }, 130 | "file_extension": ".py", 131 | "mimetype": "text/x-python", 132 | "name": "python", 133 | "nbconvert_exporter": "python", 134 | "pygments_lexer": "ipython3", 135 | "version": "3.6.5" 136 | } 137 | }, 138 | "nbformat": 4, 139 | "nbformat_minor": 2 140 | } 141 | -------------------------------------------------------------------------------- /Assignments/schemas-and-accumulators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pymongo import MongoClient\n", 10 | "import pprint" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 20 | "course_client = MongoClient(course_cluster_uri)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 15, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "orders = course_client['coursera-agg']['orders']" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 24, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "# Replace XXXX with a pipeline to add the fields mean_order_quantity, mean_order_unit_price,\n", 39 | "# order_quantity, and order_total to each document. You can also add a $sort and $limit to your\n", 40 | "# pipeline to answer the verification question.\n", 41 | "pipeline = [\n", 42 | " {\n", 43 | " \"$addFields\": {\n", 44 | " \"mean_order_quantity\": { \"$avg\": \"$purchases.quantity\" },\n", 45 | " \"mean_order_unit_price\": { \"$avg\": \"$purchases.unit_price\" },\n", 46 | " \"order_quantity\": { \"$sum\": \"$purchases.quantity\" },\n", 47 | " \"order_total\": { \n", 48 | " \"$reduce\": {\n", 49 | " \"input\": \"$purchases\",\n", 50 | " \"initialValue\": 0.00,\n", 51 | " \"in\": {\n", 52 | " \"$add\": [\n", 53 | " \"$$value\",\n", 54 | " { \"$multiply\": [ \"$$this.quantity\", \"$$this.unit_price\"] }\n", 55 | " ]\n", 56 | " }\n", 57 | " }\n", 58 | " }\n", 59 | " }\n", 60 | " },\n", 61 | " {\n", 62 | " \"$sort\": { \"order_total\": -1 }\n", 63 | " },\n", 64 | " {\n", 65 | " \"$limit\": 1\n", 66 | " }\n", 67 | "]" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 25, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "cursor = orders.aggregate(pipeline)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 26, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "{'_id': 581483,\n", 89 | " 'country': 'United Kingdom',\n", 90 | " 'customer_id': 16446,\n", 91 | " 'date': datetime.datetime(2011, 12, 9, 9, 15),\n", 92 | " 'mean_order_quantity': 80995.0,\n", 93 | " 'mean_order_unit_price': 2.08,\n", 94 | " 'order_quantity': 80995,\n", 95 | " 'order_total': 168469.6,\n", 96 | " 'purchases': [{'description': 'PAPER CRAFT , LITTLE BIRDIE',\n", 97 | " 'quantity': 80995,\n", 98 | " 'stock_code': '23843',\n", 99 | " 'unit_price': 2.08}]}\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "for doc in cursor:\n", 105 | " pprint.pprint(doc)\n", 106 | "#168469.6" 107 | ] 108 | } 109 | ], 110 | "metadata": { 111 | "kernelspec": { 112 | "display_name": "Python 3", 113 | "language": "python", 114 | "name": "python3" 115 | }, 116 | "language_info": { 117 | "codemirror_mode": { 118 | "name": "ipython", 119 | "version": 3 120 | }, 121 | "file_extension": ".py", 122 | "mimetype": "text/x-python", 123 | "name": "python", 124 | "nbconvert_exporter": "python", 125 | "pygments_lexer": "ipython3", 126 | "version": "3.6.5" 127 | } 128 | }, 129 | "nbformat": 4, 130 | "nbformat_minor": 2 131 | } 132 | -------------------------------------------------------------------------------- /Assignments/changing_document_shape.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pymongo" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 19 | "course_client = pymongo.MongoClient(course_cluster_uri)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "movies = course_client['aggregations']['movies']" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Lab: Changing Document Shape\n", 36 | "\n", 37 | "## For this lab, you'll be using expressions to change document shape and perform an analysis \n", 38 | "\n", 39 | "#### The dataset for this lab can be downloaded [here](https://s3.amazonaws.com/edu-static.mongodb.com/lessons/coursera/aggregation/movies.json) for upload to your own cluster." 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "### Prelude\n", 47 | "\n", 48 | "Our movies dataset has a lot of different documents, some with more convoluted\n", 49 | "titles than others. \n", 50 | "\n", 51 | "If we'd like to analyze our collection to find movie titles\n", 52 | "that are composed of only one word, we **could** fetch all the movies in the\n", 53 | "dataset and do some processing in a client application, but the Aggregation\n", 54 | "Framework allows us to do this on the server!\n", 55 | "\n", 56 | "Ensure you explore the [string expressions](https://docs.mongodb.com/manual/meta/aggregation-quick-reference/#string-expressions) and the [array expressions](https://docs.mongodb.com/manual/meta/aggregation-quick-reference/#array-expressions) before attempting this lab.\n", 57 | "\n", 58 | "### Question\n", 59 | "\n", 60 | "Using the Aggregation Framework, find a count of the number of movies that have\n", 61 | "a title composed of one word. To clarify, \"Cinderella\" and \"3-25\" should count,\n", 62 | "where as \"Cast Away\" would not.\n", 63 | "\n", 64 | "Don't forget to append the following `counting` variable to your pipeline!" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "counting = {\n", 74 | " \"$count\": \"one_word_titles\"\n", 75 | "}" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 5, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "shaping = {\n", 85 | " \"$project\": {\n", 86 | " \"title_size\": { \"$size\": { \"$split\": [ \"$title\", \" \" ] } },\n", 87 | " }\n", 88 | "}" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 6, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "matching = {\n", 98 | " \"$match\": { \"title_size\": { \"$eq\": 1 } }\n", 99 | "}" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 8, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "[{'one_word_titles': 44497}]" 111 | ] 112 | }, 113 | "metadata": {}, 114 | "output_type": "display_data" 115 | } 116 | ], 117 | "source": [ 118 | "pipeline = [\n", 119 | " shaping,\n", 120 | " matching,\n", 121 | " counting\n", 122 | "]\n", 123 | "\n", 124 | "display(list(movies.aggregate(pipeline)))\n", 125 | "#8068" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [] 136 | } 137 | ], 138 | "metadata": { 139 | "kernelspec": { 140 | "display_name": "Python 3", 141 | "language": "python", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "codemirror_mode": { 146 | "name": "ipython", 147 | "version": 3 148 | }, 149 | "file_extension": ".py", 150 | "mimetype": "text/x-python", 151 | "name": "python", 152 | "nbconvert_exporter": "python", 153 | "pygments_lexer": "ipython3", 154 | "version": "3.6.5" 155 | } 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 2 159 | } 160 | -------------------------------------------------------------------------------- /Assignments/unwind_and_group_lab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pymongo" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 19 | "course_client = pymongo.MongoClient(course_cluster_uri)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "movies = course_client['aggregations']['movies']" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Lab: Using ``$unwind`` and ``$group``\n", 36 | "\n", 37 | "## For this lab, you'll be using both the ``$unwind`` and ``$group`` stages.\n", 38 | "\n", 39 | "#### The dataset for this lab can be downloaded [here](https://s3.amazonaws.com/edu-static.mongodb.com/lessons/coursera/aggregation/movies.json) for upload to your own cluster." 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "### Question\n", 47 | "\n", 48 | "Let's use our increasing understanding of the Aggregation Framework to explore our\n", 49 | "movies collection in more detail. We'd like to calculate how many movies every\n", 50 | "**cast** member has been in, and get an average ``imdb.rating`` for each\n", 51 | "``cast`` member.\n", 52 | "\n", 53 | "Which cast member has the been in the most movies with **English** as an available language?\n", 54 | "\n", 55 | "To verify that you've successfully completed this exercise please submit your answer as the sum of the number of films and average rating for this cast member.\n", 56 | "\n", 57 | "For example, if the cast member was output like so:\n", 58 | "\n", 59 | " { \"_id\": \"James Dean\", \"numFilms\": 11, \"average\": 7.1 }\n", 60 | "Then the answer would be 18.1." 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "predicate = {\n", 70 | " \"$match\": { \n", 71 | " \"imdb.rating\": { \"$exists\": True },\n", 72 | " \"languages\": { \"$in\": [ \"English\", \"$languages\" ] }, \n", 73 | " \"cast\": { \"$elemMatch\": { \"$exists\": True } }\n", 74 | " }\n", 75 | "}" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 5, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "unwinding = {\n", 85 | " \"$unwind\": \"$cast\"\n", 86 | "}" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 6, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "grouping = {\n", 96 | " \"$group\": {\n", 97 | " \"_id\": \"$cast\",\n", 98 | " \"num_films\": { \"$sum\": 1 },\n", 99 | " \"average\": { \"$avg\": \"$imdb.rating\" }\n", 100 | " } \n", 101 | "}" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 8, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "shaping = {\n", 111 | " \"$project\": {\n", 112 | " \"_id\": 1,\n", 113 | " \"num_films\": 1,\n", 114 | " \"average\": 1\n", 115 | " }\n", 116 | "}" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 9, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "sorting = {\n", 126 | " \"$sort\": {\n", 127 | " \"num_films\": -1\n", 128 | " }\n", 129 | "}" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 10, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "limiting = {\n", 139 | " \"$limit\": 1\n", 140 | "}" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 11, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "pipeline = [\n", 150 | " predicate,\n", 151 | " unwinding,\n", 152 | " grouping,\n", 153 | " shaping,\n", 154 | " sorting,\n", 155 | " limiting\n", 156 | "]" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 12, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "{'_id': 'John Wayne', 'num_films': 107, 'average': 6.424299065420561}" 168 | ] 169 | }, 170 | "metadata": {}, 171 | "output_type": "display_data" 172 | } 173 | ], 174 | "source": [ 175 | "display(list(movies.aggregate(pipeline))[0])\n", 176 | "#{'_id': 'John Wayne', 'num_films': 107, 'average': 6.424299065420561}" 177 | ] 178 | } 179 | ], 180 | "metadata": { 181 | "kernelspec": { 182 | "display_name": "Python 3", 183 | "language": "python", 184 | "name": "python3" 185 | }, 186 | "language_info": { 187 | "codemirror_mode": { 188 | "name": "ipython", 189 | "version": 3 190 | }, 191 | "file_extension": ".py", 192 | "mimetype": "text/x-python", 193 | "name": "python", 194 | "nbconvert_exporter": "python", 195 | "pygments_lexer": "ipython3", 196 | "version": "3.6.5" 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 2 201 | } 202 | -------------------------------------------------------------------------------- /Assignments/entity-resolution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pymongo import MongoClient\n", 10 | "import pprint" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "For this lab, use the provided `course-cluster-uri` below." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 27 | "course_client = MongoClient(course_cluster_uri)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "people_master = course_client['coursera-agg']['people_master']" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 11, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# Replace this with a match stage that will return documents that match on\n", 46 | "# first_name OR last_name OR birthday OR email.\n", 47 | "greedy_match = {\n", 48 | " \"$match\": { \n", 49 | " \"$expr\": { \n", 50 | " \"$or\": [\n", 51 | " { \"$eq\": [\"$first_name\", \"$$first_name\"] },\n", 52 | " { \"$eq\": [\"$last_name\", \"$$last_name\"] },\n", 53 | " { \"$eq\": [\"$birthday\", \"$$birthday\"] },\n", 54 | " { \"$eq\": [\"$email\", \"$$email\"] },\n", 55 | " ]\n", 56 | " }\n", 57 | " }\n", 58 | "}" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 9, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# Replace this with a stage that will add a field called 'matchScore', where\n", 68 | "# matchScore is the number of fields (first_name, last_name, birthday, email)\n", 69 | "# that match the source document.\n", 70 | "match_score_calculation = {\n", 71 | " \"$addFields\": {\n", 72 | " \"matchScore\": {\n", 73 | " \"$sum\": [\n", 74 | " { \n", 75 | " \"$cond\": [\n", 76 | " { \"$eq\": [\"$first_name\", \"$$first_name\"] }, 1, 0\n", 77 | " ] \n", 78 | " },\n", 79 | " { \n", 80 | " \"$cond\": [\n", 81 | " { \"$eq\": [\"$last_name\", \"$$last_name\"] }, 1, 0\n", 82 | " ] \n", 83 | " },\n", 84 | " { \n", 85 | " \"$cond\": [\n", 86 | " { \"$eq\": [\"$birthday\", \"$$birthday\"] }, 1, 0\n", 87 | " ] \n", 88 | " },\n", 89 | " { \n", 90 | " \"$cond\": [\n", 91 | " { \"$eq\": [\"$email\", \"$$email\"] }, 1, 0\n", 92 | " ] \n", 93 | " }\n", 94 | " ]\n", 95 | " } \n", 96 | " }\n", 97 | "}" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 12, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "cursor = people_master.aggregate([\n", 107 | " {\n", 108 | " \"$lookup\": {\n", 109 | " \"from\": \"people_import\",\n", 110 | " \"let\": {\n", 111 | " \"first_name\": \"$first_name\",\n", 112 | " \"last_name\": \"$last_name\",\n", 113 | " \"email\": \"$email\",\n", 114 | " \"birthday\": \"$birthday\",\n", 115 | " },\n", 116 | " \"pipeline\": [\n", 117 | " greedy_match,\n", 118 | " match_score_calculation,\n", 119 | " {\n", 120 | " \"$match\": {\n", 121 | " \"matchScore\": { \"$gte\": 3 }\n", 122 | " }\n", 123 | " },\n", 124 | " {\n", 125 | " \"$sort\": { \"matchScore\": -1 }\n", 126 | " },\n", 127 | " {\n", 128 | " \"$limit\": 5\n", 129 | " }\n", 130 | " ],\n", 131 | " \"as\": \"matches\"\n", 132 | " }\n", 133 | " },\n", 134 | " {\n", 135 | " \"$match\": {\n", 136 | " \"matches.matchScore\": 3\n", 137 | " }\n", 138 | " }\n", 139 | "])" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 13, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "19" 151 | ] 152 | }, 153 | "execution_count": 13, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "len(list(cursor))\n", 160 | "#19" 161 | ] 162 | } 163 | ], 164 | "metadata": { 165 | "kernelspec": { 166 | "display_name": "Python 3", 167 | "language": "python", 168 | "name": "python3" 169 | }, 170 | "language_info": { 171 | "codemirror_mode": { 172 | "name": "ipython", 173 | "version": 3 174 | }, 175 | "file_extension": ".py", 176 | "mimetype": "text/x-python", 177 | "name": "python", 178 | "nbconvert_exporter": "python", 179 | "pygments_lexer": "ipython3", 180 | "version": "3.6.5" 181 | } 182 | }, 183 | "nbformat": 4, 184 | "nbformat_minor": 2 185 | } 186 | -------------------------------------------------------------------------------- /Assignments/cursor_like_methods.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pymongo" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 19 | "course_client = pymongo.MongoClient(course_cluster_uri)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "movies = course_client['aggregations']['movies']" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Lab: Using Cursor-like aggregation stages\n", 36 | "\n", 37 | "## For this lab, you'll have to use cursor-like aggregation stages to find the answer for the following scenario.\n", 38 | "\n", 39 | "#### The dataset for this lab can be downloaded [here](https://s3.amazonaws.com/edu-static.mongodb.com/lessons/coursera/aggregation/movies.json) for upload to your own cluster.\n", 40 | "\n", 41 | "### Movie Night\n", 42 | "\n", 43 | "Your organization has a movie night scheduled, and you've again been tasked with coming up with a selection.\n", 44 | "\n", 45 | "HR has polled employees and assembled the following list of preferred actresses and actors." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "favorites = [\n", 55 | " \"Sandra Bullock\",\n", 56 | " \"Tom Hanks\",\n", 57 | " \"Julia Roberts\",\n", 58 | " \"Kevin Spacey\",\n", 59 | " \"George Clooney\"\n", 60 | "]" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "For movies released in the **USA** with a ``tomatoes.viewer.rating`` greater\n", 68 | "than or equal to **3**, calculate a new field called num_favs that represets how\n", 69 | "many **favorites** appear in the ``cast`` field of the movie.\n", 70 | "\n", 71 | "Sort your results by ``num_favs``, ``tomatoes.viewer.rating``, and ``title``,\n", 72 | "all in descending order.\n", 73 | "\n", 74 | "What is the ``title`` of the **25th** film in the aggregation result?\n", 75 | "\n", 76 | "**Hint**: MongoDB has a great expression for quickly determining whether there are common elements in lists, ``$setIntersection``" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 19, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "#Check that the title exists, countries contains USA, rating >=3 and cast is array.\n", 86 | "predicate = {\n", 87 | " \"$match\": { \n", 88 | " \"title\": { \"$exists\": True },\n", 89 | " \"countries\": { \"$in\": [ \"USA\", \"$countries\" ] }, \n", 90 | " \"cast\": { \"$elemMatch\": { \"$exists\": True } }, \n", 91 | " \"tomatoes.viewer.rating\": { \"$gte\": 3 }\n", 92 | " }\n", 93 | "}" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 6, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "#Project required fields and create num_favs field\n", 103 | "projection = {\n", 104 | " \"$project\": {\n", 105 | " \"title\": 1,\n", 106 | " \"tomatoes.viewer.rating\": 1,\n", 107 | " \"num_favs\": { \"$size\": { \"$setIntersection\" : [ favorites, \"$cast\" ] } }\n", 108 | " }\n", 109 | "}" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 17, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "sorting = {\n", 119 | " \"$sort\": {\n", 120 | " \"num_favs\": -1,\n", 121 | " \"tomatoes.viewer.rating\": -1,\n", 122 | " \"title\": -1\n", 123 | " }\n", 124 | "}" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 15, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "skipping = {\n", 134 | " \"$skip\": 24\n", 135 | "}" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 16, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "limiting = {\n", 145 | " \"$limit\": 1\n", 146 | "}" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 20, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "[{'_id': ObjectId('573a13ddf29313caabdb320f'),\n", 158 | " 'title': 'The Heat',\n", 159 | " 'tomatoes': {'viewer': {'rating': 3.8}},\n", 160 | " 'num_favs': 1}]" 161 | ] 162 | }, 163 | "metadata": {}, 164 | "output_type": "display_data" 165 | } 166 | ], 167 | "source": [ 168 | "pipeline = [\n", 169 | " predicate,\n", 170 | " projection,\n", 171 | " sorting,\n", 172 | " skipping,\n", 173 | " limiting\n", 174 | "]\n", 175 | "\n", 176 | "display(list(movies.aggregate(pipeline)))\n", 177 | "#The Heat" 178 | ] 179 | } 180 | ], 181 | "metadata": { 182 | "kernelspec": { 183 | "display_name": "Python 3", 184 | "language": "python", 185 | "name": "python3" 186 | }, 187 | "language_info": { 188 | "codemirror_mode": { 189 | "name": "ipython", 190 | "version": 3 191 | }, 192 | "file_extension": ".py", 193 | "mimetype": "text/x-python", 194 | "name": "python", 195 | "nbconvert_exporter": "python", 196 | "pygments_lexer": "ipython3", 197 | "version": "3.6.5" 198 | } 199 | }, 200 | "nbformat": 4, 201 | "nbformat_minor": 2 202 | } 203 | -------------------------------------------------------------------------------- /Assignments/lab__graphlookup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Collecting dateparser\n", 13 | " Using cached https://files.pythonhosted.org/packages/ac/9e/1aa87c0c59f9731820bfd20a8b148d97b315530c2c92d1fb300328c8c42f/dateparser-0.7.0-py2.py3-none-any.whl\n", 14 | "Requirement already satisfied: python-dateutil in c:\\users\\jesus\\anaconda3\\envs\\aggregation-framework\\lib\\site-packages (from dateparser) (2.7.3)\n", 15 | "Collecting regex (from dateparser)\n", 16 | " Using cached https://files.pythonhosted.org/packages/7c/11/89b423ecd55990abd66fe3742992c5c13f951b8b4447deb1d7cc3e292611/regex-2018.07.11-cp36-none-win_amd64.whl\n", 17 | "Collecting tzlocal (from dateparser)\n", 18 | "Requirement already satisfied: pytz in c:\\users\\jesus\\anaconda3\\envs\\aggregation-framework\\lib\\site-packages (from dateparser) (2018.4)\n", 19 | "Requirement already satisfied: six>=1.5 in c:\\users\\jesus\\anaconda3\\envs\\aggregation-framework\\lib\\site-packages (from python-dateutil->dateparser) (1.11.0)\n", 20 | "Installing collected packages: regex, tzlocal, dateparser\n", 21 | "Successfully installed dateparser-0.7.0 regex-2018.7.11 tzlocal-1.5.1\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "!pip install dateparser" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 4, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "import pymongo\n", 36 | "import pprint\n", 37 | "import dateparser\n", 38 | "from bson.son import SON\n", 39 | "\n", 40 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 41 | "course_client = pymongo.MongoClient(course_cluster_uri)\n", 42 | "movies = course_client['aggregations']['movies']" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Lab : $graphLookup\n", 50 | "\n", 51 | "For this lab, you'll be calculating the [degrees of separation](https://en.wikipedia.org/wiki/Six_degrees_of_separation) of directors to \"Steven Spielberg\".\n", 52 | "\n", 53 | "This is a bit like calculating a [\"Kevin Bacon\" number](https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon), but instead of all connections you will only consider connections through the `directors` graph nodes.\n", 54 | "\n", 55 | "Complete the the `$graphLookup` and `$project` stages by correctly constructing the `graph_lookup` and `project_cast` variables below. \n", 56 | "\n", 57 | "To optimize the execution of `$graphLookup` stage, use a `maxDepth` of 6.\n", 58 | "\n", 59 | "For the solution, only provide the numeric portion of the returned output to the validator.\n", 60 | "\n", 61 | "**HINT**: `$reduce` is a powerful expression!" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 7, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "{'answer': 2}" 73 | ] 74 | }, 75 | "execution_count": 7, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "\n", 82 | "graph_lookup = {\n", 83 | " \"$graphLookup\": {\n", 84 | " \"from\": \"movies\",\n", 85 | " \"startWith\": \"$directors\",\n", 86 | " \"connectFromField\": \"directors\",\n", 87 | " \"connectToField\": \"directors\",\n", 88 | " \"as\": \"network\",\n", 89 | " \"maxDepth\": 6,\n", 90 | " \"depthField\": \"network_level\"\n", 91 | " }\n", 92 | "}\n", 93 | "\n", 94 | "\n", 95 | "project_cast = {\n", 96 | " \"$project\": {\n", 97 | " \"cast\": {\n", 98 | " \"$reduce\": {\n", 99 | " \"input\": \"$cast\",\n", 100 | " \"initialValue\": [],\n", 101 | " \"in\": { \"$concatArrays\" : [\"$$value\", \"$$this\"] }\n", 102 | " }\n", 103 | " }\n", 104 | " }\n", 105 | "}\n", 106 | "\n", 107 | "\n", 108 | "results = movies.aggregate([\n", 109 | " {\n", 110 | " \"$match\": {\n", 111 | " \"directors\": \"Steven Spielberg\"\n", 112 | " }\n", 113 | " },\n", 114 | " {\n", 115 | " \"$project\": {\n", 116 | " \"directors\": 1\n", 117 | " }\n", 118 | " },\n", 119 | " graph_lookup,\n", 120 | " {\n", 121 | " \"$unwind\": \"$network\"\n", 122 | " },\n", 123 | " {\n", 124 | " \"$project\": {\n", 125 | " \"cast\": \"$network.cast\",\n", 126 | " \"level\": \"$network.network_level\"\n", 127 | " }\n", 128 | " },\n", 129 | " {\n", 130 | " \"$group\": {\n", 131 | " \"_id\": \"$level\",\n", 132 | " \"cast\": {\"$addToSet\": \"$cast\"}\n", 133 | " }\n", 134 | " },\n", 135 | " project_cast,\n", 136 | " {\n", 137 | " \"$match\": {\n", 138 | " \"cast\": \"Woody Harrelson\"\n", 139 | " }\n", 140 | " },\n", 141 | " {\n", 142 | " \"$sort\": {\n", 143 | " \"_id\": 1\n", 144 | " }\n", 145 | " },\n", 146 | " {\n", 147 | " \"$project\": {\n", 148 | " \"_id\": 0,\n", 149 | " \"answer\": \"$_id\"\n", 150 | " }\n", 151 | " },\n", 152 | " {\n", 153 | " \"$limit\": 1\n", 154 | " }\n", 155 | "])\n", 156 | "\n", 157 | "list(results)[0]\n", 158 | "#2" 159 | ] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "Python 3", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 3 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython3", 178 | "version": "3.6.5" 179 | } 180 | }, 181 | "nbformat": 4, 182 | "nbformat_minor": 2 183 | } 184 | -------------------------------------------------------------------------------- /LessonNotes/linear-regressions-with-mongodb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from pandas.io.json import json_normalize\n", 12 | "from pymongo import MongoClient\n", 13 | "from sklearn import linear_model\n", 14 | "from sklearn.model_selection import train_test_split\n", 15 | "import numpy as np\n", 16 | "import seaborn as sns" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "collapsed": true 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 28 | "course_client = MongoClient(course_cluster_uri)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "weather_db = course_client['100YWeatherSmall']['data']" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "weather_filter = {\n", 51 | " \"$match\": {\n", 52 | " \"airTemperature.value\": { \"$lt\": 900 },\n", 53 | " \"dewPoint.value\": { \"$lt\": 900 },\n", 54 | " \"pressure.value\": { \"$lt\": 9000 },\n", 55 | " }\n", 56 | "}" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "weather_projection = {\n", 68 | " \"$project\": {\n", 69 | " \"_id\": 0,\n", 70 | " \"airTemperature.value\": 1,\n", 71 | " \"dewPoint.value\": 1,\n", 72 | " \"pressure.value\": 1,\n", 73 | " }\n", 74 | "}" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "sample_stage = { \"$sample\": { \"size\": 10000 } }" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "cursor = weather_db.aggregate([\n", 97 | " weather_filter,\n", 98 | " weather_projection,\n", 99 | " sample_stage\n", 100 | "])" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "weather_data = list(cursor)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": true 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "weather_data[0]" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": true 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "df = json_normalize(weather_data)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "df.head()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "%matplotlib inline" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": { 162 | "collapsed": true 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "sns.pairplot(df)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": true 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "df_x = df.drop(['airTemperature.value'], axis=1)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "collapsed": true 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "df_y = df['airTemperature.value']" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "collapsed": true 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "reg = linear_model.LinearRegression()" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": { 206 | "collapsed": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "reg.fit(x_train, y_train)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "reg.coef_" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": true 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "reg.intercept_" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": true 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "reg.predict(x_test)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "collapsed": true 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "np.mean((reg.predict(x_test) - y_test)**2)" 266 | ] 267 | } 268 | ], 269 | "metadata": { 270 | "kernelspec": { 271 | "display_name": "Python 3", 272 | "language": "python", 273 | "name": "python3" 274 | }, 275 | "language_info": { 276 | "codemirror_mode": { 277 | "name": "ipython", 278 | "version": 3 279 | }, 280 | "file_extension": ".py", 281 | "mimetype": "text/x-python", 282 | "name": "python", 283 | "nbconvert_exporter": "python", 284 | "pygments_lexer": "ipython3", 285 | "version": "3.6.5" 286 | } 287 | }, 288 | "nbformat": 4, 289 | "nbformat_minor": 2 290 | } 291 | -------------------------------------------------------------------------------- /Assignments/lookup_lab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pymongo" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 19 | "course_client = pymongo.MongoClient(course_cluster_uri)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "routes = course_client['aggregations']['air_routes']" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Lab: Using ``$lookup``\n", 36 | "\n", 37 | "## For this lab, you'll be using the ``$lookup``.\n", 38 | "\n", 39 | "#### The dataset for this lab can be downloaded by clicking the following links - [air_alliances](https://s3.amazonaws.com/edu-static.mongodb.com/lessons/coursera/aggregation/air_alliances.json), [air_routes](https://s3.amazonaws.com/edu-static.mongodb.com/lessons/coursera/aggregation/air_routes.json) - for upload to your own cluster." 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "### Question\n", 47 | "\n", 48 | "Which alliance from ``air_alliances`` flies the most **routes** with either a\n", 49 | "Boeing 747 or an Airbus A380 (abbreviated 747 and 380 in ``air_routes``)?\n", 50 | "\n", 51 | "**Note**: Begin from the ``air_routes`` collection!" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# predicate is given this lab\n", 61 | "predicate = {\n", 62 | " \"$match\": {\n", 63 | " \"airplane\": {\"$regex\": \"747|380\"}\n", 64 | " }\n", 65 | "}" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 5, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "lookup = {\n", 75 | " \"$lookup\": {\n", 76 | " \"from\": 'air_alliances',\n", 77 | " \"localField\": \"airline.name\",\n", 78 | " \"foreignField\": \"airlines\",\n", 79 | " \"as\": \"alliance\"\n", 80 | " }\n", 81 | "}" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 6, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "unwinding = {\n", 91 | " \"$unwind\": \"$alliance\"\n", 92 | "}" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 7, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "grouping = {\n", 102 | " \"$group\": {\n", 103 | " \"_id\": \"$alliance\",\n", 104 | " \"count\": { \"$sum\": 1 }\n", 105 | " } \n", 106 | "}" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 8, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "sorting = {\n", 116 | " \"$sort\": {\n", 117 | " \"count\": -1\n", 118 | " }\n", 119 | "}" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 9, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "pipeline = [\n", 129 | " predicate,\n", 130 | " lookup,\n", 131 | " unwinding,\n", 132 | " grouping,\n", 133 | " sorting\n", 134 | "]" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 10, 140 | "metadata": {}, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "[{'_id': {'_id': ObjectId('5980bef9a39d0ba3c650ae9c'),\n", 146 | " 'name': 'SkyTeam',\n", 147 | " 'airlines': ['Aeroflot',\n", 148 | " 'Aerolinias Argentinas',\n", 149 | " 'Aeromexico',\n", 150 | " 'Air Europa',\n", 151 | " 'Air France',\n", 152 | " 'Alitalia',\n", 153 | " 'China Airlines',\n", 154 | " 'China Eastern Airlines',\n", 155 | " 'China Southern Airlines',\n", 156 | " 'Czech Airlines',\n", 157 | " 'Delta Air Lines',\n", 158 | " 'Garuda Indonesia',\n", 159 | " 'Kenya Airways',\n", 160 | " 'KLM',\n", 161 | " 'Korean Air',\n", 162 | " 'Middle East Airlines',\n", 163 | " 'Saudia',\n", 164 | " 'TAROM',\n", 165 | " 'Vetnam Airlines',\n", 166 | " 'Xiamen Airlines']},\n", 167 | " 'count': 16},\n", 168 | " {'_id': {'_id': ObjectId('5980bef9a39d0ba3c650ae9b'),\n", 169 | " 'name': 'Star Alliance',\n", 170 | " 'airlines': ['Air Canada',\n", 171 | " 'Adria Airways',\n", 172 | " 'Avianca',\n", 173 | " 'Scandinavian Airlines',\n", 174 | " 'All Nippon Airways',\n", 175 | " 'Brussels Airlines',\n", 176 | " 'Shenzhen Airlines',\n", 177 | " 'Air China',\n", 178 | " 'Air New Zealand',\n", 179 | " 'Asiana Airlines',\n", 180 | " 'Brussels Airlines',\n", 181 | " 'Copa Airlines',\n", 182 | " 'Croatia Airlines',\n", 183 | " 'EgyptAir',\n", 184 | " 'TAP Portugal',\n", 185 | " 'United Airlines',\n", 186 | " 'Turkish Airlines',\n", 187 | " 'Swiss International Air Lines',\n", 188 | " 'Lufthansa',\n", 189 | " 'EVA Air',\n", 190 | " 'South African Airways',\n", 191 | " 'Singapore Airlines']},\n", 192 | " 'count': 11},\n", 193 | " {'_id': {'_id': ObjectId('5980bef9a39d0ba3c650ae9d'),\n", 194 | " 'name': 'OneWorld',\n", 195 | " 'airlines': ['Air Berlin',\n", 196 | " 'American Airlines',\n", 197 | " 'British Airways',\n", 198 | " 'Cathay Pacific',\n", 199 | " 'Finnair',\n", 200 | " 'Iberia Airlines',\n", 201 | " 'Japan Airlines',\n", 202 | " 'LATAM Chile',\n", 203 | " 'LATAM Brasil',\n", 204 | " 'Malasya Airlines',\n", 205 | " 'Canadian Airlines',\n", 206 | " 'Quantas',\n", 207 | " 'Qatar Airways',\n", 208 | " 'Royal Jordainian',\n", 209 | " 'SriLanka Airlines',\n", 210 | " 'S7 Airlines']},\n", 211 | " 'count': 11}]" 212 | ] 213 | }, 214 | "metadata": {}, 215 | "output_type": "display_data" 216 | } 217 | ], 218 | "source": [ 219 | "display(list(routes.aggregate(pipeline)))\n", 220 | "#SkyTeam, 16" 221 | ] 222 | } 223 | ], 224 | "metadata": { 225 | "kernelspec": { 226 | "display_name": "Python 3", 227 | "language": "python", 228 | "name": "python3" 229 | }, 230 | "language_info": { 231 | "codemirror_mode": { 232 | "name": "ipython", 233 | "version": 3 234 | }, 235 | "file_extension": ".py", 236 | "mimetype": "text/x-python", 237 | "name": "python", 238 | "nbconvert_exporter": "python", 239 | "pygments_lexer": "ipython3", 240 | "version": "3.6.5" 241 | } 242 | }, 243 | "nbformat": 4, 244 | "nbformat_minor": 2 245 | } 246 | -------------------------------------------------------------------------------- /LessonNotes/migrating-schema-lesson.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from pymongo import MongoClient\n", 12 | "from bson.objectid import ObjectId\n", 13 | "from bson.decimal128 import Decimal128\n", 14 | "import json" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "class JSONEncoder(json.JSONEncoder):\n", 26 | " def default(self, o):\n", 27 | " if isinstance(o, ObjectId) or isinstance(o, Decimal128):\n", 28 | " return str(o)\n", 29 | " return json.JSONEncoder.default(self, o)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## Data source\n", 37 | "\n", 38 | "If you do not change the data uri (*course_cluster_uri*), you can execute most\n", 39 | "of this notebook, however you will not be able to write to the database.\n", 40 | "\n", 41 | "To execute successfully the pipelines with an $out/save stage in this notebook,\n", 42 | "point to your own Atlas cluster into which you will have imported the *retail.csv* dataset.\n" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 54 | "course_client = MongoClient(course_cluster_uri)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "retail_col = course_client['coursera-agg']['retail']" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "assemble = {\n", 77 | " \"$group\": {\n", 78 | " \"_id\": {\n", 79 | " \"InvoiceNo\": \"$InvoiceNo\",\n", 80 | " \"CustomerID\": \"$CustomerID\",\n", 81 | " \"Country\": \"$Country\"\n", 82 | " },\n", 83 | " \"InvoiceDate\": { \"$max\": \"$InvoiceDate\" },\n", 84 | " \"Items\": {\n", 85 | " \"$push\": {\n", 86 | " \"StockCode\": \"$StockCode\",\n", 87 | " \"Description\": \"$Description\",\n", 88 | " \"Quantity\": \"$Quantity\",\n", 89 | " \"UnitPrice\": \"$UnitPrice\"\n", 90 | " }\n", 91 | " }\n", 92 | " }\n", 93 | "}" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": true 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "beautify = {\n", 105 | " \"$project\": {\n", 106 | " \"_id\": \"$_id.InvoiceNo\",\n", 107 | " \"InvoiceDate\": \"$_id.InvoiceDate\",\n", 108 | " \"CustomerID\": \"$_id.CustomerID\",\n", 109 | " \"Country\": \"$_id.Country\",\n", 110 | " \"Items\": 1\n", 111 | " }\n", 112 | "}" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "cursor = retail_col.aggregate([\n", 124 | " assemble,\n", 125 | " beautify\n", 126 | " ],\n", 127 | " allowDiskUse=True)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "collapsed": true 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "retail_doc = cursor.next()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "print(json.dumps(retail_doc, cls=JSONEncoder, indent=4))" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "computed = {\n", 161 | " \"$addFields\" : {\n", 162 | " \"TotalPrice\": {\n", 163 | " \"$reduce\": {\n", 164 | " \"input\": \"$Items\",\n", 165 | " \"initialValue\": Decimal128(\"0.00\"),\n", 166 | " \"in\": {\n", 167 | " \"$add\": [\n", 168 | " \"$$value\",\n", 169 | " { \"$multiply\": [ \"$$this.Quantity\", \"$$this.UnitPrice\" ] }\n", 170 | " ]\n", 171 | " }\n", 172 | " }\n", 173 | " }\n", 174 | " }\n", 175 | "}" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "collapsed": true 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "cursor = retail_col.aggregate([\n", 187 | " assemble,\n", 188 | " beautify,\n", 189 | " computed\n", 190 | " ],\n", 191 | " allowDiskUse=True)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "retail_doc = cursor.next()" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "print(json.dumps(retail_doc, cls=JSONEncoder, indent=4))" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "collapsed": true 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "save = {\n", 225 | " \"$out\": \"orders_new\"\n", 226 | "}" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "The following cell will **fail if you are not pointing** to your own Atlas group\n", 234 | "where you have write privileges to the target collection" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": { 241 | "collapsed": true 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "cursor = retail_col.aggregate([\n", 246 | " assemble,\n", 247 | " beautify,\n", 248 | " computed,\n", 249 | " save\n", 250 | " ],\n", 251 | " allowDiskUse=True)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "collapsed": true 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "assemble = {\n", 263 | " \"$group\": {\n", 264 | " \"_id\": {\n", 265 | " \"InvoiceNo\": \"$InvoiceNo\",\n", 266 | " \"CustomerID\": \"$CustomerID\",\n", 267 | " \"Country\": \"$Country\",\n", 268 | " \"InvoiceDate\": { \"$max\": \"$InvoiceDate\" },\n", 269 | " },\n", 270 | " \"Items\": {\n", 271 | " \"$push\": {\n", 272 | " \"StockCode\": \"$StockCode\",\n", 273 | " \"Description\": \"$Description\",\n", 274 | " \"Quantity\": \"$Quantity\",\n", 275 | " \"UnitPrice\": \"$UnitPrice\"\n", 276 | " }\n", 277 | " }\n", 278 | " }\n", 279 | "}" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "The following cell will show the expected error message of trying to build\n", 287 | "an index on *_id*, if you are pointing to your own Atlas cluster where you\n", 288 | "have write privileges" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": true 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "cursor = retail_col.aggregate([\n", 300 | " assemble,\n", 301 | " beautify,\n", 302 | " computed,\n", 303 | " save\n", 304 | " ],\n", 305 | " allowDiskUse=True)" 306 | ] 307 | } 308 | ], 309 | "metadata": { 310 | "kernelspec": { 311 | "display_name": "Python 3", 312 | "language": "python", 313 | "name": "python3" 314 | }, 315 | "language_info": { 316 | "codemirror_mode": { 317 | "name": "ipython", 318 | "version": 3 319 | }, 320 | "file_extension": ".py", 321 | "mimetype": "text/x-python", 322 | "name": "python", 323 | "nbconvert_exporter": "python", 324 | "pygments_lexer": "ipython3", 325 | "version": "3.6.5" 326 | } 327 | }, 328 | "nbformat": 4, 329 | "nbformat_minor": 2 330 | } 331 | -------------------------------------------------------------------------------- /LessonNotes/principal-component-analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import matplotlib.pyplot as plt\n", 14 | "\n", 15 | "from pandas.io.json import json_normalize\n", 16 | "from pymongo import MongoClient\n", 17 | "from sklearn import preprocessing\n", 18 | "from sklearn.model_selection import train_test_split\n", 19 | "from sklearn.linear_model import LogisticRegression\n", 20 | "\n", 21 | "%matplotlib inline" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 33 | "course_client = MongoClient(course_cluster_uri)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "# Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.\n", 45 | "wine = course_client['coursera-agg']['wine']" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "pipeline = [\n", 57 | " {\n", 58 | " \"$project\": {\n", 59 | " \"_id\": 0\n", 60 | " }\n", 61 | " }\n", 62 | "]" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "cursor = wine.aggregate(pipeline)\n", 74 | "docs = list(cursor)\n", 75 | "df = json_normalize(docs)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "df.head()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "X = df.drop(['Alcohol'], axis=1).values.astype('float64')" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "X = preprocessing.scale(X)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "collapsed": true 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "cov_matrix = np.cov(X.T)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "for val in eigenvalues:\n", 138 | " print(val)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "eigen_map = list(zip(eigenvalues, eigenvectors.T))" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "eigen_map.sort(key=lambda x: x[0], reverse=True)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "sorted_eigenvalues = [pair[0] for pair in eigen_map]\n", 172 | "sorted_eigenvectors = [pair[1] for pair in eigen_map]" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "sorted_eigenvalues" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "print(pd.DataFrame(sorted_eigenvectors, columns=df.drop(['Alcohol'], axis=1).columns))" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "collapsed": true 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "eigenvalue_sum = sum(eigenvalues)\n", 202 | "var_exp = [(v / eigenvalue_sum)*100 for v in sorted_eigenvalues]\n", 203 | "cum_var_exp = np.cumsum(var_exp)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "collapsed": true 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "dims = len(df.drop(['Alcohol'], axis=1).columns)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "plt.clf()\n", 224 | "fig, ax = plt.subplots()\n", 225 | "\n", 226 | "ax.plot(range(dims), cum_var_exp, '-o')\n", 227 | "\n", 228 | "plt.xlabel('Number of Components')\n", 229 | "plt.ylabel('Percent of Variance Explained')\n", 230 | "\n", 231 | "plt.show()" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "ev1 = sorted_eigenvectors[0]\n", 243 | "ev2 = sorted_eigenvectors[1]" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": true 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "eigen_matrix = np.hstack((ev1.reshape(dims,1), ev2.reshape(dims,1)))" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "eigen_matrix" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "collapsed": true 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "Y = X.dot(eigen_matrix)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "plt.clf()\n", 284 | "fig, ax = plt.subplots()\n", 285 | "ax.scatter(Y.T[0], Y.T[1], alpha=0.2)\n", 286 | "plt.xlabel('PC1')\n", 287 | "plt.ylabel('PC2')\n", 288 | "plt.show()" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": true 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "from sklearn.decomposition import PCA\n", 300 | "pca = PCA(n_components=2)\n", 301 | "Y_sklearn = pca.fit_transform(X)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "plt.clf()\n", 311 | "fig, ax = plt.subplots()\n", 312 | "ax.scatter(Y_sklearn.T[0], Y_sklearn.T[1], alpha=0.2)\n", 313 | "plt.xlabel('PC1')\n", 314 | "plt.ylabel('PC2')\n", 315 | "plt.show()" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": { 322 | "collapsed": true 323 | }, 324 | "outputs": [], 325 | "source": [ 326 | "y = df['Alcohol'].values" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": { 333 | "collapsed": true 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "# Let's split the model for training and testing, and use a logistic regression\n", 338 | "X_train, X_test, y_train, y_test = train_test_split(df.drop('Alcohol', axis=1), y, test_size=0.25)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": { 345 | "collapsed": true 346 | }, 347 | "outputs": [], 348 | "source": [ 349 | "classifier = LogisticRegression(random_state=0)" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "classifier.fit(X_train, y_train)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "y_pred = classifier.score(X_test, y_test)\n", 368 | "y_pred" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": { 375 | "collapsed": true 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "# now with PCA applied\n", 380 | "X_train, X_test, y_train, y_test = train_test_split(Y_sklearn, y, test_size=0.3)" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "classifier_with_pca = LogisticRegression(random_state=0)\n", 390 | "classifier_with_pca.fit(X_train, y_train)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "y_pred = classifier_with_pca.score(X_test, y_test)\n", 400 | "y_pred" 401 | ] 402 | } 403 | ], 404 | "metadata": { 405 | "kernelspec": { 406 | "display_name": "Python 3", 407 | "language": "python", 408 | "name": "python3" 409 | }, 410 | "language_info": { 411 | "codemirror_mode": { 412 | "name": "ipython", 413 | "version": 3 414 | }, 415 | "file_extension": ".py", 416 | "mimetype": "text/x-python", 417 | "name": "python", 418 | "nbconvert_exporter": "python", 419 | "pygments_lexer": "ipython3", 420 | "version": "3.6.5" 421 | } 422 | }, 423 | "nbformat": 4, 424 | "nbformat_minor": 2 425 | } 426 | -------------------------------------------------------------------------------- /Assignments/expressions_with_project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pymongo\n", 10 | "import json" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 20 | "course_client = pymongo.MongoClient(course_cluster_uri)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "movies = course_client['aggregations']['movies']" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# Lab: Expression Composition\n", 37 | "\n", 38 | "## For this lab, you'll be composing expressions together \n", 39 | "\n", 40 | "#### The dataset for this lab can be downloaded [here](https://s3.amazonaws.com/edu-static.mongodb.com/lessons/coursera/aggregation/movies.json) for upload to your own cluster." 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### Prelude\n", 48 | "\n", 49 | "This lab will have you work with data within arrays, a common operation.\n", 50 | "\n", 51 | "Specifically, one of the arrays you'll work with is ``writers``, from the\n", 52 | "**movies** collection.\n", 53 | "\n", 54 | "There are times when we want to make sure that the field is an array, and that\n", 55 | "it is not empty. We can do this within ``$match``\n", 56 | "\n", 57 | " `{ \"$match\": { \"writers\": { \"$elemMatch\": { \"$exists\": true } } }`\n", 58 | "\n", 59 | "However, the entries within ``writers`` presents another problem. A good amount\n", 60 | "of entries in ``writers`` look something like the following, where the writer is\n", 61 | "attributed with their specific contribution ::\n", 62 | "\n", 63 | " `\"writers\" : [ \"Vincenzo Cerami (story)\", \"Roberto Benigni (story)\" ]`\n", 64 | "\n", 65 | "But the writer also appears in the ``cast`` array as \"Roberto Benigni\"!\n", 66 | "\n", 67 | "Give it a look with the following query" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "{\n", 80 | " \"cast\": [\n", 81 | " \"Roberto Benigni\",\n", 82 | " \"Nicoletta Braschi\",\n", 83 | " \"Giustino Durano\",\n", 84 | " \"Giorgio Cantarini\"\n", 85 | " ],\n", 86 | " \"writers\": [\n", 87 | " \"Vincenzo Cerami (story)\",\n", 88 | " \"Roberto Benigni (story)\"\n", 89 | " ]\n", 90 | "}\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "result = movies.find_one({\"title\": \"Life Is Beautiful\"}, { \"_id\": 0, \"cast\": 1, \"writers\": 1})\n", 96 | "print(json.dumps(result, indent=4))" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "This presents a problem, since comparing ``\"Roberto Benigni\"`` to\n", 104 | "``\"Roberto Benigni (story)\"`` will definitely result in a difference.\n", 105 | "\n", 106 | "Thankfully there is a powerful expression to help us, ``$map``. ``$map`` lets us\n", 107 | "iterate over an array, element by element, performing some transformation on\n", 108 | "each element. The result of that transformation will be returned in the same\n", 109 | "place as the original element.\n", 110 | "\n", 111 | "Within ``$map``, the argument to ``input`` can be any expression as long as it\n", 112 | "resolves to an array. The argument to ``as`` is the name we want to use to refer\n", 113 | "to each element of the array when performing whatever logic we want, surrounding\n", 114 | "it with quotes and prepending two `$` signs. The field ``as`` is optional, and if omitted\n", 115 | "each element must be referred to as ``\"$$this\"``\n", 116 | "\n", 117 | " \"writers\": {\n", 118 | " \"$map\": {\n", 119 | " \"input\": \"$writers\",\n", 120 | " \"as\": \"writer\",\n", 121 | " \"in\": \"$$writer\"\n", 122 | "\n", 123 | "\n", 124 | "``in`` is where the work is peformed. Here, we use the ``$arrayElemAt``\n", 125 | "expression, which takes two arguments, the array and the index of the element we\n", 126 | "want. We use the ``$split`` expression, splitting the values on ``\" (\"``.\n", 127 | "\n", 128 | "If the string did not contain the pattern specified, the only modification is it\n", 129 | "is wrapped in an array, so ``$arrayElemAt`` will always work\n", 130 | "\n", 131 | " \"writers\": \"$map\": {\n", 132 | " \"input\": \"$writers\",\n", 133 | " \"as\": \"writer\",\n", 134 | " \"in\": {\n", 135 | " \"$arrayElemAt\": [\n", 136 | " {\n", 137 | " \"$split\": [ \"$$writer\", \" (\" ]\n", 138 | " },\n", 139 | " 0\n", 140 | " ]\n", 141 | " }\n", 142 | " }\n", 143 | " \n", 144 | "Let's see it in action to get a full sense of what it does." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 5, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "# this stage is provided for you, use it later as well\n", 154 | "mapping = {\n", 155 | " \"$project\": {\n", 156 | " \"_id\": 0,\n", 157 | " \"cast\": 1,\n", 158 | " \"directors\": 1,\n", 159 | " \"writers\": {\n", 160 | " \"$map\": {\n", 161 | " \"input\": \"$writers\",\n", 162 | " \"as\": \"writer\",\n", 163 | " \"in\": {\n", 164 | " \"$arrayElemAt\": [\n", 165 | " { \"$split\": [\"$$writer\", \" (\"] },\n", 166 | " 0\n", 167 | " ]\n", 168 | " }\n", 169 | " }\n", 170 | " }\n", 171 | " }\n", 172 | "}" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 6, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "[\n", 185 | " {\n", 186 | " \"cast\": [\n", 187 | " \"Roberto Benigni\",\n", 188 | " \"Nicoletta Braschi\",\n", 189 | " \"Giustino Durano\",\n", 190 | " \"Giorgio Cantarini\"\n", 191 | " ],\n", 192 | " \"directors\": [\n", 193 | " \"Roberto Benigni\"\n", 194 | " ],\n", 195 | " \"writers\": [\n", 196 | " \"Vincenzo Cerami\",\n", 197 | " \"Roberto Benigni\"\n", 198 | " ]\n", 199 | " }\n", 200 | "]\n" 201 | ] 202 | } 203 | ], 204 | "source": [ 205 | "\n", 206 | "result = movies.aggregate([\n", 207 | " {\n", 208 | " \"$match\": {\"title\": \"Life Is Beautiful\"}\n", 209 | " },\n", 210 | " mapping\n", 211 | "])\n", 212 | "print(json.dumps(list(result), indent=4))" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "## Question\n", 220 | "\n", 221 | "Let's find how many movies in our **movies** collection are a \"labor of love\",\n", 222 | "where the same person appears in ``cast``, ``directors``, and ``writers``\n", 223 | "\n", 224 | "\n", 225 | "How many movies are \"labors of love\"?\n", 226 | "\n", 227 | "To get a count, ensure you add the following to the end of your pipeline list." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 7, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "counting = {\n", 237 | " \"$count\": \"labors_of_love\"\n", 238 | "}" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "The necessary mapping stage is provided for you." 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 8, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "mapping = {\n", 255 | " \"$project\": {\n", 256 | " \"_id\": 0,\n", 257 | " \"cast\": 1,\n", 258 | " \"directors\": 1,\n", 259 | " \"writers\": {\n", 260 | " \"$map\": {\n", 261 | " \"input\": \"$writers\",\n", 262 | " \"as\": \"writer\",\n", 263 | " \"in\": {\n", 264 | " \"$arrayElemAt\": [\n", 265 | " { \"$split\": [\"$$writer\", \" (\"] },\n", 266 | " 0\n", 267 | " ]\n", 268 | " }\n", 269 | " }\n", 270 | " }\n", 271 | " }\n", 272 | "}" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 24, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "#Filter documents that have all 3 fields: cast, directors, writers\n", 282 | "predicate = {\n", 283 | " \"$match\": { \n", 284 | " \"cast\": { \"$elemMatch\": { \"$exists\": True } }, \n", 285 | " \"directors\": { \"$elemMatch\": { \"$exists\": True } },\n", 286 | " \"writers\": { \"$elemMatch\": { \"$exists\": True } }\n", 287 | " }\n", 288 | "}" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 20, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "#Select only the 3 fields needed\n", 298 | "projection = {\n", 299 | " \"$project\": {\n", 300 | " \"cast\": 1,\n", 301 | " \"directors\": 1,\n", 302 | " \"writers\": 1,\n", 303 | " \"labors_of_love\": { \"$setIntersection\" : [ \"$cast\", \"$directors\", \"$writers\" ] }\n", 304 | " }\n", 305 | "}" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 18, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "#Obtain those where at least one member appears in the 3 groups using a set\n", 315 | "matching = {\n", 316 | " \"$match\": { \"labors_of_love\": { \"$elemMatch\": { \"$exists\": True } } }\n", 317 | "}" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 25, 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "data": { 327 | "text/plain": [ 328 | "[{'labors_of_love': 1597}]" 329 | ] 330 | }, 331 | "metadata": {}, 332 | "output_type": "display_data" 333 | } 334 | ], 335 | "source": [ 336 | "pipeline = [\n", 337 | " predicate,\n", 338 | " mapping,\n", 339 | " projection,\n", 340 | " matching,\n", 341 | " counting\n", 342 | "]\n", 343 | "\n", 344 | "display(list(movies.aggregate(pipeline)))\n", 345 | "\n", 346 | "#1597" 347 | ] 348 | } 349 | ], 350 | "metadata": { 351 | "kernelspec": { 352 | "display_name": "Python 3", 353 | "language": "python", 354 | "name": "python3" 355 | }, 356 | "language_info": { 357 | "codemirror_mode": { 358 | "name": "ipython", 359 | "version": 3 360 | }, 361 | "file_extension": ".py", 362 | "mimetype": "text/x-python", 363 | "name": "python", 364 | "nbconvert_exporter": "python", 365 | "pygments_lexer": "ipython3", 366 | "version": "3.6.5" 367 | } 368 | }, 369 | "nbformat": 4, 370 | "nbformat_minor": 2 371 | } 372 | -------------------------------------------------------------------------------- /Assignments/linear-regression-on-titanic-data-set.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pandas.io.json import json_normalize\n", 10 | "from pymongo import MongoClient\n", 11 | "from sklearn import linear_model\n", 12 | "from sklearn.model_selection import train_test_split\n", 13 | "from sklearn.metrics import mean_squared_error\n", 14 | "import numpy as np\n", 15 | "import pprint" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 25 | "course_client = MongoClient(course_cluster_uri)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "titanic = course_client['coursera-agg']['titanic']" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 4, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Replace {} with a stage to determine the possible values for gender.\n", 44 | "unique_gender_stage = {\n", 45 | " \"$group\": {\n", 46 | " \"_id\": \"$gender\"\n", 47 | " }\n", 48 | "}" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "possible_gender_values = titanic.aggregate([\n", 58 | " {\n", 59 | " \"$match\": {\n", 60 | " \"age\": {\"$type\": \"number\"},\n", 61 | " \"point_of_embarkation\": {\"$ne\": \"\"}\n", 62 | " }\n", 63 | " },\n", 64 | " unique_gender_stage\n", 65 | "])" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "[{'_id': 'female'}, {'_id': 'male'}]\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "# Print the distinct list of values for the gender field\n", 83 | "pprint.pprint(list(possible_gender_values))" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 7, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# Replace {} with a stage to determine the possible values for point_of_embarkation\n", 93 | "unique_point_of_embarkation_stage = {\n", 94 | " \"$group\": {\n", 95 | " \"_id\": \"$point_of_embarkation\"\n", 96 | " }\n", 97 | "}" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 8, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "possible_point_of_embarkation_values = titanic.aggregate([\n", 107 | " {\n", 108 | " \"$match\": {\n", 109 | " \"age\": {\"$type\": \"number\"},\n", 110 | " \"point_of_embarkation\": {\"$ne\": \"\"}\n", 111 | " }\n", 112 | " },\n", 113 | " unique_point_of_embarkation_stage\n", 114 | "])" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 9, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "[{'_id': 'Q'}, {'_id': 'C'}, {'_id': 'S'}]\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "# Print the distinct list of values for the point_of_embarkation field\n", 132 | "pprint.pprint(list(possible_point_of_embarkation_values))" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 14, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "# Given the possible values for point_of_embarkation and gender replace {} with a stage that\n", 142 | "# will convert those field values to an integer.\n", 143 | "# e.g., For the gender field convert 'female' to 0 and 'male' to 1\n", 144 | "gender_and_point_of_embarkation_conversion_stage = {\n", 145 | " \"$addFields\": {\n", 146 | " \"gender\": {\n", 147 | " \"$switch\": {\n", 148 | " \"branches\": [\n", 149 | " { \"case\": { \"$eq\": [\"$gender\", \"female\"] }, \"then\": 0 },\n", 150 | " { \"case\": { \"$eq\": [\"$gender\", \"male\"] }, \"then\": 1 }\n", 151 | " ]\n", 152 | " }\n", 153 | " },\n", 154 | " \"point_of_embarkation\": {\n", 155 | " \"$switch\": {\n", 156 | " \"branches\": [\n", 157 | " { \"case\": { \"$eq\": [\"$point_of_embarkation\", \"C\"] }, \"then\": 0 },\n", 158 | " { \"case\": { \"$eq\": [\"$point_of_embarkation\", \"Q\"] }, \"then\": 1 },\n", 159 | " { \"case\": { \"$eq\": [\"$point_of_embarkation\", \"S\"] }, \"then\": 2 }\n", 160 | " ]\n", 161 | " }\n", 162 | " }\n", 163 | " }\n", 164 | "}" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 15, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "cursor = titanic.aggregate([\n", 174 | " {\n", 175 | " \"$match\": {\n", 176 | " \"age\": {\"$type\": \"number\"},\n", 177 | " \"point_of_embarkation\": {\"$ne\": \"\"}\n", 178 | " }\n", 179 | " },\n", 180 | " gender_and_point_of_embarkation_conversion_stage,\n", 181 | " {\n", 182 | " \"$project\": {\n", 183 | " \"_id\": 0,\n", 184 | " \"ticket_number\": 0,\n", 185 | " \"name\": 0,\n", 186 | " \"passenger_id\": 0,\n", 187 | " \"cabin\": 0\n", 188 | " }\n", 189 | " }\n", 190 | "])" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 16, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# Exhaust our cursor into a list\n", 200 | "titanic_data = list(cursor)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 17, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "# Load our dataset into a DataFrame\n", 210 | "df = json_normalize(titanic_data)" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 18, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "# Pull out the survived column (only the data we want to correlate against)\n", 220 | "df_x = df.drop(['survived'], axis=1)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 19, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "# Only the survived column (the value we want to predict)\n", 230 | "df_y = df['survived']" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 20, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "# Create a Least Squares Linear Regression object\n", 240 | "reg = linear_model.LinearRegression()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 21, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "# Split our dataset into a training set (80%) and a test set (20%)\n", 250 | "x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 22, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "data": { 260 | "text/plain": [ 261 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" 262 | ] 263 | }, 264 | "execution_count": 22, 265 | "metadata": {}, 266 | "output_type": "execute_result" 267 | } 268 | ], 269 | "source": [ 270 | "# Fit a linear model to our training data\n", 271 | "reg.fit(x_train, y_train)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 23, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "array([ 0.11539484, 0.90271544, 0.57035024, 0.63352057, 0.01172327,\n", 283 | " 0.01800834, 0.06378512, 0.62182568, 0.23796354, 0.71916517,\n", 284 | " 1.02356223, 0.12115151, 0.55889098, 0.0294754 , 0.82479006,\n", 285 | " 0.45264152, 0.19836704, 0.3759118 , 0.31355646, 0.22183264,\n", 286 | " 0.44336354, 0.83162647, 0.61957421, 0.2443862 , 0.38782189,\n", 287 | " 0.83087677, 0.40299786, 0.24438707, 0.16732825, 0.09493174,\n", 288 | " 0.61874023, -0.02928499, -0.08826717, 0.64469654, 0.59732346,\n", 289 | " 0.03927108, 0.09652011, 0.14407889, 0.33575427, 0.56339801,\n", 290 | " 0.85238449, 0.15566589, 0.64720588, 0.12115151, 0.12113096,\n", 291 | " 0.44901703, 0.5885736 , 0.20252592, 0.1326563 , 0.31003879,\n", 292 | " 0.68068162, 0.64536789, 0.03974376, 0.61607328, 0.00609504,\n", 293 | " 0.56856501, 0.08091643, 0.07937081, 0.82713745, 0.950898 ,\n", 294 | " 0.57283271, 0.51746255, 0.845916 , 0.20614194, 0.64480443,\n", 295 | " 0.76276758, 0.84160247, 0.11572537, 0.23575847, 0.74036174,\n", 296 | " 0.15553317, 0.17162756, 0.7142957 , 0.25059462, 0.12148119,\n", 297 | " -0.04078978, 0.37687851, 0.20699649, 0.55118714, 0.31387098,\n", 298 | " 0.3820695 , 0.87793894, 0.07505014, 0.74527929, 0.24751388,\n", 299 | " 0.14419279, 0.02750939, 0.13823488, 0.785322 , 0.16128556,\n", 300 | " 0.04770106, 0.76970252, 0.10964672, 0.09814793, 0.71628897,\n", 301 | " 0.14108165, 0.90965933, 0.84238156, 0.1326546 , 0.20457545,\n", 302 | " 0.21711669, 0.38078977, 0.25059462, 0.2436029 , 0.33518191,\n", 303 | " 0.1081298 , 0.15018666, 0.8101361 , 0.53516221, 0.26215909,\n", 304 | " 0.88815928, 0.66823875, 0.38939141, 0.53518871, 0.97887538,\n", 305 | " 0.68330104, 0.37122508, -0.07037902, 0.19425321, 0.4061484 ,\n", 306 | " 0.17287256, 0.12074227, 0.84769297, 0.74951466, 0.23857605,\n", 307 | " 0.6069391 , 0.69202843, 0.53481044, 0.75659347, 0.16258501,\n", 308 | " 0.82871559, 0.05788116, 0.80706308, 0.74356107, 0.13575221,\n", 309 | " 1.05001092, 0.14843955, -0.06901836, 0.02505873, 0.18280612,\n", 310 | " 0.31021567, 0.68662588, 0.74605276])" 311 | ] 312 | }, 313 | "execution_count": 23, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "# Check our test set against our trained linear model\n", 320 | "reg.predict(x_test)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 24, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "0.13166469521402543" 332 | ] 333 | }, 334 | "execution_count": 24, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "# Calculate mean squared error (should be ~0.13-0.15%)\n", 341 | "mean_squared_error(y_test, reg.predict(x_test))" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 25, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "# age: 25,\n", 351 | "# class: 1,\n", 352 | "# fare_paid: 45,\n", 353 | "# gender: 1, (replace Y with the integer you assigned for 'male')\n", 354 | "# parents_children: 0,\n", 355 | "# point_of_embarkation: 0, (replace Z with the integer you assigned for 'C')\n", 356 | "# siblings_spouse: 1\n", 357 | "\n", 358 | "fake_passenger = [[25, 1, 45, 1, 0, 0, 1]]" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 26, 364 | "metadata": { 365 | "scrolled": true 366 | }, 367 | "outputs": [ 368 | { 369 | "data": { 370 | "text/plain": [ 371 | "array([0.50169618])" 372 | ] 373 | }, 374 | "execution_count": 26, 375 | "metadata": {}, 376 | "output_type": "execute_result" 377 | } 378 | ], 379 | "source": [ 380 | "# Use this output to verify your completion of this exercise\n", 381 | "reg.predict(fake_passenger)\n", 382 | "#0.50169618" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [] 391 | } 392 | ], 393 | "metadata": { 394 | "kernelspec": { 395 | "display_name": "Python 3", 396 | "language": "python", 397 | "name": "python3" 398 | }, 399 | "language_info": { 400 | "codemirror_mode": { 401 | "name": "ipython", 402 | "version": 3 403 | }, 404 | "file_extension": ".py", 405 | "mimetype": "text/x-python", 406 | "name": "python", 407 | "nbconvert_exporter": "python", 408 | "pygments_lexer": "ipython3", 409 | "version": "3.6.5" 410 | } 411 | }, 412 | "nbformat": 4, 413 | "nbformat_minor": 2 414 | } 415 | -------------------------------------------------------------------------------- /Assignments/Decision+Tree.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%%capture\n", 10 | "!pip install pymongo pprint dateparser matplotlib pandas sklearn numpy seaborn" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import pymongo\n", 20 | "import pprint\n", 21 | "import dateparser\n", 22 | "import pandas as pd\n", 23 | "import numpy as np\n", 24 | "from sklearn.tree import DecisionTreeClassifier\n", 25 | "from sklearn.metrics import classification_report, confusion_matrix\n", 26 | "from sklearn.ensemble import RandomForestClassifier\n", 27 | "from sklearn.model_selection import train_test_split\n", 28 | "\n", 29 | "%matplotlib inline" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 39 | "course_client = pymongo.MongoClient(course_cluster_uri)\n", 40 | "titanic = course_client['coursera-agg']['titanic']" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "initial_project = {\n", 50 | " \"$project\": {\n", 51 | " \"_id\": 0,\n", 52 | " \"name\": 0,\n", 53 | " \"point_of_embarkation\": 0,\n", 54 | " \"ticket_number\": 0,\n", 55 | " \"passenger_id\": 0,\n", 56 | " \"cabin\": 0,\n", 57 | " }\n", 58 | "}" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 6, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# todo - correct the age.\n", 68 | "# *HINT* -- If the $type of \"$age\" is a string, set it to 0\n", 69 | "age_correction = {\n", 70 | " \"$cond\": [ { \"$type\": \"string\" }, 0, \"$age\" ]\n", 71 | "}" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 12, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# todo - one hot encode gender_female. 1 if female, 0 if male\n", 81 | "one_hot_female = {\n", 82 | " \"$cond\": [ { \"$eq\": [ \"$gender\", \"female\" ] }, 1, 0 ]\n", 83 | "}" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 13, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# todo - the inverse of above. 1 if male, 0 if female\n", 93 | "one_hot_male = {\n", 94 | " \"$cond\": [ { \"$eq\": [ \"$gender\", \"male\" ] }, 1, 0 ]\n", 95 | "}" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 14, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "encoding_stage = {\n", 105 | " \"$addFields\": {\n", 106 | " \"gender_female\": one_hot_female,\n", 107 | " \"gender_male\": one_hot_male,\n", 108 | " \"age\": age_correction\n", 109 | " }\n", 110 | "}" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 15, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "final_project = {\n", 120 | " \"$project\": {\n", 121 | " \"gender\": 0\n", 122 | " }\n", 123 | "}" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 16, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "pipeline = [initial_project, encoding_stage, final_project]" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 17, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/html": [ 143 | "
\n", 144 | "\n", 157 | "\n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | "
ageclassfare_paidgender_femalegender_maleparents_childrensiblings_spousesurvived
0038.050001000
1037.250001010
20316.700010111
30311.133310201
40153.100010011
\n", 229 | "
" 230 | ], 231 | "text/plain": [ 232 | " age class fare_paid gender_female gender_male parents_children \\\n", 233 | "0 0 3 8.0500 0 1 0 \n", 234 | "1 0 3 7.2500 0 1 0 \n", 235 | "2 0 3 16.7000 1 0 1 \n", 236 | "3 0 3 11.1333 1 0 2 \n", 237 | "4 0 1 53.1000 1 0 0 \n", 238 | "\n", 239 | " siblings_spouse survived \n", 240 | "0 0 0 \n", 241 | "1 1 0 \n", 242 | "2 1 1 \n", 243 | "3 0 1 \n", 244 | "4 1 1 " 245 | ] 246 | }, 247 | "execution_count": 17, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "df = pd.DataFrame.from_dict(list(titanic.aggregate(pipeline)))\n", 254 | "df.head()" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 18, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "X = df.drop('survived', axis=1)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 19, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "y = df['survived']" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 20, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 21, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "dtree = DecisionTreeClassifier()" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 22, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "%%capture\n", 300 | "dtree.fit(X_train, y_train)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 23, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "predictions = dtree.predict(X_test)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 24, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "[[73 13]\n", 322 | " [11 37]]\n", 323 | "\n", 324 | "\n", 325 | " precision recall f1-score support\n", 326 | "\n", 327 | " 0 0.87 0.85 0.86 86\n", 328 | " 1 0.74 0.77 0.76 48\n", 329 | "\n", 330 | "avg / total 0.82 0.82 0.82 134\n", 331 | "\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "print(confusion_matrix(y_test, predictions))\n", 337 | "print(\"\\n\")\n", 338 | "print(classification_report(y_test, predictions))" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 25, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "rfc = RandomForestClassifier(n_estimators=20)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 26, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "%%capture\n", 357 | "rfc.fit(X_train, y_train)" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 27, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "rfc_pred = rfc.predict(X_test)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 28, 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "[[73 13]\n", 379 | " [15 33]]\n", 380 | "\n", 381 | "\n", 382 | " precision recall f1-score support\n", 383 | "\n", 384 | " test 0.83 0.85 0.84 86\n", 385 | "predictions 0.72 0.69 0.70 48\n", 386 | "\n", 387 | "avg / total 0.79 0.79 0.79 134\n", 388 | "\n" 389 | ] 390 | } 391 | ], 392 | "source": [ 393 | "print(confusion_matrix(y_test, rfc_pred))\n", 394 | "print(\"\\n\")\n", 395 | "print(classification_report(y_test, rfc_pred, target_names=['test', 'predictions']))" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 29, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "\n", 408 | "After 1000 iterations:\n", 409 | " Single Decision Tree accuracy: 0.8199253731343169\n", 410 | " Random Forest accuracy: 0.7955373134328367\n", 411 | " \n", 412 | " Lab Answer: dtree=0.82, rfc=0.8\n", 413 | "\n" 414 | ] 415 | } 416 | ], 417 | "source": [ 418 | "iterations = 1000\n", 419 | "dtree_avg_accuracy = 0\n", 420 | "rfc_avg_accuracy = 0\n", 421 | "for _ in range(iterations):\n", 422 | " dtree.fit(X_train, y_train)\n", 423 | " dtree_avg_accuracy += dtree.score(X_test, y_test)\n", 424 | " rfc.fit(X_train, y_train)\n", 425 | " rfc_avg_accuracy += rfc.score(X_test, y_test)\n", 426 | " \n", 427 | "print(f\"\"\"\n", 428 | "After {iterations} iterations:\n", 429 | " Single Decision Tree accuracy: {dtree_avg_accuracy / iterations}\n", 430 | " Random Forest accuracy: {rfc_avg_accuracy / iterations}\n", 431 | " \n", 432 | " Lab Answer: dtree={round(dtree_avg_accuracy / iterations, 2)}, rfc={round(rfc_avg_accuracy / iterations, 2)}\n", 433 | "\"\"\")\n", 434 | "#After 1000 iterations:\n", 435 | "# Single Decision Tree accuracy: 0.8199253731343169\n", 436 | "# Random Forest accuracy: 0.7955373134328367\n", 437 | "# \n", 438 | "# Lab Answer: dtree=0.82, rfc=0.8\n", 439 | "#0.8199253731343169+0.7955373134328367=1.62" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": { 446 | "collapsed": true 447 | }, 448 | "outputs": [], 449 | "source": [] 450 | } 451 | ], 452 | "metadata": { 453 | "kernelspec": { 454 | "display_name": "Python 3", 455 | "language": "python", 456 | "name": "python3" 457 | }, 458 | "language_info": { 459 | "codemirror_mode": { 460 | "name": "ipython", 461 | "version": 3 462 | }, 463 | "file_extension": ".py", 464 | "mimetype": "text/x-python", 465 | "name": "python", 466 | "nbconvert_exporter": "python", 467 | "pygments_lexer": "ipython3", 468 | "version": "3.6.5" 469 | } 470 | }, 471 | "nbformat": 4, 472 | "nbformat_minor": 2 473 | } 474 | -------------------------------------------------------------------------------- /LessonNotes/associative_rules__lesson.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Dependencies Installation\n", 8 | "Before we get started, let's make sure we have all dependencies installed." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": { 15 | "collapsed": true 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "%%capture\n", 20 | "! pip3 install pymongo dateparser sklearn pandas numpy pprint scipy matplotlib seaborn mlxtend\n", 21 | "%matplotlib inline\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "# Association Rules\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Importing Necessary Dependencies" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "# dependencies\n", 47 | "import dateparser\n", 48 | "import pymongo\n", 49 | "import pandas as pd\n", 50 | "from sklearn.cluster import KMeans\n", 51 | "from sklearn import preprocessing\n", 52 | "from mlxtend.frequent_patterns import apriori\n", 53 | "from mlxtend.frequent_patterns import association_rules\n", 54 | "from mlxtend.preprocessing import one_hot\n", 55 | "import numpy as np\n", 56 | "import json\n", 57 | "import matplotlib.pyplot as plt\n", 58 | "import seaborn as sns\n", 59 | "sns.set(style=\"whitegrid\", palette=\"muted\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "### The Initial Setup\n", 67 | "\n", 68 | "We'll create a dataframe with some made up transactions to illustrate the apriori algorithm and association rules. The dictionary key will represent the product bought, and the number will represent the quantity bought." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "transactions = [\n", 80 | " {\n", 81 | " \"beer\": 1,\n", 82 | " \"chips\": 2,\n", 83 | " \"salsa\": 1,\n", 84 | " },\n", 85 | " {\n", 86 | " \"chips\": 1,\n", 87 | " \"salsa\": 1,\n", 88 | " \"chocolate\": 3\n", 89 | " },\n", 90 | " {\n", 91 | " \"chocolate\": 2,\n", 92 | " \"diapers\": 1,\n", 93 | " \"beer\": 2\n", 94 | " },\n", 95 | " {\n", 96 | " \"chips\": 2,\n", 97 | " \"salsa\": 1,\n", 98 | " \"chocolate\": 2\n", 99 | " },\n", 100 | " {\n", 101 | " \"diapers\": 3,\n", 102 | " \"chips\": 1,\n", 103 | " \"salsa\": 2,\n", 104 | " \"beer\": 2\n", 105 | " },\n", 106 | " {\n", 107 | " \"diapers\": 2,\n", 108 | " \"chips\": 1,\n", 109 | " \"salsa\": 1,\n", 110 | " \"chocolate\": 4,\n", 111 | " \"beer\": 3\n", 112 | " }\n", 113 | "]" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "transactions = pd.DataFrame.from_dict(transactions)\n", 123 | "transactions" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "### Getting rid of NaN Values\n", 131 | "\n", 132 | "We need to get rid of NaN values, so we'll use a utility method from Pandas to replace them with 0." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "transactions.fillna(0, inplace=True)\n", 142 | "transactions" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### One-hot Encoding\n", 150 | "\n", 151 | "We need to one hot encode the data, so that 1 means they bought the item and 0 means they didn't. We'll quickly search the dataframe and replace values greater than 1 to 1." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "oh = transactions\n", 161 | "for column in oh.columns:\n", 162 | " oh.loc[oh[column] > 0, column] = 1\n", 163 | "oh" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "### Apriori\n", 171 | "\n", 172 | "The first step is to use the apriori algorithm. This will give us our frequent itemsets and their support.\n", 173 | "\n", 174 | "The support of an itemset is the proportion of transaction in the collection in which the itemset appears. It signifies the popularity of an itemset.\n", 175 | "\n", 176 | "Given the above information, we have 6 transactions. Of those, beer appears in 4 of them. So, we'd expect the itemset `[beer]` to have a support value of `4/6` or `.666666667`.\n", 177 | "\n", 178 | "Going through all of them, we can build itemsets that are just one item and calculate their support." 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "Now that we have our 1 item itemsets, let's build up our 2 item itemsets. So, if an itemset is [a, b] where a is chips and b is salse, the support is the ratio of the apperance of itemset `[a, b]` in all transactions. We would do this until we have exhausted all possible itemsets.\n", 186 | "\n", 187 | "Also of key importance is being able to define some minimum threshold for which we do not care about that itemset.\n", 188 | "\n", 189 | "For this, we'll use the `apriori` algorithm from `mlxtend`." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "assocs = apriori(oh, min_support=0.5, use_colnames=True)\n", 199 | "\n", 200 | "assocs =assocs.sort_values(by='support', ascending=False)\n", 201 | "assocs" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "rules = association_rules(assocs, min_threshold=0.5)\n", 211 | "with pd.option_context('display.max_rows', None, 'display.max_columns', 5):\n", 212 | " display(rules.sort_values(by='lift', ascending=False))" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "## Pymongo Setup" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "collapsed": true 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "# pymongo driver configuration\n", 231 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 232 | "course_client = pymongo.MongoClient(course_cluster_uri)\n", 233 | "orders = course_client['coursera-agg']['orders']" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "# Getting our data from MongoDB\n", 241 | "\n", 242 | "We'll need to construct a one-hot encoded dataframe. This means that for every document, convert the information into the purchases array into something like:\n", 243 | "\n", 244 | "```\n", 245 | "{\n", 246 | " ...,\n", 247 | " \"purchases\": [\n", 248 | " {\n", 249 | " \"description\": \"WHITE WIRE EGG HOLDER\",\n", 250 | " \"quantity\": 36,\n", 251 | " \"stock_code\": \"84880\",\n", 252 | " \"unit_price\": 4.95\n", 253 | " },\n", 254 | " {\n", 255 | " \"description\": \"JUMBO BAG BAROQUE BLACK WHITE\",\n", 256 | " \"quantity\": 100,\n", 257 | " \"stock_code\": \"85099C\",\n", 258 | " \"unit_price\": 1.65\n", 259 | " },\n", 260 | " {\n", 261 | " \"description\": \"JUMBO BAG RED RETROSPOT\",\n", 262 | " \"quantity\": 100,\n", 263 | " \"stock_code\": \"85099B\",\n", 264 | " \"unit_price\": 1.65\n", 265 | " }\n", 266 | " ],\n", 267 | " }\n", 268 | " ```\n", 269 | " into\n", 270 | " ```\n", 271 | "{\n", 272 | " \"84880\": 1,\n", 273 | " \"85099C\": 1,\n", 274 | " \"85099B\": 1,\n", 275 | "}\n", 276 | "```" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "## The Pipeline" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "order_projection = {\n", 293 | " \"$replaceRoot\": {\n", 294 | " \"newRoot\": {\n", 295 | " \"$arrayToObject\": {\n", 296 | " \"$map\": {\n", 297 | " \"input\": \"$purchases\",\n", 298 | " \"in\": {\n", 299 | " \"k\": \"$$this.stock_code\",\n", 300 | " \"v\": 1\n", 301 | " }\n", 302 | " }\n", 303 | " }\n", 304 | " }\n", 305 | " }\n", 306 | " \n", 307 | "}\n", 308 | "\n", 309 | "print(json.dumps(order_projection, indent=2))" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "# Constructing the Pipeline\n", 317 | "\n", 318 | "That's it! We will use our one stage." 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": { 325 | "collapsed": true 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "pipeline = [\n", 330 | " order_projection\n", 331 | "]" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "# Constructing the pandas Dataframe from MongoDB\n", 339 | "\n", 340 | "Here you will need to construct the DataFrame. Assign it to the variabled `df` below." 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "df = pd.DataFrame.from_dict(list(orders.aggregate(pipeline)))\n", 350 | "df.head(n=10)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "## Fixing the NaN values\n", 358 | "\n", 359 | "We will use the Pandas DataFrame [fillna](http://github.com/pandas-dev/pandas/blob/v0.21.0/pandas/core/frame.py#L3029-L3035) method to fill in NaN values for us with 0." 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "df.fillna(0, inplace=True)\n", 369 | "df.head(10)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "## Association" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "### Apriori\n", 384 | "First, we'll use the `apriori` algorithm from `mlxtend` to extract frequent itemsets. " 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": { 391 | "collapsed": true, 392 | "scrolled": false 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "assocs = apriori(df, min_support=0.02, use_colnames=True)" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "with pd.option_context('display.max_rows', None, 'display.max_columns', 5):\n", 406 | " assocs =assocs.sort_values(by='support', ascending=False)\n", 407 | " display(assocs)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "metadata": {}, 413 | "source": [ 414 | "## Association Rules\n", 415 | "\n", 416 | "Now we form the association rules. Try adjusting the `min_threshold` along with the `metric` to find interesting associations. For example, which class appears to be highly associated with `parents_children`? Go back and add a one-hot encoding function for `parents_children` and see if the results are more clear." 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": { 423 | "collapsed": true 424 | }, 425 | "outputs": [], 426 | "source": [ 427 | "rules = association_rules(assocs, metric=\"lift\", min_threshold=3)" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "with pd.option_context('display.max_rows', None, 'display.max_columns', 5):\n", 437 | " display(rules.sort_values(by='lift', ascending=False))" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "query = {\n", 447 | " \"$match\": {\n", 448 | " \"_id.stock_code\": { \"$in\": [\"22697\", \"22698\", \"22699\"]}\n", 449 | " }\n", 450 | "}\n", 451 | "\n", 452 | "project = {\n", 453 | " \"$project\": { \"_id\": 0, \"purchases.stock_code\": 1, \"purchases.description\": 1}\n", 454 | "}\n", 455 | "\n", 456 | "pipeline = [\n", 457 | " {\n", 458 | " \"$unwind\": \"$purchases\"\n", 459 | " },\n", 460 | " {\n", 461 | " \"$group\": {\n", 462 | " \"_id\": {\n", 463 | " \"stock_code\": \"$purchases.stock_code\",\n", 464 | " \"description\": \"$purchases.description\"\n", 465 | " }\n", 466 | " \n", 467 | " }\n", 468 | " },\n", 469 | " query\n", 470 | "]\n", 471 | "display(list(orders.aggregate(pipeline)))" 472 | ] 473 | } 474 | ], 475 | "metadata": { 476 | "kernelspec": { 477 | "display_name": "Python 3", 478 | "language": "python", 479 | "name": "python3" 480 | }, 481 | "language_info": { 482 | "codemirror_mode": { 483 | "name": "ipython", 484 | "version": 3 485 | }, 486 | "file_extension": ".py", 487 | "mimetype": "text/x-python", 488 | "name": "python", 489 | "nbconvert_exporter": "python", 490 | "pygments_lexer": "ipython3", 491 | "version": "3.6.5" 492 | } 493 | }, 494 | "nbformat": 4, 495 | "nbformat_minor": 1 496 | } 497 | -------------------------------------------------------------------------------- /LessonNotes/pearson_correlation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pearson Correlation\n", 8 | "\n", 9 | "Correlation using Pearson Correlation (Pearson's Rho, Pearson Correlation Coefficient, etc...)\n", 10 | "\n", 11 | "## What is Correlation?\n", 12 | "\n", 13 | "Let's define what correlation is. Bivariate data (data with two variables) is said to be correlated when there is a strong linear relationship between the two variables. The Pearson correlation is a measure of strength in this linear relationship.\n", 14 | "\n", 15 | "## Pearson correlation\n", 16 | "\n", 17 | "Pearson correlation values can range from -1 to 1, inclusive. Negative values indicate a negative correlation, positive values a positive correlation, and values near 0 represent no correlation. That is to say, given two values `x` and `y`, a negative correlation would be as the value of `x` increases `y` decreases, a positive correlation would be as `x` increases `y` increases, and no correlation would be as `x` increases `y` has no increase or decrease.\n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "%%html\n", 29 | "
\n", 30 | " \t\n", 33 | "
Source:\n", 34 | " \n", 35 | " Wikipedia\n", 36 | " \n", 37 | "
\n", 38 | "
" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "%%capture\n", 50 | "!pip install numpy pandas seaborn scipy\n", 51 | "import numpy as np\n", 52 | "import pandas as pd\n", 53 | "import seaborn as sns\n", 54 | "import matplotlib as plt\n", 55 | "from pymongo import MongoClient\n", 56 | "from cmath import sqrt\n", 57 | "from scipy.stats import pearsonr\n", 58 | "%matplotlib inline\n", 59 | "sns.set(color_codes=True)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Real Data\n", 67 | "\n", 68 | "Perfect correlations are virtually non-existent on real data. Let's look at a set of real data, and graph it.\n", 69 | "\n", 70 | "We'll use the ``movies`` dataset available from Atlas, and look at the ``imdb.votes`` and ``imdb.rating`` fields. Let's determine if there is a correlation between the number of votes and the rating." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 82 | "course_client = MongoClient(course_cluster_uri)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "movies = course_client['aggregations']['movies']" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": true 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "pipeline = [\n", 105 | " {\n", 106 | " \"$match\": {\n", 107 | " \"imdb.rating\": { \"$gt\": 0 },\n", 108 | " \"imdb.votes\": { \"$gt\": 0}\n", 109 | " }\n", 110 | " },\n", 111 | " {\n", 112 | " \"$project\": {\n", 113 | " \"_id\": 0,\n", 114 | " \"rating\": \"$imdb.rating\",\n", 115 | " \"votes\": \"$imdb.votes\"\n", 116 | " }\n", 117 | " }\n", 118 | "]" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "df = pd.DataFrame.from_dict(list(movies.aggregate(pipeline)))" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": { 136 | "collapsed": true 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "df.head()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "collapsed": true 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "sns.jointplot(x=\"rating\", y=\"votes\", data=df, kind='reg',\n", 152 | " joint_kws={'line_kws':{'color':'red'}, 'scatter_kws': { 'alpha': 0.5, 's': 20}}, size=8 )" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "## Calculating Correlation\n", 160 | "\n", 161 | "### The Formula\n", 162 | "\n", 163 | "Math is both beautiful and terrifying. This is the formula for a single-pass Pearson Correlation.\n", 164 | "\n", 165 | "![Pearson's Rho Single Pass Formula](https://s3.amazonaws.com/special-partnerships/coursera/pearson.gif)\n", 166 | "\n", 167 | "\n", 168 | "### Groundwork\n", 169 | "\n", 170 | "To calculate the correlation, we need to calculate some values for later use. For the following steps, X and Y refer to rating and movies.\n", 171 | "\n", 172 | "* Calculate the mean for X and the mean for Y. We'll call these m_x and m_y\n", 173 | "* For each X, substract m_x. We'll call this little x\n", 174 | "* For each Y, substract m_y. We'll call this little y\n", 175 | "* For each pair of values, multiply x and y. We'll call this xy\n", 176 | "* For each little x and each little y, calculate the square. We'll call these x2 and y2\n", 177 | "\n", 178 | "Let's see these these values and how they relate to the input.\n", 179 | "\n", 180 | "First, we'll create a copy.\n", 181 | "\n" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "collapsed": true 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "exm = df\n", 193 | "exm.head(n=10)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "Next, we will calculate m_x and m_y." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "m_x = sum(exm['rating'])/len(exm['rating'])\n", 212 | "m_y = sum(exm['votes'])/len(exm['rating'])\n", 213 | "print(f\"m_x= {m_x}, m_y= {m_y}\")" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "Now, we will calculate little x, little y, xy, x2 and y2. We will assign these values into the ``exm`` dataframe, and then view the first 10 rows to see how they all relate." 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": true 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "x = list(map(lambda X: X - m_x, exm['rating']))\n", 232 | "y = list(map(lambda Y: Y - m_y, exm['votes']))\n", 233 | "xy = list(map(lambda xy: xy[0] * xy[1], zip(exm['rating'], exm['votes'])))\n", 234 | "x2 = list(map(lambda x: x * x, exm['rating']))\n", 235 | "y2 = list(map(lambda y: y * y, exm['votes']))\n", 236 | "exm = exm.assign(x=x, y=y, xy=xy, x2=x2, y2=y2)\n", 237 | "exm = exm[['rating', 'votes', 'x', 'y', 'xy', 'x2', 'y2']]\n", 238 | "exm.head(n=10)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "### The Maths in the Equation\n", 246 | "\n", 247 | "With these necessary values calculated, we can now jump into the equation itself." 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "To start, we'll focus on the top of the equation, which we'll call `top`.\n", 255 | "\n", 256 | "We multiply the number of elements (`elems`), by the sum of xy (`sum_xy`) and call it `product_xy_elems`. After doing so we then subtract the sum of x (`sum_x`) multiplied by the sum of `y` (`sum_y`) and call it `product_sum_x_sum_y`. So `top = product_xy_elems - product_sum_x_sum_y`." 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "collapsed": true 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "elems = len(exm['votes'])\n", 268 | "print(elems)\n", 269 | "sum_xy = exm['xy'].sum()\n", 270 | "sum_x = exm['rating'].sum()\n", 271 | "sum_y = exm['votes'].sum()\n", 272 | "\n", 273 | "product_xy_elems = elems * sum_xy\n", 274 | "product_sum_x_sum_y = sum_x * sum_y\n", 275 | "\n", 276 | "top = product_xy_elems - product_sum_x_sum_y" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "Let's now focus on the bottom of the equation. For the moment we'll ignore the square root.\n", 284 | "\n", 285 | "First, bottom left, which we'll call `bottom_left`. This is composed of two parts.\n", 286 | "\n", 287 | "We multiply `elems` by the sum of the squares of x (`sum_x2`) and call it `product_sum_x2_elems`. We then subtract the square of the sums of x (`sum_x_2`)." 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": { 294 | "collapsed": true 295 | }, 296 | "outputs": [], 297 | "source": [ 298 | "sum_x2 = exm['x2'].sum()\n", 299 | "sum_x_2 = sum_x * sum_x\n", 300 | "\n", 301 | "product_sum_x2_elems = elems * sum_x2\n", 302 | "\n", 303 | "bottom_left = product_sum_x2_elems - sum_x_2" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "Next, bottom right, which we'll call `bottom_right`. This is almost identical to `bottom_left`, but we are now concerned with `y` instead of `x`.\n", 311 | "\n", 312 | "We multiply `elems` by the sum of the squares of y (`sum_y2`) and call it `product_sum_y2_elems`. We then subtract the square of the sums of y (`sum_y_2`)." 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "sum_y2 = exm['y2'].sum()\n", 324 | "sum_y_2 = sum_y * sum_y\n", 325 | "product_sum_y2_elems = elems * sum_y2\n", 326 | "bottom_right = product_sum_y2_elems - sum_y_2" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": { 332 | "collapsed": true 333 | }, 334 | "source": [ 335 | "We can short circuit the math a bit and multiply `bottom_left` by `bottom_right` and then take the square root of that." 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": { 342 | "collapsed": true 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "bottom = sqrt(bottom_left * bottom_right)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "### Finding our correlation" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "collapsed": true 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "r = top/bottom\n", 365 | "print(f\"{round(r.real, 4)}\")" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "We have our correlation! Let's see if we were accurate by comparing to the pearsonr library method in available in `scipy.stats`." 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": { 379 | "collapsed": true 380 | }, 381 | "outputs": [], 382 | "source": [ 383 | "p = pearsonr(exm['rating'], exm['votes'])\n", 384 | "print(f\"{round(p[0], 4)}\")" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "## Within MongoDB\n", 392 | "\n", 393 | "Excellent! We can see that we're getting the same results as the library function.\n", 394 | "\n", 395 | "However, this is slower than it needs to be, and we can do all of this work within the aggregation framework!\n", 396 | "\n", 397 | "We'll calculate the same variables we did before within a pipeline, and assign it to a key called `m`." 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": { 404 | "collapsed": true 405 | }, 406 | "outputs": [], 407 | "source": [ 408 | "X = '$imdb.rating'\n", 409 | "Y = '$imdb.votes'" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": { 416 | "collapsed": true 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "elems = { \"$sum\": 1 }\n", 421 | "sum_x = { \"$sum\": X }\n", 422 | "sum_y = { \"$sum\": Y }\n", 423 | "sum_x2 = { \"$sum\": { \"$multiply\": [X, X] } }\n", 424 | "sum_y2 = { \"$sum\": { \"$multiply\": [Y, Y] } }\n", 425 | "sum_xy = { \"$sum\": { \"$multiply\": [X, Y] } }\n", 426 | "\n", 427 | "all_sums = {\n", 428 | " \"$group\": {\n", 429 | " \"_id\": None,\n", 430 | " \"elems\": elems,\n", 431 | " \"sum_x\": sum_x,\n", 432 | " \"sum_y\": sum_y,\n", 433 | " \"sum_x2\": sum_x2,\n", 434 | " \"sum_y2\": sum_y2,\n", 435 | " \"sum_xy\": sum_xy\n", 436 | " }\n", 437 | "}" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": { 444 | "collapsed": true 445 | }, 446 | "outputs": [], 447 | "source": [ 448 | "product_sum_x_sum_y = { \"$multiply\": [\"$sum_x\", \"$sum_y\"] }\n", 449 | "product_sum_xy_elems = { \"$multiply\": [\"$sum_xy\", \"$elems\"] }\n", 450 | "top = { \"$subtract\": [ product_sum_xy_elems, product_sum_x_sum_y]}" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": { 457 | "collapsed": true 458 | }, 459 | "outputs": [], 460 | "source": [ 461 | "product_sum_x2_elems = { \"$multiply\": [\"$sum_x2\", \"$elems\"] }\n", 462 | "sum_x_2 = { \"$multiply\": [\"$sum_x\", \"$sum_x\"] }\n", 463 | "bottom_left = { \"$subtract\": [ product_sum_x2_elems, sum_x_2]}" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": { 470 | "collapsed": true 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "product_sum_y2_elems = { \"$multiply\": [\"$sum_y2\", \"$elems\"] }\n", 475 | "sum_y_2 = { \"$multiply\": [\"$sum_y\", \"$sum_y\"] }\n", 476 | "bottom_right = { \"$subtract\": [product_sum_y2_elems, sum_y_2] }" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": { 483 | "collapsed": true 484 | }, 485 | "outputs": [], 486 | "source": [ 487 | "bottom = { \"$sqrt\": { \"$multiply\": [bottom_left, bottom_right] } }\n", 488 | "correlation = { \n", 489 | " \"$project\": {\n", 490 | " \"m\": { \"$divide\": [top, bottom] }\n", 491 | " }\n", 492 | "}" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "metadata": { 499 | "collapsed": true 500 | }, 501 | "outputs": [], 502 | "source": [ 503 | "pipeline = [\n", 504 | " {\n", 505 | " \"$match\": {\n", 506 | " \"imdb.rating\": { \"$gt\": 0 },\n", 507 | " \"imdb.votes\": { \"$gt\": 0}\n", 508 | " }\n", 509 | " },\n", 510 | " all_sums,\n", 511 | " correlation\n", 512 | "]" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "metadata": { 519 | "collapsed": true 520 | }, 521 | "outputs": [], 522 | "source": [ 523 | "result = list(movies.aggregate(pipeline))\n", 524 | "print(f\"\"\"\n", 525 | "r = {round(r.real, 4)} (calculated by hand)\n", 526 | "p = {round(p[0], 4)} (from scipy)\n", 527 | "m = {round(result[0]['m'], 4)} (from MongoDB)\n", 528 | "\"\"\")" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": { 535 | "collapsed": true 536 | }, 537 | "outputs": [], 538 | "source": [] 539 | } 540 | ], 541 | "metadata": { 542 | "kernelspec": { 543 | "display_name": "Python 3", 544 | "language": "python", 545 | "name": "python3" 546 | }, 547 | "language_info": { 548 | "codemirror_mode": { 549 | "name": "ipython", 550 | "version": 3 551 | }, 552 | "file_extension": ".py", 553 | "mimetype": "text/x-python", 554 | "name": "python", 555 | "nbconvert_exporter": "python", 556 | "pygments_lexer": "ipython3", 557 | "version": "3.6.5" 558 | } 559 | }, 560 | "nbformat": 4, 561 | "nbformat_minor": 2 562 | } 563 | -------------------------------------------------------------------------------- /LessonNotes/tree_like__lesson.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "%%capture\n", 12 | "# installing necessary dependencies and importing\n", 13 | "!pip install numpy pandas ete3\n", 14 | "import numpy as np\n", 15 | "import pandas as pd\n", 16 | "import seaborn as sns\n", 17 | "from pymongo import MongoClient\n", 18 | "from ete3 import Tree, TreeStyle, TextFace, add_face_to_node\n", 19 | "import json\n", 20 | "from IPython.display import Image\n", 21 | "import pprint" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n", 33 | "course_client = MongoClient(course_cluster_uri)\n", 34 | "products = course_client['coursera-agg']['product_categories']" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# let's just get the products involving cats\n", 44 | "df = pd.DataFrame.from_dict(list(products.find({\"name\": { \"$regex\": \"^cat \", \"$options\": 'i' } }, {\"_id\": 0})))\n", 45 | "df" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# Tree visualizations using etetoolkit, www.etetoolkit.org\n", 55 | "# You can read in depth about newick trees there!\n", 56 | "current_view = []\n", 57 | "for name in df['name']:\n", 58 | " dat = df.loc[df['name'] == name].values\n", 59 | " z = f\"({dat[0, 0]}){dat[0, 1]}\"\n", 60 | " current_view.append(z)\n", 61 | " \n", 62 | "q = ','.join(current_view)\n", 63 | "a = f\"({q});\"\n", 64 | "t = Tree(a, format=1)\n", 65 | "ts = TreeStyle()\n", 66 | "ts.show_leaf_name = False\n", 67 | "def my_layout(node):\n", 68 | " F = TextFace(node.name, tight_text=True)\n", 69 | " add_face_to_node(F, node, column=0, position=\"branch-right\")\n", 70 | "ts.layout_fn = my_layout\n", 71 | "t.render('%%inline', tree_style=ts)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "preferred_view = []\n", 81 | "\n", 82 | "\n", 83 | "dat = df.loc[df['parent'] == 'Cat Supplies'].values\n", 84 | "for i in range(0, len(dat)):\n", 85 | " row = f\"({dat[i, 0]})\"\n", 86 | " preferred_view.append(row)\n", 87 | " \n", 88 | "q = ','.join(preferred_view)\n", 89 | "a = f\"((({q})Cat Supplies)Pet Supplies);\"\n", 90 | "t = Tree(a, format=1)\n", 91 | "ts = TreeStyle()\n", 92 | "ts.show_leaf_name = False\n", 93 | "ts.layout_fn = my_layout\n", 94 | "t.render('%%inline', tree_style=ts)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "just_cat_toys = [\n", 106 | " {\n", 107 | " \"$match\": { \"name\": \"Cat Toys\"}\n", 108 | " },\n", 109 | " {\n", 110 | " \"$graphLookup\": {\n", 111 | " \"from\": \"product_categories\",\n", 112 | " \"startWith\": \"$name\",\n", 113 | " \"connectFromField\": \"parent\",\n", 114 | " \"connectToField\": \"name\",\n", 115 | " \"as\": \"ancestors\",\n", 116 | " }\n", 117 | " },\n", 118 | " {\n", 119 | " \"$project\": { \"_id\": 0 }\n", 120 | " }\n", 121 | "]\n", 122 | "\n", 123 | "cat_toy_lineage = list(products.aggregate(just_cat_toys))" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "pprint.pprint(cat_toy_lineage)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "just_cat_toys_ancestry = [\n", 144 | " {\n", 145 | " \"$match\": { \"name\": \"Cat Toys\"}\n", 146 | " },\n", 147 | " {\n", 148 | " \"$graphLookup\": {\n", 149 | " \"from\": \"product_categories\",\n", 150 | " \"startWith\": \"$name\",\n", 151 | " \"connectFromField\": \"parent\",\n", 152 | " \"connectToField\": \"name\",\n", 153 | " \"as\": \"ancestors\",\n", 154 | " }\n", 155 | " },\n", 156 | " {\n", 157 | " \"$project\": {\n", 158 | " \"_id\": 0,\n", 159 | " \"name\": 1,\n", 160 | " \"ancestors\": {\n", 161 | " \"$setDifference\": [\"$ancestors.parent\", [\"$name\", None]]\n", 162 | " },\n", 163 | " \"parent\": 1\n", 164 | " }\n", 165 | " }\n", 166 | "]" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "print(json.dumps(list(products.aggregate(just_cat_toys_ancestry)), indent=4))" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "collapsed": true 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "just_cat_toys_unwound = [\n", 187 | " {\n", 188 | " \"$match\": { \"name\": \"Cat Toys\"}\n", 189 | " },\n", 190 | " {\n", 191 | " \"$graphLookup\": {\n", 192 | " \"from\": \"product_categories\",\n", 193 | " \"startWith\": \"$name\",\n", 194 | " \"connectFromField\": \"parent\",\n", 195 | " \"connectToField\": \"name\",\n", 196 | " \"as\": \"ancestors\",\n", 197 | " }\n", 198 | " },\n", 199 | " {\n", 200 | " \"$unwind\": \"$ancestors\"\n", 201 | " },\n", 202 | " {\n", 203 | " \"$project\": { \"_id\": 0, \"ancestors._id\": 0 }\n", 204 | " }\n", 205 | "]" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "print(json.dumps(list(products.aggregate(just_cat_toys_unwound)), indent=4))" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "descendants = [\n", 224 | " {\n", 225 | " \"$match\": { \"name\": \"Cat Toys\"}\n", 226 | " },\n", 227 | " {\n", 228 | " \"$graphLookup\": {\n", 229 | " \"from\": \"product_categories\",\n", 230 | " \"startWith\": \"$name\",\n", 231 | " \"connectFromField\": \"parent\",\n", 232 | " \"connectToField\": \"name\",\n", 233 | " \"as\": \"ancestors\",\n", 234 | " }\n", 235 | " },\n", 236 | " {\n", 237 | " \"$unwind\": \"$ancestors\"\n", 238 | " },\n", 239 | " {\n", 240 | " \"$group\": {\n", 241 | " \"_id\": \"$ancestors.name\",\n", 242 | " \"descendants\": { \n", 243 | " \"$addToSet\": \"$name\"\n", 244 | " }\n", 245 | " }\n", 246 | " }\n", 247 | "]\n", 248 | "print(json.dumps(list(products.aggregate(descendants)), indent=4))" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "descendants_pet_supplies = [\n", 258 | " {\n", 259 | " \"$match\": {\n", 260 | " \"name\": { \"$regex\": \"^cat \", \"$options\": 'i' },\n", 261 | " \"parent\": { \"$in\": [\"Cat Supplies\", \"Pet Supplies\"]}\n", 262 | " }\n", 263 | " },\n", 264 | " {\n", 265 | " \"$graphLookup\": {\n", 266 | " \"from\": \"product_categories\",\n", 267 | " \"startWith\": \"$name\",\n", 268 | " \"connectFromField\": \"parent\",\n", 269 | " \"connectToField\": \"name\",\n", 270 | " \"as\": \"ancestors\",\n", 271 | " }\n", 272 | " },\n", 273 | " {\n", 274 | " \"$unwind\": \"$ancestors\"\n", 275 | " },\n", 276 | " {\n", 277 | " \"$group\": {\n", 278 | " \"_id\": \"$ancestors.name\",\n", 279 | " \"descendants\": { \n", 280 | " \"$addToSet\": {\n", 281 | " \"name\": \"$name\",\n", 282 | " \"parent\": \"$parent\"\n", 283 | " }\n", 284 | " }\n", 285 | " }\n", 286 | " },\n", 287 | " {\n", 288 | " \"$match\": {\n", 289 | " \"_id\": { \"$regex\": \"^cat |^pet \", \"$options\": \"i\" }\n", 290 | " }\n", 291 | " }\n", 292 | "]\n", 293 | "print(json.dumps(list(products.aggregate(descendants_pet_supplies)), indent=4))" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": true 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "pipeline_children = [\n", 305 | " {\n", 306 | " \"$graphLookup\": {\n", 307 | " \"from\": \"product_categories\",\n", 308 | " \"startWith\": \"$name\",\n", 309 | " \"connectFromField\": \"parent\",\n", 310 | " \"connectToField\": \"name\",\n", 311 | " \"as\": \"ancestors\",\n", 312 | " }\n", 313 | " },\n", 314 | " {\n", 315 | " \"$unwind\": \"$ancestors\"\n", 316 | " },\n", 317 | " {\n", 318 | " \"$group\": {\n", 319 | " \"_id\": \"$ancestors.name\",\n", 320 | " \"descendants\": { \n", 321 | " \"$addToSet\": {\n", 322 | " \"name\": \"$name\",\n", 323 | " \"parent\": \"$parent\"\n", 324 | " }\n", 325 | " }\n", 326 | " }\n", 327 | " },\n", 328 | " {\n", 329 | " \"$addFields\": {\n", 330 | " \"descendants\": {\n", 331 | " \"$setDifference\": [ \"$descendants.name\", [\"$_id\"]]\n", 332 | " },\n", 333 | " \"children\": {\n", 334 | " \"$map\": {\n", 335 | " \"input\": {\n", 336 | " \"$filter\": {\n", 337 | " \"input\": \"$descendants\",\n", 338 | " \"cond\": {\n", 339 | " \"$eq\": [\"$_id\", \"$$this.parent\"]\n", 340 | " }\n", 341 | " }\n", 342 | " },\n", 343 | " \"in\": \"$$this.name\"\n", 344 | " } \n", 345 | " }\n", 346 | " }\n", 347 | " }\n", 348 | "]" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [ 357 | "pipeline = pipeline_children.copy()\n", 358 | "pipeline.insert(0, {\n", 359 | " \"$match\": {\n", 360 | " \"name\": { \"$regex\": \"^cat \", \"$options\": 'i' },\n", 361 | " \"parent\": { \"$in\": [\"Cat Supplies\", \"Pet Supplies\"]}\n", 362 | " }\n", 363 | "})\n", 364 | "pipeline.append({\n", 365 | " \"$match\": {\n", 366 | " \"_id\": { \"$regex\": \"^cat |^pet \", \"$options\": \"i\" }\n", 367 | " }\n", 368 | "})\n", 369 | "tree = list(products.aggregate(pipeline))\n", 370 | "print(json.dumps(tree, indent=4))" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "collapsed": true 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "def descend_tree(node=None, data=None):\n", 382 | " \"\"\"\n", 383 | " This function descends a tree from a given node. The node name and dataframe are required\n", 384 | " \"\"\"\n", 385 | " if node is None or data is None or data.empty:\n", 386 | " raise(f\"Both node and data are required. Got {node} for node and {data} for data\")\n", 387 | " try:\n", 388 | " children = data.loc[data['name'] == node]['children'].values[0]\n", 389 | " if not children:\n", 390 | " return '(' + node + ')'\n", 391 | " else:\n", 392 | " return '(' + ','.join([descend_tree(child, data) for child in children]) + ')' + node\n", 393 | " except:\n", 394 | " return '(' + node + ')'\n" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "df1 = pd.DataFrame.from_dict(tree)\n", 404 | "df1['name'] = df1['_id']\n", 405 | "df1.drop('_id', axis=1, inplace=True)\n", 406 | "t = Tree(f\"{descend_tree('Pet Supplies', df1)};\", format=1)\n", 407 | "ts = TreeStyle()\n", 408 | "ts.show_leaf_name = False\n", 409 | "def my_layout(node):\n", 410 | " F = TextFace(node.name, tight_text=True)\n", 411 | " add_face_to_node(F, node, column=0, position=\"branch-right\")\n", 412 | "ts.layout_fn = my_layout\n", 413 | "t.render('%%inline', tree_style=ts)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": { 420 | "collapsed": true 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "pipeline_parents = [\n", 425 | " {\n", 426 | " \"$graphLookup\": {\n", 427 | " \"from\": \"product_categories\",\n", 428 | " \"startWith\": \"$name\",\n", 429 | " \"connectFromField\": \"parent\",\n", 430 | " \"connectToField\": \"name\",\n", 431 | " \"as\": \"ancestors\",\n", 432 | " }\n", 433 | " },\n", 434 | " {\n", 435 | " \"$project\": {\n", 436 | " \"name\": 1,\n", 437 | " \"ancestors\": {\n", 438 | " \"$setDifference\": [\"$ancestors.parent\", [\"$name\", None]]\n", 439 | " },\n", 440 | " \"parent\": 1\n", 441 | " }\n", 442 | " }\n", 443 | "]" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": { 450 | "collapsed": true 451 | }, 452 | "outputs": [], 453 | "source": [ 454 | "full_tree = [\n", 455 | " {\n", 456 | " \"$facet\": {\n", 457 | " \"child_tree\": pipeline_children,\n", 458 | " \"parent_tree\": pipeline_parents\n", 459 | " }\n", 460 | " },\n", 461 | " {\n", 462 | " \"$unwind\": \"$parent_tree\"\n", 463 | " },\n", 464 | " {\n", 465 | " \"$project\": {\n", 466 | " \"own_child_tree\": {\n", 467 | " \"$arrayElemAt\": [\n", 468 | " {\n", 469 | " \"$filter\": {\n", 470 | " \"input\": \"$child_tree\",\n", 471 | " \"cond\": {\n", 472 | " \"$eq\": [\"$$this._id\", \"$parent_tree.name\"]\n", 473 | " }\n", 474 | " }\n", 475 | " },\n", 476 | " 0\n", 477 | " ]\n", 478 | " },\n", 479 | " \"name\": \"$parent_tree.name\",\n", 480 | " \"parent\": \"$parent_tree.parent\",\n", 481 | " \"ancestors\": \"$parent_tree.ancestors\",\n", 482 | " }\n", 483 | " },\n", 484 | " {\n", 485 | " \"$addFields\": {\n", 486 | " \"children\": \"$own_child_tree.children\",\n", 487 | " \"descendants\": \"$own_child_tree.descendants\"\n", 488 | " }\n", 489 | " },\n", 490 | " {\n", 491 | " \"$addFields\": {\n", 492 | " \"num_children\": { \"$size\": \"$children\" },\n", 493 | " \"num_descendants\": { \"$size\": \"$descendants\" },\n", 494 | " \"num_ancestors\": { \"$size\": \"$ancestors\" }\n", 495 | " }\n", 496 | " },\n", 497 | " {\n", 498 | " \"$project\": { \"own_child_tree\": 0 }\n", 499 | " },\n", 500 | " {\n", 501 | " \"$sort\": { \"num_descendants\": -1 }\n", 502 | " }\n", 503 | "]" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": { 510 | "collapsed": true 511 | }, 512 | "outputs": [], 513 | "source": [ 514 | "df = pd.DataFrame.from_dict(list(products.aggregate(full_tree)))" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "df.head(10)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": { 530 | "collapsed": true 531 | }, 532 | "outputs": [], 533 | "source": [ 534 | "%%capture\n", 535 | "tr = Tree(f\"{descend_tree('Pet Supplies', df)};\", format=1)\n", 536 | "ts = TreeStyle()\n", 537 | "ts.show_branch_length = False\n", 538 | "ts.show_branch_support = False\n", 539 | "ts.show_leaf_name = False\n", 540 | "ts.mode = 'c'\n", 541 | "ts.layout_fn = my_layout" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "tr.render('%%inline', tree_style=ts)" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": { 557 | "collapsed": true 558 | }, 559 | "outputs": [], 560 | "source": [ 561 | "def is_descendant(candidate=None, of=None, data=None):\n", 562 | " assert isinstance(candidate, str), \"candidate is required and must be of type str\"\n", 563 | " assert isinstance(of, str), \"of is required and must be of type str\"\n", 564 | " assert isinstance(data, pd.DataFrame), \"data is required and must be of a pandas DataFrame\"\n", 565 | " try:\n", 566 | " return candidate in data.loc[data['name'] == of]['descendants'].values[0]\n", 567 | " except:\n", 568 | " return False\n", 569 | "\n", 570 | "def is_ancestor(candidate=None, of=None, data=None):\n", 571 | " assert isinstance(candidate, str), \"candidate is required and must be of type str\"\n", 572 | " assert isinstance(of, str), \"of is required and must be of type str\"\n", 573 | " assert isinstance(data, pd.DataFrame), \"data is required and must be of a pandas DataFrame\"\n", 574 | " try:\n", 575 | " return candidate in data.loc[data['name'] == of]['ancestors'].values[0]\n", 576 | " except:\n", 577 | " return False\n", 578 | "\n", 579 | "def common_ancestor(node_1=None, node_2=None, data=None):\n", 580 | " assert isinstance(node_1, str), \"candidate is required and must be of type str\"\n", 581 | " assert isinstance(node_2, str), \"of is required and must be of type str\"\n", 582 | " assert isinstance(data, pd.DataFrame), \"data is required and must be of a pandas DataFrame\"\n", 583 | "\n", 584 | " try:\n", 585 | " node_1_loc = data.loc[data['name'] == node_1]\n", 586 | " node_2_loc = data.loc[data['name'] == node_2]\n", 587 | "\n", 588 | " if node_1 == node_2:\n", 589 | " return node_1\n", 590 | " \n", 591 | " if is_ancestor(node_1, node_2, data):\n", 592 | " return node_1\n", 593 | " if is_descendant(node_1, node_2, data):\n", 594 | " return node_2\n", 595 | " \n", 596 | " node_1_parent = node_1_loc['parent'].values[0]\n", 597 | " node_2_parent = node_2_loc['parent'].values[0]\n", 598 | " \n", 599 | " if node_1_parent == node_2_parent:\n", 600 | " return node_1_parent\n", 601 | " \n", 602 | " return common_ancestor(node_1_parent, node_2_parent, data)\n", 603 | " except:\n", 604 | " return \"no common ancestor found\"" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "is_descendant('Bird Cage Food & Water Dishes', 'Pet Supplies', df)" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": null, 619 | "metadata": {}, 620 | "outputs": [], 621 | "source": [ 622 | "is_ancestor('Pet Supplies', 'Bird Cage Accessories', df)" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": null, 628 | "metadata": {}, 629 | "outputs": [], 630 | "source": [ 631 | "common_ancestor('Small Animal Food', 'Pet Food Containers', df)" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": { 638 | "collapsed": true 639 | }, 640 | "outputs": [], 641 | "source": [ 642 | "%%capture\n", 643 | "root_nodes = df.loc[df['num_ancestors'] == 0]['name'].values.tolist()\n", 644 | "the_product_tree = '(' + ','.join([descend_tree(root, df) for root in root_nodes]) + ')' + ';'\n", 645 | "t = Tree(the_product_tree, format=1)\n", 646 | "ts = TreeStyle()\n", 647 | "ts.show_branch_length = False\n", 648 | "ts.show_branch_support = False\n", 649 | "ts.layout_fn = my_layout\n", 650 | "# uncomment the line below to generate the entire product graph\n", 651 | "# t.render('product_tree.png', tree_style=ts)\n", 652 | "\"\"\"\n", 653 | "Open the local file \"product_tree.png\" at your own risk!\n", 654 | "\"\"\"" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": null, 660 | "metadata": { 661 | "collapsed": true 662 | }, 663 | "outputs": [], 664 | "source": [] 665 | } 666 | ], 667 | "metadata": { 668 | "kernelspec": { 669 | "display_name": "Python 3", 670 | "language": "python", 671 | "name": "python3" 672 | }, 673 | "language_info": { 674 | "codemirror_mode": { 675 | "name": "ipython", 676 | "version": 3 677 | }, 678 | "file_extension": ".py", 679 | "mimetype": "text/x-python", 680 | "name": "python", 681 | "nbconvert_exporter": "python", 682 | "pygments_lexer": "ipython3", 683 | "version": "3.6.3" 684 | } 685 | }, 686 | "nbformat": 4, 687 | "nbformat_minor": 2 688 | } 689 | --------------------------------------------------------------------------------