├── README.md
├── Assignments
    ├── lab_group_accumulators.ipynb
    ├── schemas-and-accumulators.ipynb
    ├── changing_document_shape.ipynb
    ├── unwind_and_group_lab.ipynb
    ├── entity-resolution.ipynb
    ├── cursor_like_methods.ipynb
    ├── lab__graphlookup.ipynb
    ├── lookup_lab.ipynb
    ├── expressions_with_project.ipynb
    ├── linear-regression-on-titanic-data-set.ipynb
    └── Decision+Tree.ipynb
└── LessonNotes
    ├── linear-regressions-with-mongodb.ipynb
    ├── migrating-schema-lesson.ipynb
    ├── principal-component-analysis.ipynb
    ├── associative_rules__lesson.ipynb
    ├── pearson_correlation.ipynb
    └── tree_like__lesson.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | # MongoDB Aggregation Framework
 2 | 
 3 | Work from the [MongoDB Aggregation Framework](https://www.coursera.org/learn/mongodb-aggregation-framework) course on Coursera.
 4 | 
 5 | ## Content
 6 | 
 7 | ### The Fundamentals of MongoDB Aggregation
 8 | 
 9 | - Aggregation Introduction
10 | - The Concept of Pipelines
11 | - Aggregation Structure and Syntax
12 | - $match: Filtering Documents
13 | - Using $project
14 | - Using Expressions
15 | - Cursor Like Methods Stages
16 | - The $group Stage
17 | - $unwind
18 | - The $lookup Stage
19 | 
20 | ### Leveraging MongoDB's Flexible Schema
21 | 
22 |  - mongoimport
23 |  - Importance of Schema
24 |  - Exploring Schemas
25 |  - Migrating Your Schema
26 |  - Views
27 |  - Supplementing Schemas with Accumulators
28 |  - Tree-like Data in Individual Documents
29 |  - Expressive Lookup Basics
30 |  - Entity Resolution with $lookup
31 |  
32 |  ### Machine Learning with MongoDB
33 |  
34 |   - Calculation of Persons-Rho
35 |   - Intro: Associative Role Learning
36 |   - Principal Component Analysis
37 |   - Intro to Linear Regressions
38 |   - Decision Trees
39 |   - Intro to Clustering Algorithms
40 | 


--------------------------------------------------------------------------------
/Assignments/lab_group_accumulators.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pymongo"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 19 |     "course_client = pymongo.MongoClient(course_cluster_uri)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "movies = course_client['aggregations']['movies']"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "# Lab: Group Accumulators\n",
 36 |     "\n",
 37 |     "## For this lab, you'll be using group accumulators."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "### Question\n",
 45 |     "\n",
 46 |     "In this lab, you will need to capture the highest `imdb.rating`, lowest `imdb.rating`, average, and **sample** standard deviation for all films that won an Oscar.\n",
 47 |     "\n",
 48 |     "You may find documentation on [group accumulators](https://docs.mongodb.com/manual/reference/operator/aggregation-group/#group-accumulator-operators) helpful!\n",
 49 |     "\n",
 50 |     "The matching stage to find films with Oscar wins is provided below."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 4,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "matching = {\n",
 60 |     "    \"$match\": {\n",
 61 |     "        \"awards\": { \"$regex\": \"Won \\\\d{1,2} Oscars?\"}\n",
 62 |     "    }\n",
 63 |     "}"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 13,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "grouping = {\n",
 73 |     "    \"$group\": {\n",
 74 |     "        \"_id\": None,\n",
 75 |     "        \"highest_rating\": { \"$max\": \"$imdb.rating\" },\n",
 76 |     "        \"lowest_rating\": { \"$min\": \"$imdb.rating\" },\n",
 77 |     "        \"average_rating\": { \"$avg\": \"$imdb.rating\" },\n",
 78 |     "        \"sample_st_dev_rating\": { \"$stdDevSamp\": \"$imdb.rating\" }\n",
 79 |     "    }\n",
 80 |     "}"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 14,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "data": {
 90 |       "text/plain": [
 91 |        "[{'_id': None,\n",
 92 |        "  'highest_rating': 9.2,\n",
 93 |        "  'lowest_rating': 4.5,\n",
 94 |        "  'average_rating': 7.527024070021882,\n",
 95 |        "  'sample_st_dev_rating': 0.5988145513344504}]"
 96 |       ]
 97 |      },
 98 |      "metadata": {},
 99 |      "output_type": "display_data"
100 |     }
101 |    ],
102 |    "source": [
103 |     "pipeline = [\n",
104 |     "    matching,\n",
105 |     "    grouping\n",
106 |     "]\n",
107 |     "\n",
108 |     "display(list(movies.aggregate(pipeline)))"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": []
117 |   }
118 |  ],
119 |  "metadata": {
120 |   "kernelspec": {
121 |    "display_name": "Python 3",
122 |    "language": "python",
123 |    "name": "python3"
124 |   },
125 |   "language_info": {
126 |    "codemirror_mode": {
127 |     "name": "ipython",
128 |     "version": 3
129 |    },
130 |    "file_extension": ".py",
131 |    "mimetype": "text/x-python",
132 |    "name": "python",
133 |    "nbconvert_exporter": "python",
134 |    "pygments_lexer": "ipython3",
135 |    "version": "3.6.5"
136 |   }
137 |  },
138 |  "nbformat": 4,
139 |  "nbformat_minor": 2
140 | }
141 | 


--------------------------------------------------------------------------------
/Assignments/schemas-and-accumulators.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pymongo import MongoClient\n",
 10 |     "import pprint"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 20 |     "course_client = MongoClient(course_cluster_uri)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 15,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "orders = course_client['coursera-agg']['orders']"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 24,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# Replace XXXX with a pipeline to add the fields mean_order_quantity, mean_order_unit_price,\n",
 39 |     "# order_quantity, and order_total to each document. You can also add a $sort and $limit to your\n",
 40 |     "# pipeline to answer the verification question.\n",
 41 |     "pipeline = [\n",
 42 |     "    {\n",
 43 |     "        \"$addFields\": {\n",
 44 |     "            \"mean_order_quantity\": { \"$avg\": \"$purchases.quantity\" },\n",
 45 |     "            \"mean_order_unit_price\": { \"$avg\": \"$purchases.unit_price\" },\n",
 46 |     "            \"order_quantity\": { \"$sum\": \"$purchases.quantity\" },\n",
 47 |     "            \"order_total\": { \n",
 48 |     "                \"$reduce\": {\n",
 49 |     "                    \"input\": \"$purchases\",\n",
 50 |     "                    \"initialValue\": 0.00,\n",
 51 |     "                    \"in\": {\n",
 52 |     "                        \"$add\": [\n",
 53 |     "                            \"$$value\",\n",
 54 |     "                            { \"$multiply\": [ \"$$this.quantity\", \"$$this.unit_price\"] }\n",
 55 |     "                        ]\n",
 56 |     "                    }\n",
 57 |     "                }\n",
 58 |     "            }\n",
 59 |     "        }\n",
 60 |     "    },\n",
 61 |     "    {\n",
 62 |     "        \"$sort\": { \"order_total\": -1 }\n",
 63 |     "    },\n",
 64 |     "    {\n",
 65 |     "        \"$limit\": 1\n",
 66 |     "    }\n",
 67 |     "]"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 25,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "cursor = orders.aggregate(pipeline)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 26,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "{'_id': 581483,\n",
 89 |       " 'country': 'United Kingdom',\n",
 90 |       " 'customer_id': 16446,\n",
 91 |       " 'date': datetime.datetime(2011, 12, 9, 9, 15),\n",
 92 |       " 'mean_order_quantity': 80995.0,\n",
 93 |       " 'mean_order_unit_price': 2.08,\n",
 94 |       " 'order_quantity': 80995,\n",
 95 |       " 'order_total': 168469.6,\n",
 96 |       " 'purchases': [{'description': 'PAPER CRAFT , LITTLE BIRDIE',\n",
 97 |       "                'quantity': 80995,\n",
 98 |       "                'stock_code': '23843',\n",
 99 |       "                'unit_price': 2.08}]}\n"
100 |      ]
101 |     }
102 |    ],
103 |    "source": [
104 |     "for doc in cursor:\n",
105 |     "    pprint.pprint(doc)\n",
106 |     "#168469.6"
107 |    ]
108 |   }
109 |  ],
110 |  "metadata": {
111 |   "kernelspec": {
112 |    "display_name": "Python 3",
113 |    "language": "python",
114 |    "name": "python3"
115 |   },
116 |   "language_info": {
117 |    "codemirror_mode": {
118 |     "name": "ipython",
119 |     "version": 3
120 |    },
121 |    "file_extension": ".py",
122 |    "mimetype": "text/x-python",
123 |    "name": "python",
124 |    "nbconvert_exporter": "python",
125 |    "pygments_lexer": "ipython3",
126 |    "version": "3.6.5"
127 |   }
128 |  },
129 |  "nbformat": 4,
130 |  "nbformat_minor": 2
131 | }
132 | 


--------------------------------------------------------------------------------
/Assignments/changing_document_shape.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pymongo"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 19 |     "course_client = pymongo.MongoClient(course_cluster_uri)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "movies = course_client['aggregations']['movies']"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "# Lab: Changing Document Shape\n",
 36 |     "\n",
 37 |     "## For this lab, you'll be using expressions to change document shape and perform an analysis \n",
 38 |     "\n",
 39 |     "#### The dataset for this lab can be downloaded [here](https://s3.amazonaws.com/edu-static.mongodb.com/lessons/coursera/aggregation/movies.json) for upload to your own cluster."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "### Prelude\n",
 47 |     "\n",
 48 |     "Our movies dataset has a lot of different documents, some with more convoluted\n",
 49 |     "titles than others. \n",
 50 |     "\n",
 51 |     "If we'd like to analyze our collection to find movie titles\n",
 52 |     "that are composed of only one word, we **could** fetch all the movies in the\n",
 53 |     "dataset and do some processing in a client application, but the Aggregation\n",
 54 |     "Framework allows us to do this on the server!\n",
 55 |     "\n",
 56 |     "Ensure you explore the  [string expressions](https://docs.mongodb.com/manual/meta/aggregation-quick-reference/#string-expressions) and the [array expressions](https://docs.mongodb.com/manual/meta/aggregation-quick-reference/#array-expressions) before attempting this lab.\n",
 57 |     "\n",
 58 |     "### Question\n",
 59 |     "\n",
 60 |     "Using the Aggregation Framework, find a count of the number of movies that have\n",
 61 |     "a title composed of one word. To clarify, \"Cinderella\" and \"3-25\" should count,\n",
 62 |     "where as \"Cast Away\" would not.\n",
 63 |     "\n",
 64 |     "Don't forget to append the following `counting` variable to your pipeline!"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "counting = {\n",
 74 |     "    \"$count\": \"one_word_titles\"\n",
 75 |     "}"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 5,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "shaping = {\n",
 85 |     "    \"$project\": {\n",
 86 |     "        \"title_size\": { \"$size\": { \"$split\": [ \"$title\", \" \" ] } },\n",
 87 |     "    }\n",
 88 |     "}"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 6,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "matching = {\n",
 98 |     "    \"$match\": { \"title_size\": { \"$eq\": 1 } }\n",
 99 |     "}"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 8,
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "data": {
109 |       "text/plain": [
110 |        "[{'one_word_titles': 44497}]"
111 |       ]
112 |      },
113 |      "metadata": {},
114 |      "output_type": "display_data"
115 |     }
116 |    ],
117 |    "source": [
118 |     "pipeline = [\n",
119 |     "    shaping,\n",
120 |     "    matching,\n",
121 |     "    counting\n",
122 |     "]\n",
123 |     "\n",
124 |     "display(list(movies.aggregate(pipeline)))\n",
125 |     "#8068"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {
132 |     "collapsed": true
133 |    },
134 |    "outputs": [],
135 |    "source": []
136 |   }
137 |  ],
138 |  "metadata": {
139 |   "kernelspec": {
140 |    "display_name": "Python 3",
141 |    "language": "python",
142 |    "name": "python3"
143 |   },
144 |   "language_info": {
145 |    "codemirror_mode": {
146 |     "name": "ipython",
147 |     "version": 3
148 |    },
149 |    "file_extension": ".py",
150 |    "mimetype": "text/x-python",
151 |    "name": "python",
152 |    "nbconvert_exporter": "python",
153 |    "pygments_lexer": "ipython3",
154 |    "version": "3.6.5"
155 |   }
156 |  },
157 |  "nbformat": 4,
158 |  "nbformat_minor": 2
159 | }
160 | 


--------------------------------------------------------------------------------
/Assignments/unwind_and_group_lab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pymongo"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 19 |     "course_client = pymongo.MongoClient(course_cluster_uri)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "movies = course_client['aggregations']['movies']"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "# Lab: Using ``$unwind`` and ``$group``\n",
 36 |     "\n",
 37 |     "## For this lab, you'll be using both the ``$unwind`` and ``$group`` stages.\n",
 38 |     "\n",
 39 |     "#### The dataset for this lab can be downloaded [here](https://s3.amazonaws.com/edu-static.mongodb.com/lessons/coursera/aggregation/movies.json) for upload to your own cluster."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "### Question\n",
 47 |     "\n",
 48 |     "Let's use our increasing understanding of the Aggregation Framework to explore our\n",
 49 |     "movies collection in more detail. We'd like to calculate how many movies every\n",
 50 |     "**cast** member has been in, and get an average ``imdb.rating`` for each\n",
 51 |     "``cast`` member.\n",
 52 |     "\n",
 53 |     "Which cast member has the been in the most movies with **English** as an available language?\n",
 54 |     "\n",
 55 |     "To verify that you've successfully completed this exercise please submit your answer as the sum of the number of films and average rating for this cast member.\n",
 56 |     "\n",
 57 |     "For example, if the cast member was output like so:\n",
 58 |     "\n",
 59 |     "    { \"_id\": \"James Dean\", \"numFilms\": 11, \"average\": 7.1 }\n",
 60 |     "Then the answer would be 18.1."
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 4,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "predicate = {\n",
 70 |     "    \"$match\": { \n",
 71 |     "        \"imdb.rating\": { \"$exists\": True },\n",
 72 |     "        \"languages\": { \"$in\": [ \"English\", \"$languages\" ] }, \n",
 73 |     "        \"cast\": { \"$elemMatch\": { \"$exists\": True } }\n",
 74 |     "    }\n",
 75 |     "}"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 5,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "unwinding = {\n",
 85 |     "    \"$unwind\": \"$cast\"\n",
 86 |     "}"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 6,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "grouping = {\n",
 96 |     "    \"$group\": {\n",
 97 |     "        \"_id\": \"$cast\",\n",
 98 |     "        \"num_films\": { \"$sum\": 1 },\n",
 99 |     "        \"average\": { \"$avg\": \"$imdb.rating\" }\n",
100 |     "    } \n",
101 |     "}"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 8,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "shaping = {\n",
111 |     "    \"$project\": {\n",
112 |     "        \"_id\": 1,\n",
113 |     "        \"num_films\": 1,\n",
114 |     "        \"average\": 1\n",
115 |     "    }\n",
116 |     "}"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 9,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "sorting = {\n",
126 |     "    \"$sort\": {\n",
127 |     "        \"num_films\": -1\n",
128 |     "    }\n",
129 |     "}"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 10,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "limiting = {\n",
139 |     "    \"$limit\": 1\n",
140 |     "}"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 11,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "pipeline = [\n",
150 |     "    predicate,\n",
151 |     "    unwinding,\n",
152 |     "    grouping,\n",
153 |     "    shaping,\n",
154 |     "    sorting,\n",
155 |     "    limiting\n",
156 |     "]"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 12,
162 |    "metadata": {},
163 |    "outputs": [
164 |     {
165 |      "data": {
166 |       "text/plain": [
167 |        "{'_id': 'John Wayne', 'num_films': 107, 'average': 6.424299065420561}"
168 |       ]
169 |      },
170 |      "metadata": {},
171 |      "output_type": "display_data"
172 |     }
173 |    ],
174 |    "source": [
175 |     "display(list(movies.aggregate(pipeline))[0])\n",
176 |     "#{'_id': 'John Wayne', 'num_films': 107, 'average': 6.424299065420561}"
177 |    ]
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.6.5"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 2
201 | }
202 | 


--------------------------------------------------------------------------------
/Assignments/entity-resolution.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pymongo import MongoClient\n",
 10 |     "import pprint"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "For this lab, use the provided `course-cluster-uri` below."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 27 |     "course_client = MongoClient(course_cluster_uri)"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "people_master = course_client['coursera-agg']['people_master']"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 11,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "# Replace this with a match stage that will return documents that match on\n",
 46 |     "# first_name OR last_name OR birthday OR email.\n",
 47 |     "greedy_match = {\n",
 48 |     "    \"$match\": { \n",
 49 |     "        \"$expr\": { \n",
 50 |     "            \"$or\": [\n",
 51 |     "                { \"$eq\": [\"$first_name\", \"$$first_name\"] },\n",
 52 |     "                { \"$eq\": [\"$last_name\", \"$$last_name\"] },\n",
 53 |     "                { \"$eq\": [\"$birthday\", \"$$birthday\"] },\n",
 54 |     "                { \"$eq\": [\"$email\", \"$$email\"] },\n",
 55 |     "            ]\n",
 56 |     "        }\n",
 57 |     "    }\n",
 58 |     "}"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 9,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# Replace this with a stage that will add a field called 'matchScore', where\n",
 68 |     "# matchScore is the number of fields (first_name, last_name, birthday, email)\n",
 69 |     "# that match the source document.\n",
 70 |     "match_score_calculation = {\n",
 71 |     "    \"$addFields\": {\n",
 72 |     "        \"matchScore\": {\n",
 73 |     "            \"$sum\": [\n",
 74 |     "                { \n",
 75 |     "                    \"$cond\": [\n",
 76 |     "                        { \"$eq\": [\"$first_name\", \"$$first_name\"] }, 1, 0\n",
 77 |     "                    ] \n",
 78 |     "                },\n",
 79 |     "                { \n",
 80 |     "                    \"$cond\": [\n",
 81 |     "                        { \"$eq\": [\"$last_name\", \"$$last_name\"] }, 1, 0\n",
 82 |     "                    ] \n",
 83 |     "                },\n",
 84 |     "                { \n",
 85 |     "                    \"$cond\": [\n",
 86 |     "                        { \"$eq\": [\"$birthday\", \"$$birthday\"] }, 1, 0\n",
 87 |     "                    ] \n",
 88 |     "                },\n",
 89 |     "                { \n",
 90 |     "                    \"$cond\": [\n",
 91 |     "                        { \"$eq\": [\"$email\", \"$$email\"] }, 1, 0\n",
 92 |     "                    ] \n",
 93 |     "                }\n",
 94 |     "            ]\n",
 95 |     "        }   \n",
 96 |     "    }\n",
 97 |     "}"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 12,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "cursor = people_master.aggregate([\n",
107 |     "    {\n",
108 |     "        \"$lookup\": {\n",
109 |     "            \"from\": \"people_import\",\n",
110 |     "            \"let\": {\n",
111 |     "                \"first_name\": \"$first_name\",\n",
112 |     "                \"last_name\": \"$last_name\",\n",
113 |     "                \"email\": \"$email\",\n",
114 |     "                \"birthday\": \"$birthday\",\n",
115 |     "            },\n",
116 |     "            \"pipeline\": [\n",
117 |     "                greedy_match,\n",
118 |     "                match_score_calculation,\n",
119 |     "                {\n",
120 |     "                    \"$match\": {\n",
121 |     "                        \"matchScore\": { \"$gte\": 3 }\n",
122 |     "                    }\n",
123 |     "                },\n",
124 |     "                {\n",
125 |     "                    \"$sort\": { \"matchScore\": -1 }\n",
126 |     "                },\n",
127 |     "                {\n",
128 |     "                    \"$limit\": 5\n",
129 |     "                }\n",
130 |     "            ],\n",
131 |     "            \"as\": \"matches\"\n",
132 |     "        }\n",
133 |     "    },\n",
134 |     "    {\n",
135 |     "        \"$match\": {\n",
136 |     "            \"matches.matchScore\": 3\n",
137 |     "        }\n",
138 |     "    }\n",
139 |     "])"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 13,
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "data": {
149 |       "text/plain": [
150 |        "19"
151 |       ]
152 |      },
153 |      "execution_count": 13,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "len(list(cursor))\n",
160 |     "#19"
161 |    ]
162 |   }
163 |  ],
164 |  "metadata": {
165 |   "kernelspec": {
166 |    "display_name": "Python 3",
167 |    "language": "python",
168 |    "name": "python3"
169 |   },
170 |   "language_info": {
171 |    "codemirror_mode": {
172 |     "name": "ipython",
173 |     "version": 3
174 |    },
175 |    "file_extension": ".py",
176 |    "mimetype": "text/x-python",
177 |    "name": "python",
178 |    "nbconvert_exporter": "python",
179 |    "pygments_lexer": "ipython3",
180 |    "version": "3.6.5"
181 |   }
182 |  },
183 |  "nbformat": 4,
184 |  "nbformat_minor": 2
185 | }
186 | 


--------------------------------------------------------------------------------
/Assignments/cursor_like_methods.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pymongo"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 19 |     "course_client = pymongo.MongoClient(course_cluster_uri)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "movies = course_client['aggregations']['movies']"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "# Lab: Using Cursor-like aggregation stages\n",
 36 |     "\n",
 37 |     "## For this lab, you'll have to use cursor-like aggregation stages to find the answer for the following scenario.\n",
 38 |     "\n",
 39 |     "#### The dataset for this lab can be downloaded [here](https://s3.amazonaws.com/edu-static.mongodb.com/lessons/coursera/aggregation/movies.json) for upload to your own cluster.\n",
 40 |     "\n",
 41 |     "### Movie Night\n",
 42 |     "\n",
 43 |     "Your organization has a movie night scheduled, and you've again been tasked with coming up with a selection.\n",
 44 |     "\n",
 45 |     "HR has polled employees and assembled the following list of preferred actresses and actors."
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "favorites = [\n",
 55 |     "  \"Sandra Bullock\",\n",
 56 |     "  \"Tom Hanks\",\n",
 57 |     "  \"Julia Roberts\",\n",
 58 |     "  \"Kevin Spacey\",\n",
 59 |     "  \"George Clooney\"\n",
 60 |     "]"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "For movies released in the **USA** with a ``tomatoes.viewer.rating`` greater\n",
 68 |     "than or equal to **3**, calculate a new field called num_favs that represets how\n",
 69 |     "many **favorites** appear in the ``cast`` field of the movie.\n",
 70 |     "\n",
 71 |     "Sort your results by ``num_favs``, ``tomatoes.viewer.rating``, and ``title``,\n",
 72 |     "all in descending order.\n",
 73 |     "\n",
 74 |     "What is the ``title`` of the **25th** film in the aggregation result?\n",
 75 |     "\n",
 76 |     "**Hint**: MongoDB has a great expression for quickly determining whether there are common elements in lists, ``$setIntersection``"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 19,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "#Check that the title exists, countries contains USA, rating >=3 and cast is array.\n",
 86 |     "predicate = {\n",
 87 |     "    \"$match\": { \n",
 88 |     "        \"title\": { \"$exists\": True },\n",
 89 |     "        \"countries\": { \"$in\": [ \"USA\", \"$countries\" ] }, \n",
 90 |     "        \"cast\": { \"$elemMatch\": { \"$exists\": True } }, \n",
 91 |     "        \"tomatoes.viewer.rating\": { \"$gte\": 3 }\n",
 92 |     "    }\n",
 93 |     "}"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 6,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "#Project required fields and create num_favs field\n",
103 |     "projection = {\n",
104 |     "    \"$project\": {\n",
105 |     "        \"title\": 1,\n",
106 |     "        \"tomatoes.viewer.rating\": 1,\n",
107 |     "        \"num_favs\": { \"$size\": { \"$setIntersection\" : [ favorites, \"$cast\" ] } }\n",
108 |     "    }\n",
109 |     "}"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 17,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "sorting = {\n",
119 |     "    \"$sort\": {\n",
120 |     "        \"num_favs\": -1,\n",
121 |     "        \"tomatoes.viewer.rating\": -1,\n",
122 |     "        \"title\": -1\n",
123 |     "    }\n",
124 |     "}"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 15,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "skipping = {\n",
134 |     "    \"$skip\": 24\n",
135 |     "}"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 16,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "limiting = {\n",
145 |     "    \"$limit\": 1\n",
146 |     "}"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 20,
152 |    "metadata": {},
153 |    "outputs": [
154 |     {
155 |      "data": {
156 |       "text/plain": [
157 |        "[{'_id': ObjectId('573a13ddf29313caabdb320f'),\n",
158 |        "  'title': 'The Heat',\n",
159 |        "  'tomatoes': {'viewer': {'rating': 3.8}},\n",
160 |        "  'num_favs': 1}]"
161 |       ]
162 |      },
163 |      "metadata": {},
164 |      "output_type": "display_data"
165 |     }
166 |    ],
167 |    "source": [
168 |     "pipeline = [\n",
169 |     "    predicate,\n",
170 |     "    projection,\n",
171 |     "    sorting,\n",
172 |     "    skipping,\n",
173 |     "    limiting\n",
174 |     "]\n",
175 |     "\n",
176 |     "display(list(movies.aggregate(pipeline)))\n",
177 |     "#The Heat"
178 |    ]
179 |   }
180 |  ],
181 |  "metadata": {
182 |   "kernelspec": {
183 |    "display_name": "Python 3",
184 |    "language": "python",
185 |    "name": "python3"
186 |   },
187 |   "language_info": {
188 |    "codemirror_mode": {
189 |     "name": "ipython",
190 |     "version": 3
191 |    },
192 |    "file_extension": ".py",
193 |    "mimetype": "text/x-python",
194 |    "name": "python",
195 |    "nbconvert_exporter": "python",
196 |    "pygments_lexer": "ipython3",
197 |    "version": "3.6.5"
198 |   }
199 |  },
200 |  "nbformat": 4,
201 |  "nbformat_minor": 2
202 | }
203 | 


--------------------------------------------------------------------------------
/Assignments/lab__graphlookup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Collecting dateparser\n",
 13 |       "  Using cached https://files.pythonhosted.org/packages/ac/9e/1aa87c0c59f9731820bfd20a8b148d97b315530c2c92d1fb300328c8c42f/dateparser-0.7.0-py2.py3-none-any.whl\n",
 14 |       "Requirement already satisfied: python-dateutil in c:\\users\\jesus\\anaconda3\\envs\\aggregation-framework\\lib\\site-packages (from dateparser) (2.7.3)\n",
 15 |       "Collecting regex (from dateparser)\n",
 16 |       "  Using cached https://files.pythonhosted.org/packages/7c/11/89b423ecd55990abd66fe3742992c5c13f951b8b4447deb1d7cc3e292611/regex-2018.07.11-cp36-none-win_amd64.whl\n",
 17 |       "Collecting tzlocal (from dateparser)\n",
 18 |       "Requirement already satisfied: pytz in c:\\users\\jesus\\anaconda3\\envs\\aggregation-framework\\lib\\site-packages (from dateparser) (2018.4)\n",
 19 |       "Requirement already satisfied: six>=1.5 in c:\\users\\jesus\\anaconda3\\envs\\aggregation-framework\\lib\\site-packages (from python-dateutil->dateparser) (1.11.0)\n",
 20 |       "Installing collected packages: regex, tzlocal, dateparser\n",
 21 |       "Successfully installed dateparser-0.7.0 regex-2018.7.11 tzlocal-1.5.1\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "!pip install dateparser"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 4,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "import pymongo\n",
 36 |     "import pprint\n",
 37 |     "import dateparser\n",
 38 |     "from bson.son import SON\n",
 39 |     "\n",
 40 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 41 |     "course_client = pymongo.MongoClient(course_cluster_uri)\n",
 42 |     "movies = course_client['aggregations']['movies']"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## Lab : $graphLookup\n",
 50 |     "\n",
 51 |     "For this lab, you'll be calculating the [degrees of separation](https://en.wikipedia.org/wiki/Six_degrees_of_separation) of directors to \"Steven Spielberg\".\n",
 52 |     "\n",
 53 |     "This is a bit like calculating a [\"Kevin Bacon\" number](https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon), but instead of all connections you will only consider connections through the `directors` graph nodes.\n",
 54 |     "\n",
 55 |     "Complete the the `$graphLookup` and `$project` stages by correctly constructing the `graph_lookup` and `project_cast` variables below. \n",
 56 |     "\n",
 57 |     "To optimize the execution of `$graphLookup` stage, use a `maxDepth` of 6.\n",
 58 |     "\n",
 59 |     "For the solution, only provide the numeric portion of the returned output to the validator.\n",
 60 |     "\n",
 61 |     "**HINT**: `$reduce` is a powerful expression!"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 7,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "data": {
 71 |       "text/plain": [
 72 |        "{'answer': 2}"
 73 |       ]
 74 |      },
 75 |      "execution_count": 7,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "\n",
 82 |     "graph_lookup = {\n",
 83 |     "    \"$graphLookup\": {\n",
 84 |     "        \"from\": \"movies\",\n",
 85 |     "        \"startWith\": \"$directors\",\n",
 86 |     "        \"connectFromField\": \"directors\",\n",
 87 |     "        \"connectToField\": \"directors\",\n",
 88 |     "        \"as\": \"network\",\n",
 89 |     "        \"maxDepth\": 6,\n",
 90 |     "        \"depthField\": \"network_level\"\n",
 91 |     "    }\n",
 92 |     "}\n",
 93 |     "\n",
 94 |     "\n",
 95 |     "project_cast = {\n",
 96 |     "     \"$project\": {\n",
 97 |     "        \"cast\": {\n",
 98 |     "            \"$reduce\": {\n",
 99 |     "                \"input\": \"$cast\",\n",
100 |     "                \"initialValue\": [],\n",
101 |     "                \"in\": { \"$concatArrays\" : [\"$$value\", \"$$this\"] }\n",
102 |     "            }\n",
103 |     "        }\n",
104 |     "     }\n",
105 |     "}\n",
106 |     "\n",
107 |     "\n",
108 |     "results = movies.aggregate([\n",
109 |     "    {\n",
110 |     "        \"$match\": {\n",
111 |     "            \"directors\": \"Steven Spielberg\"\n",
112 |     "        }\n",
113 |     "    },\n",
114 |     "    {\n",
115 |     "        \"$project\": {\n",
116 |     "            \"directors\": 1\n",
117 |     "        }\n",
118 |     "    },\n",
119 |     "    graph_lookup,\n",
120 |     "    {\n",
121 |     "        \"$unwind\": \"$network\"\n",
122 |     "    },\n",
123 |     "    {\n",
124 |     "        \"$project\": {\n",
125 |     "            \"cast\": \"$network.cast\",\n",
126 |     "            \"level\": \"$network.network_level\"\n",
127 |     "        }\n",
128 |     "    },\n",
129 |     "    {\n",
130 |     "        \"$group\": {\n",
131 |     "            \"_id\": \"$level\",\n",
132 |     "            \"cast\": {\"$addToSet\": \"$cast\"}\n",
133 |     "        }\n",
134 |     "    },\n",
135 |     "    project_cast,\n",
136 |     "    {\n",
137 |     "        \"$match\": {\n",
138 |     "            \"cast\": \"Woody Harrelson\"\n",
139 |     "        }\n",
140 |     "    },\n",
141 |     "    {\n",
142 |     "        \"$sort\": {\n",
143 |     "            \"_id\": 1\n",
144 |     "        }\n",
145 |     "     },\n",
146 |     "    {\n",
147 |     "        \"$project\": {\n",
148 |     "            \"_id\": 0,\n",
149 |     "            \"answer\": \"$_id\"\n",
150 |     "        }\n",
151 |     "    },\n",
152 |     "    {\n",
153 |     "        \"$limit\": 1\n",
154 |     "    }\n",
155 |     "])\n",
156 |     "\n",
157 |     "list(results)[0]\n",
158 |     "#2"
159 |    ]
160 |   }
161 |  ],
162 |  "metadata": {
163 |   "kernelspec": {
164 |    "display_name": "Python 3",
165 |    "language": "python",
166 |    "name": "python3"
167 |   },
168 |   "language_info": {
169 |    "codemirror_mode": {
170 |     "name": "ipython",
171 |     "version": 3
172 |    },
173 |    "file_extension": ".py",
174 |    "mimetype": "text/x-python",
175 |    "name": "python",
176 |    "nbconvert_exporter": "python",
177 |    "pygments_lexer": "ipython3",
178 |    "version": "3.6.5"
179 |   }
180 |  },
181 |  "nbformat": 4,
182 |  "nbformat_minor": 2
183 | }
184 | 


--------------------------------------------------------------------------------
/LessonNotes/linear-regressions-with-mongodb.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from pandas.io.json import json_normalize\n",
 12 |     "from pymongo import MongoClient\n",
 13 |     "from sklearn import linear_model\n",
 14 |     "from sklearn.model_selection import train_test_split\n",
 15 |     "import numpy as np\n",
 16 |     "import seaborn as sns"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {
 23 |     "collapsed": true
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 28 |     "course_client = MongoClient(course_cluster_uri)"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "collapsed": true
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "weather_db = course_client['100YWeatherSmall']['data']"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {
 46 |     "collapsed": true
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "weather_filter = {\n",
 51 |     "    \"$match\": {\n",
 52 |     "        \"airTemperature.value\": { \"$lt\": 900 },\n",
 53 |     "        \"dewPoint.value\": { \"$lt\": 900 },\n",
 54 |     "        \"pressure.value\": { \"$lt\": 9000 },\n",
 55 |     "    }\n",
 56 |     "}"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {
 63 |     "collapsed": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "weather_projection = {\n",
 68 |     "    \"$project\": {\n",
 69 |     "        \"_id\": 0,\n",
 70 |     "        \"airTemperature.value\": 1,\n",
 71 |     "        \"dewPoint.value\": 1,\n",
 72 |     "        \"pressure.value\": 1,\n",
 73 |     "    }\n",
 74 |     "}"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {
 81 |     "collapsed": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "sample_stage = { \"$sample\": { \"size\": 10000 } }"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "cursor = weather_db.aggregate([\n",
 97 |     "    weather_filter,\n",
 98 |     "    weather_projection,\n",
 99 |     "    sample_stage\n",
100 |     "])"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "weather_data = list(cursor)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "collapsed": true
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "weather_data[0]"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {
129 |     "collapsed": true
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "df = json_normalize(weather_data)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {
140 |     "collapsed": true
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "df.head()"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {
151 |     "collapsed": true
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "%matplotlib inline"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {
162 |     "collapsed": true
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "sns.pairplot(df)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {
173 |     "collapsed": true
174 |    },
175 |    "outputs": [],
176 |    "source": [
177 |     "df_x = df.drop(['airTemperature.value'], axis=1)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {
184 |     "collapsed": true
185 |    },
186 |    "outputs": [],
187 |    "source": [
188 |     "df_y = df['airTemperature.value']"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {
195 |     "collapsed": true
196 |    },
197 |    "outputs": [],
198 |    "source": [
199 |     "reg = linear_model.LinearRegression()"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {
206 |     "collapsed": true
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {
217 |     "collapsed": true
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "reg.fit(x_train, y_train)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {
228 |     "collapsed": true
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "reg.coef_"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {
239 |     "collapsed": true
240 |    },
241 |    "outputs": [],
242 |    "source": [
243 |     "reg.intercept_"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "collapsed": true
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "reg.predict(x_test)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {
261 |     "collapsed": true
262 |    },
263 |    "outputs": [],
264 |    "source": [
265 |     "np.mean((reg.predict(x_test) - y_test)**2)"
266 |    ]
267 |   }
268 |  ],
269 |  "metadata": {
270 |   "kernelspec": {
271 |    "display_name": "Python 3",
272 |    "language": "python",
273 |    "name": "python3"
274 |   },
275 |   "language_info": {
276 |    "codemirror_mode": {
277 |     "name": "ipython",
278 |     "version": 3
279 |    },
280 |    "file_extension": ".py",
281 |    "mimetype": "text/x-python",
282 |    "name": "python",
283 |    "nbconvert_exporter": "python",
284 |    "pygments_lexer": "ipython3",
285 |    "version": "3.6.5"
286 |   }
287 |  },
288 |  "nbformat": 4,
289 |  "nbformat_minor": 2
290 | }
291 | 


--------------------------------------------------------------------------------
/Assignments/lookup_lab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pymongo"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 19 |     "course_client = pymongo.MongoClient(course_cluster_uri)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "routes = course_client['aggregations']['air_routes']"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "# Lab: Using ``$lookup``\n",
 36 |     "\n",
 37 |     "## For this lab, you'll be using the ``$lookup``.\n",
 38 |     "\n",
 39 |     "#### The dataset for this lab can be downloaded by clicking the following links - [air_alliances](https://s3.amazonaws.com/edu-static.mongodb.com/lessons/coursera/aggregation/air_alliances.json), [air_routes](https://s3.amazonaws.com/edu-static.mongodb.com/lessons/coursera/aggregation/air_routes.json) - for upload to your own cluster."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "### Question\n",
 47 |     "\n",
 48 |     "Which alliance from ``air_alliances`` flies the most **routes** with either a\n",
 49 |     "Boeing 747 or an Airbus A380 (abbreviated 747 and 380 in ``air_routes``)?\n",
 50 |     "\n",
 51 |     "**Note**: Begin from the ``air_routes`` collection!"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 4,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# predicate is given this lab\n",
 61 |     "predicate = {\n",
 62 |     "  \"$match\": {\n",
 63 |     "      \"airplane\": {\"$regex\": \"747|380\"}\n",
 64 |     "  }\n",
 65 |     "}"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 5,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "lookup = {\n",
 75 |     "    \"$lookup\": {\n",
 76 |     "        \"from\": 'air_alliances',\n",
 77 |     "        \"localField\": \"airline.name\",\n",
 78 |     "        \"foreignField\": \"airlines\",\n",
 79 |     "        \"as\": \"alliance\"\n",
 80 |     "    }\n",
 81 |     "}"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 6,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "unwinding = {\n",
 91 |     "    \"$unwind\": \"$alliance\"\n",
 92 |     "}"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 7,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "grouping = {\n",
102 |     "    \"$group\": {\n",
103 |     "        \"_id\": \"$alliance\",\n",
104 |     "        \"count\": { \"$sum\": 1 }\n",
105 |     "    } \n",
106 |     "}"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 8,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "sorting = {\n",
116 |     "    \"$sort\": {\n",
117 |     "        \"count\": -1\n",
118 |     "    }\n",
119 |     "}"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 9,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "pipeline = [\n",
129 |     "    predicate,\n",
130 |     "    lookup,\n",
131 |     "    unwinding,\n",
132 |     "    grouping,\n",
133 |     "    sorting\n",
134 |     "]"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 10,
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "data": {
144 |       "text/plain": [
145 |        "[{'_id': {'_id': ObjectId('5980bef9a39d0ba3c650ae9c'),\n",
146 |        "   'name': 'SkyTeam',\n",
147 |        "   'airlines': ['Aeroflot',\n",
148 |        "    'Aerolinias Argentinas',\n",
149 |        "    'Aeromexico',\n",
150 |        "    'Air Europa',\n",
151 |        "    'Air France',\n",
152 |        "    'Alitalia',\n",
153 |        "    'China Airlines',\n",
154 |        "    'China Eastern Airlines',\n",
155 |        "    'China Southern Airlines',\n",
156 |        "    'Czech Airlines',\n",
157 |        "    'Delta Air Lines',\n",
158 |        "    'Garuda Indonesia',\n",
159 |        "    'Kenya Airways',\n",
160 |        "    'KLM',\n",
161 |        "    'Korean Air',\n",
162 |        "    'Middle East Airlines',\n",
163 |        "    'Saudia',\n",
164 |        "    'TAROM',\n",
165 |        "    'Vetnam Airlines',\n",
166 |        "    'Xiamen Airlines']},\n",
167 |        "  'count': 16},\n",
168 |        " {'_id': {'_id': ObjectId('5980bef9a39d0ba3c650ae9b'),\n",
169 |        "   'name': 'Star Alliance',\n",
170 |        "   'airlines': ['Air Canada',\n",
171 |        "    'Adria Airways',\n",
172 |        "    'Avianca',\n",
173 |        "    'Scandinavian Airlines',\n",
174 |        "    'All Nippon Airways',\n",
175 |        "    'Brussels Airlines',\n",
176 |        "    'Shenzhen Airlines',\n",
177 |        "    'Air China',\n",
178 |        "    'Air New Zealand',\n",
179 |        "    'Asiana Airlines',\n",
180 |        "    'Brussels Airlines',\n",
181 |        "    'Copa Airlines',\n",
182 |        "    'Croatia Airlines',\n",
183 |        "    'EgyptAir',\n",
184 |        "    'TAP Portugal',\n",
185 |        "    'United Airlines',\n",
186 |        "    'Turkish Airlines',\n",
187 |        "    'Swiss International Air Lines',\n",
188 |        "    'Lufthansa',\n",
189 |        "    'EVA Air',\n",
190 |        "    'South African Airways',\n",
191 |        "    'Singapore Airlines']},\n",
192 |        "  'count': 11},\n",
193 |        " {'_id': {'_id': ObjectId('5980bef9a39d0ba3c650ae9d'),\n",
194 |        "   'name': 'OneWorld',\n",
195 |        "   'airlines': ['Air Berlin',\n",
196 |        "    'American Airlines',\n",
197 |        "    'British Airways',\n",
198 |        "    'Cathay Pacific',\n",
199 |        "    'Finnair',\n",
200 |        "    'Iberia Airlines',\n",
201 |        "    'Japan Airlines',\n",
202 |        "    'LATAM Chile',\n",
203 |        "    'LATAM Brasil',\n",
204 |        "    'Malasya Airlines',\n",
205 |        "    'Canadian Airlines',\n",
206 |        "    'Quantas',\n",
207 |        "    'Qatar Airways',\n",
208 |        "    'Royal Jordainian',\n",
209 |        "    'SriLanka Airlines',\n",
210 |        "    'S7 Airlines']},\n",
211 |        "  'count': 11}]"
212 |       ]
213 |      },
214 |      "metadata": {},
215 |      "output_type": "display_data"
216 |     }
217 |    ],
218 |    "source": [
219 |     "display(list(routes.aggregate(pipeline)))\n",
220 |     "#SkyTeam, 16"
221 |    ]
222 |   }
223 |  ],
224 |  "metadata": {
225 |   "kernelspec": {
226 |    "display_name": "Python 3",
227 |    "language": "python",
228 |    "name": "python3"
229 |   },
230 |   "language_info": {
231 |    "codemirror_mode": {
232 |     "name": "ipython",
233 |     "version": 3
234 |    },
235 |    "file_extension": ".py",
236 |    "mimetype": "text/x-python",
237 |    "name": "python",
238 |    "nbconvert_exporter": "python",
239 |    "pygments_lexer": "ipython3",
240 |    "version": "3.6.5"
241 |   }
242 |  },
243 |  "nbformat": 4,
244 |  "nbformat_minor": 2
245 | }
246 | 


--------------------------------------------------------------------------------
/LessonNotes/migrating-schema-lesson.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from pymongo import MongoClient\n",
 12 |     "from bson.objectid import ObjectId\n",
 13 |     "from bson.decimal128 import Decimal128\n",
 14 |     "import json"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "class JSONEncoder(json.JSONEncoder):\n",
 26 |     "    def default(self, o):\n",
 27 |     "        if isinstance(o, ObjectId) or isinstance(o, Decimal128):\n",
 28 |     "            return str(o)\n",
 29 |     "        return json.JSONEncoder.default(self, o)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## Data source\n",
 37 |     "\n",
 38 |     "If you do not change the data uri (*course_cluster_uri*), you can execute most\n",
 39 |     "of this notebook, however you will not be able to write to the database.\n",
 40 |     "\n",
 41 |     "To execute successfully the pipelines with an $out/save stage in this notebook,\n",
 42 |     "point to your own Atlas cluster into which you will have imported the *retail.csv* dataset.\n"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 54 |     "course_client = MongoClient(course_cluster_uri)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {
 61 |     "collapsed": true
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "retail_col = course_client['coursera-agg']['retail']"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "assemble = {\n",
 77 |     "    \"$group\": {\n",
 78 |     "        \"_id\": {\n",
 79 |     "            \"InvoiceNo\": \"$InvoiceNo\",\n",
 80 |     "            \"CustomerID\": \"$CustomerID\",\n",
 81 |     "            \"Country\": \"$Country\"\n",
 82 |     "        },\n",
 83 |     "        \"InvoiceDate\": { \"$max\": \"$InvoiceDate\" },\n",
 84 |     "        \"Items\": {\n",
 85 |     "            \"$push\": {\n",
 86 |     "                \"StockCode\": \"$StockCode\",\n",
 87 |     "                \"Description\": \"$Description\",\n",
 88 |     "                \"Quantity\": \"$Quantity\",\n",
 89 |     "                \"UnitPrice\": \"$UnitPrice\"\n",
 90 |     "            }\n",
 91 |     "        }\n",
 92 |     "    }\n",
 93 |     "}"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {
100 |     "collapsed": true
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "beautify = {\n",
105 |     "    \"$project\": {\n",
106 |     "        \"_id\": \"$_id.InvoiceNo\",\n",
107 |     "        \"InvoiceDate\": \"$_id.InvoiceDate\",\n",
108 |     "        \"CustomerID\": \"$_id.CustomerID\",\n",
109 |     "        \"Country\": \"$_id.Country\",\n",
110 |     "        \"Items\": 1\n",
111 |     "    }\n",
112 |     "}"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "collapsed": true
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "cursor = retail_col.aggregate([\n",
124 |     "    assemble,\n",
125 |     "    beautify\n",
126 |     "  ],\n",
127 |     "  allowDiskUse=True)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {
134 |     "collapsed": true
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "retail_doc = cursor.next()"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {
145 |     "collapsed": true
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "print(json.dumps(retail_doc, cls=JSONEncoder, indent=4))"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "computed = {\n",
161 |     "    \"$addFields\" : {\n",
162 |     "        \"TotalPrice\": {\n",
163 |     "            \"$reduce\": {\n",
164 |     "                \"input\": \"$Items\",\n",
165 |     "                \"initialValue\": Decimal128(\"0.00\"),\n",
166 |     "                \"in\": {\n",
167 |     "                    \"$add\": [\n",
168 |     "                        \"$$value\",\n",
169 |     "                        { \"$multiply\": [ \"$$this.Quantity\", \"$$this.UnitPrice\" ] }\n",
170 |     "                    ]\n",
171 |     "                }\n",
172 |     "            }\n",
173 |     "        }\n",
174 |     "    }\n",
175 |     "}"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {
182 |     "collapsed": true
183 |    },
184 |    "outputs": [],
185 |    "source": [
186 |     "cursor = retail_col.aggregate([\n",
187 |     "    assemble,\n",
188 |     "    beautify,\n",
189 |     "    computed\n",
190 |     "  ],\n",
191 |     "  allowDiskUse=True)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "retail_doc = cursor.next()"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {
209 |     "collapsed": true
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "print(json.dumps(retail_doc, cls=JSONEncoder, indent=4))"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {
220 |     "collapsed": true
221 |    },
222 |    "outputs": [],
223 |    "source": [
224 |     "save = {\n",
225 |     "    \"$out\": \"orders_new\"\n",
226 |     "}"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "The following cell will **fail if you are not pointing** to your own Atlas group\n",
234 |     "where you have write privileges to the target collection"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {
241 |     "collapsed": true
242 |    },
243 |    "outputs": [],
244 |    "source": [
245 |     "cursor = retail_col.aggregate([\n",
246 |     "    assemble,\n",
247 |     "    beautify,\n",
248 |     "    computed,\n",
249 |     "    save\n",
250 |     "  ],\n",
251 |     "  allowDiskUse=True)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {
258 |     "collapsed": true
259 |    },
260 |    "outputs": [],
261 |    "source": [
262 |     "assemble = {\n",
263 |     "    \"$group\": {\n",
264 |     "        \"_id\": {\n",
265 |     "            \"InvoiceNo\": \"$InvoiceNo\",\n",
266 |     "            \"CustomerID\": \"$CustomerID\",\n",
267 |     "            \"Country\": \"$Country\",\n",
268 |     "            \"InvoiceDate\": { \"$max\": \"$InvoiceDate\" },\n",
269 |     "        },\n",
270 |     "        \"Items\": {\n",
271 |     "            \"$push\": {\n",
272 |     "                \"StockCode\": \"$StockCode\",\n",
273 |     "                \"Description\": \"$Description\",\n",
274 |     "                \"Quantity\": \"$Quantity\",\n",
275 |     "                \"UnitPrice\": \"$UnitPrice\"\n",
276 |     "            }\n",
277 |     "        }\n",
278 |     "    }\n",
279 |     "}"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "The following cell will show the expected error message of trying to build\n",
287 |     "an index on *_id*, if you are pointing to your own Atlas cluster where you\n",
288 |     "have write privileges"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "metadata": {
295 |     "collapsed": true
296 |    },
297 |    "outputs": [],
298 |    "source": [
299 |     "cursor = retail_col.aggregate([\n",
300 |     "    assemble,\n",
301 |     "    beautify,\n",
302 |     "    computed,\n",
303 |     "    save\n",
304 |     "  ],\n",
305 |     "  allowDiskUse=True)"
306 |    ]
307 |   }
308 |  ],
309 |  "metadata": {
310 |   "kernelspec": {
311 |    "display_name": "Python 3",
312 |    "language": "python",
313 |    "name": "python3"
314 |   },
315 |   "language_info": {
316 |    "codemirror_mode": {
317 |     "name": "ipython",
318 |     "version": 3
319 |    },
320 |    "file_extension": ".py",
321 |    "mimetype": "text/x-python",
322 |    "name": "python",
323 |    "nbconvert_exporter": "python",
324 |    "pygments_lexer": "ipython3",
325 |    "version": "3.6.5"
326 |   }
327 |  },
328 |  "nbformat": 4,
329 |  "nbformat_minor": 2
330 | }
331 | 


--------------------------------------------------------------------------------
/LessonNotes/principal-component-analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import matplotlib.pyplot as plt\n",
 14 |     "\n",
 15 |     "from pandas.io.json import json_normalize\n",
 16 |     "from pymongo import MongoClient\n",
 17 |     "from sklearn import preprocessing\n",
 18 |     "from sklearn.model_selection import  train_test_split\n",
 19 |     "from sklearn.linear_model import LogisticRegression\n",
 20 |     "\n",
 21 |     "%matplotlib inline"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {
 28 |     "collapsed": true
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 33 |     "course_client = MongoClient(course_cluster_uri)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.\n",
 45 |     "wine = course_client['coursera-agg']['wine']"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "collapsed": true
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "pipeline = [\n",
 57 |     "    {\n",
 58 |     "        \"$project\": {\n",
 59 |     "            \"_id\": 0\n",
 60 |     "        }\n",
 61 |     "    }\n",
 62 |     "]"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {
 69 |     "collapsed": true
 70 |    },
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "cursor = wine.aggregate(pipeline)\n",
 74 |     "docs = list(cursor)\n",
 75 |     "df = json_normalize(docs)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "df.head()"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "X = df.drop(['Alcohol'], axis=1).values.astype('float64')"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {
102 |     "collapsed": true
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "X = preprocessing.scale(X)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {
113 |     "collapsed": true
114 |    },
115 |    "outputs": [],
116 |    "source": [
117 |     "cov_matrix = np.cov(X.T)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {
124 |     "collapsed": true
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "for val in eigenvalues:\n",
138 |     "    print(val)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {
145 |     "collapsed": true
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "eigen_map = list(zip(eigenvalues, eigenvectors.T))"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "eigen_map.sort(key=lambda x: x[0], reverse=True)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {
167 |     "collapsed": true
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "sorted_eigenvalues = [pair[0] for pair in eigen_map]\n",
172 |     "sorted_eigenvectors = [pair[1] for pair in eigen_map]"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "sorted_eigenvalues"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "print(pd.DataFrame(sorted_eigenvectors, columns=df.drop(['Alcohol'], axis=1).columns))"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {
197 |     "collapsed": true
198 |    },
199 |    "outputs": [],
200 |    "source": [
201 |     "eigenvalue_sum = sum(eigenvalues)\n",
202 |     "var_exp = [(v / eigenvalue_sum)*100 for v in sorted_eigenvalues]\n",
203 |     "cum_var_exp = np.cumsum(var_exp)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {
210 |     "collapsed": true
211 |    },
212 |    "outputs": [],
213 |    "source": [
214 |     "dims = len(df.drop(['Alcohol'], axis=1).columns)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "plt.clf()\n",
224 |     "fig, ax = plt.subplots()\n",
225 |     "\n",
226 |     "ax.plot(range(dims), cum_var_exp, '-o')\n",
227 |     "\n",
228 |     "plt.xlabel('Number of Components')\n",
229 |     "plt.ylabel('Percent of Variance Explained')\n",
230 |     "\n",
231 |     "plt.show()"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "collapsed": true
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "ev1 = sorted_eigenvectors[0]\n",
243 |     "ev2 = sorted_eigenvectors[1]"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "collapsed": true
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "eigen_matrix = np.hstack((ev1.reshape(dims,1), ev2.reshape(dims,1)))"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "eigen_matrix"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {
270 |     "collapsed": true
271 |    },
272 |    "outputs": [],
273 |    "source": [
274 |     "Y = X.dot(eigen_matrix)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "plt.clf()\n",
284 |     "fig, ax = plt.subplots()\n",
285 |     "ax.scatter(Y.T[0], Y.T[1], alpha=0.2)\n",
286 |     "plt.xlabel('PC1')\n",
287 |     "plt.ylabel('PC2')\n",
288 |     "plt.show()"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "metadata": {
295 |     "collapsed": true
296 |    },
297 |    "outputs": [],
298 |    "source": [
299 |     "from sklearn.decomposition import PCA\n",
300 |     "pca = PCA(n_components=2)\n",
301 |     "Y_sklearn = pca.fit_transform(X)"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "plt.clf()\n",
311 |     "fig, ax = plt.subplots()\n",
312 |     "ax.scatter(Y_sklearn.T[0], Y_sklearn.T[1], alpha=0.2)\n",
313 |     "plt.xlabel('PC1')\n",
314 |     "plt.ylabel('PC2')\n",
315 |     "plt.show()"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {
322 |     "collapsed": true
323 |    },
324 |    "outputs": [],
325 |    "source": [
326 |     "y = df['Alcohol'].values"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {
333 |     "collapsed": true
334 |    },
335 |    "outputs": [],
336 |    "source": [
337 |     "# Let's split the model for training and testing, and use a logistic regression\n",
338 |     "X_train, X_test, y_train, y_test = train_test_split(df.drop('Alcohol', axis=1), y, test_size=0.25)"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": null,
344 |    "metadata": {
345 |     "collapsed": true
346 |    },
347 |    "outputs": [],
348 |    "source": [
349 |     "classifier = LogisticRegression(random_state=0)"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": null,
355 |    "metadata": {},
356 |    "outputs": [],
357 |    "source": [
358 |     "classifier.fit(X_train, y_train)"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {},
365 |    "outputs": [],
366 |    "source": [
367 |     "y_pred = classifier.score(X_test, y_test)\n",
368 |     "y_pred"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "metadata": {
375 |     "collapsed": true
376 |    },
377 |    "outputs": [],
378 |    "source": [
379 |     "# now with PCA applied\n",
380 |     "X_train, X_test, y_train, y_test = train_test_split(Y_sklearn, y, test_size=0.3)"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "classifier_with_pca = LogisticRegression(random_state=0)\n",
390 |     "classifier_with_pca.fit(X_train, y_train)"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": null,
396 |    "metadata": {},
397 |    "outputs": [],
398 |    "source": [
399 |     "y_pred = classifier_with_pca.score(X_test, y_test)\n",
400 |     "y_pred"
401 |    ]
402 |   }
403 |  ],
404 |  "metadata": {
405 |   "kernelspec": {
406 |    "display_name": "Python 3",
407 |    "language": "python",
408 |    "name": "python3"
409 |   },
410 |   "language_info": {
411 |    "codemirror_mode": {
412 |     "name": "ipython",
413 |     "version": 3
414 |    },
415 |    "file_extension": ".py",
416 |    "mimetype": "text/x-python",
417 |    "name": "python",
418 |    "nbconvert_exporter": "python",
419 |    "pygments_lexer": "ipython3",
420 |    "version": "3.6.5"
421 |   }
422 |  },
423 |  "nbformat": 4,
424 |  "nbformat_minor": 2
425 | }
426 | 


--------------------------------------------------------------------------------
/Assignments/expressions_with_project.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pymongo\n",
 10 |     "import json"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 20 |     "course_client = pymongo.MongoClient(course_cluster_uri)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 3,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "movies = course_client['aggregations']['movies']"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "# Lab: Expression  Composition\n",
 37 |     "\n",
 38 |     "## For this lab, you'll be composing expressions together \n",
 39 |     "\n",
 40 |     "#### The dataset for this lab can be downloaded [here](https://s3.amazonaws.com/edu-static.mongodb.com/lessons/coursera/aggregation/movies.json) for upload to your own cluster."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "### Prelude\n",
 48 |     "\n",
 49 |     "This lab will have you work with data within arrays, a common operation.\n",
 50 |     "\n",
 51 |     "Specifically, one of the arrays you'll work with is ``writers``, from the\n",
 52 |     "**movies** collection.\n",
 53 |     "\n",
 54 |     "There are times when we want to make sure that the field is an array, and that\n",
 55 |     "it is not empty. We can do this within ``$match``\n",
 56 |     "\n",
 57 |     "  `{ \"$match\": { \"writers\": { \"$elemMatch\": { \"$exists\": true } } }`\n",
 58 |     "\n",
 59 |     "However, the entries within ``writers`` presents another problem. A good amount\n",
 60 |     "of entries in ``writers`` look something like the following, where the writer is\n",
 61 |     "attributed with their specific contribution ::\n",
 62 |     "\n",
 63 |     "  `\"writers\" : [ \"Vincenzo Cerami (story)\", \"Roberto Benigni (story)\" ]`\n",
 64 |     "\n",
 65 |     "But the writer also appears in the ``cast`` array as \"Roberto Benigni\"!\n",
 66 |     "\n",
 67 |     "Give it a look with the following query"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "name": "stdout",
 77 |      "output_type": "stream",
 78 |      "text": [
 79 |       "{\n",
 80 |       "    \"cast\": [\n",
 81 |       "        \"Roberto Benigni\",\n",
 82 |       "        \"Nicoletta Braschi\",\n",
 83 |       "        \"Giustino Durano\",\n",
 84 |       "        \"Giorgio Cantarini\"\n",
 85 |       "    ],\n",
 86 |       "    \"writers\": [\n",
 87 |       "        \"Vincenzo Cerami (story)\",\n",
 88 |       "        \"Roberto Benigni (story)\"\n",
 89 |       "    ]\n",
 90 |       "}\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "result = movies.find_one({\"title\": \"Life Is Beautiful\"}, { \"_id\": 0, \"cast\": 1, \"writers\": 1})\n",
 96 |     "print(json.dumps(result, indent=4))"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "This presents a problem, since comparing ``\"Roberto Benigni\"`` to\n",
104 |     "``\"Roberto Benigni (story)\"`` will definitely result in a difference.\n",
105 |     "\n",
106 |     "Thankfully there is a powerful expression to help us, ``$map``. ``$map`` lets us\n",
107 |     "iterate over an array, element by element, performing some transformation on\n",
108 |     "each element. The result of that transformation will be returned in the same\n",
109 |     "place as the original element.\n",
110 |     "\n",
111 |     "Within ``$map``, the argument to ``input`` can be any expression as long as it\n",
112 |     "resolves to an array. The argument to ``as`` is the name we want to use to refer\n",
113 |     "to each element of the array when performing whatever logic we want, surrounding\n",
114 |     "it with quotes and prepending two `$` signs. The field ``as`` is optional, and if omitted\n",
115 |     "each element must be referred to as ``\"$$this\"``\n",
116 |     "\n",
117 |     "      \"writers\": {\n",
118 |     "        \"$map\": {\n",
119 |     "          \"input\": \"$writers\",\n",
120 |     "          \"as\": \"writer\",\n",
121 |     "          \"in\": \"$$writer\"\n",
122 |     "\n",
123 |     "\n",
124 |     "``in`` is where the work is peformed. Here, we use the ``$arrayElemAt``\n",
125 |     "expression, which takes two arguments, the array and the index of the element we\n",
126 |     "want. We use the ``$split`` expression, splitting the values on ``\" (\"``.\n",
127 |     "\n",
128 |     "If the string did not contain the pattern specified, the only modification is it\n",
129 |     "is wrapped in an array, so ``$arrayElemAt`` will always work\n",
130 |     "\n",
131 |     "      \"writers\": \"$map\": {\n",
132 |     "        \"input\": \"$writers\",\n",
133 |     "        \"as\": \"writer\",\n",
134 |     "        \"in\": {\n",
135 |     "          \"$arrayElemAt\": [\n",
136 |     "            {\n",
137 |     "              \"$split\": [ \"$$writer\", \" (\" ]\n",
138 |     "            },\n",
139 |     "            0\n",
140 |     "          ]\n",
141 |     "        }\n",
142 |     "      }\n",
143 |     "      \n",
144 |     "Let's see it in action to get a full sense of what it does."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 5,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "# this stage is provided for you, use it later as well\n",
154 |     "mapping = {\n",
155 |     "    \"$project\": {\n",
156 |     "        \"_id\": 0,\n",
157 |     "        \"cast\": 1,\n",
158 |     "        \"directors\": 1,\n",
159 |     "        \"writers\": {\n",
160 |     "            \"$map\": {\n",
161 |     "                \"input\": \"$writers\",\n",
162 |     "                \"as\": \"writer\",\n",
163 |     "                \"in\": {\n",
164 |     "                    \"$arrayElemAt\": [\n",
165 |     "                        { \"$split\": [\"$$writer\", \" (\"] },\n",
166 |     "                        0\n",
167 |     "                    ]\n",
168 |     "                }\n",
169 |     "            }\n",
170 |     "        }\n",
171 |     "    }\n",
172 |     "}"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 6,
178 |    "metadata": {},
179 |    "outputs": [
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "[\n",
185 |       "    {\n",
186 |       "        \"cast\": [\n",
187 |       "            \"Roberto Benigni\",\n",
188 |       "            \"Nicoletta Braschi\",\n",
189 |       "            \"Giustino Durano\",\n",
190 |       "            \"Giorgio Cantarini\"\n",
191 |       "        ],\n",
192 |       "        \"directors\": [\n",
193 |       "            \"Roberto Benigni\"\n",
194 |       "        ],\n",
195 |       "        \"writers\": [\n",
196 |       "            \"Vincenzo Cerami\",\n",
197 |       "            \"Roberto Benigni\"\n",
198 |       "        ]\n",
199 |       "    }\n",
200 |       "]\n"
201 |      ]
202 |     }
203 |    ],
204 |    "source": [
205 |     "\n",
206 |     "result = movies.aggregate([\n",
207 |     "    {\n",
208 |     "        \"$match\": {\"title\": \"Life Is Beautiful\"}\n",
209 |     "    },\n",
210 |     "    mapping\n",
211 |     "])\n",
212 |     "print(json.dumps(list(result), indent=4))"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "## Question\n",
220 |     "\n",
221 |     "Let's find how many movies in our **movies** collection are a \"labor of love\",\n",
222 |     "where the same person appears in ``cast``, ``directors``, and ``writers``\n",
223 |     "\n",
224 |     "\n",
225 |     "How many movies are \"labors of love\"?\n",
226 |     "\n",
227 |     "To get a count, ensure you add the following to the end of your pipeline list."
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 7,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "counting = {\n",
237 |     "    \"$count\": \"labors_of_love\"\n",
238 |     "}"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "The necessary mapping stage is provided for you."
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 8,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "mapping = {\n",
255 |     "    \"$project\": {\n",
256 |     "        \"_id\": 0,\n",
257 |     "        \"cast\": 1,\n",
258 |     "        \"directors\": 1,\n",
259 |     "        \"writers\": {\n",
260 |     "            \"$map\": {\n",
261 |     "                \"input\": \"$writers\",\n",
262 |     "                \"as\": \"writer\",\n",
263 |     "                \"in\": {\n",
264 |     "                    \"$arrayElemAt\": [\n",
265 |     "                        { \"$split\": [\"$$writer\", \" (\"] },\n",
266 |     "                        0\n",
267 |     "                    ]\n",
268 |     "                }\n",
269 |     "            }\n",
270 |     "        }\n",
271 |     "    }\n",
272 |     "}"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 24,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "#Filter documents that have all 3 fields: cast, directors, writers\n",
282 |     "predicate = {\n",
283 |     "    \"$match\": { \n",
284 |     "        \"cast\": { \"$elemMatch\": { \"$exists\": True } }, \n",
285 |     "        \"directors\": { \"$elemMatch\": { \"$exists\": True } },\n",
286 |     "        \"writers\": { \"$elemMatch\": { \"$exists\": True } }\n",
287 |     "    }\n",
288 |     "}"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 20,
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "#Select only the 3 fields needed\n",
298 |     "projection = {\n",
299 |     "    \"$project\": {\n",
300 |     "        \"cast\": 1,\n",
301 |     "        \"directors\": 1,\n",
302 |     "        \"writers\": 1,\n",
303 |     "        \"labors_of_love\": { \"$setIntersection\" : [ \"$cast\", \"$directors\", \"$writers\" ] }\n",
304 |     "    }\n",
305 |     "}"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 18,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "#Obtain those where at least one member appears in the 3 groups using a set\n",
315 |     "matching = {\n",
316 |     "    \"$match\": {  \"labors_of_love\": { \"$elemMatch\": { \"$exists\": True } } }\n",
317 |     "}"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 25,
323 |    "metadata": {},
324 |    "outputs": [
325 |     {
326 |      "data": {
327 |       "text/plain": [
328 |        "[{'labors_of_love': 1597}]"
329 |       ]
330 |      },
331 |      "metadata": {},
332 |      "output_type": "display_data"
333 |     }
334 |    ],
335 |    "source": [
336 |     "pipeline = [\n",
337 |     "    predicate,\n",
338 |     "    mapping,\n",
339 |     "    projection,\n",
340 |     "    matching,\n",
341 |     "    counting\n",
342 |     "]\n",
343 |     "\n",
344 |     "display(list(movies.aggregate(pipeline)))\n",
345 |     "\n",
346 |     "#1597"
347 |    ]
348 |   }
349 |  ],
350 |  "metadata": {
351 |   "kernelspec": {
352 |    "display_name": "Python 3",
353 |    "language": "python",
354 |    "name": "python3"
355 |   },
356 |   "language_info": {
357 |    "codemirror_mode": {
358 |     "name": "ipython",
359 |     "version": 3
360 |    },
361 |    "file_extension": ".py",
362 |    "mimetype": "text/x-python",
363 |    "name": "python",
364 |    "nbconvert_exporter": "python",
365 |    "pygments_lexer": "ipython3",
366 |    "version": "3.6.5"
367 |   }
368 |  },
369 |  "nbformat": 4,
370 |  "nbformat_minor": 2
371 | }
372 | 


--------------------------------------------------------------------------------
/Assignments/linear-regression-on-titanic-data-set.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pandas.io.json import json_normalize\n",
 10 |     "from pymongo import MongoClient\n",
 11 |     "from sklearn import linear_model\n",
 12 |     "from sklearn.model_selection import train_test_split\n",
 13 |     "from sklearn.metrics import mean_squared_error\n",
 14 |     "import numpy as np\n",
 15 |     "import pprint"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 25 |     "course_client = MongoClient(course_cluster_uri)"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "titanic = course_client['coursera-agg']['titanic']"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 4,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "# Replace {} with a stage to determine the possible values for gender.\n",
 44 |     "unique_gender_stage = {\n",
 45 |     "    \"$group\": {\n",
 46 |     "        \"_id\": \"$gender\"\n",
 47 |     "    }\n",
 48 |     "}"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 5,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "possible_gender_values = titanic.aggregate([\n",
 58 |     "    {\n",
 59 |     "        \"$match\": {\n",
 60 |     "            \"age\": {\"$type\": \"number\"},\n",
 61 |     "            \"point_of_embarkation\": {\"$ne\": \"\"}\n",
 62 |     "        }\n",
 63 |     "    },\n",
 64 |     "    unique_gender_stage\n",
 65 |     "])"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 6,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "[{'_id': 'female'}, {'_id': 'male'}]\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "# Print the distinct list of values for the gender field\n",
 83 |     "pprint.pprint(list(possible_gender_values))"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 7,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "# Replace {} with a stage to determine the possible values for point_of_embarkation\n",
 93 |     "unique_point_of_embarkation_stage = {\n",
 94 |     "    \"$group\": {\n",
 95 |     "        \"_id\": \"$point_of_embarkation\"\n",
 96 |     "    }\n",
 97 |     "}"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 8,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "possible_point_of_embarkation_values = titanic.aggregate([\n",
107 |     "    {\n",
108 |     "        \"$match\": {\n",
109 |     "            \"age\": {\"$type\": \"number\"},\n",
110 |     "            \"point_of_embarkation\": {\"$ne\": \"\"}\n",
111 |     "        }\n",
112 |     "    },\n",
113 |     "    unique_point_of_embarkation_stage\n",
114 |     "])"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 9,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "[{'_id': 'Q'}, {'_id': 'C'}, {'_id': 'S'}]\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "# Print the distinct list of values for the point_of_embarkation field\n",
132 |     "pprint.pprint(list(possible_point_of_embarkation_values))"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 14,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "# Given the possible values for point_of_embarkation and gender replace {} with a stage that\n",
142 |     "# will convert those field values to an integer.\n",
143 |     "# e.g., For the gender field convert 'female' to 0 and 'male' to 1\n",
144 |     "gender_and_point_of_embarkation_conversion_stage = {\n",
145 |     "    \"$addFields\": {\n",
146 |     "        \"gender\": {\n",
147 |     "            \"$switch\": {\n",
148 |     "                \"branches\": [\n",
149 |     "                    { \"case\": { \"$eq\": [\"$gender\", \"female\"] }, \"then\": 0 },\n",
150 |     "                    { \"case\": { \"$eq\": [\"$gender\", \"male\"] }, \"then\": 1 }\n",
151 |     "                ]\n",
152 |     "            }\n",
153 |     "        },\n",
154 |     "        \"point_of_embarkation\": {\n",
155 |     "            \"$switch\": {\n",
156 |     "                \"branches\": [\n",
157 |     "                    { \"case\": { \"$eq\": [\"$point_of_embarkation\", \"C\"] }, \"then\": 0 },\n",
158 |     "                    { \"case\": { \"$eq\": [\"$point_of_embarkation\", \"Q\"] }, \"then\": 1 },\n",
159 |     "                    { \"case\": { \"$eq\": [\"$point_of_embarkation\", \"S\"] }, \"then\": 2 }\n",
160 |     "                ]\n",
161 |     "            }\n",
162 |     "        }\n",
163 |     "    }\n",
164 |     "}"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 15,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "cursor = titanic.aggregate([\n",
174 |     "    {\n",
175 |     "        \"$match\": {\n",
176 |     "            \"age\": {\"$type\": \"number\"},\n",
177 |     "            \"point_of_embarkation\": {\"$ne\": \"\"}\n",
178 |     "        }\n",
179 |     "    },\n",
180 |     "    gender_and_point_of_embarkation_conversion_stage,\n",
181 |     "    {\n",
182 |     "        \"$project\": {\n",
183 |     "            \"_id\": 0,\n",
184 |     "            \"ticket_number\": 0,\n",
185 |     "            \"name\": 0,\n",
186 |     "            \"passenger_id\": 0,\n",
187 |     "            \"cabin\": 0\n",
188 |     "        }\n",
189 |     "    }\n",
190 |     "])"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 16,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "# Exhaust our cursor into a list\n",
200 |     "titanic_data = list(cursor)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 17,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "# Load our dataset into a DataFrame\n",
210 |     "df = json_normalize(titanic_data)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 18,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "# Pull out the survived column (only the data we want to correlate against)\n",
220 |     "df_x = df.drop(['survived'], axis=1)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 19,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "# Only the survived column (the value we want to predict)\n",
230 |     "df_y = df['survived']"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 20,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "# Create a Least Squares Linear Regression object\n",
240 |     "reg = linear_model.LinearRegression()"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 21,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "# Split our dataset into a training set (80%) and a test set (20%)\n",
250 |     "x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 22,
256 |    "metadata": {},
257 |    "outputs": [
258 |     {
259 |      "data": {
260 |       "text/plain": [
261 |        "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
262 |       ]
263 |      },
264 |      "execution_count": 22,
265 |      "metadata": {},
266 |      "output_type": "execute_result"
267 |     }
268 |    ],
269 |    "source": [
270 |     "# Fit a linear model to our training data\n",
271 |     "reg.fit(x_train, y_train)"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 23,
277 |    "metadata": {},
278 |    "outputs": [
279 |     {
280 |      "data": {
281 |       "text/plain": [
282 |        "array([ 0.11539484,  0.90271544,  0.57035024,  0.63352057,  0.01172327,\n",
283 |        "        0.01800834,  0.06378512,  0.62182568,  0.23796354,  0.71916517,\n",
284 |        "        1.02356223,  0.12115151,  0.55889098,  0.0294754 ,  0.82479006,\n",
285 |        "        0.45264152,  0.19836704,  0.3759118 ,  0.31355646,  0.22183264,\n",
286 |        "        0.44336354,  0.83162647,  0.61957421,  0.2443862 ,  0.38782189,\n",
287 |        "        0.83087677,  0.40299786,  0.24438707,  0.16732825,  0.09493174,\n",
288 |        "        0.61874023, -0.02928499, -0.08826717,  0.64469654,  0.59732346,\n",
289 |        "        0.03927108,  0.09652011,  0.14407889,  0.33575427,  0.56339801,\n",
290 |        "        0.85238449,  0.15566589,  0.64720588,  0.12115151,  0.12113096,\n",
291 |        "        0.44901703,  0.5885736 ,  0.20252592,  0.1326563 ,  0.31003879,\n",
292 |        "        0.68068162,  0.64536789,  0.03974376,  0.61607328,  0.00609504,\n",
293 |        "        0.56856501,  0.08091643,  0.07937081,  0.82713745,  0.950898  ,\n",
294 |        "        0.57283271,  0.51746255,  0.845916  ,  0.20614194,  0.64480443,\n",
295 |        "        0.76276758,  0.84160247,  0.11572537,  0.23575847,  0.74036174,\n",
296 |        "        0.15553317,  0.17162756,  0.7142957 ,  0.25059462,  0.12148119,\n",
297 |        "       -0.04078978,  0.37687851,  0.20699649,  0.55118714,  0.31387098,\n",
298 |        "        0.3820695 ,  0.87793894,  0.07505014,  0.74527929,  0.24751388,\n",
299 |        "        0.14419279,  0.02750939,  0.13823488,  0.785322  ,  0.16128556,\n",
300 |        "        0.04770106,  0.76970252,  0.10964672,  0.09814793,  0.71628897,\n",
301 |        "        0.14108165,  0.90965933,  0.84238156,  0.1326546 ,  0.20457545,\n",
302 |        "        0.21711669,  0.38078977,  0.25059462,  0.2436029 ,  0.33518191,\n",
303 |        "        0.1081298 ,  0.15018666,  0.8101361 ,  0.53516221,  0.26215909,\n",
304 |        "        0.88815928,  0.66823875,  0.38939141,  0.53518871,  0.97887538,\n",
305 |        "        0.68330104,  0.37122508, -0.07037902,  0.19425321,  0.4061484 ,\n",
306 |        "        0.17287256,  0.12074227,  0.84769297,  0.74951466,  0.23857605,\n",
307 |        "        0.6069391 ,  0.69202843,  0.53481044,  0.75659347,  0.16258501,\n",
308 |        "        0.82871559,  0.05788116,  0.80706308,  0.74356107,  0.13575221,\n",
309 |        "        1.05001092,  0.14843955, -0.06901836,  0.02505873,  0.18280612,\n",
310 |        "        0.31021567,  0.68662588,  0.74605276])"
311 |       ]
312 |      },
313 |      "execution_count": 23,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "# Check our test set against our trained linear model\n",
320 |     "reg.predict(x_test)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 24,
326 |    "metadata": {},
327 |    "outputs": [
328 |     {
329 |      "data": {
330 |       "text/plain": [
331 |        "0.13166469521402543"
332 |       ]
333 |      },
334 |      "execution_count": 24,
335 |      "metadata": {},
336 |      "output_type": "execute_result"
337 |     }
338 |    ],
339 |    "source": [
340 |     "# Calculate mean squared error (should be ~0.13-0.15%)\n",
341 |     "mean_squared_error(y_test, reg.predict(x_test))"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 25,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "# age: 25,\n",
351 |     "# class: 1,\n",
352 |     "# fare_paid: 45,\n",
353 |     "# gender: 1, (replace Y with the integer you assigned for 'male')\n",
354 |     "# parents_children: 0,\n",
355 |     "# point_of_embarkation: 0, (replace Z with the integer you assigned for 'C')\n",
356 |     "# siblings_spouse: 1\n",
357 |     "\n",
358 |     "fake_passenger = [[25, 1, 45, 1, 0, 0, 1]]"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 26,
364 |    "metadata": {
365 |     "scrolled": true
366 |    },
367 |    "outputs": [
368 |     {
369 |      "data": {
370 |       "text/plain": [
371 |        "array([0.50169618])"
372 |       ]
373 |      },
374 |      "execution_count": 26,
375 |      "metadata": {},
376 |      "output_type": "execute_result"
377 |     }
378 |    ],
379 |    "source": [
380 |     "# Use this output to verify your completion of this exercise\n",
381 |     "reg.predict(fake_passenger)\n",
382 |     "#0.50169618"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": []
391 |   }
392 |  ],
393 |  "metadata": {
394 |   "kernelspec": {
395 |    "display_name": "Python 3",
396 |    "language": "python",
397 |    "name": "python3"
398 |   },
399 |   "language_info": {
400 |    "codemirror_mode": {
401 |     "name": "ipython",
402 |     "version": 3
403 |    },
404 |    "file_extension": ".py",
405 |    "mimetype": "text/x-python",
406 |    "name": "python",
407 |    "nbconvert_exporter": "python",
408 |    "pygments_lexer": "ipython3",
409 |    "version": "3.6.5"
410 |   }
411 |  },
412 |  "nbformat": 4,
413 |  "nbformat_minor": 2
414 | }
415 | 


--------------------------------------------------------------------------------
/Assignments/Decision+Tree.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%%capture\n",
 10 |     "!pip install pymongo pprint dateparser matplotlib pandas sklearn numpy seaborn"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import pymongo\n",
 20 |     "import pprint\n",
 21 |     "import dateparser\n",
 22 |     "import pandas as pd\n",
 23 |     "import numpy as np\n",
 24 |     "from sklearn.tree import DecisionTreeClassifier\n",
 25 |     "from sklearn.metrics import classification_report, confusion_matrix\n",
 26 |     "from sklearn.ensemble import RandomForestClassifier\n",
 27 |     "from sklearn.model_selection import train_test_split\n",
 28 |     "\n",
 29 |     "%matplotlib inline"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 39 |     "course_client = pymongo.MongoClient(course_cluster_uri)\n",
 40 |     "titanic = course_client['coursera-agg']['titanic']"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 4,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "initial_project = {\n",
 50 |     "    \"$project\": {\n",
 51 |     "        \"_id\": 0,\n",
 52 |     "        \"name\": 0,\n",
 53 |     "        \"point_of_embarkation\": 0,\n",
 54 |     "        \"ticket_number\": 0,\n",
 55 |     "        \"passenger_id\": 0,\n",
 56 |     "        \"cabin\": 0,\n",
 57 |     "    }\n",
 58 |     "}"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 6,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# todo - correct the age.\n",
 68 |     "# *HINT* -- If the $type of \"$age\" is a string, set it to 0\n",
 69 |     "age_correction = {\n",
 70 |     "        \"$cond\": [ { \"$type\": \"string\" }, 0, \"$age\" ]\n",
 71 |     "}"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 12,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# todo - one hot encode gender_female. 1 if female, 0 if male\n",
 81 |     "one_hot_female = {\n",
 82 |     "    \"$cond\": [ { \"$eq\": [ \"$gender\", \"female\" ] }, 1, 0 ]\n",
 83 |     "}"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 13,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "# todo - the inverse of above. 1 if male, 0 if female\n",
 93 |     "one_hot_male = {\n",
 94 |     "    \"$cond\": [ { \"$eq\": [ \"$gender\", \"male\" ] }, 1, 0 ]\n",
 95 |     "}"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 14,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "encoding_stage = {\n",
105 |     "    \"$addFields\": {\n",
106 |     "        \"gender_female\": one_hot_female,\n",
107 |     "        \"gender_male\": one_hot_male,\n",
108 |     "        \"age\": age_correction\n",
109 |     "    }\n",
110 |     "}"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 15,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "final_project = {\n",
120 |     "    \"$project\": {\n",
121 |     "        \"gender\": 0\n",
122 |     "    }\n",
123 |     "}"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 16,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "pipeline = [initial_project, encoding_stage, final_project]"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 17,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/html": [
143 |        "<div>\n",
144 |        "<style scoped>\n",
145 |        "    .dataframe tbody tr th:only-of-type {\n",
146 |        "        vertical-align: middle;\n",
147 |        "    }\n",
148 |        "\n",
149 |        "    .dataframe tbody tr th {\n",
150 |        "        vertical-align: top;\n",
151 |        "    }\n",
152 |        "\n",
153 |        "    .dataframe thead th {\n",
154 |        "        text-align: right;\n",
155 |        "    }\n",
156 |        "</style>\n",
157 |        "<table border=\"1\" class=\"dataframe\">\n",
158 |        "  <thead>\n",
159 |        "    <tr style=\"text-align: right;\">\n",
160 |        "      <th></th>\n",
161 |        "      <th>age</th>\n",
162 |        "      <th>class</th>\n",
163 |        "      <th>fare_paid</th>\n",
164 |        "      <th>gender_female</th>\n",
165 |        "      <th>gender_male</th>\n",
166 |        "      <th>parents_children</th>\n",
167 |        "      <th>siblings_spouse</th>\n",
168 |        "      <th>survived</th>\n",
169 |        "    </tr>\n",
170 |        "  </thead>\n",
171 |        "  <tbody>\n",
172 |        "    <tr>\n",
173 |        "      <th>0</th>\n",
174 |        "      <td>0</td>\n",
175 |        "      <td>3</td>\n",
176 |        "      <td>8.0500</td>\n",
177 |        "      <td>0</td>\n",
178 |        "      <td>1</td>\n",
179 |        "      <td>0</td>\n",
180 |        "      <td>0</td>\n",
181 |        "      <td>0</td>\n",
182 |        "    </tr>\n",
183 |        "    <tr>\n",
184 |        "      <th>1</th>\n",
185 |        "      <td>0</td>\n",
186 |        "      <td>3</td>\n",
187 |        "      <td>7.2500</td>\n",
188 |        "      <td>0</td>\n",
189 |        "      <td>1</td>\n",
190 |        "      <td>0</td>\n",
191 |        "      <td>1</td>\n",
192 |        "      <td>0</td>\n",
193 |        "    </tr>\n",
194 |        "    <tr>\n",
195 |        "      <th>2</th>\n",
196 |        "      <td>0</td>\n",
197 |        "      <td>3</td>\n",
198 |        "      <td>16.7000</td>\n",
199 |        "      <td>1</td>\n",
200 |        "      <td>0</td>\n",
201 |        "      <td>1</td>\n",
202 |        "      <td>1</td>\n",
203 |        "      <td>1</td>\n",
204 |        "    </tr>\n",
205 |        "    <tr>\n",
206 |        "      <th>3</th>\n",
207 |        "      <td>0</td>\n",
208 |        "      <td>3</td>\n",
209 |        "      <td>11.1333</td>\n",
210 |        "      <td>1</td>\n",
211 |        "      <td>0</td>\n",
212 |        "      <td>2</td>\n",
213 |        "      <td>0</td>\n",
214 |        "      <td>1</td>\n",
215 |        "    </tr>\n",
216 |        "    <tr>\n",
217 |        "      <th>4</th>\n",
218 |        "      <td>0</td>\n",
219 |        "      <td>1</td>\n",
220 |        "      <td>53.1000</td>\n",
221 |        "      <td>1</td>\n",
222 |        "      <td>0</td>\n",
223 |        "      <td>0</td>\n",
224 |        "      <td>1</td>\n",
225 |        "      <td>1</td>\n",
226 |        "    </tr>\n",
227 |        "  </tbody>\n",
228 |        "</table>\n",
229 |        "</div>"
230 |       ],
231 |       "text/plain": [
232 |        "   age  class  fare_paid  gender_female  gender_male  parents_children  \\\n",
233 |        "0    0      3     8.0500              0            1                 0   \n",
234 |        "1    0      3     7.2500              0            1                 0   \n",
235 |        "2    0      3    16.7000              1            0                 1   \n",
236 |        "3    0      3    11.1333              1            0                 2   \n",
237 |        "4    0      1    53.1000              1            0                 0   \n",
238 |        "\n",
239 |        "   siblings_spouse  survived  \n",
240 |        "0                0         0  \n",
241 |        "1                1         0  \n",
242 |        "2                1         1  \n",
243 |        "3                0         1  \n",
244 |        "4                1         1  "
245 |       ]
246 |      },
247 |      "execution_count": 17,
248 |      "metadata": {},
249 |      "output_type": "execute_result"
250 |     }
251 |    ],
252 |    "source": [
253 |     "df = pd.DataFrame.from_dict(list(titanic.aggregate(pipeline)))\n",
254 |     "df.head()"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 18,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "X = df.drop('survived', axis=1)"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 19,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "y = df['survived']"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 20,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 21,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "dtree = DecisionTreeClassifier()"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 22,
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "%%capture\n",
300 |     "dtree.fit(X_train, y_train)"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 23,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "predictions = dtree.predict(X_test)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 24,
315 |    "metadata": {},
316 |    "outputs": [
317 |     {
318 |      "name": "stdout",
319 |      "output_type": "stream",
320 |      "text": [
321 |       "[[73 13]\n",
322 |       " [11 37]]\n",
323 |       "\n",
324 |       "\n",
325 |       "             precision    recall  f1-score   support\n",
326 |       "\n",
327 |       "          0       0.87      0.85      0.86        86\n",
328 |       "          1       0.74      0.77      0.76        48\n",
329 |       "\n",
330 |       "avg / total       0.82      0.82      0.82       134\n",
331 |       "\n"
332 |      ]
333 |     }
334 |    ],
335 |    "source": [
336 |     "print(confusion_matrix(y_test, predictions))\n",
337 |     "print(\"\\n\")\n",
338 |     "print(classification_report(y_test, predictions))"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 25,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "rfc = RandomForestClassifier(n_estimators=20)"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 26,
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "%%capture\n",
357 |     "rfc.fit(X_train, y_train)"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 27,
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "rfc_pred = rfc.predict(X_test)"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 28,
372 |    "metadata": {},
373 |    "outputs": [
374 |     {
375 |      "name": "stdout",
376 |      "output_type": "stream",
377 |      "text": [
378 |       "[[73 13]\n",
379 |       " [15 33]]\n",
380 |       "\n",
381 |       "\n",
382 |       "             precision    recall  f1-score   support\n",
383 |       "\n",
384 |       "       test       0.83      0.85      0.84        86\n",
385 |       "predictions       0.72      0.69      0.70        48\n",
386 |       "\n",
387 |       "avg / total       0.79      0.79      0.79       134\n",
388 |       "\n"
389 |      ]
390 |     }
391 |    ],
392 |    "source": [
393 |     "print(confusion_matrix(y_test, rfc_pred))\n",
394 |     "print(\"\\n\")\n",
395 |     "print(classification_report(y_test, rfc_pred, target_names=['test', 'predictions']))"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 29,
401 |    "metadata": {},
402 |    "outputs": [
403 |     {
404 |      "name": "stdout",
405 |      "output_type": "stream",
406 |      "text": [
407 |       "\n",
408 |       "After 1000 iterations:\n",
409 |       "  Single Decision Tree accuracy: 0.8199253731343169\n",
410 |       "  Random Forest accuracy:        0.7955373134328367\n",
411 |       "  \n",
412 |       "  Lab Answer:  dtree=0.82, rfc=0.8\n",
413 |       "\n"
414 |      ]
415 |     }
416 |    ],
417 |    "source": [
418 |     "iterations = 1000\n",
419 |     "dtree_avg_accuracy = 0\n",
420 |     "rfc_avg_accuracy = 0\n",
421 |     "for _ in range(iterations):\n",
422 |     "    dtree.fit(X_train, y_train)\n",
423 |     "    dtree_avg_accuracy += dtree.score(X_test, y_test)\n",
424 |     "    rfc.fit(X_train, y_train)\n",
425 |     "    rfc_avg_accuracy += rfc.score(X_test, y_test)\n",
426 |     "    \n",
427 |     "print(f\"\"\"\n",
428 |     "After {iterations} iterations:\n",
429 |     "  Single Decision Tree accuracy: {dtree_avg_accuracy / iterations}\n",
430 |     "  Random Forest accuracy:        {rfc_avg_accuracy / iterations}\n",
431 |     "  \n",
432 |     "  Lab Answer:  dtree={round(dtree_avg_accuracy / iterations, 2)}, rfc={round(rfc_avg_accuracy / iterations, 2)}\n",
433 |     "\"\"\")\n",
434 |     "#After 1000 iterations:\n",
435 |     "#  Single Decision Tree accuracy: 0.8199253731343169\n",
436 |     "#  Random Forest accuracy:        0.7955373134328367\n",
437 |     "#  \n",
438 |     "#  Lab Answer:  dtree=0.82, rfc=0.8\n",
439 |     "#0.8199253731343169+0.7955373134328367=1.62"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "metadata": {
446 |     "collapsed": true
447 |    },
448 |    "outputs": [],
449 |    "source": []
450 |   }
451 |  ],
452 |  "metadata": {
453 |   "kernelspec": {
454 |    "display_name": "Python 3",
455 |    "language": "python",
456 |    "name": "python3"
457 |   },
458 |   "language_info": {
459 |    "codemirror_mode": {
460 |     "name": "ipython",
461 |     "version": 3
462 |    },
463 |    "file_extension": ".py",
464 |    "mimetype": "text/x-python",
465 |    "name": "python",
466 |    "nbconvert_exporter": "python",
467 |    "pygments_lexer": "ipython3",
468 |    "version": "3.6.5"
469 |   }
470 |  },
471 |  "nbformat": 4,
472 |  "nbformat_minor": 2
473 | }
474 | 


--------------------------------------------------------------------------------
/LessonNotes/associative_rules__lesson.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Dependencies Installation\n",
  8 |     "Before we get started, let's make sure we have all dependencies installed."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {
 15 |     "collapsed": true
 16 |    },
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "%%capture\n",
 20 |     "! pip3 install pymongo dateparser sklearn pandas numpy pprint scipy matplotlib seaborn mlxtend\n",
 21 |     "%matplotlib inline\n"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Association Rules\n"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Importing Necessary Dependencies"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# dependencies\n",
 47 |     "import dateparser\n",
 48 |     "import pymongo\n",
 49 |     "import pandas as pd\n",
 50 |     "from sklearn.cluster import KMeans\n",
 51 |     "from sklearn import preprocessing\n",
 52 |     "from mlxtend.frequent_patterns import apriori\n",
 53 |     "from mlxtend.frequent_patterns import association_rules\n",
 54 |     "from mlxtend.preprocessing import one_hot\n",
 55 |     "import numpy as np\n",
 56 |     "import json\n",
 57 |     "import matplotlib.pyplot as plt\n",
 58 |     "import seaborn as sns\n",
 59 |     "sns.set(style=\"whitegrid\", palette=\"muted\")"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "### The Initial Setup\n",
 67 |     "\n",
 68 |     "We'll create a dataframe with some made up transactions to illustrate the apriori algorithm and association rules. The dictionary key will represent the product bought, and the number will represent the quantity bought."
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {
 75 |     "collapsed": true
 76 |    },
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "transactions = [\n",
 80 |     "    {\n",
 81 |     "        \"beer\": 1,\n",
 82 |     "        \"chips\": 2,\n",
 83 |     "        \"salsa\": 1,\n",
 84 |     "    },\n",
 85 |     "    {\n",
 86 |     "        \"chips\": 1,\n",
 87 |     "        \"salsa\": 1,\n",
 88 |     "        \"chocolate\": 3\n",
 89 |     "    },\n",
 90 |     "    {\n",
 91 |     "        \"chocolate\": 2,\n",
 92 |     "        \"diapers\": 1,\n",
 93 |     "        \"beer\": 2\n",
 94 |     "    },\n",
 95 |     "    {\n",
 96 |     "        \"chips\": 2,\n",
 97 |     "        \"salsa\": 1,\n",
 98 |     "        \"chocolate\": 2\n",
 99 |     "    },\n",
100 |     "    {\n",
101 |     "        \"diapers\": 3,\n",
102 |     "        \"chips\": 1,\n",
103 |     "        \"salsa\": 2,\n",
104 |     "        \"beer\": 2\n",
105 |     "    },\n",
106 |     "    {\n",
107 |     "        \"diapers\": 2,\n",
108 |     "        \"chips\": 1,\n",
109 |     "        \"salsa\": 1,\n",
110 |     "        \"chocolate\": 4,\n",
111 |     "        \"beer\": 3\n",
112 |     "    }\n",
113 |     "]"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "transactions = pd.DataFrame.from_dict(transactions)\n",
123 |     "transactions"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "### Getting rid of NaN Values\n",
131 |     "\n",
132 |     "We need to get rid of NaN values, so we'll use a utility method from Pandas to replace them with 0."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "transactions.fillna(0, inplace=True)\n",
142 |     "transactions"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "### One-hot Encoding\n",
150 |     "\n",
151 |     "We need to one hot encode the data, so that 1 means they bought the item and 0 means they didn't. We'll quickly search the dataframe and replace values greater than 1 to 1."
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "oh = transactions\n",
161 |     "for column in oh.columns:\n",
162 |     "    oh.loc[oh[column] > 0, column] = 1\n",
163 |     "oh"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "### Apriori\n",
171 |     "\n",
172 |     "The first step is to use the apriori algorithm. This will give us our frequent itemsets and their support.\n",
173 |     "\n",
174 |     "The support of an itemset is the proportion of transaction in the collection in which the itemset appears. It signifies the popularity of an itemset.\n",
175 |     "\n",
176 |     "Given the above information, we have 6 transactions. Of those, beer appears in 4 of them. So, we'd expect the itemset `[beer]` to have a support value of `4/6` or `.666666667`.\n",
177 |     "\n",
178 |     "Going through all of them, we can build itemsets that are just one item and calculate their support."
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "Now that we have our 1 item itemsets, let's build up our 2 item itemsets. So, if an itemset is [a, b] where a is chips and b is salse, the support is the ratio of the apperance of itemset `[a, b]` in all transactions. We would do this until we have exhausted all possible itemsets.\n",
186 |     "\n",
187 |     "Also of key importance is being able to define some minimum threshold for which we do not care about that itemset.\n",
188 |     "\n",
189 |     "For this, we'll use the `apriori` algorithm from `mlxtend`."
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "assocs = apriori(oh, min_support=0.5, use_colnames=True)\n",
199 |     "\n",
200 |     "assocs =assocs.sort_values(by='support', ascending=False)\n",
201 |     "assocs"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "rules = association_rules(assocs, min_threshold=0.5)\n",
211 |     "with pd.option_context('display.max_rows', None, 'display.max_columns', 5):\n",
212 |     "    display(rules.sort_values(by='lift', ascending=False))"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "## Pymongo Setup"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {
226 |     "collapsed": true
227 |    },
228 |    "outputs": [],
229 |    "source": [
230 |     "# pymongo driver configuration\n",
231 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
232 |     "course_client = pymongo.MongoClient(course_cluster_uri)\n",
233 |     "orders = course_client['coursera-agg']['orders']"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "# Getting our data from MongoDB\n",
241 |     "\n",
242 |     "We'll need to construct a one-hot encoded dataframe. This means that for every document, convert the information into the purchases array into something like:\n",
243 |     "\n",
244 |     "```\n",
245 |     "{\n",
246 |     "    ...,\n",
247 |     "    \"purchases\": [\n",
248 |     "        {\n",
249 |     "          \"description\": \"WHITE WIRE EGG HOLDER\",\n",
250 |     "          \"quantity\": 36,\n",
251 |     "          \"stock_code\": \"84880\",\n",
252 |     "          \"unit_price\": 4.95\n",
253 |     "        },\n",
254 |     "        {\n",
255 |     "          \"description\": \"JUMBO  BAG BAROQUE BLACK WHITE\",\n",
256 |     "          \"quantity\": 100,\n",
257 |     "          \"stock_code\": \"85099C\",\n",
258 |     "          \"unit_price\": 1.65\n",
259 |     "        },\n",
260 |     "        {\n",
261 |     "          \"description\": \"JUMBO BAG RED RETROSPOT\",\n",
262 |     "          \"quantity\": 100,\n",
263 |     "          \"stock_code\": \"85099B\",\n",
264 |     "          \"unit_price\": 1.65\n",
265 |     "        }\n",
266 |     "      ],\n",
267 |     "  }\n",
268 |     "  ```\n",
269 |     "  into\n",
270 |     "  ```\n",
271 |     "{\n",
272 |     "    \"84880\": 1,\n",
273 |     "    \"85099C\": 1,\n",
274 |     "    \"85099B\": 1,\n",
275 |     "}\n",
276 |     "```"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "## The Pipeline"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "order_projection = {\n",
293 |     "    \"$replaceRoot\": {\n",
294 |     "            \"newRoot\":  {\n",
295 |     "                \"$arrayToObject\": {\n",
296 |     "                    \"$map\": {\n",
297 |     "                        \"input\": \"$purchases\",\n",
298 |     "                        \"in\": {\n",
299 |     "                            \"k\": \"$$this.stock_code\",\n",
300 |     "                            \"v\": 1\n",
301 |     "                        }\n",
302 |     "                    }\n",
303 |     "                }\n",
304 |     "            }\n",
305 |     "    }\n",
306 |     "            \n",
307 |     "}\n",
308 |     "\n",
309 |     "print(json.dumps(order_projection, indent=2))"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "# Constructing the Pipeline\n",
317 |     "\n",
318 |     "That's it! We will use our one stage."
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": null,
324 |    "metadata": {
325 |     "collapsed": true
326 |    },
327 |    "outputs": [],
328 |    "source": [
329 |     "pipeline = [\n",
330 |     "    order_projection\n",
331 |     "]"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "markdown",
336 |    "metadata": {},
337 |    "source": [
338 |     "# Constructing the pandas Dataframe from MongoDB\n",
339 |     "\n",
340 |     "Here you will need to construct the DataFrame. Assign it to the variabled `df` below."
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "df = pd.DataFrame.from_dict(list(orders.aggregate(pipeline)))\n",
350 |     "df.head(n=10)"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "metadata": {},
356 |    "source": [
357 |     "## Fixing the NaN values\n",
358 |     "\n",
359 |     "We will use the Pandas DataFrame [fillna](http://github.com/pandas-dev/pandas/blob/v0.21.0/pandas/core/frame.py#L3029-L3035) method to fill in NaN values for us with 0."
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "df.fillna(0, inplace=True)\n",
369 |     "df.head(10)"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {},
375 |    "source": [
376 |     "## Association"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {},
382 |    "source": [
383 |     "### Apriori\n",
384 |     "First, we'll use the `apriori` algorithm from `mlxtend` to extract frequent itemsets. "
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {
391 |     "collapsed": true,
392 |     "scrolled": false
393 |    },
394 |    "outputs": [],
395 |    "source": [
396 |     "assocs = apriori(df, min_support=0.02, use_colnames=True)"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "with pd.option_context('display.max_rows', None, 'display.max_columns', 5):\n",
406 |     "    assocs =assocs.sort_values(by='support', ascending=False)\n",
407 |     "    display(assocs)"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "markdown",
412 |    "metadata": {},
413 |    "source": [
414 |     "## Association Rules\n",
415 |     "\n",
416 |     "Now we form the association rules. Try adjusting the `min_threshold` along with the `metric` to find interesting associations. For example, which class appears to be highly associated with `parents_children`? Go back and add a one-hot encoding function for `parents_children` and see if the results are more clear."
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": null,
422 |    "metadata": {
423 |     "collapsed": true
424 |    },
425 |    "outputs": [],
426 |    "source": [
427 |     "rules = association_rules(assocs, metric=\"lift\", min_threshold=3)"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": null,
433 |    "metadata": {},
434 |    "outputs": [],
435 |    "source": [
436 |     "with pd.option_context('display.max_rows', None, 'display.max_columns', 5):\n",
437 |     "    display(rules.sort_values(by='lift', ascending=False))"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "query = {\n",
447 |     "    \"$match\": {\n",
448 |     "        \"_id.stock_code\": { \"$in\": [\"22697\", \"22698\", \"22699\"]}\n",
449 |     "    }\n",
450 |     "}\n",
451 |     "\n",
452 |     "project = {\n",
453 |     "    \"$project\": { \"_id\": 0, \"purchases.stock_code\": 1, \"purchases.description\": 1}\n",
454 |     "}\n",
455 |     "\n",
456 |     "pipeline = [\n",
457 |     "    {\n",
458 |     "        \"$unwind\": \"$purchases\"\n",
459 |     "    },\n",
460 |     "    {\n",
461 |     "        \"$group\": {\n",
462 |     "            \"_id\": {\n",
463 |     "                \"stock_code\": \"$purchases.stock_code\",\n",
464 |     "                \"description\": \"$purchases.description\"\n",
465 |     "            }\n",
466 |     "            \n",
467 |     "        }\n",
468 |     "    },\n",
469 |     "    query\n",
470 |     "]\n",
471 |     "display(list(orders.aggregate(pipeline)))"
472 |    ]
473 |   }
474 |  ],
475 |  "metadata": {
476 |   "kernelspec": {
477 |    "display_name": "Python 3",
478 |    "language": "python",
479 |    "name": "python3"
480 |   },
481 |   "language_info": {
482 |    "codemirror_mode": {
483 |     "name": "ipython",
484 |     "version": 3
485 |    },
486 |    "file_extension": ".py",
487 |    "mimetype": "text/x-python",
488 |    "name": "python",
489 |    "nbconvert_exporter": "python",
490 |    "pygments_lexer": "ipython3",
491 |    "version": "3.6.5"
492 |   }
493 |  },
494 |  "nbformat": 4,
495 |  "nbformat_minor": 1
496 | }
497 | 


--------------------------------------------------------------------------------
/LessonNotes/pearson_correlation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Pearson Correlation\n",
  8 |     "\n",
  9 |     "Correlation using Pearson Correlation (Pearson's Rho, Pearson Correlation Coefficient, etc...)\n",
 10 |     "\n",
 11 |     "## What is Correlation?\n",
 12 |     "\n",
 13 |     "Let's define what correlation is. Bivariate data (data with two variables) is said to be correlated when there is a strong linear relationship between the two variables. The Pearson correlation is a measure of strength in this linear relationship.\n",
 14 |     "\n",
 15 |     "## Pearson correlation\n",
 16 |     "\n",
 17 |     "Pearson correlation values can range from -1 to 1, inclusive. Negative values indicate a negative correlation, positive values a positive correlation, and values near 0 represent no correlation. That is to say, given two values `x` and `y`, a negative correlation would be as the value of `x` increases `y` decreases, a positive correlation would be as `x` increases `y` increases, and no correlation would be as `x` increases `y` has no increase or decrease.\n"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {
 24 |     "collapsed": true
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "%%html\n",
 29 |     "<figure>\n",
 30 |     "  <img\n",
 31 |     "  src=\"https://upload.wikimedia.org/wikipedia/commons/3/34/Correlation_coefficient.png\"\n",
 32 |     "  alt=\"Pearson Correlation Example Plots\">\t\n",
 33 |     "  <figcaption>Source:\n",
 34 |     "    <a href=\"https://en.wikipedia.org/wiki/Pearson_correlation_coefficient\" target=\"_blank\">\n",
 35 |     "      Wikipedia\n",
 36 |     "    </a>\n",
 37 |     "  </figcaption>\n",
 38 |     "</figure>"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {
 45 |     "collapsed": true
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "%%capture\n",
 50 |     "!pip install numpy pandas seaborn scipy\n",
 51 |     "import numpy as np\n",
 52 |     "import pandas as pd\n",
 53 |     "import seaborn as sns\n",
 54 |     "import matplotlib as plt\n",
 55 |     "from pymongo import MongoClient\n",
 56 |     "from cmath import sqrt\n",
 57 |     "from scipy.stats import pearsonr\n",
 58 |     "%matplotlib inline\n",
 59 |     "sns.set(color_codes=True)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "## Real Data\n",
 67 |     "\n",
 68 |     "Perfect correlations are virtually non-existent on real data. Let's look at a set of real data, and graph it.\n",
 69 |     "\n",
 70 |     "We'll use the ``movies`` dataset available from Atlas, and look at the ``imdb.votes`` and ``imdb.rating`` fields. Let's determine if there is a correlation between the number of votes and the rating."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {
 77 |     "collapsed": true
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 82 |     "course_client = MongoClient(course_cluster_uri)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "collapsed": true
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "movies = course_client['aggregations']['movies']"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {
100 |     "collapsed": true
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "pipeline = [\n",
105 |     "    {\n",
106 |     "        \"$match\": {\n",
107 |     "            \"imdb.rating\": { \"$gt\": 0 },\n",
108 |     "            \"imdb.votes\": { \"$gt\": 0}\n",
109 |     "        }\n",
110 |     "    },\n",
111 |     "    {\n",
112 |     "        \"$project\": {\n",
113 |     "            \"_id\": 0,\n",
114 |     "            \"rating\": \"$imdb.rating\",\n",
115 |     "            \"votes\": \"$imdb.votes\"\n",
116 |     "        }\n",
117 |     "    }\n",
118 |     "]"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "df = pd.DataFrame.from_dict(list(movies.aggregate(pipeline)))"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {
136 |     "collapsed": true
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "df.head()"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {
147 |     "collapsed": true
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "sns.jointplot(x=\"rating\", y=\"votes\", data=df, kind='reg',\n",
152 |     "                  joint_kws={'line_kws':{'color':'red'}, 'scatter_kws': { 'alpha': 0.5, 's': 20}}, size=8 )"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "## Calculating Correlation\n",
160 |     "\n",
161 |     "### The Formula\n",
162 |     "\n",
163 |     "Math is both beautiful and terrifying. This is the formula for a single-pass Pearson Correlation.\n",
164 |     "\n",
165 |     "![Pearson's Rho Single Pass Formula](https://s3.amazonaws.com/special-partnerships/coursera/pearson.gif)\n",
166 |     "\n",
167 |     "\n",
168 |     "### Groundwork\n",
169 |     "\n",
170 |     "To calculate the correlation, we need to calculate some values for later use. For the following steps, X and Y refer to rating and movies.\n",
171 |     "\n",
172 |     "* Calculate the mean for X and the mean for Y. We'll call these m_x and m_y\n",
173 |     "* For each X, substract m_x. We'll call this little x\n",
174 |     "* For each Y, substract m_y. We'll call this little y\n",
175 |     "* For each pair of values, multiply x and y. We'll call this xy\n",
176 |     "* For each little x and each little y, calculate the square. We'll call these x2 and y2\n",
177 |     "\n",
178 |     "Let's see these these values and how they relate to the input.\n",
179 |     "\n",
180 |     "First, we'll create a copy.\n",
181 |     "\n"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {
188 |     "collapsed": true
189 |    },
190 |    "outputs": [],
191 |    "source": [
192 |     "exm = df\n",
193 |     "exm.head(n=10)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "Next, we will calculate m_x and m_y."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {
207 |     "collapsed": true
208 |    },
209 |    "outputs": [],
210 |    "source": [
211 |     "m_x = sum(exm['rating'])/len(exm['rating'])\n",
212 |     "m_y = sum(exm['votes'])/len(exm['rating'])\n",
213 |     "print(f\"m_x= {m_x}, m_y= {m_y}\")"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "Now, we will calculate little x, little y, xy, x2 and y2. We will assign these values into the ``exm`` dataframe, and then view the first 10 rows to see how they all relate."
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {
227 |     "collapsed": true
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "x = list(map(lambda X: X - m_x, exm['rating']))\n",
232 |     "y = list(map(lambda Y: Y - m_y, exm['votes']))\n",
233 |     "xy = list(map(lambda xy: xy[0] * xy[1], zip(exm['rating'], exm['votes'])))\n",
234 |     "x2 = list(map(lambda x: x * x, exm['rating']))\n",
235 |     "y2 = list(map(lambda y: y * y, exm['votes']))\n",
236 |     "exm = exm.assign(x=x, y=y, xy=xy, x2=x2, y2=y2)\n",
237 |     "exm = exm[['rating', 'votes', 'x', 'y', 'xy', 'x2', 'y2']]\n",
238 |     "exm.head(n=10)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "### The Maths in the Equation\n",
246 |     "\n",
247 |     "With these necessary values calculated, we can now jump into the equation itself."
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "To start, we'll focus on the top of the equation, which we'll call `top`.\n",
255 |     "\n",
256 |     "We multiply the number of elements (`elems`), by the sum of xy (`sum_xy`) and call it `product_xy_elems`. After doing so we then subtract the sum of x (`sum_x`) multiplied by the sum of `y` (`sum_y`) and call it `product_sum_x_sum_y`. So `top = product_xy_elems - product_sum_x_sum_y`."
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {
263 |     "collapsed": true
264 |    },
265 |    "outputs": [],
266 |    "source": [
267 |     "elems = len(exm['votes'])\n",
268 |     "print(elems)\n",
269 |     "sum_xy = exm['xy'].sum()\n",
270 |     "sum_x = exm['rating'].sum()\n",
271 |     "sum_y = exm['votes'].sum()\n",
272 |     "\n",
273 |     "product_xy_elems = elems * sum_xy\n",
274 |     "product_sum_x_sum_y = sum_x * sum_y\n",
275 |     "\n",
276 |     "top = product_xy_elems - product_sum_x_sum_y"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "Let's now focus on the bottom of the equation. For the moment we'll ignore the square root.\n",
284 |     "\n",
285 |     "First, bottom left, which we'll call `bottom_left`. This is composed of two parts.\n",
286 |     "\n",
287 |     "We multiply `elems` by the sum of the squares of x (`sum_x2`) and call it `product_sum_x2_elems`. We then subtract the square of the sums of x (`sum_x_2`)."
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {
294 |     "collapsed": true
295 |    },
296 |    "outputs": [],
297 |    "source": [
298 |     "sum_x2 = exm['x2'].sum()\n",
299 |     "sum_x_2 = sum_x * sum_x\n",
300 |     "\n",
301 |     "product_sum_x2_elems = elems * sum_x2\n",
302 |     "\n",
303 |     "bottom_left = product_sum_x2_elems - sum_x_2"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "Next, bottom right, which we'll call `bottom_right`. This is almost identical to `bottom_left`, but we are now concerned with `y` instead of `x`.\n",
311 |     "\n",
312 |     "We multiply `elems` by the sum of the squares of y (`sum_y2`) and call it `product_sum_y2_elems`. We then subtract the square of the sums of y (`sum_y_2`)."
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {
319 |     "collapsed": true
320 |    },
321 |    "outputs": [],
322 |    "source": [
323 |     "sum_y2 = exm['y2'].sum()\n",
324 |     "sum_y_2 = sum_y * sum_y\n",
325 |     "product_sum_y2_elems = elems * sum_y2\n",
326 |     "bottom_right = product_sum_y2_elems - sum_y_2"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "metadata": {
332 |     "collapsed": true
333 |    },
334 |    "source": [
335 |     "We can short circuit the math a bit and multiply `bottom_left` by `bottom_right` and then take the square root of that."
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": null,
341 |    "metadata": {
342 |     "collapsed": true
343 |    },
344 |    "outputs": [],
345 |    "source": [
346 |     "bottom = sqrt(bottom_left * bottom_right)"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "### Finding our correlation"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {
360 |     "collapsed": true
361 |    },
362 |    "outputs": [],
363 |    "source": [
364 |     "r = top/bottom\n",
365 |     "print(f\"{round(r.real, 4)}\")"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "markdown",
370 |    "metadata": {},
371 |    "source": [
372 |     "We have our correlation! Let's see if we were accurate by comparing to the pearsonr library method in available in `scipy.stats`."
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {
379 |     "collapsed": true
380 |    },
381 |    "outputs": [],
382 |    "source": [
383 |     "p = pearsonr(exm['rating'], exm['votes'])\n",
384 |     "print(f\"{round(p[0], 4)}\")"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "markdown",
389 |    "metadata": {},
390 |    "source": [
391 |     "## Within MongoDB\n",
392 |     "\n",
393 |     "Excellent! We can see that we're getting the same results as the library function.\n",
394 |     "\n",
395 |     "However, this is slower than it needs to be, and we can do all of this work within the aggregation framework!\n",
396 |     "\n",
397 |     "We'll calculate the same variables we did before within a pipeline, and assign it to a key called `m`."
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {
404 |     "collapsed": true
405 |    },
406 |    "outputs": [],
407 |    "source": [
408 |     "X = '$imdb.rating'\n",
409 |     "Y = '$imdb.votes'"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {
416 |     "collapsed": true
417 |    },
418 |    "outputs": [],
419 |    "source": [
420 |     "elems = { \"$sum\": 1 }\n",
421 |     "sum_x = { \"$sum\": X }\n",
422 |     "sum_y = { \"$sum\": Y }\n",
423 |     "sum_x2 = { \"$sum\": { \"$multiply\": [X, X] } }\n",
424 |     "sum_y2 = { \"$sum\": { \"$multiply\": [Y, Y] } }\n",
425 |     "sum_xy = { \"$sum\": { \"$multiply\": [X, Y] } }\n",
426 |     "\n",
427 |     "all_sums = {\n",
428 |     "    \"$group\": {\n",
429 |     "        \"_id\": None,\n",
430 |     "        \"elems\": elems,\n",
431 |     "        \"sum_x\": sum_x,\n",
432 |     "        \"sum_y\": sum_y,\n",
433 |     "        \"sum_x2\": sum_x2,\n",
434 |     "        \"sum_y2\": sum_y2,\n",
435 |     "        \"sum_xy\": sum_xy\n",
436 |     "    }\n",
437 |     "}"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {
444 |     "collapsed": true
445 |    },
446 |    "outputs": [],
447 |    "source": [
448 |     "product_sum_x_sum_y = { \"$multiply\": [\"$sum_x\", \"$sum_y\"] }\n",
449 |     "product_sum_xy_elems = { \"$multiply\": [\"$sum_xy\", \"$elems\"] }\n",
450 |     "top = { \"$subtract\": [ product_sum_xy_elems, product_sum_x_sum_y]}"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {
457 |     "collapsed": true
458 |    },
459 |    "outputs": [],
460 |    "source": [
461 |     "product_sum_x2_elems = { \"$multiply\": [\"$sum_x2\", \"$elems\"] }\n",
462 |     "sum_x_2 = { \"$multiply\": [\"$sum_x\", \"$sum_x\"] }\n",
463 |     "bottom_left = { \"$subtract\": [ product_sum_x2_elems, sum_x_2]}"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "metadata": {
470 |     "collapsed": true
471 |    },
472 |    "outputs": [],
473 |    "source": [
474 |     "product_sum_y2_elems = { \"$multiply\": [\"$sum_y2\", \"$elems\"] }\n",
475 |     "sum_y_2 = { \"$multiply\": [\"$sum_y\", \"$sum_y\"] }\n",
476 |     "bottom_right = { \"$subtract\": [product_sum_y2_elems, sum_y_2] }"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {
483 |     "collapsed": true
484 |    },
485 |    "outputs": [],
486 |    "source": [
487 |     "bottom = { \"$sqrt\": { \"$multiply\": [bottom_left, bottom_right] } }\n",
488 |     "correlation = { \n",
489 |     "    \"$project\": {\n",
490 |     "        \"m\": { \"$divide\": [top, bottom] }\n",
491 |     "    }\n",
492 |     "}"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": null,
498 |    "metadata": {
499 |     "collapsed": true
500 |    },
501 |    "outputs": [],
502 |    "source": [
503 |     "pipeline = [\n",
504 |     "    {\n",
505 |     "        \"$match\": {\n",
506 |     "            \"imdb.rating\": { \"$gt\": 0 },\n",
507 |     "            \"imdb.votes\": { \"$gt\": 0}\n",
508 |     "        }\n",
509 |     "    },\n",
510 |     "    all_sums,\n",
511 |     "    correlation\n",
512 |     "]"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": null,
518 |    "metadata": {
519 |     "collapsed": true
520 |    },
521 |    "outputs": [],
522 |    "source": [
523 |     "result = list(movies.aggregate(pipeline))\n",
524 |     "print(f\"\"\"\n",
525 |     "r = {round(r.real, 4)} (calculated by hand)\n",
526 |     "p = {round(p[0], 4)} (from scipy)\n",
527 |     "m = {round(result[0]['m'], 4)} (from MongoDB)\n",
528 |     "\"\"\")"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "code",
533 |    "execution_count": null,
534 |    "metadata": {
535 |     "collapsed": true
536 |    },
537 |    "outputs": [],
538 |    "source": []
539 |   }
540 |  ],
541 |  "metadata": {
542 |   "kernelspec": {
543 |    "display_name": "Python 3",
544 |    "language": "python",
545 |    "name": "python3"
546 |   },
547 |   "language_info": {
548 |    "codemirror_mode": {
549 |     "name": "ipython",
550 |     "version": 3
551 |    },
552 |    "file_extension": ".py",
553 |    "mimetype": "text/x-python",
554 |    "name": "python",
555 |    "nbconvert_exporter": "python",
556 |    "pygments_lexer": "ipython3",
557 |    "version": "3.6.5"
558 |   }
559 |  },
560 |  "nbformat": 4,
561 |  "nbformat_minor": 2
562 | }
563 | 


--------------------------------------------------------------------------------
/LessonNotes/tree_like__lesson.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "%%capture\n",
 12 |     "# installing necessary dependencies and importing\n",
 13 |     "!pip install numpy pandas ete3\n",
 14 |     "import numpy as np\n",
 15 |     "import pandas as pd\n",
 16 |     "import seaborn as sns\n",
 17 |     "from pymongo import MongoClient\n",
 18 |     "from ete3 import Tree, TreeStyle, TextFace, add_face_to_node\n",
 19 |     "import json\n",
 20 |     "from IPython.display import Image\n",
 21 |     "import pprint"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {
 28 |     "collapsed": true
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "course_cluster_uri = \"mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin\"\n",
 33 |     "course_client = MongoClient(course_cluster_uri)\n",
 34 |     "products = course_client['coursera-agg']['product_categories']"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "# let's just get the products involving cats\n",
 44 |     "df = pd.DataFrame.from_dict(list(products.find({\"name\": { \"$regex\": \"^cat \", \"$options\": 'i' } }, {\"_id\": 0})))\n",
 45 |     "df"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "# Tree visualizations using etetoolkit, www.etetoolkit.org\n",
 55 |     "# You can read in depth about newick trees there!\n",
 56 |     "current_view = []\n",
 57 |     "for name in df['name']:\n",
 58 |     "    dat = df.loc[df['name'] == name].values\n",
 59 |     "    z = f\"({dat[0, 0]}){dat[0, 1]}\"\n",
 60 |     "    current_view.append(z)\n",
 61 |     "    \n",
 62 |     "q = ','.join(current_view)\n",
 63 |     "a = f\"({q});\"\n",
 64 |     "t = Tree(a, format=1)\n",
 65 |     "ts = TreeStyle()\n",
 66 |     "ts.show_leaf_name = False\n",
 67 |     "def my_layout(node):\n",
 68 |     "        F = TextFace(node.name, tight_text=True)\n",
 69 |     "        add_face_to_node(F, node, column=0, position=\"branch-right\")\n",
 70 |     "ts.layout_fn = my_layout\n",
 71 |     "t.render('%%inline', tree_style=ts)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "preferred_view = []\n",
 81 |     "\n",
 82 |     "\n",
 83 |     "dat = df.loc[df['parent'] == 'Cat Supplies'].values\n",
 84 |     "for i in range(0, len(dat)):\n",
 85 |     "    row = f\"({dat[i, 0]})\"\n",
 86 |     "    preferred_view.append(row)\n",
 87 |     "    \n",
 88 |     "q = ','.join(preferred_view)\n",
 89 |     "a = f\"((({q})Cat Supplies)Pet Supplies);\"\n",
 90 |     "t = Tree(a, format=1)\n",
 91 |     "ts = TreeStyle()\n",
 92 |     "ts.show_leaf_name = False\n",
 93 |     "ts.layout_fn = my_layout\n",
 94 |     "t.render('%%inline', tree_style=ts)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {
101 |     "collapsed": true
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "just_cat_toys = [\n",
106 |     "    {\n",
107 |     "        \"$match\": { \"name\": \"Cat Toys\"}\n",
108 |     "    },\n",
109 |     "    {\n",
110 |     "        \"$graphLookup\": {\n",
111 |     "            \"from\": \"product_categories\",\n",
112 |     "            \"startWith\": \"$name\",\n",
113 |     "            \"connectFromField\": \"parent\",\n",
114 |     "            \"connectToField\": \"name\",\n",
115 |     "            \"as\": \"ancestors\",\n",
116 |     "        }\n",
117 |     "    },\n",
118 |     "    {\n",
119 |     "        \"$project\": { \"_id\": 0 }\n",
120 |     "    }\n",
121 |     "]\n",
122 |     "\n",
123 |     "cat_toy_lineage = list(products.aggregate(just_cat_toys))"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "pprint.pprint(cat_toy_lineage)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "just_cat_toys_ancestry = [\n",
144 |     "    {\n",
145 |     "        \"$match\": { \"name\": \"Cat Toys\"}\n",
146 |     "    },\n",
147 |     "    {\n",
148 |     "        \"$graphLookup\": {\n",
149 |     "            \"from\": \"product_categories\",\n",
150 |     "            \"startWith\": \"$name\",\n",
151 |     "            \"connectFromField\": \"parent\",\n",
152 |     "            \"connectToField\": \"name\",\n",
153 |     "            \"as\": \"ancestors\",\n",
154 |     "        }\n",
155 |     "    },\n",
156 |     "    {\n",
157 |     "        \"$project\": {\n",
158 |     "            \"_id\": 0,\n",
159 |     "            \"name\": 1,\n",
160 |     "            \"ancestors\": {\n",
161 |     "                \"$setDifference\": [\"$ancestors.parent\", [\"$name\", None]]\n",
162 |     "            },\n",
163 |     "            \"parent\": 1\n",
164 |     "        }\n",
165 |     "    }\n",
166 |     "]"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "print(json.dumps(list(products.aggregate(just_cat_toys_ancestry)), indent=4))"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {
182 |     "collapsed": true
183 |    },
184 |    "outputs": [],
185 |    "source": [
186 |     "just_cat_toys_unwound = [\n",
187 |     "    {\n",
188 |     "        \"$match\": { \"name\": \"Cat Toys\"}\n",
189 |     "    },\n",
190 |     "    {\n",
191 |     "        \"$graphLookup\": {\n",
192 |     "            \"from\": \"product_categories\",\n",
193 |     "            \"startWith\": \"$name\",\n",
194 |     "            \"connectFromField\": \"parent\",\n",
195 |     "            \"connectToField\": \"name\",\n",
196 |     "            \"as\": \"ancestors\",\n",
197 |     "        }\n",
198 |     "    },\n",
199 |     "    {\n",
200 |     "        \"$unwind\": \"$ancestors\"\n",
201 |     "    },\n",
202 |     "    {\n",
203 |     "        \"$project\": { \"_id\": 0, \"ancestors._id\": 0 }\n",
204 |     "    }\n",
205 |     "]"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "print(json.dumps(list(products.aggregate(just_cat_toys_unwound)), indent=4))"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "descendants = [\n",
224 |     "    {\n",
225 |     "        \"$match\": { \"name\": \"Cat Toys\"}\n",
226 |     "    },\n",
227 |     "    {\n",
228 |     "        \"$graphLookup\": {\n",
229 |     "            \"from\": \"product_categories\",\n",
230 |     "            \"startWith\": \"$name\",\n",
231 |     "            \"connectFromField\": \"parent\",\n",
232 |     "            \"connectToField\": \"name\",\n",
233 |     "            \"as\": \"ancestors\",\n",
234 |     "        }\n",
235 |     "    },\n",
236 |     "    {\n",
237 |     "        \"$unwind\": \"$ancestors\"\n",
238 |     "    },\n",
239 |     "    {\n",
240 |     "        \"$group\": {\n",
241 |     "            \"_id\": \"$ancestors.name\",\n",
242 |     "            \"descendants\": { \n",
243 |     "                \"$addToSet\": \"$name\"\n",
244 |     "            }\n",
245 |     "        }\n",
246 |     "    }\n",
247 |     "]\n",
248 |     "print(json.dumps(list(products.aggregate(descendants)), indent=4))"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "descendants_pet_supplies = [\n",
258 |     "    {\n",
259 |     "        \"$match\": {\n",
260 |     "            \"name\": { \"$regex\": \"^cat \", \"$options\": 'i'  },\n",
261 |     "            \"parent\": { \"$in\": [\"Cat Supplies\", \"Pet Supplies\"]}\n",
262 |     "        }\n",
263 |     "    },\n",
264 |     "    {\n",
265 |     "        \"$graphLookup\": {\n",
266 |     "            \"from\": \"product_categories\",\n",
267 |     "            \"startWith\": \"$name\",\n",
268 |     "            \"connectFromField\": \"parent\",\n",
269 |     "            \"connectToField\": \"name\",\n",
270 |     "            \"as\": \"ancestors\",\n",
271 |     "        }\n",
272 |     "    },\n",
273 |     "    {\n",
274 |     "        \"$unwind\": \"$ancestors\"\n",
275 |     "    },\n",
276 |     "    {\n",
277 |     "        \"$group\": {\n",
278 |     "            \"_id\": \"$ancestors.name\",\n",
279 |     "            \"descendants\": { \n",
280 |     "                \"$addToSet\": {\n",
281 |     "                    \"name\": \"$name\",\n",
282 |     "                    \"parent\": \"$parent\"\n",
283 |     "                }\n",
284 |     "            }\n",
285 |     "        }\n",
286 |     "    },\n",
287 |     "    {\n",
288 |     "        \"$match\": {\n",
289 |     "            \"_id\": { \"$regex\": \"^cat |^pet \", \"$options\": \"i\" }\n",
290 |     "        }\n",
291 |     "    }\n",
292 |     "]\n",
293 |     "print(json.dumps(list(products.aggregate(descendants_pet_supplies)), indent=4))"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {
300 |     "collapsed": true
301 |    },
302 |    "outputs": [],
303 |    "source": [
304 |     "pipeline_children = [\n",
305 |     "    {\n",
306 |     "        \"$graphLookup\": {\n",
307 |     "            \"from\": \"product_categories\",\n",
308 |     "            \"startWith\": \"$name\",\n",
309 |     "            \"connectFromField\": \"parent\",\n",
310 |     "            \"connectToField\": \"name\",\n",
311 |     "            \"as\": \"ancestors\",\n",
312 |     "        }\n",
313 |     "    },\n",
314 |     "    {\n",
315 |     "        \"$unwind\": \"$ancestors\"\n",
316 |     "    },\n",
317 |     "    {\n",
318 |     "        \"$group\": {\n",
319 |     "            \"_id\": \"$ancestors.name\",\n",
320 |     "            \"descendants\": { \n",
321 |     "                \"$addToSet\": {\n",
322 |     "                    \"name\": \"$name\",\n",
323 |     "                    \"parent\": \"$parent\"\n",
324 |     "                }\n",
325 |     "            }\n",
326 |     "        }\n",
327 |     "    },\n",
328 |     "    {\n",
329 |     "        \"$addFields\": {\n",
330 |     "            \"descendants\": {\n",
331 |     "                \"$setDifference\": [ \"$descendants.name\", [\"$_id\"]]\n",
332 |     "            },\n",
333 |     "            \"children\": {\n",
334 |     "                \"$map\": {\n",
335 |     "                    \"input\": {\n",
336 |     "                        \"$filter\": {\n",
337 |     "                            \"input\": \"$descendants\",\n",
338 |     "                            \"cond\": {\n",
339 |     "                                \"$eq\": [\"$_id\", \"$$this.parent\"]\n",
340 |     "                            }\n",
341 |     "                        }\n",
342 |     "                    },\n",
343 |     "                    \"in\": \"$$this.name\"\n",
344 |     "                } \n",
345 |     "            }\n",
346 |     "        }\n",
347 |     "    }\n",
348 |     "]"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": null,
354 |    "metadata": {},
355 |    "outputs": [],
356 |    "source": [
357 |     "pipeline = pipeline_children.copy()\n",
358 |     "pipeline.insert(0, {\n",
359 |     "    \"$match\": {\n",
360 |     "        \"name\": { \"$regex\": \"^cat \", \"$options\": 'i'  },\n",
361 |     "        \"parent\": { \"$in\": [\"Cat Supplies\", \"Pet Supplies\"]}\n",
362 |     "    }\n",
363 |     "})\n",
364 |     "pipeline.append({\n",
365 |     "    \"$match\": {\n",
366 |     "        \"_id\": { \"$regex\": \"^cat |^pet \", \"$options\": \"i\" }\n",
367 |     "    }\n",
368 |     "})\n",
369 |     "tree = list(products.aggregate(pipeline))\n",
370 |     "print(json.dumps(tree, indent=4))"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "metadata": {
377 |     "collapsed": true
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "def descend_tree(node=None, data=None):\n",
382 |     "    \"\"\"\n",
383 |     "    This function descends a tree from a given node. The node name and dataframe are required\n",
384 |     "    \"\"\"\n",
385 |     "    if node is None or data is None or data.empty:\n",
386 |     "        raise(f\"Both node and data are required. Got {node} for node and {data} for data\")\n",
387 |     "    try:\n",
388 |     "        children = data.loc[data['name'] == node]['children'].values[0]\n",
389 |     "        if not children:\n",
390 |     "            return '(' + node + ')'\n",
391 |     "        else:\n",
392 |     "            return '(' + ','.join([descend_tree(child, data) for child in children]) + ')' + node\n",
393 |     "    except:\n",
394 |     "        return '(' + node + ')'\n"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [],
402 |    "source": [
403 |     "df1 = pd.DataFrame.from_dict(tree)\n",
404 |     "df1['name'] = df1['_id']\n",
405 |     "df1.drop('_id', axis=1, inplace=True)\n",
406 |     "t = Tree(f\"{descend_tree('Pet Supplies', df1)};\", format=1)\n",
407 |     "ts = TreeStyle()\n",
408 |     "ts.show_leaf_name = False\n",
409 |     "def my_layout(node):\n",
410 |     "        F = TextFace(node.name, tight_text=True)\n",
411 |     "        add_face_to_node(F, node, column=0, position=\"branch-right\")\n",
412 |     "ts.layout_fn = my_layout\n",
413 |     "t.render('%%inline', tree_style=ts)"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": null,
419 |    "metadata": {
420 |     "collapsed": true
421 |    },
422 |    "outputs": [],
423 |    "source": [
424 |     "pipeline_parents = [\n",
425 |     "    {\n",
426 |     "        \"$graphLookup\": {\n",
427 |     "            \"from\": \"product_categories\",\n",
428 |     "            \"startWith\": \"$name\",\n",
429 |     "            \"connectFromField\": \"parent\",\n",
430 |     "            \"connectToField\": \"name\",\n",
431 |     "            \"as\": \"ancestors\",\n",
432 |     "        }\n",
433 |     "    },\n",
434 |     "    {\n",
435 |     "        \"$project\": {\n",
436 |     "            \"name\": 1,\n",
437 |     "            \"ancestors\": {\n",
438 |     "                \"$setDifference\": [\"$ancestors.parent\", [\"$name\", None]]\n",
439 |     "            },\n",
440 |     "            \"parent\": 1\n",
441 |     "        }\n",
442 |     "    }\n",
443 |     "]"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": null,
449 |    "metadata": {
450 |     "collapsed": true
451 |    },
452 |    "outputs": [],
453 |    "source": [
454 |     "full_tree = [\n",
455 |     "    {\n",
456 |     "        \"$facet\": {\n",
457 |     "            \"child_tree\": pipeline_children,\n",
458 |     "            \"parent_tree\": pipeline_parents\n",
459 |     "        }\n",
460 |     "    },\n",
461 |     "    {\n",
462 |     "        \"$unwind\": \"$parent_tree\"\n",
463 |     "    },\n",
464 |     "    {\n",
465 |     "        \"$project\": {\n",
466 |     "            \"own_child_tree\": {\n",
467 |     "                \"$arrayElemAt\": [\n",
468 |     "                    {\n",
469 |     "                        \"$filter\": {\n",
470 |     "                            \"input\": \"$child_tree\",\n",
471 |     "                            \"cond\": {\n",
472 |     "                                \"$eq\": [\"$$this._id\", \"$parent_tree.name\"]\n",
473 |     "                            }\n",
474 |     "                        }\n",
475 |     "                    },\n",
476 |     "                    0\n",
477 |     "                ]\n",
478 |     "            },\n",
479 |     "            \"name\": \"$parent_tree.name\",\n",
480 |     "            \"parent\": \"$parent_tree.parent\",\n",
481 |     "            \"ancestors\": \"$parent_tree.ancestors\",\n",
482 |     "        }\n",
483 |     "    },\n",
484 |     "    {\n",
485 |     "        \"$addFields\": {\n",
486 |     "            \"children\": \"$own_child_tree.children\",\n",
487 |     "            \"descendants\": \"$own_child_tree.descendants\"\n",
488 |     "        }\n",
489 |     "    },\n",
490 |     "    {\n",
491 |     "        \"$addFields\": {\n",
492 |     "            \"num_children\": { \"$size\": \"$children\" },\n",
493 |     "            \"num_descendants\": { \"$size\": \"$descendants\" },\n",
494 |     "            \"num_ancestors\": { \"$size\": \"$ancestors\" }\n",
495 |     "        }\n",
496 |     "    },\n",
497 |     "    {\n",
498 |     "        \"$project\": { \"own_child_tree\": 0 }\n",
499 |     "    },\n",
500 |     "    {\n",
501 |     "        \"$sort\": { \"num_descendants\": -1 }\n",
502 |     "    }\n",
503 |     "]"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": null,
509 |    "metadata": {
510 |     "collapsed": true
511 |    },
512 |    "outputs": [],
513 |    "source": [
514 |     "df = pd.DataFrame.from_dict(list(products.aggregate(full_tree)))"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": null,
520 |    "metadata": {},
521 |    "outputs": [],
522 |    "source": [
523 |     "df.head(10)"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": null,
529 |    "metadata": {
530 |     "collapsed": true
531 |    },
532 |    "outputs": [],
533 |    "source": [
534 |     "%%capture\n",
535 |     "tr = Tree(f\"{descend_tree('Pet Supplies', df)};\", format=1)\n",
536 |     "ts = TreeStyle()\n",
537 |     "ts.show_branch_length = False\n",
538 |     "ts.show_branch_support = False\n",
539 |     "ts.show_leaf_name = False\n",
540 |     "ts.mode = 'c'\n",
541 |     "ts.layout_fn = my_layout"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": null,
547 |    "metadata": {},
548 |    "outputs": [],
549 |    "source": [
550 |     "tr.render('%%inline', tree_style=ts)"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": null,
556 |    "metadata": {
557 |     "collapsed": true
558 |    },
559 |    "outputs": [],
560 |    "source": [
561 |     "def is_descendant(candidate=None, of=None, data=None):\n",
562 |     "    assert isinstance(candidate, str), \"candidate is required and must be of type str\"\n",
563 |     "    assert isinstance(of, str), \"of is required and must be of type str\"\n",
564 |     "    assert isinstance(data, pd.DataFrame), \"data is required and must be of a pandas DataFrame\"\n",
565 |     "    try:\n",
566 |     "        return candidate in data.loc[data['name'] == of]['descendants'].values[0]\n",
567 |     "    except:\n",
568 |     "        return False\n",
569 |     "\n",
570 |     "def is_ancestor(candidate=None, of=None, data=None):\n",
571 |     "    assert isinstance(candidate, str), \"candidate is required and must be of type str\"\n",
572 |     "    assert isinstance(of, str), \"of is required and must be of type str\"\n",
573 |     "    assert isinstance(data, pd.DataFrame), \"data is required and must be of a pandas DataFrame\"\n",
574 |     "    try:\n",
575 |     "        return candidate in data.loc[data['name'] == of]['ancestors'].values[0]\n",
576 |     "    except:\n",
577 |     "        return False\n",
578 |     "\n",
579 |     "def common_ancestor(node_1=None, node_2=None, data=None):\n",
580 |     "    assert isinstance(node_1, str), \"candidate is required and must be of type str\"\n",
581 |     "    assert isinstance(node_2, str), \"of is required and must be of type str\"\n",
582 |     "    assert isinstance(data, pd.DataFrame), \"data is required and must be of a pandas DataFrame\"\n",
583 |     "\n",
584 |     "    try:\n",
585 |     "        node_1_loc = data.loc[data['name'] == node_1]\n",
586 |     "        node_2_loc = data.loc[data['name'] == node_2]\n",
587 |     "\n",
588 |     "        if node_1 == node_2:\n",
589 |     "            return node_1\n",
590 |     "       \n",
591 |     "        if is_ancestor(node_1, node_2, data):\n",
592 |     "            return node_1\n",
593 |     "        if is_descendant(node_1, node_2, data):\n",
594 |     "            return node_2\n",
595 |     "        \n",
596 |     "        node_1_parent = node_1_loc['parent'].values[0]\n",
597 |     "        node_2_parent = node_2_loc['parent'].values[0]\n",
598 |     "        \n",
599 |     "        if node_1_parent == node_2_parent:\n",
600 |     "            return node_1_parent\n",
601 |     "        \n",
602 |     "        return common_ancestor(node_1_parent, node_2_parent, data)\n",
603 |     "    except:\n",
604 |     "        return \"no common ancestor found\""
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": null,
610 |    "metadata": {},
611 |    "outputs": [],
612 |    "source": [
613 |     "is_descendant('Bird Cage Food & Water Dishes', 'Pet Supplies', df)"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": null,
619 |    "metadata": {},
620 |    "outputs": [],
621 |    "source": [
622 |     "is_ancestor('Pet Supplies', 'Bird Cage Accessories', df)"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": null,
628 |    "metadata": {},
629 |    "outputs": [],
630 |    "source": [
631 |     "common_ancestor('Small Animal Food', 'Pet Food Containers', df)"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": null,
637 |    "metadata": {
638 |     "collapsed": true
639 |    },
640 |    "outputs": [],
641 |    "source": [
642 |     "%%capture\n",
643 |     "root_nodes = df.loc[df['num_ancestors'] == 0]['name'].values.tolist()\n",
644 |     "the_product_tree = '(' + ','.join([descend_tree(root, df) for root in root_nodes]) + ')' + ';'\n",
645 |     "t = Tree(the_product_tree, format=1)\n",
646 |     "ts = TreeStyle()\n",
647 |     "ts.show_branch_length = False\n",
648 |     "ts.show_branch_support = False\n",
649 |     "ts.layout_fn = my_layout\n",
650 |     "# uncomment the line below to generate the entire product graph\n",
651 |     "# t.render('product_tree.png', tree_style=ts)\n",
652 |     "\"\"\"\n",
653 |     "Open the local file \"product_tree.png\" at your own risk!\n",
654 |     "\"\"\""
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": null,
660 |    "metadata": {
661 |     "collapsed": true
662 |    },
663 |    "outputs": [],
664 |    "source": []
665 |   }
666 |  ],
667 |  "metadata": {
668 |   "kernelspec": {
669 |    "display_name": "Python 3",
670 |    "language": "python",
671 |    "name": "python3"
672 |   },
673 |   "language_info": {
674 |    "codemirror_mode": {
675 |     "name": "ipython",
676 |     "version": 3
677 |    },
678 |    "file_extension": ".py",
679 |    "mimetype": "text/x-python",
680 |    "name": "python",
681 |    "nbconvert_exporter": "python",
682 |    "pygments_lexer": "ipython3",
683 |    "version": "3.6.3"
684 |   }
685 |  },
686 |  "nbformat": 4,
687 |  "nbformat_minor": 2
688 | }
689 | 


--------------------------------------------------------------------------------