├── .gitignore ├── Amazon_Reviews_ETL.ipynb ├── README.md ├── Vine_Review_Analysis.ipynb ├── images ├── cloud_etl.png ├── customer_table.png ├── products_table.png ├── review_data.png ├── review_id_table.png ├── unpaid_reviews.png ├── vine_df.png └── vine_reviews.png └── table_schema.sql /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Amazon_Reviews_ETL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "colab": { 8 | "base_uri": "https://localhost:8080/" 9 | }, 10 | "id": "V58rxea0HqSa", 11 | "outputId": "40bc4b96-d39c-460a-f1d0-57ca01e340eb" 12 | }, 13 | "outputs": [ 14 | { 15 | "name": "stdout", 16 | "output_type": "stream", 17 | "text": [ 18 | "\r", 19 | "0% [Working]\r", 20 | " \r", 21 | "Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]\n", 22 | "Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease\n", 23 | "Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease\n", 24 | "Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]\n", 25 | "Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease\n", 26 | "Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release\n", 27 | "Get:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]\n", 28 | "Hit:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease\n", 29 | "Get:9 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]\n", 30 | "Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease\n", 31 | "Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease\n", 32 | "Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n", 33 | "Get:14 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [2,861 kB]\n", 34 | "Get:15 http://archive.ubuntu.com/ubuntu bionic-updates/multiverse amd64 Packages [29.8 kB]\n", 35 | "Get:16 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [2,297 kB]\n", 36 | "Get:17 http://security.ubuntu.com/ubuntu bionic-security/multiverse amd64 Packages [22.8 kB]\n", 37 | "Get:18 http://security.ubuntu.com/ubuntu bionic-security/restricted amd64 Packages [1,006 kB]\n", 38 | "Get:19 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [1,521 kB]\n", 39 | "Get:20 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [3,294 kB]\n", 40 | "Get:21 http://archive.ubuntu.com/ubuntu bionic-updates/restricted amd64 Packages [1,040 kB]\n", 41 | "Get:22 http://archive.ubuntu.com/ubuntu bionic-backports/main amd64 Packages [12.2 kB]\n", 42 | "Fetched 12.3 MB in 3s (4,000 kB/s)\n", 43 | "Reading package lists... Done\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "import os\n", 49 | "# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version\n", 50 | "# For example:\n", 51 | "# spark_version = 'spark-3.0.3'\n", 52 | "spark_version = 'spark-3.0.3'\n", 53 | "os.environ['SPARK_VERSION']=spark_version\n", 54 | "\n", 55 | "# Install Spark and Java\n", 56 | "!apt-get update\n", 57 | "!apt-get install openjdk-11-jdk-headless -qq > /dev/null\n", 58 | "!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz\n", 59 | "!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz\n", 60 | "!pip install -q findspark\n", 61 | "\n", 62 | "# Set Environment Variables\n", 63 | "import os\n", 64 | "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-11-openjdk-amd64\"\n", 65 | "os.environ[\"SPARK_HOME\"] = f\"/content/{spark_version}-bin-hadoop2.7\"\n", 66 | "\n", 67 | "# Start a SparkSession\n", 68 | "import findspark\n", 69 | "findspark.init()" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": { 76 | "colab": { 77 | "base_uri": "https://localhost:8080/" 78 | }, 79 | "id": "_xKwTpATHqSe", 80 | "outputId": "c231e860-8ba8-4ebd-c401-7c2cf8a9f2aa" 81 | }, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "--2022-06-18 00:11:50-- https://jdbc.postgresql.org/download/postgresql-42.2.16.jar\n", 88 | "Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228\n", 89 | "Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.\n", 90 | "HTTP request sent, awaiting response... 200 OK\n", 91 | "Length: 1002883 (979K) [application/java-archive]\n", 92 | "Saving to: ‘postgresql-42.2.16.jar’\n", 93 | "\n", 94 | "postgresql-42.2.16. 100%[===================>] 979.38K 5.02MB/s in 0.2s \n", 95 | "\n", 96 | "2022-06-18 00:11:51 (5.02 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]\n", 97 | "\n" 98 | ] 99 | } 100 | ], 101 | "source": [ 102 | "# Download the Postgres driver that will allow Spark to interact with Postgres.\n", 103 | "!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 4, 109 | "metadata": { 110 | "id": "MMqDAjVS0KN9" 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "from pyspark.sql import SparkSession\n", 115 | "spark = SparkSession.builder.appName(\"M16-Amazon-Challenge\").config(\"spark.driver.extraClassPath\",\"/content/postgresql-42.2.16.jar\").getOrCreate()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "id": "cyBsySGuY-9V" 122 | }, 123 | "source": [ 124 | "### Load Amazon Data into Spark DataFrame" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 5, 130 | "metadata": { 131 | "colab": { 132 | "base_uri": "https://localhost:8080/" 133 | }, 134 | "id": "CtCmBhQJY-9Z", 135 | "outputId": "e9a4a16e-6c80-478e-acd4-e48e9b65d99e" 136 | }, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 143 | "|marketplace|customer_id| review_id|product_id|product_parent| product_title| product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase| review_headline| review_body|review_date|\n", 144 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 145 | "| US| 179886| RY01SAV7HZ8QO|B00NTI0CQ2| 667358431|Aketek 1080P LED ...|Home Entertainment| 4| 0| 0| N| Y|good enough for m...|not the best pict...| 2015-08-31|\n", 146 | "| US| 37293769|R1XX8SDGJ4MZ4L|B00BUCLVZU| 621695622|TiVo Mini with IR...|Home Entertainment| 5| 0| 0| N| N|Tell the Cable Co...|Not only do my Ti...| 2015-08-31|\n", 147 | "| US| 8332121|R149Q3B5L33NN5|B00RBX9D5W| 143071132|Apple TV MD199LL/...|Home Entertainment| 5| 0| 0| N| Y| Works perfectly!|Works perfectly! ...| 2015-08-31|\n", 148 | "| US| 47054962|R2ZVD69Z6KPJ4O|B00UJ3IULO| 567816707|New Roku 3 6.5 Fo...|Home Entertainment| 1| 0| 2| N| Y|It doesn't work. ...|It doesn't work. ...| 2015-08-31|\n", 149 | "| US| 23413911|R1DIKG2G33ZLNP|B0037UCTXG| 909557698|Generic DVI-I Dua...|Home Entertainment| 4| 0| 0| N| Y| As pictured|I received the it...| 2015-08-31|\n", 150 | "| US| 4417771|R3L6FGKAW0EYFI|B004N866SU| 414565179|Samsung 3D LED HD...|Home Entertainment| 1| 1| 1| N| N|Only lasts 3-4 ye...|I bought this TV ...| 2015-08-31|\n", 151 | "| US| 47900707| RAO0QZH5VC6VI|B00JE6AOJS| 798450889|Jiake Wireless Bl...|Home Entertainment| 1| 0| 0| N| Y| One Star| Waste of $$$$$$$| 2015-08-31|\n", 152 | "| US| 34112894|R25IK0UAHWNB22|B00COL0B7A| 777554234|3pcs/lot 3in1 3ft...|Home Entertainment| 3| 0| 0| N| Y| Three Stars|Nice but all thre...| 2015-08-31|\n", 153 | "| US| 20691979|R2A9IHKZMTMAL1|B00QHLSKOE| 885228855|Matricom G-Box Q²...|Home Entertainment| 5| 1| 2| N| Y|Yes...exactly wha...|Oh, yeah...doesn'...| 2015-08-31|\n", 154 | "| US| 25983343| R5XVKTHL6SITI|B00UNL2MUW| 164482798|VIZIO S2920W-C0B ...|Home Entertainment| 5| 0| 0| N| Y|Fantastic sound. ...|Fantastic sound. ...| 2015-08-31|\n", 155 | "| US| 35816068|R2QZZOSTDDY1IE|B00RIC9JB4| 184834831|Hitachi 49\" Class...|Home Entertainment| 3| 3| 6| N| Y| risky|Great tv but Tv d...| 2015-08-31|\n", 156 | "| US| 10628020|R38CUDCFPSNYTD|B00HPMCO6O| 444378461|Sony BDPS5200 3D ...|Home Entertainment| 5| 0| 0| N| Y| Five Stars|EXCELLENT SERVICE...| 2015-08-31|\n", 157 | "| US| 9059625| RM6ZR6NH052YH|B004QGXWS6| 770226547|Sylvania 7-Inch T...|Home Entertainment| 3| 1| 2| N| Y|They worked great...|They worked great...| 2015-08-31|\n", 158 | "| US| 2681147| RUQK5N4WH8UN8|B00FO12XY6| 448806082|Roku HD Streaming...|Home Entertainment| 5| 0| 0| N| Y| Five Stars|Smooth and worked...| 2015-08-31|\n", 159 | "| US| 33449922|R21LWSBQWWJYZ3|B00BD7UVO4| 374427271|LG Electronics BP...|Home Entertainment| 5| 0| 0| N| Y| Five Stars| Works very well| 2015-08-31|\n", 160 | "| US| 43069144| R8W5S53RQ2DF7|B00TRQPEYK| 614207013|LG Electronics LF...|Home Entertainment| 5| 0| 0| N| Y| LG TV|The TV arrived qu...| 2015-08-31|\n", 161 | "| US| 46780686|R3ENME3JQBWXZS|B005STXQG8| 689442799|SquareTrade TV Pr...|Home Entertainment| 5| 0| 1| N| Y| Five Stars|well worth the mo...| 2015-08-31|\n", 162 | "| US| 49037595|R3URL5K7DHHYK7|B00BEL11RA| 910670994|Cambridge - Azur ...|Home Entertainment| 5| 1| 1| N| Y| Love it.|Great Blu-ray pla...| 2015-08-31|\n", 163 | "| US| 27868511|R22YISZKS35YZX|B00QHLSKOE| 885228855|Matricom G-Box Q²...|Home Entertainment| 5| 2| 3| N| Y|If your on the fe...|Great media cente...| 2015-08-31|\n", 164 | "| US| 3004043|R3IIOLWHWC297U|B00MWCJ8VQ| 946374680|Roku 3500XB Strea...|Home Entertainment| 5| 0| 0| N| Y| Five Stars| great product| 2015-08-31|\n", 165 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 166 | "only showing top 20 rows\n", 167 | "\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "from pyspark import SparkFiles\n", 173 | "url = \"https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Home_Entertainment_v1_00.tsv.gz\"\n", 174 | "spark.sparkContext.addFile(url)\n", 175 | "df = spark.read.option(\"encoding\", \"UTF-8\").csv(SparkFiles.get(\"\"), sep=\"\\t\", header=True, inferSchema=True)\n", 176 | "df.show()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": { 182 | "id": "2yUSe55VY-9t" 183 | }, 184 | "source": [ 185 | "### Create DataFrames to match tables" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 6, 191 | "metadata": { 192 | "colab": { 193 | "base_uri": "https://localhost:8080/" 194 | }, 195 | "id": "C8REmY1aY-9u", 196 | "outputId": "1d0f30c0-eedc-405c-b9e2-5585a7185c09" 197 | }, 198 | "outputs": [ 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 204 | "|marketplace|customer_id| review_id|product_id|product_parent| product_title| product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase| review_headline| review_body|review_date|\n", 205 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 206 | "| US| 179886| RY01SAV7HZ8QO|B00NTI0CQ2| 667358431|Aketek 1080P LED ...|Home Entertainment| 4| 0| 0| N| Y|good enough for m...|not the best pict...| 2015-08-31|\n", 207 | "| US| 37293769|R1XX8SDGJ4MZ4L|B00BUCLVZU| 621695622|TiVo Mini with IR...|Home Entertainment| 5| 0| 0| N| N|Tell the Cable Co...|Not only do my Ti...| 2015-08-31|\n", 208 | "| US| 8332121|R149Q3B5L33NN5|B00RBX9D5W| 143071132|Apple TV MD199LL/...|Home Entertainment| 5| 0| 0| N| Y| Works perfectly!|Works perfectly! ...| 2015-08-31|\n", 209 | "| US| 47054962|R2ZVD69Z6KPJ4O|B00UJ3IULO| 567816707|New Roku 3 6.5 Fo...|Home Entertainment| 1| 0| 2| N| Y|It doesn't work. ...|It doesn't work. ...| 2015-08-31|\n", 210 | "| US| 23413911|R1DIKG2G33ZLNP|B0037UCTXG| 909557698|Generic DVI-I Dua...|Home Entertainment| 4| 0| 0| N| Y| As pictured|I received the it...| 2015-08-31|\n", 211 | "| US| 4417771|R3L6FGKAW0EYFI|B004N866SU| 414565179|Samsung 3D LED HD...|Home Entertainment| 1| 1| 1| N| N|Only lasts 3-4 ye...|I bought this TV ...| 2015-08-31|\n", 212 | "| US| 47900707| RAO0QZH5VC6VI|B00JE6AOJS| 798450889|Jiake Wireless Bl...|Home Entertainment| 1| 0| 0| N| Y| One Star| Waste of $$$$$$$| 2015-08-31|\n", 213 | "| US| 34112894|R25IK0UAHWNB22|B00COL0B7A| 777554234|3pcs/lot 3in1 3ft...|Home Entertainment| 3| 0| 0| N| Y| Three Stars|Nice but all thre...| 2015-08-31|\n", 214 | "| US| 20691979|R2A9IHKZMTMAL1|B00QHLSKOE| 885228855|Matricom G-Box Q²...|Home Entertainment| 5| 1| 2| N| Y|Yes...exactly wha...|Oh, yeah...doesn'...| 2015-08-31|\n", 215 | "| US| 25983343| R5XVKTHL6SITI|B00UNL2MUW| 164482798|VIZIO S2920W-C0B ...|Home Entertainment| 5| 0| 0| N| Y|Fantastic sound. ...|Fantastic sound. ...| 2015-08-31|\n", 216 | "| US| 35816068|R2QZZOSTDDY1IE|B00RIC9JB4| 184834831|Hitachi 49\" Class...|Home Entertainment| 3| 3| 6| N| Y| risky|Great tv but Tv d...| 2015-08-31|\n", 217 | "| US| 10628020|R38CUDCFPSNYTD|B00HPMCO6O| 444378461|Sony BDPS5200 3D ...|Home Entertainment| 5| 0| 0| N| Y| Five Stars|EXCELLENT SERVICE...| 2015-08-31|\n", 218 | "| US| 9059625| RM6ZR6NH052YH|B004QGXWS6| 770226547|Sylvania 7-Inch T...|Home Entertainment| 3| 1| 2| N| Y|They worked great...|They worked great...| 2015-08-31|\n", 219 | "| US| 2681147| RUQK5N4WH8UN8|B00FO12XY6| 448806082|Roku HD Streaming...|Home Entertainment| 5| 0| 0| N| Y| Five Stars|Smooth and worked...| 2015-08-31|\n", 220 | "| US| 33449922|R21LWSBQWWJYZ3|B00BD7UVO4| 374427271|LG Electronics BP...|Home Entertainment| 5| 0| 0| N| Y| Five Stars| Works very well| 2015-08-31|\n", 221 | "| US| 43069144| R8W5S53RQ2DF7|B00TRQPEYK| 614207013|LG Electronics LF...|Home Entertainment| 5| 0| 0| N| Y| LG TV|The TV arrived qu...| 2015-08-31|\n", 222 | "| US| 46780686|R3ENME3JQBWXZS|B005STXQG8| 689442799|SquareTrade TV Pr...|Home Entertainment| 5| 0| 1| N| Y| Five Stars|well worth the mo...| 2015-08-31|\n", 223 | "| US| 49037595|R3URL5K7DHHYK7|B00BEL11RA| 910670994|Cambridge - Azur ...|Home Entertainment| 5| 1| 1| N| Y| Love it.|Great Blu-ray pla...| 2015-08-31|\n", 224 | "| US| 27868511|R22YISZKS35YZX|B00QHLSKOE| 885228855|Matricom G-Box Q²...|Home Entertainment| 5| 2| 3| N| Y|If your on the fe...|Great media cente...| 2015-08-31|\n", 225 | "| US| 3004043|R3IIOLWHWC297U|B00MWCJ8VQ| 946374680|Roku 3500XB Strea...|Home Entertainment| 5| 0| 0| N| Y| Five Stars| great product| 2015-08-31|\n", 226 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 227 | "only showing top 20 rows\n", 228 | "\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "from pyspark.sql.functions import to_date\n", 234 | "\n", 235 | "# Read in the Review dataset as a DataFrame\n", 236 | "review_data = df\n", 237 | "\n", 238 | "review_data.show()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 11, 244 | "metadata": { 245 | "colab": { 246 | "base_uri": "https://localhost:8080/" 247 | }, 248 | "id": "B0TESUDRY-90", 249 | "outputId": "c671bf5e-a5eb-41b3-812a-e17119e4f1bf" 250 | }, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "+-----------+--------------+\n", 257 | "|customer_id|customer_count|\n", 258 | "+-----------+--------------+\n", 259 | "| 10142992| 1|\n", 260 | "| 16457323| 6|\n", 261 | "| 11935383| 1|\n", 262 | "| 46277736| 1|\n", 263 | "| 13671072| 1|\n", 264 | "| 21453814| 1|\n", 265 | "| 17684885| 1|\n", 266 | "| 20415768| 1|\n", 267 | "| 15212710| 1|\n", 268 | "| 5220924| 1|\n", 269 | "| 46253451| 6|\n", 270 | "| 971908| 1|\n", 271 | "| 32829933| 1|\n", 272 | "| 51221518| 1|\n", 273 | "| 12002637| 2|\n", 274 | "| 16105308| 1|\n", 275 | "| 135867| 1|\n", 276 | "| 47425808| 1|\n", 277 | "| 43138273| 1|\n", 278 | "| 16411995| 1|\n", 279 | "+-----------+--------------+\n", 280 | "only showing top 20 rows\n", 281 | "\n" 282 | ] 283 | } 284 | ], 285 | "source": [ 286 | "# Create the customers_table DataFrame\n", 287 | "customers_df = df.groupby(\"customer_id\").agg({\"customer_id\":\"count\"}).withColumnRenamed(\"count(customer_id)\", \"customer_count\")\n", 288 | "customers_df.show()" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 13, 294 | "metadata": { 295 | "colab": { 296 | "base_uri": "https://localhost:8080/" 297 | }, 298 | "id": "4FwXA6UvY-96", 299 | "outputId": "4fe66c7a-4c34-4d45-a927-f7aa7dda3269" 300 | }, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "+----------+--------------------+\n", 307 | "|product_id| product_title|\n", 308 | "+----------+--------------------+\n", 309 | "|B00N9OT6RM|Upstar 19-Inch 72...|\n", 310 | "|B007R9RUPU|Kinivo LS210 Port...|\n", 311 | "|B00U9U9AAM|Samsung J6200, SB...|\n", 312 | "|B00QCLTOQM|Panasonic TC32A40...|\n", 313 | "|B007F9XJW0|Sony DVPFX780 7-I...|\n", 314 | "|B00JA7ZQOY|Minix X7mini Andr...|\n", 315 | "|B008I641TE|SquareTrade 2-Yea...|\n", 316 | "|B00EJ5UJZ8|PowerSmart 4200mA...|\n", 317 | "|B006L8TX94|TiVo Premiere 500...|\n", 318 | "|B00R8K9ZH4|Fosmon HYBO-DUOC ...|\n", 319 | "|B001JHJK22|Mediabridge - RCA...|\n", 320 | "|B00CWEJ5BW|2-Year Electronic...|\n", 321 | "|B000BSHLLW|Ziotek Purse Lock...|\n", 322 | "|B0019OJOTE|FAVI 32-Inch 1080...|\n", 323 | "|B00MOCT3NW|BenQ MH630 1.4A 1...|\n", 324 | "|B00VWV1O3S|12V 8Ah SLA Batte...|\n", 325 | "|B00BBAG0DY|LG Electronics LA...|\n", 326 | "|B00BXF7I8I|Seiki 1080p 60Hz ...|\n", 327 | "|B0011ZOZ36|Vizio VW26LHDTV20...|\n", 328 | "|B00Q8DB4YY|Atoah MXIII TV Bo...|\n", 329 | "+----------+--------------------+\n", 330 | "only showing top 20 rows\n", 331 | "\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "# Create the products_table DataFrame and drop duplicates. \n", 337 | "products_df = df.select([\"product_id\", \"product_title\"]).drop_duplicates()\n", 338 | "products_df.show()" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 16, 344 | "metadata": { 345 | "colab": { 346 | "base_uri": "https://localhost:8080/" 347 | }, 348 | "id": "MkqyCuNQY-9-", 349 | "outputId": "0dc15924-7f79-4e57-b9ec-1da784e2071c" 350 | }, 351 | "outputs": [ 352 | { 353 | "name": "stdout", 354 | "output_type": "stream", 355 | "text": [ 356 | "+--------------+-----------+----------+--------------+-----------+\n", 357 | "| review_id|customer_id|product_id|product_parent|review_date|\n", 358 | "+--------------+-----------+----------+--------------+-----------+\n", 359 | "| RY01SAV7HZ8QO| 179886|B00NTI0CQ2| 667358431| 2015-08-31|\n", 360 | "|R1XX8SDGJ4MZ4L| 37293769|B00BUCLVZU| 621695622| 2015-08-31|\n", 361 | "|R149Q3B5L33NN5| 8332121|B00RBX9D5W| 143071132| 2015-08-31|\n", 362 | "|R2ZVD69Z6KPJ4O| 47054962|B00UJ3IULO| 567816707| 2015-08-31|\n", 363 | "|R1DIKG2G33ZLNP| 23413911|B0037UCTXG| 909557698| 2015-08-31|\n", 364 | "|R3L6FGKAW0EYFI| 4417771|B004N866SU| 414565179| 2015-08-31|\n", 365 | "| RAO0QZH5VC6VI| 47900707|B00JE6AOJS| 798450889| 2015-08-31|\n", 366 | "|R25IK0UAHWNB22| 34112894|B00COL0B7A| 777554234| 2015-08-31|\n", 367 | "|R2A9IHKZMTMAL1| 20691979|B00QHLSKOE| 885228855| 2015-08-31|\n", 368 | "| R5XVKTHL6SITI| 25983343|B00UNL2MUW| 164482798| 2015-08-31|\n", 369 | "|R2QZZOSTDDY1IE| 35816068|B00RIC9JB4| 184834831| 2015-08-31|\n", 370 | "|R38CUDCFPSNYTD| 10628020|B00HPMCO6O| 444378461| 2015-08-31|\n", 371 | "| RM6ZR6NH052YH| 9059625|B004QGXWS6| 770226547| 2015-08-31|\n", 372 | "| RUQK5N4WH8UN8| 2681147|B00FO12XY6| 448806082| 2015-08-31|\n", 373 | "|R21LWSBQWWJYZ3| 33449922|B00BD7UVO4| 374427271| 2015-08-31|\n", 374 | "| R8W5S53RQ2DF7| 43069144|B00TRQPEYK| 614207013| 2015-08-31|\n", 375 | "|R3ENME3JQBWXZS| 46780686|B005STXQG8| 689442799| 2015-08-31|\n", 376 | "|R3URL5K7DHHYK7| 49037595|B00BEL11RA| 910670994| 2015-08-31|\n", 377 | "|R22YISZKS35YZX| 27868511|B00QHLSKOE| 885228855| 2015-08-31|\n", 378 | "|R3IIOLWHWC297U| 3004043|B00MWCJ8VQ| 946374680| 2015-08-31|\n", 379 | "+--------------+-----------+----------+--------------+-----------+\n", 380 | "only showing top 20 rows\n", 381 | "\n" 382 | ] 383 | } 384 | ], 385 | "source": [ 386 | "# Create the review_id_table DataFrame. \n", 387 | "# Convert the 'review_date' column to a date datatype with to_date(\"review_date\", 'yyyy-MM-dd').alias(\"review_date\")\n", 388 | "review_id_df = df.select([\"review_id\", \"customer_id\", \"product_id\", \"product_parent\", to_date(\"review_date\", 'yyyy-MM-dd').alias(\"review_date\")])\n", 389 | "review_id_df.show()" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 18, 395 | "metadata": { 396 | "colab": { 397 | "base_uri": "https://localhost:8080/" 398 | }, 399 | "id": "lzMmkdKmY--D", 400 | "outputId": "573c6df0-c4e3-45e7-b53e-375a5c717d3b" 401 | }, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 408 | "| review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|\n", 409 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 410 | "| RY01SAV7HZ8QO| 4| 0| 0| N| Y|\n", 411 | "|R1XX8SDGJ4MZ4L| 5| 0| 0| N| N|\n", 412 | "|R149Q3B5L33NN5| 5| 0| 0| N| Y|\n", 413 | "|R2ZVD69Z6KPJ4O| 1| 0| 2| N| Y|\n", 414 | "|R1DIKG2G33ZLNP| 4| 0| 0| N| Y|\n", 415 | "|R3L6FGKAW0EYFI| 1| 1| 1| N| N|\n", 416 | "| RAO0QZH5VC6VI| 1| 0| 0| N| Y|\n", 417 | "|R25IK0UAHWNB22| 3| 0| 0| N| Y|\n", 418 | "|R2A9IHKZMTMAL1| 5| 1| 2| N| Y|\n", 419 | "| R5XVKTHL6SITI| 5| 0| 0| N| Y|\n", 420 | "|R2QZZOSTDDY1IE| 3| 3| 6| N| Y|\n", 421 | "|R38CUDCFPSNYTD| 5| 0| 0| N| Y|\n", 422 | "| RM6ZR6NH052YH| 3| 1| 2| N| Y|\n", 423 | "| RUQK5N4WH8UN8| 5| 0| 0| N| Y|\n", 424 | "|R21LWSBQWWJYZ3| 5| 0| 0| N| Y|\n", 425 | "| R8W5S53RQ2DF7| 5| 0| 0| N| Y|\n", 426 | "|R3ENME3JQBWXZS| 5| 0| 1| N| Y|\n", 427 | "|R3URL5K7DHHYK7| 5| 1| 1| N| Y|\n", 428 | "|R22YISZKS35YZX| 5| 2| 3| N| Y|\n", 429 | "|R3IIOLWHWC297U| 5| 0| 0| N| Y|\n", 430 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 431 | "only showing top 20 rows\n", 432 | "\n" 433 | ] 434 | } 435 | ], 436 | "source": [ 437 | "# Create the vine_table. DataFrame\n", 438 | "vine_df = df.select([\"review_id\", \"star_rating\", \"helpful_votes\", \"total_votes\", \"vine\", \"verified_purchase\"])\n", 439 | "vine_df.show()" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": { 445 | "id": "jITZhLkmY--J" 446 | }, 447 | "source": [ 448 | "### Connect to the AWS RDS instance and write each DataFrame to its table. " 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 42, 454 | "metadata": { 455 | "id": "7jiUvs1aY--L" 456 | }, 457 | "outputs": [], 458 | "source": [ 459 | "# Configure settings for RDS\n", 460 | "mode = \"append\"\n", 461 | "jdbc_url=\"jdbc:postgresql://amazon-vine-reviews.cp2gjvsddsgd.us-east-1.rds.amazonaws.com:5432/postgres\"\n", 462 | "config = {\"user\":\"postgres\", \n", 463 | " \"password\": 'null', \n", 464 | " \"driver\":\"org.postgresql.Driver\"}" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 38, 470 | "metadata": { 471 | "id": "T2zgZ-aKY--Q" 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "# Write review_id_df to table in RDS\n", 476 | "review_id_df.write.jdbc(url=jdbc_url, table='review_id_table', mode=mode, properties=config)" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 39, 482 | "metadata": { 483 | "id": "1m3yzn-LY--U" 484 | }, 485 | "outputs": [], 486 | "source": [ 487 | "# Write products_df to table in RDS\n", 488 | "# about 3 min\n", 489 | "products_df.write.jdbc(url=jdbc_url, table='products_table', mode=mode, properties=config)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 40, 495 | "metadata": { 496 | "id": "KbXri15fY--Z" 497 | }, 498 | "outputs": [], 499 | "source": [ 500 | "# Write customers_df to table in RDS\n", 501 | "# 5 min 14 s\n", 502 | "customers_df.write.jdbc(url=jdbc_url, table='customers_table', mode=mode, properties=config)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 41, 508 | "metadata": { 509 | "id": "XdQknSHLY--e" 510 | }, 511 | "outputs": [], 512 | "source": [ 513 | "# Write vine_df to table in RDS\n", 514 | "# 11 minutes\n", 515 | "vine_df.write.jdbc(url=jdbc_url, table='vine_table', mode=mode, properties=config)" 516 | ] 517 | } 518 | ], 519 | "metadata": { 520 | "colab": { 521 | "collapsed_sections": [], 522 | "name": "Amazon_Reviews_ETL.ipynb", 523 | "provenance": [] 524 | }, 525 | "kernelspec": { 526 | "display_name": "Python 3 (ipykernel)", 527 | "language": "python", 528 | "name": "python3" 529 | }, 530 | "language_info": { 531 | "codemirror_mode": { 532 | "name": "ipython", 533 | "version": 3 534 | }, 535 | "file_extension": ".py", 536 | "mimetype": "text/x-python", 537 | "name": "python", 538 | "nbconvert_exporter": "python", 539 | "pygments_lexer": "ipython3", 540 | "version": "3.10.9" 541 | }, 542 | "nteract": { 543 | "version": "0.12.3" 544 | }, 545 | "vscode": { 546 | "interpreter": { 547 | "hash": "70ffbe79564a77748a18f4c2688eb5ce7975141f398702b634ac4a38c8d62701" 548 | } 549 | } 550 | }, 551 | "nbformat": 4, 552 | "nbformat_minor": 1 553 | } 554 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Marketing Analysis with Big Data 2 | 3 |
10 | Goals • 11 | Dataset • 12 | Tools Used • 13 | Analysis and Challenges • 14 | Results • 15 | Summary 16 |
17 | 18 | #