├── .gitignore ├── Amazon_Reviews_ETL.ipynb ├── README.md ├── Vine_Review_Analysis.ipynb ├── images ├── cloud_etl.png ├── customer_table.png ├── products_table.png ├── review_data.png ├── review_id_table.png ├── unpaid_reviews.png ├── vine_df.png └── vine_reviews.png └── table_schema.sql /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Amazon_Reviews_ETL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "colab": { 8 | "base_uri": "https://localhost:8080/" 9 | }, 10 | "id": "V58rxea0HqSa", 11 | "outputId": "40bc4b96-d39c-460a-f1d0-57ca01e340eb" 12 | }, 13 | "outputs": [ 14 | { 15 | "name": "stdout", 16 | "output_type": "stream", 17 | "text": [ 18 | "\r", 19 | "0% [Working]\r", 20 | " \r", 21 | "Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]\n", 22 | "Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease\n", 23 | "Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease\n", 24 | "Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]\n", 25 | "Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease\n", 26 | "Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release\n", 27 | "Get:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]\n", 28 | "Hit:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease\n", 29 | "Get:9 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]\n", 30 | "Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease\n", 31 | "Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease\n", 32 | "Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n", 33 | "Get:14 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [2,861 kB]\n", 34 | "Get:15 http://archive.ubuntu.com/ubuntu bionic-updates/multiverse amd64 Packages [29.8 kB]\n", 35 | "Get:16 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [2,297 kB]\n", 36 | "Get:17 http://security.ubuntu.com/ubuntu bionic-security/multiverse amd64 Packages [22.8 kB]\n", 37 | "Get:18 http://security.ubuntu.com/ubuntu bionic-security/restricted amd64 Packages [1,006 kB]\n", 38 | "Get:19 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [1,521 kB]\n", 39 | "Get:20 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [3,294 kB]\n", 40 | "Get:21 http://archive.ubuntu.com/ubuntu bionic-updates/restricted amd64 Packages [1,040 kB]\n", 41 | "Get:22 http://archive.ubuntu.com/ubuntu bionic-backports/main amd64 Packages [12.2 kB]\n", 42 | "Fetched 12.3 MB in 3s (4,000 kB/s)\n", 43 | "Reading package lists... Done\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "import os\n", 49 | "# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version\n", 50 | "# For example:\n", 51 | "# spark_version = 'spark-3.0.3'\n", 52 | "spark_version = 'spark-3.0.3'\n", 53 | "os.environ['SPARK_VERSION']=spark_version\n", 54 | "\n", 55 | "# Install Spark and Java\n", 56 | "!apt-get update\n", 57 | "!apt-get install openjdk-11-jdk-headless -qq > /dev/null\n", 58 | "!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz\n", 59 | "!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz\n", 60 | "!pip install -q findspark\n", 61 | "\n", 62 | "# Set Environment Variables\n", 63 | "import os\n", 64 | "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-11-openjdk-amd64\"\n", 65 | "os.environ[\"SPARK_HOME\"] = f\"/content/{spark_version}-bin-hadoop2.7\"\n", 66 | "\n", 67 | "# Start a SparkSession\n", 68 | "import findspark\n", 69 | "findspark.init()" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": { 76 | "colab": { 77 | "base_uri": "https://localhost:8080/" 78 | }, 79 | "id": "_xKwTpATHqSe", 80 | "outputId": "c231e860-8ba8-4ebd-c401-7c2cf8a9f2aa" 81 | }, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "--2022-06-18 00:11:50-- https://jdbc.postgresql.org/download/postgresql-42.2.16.jar\n", 88 | "Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228\n", 89 | "Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.\n", 90 | "HTTP request sent, awaiting response... 200 OK\n", 91 | "Length: 1002883 (979K) [application/java-archive]\n", 92 | "Saving to: ‘postgresql-42.2.16.jar’\n", 93 | "\n", 94 | "postgresql-42.2.16. 100%[===================>] 979.38K 5.02MB/s in 0.2s \n", 95 | "\n", 96 | "2022-06-18 00:11:51 (5.02 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]\n", 97 | "\n" 98 | ] 99 | } 100 | ], 101 | "source": [ 102 | "# Download the Postgres driver that will allow Spark to interact with Postgres.\n", 103 | "!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 4, 109 | "metadata": { 110 | "id": "MMqDAjVS0KN9" 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "from pyspark.sql import SparkSession\n", 115 | "spark = SparkSession.builder.appName(\"M16-Amazon-Challenge\").config(\"spark.driver.extraClassPath\",\"/content/postgresql-42.2.16.jar\").getOrCreate()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "id": "cyBsySGuY-9V" 122 | }, 123 | "source": [ 124 | "### Load Amazon Data into Spark DataFrame" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 5, 130 | "metadata": { 131 | "colab": { 132 | "base_uri": "https://localhost:8080/" 133 | }, 134 | "id": "CtCmBhQJY-9Z", 135 | "outputId": "e9a4a16e-6c80-478e-acd4-e48e9b65d99e" 136 | }, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 143 | "|marketplace|customer_id| review_id|product_id|product_parent| product_title| product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase| review_headline| review_body|review_date|\n", 144 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 145 | "| US| 179886| RY01SAV7HZ8QO|B00NTI0CQ2| 667358431|Aketek 1080P LED ...|Home Entertainment| 4| 0| 0| N| Y|good enough for m...|not the best pict...| 2015-08-31|\n", 146 | "| US| 37293769|R1XX8SDGJ4MZ4L|B00BUCLVZU| 621695622|TiVo Mini with IR...|Home Entertainment| 5| 0| 0| N| N|Tell the Cable Co...|Not only do my Ti...| 2015-08-31|\n", 147 | "| US| 8332121|R149Q3B5L33NN5|B00RBX9D5W| 143071132|Apple TV MD199LL/...|Home Entertainment| 5| 0| 0| N| Y| Works perfectly!|Works perfectly! ...| 2015-08-31|\n", 148 | "| US| 47054962|R2ZVD69Z6KPJ4O|B00UJ3IULO| 567816707|New Roku 3 6.5 Fo...|Home Entertainment| 1| 0| 2| N| Y|It doesn't work. ...|It doesn't work. ...| 2015-08-31|\n", 149 | "| US| 23413911|R1DIKG2G33ZLNP|B0037UCTXG| 909557698|Generic DVI-I Dua...|Home Entertainment| 4| 0| 0| N| Y| As pictured|I received the it...| 2015-08-31|\n", 150 | "| US| 4417771|R3L6FGKAW0EYFI|B004N866SU| 414565179|Samsung 3D LED HD...|Home Entertainment| 1| 1| 1| N| N|Only lasts 3-4 ye...|I bought this TV ...| 2015-08-31|\n", 151 | "| US| 47900707| RAO0QZH5VC6VI|B00JE6AOJS| 798450889|Jiake Wireless Bl...|Home Entertainment| 1| 0| 0| N| Y| One Star| Waste of $$$$$$$| 2015-08-31|\n", 152 | "| US| 34112894|R25IK0UAHWNB22|B00COL0B7A| 777554234|3pcs/lot 3in1 3ft...|Home Entertainment| 3| 0| 0| N| Y| Three Stars|Nice but all thre...| 2015-08-31|\n", 153 | "| US| 20691979|R2A9IHKZMTMAL1|B00QHLSKOE| 885228855|Matricom G-Box Q²...|Home Entertainment| 5| 1| 2| N| Y|Yes...exactly wha...|Oh, yeah...doesn'...| 2015-08-31|\n", 154 | "| US| 25983343| R5XVKTHL6SITI|B00UNL2MUW| 164482798|VIZIO S2920W-C0B ...|Home Entertainment| 5| 0| 0| N| Y|Fantastic sound. ...|Fantastic sound. ...| 2015-08-31|\n", 155 | "| US| 35816068|R2QZZOSTDDY1IE|B00RIC9JB4| 184834831|Hitachi 49\" Class...|Home Entertainment| 3| 3| 6| N| Y| risky|Great tv but Tv d...| 2015-08-31|\n", 156 | "| US| 10628020|R38CUDCFPSNYTD|B00HPMCO6O| 444378461|Sony BDPS5200 3D ...|Home Entertainment| 5| 0| 0| N| Y| Five Stars|EXCELLENT SERVICE...| 2015-08-31|\n", 157 | "| US| 9059625| RM6ZR6NH052YH|B004QGXWS6| 770226547|Sylvania 7-Inch T...|Home Entertainment| 3| 1| 2| N| Y|They worked great...|They worked great...| 2015-08-31|\n", 158 | "| US| 2681147| RUQK5N4WH8UN8|B00FO12XY6| 448806082|Roku HD Streaming...|Home Entertainment| 5| 0| 0| N| Y| Five Stars|Smooth and worked...| 2015-08-31|\n", 159 | "| US| 33449922|R21LWSBQWWJYZ3|B00BD7UVO4| 374427271|LG Electronics BP...|Home Entertainment| 5| 0| 0| N| Y| Five Stars| Works very well| 2015-08-31|\n", 160 | "| US| 43069144| R8W5S53RQ2DF7|B00TRQPEYK| 614207013|LG Electronics LF...|Home Entertainment| 5| 0| 0| N| Y| LG TV|The TV arrived qu...| 2015-08-31|\n", 161 | "| US| 46780686|R3ENME3JQBWXZS|B005STXQG8| 689442799|SquareTrade TV Pr...|Home Entertainment| 5| 0| 1| N| Y| Five Stars|well worth the mo...| 2015-08-31|\n", 162 | "| US| 49037595|R3URL5K7DHHYK7|B00BEL11RA| 910670994|Cambridge - Azur ...|Home Entertainment| 5| 1| 1| N| Y| Love it.|Great Blu-ray pla...| 2015-08-31|\n", 163 | "| US| 27868511|R22YISZKS35YZX|B00QHLSKOE| 885228855|Matricom G-Box Q²...|Home Entertainment| 5| 2| 3| N| Y|If your on the fe...|Great media cente...| 2015-08-31|\n", 164 | "| US| 3004043|R3IIOLWHWC297U|B00MWCJ8VQ| 946374680|Roku 3500XB Strea...|Home Entertainment| 5| 0| 0| N| Y| Five Stars| great product| 2015-08-31|\n", 165 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 166 | "only showing top 20 rows\n", 167 | "\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "from pyspark import SparkFiles\n", 173 | "url = \"https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Home_Entertainment_v1_00.tsv.gz\"\n", 174 | "spark.sparkContext.addFile(url)\n", 175 | "df = spark.read.option(\"encoding\", \"UTF-8\").csv(SparkFiles.get(\"\"), sep=\"\\t\", header=True, inferSchema=True)\n", 176 | "df.show()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": { 182 | "id": "2yUSe55VY-9t" 183 | }, 184 | "source": [ 185 | "### Create DataFrames to match tables" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 6, 191 | "metadata": { 192 | "colab": { 193 | "base_uri": "https://localhost:8080/" 194 | }, 195 | "id": "C8REmY1aY-9u", 196 | "outputId": "1d0f30c0-eedc-405c-b9e2-5585a7185c09" 197 | }, 198 | "outputs": [ 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 204 | "|marketplace|customer_id| review_id|product_id|product_parent| product_title| product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase| review_headline| review_body|review_date|\n", 205 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 206 | "| US| 179886| RY01SAV7HZ8QO|B00NTI0CQ2| 667358431|Aketek 1080P LED ...|Home Entertainment| 4| 0| 0| N| Y|good enough for m...|not the best pict...| 2015-08-31|\n", 207 | "| US| 37293769|R1XX8SDGJ4MZ4L|B00BUCLVZU| 621695622|TiVo Mini with IR...|Home Entertainment| 5| 0| 0| N| N|Tell the Cable Co...|Not only do my Ti...| 2015-08-31|\n", 208 | "| US| 8332121|R149Q3B5L33NN5|B00RBX9D5W| 143071132|Apple TV MD199LL/...|Home Entertainment| 5| 0| 0| N| Y| Works perfectly!|Works perfectly! ...| 2015-08-31|\n", 209 | "| US| 47054962|R2ZVD69Z6KPJ4O|B00UJ3IULO| 567816707|New Roku 3 6.5 Fo...|Home Entertainment| 1| 0| 2| N| Y|It doesn't work. ...|It doesn't work. ...| 2015-08-31|\n", 210 | "| US| 23413911|R1DIKG2G33ZLNP|B0037UCTXG| 909557698|Generic DVI-I Dua...|Home Entertainment| 4| 0| 0| N| Y| As pictured|I received the it...| 2015-08-31|\n", 211 | "| US| 4417771|R3L6FGKAW0EYFI|B004N866SU| 414565179|Samsung 3D LED HD...|Home Entertainment| 1| 1| 1| N| N|Only lasts 3-4 ye...|I bought this TV ...| 2015-08-31|\n", 212 | "| US| 47900707| RAO0QZH5VC6VI|B00JE6AOJS| 798450889|Jiake Wireless Bl...|Home Entertainment| 1| 0| 0| N| Y| One Star| Waste of $$$$$$$| 2015-08-31|\n", 213 | "| US| 34112894|R25IK0UAHWNB22|B00COL0B7A| 777554234|3pcs/lot 3in1 3ft...|Home Entertainment| 3| 0| 0| N| Y| Three Stars|Nice but all thre...| 2015-08-31|\n", 214 | "| US| 20691979|R2A9IHKZMTMAL1|B00QHLSKOE| 885228855|Matricom G-Box Q²...|Home Entertainment| 5| 1| 2| N| Y|Yes...exactly wha...|Oh, yeah...doesn'...| 2015-08-31|\n", 215 | "| US| 25983343| R5XVKTHL6SITI|B00UNL2MUW| 164482798|VIZIO S2920W-C0B ...|Home Entertainment| 5| 0| 0| N| Y|Fantastic sound. ...|Fantastic sound. ...| 2015-08-31|\n", 216 | "| US| 35816068|R2QZZOSTDDY1IE|B00RIC9JB4| 184834831|Hitachi 49\" Class...|Home Entertainment| 3| 3| 6| N| Y| risky|Great tv but Tv d...| 2015-08-31|\n", 217 | "| US| 10628020|R38CUDCFPSNYTD|B00HPMCO6O| 444378461|Sony BDPS5200 3D ...|Home Entertainment| 5| 0| 0| N| Y| Five Stars|EXCELLENT SERVICE...| 2015-08-31|\n", 218 | "| US| 9059625| RM6ZR6NH052YH|B004QGXWS6| 770226547|Sylvania 7-Inch T...|Home Entertainment| 3| 1| 2| N| Y|They worked great...|They worked great...| 2015-08-31|\n", 219 | "| US| 2681147| RUQK5N4WH8UN8|B00FO12XY6| 448806082|Roku HD Streaming...|Home Entertainment| 5| 0| 0| N| Y| Five Stars|Smooth and worked...| 2015-08-31|\n", 220 | "| US| 33449922|R21LWSBQWWJYZ3|B00BD7UVO4| 374427271|LG Electronics BP...|Home Entertainment| 5| 0| 0| N| Y| Five Stars| Works very well| 2015-08-31|\n", 221 | "| US| 43069144| R8W5S53RQ2DF7|B00TRQPEYK| 614207013|LG Electronics LF...|Home Entertainment| 5| 0| 0| N| Y| LG TV|The TV arrived qu...| 2015-08-31|\n", 222 | "| US| 46780686|R3ENME3JQBWXZS|B005STXQG8| 689442799|SquareTrade TV Pr...|Home Entertainment| 5| 0| 1| N| Y| Five Stars|well worth the mo...| 2015-08-31|\n", 223 | "| US| 49037595|R3URL5K7DHHYK7|B00BEL11RA| 910670994|Cambridge - Azur ...|Home Entertainment| 5| 1| 1| N| Y| Love it.|Great Blu-ray pla...| 2015-08-31|\n", 224 | "| US| 27868511|R22YISZKS35YZX|B00QHLSKOE| 885228855|Matricom G-Box Q²...|Home Entertainment| 5| 2| 3| N| Y|If your on the fe...|Great media cente...| 2015-08-31|\n", 225 | "| US| 3004043|R3IIOLWHWC297U|B00MWCJ8VQ| 946374680|Roku 3500XB Strea...|Home Entertainment| 5| 0| 0| N| Y| Five Stars| great product| 2015-08-31|\n", 226 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 227 | "only showing top 20 rows\n", 228 | "\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "from pyspark.sql.functions import to_date\n", 234 | "\n", 235 | "# Read in the Review dataset as a DataFrame\n", 236 | "review_data = df\n", 237 | "\n", 238 | "review_data.show()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 11, 244 | "metadata": { 245 | "colab": { 246 | "base_uri": "https://localhost:8080/" 247 | }, 248 | "id": "B0TESUDRY-90", 249 | "outputId": "c671bf5e-a5eb-41b3-812a-e17119e4f1bf" 250 | }, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "+-----------+--------------+\n", 257 | "|customer_id|customer_count|\n", 258 | "+-----------+--------------+\n", 259 | "| 10142992| 1|\n", 260 | "| 16457323| 6|\n", 261 | "| 11935383| 1|\n", 262 | "| 46277736| 1|\n", 263 | "| 13671072| 1|\n", 264 | "| 21453814| 1|\n", 265 | "| 17684885| 1|\n", 266 | "| 20415768| 1|\n", 267 | "| 15212710| 1|\n", 268 | "| 5220924| 1|\n", 269 | "| 46253451| 6|\n", 270 | "| 971908| 1|\n", 271 | "| 32829933| 1|\n", 272 | "| 51221518| 1|\n", 273 | "| 12002637| 2|\n", 274 | "| 16105308| 1|\n", 275 | "| 135867| 1|\n", 276 | "| 47425808| 1|\n", 277 | "| 43138273| 1|\n", 278 | "| 16411995| 1|\n", 279 | "+-----------+--------------+\n", 280 | "only showing top 20 rows\n", 281 | "\n" 282 | ] 283 | } 284 | ], 285 | "source": [ 286 | "# Create the customers_table DataFrame\n", 287 | "customers_df = df.groupby(\"customer_id\").agg({\"customer_id\":\"count\"}).withColumnRenamed(\"count(customer_id)\", \"customer_count\")\n", 288 | "customers_df.show()" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 13, 294 | "metadata": { 295 | "colab": { 296 | "base_uri": "https://localhost:8080/" 297 | }, 298 | "id": "4FwXA6UvY-96", 299 | "outputId": "4fe66c7a-4c34-4d45-a927-f7aa7dda3269" 300 | }, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "+----------+--------------------+\n", 307 | "|product_id| product_title|\n", 308 | "+----------+--------------------+\n", 309 | "|B00N9OT6RM|Upstar 19-Inch 72...|\n", 310 | "|B007R9RUPU|Kinivo LS210 Port...|\n", 311 | "|B00U9U9AAM|Samsung J6200, SB...|\n", 312 | "|B00QCLTOQM|Panasonic TC32A40...|\n", 313 | "|B007F9XJW0|Sony DVPFX780 7-I...|\n", 314 | "|B00JA7ZQOY|Minix X7mini Andr...|\n", 315 | "|B008I641TE|SquareTrade 2-Yea...|\n", 316 | "|B00EJ5UJZ8|PowerSmart 4200mA...|\n", 317 | "|B006L8TX94|TiVo Premiere 500...|\n", 318 | "|B00R8K9ZH4|Fosmon HYBO-DUOC ...|\n", 319 | "|B001JHJK22|Mediabridge - RCA...|\n", 320 | "|B00CWEJ5BW|2-Year Electronic...|\n", 321 | "|B000BSHLLW|Ziotek Purse Lock...|\n", 322 | "|B0019OJOTE|FAVI 32-Inch 1080...|\n", 323 | "|B00MOCT3NW|BenQ MH630 1.4A 1...|\n", 324 | "|B00VWV1O3S|12V 8Ah SLA Batte...|\n", 325 | "|B00BBAG0DY|LG Electronics LA...|\n", 326 | "|B00BXF7I8I|Seiki 1080p 60Hz ...|\n", 327 | "|B0011ZOZ36|Vizio VW26LHDTV20...|\n", 328 | "|B00Q8DB4YY|Atoah MXIII TV Bo...|\n", 329 | "+----------+--------------------+\n", 330 | "only showing top 20 rows\n", 331 | "\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "# Create the products_table DataFrame and drop duplicates. \n", 337 | "products_df = df.select([\"product_id\", \"product_title\"]).drop_duplicates()\n", 338 | "products_df.show()" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 16, 344 | "metadata": { 345 | "colab": { 346 | "base_uri": "https://localhost:8080/" 347 | }, 348 | "id": "MkqyCuNQY-9-", 349 | "outputId": "0dc15924-7f79-4e57-b9ec-1da784e2071c" 350 | }, 351 | "outputs": [ 352 | { 353 | "name": "stdout", 354 | "output_type": "stream", 355 | "text": [ 356 | "+--------------+-----------+----------+--------------+-----------+\n", 357 | "| review_id|customer_id|product_id|product_parent|review_date|\n", 358 | "+--------------+-----------+----------+--------------+-----------+\n", 359 | "| RY01SAV7HZ8QO| 179886|B00NTI0CQ2| 667358431| 2015-08-31|\n", 360 | "|R1XX8SDGJ4MZ4L| 37293769|B00BUCLVZU| 621695622| 2015-08-31|\n", 361 | "|R149Q3B5L33NN5| 8332121|B00RBX9D5W| 143071132| 2015-08-31|\n", 362 | "|R2ZVD69Z6KPJ4O| 47054962|B00UJ3IULO| 567816707| 2015-08-31|\n", 363 | "|R1DIKG2G33ZLNP| 23413911|B0037UCTXG| 909557698| 2015-08-31|\n", 364 | "|R3L6FGKAW0EYFI| 4417771|B004N866SU| 414565179| 2015-08-31|\n", 365 | "| RAO0QZH5VC6VI| 47900707|B00JE6AOJS| 798450889| 2015-08-31|\n", 366 | "|R25IK0UAHWNB22| 34112894|B00COL0B7A| 777554234| 2015-08-31|\n", 367 | "|R2A9IHKZMTMAL1| 20691979|B00QHLSKOE| 885228855| 2015-08-31|\n", 368 | "| R5XVKTHL6SITI| 25983343|B00UNL2MUW| 164482798| 2015-08-31|\n", 369 | "|R2QZZOSTDDY1IE| 35816068|B00RIC9JB4| 184834831| 2015-08-31|\n", 370 | "|R38CUDCFPSNYTD| 10628020|B00HPMCO6O| 444378461| 2015-08-31|\n", 371 | "| RM6ZR6NH052YH| 9059625|B004QGXWS6| 770226547| 2015-08-31|\n", 372 | "| RUQK5N4WH8UN8| 2681147|B00FO12XY6| 448806082| 2015-08-31|\n", 373 | "|R21LWSBQWWJYZ3| 33449922|B00BD7UVO4| 374427271| 2015-08-31|\n", 374 | "| R8W5S53RQ2DF7| 43069144|B00TRQPEYK| 614207013| 2015-08-31|\n", 375 | "|R3ENME3JQBWXZS| 46780686|B005STXQG8| 689442799| 2015-08-31|\n", 376 | "|R3URL5K7DHHYK7| 49037595|B00BEL11RA| 910670994| 2015-08-31|\n", 377 | "|R22YISZKS35YZX| 27868511|B00QHLSKOE| 885228855| 2015-08-31|\n", 378 | "|R3IIOLWHWC297U| 3004043|B00MWCJ8VQ| 946374680| 2015-08-31|\n", 379 | "+--------------+-----------+----------+--------------+-----------+\n", 380 | "only showing top 20 rows\n", 381 | "\n" 382 | ] 383 | } 384 | ], 385 | "source": [ 386 | "# Create the review_id_table DataFrame. \n", 387 | "# Convert the 'review_date' column to a date datatype with to_date(\"review_date\", 'yyyy-MM-dd').alias(\"review_date\")\n", 388 | "review_id_df = df.select([\"review_id\", \"customer_id\", \"product_id\", \"product_parent\", to_date(\"review_date\", 'yyyy-MM-dd').alias(\"review_date\")])\n", 389 | "review_id_df.show()" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 18, 395 | "metadata": { 396 | "colab": { 397 | "base_uri": "https://localhost:8080/" 398 | }, 399 | "id": "lzMmkdKmY--D", 400 | "outputId": "573c6df0-c4e3-45e7-b53e-375a5c717d3b" 401 | }, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 408 | "| review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|\n", 409 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 410 | "| RY01SAV7HZ8QO| 4| 0| 0| N| Y|\n", 411 | "|R1XX8SDGJ4MZ4L| 5| 0| 0| N| N|\n", 412 | "|R149Q3B5L33NN5| 5| 0| 0| N| Y|\n", 413 | "|R2ZVD69Z6KPJ4O| 1| 0| 2| N| Y|\n", 414 | "|R1DIKG2G33ZLNP| 4| 0| 0| N| Y|\n", 415 | "|R3L6FGKAW0EYFI| 1| 1| 1| N| N|\n", 416 | "| RAO0QZH5VC6VI| 1| 0| 0| N| Y|\n", 417 | "|R25IK0UAHWNB22| 3| 0| 0| N| Y|\n", 418 | "|R2A9IHKZMTMAL1| 5| 1| 2| N| Y|\n", 419 | "| R5XVKTHL6SITI| 5| 0| 0| N| Y|\n", 420 | "|R2QZZOSTDDY1IE| 3| 3| 6| N| Y|\n", 421 | "|R38CUDCFPSNYTD| 5| 0| 0| N| Y|\n", 422 | "| RM6ZR6NH052YH| 3| 1| 2| N| Y|\n", 423 | "| RUQK5N4WH8UN8| 5| 0| 0| N| Y|\n", 424 | "|R21LWSBQWWJYZ3| 5| 0| 0| N| Y|\n", 425 | "| R8W5S53RQ2DF7| 5| 0| 0| N| Y|\n", 426 | "|R3ENME3JQBWXZS| 5| 0| 1| N| Y|\n", 427 | "|R3URL5K7DHHYK7| 5| 1| 1| N| Y|\n", 428 | "|R22YISZKS35YZX| 5| 2| 3| N| Y|\n", 429 | "|R3IIOLWHWC297U| 5| 0| 0| N| Y|\n", 430 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 431 | "only showing top 20 rows\n", 432 | "\n" 433 | ] 434 | } 435 | ], 436 | "source": [ 437 | "# Create the vine_table. DataFrame\n", 438 | "vine_df = df.select([\"review_id\", \"star_rating\", \"helpful_votes\", \"total_votes\", \"vine\", \"verified_purchase\"])\n", 439 | "vine_df.show()" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": { 445 | "id": "jITZhLkmY--J" 446 | }, 447 | "source": [ 448 | "### Connect to the AWS RDS instance and write each DataFrame to its table. " 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 42, 454 | "metadata": { 455 | "id": "7jiUvs1aY--L" 456 | }, 457 | "outputs": [], 458 | "source": [ 459 | "# Configure settings for RDS\n", 460 | "mode = \"append\"\n", 461 | "jdbc_url=\"jdbc:postgresql://amazon-vine-reviews.cp2gjvsddsgd.us-east-1.rds.amazonaws.com:5432/postgres\"\n", 462 | "config = {\"user\":\"postgres\", \n", 463 | " \"password\": 'null', \n", 464 | " \"driver\":\"org.postgresql.Driver\"}" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 38, 470 | "metadata": { 471 | "id": "T2zgZ-aKY--Q" 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "# Write review_id_df to table in RDS\n", 476 | "review_id_df.write.jdbc(url=jdbc_url, table='review_id_table', mode=mode, properties=config)" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 39, 482 | "metadata": { 483 | "id": "1m3yzn-LY--U" 484 | }, 485 | "outputs": [], 486 | "source": [ 487 | "# Write products_df to table in RDS\n", 488 | "# about 3 min\n", 489 | "products_df.write.jdbc(url=jdbc_url, table='products_table', mode=mode, properties=config)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 40, 495 | "metadata": { 496 | "id": "KbXri15fY--Z" 497 | }, 498 | "outputs": [], 499 | "source": [ 500 | "# Write customers_df to table in RDS\n", 501 | "# 5 min 14 s\n", 502 | "customers_df.write.jdbc(url=jdbc_url, table='customers_table', mode=mode, properties=config)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 41, 508 | "metadata": { 509 | "id": "XdQknSHLY--e" 510 | }, 511 | "outputs": [], 512 | "source": [ 513 | "# Write vine_df to table in RDS\n", 514 | "# 11 minutes\n", 515 | "vine_df.write.jdbc(url=jdbc_url, table='vine_table', mode=mode, properties=config)" 516 | ] 517 | } 518 | ], 519 | "metadata": { 520 | "colab": { 521 | "collapsed_sections": [], 522 | "name": "Amazon_Reviews_ETL.ipynb", 523 | "provenance": [] 524 | }, 525 | "kernelspec": { 526 | "display_name": "Python 3 (ipykernel)", 527 | "language": "python", 528 | "name": "python3" 529 | }, 530 | "language_info": { 531 | "codemirror_mode": { 532 | "name": "ipython", 533 | "version": 3 534 | }, 535 | "file_extension": ".py", 536 | "mimetype": "text/x-python", 537 | "name": "python", 538 | "nbconvert_exporter": "python", 539 | "pygments_lexer": "ipython3", 540 | "version": "3.10.9" 541 | }, 542 | "nteract": { 543 | "version": "0.12.3" 544 | }, 545 | "vscode": { 546 | "interpreter": { 547 | "hash": "70ffbe79564a77748a18f4c2688eb5ce7975141f398702b634ac4a38c8d62701" 548 | } 549 | } 550 | }, 551 | "nbformat": 4, 552 | "nbformat_minor": 1 553 | } 554 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Marketing Analysis with Big Data 2 | 3 |
4 | 5 |
6 | 7 | ##
Build Data Pipeline with pgAdmin, AWS Cloud and Apache Spark to Analyze and Determine Bias in Amazon Vine Reviews
8 | 9 |

10 | Goals  •  11 | Dataset  •  12 | Tools Used  •  13 | Analysis and Challenges  •  14 | Results  •  15 | Summary 16 |

17 | 18 | #
Goals
19 | 20 | Companies pay a small fee to Amazon and provide products to Amazon Vine members, who are then required to publish a review. This project will analyze Amazon reviews written by members of the paid Amazon Vine program. The Amazon Vine program is a service that allows manufacturers and publishers to receive reviews for their products. In this project, you’ll have access to approximately 50 datasets. Each one contains reviews of a specific product, from clothing apparel to wireless products. 21 | 22 | This scope will cover the TV review dataset. First I'll use PySpark to perform the ETL process to extract the dataset, transform the data, connect to an AWS RDS instance, and load the transformed data into pgAdmin. Next, I'll use PySpark to determine if there is any bias toward favorable reviews from Vine members in your dataset. 23 | 24 | #
Dataset
25 | 26 | Amazon S3 bucket containing 50 review datasets. 27 | 28 | - [Amazon Review Datasets:](https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt) I'll be analyzing a TSV file with 22,930 rows of TV reviews 29 | 30 | #
Tools Used
31 | - **Apache Spark:** A unified analytics engine for large-scale data processing 32 | - **Google Colab:** Cloud based developer notebooks, used for testing scripts and performing complex calculations 33 | - **Amazon Web Services:** Cloud based services that performs many functions, hosting, data processing 34 | - **AWS RDS:** Relational Database service used for querying data in the cloud 35 | - **AWS S3:** Cloud file storage service 36 | - **PGAdmin:** Software used to build databases and analyze data with SQL 37 | 38 | #
Analysis and Challenges
39 | 40 | After the success of the SellBy project, our group will be running an analysis Amazon reviews written by members of the paid Amazon Vine program. I analyzed the TV review dataset and use PySpark to perform the ETL process to extract the dataset, transform the data, connect to an AWS RDS instance, and load the transformed data into pgAdmin. I then used PySpark to determine if there is any bias toward favorable reviews from Vine members in your dataset. 41 | 42 | Below you will see dataframes I used to analyze the TV review data. 43 | 44 | ### Review Data 45 | ![Review Data](images/review_data.png) 46 | 47 | ### Review ID Table 48 | ![Review ID Table](images/review_id_table.png) 49 | 50 | ### Customer Table 51 | ![Customer Table](images/customer_table.png) 52 | 53 | ### Product Table 54 | ![Product Table](images/products_table.png) 55 | 56 | ### Vine Table 57 | ![Vine Table](images/vine_df.png) 58 | 59 | #
Results
60 | 61 | ![Vine Reviews](images/vine_reviews.png) 62 | 63 | ### Unpaid Reviews 64 | ![Unpaid Reviews](images/unpaid_reviews.png) 65 | 66 | - In Total there were 255 Vine reviews and 22,675 unpaid reviews 67 | - Of the 255 Vine reviews, 103 were 5 star reviews (40%) 68 | - Of the 22,675 unpaid reviews, 10,310 were 5 star reviews (45%) 69 | 70 | #
Summary
71 | 72 | Based on the results of my analysis comparing Vine and unpaid reviews, I did not see evidence of positivity bias within the paid reviews. A higher percentage of unpaid reviews were 5 stars. 73 | 74 | Here are some additional levels of analyis I am planning to apply to the current data set: 75 | - Compare the number of 1 star reviews between Vine and Unpaid to determine any additional patterns 76 | - Filter the Vine and Unpaid review datasets by verified purchase to add credibility to our review sample analysis 77 | 78 | [Back to top](#marketing-analysis-with-big-data) 79 | -------------------------------------------------------------------------------- /Vine_Review_Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "colab": { 8 | "base_uri": "https://localhost:8080/" 9 | }, 10 | "id": "Aw3mKu1NwNn-", 11 | "outputId": "f2838f8a-608c-4a8e-d8f3-f6fa768deb63" 12 | }, 13 | "outputs": [ 14 | { 15 | "name": "stdout", 16 | "output_type": "stream", 17 | "text": [ 18 | "\r", 19 | "0% [Working]\r", 20 | " \r", 21 | "Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease\n", 22 | "\r", 23 | "0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [C\r", 24 | " \r", 25 | "Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]\n", 26 | "\r", 27 | "0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Waiting f\r", 28 | "0% [1 InRelease gpgv 1,581 B] [Waiting for headers] [Waiting for headers] [Wait\r", 29 | " \r", 30 | "Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]\n", 31 | "\r", 32 | "0% [1 InRelease gpgv 1,581 B] [Waiting for headers] [3 InRelease 14.2 kB/88.7 k\r", 33 | " \r", 34 | "Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease\n", 35 | "\r", 36 | "0% [1 InRelease gpgv 1,581 B] [Waiting for headers] [3 InRelease 14.2 kB/88.7 k\r", 37 | " \r", 38 | "Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease\n", 39 | "Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease\n", 40 | "Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release\n", 41 | "Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]\n", 42 | "Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease\n", 43 | "Hit:10 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease\n", 44 | "Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]\n", 45 | "Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n", 46 | "Get:14 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [2,861 kB]\n", 47 | "Get:15 http://archive.ubuntu.com/ubuntu bionic-updates/multiverse amd64 Packages [29.8 kB]\n", 48 | "Get:16 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [2,297 kB]\n", 49 | "Get:17 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [1,521 kB]\n", 50 | "Get:18 http://security.ubuntu.com/ubuntu bionic-security/restricted amd64 Packages [1,006 kB]\n", 51 | "Get:19 http://security.ubuntu.com/ubuntu bionic-security/multiverse amd64 Packages [22.8 kB]\n", 52 | "Get:20 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [3,294 kB]\n", 53 | "Get:21 http://archive.ubuntu.com/ubuntu bionic-updates/restricted amd64 Packages [1,040 kB]\n", 54 | "Get:22 http://archive.ubuntu.com/ubuntu bionic-backports/main amd64 Packages [12.2 kB]\n", 55 | "Fetched 12.3 MB in 6s (1,907 kB/s)\n", 56 | "Reading package lists... Done\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "import os\n", 62 | "# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version\n", 63 | "# For example:\n", 64 | "# spark_version = 'spark-3.0.3'\n", 65 | "spark_version = 'spark-3.0.3'\n", 66 | "os.environ['SPARK_VERSION']=spark_version\n", 67 | "\n", 68 | "# Install Spark and Java\n", 69 | "!apt-get update\n", 70 | "!apt-get install openjdk-11-jdk-headless -qq > /dev/null\n", 71 | "!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz\n", 72 | "!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz\n", 73 | "!pip install -q findspark\n", 74 | "\n", 75 | "# Set Environment Variables\n", 76 | "import os\n", 77 | "os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-11-openjdk-amd64\"\n", 78 | "os.environ[\"SPARK_HOME\"] = f\"/content/{spark_version}-bin-hadoop2.7\"\n", 79 | "\n", 80 | "# Start a SparkSession\n", 81 | "import findspark\n", 82 | "findspark.init()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 2, 88 | "metadata": { 89 | "colab": { 90 | "base_uri": "https://localhost:8080/" 91 | }, 92 | "id": "SBs9FrnOwS_b", 93 | "outputId": "9cc41fb5-b927-45ec-9dc4-f09b2b00908f" 94 | }, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "--2022-06-18 03:00:58-- https://jdbc.postgresql.org/download/postgresql-42.2.16.jar\n", 101 | "Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228\n", 102 | "Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.\n", 103 | "HTTP request sent, awaiting response... 200 OK\n", 104 | "Length: 1002883 (979K) [application/java-archive]\n", 105 | "Saving to: ‘postgresql-42.2.16.jar’\n", 106 | "\n", 107 | "postgresql-42.2.16. 100%[===================>] 979.38K 5.59MB/s in 0.2s \n", 108 | "\n", 109 | "2022-06-18 03:00:59 (5.59 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]\n", 110 | "\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "# Download the Postgres driver that will allow Spark to interact with Postgres.\n", 116 | "!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 3, 122 | "metadata": { 123 | "id": "9zlnUTB9wWM_" 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "from pyspark.sql import SparkSession\n", 128 | "spark = SparkSession.builder.appName(\"M16-Amazon-Challenge\").config(\"spark.driver.extraClassPath\",\"/content/postgresql-42.2.16.jar\").getOrCreate()" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 4, 134 | "metadata": { 135 | "colab": { 136 | "base_uri": "https://localhost:8080/" 137 | }, 138 | "id": "B-LqaPMgwXiH", 139 | "outputId": "f8e6e200-5c70-452d-9a1f-f1285d88d925" 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 147 | "|marketplace|customer_id| review_id|product_id|product_parent| product_title| product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase| review_headline| review_body|review_date|\n", 148 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 149 | "| US| 179886| RY01SAV7HZ8QO|B00NTI0CQ2| 667358431|Aketek 1080P LED ...|Home Entertainment| 4| 0| 0| N| Y|good enough for m...|not the best pict...| 2015-08-31|\n", 150 | "| US| 37293769|R1XX8SDGJ4MZ4L|B00BUCLVZU| 621695622|TiVo Mini with IR...|Home Entertainment| 5| 0| 0| N| N|Tell the Cable Co...|Not only do my Ti...| 2015-08-31|\n", 151 | "| US| 8332121|R149Q3B5L33NN5|B00RBX9D5W| 143071132|Apple TV MD199LL/...|Home Entertainment| 5| 0| 0| N| Y| Works perfectly!|Works perfectly! ...| 2015-08-31|\n", 152 | "| US| 47054962|R2ZVD69Z6KPJ4O|B00UJ3IULO| 567816707|New Roku 3 6.5 Fo...|Home Entertainment| 1| 0| 2| N| Y|It doesn't work. ...|It doesn't work. ...| 2015-08-31|\n", 153 | "| US| 23413911|R1DIKG2G33ZLNP|B0037UCTXG| 909557698|Generic DVI-I Dua...|Home Entertainment| 4| 0| 0| N| Y| As pictured|I received the it...| 2015-08-31|\n", 154 | "| US| 4417771|R3L6FGKAW0EYFI|B004N866SU| 414565179|Samsung 3D LED HD...|Home Entertainment| 1| 1| 1| N| N|Only lasts 3-4 ye...|I bought this TV ...| 2015-08-31|\n", 155 | "| US| 47900707| RAO0QZH5VC6VI|B00JE6AOJS| 798450889|Jiake Wireless Bl...|Home Entertainment| 1| 0| 0| N| Y| One Star| Waste of $$$$$$$| 2015-08-31|\n", 156 | "| US| 34112894|R25IK0UAHWNB22|B00COL0B7A| 777554234|3pcs/lot 3in1 3ft...|Home Entertainment| 3| 0| 0| N| Y| Three Stars|Nice but all thre...| 2015-08-31|\n", 157 | "| US| 20691979|R2A9IHKZMTMAL1|B00QHLSKOE| 885228855|Matricom G-Box Q²...|Home Entertainment| 5| 1| 2| N| Y|Yes...exactly wha...|Oh, yeah...doesn'...| 2015-08-31|\n", 158 | "| US| 25983343| R5XVKTHL6SITI|B00UNL2MUW| 164482798|VIZIO S2920W-C0B ...|Home Entertainment| 5| 0| 0| N| Y|Fantastic sound. ...|Fantastic sound. ...| 2015-08-31|\n", 159 | "| US| 35816068|R2QZZOSTDDY1IE|B00RIC9JB4| 184834831|Hitachi 49\" Class...|Home Entertainment| 3| 3| 6| N| Y| risky|Great tv but Tv d...| 2015-08-31|\n", 160 | "| US| 10628020|R38CUDCFPSNYTD|B00HPMCO6O| 444378461|Sony BDPS5200 3D ...|Home Entertainment| 5| 0| 0| N| Y| Five Stars|EXCELLENT SERVICE...| 2015-08-31|\n", 161 | "| US| 9059625| RM6ZR6NH052YH|B004QGXWS6| 770226547|Sylvania 7-Inch T...|Home Entertainment| 3| 1| 2| N| Y|They worked great...|They worked great...| 2015-08-31|\n", 162 | "| US| 2681147| RUQK5N4WH8UN8|B00FO12XY6| 448806082|Roku HD Streaming...|Home Entertainment| 5| 0| 0| N| Y| Five Stars|Smooth and worked...| 2015-08-31|\n", 163 | "| US| 33449922|R21LWSBQWWJYZ3|B00BD7UVO4| 374427271|LG Electronics BP...|Home Entertainment| 5| 0| 0| N| Y| Five Stars| Works very well| 2015-08-31|\n", 164 | "| US| 43069144| R8W5S53RQ2DF7|B00TRQPEYK| 614207013|LG Electronics LF...|Home Entertainment| 5| 0| 0| N| Y| LG TV|The TV arrived qu...| 2015-08-31|\n", 165 | "| US| 46780686|R3ENME3JQBWXZS|B005STXQG8| 689442799|SquareTrade TV Pr...|Home Entertainment| 5| 0| 1| N| Y| Five Stars|well worth the mo...| 2015-08-31|\n", 166 | "| US| 49037595|R3URL5K7DHHYK7|B00BEL11RA| 910670994|Cambridge - Azur ...|Home Entertainment| 5| 1| 1| N| Y| Love it.|Great Blu-ray pla...| 2015-08-31|\n", 167 | "| US| 27868511|R22YISZKS35YZX|B00QHLSKOE| 885228855|Matricom G-Box Q²...|Home Entertainment| 5| 2| 3| N| Y|If your on the fe...|Great media cente...| 2015-08-31|\n", 168 | "| US| 3004043|R3IIOLWHWC297U|B00MWCJ8VQ| 946374680|Roku 3500XB Strea...|Home Entertainment| 5| 0| 0| N| Y| Five Stars| great product| 2015-08-31|\n", 169 | "+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+\n", 170 | "only showing top 20 rows\n", 171 | "\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "from pyspark import SparkFiles\n", 177 | "url = \"https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Home_Entertainment_v1_00.tsv.gz\"\n", 178 | "spark.sparkContext.addFile(url)\n", 179 | "df = spark.read.option(\"encoding\", \"UTF-8\").csv(SparkFiles.get(\"\"), sep=\"\\t\", header=True, inferSchema=True)\n", 180 | "df.show()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 8, 186 | "metadata": { 187 | "colab": { 188 | "base_uri": "https://localhost:8080/" 189 | }, 190 | "id": "6jp5lEQU1EbH", 191 | "outputId": "f9b21663-bf7b-4339-d378-5544615cc31c" 192 | }, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 199 | "| review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|\n", 200 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 201 | "| RY01SAV7HZ8QO| 4| 0| 0| N| Y|\n", 202 | "|R1XX8SDGJ4MZ4L| 5| 0| 0| N| N|\n", 203 | "|R149Q3B5L33NN5| 5| 0| 0| N| Y|\n", 204 | "|R2ZVD69Z6KPJ4O| 1| 0| 2| N| Y|\n", 205 | "|R1DIKG2G33ZLNP| 4| 0| 0| N| Y|\n", 206 | "|R3L6FGKAW0EYFI| 1| 1| 1| N| N|\n", 207 | "| RAO0QZH5VC6VI| 1| 0| 0| N| Y|\n", 208 | "|R25IK0UAHWNB22| 3| 0| 0| N| Y|\n", 209 | "|R2A9IHKZMTMAL1| 5| 1| 2| N| Y|\n", 210 | "| R5XVKTHL6SITI| 5| 0| 0| N| Y|\n", 211 | "|R2QZZOSTDDY1IE| 3| 3| 6| N| Y|\n", 212 | "|R38CUDCFPSNYTD| 5| 0| 0| N| Y|\n", 213 | "| RM6ZR6NH052YH| 3| 1| 2| N| Y|\n", 214 | "| RUQK5N4WH8UN8| 5| 0| 0| N| Y|\n", 215 | "|R21LWSBQWWJYZ3| 5| 0| 0| N| Y|\n", 216 | "| R8W5S53RQ2DF7| 5| 0| 0| N| Y|\n", 217 | "|R3ENME3JQBWXZS| 5| 0| 1| N| Y|\n", 218 | "|R3URL5K7DHHYK7| 5| 1| 1| N| Y|\n", 219 | "|R22YISZKS35YZX| 5| 2| 3| N| Y|\n", 220 | "|R3IIOLWHWC297U| 5| 0| 0| N| Y|\n", 221 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 222 | "only showing top 20 rows\n", 223 | "\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "# Create the vine_table. DataFrame\n", 229 | "vine_df = df.select([\"review_id\", \"star_rating\", \"helpful_votes\", \"total_votes\", \"vine\", \"verified_purchase\"])\n", 230 | "vine_df.show()" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 10, 236 | "metadata": { 237 | "colab": { 238 | "base_uri": "https://localhost:8080/" 239 | }, 240 | "id": "cJY6BReOwbZ6", 241 | "outputId": "a6db6722-f50a-46d2-fa1b-b76554e75026" 242 | }, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 249 | "| review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|\n", 250 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 251 | "|R1VCSDR38XTALJ| 1| 33| 40| N| N|\n", 252 | "|R1KMC6B2JRCZ3D| 3| 42| 47| N| N|\n", 253 | "|R1LV778IDGRJQ6| 1| 4| 23| N| Y|\n", 254 | "| RBZE8H6SHVYWW| 3| 77| 110| N| N|\n", 255 | "|R2LBEAGVJA7HK1| 3| 5| 48| N| N|\n", 256 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 257 | "only showing top 5 rows\n", 258 | "\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "#Filter the data and create a new DataFrame or table to retrieve all the rows where the total_votes count is equal to or greater than 20 to pick reviews that are more likely to be helpful and to avoid having division by zero errors later on.\n", 264 | "\n", 265 | "relevant_reviews = vine_df.filter(\"total_votes>20\")\n", 266 | "relevant_reviews.show(5)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 13, 272 | "metadata": { 273 | "colab": { 274 | "base_uri": "https://localhost:8080/" 275 | }, 276 | "id": "knvsFQ1h0bbb", 277 | "outputId": "7300c1b9-02cf-4873-bdb1-17440c71d294" 278 | }, 279 | "outputs": [ 280 | { 281 | "name": "stdout", 282 | "output_type": "stream", 283 | "text": [ 284 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 285 | "| review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|\n", 286 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 287 | "|R1VCSDR38XTALJ| 1| 33| 40| N| N|\n", 288 | "|R1KMC6B2JRCZ3D| 3| 42| 47| N| N|\n", 289 | "| RBZE8H6SHVYWW| 3| 77| 110| N| N|\n", 290 | "|R3UN4H3BW9YTQA| 5| 358| 388| N| Y|\n", 291 | "| R2YKRQ8R3B2Q3| 5| 30| 35| N| N|\n", 292 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 293 | "only showing top 5 rows\n", 294 | "\n" 295 | ] 296 | } 297 | ], 298 | "source": [ 299 | "# Filter the new DataFrame or table created in Step 1 and create a new DataFrame or table to retrieve all the rows where the number of helpful_votes divided by total_votes is equal to or greater than 50%.\n", 300 | "\n", 301 | "helpful_reviews = relevant_reviews.filter(\"(helpful_votes/total_votes)>=.5\")\n", 302 | "helpful_reviews.show(5)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 31, 308 | "metadata": { 309 | "colab": { 310 | "base_uri": "https://localhost:8080/" 311 | }, 312 | "id": "WSAeG-Ym1xrX", 313 | "outputId": "e2a17bf2-301f-45ca-933a-4e8a55713317" 314 | }, 315 | "outputs": [ 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 321 | "| review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|\n", 322 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 323 | "|R3LLICBKRHLRJC| 4| 17| 22| Y| N|\n", 324 | "| RUD135S06DEVE| 5| 41| 51| Y| N|\n", 325 | "|R2WG8UL4SGDPDH| 4| 23| 25| Y| N|\n", 326 | "|R158AOKOA28WS2| 4| 18| 28| Y| N|\n", 327 | "| RGWV3QLJQXU9X| 5| 332| 380| Y| N|\n", 328 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 329 | "only showing top 5 rows\n", 330 | "\n" 331 | ] 332 | } 333 | ], 334 | "source": [ 335 | "# Filter the DataFrame or table created in Step 2, and create a new DataFrame or table that retrieves all the rows where a review was written as part of the Vine program (paid)\n", 336 | "vine_reviews = helpful_reviews.filter(\"vine=='Y'\")\n", 337 | "vine_reviews.show(5)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 29, 343 | "metadata": { 344 | "colab": { 345 | "base_uri": "https://localhost:8080/" 346 | }, 347 | "id": "v0teSN942JjE", 348 | "outputId": "5bda2d62-3b95-4136-80ba-4cb8f712da9e" 349 | }, 350 | "outputs": [ 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 356 | "| review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|\n", 357 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 358 | "|R1VCSDR38XTALJ| 1| 33| 40| N| N|\n", 359 | "|R1KMC6B2JRCZ3D| 3| 42| 47| N| N|\n", 360 | "| RBZE8H6SHVYWW| 3| 77| 110| N| N|\n", 361 | "|R3UN4H3BW9YTQA| 5| 358| 388| N| Y|\n", 362 | "| R2YKRQ8R3B2Q3| 5| 30| 35| N| N|\n", 363 | "+--------------+-----------+-------------+-----------+----+-----------------+\n", 364 | "only showing top 5 rows\n", 365 | "\n" 366 | ] 367 | } 368 | ], 369 | "source": [ 370 | "# Repeat previous step, but this time retrieve all the rows where the review was not part of the Vine program (unpaid)\n", 371 | "unpaid_reviews = helpful_reviews.filter(\"vine=='N'\")\n", 372 | "unpaid_reviews.show(5)" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 36, 378 | "metadata": { 379 | "colab": { 380 | "base_uri": "https://localhost:8080/" 381 | }, 382 | "id": "qm9rif_m4SgT", 383 | "outputId": "7300a65f-8f88-46b2-afba-c7f90cd66aa1" 384 | }, 385 | "outputs": [ 386 | { 387 | "data": { 388 | "text/plain": [ 389 | "0.403921568627451" 390 | ] 391 | }, 392 | "execution_count": 36, 393 | "metadata": {}, 394 | "output_type": "execute_result" 395 | } 396 | ], 397 | "source": [ 398 | "# Determine the total number of reviews, the number of 5-star reviews, and the percentage of 5-star reviews for the two types of review (paid vs unpaid)\n", 399 | "total_vine_reviews = vine_reviews.count()\n", 400 | "fivestar_vine_reviews = vine_reviews.filter(\"star_rating==5\").count()\n", 401 | "fivestar_vine_reviews_percentage = fivestar_vine_reviews / total_vine_reviews\n", 402 | "fivestar_vine_reviews_percentage" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 37, 408 | "metadata": { 409 | "colab": { 410 | "base_uri": "https://localhost:8080/" 411 | }, 412 | "id": "Iq6maOZV5Uym", 413 | "outputId": "29761929-da59-42b6-a5a8-652a7aad8b9e" 414 | }, 415 | "outputs": [ 416 | { 417 | "data": { 418 | "text/plain": [ 419 | "0.4546857772877618" 420 | ] 421 | }, 422 | "execution_count": 37, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "# Determine the total number of reviews, the number of 5-star reviews, and the percentage of 5-star reviews for the two types of review (paid vs unpaid)\n", 429 | "total_unpaid_reviews = unpaid_reviews.count()\n", 430 | "fivestar_unpaid_reviews = unpaid_reviews.filter(\"star_rating==5\").count()\n", 431 | "fivestar_unpaid_reviews_percentage = fivestar_unpaid_reviews / total_unpaid_reviews\n", 432 | "fivestar_unpaid_reviews_percentage" 433 | ] 434 | } 435 | ], 436 | "metadata": { 437 | "colab": { 438 | "collapsed_sections": [], 439 | "name": "Vine_Review_Analysis.ipynb", 440 | "provenance": [] 441 | }, 442 | "kernelspec": { 443 | "display_name": "Python 3 (ipykernel)", 444 | "language": "python", 445 | "name": "python3" 446 | }, 447 | "language_info": { 448 | "codemirror_mode": { 449 | "name": "ipython", 450 | "version": 3 451 | }, 452 | "file_extension": ".py", 453 | "mimetype": "text/x-python", 454 | "name": "python", 455 | "nbconvert_exporter": "python", 456 | "pygments_lexer": "ipython3", 457 | "version": "3.10.9" 458 | } 459 | }, 460 | "nbformat": 4, 461 | "nbformat_minor": 1 462 | } 463 | -------------------------------------------------------------------------------- /images/cloud_etl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LoveNui/Marketing_Analysis-AWS-Spark-SQL/aa71d403085a96d75c7680a2b60e9dc329b8ceaa/images/cloud_etl.png -------------------------------------------------------------------------------- /images/customer_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LoveNui/Marketing_Analysis-AWS-Spark-SQL/aa71d403085a96d75c7680a2b60e9dc329b8ceaa/images/customer_table.png -------------------------------------------------------------------------------- /images/products_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LoveNui/Marketing_Analysis-AWS-Spark-SQL/aa71d403085a96d75c7680a2b60e9dc329b8ceaa/images/products_table.png -------------------------------------------------------------------------------- /images/review_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LoveNui/Marketing_Analysis-AWS-Spark-SQL/aa71d403085a96d75c7680a2b60e9dc329b8ceaa/images/review_data.png -------------------------------------------------------------------------------- /images/review_id_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LoveNui/Marketing_Analysis-AWS-Spark-SQL/aa71d403085a96d75c7680a2b60e9dc329b8ceaa/images/review_id_table.png -------------------------------------------------------------------------------- /images/unpaid_reviews.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LoveNui/Marketing_Analysis-AWS-Spark-SQL/aa71d403085a96d75c7680a2b60e9dc329b8ceaa/images/unpaid_reviews.png -------------------------------------------------------------------------------- /images/vine_df.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LoveNui/Marketing_Analysis-AWS-Spark-SQL/aa71d403085a96d75c7680a2b60e9dc329b8ceaa/images/vine_df.png -------------------------------------------------------------------------------- /images/vine_reviews.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LoveNui/Marketing_Analysis-AWS-Spark-SQL/aa71d403085a96d75c7680a2b60e9dc329b8ceaa/images/vine_reviews.png -------------------------------------------------------------------------------- /table_schema.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE review_id_table ( 2 | review_id TEXT PRIMARY KEY NOT NULL, 3 | customer_id INTEGER, 4 | product_id TEXT, 5 | product_parent INTEGER, 6 | review_date DATE -- this should be in the formate yyyy-mm-dd 7 | ); 8 | 9 | -- This table will contain only unique values 10 | CREATE TABLE products_table ( 11 | product_id TEXT PRIMARY KEY NOT NULL UNIQUE, 12 | product_title TEXT 13 | ); 14 | 15 | -- Customer table for first data set 16 | CREATE TABLE customers_table ( 17 | customer_id INT PRIMARY KEY NOT NULL UNIQUE, 18 | customer_count INT 19 | ); 20 | 21 | -- vine table 22 | CREATE TABLE vine_table ( 23 | review_id TEXT PRIMARY KEY, 24 | star_rating INTEGER, 25 | helpful_votes INTEGER, 26 | total_votes INTEGER, 27 | vine TEXT, 28 | verified_purchase TEXT 29 | ); --------------------------------------------------------------------------------