├── .gitignore ├── README.md ├── airline-data ├── README.md ├── airport-graph-analysis.ipynb ├── archive │ └── mini-cluster │ │ ├── README.md │ │ ├── airline-data-to-parquet.ipynb │ │ ├── average-airline-delay-hive-udf.ipynb │ │ └── average-airline-delay.ipynb └── load-airline-data.ipynb ├── apache-access-logs └── access-log-to-parquet.ipynb ├── reddit-data ├── README.md ├── download_and_convert_to_bz2.sh ├── load-reddit-comments-to-parquet.ipynb ├── load-reddit-posts-to-parquet.ipynb └── reddit-bot-commenters-bensons-law.ipynb ├── tools └── download_and_convert_to_bz2.sh └── udf-development ├── build.sbt └── src ├── main └── java │ └── net │ └── diybigdata │ └── udf │ └── FormatYearMonthString.java └── test └── java └── net └── diybigdata └── udf └── FormatYearMonthString_T.java /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # Jupyter settings 92 | metastore_db/ 93 | 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark Data Analysis Projects 2 | 3 | These are various Apache Spark data analysis projects done in Jupyter notebooks. Some of these analyses were conducted on the [ODROID XU4 mini cluster](http://diybigdata.net/odroid-xu4-cluster/), which the more recent ones are being performed on the [Personal Compute Cluster](https://diybigdata.net/personal-compute-cluster-2019-edition/). Since the XU4 mini cluster is a significantly constrained system, the projects done there are limited in scope. If you are looking to repeat some of these projects, the Personal Compute Cluster versions are more current. 4 | -------------------------------------------------------------------------------- /airline-data/README.md: -------------------------------------------------------------------------------- 1 | # Airline Activity Analysis 2 | This goal of this project is to study the data available from the Department of Transportation concerning flight activity and timeliness. The data for this analysis is available here at [the Bureau of Transportation Statistics site](http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time). 3 | 4 | The data for and description of these projects can be [found here](https://diybigdata.net/airline-on-time-performance-data-analysis/). 5 | 6 | The analysis down in these notebooks use Apache Spark v2.4 or greater with the [Quantcast File System](https://github.com/quantcast/qfs) as the underlying distributed file system. The code can be easily converted to use [HDFS](https://en.wikipedia.org/wiki/Apache_Hadoop) as needed. 7 | -------------------------------------------------------------------------------- /airline-data/archive/mini-cluster/README.md: -------------------------------------------------------------------------------- 1 | # Airline Activity Analysis on the ODROID XU-4 Mini Cluster 2 | This version of the airline data analysis was designed to be conducted on a resource-constrained computer cluster, such as the [ODROID XU-4 Mini-Cluster](https://diybigdata.net/odroid-xu4-cluster/). 3 | -------------------------------------------------------------------------------- /airline-data/archive/mini-cluster/airline-data-to-parquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from pyspark.sql.types import StructType, StructField\n", 12 | "from pyspark.sql.types import DoubleType, IntegerType, StringType, DecimalType, LongType\n", 13 | "\n", 14 | "air_schema = StructType([\n", 15 | " StructField(\"Year\", IntegerType()),\n", 16 | " StructField(\"Quarter\", IntegerType()),\n", 17 | " StructField(\"Month\", IntegerType()),\n", 18 | " StructField(\"DayofMonth\", IntegerType()),\n", 19 | " StructField(\"DayOfWeek\", IntegerType()),\n", 20 | " StructField(\"FlightDate\", StringType()),\n", 21 | " StructField(\"UniqueCarrier\", StringType()),\n", 22 | " StructField(\"AirlineID\", LongType()),\n", 23 | " StructField(\"Carrier\", StringType()),\n", 24 | " StructField(\"TailNum\", StringType()),\n", 25 | " StructField(\"FlightNum\", IntegerType()),\n", 26 | " StructField(\"OriginAirportID\", IntegerType()),\n", 27 | " StructField(\"OriginAirportSeqID\", IntegerType()),\n", 28 | " StructField(\"OriginCityMarketID\", IntegerType()),\n", 29 | " StructField(\"Origin\", StringType()),\n", 30 | " StructField(\"OriginCityName\", StringType()),\n", 31 | " StructField(\"OriginState\", StringType()),\n", 32 | " StructField(\"OriginStateFips\", IntegerType()),\n", 33 | " StructField(\"OriginStateName\", StringType()),\n", 34 | " StructField(\"OriginWac\", IntegerType()),\n", 35 | " StructField(\"DestAirportID\", IntegerType()),\n", 36 | " StructField(\"DestAirportSeqID\", IntegerType()),\n", 37 | " StructField(\"DestCityMarketID\", IntegerType()),\n", 38 | " StructField(\"Dest\", StringType()),\n", 39 | " StructField(\"DestCityName\", StringType()),\n", 40 | " StructField(\"DestState\", StringType()),\n", 41 | " StructField(\"DestStateFips\", IntegerType()),\n", 42 | " StructField(\"DestStateName\", StringType()),\n", 43 | " StructField(\"DestWac\", IntegerType()),\n", 44 | " StructField(\"CRSDepTime\", StringType()),\n", 45 | " StructField(\"DepTime\", StringType()),\n", 46 | " StructField(\"DepDelay\", DoubleType()),\n", 47 | " StructField(\"DepDelayMinutes\", DoubleType()),\n", 48 | " StructField(\"DepDel15\", DoubleType()),\n", 49 | " StructField(\"DepartureDelayGroups\", IntegerType()),\n", 50 | " StructField(\"DepTimeBlk\", StringType()),\n", 51 | " StructField(\"TaxiOut\", DoubleType()),\n", 52 | " StructField(\"WheelsOff\", StringType()),\n", 53 | " StructField(\"WheelsOn\", StringType()),\n", 54 | " StructField(\"TaxiIn\", DoubleType()),\n", 55 | " StructField(\"CRSArrTime\", StringType()),\n", 56 | " StructField(\"ArrTime\", StringType()),\n", 57 | " StructField(\"ArrDelay\", DoubleType()),\n", 58 | " StructField(\"ArrDelayMinutes\", DoubleType()),\n", 59 | " StructField(\"ArrDel15\", DoubleType()),\n", 60 | " StructField(\"ArrivalDelayGroups\", IntegerType()),\n", 61 | " StructField(\"ArrTimeBlk\", StringType()),\n", 62 | " StructField(\"Cancelled\", DoubleType()),\n", 63 | " StructField(\"CancellationCode\", StringType()),\n", 64 | " StructField(\"Diverted\", DoubleType()),\n", 65 | " StructField(\"CRSElapsedTime\", DoubleType()),\n", 66 | " StructField(\"ActualElapsedTime\", DoubleType()),\n", 67 | " StructField(\"AirTime\", DoubleType()),\n", 68 | " StructField(\"Flights\", DoubleType()),\n", 69 | " StructField(\"Distance\", DoubleType()),\n", 70 | " StructField(\"DistanceGroup\", IntegerType()),\n", 71 | " StructField(\"CarrierDelay\", DoubleType()),\n", 72 | " StructField(\"WeatherDelay\", DoubleType()),\n", 73 | " StructField(\"NASDelay\", DoubleType()),\n", 74 | " StructField(\"SecurityDelay\", DoubleType()),\n", 75 | " StructField(\"LateAircraftDelay\", DoubleType()),\n", 76 | " StructField(\"FirstDepTime\", StringType()),\n", 77 | " StructField(\"TotalAddGTime\", StringType()),\n", 78 | " StructField(\"LongestAddGTime\", StringType()),\n", 79 | " StructField(\"DivAirportLandings\", StringType()),\n", 80 | " StructField(\"DivReachedDest\", StringType()),\n", 81 | " StructField(\"DivActualElapsedTime\", StringType()),\n", 82 | " StructField(\"DivArrDelay\", StringType()),\n", 83 | " StructField(\"DivDistance\", StringType()),\n", 84 | " StructField(\"Div1Airport\", StringType()),\n", 85 | " StructField(\"Div1AirportID\", StringType()),\n", 86 | " StructField(\"Div1AirportSeqID\", StringType()),\n", 87 | " StructField(\"Div1WheelsOn\", StringType()),\n", 88 | " StructField(\"Div1TotalGTime\", StringType()),\n", 89 | " StructField(\"Div1LongestGTime\", StringType()),\n", 90 | " StructField(\"Div1WheelsOff\", StringType()),\n", 91 | " StructField(\"Div1TailNum\", StringType()),\n", 92 | " StructField(\"Div2Airport\", StringType()),\n", 93 | " StructField(\"Div2AirportID\", StringType()),\n", 94 | " StructField(\"Div2AirportSeqID\", StringType()),\n", 95 | " StructField(\"Div2WheelsOn\", StringType()),\n", 96 | " StructField(\"Div2TotalGTime\", StringType()),\n", 97 | " StructField(\"Div2LongestGTime\", StringType()),\n", 98 | " StructField(\"Div2WheelsOff\", StringType()),\n", 99 | " StructField(\"Div2TailNum\", StringType()),\n", 100 | " StructField(\"Div3Airport\", StringType()),\n", 101 | " StructField(\"Div3AirportID\", StringType()),\n", 102 | " StructField(\"Div3AirportSeqID\", StringType()),\n", 103 | " StructField(\"Div3WheelsOn\", StringType()),\n", 104 | " StructField(\"Div3TotalGTime\", StringType()),\n", 105 | " StructField(\"Div3LongestGTime\", StringType()),\n", 106 | " StructField(\"Div3WheelsOff\", StringType()),\n", 107 | " StructField(\"Div3TailNum\", StringType()),\n", 108 | " StructField(\"Div4Airport\", StringType()),\n", 109 | " StructField(\"Div4AirportID\", StringType()),\n", 110 | " StructField(\"Div4AirportSeqID\", StringType()),\n", 111 | " StructField(\"Div4WheelsOn\", StringType()),\n", 112 | " StructField(\"Div4TotalGTime\", StringType()),\n", 113 | " StructField(\"Div4LongestGTime\", StringType()),\n", 114 | " StructField(\"Div4WheelsOff\", StringType()),\n", 115 | " StructField(\"Div4TailNum\", StringType()),\n", 116 | " StructField(\"Div5Airport\", StringType()),\n", 117 | " StructField(\"Div5AirportID\", StringType()),\n", 118 | " StructField(\"Div5AirportSeqID\", StringType()),\n", 119 | " StructField(\"Div5WheelsOn\", StringType()),\n", 120 | " StructField(\"Div5TotalGTime\", StringType()),\n", 121 | " StructField(\"Div5LongestGTime\", StringType()),\n", 122 | " StructField(\"Div5WheelsOff\", StringType()),\n", 123 | " StructField(\"Div5TailNum\", StringType())\n", 124 | "])\n", 125 | "\n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "import itertools\n", 137 | "year_list = ['2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015']\n", 138 | "month_list = ['1','2','3','4','5','6','7','8','9','10','11','12']\n", 139 | "\n", 140 | "air_df_dict = {}\n", 141 | "\n", 142 | "print('Gathering files ...')\n", 143 | "for (year_str,month_str) in list(itertools.product(year_list,month_list)):\n", 144 | " year_month_str = '%s_%s'%(year_str,month_str)\n", 145 | " print('%s, '%(year_month_str), end=\"\")\n", 146 | " air_df_dict[year_month_str] = spark.read.csv( \n", 147 | " 'qfs://master:20000/user/michael/data/airline/On_Time_On_Time_Performance_%s.csv'%(year_month_str), \n", 148 | " header=True, \n", 149 | " schema=air_schema,\n", 150 | " escape='\"')\n", 151 | "print('Done!')" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "from datetime import datetime\n", 163 | "from pyspark.sql.functions import col, udf, unix_timestamp, to_date\n", 164 | "from pyspark.sql.types import DateType\n", 165 | "from pyspark import StorageLevel\n", 166 | "\n", 167 | "airline_data_parts = []\n", 168 | "\n", 169 | "# Should really coalesce to 1 here, but that strains the ODROID XU4 cluster too\n", 170 | "# much.\n", 171 | "print('Processing ', end=\"\")\n", 172 | "for year_month_str, air_df in air_df_dict.items():\n", 173 | " print('%s, '%(year_month_str), end=\"\")\n", 174 | " airline_data = air_df.select(\n", 175 | " \"Year\",\"Quarter\",\"Month\",\"DayofMonth\",\"DayOfWeek\",\"FlightDate\",\"UniqueCarrier\",\"AirlineID\",\n", 176 | " \"Carrier\",\"TailNum\",\"FlightNum\",\"OriginAirportID\",\"OriginAirportSeqID\",\"OriginCityMarketID\",\n", 177 | " \"Origin\",\"OriginCityName\",\"OriginState\",\"OriginStateFips\",\"OriginStateName\",\"OriginWac\",\n", 178 | " \"DestAirportID\",\"DestAirportSeqID\",\"DestCityMarketID\",\"Dest\",\"DestCityName\",\"DestState\",\n", 179 | " \"DestStateFips\",\"DestStateName\",\"DestWac\",\"CRSDepTime\",\"DepTime\",\"DepDelay\",\"DepDelayMinutes\",\n", 180 | " \"DepDel15\",\"DepartureDelayGroups\",\"DepTimeBlk\",\"TaxiOut\",\"WheelsOff\",\"WheelsOn\",\"TaxiIn\",\"CRSArrTime\",\n", 181 | " \"ArrTime\",\"ArrDelay\",\"ArrDelayMinutes\",\"ArrDel15\",\"ArrivalDelayGroups\",\"ArrTimeBlk\",\"Cancelled\",\n", 182 | " \"CancellationCode\",\"Diverted\",\"CRSElapsedTime\",\"ActualElapsedTime\",\"AirTime\",\"Flights\",\"Distance\",\n", 183 | " \"DistanceGroup\",\"CarrierDelay\",\"WeatherDelay\",\"NASDelay\",\"SecurityDelay\",\"LateAircraftDelay\"\n", 184 | " ).withColumn(\n", 185 | " 'FlightDate', to_date(col('FlightDate'))\n", 186 | " )\n", 187 | " \n", 188 | " airline_data_parts.append(airline_data)\n", 189 | "\n", 190 | "print('Done!')" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "collapsed": false 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "master_data = airline_data_parts[0]\n", 202 | "\n", 203 | "print('Unionizing data frames 0, ', end=\"\")\n", 204 | "for i in range(1,len(airline_data_parts)):\n", 205 | " print('%d, '%(i), end=\"\")\n", 206 | " master_data = master_data.union(airline_data_parts[i])\n", 207 | "print(\" Done!\")\n", 208 | "print('Starting export to HDFS...')\n", 209 | "master_data.write.partitionBy(\n", 210 | " \"Year\",\"Month\"\n", 211 | " ).parquet(\n", 212 | " 'qfs://master:20000/user/michael/data/airline_data',\n", 213 | " mode='overwrite'\n", 214 | " )\n", 215 | "print('Done!')" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "collapsed": false 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "master_data.take(1)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": false 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "from pyspark.sql import Row\n", 238 | "\n", 239 | "def mapAirlineIdRow(r):\n", 240 | " airline_id = int(r.Code)\n", 241 | " airline_name_parts = r.Description.split(':')\n", 242 | " airline_name = airline_name_parts[0].strip()\n", 243 | " iata_carrier = airline_name_parts[1].strip()\n", 244 | " out = Row(\n", 245 | " AirlineID=airline_id,\n", 246 | " AirlineName=airline_name,\n", 247 | " Carrier=iata_carrier\n", 248 | " )\n", 249 | " return out;\n", 250 | "\n", 251 | "airline_id_csv = spark.read.csv(\n", 252 | " 'qfs://master:20000/user/michael/data/airline/airline-id-lookup-table.csv',\n", 253 | " header=True,\n", 254 | " escape='\"'\n", 255 | ")\n", 256 | "\n", 257 | "airline_id_df = airline_id_csv.rdd.map(mapAirlineIdRow).toDF().coalesce(1)\n", 258 | "airline_id_df.write.parquet(\n", 259 | " 'qfs://master:20000/user/michael/data/airline_id_table',\n", 260 | " mode='overwrite'\n", 261 | " )" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": { 268 | "collapsed": false 269 | }, 270 | "outputs": [], 271 | "source": [ 272 | "airline_id_df.take(1)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": { 279 | "collapsed": false 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "airport_schema = StructType([\n", 284 | " StructField(\"Code\", StringType()),\n", 285 | " StructField(\"Description\", StringType()),\n", 286 | "])\n", 287 | "\n", 288 | "def mapAirportIdRow(r):\n", 289 | " airport_id = r.Code\n", 290 | " airport_city = ''\n", 291 | " airport_name = ''\n", 292 | " airport_name_parts = r.Description.split(':')\n", 293 | " if len(airport_name_parts) is 2:\n", 294 | " airport_city = airport_name_parts[0].strip()\n", 295 | " airport_name = airport_name_parts[1].strip()\n", 296 | " elif len(airport_name_parts) is 1:\n", 297 | " airport_city = airport_name_parts[0]\n", 298 | " airport_name = r.Code\n", 299 | " \n", 300 | " out = Row(\n", 301 | " AirportID=airport_id,\n", 302 | " City=airport_city,\n", 303 | " Name=airport_name\n", 304 | " )\n", 305 | " return out;\n", 306 | "\n", 307 | "airport_id_csv = spark.read.csv(\n", 308 | " 'qfs://master:20000/user/michael/data/airline/airport-information.csv',\n", 309 | " header=True,\n", 310 | " escape='\"',\n", 311 | " schema=airport_schema\n", 312 | ")\n", 313 | "\n", 314 | "airport_id_df = airport_id_csv.rdd.map(mapAirportIdRow).toDF().coalesce(1)\n", 315 | "airport_id_df.write.parquet(\n", 316 | " 'qfs://master:20000/user/michael/data/airport_id_table',\n", 317 | " mode='overwrite'\n", 318 | " )\n", 319 | "\n", 320 | "airport_id_df.take(1)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": { 327 | "collapsed": true 328 | }, 329 | "outputs": [], 330 | "source": [] 331 | } 332 | ], 333 | "metadata": { 334 | "kernelspec": { 335 | "display_name": "Python 3", 336 | "language": "python", 337 | "name": "python3" 338 | }, 339 | "language_info": { 340 | "codemirror_mode": { 341 | "name": "ipython", 342 | "version": 3 343 | }, 344 | "file_extension": ".py", 345 | "mimetype": "text/x-python", 346 | "name": "python", 347 | "nbconvert_exporter": "python", 348 | "pygments_lexer": "ipython3", 349 | "version": "3.4.3" 350 | } 351 | }, 352 | "nbformat": 4, 353 | "nbformat_minor": 0 354 | } 355 | -------------------------------------------------------------------------------- /airline-data/load-airline-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Load Airline Data into Parquet\n", 8 | "This notebook will load the raw CSV data downloaded from the [Bureau of Transportation Statistics's website](https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time) into a parquet file partitioned by year and month. This notebook assumes that the raw files will be in a directory on the QFS file system named `/data/airline/raw/`, and will output the parquet files into a directory named `/data/airline/processed/`." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import pyspark.sql.functions as F\n", 18 | "import pyspark.sql.types as T\n", 19 | "\n", 20 | "spark = SparkSession\\\n", 21 | " .builder\\\n", 22 | " .appName(\"AirlineDataLoad\")\\\n", 23 | " .getOrCreate()" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "air_schema = T.StructType([\n", 33 | " T.StructField(\"Year\", T.IntegerType()),\n", 34 | " T.StructField(\"Quarter\", T.IntegerType()),\n", 35 | " T.StructField(\"Month\", T.IntegerType()),\n", 36 | " T.StructField(\"DayofMonth\", T.IntegerType()),\n", 37 | " T.StructField(\"DayOfWeek\", T.IntegerType()),\n", 38 | " T.StructField(\"FlightDate\", T.StringType()),\n", 39 | " T.StructField(\"UniqueCarrier\", T.StringType()),\n", 40 | " T.StructField(\"AirlineID\", T.LongType()),\n", 41 | " T.StructField(\"Carrier\", T.StringType()),\n", 42 | " T.StructField(\"TailNum\", T.StringType()),\n", 43 | " T.StructField(\"FlightNum\", T.IntegerType()),\n", 44 | " T.StructField(\"OriginAirportID\", T.IntegerType()),\n", 45 | " T.StructField(\"OriginAirportSeqID\", T.IntegerType()),\n", 46 | " T.StructField(\"OriginCityMarketID\", T.IntegerType()),\n", 47 | " T.StructField(\"Origin\", T.StringType()),\n", 48 | " T.StructField(\"OriginCityName\", T.StringType()),\n", 49 | " T.StructField(\"OriginState\", T.StringType()),\n", 50 | " T.StructField(\"OriginStateFips\", T.IntegerType()),\n", 51 | " T.StructField(\"OriginStateName\", T.StringType()),\n", 52 | " T.StructField(\"OriginWac\", T.IntegerType()),\n", 53 | " T.StructField(\"DestAirportID\", T.IntegerType()),\n", 54 | " T.StructField(\"DestAirportSeqID\", T.IntegerType()),\n", 55 | " T.StructField(\"DestCityMarketID\", T.IntegerType()),\n", 56 | " T.StructField(\"Dest\", T.StringType()),\n", 57 | " T.StructField(\"DestCityName\", T.StringType()),\n", 58 | " T.StructField(\"DestState\", T.StringType()),\n", 59 | " T.StructField(\"DestStateFips\", T.IntegerType()),\n", 60 | " T.StructField(\"DestStateName\", T.StringType()),\n", 61 | " T.StructField(\"DestWac\", T.IntegerType()),\n", 62 | " T.StructField(\"CRSDepTime\", T.StringType()),\n", 63 | " T.StructField(\"DepTime\", T.StringType()),\n", 64 | " T.StructField(\"DepDelay\", T.DoubleType()),\n", 65 | " T.StructField(\"DepDelayMinutes\", T.DoubleType()),\n", 66 | " T.StructField(\"DepDel15\", T.DoubleType()),\n", 67 | " T.StructField(\"DepartureDelayGroups\", T.IntegerType()),\n", 68 | " T.StructField(\"DepTimeBlk\", T.StringType()),\n", 69 | " T.StructField(\"TaxiOut\", T.DoubleType()),\n", 70 | " T.StructField(\"WheelsOff\", T.StringType()),\n", 71 | " T.StructField(\"WheelsOn\", T.StringType()),\n", 72 | " T.StructField(\"TaxiIn\", T.DoubleType()),\n", 73 | " T.StructField(\"CRSArrTime\", T.StringType()),\n", 74 | " T.StructField(\"ArrTime\", T.StringType()),\n", 75 | " T.StructField(\"ArrDelay\", T.DoubleType()),\n", 76 | " T.StructField(\"ArrDelayMinutes\", T.DoubleType()),\n", 77 | " T.StructField(\"ArrDel15\", T.DoubleType()),\n", 78 | " T.StructField(\"ArrivalDelayGroups\", T.IntegerType()),\n", 79 | " T.StructField(\"ArrTimeBlk\", T.StringType()),\n", 80 | " T.StructField(\"Cancelled\", T.DoubleType()),\n", 81 | " T.StructField(\"CancellationCode\", T.StringType()),\n", 82 | " T.StructField(\"Diverted\", T.DoubleType()),\n", 83 | " T.StructField(\"CRSElapsedTime\", T.DoubleType()),\n", 84 | " T.StructField(\"ActualElapsedTime\", T.DoubleType()),\n", 85 | " T.StructField(\"AirTime\", T.DoubleType()),\n", 86 | " T.StructField(\"Flights\", T.DoubleType()),\n", 87 | " T.StructField(\"Distance\", T.DoubleType()),\n", 88 | " T.StructField(\"DistanceGroup\", T.IntegerType()),\n", 89 | " T.StructField(\"CarrierDelay\", T.DoubleType()),\n", 90 | " T.StructField(\"WeatherDelay\", T.DoubleType()),\n", 91 | " T.StructField(\"NASDelay\", T.DoubleType()),\n", 92 | " T.StructField(\"SecurityDelay\", T.DoubleType()),\n", 93 | " T.StructField(\"LateAircraftDelay\", T.DoubleType()),\n", 94 | " T.StructField(\"FirstDepTime\", T.StringType()),\n", 95 | " T.StructField(\"TotalAddGTime\", T.StringType()),\n", 96 | " T.StructField(\"LongestAddGTime\", T.StringType()),\n", 97 | " T.StructField(\"DivAirportLandings\", T.StringType()),\n", 98 | " T.StructField(\"DivReachedDest\", T.StringType()),\n", 99 | " T.StructField(\"DivActualElapsedTime\", T.StringType()),\n", 100 | " T.StructField(\"DivArrDelay\", T.StringType()),\n", 101 | " T.StructField(\"DivDistance\", T.StringType()),\n", 102 | " T.StructField(\"Div1Airport\", T.StringType()),\n", 103 | " T.StructField(\"Div1AirportID\", T.StringType()),\n", 104 | " T.StructField(\"Div1AirportSeqID\", T.StringType()),\n", 105 | " T.StructField(\"Div1WheelsOn\", T.StringType()),\n", 106 | " T.StructField(\"Div1TotalGTime\", T.StringType()),\n", 107 | " T.StructField(\"Div1LongestGTime\", T.StringType()),\n", 108 | " T.StructField(\"Div1WheelsOff\", T.StringType()),\n", 109 | " T.StructField(\"Div1TailNum\", T.StringType()),\n", 110 | " T.StructField(\"Div2Airport\", T.StringType()),\n", 111 | " T.StructField(\"Div2AirportID\", T.StringType()),\n", 112 | " T.StructField(\"Div2AirportSeqID\", T.StringType()),\n", 113 | " T.StructField(\"Div2WheelsOn\", T.StringType()),\n", 114 | " T.StructField(\"Div2TotalGTime\", T.StringType()),\n", 115 | " T.StructField(\"Div2LongestGTime\", T.StringType()),\n", 116 | " T.StructField(\"Div2WheelsOff\", T.StringType()),\n", 117 | " T.StructField(\"Div2TailNum\", T.StringType()),\n", 118 | " T.StructField(\"Div3Airport\", T.StringType()),\n", 119 | " T.StructField(\"Div3AirportID\", T.StringType()),\n", 120 | " T.StructField(\"Div3AirportSeqID\", T.StringType()),\n", 121 | " T.StructField(\"Div3WheelsOn\", T.StringType()),\n", 122 | " T.StructField(\"Div3TotalGTime\", T.StringType()),\n", 123 | " T.StructField(\"Div3LongestGTime\", T.StringType()),\n", 124 | " T.StructField(\"Div3WheelsOff\", T.StringType()),\n", 125 | " T.StructField(\"Div3TailNum\", T.StringType()),\n", 126 | " T.StructField(\"Div4Airport\", T.StringType()),\n", 127 | " T.StructField(\"Div4AirportID\", T.StringType()),\n", 128 | " T.StructField(\"Div4AirportSeqID\", T.StringType()),\n", 129 | " T.StructField(\"Div4WheelsOn\", T.StringType()),\n", 130 | " T.StructField(\"Div4TotalGTime\", T.StringType()),\n", 131 | " T.StructField(\"Div4LongestGTime\", T.StringType()),\n", 132 | " T.StructField(\"Div4WheelsOff\", T.StringType()),\n", 133 | " T.StructField(\"Div4TailNum\", T.StringType()),\n", 134 | " T.StructField(\"Div5Airport\", T.StringType()),\n", 135 | " T.StructField(\"Div5AirportID\", T.StringType()),\n", 136 | " T.StructField(\"Div5AirportSeqID\", T.StringType()),\n", 137 | " T.StructField(\"Div5WheelsOn\", T.StringType()),\n", 138 | " T.StructField(\"Div5TotalGTime\", T.StringType()),\n", 139 | " T.StructField(\"Div5LongestGTime\", T.StringType()),\n", 140 | " T.StructField(\"Div5WheelsOff\", T.StringType()),\n", 141 | " T.StructField(\"Div5TailNum\", T.StringType())\n", 142 | "])" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "raw_df = spark.read.csv( \n", 152 | " 'qfs:///data/airline/raw/On_Time_On_Time_Performance_*.csv', \n", 153 | " header=True, \n", 154 | " schema=air_schema,\n", 155 | " escape='\"')\n", 156 | "\n", 157 | "airline_data = raw_df.select(\n", 158 | " \"Year\",\"Quarter\",\"Month\",\"DayofMonth\",\"DayOfWeek\",\"FlightDate\",\"UniqueCarrier\",\"AirlineID\",\n", 159 | " \"Carrier\",\"TailNum\",\"FlightNum\",\"OriginAirportID\",\"OriginAirportSeqID\",\"OriginCityMarketID\",\n", 160 | " \"Origin\",\"OriginCityName\",\"OriginState\",\"OriginStateFips\",\"OriginStateName\",\"OriginWac\",\n", 161 | " \"DestAirportID\",\"DestAirportSeqID\",\"DestCityMarketID\",\"Dest\",\"DestCityName\",\"DestState\",\n", 162 | " \"DestStateFips\",\"DestStateName\",\"DestWac\",\"CRSDepTime\",\"DepTime\",\"DepDelay\",\"DepDelayMinutes\",\n", 163 | " \"DepDel15\",\"DepartureDelayGroups\",\"DepTimeBlk\",\"TaxiOut\",\"WheelsOff\",\"WheelsOn\",\"TaxiIn\",\"CRSArrTime\",\n", 164 | " \"ArrTime\",\"ArrDelay\",\"ArrDelayMinutes\",\"ArrDel15\",\"ArrivalDelayGroups\",\"ArrTimeBlk\",\"Cancelled\",\n", 165 | " \"CancellationCode\",\"Diverted\",\"CRSElapsedTime\",\"ActualElapsedTime\",\"AirTime\",\"Flights\",\"Distance\",\n", 166 | " \"DistanceGroup\",\"CarrierDelay\",\"WeatherDelay\",\"NASDelay\",\"SecurityDelay\",\"LateAircraftDelay\"\n", 167 | " ).withColumn(\n", 168 | " 'FlightDate', F.to_date(F.col('FlightDate'),'yyyy-MM-dd')\n", 169 | " )\n", 170 | "\n", 171 | "airline_data.repartition('Year').write.partitionBy(\n", 172 | " \"Year\",\"Month\"\n", 173 | " ).parquet(\n", 174 | " 'qfs:///data/airline/processed/airline_data',\n", 175 | " mode='overwrite'\n", 176 | " )" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "from pyspark.sql import Row\n", 186 | "\n", 187 | "def mapAirlineIdRow(r):\n", 188 | " airline_id = int(r.Code)\n", 189 | " airline_name_parts = r.Description.split(':')\n", 190 | " airline_name = airline_name_parts[0].strip()\n", 191 | " iata_carrier = airline_name_parts[1].strip()\n", 192 | " out = Row(\n", 193 | " AirlineID=airline_id,\n", 194 | " AirlineName=airline_name,\n", 195 | " Carrier=iata_carrier\n", 196 | " )\n", 197 | " return out;\n", 198 | "\n", 199 | "airline_id_csv = spark.read.csv(\n", 200 | " 'qfs:///data/airline/raw/LUT-DOT_airline_IDs.csv',\n", 201 | " header=True,\n", 202 | " escape='\"'\n", 203 | ")\n", 204 | "\n", 205 | "airline_id_df = airline_id_csv.rdd.map(mapAirlineIdRow).toDF().coalesce(1)\n", 206 | "airline_id_df.write.parquet(\n", 207 | " 'qfs:///data/airline/processed/DOT_airline_codes_table',\n", 208 | " mode='overwrite'\n", 209 | " )\n", 210 | " \n", 211 | "airline_id_df.take(1)\n", 212 | "\n", 213 | "airport_schema = T.StructType([\n", 214 | " T.StructField(\"Code\", T.StringType()),\n", 215 | " T.StructField(\"Description\", T.StringType()),\n", 216 | "])\n", 217 | "\n", 218 | "def mapAirportIdRow(r):\n", 219 | " airport_id = r.Code\n", 220 | " airport_city = ''\n", 221 | " airport_name = ''\n", 222 | " airport_name_parts = r.Description.split(':')\n", 223 | " if len(airport_name_parts) is 2:\n", 224 | " airport_city = airport_name_parts[0].strip()\n", 225 | " airport_name = airport_name_parts[1].strip()\n", 226 | " elif len(airport_name_parts) is 1:\n", 227 | " airport_city = airport_name_parts[0]\n", 228 | " airport_name = r.Code\n", 229 | " \n", 230 | " out = Row(\n", 231 | " \n", 232 | " AirportID=airport_id,\n", 233 | " City=airport_city,\n", 234 | " Name=airport_name\n", 235 | " )\n", 236 | " return out;\n", 237 | "\n", 238 | "airport_codes_csv = spark.read.csv(\n", 239 | " 'qfs:///data/airline/raw/LUT-airport_codes.csv',\n", 240 | " header=True,\n", 241 | " escape='\"',\n", 242 | " schema=airport_schema\n", 243 | ")\n", 244 | "\n", 245 | "airport_codes_df = airport_codes_csv.rdd.map(mapAirportIdRow).toDF().coalesce(1)\n", 246 | "airport_codes_df.write.parquet(\n", 247 | " 'qfs:///data/airline/processed/airport_codes_table',\n", 248 | " mode='overwrite'\n", 249 | " )\n", 250 | "\n", 251 | "airport_id_csv = spark.read.csv(\n", 252 | " 'qfs:///data/airline/raw/LUT-DOT_airport_IDs.csv',\n", 253 | " header=True,\n", 254 | " escape='\"',\n", 255 | " schema=airport_schema\n", 256 | ")\n", 257 | "\n", 258 | "airport_id_df = (\n", 259 | " airport_id_csv\n", 260 | " .rdd.map(mapAirportIdRow)\n", 261 | " .toDF()\n", 262 | " .withColumn(\n", 263 | " 'AirportID',\n", 264 | " F.col('AirportID').cast(T.IntegerType())\n", 265 | " )\n", 266 | " .coalesce(1)\n", 267 | ")\n", 268 | "airport_id_df.write.parquet(\n", 269 | " 'qfs:///data/airline/processed/airport_id_table',\n", 270 | " mode='overwrite'\n", 271 | " )" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [] 280 | } 281 | ], 282 | "metadata": { 283 | "kernelspec": { 284 | "display_name": "Python 3", 285 | "language": "python", 286 | "name": "python3" 287 | }, 288 | "language_info": { 289 | "codemirror_mode": { 290 | "name": "ipython", 291 | "version": 3 292 | }, 293 | "file_extension": ".py", 294 | "mimetype": "text/x-python", 295 | "name": "python", 296 | "nbconvert_exporter": "python", 297 | "pygments_lexer": "ipython3", 298 | "version": "3.5.3" 299 | } 300 | }, 301 | "nbformat": 4, 302 | "nbformat_minor": 2 303 | } 304 | -------------------------------------------------------------------------------- /apache-access-logs/access-log-to-parquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# Parse an Apache access log. Assumes Python 3\n", 12 | "import re\n", 13 | "from pyspark.sql import Row\n", 14 | "from datetime import datetime\n", 15 | "\n", 16 | "APACHE_ACCESS_LOG_PATTERN = '^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\" (\\d{3}) (\\d+) \"((?:[^”]|”)+)\" \"((?:[^”]|”)+)\"$'\n", 17 | "DATETIME_PARSE_PATTERN = '%d/%b/%Y:%H:%M:%S %z'\n", 18 | "\n", 19 | "# Returns a Row containing the Apache Access Log info\n", 20 | "def parse_apache_log_line(logline):\n", 21 | " match = re.search(APACHE_ACCESS_LOG_PATTERN, logline)\n", 22 | " if match is None:\n", 23 | " return None\n", 24 | " date_obj = datetime.strptime(match.group(4),DATETIME_PARSE_PATTERN)\n", 25 | " return Row(\n", 26 | " ipAddress = match.group(1),\n", 27 | " clientIdentd = match.group(2),\n", 28 | " userId = match.group(3),\n", 29 | " dateTime = match.group(4),\n", 30 | " timestamp = date_obj.timestamp(),\n", 31 | " month = date_obj.strftime('%Y-%m'),\n", 32 | " method = match.group(5),\n", 33 | " endpoint = match.group(6),\n", 34 | " protocol = match.group(7),\n", 35 | " referrer = match.group(10),\n", 36 | " userAgent = match.group(11),\n", 37 | " responseCode = int(match.group(8)),\n", 38 | " contentSize = int(match.group(9)))" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "access_logs_raw = sc.textFile(\"hdfs://master:9000/user/michael/data/diybigdata.20160808.log\")" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "access_logs = access_logs_raw.map(parse_apache_log_line).filter(lambda x: x is not None)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "accoss_logs_df = access_logs.toDF()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "root\n", 86 | " |-- clientIdentd: string (nullable = true)\n", 87 | " |-- contentSize: long (nullable = true)\n", 88 | " |-- dateTime: string (nullable = true)\n", 89 | " |-- endpoint: string (nullable = true)\n", 90 | " |-- ipAddress: string (nullable = true)\n", 91 | " |-- method: string (nullable = true)\n", 92 | " |-- month: string (nullable = true)\n", 93 | " |-- protocol: string (nullable = true)\n", 94 | " |-- referrer: string (nullable = true)\n", 95 | " |-- responseCode: long (nullable = true)\n", 96 | " |-- timestamp: double (nullable = true)\n", 97 | " |-- userAgent: string (nullable = true)\n", 98 | " |-- userId: string (nullable = true)\n", 99 | "\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "accoss_logs_df.printSchema()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 6, 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "accoss_logs_df.write.partitionBy(\n", 116 | " \"month\"\n", 117 | " ).parquet(\n", 118 | " \"hdfs://master:9000/user/michael/data/diybigdata.20160808.parquet\",\n", 119 | " mode='overwrite'\n", 120 | " )" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.4.3" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 0 154 | } 155 | -------------------------------------------------------------------------------- /reddit-data/README.md: -------------------------------------------------------------------------------- 1 | # Loading Pushshidt.io Data 2 | In order to fetch data from [pushshoft.io](https://files.pushshift.io/reddit/), use the shell script [download_and_convert_to_bz2.sh](./download_and_convert_to_bz2.sh) to download the comments and/or submissions files and convert them to a `bz2` compressions format, which works better with Spark. The Jupyter notebooks in this directory for loading the pushshift.io downloads to parquet expect the files to being `bz2` compressed. 3 | 4 | # Reddit Comment Data Analysis 5 | The analyses in this directory pertain to the Reddit Comments Data that can be [downloaded here](http://academictorrents.com/details/85a5bd50e4c365f8df70240ffd4ecc7dec59912b). When analyzing this data, the first step that must be done is to load it into the parquet file format. All analyses in this directory expects that the data has been loaded into the parquet file format as implemented in the `load-reddit-*-to-parquet` notebooks. 6 | 7 | The analyses performed on this data set are: 8 | * *Identification of Bot Commenters* ([`reddit-bot-commenters-bensons-law.ipynb`](./reddit-bot-commenters-bensons-law.ipynb)) - This analysis uses Benford's Law to identify the commenters on Reddit that are most likely to be bots based on their commenting patterns. 9 | 10 | -------------------------------------------------------------------------------- /reddit-data/download_and_convert_to_bz2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # there should be two arguments. 5 | # $1 - the directory finalized files should be placed in 6 | # $2 - the file containing the URLs to download, one per line. 7 | # 8 | # The following tools should be installed on your system: 9 | # - bzip2 10 | # - xz-utils 11 | # - zstd 12 | # 13 | 14 | NUM_COMPRESSION_THREADS=12 15 | 16 | if [ $# -ne 2 ]; then 17 | echo "usage: download_and_convert_to_bz2.sh /path/to/destination/directory /path/to/url_list.txt" 18 | exit 1 19 | fi 20 | 21 | # manage arguments 22 | destination_dir=${1%/} 23 | readarray url_list < $2 24 | 25 | # the main loop 26 | echo "Fetching URLs list in ${2}" 27 | for url in ${url_list[@]}; do 28 | echo "Processing URL = ${url}" 29 | download_file_name="${url##*/}" 30 | download_file_extension="${download_file_name##*.}" 31 | uncompressed_file_name="${download_file_name%.*}" 32 | final_file_name=${download_file_name} 33 | 34 | # download the files 35 | wget $url 36 | 37 | # if file extension of download is not bz2 deompress and recompress as bz2 38 | if [ "$download_file_extension" != "bz2" ]; then 39 | if [ "$download_file_extension" == "zst" ]; then 40 | zstd -v -d --memory=2048MB $download_file_name 41 | elif [ "$download_file_extension" == "xz" ]; then 42 | xz -v -k -T $NUM_COMPRESSION_THREADS -d $download_file_name 43 | else 44 | echo "Unrecognized file type for ${url}" 45 | exit 1 46 | fi 47 | lbzip2 -v -n $((NUM_COMPRESSION_THREADS)) $uncompressed_file_name 48 | rm $download_file_name 49 | final_file_name="${uncompressed_file_name}.bz2" 50 | fi 51 | mv -v -f $final_file_name $destination_dir 52 | echo "Finalized ${final_file_name}" 53 | echo "" 54 | done 55 | 56 | echo "Finished processing $2" 57 | exit 0 58 | -------------------------------------------------------------------------------- /reddit-data/load-reddit-comments-to-parquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Load Reddit Comments Data into Parquet \n", 8 | "This notebook loads the raw [Reddit comments dataset](http://academictorrents.com/details/85a5bd50e4c365f8df70240ffd4ecc7dec59912b) into a parquet file format. It does augment the data with several improved time columns, and the partitions the data by year/month/day. The file paths in this notebook should be modified for your system." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": { 15 | "ExecuteTime": { 16 | "end_time": "2022-09-15T08:34:31.416045Z", 17 | "start_time": "2022-09-15T08:34:30.146765Z" 18 | } 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import pyspark.sql.functions as F\n", 23 | "import pyspark.sql.types as T\n", 24 | "import pyspark.sql.utils as U\n", 25 | "from pyspark.sql.window import Window as W\n", 26 | "\n", 27 | "import pandas as pd\n", 28 | "\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "\n", 31 | "spark = SparkSession\\\n", 32 | " .builder\\\n", 33 | " .appName(\"RedditCommentsLoadToParquet\")\\\n", 34 | " .getOrCreate()" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "ExecuteTime": { 42 | "end_time": "2022-09-15T08:34:31.427088Z", 43 | "start_time": "2022-09-15T08:34:31.420675Z" 44 | } 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "reddit_comments_schema = T.StructType([\n", 49 | " T.StructField(\"id\", T.StringType()),\n", 50 | " T.StructField(\"parent_id\", T.StringType()),\n", 51 | " T.StructField(\"author\", T.StringType()),\n", 52 | " T.StructField(\"link_id\", T.StringType()),\n", 53 | " T.StructField(\"subreddit\", T.StringType()),\n", 54 | " T.StructField(\"subreddit_id\", T.StringType()),\n", 55 | " T.StructField(\"edited\", T.BooleanType()),\n", 56 | " T.StructField(\"score\", T.LongType()),\n", 57 | " T.StructField(\"body\", T.StringType()),\n", 58 | " T.StructField(\"created_utc\", T.LongType()),\n", 59 | " T.StructField(\"retrieved_utc\", T.LongType()),\n", 60 | " T.StructField(\"retrieved_on\", T.LongType()),\n", 61 | "])" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "ExecuteTime": { 69 | "end_time": "2022-09-15T08:47:13.192762Z", 70 | "start_time": "2022-09-15T08:34:32.598344Z" 71 | }, 72 | "code_folding": [] 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "import gc\n", 77 | "\n", 78 | "spark.conf.set(\"spark.sql.session.timeZone\", \"UTC\")\n", 79 | "\n", 80 | "def has_column(df, col_name):\n", 81 | " if col_name in df.columns:\n", 82 | " return F.lit(True)\n", 83 | " else:\n", 84 | " return F.lit(False)\n", 85 | "\n", 86 | "load_months = [\n", 87 | "# (2021, 7),\n", 88 | "# (2021, 8),\n", 89 | "# (2021, 9),\n", 90 | "# (2021, 10),\n", 91 | "# (2021, 11),\n", 92 | "# (2021, 12),\n", 93 | "# (2022, 1),\n", 94 | "# (2022, 2),\n", 95 | "# (2022, 3),\n", 96 | "# (2022, 4),\n", 97 | " (2022, 8),\n", 98 | "]\n", 99 | "\n", 100 | "for year, month in load_months:\n", 101 | " file_path = 'qfs:///data/reddit/comments/raw/RC_{0}-{1:02d}*.bz2'.format(year, month)\n", 102 | " print('loading data for year-month {0}-{1:02d} at file path {2}'.format(year, month, file_path))\n", 103 | " reddit_df = (\n", 104 | " spark.read.json(\n", 105 | " file_path,\n", 106 | " schema=reddit_comments_schema,\n", 107 | " )\n", 108 | " .withColumn(\n", 109 | " 'retrieved_on',\n", 110 | " F.when(\n", 111 | " F.col('retrieved_utc').isNotNull(),\n", 112 | " F.col('retrieved_utc')\n", 113 | " ).otherwise(\n", 114 | " F.col('retrieved_on')\n", 115 | " )\n", 116 | " )\n", 117 | " ) \n", 118 | "\n", 119 | " reddit_finalized = (\n", 120 | " reddit_df\n", 121 | " .select(\n", 122 | " 'author',\n", 123 | " 'link_id',\n", 124 | " 'retrieved_on',\n", 125 | " 'subreddit',\n", 126 | " 'subreddit_id',\n", 127 | " 'id',\n", 128 | " 'parent_id',\n", 129 | " 'edited',\n", 130 | " 'score',\n", 131 | " 'body',\n", 132 | " 'created_utc',\n", 133 | " F.from_unixtime('created_utc', 'yyyy-MM-dd').alias('created_date'),\n", 134 | " F.from_unixtime('created_utc', 'dd').alias('day')\n", 135 | " )\n", 136 | " .repartition('day')\n", 137 | " ).cache()\n", 138 | " print(' There are {0} total rows in month data set.'.format(reddit_finalized.count()))\n", 139 | "\n", 140 | " out_path = 'qfs:///data/reddit/comments/processed/year={0}/month={1:02d}'.format(year, month)\n", 141 | " print(' writing to: {0}'.format(out_path))\n", 142 | " reddit_finalized.write.partitionBy(\n", 143 | " 'day'\n", 144 | " ).parquet(\n", 145 | " out_path,\n", 146 | " mode='overwrite'\n", 147 | " )\n", 148 | " print('\\n')\n", 149 | " reddit_finalized.unpersist()\n", 150 | " del reddit_finalized\n", 151 | " del reddit_df\n", 152 | " gc.collect()\n", 153 | " " 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "ExecuteTime": { 161 | "end_time": "2022-09-15T08:47:43.932518Z", 162 | "start_time": "2022-09-15T08:47:13.195656Z" 163 | } 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "reddit_processed = spark.read.parquet('qfs:///data/reddit/comments/processed/')\n", 168 | "reddit_processed.printSchema()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "ExecuteTime": { 176 | "end_time": "2022-09-15T08:54:46.366672Z", 177 | "start_time": "2022-09-15T08:47:43.935354Z" 178 | } 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "(\n", 183 | " reddit_processed\n", 184 | " .groupBy('year')\n", 185 | " .agg(\n", 186 | " F.count('*').alias('count'),\n", 187 | " F.countDistinct('author').alias('authors')\n", 188 | " )\n", 189 | " .orderBy('year')\n", 190 | ").toPandas()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "ExecuteTime": { 198 | "end_time": "2022-08-12T16:12:27.139557Z", 199 | "start_time": "2022-08-12T16:06:08.767951Z" 200 | } 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "(\n", 205 | " reddit_processed\n", 206 | " .groupBy('year')\n", 207 | " .agg(\n", 208 | " F.count('*').alias('count'),\n", 209 | " F.countDistinct('author').alias('authors')\n", 210 | " )\n", 211 | " .orderBy('year')\n", 212 | ").toPandas()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "ExecuteTime": { 220 | "end_time": "2022-09-15T14:39:31.024042Z", 221 | "start_time": "2022-09-15T14:39:16.984361Z" 222 | } 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "(\n", 227 | " reddit_processed\n", 228 | " .filter(\n", 229 | " (F.col('year') == 2022)\n", 230 | " &(F.col('month') == 8)\n", 231 | " )\n", 232 | " .groupBy('year','month','day')\n", 233 | " .agg(\n", 234 | " F.count('*').alias('count'),\n", 235 | " F.countDistinct('author').alias('authors')\n", 236 | " )\n", 237 | " .orderBy('year','month','day')\n", 238 | ").toPandas()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [] 247 | } 248 | ], 249 | "metadata": { 250 | "kernelspec": { 251 | "display_name": "Python 3 (ipykernel)", 252 | "language": "python", 253 | "name": "python3" 254 | }, 255 | "language_info": { 256 | "codemirror_mode": { 257 | "name": "ipython", 258 | "version": 3 259 | }, 260 | "file_extension": ".py", 261 | "mimetype": "text/x-python", 262 | "name": "python", 263 | "nbconvert_exporter": "python", 264 | "pygments_lexer": "ipython3", 265 | "version": "3.10.7" 266 | }, 267 | "toc": { 268 | "base_numbering": 1, 269 | "nav_menu": { 270 | "height": "217px", 271 | "width": "201px" 272 | }, 273 | "number_sections": true, 274 | "sideBar": true, 275 | "skip_h1_title": false, 276 | "title_cell": "Table of Contents", 277 | "title_sidebar": "Contents", 278 | "toc_cell": false, 279 | "toc_position": {}, 280 | "toc_section_display": true, 281 | "toc_window_display": false 282 | } 283 | }, 284 | "nbformat": 4, 285 | "nbformat_minor": 2 286 | } 287 | -------------------------------------------------------------------------------- /reddit-data/load-reddit-posts-to-parquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Load Reddit Submissions in to Parquet\n", 8 | "\n", 9 | "The raw data was pulled from [pushshift.io](https://files.pushshift.io/reddit/submissions/)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "ExecuteTime": { 17 | "end_time": "2022-09-15T14:39:59.203555Z", 18 | "start_time": "2022-09-15T14:39:57.940891Z" 19 | } 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import pyspark.sql.functions as F\n", 24 | "import pyspark.sql.types as T\n", 25 | "import pyspark.sql.utils as U\n", 26 | "from pyspark.sql.window import Window as W\n", 27 | "\n", 28 | "import pandas as pd\n", 29 | "\n", 30 | "pd.set_option('display.max_colwidth', None)\n", 31 | "\n", 32 | "spark = SparkSession\\\n", 33 | " .builder\\\n", 34 | " .appName(\"RedditPostsLoadToParquet\")\\\n", 35 | " .getOrCreate()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "ExecuteTime": { 43 | "end_time": "2022-09-15T15:03:57.158114Z", 44 | "start_time": "2022-09-15T14:40:07.942034Z" 45 | } 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "year_range = range(2022,2023)\n", 50 | "\n", 51 | "json_schema = T.StructType([\n", 52 | " T.StructField(\"author\", T.StringType()),\n", 53 | " T.StructField(\"created_utc\", T.LongType()),\n", 54 | " T.StructField(\"domain\", T.StringType()),\n", 55 | " T.StructField(\"edited\", T.BooleanType()),\n", 56 | " T.StructField(\"id\", T.StringType()),\n", 57 | " T.StructField(\"is_crosspostable\", T.BooleanType()),\n", 58 | " T.StructField(\"is_self\", T.BooleanType()),\n", 59 | " T.StructField(\"is_video\", T.BooleanType()),\n", 60 | " T.StructField(\"num_comments\", T.LongType()),\n", 61 | " T.StructField(\"num_crossposts\", T.LongType()),\n", 62 | " T.StructField(\"over_18\", T.BooleanType()),\n", 63 | " T.StructField(\"permalink\", T.StringType()),\n", 64 | " T.StructField(\"promoted\", T.BooleanType()),\n", 65 | " T.StructField(\"score\", T.LongType()),\n", 66 | " T.StructField(\"selftext\", T.StringType()),\n", 67 | " T.StructField(\"spam\", T.BooleanType()),\n", 68 | " T.StructField(\"stickied\", T.BooleanType()),\n", 69 | " T.StructField(\"subreddit\", T.StringType()),\n", 70 | " T.StructField(\"subreddit_id\", T.StringType()),\n", 71 | " T.StructField(\"thumbnail\", T.StringType()),\n", 72 | " T.StructField(\"title\", T.StringType()),\n", 73 | " T.StructField(\"ups\", T.StringType()),\n", 74 | " T.StructField(\"url\", T.StringType()), \n", 75 | "])\n", 76 | "\n", 77 | "def has_column(df, col_name):\n", 78 | " if col_name in df.columns:\n", 79 | " return F.lit(True)\n", 80 | " else:\n", 81 | " return F.lit(False)\n", 82 | "\n", 83 | "for year in year_range:\n", 84 | " print('Processing submissions date for year {0}'.format(year))\n", 85 | " file_pattern = 'qfs:///data/reddit/submissions/raw/RS_*{0}-*.bz2'.format(year)\n", 86 | " submissions_raw = (\n", 87 | " spark.read.json(\n", 88 | " file_pattern,\n", 89 | " encoding='utf-8',\n", 90 | " schema=json_schema,\n", 91 | " )\n", 92 | " )\n", 93 | " df = (\n", 94 | " submissions_raw\n", 95 | " .withColumn(\n", 96 | " 'created_date',\n", 97 | " F.from_unixtime(F.col('created_utc'), 'yyyy-MM-dd')\n", 98 | " )\n", 99 | " .withColumn(\n", 100 | " 'month',\n", 101 | " F.from_unixtime(F.col('created_utc'), 'MM')\n", 102 | " )\n", 103 | " .withColumn(\n", 104 | " 'day',\n", 105 | " F.from_unixtime(F.col('created_utc'), 'dd')\n", 106 | " )\n", 107 | " .withColumn(\n", 108 | " 'created_date',\n", 109 | " F.from_unixtime(F.col('created_utc'), 'dd')\n", 110 | " )\n", 111 | " ) \n", 112 | " df.write.partitionBy(\n", 113 | " 'month', 'day'\n", 114 | " ).parquet(\n", 115 | " 'qfs:///data/reddit/submissions/processed/year={0}/'.format(year),\n", 116 | " mode='overwrite'\n", 117 | " )\n", 118 | "\n", 119 | " " 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "ExecuteTime": { 127 | "end_time": "2022-09-15T15:04:21.681074Z", 128 | "start_time": "2022-09-15T15:03:57.161435Z" 129 | } 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "submissions_df = spark.read.parquet('qfs:///data/reddit/submissions/processed/')" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "ExecuteTime": { 141 | "end_time": "2022-09-15T15:04:21.695868Z", 142 | "start_time": "2022-09-15T15:04:21.683797Z" 143 | } 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "submissions_df.printSchema()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "ExecuteTime": { 155 | "end_time": "2022-07-29T10:08:18.918367Z", 156 | "start_time": "2022-07-29T10:08:07.636296Z" 157 | } 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "submissions_df.count()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "ExecuteTime": { 169 | "end_time": "2022-09-15T15:04:31.142038Z", 170 | "start_time": "2022-09-15T15:04:21.698757Z" 171 | } 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "submissions_df.count()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": { 182 | "ExecuteTime": { 183 | "end_time": "2022-09-15T16:04:57.119540Z", 184 | "start_time": "2022-09-15T15:56:36.679141Z" 185 | } 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "submissions_df.filter('author = \"MichaelKamprath\"').toPandas()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "ExecuteTime": { 197 | "end_time": "2022-09-15T15:12:50.759818Z", 198 | "start_time": "2022-09-15T15:12:50.224480Z" 199 | }, 200 | "code_folding": [ 201 | 14 202 | ] 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "import matplotlib.pyplot as plt\n", 207 | "import numpy as np\n", 208 | "import pandas as pd\n", 209 | "from pandas.plotting import register_matplotlib_converters\n", 210 | "\n", 211 | "register_matplotlib_converters()\n", 212 | "pd.set_option('display.max_colwidth', None)\n", 213 | "\n", 214 | "def plot_line_graph(\n", 215 | " df,\n", 216 | " y_axis_column,\n", 217 | " x_axis_column,\n", 218 | " segment_column=None,\n", 219 | " segment_values=None, # a list of values from segment_column to be graphed\n", 220 | " segment_labels=None, # a dictionary with segment_values as key and name a value\n", 221 | " xlabel=None,\n", 222 | " ylabel=None,\n", 223 | " line_width=2,\n", 224 | " xlabel_rotation=None,\n", 225 | " x_axis_is_dates=True,\n", 226 | " y_axis_log_scale=False,\n", 227 | " title=None,\n", 228 | " legend_location='lower left',\n", 229 | "):\n", 230 | " df_pd = df.toPandas()\n", 231 | " fig, ax = plt.subplots()\n", 232 | "\n", 233 | " if segment_column is None:\n", 234 | " x_axis_values = df_pd[x_axis_column]\n", 235 | " if ylabel is None:\n", 236 | " item_label = y_axis_column\n", 237 | " else: \n", 238 | " item_label = ylabel\n", 239 | " if x_axis_is_dates:\n", 240 | " ax.plot_date(\n", 241 | " x_axis_values,\n", 242 | " df_pd[y_axis_column],\n", 243 | " '-',\n", 244 | " linewidth = line_width,\n", 245 | " label = item_label,\n", 246 | " )\n", 247 | " else:\n", 248 | " ax.plot(\n", 249 | " x_axis_values,\n", 250 | " df_pd[y_axis_column],\n", 251 | " label = item_label,\n", 252 | " linewidth = line_width,\n", 253 | " ) \n", 254 | " else:\n", 255 | " if segment_values is None:\n", 256 | " segment_value_list = [r.val for r in df.select(F.col(segment_column).alias('val')).distinct().collect()]\n", 257 | " else:\n", 258 | " segment_value_list = segment_values\n", 259 | " for i in segment_value_list:\n", 260 | " data = df_pd[df_pd[segment_column] == i]\n", 261 | " x_axis_values = data[x_axis_column]\n", 262 | " if segment_labels is not None:\n", 263 | " item_label = segment_labels[i]\n", 264 | " else:\n", 265 | " item_label = \"{0}\".format(i)\n", 266 | " \n", 267 | " if x_axis_is_dates:\n", 268 | " ax.plot_date(\n", 269 | " x_axis_values,\n", 270 | " data[y_axis_column],\n", 271 | " '-',\n", 272 | " linewidth = line_width,\n", 273 | " label = item_label,\n", 274 | " ) \n", 275 | " else:\n", 276 | " ax.plot(\n", 277 | " x_axis_values,\n", 278 | " data[y_axis_column],\n", 279 | " label = item_label,\n", 280 | " linewidth = line_width\n", 281 | " )\n", 282 | " \n", 283 | " fig.set_size_inches(20,12)\n", 284 | " if xlabel is not None:\n", 285 | " plt.xlabel(xlabel)\n", 286 | " if ylabel is not None:\n", 287 | " plt.ylabel(ylabel)\n", 288 | " if xlabel_rotation is not None:\n", 289 | " plt.xticks(rotation=xlabel_rotation)\n", 290 | " if x_axis_is_dates:\n", 291 | " fig.autofmt_xdate()\n", 292 | " if y_axis_log_scale:\n", 293 | " plt.grid()\n", 294 | " plt.yscale(\"log\")\n", 295 | " if title is not None:\n", 296 | " fig.suptitle(title, fontsize=18)\n", 297 | " ax.legend(loc=legend_location)\n", 298 | " plt.show()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": { 305 | "ExecuteTime": { 306 | "end_time": "2022-09-15T15:12:50.923459Z", 307 | "start_time": "2022-09-15T15:12:50.762273Z" 308 | } 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "monthly_submissions = (\n", 313 | " submissions_df\n", 314 | " .withColumn(\n", 315 | " 'year_month', \n", 316 | " F.to_date(\n", 317 | " F.format_string('%4d-%02d', F.col('year'), F.col('month')),\n", 318 | " format='yyyy-MM'\n", 319 | " ) \n", 320 | " )\n", 321 | " .groupBy('year_month')\n", 322 | " .agg(\n", 323 | " F.count('*').alias('count'),\n", 324 | " F.countDistinct('author').alias('authors')\n", 325 | " )\n", 326 | " .orderBy(F.col('year_month'))\n", 327 | " ).cache()" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "ExecuteTime": { 335 | "end_time": "2022-07-04T10:53:25.010451Z", 336 | "start_time": "2022-07-04T10:53:24.767770Z" 337 | } 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "(\n", 342 | " submissions_df\n", 343 | " .filter(F.col('month').isNull())\n", 344 | " .select(\n", 345 | " 'author',\n", 346 | " 'subreddit_id',\n", 347 | " 'permalink',\n", 348 | " 'selftext',\n", 349 | " 'created_utc',\n", 350 | " 'created_date',\n", 351 | " 'year',\n", 352 | " 'month',\n", 353 | " 'day',\n", 354 | " )\n", 355 | ").limit(20).toPandas()" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "ExecuteTime": { 363 | "end_time": "2022-07-04T10:53:25.514523Z", 364 | "start_time": "2022-07-04T10:53:25.012170Z" 365 | } 366 | }, 367 | "outputs": [], 368 | "source": [ 369 | "submissions_df.filter(F.col('month').isNull()).groupBy('year').agg(F.count('*').alias('count')).toPandas()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "ExecuteTime": { 377 | "end_time": "2022-07-04T10:56:48.296261Z", 378 | "start_time": "2022-07-04T10:53:25.517524Z" 379 | } 380 | }, 381 | "outputs": [], 382 | "source": [ 383 | "monthly_submissions.orderBy(F.col('year_month')).limit(20).toPandas()" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": { 390 | "ExecuteTime": { 391 | "end_time": "2022-08-12T17:37:04.642659Z", 392 | "start_time": "2022-08-12T17:37:04.010513Z" 393 | } 394 | }, 395 | "outputs": [], 396 | "source": [ 397 | "plot_line_graph(\n", 398 | " monthly_submissions,\n", 399 | " 'authors',\n", 400 | " 'year_month',\n", 401 | " xlabel='Date',\n", 402 | " ylabel='Authors',\n", 403 | ")" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [] 412 | } 413 | ], 414 | "metadata": { 415 | "kernelspec": { 416 | "display_name": "Python 3 (ipykernel)", 417 | "language": "python", 418 | "name": "python3" 419 | }, 420 | "language_info": { 421 | "codemirror_mode": { 422 | "name": "ipython", 423 | "version": 3 424 | }, 425 | "file_extension": ".py", 426 | "mimetype": "text/x-python", 427 | "name": "python", 428 | "nbconvert_exporter": "python", 429 | "pygments_lexer": "ipython3", 430 | "version": "3.7.9" 431 | }, 432 | "toc": { 433 | "base_numbering": 1, 434 | "nav_menu": {}, 435 | "number_sections": true, 436 | "sideBar": true, 437 | "skip_h1_title": false, 438 | "title_cell": "Table of Contents", 439 | "title_sidebar": "Contents", 440 | "toc_cell": false, 441 | "toc_position": {}, 442 | "toc_section_display": true, 443 | "toc_window_display": false 444 | } 445 | }, 446 | "nbformat": 4, 447 | "nbformat_minor": 4 448 | } 449 | -------------------------------------------------------------------------------- /reddit-data/reddit-bot-commenters-bensons-law.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Reddit Bot Commenters \n", 8 | "Identifies likely bot commenters on Reddit using Benford's Law. See [original blog post](https://diybigdata.net/2020/03/using-benfords-law-to-identify-bots-on-reddit/) for a discussion on this technique.\n", 9 | "\n", 10 | "The core of this code is the `generateBenfordsLawAnalysis()` function, which takes a user event log data frame that must have a user ID column and a event timestamp column, and it returns the chi squared score of close each user's activity is to the ideal Benford's Law distribution. Scores closer to zero mean the user's activity more closely adheres to the ideal distribution. " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "ExecuteTime": { 18 | "end_time": "2022-08-12T17:58:46.627964Z", 19 | "start_time": "2022-08-12T17:58:45.607762Z" 20 | } 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import pyspark.sql.functions as F\n", 25 | "import pyspark.sql.types as T\n", 26 | "from pyspark.sql.window import Window as W\n", 27 | "\n", 28 | "import pandas as pd\n", 29 | "\n", 30 | "pd.set_option('display.max_colwidth', None)\n", 31 | "\n", 32 | "spark = SparkSession\\\n", 33 | " .builder\\\n", 34 | " .appName(\"RedditBotCommenters\")\\\n", 35 | " .getOrCreate()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "ExecuteTime": { 43 | "end_time": "2022-08-12T17:58:46.636108Z", 44 | "start_time": "2022-08-12T17:58:46.630656Z" 45 | } 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "orig_suffle_partitions = spark.conf.get(\"spark.sql.shuffle.partitions\")\n", 50 | "spark.conf.set(\"spark.sql.shuffle.partitions\", 500)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "ExecuteTime": { 58 | "end_time": "2022-08-12T17:59:26.166948Z", 59 | "start_time": "2022-08-12T17:58:46.638715Z" 60 | } 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "reddit_df = (\n", 65 | " spark.read.parquet('qfs:///data/reddit/comments/processed')\n", 66 | " # filter out moderator and deleted authors\n", 67 | " .filter(~F.col('author').isin('[deleted]','AutoModerator'))\n", 68 | ")\n", 69 | "\n", 70 | "reddit_df.printSchema()" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "ExecuteTime": { 78 | "end_time": "2022-08-12T17:59:42.896570Z", 79 | "start_time": "2022-08-12T17:59:26.168915Z" 80 | } 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "submissions_df = spark.read.parquet('qfs:///data/reddit/submissions/processed')\n", 85 | "submissions_df.printSchema()" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "ExecuteTime": { 93 | "end_time": "2022-08-12T17:59:42.969137Z", 94 | "start_time": "2022-08-12T17:59:42.899315Z" 95 | } 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "combined_df = (\n", 100 | " reddit_df\n", 101 | " .select(\n", 102 | " 'author',\n", 103 | " 'created_utc',\n", 104 | " )\n", 105 | " .union(\n", 106 | " submissions_df\n", 107 | " .select(\n", 108 | " 'author',\n", 109 | " 'created_utc',\n", 110 | " )\n", 111 | " \n", 112 | " )\n", 113 | " .filter(\n", 114 | " F.col('author').isNotNull()\n", 115 | " &(F.length(F.col('author')) > 0)\n", 116 | " )\n", 117 | " .repartition('author')\n", 118 | ")" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "`generateBenfordsLawAnalysis`\n", 126 | "\n", 127 | "A function to perform Benford's Law analysis against a data frame of user activities in order to determine which user's activities best (or least) adhere to the Benford's Law distribution. The data frame is ostensibly a event log keyed by a user ID and has a timestamp for each event row. Only the user ID and timesamps columns are used for analysis.\n", 128 | "\n", 129 | "### Arguments \n", 130 | "* `df` - The data frame with the timestamped user activity to be analyzed\n", 131 | "* `user_col` - a string identifying the name of the column of df that contains the user IDs\n", 132 | "* `timestamp_col` - a string identifying the name of the column of df that contains the event timestamps. Must be `T.LongType()`.\n", 133 | "* `event_threshold` - the minimum number of events a user must have for the Benford's Law analysis to performed on it. Defaults to 100.\n", 134 | "\n", 135 | "### Returns \n", 136 | "A dataframe with the following columns:\n", 137 | "* `user_col` - The user IDs. The column name will be the same as the original dataframe.\n", 138 | "* `frequency_count` - the number of events found for the user\n", 139 | "* `chi_squared` - the chi squared score indicating how similar the user's activity is to the ideal Benford's Law distribution.\n", 140 | "* `digit_share` - A list containing the relative share each first digit has among the user's activity. The list is ordered from digit 1 to digit 9.\n", 141 | "\n" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": { 148 | "ExecuteTime": { 149 | "end_time": "2022-08-12T17:59:42.989415Z", 150 | "start_time": "2022-08-12T17:59:42.972762Z" 151 | }, 152 | "code_folding": [] 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "from math import log10, sqrt\n", 157 | "\n", 158 | "def _getUsersAndDigit(df, user_col, event_threshold):\n", 159 | " digits_df = (\n", 160 | " spark\n", 161 | " .createDataFrame(\n", 162 | " [[1], [2], [3], [4], [5], [6], [7], [8], [9]],\n", 163 | " schema=T.StructType([\n", 164 | " T.StructField(\n", 165 | " \"first_digit\", \n", 166 | " T.IntegerType()\n", 167 | " )\n", 168 | " ])\n", 169 | " )\n", 170 | " .coalesce(1)\n", 171 | " )\n", 172 | " users_and_digits = (\n", 173 | " df\n", 174 | " .groupBy(user_col)\n", 175 | " .agg(F.count('*').alias('count'))\n", 176 | " .filter(F.col('count') > event_threshold )\n", 177 | " .select(user_col)\n", 178 | " .repartition(user_col)\n", 179 | " .crossJoin(digits_df)\n", 180 | " )\n", 181 | " return users_and_digits\n", 182 | "\n", 183 | "def _generateFirstDigitShare(df, user_col, timestamp_col):\n", 184 | " user_event_window = W.partitionBy(user_col).orderBy(timestamp_col)\n", 185 | " user_cum_dist_window = W.partitionBy(user_col).orderBy('first_digit')\n", 186 | " \n", 187 | " event_time_delta = F.col(timestamp_col) - F.lag(F.col(timestamp_col)).over(user_event_window)\n", 188 | "\n", 189 | " first_digit_share = (\n", 190 | " df\n", 191 | " .select(\n", 192 | " user_col,\n", 193 | " timestamp_col,\n", 194 | " event_time_delta.alias('time_delta')\n", 195 | " )\n", 196 | " .filter(F.col('time_delta').isNotNull())\n", 197 | " .withColumn(\n", 198 | " 'first_digit',\n", 199 | " F.substring(F.col('time_delta').cast(T.StringType()), 0, 1).cast(T.IntegerType())\n", 200 | " )\n", 201 | " .withColumn(\n", 202 | " 'first_digit_cum_dist',\n", 203 | " F.cume_dist().over(user_cum_dist_window)\n", 204 | " )\n", 205 | " .groupBy(user_col, 'first_digit', 'first_digit_cum_dist')\n", 206 | " .agg(\n", 207 | " F.count(timestamp_col).alias('frequency_count')\n", 208 | " )\n", 209 | " .withColumn(\n", 210 | " 'first_digit_share',\n", 211 | " F.col('first_digit_cum_dist') \n", 212 | " - F.coalesce(\n", 213 | " F.lag('first_digit_cum_dist').over(user_cum_dist_window), \n", 214 | " F.lit(0)\n", 215 | " )\n", 216 | " )\n", 217 | " .repartition(user_col)\n", 218 | " )\n", 219 | " return first_digit_share\n", 220 | "\n", 221 | "def _expectedBenfordsShare():\n", 222 | " digits = [1, 2, 3, 4, 5, 6, 7, 8, 9]\n", 223 | " expected_share_list = [(d, log10(d+1)-log10(d)) for d in digits]\n", 224 | "\n", 225 | " expected_share_df = (\n", 226 | " spark\n", 227 | " .createDataFrame(\n", 228 | " expected_share_list,\n", 229 | " schema=T.StructType([\n", 230 | " T.StructField(\n", 231 | " 'first_digit', \n", 232 | " T.IntegerType()\n", 233 | " ),\n", 234 | " T.StructField(\n", 235 | " 'expected_share',\n", 236 | " T.DoubleType()\n", 237 | " )\n", 238 | " ])\n", 239 | " )\n", 240 | " .coalesce(1)\n", 241 | " )\n", 242 | " \n", 243 | " return expected_share_df\n", 244 | "\n", 245 | "def generateBenfordsLawAnalysis(df, user_col, timestamp_col, event_threshold = 100):\n", 246 | " user_digts_df = _getUsersAndDigit(df, user_col, event_threshold)\n", 247 | " first_digit_share_df = _generateFirstDigitShare(df, user_col, timestamp_col)\n", 248 | " expected_share_df = _expectedBenfordsShare()\n", 249 | " \n", 250 | " finalized_first_digit_share_df = (\n", 251 | " first_digit_share_df\n", 252 | " .join(\n", 253 | " user_digts_df,\n", 254 | " on=[user_col,'first_digit'],\n", 255 | " how='right'\n", 256 | " )\n", 257 | " .na.fill(0)\n", 258 | " .cache()\n", 259 | " ) \n", 260 | " user_benford_distances = (\n", 261 | " finalized_first_digit_share_df\n", 262 | " .join(\n", 263 | " F.broadcast(expected_share_df),\n", 264 | " on='first_digit',\n", 265 | " how='inner'\n", 266 | " )\n", 267 | " .withColumn(\n", 268 | " 'chi_squared_addends',\n", 269 | " F.pow(\n", 270 | " (F.col('first_digit_share') - F.col('expected_share')),\n", 271 | " F.lit(2)\n", 272 | " ) / F.col('expected_share')\n", 273 | " )\n", 274 | " .orderBy(user_col, 'first_digit')\n", 275 | " .groupBy(user_col)\n", 276 | " .agg(\n", 277 | " F.sum('frequency_count').alias('frequency_count'),\n", 278 | " F.sum('chi_squared_addends').alias('chi_squared'),\n", 279 | " F.collect_list(F.col('first_digit_share')).alias('digit_share')\n", 280 | " )\n", 281 | " )\n", 282 | " return user_benford_distances " 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "ExecuteTime": { 290 | "end_time": "2022-08-12T18:23:56.132897Z", 291 | "start_time": "2022-08-12T17:59:42.991016Z" 292 | }, 293 | "scrolled": false 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "new_df = generateBenfordsLawAnalysis(reddit_df, 'author', 'created_utc')\n", 298 | "\n", 299 | "new_df.orderBy(F.col('chi_squared').desc()).limit(50).toPandas()" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "ExecuteTime": { 307 | "end_time": "2022-08-12T20:16:38.977709Z", 308 | "start_time": "2022-08-12T20:15:52.691420Z" 309 | } 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "new_df.write.parquet(\n", 314 | " 'qfs:///user/spark/reddit/author_bot_chi_squared_score/',\n", 315 | " mode='overwrite'\n", 316 | ")" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [] 325 | } 326 | ], 327 | "metadata": { 328 | "kernelspec": { 329 | "display_name": "Python 3 (ipykernel)", 330 | "language": "python", 331 | "name": "python3" 332 | }, 333 | "language_info": { 334 | "codemirror_mode": { 335 | "name": "ipython", 336 | "version": 3 337 | }, 338 | "file_extension": ".py", 339 | "mimetype": "text/x-python", 340 | "name": "python", 341 | "nbconvert_exporter": "python", 342 | "pygments_lexer": "ipython3", 343 | "version": "3.10.7" 344 | }, 345 | "toc": { 346 | "base_numbering": 1, 347 | "nav_menu": {}, 348 | "number_sections": true, 349 | "sideBar": true, 350 | "skip_h1_title": false, 351 | "title_cell": "Table of Contents", 352 | "title_sidebar": "Contents", 353 | "toc_cell": false, 354 | "toc_position": {}, 355 | "toc_section_display": true, 356 | "toc_window_display": false 357 | } 358 | }, 359 | "nbformat": 4, 360 | "nbformat_minor": 4 361 | } 362 | -------------------------------------------------------------------------------- /tools/download_and_convert_to_bz2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # This script will download each URL listed in a passed configuration files, 5 | # and then if needed convert it's compression to bz2, which plays nicer with 6 | # Apache Spark. 7 | # 8 | # This script has two arguments. 9 | # $1 - the directory finalized files should be placed in 10 | # $2 - the file containing the URLs to download, one URL per line. 11 | # 12 | # All processing will be done in the current working directory. 13 | # Files are downloaded and processed one at a time. 14 | # 15 | # Requires the following tools be installed: 16 | # zstd 17 | # xz 18 | # lbzip2 19 | # 20 | # Set NUM_COMPRESSION_THREADS environment variable for the number of threads then 21 | # various compression tools will use. Defaults to 12. 22 | # 23 | 24 | NUM_COMPRESSION_THREADS=${NUM_COMPRESSION_THREADS:-12} 25 | 26 | if [ $# -ne 2 ]; then 27 | echo "usage: download_and_convert_to_bz2.sh /path/to/destination/directory /path/to/url_list.txt" 28 | exit 1 29 | fi 30 | 31 | # manage arguments 32 | destination_dir=${1%/} 33 | readarray url_list < $2 34 | 35 | # the main loop 36 | echo "Fetching URLs list in ${2}" 37 | for url in ${url_list[@]}; do 38 | echo "Processing URL = ${url}" 39 | download_file_name="${url##*/}" 40 | download_file_extension="${download_file_name##*.}" 41 | uncompressed_file_name="${download_file_name%.*}" 42 | final_file_name=${download_file_name} 43 | 44 | # download the files 45 | wget $url 46 | 47 | # if file extension of download is not bz2 deompress and recompress as bz2 48 | if [ "$download_file_extension" != "bz2" ]; then 49 | if [ "$download_file_extension" == "zst" ]; then 50 | zstd -v -d $download_file_name 51 | elif [ "$download_file_extension" == "xz" ]; then 52 | xz -v -k -T $NUM_COMPRESSION_THREADS -d $download_file_name 53 | else 54 | echo "Unrecognized file type for ${url}" 55 | exit 1 56 | fi 57 | lbzip2 -v -n $((NUM_COMPRESSION_THREADS)) $uncompressed_file_name 58 | rm $download_file_name 59 | final_file_name="${uncompressed_file_name}.bz2" 60 | fi 61 | mv -v -f $final_file_name $destination_dir 62 | echo "Finalized ${final_file_name}" 63 | echo "" 64 | done 65 | 66 | echo "Finished processing $2" 67 | exit 0 68 | -------------------------------------------------------------------------------- /udf-development/build.sbt: -------------------------------------------------------------------------------- 1 | name := "diybigdata-udf" 2 | 3 | // orgnization name (e.g., the package name of the project) 4 | organization := "net.diybigdata" 5 | 6 | version := "1.0-SNAPSHOT" 7 | 8 | // project description 9 | description := "DIY Big Data Hive UDFs" 10 | 11 | // Enables publishing to maven repo 12 | publishMavenStyle := true 13 | 14 | // Do not append Scala versions to the generated artifacts 15 | crossPaths := false 16 | 17 | // This forbids including Scala related libraries into the dependency 18 | autoScalaLibrary := false 19 | 20 | // Use the latest Scala version with Spark 2+ 21 | scalaVersion := "2.11.6" 22 | scalacOptions ++= Seq("-unchecked", "-feature", "-deprecation") 23 | 24 | // Add repositories where library dependencies can be found 25 | resolvers += "Cloudera" at "https://repository.cloudera.com/content/repositories/releases/" 26 | resolvers += "Central" at "http://central.maven.org/maven2/" 27 | resolvers += "Spring Plugins" at "http://repo.spring.io/plugins-release/" 28 | 29 | // library dependencies. (orginization name) % (project name) % (version) 30 | libraryDependencies ++= Seq( 31 | "org.apache.hive" % "hive-exec" % "2.1.0" % "provided", 32 | "org.apache.hadoop" % "hadoop-core" % "2.6.0-mr1-cdh5.8.2", 33 | "com.novocode" % "junit-interface" % "0.11" % "test" 34 | ) 35 | 36 | -------------------------------------------------------------------------------- /udf-development/src/main/java/net/diybigdata/udf/FormatYearMonthString.java: -------------------------------------------------------------------------------- 1 | package net.diybigdata.udf; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | import org.apache.hadoop.hive.ql.exec.Description; 5 | 6 | @Description( 7 | name = "FormatYearMonthString", 8 | value = "_FUNC_(InputDataType) - Converts the passed year and month integers to a formatted string.", 9 | extended = "Example:\n" 10 | + " > SELECT _FUNC_(InputDataType) FROM tablename;") 11 | 12 | public class FormatYearMonthString extends UDF { 13 | public String evaluate( Integer year, Integer month ) { 14 | return String.format("%1$d-%2$02d", year, month ); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /udf-development/src/test/java/net/diybigdata/udf/FormatYearMonthString_T.java: -------------------------------------------------------------------------------- 1 | package net.diybigdata.udf; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import org.junit.Test; 5 | 6 | import net.diybigdata.udf.FormatYearMonthString; 7 | 8 | public class FormatYearMonthString_T { 9 | 10 | @Test 11 | public void testStringFormating() { 12 | FormatYearMonthString udf = new FormatYearMonthString(); 13 | 14 | assertEquals( 15 | "evaluate(1936, 12)", 16 | "1936-12", 17 | udf.evaluate( 1936, 12 ) 18 | ); 19 | assertEquals( 20 | "evaluate(1980, 07)", 21 | "1980-07", 22 | udf.evaluate( 1980, 07 ) 23 | ); 24 | } 25 | } 26 | --------------------------------------------------------------------------------