├── .gitignore
├── README.md
├── airline-data
├── README.md
├── airport-graph-analysis.ipynb
├── archive
│ └── mini-cluster
│ │ ├── README.md
│ │ ├── airline-data-to-parquet.ipynb
│ │ ├── average-airline-delay-hive-udf.ipynb
│ │ └── average-airline-delay.ipynb
└── load-airline-data.ipynb
├── apache-access-logs
└── access-log-to-parquet.ipynb
├── reddit-data
├── README.md
├── download_and_convert_to_bz2.sh
├── load-reddit-comments-to-parquet.ipynb
├── load-reddit-posts-to-parquet.ipynb
└── reddit-bot-commenters-bensons-law.ipynb
├── tools
└── download_and_convert_to_bz2.sh
└── udf-development
├── build.sbt
└── src
├── main
└── java
│ └── net
│ └── diybigdata
│ └── udf
│ └── FormatYearMonthString.java
└── test
└── java
└── net
└── diybigdata
└── udf
└── FormatYearMonthString_T.java
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
91 | # Jupyter settings
92 | metastore_db/
93 |
94 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Spark Data Analysis Projects
2 |
3 | These are various Apache Spark data analysis projects done in Jupyter notebooks. Some of these analyses were conducted on the [ODROID XU4 mini cluster](http://diybigdata.net/odroid-xu4-cluster/), which the more recent ones are being performed on the [Personal Compute Cluster](https://diybigdata.net/personal-compute-cluster-2019-edition/). Since the XU4 mini cluster is a significantly constrained system, the projects done there are limited in scope. If you are looking to repeat some of these projects, the Personal Compute Cluster versions are more current.
4 |
--------------------------------------------------------------------------------
/airline-data/README.md:
--------------------------------------------------------------------------------
1 | # Airline Activity Analysis
2 | This goal of this project is to study the data available from the Department of Transportation concerning flight activity and timeliness. The data for this analysis is available here at [the Bureau of Transportation Statistics site](http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time).
3 |
4 | The data for and description of these projects can be [found here](https://diybigdata.net/airline-on-time-performance-data-analysis/).
5 |
6 | The analysis down in these notebooks use Apache Spark v2.4 or greater with the [Quantcast File System](https://github.com/quantcast/qfs) as the underlying distributed file system. The code can be easily converted to use [HDFS](https://en.wikipedia.org/wiki/Apache_Hadoop) as needed.
7 |
--------------------------------------------------------------------------------
/airline-data/archive/mini-cluster/README.md:
--------------------------------------------------------------------------------
1 | # Airline Activity Analysis on the ODROID XU-4 Mini Cluster
2 | This version of the airline data analysis was designed to be conducted on a resource-constrained computer cluster, such as the [ODROID XU-4 Mini-Cluster](https://diybigdata.net/odroid-xu4-cluster/).
3 |
--------------------------------------------------------------------------------
/airline-data/archive/mini-cluster/airline-data-to-parquet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from pyspark.sql.types import StructType, StructField\n",
12 | "from pyspark.sql.types import DoubleType, IntegerType, StringType, DecimalType, LongType\n",
13 | "\n",
14 | "air_schema = StructType([\n",
15 | " StructField(\"Year\", IntegerType()),\n",
16 | " StructField(\"Quarter\", IntegerType()),\n",
17 | " StructField(\"Month\", IntegerType()),\n",
18 | " StructField(\"DayofMonth\", IntegerType()),\n",
19 | " StructField(\"DayOfWeek\", IntegerType()),\n",
20 | " StructField(\"FlightDate\", StringType()),\n",
21 | " StructField(\"UniqueCarrier\", StringType()),\n",
22 | " StructField(\"AirlineID\", LongType()),\n",
23 | " StructField(\"Carrier\", StringType()),\n",
24 | " StructField(\"TailNum\", StringType()),\n",
25 | " StructField(\"FlightNum\", IntegerType()),\n",
26 | " StructField(\"OriginAirportID\", IntegerType()),\n",
27 | " StructField(\"OriginAirportSeqID\", IntegerType()),\n",
28 | " StructField(\"OriginCityMarketID\", IntegerType()),\n",
29 | " StructField(\"Origin\", StringType()),\n",
30 | " StructField(\"OriginCityName\", StringType()),\n",
31 | " StructField(\"OriginState\", StringType()),\n",
32 | " StructField(\"OriginStateFips\", IntegerType()),\n",
33 | " StructField(\"OriginStateName\", StringType()),\n",
34 | " StructField(\"OriginWac\", IntegerType()),\n",
35 | " StructField(\"DestAirportID\", IntegerType()),\n",
36 | " StructField(\"DestAirportSeqID\", IntegerType()),\n",
37 | " StructField(\"DestCityMarketID\", IntegerType()),\n",
38 | " StructField(\"Dest\", StringType()),\n",
39 | " StructField(\"DestCityName\", StringType()),\n",
40 | " StructField(\"DestState\", StringType()),\n",
41 | " StructField(\"DestStateFips\", IntegerType()),\n",
42 | " StructField(\"DestStateName\", StringType()),\n",
43 | " StructField(\"DestWac\", IntegerType()),\n",
44 | " StructField(\"CRSDepTime\", StringType()),\n",
45 | " StructField(\"DepTime\", StringType()),\n",
46 | " StructField(\"DepDelay\", DoubleType()),\n",
47 | " StructField(\"DepDelayMinutes\", DoubleType()),\n",
48 | " StructField(\"DepDel15\", DoubleType()),\n",
49 | " StructField(\"DepartureDelayGroups\", IntegerType()),\n",
50 | " StructField(\"DepTimeBlk\", StringType()),\n",
51 | " StructField(\"TaxiOut\", DoubleType()),\n",
52 | " StructField(\"WheelsOff\", StringType()),\n",
53 | " StructField(\"WheelsOn\", StringType()),\n",
54 | " StructField(\"TaxiIn\", DoubleType()),\n",
55 | " StructField(\"CRSArrTime\", StringType()),\n",
56 | " StructField(\"ArrTime\", StringType()),\n",
57 | " StructField(\"ArrDelay\", DoubleType()),\n",
58 | " StructField(\"ArrDelayMinutes\", DoubleType()),\n",
59 | " StructField(\"ArrDel15\", DoubleType()),\n",
60 | " StructField(\"ArrivalDelayGroups\", IntegerType()),\n",
61 | " StructField(\"ArrTimeBlk\", StringType()),\n",
62 | " StructField(\"Cancelled\", DoubleType()),\n",
63 | " StructField(\"CancellationCode\", StringType()),\n",
64 | " StructField(\"Diverted\", DoubleType()),\n",
65 | " StructField(\"CRSElapsedTime\", DoubleType()),\n",
66 | " StructField(\"ActualElapsedTime\", DoubleType()),\n",
67 | " StructField(\"AirTime\", DoubleType()),\n",
68 | " StructField(\"Flights\", DoubleType()),\n",
69 | " StructField(\"Distance\", DoubleType()),\n",
70 | " StructField(\"DistanceGroup\", IntegerType()),\n",
71 | " StructField(\"CarrierDelay\", DoubleType()),\n",
72 | " StructField(\"WeatherDelay\", DoubleType()),\n",
73 | " StructField(\"NASDelay\", DoubleType()),\n",
74 | " StructField(\"SecurityDelay\", DoubleType()),\n",
75 | " StructField(\"LateAircraftDelay\", DoubleType()),\n",
76 | " StructField(\"FirstDepTime\", StringType()),\n",
77 | " StructField(\"TotalAddGTime\", StringType()),\n",
78 | " StructField(\"LongestAddGTime\", StringType()),\n",
79 | " StructField(\"DivAirportLandings\", StringType()),\n",
80 | " StructField(\"DivReachedDest\", StringType()),\n",
81 | " StructField(\"DivActualElapsedTime\", StringType()),\n",
82 | " StructField(\"DivArrDelay\", StringType()),\n",
83 | " StructField(\"DivDistance\", StringType()),\n",
84 | " StructField(\"Div1Airport\", StringType()),\n",
85 | " StructField(\"Div1AirportID\", StringType()),\n",
86 | " StructField(\"Div1AirportSeqID\", StringType()),\n",
87 | " StructField(\"Div1WheelsOn\", StringType()),\n",
88 | " StructField(\"Div1TotalGTime\", StringType()),\n",
89 | " StructField(\"Div1LongestGTime\", StringType()),\n",
90 | " StructField(\"Div1WheelsOff\", StringType()),\n",
91 | " StructField(\"Div1TailNum\", StringType()),\n",
92 | " StructField(\"Div2Airport\", StringType()),\n",
93 | " StructField(\"Div2AirportID\", StringType()),\n",
94 | " StructField(\"Div2AirportSeqID\", StringType()),\n",
95 | " StructField(\"Div2WheelsOn\", StringType()),\n",
96 | " StructField(\"Div2TotalGTime\", StringType()),\n",
97 | " StructField(\"Div2LongestGTime\", StringType()),\n",
98 | " StructField(\"Div2WheelsOff\", StringType()),\n",
99 | " StructField(\"Div2TailNum\", StringType()),\n",
100 | " StructField(\"Div3Airport\", StringType()),\n",
101 | " StructField(\"Div3AirportID\", StringType()),\n",
102 | " StructField(\"Div3AirportSeqID\", StringType()),\n",
103 | " StructField(\"Div3WheelsOn\", StringType()),\n",
104 | " StructField(\"Div3TotalGTime\", StringType()),\n",
105 | " StructField(\"Div3LongestGTime\", StringType()),\n",
106 | " StructField(\"Div3WheelsOff\", StringType()),\n",
107 | " StructField(\"Div3TailNum\", StringType()),\n",
108 | " StructField(\"Div4Airport\", StringType()),\n",
109 | " StructField(\"Div4AirportID\", StringType()),\n",
110 | " StructField(\"Div4AirportSeqID\", StringType()),\n",
111 | " StructField(\"Div4WheelsOn\", StringType()),\n",
112 | " StructField(\"Div4TotalGTime\", StringType()),\n",
113 | " StructField(\"Div4LongestGTime\", StringType()),\n",
114 | " StructField(\"Div4WheelsOff\", StringType()),\n",
115 | " StructField(\"Div4TailNum\", StringType()),\n",
116 | " StructField(\"Div5Airport\", StringType()),\n",
117 | " StructField(\"Div5AirportID\", StringType()),\n",
118 | " StructField(\"Div5AirportSeqID\", StringType()),\n",
119 | " StructField(\"Div5WheelsOn\", StringType()),\n",
120 | " StructField(\"Div5TotalGTime\", StringType()),\n",
121 | " StructField(\"Div5LongestGTime\", StringType()),\n",
122 | " StructField(\"Div5WheelsOff\", StringType()),\n",
123 | " StructField(\"Div5TailNum\", StringType())\n",
124 | "])\n",
125 | "\n"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {
132 | "collapsed": false
133 | },
134 | "outputs": [],
135 | "source": [
136 | "import itertools\n",
137 | "year_list = ['2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015']\n",
138 | "month_list = ['1','2','3','4','5','6','7','8','9','10','11','12']\n",
139 | "\n",
140 | "air_df_dict = {}\n",
141 | "\n",
142 | "print('Gathering files ...')\n",
143 | "for (year_str,month_str) in list(itertools.product(year_list,month_list)):\n",
144 | " year_month_str = '%s_%s'%(year_str,month_str)\n",
145 | " print('%s, '%(year_month_str), end=\"\")\n",
146 | " air_df_dict[year_month_str] = spark.read.csv( \n",
147 | " 'qfs://master:20000/user/michael/data/airline/On_Time_On_Time_Performance_%s.csv'%(year_month_str), \n",
148 | " header=True, \n",
149 | " schema=air_schema,\n",
150 | " escape='\"')\n",
151 | "print('Done!')"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {
158 | "collapsed": false
159 | },
160 | "outputs": [],
161 | "source": [
162 | "from datetime import datetime\n",
163 | "from pyspark.sql.functions import col, udf, unix_timestamp, to_date\n",
164 | "from pyspark.sql.types import DateType\n",
165 | "from pyspark import StorageLevel\n",
166 | "\n",
167 | "airline_data_parts = []\n",
168 | "\n",
169 | "# Should really coalesce to 1 here, but that strains the ODROID XU4 cluster too\n",
170 | "# much.\n",
171 | "print('Processing ', end=\"\")\n",
172 | "for year_month_str, air_df in air_df_dict.items():\n",
173 | " print('%s, '%(year_month_str), end=\"\")\n",
174 | " airline_data = air_df.select(\n",
175 | " \"Year\",\"Quarter\",\"Month\",\"DayofMonth\",\"DayOfWeek\",\"FlightDate\",\"UniqueCarrier\",\"AirlineID\",\n",
176 | " \"Carrier\",\"TailNum\",\"FlightNum\",\"OriginAirportID\",\"OriginAirportSeqID\",\"OriginCityMarketID\",\n",
177 | " \"Origin\",\"OriginCityName\",\"OriginState\",\"OriginStateFips\",\"OriginStateName\",\"OriginWac\",\n",
178 | " \"DestAirportID\",\"DestAirportSeqID\",\"DestCityMarketID\",\"Dest\",\"DestCityName\",\"DestState\",\n",
179 | " \"DestStateFips\",\"DestStateName\",\"DestWac\",\"CRSDepTime\",\"DepTime\",\"DepDelay\",\"DepDelayMinutes\",\n",
180 | " \"DepDel15\",\"DepartureDelayGroups\",\"DepTimeBlk\",\"TaxiOut\",\"WheelsOff\",\"WheelsOn\",\"TaxiIn\",\"CRSArrTime\",\n",
181 | " \"ArrTime\",\"ArrDelay\",\"ArrDelayMinutes\",\"ArrDel15\",\"ArrivalDelayGroups\",\"ArrTimeBlk\",\"Cancelled\",\n",
182 | " \"CancellationCode\",\"Diverted\",\"CRSElapsedTime\",\"ActualElapsedTime\",\"AirTime\",\"Flights\",\"Distance\",\n",
183 | " \"DistanceGroup\",\"CarrierDelay\",\"WeatherDelay\",\"NASDelay\",\"SecurityDelay\",\"LateAircraftDelay\"\n",
184 | " ).withColumn(\n",
185 | " 'FlightDate', to_date(col('FlightDate'))\n",
186 | " )\n",
187 | " \n",
188 | " airline_data_parts.append(airline_data)\n",
189 | "\n",
190 | "print('Done!')"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "metadata": {
197 | "collapsed": false
198 | },
199 | "outputs": [],
200 | "source": [
201 | "master_data = airline_data_parts[0]\n",
202 | "\n",
203 | "print('Unionizing data frames 0, ', end=\"\")\n",
204 | "for i in range(1,len(airline_data_parts)):\n",
205 | " print('%d, '%(i), end=\"\")\n",
206 | " master_data = master_data.union(airline_data_parts[i])\n",
207 | "print(\" Done!\")\n",
208 | "print('Starting export to HDFS...')\n",
209 | "master_data.write.partitionBy(\n",
210 | " \"Year\",\"Month\"\n",
211 | " ).parquet(\n",
212 | " 'qfs://master:20000/user/michael/data/airline_data',\n",
213 | " mode='overwrite'\n",
214 | " )\n",
215 | "print('Done!')"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {
222 | "collapsed": false
223 | },
224 | "outputs": [],
225 | "source": [
226 | "master_data.take(1)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {
233 | "collapsed": false
234 | },
235 | "outputs": [],
236 | "source": [
237 | "from pyspark.sql import Row\n",
238 | "\n",
239 | "def mapAirlineIdRow(r):\n",
240 | " airline_id = int(r.Code)\n",
241 | " airline_name_parts = r.Description.split(':')\n",
242 | " airline_name = airline_name_parts[0].strip()\n",
243 | " iata_carrier = airline_name_parts[1].strip()\n",
244 | " out = Row(\n",
245 | " AirlineID=airline_id,\n",
246 | " AirlineName=airline_name,\n",
247 | " Carrier=iata_carrier\n",
248 | " )\n",
249 | " return out;\n",
250 | "\n",
251 | "airline_id_csv = spark.read.csv(\n",
252 | " 'qfs://master:20000/user/michael/data/airline/airline-id-lookup-table.csv',\n",
253 | " header=True,\n",
254 | " escape='\"'\n",
255 | ")\n",
256 | "\n",
257 | "airline_id_df = airline_id_csv.rdd.map(mapAirlineIdRow).toDF().coalesce(1)\n",
258 | "airline_id_df.write.parquet(\n",
259 | " 'qfs://master:20000/user/michael/data/airline_id_table',\n",
260 | " mode='overwrite'\n",
261 | " )"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": null,
267 | "metadata": {
268 | "collapsed": false
269 | },
270 | "outputs": [],
271 | "source": [
272 | "airline_id_df.take(1)"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": null,
278 | "metadata": {
279 | "collapsed": false
280 | },
281 | "outputs": [],
282 | "source": [
283 | "airport_schema = StructType([\n",
284 | " StructField(\"Code\", StringType()),\n",
285 | " StructField(\"Description\", StringType()),\n",
286 | "])\n",
287 | "\n",
288 | "def mapAirportIdRow(r):\n",
289 | " airport_id = r.Code\n",
290 | " airport_city = ''\n",
291 | " airport_name = ''\n",
292 | " airport_name_parts = r.Description.split(':')\n",
293 | " if len(airport_name_parts) is 2:\n",
294 | " airport_city = airport_name_parts[0].strip()\n",
295 | " airport_name = airport_name_parts[1].strip()\n",
296 | " elif len(airport_name_parts) is 1:\n",
297 | " airport_city = airport_name_parts[0]\n",
298 | " airport_name = r.Code\n",
299 | " \n",
300 | " out = Row(\n",
301 | " AirportID=airport_id,\n",
302 | " City=airport_city,\n",
303 | " Name=airport_name\n",
304 | " )\n",
305 | " return out;\n",
306 | "\n",
307 | "airport_id_csv = spark.read.csv(\n",
308 | " 'qfs://master:20000/user/michael/data/airline/airport-information.csv',\n",
309 | " header=True,\n",
310 | " escape='\"',\n",
311 | " schema=airport_schema\n",
312 | ")\n",
313 | "\n",
314 | "airport_id_df = airport_id_csv.rdd.map(mapAirportIdRow).toDF().coalesce(1)\n",
315 | "airport_id_df.write.parquet(\n",
316 | " 'qfs://master:20000/user/michael/data/airport_id_table',\n",
317 | " mode='overwrite'\n",
318 | " )\n",
319 | "\n",
320 | "airport_id_df.take(1)"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "metadata": {
327 | "collapsed": true
328 | },
329 | "outputs": [],
330 | "source": []
331 | }
332 | ],
333 | "metadata": {
334 | "kernelspec": {
335 | "display_name": "Python 3",
336 | "language": "python",
337 | "name": "python3"
338 | },
339 | "language_info": {
340 | "codemirror_mode": {
341 | "name": "ipython",
342 | "version": 3
343 | },
344 | "file_extension": ".py",
345 | "mimetype": "text/x-python",
346 | "name": "python",
347 | "nbconvert_exporter": "python",
348 | "pygments_lexer": "ipython3",
349 | "version": "3.4.3"
350 | }
351 | },
352 | "nbformat": 4,
353 | "nbformat_minor": 0
354 | }
355 |
--------------------------------------------------------------------------------
/airline-data/load-airline-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Load Airline Data into Parquet\n",
8 | "This notebook will load the raw CSV data downloaded from the [Bureau of Transportation Statistics's website](https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time) into a parquet file partitioned by year and month. This notebook assumes that the raw files will be in a directory on the QFS file system named `/data/airline/raw/`, and will output the parquet files into a directory named `/data/airline/processed/`."
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import pyspark.sql.functions as F\n",
18 | "import pyspark.sql.types as T\n",
19 | "\n",
20 | "spark = SparkSession\\\n",
21 | " .builder\\\n",
22 | " .appName(\"AirlineDataLoad\")\\\n",
23 | " .getOrCreate()"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "air_schema = T.StructType([\n",
33 | " T.StructField(\"Year\", T.IntegerType()),\n",
34 | " T.StructField(\"Quarter\", T.IntegerType()),\n",
35 | " T.StructField(\"Month\", T.IntegerType()),\n",
36 | " T.StructField(\"DayofMonth\", T.IntegerType()),\n",
37 | " T.StructField(\"DayOfWeek\", T.IntegerType()),\n",
38 | " T.StructField(\"FlightDate\", T.StringType()),\n",
39 | " T.StructField(\"UniqueCarrier\", T.StringType()),\n",
40 | " T.StructField(\"AirlineID\", T.LongType()),\n",
41 | " T.StructField(\"Carrier\", T.StringType()),\n",
42 | " T.StructField(\"TailNum\", T.StringType()),\n",
43 | " T.StructField(\"FlightNum\", T.IntegerType()),\n",
44 | " T.StructField(\"OriginAirportID\", T.IntegerType()),\n",
45 | " T.StructField(\"OriginAirportSeqID\", T.IntegerType()),\n",
46 | " T.StructField(\"OriginCityMarketID\", T.IntegerType()),\n",
47 | " T.StructField(\"Origin\", T.StringType()),\n",
48 | " T.StructField(\"OriginCityName\", T.StringType()),\n",
49 | " T.StructField(\"OriginState\", T.StringType()),\n",
50 | " T.StructField(\"OriginStateFips\", T.IntegerType()),\n",
51 | " T.StructField(\"OriginStateName\", T.StringType()),\n",
52 | " T.StructField(\"OriginWac\", T.IntegerType()),\n",
53 | " T.StructField(\"DestAirportID\", T.IntegerType()),\n",
54 | " T.StructField(\"DestAirportSeqID\", T.IntegerType()),\n",
55 | " T.StructField(\"DestCityMarketID\", T.IntegerType()),\n",
56 | " T.StructField(\"Dest\", T.StringType()),\n",
57 | " T.StructField(\"DestCityName\", T.StringType()),\n",
58 | " T.StructField(\"DestState\", T.StringType()),\n",
59 | " T.StructField(\"DestStateFips\", T.IntegerType()),\n",
60 | " T.StructField(\"DestStateName\", T.StringType()),\n",
61 | " T.StructField(\"DestWac\", T.IntegerType()),\n",
62 | " T.StructField(\"CRSDepTime\", T.StringType()),\n",
63 | " T.StructField(\"DepTime\", T.StringType()),\n",
64 | " T.StructField(\"DepDelay\", T.DoubleType()),\n",
65 | " T.StructField(\"DepDelayMinutes\", T.DoubleType()),\n",
66 | " T.StructField(\"DepDel15\", T.DoubleType()),\n",
67 | " T.StructField(\"DepartureDelayGroups\", T.IntegerType()),\n",
68 | " T.StructField(\"DepTimeBlk\", T.StringType()),\n",
69 | " T.StructField(\"TaxiOut\", T.DoubleType()),\n",
70 | " T.StructField(\"WheelsOff\", T.StringType()),\n",
71 | " T.StructField(\"WheelsOn\", T.StringType()),\n",
72 | " T.StructField(\"TaxiIn\", T.DoubleType()),\n",
73 | " T.StructField(\"CRSArrTime\", T.StringType()),\n",
74 | " T.StructField(\"ArrTime\", T.StringType()),\n",
75 | " T.StructField(\"ArrDelay\", T.DoubleType()),\n",
76 | " T.StructField(\"ArrDelayMinutes\", T.DoubleType()),\n",
77 | " T.StructField(\"ArrDel15\", T.DoubleType()),\n",
78 | " T.StructField(\"ArrivalDelayGroups\", T.IntegerType()),\n",
79 | " T.StructField(\"ArrTimeBlk\", T.StringType()),\n",
80 | " T.StructField(\"Cancelled\", T.DoubleType()),\n",
81 | " T.StructField(\"CancellationCode\", T.StringType()),\n",
82 | " T.StructField(\"Diverted\", T.DoubleType()),\n",
83 | " T.StructField(\"CRSElapsedTime\", T.DoubleType()),\n",
84 | " T.StructField(\"ActualElapsedTime\", T.DoubleType()),\n",
85 | " T.StructField(\"AirTime\", T.DoubleType()),\n",
86 | " T.StructField(\"Flights\", T.DoubleType()),\n",
87 | " T.StructField(\"Distance\", T.DoubleType()),\n",
88 | " T.StructField(\"DistanceGroup\", T.IntegerType()),\n",
89 | " T.StructField(\"CarrierDelay\", T.DoubleType()),\n",
90 | " T.StructField(\"WeatherDelay\", T.DoubleType()),\n",
91 | " T.StructField(\"NASDelay\", T.DoubleType()),\n",
92 | " T.StructField(\"SecurityDelay\", T.DoubleType()),\n",
93 | " T.StructField(\"LateAircraftDelay\", T.DoubleType()),\n",
94 | " T.StructField(\"FirstDepTime\", T.StringType()),\n",
95 | " T.StructField(\"TotalAddGTime\", T.StringType()),\n",
96 | " T.StructField(\"LongestAddGTime\", T.StringType()),\n",
97 | " T.StructField(\"DivAirportLandings\", T.StringType()),\n",
98 | " T.StructField(\"DivReachedDest\", T.StringType()),\n",
99 | " T.StructField(\"DivActualElapsedTime\", T.StringType()),\n",
100 | " T.StructField(\"DivArrDelay\", T.StringType()),\n",
101 | " T.StructField(\"DivDistance\", T.StringType()),\n",
102 | " T.StructField(\"Div1Airport\", T.StringType()),\n",
103 | " T.StructField(\"Div1AirportID\", T.StringType()),\n",
104 | " T.StructField(\"Div1AirportSeqID\", T.StringType()),\n",
105 | " T.StructField(\"Div1WheelsOn\", T.StringType()),\n",
106 | " T.StructField(\"Div1TotalGTime\", T.StringType()),\n",
107 | " T.StructField(\"Div1LongestGTime\", T.StringType()),\n",
108 | " T.StructField(\"Div1WheelsOff\", T.StringType()),\n",
109 | " T.StructField(\"Div1TailNum\", T.StringType()),\n",
110 | " T.StructField(\"Div2Airport\", T.StringType()),\n",
111 | " T.StructField(\"Div2AirportID\", T.StringType()),\n",
112 | " T.StructField(\"Div2AirportSeqID\", T.StringType()),\n",
113 | " T.StructField(\"Div2WheelsOn\", T.StringType()),\n",
114 | " T.StructField(\"Div2TotalGTime\", T.StringType()),\n",
115 | " T.StructField(\"Div2LongestGTime\", T.StringType()),\n",
116 | " T.StructField(\"Div2WheelsOff\", T.StringType()),\n",
117 | " T.StructField(\"Div2TailNum\", T.StringType()),\n",
118 | " T.StructField(\"Div3Airport\", T.StringType()),\n",
119 | " T.StructField(\"Div3AirportID\", T.StringType()),\n",
120 | " T.StructField(\"Div3AirportSeqID\", T.StringType()),\n",
121 | " T.StructField(\"Div3WheelsOn\", T.StringType()),\n",
122 | " T.StructField(\"Div3TotalGTime\", T.StringType()),\n",
123 | " T.StructField(\"Div3LongestGTime\", T.StringType()),\n",
124 | " T.StructField(\"Div3WheelsOff\", T.StringType()),\n",
125 | " T.StructField(\"Div3TailNum\", T.StringType()),\n",
126 | " T.StructField(\"Div4Airport\", T.StringType()),\n",
127 | " T.StructField(\"Div4AirportID\", T.StringType()),\n",
128 | " T.StructField(\"Div4AirportSeqID\", T.StringType()),\n",
129 | " T.StructField(\"Div4WheelsOn\", T.StringType()),\n",
130 | " T.StructField(\"Div4TotalGTime\", T.StringType()),\n",
131 | " T.StructField(\"Div4LongestGTime\", T.StringType()),\n",
132 | " T.StructField(\"Div4WheelsOff\", T.StringType()),\n",
133 | " T.StructField(\"Div4TailNum\", T.StringType()),\n",
134 | " T.StructField(\"Div5Airport\", T.StringType()),\n",
135 | " T.StructField(\"Div5AirportID\", T.StringType()),\n",
136 | " T.StructField(\"Div5AirportSeqID\", T.StringType()),\n",
137 | " T.StructField(\"Div5WheelsOn\", T.StringType()),\n",
138 | " T.StructField(\"Div5TotalGTime\", T.StringType()),\n",
139 | " T.StructField(\"Div5LongestGTime\", T.StringType()),\n",
140 | " T.StructField(\"Div5WheelsOff\", T.StringType()),\n",
141 | " T.StructField(\"Div5TailNum\", T.StringType())\n",
142 | "])"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "raw_df = spark.read.csv( \n",
152 | " 'qfs:///data/airline/raw/On_Time_On_Time_Performance_*.csv', \n",
153 | " header=True, \n",
154 | " schema=air_schema,\n",
155 | " escape='\"')\n",
156 | "\n",
157 | "airline_data = raw_df.select(\n",
158 | " \"Year\",\"Quarter\",\"Month\",\"DayofMonth\",\"DayOfWeek\",\"FlightDate\",\"UniqueCarrier\",\"AirlineID\",\n",
159 | " \"Carrier\",\"TailNum\",\"FlightNum\",\"OriginAirportID\",\"OriginAirportSeqID\",\"OriginCityMarketID\",\n",
160 | " \"Origin\",\"OriginCityName\",\"OriginState\",\"OriginStateFips\",\"OriginStateName\",\"OriginWac\",\n",
161 | " \"DestAirportID\",\"DestAirportSeqID\",\"DestCityMarketID\",\"Dest\",\"DestCityName\",\"DestState\",\n",
162 | " \"DestStateFips\",\"DestStateName\",\"DestWac\",\"CRSDepTime\",\"DepTime\",\"DepDelay\",\"DepDelayMinutes\",\n",
163 | " \"DepDel15\",\"DepartureDelayGroups\",\"DepTimeBlk\",\"TaxiOut\",\"WheelsOff\",\"WheelsOn\",\"TaxiIn\",\"CRSArrTime\",\n",
164 | " \"ArrTime\",\"ArrDelay\",\"ArrDelayMinutes\",\"ArrDel15\",\"ArrivalDelayGroups\",\"ArrTimeBlk\",\"Cancelled\",\n",
165 | " \"CancellationCode\",\"Diverted\",\"CRSElapsedTime\",\"ActualElapsedTime\",\"AirTime\",\"Flights\",\"Distance\",\n",
166 | " \"DistanceGroup\",\"CarrierDelay\",\"WeatherDelay\",\"NASDelay\",\"SecurityDelay\",\"LateAircraftDelay\"\n",
167 | " ).withColumn(\n",
168 | " 'FlightDate', F.to_date(F.col('FlightDate'),'yyyy-MM-dd')\n",
169 | " )\n",
170 | "\n",
171 | "airline_data.repartition('Year').write.partitionBy(\n",
172 | " \"Year\",\"Month\"\n",
173 | " ).parquet(\n",
174 | " 'qfs:///data/airline/processed/airline_data',\n",
175 | " mode='overwrite'\n",
176 | " )"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "from pyspark.sql import Row\n",
186 | "\n",
187 | "def mapAirlineIdRow(r):\n",
188 | " airline_id = int(r.Code)\n",
189 | " airline_name_parts = r.Description.split(':')\n",
190 | " airline_name = airline_name_parts[0].strip()\n",
191 | " iata_carrier = airline_name_parts[1].strip()\n",
192 | " out = Row(\n",
193 | " AirlineID=airline_id,\n",
194 | " AirlineName=airline_name,\n",
195 | " Carrier=iata_carrier\n",
196 | " )\n",
197 | " return out;\n",
198 | "\n",
199 | "airline_id_csv = spark.read.csv(\n",
200 | " 'qfs:///data/airline/raw/LUT-DOT_airline_IDs.csv',\n",
201 | " header=True,\n",
202 | " escape='\"'\n",
203 | ")\n",
204 | "\n",
205 | "airline_id_df = airline_id_csv.rdd.map(mapAirlineIdRow).toDF().coalesce(1)\n",
206 | "airline_id_df.write.parquet(\n",
207 | " 'qfs:///data/airline/processed/DOT_airline_codes_table',\n",
208 | " mode='overwrite'\n",
209 | " )\n",
210 | " \n",
211 | "airline_id_df.take(1)\n",
212 | "\n",
213 | "airport_schema = T.StructType([\n",
214 | " T.StructField(\"Code\", T.StringType()),\n",
215 | " T.StructField(\"Description\", T.StringType()),\n",
216 | "])\n",
217 | "\n",
218 | "def mapAirportIdRow(r):\n",
219 | " airport_id = r.Code\n",
220 | " airport_city = ''\n",
221 | " airport_name = ''\n",
222 | " airport_name_parts = r.Description.split(':')\n",
223 | " if len(airport_name_parts) is 2:\n",
224 | " airport_city = airport_name_parts[0].strip()\n",
225 | " airport_name = airport_name_parts[1].strip()\n",
226 | " elif len(airport_name_parts) is 1:\n",
227 | " airport_city = airport_name_parts[0]\n",
228 | " airport_name = r.Code\n",
229 | " \n",
230 | " out = Row(\n",
231 | " \n",
232 | " AirportID=airport_id,\n",
233 | " City=airport_city,\n",
234 | " Name=airport_name\n",
235 | " )\n",
236 | " return out;\n",
237 | "\n",
238 | "airport_codes_csv = spark.read.csv(\n",
239 | " 'qfs:///data/airline/raw/LUT-airport_codes.csv',\n",
240 | " header=True,\n",
241 | " escape='\"',\n",
242 | " schema=airport_schema\n",
243 | ")\n",
244 | "\n",
245 | "airport_codes_df = airport_codes_csv.rdd.map(mapAirportIdRow).toDF().coalesce(1)\n",
246 | "airport_codes_df.write.parquet(\n",
247 | " 'qfs:///data/airline/processed/airport_codes_table',\n",
248 | " mode='overwrite'\n",
249 | " )\n",
250 | "\n",
251 | "airport_id_csv = spark.read.csv(\n",
252 | " 'qfs:///data/airline/raw/LUT-DOT_airport_IDs.csv',\n",
253 | " header=True,\n",
254 | " escape='\"',\n",
255 | " schema=airport_schema\n",
256 | ")\n",
257 | "\n",
258 | "airport_id_df = (\n",
259 | " airport_id_csv\n",
260 | " .rdd.map(mapAirportIdRow)\n",
261 | " .toDF()\n",
262 | " .withColumn(\n",
263 | " 'AirportID',\n",
264 | " F.col('AirportID').cast(T.IntegerType())\n",
265 | " )\n",
266 | " .coalesce(1)\n",
267 | ")\n",
268 | "airport_id_df.write.parquet(\n",
269 | " 'qfs:///data/airline/processed/airport_id_table',\n",
270 | " mode='overwrite'\n",
271 | " )"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": []
280 | }
281 | ],
282 | "metadata": {
283 | "kernelspec": {
284 | "display_name": "Python 3",
285 | "language": "python",
286 | "name": "python3"
287 | },
288 | "language_info": {
289 | "codemirror_mode": {
290 | "name": "ipython",
291 | "version": 3
292 | },
293 | "file_extension": ".py",
294 | "mimetype": "text/x-python",
295 | "name": "python",
296 | "nbconvert_exporter": "python",
297 | "pygments_lexer": "ipython3",
298 | "version": "3.5.3"
299 | }
300 | },
301 | "nbformat": 4,
302 | "nbformat_minor": 2
303 | }
304 |
--------------------------------------------------------------------------------
/apache-access-logs/access-log-to-parquet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "# Parse an Apache access log. Assumes Python 3\n",
12 | "import re\n",
13 | "from pyspark.sql import Row\n",
14 | "from datetime import datetime\n",
15 | "\n",
16 | "APACHE_ACCESS_LOG_PATTERN = '^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\" (\\d{3}) (\\d+) \"((?:[^”]|”)+)\" \"((?:[^”]|”)+)\"$'\n",
17 | "DATETIME_PARSE_PATTERN = '%d/%b/%Y:%H:%M:%S %z'\n",
18 | "\n",
19 | "# Returns a Row containing the Apache Access Log info\n",
20 | "def parse_apache_log_line(logline):\n",
21 | " match = re.search(APACHE_ACCESS_LOG_PATTERN, logline)\n",
22 | " if match is None:\n",
23 | " return None\n",
24 | " date_obj = datetime.strptime(match.group(4),DATETIME_PARSE_PATTERN)\n",
25 | " return Row(\n",
26 | " ipAddress = match.group(1),\n",
27 | " clientIdentd = match.group(2),\n",
28 | " userId = match.group(3),\n",
29 | " dateTime = match.group(4),\n",
30 | " timestamp = date_obj.timestamp(),\n",
31 | " month = date_obj.strftime('%Y-%m'),\n",
32 | " method = match.group(5),\n",
33 | " endpoint = match.group(6),\n",
34 | " protocol = match.group(7),\n",
35 | " referrer = match.group(10),\n",
36 | " userAgent = match.group(11),\n",
37 | " responseCode = int(match.group(8)),\n",
38 | " contentSize = int(match.group(9)))"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 2,
44 | "metadata": {
45 | "collapsed": true
46 | },
47 | "outputs": [],
48 | "source": [
49 | "access_logs_raw = sc.textFile(\"hdfs://master:9000/user/michael/data/diybigdata.20160808.log\")"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 3,
55 | "metadata": {
56 | "collapsed": true
57 | },
58 | "outputs": [],
59 | "source": [
60 | "access_logs = access_logs_raw.map(parse_apache_log_line).filter(lambda x: x is not None)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 4,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [],
70 | "source": [
71 | "accoss_logs_df = access_logs.toDF()"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 5,
77 | "metadata": {
78 | "collapsed": false
79 | },
80 | "outputs": [
81 | {
82 | "name": "stdout",
83 | "output_type": "stream",
84 | "text": [
85 | "root\n",
86 | " |-- clientIdentd: string (nullable = true)\n",
87 | " |-- contentSize: long (nullable = true)\n",
88 | " |-- dateTime: string (nullable = true)\n",
89 | " |-- endpoint: string (nullable = true)\n",
90 | " |-- ipAddress: string (nullable = true)\n",
91 | " |-- method: string (nullable = true)\n",
92 | " |-- month: string (nullable = true)\n",
93 | " |-- protocol: string (nullable = true)\n",
94 | " |-- referrer: string (nullable = true)\n",
95 | " |-- responseCode: long (nullable = true)\n",
96 | " |-- timestamp: double (nullable = true)\n",
97 | " |-- userAgent: string (nullable = true)\n",
98 | " |-- userId: string (nullable = true)\n",
99 | "\n"
100 | ]
101 | }
102 | ],
103 | "source": [
104 | "accoss_logs_df.printSchema()"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 6,
110 | "metadata": {
111 | "collapsed": false
112 | },
113 | "outputs": [],
114 | "source": [
115 | "accoss_logs_df.write.partitionBy(\n",
116 | " \"month\"\n",
117 | " ).parquet(\n",
118 | " \"hdfs://master:9000/user/michael/data/diybigdata.20160808.parquet\",\n",
119 | " mode='overwrite'\n",
120 | " )"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": null,
126 | "metadata": {
127 | "collapsed": true
128 | },
129 | "outputs": [],
130 | "source": []
131 | }
132 | ],
133 | "metadata": {
134 | "kernelspec": {
135 | "display_name": "Python 3",
136 | "language": "python",
137 | "name": "python3"
138 | },
139 | "language_info": {
140 | "codemirror_mode": {
141 | "name": "ipython",
142 | "version": 3
143 | },
144 | "file_extension": ".py",
145 | "mimetype": "text/x-python",
146 | "name": "python",
147 | "nbconvert_exporter": "python",
148 | "pygments_lexer": "ipython3",
149 | "version": "3.4.3"
150 | }
151 | },
152 | "nbformat": 4,
153 | "nbformat_minor": 0
154 | }
155 |
--------------------------------------------------------------------------------
/reddit-data/README.md:
--------------------------------------------------------------------------------
1 | # Loading Pushshidt.io Data
2 | In order to fetch data from [pushshoft.io](https://files.pushshift.io/reddit/), use the shell script [download_and_convert_to_bz2.sh](./download_and_convert_to_bz2.sh) to download the comments and/or submissions files and convert them to a `bz2` compressions format, which works better with Spark. The Jupyter notebooks in this directory for loading the pushshift.io downloads to parquet expect the files to being `bz2` compressed.
3 |
4 | # Reddit Comment Data Analysis
5 | The analyses in this directory pertain to the Reddit Comments Data that can be [downloaded here](http://academictorrents.com/details/85a5bd50e4c365f8df70240ffd4ecc7dec59912b). When analyzing this data, the first step that must be done is to load it into the parquet file format. All analyses in this directory expects that the data has been loaded into the parquet file format as implemented in the `load-reddit-*-to-parquet` notebooks.
6 |
7 | The analyses performed on this data set are:
8 | * *Identification of Bot Commenters* ([`reddit-bot-commenters-bensons-law.ipynb`](./reddit-bot-commenters-bensons-law.ipynb)) - This analysis uses Benford's Law to identify the commenters on Reddit that are most likely to be bots based on their commenting patterns.
9 |
10 |
--------------------------------------------------------------------------------
/reddit-data/download_and_convert_to_bz2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #
4 | # there should be two arguments.
5 | # $1 - the directory finalized files should be placed in
6 | # $2 - the file containing the URLs to download, one per line.
7 | #
8 | # The following tools should be installed on your system:
9 | # - bzip2
10 | # - xz-utils
11 | # - zstd
12 | #
13 |
14 | NUM_COMPRESSION_THREADS=12
15 |
16 | if [ $# -ne 2 ]; then
17 | echo "usage: download_and_convert_to_bz2.sh /path/to/destination/directory /path/to/url_list.txt"
18 | exit 1
19 | fi
20 |
21 | # manage arguments
22 | destination_dir=${1%/}
23 | readarray url_list < $2
24 |
25 | # the main loop
26 | echo "Fetching URLs list in ${2}"
27 | for url in ${url_list[@]}; do
28 | echo "Processing URL = ${url}"
29 | download_file_name="${url##*/}"
30 | download_file_extension="${download_file_name##*.}"
31 | uncompressed_file_name="${download_file_name%.*}"
32 | final_file_name=${download_file_name}
33 |
34 | # download the files
35 | wget $url
36 |
37 | # if file extension of download is not bz2 deompress and recompress as bz2
38 | if [ "$download_file_extension" != "bz2" ]; then
39 | if [ "$download_file_extension" == "zst" ]; then
40 | zstd -v -d --memory=2048MB $download_file_name
41 | elif [ "$download_file_extension" == "xz" ]; then
42 | xz -v -k -T $NUM_COMPRESSION_THREADS -d $download_file_name
43 | else
44 | echo "Unrecognized file type for ${url}"
45 | exit 1
46 | fi
47 | lbzip2 -v -n $((NUM_COMPRESSION_THREADS)) $uncompressed_file_name
48 | rm $download_file_name
49 | final_file_name="${uncompressed_file_name}.bz2"
50 | fi
51 | mv -v -f $final_file_name $destination_dir
52 | echo "Finalized ${final_file_name}"
53 | echo ""
54 | done
55 |
56 | echo "Finished processing $2"
57 | exit 0
58 |
--------------------------------------------------------------------------------
/reddit-data/load-reddit-comments-to-parquet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Load Reddit Comments Data into Parquet \n",
8 | "This notebook loads the raw [Reddit comments dataset](http://academictorrents.com/details/85a5bd50e4c365f8df70240ffd4ecc7dec59912b) into a parquet file format. It does augment the data with several improved time columns, and the partitions the data by year/month/day. The file paths in this notebook should be modified for your system."
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "metadata": {
15 | "ExecuteTime": {
16 | "end_time": "2022-09-15T08:34:31.416045Z",
17 | "start_time": "2022-09-15T08:34:30.146765Z"
18 | }
19 | },
20 | "outputs": [],
21 | "source": [
22 | "import pyspark.sql.functions as F\n",
23 | "import pyspark.sql.types as T\n",
24 | "import pyspark.sql.utils as U\n",
25 | "from pyspark.sql.window import Window as W\n",
26 | "\n",
27 | "import pandas as pd\n",
28 | "\n",
29 | "pd.set_option('display.max_colwidth', None)\n",
30 | "\n",
31 | "spark = SparkSession\\\n",
32 | " .builder\\\n",
33 | " .appName(\"RedditCommentsLoadToParquet\")\\\n",
34 | " .getOrCreate()"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {
41 | "ExecuteTime": {
42 | "end_time": "2022-09-15T08:34:31.427088Z",
43 | "start_time": "2022-09-15T08:34:31.420675Z"
44 | }
45 | },
46 | "outputs": [],
47 | "source": [
48 | "reddit_comments_schema = T.StructType([\n",
49 | " T.StructField(\"id\", T.StringType()),\n",
50 | " T.StructField(\"parent_id\", T.StringType()),\n",
51 | " T.StructField(\"author\", T.StringType()),\n",
52 | " T.StructField(\"link_id\", T.StringType()),\n",
53 | " T.StructField(\"subreddit\", T.StringType()),\n",
54 | " T.StructField(\"subreddit_id\", T.StringType()),\n",
55 | " T.StructField(\"edited\", T.BooleanType()),\n",
56 | " T.StructField(\"score\", T.LongType()),\n",
57 | " T.StructField(\"body\", T.StringType()),\n",
58 | " T.StructField(\"created_utc\", T.LongType()),\n",
59 | " T.StructField(\"retrieved_utc\", T.LongType()),\n",
60 | " T.StructField(\"retrieved_on\", T.LongType()),\n",
61 | "])"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {
68 | "ExecuteTime": {
69 | "end_time": "2022-09-15T08:47:13.192762Z",
70 | "start_time": "2022-09-15T08:34:32.598344Z"
71 | },
72 | "code_folding": []
73 | },
74 | "outputs": [],
75 | "source": [
76 | "import gc\n",
77 | "\n",
78 | "spark.conf.set(\"spark.sql.session.timeZone\", \"UTC\")\n",
79 | "\n",
80 | "def has_column(df, col_name):\n",
81 | " if col_name in df.columns:\n",
82 | " return F.lit(True)\n",
83 | " else:\n",
84 | " return F.lit(False)\n",
85 | "\n",
86 | "load_months = [\n",
87 | "# (2021, 7),\n",
88 | "# (2021, 8),\n",
89 | "# (2021, 9),\n",
90 | "# (2021, 10),\n",
91 | "# (2021, 11),\n",
92 | "# (2021, 12),\n",
93 | "# (2022, 1),\n",
94 | "# (2022, 2),\n",
95 | "# (2022, 3),\n",
96 | "# (2022, 4),\n",
97 | " (2022, 8),\n",
98 | "]\n",
99 | "\n",
100 | "for year, month in load_months:\n",
101 | " file_path = 'qfs:///data/reddit/comments/raw/RC_{0}-{1:02d}*.bz2'.format(year, month)\n",
102 | " print('loading data for year-month {0}-{1:02d} at file path {2}'.format(year, month, file_path))\n",
103 | " reddit_df = (\n",
104 | " spark.read.json(\n",
105 | " file_path,\n",
106 | " schema=reddit_comments_schema,\n",
107 | " )\n",
108 | " .withColumn(\n",
109 | " 'retrieved_on',\n",
110 | " F.when(\n",
111 | " F.col('retrieved_utc').isNotNull(),\n",
112 | " F.col('retrieved_utc')\n",
113 | " ).otherwise(\n",
114 | " F.col('retrieved_on')\n",
115 | " )\n",
116 | " )\n",
117 | " ) \n",
118 | "\n",
119 | " reddit_finalized = (\n",
120 | " reddit_df\n",
121 | " .select(\n",
122 | " 'author',\n",
123 | " 'link_id',\n",
124 | " 'retrieved_on',\n",
125 | " 'subreddit',\n",
126 | " 'subreddit_id',\n",
127 | " 'id',\n",
128 | " 'parent_id',\n",
129 | " 'edited',\n",
130 | " 'score',\n",
131 | " 'body',\n",
132 | " 'created_utc',\n",
133 | " F.from_unixtime('created_utc', 'yyyy-MM-dd').alias('created_date'),\n",
134 | " F.from_unixtime('created_utc', 'dd').alias('day')\n",
135 | " )\n",
136 | " .repartition('day')\n",
137 | " ).cache()\n",
138 | " print(' There are {0} total rows in month data set.'.format(reddit_finalized.count()))\n",
139 | "\n",
140 | " out_path = 'qfs:///data/reddit/comments/processed/year={0}/month={1:02d}'.format(year, month)\n",
141 | " print(' writing to: {0}'.format(out_path))\n",
142 | " reddit_finalized.write.partitionBy(\n",
143 | " 'day'\n",
144 | " ).parquet(\n",
145 | " out_path,\n",
146 | " mode='overwrite'\n",
147 | " )\n",
148 | " print('\\n')\n",
149 | " reddit_finalized.unpersist()\n",
150 | " del reddit_finalized\n",
151 | " del reddit_df\n",
152 | " gc.collect()\n",
153 | " "
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {
160 | "ExecuteTime": {
161 | "end_time": "2022-09-15T08:47:43.932518Z",
162 | "start_time": "2022-09-15T08:47:13.195656Z"
163 | }
164 | },
165 | "outputs": [],
166 | "source": [
167 | "reddit_processed = spark.read.parquet('qfs:///data/reddit/comments/processed/')\n",
168 | "reddit_processed.printSchema()"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {
175 | "ExecuteTime": {
176 | "end_time": "2022-09-15T08:54:46.366672Z",
177 | "start_time": "2022-09-15T08:47:43.935354Z"
178 | }
179 | },
180 | "outputs": [],
181 | "source": [
182 | "(\n",
183 | " reddit_processed\n",
184 | " .groupBy('year')\n",
185 | " .agg(\n",
186 | " F.count('*').alias('count'),\n",
187 | " F.countDistinct('author').alias('authors')\n",
188 | " )\n",
189 | " .orderBy('year')\n",
190 | ").toPandas()"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": null,
196 | "metadata": {
197 | "ExecuteTime": {
198 | "end_time": "2022-08-12T16:12:27.139557Z",
199 | "start_time": "2022-08-12T16:06:08.767951Z"
200 | }
201 | },
202 | "outputs": [],
203 | "source": [
204 | "(\n",
205 | " reddit_processed\n",
206 | " .groupBy('year')\n",
207 | " .agg(\n",
208 | " F.count('*').alias('count'),\n",
209 | " F.countDistinct('author').alias('authors')\n",
210 | " )\n",
211 | " .orderBy('year')\n",
212 | ").toPandas()"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {
219 | "ExecuteTime": {
220 | "end_time": "2022-09-15T14:39:31.024042Z",
221 | "start_time": "2022-09-15T14:39:16.984361Z"
222 | }
223 | },
224 | "outputs": [],
225 | "source": [
226 | "(\n",
227 | " reddit_processed\n",
228 | " .filter(\n",
229 | " (F.col('year') == 2022)\n",
230 | " &(F.col('month') == 8)\n",
231 | " )\n",
232 | " .groupBy('year','month','day')\n",
233 | " .agg(\n",
234 | " F.count('*').alias('count'),\n",
235 | " F.countDistinct('author').alias('authors')\n",
236 | " )\n",
237 | " .orderBy('year','month','day')\n",
238 | ").toPandas()"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": []
247 | }
248 | ],
249 | "metadata": {
250 | "kernelspec": {
251 | "display_name": "Python 3 (ipykernel)",
252 | "language": "python",
253 | "name": "python3"
254 | },
255 | "language_info": {
256 | "codemirror_mode": {
257 | "name": "ipython",
258 | "version": 3
259 | },
260 | "file_extension": ".py",
261 | "mimetype": "text/x-python",
262 | "name": "python",
263 | "nbconvert_exporter": "python",
264 | "pygments_lexer": "ipython3",
265 | "version": "3.10.7"
266 | },
267 | "toc": {
268 | "base_numbering": 1,
269 | "nav_menu": {
270 | "height": "217px",
271 | "width": "201px"
272 | },
273 | "number_sections": true,
274 | "sideBar": true,
275 | "skip_h1_title": false,
276 | "title_cell": "Table of Contents",
277 | "title_sidebar": "Contents",
278 | "toc_cell": false,
279 | "toc_position": {},
280 | "toc_section_display": true,
281 | "toc_window_display": false
282 | }
283 | },
284 | "nbformat": 4,
285 | "nbformat_minor": 2
286 | }
287 |
--------------------------------------------------------------------------------
/reddit-data/load-reddit-posts-to-parquet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Load Reddit Submissions in to Parquet\n",
8 | "\n",
9 | "The raw data was pulled from [pushshift.io](https://files.pushshift.io/reddit/submissions/)."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {
16 | "ExecuteTime": {
17 | "end_time": "2022-09-15T14:39:59.203555Z",
18 | "start_time": "2022-09-15T14:39:57.940891Z"
19 | }
20 | },
21 | "outputs": [],
22 | "source": [
23 | "import pyspark.sql.functions as F\n",
24 | "import pyspark.sql.types as T\n",
25 | "import pyspark.sql.utils as U\n",
26 | "from pyspark.sql.window import Window as W\n",
27 | "\n",
28 | "import pandas as pd\n",
29 | "\n",
30 | "pd.set_option('display.max_colwidth', None)\n",
31 | "\n",
32 | "spark = SparkSession\\\n",
33 | " .builder\\\n",
34 | " .appName(\"RedditPostsLoadToParquet\")\\\n",
35 | " .getOrCreate()"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "ExecuteTime": {
43 | "end_time": "2022-09-15T15:03:57.158114Z",
44 | "start_time": "2022-09-15T14:40:07.942034Z"
45 | }
46 | },
47 | "outputs": [],
48 | "source": [
49 | "year_range = range(2022,2023)\n",
50 | "\n",
51 | "json_schema = T.StructType([\n",
52 | " T.StructField(\"author\", T.StringType()),\n",
53 | " T.StructField(\"created_utc\", T.LongType()),\n",
54 | " T.StructField(\"domain\", T.StringType()),\n",
55 | " T.StructField(\"edited\", T.BooleanType()),\n",
56 | " T.StructField(\"id\", T.StringType()),\n",
57 | " T.StructField(\"is_crosspostable\", T.BooleanType()),\n",
58 | " T.StructField(\"is_self\", T.BooleanType()),\n",
59 | " T.StructField(\"is_video\", T.BooleanType()),\n",
60 | " T.StructField(\"num_comments\", T.LongType()),\n",
61 | " T.StructField(\"num_crossposts\", T.LongType()),\n",
62 | " T.StructField(\"over_18\", T.BooleanType()),\n",
63 | " T.StructField(\"permalink\", T.StringType()),\n",
64 | " T.StructField(\"promoted\", T.BooleanType()),\n",
65 | " T.StructField(\"score\", T.LongType()),\n",
66 | " T.StructField(\"selftext\", T.StringType()),\n",
67 | " T.StructField(\"spam\", T.BooleanType()),\n",
68 | " T.StructField(\"stickied\", T.BooleanType()),\n",
69 | " T.StructField(\"subreddit\", T.StringType()),\n",
70 | " T.StructField(\"subreddit_id\", T.StringType()),\n",
71 | " T.StructField(\"thumbnail\", T.StringType()),\n",
72 | " T.StructField(\"title\", T.StringType()),\n",
73 | " T.StructField(\"ups\", T.StringType()),\n",
74 | " T.StructField(\"url\", T.StringType()), \n",
75 | "])\n",
76 | "\n",
77 | "def has_column(df, col_name):\n",
78 | " if col_name in df.columns:\n",
79 | " return F.lit(True)\n",
80 | " else:\n",
81 | " return F.lit(False)\n",
82 | "\n",
83 | "for year in year_range:\n",
84 | " print('Processing submissions date for year {0}'.format(year))\n",
85 | " file_pattern = 'qfs:///data/reddit/submissions/raw/RS_*{0}-*.bz2'.format(year)\n",
86 | " submissions_raw = (\n",
87 | " spark.read.json(\n",
88 | " file_pattern,\n",
89 | " encoding='utf-8',\n",
90 | " schema=json_schema,\n",
91 | " )\n",
92 | " )\n",
93 | " df = (\n",
94 | " submissions_raw\n",
95 | " .withColumn(\n",
96 | " 'created_date',\n",
97 | " F.from_unixtime(F.col('created_utc'), 'yyyy-MM-dd')\n",
98 | " )\n",
99 | " .withColumn(\n",
100 | " 'month',\n",
101 | " F.from_unixtime(F.col('created_utc'), 'MM')\n",
102 | " )\n",
103 | " .withColumn(\n",
104 | " 'day',\n",
105 | " F.from_unixtime(F.col('created_utc'), 'dd')\n",
106 | " )\n",
107 | " .withColumn(\n",
108 | " 'created_date',\n",
109 | " F.from_unixtime(F.col('created_utc'), 'dd')\n",
110 | " )\n",
111 | " ) \n",
112 | " df.write.partitionBy(\n",
113 | " 'month', 'day'\n",
114 | " ).parquet(\n",
115 | " 'qfs:///data/reddit/submissions/processed/year={0}/'.format(year),\n",
116 | " mode='overwrite'\n",
117 | " )\n",
118 | "\n",
119 | " "
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {
126 | "ExecuteTime": {
127 | "end_time": "2022-09-15T15:04:21.681074Z",
128 | "start_time": "2022-09-15T15:03:57.161435Z"
129 | }
130 | },
131 | "outputs": [],
132 | "source": [
133 | "submissions_df = spark.read.parquet('qfs:///data/reddit/submissions/processed/')"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {
140 | "ExecuteTime": {
141 | "end_time": "2022-09-15T15:04:21.695868Z",
142 | "start_time": "2022-09-15T15:04:21.683797Z"
143 | }
144 | },
145 | "outputs": [],
146 | "source": [
147 | "submissions_df.printSchema()"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {
154 | "ExecuteTime": {
155 | "end_time": "2022-07-29T10:08:18.918367Z",
156 | "start_time": "2022-07-29T10:08:07.636296Z"
157 | }
158 | },
159 | "outputs": [],
160 | "source": [
161 | "submissions_df.count()"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {
168 | "ExecuteTime": {
169 | "end_time": "2022-09-15T15:04:31.142038Z",
170 | "start_time": "2022-09-15T15:04:21.698757Z"
171 | }
172 | },
173 | "outputs": [],
174 | "source": [
175 | "submissions_df.count()"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {
182 | "ExecuteTime": {
183 | "end_time": "2022-09-15T16:04:57.119540Z",
184 | "start_time": "2022-09-15T15:56:36.679141Z"
185 | }
186 | },
187 | "outputs": [],
188 | "source": [
189 | "submissions_df.filter('author = \"MichaelKamprath\"').toPandas()"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": null,
195 | "metadata": {
196 | "ExecuteTime": {
197 | "end_time": "2022-09-15T15:12:50.759818Z",
198 | "start_time": "2022-09-15T15:12:50.224480Z"
199 | },
200 | "code_folding": [
201 | 14
202 | ]
203 | },
204 | "outputs": [],
205 | "source": [
206 | "import matplotlib.pyplot as plt\n",
207 | "import numpy as np\n",
208 | "import pandas as pd\n",
209 | "from pandas.plotting import register_matplotlib_converters\n",
210 | "\n",
211 | "register_matplotlib_converters()\n",
212 | "pd.set_option('display.max_colwidth', None)\n",
213 | "\n",
214 | "def plot_line_graph(\n",
215 | " df,\n",
216 | " y_axis_column,\n",
217 | " x_axis_column,\n",
218 | " segment_column=None,\n",
219 | " segment_values=None, # a list of values from segment_column to be graphed\n",
220 | " segment_labels=None, # a dictionary with segment_values as key and name a value\n",
221 | " xlabel=None,\n",
222 | " ylabel=None,\n",
223 | " line_width=2,\n",
224 | " xlabel_rotation=None,\n",
225 | " x_axis_is_dates=True,\n",
226 | " y_axis_log_scale=False,\n",
227 | " title=None,\n",
228 | " legend_location='lower left',\n",
229 | "):\n",
230 | " df_pd = df.toPandas()\n",
231 | " fig, ax = plt.subplots()\n",
232 | "\n",
233 | " if segment_column is None:\n",
234 | " x_axis_values = df_pd[x_axis_column]\n",
235 | " if ylabel is None:\n",
236 | " item_label = y_axis_column\n",
237 | " else: \n",
238 | " item_label = ylabel\n",
239 | " if x_axis_is_dates:\n",
240 | " ax.plot_date(\n",
241 | " x_axis_values,\n",
242 | " df_pd[y_axis_column],\n",
243 | " '-',\n",
244 | " linewidth = line_width,\n",
245 | " label = item_label,\n",
246 | " )\n",
247 | " else:\n",
248 | " ax.plot(\n",
249 | " x_axis_values,\n",
250 | " df_pd[y_axis_column],\n",
251 | " label = item_label,\n",
252 | " linewidth = line_width,\n",
253 | " ) \n",
254 | " else:\n",
255 | " if segment_values is None:\n",
256 | " segment_value_list = [r.val for r in df.select(F.col(segment_column).alias('val')).distinct().collect()]\n",
257 | " else:\n",
258 | " segment_value_list = segment_values\n",
259 | " for i in segment_value_list:\n",
260 | " data = df_pd[df_pd[segment_column] == i]\n",
261 | " x_axis_values = data[x_axis_column]\n",
262 | " if segment_labels is not None:\n",
263 | " item_label = segment_labels[i]\n",
264 | " else:\n",
265 | " item_label = \"{0}\".format(i)\n",
266 | " \n",
267 | " if x_axis_is_dates:\n",
268 | " ax.plot_date(\n",
269 | " x_axis_values,\n",
270 | " data[y_axis_column],\n",
271 | " '-',\n",
272 | " linewidth = line_width,\n",
273 | " label = item_label,\n",
274 | " ) \n",
275 | " else:\n",
276 | " ax.plot(\n",
277 | " x_axis_values,\n",
278 | " data[y_axis_column],\n",
279 | " label = item_label,\n",
280 | " linewidth = line_width\n",
281 | " )\n",
282 | " \n",
283 | " fig.set_size_inches(20,12)\n",
284 | " if xlabel is not None:\n",
285 | " plt.xlabel(xlabel)\n",
286 | " if ylabel is not None:\n",
287 | " plt.ylabel(ylabel)\n",
288 | " if xlabel_rotation is not None:\n",
289 | " plt.xticks(rotation=xlabel_rotation)\n",
290 | " if x_axis_is_dates:\n",
291 | " fig.autofmt_xdate()\n",
292 | " if y_axis_log_scale:\n",
293 | " plt.grid()\n",
294 | " plt.yscale(\"log\")\n",
295 | " if title is not None:\n",
296 | " fig.suptitle(title, fontsize=18)\n",
297 | " ax.legend(loc=legend_location)\n",
298 | " plt.show()"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {
305 | "ExecuteTime": {
306 | "end_time": "2022-09-15T15:12:50.923459Z",
307 | "start_time": "2022-09-15T15:12:50.762273Z"
308 | }
309 | },
310 | "outputs": [],
311 | "source": [
312 | "monthly_submissions = (\n",
313 | " submissions_df\n",
314 | " .withColumn(\n",
315 | " 'year_month', \n",
316 | " F.to_date(\n",
317 | " F.format_string('%4d-%02d', F.col('year'), F.col('month')),\n",
318 | " format='yyyy-MM'\n",
319 | " ) \n",
320 | " )\n",
321 | " .groupBy('year_month')\n",
322 | " .agg(\n",
323 | " F.count('*').alias('count'),\n",
324 | " F.countDistinct('author').alias('authors')\n",
325 | " )\n",
326 | " .orderBy(F.col('year_month'))\n",
327 | " ).cache()"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": null,
333 | "metadata": {
334 | "ExecuteTime": {
335 | "end_time": "2022-07-04T10:53:25.010451Z",
336 | "start_time": "2022-07-04T10:53:24.767770Z"
337 | }
338 | },
339 | "outputs": [],
340 | "source": [
341 | "(\n",
342 | " submissions_df\n",
343 | " .filter(F.col('month').isNull())\n",
344 | " .select(\n",
345 | " 'author',\n",
346 | " 'subreddit_id',\n",
347 | " 'permalink',\n",
348 | " 'selftext',\n",
349 | " 'created_utc',\n",
350 | " 'created_date',\n",
351 | " 'year',\n",
352 | " 'month',\n",
353 | " 'day',\n",
354 | " )\n",
355 | ").limit(20).toPandas()"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {
362 | "ExecuteTime": {
363 | "end_time": "2022-07-04T10:53:25.514523Z",
364 | "start_time": "2022-07-04T10:53:25.012170Z"
365 | }
366 | },
367 | "outputs": [],
368 | "source": [
369 | "submissions_df.filter(F.col('month').isNull()).groupBy('year').agg(F.count('*').alias('count')).toPandas()"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "metadata": {
376 | "ExecuteTime": {
377 | "end_time": "2022-07-04T10:56:48.296261Z",
378 | "start_time": "2022-07-04T10:53:25.517524Z"
379 | }
380 | },
381 | "outputs": [],
382 | "source": [
383 | "monthly_submissions.orderBy(F.col('year_month')).limit(20).toPandas()"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": null,
389 | "metadata": {
390 | "ExecuteTime": {
391 | "end_time": "2022-08-12T17:37:04.642659Z",
392 | "start_time": "2022-08-12T17:37:04.010513Z"
393 | }
394 | },
395 | "outputs": [],
396 | "source": [
397 | "plot_line_graph(\n",
398 | " monthly_submissions,\n",
399 | " 'authors',\n",
400 | " 'year_month',\n",
401 | " xlabel='Date',\n",
402 | " ylabel='Authors',\n",
403 | ")"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": null,
409 | "metadata": {},
410 | "outputs": [],
411 | "source": []
412 | }
413 | ],
414 | "metadata": {
415 | "kernelspec": {
416 | "display_name": "Python 3 (ipykernel)",
417 | "language": "python",
418 | "name": "python3"
419 | },
420 | "language_info": {
421 | "codemirror_mode": {
422 | "name": "ipython",
423 | "version": 3
424 | },
425 | "file_extension": ".py",
426 | "mimetype": "text/x-python",
427 | "name": "python",
428 | "nbconvert_exporter": "python",
429 | "pygments_lexer": "ipython3",
430 | "version": "3.7.9"
431 | },
432 | "toc": {
433 | "base_numbering": 1,
434 | "nav_menu": {},
435 | "number_sections": true,
436 | "sideBar": true,
437 | "skip_h1_title": false,
438 | "title_cell": "Table of Contents",
439 | "title_sidebar": "Contents",
440 | "toc_cell": false,
441 | "toc_position": {},
442 | "toc_section_display": true,
443 | "toc_window_display": false
444 | }
445 | },
446 | "nbformat": 4,
447 | "nbformat_minor": 4
448 | }
449 |
--------------------------------------------------------------------------------
/reddit-data/reddit-bot-commenters-bensons-law.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Reddit Bot Commenters \n",
8 | "Identifies likely bot commenters on Reddit using Benford's Law. See [original blog post](https://diybigdata.net/2020/03/using-benfords-law-to-identify-bots-on-reddit/) for a discussion on this technique.\n",
9 | "\n",
10 | "The core of this code is the `generateBenfordsLawAnalysis()` function, which takes a user event log data frame that must have a user ID column and a event timestamp column, and it returns the chi squared score of close each user's activity is to the ideal Benford's Law distribution. Scores closer to zero mean the user's activity more closely adheres to the ideal distribution. "
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {
17 | "ExecuteTime": {
18 | "end_time": "2022-08-12T17:58:46.627964Z",
19 | "start_time": "2022-08-12T17:58:45.607762Z"
20 | }
21 | },
22 | "outputs": [],
23 | "source": [
24 | "import pyspark.sql.functions as F\n",
25 | "import pyspark.sql.types as T\n",
26 | "from pyspark.sql.window import Window as W\n",
27 | "\n",
28 | "import pandas as pd\n",
29 | "\n",
30 | "pd.set_option('display.max_colwidth', None)\n",
31 | "\n",
32 | "spark = SparkSession\\\n",
33 | " .builder\\\n",
34 | " .appName(\"RedditBotCommenters\")\\\n",
35 | " .getOrCreate()"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {
42 | "ExecuteTime": {
43 | "end_time": "2022-08-12T17:58:46.636108Z",
44 | "start_time": "2022-08-12T17:58:46.630656Z"
45 | }
46 | },
47 | "outputs": [],
48 | "source": [
49 | "orig_suffle_partitions = spark.conf.get(\"spark.sql.shuffle.partitions\")\n",
50 | "spark.conf.set(\"spark.sql.shuffle.partitions\", 500)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {
57 | "ExecuteTime": {
58 | "end_time": "2022-08-12T17:59:26.166948Z",
59 | "start_time": "2022-08-12T17:58:46.638715Z"
60 | }
61 | },
62 | "outputs": [],
63 | "source": [
64 | "reddit_df = (\n",
65 | " spark.read.parquet('qfs:///data/reddit/comments/processed')\n",
66 | " # filter out moderator and deleted authors\n",
67 | " .filter(~F.col('author').isin('[deleted]','AutoModerator'))\n",
68 | ")\n",
69 | "\n",
70 | "reddit_df.printSchema()"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {
77 | "ExecuteTime": {
78 | "end_time": "2022-08-12T17:59:42.896570Z",
79 | "start_time": "2022-08-12T17:59:26.168915Z"
80 | }
81 | },
82 | "outputs": [],
83 | "source": [
84 | "submissions_df = spark.read.parquet('qfs:///data/reddit/submissions/processed')\n",
85 | "submissions_df.printSchema()"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {
92 | "ExecuteTime": {
93 | "end_time": "2022-08-12T17:59:42.969137Z",
94 | "start_time": "2022-08-12T17:59:42.899315Z"
95 | }
96 | },
97 | "outputs": [],
98 | "source": [
99 | "combined_df = (\n",
100 | " reddit_df\n",
101 | " .select(\n",
102 | " 'author',\n",
103 | " 'created_utc',\n",
104 | " )\n",
105 | " .union(\n",
106 | " submissions_df\n",
107 | " .select(\n",
108 | " 'author',\n",
109 | " 'created_utc',\n",
110 | " )\n",
111 | " \n",
112 | " )\n",
113 | " .filter(\n",
114 | " F.col('author').isNotNull()\n",
115 | " &(F.length(F.col('author')) > 0)\n",
116 | " )\n",
117 | " .repartition('author')\n",
118 | ")"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "`generateBenfordsLawAnalysis`\n",
126 | "\n",
127 | "A function to perform Benford's Law analysis against a data frame of user activities in order to determine which user's activities best (or least) adhere to the Benford's Law distribution. The data frame is ostensibly a event log keyed by a user ID and has a timestamp for each event row. Only the user ID and timesamps columns are used for analysis.\n",
128 | "\n",
129 | "### Arguments \n",
130 | "* `df` - The data frame with the timestamped user activity to be analyzed\n",
131 | "* `user_col` - a string identifying the name of the column of df that contains the user IDs\n",
132 | "* `timestamp_col` - a string identifying the name of the column of df that contains the event timestamps. Must be `T.LongType()`.\n",
133 | "* `event_threshold` - the minimum number of events a user must have for the Benford's Law analysis to performed on it. Defaults to 100.\n",
134 | "\n",
135 | "### Returns \n",
136 | "A dataframe with the following columns:\n",
137 | "* `user_col` - The user IDs. The column name will be the same as the original dataframe.\n",
138 | "* `frequency_count` - the number of events found for the user\n",
139 | "* `chi_squared` - the chi squared score indicating how similar the user's activity is to the ideal Benford's Law distribution.\n",
140 | "* `digit_share` - A list containing the relative share each first digit has among the user's activity. The list is ordered from digit 1 to digit 9.\n",
141 | "\n"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "metadata": {
148 | "ExecuteTime": {
149 | "end_time": "2022-08-12T17:59:42.989415Z",
150 | "start_time": "2022-08-12T17:59:42.972762Z"
151 | },
152 | "code_folding": []
153 | },
154 | "outputs": [],
155 | "source": [
156 | "from math import log10, sqrt\n",
157 | "\n",
158 | "def _getUsersAndDigit(df, user_col, event_threshold):\n",
159 | " digits_df = (\n",
160 | " spark\n",
161 | " .createDataFrame(\n",
162 | " [[1], [2], [3], [4], [5], [6], [7], [8], [9]],\n",
163 | " schema=T.StructType([\n",
164 | " T.StructField(\n",
165 | " \"first_digit\", \n",
166 | " T.IntegerType()\n",
167 | " )\n",
168 | " ])\n",
169 | " )\n",
170 | " .coalesce(1)\n",
171 | " )\n",
172 | " users_and_digits = (\n",
173 | " df\n",
174 | " .groupBy(user_col)\n",
175 | " .agg(F.count('*').alias('count'))\n",
176 | " .filter(F.col('count') > event_threshold )\n",
177 | " .select(user_col)\n",
178 | " .repartition(user_col)\n",
179 | " .crossJoin(digits_df)\n",
180 | " )\n",
181 | " return users_and_digits\n",
182 | "\n",
183 | "def _generateFirstDigitShare(df, user_col, timestamp_col):\n",
184 | " user_event_window = W.partitionBy(user_col).orderBy(timestamp_col)\n",
185 | " user_cum_dist_window = W.partitionBy(user_col).orderBy('first_digit')\n",
186 | " \n",
187 | " event_time_delta = F.col(timestamp_col) - F.lag(F.col(timestamp_col)).over(user_event_window)\n",
188 | "\n",
189 | " first_digit_share = (\n",
190 | " df\n",
191 | " .select(\n",
192 | " user_col,\n",
193 | " timestamp_col,\n",
194 | " event_time_delta.alias('time_delta')\n",
195 | " )\n",
196 | " .filter(F.col('time_delta').isNotNull())\n",
197 | " .withColumn(\n",
198 | " 'first_digit',\n",
199 | " F.substring(F.col('time_delta').cast(T.StringType()), 0, 1).cast(T.IntegerType())\n",
200 | " )\n",
201 | " .withColumn(\n",
202 | " 'first_digit_cum_dist',\n",
203 | " F.cume_dist().over(user_cum_dist_window)\n",
204 | " )\n",
205 | " .groupBy(user_col, 'first_digit', 'first_digit_cum_dist')\n",
206 | " .agg(\n",
207 | " F.count(timestamp_col).alias('frequency_count')\n",
208 | " )\n",
209 | " .withColumn(\n",
210 | " 'first_digit_share',\n",
211 | " F.col('first_digit_cum_dist') \n",
212 | " - F.coalesce(\n",
213 | " F.lag('first_digit_cum_dist').over(user_cum_dist_window), \n",
214 | " F.lit(0)\n",
215 | " )\n",
216 | " )\n",
217 | " .repartition(user_col)\n",
218 | " )\n",
219 | " return first_digit_share\n",
220 | "\n",
221 | "def _expectedBenfordsShare():\n",
222 | " digits = [1, 2, 3, 4, 5, 6, 7, 8, 9]\n",
223 | " expected_share_list = [(d, log10(d+1)-log10(d)) for d in digits]\n",
224 | "\n",
225 | " expected_share_df = (\n",
226 | " spark\n",
227 | " .createDataFrame(\n",
228 | " expected_share_list,\n",
229 | " schema=T.StructType([\n",
230 | " T.StructField(\n",
231 | " 'first_digit', \n",
232 | " T.IntegerType()\n",
233 | " ),\n",
234 | " T.StructField(\n",
235 | " 'expected_share',\n",
236 | " T.DoubleType()\n",
237 | " )\n",
238 | " ])\n",
239 | " )\n",
240 | " .coalesce(1)\n",
241 | " )\n",
242 | " \n",
243 | " return expected_share_df\n",
244 | "\n",
245 | "def generateBenfordsLawAnalysis(df, user_col, timestamp_col, event_threshold = 100):\n",
246 | " user_digts_df = _getUsersAndDigit(df, user_col, event_threshold)\n",
247 | " first_digit_share_df = _generateFirstDigitShare(df, user_col, timestamp_col)\n",
248 | " expected_share_df = _expectedBenfordsShare()\n",
249 | " \n",
250 | " finalized_first_digit_share_df = (\n",
251 | " first_digit_share_df\n",
252 | " .join(\n",
253 | " user_digts_df,\n",
254 | " on=[user_col,'first_digit'],\n",
255 | " how='right'\n",
256 | " )\n",
257 | " .na.fill(0)\n",
258 | " .cache()\n",
259 | " ) \n",
260 | " user_benford_distances = (\n",
261 | " finalized_first_digit_share_df\n",
262 | " .join(\n",
263 | " F.broadcast(expected_share_df),\n",
264 | " on='first_digit',\n",
265 | " how='inner'\n",
266 | " )\n",
267 | " .withColumn(\n",
268 | " 'chi_squared_addends',\n",
269 | " F.pow(\n",
270 | " (F.col('first_digit_share') - F.col('expected_share')),\n",
271 | " F.lit(2)\n",
272 | " ) / F.col('expected_share')\n",
273 | " )\n",
274 | " .orderBy(user_col, 'first_digit')\n",
275 | " .groupBy(user_col)\n",
276 | " .agg(\n",
277 | " F.sum('frequency_count').alias('frequency_count'),\n",
278 | " F.sum('chi_squared_addends').alias('chi_squared'),\n",
279 | " F.collect_list(F.col('first_digit_share')).alias('digit_share')\n",
280 | " )\n",
281 | " )\n",
282 | " return user_benford_distances "
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "metadata": {
289 | "ExecuteTime": {
290 | "end_time": "2022-08-12T18:23:56.132897Z",
291 | "start_time": "2022-08-12T17:59:42.991016Z"
292 | },
293 | "scrolled": false
294 | },
295 | "outputs": [],
296 | "source": [
297 | "new_df = generateBenfordsLawAnalysis(reddit_df, 'author', 'created_utc')\n",
298 | "\n",
299 | "new_df.orderBy(F.col('chi_squared').desc()).limit(50).toPandas()"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {
306 | "ExecuteTime": {
307 | "end_time": "2022-08-12T20:16:38.977709Z",
308 | "start_time": "2022-08-12T20:15:52.691420Z"
309 | }
310 | },
311 | "outputs": [],
312 | "source": [
313 | "new_df.write.parquet(\n",
314 | " 'qfs:///user/spark/reddit/author_bot_chi_squared_score/',\n",
315 | " mode='overwrite'\n",
316 | ")"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": null,
322 | "metadata": {},
323 | "outputs": [],
324 | "source": []
325 | }
326 | ],
327 | "metadata": {
328 | "kernelspec": {
329 | "display_name": "Python 3 (ipykernel)",
330 | "language": "python",
331 | "name": "python3"
332 | },
333 | "language_info": {
334 | "codemirror_mode": {
335 | "name": "ipython",
336 | "version": 3
337 | },
338 | "file_extension": ".py",
339 | "mimetype": "text/x-python",
340 | "name": "python",
341 | "nbconvert_exporter": "python",
342 | "pygments_lexer": "ipython3",
343 | "version": "3.10.7"
344 | },
345 | "toc": {
346 | "base_numbering": 1,
347 | "nav_menu": {},
348 | "number_sections": true,
349 | "sideBar": true,
350 | "skip_h1_title": false,
351 | "title_cell": "Table of Contents",
352 | "title_sidebar": "Contents",
353 | "toc_cell": false,
354 | "toc_position": {},
355 | "toc_section_display": true,
356 | "toc_window_display": false
357 | }
358 | },
359 | "nbformat": 4,
360 | "nbformat_minor": 4
361 | }
362 |
--------------------------------------------------------------------------------
/tools/download_and_convert_to_bz2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #
4 | # This script will download each URL listed in a passed configuration files,
5 | # and then if needed convert it's compression to bz2, which plays nicer with
6 | # Apache Spark.
7 | #
8 | # This script has two arguments.
9 | # $1 - the directory finalized files should be placed in
10 | # $2 - the file containing the URLs to download, one URL per line.
11 | #
12 | # All processing will be done in the current working directory.
13 | # Files are downloaded and processed one at a time.
14 | #
15 | # Requires the following tools be installed:
16 | # zstd
17 | # xz
18 | # lbzip2
19 | #
20 | # Set NUM_COMPRESSION_THREADS environment variable for the number of threads then
21 | # various compression tools will use. Defaults to 12.
22 | #
23 |
24 | NUM_COMPRESSION_THREADS=${NUM_COMPRESSION_THREADS:-12}
25 |
26 | if [ $# -ne 2 ]; then
27 | echo "usage: download_and_convert_to_bz2.sh /path/to/destination/directory /path/to/url_list.txt"
28 | exit 1
29 | fi
30 |
31 | # manage arguments
32 | destination_dir=${1%/}
33 | readarray url_list < $2
34 |
35 | # the main loop
36 | echo "Fetching URLs list in ${2}"
37 | for url in ${url_list[@]}; do
38 | echo "Processing URL = ${url}"
39 | download_file_name="${url##*/}"
40 | download_file_extension="${download_file_name##*.}"
41 | uncompressed_file_name="${download_file_name%.*}"
42 | final_file_name=${download_file_name}
43 |
44 | # download the files
45 | wget $url
46 |
47 | # if file extension of download is not bz2 deompress and recompress as bz2
48 | if [ "$download_file_extension" != "bz2" ]; then
49 | if [ "$download_file_extension" == "zst" ]; then
50 | zstd -v -d $download_file_name
51 | elif [ "$download_file_extension" == "xz" ]; then
52 | xz -v -k -T $NUM_COMPRESSION_THREADS -d $download_file_name
53 | else
54 | echo "Unrecognized file type for ${url}"
55 | exit 1
56 | fi
57 | lbzip2 -v -n $((NUM_COMPRESSION_THREADS)) $uncompressed_file_name
58 | rm $download_file_name
59 | final_file_name="${uncompressed_file_name}.bz2"
60 | fi
61 | mv -v -f $final_file_name $destination_dir
62 | echo "Finalized ${final_file_name}"
63 | echo ""
64 | done
65 |
66 | echo "Finished processing $2"
67 | exit 0
68 |
--------------------------------------------------------------------------------
/udf-development/build.sbt:
--------------------------------------------------------------------------------
1 | name := "diybigdata-udf"
2 |
3 | // orgnization name (e.g., the package name of the project)
4 | organization := "net.diybigdata"
5 |
6 | version := "1.0-SNAPSHOT"
7 |
8 | // project description
9 | description := "DIY Big Data Hive UDFs"
10 |
11 | // Enables publishing to maven repo
12 | publishMavenStyle := true
13 |
14 | // Do not append Scala versions to the generated artifacts
15 | crossPaths := false
16 |
17 | // This forbids including Scala related libraries into the dependency
18 | autoScalaLibrary := false
19 |
20 | // Use the latest Scala version with Spark 2+
21 | scalaVersion := "2.11.6"
22 | scalacOptions ++= Seq("-unchecked", "-feature", "-deprecation")
23 |
24 | // Add repositories where library dependencies can be found
25 | resolvers += "Cloudera" at "https://repository.cloudera.com/content/repositories/releases/"
26 | resolvers += "Central" at "http://central.maven.org/maven2/"
27 | resolvers += "Spring Plugins" at "http://repo.spring.io/plugins-release/"
28 |
29 | // library dependencies. (orginization name) % (project name) % (version)
30 | libraryDependencies ++= Seq(
31 | "org.apache.hive" % "hive-exec" % "2.1.0" % "provided",
32 | "org.apache.hadoop" % "hadoop-core" % "2.6.0-mr1-cdh5.8.2",
33 | "com.novocode" % "junit-interface" % "0.11" % "test"
34 | )
35 |
36 |
--------------------------------------------------------------------------------
/udf-development/src/main/java/net/diybigdata/udf/FormatYearMonthString.java:
--------------------------------------------------------------------------------
1 | package net.diybigdata.udf;
2 |
3 | import org.apache.hadoop.hive.ql.exec.UDF;
4 | import org.apache.hadoop.hive.ql.exec.Description;
5 |
6 | @Description(
7 | name = "FormatYearMonthString",
8 | value = "_FUNC_(InputDataType) - Converts the passed year and month integers to a formatted string.",
9 | extended = "Example:\n"
10 | + " > SELECT _FUNC_(InputDataType) FROM tablename;")
11 |
12 | public class FormatYearMonthString extends UDF {
13 | public String evaluate( Integer year, Integer month ) {
14 | return String.format("%1$d-%2$02d", year, month );
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/udf-development/src/test/java/net/diybigdata/udf/FormatYearMonthString_T.java:
--------------------------------------------------------------------------------
1 | package net.diybigdata.udf;
2 |
3 | import static org.junit.Assert.assertEquals;
4 | import org.junit.Test;
5 |
6 | import net.diybigdata.udf.FormatYearMonthString;
7 |
8 | public class FormatYearMonthString_T {
9 |
10 | @Test
11 | public void testStringFormating() {
12 | FormatYearMonthString udf = new FormatYearMonthString();
13 |
14 | assertEquals(
15 | "evaluate(1936, 12)",
16 | "1936-12",
17 | udf.evaluate( 1936, 12 )
18 | );
19 | assertEquals(
20 | "evaluate(1980, 07)",
21 | "1980-07",
22 | udf.evaluate( 1980, 07 )
23 | );
24 | }
25 | }
26 |
--------------------------------------------------------------------------------