├── README.md ├── credentials.py ├── reddit_producer.ipynb ├── Producer.ipynb ├── publisher.ipynb ├── reddit_consumer.ipynb └── sub_works.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Spark_Streaming_Hackathon -------------------------------------------------------------------------------- /credentials.py: -------------------------------------------------------------------------------- 1 | CLIENT_ID = "k08S90-3NAQb_0xASjQfWA" 2 | CLIENT_SECRET = "xKtSVRYW9A72U-IKZ72YVYk2FafZrg" 3 | USERNAME = "Dramatic_Attitude_58" 4 | PASSWORD = "TqK7Qtp@h7SLtBn" 5 | USER_AGENT = "" -------------------------------------------------------------------------------- /reddit_producer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## SparkStreaming Hackathon\n", 8 | "### Course: Real-time Data Analysis\n", 9 | "### Authors: Ruben Tak, Nils Jennissen, David Landeo\n", 10 | "This task involves setting up a data streaming pipeline to extract and process posts and comments from Reddit. The data will be structured and sent through a socket, then received and processed by another process. References to users, posts, and external sites will be extracted and counted, and the top 10 important words will be identified using TF-IDF. Optional features include sentiment analysis, additional metrics, saving results to a database, creating a Jupyter Notebook dashboard, and visualizing the results on a web page. The deliverables include Python code, instructions, output data files, and optional Docker setup." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "ExecuteTime": { 18 | "end_time": "2023-06-21T16:47:19.800355Z", 19 | "start_time": "2023-06-21T16:47:19.795233Z" 20 | } 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "# pip install praw" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": { 31 | "scrolled": true, 32 | "is_executing": true, 33 | "ExecuteTime": { 34 | "start_time": "2023-06-21T19:36:22.394393Z" 35 | } 36 | }, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "Listening on port: 9999\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "# remember to use nc -lk 9999 before you run the script\n", 48 | "import socket\n", 49 | "import json\n", 50 | "import praw\n", 51 | "import logging\n", 52 | "logging.basicConfig(filename='stream_json_error.log', level=logging.ERROR)\n", 53 | "from credentials import CLIENT_ID, CLIENT_SECRET\n", 54 | "\n", 55 | "USER_AGENT = 'MyBot/0.0.1'\n", 56 | "\n", 57 | "host = \"127.0.0.1\"\n", 58 | "port = 9999\n", 59 | "\n", 60 | "subred_name = \"reddit\"\n", 61 | "\n", 62 | "def create_socket(host, port):\n", 63 | " \"\"\"\n", 64 | " Create a socket and bind it to the specified host and port.\n", 65 | " \"\"\"\n", 66 | " s = socket.socket()\n", 67 | " s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n", 68 | " s.bind((host, port))\n", 69 | " print(f\"Listening on port: {port}\")\n", 70 | " s.listen()\n", 71 | " return s\n", 72 | "\n", 73 | "def stream_json(reddit, subreddit, socket):\n", 74 | " \"\"\"\n", 75 | " Stream comments from the specified subreddit and send them through the socket.\n", 76 | " \"\"\"\n", 77 | " # Accept the connection once\n", 78 | " c, addr = socket.accept()\n", 79 | " for comment in subreddit.stream.comments():\n", 80 | " try:\n", 81 | " post = comment.submission\n", 82 | " parent_id = str(comment.parent())\n", 83 | " parent_comment = reddit.comment(parent_id)\n", 84 | " my_object = {\n", 85 | " \"comment\": comment.body,\n", 86 | " \"prev_comment\": parent_comment.body,\n", 87 | " \"post\": post.selftext,\n", 88 | " \"post_date\": post.created_utc,\n", 89 | " \"comment_date\": comment.created_utc,\n", 90 | " }\n", 91 | " # Send data with a newline character\n", 92 | " c.send((json.dumps(my_object) + '\\n').encode('utf-8'))\n", 93 | " print(f'Sent data: {my_object}')\n", 94 | " except praw.exceptions.PRAWException as ex:\n", 95 | " logging.error(f\"Error while streaming comments: {ex}\")\n", 96 | " pass\n", 97 | " # Close the connection after streaming comments\n", 98 | " c.close()\n", 99 | "\n", 100 | "def main():\n", 101 | " # Set up Reddit API\n", 102 | " reddit = praw.Reddit(client_id=CLIENT_ID,\n", 103 | " client_secret=CLIENT_SECRET,\n", 104 | " user_agent=USER_AGENT)\n", 105 | "\n", 106 | " subreddit = reddit.subreddit(subred_name)\n", 107 | "\n", 108 | " # Set up socket\n", 109 | " with create_socket(host, port) as s:\n", 110 | " # Stream comments and send them through the socket\n", 111 | " stream_json(reddit, subreddit, s)\n", 112 | "\n", 113 | "if __name__ == \"__main__\":\n", 114 | " main()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3 (ipykernel)", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.11.4" 142 | }, 143 | "vscode": { 144 | "interpreter": { 145 | "hash": "1db35aecd77fcc020a8642668a5b7619c380276ef0ce04f324d75e02f2d7512b" 146 | } 147 | } 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 4 151 | } 152 | -------------------------------------------------------------------------------- /Producer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": { 7 | "tags": [] 8 | }, 9 | "source": [ 10 | "# Producer app\n", 11 | "\n", 12 | "We are using sockets as a way to emit data for spark streaming to consume it. " 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "## Reddit information" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": { 26 | "ExecuteTime": { 27 | "end_time": "2023-06-21T07:26:21.333629Z", 28 | "start_time": "2023-06-21T07:26:21.329811Z" 29 | } 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "CLIENT_ID = '6di041usQ3ginoVTUL3Tjw'\n", 34 | "SECRET_TOKEN = 'Q-uxdARHnumCBT-tGLLmbmsZG0mwxw'\n", 35 | "USERNAME = 'david_landeo'\n", 36 | "PASSWORD = 'LANvar374*'\n", 37 | "USER_AGENT = 'MyBot/0.0.1'" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 1, 43 | "metadata": { 44 | "ExecuteTime": { 45 | "end_time": "2023-06-20T15:16:26.515830Z", 46 | "start_time": "2023-06-20T15:16:21.557793Z" 47 | } 48 | }, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "Requirement already satisfied: praw in /Users/erictak/miniconda3/lib/python3.10/site-packages (7.7.0)\r\n", 55 | "Requirement already satisfied: prawcore<3,>=2.1 in /Users/erictak/miniconda3/lib/python3.10/site-packages (from praw) (2.3.0)\r\n", 56 | "Requirement already satisfied: update-checker>=0.18 in /Users/erictak/miniconda3/lib/python3.10/site-packages (from praw) (0.18.0)\r\n", 57 | "Requirement already satisfied: websocket-client>=0.54.0 in /Users/erictak/miniconda3/lib/python3.10/site-packages (from praw) (0.58.0)\r\n", 58 | "Requirement already satisfied: requests<3.0,>=2.6.0 in /Users/erictak/miniconda3/lib/python3.10/site-packages (from prawcore<3,>=2.1->praw) (2.31.0)\r\n", 59 | "Requirement already satisfied: six in /Users/erictak/miniconda3/lib/python3.10/site-packages (from websocket-client>=0.54.0->praw) (1.16.0)\r\n", 60 | "Requirement already satisfied: idna<4,>=2.5 in /Users/erictak/miniconda3/lib/python3.10/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (3.4)\r\n", 61 | "Requirement already satisfied: certifi>=2017.4.17 in /Users/erictak/miniconda3/lib/python3.10/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2022.12.7)\r\n", 62 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/erictak/miniconda3/lib/python3.10/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2.0.2)\r\n", 63 | "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/erictak/miniconda3/lib/python3.10/site-packages (from requests<3.0,>=2.6.0->prawcore<3,>=2.1->praw) (2.0.4)\r\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "!pip install praw" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 2, 74 | "metadata": { 75 | "ExecuteTime": { 76 | "end_time": "2023-06-21T07:26:28.093906Z", 77 | "start_time": "2023-06-21T07:26:28.023417Z" 78 | } 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "import praw\n", 83 | "import datetime\n", 84 | "import json\n", 85 | "\n", 86 | "reddit = praw.Reddit(client_id=CLIENT_ID,\n", 87 | " client_secret=SECRET_TOKEN,\n", 88 | " user_agent=USER_AGENT)" 89 | ] 90 | }, 91 | { 92 | "attachments": {}, 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## Create a socket and send messages non stop\n", 97 | "\n" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "tags": [], 105 | "is_executing": true, 106 | "ExecuteTime": { 107 | "start_time": "2023-06-21T07:26:31.552581Z" 108 | } 109 | }, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "Listening on port: 9998\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "import socket\n", 121 | "\n", 122 | "def stream_data(subreddit):\n", 123 | " for comment in subreddit.stream.comments():\n", 124 | " try:\n", 125 | " parent_id = str(comment.parent())\n", 126 | " parent_com = reddit.comment(parent_id)\n", 127 | " post = comment.submission\n", 128 | " print('-comment:')\n", 129 | " print(comment.body)\n", 130 | " print('-Parent comment:')\n", 131 | " print(parent_com.body)\n", 132 | " print('-creation date:')\n", 133 | " creation_date = datetime.datetime.utcfromtimestamp(post.created_utc)\n", 134 | " print(creation_date)\n", 135 | " \n", 136 | " # Envía la información al socket\n", 137 | " sentence = f\"Comment: {comment.body}\\nParent Comment: {parent_com.body}\\nCreation Date: {creation_date}\\n\"\n", 138 | " c.send(sentence.encode(\"utf-8\"))\n", 139 | " \n", 140 | " except praw.exceptions.PRAWException as ex:\n", 141 | " print(f'-Parsing ERROR, with message: {ex}')\n", 142 | " return sentence\n", 143 | "\n", 144 | "s = socket.socket()\n", 145 | "host = \"127.0.0.1\"\n", 146 | "port = 9998\n", 147 | "\n", 148 | "s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)\n", 149 | "s.bind((host, port))\n", 150 | "\n", 151 | "print(\"Listening on port: %s\" % str(port))\n", 152 | "\n", 153 | "s.listen(5)\n", 154 | "\n", 155 | "subreddit1 = reddit.subreddit('funny')\n", 156 | "subreddit2 = reddit.subreddit('python')\n", 157 | "\n", 158 | "while True:\n", 159 | " c, addr = s.accept()\n", 160 | " stream_data(subreddit1)\n", 161 | " stream_data(subreddit2)\n", 162 | " print(\"message sent!\")\n", 163 | " print(sentence)\n", 164 | " c.close()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "c.close()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [] 182 | } 183 | ], 184 | "metadata": { 185 | "kernelspec": { 186 | "display_name": "Python 3 (ipykernel)", 187 | "language": "python", 188 | "name": "python3" 189 | }, 190 | "language_info": { 191 | "codemirror_mode": { 192 | "name": "ipython", 193 | "version": 3 194 | }, 195 | "file_extension": ".py", 196 | "mimetype": "text/x-python", 197 | "name": "python", 198 | "nbconvert_exporter": "python", 199 | "pygments_lexer": "ipython3", 200 | "version": "3.11.4" 201 | } 202 | }, 203 | "nbformat": 4, 204 | "nbformat_minor": 4 205 | } 206 | -------------------------------------------------------------------------------- /publisher.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "ExecuteTime": { 9 | "end_time": "2023-06-21T18:53:59.823754Z", 10 | "start_time": "2023-06-21T18:50:45.121703Z" 11 | } 12 | }, 13 | "outputs": [ 14 | { 15 | "name": "stdout", 16 | "output_type": "stream", 17 | "text": [ 18 | "Listening on port: 9999\n", 19 | "Sent data: {'comment': 'His \"joke\" explained clearly. Crystal. \\n\\nhttps://pca.st/episode/f8b63290-5616-42e1-a956-a0486f29e21e', 'prev_comment': 'His “joke” is the least of our issues. His behavior and communications with us has been all over the place—saying one thing to us while saying something completely different externally; recording and leaking a private phone call—to the point where I don’t know how we could do business with him.', 'post': \"Dear redditors,\\n\\nFor those of you who don’t know me, I’m Steve aka u/spez. I am one of the founders of Reddit, and I’ve been CEO since 2015. On Wednesday, I celebrated my 18th cake-day, which is about 17 years and 9 months longer than I thought this project would last. To be with you here today on Reddit—even in a heated moment like this—is an honor.\\n\\nI want to talk with you today about what’s happening within the community and frustration stemming from changes we are making to access our API. I spoke to a number of moderators on Wednesday and yesterday afternoon and our product and community teams have had further conversations with mods as well.\\n\\nFirst, let me share the background on this topic as well as some clarifying details. On 4/18, we [shared](https://www.reddit.com/r/reddit/comments/12qwagm/an_update_regarding_reddits_api/) that we would update access to the API, including premium access for third parties who require additional capabilities and higher usage limits. Reddit needs to be a self-sustaining business, and to do that, we can no longer subsidize commercial entities that require large-scale data use. \\n\\nThere’s been a lot of confusion over what these changes mean, and I want to highlight what these changes mean for moderators and developers. \\n\\n* **Terms of Service**\\n * Effective June 19, 2023, our updated [Data API Terms](https://www.redditinc.com/policies/data-api-terms), together with our [Developer Terms](https://www.redditinc.com/policies/developer-terms), replaced the existing Data API terms.\\n* **Free Data API** \\n * Effective July 1, 2023, the rate limits to use the Data API free of charge are:\\n * 100 queries per minute per OAuth client id if you are using OAuth authentication and 10 queries per minute if you are not using OAuth authentication.\\n * Today, over 90% of apps fall into this category and can continue to access the Data API for free.\\n* **Premium Enterprise API / Third-party apps**\\n * Effective July 1, 2023, the rate for apps that require higher usage limits is $0.24 per 1K API calls (less than $1.00 per user / month for a typical Reddit third-party app).\\n * Some apps such as Apollo, Reddit is Fun, and Sync have decided this pricing doesn’t work for their businesses and will close before pricing goes into effect. \\n * For the other apps, we will continue talking. We acknowledge that the timeline we gave was tight; we are happy to engage with folks who want to work with us.\\n* **Mod Tools**\\n * We know many communities rely on tools like RES, ContextMod, Toolbox, etc., and these tools will continue to have free access to the Data API.\\n * We’re working together with Pushshift to [restore access](https://www.reddit.com/r/pushshift/comments/13w6j20/advancing_communityled_moderation_an_update_on/?utm_source=share&utm_medium=web2x&context=3) for verified moderators.\\n* **Mod Bots**\\n * If you’re creating free bots that help moderators and users (e.g. haikubot, setlistbot, etc), please continue to do so. You can contact us [here](https://support.reddithelp.com/hc/en-us/requests/new?ticket_form_id=14868593862164) if you have a bot that requires access to the Data API above the free limits. \\n * Developer Platform is a new platform designed to let users and developers expand the Reddit experience by providing powerful features for building moderation tools, creative tools, games, and more. We are currently in a closed beta with hundreds of developers (sign up [here](https://developers.reddit.com/)). For those of you who have been around a while, it is the spiritual successor to both the API and Custom CSS.\\n* **Explicit Content** \\n * Effective July 5, 2023, we will limit access to mature content via our Data API as part of an ongoing effort to provide guardrails to how explicit content and communities on Reddit are discovered and viewed. \\n * This change will not impact any moderator bots or extensions.\\nIn our conversations with moderators and developers, we heard two areas of feedback we plan to address. \\n\\n* **Accessibility** \\\\- We want everyone to be able to use Reddit. As a result, non-commercial, accessibility-focused apps and tools will continue to have free access. We’re working with apps like RedReader and Dystopia and a few others to ensure they can continue to access the Data API.\\n* **Better mobile moderation** \\\\- We need more efficient moderation tools, especially on mobile. They are coming. We’ve [launched ](https://www.reddit.com/r/modnews/comments/142kh8s/improvement_to_the_mobile_mod_queue/?utm_source=share&utm_medium=web2x&context=3)improvements to some tools recently and will continue to do so. About 3% of mod actions come from third-party apps, and we’ve reached out to communities who moderate almost exclusively using these apps to ensure we address their needs.\\n\\nMods, I appreciate all the time you’ve spent with us this week, and all the time prior as well. Your feedback is invaluable. We respect when you and your communities take action to highlight the things you need, including, at times, going private. We are all responsible for ensuring Reddit provides an open accessible place for people to find community and belonging. \\n\\nI will be sticking around to answer questions along with other admins. We know answers are tough to find, so we're switching the default sort to Q&A mode. You can view responses from the following admins here:\\n\\n* u/spez\\n* u/KeyserSosa\\n* u/Go_JasonWaterfalls\\n* u/FlyingLaserTurtle\\n\\n\\\\- Steve\\n\\nP.S. old.reddit.com isn’t going anywhere, and explicit content is *still* allowed on Reddit as long as it abides by our content policy. \\n\\nedit: formatting\", 'author': 'Yowzz', 'link_url': 'https://www.reddit.com/r/reddit/comments/145bram/addressing_the_community_about_changes_to_our_api/', 'link_permalink': 'https://www.reddit.com/r/reddit/comments/145bram/addressing_the_community_about_changes_to_our_api/', 'post_date': 1687315472.0, 'ups': 1, 'likes': None}\n" 20 | ] 21 | }, 22 | { 23 | "ename": "KeyboardInterrupt", 24 | "evalue": "", 25 | "output_type": "error", 26 | "traceback": [ 27 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", 28 | "\u001B[0;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)", 29 | "Cell \u001B[0;32mIn[1], line 58\u001B[0m\n\u001B[1;32m 53\u001B[0m reddit \u001B[38;5;241m=\u001B[39m praw\u001B[38;5;241m.\u001B[39mReddit(client_id\u001B[38;5;241m=\u001B[39mCLIENT_ID,\n\u001B[1;32m 54\u001B[0m client_secret\u001B[38;5;241m=\u001B[39mSECRET_TOKEN,\n\u001B[1;32m 55\u001B[0m user_agent\u001B[38;5;241m=\u001B[39mUSER_AGENT)\n\u001B[1;32m 57\u001B[0m subreddit \u001B[38;5;241m=\u001B[39m reddit\u001B[38;5;241m.\u001B[39msubreddit(subred_name)\n\u001B[0;32m---> 58\u001B[0m \u001B[43mstream_json\u001B[49m\u001B[43m(\u001B[49m\u001B[43msubreddit\u001B[49m\u001B[43m)\u001B[49m\n", 30 | "Cell \u001B[0;32mIn[1], line 46\u001B[0m, in \u001B[0;36mstream_json\u001B[0;34m(subreddit)\u001B[0m\n\u001B[1;32m 34\u001B[0m my_object \u001B[38;5;241m=\u001B[39m {\n\u001B[1;32m 35\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcomment\u001B[39m\u001B[38;5;124m\"\u001B[39m: comment_body,\n\u001B[1;32m 36\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mprev_comment\u001B[39m\u001B[38;5;124m\"\u001B[39m: prev_body,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 43\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mlikes\u001B[39m\u001B[38;5;124m\"\u001B[39m: comment\u001B[38;5;241m.\u001B[39mlikes,\n\u001B[1;32m 44\u001B[0m }\n\u001B[1;32m 45\u001B[0m \u001B[38;5;66;03m# Send through socket\u001B[39;00m\n\u001B[0;32m---> 46\u001B[0m c, addr \u001B[38;5;241m=\u001B[39m \u001B[43ms\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43maccept\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 47\u001B[0m c\u001B[38;5;241m.\u001B[39msend(json\u001B[38;5;241m.\u001B[39mdumps(my_object)\u001B[38;5;241m.\u001B[39mencode(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mutf-8\u001B[39m\u001B[38;5;124m'\u001B[39m))\n\u001B[1;32m 48\u001B[0m c\u001B[38;5;241m.\u001B[39mclose()\n", 31 | "File \u001B[0;32m~/miniforge3/envs/spark-streaming-hackathon/lib/python3.10/socket.py:293\u001B[0m, in \u001B[0;36msocket.accept\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m 286\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21maccept\u001B[39m(\u001B[38;5;28mself\u001B[39m):\n\u001B[1;32m 287\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124;03m\"\"\"accept() -> (socket object, address info)\u001B[39;00m\n\u001B[1;32m 288\u001B[0m \n\u001B[1;32m 289\u001B[0m \u001B[38;5;124;03m Wait for an incoming connection. Return a new socket\u001B[39;00m\n\u001B[1;32m 290\u001B[0m \u001B[38;5;124;03m representing the connection, and the address of the client.\u001B[39;00m\n\u001B[1;32m 291\u001B[0m \u001B[38;5;124;03m For IP sockets, the address info is a pair (hostaddr, port).\u001B[39;00m\n\u001B[1;32m 292\u001B[0m \u001B[38;5;124;03m \"\"\"\u001B[39;00m\n\u001B[0;32m--> 293\u001B[0m fd, addr \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_accept\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 294\u001B[0m sock \u001B[38;5;241m=\u001B[39m socket(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mfamily, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtype, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mproto, fileno\u001B[38;5;241m=\u001B[39mfd)\n\u001B[1;32m 295\u001B[0m \u001B[38;5;66;03m# Issue #7995: if no default timeout is set and the listening\u001B[39;00m\n\u001B[1;32m 296\u001B[0m \u001B[38;5;66;03m# socket had a (non-zero) timeout, force the new socket in blocking\u001B[39;00m\n\u001B[1;32m 297\u001B[0m \u001B[38;5;66;03m# mode to override platform-specific socket flags inheritance.\u001B[39;00m\n", 32 | "\u001B[0;31mKeyboardInterrupt\u001B[0m: " 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "import socket\n", 38 | "import json\n", 39 | "import praw\n", 40 | "import logging\n", 41 | "logging.basicConfig(filename='stream_json_error.log', level=logging.ERROR)\n", 42 | "from credentials import YOUR_CLIENT_ID, YOUR_CLIENT_SECRET\n", 43 | "\n", 44 | "CLIENT_ID = YOUR_CLIENT_ID\n", 45 | "SECRET_TOKEN = YOUR_CLIENT_SECRET\n", 46 | "USER_AGENT = 'MyBot/0.0.1'\n", 47 | "\n", 48 | "host = \"127.0.0.1\"\n", 49 | "port = 9999\n", 50 | "\n", 51 | "subred_name = \"reddit\"\n", 52 | "\n", 53 | "# Socket Prep\n", 54 | "s = socket.socket()\n", 55 | "s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1)\n", 56 | "s.bind((host, port))\n", 57 | "print(\"Listening on port: %s\" % str(port))\n", 58 | "s.listen()\n", 59 | "\n", 60 | "def stream_json(subreddit):\n", 61 | " for comment in subreddit.stream.comments():\n", 62 | " try:\n", 63 | " post = comment.submission\n", 64 | " parent_id = str(comment.parent())\n", 65 | " prev_comment = reddit.comment(parent_id)\n", 66 | "\n", 67 | " prev_body = prev_comment.body\n", 68 | " comment_body = comment.body\n", 69 | "\n", 70 | " my_object = {\n", 71 | " \"comment\": comment_body,\n", 72 | " \"prev_comment\": prev_body,\n", 73 | " \"post\": post.selftext,\n", 74 | " \"author\": str(comment.author),\n", 75 | " \"link_url\": comment.link_url,\n", 76 | " \"link_permalink\": comment.link_permalink,\n", 77 | " \"post_date\": comment.created_utc,\n", 78 | " \"ups\": comment.ups,\n", 79 | " \"likes\": comment.likes,\n", 80 | " }\n", 81 | " # Send through socket\n", 82 | " c, addr = s.accept()\n", 83 | " c.send(json.dumps(my_object).encode('utf-8'))\n", 84 | " c.close()\n", 85 | " print(f'Sent data: {my_object}')\n", 86 | " except praw.exceptions.PRAWException as ex:\n", 87 | " pass\n", 88 | "\n", 89 | "reddit = praw.Reddit(client_id=CLIENT_ID,\n", 90 | " client_secret=SECRET_TOKEN,\n", 91 | " user_agent=USER_AGENT)\n", 92 | "\n", 93 | "subreddit = reddit.subreddit(subred_name)\n", 94 | "stream_json(subreddit)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "outputs": [], 101 | "source": [], 102 | "metadata": { 103 | "collapsed": false 104 | } 105 | } 106 | ], 107 | "metadata": { 108 | "kernelspec": { 109 | "display_name": "Python 3", 110 | "language": "python", 111 | "name": "python3" 112 | }, 113 | "language_info": { 114 | "codemirror_mode": { 115 | "name": "ipython", 116 | "version": 2 117 | }, 118 | "file_extension": ".py", 119 | "mimetype": "text/x-python", 120 | "name": "python", 121 | "nbconvert_exporter": "python", 122 | "pygments_lexer": "ipython2", 123 | "version": "2.7.6" 124 | } 125 | }, 126 | "nbformat": 4, 127 | "nbformat_minor": 0 128 | } 129 | -------------------------------------------------------------------------------- /reddit_consumer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## SparkStreaming Hackathon\n", 8 | "### Course: Real-time Data Analysis\n", 9 | "### Authors: Ruben Tak, Nils Jennissen, David Landeo\n", 10 | "This task involves setting up a data streaming pipeline to extract and process posts and comments from Reddit. The data will be structured and sent through a socket, then received and processed by another process. References to users, posts, and external sites will be extracted and counted, and the top 10 important words will be identified using TF-IDF. Optional features include sentiment analysis, additional metrics, saving results to a database, creating a Jupyter Notebook dashboard, and visualizing the results on a web page. The deliverables include Python code, instructions, output data files, and optional Docker setup." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "scrolled": true, 18 | "ExecuteTime": { 19 | "end_time": "2023-06-21T19:37:26.726664Z", 20 | "start_time": "2023-06-21T19:37:24.738614Z" 21 | } 22 | }, 23 | "outputs": [ 24 | { 25 | "name": "stderr", 26 | "output_type": "stream", 27 | "text": [ 28 | "23/06/21 21:37:26 WARN Utils: Your hostname, Nilss-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.108 instead (on interface en0)\n", 29 | "23/06/21 21:37:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 30 | "Exception in thread \"main\" java.lang.ExceptionInInitializerError\n", 31 | "\tat org.apache.spark.unsafe.array.ByteArrayMethods.(ByteArrayMethods.java:54)\n", 32 | "\tat org.apache.spark.internal.config.package$.(package.scala:1006)\n", 33 | "\tat org.apache.spark.internal.config.package$.(package.scala)\n", 34 | "\tat org.apache.spark.deploy.SparkSubmitArguments.$anonfun$loadEnvironmentArguments$3(SparkSubmitArguments.scala:157)\n", 35 | "\tat scala.Option.orElse(Option.scala:447)\n", 36 | "\tat org.apache.spark.deploy.SparkSubmitArguments.loadEnvironmentArguments(SparkSubmitArguments.scala:157)\n", 37 | "\tat org.apache.spark.deploy.SparkSubmitArguments.(SparkSubmitArguments.scala:115)\n", 38 | "\tat org.apache.spark.deploy.SparkSubmit$$anon$2$$anon$3.(SparkSubmit.scala:990)\n", 39 | "\tat org.apache.spark.deploy.SparkSubmit$$anon$2.parseArguments(SparkSubmit.scala:990)\n", 40 | "\tat org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:85)\n", 41 | "\tat org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1007)\n", 42 | "\tat org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1016)\n", 43 | "\tat org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)\n", 44 | "Caused by: java.lang.reflect.InaccessibleObjectException: Unable to make private java.nio.DirectByteBuffer(long,int) accessible: module java.base does not \"opens java.nio\" to unnamed module @15c89abd\n", 45 | "\tat java.base/java.lang.reflect.AccessibleObject.throwInaccessibleObjectException(AccessibleObject.java:387)\n", 46 | "\tat java.base/java.lang.reflect.AccessibleObject.checkCanSetAccessible(AccessibleObject.java:363)\n", 47 | "\tat java.base/java.lang.reflect.AccessibleObject.checkCanSetAccessible(AccessibleObject.java:311)\n", 48 | "\tat java.base/java.lang.reflect.Constructor.checkCanSetAccessible(Constructor.java:192)\n", 49 | "\tat java.base/java.lang.reflect.Constructor.setAccessible(Constructor.java:185)\n", 50 | "\tat org.apache.spark.unsafe.Platform.(Platform.java:56)\n", 51 | "\t... 13 more\n" 52 | ] 53 | }, 54 | { 55 | "ename": "Exception", 56 | "evalue": "Java gateway process exited before sending its port number", 57 | "output_type": "error", 58 | "traceback": [ 59 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", 60 | "\u001B[0;31mException\u001B[0m Traceback (most recent call last)", 61 | "Cell \u001B[0;32mIn[1], line 163\u001B[0m\n\u001B[1;32m 153\u001B[0m schema \u001B[38;5;241m=\u001B[39m StructType([\n\u001B[1;32m 154\u001B[0m StructField(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtype\u001B[39m\u001B[38;5;124m\"\u001B[39m, StringType(), \u001B[38;5;28;01mTrue\u001B[39;00m),\n\u001B[1;32m 155\u001B[0m StructField(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mtext\u001B[39m\u001B[38;5;124m\"\u001B[39m, StringType(), \u001B[38;5;28;01mTrue\u001B[39;00m),\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 159\u001B[0m StructField(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcreated_utc\u001B[39m\u001B[38;5;124m\"\u001B[39m, FloatType(), \u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[1;32m 160\u001B[0m ])\n\u001B[1;32m 162\u001B[0m \u001B[38;5;66;03m# Create a SparkSession\u001B[39;00m\n\u001B[0;32m--> 163\u001B[0m spark \u001B[38;5;241m=\u001B[39m \u001B[43mSparkSession\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbuilder\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mappName\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mreddit\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgetOrCreate\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 165\u001B[0m \u001B[38;5;66;03m# Read the data from the socket as a streaming DataFrame\u001B[39;00m\n\u001B[1;32m 166\u001B[0m raw_data \u001B[38;5;241m=\u001B[39m spark\u001B[38;5;241m.\u001B[39mreadStream\u001B[38;5;241m.\u001B[39mformat(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124msocket\u001B[39m\u001B[38;5;124m\"\u001B[39m)\u001B[38;5;241m.\u001B[39moption(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhost\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mlocalhost\u001B[39m\u001B[38;5;124m\"\u001B[39m)\u001B[38;5;241m.\u001B[39moption(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mport\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;241m9999\u001B[39m)\u001B[38;5;241m.\u001B[39mload()\n", 62 | "File \u001B[0;32m~/miniforge3/envs/spark-streaming-hackathon/lib/python3.10/site-packages/pyspark/sql/session.py:186\u001B[0m, in \u001B[0;36mSparkSession.Builder.getOrCreate\u001B[0;34m(self)\u001B[0m\n\u001B[1;32m 184\u001B[0m sparkConf\u001B[38;5;241m.\u001B[39mset(key, value)\n\u001B[1;32m 185\u001B[0m \u001B[38;5;66;03m# This SparkContext may be an existing one.\u001B[39;00m\n\u001B[0;32m--> 186\u001B[0m sc \u001B[38;5;241m=\u001B[39m \u001B[43mSparkContext\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mgetOrCreate\u001B[49m\u001B[43m(\u001B[49m\u001B[43msparkConf\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 187\u001B[0m \u001B[38;5;66;03m# Do not update `SparkConf` for existing `SparkContext`, as it's shared\u001B[39;00m\n\u001B[1;32m 188\u001B[0m \u001B[38;5;66;03m# by all sessions.\u001B[39;00m\n\u001B[1;32m 189\u001B[0m session \u001B[38;5;241m=\u001B[39m SparkSession(sc)\n", 63 | "File \u001B[0;32m~/miniforge3/envs/spark-streaming-hackathon/lib/python3.10/site-packages/pyspark/context.py:378\u001B[0m, in \u001B[0;36mSparkContext.getOrCreate\u001B[0;34m(cls, conf)\u001B[0m\n\u001B[1;32m 376\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m SparkContext\u001B[38;5;241m.\u001B[39m_lock:\n\u001B[1;32m 377\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m SparkContext\u001B[38;5;241m.\u001B[39m_active_spark_context \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[0;32m--> 378\u001B[0m \u001B[43mSparkContext\u001B[49m\u001B[43m(\u001B[49m\u001B[43mconf\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mconf\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;129;43;01mor\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mSparkConf\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 379\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m SparkContext\u001B[38;5;241m.\u001B[39m_active_spark_context\n", 64 | "File \u001B[0;32m~/miniforge3/envs/spark-streaming-hackathon/lib/python3.10/site-packages/pyspark/context.py:133\u001B[0m, in \u001B[0;36mSparkContext.__init__\u001B[0;34m(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)\u001B[0m\n\u001B[1;32m 128\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m gateway \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m \u001B[38;5;129;01mand\u001B[39;00m gateway\u001B[38;5;241m.\u001B[39mgateway_parameters\u001B[38;5;241m.\u001B[39mauth_token \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m 129\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[1;32m 130\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mYou are trying to pass an insecure Py4j gateway to Spark. This\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[1;32m 131\u001B[0m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m is not allowed as it is a security risk.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m--> 133\u001B[0m \u001B[43mSparkContext\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_ensure_initialized\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mgateway\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mgateway\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mconf\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mconf\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 134\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m 135\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,\n\u001B[1;32m 136\u001B[0m conf, jsc, profiler_cls)\n", 65 | "File \u001B[0;32m~/miniforge3/envs/spark-streaming-hackathon/lib/python3.10/site-packages/pyspark/context.py:327\u001B[0m, in \u001B[0;36mSparkContext._ensure_initialized\u001B[0;34m(cls, instance, gateway, conf)\u001B[0m\n\u001B[1;32m 325\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m SparkContext\u001B[38;5;241m.\u001B[39m_lock:\n\u001B[1;32m 326\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m SparkContext\u001B[38;5;241m.\u001B[39m_gateway:\n\u001B[0;32m--> 327\u001B[0m SparkContext\u001B[38;5;241m.\u001B[39m_gateway \u001B[38;5;241m=\u001B[39m gateway \u001B[38;5;129;01mor\u001B[39;00m \u001B[43mlaunch_gateway\u001B[49m\u001B[43m(\u001B[49m\u001B[43mconf\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 328\u001B[0m SparkContext\u001B[38;5;241m.\u001B[39m_jvm \u001B[38;5;241m=\u001B[39m SparkContext\u001B[38;5;241m.\u001B[39m_gateway\u001B[38;5;241m.\u001B[39mjvm\n\u001B[1;32m 330\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m instance:\n", 66 | "File \u001B[0;32m~/miniforge3/envs/spark-streaming-hackathon/lib/python3.10/site-packages/pyspark/java_gateway.py:105\u001B[0m, in \u001B[0;36mlaunch_gateway\u001B[0;34m(conf, popen_kwargs)\u001B[0m\n\u001B[1;32m 102\u001B[0m time\u001B[38;5;241m.\u001B[39msleep(\u001B[38;5;241m0.1\u001B[39m)\n\u001B[1;32m 104\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m os\u001B[38;5;241m.\u001B[39mpath\u001B[38;5;241m.\u001B[39misfile(conn_info_file):\n\u001B[0;32m--> 105\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mJava gateway process exited before sending its port number\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 107\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(conn_info_file, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mrb\u001B[39m\u001B[38;5;124m\"\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m info:\n\u001B[1;32m 108\u001B[0m gateway_port \u001B[38;5;241m=\u001B[39m read_int(info)\n", 67 | "\u001B[0;31mException\u001B[0m: Java gateway process exited before sending its port number" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "from pyspark.sql import SparkSession\n", 73 | "from pyspark.sql.types import StructType, StructField, StringType, FloatType\n", 74 | "from pyspark.sql.functions import from_json, regexp_extract, split, window, count, min, max, avg, udf\n", 75 | "from pyspark.ml.feature import CountVectorizer, IDF\n", 76 | "from textblob import TextBlob\n", 77 | "from pyspark.sql.functions import explode, to_date\n", 78 | "from pyspark.sql.types import TimestampType\n", 79 | "import pandas as pd\n", 80 | "from pyspark.sql.functions import pandas_udf, PandasUDFType\n", 81 | "from pyspark.sql.functions import from_unixtime\n", 82 | "from typing import Iterator\n", 83 | "import pandas as pd\n", 84 | "\n", 85 | "def preprocess_comment(df):\n", 86 | " return df.withColumn(\"words\", split(df[\"comment\"], \" \"))\n", 87 | "\n", 88 | "def calculate_tfidf(df):\n", 89 | " if df.rdd.isEmpty():\n", 90 | " return None, None, None\n", 91 | "\n", 92 | " vectorizer = CountVectorizer(inputCol=\"words\", outputCol=\"raw_features\")\n", 93 | " vectorizer_model = vectorizer.fit(df)\n", 94 | " count_vectorized = vectorizer_model.transform(df)\n", 95 | "\n", 96 | " idf = IDF(inputCol=\"raw_features\", outputCol=\"features\")\n", 97 | " idf_model = idf.fit(count_vectorized)\n", 98 | " tfidf = idf_model.transform(count_vectorized)\n", 99 | "\n", 100 | " return tfidf, vectorizer_model, idf_model\n", 101 | "\n", 102 | "def extract_references(df):\n", 103 | " return df.select(\"comment\", \"prev_comment\", \"post\", \"created_utc\",\n", 104 | " regexp_extract(\"comment\", '/u/([^\\\\s/]+)', 1).alias('user_reference'),\n", 105 | " regexp_extract(\"comment\", '/r/([^\\\\s/]+)', 1).alias('post_reference'),\n", 106 | " regexp_extract(\"comment\", 'http[s]?://([^\\\\s/]+)', 1).alias('site_reference'))\n", 107 | "\n", 108 | "def calculate_time_range(df):\n", 109 | " # Filter rows with non-null values in the created_utc column\n", 110 | " df = df.filter(df[\"created_utc\"].isNotNull())\n", 111 | "\n", 112 | " if df.rdd.isEmpty():\n", 113 | " return None, None\n", 114 | "\n", 115 | " time_range = df.agg(min(\"created_utc\").alias(\"min_date\"), max(\"created_utc\").alias(\"max_date\")).collect()[0]\n", 116 | " return time_range[\"min_date\"], time_range[\"max_date\"]\n", 117 | "\n", 118 | "\n", 119 | "def calculate_sentiment(df):\n", 120 | " if df.rdd.isEmpty():\n", 121 | " return df\n", 122 | "\n", 123 | " @pandas_udf(FloatType())\n", 124 | " def sentiment_score(series: pd.Series) -> pd.Series:\n", 125 | " return series.apply(lambda text: TextBlob(text).sentiment.polarity)\n", 126 | "\n", 127 | " df = df.withColumn(\"sentiment\", sentiment_score(df[\"comment\"]))\n", 128 | " return df\n", 129 | "\n", 130 | "def most_common_words(df, n=10):\n", 131 | " words_df = df.select(explode(split(df[\"post\"], \" \")).alias(\"word\"))\n", 132 | " word_counts = words_df.groupBy(\"word\").agg(count(\"*\").alias(\"count\")).orderBy(\"count\", ascending=False)\n", 133 | " return word_counts.limit(n)\n", 134 | "\n", 135 | "def process_batch(df, epoch_id):\n", 136 | " if df.rdd.isEmpty():\n", 137 | " return\n", 138 | "\n", 139 | " # Preprocess comments\n", 140 | " preprocessed_comments = preprocess_comment(df)\n", 141 | "\n", 142 | " # Print schema and sample data for debugging\n", 143 | " preprocessed_comments.printSchema()\n", 144 | " preprocessed_comments.show(5)\n", 145 | "\n", 146 | " # Calculate TF-IDF\n", 147 | " tfidf, vectorizer_model, idf_model = calculate_tfidf(preprocessed_comments)\n", 148 | "\n", 149 | " if vectorizer_model and idf_model:\n", 150 | " # Get top 10 important words\n", 151 | " vocab = vectorizer_model.vocabulary\n", 152 | " top_10_words = idf_model.idf.toArray().argsort()[-10:]\n", 153 | " top_10_words = [vocab[idx] for idx in top_10_words]\n", 154 | " if top_10_words:\n", 155 | " print(\"Top 10 important words:\")\n", 156 | " print(top_10_words)\n", 157 | "\n", 158 | " # Extract references from the data\n", 159 | " references_df = extract_references(preprocessed_comments)\n", 160 | "\n", 161 | " # Save the raw data to a temporary table in Spark\n", 162 | " preprocessed_comments.createOrReplaceTempView(\"raw\")\n", 163 | "\n", 164 | " # Save the raw data to disk\n", 165 | " preprocessed_comments.write.json(\"output/raw\", mode=\"append\")\n", 166 | "\n", 167 | " # Calculate occurrences of references\n", 168 | " user_ref_counts = references_df.groupBy(window(\"created_utc\", \"60 seconds\", \"5 seconds\"), \"user_reference\").agg(count(\"*\").alias(\"count\")).orderBy(\"window\", \"count\", ascending=False)\n", 169 | " post_ref_counts = references_df.groupBy(window(\"created_utc\", \"60 seconds\", \"5 seconds\"), \"post_reference\").agg(count(\"*\").alias(\"count\")).orderBy(\"window\", \"count\", ascending=False)\n", 170 | " site_ref_counts = references_df.groupBy(window(\"created_utc\", \"60 seconds\", \"5 seconds\"), \"site_reference\").agg(count(\"*\").alias(\"count\")).orderBy(\"window\", \"count\", ascending=False)\n", 171 | "\n", 172 | " # Print the occurrences if not empty\n", 173 | " if not user_ref_counts.rdd.isEmpty():\n", 174 | " print(\"User references:\")\n", 175 | " user_ref_counts.show()\n", 176 | " if not post_ref_counts.rdd.isEmpty():\n", 177 | " print(\"Post references:\")\n", 178 | " post_ref_counts.show()\n", 179 | " if not site_ref_counts.rdd.isEmpty():\n", 180 | " print(\"Site references:\")\n", 181 | " site_ref_counts.show()\n", 182 | "\n", 183 | " # Calculate time range\n", 184 | " min_date, max_date = calculate_time_range(preprocessed_comments)\n", 185 | " if min_date and max_date:\n", 186 | " print(f\"Time range of the data: {min_date} - {max_date}\")\n", 187 | "\n", 188 | " # Calculate sentiment\n", 189 | " sentiment_df = calculate_sentiment(preprocessed_comments)\n", 190 | " if not sentiment_df.rdd.isEmpty():\n", 191 | " avg_sentiment = sentiment_df.agg(avg(\"sentiment\").alias(\"average_sentiment\")).collect()[0][\"average_sentiment\"]\n", 192 | " print(f\"Average sentiment: {avg_sentiment}\")\n", 193 | " else:\n", 194 | " print(\"No sentiment data available.\")\n", 195 | "\n", 196 | " # Calculate most common words\n", 197 | " common_words_df = most_common_words(preprocessed_comments)\n", 198 | " print(\"Most common words in post titles:\")\n", 199 | " common_words_df.show()\n", 200 | "\n", 201 | "def calculate_sentiment(df):\n", 202 | " if df.rdd.isEmpty():\n", 203 | " return df\n", 204 | "\n", 205 | " @pandas_udf(FloatType(), PandasUDFType.SCALAR)\n", 206 | " def sentiment_score(series: pd.Series) -> pd.Series:\n", 207 | " return series.apply(lambda text: TextBlob(text).sentiment.polarity)\n", 208 | "\n", 209 | " df = df.withColumn(\"sentiment\", sentiment_score(df[\"comment\"]))\n", 210 | " return df\n", 211 | "\n", 212 | "# Create a SparkSession\n", 213 | "spark = SparkSession.builder.appName(\"reddit\").getOrCreate()\n", 214 | "\n", 215 | "# Define the schema\n", 216 | "schema = StructType([\n", 217 | " StructField(\"comment\", StringType(), True),\n", 218 | " StructField(\"prev_comment\", StringType(), True),\n", 219 | " StructField(\"post\", StringType(), True),\n", 220 | " StructField(\"created_utc\", StringType(), True),\n", 221 | "])\n", 222 | "\n", 223 | "# Read the data from the socket as a streaming DataFrame\n", 224 | "raw_data = spark.readStream.format(\"socket\").option(\"host\", \"localhost\").option(\"port\", 9999).load()\n", 225 | "\n", 226 | "# Parse the JSON data and apply the schema\n", 227 | "parsed_data = raw_data.select(from_json(raw_data.value, schema).alias(\"data\")).select(\"data.*\")\n", 228 | "\n", 229 | "# Convert the created_utc field to TimestampType\n", 230 | "parsed_data = parsed_data.withColumn(\"created_utc\", from_unixtime(parsed_data[\"created_utc\"]).cast(TimestampType()))\n", 231 | "\n", 232 | "# Process each batch of data\n", 233 | "query = parsed_data.writeStream.foreachBatch(process_batch).start()\n", 234 | "\n", 235 | "query.awaitTermination()" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "outputs": [], 242 | "source": [], 243 | "metadata": { 244 | "collapsed": false 245 | } 246 | } 247 | ], 248 | "metadata": { 249 | "kernelspec": { 250 | "display_name": "Python 3 (ipykernel)", 251 | "language": "python", 252 | "name": "python3" 253 | }, 254 | "language_info": { 255 | "codemirror_mode": { 256 | "name": "ipython", 257 | "version": 3 258 | }, 259 | "file_extension": ".py", 260 | "mimetype": "text/x-python", 261 | "name": "python", 262 | "nbconvert_exporter": "python", 263 | "pygments_lexer": "ipython3", 264 | "version": "3.11.4" 265 | }, 266 | "vscode": { 267 | "interpreter": { 268 | "hash": "1db35aecd77fcc020a8642668a5b7619c380276ef0ce04f324d75e02f2d7512b" 269 | } 270 | } 271 | }, 272 | "nbformat": 4, 273 | "nbformat_minor": 4 274 | } 275 | -------------------------------------------------------------------------------- /sub_works.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "## SparkStreaming Hackathon\n", 7 | "### Course: Real-time Data Analysis\n", 8 | "### Authors: Ruben Tak, Nils Jennissen, David Landeo\n", 9 | "This task involves setting up a data streaming pipeline to extract and process posts and comments from Reddit. The data will be structured and sent through a socket, then received and processed by another process. References to users, posts, and external sites will be extracted and counted, and the top 10 important words will be identified using TF-IDF. Optional features include sentiment analysis, additional metrics, saving results to a database, creating a Jupyter Notebook dashboard, and visualizing the results on a web page. The deliverables include Python code, instructions, output data files, and optional Docker setup." 10 | ], 11 | "metadata": { 12 | "collapsed": false 13 | } 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "scrolled": true 20 | }, 21 | "outputs": [ 22 | { 23 | "name": "stderr", 24 | "output_type": "stream", 25 | "text": [ 26 | "/usr/local/spark/python/pyspark/streaming/context.py:72: FutureWarning: DStream is deprecated as of Spark 3.4.0. Migrate to Structured Streaming.\n", 27 | " warnings.warn(\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "from pyspark import SparkConf\n", 33 | "from pyspark.sql import SparkSession\n", 34 | "from pyspark.streaming import StreamingContext\n", 35 | "from pyspark.sql.types import StructType, StructField, StringType\n", 36 | "from pyspark.sql.functions import col, count, window, lit, desc, split, explode\n", 37 | "from pyspark.sql.types import ArrayType\n", 38 | "\n", 39 | "import json\n", 40 | "import re\n", 41 | "\n", 42 | "# Function to extract references to users, posts, and external sites from text\n", 43 | "def get_references(text):\n", 44 | " ''' This function extracts references to users, posts, and external sites from text.\n", 45 | " It returns a list of references.'''\n", 46 | " user_pattern = r'/u/(\\w+)'\n", 47 | " post_pattern = r'/r/(\\w+)'\n", 48 | " url_pattern = r'(https?://[^\\s]+)'\n", 49 | "\n", 50 | " references = []\n", 51 | "\n", 52 | " # Search for user references\n", 53 | " match = re.search(user_pattern, text)\n", 54 | " while match:\n", 55 | " references.append(match.group(1))\n", 56 | " text = text[:match.start()] + text[match.end():]\n", 57 | " match = re.search(user_pattern, text)\n", 58 | "\n", 59 | " # Search for post references\n", 60 | " match = re.search(post_pattern, text)\n", 61 | " while match:\n", 62 | " references.append(match.group(1))\n", 63 | " text = text[:match.start()] + text[match.end():]\n", 64 | " match = re.search(post_pattern, text)\n", 65 | "\n", 66 | " # Search for url references\n", 67 | " match = re.search(url_pattern, text)\n", 68 | " while match:\n", 69 | " references.append(match.group(1))\n", 70 | " text = text[:match.start()] + text[match.end():]\n", 71 | " match = re.search(url_pattern, text)\n", 72 | "\n", 73 | " return references\n", 74 | "\n", 75 | "# Create a SparkSession and StreamingContext\n", 76 | "spark_conf = SparkConf().setAppName(\"reddit\")\n", 77 | "spark_session = SparkSession.builder.config(conf=spark_conf).getOrCreate()\n", 78 | "streaming_context = StreamingContext(spark_session.sparkContext, 5)\n", 79 | "\n", 80 | "# Create a DStream\n", 81 | "lines = streaming_context.socketTextStream(\"localhost\", 9999)\n", 82 | "comments = lines.map(lambda json_data: json.loads(json_data))\n", 83 | "\n", 84 | "# Define the schema for the DataFrame\n", 85 | "schema = StructType([\n", 86 | " StructField(\"comment\", StringType(), True),\n", 87 | " StructField(\"prev_comment\", StringType(), True),\n", 88 | " StructField(\"post\", StringType(), True),\n", 89 | " StructField(\"author\", StringType(), True),\n", 90 | " StructField(\"post_date\", StringType(), True)\n", 91 | "])\n", 92 | "\n", 93 | "# Update the base_path according to your desired output location\n", 94 | "base_path = \"./data/raw/reddit_v5\"\n", 95 | "\n", 96 | "# Convert each RDD in the DStream to a DataFrame and process it\n", 97 | "def process_rdd(time, rdd):\n", 98 | " ''' This function is applied to each RDD in the DStream.\n", 99 | " It converts the RDD to a DataFrame, extracts references to users, posts, and external sites,\n", 100 | " counts the occurrences of references in 60-second windows every 5 seconds,\n", 101 | " and saves the processed data to disk.'''\n", 102 | " if not rdd.isEmpty():\n", 103 | " df = spark_session.createDataFrame(rdd, schema)\n", 104 | "\n", 105 | " # Extract references to users, posts, and external sites\n", 106 | " data_frame_with_refs = df.withColumn(\"refs\", split(col(\"comment\"), r'/u/(\\w+)|/r/(\\w+)|(https?://[^\\s]+)'))\n", 107 | "\n", 108 | " # Count occurrences of references in 60-second windows every 5 seconds\n", 109 | " windowed_counts = data_frame_with_refs \\\n", 110 | " .withWatermark(\"post_date\", \"60 seconds\") \\\n", 111 | " .groupBy(window(\"post_date\", \"5 seconds\")) \\\n", 112 | " .agg(count(\"refs\").alias(\"reference_count\"))\n", 113 | "\n", 114 | " # Get top 10 words in window using word count\n", 115 | " word_count = data_frame_with_refs.selectExpr(\"explode(split(comment, ' ')) as word\") \\\n", 116 | " .groupBy(\"word\") \\\n", 117 | " .agg(count(\"*\").alias(\"count\")) \\\n", 118 | " .orderBy(desc(\"count\")) \\\n", 119 | " .limit(10)\n", 120 | "\n", 121 | " # Convert the top 10 words to a string representation\n", 122 | " top10_str = str(word_count.select(\"word\").rdd.flatMap(lambda x: x).collect())\n", 123 | "\n", 124 | " # Add the top10 words to the dataframe\n", 125 | " data_frame_with_refs = data_frame_with_refs.withColumn(\"top10\", lit(top10_str))\n", 126 | "\n", 127 | " # Get the time range of the data\n", 128 | " min_time = df.selectExpr(\"MIN(post_date)\").first()[0]\n", 129 | " max_time = df.selectExpr(\"MAX(post_date)\").first()[0]\n", 130 | "\n", 131 | " # Add the time, min_time, max_time to the dataframe\n", 132 | " data_frame_with_refs = data_frame_with_refs.withColumn(\"time\", lit(time))\n", 133 | " data_frame_with_refs = data_frame_with_refs.withColumn(\"min_time\", lit(min_time))\n", 134 | " data_frame_with_refs = data_frame_with_refs.withColumn(\"max_time\", lit(max_time))\n", 135 | "\n", 136 | " # Save the processed data to disk with folder names separated by hyphens\n", 137 | " output_path = f\"{base_path}/{time.strftime('%Y-%m-%d-%H-%M-%S')}\"\n", 138 | " data_frame_with_refs.write.json(output_path)\n", 139 | "\n", 140 | " # Print some information for verification\n", 141 | " # print(f\"Time: {time}, Data Range: {min_time} - {max_time}\")\n", 142 | " # Show the output\n", 143 | " data_frame_with_refs.show()\n", 144 | "\n", 145 | "comments.foreachRDD(process_rdd)\n", 146 | "\n", 147 | "# Start the streaming context\n", 148 | "streaming_context.start()\n", 149 | "# No streaming_context.awaitTermination() added here to make the cell non-blocking and to use other cells in parallel." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 2, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "" 161 | ] 162 | }, 163 | "execution_count": 2, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | }, 167 | { 168 | "name": "stdout", 169 | "output_type": "stream", 170 | "text": [ 171 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 172 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 173 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 174 | "|You're a complete...|We can always do ...|Dear redditors,\\n...|harley247|https://www.reddi...|https://www.reddi...|1.687317098E9| 1|[You're a complet...|['moron', 'dude',...|2023-06-21 19:45:55|1.687317098E9|1.687317098E9| null|\n", 175 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 176 | "\n", 177 | "+-------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+---------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 178 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 179 | "+-------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+---------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 180 | "|BOOOOO 🍅🍅🍅|We can always do ...|Dear redditors,\\n...|XxxGoldDustWomanxxX|https://www.reddi...|https://www.reddi...|1.687318709E9| 1|[BOOOOO 🍅🍅🍅]|['🍅🍅🍅', 'BOOOOO']|2023-06-21 19:46:30|1.687318709E9|1.687318709E9| null|\n", 181 | "+-------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+---------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 182 | "\n", 183 | "+----------+--------------------+--------------------+-----------------+--------------------+--------------------+-------------+---+------------+-----------------+-------------------+-------------+-------------+-----------------+\n", 184 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 185 | "+----------+--------------------+--------------------+-----------------+--------------------+--------------------+-------------+---+------------+-----------------+-------------------+-------------+-------------+-----------------+\n", 186 | "|3.7 now...|I love how it wen...|Dear redditors,\\n...|NoEngineering5990|https://www.reddi...|https://www.reddi...|1.687320159E9| 1|[3.7 now...]|['3.7', 'now...']|2023-06-21 19:47:10|1.687320159E9|1.687320159E9| null|\n", 187 | "+----------+--------------------+--------------------+-----------------+--------------------+--------------------+-------------+---+------------+-----------------+-------------------+-------------+-------------+-----------------+\n", 188 | "\n", 189 | "+--------------------+------------+--------------------+-----------------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 190 | "| comment|prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 191 | "+--------------------+------------+--------------------+-----------------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 192 | "|But it's raining ...| Go outside|Dear redditors,\\n...|NoEngineering5990|https://www.reddi...|https://www.reddi...|1.68732027E9| 1|[But it's raining...|['cant', 'without...|2023-06-21 19:47:35|1.68732027E9|1.68732027E9| null|\n", 193 | "+--------------------+------------+--------------------+-----------------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 194 | "\n", 195 | "+-------+--------------------+--------------------+------------+--------------------+--------------------+-------------+---+-------+---------+-------------------+-------------+-------------+-----------------+\n", 196 | "|comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 197 | "+-------+--------------------+--------------------+------------+--------------------+--------------------+-------------+---+-------+---------+-------------------+-------------+-------------+-----------------+\n", 198 | "| Lmfao|https://www.reddi...|Dear redditors,\\n...|EnricoShapka|https://www.reddi...|https://www.reddi...|1.687320393E9| 1|[Lmfao]|['Lmfao']|2023-06-21 19:48:10|1.687320393E9|1.687320393E9| null|\n", 199 | "+-------+--------------------+--------------------+------------+--------------------+--------------------+-------------+---+-------+---------+-------------------+-------------+-------------+-----------------+\n", 200 | "\n", 201 | "+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 202 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 203 | "+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 204 | "|This is what happ...|Hey spez, \\n\\n\\n1...|Dear redditors,\\n...|NoEngineering5990|https://www.reddi...|https://www.reddi...|1.68732043E9| 1|[This is what hap...|['to', 'they', 'g...|2023-06-21 19:48:35|1.68732043E9|1.68732043E9| null|\n", 205 | "+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 206 | "\n", 207 | "+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 208 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 209 | "+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 210 | "|>shell of former ...|According to u/sp...|Dear redditors,\\n...|NoEngineering5990|https://www.reddi...|https://www.reddi...|1.687320518E9| 1|[>shell of former...|['>shell', 'forme...|2023-06-21 19:48:50|1.687320518E9|1.687320518E9| null|\n", 211 | "+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 212 | "\n", 213 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 214 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 215 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 216 | "|/u/spez So you're...|We are following ...|Dear redditors,\\n...|Mega_Trix|https://www.reddi...|https://www.reddi...|1.687321627E9| 1|[, So you're ess...|['admitting', 'un...|2023-06-21 19:48:55|1.687321627E9|1.687321627E9| null|\n", 217 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 218 | "\n", 219 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 220 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 221 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 222 | "|Why are you actin...|Lol. \\n\\nEverythi...|Dear redditors,\\n...|Sablemint|https://www.reddi...|https://www.reddi...|1.687321653E9| 1|[Why are you acti...|['you', \"isn't\", ...|2023-06-21 19:49:05|1.687321653E9|1.687321653E9| null|\n", 223 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 224 | "\n", 225 | "+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 226 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 227 | "+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 228 | "|I just tried it. ...|Hey. We actually ...|Dear redditors,\\n...|SupremeLisper|https://www.reddi...|https://www.reddi...|1.687321734E9| 1|[I just tried it....|['reddit', 'to', ...|2023-06-21 19:49:15|1.687321734E9|1.687321734E9| null|\n", 229 | "+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 230 | "\n", 231 | "+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 232 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 233 | "+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 234 | "|Liar. I’m deletin...|I acknowledge it ...|Dear redditors,\\n...|VultureBoySam|https://www.reddi...|https://www.reddi...|1.687322266E9| 2|[Liar. I’m deleti...|['July', '1st', '...|2023-06-21 19:49:20|1.687322266E9|1.687322266E9| null|\n", 235 | "+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 236 | "\n", 237 | "+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 238 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 239 | "+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 240 | "|Do you really thi...|sounds like you s...|Dear redditors,\\n...|NissaWalker|https://www.reddi...|https://www.reddi...|1.687322727E9| 1|[Do you really th...|['I', 'be', 'abou...|2023-06-21 19:49:25|1.687322727E9|1.687322727E9| null|\n", 241 | "+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 242 | "\n", 243 | "+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 244 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 245 | "+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 246 | "|Get a life you lo...|Messaging from In...|Dear redditors,\\n...|SovietSteve|https://www.reddi...|https://www.reddi...|1.687324002E9| 2|[Get a life you l...|['you', '‘Enemies...|2023-06-21 19:49:30|1.687324002E9|1.687324002E9| null|\n", 247 | "+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 248 | "\n", 249 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 250 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 251 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 252 | "|You had a chance ...|We can always do ...|Dear redditors,\\n...|MessAdmin|https://www.reddi...|https://www.reddi...|1.687324453E9| 1|[You had a chance...|['to', 'feel?', '...|2023-06-21 19:49:40|1.687324453E9|1.687324453E9| null|\n", 253 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 254 | "\n", 255 | "+------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 256 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 257 | "+------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 258 | "|Bro dont even care|We can always do ...|Dear redditors,\\n...|berny1244|https://www.reddi...|https://www.reddi...|1.687325295E9| 1|[Bro dont even care]|['even', 'care', ...|2023-06-21 19:49:45|1.687325295E9|1.687325295E9| null|\n", 259 | "+------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 260 | "\n", 261 | "+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 262 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 263 | "+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 264 | "|Cut the shit, Spe...|We can always do ...|Dear redditors,\\n...|Chef_Boy_Hard_Dick|https://www.reddi...|https://www.reddi...|1.687326178E9| 1|[Cut the shit, Sp...|['to', 'Twitter?'...|2023-06-21 19:49:50|1.687326178E9|1.687326178E9| null|\n", 265 | "+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 266 | "\n", 267 | "+-------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------+----------+-------------------+-------------+-------------+-----------------+\n", 268 | "|comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 269 | "+-------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------+----------+-------------------+-------------+-------------+-----------------+\n", 270 | "| resign|We can always do ...|Dear redditors,\\n...|Zytharros|https://www.reddi...|https://www.reddi...|1.687326736E9| 1|[resign]|['resign']|2023-06-21 19:49:55|1.687326736E9|1.687326736E9| null|\n", 271 | "+-------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------+----------+-------------------+-------------+-------------+-----------------+\n", 272 | "\n", 273 | "+--------------------+------------+--------------------+------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 274 | "| comment|prev_comment| post|author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 275 | "+--------------------+------------+--------------------+------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 276 | "|just ban me alrea...| [removed]|Dear redditors,\\n...|ITPoet|https://www.reddi...|https://www.reddi...|1.687327671E9| 1|[just ban me alre...|['me', 'you', 'gi...|2023-06-21 19:50:05|1.687327671E9|1.687327671E9| null|\n", 277 | "+--------------------+------------+--------------------+------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 278 | "\n", 279 | "+-------+--------------------+--------------------+------+--------------------+--------------------+-------------+---+--------+----------+-------------------+-------------+-------------+-----------------+\n", 280 | "|comment| prev_comment| post|author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 281 | "+-------+--------------------+--------------------+------+--------------------+--------------------+-------------+---+--------+----------+-------------------+-------------+-------------+-----------------+\n", 282 | "| u/spez|just ban me alrea...|Dear redditors,\\n...|ITPoet|https://www.reddi...|https://www.reddi...|1.687327696E9| 1|[u/spez]|['u/spez']|2023-06-21 19:50:20|1.687327696E9|1.687327696E9| null|\n", 283 | "+-------+--------------------+--------------------+------+--------------------+--------------------+-------------+---+--------+----------+-------------------+-------------+-------------+-----------------+\n", 284 | "\n", 285 | "+-------+------------+--------------------+------+--------------------+--------------------+-------------+---+--------+----------+-------------------+-------------+-------------+-----------------+\n", 286 | "|comment|prev_comment| post|author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 287 | "+-------+------------+--------------------+------+--------------------+--------------------+-------------+---+--------+----------+-------------------+-------------+-------------+-----------------+\n", 288 | "| u/spez| u/spez|Dear redditors,\\n...|ITPoet|https://www.reddi...|https://www.reddi...|1.687327707E9| 1|[u/spez]|['u/spez']|2023-06-21 19:50:50|1.687327707E9|1.687327707E9| null|\n", 289 | "+-------+------------+--------------------+------+--------------------+--------------------+-------------+---+--------+----------+-------------------+-------------+-------------+-----------------+\n", 290 | "\n", 291 | "+-------+------------+--------------------+------+--------------------+--------------------+------------+---+--------+----------+-------------------+------------+------------+-----------------+\n", 292 | "|comment|prev_comment| post|author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 293 | "+-------+------------+--------------------+------+--------------------+--------------------+------------+---+--------+----------+-------------------+------------+------------+-----------------+\n", 294 | "| u/spez| u/spez|Dear redditors,\\n...|ITPoet|https://www.reddi...|https://www.reddi...|1.68732772E9| 1|[u/spez]|['u/spez']|2023-06-21 19:51:15|1.68732772E9|1.68732772E9| null|\n", 295 | "+-------+------------+--------------------+------+--------------------+--------------------+------------+---+--------+----------+-------------------+------------+------------+-----------------+\n", 296 | "\n", 297 | "+-------+------------+--------------------+------+--------------------+--------------------+-------------+---+--------+----------+-------------------+-------------+-------------+-----------------+\n", 298 | "|comment|prev_comment| post|author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 299 | "+-------+------------+--------------------+------+--------------------+--------------------+-------------+---+--------+----------+-------------------+-------------+-------------+-----------------+\n", 300 | "| u/spez| u/spez|Dear redditors,\\n...|ITPoet|https://www.reddi...|https://www.reddi...|1.687327729E9| 1|[u/spez]|['u/spez']|2023-06-21 19:51:40|1.687327729E9|1.687327729E9| null|\n", 301 | "+-------+------------+--------------------+------+--------------------+--------------------+-------------+---+--------+----------+-------------------+-------------+-------------+-----------------+\n", 302 | "\n", 303 | "+-----------------+------------+--------------------+------+--------------------+--------------------+-------------+---+-------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 304 | "| comment|prev_comment| post|author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 305 | "+-----------------+------------+--------------------+------+--------------------+--------------------+-------------+---+-------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 306 | "|u/spez is a dummy| u/spez|Dear redditors,\\n...|ITPoet|https://www.reddi...|https://www.reddi...|1.687327741E9| 1|[u/spez is a dummy]|['is', 'u/spez', ...|2023-06-21 19:52:05|1.687327741E9|1.687327741E9| null|\n", 307 | "+-----------------+------------+--------------------+------+--------------------+--------------------+-------------+---+-------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 308 | "\n", 309 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 310 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 311 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 312 | "|maybe you should ...|sounds like you s...|Dear redditors,\\n...|Intelligent_Cress932|https://www.reddi...|https://www.reddi...|1.687329387E9| 3|[maybe you should...|['touch', 'you', ...|2023-06-21 19:52:40|1.687329387E9|1.687329387E9| null|\n", 313 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 314 | "\n", 315 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 316 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 317 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 318 | "|Amazing Ted Lasso...|\"Trent Crimm. The...|Dear redditors,\\n...|Lunaticultistt|https://www.reddi...|https://www.reddi...|1.687329846E9| 1|[Amazing Ted Lass...|['Amazing', 'Lass...|2023-06-21 19:53:10|1.687329846E9|1.687329846E9| null|\n", 319 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 320 | "\n", 321 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 322 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 323 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 324 | "|This is a 7 year ...|But that’s the po...|Dear redditors,\\n...|EvilStevilTheKenevil|https://www.reddi...|https://www.reddi...|1.687334704E9| 2|[This is a 7 year...|['is', 'a', '7', ...|2023-06-21 19:53:55|1.687334704E9|1.687334704E9| null|\n", 325 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 326 | "\n", 327 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 328 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 329 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 330 | "|i'm pretty sure n...|Stop giving reddi...|Dear redditors,\\n...|Separate_Feedback862|https://www.reddi...|https://www.reddi...|1.687337121E9| 1|[i'm pretty sure ...|['credit.', 'sure...|2023-06-21 19:54:30|1.687337121E9|1.687337121E9| null|\n", 331 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 332 | "\n", 333 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 334 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 335 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 336 | "|What are you doin...|sounds like you s...|Dear redditors,\\n...|Separate_Feedback862|https://www.reddi...|https://www.reddi...|1.687337262E9| 1|[What are you doi...|['who', 'seen', '...|2023-06-21 19:54:55|1.687337262E9|1.687337262E9| null|\n", 337 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 338 | "\n", 339 | "+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 340 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 341 | "+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 342 | "|I’m kinda confuse...|Almost all of thi...|Dear redditors,\\n...|mushroomboie|https://www.reddi...|https://www.reddi...|1.687337902E9| 1|[I’m kinda confus...|['is', 'from', 'B...|2023-06-21 19:55:25|1.687337902E9|1.687337902E9| null|\n", 343 | "+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 344 | "\n", 345 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 346 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 347 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 348 | "|yep otherwise how...|just stop i want ...|Dear redditors,\\n...|Separate_Feedback862|https://www.reddi...|https://www.reddi...|1.687337948E9| 1|[yep otherwise ho...|['will', 'you', '...|2023-06-21 19:55:50|1.687337948E9|1.687337948E9| null|\n", 349 | "+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 350 | "\n", 351 | "+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 352 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 353 | "+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 354 | "|How does this cha...|According to u/sp...|Dear redditors,\\n...|mushroomboie|https://www.reddi...|https://www.reddi...|1.687338064E9| 1|[How does this ch...|['this', 'affect'...|2023-06-21 19:56:15|1.687338064E9|1.687338064E9| null|\n", 355 | "+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 356 | "\n", 357 | "+--------------------+--------------------+--------------------+------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 358 | "| comment| prev_comment| post|author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 359 | "+--------------------+--------------------+--------------------+------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 360 | "|Okay, but here's ...|His “joke” is the...|Dear redditors,\\n...| None|https://www.reddi...|https://www.reddi...|1.687338953E9| 1|[Okay, but here's...|[\"here's\", 'Okay,...|2023-06-21 19:56:40|1.687338953E9|1.687338953E9| null|\n", 361 | "+--------------------+--------------------+--------------------+------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 362 | "\n", 363 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 364 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 365 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 366 | "|yall should serio...|Apologies for the...|Dear redditors,\\n...|PontusGW_|https://www.reddi...|https://www.reddi...|1.687339642E9| 1|[yall should seri...|['💀💀💀', 'your'...|2023-06-21 19:57:10|1.687339642E9|1.687339642E9| null|\n", 367 | "+--------------------+--------------------+--------------------+---------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 368 | "\n", 369 | "+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 370 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 371 | "+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 372 | "|wow you really ha...|You're not entitl...|Dear redditors,\\n...|Cool_Hedgehog8390|https://www.reddi...|https://www.reddi...|1.687341113E9| 1|[wow you really h...|['', 'you', 'to',...|2023-06-21 19:57:35|1.687341113E9|1.687341113E9| null|\n", 373 | "+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 374 | "\n", 375 | "+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 376 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 377 | "+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 378 | "|> available via H...|The thing that ma...|Dear redditors,\\n...|trycatchebola|https://www.reddi...|https://www.reddi...|1.68734163E9| 1|[> available via ...|['is', 'the', 'wi...|2023-06-21 19:58:00|1.68734163E9|1.68734163E9| null|\n", 379 | "+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 380 | "\n", 381 | "+---------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+-----------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 382 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 383 | "+---------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+-----------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 384 | "|Let Reddit die!|**I am the develo...|Dear redditors,\\n...|Educational-Ad-8491|https://www.reddi...|https://www.reddi...|1.687342145E9| 1|[Let Reddit die!]|['Reddit', 'die!'...|2023-06-21 19:58:30|1.687342145E9|1.687342145E9| null|\n", 385 | "+---------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+-----------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 386 | "\n", 387 | "+--------------------+--------------------+--------------------+----------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 388 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 389 | "+--------------------+--------------------+--------------------+----------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 390 | "|He got pissed alr...|We can always do ...|Dear redditors,\\n...|YanameeUwU|https://www.reddi...|https://www.reddi...|1.687342243E9| 1|[He got pissed al...|['more', 'less', ...|2023-06-21 19:58:55|1.687342243E9|1.687342243E9| null|\n", 391 | "+--------------------+--------------------+--------------------+----------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 392 | "\n", 393 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 394 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 395 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 396 | "|Get fucked you fu...|We can always do ...|Dear redditors,\\n...|too_much_to_do|https://www.reddi...|https://www.reddi...|1.687344268E9| 1|[Get fucked you f...|['you', 'donut.',...|2023-06-21 19:59:15|1.687344268E9|1.687344268E9| null|\n", 397 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 398 | "\n", 399 | "+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 400 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 401 | "+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 402 | "|It’s Reddit man.....|>How do you addre...|Dear redditors,\\n...|Mammoth-Vast4598|https://www.reddi...|https://www.reddi...|1.687344838E9| 1|[It’s Reddit man....|['It’s', 'take', ...|2023-06-21 19:59:20|1.687344838E9|1.687344838E9| null|\n", 403 | "+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 404 | "\n", 405 | "+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 406 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 407 | "+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 408 | "|That’s umm… not w...|And here it is: h...|Dear redditors,\\n...|Ok-Opportunity-9915|https://www.reddi...|https://www.reddi...|1.687348491E9| 1|[That’s umm… not ...|['talking', 'were...|2023-06-21 19:59:25|1.687348491E9|1.687348491E9| null|\n", 409 | "+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 410 | "\n", 411 | "+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 412 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 413 | "+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 414 | "|Because they were...|Why are you assum...|Dear redditors,\\n...|Ok-Opportunity-9915|https://www.reddi...|https://www.reddi...|1.68734895E9| 1|[Because they wer...|['I', 'this', 'If...|2023-06-21 19:59:35|1.68734895E9|1.68734895E9| null|\n", 415 | "+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 416 | "\n", 417 | "+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 418 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 419 | "+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 420 | "|no, it’s how Redd...|EXACTLY THAT\"S RA...|Dear redditors,\\n...|Ok-Opportunity-9915|https://www.reddi...|https://www.reddi...|1.687349347E9| 1|[no, it’s how Red...|['how', 'it’s', '...|2023-06-21 19:59:40|1.687349347E9|1.687349347E9| null|\n", 421 | "+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 422 | "\n", 423 | "+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 424 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 425 | "+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 426 | "|The level of delu...|Lol. \\n\\nEverythi...|Dear redditors,\\n...|Ok-Opportunity-9915|https://www.reddi...|https://www.reddi...|1.687349791E9| 2|[The level of del...|['one', 'in', 'of...|2023-06-21 19:59:45|1.687349791E9|1.687349791E9| null|\n", 427 | "+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 428 | "\n", 429 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 430 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 431 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 432 | "|I'm just glad a l...|I've attempted to...|Dear redditors,\\n...|activeXdiamond|https://www.reddi...|https://www.reddi...|1.687350558E9| 1|[I'm just glad a ...|['the', 'of', 'a'...|2023-06-21 19:59:50|1.687350558E9|1.687350558E9| null|\n", 433 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 434 | "\n", 435 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 436 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 437 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 438 | "|What makes this e...|Same - the second...|Dear redditors,\\n...|activeXdiamond|https://www.reddi...|https://www.reddi...|1.687350702E9| 2|[What makes this ...|['the', 'that', '...|2023-06-21 20:00:00|1.687350702E9|1.687350702E9| null|\n", 439 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 440 | "\n", 441 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 442 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 443 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 444 | "|While also taking...|Same. 95% of what...|Dear redditors,\\n...|activeXdiamond|https://www.reddi...|https://www.reddi...|1.687350823E9| 1|[While also takin...|['taking', 'you',...|2023-06-21 20:00:05|1.687350823E9|1.687350823E9| null|\n", 445 | "+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 446 | "\n", 447 | "+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 448 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 449 | "+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 450 | "|He's correct thou...|Get a life you lo...|Dear redditors,\\n...|epicboi1337|https://www.reddi...|https://www.reddi...|1.68735093E9| 1|[He's correct tho...|['has', 'to', 'pu...|2023-06-21 20:00:20|1.68735093E9|1.68735093E9| null|\n", 451 | "+--------------------+--------------------+--------------------+-----------+--------------------+--------------------+------------+---+--------------------+--------------------+-------------------+------------+------------+-----------------+\n", 452 | "\n", 453 | "+-------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+------+--------+-------------------+-------------+-------------+-----------------+\n", 454 | "|comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 455 | "+-------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+------+--------+-------------------+-------------+-------------+-----------------+\n", 456 | "| lmao|I can't believe I...|Dear redditors,\\n...|Right-Huckleberry574|https://www.reddi...|https://www.reddi...|1.687351348E9| 1|[lmao]|['lmao']|2023-06-21 20:00:45|1.687351348E9|1.687351348E9| null|\n", 457 | "+-------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+---+------+--------+-------------------+-------------+-------------+-----------------+\n", 458 | "\n", 459 | "+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 460 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 461 | "+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 462 | "|>Go point out in ...|I've already prov...|Hi everyone, I’m ...|DefendSection230|https://www.reddi...|https://www.reddi...|1.687351691E9| 1|[>Go point out in...|['the', 'to', 'ri...|2023-06-21 20:01:10|1.687351691E9|1.687351691E9| null|\n", 463 | "+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 464 | "\n", 465 | "+--------------------+--------------------+--------------------+----------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 466 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 467 | "+--------------------+--------------------+--------------------+----------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 468 | "|Agreed. Investors...|Ah no, they'll in...|Dear redditors,\\n...|noobatious|https://www.reddi...|https://www.reddi...|1.687353192E9| 1|[Agreed. Investor...|['in', 'are', 'wh...|2023-06-21 20:01:35|1.687353192E9|1.687353192E9| null|\n", 469 | "+--------------------+--------------------+--------------------+----------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 470 | "\n", 471 | "+--------------------+--------------------+--------------------+--------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 472 | "| comment| prev_comment| post| author| link_url| link_permalink| post_date|ups| refs| top10| time| min_time| max_time|average_sentiment|\n", 473 | "+--------------------+--------------------+--------------------+--------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 474 | "|The best thing th...|We can always do ...|Dear redditors,\\n...|i_J3ff1n|https://www.reddi...|https://www.reddi...|1.687353621E9| 1|[The best thing t...|['still', 'accoun...|2023-06-21 20:02:15|1.687353621E9|1.687353621E9| null|\n", 475 | "+--------------------+--------------------+--------------------+--------+--------------------+--------------------+-------------+---+--------------------+--------------------+-------------------+-------------+-------------+-----------------+\n", 476 | "\n" 477 | ] 478 | } 479 | ], 480 | "source": [ 481 | "from pyspark import SparkConf\n", 482 | "from pyspark.sql import SparkSession\n", 483 | "from pyspark.sql.types import StructType, StructField, StringType\n", 484 | "from pyspark.sql.functions import col, from_unixtime\n", 485 | "from pyspark.sql.types import TimestampType\n", 486 | "from pyspark.sql import functions as F\n", 487 | "from pyspark.sql.window import Window\n", 488 | "\n", 489 | "host = \"localhost\"\n", 490 | "port = 9999\n", 491 | "\n", 492 | "# Create a SparkSession\n", 493 | "spark_conf = SparkConf().setAppName(\"reddit\")\n", 494 | "spark_session = SparkSession.builder.config(conf=spark_conf).getOrCreate()\n", 495 | "\n", 496 | "# Update the input_path according to the location in the first code chunk\n", 497 | "input_path = \"./data/raw/reddit_v5/*/*.json\"\n", 498 | "\n", 499 | "schema = StructType([\n", 500 | " StructField(\"comment\", StringType(), True),\n", 501 | " StructField(\"prev_comment\", StringType(), True),\n", 502 | " StructField(\"post\", StringType(), True),\n", 503 | " StructField(\"author\", StringType(), True),\n", 504 | " StructField(\"post_date\", StringType(), True),\n", 505 | " StructField(\"top10\", StringType(), True),\n", 506 | " StructField(\"time\", StringType(), True),\n", 507 | " StructField(\"min_time\", StringType(), True),\n", 508 | " StructField(\"max_time\", StringType(), True)\n", 509 | "])\n", 510 | "\n", 511 | "# Read the JSON data saved by the first code chunk\n", 512 | "streaming_df = spark_session.readStream \\\n", 513 | " .format(\"json\") \\\n", 514 | " .schema(schema) \\\n", 515 | " .option(\"path\", input_path) \\\n", 516 | " .load()\n", 517 | "\n", 518 | "# Perform transformations on the data\n", 519 | "transformed_df = streaming_df \\\n", 520 | " .withColumn('second_posrt_date', col('post_date').cast(\"float\")) \\\n", 521 | " .withColumn('third_post_date', col('second_post_date').cast(\"int\")) \\\n", 522 | " .withColumn('post_date_ts', from_unixtime(col('third_post_date')).cast(TimestampType())) \\\n", 523 | " .withColumn('min_time2', col('min_time').cast(\"float\")) \\\n", 524 | " .withColumn('min_time3', col('min_time2').cast(\"int\")) \\\n", 525 | " .withColumn('min_time_ts', from_unixtime(col('min_time3')).cast(TimestampType())) \\\n", 526 | " .withColumn('max_time2', col('max_time').cast(\"float\")) \\\n", 527 | " .withColumn('max_time3', col('max_time2').cast(\"int\")) \\\n", 528 | " .withColumn('max_time_ts', from_unixtime(col('max_time3')).cast(TimestampType())) \\\n", 529 | " .drop('post_date', 'min_time', 'max_time', 'second_posr_date', 'third_post_date', 'min_time2', 'min_time3', 'max_time2', 'max_time3')\n", 530 | "\n", 531 | "# Save the transformed data to disk\n", 532 | "# Update the output_path and checkpt_path each time you rerun this cell\n", 533 | "output_path = \"./data/processed/reddit_v1\"\n", 534 | "checkpt_path = \"./metadata/processed/reddit_v1\"\n", 535 | "\n", 536 | "transformed_df.writeStream \\\n", 537 | " .format(\"json\") \\\n", 538 | " .option(\"checkpointLocation\", checkpt_path) \\\n", 539 | " .option(\"path\", output_path) \\\n", 540 | " .outputMode(\"append\") \\\n", 541 | " .start()" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [] 550 | } 551 | ], 552 | "metadata": { 553 | "kernelspec": { 554 | "display_name": "Python 3 (ipykernel)", 555 | "language": "python", 556 | "name": "python3" 557 | }, 558 | "language_info": { 559 | "codemirror_mode": { 560 | "name": "ipython", 561 | "version": 3 562 | }, 563 | "file_extension": ".py", 564 | "mimetype": "text/x-python", 565 | "name": "python", 566 | "nbconvert_exporter": "python", 567 | "pygments_lexer": "ipython3", 568 | "version": "3.11.4" 569 | }, 570 | "vscode": { 571 | "interpreter": { 572 | "hash": "1db35aecd77fcc020a8642668a5b7619c380276ef0ce04f324d75e02f2d7512b" 573 | } 574 | } 575 | }, 576 | "nbformat": 4, 577 | "nbformat_minor": 4 578 | } 579 | --------------------------------------------------------------------------------