├── README.md ├── hadoop_install.ipynb ├── hadoop_mapreduce.ipynb ├── map_reduce.ipynb ├── mapper.py └── reducer.py /README.md: -------------------------------------------------------------------------------- 1 | # Hadoop 2 | ### This contain how to install hadoop on google colab and how to run mapreduce in hadoop. 3 | 4 | ## 1. Hadoop_install.ipynb 5 | #### This file contain hadoop installation on google colab. 6 | steps in hadoop installation: 7 | * 1.Installing hadoop using this link : https://downloads.apache.org/hadoop/common/hadoop-3.3.0/hadoop-3.3.0.tar.gz 8 | * 2.Configuring Hadoop’s Java Home : Hadoop requires that you set the path to Java, either as an environment variable or in the Hadoop configuration file. 9 | * 3.Running Hadoop 10 | ## 2. map_reduce.ipynb 11 | upload mapper.py and reducer.py 12 | ### mapper.py 13 | It will read data from *STDIN, split it into words and output a list of lines mapping words to their counts to *STDOUT. 14 | 15 | ### reducer.py 16 | It will read the results of mapper.py from STDIN and sum the occurrences of each word to a final count, and then output its results to STDOUT. 17 | -------------------------------------------------------------------------------- /hadoop_install.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "hadoop_install.ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyMNzJCDbblLt/1SgwmYCB+3" 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "myPIGP-mwKBD", 20 | "colab_type": "text" 21 | }, 22 | "source": [ 23 | "#Hadoop Instalation Part\n", 24 | "Hadoop is a Java-based programming framework that supports the processing and storage of extremely large datasets on a cluster of inexpensive machines. It was the first major open source project in the big data playing field and is sponsored by the Apache Software Foundation." 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "j9bT9M1yvyXG", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "## Step 1:Installing Hadoop" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "metadata": { 40 | "id": "bijZAdD_cBMK", 41 | "colab_type": "code", 42 | "colab": { 43 | "base_uri": "https://localhost:8080/", 44 | "height": 204 45 | }, 46 | "outputId": "989d7a35-005b-4014-a0cf-ff458ddd46d7" 47 | }, 48 | "source": [ 49 | "!wget https://downloads.apache.org/hadoop/common/hadoop-3.3.0/hadoop-3.3.0.tar.gz\n" 50 | ], 51 | "execution_count": 1, 52 | "outputs": [ 53 | { 54 | "output_type": "stream", 55 | "text": [ 56 | "--2020-09-10 05:21:56-- https://downloads.apache.org/hadoop/common/hadoop-3.3.0/hadoop-3.3.0.tar.gz\n", 57 | "Resolving downloads.apache.org (downloads.apache.org)... 88.99.95.219, 2a01:4f8:10a:201a::2\n", 58 | "Connecting to downloads.apache.org (downloads.apache.org)|88.99.95.219|:443... connected.\n", 59 | "HTTP request sent, awaiting response... 200 OK\n", 60 | "Length: 500749234 (478M) [application/x-gzip]\n", 61 | "Saving to: ‘hadoop-3.3.0.tar.gz.1’\n", 62 | "\n", 63 | "hadoop-3.3.0.tar.gz 100%[===================>] 477.55M 10.4MB/s in 47s \n", 64 | "\n", 65 | "2020-09-10 05:22:44 (10.1 MB/s) - ‘hadoop-3.3.0.tar.gz.1’ saved [500749234/500749234]\n", 66 | "\n" 67 | ], 68 | "name": "stdout" 69 | } 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "colab_type": "text", 76 | "id": "Mj40txsTw6DZ" 77 | }, 78 | "source": [ 79 | "we’ll use the tar command with the -x flag to extract, -z to uncompress, -v for verbose output, and -f to specify that we’re extracting from a file" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "metadata": { 85 | "id": "nVce513-cBHm", 86 | "colab_type": "code", 87 | "colab": {} 88 | }, 89 | "source": [ 90 | "!tar -xzvf hadoop-3.3.0.tar.gz" 91 | ], 92 | "execution_count": null, 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "metadata": { 98 | "id": "JF-ze-YOdync", 99 | "colab_type": "code", 100 | "colab": {} 101 | }, 102 | "source": [ 103 | "#copy hadoop file to user/local\n", 104 | "!cp -r hadoop-3.3.0/ /usr/local/" 105 | ], 106 | "execution_count": 3, 107 | "outputs": [] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": { 112 | "id": "Vh6Dqbbrwqpe", 113 | "colab_type": "text" 114 | }, 115 | "source": [ 116 | "## Step2:Configuring Hadoop’s Java Home\n", 117 | "Hadoop requires that you set the path to Java, either as an environment variable or in the Hadoop configuration file." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "metadata": { 123 | "id": "_OUc19ZtcBG5", 124 | "colab_type": "code", 125 | "colab": { 126 | "base_uri": "https://localhost:8080/", 127 | "height": 34 128 | }, 129 | "outputId": "a3821595-1aa0-4f19-a6df-05c7f74c1ae1" 130 | }, 131 | "source": [ 132 | "#To find the default Java path\n", 133 | "!readlink -f /usr/bin/java | sed \"s:bin/java::\"" 134 | ], 135 | "execution_count": 4, 136 | "outputs": [ 137 | { 138 | "output_type": "stream", 139 | "text": [ 140 | "/usr/lib/jvm/java-11-openjdk-amd64/\n" 141 | ], 142 | "name": "stdout" 143 | } 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "id": "MxaPBWRKxXta", 150 | "colab_type": "text" 151 | }, 152 | "source": [ 153 | "To set java path, go to /usr/local/hadoop-3.3.0/etc/hadoop/hadoop-env.sh then\n", 154 | "\n", 155 | ". . .\n", 156 | "export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64/\n", 157 | " . . . " 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": { 163 | "id": "Oj00rPPZyEWZ", 164 | "colab_type": "text" 165 | }, 166 | "source": [ 167 | "# Step 3: Running Hadoop" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "metadata": { 173 | "id": "Zhf-zK7NcBDF", 174 | "colab_type": "code", 175 | "colab": { 176 | "base_uri": "https://localhost:8080/", 177 | "height": 952 178 | }, 179 | "outputId": "4b567ac3-57ad-4992-9e1d-e5084ae61b46" 180 | }, 181 | "source": [ 182 | "#Running Hadoop\n", 183 | "!/usr/local/hadoop-3.3.0/bin/hadoop" 184 | ], 185 | "execution_count": 5, 186 | "outputs": [ 187 | { 188 | "output_type": "stream", 189 | "text": [ 190 | "Usage: hadoop [OPTIONS] SUBCOMMAND [SUBCOMMAND OPTIONS]\n", 191 | " or hadoop [OPTIONS] CLASSNAME [CLASSNAME OPTIONS]\n", 192 | " where CLASSNAME is a user-provided Java class\n", 193 | "\n", 194 | " OPTIONS is none or any of:\n", 195 | "\n", 196 | "buildpaths attempt to add class files from build tree\n", 197 | "--config dir Hadoop config directory\n", 198 | "--debug turn on shell script debug mode\n", 199 | "--help usage information\n", 200 | "hostnames list[,of,host,names] hosts to use in slave mode\n", 201 | "hosts filename list of hosts to use in slave mode\n", 202 | "loglevel level set the log4j level for this command\n", 203 | "workers turn on worker mode\n", 204 | "\n", 205 | " SUBCOMMAND is one of:\n", 206 | "\n", 207 | "\n", 208 | " Admin Commands:\n", 209 | "\n", 210 | "daemonlog get/set the log level for each daemon\n", 211 | "\n", 212 | " Client Commands:\n", 213 | "\n", 214 | "archive create a Hadoop archive\n", 215 | "checknative check native Hadoop and compression libraries availability\n", 216 | "classpath prints the class path needed to get the Hadoop jar and the\n", 217 | " required libraries\n", 218 | "conftest validate configuration XML files\n", 219 | "credential interact with credential providers\n", 220 | "distch distributed metadata changer\n", 221 | "distcp copy file or directories recursively\n", 222 | "dtutil operations related to delegation tokens\n", 223 | "envvars display computed Hadoop environment variables\n", 224 | "fs run a generic filesystem user client\n", 225 | "gridmix submit a mix of synthetic job, modeling a profiled from\n", 226 | " production load\n", 227 | "jar run a jar file. NOTE: please use \"yarn jar\" to launch YARN\n", 228 | " applications, not this command.\n", 229 | "jnipath prints the java.library.path\n", 230 | "kdiag Diagnose Kerberos Problems\n", 231 | "kerbname show auth_to_local principal conversion\n", 232 | "key manage keys via the KeyProvider\n", 233 | "rumenfolder scale a rumen input trace\n", 234 | "rumentrace convert logs into a rumen trace\n", 235 | "s3guard manage metadata on S3\n", 236 | "trace view and modify Hadoop tracing settings\n", 237 | "version print the version\n", 238 | "\n", 239 | " Daemon Commands:\n", 240 | "\n", 241 | "kms run KMS, the Key Management Server\n", 242 | "registrydns run the registry DNS server\n", 243 | "\n", 244 | "SUBCOMMAND may print help when invoked w/o parameters or with -h.\n" 245 | ], 246 | "name": "stdout" 247 | } 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "metadata": { 253 | "id": "uI-YBPIzcBCA", 254 | "colab_type": "code", 255 | "colab": {} 256 | }, 257 | "source": [ 258 | "!mkdir ~/input\n", 259 | "!cp /usr/local/hadoop-3.3.0/etc/hadoop/*.xml ~/input" 260 | ], 261 | "execution_count": 6, 262 | "outputs": [] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "metadata": { 267 | "id": "6DuDJIsPcA98", 268 | "colab_type": "code", 269 | "colab": { 270 | "base_uri": "https://localhost:8080/", 271 | "height": 68 272 | }, 273 | "outputId": "d2e82548-75aa-4e40-a048-48928825d374" 274 | }, 275 | "source": [ 276 | "!ls ~/input" 277 | ], 278 | "execution_count": 7, 279 | "outputs": [ 280 | { 281 | "output_type": "stream", 282 | "text": [ 283 | "capacity-scheduler.xml\thdfs-rbf-site.xml kms-acls.xml yarn-site.xml\n", 284 | "core-site.xml\t\thdfs-site.xml\t kms-site.xml\n", 285 | "hadoop-policy.xml\thttpfs-site.xml mapred-site.xml\n" 286 | ], 287 | "name": "stdout" 288 | } 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "metadata": { 294 | "id": "RZi5zOGKyySH", 295 | "colab_type": "code", 296 | "colab": {} 297 | }, 298 | "source": [ 299 | "!/usr/local/hadoop-3.3.0/bin/hadoop jar /usr/local/hadoop-3.3.0/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.0.jar grep ~/input ~/grep_example 'allowed[.]*'" 300 | ], 301 | "execution_count": null, 302 | "outputs": [] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "metadata": { 307 | "id": "mtr0xWbfcA5J", 308 | "colab_type": "code", 309 | "colab": { 310 | "base_uri": "https://localhost:8080/", 311 | "height": 51 312 | }, 313 | "outputId": "8debaca6-e5f5-423f-e85f-0b3f0753d3e8" 314 | }, 315 | "source": [ 316 | "!cat ~/grep_example/*" 317 | ], 318 | "execution_count": 9, 319 | "outputs": [ 320 | { 321 | "output_type": "stream", 322 | "text": [ 323 | "22\tallowed.\n", 324 | "1\tallowed\n" 325 | ], 326 | "name": "stdout" 327 | } 328 | ] 329 | } 330 | ] 331 | } -------------------------------------------------------------------------------- /hadoop_mapreduce.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "hadoop_mapreduce.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "authorship_tag": "ABX9TyNryPaSxVhByIGr0X0SJTxf" 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "myPIGP-mwKBD", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "#Hadoop Instalation Part\n", 25 | "Hadoop is a Java-based programming framework that supports the processing and storage of extremely large datasets on a cluster of inexpensive machines. It was the first major open source project in the big data playing field and is sponsored by the Apache Software Foundation." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "j9bT9M1yvyXG", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "## Step 1:Installing Hadoop" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "bijZAdD_cBMK", 42 | "colab_type": "code", 43 | "colab": { 44 | "base_uri": "https://localhost:8080/", 45 | "height": 204 46 | }, 47 | "outputId": "989d7a35-005b-4014-a0cf-ff458ddd46d7" 48 | }, 49 | "source": [ 50 | "!wget https://downloads.apache.org/hadoop/common/hadoop-3.3.0/hadoop-3.3.0.tar.gz\n" 51 | ], 52 | "execution_count": null, 53 | "outputs": [ 54 | { 55 | "output_type": "stream", 56 | "text": [ 57 | "--2020-09-10 05:21:56-- https://downloads.apache.org/hadoop/common/hadoop-3.3.0/hadoop-3.3.0.tar.gz\n", 58 | "Resolving downloads.apache.org (downloads.apache.org)... 88.99.95.219, 2a01:4f8:10a:201a::2\n", 59 | "Connecting to downloads.apache.org (downloads.apache.org)|88.99.95.219|:443... connected.\n", 60 | "HTTP request sent, awaiting response... 200 OK\n", 61 | "Length: 500749234 (478M) [application/x-gzip]\n", 62 | "Saving to: ‘hadoop-3.3.0.tar.gz.1’\n", 63 | "\n", 64 | "hadoop-3.3.0.tar.gz 100%[===================>] 477.55M 10.4MB/s in 47s \n", 65 | "\n", 66 | "2020-09-10 05:22:44 (10.1 MB/s) - ‘hadoop-3.3.0.tar.gz.1’ saved [500749234/500749234]\n", 67 | "\n" 68 | ], 69 | "name": "stdout" 70 | } 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "colab_type": "text", 77 | "id": "Mj40txsTw6DZ" 78 | }, 79 | "source": [ 80 | "we’ll use the tar command with the -x flag to extract, -z to uncompress, -v for verbose output, and -f to specify that we’re extracting from a file" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "metadata": { 86 | "id": "nVce513-cBHm", 87 | "colab_type": "code", 88 | "colab": {} 89 | }, 90 | "source": [ 91 | "!tar -xzvf hadoop-3.3.0.tar.gz" 92 | ], 93 | "execution_count": null, 94 | "outputs": [] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "metadata": { 99 | "id": "JF-ze-YOdync", 100 | "colab_type": "code", 101 | "colab": {} 102 | }, 103 | "source": [ 104 | "#copy hadoop file to user/local\n", 105 | "!cp -r hadoop-3.3.0/ /usr/local/" 106 | ], 107 | "execution_count": null, 108 | "outputs": [] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": { 113 | "id": "Vh6Dqbbrwqpe", 114 | "colab_type": "text" 115 | }, 116 | "source": [ 117 | "## Step2:Configuring Hadoop’s Java Home\n", 118 | "Hadoop requires that you set the path to Java, either as an environment variable or in the Hadoop configuration file." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "metadata": { 124 | "id": "_OUc19ZtcBG5", 125 | "colab_type": "code", 126 | "colab": { 127 | "base_uri": "https://localhost:8080/", 128 | "height": 34 129 | }, 130 | "outputId": "a3821595-1aa0-4f19-a6df-05c7f74c1ae1" 131 | }, 132 | "source": [ 133 | "#To find the default Java path\n", 134 | "!readlink -f /usr/bin/java | sed \"s:bin/java::\"" 135 | ], 136 | "execution_count": null, 137 | "outputs": [ 138 | { 139 | "output_type": "stream", 140 | "text": [ 141 | "/usr/lib/jvm/java-11-openjdk-amd64/\n" 142 | ], 143 | "name": "stdout" 144 | } 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": { 150 | "id": "MxaPBWRKxXta", 151 | "colab_type": "text" 152 | }, 153 | "source": [ 154 | "To set java path, go to /usr/local/hadoop-3.3.0/etc/hadoop/hadoop-env.sh then\n", 155 | "\n", 156 | ". . .\n", 157 | "export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64/\n", 158 | " . . . " 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": { 164 | "id": "Oj00rPPZyEWZ", 165 | "colab_type": "text" 166 | }, 167 | "source": [ 168 | "# Step 3: Running Hadoop" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "metadata": { 174 | "id": "Zhf-zK7NcBDF", 175 | "colab_type": "code", 176 | "colab": { 177 | "base_uri": "https://localhost:8080/", 178 | "height": 952 179 | }, 180 | "outputId": "4b567ac3-57ad-4992-9e1d-e5084ae61b46" 181 | }, 182 | "source": [ 183 | "#Running Hadoop\n", 184 | "!/usr/local/hadoop-3.3.0/bin/hadoop" 185 | ], 186 | "execution_count": null, 187 | "outputs": [ 188 | { 189 | "output_type": "stream", 190 | "text": [ 191 | "Usage: hadoop [OPTIONS] SUBCOMMAND [SUBCOMMAND OPTIONS]\n", 192 | " or hadoop [OPTIONS] CLASSNAME [CLASSNAME OPTIONS]\n", 193 | " where CLASSNAME is a user-provided Java class\n", 194 | "\n", 195 | " OPTIONS is none or any of:\n", 196 | "\n", 197 | "buildpaths attempt to add class files from build tree\n", 198 | "--config dir Hadoop config directory\n", 199 | "--debug turn on shell script debug mode\n", 200 | "--help usage information\n", 201 | "hostnames list[,of,host,names] hosts to use in slave mode\n", 202 | "hosts filename list of hosts to use in slave mode\n", 203 | "loglevel level set the log4j level for this command\n", 204 | "workers turn on worker mode\n", 205 | "\n", 206 | " SUBCOMMAND is one of:\n", 207 | "\n", 208 | "\n", 209 | " Admin Commands:\n", 210 | "\n", 211 | "daemonlog get/set the log level for each daemon\n", 212 | "\n", 213 | " Client Commands:\n", 214 | "\n", 215 | "archive create a Hadoop archive\n", 216 | "checknative check native Hadoop and compression libraries availability\n", 217 | "classpath prints the class path needed to get the Hadoop jar and the\n", 218 | " required libraries\n", 219 | "conftest validate configuration XML files\n", 220 | "credential interact with credential providers\n", 221 | "distch distributed metadata changer\n", 222 | "distcp copy file or directories recursively\n", 223 | "dtutil operations related to delegation tokens\n", 224 | "envvars display computed Hadoop environment variables\n", 225 | "fs run a generic filesystem user client\n", 226 | "gridmix submit a mix of synthetic job, modeling a profiled from\n", 227 | " production load\n", 228 | "jar run a jar file. NOTE: please use \"yarn jar\" to launch YARN\n", 229 | " applications, not this command.\n", 230 | "jnipath prints the java.library.path\n", 231 | "kdiag Diagnose Kerberos Problems\n", 232 | "kerbname show auth_to_local principal conversion\n", 233 | "key manage keys via the KeyProvider\n", 234 | "rumenfolder scale a rumen input trace\n", 235 | "rumentrace convert logs into a rumen trace\n", 236 | "s3guard manage metadata on S3\n", 237 | "trace view and modify Hadoop tracing settings\n", 238 | "version print the version\n", 239 | "\n", 240 | " Daemon Commands:\n", 241 | "\n", 242 | "kms run KMS, the Key Management Server\n", 243 | "registrydns run the registry DNS server\n", 244 | "\n", 245 | "SUBCOMMAND may print help when invoked w/o parameters or with -h.\n" 246 | ], 247 | "name": "stdout" 248 | } 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "metadata": { 254 | "id": "uI-YBPIzcBCA", 255 | "colab_type": "code", 256 | "colab": {} 257 | }, 258 | "source": [ 259 | "!mkdir ~/input\n", 260 | "!cp /usr/local/hadoop-3.3.0/etc/hadoop/*.xml ~/input" 261 | ], 262 | "execution_count": null, 263 | "outputs": [] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "metadata": { 268 | "id": "6DuDJIsPcA98", 269 | "colab_type": "code", 270 | "colab": { 271 | "base_uri": "https://localhost:8080/", 272 | "height": 68 273 | }, 274 | "outputId": "d2e82548-75aa-4e40-a048-48928825d374" 275 | }, 276 | "source": [ 277 | "!ls ~/input" 278 | ], 279 | "execution_count": null, 280 | "outputs": [ 281 | { 282 | "output_type": "stream", 283 | "text": [ 284 | "capacity-scheduler.xml\thdfs-rbf-site.xml kms-acls.xml yarn-site.xml\n", 285 | "core-site.xml\t\thdfs-site.xml\t kms-site.xml\n", 286 | "hadoop-policy.xml\thttpfs-site.xml mapred-site.xml\n" 287 | ], 288 | "name": "stdout" 289 | } 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "metadata": { 295 | "id": "RZi5zOGKyySH", 296 | "colab_type": "code", 297 | "colab": {} 298 | }, 299 | "source": [ 300 | "!/usr/local/hadoop-3.3.0/bin/hadoop jar /usr/local/hadoop-3.3.0/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.0.jar grep ~/input ~/grep_example 'allowed[.]*'" 301 | ], 302 | "execution_count": null, 303 | "outputs": [] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "metadata": { 308 | "id": "mtr0xWbfcA5J", 309 | "colab_type": "code", 310 | "colab": { 311 | "base_uri": "https://localhost:8080/", 312 | "height": 51 313 | }, 314 | "outputId": "8debaca6-e5f5-423f-e85f-0b3f0753d3e8" 315 | }, 316 | "source": [ 317 | "!cat ~/grep_example/*" 318 | ], 319 | "execution_count": null, 320 | "outputs": [ 321 | { 322 | "output_type": "stream", 323 | "text": [ 324 | "22\tallowed.\n", 325 | "1\tallowed\n" 326 | ], 327 | "name": "stdout" 328 | } 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": { 334 | "id": "AQc3SKkOzNxn", 335 | "colab_type": "text" 336 | }, 337 | "source": [ 338 | "**Download 20newsgroups dataset available at** http://qwone.com/~jason/20Newsgroups." 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "metadata": { 344 | "id": "vgEGfCcGOswd", 345 | "colab_type": "code", 346 | "colab": {} 347 | }, 348 | "source": [ 349 | "!wget http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz\n", 350 | "\n", 351 | "!tar -xzvf 20news-18828.tar.gz" 352 | ], 353 | "execution_count": null, 354 | "outputs": [] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": { 359 | "id": "oxCFNl3SQHDl", 360 | "colab_type": "text" 361 | }, 362 | "source": [ 363 | "#Hadoop Streaming" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "bLRsjubgOs2p", 370 | "colab_type": "code", 371 | "colab": { 372 | "base_uri": "https://localhost:8080/", 373 | "height": 119 374 | }, 375 | "outputId": "69620c94-081b-45ac-dab6-ce32db7ef007" 376 | }, 377 | "source": [ 378 | "!find / -name 'hadoop-streaming*.jar'" 379 | ], 380 | "execution_count": null, 381 | "outputs": [ 382 | { 383 | "output_type": "stream", 384 | "text": [ 385 | "/usr/local/hadoop-3.3.0/share/hadoop/tools/lib/hadoop-streaming-3.3.0.jar\n", 386 | "/usr/local/hadoop-3.3.0/share/hadoop/tools/sources/hadoop-streaming-3.3.0-sources.jar\n", 387 | "/usr/local/hadoop-3.3.0/share/hadoop/tools/sources/hadoop-streaming-3.3.0-test-sources.jar\n", 388 | "/content/hadoop-3.3.0/share/hadoop/tools/lib/hadoop-streaming-3.3.0.jar\n", 389 | "/content/hadoop-3.3.0/share/hadoop/tools/sources/hadoop-streaming-3.3.0-sources.jar\n", 390 | "/content/hadoop-3.3.0/share/hadoop/tools/sources/hadoop-streaming-3.3.0-test-sources.jar\n" 391 | ], 392 | "name": "stdout" 393 | } 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "metadata": { 399 | "id": "eu5IAGT2Os6D", 400 | "colab_type": "code", 401 | "colab": {} 402 | }, 403 | "source": [ 404 | "!chmod u+rwx /content/mapper.py\n", 405 | "!chmod u+rwx /content/reducer.py" 406 | ], 407 | "execution_count": null, 408 | "outputs": [] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "metadata": { 413 | "id": "ru1lPx9yOsvO", 414 | "colab_type": "code", 415 | "colab": { 416 | "base_uri": "https://localhost:8080/", 417 | "height": 1000 418 | }, 419 | "outputId": "023b36ba-21fc-4290-e4db-1487084e5fb6" 420 | }, 421 | "source": [ 422 | "!/usr/local/hadoop-3.3.0/bin/hadoop jar /usr/local/hadoop-3.3.0/share/hadoop/tools/lib/hadoop-streaming-3.3.0.jar -input /content/20news-18828/alt.atheism/49960 -output /content/output -file /content/mapper.py -file /content/reducer.py -mapper 'python mapper.py' -reducer 'python reducer.py'" 423 | ], 424 | "execution_count": null, 425 | "outputs": [ 426 | { 427 | "output_type": "stream", 428 | "text": [ 429 | "2020-09-05 15:03:48,933 WARN streaming.StreamJob: -file option is deprecated, please use generic option -files instead.\n", 430 | "packageJobJar: [/content/mapper.py, /content/reducer.py] [] /tmp/streamjob18407869961627485547.jar tmpDir=null\n", 431 | "2020-09-05 15:03:49,611 INFO impl.MetricsConfig: Loaded properties from hadoop-metrics2.properties\n", 432 | "2020-09-05 15:03:49,758 INFO impl.MetricsSystemImpl: Scheduled Metric snapshot period at 10 second(s).\n", 433 | "2020-09-05 15:03:49,758 INFO impl.MetricsSystemImpl: JobTracker metrics system started\n", 434 | "2020-09-05 15:03:49,785 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!\n", 435 | "2020-09-05 15:03:49,912 INFO mapred.FileInputFormat: Total input files to process : 1\n", 436 | "2020-09-05 15:03:49,932 INFO mapreduce.JobSubmitter: number of splits:1\n", 437 | "2020-09-05 15:03:50,194 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local713819920_0001\n", 438 | "2020-09-05 15:03:50,194 INFO mapreduce.JobSubmitter: Executing with tokens: []\n", 439 | "2020-09-05 15:03:50,622 INFO mapred.LocalDistributedCacheManager: Localized file:/content/mapper.py as file:/tmp/hadoop-root/mapred/local/job_local713819920_0001_a88c2a90-6a94-4a97-8309-7a43c9913551/mapper.py\n", 440 | "2020-09-05 15:03:50,657 INFO mapred.LocalDistributedCacheManager: Localized file:/content/reducer.py as file:/tmp/hadoop-root/mapred/local/job_local713819920_0001_9a513f55-7ab1-42d4-b83b-6e6312e3b2de/reducer.py\n", 441 | "2020-09-05 15:03:50,747 INFO mapreduce.Job: The url to track the job: http://localhost:8080/\n", 442 | "2020-09-05 15:03:50,748 INFO mapreduce.Job: Running job: job_local713819920_0001\n", 443 | "2020-09-05 15:03:50,763 INFO mapred.LocalJobRunner: OutputCommitter set in config null\n", 444 | "2020-09-05 15:03:50,766 INFO mapred.LocalJobRunner: OutputCommitter is org.apache.hadoop.mapred.FileOutputCommitter\n", 445 | "2020-09-05 15:03:50,777 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 2\n", 446 | "2020-09-05 15:03:50,777 INFO output.FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false\n", 447 | "2020-09-05 15:03:50,843 INFO mapred.LocalJobRunner: Waiting for map tasks\n", 448 | "2020-09-05 15:03:50,847 INFO mapred.LocalJobRunner: Starting task: attempt_local713819920_0001_m_000000_0\n", 449 | "2020-09-05 15:03:50,887 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 2\n", 450 | "2020-09-05 15:03:50,890 INFO output.FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false\n", 451 | "2020-09-05 15:03:50,919 INFO mapred.Task: Using ResourceCalculatorProcessTree : [ ]\n", 452 | "2020-09-05 15:03:50,932 INFO mapred.MapTask: Processing split: file:/content/20news-18828/alt.atheism/49960:0+11599\n", 453 | "2020-09-05 15:03:50,949 INFO mapred.MapTask: numReduceTasks: 1\n", 454 | "2020-09-05 15:03:51,024 INFO mapred.MapTask: (EQUATOR) 0 kvi 26214396(104857584)\n", 455 | "2020-09-05 15:03:51,024 INFO mapred.MapTask: mapreduce.task.io.sort.mb: 100\n", 456 | "2020-09-05 15:03:51,024 INFO mapred.MapTask: soft limit at 83886080\n", 457 | "2020-09-05 15:03:51,024 INFO mapred.MapTask: bufstart = 0; bufvoid = 104857600\n", 458 | "2020-09-05 15:03:51,024 INFO mapred.MapTask: kvstart = 26214396; length = 6553600\n", 459 | "2020-09-05 15:03:51,027 INFO mapred.MapTask: Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer\n", 460 | "2020-09-05 15:03:51,037 INFO streaming.PipeMapRed: PipeMapRed exec [/usr/local/bin/python, mapper.py]\n", 461 | "2020-09-05 15:03:51,045 INFO Configuration.deprecation: mapred.work.output.dir is deprecated. Instead, use mapreduce.task.output.dir\n", 462 | "2020-09-05 15:03:51,045 INFO Configuration.deprecation: map.input.start is deprecated. Instead, use mapreduce.map.input.start\n", 463 | "2020-09-05 15:03:51,046 INFO Configuration.deprecation: mapred.task.is.map is deprecated. Instead, use mapreduce.task.ismap\n", 464 | "2020-09-05 15:03:51,046 INFO Configuration.deprecation: mapred.task.id is deprecated. Instead, use mapreduce.task.attempt.id\n", 465 | "2020-09-05 15:03:51,047 INFO Configuration.deprecation: mapred.tip.id is deprecated. Instead, use mapreduce.task.id\n", 466 | "2020-09-05 15:03:51,047 INFO Configuration.deprecation: mapred.local.dir is deprecated. Instead, use mapreduce.cluster.local.dir\n", 467 | "2020-09-05 15:03:51,048 INFO Configuration.deprecation: map.input.file is deprecated. Instead, use mapreduce.map.input.file\n", 468 | "2020-09-05 15:03:51,049 INFO Configuration.deprecation: mapred.skip.on is deprecated. Instead, use mapreduce.job.skiprecords\n", 469 | "2020-09-05 15:03:51,049 INFO Configuration.deprecation: map.input.length is deprecated. Instead, use mapreduce.map.input.length\n", 470 | "2020-09-05 15:03:51,050 INFO Configuration.deprecation: mapred.job.id is deprecated. Instead, use mapreduce.job.id\n", 471 | "2020-09-05 15:03:51,050 INFO Configuration.deprecation: user.name is deprecated. Instead, use mapreduce.job.user.name\n", 472 | "2020-09-05 15:03:51,051 INFO Configuration.deprecation: mapred.task.partition is deprecated. Instead, use mapreduce.task.partition\n", 473 | "2020-09-05 15:03:51,082 INFO streaming.PipeMapRed: R/W/S=1/0/0 in:NA [rec/s] out:NA [rec/s]\n", 474 | "2020-09-05 15:03:51,083 INFO streaming.PipeMapRed: R/W/S=10/0/0 in:NA [rec/s] out:NA [rec/s]\n", 475 | "2020-09-05 15:03:51,084 INFO streaming.PipeMapRed: R/W/S=100/0/0 in:NA [rec/s] out:NA [rec/s]\n", 476 | "2020-09-05 15:03:51,757 INFO mapreduce.Job: Job job_local713819920_0001 running in uber mode : false\n", 477 | "2020-09-05 15:03:51,758 INFO mapreduce.Job: map 0% reduce 0%\n", 478 | "2020-09-05 15:03:51,962 INFO streaming.PipeMapRed: Records R/W=293/1\n", 479 | "2020-09-05 15:03:52,097 INFO streaming.PipeMapRed: MRErrorThread done\n", 480 | "2020-09-05 15:03:52,097 INFO streaming.PipeMapRed: mapRedFinished\n", 481 | "2020-09-05 15:03:52,100 INFO mapred.LocalJobRunner: \n", 482 | "2020-09-05 15:03:52,100 INFO mapred.MapTask: Starting flush of map output\n", 483 | "2020-09-05 15:03:52,100 INFO mapred.MapTask: Spilling map output\n", 484 | "2020-09-05 15:03:52,100 INFO mapred.MapTask: bufstart = 0; bufend = 10685; bufvoid = 104857600\n", 485 | "2020-09-05 15:03:52,101 INFO mapred.MapTask: kvstart = 26214396(104857584); kvend = 26209920(104839680); length = 4477/6553600\n", 486 | "2020-09-05 15:03:52,130 INFO mapred.MapTask: Finished spill 0\n", 487 | "2020-09-05 15:03:52,142 INFO mapred.Task: Task:attempt_local713819920_0001_m_000000_0 is done. And is in the process of committing\n", 488 | "2020-09-05 15:03:52,145 INFO mapred.LocalJobRunner: Records R/W=293/1\n", 489 | "2020-09-05 15:03:52,145 INFO mapred.Task: Task 'attempt_local713819920_0001_m_000000_0' done.\n", 490 | "2020-09-05 15:03:52,152 INFO mapred.Task: Final Counters for attempt_local713819920_0001_m_000000_0: Counters: 17\n", 491 | "\tFile System Counters\n", 492 | "\t\tFILE: Number of bytes read=15021\n", 493 | "\t\tFILE: Number of bytes written=628522\n", 494 | "\t\tFILE: Number of read operations=0\n", 495 | "\t\tFILE: Number of large read operations=0\n", 496 | "\t\tFILE: Number of write operations=0\n", 497 | "\tMap-Reduce Framework\n", 498 | "\t\tMap input records=293\n", 499 | "\t\tMap output records=1120\n", 500 | "\t\tMap output bytes=10685\n", 501 | "\t\tMap output materialized bytes=12931\n", 502 | "\t\tInput split bytes=96\n", 503 | "\t\tCombine input records=0\n", 504 | "\t\tSpilled Records=1120\n", 505 | "\t\tFailed Shuffles=0\n", 506 | "\t\tMerged Map outputs=0\n", 507 | "\t\tGC time elapsed (ms)=28\n", 508 | "\t\tTotal committed heap usage (bytes)=353370112\n", 509 | "\tFile Input Format Counters \n", 510 | "\t\tBytes Read=11599\n", 511 | "2020-09-05 15:03:52,152 INFO mapred.LocalJobRunner: Finishing task: attempt_local713819920_0001_m_000000_0\n", 512 | "2020-09-05 15:03:52,152 INFO mapred.LocalJobRunner: map task executor complete.\n", 513 | "2020-09-05 15:03:52,157 INFO mapred.LocalJobRunner: Waiting for reduce tasks\n", 514 | "2020-09-05 15:03:52,159 INFO mapred.LocalJobRunner: Starting task: attempt_local713819920_0001_r_000000_0\n", 515 | "2020-09-05 15:03:52,171 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 2\n", 516 | "2020-09-05 15:03:52,171 INFO output.FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false\n", 517 | "2020-09-05 15:03:52,176 INFO mapred.Task: Using ResourceCalculatorProcessTree : [ ]\n", 518 | "2020-09-05 15:03:52,179 INFO mapred.ReduceTask: Using ShuffleConsumerPlugin: org.apache.hadoop.mapreduce.task.reduce.Shuffle@7c884daf\n", 519 | "2020-09-05 15:03:52,181 WARN impl.MetricsSystemImpl: JobTracker metrics system already initialized!\n", 520 | "2020-09-05 15:03:52,199 INFO reduce.MergeManagerImpl: MergerManager: memoryLimit=2389914368, maxSingleShuffleLimit=597478592, mergeThreshold=1577343488, ioSortFactor=10, memToMemMergeOutputsThreshold=10\n", 521 | "2020-09-05 15:03:52,202 INFO reduce.EventFetcher: attempt_local713819920_0001_r_000000_0 Thread started: EventFetcher for fetching Map Completion Events\n", 522 | "2020-09-05 15:03:52,243 INFO reduce.LocalFetcher: localfetcher#1 about to shuffle output of map attempt_local713819920_0001_m_000000_0 decomp: 12927 len: 12931 to MEMORY\n", 523 | "2020-09-05 15:03:52,247 INFO reduce.InMemoryMapOutput: Read 12927 bytes from map-output for attempt_local713819920_0001_m_000000_0\n", 524 | "2020-09-05 15:03:52,249 INFO reduce.MergeManagerImpl: closeInMemoryFile -> map-output of size: 12927, inMemoryMapOutputs.size() -> 1, commitMemory -> 0, usedMemory ->12927\n", 525 | "2020-09-05 15:03:52,251 INFO reduce.EventFetcher: EventFetcher is interrupted.. Returning\n", 526 | "2020-09-05 15:03:52,253 INFO mapred.LocalJobRunner: 1 / 1 copied.\n", 527 | "2020-09-05 15:03:52,253 INFO reduce.MergeManagerImpl: finalMerge called with 1 in-memory map-outputs and 0 on-disk map-outputs\n", 528 | "2020-09-05 15:03:52,260 INFO mapred.Merger: Merging 1 sorted segments\n", 529 | "2020-09-05 15:03:52,260 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 12914 bytes\n", 530 | "2020-09-05 15:03:52,269 INFO reduce.MergeManagerImpl: Merged 1 segments, 12927 bytes to disk to satisfy reduce memory limit\n", 531 | "2020-09-05 15:03:52,270 INFO reduce.MergeManagerImpl: Merging 1 files, 12931 bytes from disk\n", 532 | "2020-09-05 15:03:52,270 INFO reduce.MergeManagerImpl: Merging 0 segments, 0 bytes from memory into reduce\n", 533 | "2020-09-05 15:03:52,270 INFO mapred.Merger: Merging 1 sorted segments\n", 534 | "2020-09-05 15:03:52,271 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 12914 bytes\n", 535 | "2020-09-05 15:03:52,272 INFO mapred.LocalJobRunner: 1 / 1 copied.\n", 536 | "2020-09-05 15:03:52,289 INFO streaming.PipeMapRed: PipeMapRed exec [/usr/local/bin/python, reducer.py]\n", 537 | "2020-09-05 15:03:52,294 INFO Configuration.deprecation: mapred.job.tracker is deprecated. Instead, use mapreduce.jobtracker.address\n", 538 | "2020-09-05 15:03:52,298 INFO Configuration.deprecation: mapred.map.tasks is deprecated. Instead, use mapreduce.job.maps\n", 539 | "2020-09-05 15:03:52,325 INFO streaming.PipeMapRed: R/W/S=1/0/0 in:NA [rec/s] out:NA [rec/s]\n", 540 | "2020-09-05 15:03:52,325 INFO streaming.PipeMapRed: R/W/S=10/0/0 in:NA [rec/s] out:NA [rec/s]\n", 541 | "2020-09-05 15:03:52,326 INFO streaming.PipeMapRed: R/W/S=100/0/0 in:NA [rec/s] out:NA [rec/s]\n", 542 | "2020-09-05 15:03:52,346 INFO streaming.PipeMapRed: R/W/S=1000/0/0 in:NA [rec/s] out:NA [rec/s]\n", 543 | "2020-09-05 15:03:52,463 INFO streaming.PipeMapRed: Records R/W=1120/1\n", 544 | "2020-09-05 15:03:52,468 INFO streaming.PipeMapRed: MRErrorThread done\n", 545 | "2020-09-05 15:03:52,477 INFO streaming.PipeMapRed: mapRedFinished\n", 546 | "2020-09-05 15:03:52,478 INFO mapred.Task: Task:attempt_local713819920_0001_r_000000_0 is done. And is in the process of committing\n", 547 | "2020-09-05 15:03:52,480 INFO mapred.LocalJobRunner: 1 / 1 copied.\n", 548 | "2020-09-05 15:03:52,480 INFO mapred.Task: Task attempt_local713819920_0001_r_000000_0 is allowed to commit now\n", 549 | "2020-09-05 15:03:52,484 INFO output.FileOutputCommitter: Saved output of task 'attempt_local713819920_0001_r_000000_0' to file:/content/output\n", 550 | "2020-09-05 15:03:52,496 INFO mapred.LocalJobRunner: Records R/W=1120/1 > reduce\n", 551 | "2020-09-05 15:03:52,496 INFO mapred.Task: Task 'attempt_local713819920_0001_r_000000_0' done.\n", 552 | "2020-09-05 15:03:52,499 INFO mapred.Task: Final Counters for attempt_local713819920_0001_r_000000_0: Counters: 24\n", 553 | "\tFile System Counters\n", 554 | "\t\tFILE: Number of bytes read=40915\n", 555 | "\t\tFILE: Number of bytes written=649151\n", 556 | "\t\tFILE: Number of read operations=0\n", 557 | "\t\tFILE: Number of large read operations=0\n", 558 | "\t\tFILE: Number of write operations=0\n", 559 | "\tMap-Reduce Framework\n", 560 | "\t\tCombine input records=0\n", 561 | "\t\tCombine output records=0\n", 562 | "\t\tReduce input groups=787\n", 563 | "\t\tReduce shuffle bytes=12931\n", 564 | "\t\tReduce input records=1120\n", 565 | "\t\tReduce output records=787\n", 566 | "\t\tSpilled Records=1120\n", 567 | "\t\tShuffled Maps =1\n", 568 | "\t\tFailed Shuffles=0\n", 569 | "\t\tMerged Map outputs=1\n", 570 | "\t\tGC time elapsed (ms)=0\n", 571 | "\t\tTotal committed heap usage (bytes)=353370112\n", 572 | "\tShuffle Errors\n", 573 | "\t\tBAD_ID=0\n", 574 | "\t\tCONNECTION=0\n", 575 | "\t\tIO_ERROR=0\n", 576 | "\t\tWRONG_LENGTH=0\n", 577 | "\t\tWRONG_MAP=0\n", 578 | "\t\tWRONG_REDUCE=0\n", 579 | "\tFile Output Format Counters \n", 580 | "\t\tBytes Written=7698\n", 581 | "2020-09-05 15:03:52,499 INFO mapred.LocalJobRunner: Finishing task: attempt_local713819920_0001_r_000000_0\n", 582 | "2020-09-05 15:03:52,500 INFO mapred.LocalJobRunner: reduce task executor complete.\n", 583 | "2020-09-05 15:03:52,761 INFO mapreduce.Job: map 100% reduce 100%\n", 584 | "2020-09-05 15:03:52,761 INFO mapreduce.Job: Job job_local713819920_0001 completed successfully\n", 585 | "2020-09-05 15:03:52,775 INFO mapreduce.Job: Counters: 30\n", 586 | "\tFile System Counters\n", 587 | "\t\tFILE: Number of bytes read=55936\n", 588 | "\t\tFILE: Number of bytes written=1277673\n", 589 | "\t\tFILE: Number of read operations=0\n", 590 | "\t\tFILE: Number of large read operations=0\n", 591 | "\t\tFILE: Number of write operations=0\n", 592 | "\tMap-Reduce Framework\n", 593 | "\t\tMap input records=293\n", 594 | "\t\tMap output records=1120\n", 595 | "\t\tMap output bytes=10685\n", 596 | "\t\tMap output materialized bytes=12931\n", 597 | "\t\tInput split bytes=96\n", 598 | "\t\tCombine input records=0\n", 599 | "\t\tCombine output records=0\n", 600 | "\t\tReduce input groups=787\n", 601 | "\t\tReduce shuffle bytes=12931\n", 602 | "\t\tReduce input records=1120\n", 603 | "\t\tReduce output records=787\n", 604 | "\t\tSpilled Records=2240\n", 605 | "\t\tShuffled Maps =1\n", 606 | "\t\tFailed Shuffles=0\n", 607 | "\t\tMerged Map outputs=1\n", 608 | "\t\tGC time elapsed (ms)=28\n", 609 | "\t\tTotal committed heap usage (bytes)=706740224\n", 610 | "\tShuffle Errors\n", 611 | "\t\tBAD_ID=0\n", 612 | "\t\tCONNECTION=0\n", 613 | "\t\tIO_ERROR=0\n", 614 | "\t\tWRONG_LENGTH=0\n", 615 | "\t\tWRONG_MAP=0\n", 616 | "\t\tWRONG_REDUCE=0\n", 617 | "\tFile Input Format Counters \n", 618 | "\t\tBytes Read=11599\n", 619 | "\tFile Output Format Counters \n", 620 | "\t\tBytes Written=7698\n", 621 | "2020-09-05 15:03:52,775 INFO streaming.StreamJob: Output directory: /content/output\n" 622 | ], 623 | "name": "stdout" 624 | } 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "metadata": { 630 | "id": "jiW5mPIcy_Jp", 631 | "colab_type": "code", 632 | "colab": { 633 | "base_uri": "https://localhost:8080/", 634 | "height": 34 635 | }, 636 | "outputId": "4e5d8d06-0965-4b2c-8204-503305e6fa49" 637 | }, 638 | "source": [ 639 | "!ls /content/output" 640 | ], 641 | "execution_count": null, 642 | "outputs": [ 643 | { 644 | "output_type": "stream", 645 | "text": [ 646 | "part-00000 _SUCCESS\n" 647 | ], 648 | "name": "stdout" 649 | } 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "metadata": { 655 | "id": "CTFjOm59kc04", 656 | "colab_type": "code", 657 | "colab": { 658 | "base_uri": "https://localhost:8080/", 659 | "height": 1000 660 | }, 661 | "outputId": "81c7ce84-4a64-47c2-d525-ab83c560e12a" 662 | }, 663 | "source": [ 664 | "!cat /content/output/part-00000" 665 | ], 666 | "execution_count": null, 667 | "outputs": [ 668 | { 669 | "output_type": "stream", 670 | "text": [ 671 | "034529887x\t1\n", 672 | "0511211216\t1\n", 673 | "071\t5\n", 674 | "080182494x\t1\n", 675 | "0801834074\t1\n", 676 | "0877226423\t1\n", 677 | "0877227675\t1\n", 678 | "0908\t1\n", 679 | "0910309264\t1\n", 680 | "1\t1\n", 681 | "10\t1\n", 682 | "11\t1\n", 683 | "1266\t1\n", 684 | "1271\t1\n", 685 | "14\t1\n", 686 | "140195\t1\n", 687 | "14215\t1\n", 688 | "14226\t1\n", 689 | "142282197\t1\n", 690 | "17701900\t1\n", 691 | "1881\t1\n", 692 | "1977\t1\n", 693 | "1981\t1\n", 694 | "1986\t1\n", 695 | "1988\t1\n", 696 | "1989\t1\n", 697 | "1990\t1\n", 698 | "1992\t1\n", 699 | "20th\t1\n", 700 | "226\t1\n", 701 | "24hour\t1\n", 702 | "2568900\t1\n", 703 | "272\t1\n", 704 | "273\t1\n", 705 | "2nd\t1\n", 706 | "3005\t1\n", 707 | "316\t1\n", 708 | "372\t1\n", 709 | "3d\t1\n", 710 | "3nl\t1\n", 711 | "4\t1\n", 712 | "41\t2\n", 713 | "430\t2\n", 714 | "4581244\t1\n", 715 | "4679525\t1\n", 716 | "490\t1\n", 717 | "495\t2\n", 718 | "4rh\t1\n", 719 | "4rl\t1\n", 720 | "512\t2\n", 721 | "53701\t1\n", 722 | "541\t1\n", 723 | "59\t1\n", 724 | "608\t1\n", 725 | "664\t1\n", 726 | "700\t1\n", 727 | "702\t1\n", 728 | "7119\t1\n", 729 | "716\t1\n", 730 | "7215\t1\n", 731 | "7251\t1\n", 732 | "750\t1\n", 733 | "7723\t1\n", 734 | "787140195\t1\n", 735 | "787522973\t1\n", 736 | "831\t1\n", 737 | "8372475\t1\n", 738 | "88\t1\n", 739 | "880\t2\n", 740 | "8964079\t1\n", 741 | "8ew\t1\n", 742 | "91605\t1\n", 743 | "aah\t1\n", 744 | "aap\t2\n", 745 | "abortions\t1\n", 746 | "absurdities\t1\n", 747 | "accompanied\t1\n", 748 | "accounts\t1\n", 749 | "address\t1\n", 750 | "addresses\t2\n", 751 | "adulteries\t1\n", 752 | "aesthetics\t1\n", 753 | "african\t3\n", 754 | "africanamericans\t1\n", 755 | "agnostic\t1\n", 756 | "al\t1\n", 757 | "alien\t1\n", 758 | "allen\t2\n", 759 | "also\t3\n", 760 | "altatheism\t1\n", 761 | "altatheismarchivename\t1\n", 762 | "altatheismmoderated\t1\n", 763 | "alternate\t1\n", 764 | "alternative\t1\n", 765 | "although\t2\n", 766 | "america\t1\n", 767 | "american\t5\n", 768 | "americans\t2\n", 769 | "amherst\t1\n", 770 | "amongst\t1\n", 771 | "amusing\t1\n", 772 | "ancient\t1\n", 773 | "andor\t1\n", 774 | "another\t1\n", 775 | "anselm\t1\n", 776 | "anthology\t3\n", 777 | "anyone\t1\n", 778 | "appendix\t2\n", 779 | "approachable\t1\n", 780 | "archive\t1\n", 781 | "archivename\t1\n", 782 | "archives\t1\n", 783 | "archiveservermantiscouk\t1\n", 784 | "area\t2\n", 785 | "argues\t1\n", 786 | "arguments\t5\n", 787 | "articles\t1\n", 788 | "assassinated\t1\n", 789 | "assisted\t1\n", 790 | "association\t2\n", 791 | "assorted\t3\n", 792 | "atheism\t6\n", 793 | "atheismindex\t1\n", 794 | "atheismresources\t1\n", 795 | "atheist\t10\n", 796 | "atheisten\t2\n", 797 | "atheistic\t1\n", 798 | "atomic\t2\n", 799 | "atoms\t1\n", 800 | "atrocities\t1\n", 801 | "attempt\t1\n", 802 | "attempts\t1\n", 803 | "attention\t1\n", 804 | "atwood\t1\n", 805 | "atwoods\t1\n", 806 | "austin\t2\n", 807 | "authors\t1\n", 808 | "available\t2\n", 809 | "axiarchism\t1\n", 810 | "back\t1\n", 811 | "ball\t2\n", 812 | "ballantine\t1\n", 813 | "baltimore\t1\n", 814 | "bank\t1\n", 815 | "bantam\t1\n", 816 | "based\t2\n", 817 | "bay\t1\n", 818 | "beam\t1\n", 819 | "became\t1\n", 820 | "become\t1\n", 821 | "began\t1\n", 822 | "begins\t1\n", 823 | "belief\t2\n", 824 | "believed\t1\n", 825 | "beneath\t1\n", 826 | "berkeley\t1\n", 827 | "berlin\t2\n", 828 | "best\t1\n", 829 | "better\t1\n", 830 | "beyond\t1\n", 831 | "bible\t7\n", 832 | "biblebeliever\t1\n", 833 | "biblical\t1\n", 834 | "bibliography\t1\n", 835 | "bizarre\t1\n", 836 | "black\t2\n", 837 | "blueprints\t1\n", 838 | "book\t4\n", 839 | "books\t11\n", 840 | "box\t3\n", 841 | "brain\t2\n", 842 | "britain\t1\n", 843 | "british\t1\n", 844 | "bucherdienst\t1\n", 845 | "buffalo\t3\n", 846 | "bumper\t1\n", 847 | "bund\t1\n", 848 | "ca\t1\n", 849 | "cameron\t1\n", 850 | "canticle\t1\n", 851 | "canyon\t1\n", 852 | "card\t1\n", 853 | "cardiffs\t1\n", 854 | "carries\t1\n", 855 | "cars\t1\n", 856 | "case\t1\n", 857 | "catalog\t1\n", 858 | "cathedral\t1\n", 859 | "catholic\t1\n", 860 | "centuries\t1\n", 861 | "century\t1\n", 862 | "challenging\t1\n", 863 | "characters\t2\n", 864 | "charge\t1\n", 865 | "chilling\t1\n", 866 | "christ\t1\n", 867 | "christian\t4\n", 868 | "christianity\t4\n", 869 | "christians\t2\n", 870 | "church\t1\n", 871 | "clarendon\t1\n", 872 | "classical\t2\n", 873 | "claus\t1\n", 874 | "clerical\t1\n", 875 | "closed\t1\n", 876 | "cohen\t1\n", 877 | "coherence\t1\n", 878 | "compared\t1\n", 879 | "comply\t1\n", 880 | "comprehensive\t3\n", 881 | "compromise\t1\n", 882 | "conceived\t1\n", 883 | "concentrating\t1\n", 884 | "concept\t1\n", 885 | "concluded\t1\n", 886 | "conduit\t1\n", 887 | "congress\t2\n", 888 | "considering\t1\n", 889 | "considers\t1\n", 890 | "construct\t1\n", 891 | "containing\t1\n", 892 | "contains\t3\n", 893 | "contemporary\t1\n", 894 | "contempory\t1\n", 895 | "contradictions\t2\n", 896 | "contradicts\t1\n", 897 | "conway\t1\n", 898 | "copying\t1\n", 899 | "covering\t1\n", 900 | "craftsmen\t1\n", 901 | "creed\t2\n", 902 | "crimes\t1\n", 903 | "criticized\t1\n", 904 | "critique\t1\n", 905 | "critiques\t1\n", 906 | "d1000\t2\n", 907 | "d3000\t1\n", 908 | "darwin\t4\n", 909 | "davy\t1\n", 910 | "day\t1\n", 911 | "de\t2\n", 912 | "dead\t2\n", 913 | "death\t1\n", 914 | "december\t1\n", 915 | "decisively\t1\n", 916 | "defences\t1\n", 917 | "defining\t1\n", 918 | "deity\t2\n", 919 | "delight\t1\n", 920 | "deluxe\t1\n", 921 | "demand\t1\n", 922 | "demonstrates\t1\n", 923 | "der\t3\n", 924 | "derived\t1\n", 925 | "des\t1\n", 926 | "descartes\t1\n", 927 | "describe\t1\n", 928 | "description\t1\n", 929 | "designs\t3\n", 930 | "detailed\t1\n", 931 | "developments\t1\n", 932 | "devil\t1\n", 933 | "diary\t1\n", 934 | "dick\t3\n", 935 | "dictionary\t1\n", 936 | "die\t1\n", 937 | "diener\t1\n", 938 | "different\t2\n", 939 | "difficult\t1\n", 940 | "direct\t1\n", 941 | "directly\t1\n", 942 | "disch\t1\n", 943 | "dismissively\t1\n", 944 | "divine\t2\n", 945 | "doctors\t1\n", 946 | "dogmatic\t1\n", 947 | "doomsday\t2\n", 948 | "drive\t1\n", 949 | "droemerknaur\t1\n", 950 | "dull\t1\n", 951 | "dunkle\t1\n", 952 | "earth\t2\n", 953 | "earthers\t1\n", 954 | "east\t1\n", 955 | "easy\t1\n", 956 | "edgar\t1\n", 957 | "edition\t3\n", 958 | "editor\t1\n", 959 | "edmund\t1\n", 960 | "effect\t1\n", 961 | "emphasis\t1\n", 962 | "england\t1\n", 963 | "enlighting\t1\n", 964 | "erste\t1\n", 965 | "et\t1\n", 966 | "etc\t1\n", 967 | "ethical\t1\n", 968 | "ev\t2\n", 969 | "even\t1\n", 970 | "events\t1\n", 971 | "evil\t1\n", 972 | "evolution\t3\n", 973 | "examiner\t1\n", 974 | "examines\t1\n", 975 | "example\t1\n", 976 | "existence\t5\n", 977 | "exists\t3\n", 978 | "explicitly\t1\n", 979 | "expressed\t1\n", 980 | "faith\t2\n", 981 | "fallacies\t1\n", 982 | "fallible\t1\n", 983 | "faq\t1\n", 984 | "fate\t1\n", 985 | "fax\t2\n", 986 | "feet\t1\n", 987 | "fernwright\t1\n", 988 | "ffrf\t1\n", 989 | "fiction\t1\n", 990 | "fictitious\t1\n", 991 | "figmonetcomcom\t1\n", 992 | "files\t1\n", 993 | "filling\t1\n", 994 | "fired\t1\n", 995 | "first\t1\n", 996 | "fish\t6\n", 997 | "focusses\t1\n", 998 | "following\t1\n", 999 | "foote\t2\n", 1000 | "forbids\t1\n", 1001 | "formalistic\t1\n", 1002 | "foundation\t2\n", 1003 | "founded\t1\n", 1004 | "france\t1\n", 1005 | "francisco\t1\n", 1006 | "freedom\t2\n", 1007 | "freethinker\t1\n", 1008 | "freethought\t2\n", 1009 | "friend\t1\n", 1010 | "fundamentalists\t2\n", 1011 | "fuss\t1\n", 1012 | "galactic\t1\n", 1013 | "gem\t1\n", 1014 | "george\t1\n", 1015 | "german\t1\n", 1016 | "germany\t4\n", 1017 | "get\t3\n", 1018 | "giant\t1\n", 1019 | "glenn\t1\n", 1020 | "gnostic\t1\n", 1021 | "go\t1\n", 1022 | "god\t13\n", 1023 | "gods\t3\n", 1024 | "goes\t1\n", 1025 | "gold\t1\n", 1026 | "gordon\t1\n", 1027 | "gottes\t1\n", 1028 | "great\t3\n", 1029 | "group\t1\n", 1030 | "grows\t1\n", 1031 | "gw\t1\n", 1032 | "hall\t1\n", 1033 | "handbook\t1\n", 1034 | "handmaids\t1\n", 1035 | "handwaving\t1\n", 1036 | "hanged\t1\n", 1037 | "hannover\t1\n", 1038 | "hardcover\t2\n", 1039 | "haught\t1\n", 1040 | "haughts\t1\n", 1041 | "help\t1\n", 1042 | "hero\t1\n", 1043 | "hidden\t1\n", 1044 | "high\t1\n", 1045 | "history\t6\n", 1046 | "holloway\t1\n", 1047 | "hollywood\t1\n", 1048 | "holy\t2\n", 1049 | "hopkins\t1\n", 1050 | "horrors\t2\n", 1051 | "however\t1\n", 1052 | "hrsg\t1\n", 1053 | "humanism\t5\n", 1054 | "humanist\t1\n", 1055 | "hume\t1\n", 1056 | "hunted\t1\n", 1057 | "ibdk\t1\n", 1058 | "ibka\t3\n", 1059 | "idea\t2\n", 1060 | "ie\t1\n", 1061 | "ill\t1\n", 1062 | "illustrated\t1\n", 1063 | "immoralities\t2\n", 1064 | "implicitly\t1\n", 1065 | "imputation\t1\n", 1066 | "includes\t3\n", 1067 | "including\t2\n", 1068 | "incoherent\t2\n", 1069 | "inductive\t1\n", 1070 | "information\t1\n", 1071 | "informationen\t1\n", 1072 | "ink\t1\n", 1073 | "inside\t1\n", 1074 | "intellectual\t1\n", 1075 | "internationaler\t2\n", 1076 | "invades\t1\n", 1077 | "invasion\t1\n", 1078 | "ironic\t1\n", 1079 | "isbn\t5\n", 1080 | "islington\t1\n", 1081 | "j\t1\n", 1082 | "james\t3\n", 1083 | "joe\t1\n", 1084 | "johns\t1\n", 1085 | "journal\t2\n", 1086 | "jr\t3\n", 1087 | "justification\t2\n", 1088 | "k\t2\n", 1089 | "kant\t1\n", 1090 | "kierkegaard\t1\n", 1091 | "kind\t1\n", 1092 | "king\t1\n", 1093 | "kingdom\t1\n", 1094 | "know\t1\n", 1095 | "konfessionslosen\t2\n", 1096 | "konfessionslosesn\t1\n", 1097 | "kung\t1\n", 1098 | "l\t1\n", 1099 | "lambs\t1\n", 1100 | "laser\t1\n", 1101 | "lastmodified\t1\n", 1102 | "late\t1\n", 1103 | "laurel\t1\n", 1104 | "leaving\t1\n", 1105 | "legal\t1\n", 1106 | "leibowitz\t2\n", 1107 | "lelies\t1\n", 1108 | "less\t1\n", 1109 | "letters\t1\n", 1110 | "library\t1\n", 1111 | "life\t1\n", 1112 | "like\t1\n", 1113 | "lines\t1\n", 1114 | "lion\t1\n", 1115 | "listening\t1\n", 1116 | "listing\t1\n", 1117 | "lists\t1\n", 1118 | "live\t1\n", 1119 | "lives\t1\n", 1120 | "living\t1\n", 1121 | "london\t4\n", 1122 | "looks\t1\n", 1123 | "luxuries\t1\n", 1124 | "lynn\t2\n", 1125 | "mackie\t2\n", 1126 | "mackies\t1\n", 1127 | "madison\t1\n", 1128 | "madness\t1\n", 1129 | "magazine\t1\n", 1130 | "mail\t2\n", 1131 | "mailbased\t1\n", 1132 | "mailing\t1\n", 1133 | "mainly\t1\n", 1134 | "mainstream\t1\n", 1135 | "make\t1\n", 1136 | "makes\t1\n", 1137 | "making\t1\n", 1138 | "man\t1\n", 1139 | "mantiscouk\t1\n", 1140 | "many\t3\n", 1141 | "margaret\t1\n", 1142 | "martin\t1\n", 1143 | "martins\t1\n", 1144 | "materialien\t1\n", 1145 | "mathew\t2\n", 1146 | "mathewmantiscouk\t1\n", 1147 | "may\t1\n", 1148 | "maze\t1\n", 1149 | "md\t1\n", 1150 | "men\t1\n", 1151 | "met\t1\n", 1152 | "michael\t1\n", 1153 | "miller\t1\n", 1154 | "mind\t1\n", 1155 | "miracle\t2\n", 1156 | "miz\t1\n", 1157 | "mizvertrieb\t1\n", 1158 | "monks\t1\n", 1159 | "monthly\t1\n", 1160 | "moral\t1\n", 1161 | "morality\t1\n", 1162 | "moulded\t1\n", 1163 | "murder\t1\n", 1164 | "music\t1\n", 1165 | "must\t1\n", 1166 | "mysteries\t1\n", 1167 | "mysteriously\t1\n", 1168 | "n1\t1\n", 1169 | "n19\t1\n", 1170 | "nation\t1\n", 1171 | "national\t2\n", 1172 | "necessarily\t1\n", 1173 | "negative\t1\n", 1174 | "neither\t1\n", 1175 | "net\t2\n", 1176 | "new\t4\n", 1177 | "newer\t1\n", 1178 | "newman\t1\n", 1179 | "newsletter\t1\n", 1180 | "nonbelief\t1\n", 1181 | "nonexistence\t1\n", 1182 | "nonfiction\t1\n", 1183 | "norm\t2\n", 1184 | "north\t1\n", 1185 | "noteworthy\t1\n", 1186 | "novel\t3\n", 1187 | "novels\t2\n", 1188 | "noyes\t1\n", 1189 | "number\t2\n", 1190 | "ny\t2\n", 1191 | "obscure\t1\n", 1192 | "observations\t1\n", 1193 | "oceans\t1\n", 1194 | "odd\t1\n", 1195 | "often\t3\n", 1196 | "old\t2\n", 1197 | "older\t1\n", 1198 | "one\t3\n", 1199 | "ones\t1\n", 1200 | "opinions\t1\n", 1201 | "organization\t1\n", 1202 | "organizations\t1\n", 1203 | "origin\t1\n", 1204 | "origins\t1\n", 1205 | "outlawed\t1\n", 1206 | "outstanding\t1\n", 1207 | "oxford\t2\n", 1208 | "pages\t4\n", 1209 | "paid\t1\n", 1210 | "pangborn\t1\n", 1211 | "papal\t1\n", 1212 | "paper\t3\n", 1213 | "paperback\t1\n", 1214 | "paperbacks\t1\n", 1215 | "papsttums\t1\n", 1216 | "paraphernalia\t1\n", 1217 | "particular\t1\n", 1218 | "particularly\t1\n", 1219 | "passage\t1\n", 1220 | "people\t6\n", 1221 | "per\t1\n", 1222 | "performed\t1\n", 1223 | "period\t1\n", 1224 | "persecution\t1\n", 1225 | "persons\t1\n", 1226 | "peter\t1\n", 1227 | "philadelphia\t1\n", 1228 | "philip\t2\n", 1229 | "philips\t1\n", 1230 | "philosophical\t3\n", 1231 | "philosophy\t1\n", 1232 | "pink\t1\n", 1233 | "place\t1\n", 1234 | "planet\t1\n", 1235 | "plantinga\t1\n", 1236 | "plastic\t1\n", 1237 | "platinga\t1\n", 1238 | "po\t3\n", 1239 | "polished\t1\n", 1240 | "politisches\t1\n", 1241 | "popular\t1\n", 1242 | "positions\t2\n", 1243 | "positive\t1\n", 1244 | "possibly\t1\n", 1245 | "post\t2\n", 1246 | "postfach\t3\n", 1247 | "posthumous\t1\n", 1248 | "postpaid\t1\n", 1249 | "pothealer\t2\n", 1250 | "pp\t1\n", 1251 | "pregnant\t1\n", 1252 | "premise\t1\n", 1253 | "present\t2\n", 1254 | "press\t8\n", 1255 | "price\t1\n", 1256 | "principal\t1\n", 1257 | "probably\t1\n", 1258 | "produce\t1\n", 1259 | "prometheus\t5\n", 1260 | "promoting\t1\n", 1261 | "proof\t1\n", 1262 | "property\t1\n", 1263 | "publish\t4\n", 1264 | "punished\t1\n", 1265 | "push\t1\n", 1266 | "quarterly\t1\n", 1267 | "quickly\t1\n", 1268 | "quite\t1\n", 1269 | "quotations\t2\n", 1270 | "r\t2\n", 1271 | "radio\t1\n", 1272 | "raise\t1\n", 1273 | "rambling\t1\n", 1274 | "range\t1\n", 1275 | "ranges\t1\n", 1276 | "rather\t2\n", 1277 | "rational\t1\n", 1278 | "rationalism\t1\n", 1279 | "rationalist\t1\n", 1280 | "read\t1\n", 1281 | "reading\t1\n", 1282 | "readings\t1\n", 1283 | "reality\t1\n", 1284 | "realm\t1\n", 1285 | "reason\t1\n", 1286 | "rebut\t1\n", 1287 | "recent\t1\n", 1288 | "red\t1\n", 1289 | "refreshingly\t1\n", 1290 | "refutations\t1\n", 1291 | "refuting\t1\n", 1292 | "rejected\t1\n", 1293 | "relevance\t1\n", 1294 | "religion\t6\n", 1295 | "religious\t3\n", 1296 | "rely\t1\n", 1297 | "remained\t1\n", 1298 | "remote\t1\n", 1299 | "replacements\t1\n", 1300 | "reply\t1\n", 1301 | "resources\t4\n", 1302 | "restatements\t1\n", 1303 | "retroactively\t1\n", 1304 | "returns\t1\n", 1305 | "review\t1\n", 1306 | "revised\t2\n", 1307 | "revoked\t1\n", 1308 | "richard\t1\n", 1309 | "right\t2\n", 1310 | "road\t2\n", 1311 | "rosa\t2\n", 1312 | "saint\t1\n", 1313 | "san\t1\n", 1314 | "santa\t2\n", 1315 | "saying\t1\n", 1316 | "sceptical\t1\n", 1317 | "schizophrenic\t1\n", 1318 | "scholarly\t1\n", 1319 | "searches\t1\n", 1320 | "second\t1\n", 1321 | "secular\t3\n", 1322 | "secularization\t1\n", 1323 | "see\t2\n", 1324 | "seems\t1\n", 1325 | "seite\t1\n", 1326 | "seldes\t1\n", 1327 | "sell\t2\n", 1328 | "send\t2\n", 1329 | "series\t1\n", 1330 | "server\t1\n", 1331 | "set\t2\n", 1332 | "sf\t1\n", 1333 | "sheets\t1\n", 1334 | "short\t2\n", 1335 | "sidgwick\t1\n", 1336 | "similarity\t1\n", 1337 | "simple\t1\n", 1338 | "sinful\t1\n", 1339 | "single\t1\n", 1340 | "small\t1\n", 1341 | "society\t3\n", 1342 | "somewhat\t3\n", 1343 | "sort\t1\n", 1344 | "south\t1\n", 1345 | "spent\t1\n", 1346 | "square\t1\n", 1347 | "star\t1\n", 1348 | "statements\t1\n", 1349 | "states\t1\n", 1350 | "stein\t1\n", 1351 | "stick\t1\n", 1352 | "stickers\t1\n", 1353 | "stories\t2\n", 1354 | "story\t2\n", 1355 | "street\t2\n", 1356 | "study\t1\n", 1357 | "style\t1\n", 1358 | "subject\t1\n", 1359 | "subjects\t1\n", 1360 | "substance\t1\n", 1361 | "subtitled\t1\n", 1362 | "summons\t1\n", 1363 | "supposedly\t1\n", 1364 | "suppressed\t1\n", 1365 | "sure\t1\n", 1366 | "swinburne\t6\n", 1367 | "symbol\t1\n", 1368 | "system\t1\n", 1369 | "take\t1\n", 1370 | "tale\t2\n", 1371 | "technology\t1\n", 1372 | "technologybased\t1\n", 1373 | "telephone\t4\n", 1374 | "temple\t2\n", 1375 | "tendentious\t2\n", 1376 | "terminally\t1\n", 1377 | "terminology\t1\n", 1378 | "theism\t3\n", 1379 | "theists\t1\n", 1380 | "theocracy\t1\n", 1381 | "theres\t1\n", 1382 | "theses\t1\n", 1383 | "think\t1\n", 1384 | "thomas\t1\n", 1385 | "thoughtprovoking\t1\n", 1386 | "thoughts\t1\n", 1387 | "times\t2\n", 1388 | "traces\t1\n", 1389 | "translation\t1\n", 1390 | "tries\t1\n", 1391 | "trilogy\t1\n", 1392 | "true\t1\n", 1393 | "truth\t1\n", 1394 | "try\t1\n", 1395 | "turner\t1\n", 1396 | "twisted\t1\n", 1397 | "tx\t2\n", 1398 | "uh\t1\n", 1399 | "ultimate\t1\n", 1400 | "ultimately\t1\n", 1401 | "unable\t1\n", 1402 | "unbelief\t2\n", 1403 | "uncovering\t1\n", 1404 | "und\t3\n", 1405 | "unfortunately\t1\n", 1406 | "united\t1\n", 1407 | "university\t3\n", 1408 | "unknown\t1\n", 1409 | "unsupportable\t1\n", 1410 | "upon\t1\n", 1411 | "us\t3\n", 1412 | "usa\t4\n", 1413 | "usage\t1\n", 1414 | "use\t1\n", 1415 | "used\t2\n", 1416 | "valis\t1\n", 1417 | "values\t1\n", 1418 | "various\t3\n", 1419 | "version\t3\n", 1420 | "versions\t1\n", 1421 | "vicars\t1\n", 1422 | "views\t1\n", 1423 | "volume\t2\n", 1424 | "walter\t1\n", 1425 | "way\t2\n", 1426 | "wc1r\t2\n", 1427 | "well\t2\n", 1428 | "western\t1\n", 1429 | "whether\t1\n", 1430 | "white\t1\n", 1431 | "whose\t1\n", 1432 | "wi\t1\n", 1433 | "wide\t1\n", 1434 | "wired\t1\n", 1435 | "without\t4\n", 1436 | "woman\t1\n", 1437 | "womans\t1\n", 1438 | "women\t1\n", 1439 | "womens\t1\n", 1440 | "word\t1\n", 1441 | "work\t2\n", 1442 | "works\t1\n", 1443 | "world\t1\n", 1444 | "worldview\t2\n", 1445 | "worth\t1\n", 1446 | "wp\t1\n", 1447 | "write\t6\n", 1448 | "writing\t1\n", 1449 | "writings\t1\n", 1450 | "written\t2\n", 1451 | "wrote\t3\n", 1452 | "york\t2\n", 1453 | "youll\t1\n", 1454 | "young\t1\n", 1455 | "zeit\t1\n", 1456 | "zur\t1\n", 1457 | "ÿ\t1\n" 1458 | ], 1459 | "name": "stdout" 1460 | } 1461 | ] 1462 | } 1463 | ] 1464 | } -------------------------------------------------------------------------------- /mapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """mapper.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | Original file is located at 7 | https://colab.research.google.com/drive/1yCwGyMXJT2qt3_58aLOOiJXO0GIaPcJd 8 | """ 9 | 10 | 11 | 12 | import sys 13 | import io 14 | import re 15 | import nltk 16 | nltk.download('stopwords',quiet=True) 17 | from nltk.corpus import stopwords 18 | punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' 19 | 20 | stop_words = set(stopwords.words('english')) 21 | input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='latin1') 22 | for line in input_stream: 23 | line = line.strip() 24 | line = re.sub(r'[^\w\s]', '',line) 25 | line = line.lower() 26 | for x in line: 27 | if x in punctuations: 28 | line=line.replace(x, " ") 29 | 30 | words=line.split() 31 | for word in words: 32 | if word not in stop_words: 33 | print('%s\t%s' % (word, 1)) -------------------------------------------------------------------------------- /reducer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """reducer.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | Original file is located at 7 | https://colab.research.google.com/drive/1YzJ-vUsO5VYCyMrfPMow3s2IdxXkyQ0i 8 | """ 9 | 10 | from operator import itemgetter 11 | import sys 12 | 13 | current_word = None 14 | current_count = 0 15 | word = None 16 | 17 | # input comes from STDIN 18 | for line in sys.stdin: 19 | # remove leading and trailing whitespace 20 | line = line.strip() 21 | line=line.lower() 22 | 23 | # parse the input we got from mapper.py 24 | word, count = line.split('\t', 1) 25 | try: 26 | count = int(count) 27 | except ValueError: 28 | #count was not a number, so silently 29 | #ignore/discard this line 30 | continue 31 | 32 | # this IF-switch only works because Hadoop sorts map output 33 | # by key (here: word) before it is passed to the reducer 34 | if current_word == word: 35 | current_count += count 36 | else: 37 | if current_word: 38 | # write result to STDOUT 39 | print ('%s\t%s' % (current_word, current_count)) 40 | current_count = count 41 | current_word = word 42 | 43 | # do not forget to output the last word if needed! 44 | if current_word == word: 45 | print( '%s\t%s' % (current_word, current_count)) 46 | 47 | 48 | 49 | --------------------------------------------------------------------------------