├── .ipynb_checkpoints ├── Hadoop Installation-checkpoint.ipynb └── Hadoop Tutorial-checkpoint.ipynb ├── Big-Data-Essentials ├── Week1 - Distributed File Systems │ ├── 1 Unix Command Line Interface │ │ ├── 1 File system exploration.pdf │ │ ├── 2 File system managing.pdf │ │ ├── 3 File content exploration 1.pdf │ │ ├── 4 File content exploration 2.pdf │ │ ├── 5 Processes.pdf │ │ └── Basic bash commands.csv │ ├── 2 HDFS Architechture and Scalability │ │ ├── 1 Scaling Distributed File System.pdf │ │ ├── 2 Block and Replica Stated and Recovery Process 1.pdf │ │ ├── 3 Block and Replica Stated and Recovery Process 2.pdf │ │ ├── 4 HDFS Client.pdf │ │ ├── 5 Web UI and REST API.pdf │ │ ├── 6 Namenode Architechture.pdf │ │ ├── Gentle Introduction into curl │ │ └── hadoop-hdfs-commands-cheatsheet.pdf │ └── 3 File Types │ │ ├── 1 Introduction.pdf │ │ ├── 2 Text formats.pdf │ │ ├── 3 Binary formats 1.pdf │ │ ├── 4 Binary formats 2.pdf │ │ ├── 5 Compression.pdf │ │ └── File formats extra ├── Week2 - Solving Problems with MapReduce │ ├── 1 Unreliable Components │ │ ├── 1 Unreliable Components 1.pdf │ │ ├── 2 Unreliable Components 2.pdf │ │ ├── 3 MapReduce.pdf │ │ ├── 4 Distributed Shell.pdf │ │ └── 5 Fault Tolerance.pdf │ ├── 2 Streaming Applications in Python │ │ ├── 1 Streaming.pdf │ │ ├── 2 Streaming in Python.pdf │ │ ├── 3 WordCount in Python.pdf │ │ ├── 4 Distributed Cache.pdf │ │ ├── 5 Environment Counters.pdf │ │ └── 6 Testing.pdf │ └── 3 Job Configuration Comparator Partitioner │ │ ├── 1 Combiner.pdf │ │ ├── 2 Partitioner.pdf │ │ ├── 3 Comparator.pdf │ │ ├── 4 Speculative Execution and Backup Tasks.pdf │ │ └── 5 Compression.pdf ├── Week3 - Solving Problems with MapReduce - Practice │ ├── NameCount.ipynb │ ├── StopWordsTask2.ipynb │ ├── WordGroup.ipynb │ └── WordsRatingTask1.ipynb ├── Week4 - Introduction to Apache Spark │ ├── 1 Core concepts and abstractions │ │ ├── 1 Welcome.pdf │ │ ├── 2 RDDs.pdf │ │ ├── 3 Transformations 1.pdf │ │ ├── 4 Transformations 2.pdf │ │ ├── 5 Actions.pdf │ │ └── 6 Resiliency.pdf │ ├── 2 Advanced topics │ │ ├── 1 Execution and Scheduling.pdf │ │ ├── 2 Caching and Persistence.pdf │ │ ├── 3 Broadcast variables.pdf │ │ └── 4 Accumulator variables.pdf │ ├── 3 Working with Spark in Python │ │ ├── 1 Getting started with Spark and Python.pdf │ │ ├── 2 Working with text files.pdf │ │ ├── 3 Joins.pdf │ │ ├── 4 Broadcast and Accumulator variables.pdf │ │ ├── 5 Spark UI.pdf │ │ └── 6 Cluster mode.pdf │ └── Dropped Text.txt ├── Week5 - Introduction to Apache Spark - Practice │ ├── Spakr+Assignment1+-+Pairs.ipynb │ ├── Spark+Assignment1+-+Pairs.ipynb │ └── Spark+Assignment2+-+Collocations.ipynb ├── Week6 - Real-World Applications │ ├── 1 Working with samples │ │ ├── Working-with-samples.ipynb │ │ ├── data_dictionary_trip_records_yellow.pdf │ │ ├── sample100.csv │ │ └── sample10000.csv │ ├── 2 Telecommunications Analytics │ │ ├── 1 Map and Reduce Side Joins.pdf │ │ ├── 2 Tabular Data and KeyFieldSelection.pdf │ │ └── 3 Data Skew and Salting.pdf │ ├── 3 Working with social graphs │ │ ├── 1 Twitter graph case study.pdf │ │ └── 2 Shortest path.pdf │ ├── Shortest_path.ipynb │ └── Tf-Idf.ipynb ├── data_dictionary_trip_records_yellow.pdf └── sample10000.csv ├── Hadoop Installation.ipynb ├── Hadoop Tutorial.ipynb ├── LICENSE └── README.md /.ipynb_checkpoints/Hadoop Installation-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "'Hadoop Installation.ipynb' LICENSE\r\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# use '!' to access shell commands in the notebook\n", 18 | "!ls" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Installing Prerequisites" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "Start with installing prerequisites:\n", 33 | "\n", 34 | "### JAVA-JDK" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "[sudo] password for incursio: \n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "!sudo apt-get install default-java" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "In jupyter notebook, ``sudo`` command will then require PASS_KEY but you won't be able to ENTER your password. Therefore, below two are methods to input password to run ``sudo`` command." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "#### use echo to give your PASS_KEY as an input" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 34, 71 | "metadata": { 72 | "scrolled": true 73 | }, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "Reading package lists... Done \n", 80 | "Building dependency tree \n", 81 | "Reading state information... Done\n", 82 | "default-jdk is already the newest version (2:1.10-63ubuntu1~02).\n", 83 | "0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "!echo 'XXXXXXXX' | sudo -S apt-get install default-jdk" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "#### save you password in a file and use it to provide PASS_KEY" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 35, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "Reading package lists... Done \n", 108 | "Building dependency tree \n", 109 | "Reading state information... Done\n", 110 | "default-jdk is already the newest version (2:1.10-63ubuntu1~02).\n", 111 | "0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "!sudo -S apt-get install default-jdk < /path/to/file" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "### Add a Dedicated hadoop user\n", 124 | "\n", 125 | "It'll create a new user to install/run hadoop keeping it separated from other user accounts." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 53, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [ 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "[sudo] password for incursio: Sorry, try again.\n", 140 | "[sudo] password for incursio: \n", 141 | "sudo: 1 incorrect password attempt\n", 142 | "[sudo] password for incursio: Sorry, try again.\n", 143 | "[sudo] password for incursio: \n", 144 | "sudo: 1 incorrect password attempt\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "#!sudo -S addgroup hadoop < /path/to/file\n", 150 | "#!sudo adduser --ingroup hadoop hduser < path/to/file\n", 151 | "\n", 152 | "PASS_KEY = 'XXXXXXX'\n", 153 | "!echo $PASS_KEY | sudo -S addgroup hadoop\n", 154 | "!echo $PASS_KEY | sudo -S adduser --ingroup hadoop hduser " 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### Configuring SSH" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 48, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "su: must be run from a terminal\r\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "!echo PASS_KEY | su - hduser" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "Now, to execute following commands you will have to open the terminal.\n", 186 | "\n", 187 | "**Copy the following cell(s)**\n", 188 | "\n", 189 | "The below command will transfer the terminal access to newly created ``hduser``" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "su - hduser" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "Hadoop requires an SSH access to manage nodes present all over the cluster. This command will generate an SSH key with empty(string) password.\n", 206 | "In general, it's not recommended to use empty(string) password, but since we don't want to enter the passphrase each time Hadoop connects to its nodes therefore, **leave it empty**." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "ssh-keygen -t rsa -P \"\"" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "The command creates a new file and appends generated key to it. " 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "cat .ssh/id_rsa.pub >> .ssh/authorized_keys" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "Now we'll need root access through ``hduser``, thus we'll add ``hduser`` to the list of sudoers." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "#open nano to edit /etc/sudoers.tmp\n", 248 | "sudo visudo\n", 249 | "\n", 250 | "#and append the following at the EOF\n", 251 | "hduser ALL=(ALL:ALL) ALL" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "We want to disable IPv6." 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "#open system call configuration file\n", 268 | "sudo gedit /etc/sysctl.conf\n", 269 | "\n", 270 | "#and append the following at the EOF\n", 271 | "#disable ipv6 \n", 272 | "net.ipv6.conf.all.disable_ipv6 = 1 \n", 273 | "net.ipv6.conf.default.disable_ipv6 = 1 \n", 274 | "net.ipv6.conf.lo.disable_ipv6 = 1" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "Now, **reboot** system. On boot, check whether the ipv6 has been disabled." 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "#it should return 1\n", 291 | "cat /proc/sys/net/ipv6/conf/all/disable_ipv6" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "## Install hadoop" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "Run the following cell(s) in the terminal with access to ``hduser``" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### Download Hadoop" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "#change directory\n", 322 | "cd /usr/local\n", 323 | "\n", 324 | "#download hadoop 3.1 in this directory\n", 325 | "#to download other/newer version check the link http://www-eu.apache.org/dist/hadoop/core/\n", 326 | "wget http://www-eu.apache.org/dist/hadoop/core/hadoop-3.1.0/hadoop-3.1.0.tar.gz\n", 327 | "\n", 328 | "#extract the tar file\n", 329 | "sudo tar xzf hadoop-3.1.0.tar.gz\n", 330 | "\n", 331 | "#rename it to hadoop\n", 332 | "sudo mv hadoop-3.1.0 hadoop\n", 333 | "\n", 334 | "#change the owner of files to hduser\n", 335 | "sudo chown -R hduser:hadoop hadoop" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "### Set environment variables" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "Set hadoop and java home environment variables" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "#Open the editor\n", 359 | "sudo gedit ~/.bashrc\n", 360 | "\n", 361 | "#and append the following lines at the EOF\n", 362 | "\n", 363 | "export HADOOP_HOME=/usr/local/hadoop\n", 364 | "export JAVA_HOME=/usr/lib/jvm/default-java\n", 365 | "\n", 366 | "# Some convenient aliases and functions for running Hadoop-related commands \n", 367 | "unalias fs &> /dev/null \n", 368 | "alias fs=\"hadoop fs\" \n", 369 | "unalias hls &> /dev/null \n", 370 | "alias hls=\"fs -ls\" \n", 371 | "\n", 372 | "# Add Hadoop bin/ directory to PATH \n", 373 | "export PATH=$PATH:$HADOOP_HOME/bin\n", 374 | "# Add Hadoop sbin/ directory to PATH \n", 375 | "export PATH=$PATH:$HADOOP_HOME/sbin" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "Now edit the ``hadoop-env.sh`` and update JAVA_HOME" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "#You know the drill\n", 392 | "sudo gedit $HADOOP_HOME/etc/hadoop/hadoop-env.sh\n", 393 | "\n", 394 | "#update JAVA_HOME (don't append, instead search for likewise line of code, it might be in the comments!)\n", 395 | "export JAVA_HOME=/usr/lib/jvm/default-java\n", 396 | "\n", 397 | "#you can also update HADOOP_HOME (not necessary)\n", 398 | "export HADOOP_HOME=/usr/local/hadoop" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "## Start Hadoop Cluster" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "### Standalone Mode" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "In this mode, hadoop will be set to run in a non-distributed mode, as a single java process. Using this mode we can check whether the installation is upto-mark." 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "#create a directory to store input files\n", 429 | "mkdir $HADOOP_HOME/input\n", 430 | "\n", 431 | "#now to verify no-errors in the installation, we will run a sample using example jar file\n", 432 | "#copy all xml files to the newly created directory\n", 433 | "cp $HADOOP_HOME/etc/hadoop/*.xml $HADOOP_HOME/input\n", 434 | "\n", 435 | "#1st argument is the /path/to/hadoop command (required to run MapReduce)\n", 436 | "#2nd argument is jar, specifying MapReduce is in JAVA archive\n", 437 | "#3rd argument is come along MapReduce example, it name of jar can be version specific (check your file/version)\n", 438 | "#4th argument is grep, to execute regular expression example\n", 439 | "#5th argument is input directory, containing all the .xml files\n", 440 | "#6th argument is output directory, which will be created and will contain output files\n", 441 | "#7th argument is 'dfs[a-z.]+', basically the string to be searched\n", 442 | "$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.0.jar grep input output 'dfs[a-z.]+'" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "### Pseudo Distributed Mode" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "In this mode, hadoop runs on a single node in a pseudo distributed mode where each hadoop daemon run as separate java process." 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "#### Configuring site xml(s)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "Create the ``tmp`` directory" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "sudo mkdir -p /app/hadoop/tmp\n", 480 | "sudo chown hduser:hadoop /app/hadoop/tmp" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "Now edit ``core-site.xml`` and ``hdfs-site.xml``. You'll find these files in **$HADOOP_HOME/etc/hadoop** directory.\n", 488 | "\n", 489 | "Start with **core-site.xml**" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "sudo gedit $HADOOP_HOME/etc/hadoop/core-site.xml\n", 499 | "\n", 500 | "#paste these lines between tags\n", 501 | "\n", 502 | " hadoop.tmp.dir\n", 503 | " /app/hadoop/tmp\n", 504 | " A base for other temporary directories.\n", 505 | "\n", 506 | "\n", 507 | "\n", 508 | " fs.defaultFS\n", 509 | " hdfs://localhost:9000\n", 510 | " The name of the default file system. A URI whose\n", 511 | " scheme and authority determine the FileSystem implementation. The\n", 512 | " uri's scheme determines the config property (fs.SCHEME.impl) naming\n", 513 | " the FileSystem implementation class. The uri's authority is used to\n", 514 | " determine the host, port, etc. for a filesystem.\n", 515 | "" 516 | ] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "metadata": {}, 521 | "source": [ 522 | "**hdfs-site.xml**" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "sudo gedit $HADOOP_HOME/etc/hadoop/hdfs-site.xml\n", 532 | "\n", 533 | "#paste these lines between tags\n", 534 | "\n", 535 | " dfs.replication\n", 536 | " 1\n", 537 | " Default block replication.\n", 538 | " The actual number of replications can be specified when the file is created.\n", 539 | " The default is used if replication is not specified in create time.\n", 540 | " \n", 541 | "" 542 | ] 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "metadata": {}, 547 | "source": [ 548 | "Format namenode (you'll need to do this only the first time you set up hadoop cluster i.e, the time of installation)" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "#On running this command, you'll get the o/p with SHUTDOWN_MSG at the end.\n", 558 | "#Don't worry it's not an error\n", 559 | "$HADOOP_HOME/bin/hdfs namenode -format" 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": {}, 565 | "source": [ 566 | "Now it's time to start the HADOOP CLUSTER!!!" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": null, 572 | "metadata": {}, 573 | "outputs": [], 574 | "source": [ 575 | "#start the namenode and datanode daemon\n", 576 | "$HADOOP_HOME/sbin/start-dfs.sh" 577 | ] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "metadata": {}, 582 | "source": [ 583 | "Yeah, it's done!" 584 | ] 585 | }, 586 | { 587 | "cell_type": "markdown", 588 | "metadata": {}, 589 | "source": [ 590 | "You can check the cluster nodes" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": null, 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "#run this command\n", 600 | "jps\n", 601 | "\n", 602 | "#also you can browse the namenode web interface on this link\n", 603 | "http://localhost:9870/" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "Now create the HDFS directories which we'll need to execute MapReduce jobs." 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "$HADOOP_HOME/bin/hdfs " 620 | ] 621 | }, 622 | { 623 | "cell_type": "markdown", 624 | "metadata": {}, 625 | "source": [ 626 | "To stop the network" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [ 635 | "$HADOOP_HOME/sbin/stop-dfs.sh" 636 | ] 637 | }, 638 | { 639 | "cell_type": "markdown", 640 | "metadata": {}, 641 | "source": [ 642 | "#### YARN on a single node" 643 | ] 644 | }, 645 | { 646 | "cell_type": "markdown", 647 | "metadata": {}, 648 | "source": [ 649 | "Open the ``.bashrc`` and append other environment variables." 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": null, 655 | "metadata": {}, 656 | "outputs": [], 657 | "source": [ 658 | "sudo gedit ~/.bashrc\n", 659 | "\n", 660 | "#and append the following\n", 661 | "export HADOOP_MAPRED_HOME=${HADOOP_HOME}\n", 662 | "export HADOOP_COMMON_HOME=${HADOOP_HOME}\n", 663 | "export HADOOP_HDFS_HOME=${HADOOP_HOME}\n", 664 | "export HADOOP_YARN_HOME=${HADOOP_HOME}" 665 | ] 666 | }, 667 | { 668 | "cell_type": "markdown", 669 | "metadata": {}, 670 | "source": [ 671 | " **mapred-site.xml**" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": null, 677 | "metadata": {}, 678 | "outputs": [], 679 | "source": [ 680 | "sudo gedit $HADOOP_HOME/etc/hadoop/mapred-site.xml\n", 681 | "\n", 682 | "#paste these lines between tags\n", 683 | "\n", 684 | " mapred.job.tracker\n", 685 | " localhost:54311\n", 686 | " The host and port that the MapReduce job tracker runs\n", 687 | " at. If \"local\", then jobs are run in-process as a single map\n", 688 | " and reduce task.\n", 689 | " \n", 690 | "" 691 | ] 692 | }, 693 | { 694 | "cell_type": "markdown", 695 | "metadata": {}, 696 | "source": [ 697 | "**yarn-site.xml**" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": null, 703 | "metadata": {}, 704 | "outputs": [], 705 | "source": [ 706 | "sudo gedit $HADOOP_HOME/etc/hadoop/yarn-site.xml\n", 707 | "\n", 708 | "#paste these lines between tags\n", 709 | "\n", 710 | " yarn.nodemanager.aux-services\n", 711 | " mapreduce_shuffle\n", 712 | "\n", 713 | "\n", 714 | " yarn.nodemanager.env-whitelist\n", 715 | " JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME\n", 716 | "" 717 | ] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "metadata": {}, 722 | "source": [ 723 | "**To check the distributed filesystem working properly**" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "#Type and check the results for the following commands\n", 733 | "#list of files and directories on your distributed filesystem\n", 734 | "hdfs dfs -ls\n", 735 | "\n", 736 | "#now create a relative path\n", 737 | "hdfs dfs -mkdir /user\n", 738 | "hdfs dfs -mkdir /user/\n", 739 | "\n", 740 | "#relative and absolute path\n", 741 | "hdfs dfs -mkdir /cluster #----> This directory will be created in your dfs home i.e., where directory user is!\n", 742 | "\n", 743 | "hdfs dfs -mkdir cluster #----> This will be created inside /user/\n", 744 | "\n", 745 | "#You can view the added directories in the WebUI too\n", 746 | "#browse localhost:9870\n", 747 | "\n", 748 | "#and check the option Utilities -> Browse filesystem\n", 749 | "#it displays some webhdfs Server Error [This error is shown for the java versions >=9]\n", 750 | "#open and edit hadoop-env.sh\n", 751 | "export HADOOP_OPTS=\"--add-modules java.activation\"" 752 | ] 753 | }, 754 | { 755 | "cell_type": "markdown", 756 | "metadata": {}, 757 | "source": [ 758 | "**I hope your hadoop distributed file system is working fine!**\n", 759 | "\n", 760 | "**In case of trouble or mistake in the code, notify me!**\n", 761 | "\n", 762 | "Now, we can move further and learn the basic commands and their usage in hadoop filesystem. Tutorial is available [HERE](https://github.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/blob/master/Hadoop%20Tutorial.ipynb)" 763 | ] 764 | } 765 | ], 766 | "metadata": { 767 | "kernelspec": { 768 | "display_name": "Python 3", 769 | "language": "python", 770 | "name": "python3" 771 | }, 772 | "language_info": { 773 | "codemirror_mode": { 774 | "name": "ipython", 775 | "version": 3 776 | }, 777 | "file_extension": ".py", 778 | "mimetype": "text/x-python", 779 | "name": "python", 780 | "nbconvert_exporter": "python", 781 | "pygments_lexer": "ipython3", 782 | "version": "3.6.1" 783 | }, 784 | "toc": { 785 | "base_numbering": 1, 786 | "nav_menu": {}, 787 | "number_sections": true, 788 | "sideBar": true, 789 | "skip_h1_title": false, 790 | "title_cell": "Table of Contents", 791 | "title_sidebar": "Contents", 792 | "toc_cell": false, 793 | "toc_position": { 794 | "height": "calc(100% - 180px)", 795 | "left": "10px", 796 | "top": "150px", 797 | "width": "165px" 798 | }, 799 | "toc_section_display": true, 800 | "toc_window_display": true 801 | } 802 | }, 803 | "nbformat": 4, 804 | "nbformat_minor": 2 805 | } 806 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Hadoop Tutorial-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | } 10 | ], 11 | "metadata": { 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.6.1" 28 | }, 29 | "toc": { 30 | "base_numbering": 1, 31 | "nav_menu": {}, 32 | "number_sections": true, 33 | "sideBar": true, 34 | "skip_h1_title": false, 35 | "title_cell": "Table of Contents", 36 | "title_sidebar": "Contents", 37 | "toc_cell": false, 38 | "toc_position": {}, 39 | "toc_section_display": true, 40 | "toc_window_display": false 41 | } 42 | }, 43 | "nbformat": 4, 44 | "nbformat_minor": 2 45 | } 46 | -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/1 Unix Command Line Interface/1 File system exploration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/1 Unix Command Line Interface/1 File system exploration.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/1 Unix Command Line Interface/2 File system managing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/1 Unix Command Line Interface/2 File system managing.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/1 Unix Command Line Interface/3 File content exploration 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/1 Unix Command Line Interface/3 File content exploration 1.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/1 Unix Command Line Interface/4 File content exploration 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/1 Unix Command Line Interface/4 File content exploration 2.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/1 Unix Command Line Interface/5 Processes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/1 Unix Command Line Interface/5 Processes.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/1 Unix Command Line Interface/Basic bash commands.csv: -------------------------------------------------------------------------------- 1 | Command,Description 2 | awk,"""Aho, Weinberger and Kernigan"", Bell Labs, 1970s. Interpreted programming language for text processing." 3 | awk -F,(see above) + Set the field separator. 4 | cat,"Display the contents of a file at the command line, is also used to copy and or append text files into a document. Named after its function to con-cat-enate files." 5 | cd,Change the current working directory. Also known as chdir (change directory). 6 | cd /,Change the current directory to root directory. 7 | cd ..,Change the current directory to parent directory. 8 | cd ~,Change the current directory to your home directory. 9 | cp,Make copies of files and directories. 10 | cp -r,Copy directories recursively. 11 | cut,"Drop sections of each line of input by bytes, characters, or fields, separated by a delimiter (the tab character by default)." 12 | cut -d -f,"-d is for delimiter instead of tab character, -f select only those fields (ex.: “cut -d “,“ -f1 multilined_file.txt” - will mean that we select only the first field from each comma-separated line in the file)" 13 | du,Estimate (and display) the file space usage - space used under a particular directory or files on a file system. 14 | df,Display the amount of available disk space being used by file systems. 15 | df -h,Use human readable format. 16 | free,Display the total amount of free and used memory (use vm_stat instead on MacOS). 17 | free -m,Display the amount of memory in megabytes. 18 | free -g,Display the amount of memory in gigabytes. 19 | grep,"Process text and print any lines which match a regular expression (""global regular expression print"")" 20 | head,"Print the beginning of a text file or piped data. By default, outputs the first 10 lines of its input to the command line." 21 | head -n,Output the first n lines of input data (ex.: “head -5 multilined_file.txt”). 22 | kill,Send a signal to kill a process. The default signal for kill is TERM (which will terminate the process). 23 | less,"Is similar to more, but has the extended capability of allowing both forward and backward navigation through the file." 24 | ls,List the contents of a directory. 25 | ls -l,"List the contents of a directory + use a long format, displaying Unix file types, permissions, number of hard links, owner, group, size, last-modified date and filename." 26 | ls -lh,"List the contents of a directory + print sizes in human readable format. (e.g. 1K, 234M, 2G, etc.)" 27 | ls -lS,Sort by file size 28 | man,"Display the manual pages which provide documentation about commands, system calls, library routines and the kernel." 29 | mkdir,"Create a directory on a file system (""make directory"")" 30 | more,Display the contents of a text file one screen at a time. 31 | mv,Rename files or directories or move them to a different directory. 32 | nice,Run a command with a modified scheduling priority. 33 | ps,"Provide information about the currently running processes, including their process identification numbers (PIDs) (""process status"")." 34 | ps a,Select all processes except both session leaders and processes not associated with a terminal. 35 | pwd,"Abbreviated from ""print working directory"", pwd writes the full pathname of the current working directory." 36 | rm,Remove files or directories. 37 | rm -r,Remove directories and their contents recursively. 38 | sort,Sort the contents of a text file. 39 | sort -r,Sort the output in the reverse order. Reverse means - to reverse the result of comparsions 40 | sort -k,"-k or --key=POS1[,POS2] Start a key at POS1 (origin 1), end it at POS2 (default end of the line) (ex.: “sort -k2,2 multilined_file.txt”)." 41 | sort -n,Compare according to string numerical value. 42 | tail,"Print the tail end of a text file or piped data. Be default, outputs the last 10 lines of its input to the command line." 43 | tail -n,Output the last n lines of input data (ex.: “tail -2 multilined_file.txt”). 44 | top,"Produce an ordered list of running processes selected by user-specified criteria, and updates it periodically." 45 | touch,Update the access date and or modification date of a file or directory or create an empty file. 46 | tr,"Replace or remove specific characters in its input data set (""translate"")." 47 | tr -d,"Delete characters, do not translate." 48 | vim,"Is a text editor (""vi improved""). It can be used for editing any kind of text and is especially suited for editing computer programs." 49 | wc,"Print a count of lines, words and bytes for each input file (""word count"")" 50 | wc -c,Print only the number of characters. 51 | wc -l,Print only the number of lines. 52 | -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/1 Scaling Distributed File System.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/1 Scaling Distributed File System.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/2 Block and Replica Stated and Recovery Process 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/2 Block and Replica Stated and Recovery Process 1.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/3 Block and Replica Stated and Recovery Process 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/3 Block and Replica Stated and Recovery Process 2.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/4 HDFS Client.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/4 HDFS Client.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/5 Web UI and REST API.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/5 Web UI and REST API.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/6 Namenode Architechture.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/6 Namenode Architechture.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/Gentle Introduction into curl: -------------------------------------------------------------------------------- 1 | >> curl is a tool to transfer data from or to a server, using one of the supported protocols (DICT, FILE, FTP, FTPS, GOPHER, HTTP, HTTPS, IMAP, IMAPS, LDAP, LDAPS, POP3, POP3S, RTMP, RTSP, SCP, SFTP, SMB, SMBS, SMTP, SMTPS, TELNET and TFTP) 2 | 3 | We are concerned about HTTP / HTTPS protocols 4 | 5 | >> The command is designed to work without user interaction. 6 | 7 | The command is designed to work without user interference. Thus there is no need to open a browser, type urls and download them manually. 8 | 9 | >> curl offers a busload of useful tricks like proxy support, user authentication, FTP upload, HTTP post, SSL connections, cookies, file transfer resume, Metalink, and more… the number of features will make your head spin! 10 | 11 | The best way to understand how to use curl is to take a look at the examples (and try them at home). For example, if you want to download 5 Web pages from the internet you can do it manually: open your browser, type url, save a page, then repeat 4 more times. Or you can easily do it via the command line: 12 | 13 | 14 | 15 | # download one Website page 16 | 17 | $ curl https://www.wikipedia.org 18 | 19 | ... 20 | 21 | content of the main page wikipedia.org 22 | 23 | ... 24 | 25 | 26 | 27 | # download one Website page and save output to a file “wiki.html” 28 | 29 | $ curl https://www.wikipedia.org -o wiki.html 30 | 31 | ... progress bar ... 32 | 33 | 34 | 35 | # download several Website pages and save to the appropriate files 36 | 37 | $ curl https://www.wikipedia.org -o wiki.html https://www.coursera.org/ -o coursera.html 38 | 39 | % Total % Received % Xferd Average Speed Time Time Time Current 40 | 41 | Dload Upload Total Spent Left Speed 42 | 43 | 100 86413 100 86413 0 0 127k 0 --:--:-- --:--:-- --:--:-- 127k 44 | 45 | 100 355k 100 355k 0 0 155k 0 0:00:02 0:00:02 --:--:-- 206k 46 | 47 | 48 | 49 | Imagine how much time you will save by downloading thousands of web pages this way. 50 | 51 | Two flags “-i” and “-L” are very handy for this course. They will help you to understand video materials and solve practical assignments. 52 | 53 | -i, --include 54 | 55 | Include the HTTP response headers in the output. The HTTP response headers can include things like server name, cookies, date of the document, HTTP version and more… 56 | 57 | -L, --location 58 | 59 | (HTTP) If the server reports that the requested page has moved to a different location (indicated with a Location: header and a 3XX response code), this option will make curl redo the request on the new place. If used together with -i, --include or -I, --head, headers from all requested pages will be shown. 60 | 61 | … 62 | 63 | A few examples to familiarize yourself with the usage: 64 | 65 | 66 | 67 | $ curl http://www.google.com 68 | 69 | 70 | 71 | 302 Moved 72 | 73 |

302 Moved

74 | 75 | The document has moved 76 | 77 | here. 78 | 79 | 80 | 81 | 82 | 83 | # print headers 84 | 85 | $ curl -i http://www.google.com 86 | 87 | HTTP/1.1 302 Found 88 | 89 | Cache-Control: private 90 | 91 | Content-Type: text/html; charset=UTF-8 92 | 93 | Referrer-Policy: no-referrer 94 | 95 | Location: http://www.google.ru/?gfe_rd=cr&dcr=0&ei=qO-zWe-zL8bG7gSt5aLQAw 96 | 97 | Content-Length: 268 98 | 99 | Date: Sat, 09 Sep 2017 13:42:00 GMT 100 | 101 | 102 | 103 | ... The document has moved ... 104 | 105 | 106 | 107 | # you can either follow redirects manually: 108 | 109 | $ curl http://www.google.ru/?gfe_rd=cr&dcr=0&ei=qO-zWe-zL8bG7gSt5aLQAw 110 | 111 | ... some HTML output ... 112 | 113 | 114 | 115 | # or follow redirects automatically 116 | 117 | $ curl -L http://www.google.com 118 | 119 | ... the same HTML output ... 120 | 121 | 122 | 123 | This should be the end of our gentle introduction into “curl”. By the way, do you know that you can provide “like / dislike” for any learning item? Your feedback is highly welcome as it helps us to make the content even better for you. If you have any other suggestions or ideas how to improve the content please provide them via “report problem” ---> “content improvement”. -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/hadoop-hdfs-commands-cheatsheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/2 HDFS Architechture and Scalability/hadoop-hdfs-commands-cheatsheet.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/3 File Types/1 Introduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/3 File Types/1 Introduction.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/3 File Types/2 Text formats.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/3 File Types/2 Text formats.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/3 File Types/3 Binary formats 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/3 File Types/3 Binary formats 1.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/3 File Types/4 Binary formats 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/3 File Types/4 Binary formats 2.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/3 File Types/5 Compression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week1 - Distributed File Systems/3 File Types/5 Compression.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week1 - Distributed File Systems/3 File Types/File formats extra: -------------------------------------------------------------------------------- 1 | As you have learned from the videos, there are plenty of file formats available in the Hadoop ecosystem. 2 | 3 | Here is a brief list of extra links that may be helpful: 4 | 5 | CSV library for Python https://docs.python.org/2/library/csv.html 6 | JSON library for Python https://docs.python.org/2/library/json.html 7 | Apache Avro website http://avro.apache.org/ 8 | RCFile paper http://web.cse.ohio-state.edu/hpcs/WWW/HTML/publications/papers/TR-11-4.pdf 9 | Apache Parquet website https://parquet.apache.org/ 10 | If you happen to find any other helpful material, please post it to the forums. 11 | 12 | Cheers! -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/1 Unreliable Components/1 Unreliable Components 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/1 Unreliable Components/1 Unreliable Components 1.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/1 Unreliable Components/2 Unreliable Components 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/1 Unreliable Components/2 Unreliable Components 2.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/1 Unreliable Components/3 MapReduce.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/1 Unreliable Components/3 MapReduce.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/1 Unreliable Components/4 Distributed Shell.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/1 Unreliable Components/4 Distributed Shell.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/1 Unreliable Components/5 Fault Tolerance.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/1 Unreliable Components/5 Fault Tolerance.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/2 Streaming Applications in Python/1 Streaming.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/2 Streaming Applications in Python/1 Streaming.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/2 Streaming Applications in Python/2 Streaming in Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/2 Streaming Applications in Python/2 Streaming in Python.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/2 Streaming Applications in Python/3 WordCount in Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/2 Streaming Applications in Python/3 WordCount in Python.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/2 Streaming Applications in Python/4 Distributed Cache.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/2 Streaming Applications in Python/4 Distributed Cache.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/2 Streaming Applications in Python/5 Environment Counters.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/2 Streaming Applications in Python/5 Environment Counters.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/2 Streaming Applications in Python/6 Testing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/2 Streaming Applications in Python/6 Testing.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/3 Job Configuration Comparator Partitioner/1 Combiner.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/3 Job Configuration Comparator Partitioner/1 Combiner.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/3 Job Configuration Comparator Partitioner/2 Partitioner.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/3 Job Configuration Comparator Partitioner/2 Partitioner.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/3 Job Configuration Comparator Partitioner/3 Comparator.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/3 Job Configuration Comparator Partitioner/3 Comparator.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/3 Job Configuration Comparator Partitioner/4 Speculative Execution and Backup Tasks.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/3 Job Configuration Comparator Partitioner/4 Speculative Execution and Backup Tasks.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week2 - Solving Problems with MapReduce/3 Job Configuration Comparator Partitioner/5 Compression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week2 - Solving Problems with MapReduce/3 Job Configuration Comparator Partitioner/5 Compression.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week3 - Solving Problems with MapReduce - Practice/NameCount.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 27, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Overwriting mapper1.py\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%%writefile mapper1.py\n", 18 | "\n", 19 | "# Your code for mapper here.\n", 20 | "import sys\n", 21 | "import re\n", 22 | "\n", 23 | "reload(sys)\n", 24 | "sys.setdefaultencoding('utf-8') # required to convert to unicode\n", 25 | "\n", 26 | "for line in sys.stdin:\n", 27 | " try:\n", 28 | " article_id, text = unicode(line.strip()).split('\\t', 1)\n", 29 | " except ValueError as e:\n", 30 | " continue\n", 31 | " words = re.split(\"\\W*\\s+\\W*\", text, flags=re.UNICODE)\n", 32 | " for word in words:\n", 33 | " print >> sys.stderr, \"reporter:counter:Wiki stats,Total words,%d\" % 1\n", 34 | " if ((word[1:].islower()) and (word[0].isupper()) and (not word[0].isdigit())):\n", 35 | " print \"%s\\t%d\\t%d\" % (word.lower(), 1, 1)\n", 36 | " else:\n", 37 | " print \"%s\\t%d\\t%d\" % (word.lower(), 0, 1)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 28, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "Overwriting reducer1.py\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "%%writefile reducer1.py\n", 55 | "\n", 56 | "# Your code for reducer here.\n", 57 | "import sys\n", 58 | "\n", 59 | "current_key = None\n", 60 | "word_name_sum = 0\n", 61 | "word_total_sum = 0\n", 62 | "\n", 63 | "for line in sys.stdin:\n", 64 | " try:\n", 65 | " key, count_name, count_total = line.strip().split('\\t', 2)\n", 66 | " count_name = int(count_name)\n", 67 | " count_total = int(count_total)\n", 68 | " except ValueError as e:\n", 69 | " continue\n", 70 | " if current_key != key:\n", 71 | " if current_key:\n", 72 | " print \"%s\\t%d\\t%d\" % (current_key, word_name_sum, word_total_sum)\n", 73 | " word_name_sum = 0\n", 74 | " word_total_sum = 0\n", 75 | " current_key = key\n", 76 | " word_name_sum += count_name\n", 77 | " word_total_sum += count_total\n", 78 | "\n", 79 | "if current_key:\n", 80 | " print \"%s\\t%d\\t%d\" % (current_key, word_name_sum, word_total_sum)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 29, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "Overwriting mapper2.py\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "%%writefile mapper2.py\n", 98 | "\n", 99 | "import sys\n", 100 | "reload(sys)\n", 101 | "\n", 102 | "for line in sys.stdin:\n", 103 | " try:\n", 104 | " word, count_name, count_total = line.strip().split('\\t', 2)\n", 105 | " count_name = int(count_name)\n", 106 | " count_total = int(count_total)\n", 107 | " if float(count_name)/count_total > 0.995:\n", 108 | " print '%d\\t%s' % (count_name, word)\n", 109 | " except ValueError as e:\n", 110 | " continue" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 30, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "Overwriting reducer2.py\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "%%writefile reducer2.py\n", 128 | "\n", 129 | "import sys\n", 130 | "reload(sys)\n", 131 | "\n", 132 | "for line in sys.stdin:\n", 133 | " try:\n", 134 | " count, word = line.strip().split('\\t', 1)\n", 135 | " count = int(count)\n", 136 | " print '%s\\t%d' % (word, count)\n", 137 | " except ValueError as e:\n", 138 | " continue" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 31, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "french\t5741\n" 151 | ] 152 | }, 153 | { 154 | "name": "stderr", 155 | "output_type": "stream", 156 | "text": [ 157 | "rm: `assignment1_wordcount_1534517230823602': No such file or directory\n", 158 | "18/08/17 14:47:14 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 159 | "18/08/17 14:47:14 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 160 | "18/08/17 14:47:14 INFO mapred.FileInputFormat: Total input files to process : 1\n", 161 | "18/08/17 14:47:14 INFO mapreduce.JobSubmitter: number of splits:2\n", 162 | "18/08/17 14:47:15 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1534512728888_0013\n", 163 | "18/08/17 14:47:15 INFO impl.YarnClientImpl: Submitted application application_1534512728888_0013\n", 164 | "18/08/17 14:47:15 INFO mapreduce.Job: The url to track the job: http://d1967598ec2c:8088/proxy/application_1534512728888_0013/\n", 165 | "18/08/17 14:47:15 INFO mapreduce.Job: Running job: job_1534512728888_0013\n", 166 | "18/08/17 14:47:20 INFO mapreduce.Job: Job job_1534512728888_0013 running in uber mode : false\n", 167 | "18/08/17 14:47:20 INFO mapreduce.Job: map 0% reduce 0%\n", 168 | "18/08/17 14:47:36 INFO mapreduce.Job: map 24% reduce 0%\n", 169 | "18/08/17 14:47:42 INFO mapreduce.Job: map 36% reduce 0%\n", 170 | "18/08/17 14:47:48 INFO mapreduce.Job: map 43% reduce 0%\n", 171 | "18/08/17 14:47:54 INFO mapreduce.Job: map 50% reduce 0%\n", 172 | "18/08/17 14:48:00 INFO mapreduce.Job: map 62% reduce 0%\n", 173 | "18/08/17 14:48:06 INFO mapreduce.Job: map 67% reduce 0%\n", 174 | "18/08/17 14:48:10 INFO mapreduce.Job: map 83% reduce 0%\n", 175 | "18/08/17 14:48:14 INFO mapreduce.Job: map 100% reduce 0%\n", 176 | "18/08/17 14:48:15 INFO mapreduce.Job: map 100% reduce 13%\n", 177 | "18/08/17 14:48:17 INFO mapreduce.Job: map 100% reduce 25%\n", 178 | "18/08/17 14:48:18 INFO mapreduce.Job: map 100% reduce 38%\n", 179 | "18/08/17 14:48:19 INFO mapreduce.Job: map 100% reduce 50%\n", 180 | "18/08/17 14:48:20 INFO mapreduce.Job: map 100% reduce 75%\n", 181 | "18/08/17 14:48:21 INFO mapreduce.Job: map 100% reduce 100%\n", 182 | "18/08/17 14:48:22 INFO mapreduce.Job: Job job_1534512728888_0013 completed successfully\n", 183 | "18/08/17 14:48:22 INFO mapreduce.Job: Counters: 51\n", 184 | "\tFile System Counters\n", 185 | "\t\tFILE: Number of bytes read=20643993\n", 186 | "\t\tFILE: Number of bytes written=32273180\n", 187 | "\t\tFILE: Number of read operations=0\n", 188 | "\t\tFILE: Number of large read operations=0\n", 189 | "\t\tFILE: Number of write operations=0\n", 190 | "\t\tHDFS: Number of bytes read=76874501\n", 191 | "\t\tHDFS: Number of bytes written=6248688\n", 192 | "\t\tHDFS: Number of read operations=30\n", 193 | "\t\tHDFS: Number of large read operations=0\n", 194 | "\t\tHDFS: Number of write operations=16\n", 195 | "\tJob Counters \n", 196 | "\t\tKilled reduce tasks=1\n", 197 | "\t\tLaunched map tasks=2\n", 198 | "\t\tLaunched reduce tasks=8\n", 199 | "\t\tData-local map tasks=2\n", 200 | "\t\tTotal time spent by all maps in occupied slots (ms)=99573\n", 201 | "\t\tTotal time spent by all reduces in occupied slots (ms)=23117\n", 202 | "\t\tTotal time spent by all map tasks (ms)=99573\n", 203 | "\t\tTotal time spent by all reduce tasks (ms)=23117\n", 204 | "\t\tTotal vcore-milliseconds taken by all map tasks=99573\n", 205 | "\t\tTotal vcore-milliseconds taken by all reduce tasks=23117\n", 206 | "\t\tTotal megabyte-milliseconds taken by all map tasks=101962752\n", 207 | "\t\tTotal megabyte-milliseconds taken by all reduce tasks=23671808\n", 208 | "\tMap-Reduce Framework\n", 209 | "\t\tMap input records=4100\n", 210 | "\t\tMap output records=11937375\n", 211 | "\t\tMap output bytes=121717186\n", 212 | "\t\tMap output materialized bytes=10291082\n", 213 | "\t\tInput split bytes=228\n", 214 | "\t\tCombine input records=11937375\n", 215 | "\t\tCombine output records=641788\n", 216 | "\t\tReduce input groups=427175\n", 217 | "\t\tReduce shuffle bytes=10291082\n", 218 | "\t\tReduce input records=641788\n", 219 | "\t\tReduce output records=427175\n", 220 | "\t\tSpilled Records=1925364\n", 221 | "\t\tShuffled Maps =16\n", 222 | "\t\tFailed Shuffles=0\n", 223 | "\t\tMerged Map outputs=16\n", 224 | "\t\tGC time elapsed (ms)=1074\n", 225 | "\t\tCPU time spent (ms)=168880\n", 226 | "\t\tPhysical memory (bytes) snapshot=2310426624\n", 227 | "\t\tVirtual memory (bytes) snapshot=20201512960\n", 228 | "\t\tTotal committed heap usage (bytes)=1607467008\n", 229 | "\tShuffle Errors\n", 230 | "\t\tBAD_ID=0\n", 231 | "\t\tCONNECTION=0\n", 232 | "\t\tIO_ERROR=0\n", 233 | "\t\tWRONG_LENGTH=0\n", 234 | "\t\tWRONG_MAP=0\n", 235 | "\t\tWRONG_REDUCE=0\n", 236 | "\tWiki stats\n", 237 | "\t\tTotal words=11937375\n", 238 | "\tFile Input Format Counters \n", 239 | "\t\tBytes Read=76874273\n", 240 | "\tFile Output Format Counters \n", 241 | "\t\tBytes Written=6248688\n", 242 | "18/08/17 14:48:22 INFO streaming.StreamJob: Output directory: assignment1_wordcount_1534517230823602\n", 243 | "rm: `assignment1_sorting_1534517303029649': No such file or directory\n", 244 | "18/08/17 14:48:26 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 245 | "18/08/17 14:48:26 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 246 | "18/08/17 14:48:27 INFO mapred.FileInputFormat: Total input files to process : 8\n", 247 | "18/08/17 14:48:27 INFO mapreduce.JobSubmitter: number of splits:8\n", 248 | "18/08/17 14:48:28 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1534512728888_0014\n", 249 | "18/08/17 14:48:28 INFO impl.YarnClientImpl: Submitted application application_1534512728888_0014\n", 250 | "18/08/17 14:48:28 INFO mapreduce.Job: The url to track the job: http://d1967598ec2c:8088/proxy/application_1534512728888_0014/\n", 251 | "18/08/17 14:48:28 INFO mapreduce.Job: Running job: job_1534512728888_0014\n", 252 | "18/08/17 14:48:34 INFO mapreduce.Job: Job job_1534512728888_0014 running in uber mode : false\n", 253 | "18/08/17 14:48:34 INFO mapreduce.Job: map 0% reduce 0%\n", 254 | "18/08/17 14:48:39 INFO mapreduce.Job: map 25% reduce 0%\n", 255 | "18/08/17 14:48:40 INFO mapreduce.Job: map 75% reduce 0%\n", 256 | "18/08/17 14:48:42 INFO mapreduce.Job: map 88% reduce 0%\n", 257 | "18/08/17 14:48:43 INFO mapreduce.Job: map 100% reduce 0%\n", 258 | "18/08/17 14:48:45 INFO mapreduce.Job: map 100% reduce 100%\n", 259 | "18/08/17 14:48:46 INFO mapreduce.Job: Job job_1534512728888_0014 completed successfully\n", 260 | "18/08/17 14:48:46 INFO mapreduce.Job: Counters: 49\n", 261 | "\tFile System Counters\n", 262 | "\t\tFILE: Number of bytes read=1779953\n", 263 | "\t\tFILE: Number of bytes written=4820469\n", 264 | "\t\tFILE: Number of read operations=0\n", 265 | "\t\tFILE: Number of large read operations=0\n", 266 | "\t\tFILE: Number of write operations=0\n", 267 | "\t\tHDFS: Number of bytes read=6249768\n", 268 | "\t\tHDFS: Number of bytes written=1509249\n", 269 | "\t\tHDFS: Number of read operations=27\n", 270 | "\t\tHDFS: Number of large read operations=0\n", 271 | "\t\tHDFS: Number of write operations=2\n", 272 | "\tJob Counters \n", 273 | "\t\tLaunched map tasks=8\n", 274 | "\t\tLaunched reduce tasks=1\n", 275 | "\t\tData-local map tasks=8\n", 276 | "\t\tTotal time spent by all maps in occupied slots (ms)=26828\n", 277 | "\t\tTotal time spent by all reduces in occupied slots (ms)=3533\n", 278 | "\t\tTotal time spent by all map tasks (ms)=26828\n", 279 | "\t\tTotal time spent by all reduce tasks (ms)=3533\n", 280 | "\t\tTotal vcore-milliseconds taken by all map tasks=26828\n", 281 | "\t\tTotal vcore-milliseconds taken by all reduce tasks=3533\n", 282 | "\t\tTotal megabyte-milliseconds taken by all map tasks=27471872\n", 283 | "\t\tTotal megabyte-milliseconds taken by all reduce tasks=3617792\n", 284 | "\tMap-Reduce Framework\n", 285 | "\t\tMap input records=427175\n", 286 | "\t\tMap output records=135348\n", 287 | "\t\tMap output bytes=1509250\n", 288 | "\t\tMap output materialized bytes=1779995\n", 289 | "\t\tInput split bytes=1080\n", 290 | "\t\tCombine input records=0\n", 291 | "\t\tCombine output records=0\n", 292 | "\t\tReduce input groups=581\n", 293 | "\t\tReduce shuffle bytes=1779995\n", 294 | "\t\tReduce input records=135348\n", 295 | "\t\tReduce output records=135348\n", 296 | "\t\tSpilled Records=270696\n", 297 | "\t\tShuffled Maps =8\n", 298 | "\t\tFailed Shuffles=0\n", 299 | "\t\tMerged Map outputs=8\n", 300 | "\t\tGC time elapsed (ms)=1050\n", 301 | "\t\tCPU time spent (ms)=14090\n", 302 | "\t\tPhysical memory (bytes) snapshot=2572472320\n", 303 | "\t\tVirtual memory (bytes) snapshot=18108641280\n", 304 | "\t\tTotal committed heap usage (bytes)=1756889088\n", 305 | "\tShuffle Errors\n", 306 | "\t\tBAD_ID=0\n", 307 | "\t\tCONNECTION=0\n", 308 | "\t\tIO_ERROR=0\n", 309 | "\t\tWRONG_LENGTH=0\n", 310 | "\t\tWRONG_MAP=0\n", 311 | "\t\tWRONG_REDUCE=0\n", 312 | "\tFile Input Format Counters \n", 313 | "\t\tBytes Read=6248688\n", 314 | "\tFile Output Format Counters \n", 315 | "\t\tBytes Written=1509249\n", 316 | "18/08/17 14:48:46 INFO streaming.StreamJob: Output directory: assignment1_sorting_1534517303029649\n" 317 | ] 318 | } 319 | ], 320 | "source": [ 321 | "%%bash\n", 322 | "\n", 323 | "OUT_DIR=\"assignment1_wordcount_\"$(date +\"%s%6N\")\n", 324 | "\n", 325 | "NUM_REDUCERS=8\n", 326 | "\n", 327 | "hdfs dfs -rm -r -skipTrash ${OUT_DIR} > /dev/null\n", 328 | "\n", 329 | "# Code for your first job\n", 330 | "# yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar ...\n", 331 | "yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \\\n", 332 | " -D mapred.jab.name=\"Streaming wordCount\" \\\n", 333 | " -D mapreduce.job.reduces=${NUM_REDUCERS} \\\n", 334 | " -files mapper1.py,reducer1.py \\\n", 335 | " -mapper \"python mapper1.py\" \\\n", 336 | " -combiner \"python reducer1.py\" \\\n", 337 | " -reducer \"python reducer1.py\" \\\n", 338 | " -input /data/wiki/en_articles_part \\\n", 339 | " -output ${OUT_DIR} > /dev/null\n", 340 | "\n", 341 | "OUT_DIR1=\"assignment1_sorting_\"$(date +\"%s%6N\")\n", 342 | "\n", 343 | "NUM_REDUCERS=1\n", 344 | "\n", 345 | "hdfs dfs -rm -r -skipTrash ${OUT_DIR1} > /dev/null\n", 346 | "\n", 347 | "# Code for your second job\n", 348 | "# yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar ...\n", 349 | "yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \\\n", 350 | " -D mapred.jab.name=\"Streaming sorting\" \\\n", 351 | " -D mapreduce.job.reduces=${NUM_REDUCERS} \\\n", 352 | " -D mapreduce.job.output.key.comparator.class=org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedComparator \\\n", 353 | " -D mapreduce.partition.keycomparator.options=\"-nr\" \\\n", 354 | " -files mapper2.py,reducer2.py \\\n", 355 | " -mapper \"python mapper2.py\" \\\n", 356 | " -reducer \"python reducer2.py\" \\\n", 357 | " -input ${OUT_DIR} \\\n", 358 | " -output ${OUT_DIR1} > /dev/null\n", 359 | "\n", 360 | "# Code for obtaining the results\n", 361 | "hdfs dfs -cat ${OUT_DIR1}/part-00000 | sed -n 5p\n", 362 | "hdfs dfs -rm -r -skipTrash ${OUT_DIR1}* > /dev/null" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [] 378 | } 379 | ], 380 | "metadata": { 381 | "kernelspec": { 382 | "display_name": "Python 2", 383 | "language": "python", 384 | "name": "python2" 385 | }, 386 | "language_info": { 387 | "codemirror_mode": { 388 | "name": "ipython", 389 | "version": 2 390 | }, 391 | "file_extension": ".py", 392 | "mimetype": "text/x-python", 393 | "name": "python", 394 | "nbconvert_exporter": "python", 395 | "pygments_lexer": "ipython2", 396 | "version": "2.7.12" 397 | } 398 | }, 399 | "nbformat": 4, 400 | "nbformat_minor": 2 401 | } 402 | -------------------------------------------------------------------------------- /Big-Data-Essentials/Week3 - Solving Problems with MapReduce - Practice/StopWordsTask2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "Xsd0oermS80K" 8 | }, 9 | "source": [ 10 | "# Hadoop Streaming assignment 2: Stop Words" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "colab_type": "text", 17 | "id": "V_0wfvKjS80L" 18 | }, 19 | "source": [ 20 | "The purpose of this task is to improve the previous \"Word rating\" program. You have to calculate how many stop words are there in the input dataset. Stop words list is in `/datasets/stop_words_en.txt` file. \n", 21 | "\n", 22 | "Use Hadoop counters to compute the number of stop words and total words in the dataset. The result is the percentage of stop words in the entire dataset (without percent symbol).\n", 23 | "\n", 24 | "There are several points for this task:\n", 25 | "\n", 26 | "1) As an output, you have to get the percentage of stop words in the entire dataset without percent symbol (correct answer on sample dataset is `41.603`).\n", 27 | "\n", 28 | "2) As you can see in the Hadoop Streaming userguide \"you will need to use `-files` option to tell the framework to pack your executable files as a part of a job submission.\"\n", 29 | "\n", 30 | "3) Do not forget to redirect junk output to `/dev/null`.\n", 31 | "\n", 32 | "4) You may modify mappers/reducers from \"Word rating\" task and parse its output to get the answer on \"Stop Words\" task.\n", 33 | "\n", 34 | "5) You may use mapper/reducer to get `\"Stop Words\"` and `\"Total Words\"` amounts and redirect them to sys.stderr. After that you may redirect the output of MapReduce to the parsed function. In this function you may find rows correspond to these amounts and compute the percentage.\n", 35 | "\n", 36 | "Here you can find the draft for the main steps of the task. You can use other methods to get the solution." 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "colab_type": "text", 43 | "id": "EpO7whLNS80M" 44 | }, 45 | "source": [ 46 | "## Step 1. Create the mapper.\n", 47 | "\n", 48 | "Hint: Create the mapper, which calculates Total word and Stop word amounts. You may redirect this information to sys.stderr. This will make it possible to parse these data on the next steps.\n", 49 | "\n", 50 | "Example of the redirections:\n", 51 | "\n", 52 | "`print >> sys.stderr, \"reporter:counter:Wiki stats,Total words,%d\" % count`\n", 53 | "\n", 54 | "Remember about the Distributed cache. If we add option `-files mapper.py,reducer.py,/datasets/stop_words_en.txt`, then `mapper.py, reducer.py` and `stop_words_en.txt` file will be in the same directory on the datanodes. Hence, it is necessary to use a relative path `stop_words_en.txt` from the mapper to access this txt file." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 39, 60 | "metadata": { 61 | "colab": { 62 | "autoexec": { 63 | "startup": false, 64 | "wait_interval": 0 65 | } 66 | }, 67 | "colab_type": "code", 68 | "id": "KU56zkC0S80M" 69 | }, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "Overwriting mapper.py\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "%%writefile mapper.py\n", 81 | "\n", 82 | "import sys\n", 83 | "import re\n", 84 | "\n", 85 | "\n", 86 | "reload(sys)\n", 87 | "sys.setdefaultencoding('utf-8') # required to convert to unicode\n", 88 | "\n", 89 | "path = 'stop_words_en.txt'\n", 90 | "def read_stop_words(file_path):\n", 91 | " return set(word.strip().lower() for word in open(file_path))\n", 92 | "stop_words = read_stop_words(path)\n", 93 | "\n", 94 | "\n", 95 | "# Your code for reading stop words here\n", 96 | "for line in sys.stdin:\n", 97 | " try:\n", 98 | " article_id, text = unicode(line.strip()).split('\\t', 1)\n", 99 | " except ValueError as e:\n", 100 | " continue\n", 101 | " words = re.split(\"\\W*\\s+\\W*\", text, flags=re.UNICODE)\n", 102 | " for word in words:\n", 103 | " print >> sys.stderr, \"reporter:counter:Wiki stats,Total words,%d\" % 1\n", 104 | " if word.lower() in stop_words:\n", 105 | " print >> sys.stderr, \"reporter:counter:Wiki stats,Stop words,%d\" % 1\n", 106 | " print \"%s\\t%d\" % (word, 1)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": { 112 | "colab_type": "text", 113 | "id": "-3EM-tBES80O" 114 | }, 115 | "source": [ 116 | "## Step 2. Create the reducer.\n", 117 | "\n", 118 | "Create the reducer, which will accumulate the information after the mapper step. You may implement the combiner if you want. It can be useful from optimizing and speed up your computations (see the lectures from the Week 2 for more details)." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 3, 124 | "metadata": { 125 | "colab": { 126 | "autoexec": { 127 | "startup": false, 128 | "wait_interval": 0 129 | } 130 | }, 131 | "colab_type": "code", 132 | "id": "yslvpwpfS80P" 133 | }, 134 | "outputs": [ 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "Overwriting reducer.py\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "%%writefile reducer.py\n", 145 | "\n", 146 | "# Your code for reducer here.\n", 147 | "import sys\n", 148 | "\n", 149 | "current_key = None\n", 150 | "word_sum = 0\n", 151 | "\n", 152 | "for line in sys.stdin:\n", 153 | " try:\n", 154 | " key, count = line.strip().split('\\t', 1)\n", 155 | " count = int(count)\n", 156 | " except ValueError as e:\n", 157 | " continue\n", 158 | " if current_key != key:\n", 159 | " if current_key:\n", 160 | " print \"%s\\t%d\" % (current_key, word_sum)\n", 161 | " word_sum = 0\n", 162 | " current_key = key\n", 163 | " word_sum += count\n", 164 | "\n", 165 | "if current_key:\n", 166 | " print \"%s\\t%d\" % (current_key, word_sum)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "colab_type": "text", 173 | "id": "MZQKrDecS80Q" 174 | }, 175 | "source": [ 176 | "## Step 3. Create the parsed function.\n", 177 | "\n", 178 | "Hint: Create the function, which will parse MapReduce sys.stderr for Total word and Stop word amounts.\n", 179 | "\n", 180 | "The `./counter_process.py` script should do the following:\n", 181 | "\n", 182 | "- parse hadoop logs from Stderr,\n", 183 | "\n", 184 | "- retrieve values of 2 user-defined counters,\n", 185 | "\n", 186 | "- compute percentage and output it into the stdout." 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 36, 192 | "metadata": { 193 | "colab": { 194 | "autoexec": { 195 | "startup": false, 196 | "wait_interval": 0 197 | } 198 | }, 199 | "colab_type": "code", 200 | "id": "XptufhbMS80R" 201 | }, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "Overwriting counter_process.py\n" 208 | ] 209 | } 210 | ], 211 | "source": [ 212 | "%%writefile counter_process.py\n", 213 | "\n", 214 | "import sys\n", 215 | "import re\n", 216 | "\n", 217 | "# Your functions may be here.\n", 218 | "def ratioStopTotal(string1, string2):\n", 219 | " for line in sys.stdin:\n", 220 | " if string1 in line:\n", 221 | " flag_stop = int(re.findall('\\d+', line)[0])\n", 222 | " if string2 in line:\n", 223 | " flag_nonstop = int(re.findall('\\d+', line)[0])\n", 224 | " \n", 225 | " return float(flag_stop*100)/flag_nonstop\n", 226 | "\n", 227 | "\n", 228 | "if __name__ == '__main__':\n", 229 | " # Your code here.\n", 230 | " print(ratioStopTotal(sys.argv[1], sys.argv[2]))" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": { 236 | "colab_type": "text", 237 | "id": "O_7XrxbKS80S" 238 | }, 239 | "source": [ 240 | "## Step 4. Bash commands\n", 241 | "\n", 242 | " Hints: \n", 243 | "\n", 244 | "1) If you want to redirect standard output to txt file you may use the following argument in yarn jar:\n", 245 | "\n", 246 | "```\n", 247 | "yarn ... \\\n", 248 | " ... \\\n", 249 | " -output ${OUT_DIR} > /dev/null 2> $LOGS\n", 250 | "```\n", 251 | "\n", 252 | "2) For printing the percentage of stop words in the entire dataset you may parse the MapReduce output. Parsed script may be written in Python code. \n", 253 | "\n", 254 | "To get the result you may use the UNIX pipe operator `|`. The output of the first command acts as an input to the second command (see lecture file-content-exploration-2 for more details).\n", 255 | "\n", 256 | "With this operator you may use command `cat` to redirect the output of MapReduce to ./counter_process.py with arguments, which correspond to the `\"Stop words\"` and `\"Total words\"` counters. Example is the following:\n", 257 | "\n", 258 | "`cat $LOGS | python ./counter_process.py \"Stop words\" \"Total words\"`\n", 259 | "\n", 260 | "Now something about Hadoop counters naming. \n", 261 | " - Built-in Hadoop counters usually have UPPER_CASE names. To make the grading system possible to distinguish your custom counters and system ones please use the following pattern for their naming: `[Aa]aaa...` (all except the first letters should be in lowercase);\n", 262 | " - Another points is how Hadoop sorts the counters. It sorts them lexicographically. Grading system reads your first counter as Stop words counter and the second as Total words. Please name you counters in such way that Hadoop set the Stop words counter before the Total words. \n", 263 | " \n", 264 | "E.g. \"Stop words\" and \"Total words\" names are Ok because they correspond both requirements.\n", 265 | "\n", 266 | "3) In Python code sys.argv is a list, which contains the command-line arguments passed to the script. The name of the script is in `sys.argv[0]`. Other arguments begin from `sys.argv[1]`.\n", 267 | "\n", 268 | "Hence, if you have two arguments, which you send from the Bash to your python script, you may use arguments in your script with the following command:\n", 269 | "\n", 270 | "`function(sys.argv[1], sys.argv[2])`\n", 271 | "\n", 272 | "4) Do not forget about printing your MapReduce output in the last cell. You may use the next command:\n", 273 | "\n", 274 | "`cat $LOGS >&2`" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 37, 280 | "metadata": { 281 | "colab": { 282 | "autoexec": { 283 | "startup": false, 284 | "wait_interval": 0 285 | } 286 | }, 287 | "colab_type": "code", 288 | "id": "bG3omEqzS80S" 289 | }, 290 | "outputs": [ 291 | { 292 | "name": "stdout", 293 | "output_type": "stream", 294 | "text": [ 295 | "41.60262201698447\n" 296 | ] 297 | }, 298 | { 299 | "name": "stderr", 300 | "output_type": "stream", 301 | "text": [ 302 | "18/08/17 10:23:53 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 303 | "18/08/17 10:23:53 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 304 | "18/08/17 10:23:54 INFO mapred.FileInputFormat: Total input files to process : 1\n", 305 | "18/08/17 10:23:54 INFO mapreduce.JobSubmitter: number of splits:2\n", 306 | "18/08/17 10:23:54 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1534472873650_0014\n", 307 | "18/08/17 10:23:54 INFO impl.YarnClientImpl: Submitted application application_1534472873650_0014\n", 308 | "18/08/17 10:23:54 INFO mapreduce.Job: The url to track the job: http://d1c261698f21:8088/proxy/application_1534472873650_0014/\n", 309 | "18/08/17 10:23:54 INFO mapreduce.Job: Running job: job_1534472873650_0014\n", 310 | "18/08/17 10:24:00 INFO mapreduce.Job: Job job_1534472873650_0014 running in uber mode : false\n", 311 | "18/08/17 10:24:00 INFO mapreduce.Job: map 0% reduce 0%\n", 312 | "18/08/17 10:24:16 INFO mapreduce.Job: map 22% reduce 0%\n", 313 | "18/08/17 10:24:23 INFO mapreduce.Job: map 33% reduce 0%\n", 314 | "18/08/17 10:24:29 INFO mapreduce.Job: map 42% reduce 0%\n", 315 | "18/08/17 10:24:35 INFO mapreduce.Job: map 46% reduce 0%\n", 316 | "18/08/17 10:24:41 INFO mapreduce.Job: map 55% reduce 0%\n", 317 | "18/08/17 10:24:47 INFO mapreduce.Job: map 65% reduce 0%\n", 318 | "18/08/17 10:24:52 INFO mapreduce.Job: map 82% reduce 0%\n", 319 | "18/08/17 10:24:53 INFO mapreduce.Job: map 83% reduce 0%\n", 320 | "18/08/17 10:24:56 INFO mapreduce.Job: map 100% reduce 0%\n", 321 | "18/08/17 10:24:58 INFO mapreduce.Job: map 100% reduce 13%\n", 322 | "18/08/17 10:24:59 INFO mapreduce.Job: map 100% reduce 25%\n", 323 | "18/08/17 10:25:01 INFO mapreduce.Job: map 100% reduce 50%\n", 324 | "18/08/17 10:25:02 INFO mapreduce.Job: map 100% reduce 75%\n", 325 | "18/08/17 10:25:03 INFO mapreduce.Job: map 100% reduce 100%\n", 326 | "18/08/17 10:25:04 INFO mapreduce.Job: Job job_1534472873650_0014 completed successfully\n", 327 | "18/08/17 10:25:04 INFO mapreduce.Job: Counters: 52\n", 328 | "\tFile System Counters\n", 329 | "\t\tFILE: Number of bytes read=19935013\n", 330 | "\t\tFILE: Number of bytes written=31232187\n", 331 | "\t\tFILE: Number of read operations=0\n", 332 | "\t\tFILE: Number of large read operations=0\n", 333 | "\t\tFILE: Number of write operations=0\n", 334 | "\t\tHDFS: Number of bytes read=76874501\n", 335 | "\t\tHDFS: Number of bytes written=5884946\n", 336 | "\t\tHDFS: Number of read operations=30\n", 337 | "\t\tHDFS: Number of large read operations=0\n", 338 | "\t\tHDFS: Number of write operations=16\n", 339 | "\tJob Counters \n", 340 | "\t\tKilled reduce tasks=1\n", 341 | "\t\tLaunched map tasks=2\n", 342 | "\t\tLaunched reduce tasks=8\n", 343 | "\t\tData-local map tasks=2\n", 344 | "\t\tTotal time spent by all maps in occupied slots (ms)=102170\n", 345 | "\t\tTotal time spent by all reduces in occupied slots (ms)=28355\n", 346 | "\t\tTotal time spent by all map tasks (ms)=102170\n", 347 | "\t\tTotal time spent by all reduce tasks (ms)=28355\n", 348 | "\t\tTotal vcore-milliseconds taken by all map tasks=102170\n", 349 | "\t\tTotal vcore-milliseconds taken by all reduce tasks=28355\n", 350 | "\t\tTotal megabyte-milliseconds taken by all map tasks=104622080\n", 351 | "\t\tTotal megabyte-milliseconds taken by all reduce tasks=29035520\n", 352 | "\tMap-Reduce Framework\n", 353 | "\t\tMap input records=4100\n", 354 | "\t\tMap output records=11937375\n", 355 | "\t\tMap output bytes=97842522\n", 356 | "\t\tMap output materialized bytes=9943111\n", 357 | "\t\tInput split bytes=228\n", 358 | "\t\tCombine input records=11937375\n", 359 | "\t\tCombine output records=718673\n", 360 | "\t\tReduce input groups=474354\n", 361 | "\t\tReduce shuffle bytes=9943111\n", 362 | "\t\tReduce input records=718673\n", 363 | "\t\tReduce output records=474354\n", 364 | "\t\tSpilled Records=2156019\n", 365 | "\t\tShuffled Maps =16\n", 366 | "\t\tFailed Shuffles=0\n", 367 | "\t\tMerged Map outputs=16\n", 368 | "\t\tGC time elapsed (ms)=1721\n", 369 | "\t\tCPU time spent (ms)=189620\n", 370 | "\t\tPhysical memory (bytes) snapshot=2352189440\n", 371 | "\t\tVirtual memory (bytes) snapshot=20214956032\n", 372 | "\t\tTotal committed heap usage (bytes)=1605369856\n", 373 | "\tShuffle Errors\n", 374 | "\t\tBAD_ID=0\n", 375 | "\t\tCONNECTION=0\n", 376 | "\t\tIO_ERROR=0\n", 377 | "\t\tWRONG_LENGTH=0\n", 378 | "\t\tWRONG_MAP=0\n", 379 | "\t\tWRONG_REDUCE=0\n", 380 | "\tWiki stats\n", 381 | "\t\tTotal words=11937375\n", 382 | "\tWiki_stats\n", 383 | "\t\tStop words=4966261\n", 384 | "\tFile Input Format Counters \n", 385 | "\t\tBytes Read=76874273\n", 386 | "\tFile Output Format Counters \n", 387 | "\t\tBytes Written=5884946\n", 388 | "18/08/17 10:25:04 INFO streaming.StreamJob: Output directory: coursera_mr_task21534501431810293\n" 389 | ] 390 | } 391 | ], 392 | "source": [ 393 | "%%bash\n", 394 | "\n", 395 | "OUT_DIR=\"coursera_mr_task2\"$(date +\"%s%6N\")\n", 396 | "NUM_REDUCERS=8\n", 397 | "LOGS=\"stderr_logs.txt\"\n", 398 | "\n", 399 | "# Stub code for your job\n", 400 | "\n", 401 | "# yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar ...\n", 402 | "# ... \\\n", 403 | "# -output ${OUT_DIR} > /dev/null 2> $LOGS\n", 404 | "yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \\\n", 405 | " -D mapred.jab.name=\"Streaming wordCount\" \\\n", 406 | " -D mapreduce.job.reduces=${NUM_REDUCERS} \\\n", 407 | " -files mapper.py,reducer.py,/datasets/stop_words_en.txt \\\n", 408 | " -mapper \"python mapper.py\" \\\n", 409 | " -combiner \"python reducer.py\" \\\n", 410 | " -reducer \"python reducer.py\" \\\n", 411 | " -input /data/wiki/en_articles_part \\\n", 412 | " -output ${OUT_DIR} > /dev/null 2> $LOGS\n", 413 | " \n", 414 | "cat $LOGS | python ./counter_process.py \"Stop words\" \"Total words\"\n", 415 | "cat $LOGS >&2\n", 416 | "\n", 417 | "hdfs dfs -rm -r -skipTrash ${OUT_DIR} > /dev/null\n" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [] 426 | } 427 | ], 428 | "metadata": { 429 | "colab": { 430 | "collapsed_sections": [], 431 | "default_view": {}, 432 | "name": "702_to_students.ipynb", 433 | "provenance": [], 434 | "version": "0.3.2", 435 | "views": {} 436 | }, 437 | "kernelspec": { 438 | "display_name": "Python 3", 439 | "language": "python", 440 | "name": "python3" 441 | }, 442 | "language_info": { 443 | "codemirror_mode": { 444 | "name": "ipython", 445 | "version": 3 446 | }, 447 | "file_extension": ".py", 448 | "mimetype": "text/x-python", 449 | "name": "python", 450 | "nbconvert_exporter": "python", 451 | "pygments_lexer": "ipython3", 452 | "version": "3.6.1" 453 | }, 454 | "toc": { 455 | "base_numbering": 1, 456 | "nav_menu": {}, 457 | "number_sections": true, 458 | "sideBar": true, 459 | "skip_h1_title": false, 460 | "title_cell": "Table of Contents", 461 | "title_sidebar": "Contents", 462 | "toc_cell": false, 463 | "toc_position": {}, 464 | "toc_section_display": true, 465 | "toc_window_display": false 466 | } 467 | }, 468 | "nbformat": 4, 469 | "nbformat_minor": 1 470 | } 471 | -------------------------------------------------------------------------------- /Big-Data-Essentials/Week3 - Solving Problems with MapReduce - Practice/WordGroup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Writing mapper1.py\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%%writefile mapper1.py\n", 18 | "\n", 19 | "# Your code for mapper here.\n", 20 | "import sys\n", 21 | "import re\n", 22 | "\n", 23 | "reload(sys)\n", 24 | "sys.setdefaultencoding('utf-8') # required to convert to unicode\n", 25 | "\n", 26 | "path = 'stop_words_en.txt'\n", 27 | "def read_stop_words(file_path):\n", 28 | " return set(word.strip().lower() for word in open(file_path))\n", 29 | "stop_words = read_stop_words(path)\n", 30 | "\n", 31 | "for line in sys.stdin:\n", 32 | " try:\n", 33 | " article_id, text = unicode(line.strip()).split('\\t', 1)\n", 34 | " except ValueError as e:\n", 35 | " continue\n", 36 | " words = re.split(\"\\W*\\s+\\W*\", text, flags=re.UNICODE)\n", 37 | " for word in words:\n", 38 | " if word.lower() not in stop_words:\n", 39 | " print >> sys.stderr, \"reporter:counter:Wiki stats,Total words,%d\" % 1\n", 40 | " print \"%s\\t%d\" % (word.lower(), 1)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "Writing reducer1.py\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "%%writefile reducer1.py\n", 58 | "\n", 59 | "# Your code for reducer here.\n", 60 | "import sys\n", 61 | "\n", 62 | "current_key = None\n", 63 | "word_sum = 0\n", 64 | "\n", 65 | "for line in sys.stdin:\n", 66 | " try:\n", 67 | " key, count = line.strip().split('\\t', 1)\n", 68 | " count = int(count)\n", 69 | " except ValueError as e:\n", 70 | " continue\n", 71 | " if current_key != key:\n", 72 | " if current_key:\n", 73 | " print \"%s\\t%d\" % (current_key, word_sum)\n", 74 | " word_sum = 0\n", 75 | " current_key = key\n", 76 | " word_sum += count\n", 77 | "\n", 78 | "if current_key:\n", 79 | " print \"%s\\t%d\" % (current_key, word_sum)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "Writing mapper2.py\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "%%writefile mapper2.py\n", 97 | "\n", 98 | "import sys\n", 99 | "reload(sys)\n", 100 | "\n", 101 | "for line in sys.stdin:\n", 102 | " try:\n", 103 | " word, count = line.strip().split('\\t', 1)\n", 104 | " count = int(count)\n", 105 | " if word.isalpha():\n", 106 | " word_permuted = ''.join(sorted(word))\n", 107 | " print \"%s\\t%s\\t%d\" % (word_permuted, word, count)\n", 108 | " except ValueError as e:\n", 109 | " continue" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 4, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "Writing reducer2.py\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "%%writefile reducer2.py\n", 127 | "\n", 128 | "# Your code for reducer here.\n", 129 | "import sys\n", 130 | "\n", 131 | "current_key = None\n", 132 | "permuted_sum = 0\n", 133 | "word_group = []\n", 134 | "\n", 135 | "for line in sys.stdin:\n", 136 | " try:\n", 137 | " key, word, count = line.strip().split('\\t', 2)\n", 138 | " count = int(count)\n", 139 | " except ValueError as e:\n", 140 | " continue\n", 141 | " if current_key != key:\n", 142 | " if current_key:\n", 143 | " if len(word_group) > 1:\n", 144 | " word_string = ','.join(sorted(word_group))\n", 145 | " print \"%d\\t%d\\t%s\" % (permuted_sum, len(word_group), word_string)\n", 146 | " current_key = key\n", 147 | " permuted_sum = 0\n", 148 | " word_group = []\n", 149 | " permuted_sum += count\n", 150 | " word_group.append(word)\n", 151 | "\n", 152 | "if current_key:\n", 153 | " if len(word_group) > 1:\n", 154 | " word_string = ','.join(sorted(word_group))\n", 155 | " print \"%d\\t%d\\t%s\" % (permuted_sum, len(word_group), word_string)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 5, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "7820\t5\tenglish,helsing,hesling,shengli,shingle\n" 168 | ] 169 | }, 170 | { 171 | "name": "stderr", 172 | "output_type": "stream", 173 | "text": [ 174 | "rm: `wordgroup_wordcount_1534566984903939': No such file or directory\n", 175 | "18/08/18 04:36:28 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 176 | "18/08/18 04:36:28 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 177 | "18/08/18 04:36:30 INFO mapred.FileInputFormat: Total input files to process : 1\n", 178 | "18/08/18 04:36:30 INFO mapreduce.JobSubmitter: number of splits:2\n", 179 | "18/08/18 04:36:30 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1534520873842_0001\n", 180 | "18/08/18 04:36:31 INFO impl.YarnClientImpl: Submitted application application_1534520873842_0001\n", 181 | "18/08/18 04:36:31 INFO mapreduce.Job: The url to track the job: http://2ce61352a0f0:8088/proxy/application_1534520873842_0001/\n", 182 | "18/08/18 04:36:31 INFO mapreduce.Job: Running job: job_1534520873842_0001\n", 183 | "18/08/18 04:36:38 INFO mapreduce.Job: Job job_1534520873842_0001 running in uber mode : false\n", 184 | "18/08/18 04:36:38 INFO mapreduce.Job: map 0% reduce 0%\n", 185 | "18/08/18 04:36:54 INFO mapreduce.Job: map 38% reduce 0%\n", 186 | "18/08/18 04:37:00 INFO mapreduce.Job: map 58% reduce 0%\n", 187 | "18/08/18 04:37:06 INFO mapreduce.Job: map 67% reduce 0%\n", 188 | "18/08/18 04:37:09 INFO mapreduce.Job: map 83% reduce 0%\n", 189 | "18/08/18 04:37:11 INFO mapreduce.Job: map 100% reduce 0%\n", 190 | "18/08/18 04:37:14 INFO mapreduce.Job: map 100% reduce 13%\n", 191 | "18/08/18 04:37:15 INFO mapreduce.Job: map 100% reduce 25%\n", 192 | "18/08/18 04:37:16 INFO mapreduce.Job: map 100% reduce 38%\n", 193 | "18/08/18 04:37:17 INFO mapreduce.Job: map 100% reduce 50%\n", 194 | "18/08/18 04:37:18 INFO mapreduce.Job: map 100% reduce 75%\n", 195 | "18/08/18 04:37:19 INFO mapreduce.Job: map 100% reduce 88%\n", 196 | "18/08/18 04:37:20 INFO mapreduce.Job: map 100% reduce 100%\n", 197 | "18/08/18 04:37:20 INFO mapreduce.Job: Job job_1534520873842_0001 completed successfully\n", 198 | "18/08/18 04:37:20 INFO mapreduce.Job: Counters: 51\n", 199 | "\tFile System Counters\n", 200 | "\t\tFILE: Number of bytes read=16340085\n", 201 | "\t\tFILE: Number of bytes written=25820726\n", 202 | "\t\tFILE: Number of read operations=0\n", 203 | "\t\tFILE: Number of large read operations=0\n", 204 | "\t\tFILE: Number of write operations=0\n", 205 | "\t\tHDFS: Number of bytes read=76874501\n", 206 | "\t\tHDFS: Number of bytes written=5367177\n", 207 | "\t\tHDFS: Number of read operations=30\n", 208 | "\t\tHDFS: Number of large read operations=0\n", 209 | "\t\tHDFS: Number of write operations=16\n", 210 | "\tJob Counters \n", 211 | "\t\tKilled reduce tasks=1\n", 212 | "\t\tLaunched map tasks=2\n", 213 | "\t\tLaunched reduce tasks=8\n", 214 | "\t\tData-local map tasks=2\n", 215 | "\t\tTotal time spent by all maps in occupied slots (ms)=60117\n", 216 | "\t\tTotal time spent by all reduces in occupied slots (ms)=20628\n", 217 | "\t\tTotal time spent by all map tasks (ms)=60117\n", 218 | "\t\tTotal time spent by all reduce tasks (ms)=20628\n", 219 | "\t\tTotal vcore-milliseconds taken by all map tasks=60117\n", 220 | "\t\tTotal vcore-milliseconds taken by all reduce tasks=20628\n", 221 | "\t\tTotal megabyte-milliseconds taken by all map tasks=61559808\n", 222 | "\t\tTotal megabyte-milliseconds taken by all reduce tasks=21123072\n", 223 | "\tMap-Reduce Framework\n", 224 | "\t\tMap input records=4100\n", 225 | "\t\tMap output records=6971114\n", 226 | "\t\tMap output bytes=68112723\n", 227 | "\t\tMap output materialized bytes=8139244\n", 228 | "\t\tInput split bytes=228\n", 229 | "\t\tCombine input records=6971114\n", 230 | "\t\tCombine output records=576488\n", 231 | "\t\tReduce input groups=426864\n", 232 | "\t\tReduce shuffle bytes=8139244\n", 233 | "\t\tReduce input records=576488\n", 234 | "\t\tReduce output records=426864\n", 235 | "\t\tSpilled Records=1729464\n", 236 | "\t\tShuffled Maps =16\n", 237 | "\t\tFailed Shuffles=0\n", 238 | "\t\tMerged Map outputs=16\n", 239 | "\t\tGC time elapsed (ms)=913\n", 240 | "\t\tCPU time spent (ms)=105250\n", 241 | "\t\tPhysical memory (bytes) snapshot=2338676736\n", 242 | "\t\tVirtual memory (bytes) snapshot=20216901632\n", 243 | "\t\tTotal committed heap usage (bytes)=1606418432\n", 244 | "\tShuffle Errors\n", 245 | "\t\tBAD_ID=0\n", 246 | "\t\tCONNECTION=0\n", 247 | "\t\tIO_ERROR=0\n", 248 | "\t\tWRONG_LENGTH=0\n", 249 | "\t\tWRONG_MAP=0\n", 250 | "\t\tWRONG_REDUCE=0\n", 251 | "\tWiki stats\n", 252 | "\t\tTotal words=6971114\n", 253 | "\tFile Input Format Counters \n", 254 | "\t\tBytes Read=76874273\n", 255 | "\tFile Output Format Counters \n", 256 | "\t\tBytes Written=5367177\n", 257 | "18/08/18 04:37:20 INFO streaming.StreamJob: Output directory: wordgroup_wordcount_1534566984903939\n", 258 | "rm: `wordgroup_permutation_1534567040795948': No such file or directory\n", 259 | "18/08/18 04:37:24 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 260 | "18/08/18 04:37:24 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 261 | "18/08/18 04:37:24 INFO mapred.FileInputFormat: Total input files to process : 8\n", 262 | "18/08/18 04:37:25 INFO mapreduce.JobSubmitter: number of splits:8\n", 263 | "18/08/18 04:37:25 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1534520873842_0002\n", 264 | "18/08/18 04:37:25 INFO impl.YarnClientImpl: Submitted application application_1534520873842_0002\n", 265 | "18/08/18 04:37:25 INFO mapreduce.Job: The url to track the job: http://2ce61352a0f0:8088/proxy/application_1534520873842_0002/\n", 266 | "18/08/18 04:37:25 INFO mapreduce.Job: Running job: job_1534520873842_0002\n", 267 | "18/08/18 04:37:32 INFO mapreduce.Job: Job job_1534520873842_0002 running in uber mode : false\n", 268 | "18/08/18 04:37:32 INFO mapreduce.Job: map 0% reduce 0%\n", 269 | "18/08/18 04:37:36 INFO mapreduce.Job: map 13% reduce 0%\n", 270 | "18/08/18 04:37:37 INFO mapreduce.Job: map 75% reduce 0%\n", 271 | "18/08/18 04:37:40 INFO mapreduce.Job: map 100% reduce 0%\n", 272 | "18/08/18 04:37:42 INFO mapreduce.Job: map 100% reduce 100%\n", 273 | "18/08/18 04:37:43 INFO mapreduce.Job: Job job_1534520873842_0002 completed successfully\n", 274 | "18/08/18 04:37:43 INFO mapreduce.Job: Counters: 49\n", 275 | "\tFile System Counters\n", 276 | "\t\tFILE: Number of bytes read=5141933\n", 277 | "\t\tFILE: Number of bytes written=11541144\n", 278 | "\t\tFILE: Number of read operations=0\n", 279 | "\t\tFILE: Number of large read operations=0\n", 280 | "\t\tFILE: Number of write operations=0\n", 281 | "\t\tHDFS: Number of bytes read=5368241\n", 282 | "\t\tHDFS: Number of bytes written=508018\n", 283 | "\t\tHDFS: Number of read operations=27\n", 284 | "\t\tHDFS: Number of large read operations=0\n", 285 | "\t\tHDFS: Number of write operations=2\n", 286 | "\tJob Counters \n", 287 | "\t\tLaunched map tasks=8\n", 288 | "\t\tLaunched reduce tasks=1\n", 289 | "\t\tData-local map tasks=8\n", 290 | "\t\tTotal time spent by all maps in occupied slots (ms)=22684\n", 291 | "\t\tTotal time spent by all reduces in occupied slots (ms)=2975\n", 292 | "\t\tTotal time spent by all map tasks (ms)=22684\n", 293 | "\t\tTotal time spent by all reduce tasks (ms)=2975\n", 294 | "\t\tTotal vcore-milliseconds taken by all map tasks=22684\n", 295 | "\t\tTotal vcore-milliseconds taken by all reduce tasks=2975\n", 296 | "\t\tTotal megabyte-milliseconds taken by all map tasks=23228416\n", 297 | "\t\tTotal megabyte-milliseconds taken by all reduce tasks=3046400\n", 298 | "\tMap-Reduce Framework\n", 299 | "\t\tMap input records=426864\n", 300 | "\t\tMap output records=226590\n", 301 | "\t\tMap output bytes=4688747\n", 302 | "\t\tMap output materialized bytes=5141975\n", 303 | "\t\tInput split bytes=1064\n", 304 | "\t\tCombine input records=0\n", 305 | "\t\tCombine output records=0\n", 306 | "\t\tReduce input groups=188979\n", 307 | "\t\tReduce shuffle bytes=5141975\n", 308 | "\t\tReduce input records=226590\n", 309 | "\t\tReduce output records=22359\n", 310 | "\t\tSpilled Records=453180\n", 311 | "\t\tShuffled Maps =8\n", 312 | "\t\tFailed Shuffles=0\n", 313 | "\t\tMerged Map outputs=8\n", 314 | "\t\tGC time elapsed (ms)=837\n", 315 | "\t\tCPU time spent (ms)=12790\n", 316 | "\t\tPhysical memory (bytes) snapshot=2555068416\n", 317 | "\t\tVirtual memory (bytes) snapshot=18100936704\n", 318 | "\t\tTotal committed heap usage (bytes)=1757413376\n", 319 | "\tShuffle Errors\n", 320 | "\t\tBAD_ID=0\n", 321 | "\t\tCONNECTION=0\n", 322 | "\t\tIO_ERROR=0\n", 323 | "\t\tWRONG_LENGTH=0\n", 324 | "\t\tWRONG_MAP=0\n", 325 | "\t\tWRONG_REDUCE=0\n", 326 | "\tFile Input Format Counters \n", 327 | "\t\tBytes Read=5367177\n", 328 | "\tFile Output Format Counters \n", 329 | "\t\tBytes Written=508018\n", 330 | "18/08/18 04:37:43 INFO streaming.StreamJob: Output directory: wordgroup_permutation_1534567040795948\n" 331 | ] 332 | } 333 | ], 334 | "source": [ 335 | "%%bash\n", 336 | "\n", 337 | "OUT_DIR=\"wordgroup_wordcount_\"$(date +\"%s%6N\")\n", 338 | "\n", 339 | "NUM_REDUCERS=8\n", 340 | "\n", 341 | "hdfs dfs -rm -r -skipTrash ${OUT_DIR} > /dev/null\n", 342 | "\n", 343 | "# Code for your first job\n", 344 | "# yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar ...\n", 345 | "yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \\\n", 346 | " -D mapred.jab.name=\"Streaming wordCount\" \\\n", 347 | " -D mapreduce.job.reduces=${NUM_REDUCERS} \\\n", 348 | " -files mapper1.py,reducer1.py,/datasets/stop_words_en.txt \\\n", 349 | " -mapper \"python mapper1.py\" \\\n", 350 | " -combiner \"python reducer1.py\" \\\n", 351 | " -reducer \"python reducer1.py\" \\\n", 352 | " -input /data/wiki/en_articles_part \\\n", 353 | " -output ${OUT_DIR} > /dev/null\n", 354 | "\n", 355 | "OUT_DIR1=\"wordgroup_permutation_\"$(date +\"%s%6N\")\n", 356 | "\n", 357 | "NUM_REDUCERS=1\n", 358 | "\n", 359 | "hdfs dfs -rm -r -skipTrash ${OUT_DIR1} > /dev/null\n", 360 | "\n", 361 | "# Code for your second job\n", 362 | "# yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar ...\n", 363 | "yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \\\n", 364 | " -D mapred.jab.name=\"Streaming permutations\" \\\n", 365 | " -D mapreduce.job.reduces=${NUM_REDUCERS} \\\n", 366 | " -files mapper2.py,reducer2.py \\\n", 367 | " -mapper \"python mapper2.py\" \\\n", 368 | " -reducer \"python reducer2.py\" \\\n", 369 | " -input ${OUT_DIR} \\\n", 370 | " -output ${OUT_DIR1} > /dev/null\n", 371 | "\n", 372 | "# Code for obtaining the results\n", 373 | "hdfs dfs -cat ${OUT_DIR1}/part-00000 | grep -P '(,|\\t)english($|,)'\n", 374 | "hdfs dfs -rm -r -skipTrash ${OUT_DIR1}* > /dev/null" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [] 383 | } 384 | ], 385 | "metadata": { 386 | "kernelspec": { 387 | "display_name": "Python 2", 388 | "language": "python", 389 | "name": "python2" 390 | }, 391 | "language_info": { 392 | "codemirror_mode": { 393 | "name": "ipython", 394 | "version": 2 395 | }, 396 | "file_extension": ".py", 397 | "mimetype": "text/x-python", 398 | "name": "python", 399 | "nbconvert_exporter": "python", 400 | "pygments_lexer": "ipython2", 401 | "version": "2.7.12" 402 | } 403 | }, 404 | "nbformat": 4, 405 | "nbformat_minor": 2 406 | } 407 | -------------------------------------------------------------------------------- /Big-Data-Essentials/Week3 - Solving Problems with MapReduce - Practice/WordsRatingTask1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "Ebk-YqgqEOw1" 8 | }, 9 | "source": [ 10 | "# Hadoop Streaming assignment 1: Words Rating" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "colab_type": "text", 17 | "id": "XDLjl7hGEOw2" 18 | }, 19 | "source": [ 20 | "The purpose of this task is to create your own WordCount program for Wikipedia dump processing and learn basic concepts of the MapReduce.\n", 21 | "\n", 22 | "In this task you have to find the 7th word by popularity and its quantity in the reverse order (most popular first) in Wikipedia data (`/data/wiki/en_articles_part`).\n", 23 | "\n", 24 | "There are several points for this task:\n", 25 | "\n", 26 | "1) As an output, you have to get the 7th word and its quantity separated by a tab character.\n", 27 | "\n", 28 | "2) You must use the second job to obtain a totally ordered result.\n", 29 | "\n", 30 | "3) Do not forget to redirect all trash and output to /dev/null.\n", 31 | "\n", 32 | "Here you can find the draft of the task main steps. You can use other methods for solution obtaining." 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "colab_type": "text", 39 | "id": "lIUikVIDEOw2" 40 | }, 41 | "source": [ 42 | "## Step 1. Create mapper and reducer.\n", 43 | "\n", 44 | "Hint: Demo task contains almost all the necessary pieces to complete this assignment. You may use the demo to implement the first MapReduce Job." 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 1, 50 | "metadata": { 51 | "colab": { 52 | "autoexec": { 53 | "startup": false, 54 | "wait_interval": 0 55 | } 56 | }, 57 | "colab_type": "code", 58 | "id": "A7T5mxDwEOw4" 59 | }, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "Writing mapper1.py\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "%%writefile mapper1.py\n", 71 | "\n", 72 | "# Your code for mapper here.\n", 73 | "import sys\n", 74 | "import re\n", 75 | "\n", 76 | "reload(sys)\n", 77 | "sys.setdefaultencoding('utf-8') # required to convert to unicode\n", 78 | "\n", 79 | "for line in sys.stdin:\n", 80 | " try:\n", 81 | " article_id, text = unicode(line.strip()).split('\\t', 1)\n", 82 | " except ValueError as e:\n", 83 | " continue\n", 84 | " words = re.split(\"\\W*\\s+\\W*\", text, flags=re.UNICODE)\n", 85 | " for word in words:\n", 86 | " print >> sys.stderr, \"reporter:counter:Wiki stats,Total words,%d\" % 1\n", 87 | " print \"%s\\t%d\" % (word.lower(), 1)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 2, 93 | "metadata": { 94 | "colab": { 95 | "autoexec": { 96 | "startup": false, 97 | "wait_interval": 0 98 | } 99 | }, 100 | "colab_type": "code", 101 | "id": "oE8KFKqpEOw5" 102 | }, 103 | "outputs": [ 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "Writing reducer1.py\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "%%writefile reducer1.py\n", 114 | "\n", 115 | "# Your code for reducer here.\n", 116 | "import sys\n", 117 | "\n", 118 | "current_key = None\n", 119 | "word_sum = 0\n", 120 | "\n", 121 | "for line in sys.stdin:\n", 122 | " try:\n", 123 | " key, count = line.strip().split('\\t', 1)\n", 124 | " count = int(count)\n", 125 | " except ValueError as e:\n", 126 | " continue\n", 127 | " if current_key != key:\n", 128 | " if current_key:\n", 129 | " print \"%s\\t%d\" % (current_key, word_sum)\n", 130 | " word_sum = 0\n", 131 | " current_key = key\n", 132 | " word_sum += count\n", 133 | "\n", 134 | "if current_key:\n", 135 | " print \"%s\\t%d\" % (current_key, word_sum)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 0, 141 | "metadata": { 142 | "colab": { 143 | "autoexec": { 144 | "startup": false, 145 | "wait_interval": 0 146 | } 147 | }, 148 | "colab_type": "code", 149 | "id": "LJtc3_u0uXSz" 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "# You can use this cell for other experiments: for example, for combiner." 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "colab_type": "text", 160 | "id": "KYZpcwY9EOw6" 161 | }, 162 | "source": [ 163 | "## Step 2. Create sort job.\n", 164 | "\n", 165 | "Hint: You may use MapReduce comparator to solve this step. Make sure that the keys are sorted in ascending order." 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 0, 171 | "metadata": { 172 | "colab": { 173 | "autoexec": { 174 | "startup": false, 175 | "wait_interval": 0 176 | } 177 | }, 178 | "colab_type": "code", 179 | "id": "g-NMzQ79EOw7" 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "# Your code for sort job here. Don't forget to use magic writefile" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 6, 189 | "metadata": { 190 | "colab": { 191 | "autoexec": { 192 | "startup": false, 193 | "wait_interval": 0 194 | } 195 | }, 196 | "colab_type": "code", 197 | "id": "qs6SKs7wO4be" 198 | }, 199 | "outputs": [ 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "Overwriting mapper2.py\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "%%writefile mapper2.py\n", 210 | "\n", 211 | "import sys\n", 212 | "reload(sys)\n", 213 | "\n", 214 | "for line in sys.stdin:\n", 215 | " try:\n", 216 | " word, count = line.strip().split('\\t', 1)\n", 217 | " count = int(count)\n", 218 | " print '%d\\t%s' % (count, word)\n", 219 | " except ValueError as e:\n", 220 | " continue" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 7, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "Overwriting reducer2.py\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "%%writefile reducer2.py\n", 238 | "\n", 239 | "import sys\n", 240 | "reload(sys)\n", 241 | "\n", 242 | "for line in sys.stdin:\n", 243 | " try:\n", 244 | " count, word = line.strip().split('\\t', 1)\n", 245 | " count = int(count)\n", 246 | " print '%s\\t%d' % (word, count)\n", 247 | " except ValueError as e:\n", 248 | " continue" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": { 254 | "colab_type": "text", 255 | "id": "4OBE81GZEOw8" 256 | }, 257 | "source": [ 258 | "## Step 3. Bash commands\n", 259 | "\n", 260 | " Hint: For printing the exact row you may use basic UNIX commands. For instance, sed/head/tail/... (if you know other commands, you can use them).\n", 261 | "\n", 262 | "To run both jobs, you must use two consecutive yarn-commands. Remember that the input for the second job is the ouput for the first job." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 9, 268 | "metadata": { 269 | "colab": { 270 | "autoexec": { 271 | "startup": false, 272 | "wait_interval": 0 273 | } 274 | }, 275 | "colab_type": "code", 276 | "id": "1q9NczdIEOw9" 277 | }, 278 | "outputs": [ 279 | { 280 | "name": "stdout", 281 | "output_type": "stream", 282 | "text": [ 283 | "is\t126420\n" 284 | ] 285 | }, 286 | { 287 | "name": "stderr", 288 | "output_type": "stream", 289 | "text": [ 290 | "rm: `assignment1_wordcount_1534486609377997': No such file or directory\n", 291 | "18/08/17 06:16:53 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 292 | "18/08/17 06:16:53 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 293 | "18/08/17 06:16:54 INFO mapred.FileInputFormat: Total input files to process : 1\n", 294 | "18/08/17 06:16:54 INFO mapreduce.JobSubmitter: number of splits:2\n", 295 | "18/08/17 06:16:54 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1534370228375_0013\n", 296 | "18/08/17 06:16:54 INFO impl.YarnClientImpl: Submitted application application_1534370228375_0013\n", 297 | "18/08/17 06:16:55 INFO mapreduce.Job: The url to track the job: http://94eb3f8bc528:8088/proxy/application_1534370228375_0013/\n", 298 | "18/08/17 06:16:55 INFO mapreduce.Job: Running job: job_1534370228375_0013\n", 299 | "18/08/17 06:17:02 INFO mapreduce.Job: Job job_1534370228375_0013 running in uber mode : false\n", 300 | "18/08/17 06:17:02 INFO mapreduce.Job: map 0% reduce 0%\n", 301 | "18/08/17 06:17:18 INFO mapreduce.Job: map 28% reduce 0%\n", 302 | "18/08/17 06:17:24 INFO mapreduce.Job: map 42% reduce 0%\n", 303 | "18/08/17 06:17:30 INFO mapreduce.Job: map 48% reduce 0%\n", 304 | "18/08/17 06:17:36 INFO mapreduce.Job: map 56% reduce 0%\n", 305 | "18/08/17 06:17:42 INFO mapreduce.Job: map 77% reduce 0%\n", 306 | "18/08/17 06:17:48 INFO mapreduce.Job: map 83% reduce 0%\n", 307 | "18/08/17 06:17:55 INFO mapreduce.Job: map 100% reduce 13%\n", 308 | "18/08/17 06:17:56 INFO mapreduce.Job: map 100% reduce 63%\n", 309 | "18/08/17 06:17:59 INFO mapreduce.Job: map 100% reduce 88%\n", 310 | "18/08/17 06:18:00 INFO mapreduce.Job: map 100% reduce 100%\n", 311 | "18/08/17 06:18:00 INFO mapreduce.Job: Job job_1534370228375_0013 completed successfully\n", 312 | "18/08/17 06:18:00 INFO mapreduce.Job: Counters: 51\n", 313 | "\tFile System Counters\n", 314 | "\t\tFILE: Number of bytes read=17954826\n", 315 | "\t\tFILE: Number of bytes written=28258223\n", 316 | "\t\tFILE: Number of read operations=0\n", 317 | "\t\tFILE: Number of large read operations=0\n", 318 | "\t\tFILE: Number of write operations=0\n", 319 | "\t\tHDFS: Number of bytes read=76874501\n", 320 | "\t\tHDFS: Number of bytes written=5370513\n", 321 | "\t\tHDFS: Number of read operations=30\n", 322 | "\t\tHDFS: Number of large read operations=0\n", 323 | "\t\tHDFS: Number of write operations=16\n", 324 | "\tJob Counters \n", 325 | "\t\tKilled reduce tasks=1\n", 326 | "\t\tLaunched map tasks=2\n", 327 | "\t\tLaunched reduce tasks=8\n", 328 | "\t\tData-local map tasks=2\n", 329 | "\t\tTotal time spent by all maps in occupied slots (ms)=89237\n", 330 | "\t\tTotal time spent by all reduces in occupied slots (ms)=56195\n", 331 | "\t\tTotal time spent by all map tasks (ms)=89237\n", 332 | "\t\tTotal time spent by all reduce tasks (ms)=56195\n", 333 | "\t\tTotal vcore-milliseconds taken by all map tasks=89237\n", 334 | "\t\tTotal vcore-milliseconds taken by all reduce tasks=56195\n", 335 | "\t\tTotal megabyte-milliseconds taken by all map tasks=91378688\n", 336 | "\t\tTotal megabyte-milliseconds taken by all reduce tasks=57543680\n", 337 | "\tMap-Reduce Framework\n", 338 | "\t\tMap input records=4100\n", 339 | "\t\tMap output records=11937375\n", 340 | "\t\tMap output bytes=97842436\n", 341 | "\t\tMap output materialized bytes=8952763\n", 342 | "\t\tInput split bytes=228\n", 343 | "\t\tCombine input records=11937375\n", 344 | "\t\tCombine output records=640011\n", 345 | "\t\tReduce input groups=427175\n", 346 | "\t\tReduce shuffle bytes=8952763\n", 347 | "\t\tReduce input records=640011\n", 348 | "\t\tReduce output records=427175\n", 349 | "\t\tSpilled Records=1920033\n", 350 | "\t\tShuffled Maps =16\n", 351 | "\t\tFailed Shuffles=0\n", 352 | "\t\tMerged Map outputs=16\n", 353 | "\t\tGC time elapsed (ms)=1313\n", 354 | "\t\tCPU time spent (ms)=166360\n", 355 | "\t\tPhysical memory (bytes) snapshot=2304663552\n", 356 | "\t\tVirtual memory (bytes) snapshot=20216516608\n", 357 | "\t\tTotal committed heap usage (bytes)=1602748416\n", 358 | "\tShuffle Errors\n", 359 | "\t\tBAD_ID=0\n", 360 | "\t\tCONNECTION=0\n", 361 | "\t\tIO_ERROR=0\n", 362 | "\t\tWRONG_LENGTH=0\n", 363 | "\t\tWRONG_MAP=0\n", 364 | "\t\tWRONG_REDUCE=0\n", 365 | "\tWiki stats\n", 366 | "\t\tTotal words=11937375\n", 367 | "\tFile Input Format Counters \n", 368 | "\t\tBytes Read=76874273\n", 369 | "\tFile Output Format Counters \n", 370 | "\t\tBytes Written=5370513\n", 371 | "18/08/17 06:18:00 INFO streaming.StreamJob: Output directory: assignment1_wordcount_1534486609377997\n", 372 | "rm: `assignment1_sorting_1534486680927516': No such file or directory\n", 373 | "18/08/17 06:18:04 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 374 | "18/08/17 06:18:04 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 375 | "18/08/17 06:18:05 INFO mapred.FileInputFormat: Total input files to process : 8\n", 376 | "18/08/17 06:18:05 INFO mapreduce.JobSubmitter: number of splits:8\n", 377 | "18/08/17 06:18:05 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1534370228375_0014\n", 378 | "18/08/17 06:18:05 INFO impl.YarnClientImpl: Submitted application application_1534370228375_0014\n", 379 | "18/08/17 06:18:05 INFO mapreduce.Job: The url to track the job: http://94eb3f8bc528:8088/proxy/application_1534370228375_0014/\n", 380 | "18/08/17 06:18:05 INFO mapreduce.Job: Running job: job_1534370228375_0014\n", 381 | "18/08/17 06:18:11 INFO mapreduce.Job: Job job_1534370228375_0014 running in uber mode : false\n", 382 | "18/08/17 06:18:11 INFO mapreduce.Job: map 0% reduce 0%\n", 383 | "18/08/17 06:18:16 INFO mapreduce.Job: map 13% reduce 0%\n", 384 | "18/08/17 06:18:17 INFO mapreduce.Job: map 75% reduce 0%\n", 385 | "18/08/17 06:18:20 INFO mapreduce.Job: map 88% reduce 0%\n", 386 | "18/08/17 06:18:21 INFO mapreduce.Job: map 100% reduce 0%\n", 387 | "18/08/17 06:18:23 INFO mapreduce.Job: map 100% reduce 100%\n", 388 | "18/08/17 06:18:23 INFO mapreduce.Job: Job job_1534370228375_0014 completed successfully\n", 389 | "18/08/17 06:18:24 INFO mapreduce.Job: Counters: 50\n", 390 | "\tFile System Counters\n", 391 | "\t\tFILE: Number of bytes read=6224888\n", 392 | "\t\tFILE: Number of bytes written=13710339\n", 393 | "\t\tFILE: Number of read operations=0\n", 394 | "\t\tFILE: Number of large read operations=0\n", 395 | "\t\tFILE: Number of write operations=0\n", 396 | "\t\tHDFS: Number of bytes read=5371593\n", 397 | "\t\tHDFS: Number of bytes written=5370513\n", 398 | "\t\tHDFS: Number of read operations=27\n", 399 | "\t\tHDFS: Number of large read operations=0\n", 400 | "\t\tHDFS: Number of write operations=2\n", 401 | "\tJob Counters \n", 402 | "\t\tKilled map tasks=1\n", 403 | "\t\tLaunched map tasks=8\n", 404 | "\t\tLaunched reduce tasks=1\n", 405 | "\t\tData-local map tasks=8\n", 406 | "\t\tTotal time spent by all maps in occupied slots (ms)=26584\n", 407 | "\t\tTotal time spent by all reduces in occupied slots (ms)=3953\n", 408 | "\t\tTotal time spent by all map tasks (ms)=26584\n", 409 | "\t\tTotal time spent by all reduce tasks (ms)=3953\n", 410 | "\t\tTotal vcore-milliseconds taken by all map tasks=26584\n", 411 | "\t\tTotal vcore-milliseconds taken by all reduce tasks=3953\n", 412 | "\t\tTotal megabyte-milliseconds taken by all map tasks=27222016\n", 413 | "\t\tTotal megabyte-milliseconds taken by all reduce tasks=4047872\n", 414 | "\tMap-Reduce Framework\n", 415 | "\t\tMap input records=427175\n", 416 | "\t\tMap output records=427175\n", 417 | "\t\tMap output bytes=5370522\n", 418 | "\t\tMap output materialized bytes=6224930\n", 419 | "\t\tInput split bytes=1080\n", 420 | "\t\tCombine input records=0\n", 421 | "\t\tCombine output records=0\n", 422 | "\t\tReduce input groups=2075\n", 423 | "\t\tReduce shuffle bytes=6224930\n", 424 | "\t\tReduce input records=427175\n", 425 | "\t\tReduce output records=427175\n", 426 | "\t\tSpilled Records=854350\n", 427 | "\t\tShuffled Maps =8\n", 428 | "\t\tFailed Shuffles=0\n", 429 | "\t\tMerged Map outputs=8\n", 430 | "\t\tGC time elapsed (ms)=991\n", 431 | "\t\tCPU time spent (ms)=16860\n", 432 | "\t\tPhysical memory (bytes) snapshot=2560761856\n", 433 | "\t\tVirtual memory (bytes) snapshot=18105384960\n", 434 | "\t\tTotal committed heap usage (bytes)=1762656256\n", 435 | "\tShuffle Errors\n", 436 | "\t\tBAD_ID=0\n", 437 | "\t\tCONNECTION=0\n", 438 | "\t\tIO_ERROR=0\n", 439 | "\t\tWRONG_LENGTH=0\n", 440 | "\t\tWRONG_MAP=0\n", 441 | "\t\tWRONG_REDUCE=0\n", 442 | "\tFile Input Format Counters \n", 443 | "\t\tBytes Read=5370513\n", 444 | "\tFile Output Format Counters \n", 445 | "\t\tBytes Written=5370513\n", 446 | "18/08/17 06:18:24 INFO streaming.StreamJob: Output directory: assignment1_sorting_1534486680927516\n" 447 | ] 448 | } 449 | ], 450 | "source": [ 451 | "%%bash\n", 452 | "\n", 453 | "OUT_DIR=\"assignment1_wordcount_\"$(date +\"%s%6N\")\n", 454 | "\n", 455 | "NUM_REDUCERS=8\n", 456 | "\n", 457 | "hdfs dfs -rm -r -skipTrash ${OUT_DIR} > /dev/null\n", 458 | "\n", 459 | "# Code for your first job\n", 460 | "# yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar ...\n", 461 | "yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \\\n", 462 | " -D mapred.jab.name=\"Streaming wordCount\" \\\n", 463 | " -D mapreduce.job.reduces=${NUM_REDUCERS} \\\n", 464 | " -files mapper1.py,reducer1.py \\\n", 465 | " -mapper \"python mapper1.py\" \\\n", 466 | " -combiner \"python reducer1.py\" \\\n", 467 | " -reducer \"python reducer1.py\" \\\n", 468 | " -input /data/wiki/en_articles_part \\\n", 469 | " -output ${OUT_DIR} > /dev/null\n", 470 | "\n", 471 | "OUT_DIR1=\"assignment1_sorting_\"$(date +\"%s%6N\")\n", 472 | "\n", 473 | "NUM_REDUCERS=1\n", 474 | "\n", 475 | "hdfs dfs -rm -r -skipTrash ${OUT_DIR1} > /dev/null\n", 476 | "\n", 477 | "# Code for your second job\n", 478 | "# yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar ...\n", 479 | "yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \\\n", 480 | " -D mapred.jab.name=\"Streaming sorting\" \\\n", 481 | " -D mapreduce.job.reduces=${NUM_REDUCERS} \\\n", 482 | " -D mapreduce.job.output.key.comparator.class=org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedComparator \\\n", 483 | " -D mapreduce.partition.keycomparator.options=\"-nr\" \\\n", 484 | " -files mapper2.py,reducer2.py \\\n", 485 | " -mapper \"python mapper2.py\" \\\n", 486 | " -reducer \"python reducer2.py\" \\\n", 487 | " -input ${OUT_DIR} \\\n", 488 | " -output ${OUT_DIR1} > /dev/null\n", 489 | "\n", 490 | "# Code for obtaining the results\n", 491 | "hdfs dfs -cat ${OUT_DIR1}/part-00000 | sed -n 7p\n", 492 | "hdfs dfs -rm -r -skipTrash ${OUT_DIR1}* > /dev/null" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [] 501 | } 502 | ], 503 | "metadata": { 504 | "colab": { 505 | "collapsed_sections": [], 506 | "default_view": {}, 507 | "name": "701_to_students.ipynb", 508 | "provenance": [], 509 | "version": "0.3.2", 510 | "views": {} 511 | }, 512 | "kernelspec": { 513 | "display_name": "Python 3", 514 | "language": "python", 515 | "name": "python3" 516 | }, 517 | "language_info": { 518 | "codemirror_mode": { 519 | "name": "ipython", 520 | "version": 3 521 | }, 522 | "file_extension": ".py", 523 | "mimetype": "text/x-python", 524 | "name": "python", 525 | "nbconvert_exporter": "python", 526 | "pygments_lexer": "ipython3", 527 | "version": "3.6.1" 528 | }, 529 | "toc": { 530 | "base_numbering": 1, 531 | "nav_menu": {}, 532 | "number_sections": true, 533 | "sideBar": true, 534 | "skip_h1_title": false, 535 | "title_cell": "Table of Contents", 536 | "title_sidebar": "Contents", 537 | "toc_cell": false, 538 | "toc_position": {}, 539 | "toc_section_display": true, 540 | "toc_window_display": false 541 | } 542 | }, 543 | "nbformat": 4, 544 | "nbformat_minor": 1 545 | } 546 | -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/1 Core concepts and abstractions/1 Welcome.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/1 Core concepts and abstractions/1 Welcome.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/1 Core concepts and abstractions/2 RDDs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/1 Core concepts and abstractions/2 RDDs.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/1 Core concepts and abstractions/3 Transformations 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/1 Core concepts and abstractions/3 Transformations 1.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/1 Core concepts and abstractions/4 Transformations 2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/1 Core concepts and abstractions/4 Transformations 2.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/1 Core concepts and abstractions/5 Actions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/1 Core concepts and abstractions/5 Actions.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/1 Core concepts and abstractions/6 Resiliency.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/1 Core concepts and abstractions/6 Resiliency.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/2 Advanced topics/1 Execution and Scheduling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/2 Advanced topics/1 Execution and Scheduling.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/2 Advanced topics/2 Caching and Persistence.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/2 Advanced topics/2 Caching and Persistence.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/2 Advanced topics/3 Broadcast variables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/2 Advanced topics/3 Broadcast variables.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/2 Advanced topics/4 Accumulator variables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/2 Advanced topics/4 Accumulator variables.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/3 Working with Spark in Python/1 Getting started with Spark and Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/3 Working with Spark in Python/1 Getting started with Spark and Python.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/3 Working with Spark in Python/2 Working with text files.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/3 Working with Spark in Python/2 Working with text files.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/3 Working with Spark in Python/3 Joins.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/3 Working with Spark in Python/3 Joins.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/3 Working with Spark in Python/4 Broadcast and Accumulator variables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/3 Working with Spark in Python/4 Broadcast and Accumulator variables.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/3 Working with Spark in Python/5 Spark UI.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/3 Working with Spark in Python/5 Spark UI.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/3 Working with Spark in Python/6 Cluster mode.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week4 - Introduction to Apache Spark/3 Working with Spark in Python/6 Cluster mode.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week4 - Introduction to Apache Spark/Dropped Text.txt: -------------------------------------------------------------------------------- 1 | are -------------------------------------------------------------------------------- /Big-Data-Essentials/Week5 - Introduction to Apache Spark - Practice/Spakr+Assignment1+-+Pairs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#! /usr/bin/env python\n", 10 | "\n", 11 | "from pyspark import SparkConf, SparkContext\n", 12 | "sc = SparkContext(conf=SparkConf().setAppName(\"MyApp\").setMaster(\"local[2]\"))\n", 13 | "\n", 14 | "import re" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "narodnaya_gazeta\t1\n", 27 | "narodnaya_volya\t9\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "def parse_article(line):\n", 33 | " try:\n", 34 | " article_id, text = unicode(line.rstrip()).split('\\t', 1)\n", 35 | " except ValueError as e:\n", 36 | " return []\n", 37 | " text = re.sub(\"^\\W+|\\W+$\", \"\", text, flags=re.UNICODE)\n", 38 | " words = re.split(\"\\W*\\s+\\W*\", text, flags=re.UNICODE)\n", 39 | " word_pair = list(zip(words[:-1], words[1:]))\n", 40 | " word_pair = [str(pair[1].lower()) for pair in word_pair if pair[0].lower() == 'narodnaya']\n", 41 | " return word_pair\n", 42 | "\n", 43 | "wiki = sc.textFile(\"/data/wiki/en_articles_part/articles-part\", 16).flatMap(parse_article).map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).sortByKey() \n", 44 | "result = wiki.map(lambda x: ('narodnaya_' + str(x[0]), x[1])).collect()\n", 45 | "\n", 46 | "for pair_count in result:\n", 47 | " print \"%s\\t%d\" % (pair_count)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 2", 61 | "language": "python", 62 | "name": "python2" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 2 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython2", 74 | "version": "2.7.12" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 2 79 | } 80 | -------------------------------------------------------------------------------- /Big-Data-Essentials/Week5 - Introduction to Apache Spark - Practice/Spark+Assignment1+-+Pairs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#! /usr/bin/env python\n", 10 | "\n", 11 | "from pyspark import SparkConf, SparkContext\n", 12 | "sc = SparkContext(conf=SparkConf().setAppName(\"MyApp\").setMaster(\"local[2]\"))\n", 13 | "\n", 14 | "import re" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 5, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "narodnaya_gazeta\t1\n", 27 | "narodnaya_volya\t9\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "def parse_article(line):\n", 33 | " try:\n", 34 | " article_id, text = unicode(line.rstrip()).split('\\t', 1)\n", 35 | " except ValueError as e:\n", 36 | " return []\n", 37 | " text = re.sub(\"^\\W+|\\W+$\", \"\", text, flags=re.UNICODE)\n", 38 | " words = re.split(\"\\W*\\s+\\W*\", text, flags=re.UNICODE)\n", 39 | " word_pair = list(zip(words[:-1], words[1:]))\n", 40 | " word_pair = [word2.lower() for word1, word2 in word_pair if word1.lower() == 'narodnaya']\n", 41 | " return word_pair\n", 42 | "\n", 43 | "wiki = sc.textFile(\"/data/wiki/en_articles_part/articles-part\", 16).flatMap(parse_article).filter(lambda x: x != []).map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).sortByKey() \n", 44 | "result = wiki.map(lambda x: ('narodnaya_' + x[0], x[1])).collect()\n", 45 | "\n", 46 | "for pair_count in result:\n", 47 | " print \"%s\\t%d\" % (pair_count)\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 2", 61 | "language": "python", 62 | "name": "python2" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 2 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython2", 74 | "version": "2.7.12" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 2 79 | } 80 | -------------------------------------------------------------------------------- /Big-Data-Essentials/Week5 - Introduction to Apache Spark - Practice/Spark+Assignment2+-+Collocations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#! /usr/bin/env python\n", 10 | "\n", 11 | "from pyspark import SparkConf, SparkContext\n", 12 | "sc = SparkContext(conf=SparkConf().setAppName(\"MyApp\").setMaster(\"local[2]\"))\n", 13 | "\n", 14 | "import math\n", 15 | "import re" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 3, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "with open('/datasets/stop_words_en.txt') as f:\n", 25 | " stop_words = [word.strip() for word in f]" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 17, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "def npmi(word1, word2, count, word_count_dict, total_words, total_pairs):\n", 35 | " p_a = word_count_dict[word1] / total_words\n", 36 | " p_b = word_count_dict[word2] / total_words\n", 37 | " p_a_b = count / total_pairs\n", 38 | " return - math.log(p_a_b / (p_a * p_b)) / math.log(p_a_b) " 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 5, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "def parse_article(line):\n", 48 | " try:\n", 49 | " article_id, text = unicode(line.rstrip()).split('\\t', 1)\n", 50 | " text = re.sub(\"^\\W+|\\W+$\", \"\", text, flags=re.UNICODE)\n", 51 | " words = re.split(\"\\W*\\s+\\W*\", text, flags=re.UNICODE)\n", 52 | " words = [word.lower() for word in words]\n", 53 | " return words\n", 54 | " except ValueError as e:\n", 55 | " return []" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 6, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "wiki = sc.textFile(\"/data/wiki/en_articles_part/articles-part\", 16).map(parse_article)\n", 65 | "\n", 66 | "wiki_filtered = wiki.filter(lambda x: x != [])\n", 67 | "wiki_filtered = wiki_filtered.map(lambda x: [word for word in x if word not in stop_words])" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 7, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "total_words = wiki_filtered.map(lambda words: len(words)).reduce(lambda a,b: a+b)\n", 77 | "\n", 78 | "wiki_mapped = wiki_filtered.flatMap(lambda words: list(zip(words[:-1], words[1:])))\n", 79 | "\n", 80 | "total_pairs = wiki_mapped.map(lambda words: 1).reduce(lambda x,y: x+y)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 8, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "word_occur = wiki_filtered.flatMap(lambda words: [(word, 1) for word in words]).reduceByKey(lambda x,y: x+y)\n", 90 | "word_occur = word_occur.collect()\n", 91 | "word_count_dict = dict()\n", 92 | "for word, count in word_occur:\n", 93 | " word_count_dict[word] = count" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 35, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "total_words = float(total_words)\n", 103 | "total_pairs = float(total_pairs)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 9, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "wiki_mostoccur = wiki_mapped.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y).filter(lambda x: x[1] >= 500)\n", 113 | "wiki_npmi = wiki_mostoccur.map(lambda (word, count): (word, npmi(word[0], word[1], count, \n", 114 | " word_count_dict, total_words, total_pairs)))" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 37, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "los_angeles\n", 127 | "external_links\n", 128 | "united_states\n", 129 | "prime_minister\n", 130 | "san_francisco\n", 131 | "et_al\n", 132 | "new_york\n", 133 | "supreme_court\n", 134 | "19th_century\n", 135 | "20th_century\n", 136 | "references_external\n", 137 | "soviet_union\n", 138 | "air_force\n", 139 | "baseball_player\n", 140 | "university_press\n", 141 | "roman_catholic\n", 142 | "united_kingdom\n", 143 | "references_reading\n", 144 | "notes_references\n", 145 | "award_best\n", 146 | "north_america\n", 147 | "new_zealand\n", 148 | "civil_war\n", 149 | "catholic_church\n", 150 | "world_war\n", 151 | "war_ii\n", 152 | "south_africa\n", 153 | "took_place\n", 154 | "roman_empire\n", 155 | "united_nations\n", 156 | "american_singer-songwriter\n", 157 | "high_school\n", 158 | "american_actor\n", 159 | "american_actress\n", 160 | "american_baseball\n", 161 | "york_city\n", 162 | "american_football\n", 163 | "years_later\n", 164 | "north_american\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "wiki_npmi = wiki_npmi.map(lambda (word, npmi_value): (npmi_value, word)) \\\n", 170 | " .sortByKey(ascending=False) \\\n", 171 | " .map(lambda (npmi_value, word): word[0]+'_'+word[1])\n", 172 | "\n", 173 | "for pair in wiki_npmi.take(39):\n", 174 | " print \"%s\" % (pair)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [] 183 | } 184 | ], 185 | "metadata": { 186 | "kernelspec": { 187 | "display_name": "Python 2", 188 | "language": "python", 189 | "name": "python2" 190 | }, 191 | "language_info": { 192 | "codemirror_mode": { 193 | "name": "ipython", 194 | "version": 2 195 | }, 196 | "file_extension": ".py", 197 | "mimetype": "text/x-python", 198 | "name": "python", 199 | "nbconvert_exporter": "python", 200 | "pygments_lexer": "ipython2", 201 | "version": "2.7.12" 202 | } 203 | }, 204 | "nbformat": 4, 205 | "nbformat_minor": 2 206 | } 207 | -------------------------------------------------------------------------------- /Big-Data-Essentials/Week6 - Real-World Applications/1 Working with samples/data_dictionary_trip_records_yellow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week6 - Real-World Applications/1 Working with samples/data_dictionary_trip_records_yellow.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week6 - Real-World Applications/1 Working with samples/sample100.csv: -------------------------------------------------------------------------------- 1 | VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount 2 | 2,2016-12-15 12:18:37,2016-12-15 12:28:45,6,.50,1,N,141,237,2,7.5,0,0.5,0,0,0.3,8.3 3 | 1,2016-12-20 22:39:23,2016-12-20 22:50:27,1,2.00,1,N,246,164,1,9.5,0.5,0.5,3,0,0.3,13.8 4 | 1,2016-12-09 23:02:06,2016-12-09 23:11:03,1,1.10,1,N,114,79,1,7.5,0.5,0.5,1.75,0,0.3,10.55 5 | 1,2016-12-11 15:40:19,2016-12-11 16:09:53,1,11.90,1,N,138,224,1,34,0,0.5,8.05,5.54,0.3,48.39 6 | 2,2016-12-19 11:34:28,2016-12-19 11:47:51,1,1.96,1,N,261,144,2,10.5,0,0.5,0,0,0.3,11.3 7 | 2,2016-12-29 14:12:30,2016-12-29 14:22:57,5,1.81,1,N,237,143,2,9,0,0.5,0,0,0.3,9.8 8 | 1,2016-12-19 19:30:59,2016-12-19 19:38:48,1,1.50,1,N,237,239,1,7.5,1,0.5,1.85,0,0.3,11.15 9 | 2,2016-12-28 14:39:49,2016-12-28 14:50:42,2,1.47,1,N,43,237,2,9,0,0.5,0,0,0.3,9.8 10 | 2,2016-12-31 20:15:07,2016-12-31 20:19:10,1,.62,1,N,263,140,2,4.5,0.5,0.5,0,0,0.3,5.8 11 | 2,2016-12-04 21:42:44,2016-12-04 21:49:46,1,1.06,1,N,68,90,2,6.5,0.5,0.5,0,0,0.3,7.8 12 | 1,2016-12-22 12:28:58,2016-12-22 12:34:38,1,1.20,1,N,238,236,2,6.5,0,0.5,0,0,0.3,7.3 13 | 2,2016-12-01 02:05:10,2016-12-01 02:37:36,1,11.98,1,N,24,181,1,36.5,0.5,0.5,11.34,0,0.3,49.14 14 | 2,2016-12-07 15:31:55,2016-12-07 16:23:04,1,3.64,1,N,170,237,1,29.5,0,0.5,1,0,0.3,31.3 15 | 2,2016-12-17 10:42:28,2016-12-17 11:00:32,1,5.29,1,N,140,211,1,18.5,0,0.5,2,0,0.3,21.3 16 | 2,2016-12-11 11:44:18,2016-12-11 11:51:40,1,.57,1,N,237,237,1,6.5,0,0.5,1.46,0,0.3,8.76 17 | 1,2016-12-01 17:12:46,2016-12-01 17:49:16,1,9.70,1,N,141,138,2,34,1,0.5,0,5.54,0.3,41.34 18 | 1,2016-12-21 14:04:49,2016-12-21 14:09:13,1,.40,1,N,140,237,1,4.5,0,0.5,1.05,0,0.3,6.35 19 | 1,2016-12-07 17:22:12,2016-12-07 17:39:59,1,1.70,1,N,161,142,1,12.5,1,0.5,3.55,0,0.3,17.85 20 | 2,2016-12-09 22:49:05,2016-12-09 23:15:47,1,2.85,1,N,161,229,2,17.5,0.5,0.5,0,0,0.3,18.8 21 | 2,2016-12-11 16:56:37,2016-12-11 17:04:48,1,1.66,1,N,237,236,2,7.5,0,0.5,0,0,0.3,8.3 22 | 1,2016-12-31 17:09:39,2016-12-31 17:23:26,1,2.40,1,N,246,144,1,11.5,0,0.5,2.45,0,0.3,14.75 23 | 1,2016-12-29 10:53:43,2016-12-29 11:08:08,1,3.20,1,N,262,161,2,13.5,0,0.5,0,0,0.3,14.3 24 | 1,2016-12-22 06:49:44,2016-12-22 06:52:55,1,.80,1,N,249,186,1,5,0,0.5,1.15,0,0.3,6.95 25 | 1,2016-12-21 04:50:45,2016-12-21 04:52:14,1,.70,1,N,233,162,2,4,0.5,0.5,0,0,0.3,5.3 26 | 2,2016-12-16 14:29:19,2016-12-16 15:57:09,2,11.15,1,N,142,138,1,56,0,0.5,18.7,5.54,0.3,81.04 27 | 2,2016-12-11 19:37:30,2016-12-11 19:56:13,1,2.31,1,N,162,234,2,13,0,0.5,0,0,0.3,13.8 28 | 1,2016-12-27 14:54:40,2016-12-27 15:01:17,1,.60,1,N,170,161,1,6,0,0.5,0,0,0.3,6.8 29 | 2,2016-12-15 13:29:03,2016-12-15 13:52:23,1,4.03,1,N,236,50,2,17,0,0.5,0,0,0.3,17.8 30 | 2,2016-12-05 21:21:17,2016-12-05 21:48:36,5,16.36,1,N,132,256,2,45,0.5,0.5,0,0,0.3,46.3 31 | 2,2016-12-11 13:10:18,2016-12-11 13:15:29,3,1.37,1,N,231,249,1,6.5,0,0.5,1.3,0,0.3,8.6 32 | 2,2016-12-23 10:53:09,2016-12-23 11:04:36,5,1.94,1,N,45,79,2,7.5,0,0.5,0,0,0.3,8.3 33 | 1,2016-12-05 19:27:57,2016-12-05 19:37:50,1,1.80,1,N,239,236,1,9,1,0.5,2.2,0,0.3,13 34 | 1,2016-12-27 19:15:05,2016-12-27 19:22:42,2,1.60,1,N,142,236,2,8,1,0.5,0,0,0.3,9.8 35 | 2,2016-12-23 15:35:25,2016-12-23 16:02:33,1,3.63,1,N,163,237,1,19.5,0,0.5,4.06,0,0.3,24.36 36 | 2,2016-12-26 21:31:55,2016-12-26 21:57:04,1,4.40,1,N,186,41,1,19.5,0.5,0.5,4.16,0,0.3,24.96 37 | 2,2016-12-13 20:14:25,2016-12-13 20:31:53,5,7.62,1,N,138,145,1,22.5,0.5,0.5,4.76,0,0.3,28.56 38 | 1,2016-12-12 02:06:28,2016-12-12 02:42:19,4,8.80,1,N,68,82,1,30.5,0.5,0.5,5,0,0.3,36.8 39 | 2,2016-12-13 17:15:02,2016-12-13 17:37:11,2,3.92,1,N,68,261,1,17,1,0.5,3.76,0,0.3,22.56 40 | 2,2016-12-06 13:59:36,2016-12-06 14:12:10,1,.82,1,N,263,262,1,9,0,0.5,1.96,0,0.3,11.76 41 | 1,2016-12-02 22:28:11,2016-12-02 22:59:39,2,6.30,1,N,144,238,1,25.5,0.5,0.5,5.35,0,0.3,32.15 42 | 1,2016-12-07 11:33:11,2016-12-07 11:52:53,1,1.80,1,N,114,87,1,13,0,0.5,3.45,0,0.3,17.25 43 | 1,2016-12-22 17:15:53,2016-12-22 17:36:54,2,1.20,1,N,43,161,1,13.5,1,0.5,3.05,0,0.3,18.35 44 | 2,2016-12-02 13:52:24,2016-12-02 14:07:35,1,3.99,1,N,261,33,2,15,0,0.5,0,0,0.3,15.8 45 | 2,2016-12-22 22:02:47,2016-12-22 22:04:24,1,.05,1,N,48,48,1,3,0.5,0.5,1,0,0.3,5.3 46 | 2,2016-12-22 23:32:41,2016-12-22 23:43:25,1,2.03,1,N,161,224,1,9,0.5,0.5,2.06,0,0.3,12.36 47 | 1,2016-12-04 11:07:44,2016-12-04 11:12:28,1,.60,1,N,236,262,1,5,0,0.5,1,0,0.3,6.8 48 | 2,2016-12-28 16:36:47,2016-12-28 16:43:53,2,.94,1,N,79,4,2,6.5,1,0.5,0,0,0.3,8.3 49 | 1,2016-12-11 21:43:11,2016-12-11 21:46:29,1,.50,1,N,113,79,1,4.5,0.5,0.5,1.15,0,0.3,6.95 50 | 2,2016-12-10 23:10:49,2016-12-10 23:47:36,1,3.98,1,N,211,140,1,23,0.5,0.5,3,0,0.3,27.3 51 | 1,2016-12-07 10:16:06,2016-12-07 10:21:03,1,.70,1,N,137,107,1,5,0,0.5,1.15,0,0.3,6.95 52 | 2,2016-12-22 10:26:55,2016-12-22 10:34:27,1,1.50,1,N,137,148,1,7.5,0,0.5,1.66,0,0.3,9.96 53 | 2,2016-12-23 00:32:31,2016-12-23 00:41:06,6,1.46,1,N,249,107,1,8,0.5,0.5,2.79,0,0.3,12.09 54 | 2,2016-12-19 14:26:29,2016-12-19 14:30:45,1,.62,1,N,137,170,1,5,0,0.5,1.16,0,0.3,6.96 55 | 2,2016-12-08 21:53:15,2016-12-08 22:02:16,3,2.58,1,N,13,158,1,10.5,0.5,0.5,3.54,0,0.3,15.34 56 | 1,2016-12-04 13:56:59,2016-12-04 14:02:07,2,.80,1,N,238,239,1,6,0,0.5,1.35,0,0.3,8.15 57 | 1,2016-12-19 17:00:08,2016-12-19 17:10:36,1,1.10,1,N,142,237,1,8,1,0.5,1.95,0,0.3,11.75 58 | 1,2016-12-29 11:04:19,2016-12-29 11:24:53,1,2.30,1,N,229,43,2,14.5,0,0.5,0,0,0.3,15.3 59 | 2,2016-12-28 13:42:20,2016-12-28 14:08:26,1,1.65,1,N,228,228,1,16,0,0.5,3.36,0,0.3,20.16 60 | 1,2016-12-06 23:32:31,2016-12-06 23:46:15,1,1.80,1,N,158,164,1,10.5,0.5,0.5,2.36,0,0.3,14.16 61 | 1,2016-12-27 01:43:20,2016-12-27 01:46:58,1,.70,1,N,231,87,1,4.5,0.5,0.5,1.15,0,0.3,6.95 62 | 1,2016-12-13 19:16:12,2016-12-13 19:22:48,1,1.20,1,N,162,107,1,6.5,1,0.5,1.66,0,0.3,9.96 63 | 2,2016-12-10 16:29:18,2016-12-10 17:02:47,5,1.73,1,N,107,45,1,20,0,0.5,2,0,0.3,22.8 64 | 2,2016-12-27 09:50:40,2016-12-27 10:22:29,1,17.02,2,N,132,162,1,52,0,0.5,11.67,5.54,0.3,70.01 65 | 1,2016-12-09 10:22:27,2016-12-09 10:32:51,1,1.80,1,N,151,152,1,9.5,0,0.5,2.05,0,0.3,12.35 66 | 1,2016-12-06 21:58:00,2016-12-06 22:01:30,1,.60,1,N,75,75,2,4.5,0.5,0.5,0,0,0.3,5.8 67 | 1,2016-12-05 22:24:55,2016-12-05 22:54:51,1,19.30,2,N,132,236,1,52,0,0.5,11.65,5.54,0.3,69.99 68 | 2,2016-12-09 10:58:18,2016-12-09 11:05:41,1,.80,1,N,68,158,1,6,0,0.5,1,0,0.3,7.8 69 | 2,2016-12-24 17:41:04,2016-12-24 17:41:55,6,.26,1,N,158,158,2,3,0,0.5,0,0,0.3,3.8 70 | 2,2016-12-22 23:59:06,2016-12-23 00:07:17,1,1.44,1,N,234,144,1,7.5,0.5,0.5,1.08,0,0.3,9.88 71 | 1,2016-12-14 16:30:40,2016-12-14 16:37:22,2,1.20,1,N,186,113,2,6.5,1,0.5,0,0,0.3,8.3 72 | 1,2016-12-06 13:10:04,2016-12-06 13:20:25,1,1.50,1,N,163,239,2,8.5,0,0.5,0,0,0.3,9.3 73 | 2,2016-12-03 13:38:11,2016-12-03 13:51:02,2,.69,1,N,233,161,2,9,0,0.5,0,0,0.3,9.8 74 | 2,2016-12-06 13:40:22,2016-12-06 13:53:16,1,1.58,1,N,246,170,1,10,0,0.5,2.16,0,0.3,12.96 75 | 2,2016-12-13 17:33:54,2016-12-13 17:50:58,2,1.13,1,N,170,233,1,11,1,0.5,2.56,0,0.3,15.36 76 | 2,2016-12-29 15:22:23,2016-12-29 15:27:01,2,.59,1,N,48,43,2,5,0,0.5,0,0,0.3,5.8 77 | 2,2016-12-05 11:45:46,2016-12-05 12:07:43,2,5.66,1,N,88,161,1,21.5,0,0.5,4.46,0,0.3,26.76 78 | 2,2016-12-03 16:26:02,2016-12-03 16:30:05,3,.31,1,N,113,249,2,4.5,0,0.5,0,0,0.3,5.3 79 | 2,2016-12-02 22:57:00,2016-12-02 23:26:51,1,17.12,2,N,132,170,1,52,0,0.5,5,5.54,0.3,63.34 80 | 1,2016-12-19 08:16:16,2016-12-19 08:26:27,1,.90,1,N,48,186,1,8,0,0.5,2,0,0.3,10.8 81 | 1,2016-12-08 20:03:19,2016-12-08 20:11:45,1,1.50,1,N,142,238,2,8,0.5,0.5,0,0,0.3,9.3 82 | 2,2016-12-23 13:08:08,2016-12-23 13:21:15,1,2.40,1,N,161,239,1,11,0,0.5,2.36,0,0.3,14.16 83 | 1,2016-12-19 22:05:22,2016-12-19 22:13:23,1,1.90,1,N,211,137,1,8.5,0.5,0.5,1.95,0,0.3,11.75 84 | 1,2016-12-30 12:42:39,2016-12-30 12:47:20,2,.50,1,N,239,239,1,5,0,0.5,1.15,0,0.3,6.95 85 | 2,2016-12-19 20:35:56,2016-12-19 20:45:24,2,1.61,1,N,186,233,1,8,0.5,0.5,1.58,0,0.3,10.88 86 | 1,2016-12-24 12:12:49,2016-12-24 12:17:05,1,.80,1,N,236,75,1,5,0,0.5,1.15,0,0.3,6.95 87 | 1,2016-12-28 11:39:23,2016-12-28 11:57:17,4,1.40,1,N,162,163,1,12,0,0.5,2.56,0,0.3,15.36 88 | 2,2016-12-07 22:53:15,2016-12-07 23:02:14,3,.91,1,N,48,50,2,7.5,0.5,0.5,0,0,0.3,8.8 89 | 2,2016-12-17 17:00:12,2016-12-17 17:09:01,3,2.57,1,N,141,74,1,9.5,0,0.5,2.58,0,0.3,12.88 90 | 1,2016-12-12 22:10:34,2016-12-12 22:15:59,1,1.20,1,N,170,107,1,6,0.5,0.5,3.5,0,0.3,10.8 91 | 1,2016-12-26 23:08:13,2016-12-26 23:13:53,3,.70,1,N,144,211,2,5.5,0.5,0.5,0,0,0.3,6.8 92 | 2,2016-12-03 01:25:43,2016-12-03 01:28:54,1,.38,1,N,79,79,1,4,0.5,0.5,0.5,0,0.3,5.8 93 | 1,2016-12-21 22:05:49,2016-12-21 22:14:15,2,1.00,1,N,232,231,1,7,0.5,0.5,1.65,0,0.3,9.95 94 | 2,2016-12-20 02:05:01,2016-12-20 02:08:38,2,.67,1,N,100,68,2,4.5,0.5,0.5,0,0,0.3,5.8 95 | 2,2016-12-14 22:54:48,2016-12-14 23:03:55,1,2.42,1,N,249,13,1,9.5,0.5,0.5,2.16,0,0.3,12.96 96 | 1,2016-12-17 17:04:54,2016-12-17 17:09:08,2,.80,1,N,50,142,2,5,0,0.5,0,0,0.3,5.8 97 | 1,2016-12-24 15:54:38,2016-12-24 16:07:04,1,2.00,1,N,163,263,1,10,0,0.5,2.7,0,0.3,13.5 98 | 1,2016-12-20 18:57:57,2016-12-20 19:12:10,2,1.80,1,N,234,45,1,10.5,1,0.5,3.65,0,0.3,15.95 99 | 1,2016-12-10 13:07:05,2016-12-10 13:34:44,2,3.40,1,N,50,79,1,18.5,0,0.5,1,0,0.3,20.3 100 | 2,2016-12-17 12:06:42,2016-12-17 12:17:44,1,.92,1,N,107,186,2,8,0,0.5,0,0,0.3,8.8 101 | 1,2016-12-08 21:43:22,2016-12-08 21:48:13,2,1.40,1,N,162,236,1,6.5,0.5,0.5,1,0,0.3,8.8 102 | -------------------------------------------------------------------------------- /Big-Data-Essentials/Week6 - Real-World Applications/2 Telecommunications Analytics/1 Map and Reduce Side Joins.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week6 - Real-World Applications/2 Telecommunications Analytics/1 Map and Reduce Side Joins.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week6 - Real-World Applications/2 Telecommunications Analytics/2 Tabular Data and KeyFieldSelection.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week6 - Real-World Applications/2 Telecommunications Analytics/2 Tabular Data and KeyFieldSelection.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week6 - Real-World Applications/2 Telecommunications Analytics/3 Data Skew and Salting.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week6 - Real-World Applications/2 Telecommunications Analytics/3 Data Skew and Salting.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week6 - Real-World Applications/3 Working with social graphs/1 Twitter graph case study.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week6 - Real-World Applications/3 Working with social graphs/1 Twitter graph case study.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week6 - Real-World Applications/3 Working with social graphs/2 Shortest path.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/Week6 - Real-World Applications/3 Working with social graphs/2 Shortest path.pdf -------------------------------------------------------------------------------- /Big-Data-Essentials/Week6 - Real-World Applications/Shortest_path.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#! /usr/bin/env python\n", 10 | "\n", 11 | "from pyspark import SparkConf, SparkContext\n", 12 | "sc = SparkContext(conf=SparkConf().setAppName(\"MyApp\").setMaster(\"local\"))" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "12,422,53,52,107,20,23,274,34\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "def parse_edge(s):\n", 30 | " user, follower = s.split(\"\\t\")\n", 31 | " return (int(user), int(follower))\n", 32 | "\n", 33 | "n = 5\n", 34 | "raw_data = sc.textFile(\"/data/twitter/twitter_sample_small.txt\")\n", 35 | "edges = raw_data.map(parse_edge).cache()\n", 36 | "\n", 37 | "#edges from follower nodes to main nodes\n", 38 | "forward_edges = edges.map(lambda e: (e[1], e[0])).partitionBy(n).persist()\n", 39 | "\n", 40 | "def step(item):\n", 41 | " # prev_v_l : previous vertices list\n", 42 | " # prev_l : previous vertex\n", 43 | " prev_v, prev_v_l, next_v = item[0], item[1][0], item[1][1]\n", 44 | " return (next_v, prev_v_l + [next_v])\n", 45 | "\n", 46 | "x = 12\n", 47 | "prev_v = [12]\n", 48 | "paths = sc.parallelize([(x, prev_v)]).partitionBy(n)\n", 49 | "flag = 0\n", 50 | "while True:\n", 51 | " paths = paths.join(forward_edges, n).map(step)\n", 52 | " flag = paths.filter(lambda i: i[0] == 34).count()\n", 53 | " if flag:\n", 54 | " break\n", 55 | "\n", 56 | "paths = paths.filter(lambda i: i[0] == 34).collect()\n", 57 | "prev_v_l = paths[0][1]\n", 58 | "print(','.join(map(str, prev_v_l)))" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 2", 72 | "language": "python", 73 | "name": "python2" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 2 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython2", 85 | "version": "2.7.12" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 2 90 | } 91 | -------------------------------------------------------------------------------- /Big-Data-Essentials/Week6 - Real-World Applications/Tf-Idf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Writing mapper.py\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%%writefile mapper.py\n", 18 | "from __future__ import division\n", 19 | "import sys\n", 20 | "import re\n", 21 | "from collections import Counter\n", 22 | "\n", 23 | "reload(sys)\n", 24 | "sys.setdefaultencoding('utf-8') # required to convert to unicode\n", 25 | "\n", 26 | "with open('stop_words_en.txt') as f:\n", 27 | " stop_words = set(f.read().split())\n", 28 | "\n", 29 | "for line in sys.stdin:\n", 30 | " try:\n", 31 | " article_id, text = unicode(line.strip()).split('\\t', 1)\n", 32 | " article_id = int(article_id)\n", 33 | " except ValueError as e:\n", 34 | " continue\n", 35 | " words = re.split(\"\\W*\\s+\\W*\", text, flags=re.UNICODE)\n", 36 | " words = [x.lower() for x in words if x.lower() not in stop_words]\n", 37 | " words_set = set(words)\n", 38 | " \n", 39 | " num_of_words_in_doc = len(words)\n", 40 | " counts = Counter(words)\n", 41 | " \n", 42 | " for word in words_set:\n", 43 | " num_of_word_in_doc = counts[word]\n", 44 | " tf = num_of_word_in_doc / num_of_words_in_doc\n", 45 | " print \"%s\\t%d\\t%f\" % (word, article_id, tf)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "Writing reducer.py\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "%%writefile reducer.py\n", 63 | "from __future__ import division\n", 64 | "import sys\n", 65 | "from math import log\n", 66 | "\n", 67 | "current_word = None\n", 68 | "articles_dict = dict()\n", 69 | "\n", 70 | "for line in sys.stdin:\n", 71 | " try:\n", 72 | " word, article_id, tf = line.strip().split('\\t')\n", 73 | " article_id = int(article_id)\n", 74 | " tf = float(tf)\n", 75 | " except ValueError as e:\n", 76 | " continue\n", 77 | " \n", 78 | " if current_word != word:\n", 79 | " if current_word:\n", 80 | " idf = 1 / log(1 + len(articles_dict))\n", 81 | " for key_article_id, tf in articles_dict.iteritems():\n", 82 | " tfidf = tf * idf\n", 83 | " print \"%s\\t%d\\t%f\" % (current_word, key_article_id, tfidf)\n", 84 | " articles_dict = dict()\n", 85 | " current_word = word\n", 86 | " articles_dict[article_id] = tf\n", 87 | "\n", 88 | "if current_word:\n", 89 | " print \"%s\\t%d\\t%f\" % (current_word, article_id, tfidf)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 5, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "0.000351\n" 102 | ] 103 | }, 104 | { 105 | "name": "stderr", 106 | "output_type": "stream", 107 | "text": [ 108 | "19/01/22 06:40:32 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 109 | "19/01/22 06:40:32 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 110 | "19/01/22 06:40:33 INFO mapred.FileInputFormat: Total input files to process : 1\n", 111 | "19/01/22 06:40:33 INFO mapreduce.JobSubmitter: number of splits:2\n", 112 | "19/01/22 06:40:33 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1548115547210_0003\n", 113 | "19/01/22 06:40:33 INFO impl.YarnClientImpl: Submitted application application_1548115547210_0003\n", 114 | "19/01/22 06:40:33 INFO mapreduce.Job: The url to track the job: http://982ca0bb2233:8088/proxy/application_1548115547210_0003/\n", 115 | "19/01/22 06:40:33 INFO mapreduce.Job: Running job: job_1548115547210_0003\n", 116 | "19/01/22 06:40:39 INFO mapreduce.Job: Job job_1548115547210_0003 running in uber mode : false\n", 117 | "19/01/22 06:40:39 INFO mapreduce.Job: map 0% reduce 0%\n", 118 | "19/01/22 06:40:55 INFO mapreduce.Job: map 66% reduce 0%\n", 119 | "19/01/22 06:40:58 INFO mapreduce.Job: map 83% reduce 0%\n", 120 | "19/01/22 06:40:59 INFO mapreduce.Job: map 100% reduce 0%\n", 121 | "19/01/22 06:41:06 INFO mapreduce.Job: map 100% reduce 13%\n", 122 | "19/01/22 06:41:07 INFO mapreduce.Job: map 100% reduce 25%\n", 123 | "19/01/22 06:41:08 INFO mapreduce.Job: map 100% reduce 38%\n", 124 | "19/01/22 06:41:09 INFO mapreduce.Job: map 100% reduce 50%\n", 125 | "19/01/22 06:41:10 INFO mapreduce.Job: map 100% reduce 63%\n", 126 | "19/01/22 06:41:11 INFO mapreduce.Job: map 100% reduce 75%\n", 127 | "19/01/22 06:41:12 INFO mapreduce.Job: map 100% reduce 88%\n", 128 | "19/01/22 06:41:13 INFO mapreduce.Job: map 100% reduce 100%\n", 129 | "19/01/22 06:41:13 INFO mapreduce.Job: Job job_1548115547210_0003 completed successfully\n", 130 | "19/01/22 06:41:13 INFO mapreduce.Job: Counters: 50\n", 131 | "\tFile System Counters\n", 132 | "\t\tFILE: Number of bytes read=83708136\n", 133 | "\t\tFILE: Number of bytes written=168815726\n", 134 | "\t\tFILE: Number of read operations=0\n", 135 | "\t\tFILE: Number of large read operations=0\n", 136 | "\t\tFILE: Number of write operations=0\n", 137 | "\t\tHDFS: Number of bytes read=76874501\n", 138 | "\t\tHDFS: Number of bytes written=76761713\n", 139 | "\t\tHDFS: Number of read operations=30\n", 140 | "\t\tHDFS: Number of large read operations=0\n", 141 | "\t\tHDFS: Number of write operations=16\n", 142 | "\tJob Counters \n", 143 | "\t\tKilled reduce tasks=1\n", 144 | "\t\tLaunched map tasks=2\n", 145 | "\t\tLaunched reduce tasks=8\n", 146 | "\t\tData-local map tasks=2\n", 147 | "\t\tTotal time spent by all maps in occupied slots (ms)=33473\n", 148 | "\t\tTotal time spent by all reduces in occupied slots (ms)=36919\n", 149 | "\t\tTotal time spent by all map tasks (ms)=33473\n", 150 | "\t\tTotal time spent by all reduce tasks (ms)=36919\n", 151 | "\t\tTotal vcore-milliseconds taken by all map tasks=33473\n", 152 | "\t\tTotal vcore-milliseconds taken by all reduce tasks=36919\n", 153 | "\t\tTotal megabyte-milliseconds taken by all map tasks=34276352\n", 154 | "\t\tTotal megabyte-milliseconds taken by all reduce tasks=37805056\n", 155 | "\tMap-Reduce Framework\n", 156 | "\t\tMap input records=4100\n", 157 | "\t\tMap output records=3472743\n", 158 | "\t\tMap output bytes=76762592\n", 159 | "\t\tMap output materialized bytes=83708184\n", 160 | "\t\tInput split bytes=228\n", 161 | "\t\tCombine input records=0\n", 162 | "\t\tCombine output records=0\n", 163 | "\t\tReduce input groups=426865\n", 164 | "\t\tReduce shuffle bytes=83708184\n", 165 | "\t\tReduce input records=3472743\n", 166 | "\t\tReduce output records=3472685\n", 167 | "\t\tSpilled Records=6945486\n", 168 | "\t\tShuffled Maps =16\n", 169 | "\t\tFailed Shuffles=0\n", 170 | "\t\tMerged Map outputs=16\n", 171 | "\t\tGC time elapsed (ms)=792\n", 172 | "\t\tCPU time spent (ms)=51470\n", 173 | "\t\tPhysical memory (bytes) snapshot=2249990144\n", 174 | "\t\tVirtual memory (bytes) snapshot=20222672896\n", 175 | "\t\tTotal committed heap usage (bytes)=1554513920\n", 176 | "\tShuffle Errors\n", 177 | "\t\tBAD_ID=0\n", 178 | "\t\tCONNECTION=0\n", 179 | "\t\tIO_ERROR=0\n", 180 | "\t\tWRONG_LENGTH=0\n", 181 | "\t\tWRONG_MAP=0\n", 182 | "\t\tWRONG_REDUCE=0\n", 183 | "\tFile Input Format Counters \n", 184 | "\t\tBytes Read=76874273\n", 185 | "\tFile Output Format Counters \n", 186 | "\t\tBytes Written=76761713\n", 187 | "19/01/22 06:41:13 INFO streaming.StreamJob: Output directory: Tf-Idf_assignment\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "%%bash\n", 193 | "\n", 194 | "OUT_DIR=\"Tf-Idf_assignment\"\n", 195 | "NUM_REDUCERS=8\n", 196 | "\n", 197 | "hdfs dfs -rm -r -skipTrash ${OUT_DIR} > /dev/null\n", 198 | "\n", 199 | "yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \\\n", 200 | " -D mapred.jab.name=\"Streaming tf_idf\" \\\n", 201 | " -D mapreduce.job.reduces=${NUM_REDUCERS} \\\n", 202 | " -files mapper.py,reducer.py,/datasets/stop_words_en.txt \\\n", 203 | " -mapper \"python mapper.py\" \\\n", 204 | " -reducer \"python reducer.py\" \\\n", 205 | " -input /data/wiki/en_articles_part \\\n", 206 | " -output ${OUT_DIR} > /dev/null\n", 207 | "\n", 208 | "hdfs dfs -cat ${OUT_DIR}/* | grep -P 'labor\\t12\\t' | cut -f3" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [] 217 | } 218 | ], 219 | "metadata": { 220 | "kernelspec": { 221 | "display_name": "Python 2", 222 | "language": "python", 223 | "name": "python2" 224 | }, 225 | "language_info": { 226 | "codemirror_mode": { 227 | "name": "ipython", 228 | "version": 2 229 | }, 230 | "file_extension": ".py", 231 | "mimetype": "text/x-python", 232 | "name": "python", 233 | "nbconvert_exporter": "python", 234 | "pygments_lexer": "ipython2", 235 | "version": "2.7.12" 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 2 240 | } 241 | -------------------------------------------------------------------------------- /Big-Data-Essentials/data_dictionary_trip_records_yellow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/81170dbd126b0626b7f34f2b308f52a9a63fc45e/Big-Data-Essentials/data_dictionary_trip_records_yellow.pdf -------------------------------------------------------------------------------- /Hadoop Installation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "'Hadoop Installation.ipynb' LICENSE\r\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# use '!' to access shell commands in the notebook\n", 18 | "!ls" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Installing Prerequisites" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "Start with installing prerequisites:\n", 33 | "\n", 34 | "### JAVA-JDK" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "[sudo] password for incursio: \n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "!sudo apt-get install default-java" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "In jupyter notebook, ``sudo`` command will then require PASS_KEY but you won't be able to ENTER your password. Therefore, below two are methods to input password to run ``sudo`` command." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "#### use echo to give your PASS_KEY as an input" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 34, 71 | "metadata": { 72 | "scrolled": true 73 | }, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "Reading package lists... Done \n", 80 | "Building dependency tree \n", 81 | "Reading state information... Done\n", 82 | "default-jdk is already the newest version (2:1.10-63ubuntu1~02).\n", 83 | "0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "!echo 'XXXXXXXX' | sudo -S apt-get install default-jdk" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "#### save you password in a file and use it to provide PASS_KEY" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 35, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "Reading package lists... Done \n", 108 | "Building dependency tree \n", 109 | "Reading state information... Done\n", 110 | "default-jdk is already the newest version (2:1.10-63ubuntu1~02).\n", 111 | "0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "!sudo -S apt-get install default-jdk < /path/to/file" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "### Add a Dedicated hadoop user\n", 124 | "\n", 125 | "It'll create a new user to install/run hadoop keeping it separated from other user accounts." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 53, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [ 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "[sudo] password for incursio: Sorry, try again.\n", 140 | "[sudo] password for incursio: \n", 141 | "sudo: 1 incorrect password attempt\n", 142 | "[sudo] password for incursio: Sorry, try again.\n", 143 | "[sudo] password for incursio: \n", 144 | "sudo: 1 incorrect password attempt\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "#!sudo -S addgroup hadoop < /path/to/file\n", 150 | "#!sudo adduser --ingroup hadoop hduser < path/to/file\n", 151 | "\n", 152 | "PASS_KEY = 'XXXXXXX'\n", 153 | "!echo $PASS_KEY | sudo -S addgroup hadoop\n", 154 | "!echo $PASS_KEY | sudo -S adduser --ingroup hadoop hduser " 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### Configuring SSH" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 48, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "su: must be run from a terminal\r\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "!echo PASS_KEY | su - hduser" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "Now, to execute following commands you will have to open the terminal.\n", 186 | "\n", 187 | "**Copy the following cell(s)**\n", 188 | "\n", 189 | "The below command will transfer the terminal access to newly created ``hduser``" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "su - hduser" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "Hadoop requires an SSH access to manage nodes present all over the cluster. This command will generate an SSH key with empty(string) password.\n", 206 | "In general, it's not recommended to use empty(string) password, but since we don't want to enter the passphrase each time Hadoop connects to its nodes therefore, **leave it empty**." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "ssh-keygen -t rsa -P \"\"" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "The command creates a new file and appends generated key to it. " 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "cat .ssh/id_rsa.pub >> .ssh/authorized_keys" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "Now we'll need root access through ``hduser``, thus we'll add ``hduser`` to the list of sudoers." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "#open nano to edit /etc/sudoers.tmp\n", 248 | "sudo visudo\n", 249 | "\n", 250 | "#and append the following at the EOF\n", 251 | "hduser ALL=(ALL:ALL) ALL" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "We want to disable IPv6." 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "#open system call configuration file\n", 268 | "sudo gedit /etc/sysctl.conf\n", 269 | "\n", 270 | "#and append the following at the EOF\n", 271 | "#disable ipv6 \n", 272 | "net.ipv6.conf.all.disable_ipv6 = 1 \n", 273 | "net.ipv6.conf.default.disable_ipv6 = 1 \n", 274 | "net.ipv6.conf.lo.disable_ipv6 = 1" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "Now, **reboot** system. On boot, check whether the ipv6 has been disabled." 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "#it should return 1\n", 291 | "cat /proc/sys/net/ipv6/conf/all/disable_ipv6" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "## Install hadoop" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "Run the following cell(s) in the terminal with access to ``hduser``" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### Download Hadoop" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "#change directory\n", 322 | "cd /usr/local\n", 323 | "\n", 324 | "#download hadoop 3.1 in this directory\n", 325 | "#to download other/newer version check the link http://www-eu.apache.org/dist/hadoop/core/\n", 326 | "wget http://www-eu.apache.org/dist/hadoop/core/hadoop-3.1.0/hadoop-3.1.0.tar.gz\n", 327 | "\n", 328 | "#extract the tar file\n", 329 | "sudo tar xzf hadoop-3.1.0.tar.gz\n", 330 | "\n", 331 | "#rename it to hadoop\n", 332 | "sudo mv hadoop-3.1.0 hadoop\n", 333 | "\n", 334 | "#change the owner of files to hduser\n", 335 | "sudo chown -R hduser:hadoop hadoop" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "### Set environment variables" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "Set hadoop and java home environment variables" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "#Open the editor\n", 359 | "sudo gedit ~/.bashrc\n", 360 | "\n", 361 | "#and append the following lines at the EOF\n", 362 | "\n", 363 | "export HADOOP_HOME=/usr/local/hadoop\n", 364 | "export JAVA_HOME=/usr/lib/jvm/default-java\n", 365 | "\n", 366 | "# Some convenient aliases and functions for running Hadoop-related commands \n", 367 | "unalias fs &> /dev/null \n", 368 | "alias fs=\"hadoop fs\" \n", 369 | "unalias hls &> /dev/null \n", 370 | "alias hls=\"fs -ls\" \n", 371 | "\n", 372 | "# Add Hadoop bin/ directory to PATH \n", 373 | "export PATH=$PATH:$HADOOP_HOME/bin\n", 374 | "# Add Hadoop sbin/ directory to PATH \n", 375 | "export PATH=$PATH:$HADOOP_HOME/sbin" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "metadata": {}, 381 | "source": [ 382 | "Now edit the ``hadoop-env.sh`` and update JAVA_HOME" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "#You know the drill\n", 392 | "sudo gedit $HADOOP_HOME/etc/hadoop/hadoop-env.sh\n", 393 | "\n", 394 | "#update JAVA_HOME (don't append, instead search for likewise line of code, it might be in the comments!)\n", 395 | "export JAVA_HOME=/usr/lib/jvm/default-java\n", 396 | "\n", 397 | "#you can also update HADOOP_HOME (not necessary)\n", 398 | "export HADOOP_HOME=/usr/local/hadoop" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": {}, 404 | "source": [ 405 | "## Start Hadoop Cluster" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "### Standalone Mode" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "In this mode, hadoop will be set to run in a non-distributed mode, as a single java process. Using this mode we can check whether the installation is upto-mark." 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "#create a directory to store input files\n", 429 | "mkdir $HADOOP_HOME/input\n", 430 | "\n", 431 | "#now to verify no-errors in the installation, we will run a sample using example jar file\n", 432 | "#copy all xml files to the newly created directory\n", 433 | "cp $HADOOP_HOME/etc/hadoop/*.xml $HADOOP_HOME/input\n", 434 | "\n", 435 | "#1st argument is the /path/to/hadoop command (required to run MapReduce)\n", 436 | "#2nd argument is jar, specifying MapReduce is in JAVA archive\n", 437 | "#3rd argument is come along MapReduce example, it name of jar can be version specific (check your file/version)\n", 438 | "#4th argument is grep, to execute regular expression example\n", 439 | "#5th argument is input directory, containing all the .xml files\n", 440 | "#6th argument is output directory, which will be created and will contain output files\n", 441 | "#7th argument is 'dfs[a-z.]+', basically the string to be searched\n", 442 | "$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.1.0.jar grep input output 'dfs[a-z.]+'" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": {}, 448 | "source": [ 449 | "### Pseudo Distributed Mode" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "In this mode, hadoop runs on a single node in a pseudo distributed mode where each hadoop daemon run as separate java process." 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "#### Configuring site xml(s)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "Create the ``tmp`` directory" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "sudo mkdir -p /app/hadoop/tmp\n", 480 | "sudo chown hduser:hadoop /app/hadoop/tmp" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "Now edit ``core-site.xml`` and ``hdfs-site.xml``. You'll find these files in **$HADOOP_HOME/etc/hadoop** directory.\n", 488 | "\n", 489 | "Start with **core-site.xml**" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "sudo gedit $HADOOP_HOME/etc/hadoop/core-site.xml\n", 499 | "\n", 500 | "#paste these lines between tags\n", 501 | "\n", 502 | " hadoop.tmp.dir\n", 503 | " /app/hadoop/tmp\n", 504 | " A base for other temporary directories.\n", 505 | "\n", 506 | "\n", 507 | "\n", 508 | " fs.defaultFS\n", 509 | " hdfs://localhost:9000\n", 510 | " The name of the default file system. A URI whose\n", 511 | " scheme and authority determine the FileSystem implementation. The\n", 512 | " uri's scheme determines the config property (fs.SCHEME.impl) naming\n", 513 | " the FileSystem implementation class. The uri's authority is used to\n", 514 | " determine the host, port, etc. for a filesystem.\n", 515 | "" 516 | ] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "metadata": {}, 521 | "source": [ 522 | "**hdfs-site.xml**" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "sudo gedit $HADOOP_HOME/etc/hadoop/hdfs-site.xml\n", 532 | "\n", 533 | "#paste these lines between tags\n", 534 | "\n", 535 | " dfs.replication\n", 536 | " 1\n", 537 | " Default block replication.\n", 538 | " The actual number of replications can be specified when the file is created.\n", 539 | " The default is used if replication is not specified in create time.\n", 540 | " \n", 541 | "" 542 | ] 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "metadata": {}, 547 | "source": [ 548 | "Format namenode (you'll need to do this only the first time you set up hadoop cluster i.e, the time of installation)" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "#On running this command, you'll get the o/p with SHUTDOWN_MSG at the end.\n", 558 | "#Don't worry it's not an error\n", 559 | "$HADOOP_HOME/bin/hdfs namenode -format" 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": {}, 565 | "source": [ 566 | "Now it's time to start the HADOOP CLUSTER!!!" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": null, 572 | "metadata": {}, 573 | "outputs": [], 574 | "source": [ 575 | "#start the namenode and datanode daemon\n", 576 | "$HADOOP_HOME/sbin/start-dfs.sh" 577 | ] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "metadata": {}, 582 | "source": [ 583 | "Yeah, it's done!" 584 | ] 585 | }, 586 | { 587 | "cell_type": "markdown", 588 | "metadata": {}, 589 | "source": [ 590 | "You can check the cluster nodes" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": null, 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "#run this command\n", 600 | "jps\n", 601 | "\n", 602 | "#also you can browse the namenode web interface on this link\n", 603 | "http://localhost:9870/" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "Now create the HDFS directories which we'll need to execute MapReduce jobs." 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "$HADOOP_HOME/bin/hdfs " 620 | ] 621 | }, 622 | { 623 | "cell_type": "markdown", 624 | "metadata": {}, 625 | "source": [ 626 | "To stop the network" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [ 635 | "$HADOOP_HOME/sbin/stop-dfs.sh" 636 | ] 637 | }, 638 | { 639 | "cell_type": "markdown", 640 | "metadata": {}, 641 | "source": [ 642 | "#### YARN on a single node" 643 | ] 644 | }, 645 | { 646 | "cell_type": "markdown", 647 | "metadata": {}, 648 | "source": [ 649 | "Open the ``.bashrc`` and append other environment variables." 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": null, 655 | "metadata": {}, 656 | "outputs": [], 657 | "source": [ 658 | "sudo gedit ~/.bashrc\n", 659 | "\n", 660 | "#and append the following\n", 661 | "export HADOOP_MAPRED_HOME=${HADOOP_HOME}\n", 662 | "export HADOOP_COMMON_HOME=${HADOOP_HOME}\n", 663 | "export HADOOP_HDFS_HOME=${HADOOP_HOME}\n", 664 | "export HADOOP_YARN_HOME=${HADOOP_HOME}" 665 | ] 666 | }, 667 | { 668 | "cell_type": "markdown", 669 | "metadata": {}, 670 | "source": [ 671 | " **mapred-site.xml**" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": null, 677 | "metadata": {}, 678 | "outputs": [], 679 | "source": [ 680 | "sudo gedit $HADOOP_HOME/etc/hadoop/mapred-site.xml\n", 681 | "\n", 682 | "#paste these lines between tags\n", 683 | "\n", 684 | " mapred.job.tracker\n", 685 | " localhost:54311\n", 686 | " The host and port that the MapReduce job tracker runs\n", 687 | " at. If \"local\", then jobs are run in-process as a single map\n", 688 | " and reduce task.\n", 689 | " \n", 690 | "" 691 | ] 692 | }, 693 | { 694 | "cell_type": "markdown", 695 | "metadata": {}, 696 | "source": [ 697 | "**yarn-site.xml**" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": null, 703 | "metadata": {}, 704 | "outputs": [], 705 | "source": [ 706 | "sudo gedit $HADOOP_HOME/etc/hadoop/yarn-site.xml\n", 707 | "\n", 708 | "#paste these lines between tags\n", 709 | "\n", 710 | " yarn.nodemanager.aux-services\n", 711 | " mapreduce_shuffle\n", 712 | "\n", 713 | "\n", 714 | " yarn.nodemanager.env-whitelist\n", 715 | " JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME\n", 716 | "" 717 | ] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "metadata": {}, 722 | "source": [ 723 | "**To check the distributed filesystem working properly**" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "#Type and check the results for the following commands\n", 733 | "#list of files and directories on your distributed filesystem\n", 734 | "hdfs dfs -ls\n", 735 | "\n", 736 | "#now create a relative path\n", 737 | "hdfs dfs -mkdir /user\n", 738 | "hdfs dfs -mkdir /user/\n", 739 | "\n", 740 | "#relative and absolute path\n", 741 | "hdfs dfs -mkdir /cluster #----> This directory will be created in your dfs home i.e., where directory user is!\n", 742 | "\n", 743 | "hdfs dfs -mkdir cluster #----> This will be created inside /user/\n", 744 | "\n", 745 | "#You can view the added directories in the WebUI too\n", 746 | "#browse localhost:9870\n", 747 | "\n", 748 | "#and check the option Utilities -> Browse filesystem\n", 749 | "#it displays some webhdfs Server Error [This error is shown for the java versions >=9]\n", 750 | "#open and edit hadoop-env.sh\n", 751 | "export HADOOP_OPTS=\"--add-modules java.activation\"" 752 | ] 753 | }, 754 | { 755 | "cell_type": "markdown", 756 | "metadata": {}, 757 | "source": [ 758 | "**I hope your hadoop distributed file system is working fine!**\n", 759 | "\n", 760 | "**In case of trouble or mistake in the code, notify me!**\n", 761 | "\n", 762 | "Now, we can move further and learn the basic commands and their usage in hadoop filesystem. Tutorial is available [HERE](https://github.com/rajatgarg149/BigData-Essentials-HDFS-SPARK-RDD/blob/master/Hadoop%20Tutorial.ipynb)" 763 | ] 764 | } 765 | ], 766 | "metadata": { 767 | "kernelspec": { 768 | "display_name": "Python 3", 769 | "language": "python", 770 | "name": "python3" 771 | }, 772 | "language_info": { 773 | "codemirror_mode": { 774 | "name": "ipython", 775 | "version": 3 776 | }, 777 | "file_extension": ".py", 778 | "mimetype": "text/x-python", 779 | "name": "python", 780 | "nbconvert_exporter": "python", 781 | "pygments_lexer": "ipython3", 782 | "version": "3.6.6" 783 | }, 784 | "toc": { 785 | "base_numbering": 1, 786 | "nav_menu": {}, 787 | "number_sections": true, 788 | "sideBar": true, 789 | "skip_h1_title": false, 790 | "title_cell": "Table of Contents", 791 | "title_sidebar": "Contents", 792 | "toc_cell": false, 793 | "toc_position": { 794 | "height": "calc(100% - 180px)", 795 | "left": "10px", 796 | "top": "150px", 797 | "width": "165px" 798 | }, 799 | "toc_section_display": true, 800 | "toc_window_display": true 801 | } 802 | }, 803 | "nbformat": 4, 804 | "nbformat_minor": 2 805 | } 806 | -------------------------------------------------------------------------------- /Hadoop Tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | } 10 | ], 11 | "metadata": { 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.6.1" 28 | }, 29 | "toc": { 30 | "base_numbering": 1, 31 | "nav_menu": {}, 32 | "number_sections": true, 33 | "sideBar": true, 34 | "skip_h1_title": false, 35 | "title_cell": "Table of Contents", 36 | "title_sidebar": "Contents", 37 | "toc_cell": false, 38 | "toc_position": {}, 39 | "toc_section_display": true, 40 | "toc_window_display": false 41 | } 42 | }, 43 | "nbformat": 4, 44 | "nbformat_minor": 2 45 | } 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Rajat Garg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BigData Essentials : HDFS-SPARK-RDD 2 | 3 | - [Distributed File Systems]() 4 | 5 | - [Solving Problems with MapReduce]() 6 | 7 | - [Solving Problems wih MapReduce - Practice](Week3%20-%20Solving%20Problems%20with%20MapReduce%20-%20Practice) 8 | 9 | - [Introduction to Apache Spark]() 10 | 11 | - [Introduction to Apache Spark - Practice](Big-Data-Essentials/Week5%20-%20Introduction%20to%20Apache%20Spark%20-%20Practice) 12 | 13 | - [Real-World Applications]() 14 | --------------------------------------------------------------------------------