├── .gitattributes
├── .gitignore
├── Chapter04
    ├── Hive_Exercise.txt
    └── WordcountExercise_Code.txt
├── Chapter05
    ├── 01_download_data.txt
    ├── 02_saving_data.txt
    ├── 03_r_shiny_web_application.txt
    ├── MongoDBLog.txt
    ├── cms.q
    ├── installing_kdb.txt
    ├── installing_r_packages.txt
    └── packt.css
├── Chapter06
    ├── .ipynb_checkpoints
    │   └── Packt_Notebook-checkpoint.ipynb
    ├── Packt_Notebook.dbc
    ├── Packt_Notebook.ipynb
    ├── Packt_Notebook.py
    └── old
    │   ├── Packt_Notebook.dbc
    │   ├── Packt_Notebook.ipynb
    │   └── Packt_Notebook.py
├── Chapter07
    ├── .Rhistory
    ├── chapter7_R_code.txt
    ├── tutorial.R
    ├── world_gdp.csv
    ├── world_gdp_per_capita.csv
    ├── world_life_expectancy.csv
    └── world_population.csv
├── Chapter08
    ├── Chapter8.R
    ├── Regularisation.xlsx
    ├── history.csv
    └── rulespackt
    │   ├── app.R
    │   ├── cms_factor_dt.rds
    │   ├── cms_rules.rds
    │   ├── cms_rules_dt.rds
    │   └── www
    │       ├── fonts.css
    │       ├── fonts
    │           └── rubik
    │           │   ├── rubik-v7-latin-300.eot
    │           │   ├── rubik-v7-latin-300.svg
    │           │   ├── rubik-v7-latin-300.ttf
    │           │   ├── rubik-v7-latin-300.woff
    │           │   ├── rubik-v7-latin-300.woff2
    │           │   ├── rubik-v7-latin-300italic.eot
    │           │   ├── rubik-v7-latin-300italic.svg
    │           │   ├── rubik-v7-latin-300italic.ttf
    │           │   ├── rubik-v7-latin-300italic.woff
    │           │   ├── rubik-v7-latin-300italic.woff2
    │           │   ├── rubik-v7-latin-500.eot
    │           │   ├── rubik-v7-latin-500.svg
    │           │   ├── rubik-v7-latin-500.ttf
    │           │   ├── rubik-v7-latin-500.woff
    │           │   ├── rubik-v7-latin-500.woff2
    │           │   ├── rubik-v7-latin-500italic.eot
    │           │   ├── rubik-v7-latin-500italic.svg
    │           │   ├── rubik-v7-latin-500italic.ttf
    │           │   ├── rubik-v7-latin-500italic.woff
    │           │   ├── rubik-v7-latin-500italic.woff2
    │           │   ├── rubik-v7-latin-700.eot
    │           │   ├── rubik-v7-latin-700.svg
    │           │   ├── rubik-v7-latin-700.ttf
    │           │   ├── rubik-v7-latin-700.woff
    │           │   ├── rubik-v7-latin-700.woff2
    │           │   ├── rubik-v7-latin-700italic.eot
    │           │   ├── rubik-v7-latin-700italic.svg
    │           │   ├── rubik-v7-latin-700italic.ttf
    │           │   ├── rubik-v7-latin-700italic.woff
    │           │   ├── rubik-v7-latin-700italic.woff2
    │           │   ├── rubik-v7-latin-900.eot
    │           │   ├── rubik-v7-latin-900.svg
    │           │   ├── rubik-v7-latin-900.ttf
    │           │   ├── rubik-v7-latin-900.woff
    │           │   ├── rubik-v7-latin-900.woff2
    │           │   ├── rubik-v7-latin-900italic.eot
    │           │   ├── rubik-v7-latin-900italic.svg
    │           │   ├── rubik-v7-latin-900italic.ttf
    │           │   ├── rubik-v7-latin-900italic.woff
    │           │   ├── rubik-v7-latin-900italic.woff2
    │           │   ├── rubik-v7-latin-italic.eot
    │           │   ├── rubik-v7-latin-italic.svg
    │           │   ├── rubik-v7-latin-italic.ttf
    │           │   ├── rubik-v7-latin-italic.woff
    │           │   ├── rubik-v7-latin-italic.woff2
    │           │   ├── rubik-v7-latin-regular.eot
    │           │   ├── rubik-v7-latin-regular.svg
    │           │   ├── rubik-v7-latin-regular.ttf
    │           │   ├── rubik-v7-latin-regular.woff
    │           │   └── rubik-v7-latin-regular.woff2
    │       └── packt2.css
├── LICENSE
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Windows image file caches
 2 | Thumbs.db
 3 | ehthumbs.db
 4 | 
 5 | # Folder config file
 6 | Desktop.ini
 7 | 
 8 | # Recycle Bin used on file shares
 9 | $RECYCLE.BIN/
10 | 
11 | # Windows Installer files
12 | *.cab
13 | *.msi
14 | *.msm
15 | *.msp
16 | 
17 | # Windows shortcuts
18 | *.lnk
19 | 
20 | # =========================
21 | # Operating System Files
22 | # =========================
23 | 
24 | # OSX
25 | # =========================
26 | 
27 | .DS_Store
28 | .AppleDouble
29 | .LSOverride
30 | 
31 | # Thumbnails
32 | ._*
33 | 
34 | # Files that might appear in the root of a volume
35 | .DocumentRevisions-V100
36 | .fseventsd
37 | .Spotlight-V100
38 | .TemporaryItems
39 | .Trashes
40 | .VolumeIcon.icns
41 | 
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 | 


--------------------------------------------------------------------------------
/Chapter04/Hive_Exercise.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # Download the oil import prices csv filecd /home/cloudera;wget -O oil.csv "https://stats.oecd.org/sdmx-json/data/DP_LIVE/.OILIMPPRICE.../OECD?contentType=csv&detail=code&separator=comma&csv-lang=en"# Cleanse the CSV File
 3 | 
 4 | # Remove all quotation marks
 5 | [cloudera@quickstart ~]$ sed -i 's/\"//g' oil.csv# Remove all non-printable characters (Source: http://alvinalexander.com/blog/post/linux-unix/how-remove-non-printable-ascii-characters-file-unix)[cloudera@quickstart ~]$ tr -cd '\11\12\15\40-\176' oil_.csv > oil_clean.csv# Rename oil_clean.csv back to oil.csv
 6 | [cloudera@quickstart ~]$ mv oil_clean.csv oil.csvmv: overwrite `oil.csv'? yes# Hive commands to create the table and load data
 7 | 
 8 | CREATE TABLE IF NOT EXISTS OIL	(location String, indicator String, subject String, measure String, 	frequency String, time String, value Float, flagCode String)	ROW FORMAT DELIMITED	FIELDS TERMINATED BY ','	LINES TERMINATED BY '\n'	STORED AS TEXTFILE	tblproperties("skip.header.line.count"="1");LOAD DATA LOCAL INPATH '/home/cloudera/oil.csv' INTO TABLE OIL;SELECT * FROM OIL;/* Queries */
 9 | SELECT LOCATION, MIN(value) as MINPRICE, AVG(value) as AVGPRICE, MAX(value) as MAXPRICEFROM OILWHERE FREQUENCY LIKE "A"GROUP BY LOCATION;# Hive Join Exercise
10 | 
11 | # ENTER THE FOLLOWING IN THE UNIX TERMINAL# DOWNLOAD LATITUDE-LONGITUDE CSV FILEcd /home/cloudera;wget -O latlong.csv "https://gist.githubusercontent.com/tadast/8827699/raw/7255fdfbf292c592b75cf5f7a19c16ea59735f74/countries_codes_and_coordinates.csv"# REMOVE QUOTATION MARKSsed -i 's/\"//g' latlong.csv# Hive commands to create the table and load data
12 | CREATE TABLE IF NOT EXISTS LATLONG	(country String, alpha2 String, alpha3 String, numCode Int, latitude Float, longitude Float)	ROW FORMAT DELIMITED	FIELDS TERMINATED BY ‘,’	LINES TERMINATED BY ‘\n’	STORED AS TEXTFILE	TBLPROPERTIES("skip.header.line.count"="1");LOAD DATA LOCAL INPATH '/home/cloudera/latlong.csv' INTO TABLE LATLONG;SELECT DISTINCT * FROM(SELECT location, avg(value) as AVGPRICE from oil GROUP BY location) xLEFT JOIN(SELECT TRIM(ALPHA3) AS alpha3, latitude, longitude from LATLONG) yON (x.location = y.alpha3);


--------------------------------------------------------------------------------
/Chapter04/WordcountExercise_Code.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # WORDCOUNT EXERCISE
 3 | 
 4 | Step 1: Create getCyrusFiles.sh – This script will be used to retrieve the data from the web# Make directory named cyrus[cloudera@quickstart ~]$ mkdir cyrus# Create a file named getCyrusFiles.sh[cloudera@quickstart ~]$ vi getCyrusFiles.sh # Use vi or nano to enter the following text  for i in `seq 10`  do  curl www.artamene.org/documents/cyrus$i.txt -o cyrus$i.txt  doneStep 2: Create processCyrusFiles.sh – This script will be used to concatenate and cleanse the files that were downloaded in the prior step[cloudera@quickstart ~]$ vi processCyrusFiles.sh # Use vi or nano to enter the following text  cd ~/cyrus;  for i in `ls cyrus*.txt`; do cat $i >> cyrusorig.txt; done  cat cyrusorig.txt | tr -dc '[:print:]' | tr A-Z a-z > cyrusprint.txtStep 3: Change the permissions to 755 to make the .sh files executable at the command prompt[cloudera@quickstart ~]$ chmod 755 getCyrusFiles.sh [cloudera@quickstart ~]$ chmod 755 processCyrusFiles.sh Step 4: Execute getCyrusFiles.sh[cloudera@quickstart cyrus]$ ./getCyrusFiles.sh
 5 | 
 6 | # The output from Step 4
 7 |   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current                                 Dload  Upload   Total   Spent    Left  Speed100  908k  100  908k    0     0   372k      0  0:00:02  0:00:02 --:--:--  421k  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current                                 Dload  Upload   Total   Spent    Left  Speed100 1125k  100 1125k    0     0   414k      0  0:00:02  0:00:02 --:--:--  471k  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current                                 Dload  Upload   Total   Spent    Left  Speed100 1084k  100 1084k    0     0   186k      0  0:00:05  0:00:05 --:--:--  236k  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current                                 Dload  Upload   Total   Spent    Left  Speed100 1048k  100 1048k    0     0   267k      0  0:00:03  0:00:03 --:--:--  291k  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current                                 Dload  Upload   Total   Spent    Left  Speed100 1116k  100 1116k    0     0   351k      0  0:00:03  0:00:03 --:--:--  489k  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current                                 Dload  Upload   Total   Spent    Left  Speed100 1213k  100 1213k    0     0   440k      0  0:00:02  0:00:02 --:--:--  488k  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current                                 Dload  Upload   Total   Spent    Left  Speed100 1119k  100 1119k    0     0   370k      0  0:00:03  0:00:03 --:--:--  407k  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current                                 Dload  Upload   Total   Spent    Left  Speed100 1132k  100 1132k    0     0   190k      0  0:00:05  0:00:05 --:--:--  249k  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current                                 Dload  Upload   Total   Spent    Left  Speed100 1084k  100 1084k    0     0   325k      0  0:00:03  0:00:03 --:--:--  365k  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current                                 Dload  Upload   Total   Spent    Left  Speed100 1259k  100 1259k    0     0   445k      0  0:00:02  0:00:02 --:--:--  486k# Check the files in the current directory, cyrus[cloudera@quickstart cyrus]$ lscyrus10.txt  cyrus3.txt  cyrus6.txt  cyrus9.txtcyrus1.txt   cyrus4.txt  cyrus7.txt  getCyrusFiles.shcyrus2.txt   cyrus5.txt  cyrus8.txt  processCyrusFiles.shStep 5: Execute processCyrusFiles.sh[cloudera@quickstart cyrus]$ ./processCyrusFiles.sh # Check the files in the current directory, cyrus[cloudera@quickstart cyrus]$ lscyrus10.txt  cyrus3.txt  cyrus6.txt  cyrus9.txt      getCyrusFiles.shcyrus1.txt   cyrus4.txt  cyrus7.txt  cyrusorig.txt   processCyrusFiles.shcyrus2.txt   cyrus5.txt  cyrus8.txt  cyrusprint.txt# Check for cyrusprint.txt - this is the file we will use for Wordcount[cloudera@quickstart cyrus]$ ls -altrh cyrusprint.txt -rw-rw-r-- 1 cloudera cloudera 11M Jun 28 20:02 cyrusprint.txt# Check for the number of words in cyrusprint.txt (1,953,931)[cloudera@quickstart cyrus]$ wc -w cyrusprint.txt 1953931 cyrusprint.txtStep 6: Execute the following steps to copy the final file, named cyrusprint.txt to HDFS, create the mapper.py and reducer.py scripts.The files, mapper.py and reducer.py were referenced from Glenn Klockwood’s website (http://www.glennklockwood.com/data-intensive/hadoop/streaming.html), which provides a wealth of information on MapReduce and related topics in general# Check the contents of the HDFS directory /user/cloudera[cloudera@quickstart cyrus]$ hdfs dfs -ls /user/cloudera
 8 | 
 9 | # Create /user/cloudera/input in HDFS[cloudera@quickstart cyrus]$ hdfs dfs -mkdir /user/cloudera/input
10 | 
11 | # Copy cyrusprint.txt to /user/cloudera/input directory[cloudera@quickstart cyrus]$ hdfs dfs -put cyrusprint.txt /user/cloudera/input/[cloudera@quickstart cyrus]$ vi mapper.py # Use vi or nano to enter the following text and save in a file named mapper.py#!/usr/bin/env python#the above just indicates to use python to intepret this file#This mapper code will input a line of text and output <word, 1> #import syssys.path.append('.')for line in sys.stdin:	line = line.strip()	keys = line.split()	for key in keys:		value = 1		print ("%s\t%d" % (key,value))[cloudera@quickstart cyrus]$ vi reducer.py # Use vi or nano to enter the following text and save in a file named mapper.py#!/usr/bin/env pythonimport syssys.path.append('.')last_key = Nonerunning_total = 0for input_line in sys.stdin:   input_line = input_line.strip()   this_key, value = input_line.split("\t", 1)   value = int(value)   if last_key == this_key:       running_total += value   else:       if last_key:           print("%s\t%d" % (last_key, running_total))       running_total = value       last_key = this_keyif last_key == this_key:   print( "%s\t%d" % (last_key, running_total) )# Change permissions to make the files executable
12 | [cloudera@quickstart cyrus]$ chmod 755 *.pyStep 7: Execute the mapper and reducer scripts which will perform the MapReduce operations in order to produce the Word Count. You may see error messages as shown below, but for the purpose of this exercise (and for generating the results), you may disregard them.# The command to start the MapReduce Job is as follows:hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar -input /user/cloudera/input -output /user/cloudera/output -mapper /home/cloudera/cyrus/mapper.py -reducer /home/cloudera/cyrus/reducer.py
13 | [cloudera@quickstart cyrus]$ hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar -input /user/cloudera/input -output /user/cloudera/output -mapper /home/cloudera/cyrus/mapper.py -reducer /home/cloudera/cyrus/reducer.pypackageJobJar: [] [/usr/lib/hadoop-mapreduce/hadoop-streaming-2.6.0-cdh5.10.0.jar] /tmp/streamjob1786353270976133464.jar tmpDir=null17/06/28 20:11:21 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:803217/06/28 20:11:21 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:803217/06/28 20:11:22 INFO mapred.FileInputFormat: Total input paths to process : 117/06/28 20:11:22 INFO mapreduce.JobSubmitter: number of splits:217/06/28 20:11:23 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1498704103152_000217/06/28 20:11:23 INFO impl.YarnClientImpl: Submitted application application_1498704103152_000217/06/28 20:11:23 INFO mapreduce.Job: The url to track the job: http://quickstart.cloudera:8088/proxy/application_1498704103152_0002/17/06/28 20:11:23 INFO mapreduce.Job: Running job: job_1498704103152_000217/06/28 20:11:30 INFO mapreduce.Job: Job job_1498704103152_0002 running in uber mode : false17/06/28 20:11:30 INFO mapreduce.Job:  map 0% reduce 0%17/06/28 20:11:41 INFO mapreduce.Job:  map 50% reduce 0%17/06/28 20:11:54 INFO mapreduce.Job:  map 83% reduce 0%17/06/28 20:11:57 INFO mapreduce.Job:  map 100% reduce 0%17/06/28 20:12:04 INFO mapreduce.Job:  map 100% reduce 100%17/06/28 20:12:04 INFO mapreduce.Job: Job job_1498704103152_0002 completed successfully17/06/28 20:12:04 INFO mapreduce.Job: Counters: 50	File System Counters		FILE: Number of bytes read=18869506		FILE: Number of bytes written=38108830		FILE: Number of read operations=0		FILE: Number of large read operations=0		FILE: Number of write operations=0		HDFS: Number of bytes read=16633042		HDFS: Number of bytes written=547815		HDFS: Number of read operations=9		HDFS: Number of large read operations=0		HDFS: Number of write operations=2	Job Counters 		Killed map tasks=1		Launched map tasks=3		Launched reduce tasks=1		Data-local map tasks=3		Total time spent by all maps in occupied slots (ms)=39591		Total time spent by all reduces in occupied slots (ms)=18844		Total time spent by all map tasks (ms)=39591		Total time spent by all reduce tasks (ms)=18844		Total vcore-seconds taken by all map tasks=39591		Total vcore-seconds taken by all reduce tasks=18844		Total megabyte-seconds taken by all map tasks=40541184		Total megabyte-seconds taken by all reduce tasks=19296256	Map-Reduce Framework		Map input records=1		Map output records=1953931		Map output bytes=14961638		Map output materialized bytes=18869512		Input split bytes=236		Combine input records=0		Combine output records=0		Reduce input groups=45962		Reduce shuffle bytes=18869512		Reduce input records=1953931		Reduce output records=45962		Spilled Records=3907862		Shuffled Maps =2		Failed Shuffles=0		Merged Map outputs=2		GC time elapsed (ms)=352		CPU time spent (ms)=8400		Physical memory (bytes) snapshot=602038272		Virtual memory (bytes) snapshot=4512694272		Total committed heap usage (bytes)=391979008	Shuffle Errors		BAD_ID=0		CONNECTION=0		IO_ERROR=0		WRONG_LENGTH=0		WRONG_MAP=0		WRONG_REDUCE=0	File Input Format Counters 		Bytes Read=16632806	File Output Format Counters 		Bytes Written=54781517/06/28 20:12:04 INFO streaming.StreamJob: Output directory: /user/cloudera/outputStep 8: The results are stored in HDFS under the /user/cloudera/output director in files with part- prefix# Check for the results file that will be available under /user/cloudera/output directory[cloudera@quickstart cyrus]$ hdfs dfs -ls /user/cloudera/outputFound 2 items-rw-r--r--   1 cloudera cloudera          0 2017-06-28 20:12 /user/cloudera/output/_SUCCESS-rw-r--r--   1 cloudera cloudera     547815 2017-06-28 20:12 /user/cloudera/output/part-00000Step 9: To view the contents of the file use hdfs dfs –cat and provide the name of the file. In this case we are viewing the first 10 lines of the output.[cloudera@quickstart cyrus]$ hdfs dfs -cat /user/cloudera/output/part-00000 | head -10!	1206!)	1!quoy,	1'	3''	1'.	1'a	32'appelloit	1'auoit	1'auroit	10


--------------------------------------------------------------------------------
/Chapter05/01_download_data.txt:
--------------------------------------------------------------------------------
1 | # Replace YOURAPPTOKEN and 12000000 with your API Key and desired record limit respectivelycd /home/packt;time wget -O cms2016.csv 'https://openpaymentsdata.cms.gov/resource/vq63-hu5i.csv?$app_token=YOURAPPTOKEN&$query=select Physician_First_Name as firstName,Physician_Last_Name as lastName,Recipient_City as city,Recipient_State as state,Submitting_Applicable_Manufacturer_or_Applicable_GPO_Name as company,Total_Amount_of_Payment_USDollars as payment,Date_of_Payment as date,Nature_of_Payment_or_Transfer_of_Value as paymentNature,Product_Category_or_Therapeutic_Area_1 as category,Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1 as product where covered_recipient_type like "Covered Recipient Physician" limit 12000000' 


--------------------------------------------------------------------------------
/Chapter05/02_saving_data.txt:
--------------------------------------------------------------------------------
1 | # Launch the Q Console by typing:packt@vagrant:~$ rlwrap ~/q/l32/q -s 4 –p 5001KDB+ 3.5 2017.06.15 Copyright (C) 1993-2017 Kx Systemsl32/ 1()core 3951MB packt vagrant 127.0.1.1 NONEXPIREWelcome to kdb+ 32bit editionFor support please see http://groups.google.com/d/forum/personal-kdbplusTutorials can be found at http://code.kx.com/wiki/TutorialsTo exit, type \\To remove this startup msg, edit q.qq)# Enter the following at the Q console. Explanations for each of the commands have been provided in the comments (using /):/change to the home directory for user packt\cd /home/packt//Define the schema of the cms tabled:(`category`city`company`date`firstName`lastName`payment`paymentNature`product`state)!"SSSZSSFSSS";/Read the headersfrom the cms csv file. These will be our table column namescolumns:system "head -1 cms2016.csv";columns:`$"," vs ssr[raze columns;"\"";""];/Run Garbage Collection.Q.gc[];/Load the cms csv file\ts cms2016:(d columns;enlist",")0:`:cms2016.csv;/Add a month column to the data\ts cms2016: `month`date xasc update month:`month$date, date:`date$date from cms2016.Q.gc[];/Modify character columns to be lower case. The data contains u\ts update lower firstName from `cms2016\ts update lower lastName from `cms2016\ts update lower city from `cms2016\ts update lower state from `cms2016\ts update lower product from `cms2016\ts update lower category from `cms2016\ts update lower paymentNature from `cms2016\ts update lower company from `cms2016.Q.gc[]cms2016:`month`date`firstName`lastName`company`state`city`product`category`payment`paymentNature xcols cms2016count cms2016 /11 million/Function to save the data that was read from the CMS csv filesavedata:{show (string .z.T)," Writing: ",string x;cms::delete month from select from cms2016 where month=x; .Q.dpft[`:cms;x;`date;`cms]}/Save the data in monthly partitions in the current foldersavedata each 2016.01m +til 12


--------------------------------------------------------------------------------
/Chapter05/03_r_shiny_web_application.txt:
--------------------------------------------------------------------------------
1 | ## This is a Shiny web application. You can run the application by clicking# the 'Run App' button above.## Find out more about building applications with Shiny here:##    http://shiny.rstudio.com/#library(shiny)library(shinydashboard)library(data.table)library(DT)library(rjson)library(jsonlite)library(shinyjs)library(rkdb)ui <- dashboardPage (skin="purple",   dashboardHeader(title = "CMS Open Payments 2016"),  dashboardSidebar(  useShinyjs(),  sidebarMenu(  uiOutput("month"),  uiOutput("company"),  uiOutput("product"),  uiOutput("state"),  uiOutput("city"),  uiOutput("showData"),  uiOutput("displayColumns"),  uiOutput("aggregationColumns"),  actionButton("queryButton", "View Results")    )  ),dashboardBody(  tags$head(tags$link(rel = "stylesheet", type = "text/css", href = "packt.css")),  textOutput("stats"),  dataTableOutput("tableData")  ),  title = "CMS Open Payments Data Mining")# Define server logic required to draw a histogramserver <- function(input, output, session) {    h <- open_connection("localhost","5001")    minDate <- execute(h,"minDate")  maxDate <- execute(h,"maxDate")  startDate <- minDate  endDate <- startDate + 31    cmsdata <- data.table(dbColumns=c("month","date","firstName","lastName","city","state","company","product",                                    "category","payment","paymentNature"),                         webColumns=c("Month","Date","First Name","Last Name","City","State","Company","Product",                                     "Category","Payment","Payment Nature"))    companyData <- execute(h,"exec distinct showCompany from alldata")    gbyVars <- c("Company","Product","State","City","Category","Payment Nature")  PLACEHOLDERLIST <- list(    placeholder = 'Please select an option below',    onInitialize = I('function() { this.setValue(""); }')  )  PLACEHOLDERLIST2 <- list(    placeholder = 'Select All',    onInitialize = I('function() { this.setValue(""); }')  )        output$month <- renderUI({    dateRangeInput("date", label = 'PAYMENT DATE', start = startDate, end = endDate, min = minDate, max = maxDate)  })    output$company <- renderUI({    selectizeInput("company","COMPANY" , companyData, multiple = TRUE,options = PLACEHOLDERLIST)  })   output$product <- renderUI({    productQuery <- paste0("getShowInfo[`product;\"",paste(input$company,collapse="|"),"\"]")    productVals <- execute(h,productQuery)    selectizeInput("product", "DRUG/PRODUCT" , productVals, multiple = TRUE,options = PLACEHOLDERLIST2)  })        output$state <- renderUI({    stateQuery <- paste0("getShowInfo[`state;\"",paste(input$company,collapse="|"),"\"]")    stateVals <- execute(h,stateQuery)    selectizeInput("state", "STATE" , stateVals, multiple = TRUE,options = PLACEHOLDERLIST2)  })        output$city <- renderUI({    cityQuery <- paste0("getShowInfo[`city;\"",paste(input$company,collapse="|"),"\"]")    cityVals <- execute(h,cityQuery)    selectizeInput("city", "CITY" , cityVals, multiple = TRUE,options = PLACEHOLDERLIST2)  })    output$showData <- renderUI({    selectInput("showData", label = "DISPLAY TYPE", choices = list("Show Data" = 1, "Aggregate Data" = 2), selected = 1)  })    output$displayColumns <- renderUI({    if (is.null(input$showData)) {selectInput("columns", "SHOW DATA",cmsdata$webColumns, selectize = FALSE, multiple = TRUE, size=11)}    else if(input$showData == 1) {selectInput("columns", "SHOW DATA",cmsdata$webColumns, selectize = FALSE, multiple = TRUE, size=11) }     else if(input$showData == 2) {selectInput("aggVars", "AGGREGATE DATA",gbyVars, selectize = FALSE, multiple = TRUE, size=6) }  })     output$aggregationColumns <- renderUI ({ conditionalPanel(    condition = "input.showData != 1",    selectInput("aggData", "CALCULATE METRICS" , c("Total Payment","Number of Payments","Minimum Payment","Maximum Payment","Average Payment"), selectize = TRUE, multiple = TRUE)  )})    getTableData <- eventReactive(input$queryButton, {    disable("queryButton")    queryInfo <- (list(date=as.character(input$date),company=input$company, product=input$product, state=input$state, city=input$city,columns=cmsdata$dbColumns[cmsdata$webColumns %in% input$columns],showData=input$showData))    if (input$showData !=1) {queryInfo <- c(queryInfo, list(aggVars=cmsdata$dbColumns[cmsdata$webColumns %in% input$aggVars], aggData=input$aggData))} else {queryInfo <- c(queryInfo)}    JSON <- rjson::toJSON(queryInfo)    getQuery <- paste0("getRes \"",URLencode(JSON),"\"")    finalResults <- execute(h,getQuery)    enable("queryButton")    print (finalResults)    fres <<- finalResults    print (class(finalResults[[1]]))    print (finalResults)    finalResults  })   output$tableData <- renderDataTable({ datatable(getTableData()[[1]])}) output$stats     <- renderText({(getTableData())[[2]]}) }# Run the application shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/Chapter05/MongoDBLog.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | [cloudera@quickstart ~]$ lsb_release -a
  3 | LSB Version:	:base-4.0-amd64:base-4.0-noarch:core-4.0-amd64:core-4.0-noarch
  4 | Distributor ID:	CentOS
  5 | Description:	CentOS release 6.7 (Final)
  6 | Release:	6.7
  7 | Codename:	Final
  8 | 
  9 | 
 10 | [root@quickstart cloudera]# sudo nano /etc/yum.repos.d/mongodb-org-3.4.repo
 11 | 
 12 | [cloudera@quickstart ~]$ sudo yum install -y mongodb-org
 13 | Loaded plugins: fastestmirror, security
 14 | Setting up Install Process
 15 | Determining fastest mirrors
 16 | epel/metalink                                                         |  12 kB     00:00     
 17 |  * base: mirrors.mit.edu
 18 |  * epel: mirror.math.princeton.edu
 19 |  * extras: repos.dfw.quadranet.com
 20 |  * updates: mirrors.tripadvisor.com
 21 | base                                                                  | 3.7 kB     00:00     
 22 | base/primary_db                                                       | 4.7 MB     00:00     
 23 | cloudera-cdh5                                                         |  951 B     00:00     
 24 | cloudera-cdh5/primary                                                 |  43 kB     00:00     
 25 | cloudera-cdh5                                                                        146/146
 26 | cloudera-gplextras5                                                   |  951 B     00:00     
 27 | cloudera-gplextras5/primary                                           | 2.4 kB     00:00     
 28 | cloudera-gplextras5                                                                      9/9
 29 | cloudera-kafka                                                        |  951 B     00:00     
 30 | cloudera-kafka/primary                                                | 1.9 kB     00:00     
 31 | cloudera-kafka                                                                           4/4
 32 | cloudera-manager                                                      |  951 B     00:00     
 33 | cloudera-manager/primary                                              | 4.3 kB     00:00     
 34 | cloudera-manager                                                                         7/7
 35 | epel                                                                  | 4.3 kB     00:00     
 36 | epel/primary_db                                                       | 5.9 MB     00:00     
 37 | extras                                                                | 3.4 kB     00:00     
 38 | extras/primary_db                                                     |  29 kB     00:00     
 39 | mongodb-org-3.4                                                       | 2.5 kB     00:00     
 40 | mongodb-org-3.4/primary_db                                            |  34 kB     00:00     
 41 | updates                                                               | 3.4 kB     00:00     
 42 | updates/primary_db                                                    | 2.5 MB     00:00     
 43 | Resolving Dependencies
 44 | --> Running transaction check
 45 | ---> Package mongodb-org.x86_64 0:3.4.6-1.el6 will be installed
 46 | --> Processing Dependency: mongodb-org-tools = 3.4.6 for package: mongodb-org-3.4.6-1.el6.x86_64
 47 | --> Processing Dependency: mongodb-org-shell = 3.4.6 for package: mongodb-org-3.4.6-1.el6.x86_64
 48 | --> Processing Dependency: mongodb-org-server = 3.4.6 for package: mongodb-org-3.4.6-1.el6.x86_64
 49 | --> Processing Dependency: mongodb-org-mongos = 3.4.6 for package: mongodb-org-3.4.6-1.el6.x86_64
 50 | --> Running transaction check
 51 | ---> Package mongodb-org-mongos.x86_64 0:3.4.6-1.el6 will be installed
 52 | ---> Package mongodb-org-server.x86_64 0:3.4.6-1.el6 will be installed
 53 | ---> Package mongodb-org-shell.x86_64 0:3.4.6-1.el6 will be installed
 54 | ---> Package mongodb-org-tools.x86_64 0:3.4.6-1.el6 will be installed
 55 | --> Finished Dependency Resolution
 56 | 
 57 | Dependencies Resolved
 58 | 
 59 | =============================================================================================
 60 |  Package                    Arch           Version             Repository               Size
 61 | =============================================================================================
 62 | Installing:
 63 |  mongodb-org                x86_64         3.4.6-1.el6         mongodb-org-3.4         5.8 k
 64 | Installing for dependencies:
 65 |  mongodb-org-mongos         x86_64         3.4.6-1.el6         mongodb-org-3.4          12 M
 66 |  mongodb-org-server         x86_64         3.4.6-1.el6         mongodb-org-3.4          20 M
 67 |  mongodb-org-shell          x86_64         3.4.6-1.el6         mongodb-org-3.4          11 M
 68 |  mongodb-org-tools          x86_64         3.4.6-1.el6         mongodb-org-3.4          49 M
 69 | 
 70 | Transaction Summary
 71 | =============================================================================================
 72 | Install       5 Package(s)
 73 | 
 74 | Total download size: 91 M
 75 | Installed size: 258 M
 76 | Downloading Packages:
 77 | (1/5): mongodb-org-3.4.6-1.el6.x86_64.rpm                             | 5.8 kB     00:00     
 78 | (2/5): mongodb-org-mongos-3.4.6-1.el6.x86_64.rpm                      |  12 MB     00:00     
 79 | (3/5): mongodb-org-server-3.4.6-1.el6.x86_64.rpm                      |  20 MB     00:00     
 80 | (4/5): mongodb-org-shell-3.4.6-1.el6.x86_64.rpm                       |  11 MB     00:00     
 81 | (5/5): mongodb-org-tools-3.4.6-1.el6.x86_64.rpm                       |  49 MB     00:09     
 82 | ---------------------------------------------------------------------------------------------
 83 | Total                                                        7.8 MB/s |  91 MB     00:11     
 84 | warning: rpmts_HdrFromFdno: Header V3 RSA/SHA1 Signature, key ID a15703c6: NOKEY
 85 | Retrieving key from https://www.mongodb.org/static/pgp/server-3.4.asc
 86 | Importing GPG key 0xA15703C6:
 87 |  Userid: "MongoDB 3.4 Release Signing Key <packaging@mongodb.com>"
 88 |  From  : https://www.mongodb.org/static/pgp/server-3.4.asc
 89 | Running rpm_check_debug
 90 | Running Transaction Test
 91 | Transaction Test Succeeded
 92 | Running Transaction
 93 |   Installing : mongodb-org-shell-3.4.6-1.el6.x86_64                                      1/5 
 94 |   Installing : mongodb-org-mongos-3.4.6-1.el6.x86_64                                     2/5 
 95 |   Installing : mongodb-org-tools-3.4.6-1.el6.x86_64                                      3/5 
 96 |   Installing : mongodb-org-server-3.4.6-1.el6.x86_64                                     4/5 
 97 |   Installing : mongodb-org-3.4.6-1.el6.x86_64                                            5/5 
 98 |   Verifying  : mongodb-org-3.4.6-1.el6.x86_64                                            1/5 
 99 |   Verifying  : mongodb-org-server-3.4.6-1.el6.x86_64                                     2/5 
100 |   Verifying  : mongodb-org-tools-3.4.6-1.el6.x86_64                                      3/5 
101 |   Verifying  : mongodb-org-mongos-3.4.6-1.el6.x86_64                                     4/5 
102 |   Verifying  : mongodb-org-shell-3.4.6-1.el6.x86_64                                      5/5 
103 | 
104 | Installed:
105 |   mongodb-org.x86_64 0:3.4.6-1.el6                                                           
106 | 
107 | Dependency Installed:
108 |   mongodb-org-mongos.x86_64 0:3.4.6-1.el6       mongodb-org-server.x86_64 0:3.4.6-1.el6      
109 |   mongodb-org-shell.x86_64 0:3.4.6-1.el6        mongodb-org-tools.x86_64 0:3.4.6-1.el6       
110 | 
111 | Complete!
112 | 
113 | 
114 | ### You need to start the mongo daemon before you can use it ###
115 | [cloudera@quickstart ~]$ mongo
116 | MongoDB shell version v3.4.6
117 | connecting to: mongodb://127.0.0.1:27017
118 | 2017-07-30T10:50:58.708-0700 W NETWORK  [thread1] Failed to connect to 127.0.0.1:27017, in(checking socket for error after poll), reason: Connection refused
119 | 2017-07-30T10:50:58.708-0700 E QUERY    [thread1] Error: couldn't connect to server 127.0.0.1:27017, connection attempt failed :
120 | connect@src/mongo/shell/mongo.js:237:13
121 | @(connect):1:6
122 | exception: connect failed
123 | 
124 | 
125 | ### Create mongo dbpath ###
126 | 
127 | [cloudera@quickstart ~]$ mkdir mongodata
128 | 
129 | ### Start mongod ###
130 | [cloudera@quickstart ~]$ mongod --dbpath mongodata
131 | 2017-07-30T10:52:17.200-0700 I CONTROL  [initandlisten] MongoDB starting : pid=16093 port=27017 dbpath=mongodata 64-bit host=quickstart.cloudera
132 | 2017-07-30T10:52:17.200-0700 I CONTROL  [initandlisten] db version v3.4.6
133 | 2017-07-30T10:52:17.200-0700 I CONTROL  [initandlisten] git version: c55eb86ef46ee7aede3b1e2a5d184a7df4bfb5b5
134 | 2017-07-30T10:52:17.200-0700 I CONTROL  [initandlisten] OpenSSL version: OpenSSL 1.0.1e-fips 11 Feb 2013
135 | 2017-07-30T10:52:17.200-0700 I CONTROL  [initandlisten] allocator: tcmalloc
136 | 2017-07-30T10:52:17.200-0700 I CONTROL  [initandlisten] modules: none
137 | 2017-07-30T10:52:17.200-0700 I CONTROL  [initandlisten] build environment:
138 | 2017-07-30T10:52:17.200-0700 I CONTROL  [initandlisten]     distmod: rhel62
139 | 2017-07-30T10:52:17.200-0700 I CONTROL  [initandlisten]     distarch: x86_64
140 | 2017-07-30T10:52:17.200-0700 I CONTROL  [initandlisten]     target_arch: x86_64
141 | 2017-07-30T10:52:17.200-0700 I CONTROL  [initandlisten] options: { storage: { dbPath: "mongodata" } }
142 | 2017-07-30T10:52:17.228-0700 I STORAGE  [initandlisten] 
143 | 2017-07-30T10:52:17.228-0700 I STORAGE  [initandlisten] ** WARNING: Using the XFS filesystem is strongly recommended with the WiredTiger storage engine
144 | 2017-07-30T10:52:17.228-0700 I STORAGE  [initandlisten] **          See http://dochub.mongodb.org/core/prodnotes-filesystem
145 | 2017-07-30T10:52:17.228-0700 I STORAGE  [initandlisten] wiredtiger_open config: create,cache_size=1403M,session_max=20000,eviction=(threads_min=4,threads_max=4),config_base=false,statistics=(fast),log=(enabled=true,archive=true,path=journal,compressor=snappy),file_manager=(close_idle_time=100000),checkpoint=(wait=60,log_size=2GB),statistics_log=(wait=0),
146 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] 
147 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] ** WARNING: Access control is not enabled for the database.
148 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] **          Read and write access to data and configuration is unrestricted.
149 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] 
150 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] 
151 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] ** WARNING: /sys/kernel/mm/transparent_hugepage/enabled is 'always'.
152 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] **        We suggest setting it to 'never'
153 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] 
154 | 2017-07-30T10:52:17.306-0700 I FTDC     [initandlisten] Initializing full-time diagnostic data capture with directory 'mongodata/diagnostic.data'
155 | 2017-07-30T10:52:17.320-0700 I INDEX    [initandlisten] build index on: admin.system.version properties: { v: 2, key: { version: 1 }, name: "incompatible_with_version_32", ns: "admin.system.version" }
156 | 2017-07-30T10:52:17.320-0700 I INDEX    [initandlisten] 	 building index using bulk method; build may temporarily use up to 500 megabytes of RAM
157 | 2017-07-30T10:52:17.321-0700 I INDEX    [initandlisten] build index done.  scanned 0 total records. 0 secs
158 | 2017-07-30T10:52:17.321-0700 I COMMAND  [initandlisten] setting featureCompatibilityVersion to 3.4
159 | 2017-07-30T10:52:17.321-0700 I NETWORK  [thread1] waiting for connections on port 27017
160 | 
161 | 
162 | ### In a separate terminal, download data files - laureates.json and country.json ###
163 | 
164 | [cloudera@quickstart ~]$ cd mongodata
165 | [cloudera@quickstart mongodata]$ curl -o laureates.json "http://api.nobelprize.org/v1/laureate.json
166 | > "^C
167 | [cloudera@quickstart mongodata]$ curl -o laureates.json "http://api.nobelprize.org/v1/laureate.json"
168 |   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
169 |                                  Dload  Upload   Total   Spent    Left  Speed
170 | 100  428k    0  428k    0     0   292k      0 --:--:--  0:00:01 --:--:--  354k
171 | [cloudera@quickstart mongodata]$ 
172 | 
173 | cat laureates.json | sed 's/^{"laureates"://g' | sed 's/}$//g' > mongofile.json
174 | 
175 | ### Download laureates.json
176 | 
177 | [cloudera@quickstart mongodata]$ cat laureates.json | sed 's/^{"laureates"://g' | sed 's/}$//g' > mongofile.json
178 | 
179 | [cloudera@quickstart mongodata]$ mongoimport --jsonArray --db nobel --collection laureates --file mongofile.json
180 | 2017-07-30T11:06:35.228-0700	connected to: localhost
181 | 2017-07-30T11:06:35.295-0700	imported 910 documents
182 | 
183 | ### Download country.json
184 | 
185 | [cloudera@quickstart mongodata]$ curl -o country.json "https://raw.githubusercontent.com/xbsd/packtbigdata/master/country.json"
186 |   % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
187 |                                  Dload  Upload   Total   Spent    Left  Speed
188 | 100  113k  100  113k    0     0   360k      0 --:--:-- --:--:-- --:--:--  885k
189 | 
190 | [cloudera@quickstart mongodata]$ mongoimport --jsonArray --db nobel --collection country --file country.json
191 | 2017-07-30T12:10:35.554-0700	connected to: localhost
192 | 2017-07-30T12:10:35.580-0700	imported 250 documents
193 | 
194 | ####
195 | 
196 | 
197 | ### MONGO SHELL ###
198 | [cloudera@quickstart mongodata]$ mongo
199 | MongoDB shell version v3.4.6
200 | connecting to: mongodb://127.0.0.1:27017
201 | MongoDB server version: 3.4.6
202 | Server has startup warnings: 
203 | 2017-07-30T10:52:17.228-0700 I STORAGE  [initandlisten] 
204 | 2017-07-30T10:52:17.228-0700 I STORAGE  [initandlisten] ** WARNING: Using the XFS filesystem is strongly recommended with the WiredTiger storage engine
205 | 2017-07-30T10:52:17.228-0700 I STORAGE  [initandlisten] **          See http://dochub.mongodb.org/core/prodnotes-filesystem
206 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] 
207 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] ** WARNING: Access control is not enabled for the database.
208 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] **          Read and write access to data and configuration is unrestricted.
209 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] 
210 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] 
211 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] ** WARNING: /sys/kernel/mm/transparent_hugepage/enabled is 'always'.
212 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] **        We suggest setting it to 'never'
213 | 2017-07-30T10:52:17.298-0700 I CONTROL  [initandlisten] 
214 | > use nobel
215 | switched to db nobel
216 | > show collections
217 | country
218 | laureates
219 | > 
220 | 
221 | ### Collections in MongoDB are the equivalent to tables in SQL
222 | 
223 | ### 1. Common Operations
224 | 
225 | ### See collection statistics
226 | 
227 | db.laureates.stats()
228 | 
229 | 	"ns" : "nobel.laureates", # Name Space
230 | 	"size" : 484053,          # Size in Bytes
231 | 	"count" : 910,            # Number of Records
232 | 	"avgObjSize" : 531,       # Average Object Size
233 | 	"storageSize" : 225280,   # Data size
234 | 
235 | 
236 | # Check space used
237 | 
238 | > db.laureates.storageSize()
239 | 225280
240 | 
241 | 
242 | # See number of records
243 | 
244 | > db.laureates.count()
245 | 910
246 | # See number of distinct prize categories
247 | 
248 | > db.laureates.distinct("prizes.category")
249 | [
250 | 	"physics",
251 | 	"chemistry",
252 | 	"peace",
253 | 	"medicine",
254 | 	"literature",
255 | 	"economics"
256 | ]
257 | 
258 | 
259 | ### See first record for laureates
260 | 
261 | > db.laureates.findOne()
262 | 
263 | {
264 | 	"_id" : ObjectId("597e202bcd8724f48de485d4"),
265 | 	"id" : "1",
266 | 	"firstname" : "Wilhelm Conrad",
267 | 	"surname" : "Röntgen",
268 | 	"born" : "1845-03-27",
269 | 	"died" : "1923-02-10",
270 | 	"bornCountry" : "Prussia (now Germany)",
271 | 	"bornCountryCode" : "DE",
272 | 	"bornCity" : "Lennep (now Remscheid)",
273 | 	"diedCountry" : "Germany",
274 | 	"diedCountryCode" : "DE",
275 | 	"diedCity" : "Munich",
276 | 	"gender" : "male",
277 | 	"prizes" : [
278 | 		{
279 | 			"year" : "1901",
280 | 			"category" : "physics",
281 | 			"share" : "1",
282 | 			"motivation" : "\"in recognition of the extraordinary services he has rendered by the discovery of the remarkable rays subsequently named after him\"",
283 | 			"affiliations" : [
284 | 				{
285 | 					"name" : "Munich University",
286 | 					"city" : "Munich",
287 | 					"country" : "Germany"
288 | 				}
289 | 			]
290 | 		}
291 | 	]
292 | }
293 | 
294 | ### See all records for laureates
295 | 
296 | > db.laureates.find()
297 | 
298 | { "_id" : ObjectId("597e202bcd8724f48de485d4"), "id" : "1", "firstname" : "Wilhelm Conrad", "surname" : "Röntgen", "born" : "1845-03-27", "died" : "1923-02-10", "bornCountry" : "Prussia (now Germany)", "bornCountryCode" : "DE", "bornCity" : "Lennep (now Remscheid)", "diedCountry" : "Germany", "diedCountryCode" : "DE", "diedCity" : "Munich", "gender" : "male", "prizes" : [ { "year" : "1901", "category" : "physics", "share" : "1", "motivation" : "\"in recognition of the extraordinary services he has rendered by the discovery of the remarkable rays subsequently named after him\"", "affiliations" : [ { "name" : "Munich University", "city" : "Munich", "country" : "Germany" } ] } ] }
299 | 
300 | ...
301 | 
302 | # Query a field - Find all Nobel Laureates who were male
303 | 
304 | > db.laureates.find({"gender":"male"})
305 | 
306 | { "_id" : ObjectId("597e202bcd8724f48de485d4"), "id" : "1", "firstname" : "Wilhelm Conrad", "surname" : "Röntgen", "born" : "1845-03-27", "died" : "1923-02-10", "bornCountry" : "Prussia (now Germany)", "bornCountryCode" : "DE", "bornCity" : "Lennep (now Remscheid)", "diedCountry" : "Germany", "diedCountryCode" : "DE", "diedCity" : "Munich", "gender" : "male", "prizes" : [ { "year" : "1901", "category" : "physics", "share" : "1", "motivation" : "\"in recognition of the extraordinary services he has rendered by the discovery of the remarkable rays subsequently named after him\"", "affiliations" : [ { "name" : "Munich University", "city" : "Munich", "country" : "Germany" } ] } ] }
307 | 
308 | { "_id" : ObjectId("597e202bcd8724f48de485d5"), "id" : "2", "firstname" : "Hendrik Antoon", "surname" : "Lorentz", "born" : "1853-07-18", "died" : "1928-02-04", "bornCountry" : "the Netherlands", "bornCountryCode" : "NL", "bornCity" : "Arnhem", "diedCountry" : "the Netherlands", "diedCountryCode" : "NL", "gender" : "male", "prizes" : [ { "year" : "1902", "category" : "physics", "share" : "2", "motivation" : "\"in recognition of the extraordinary service they rendered by their researches into the influence of magnetism upon radiation phenomena\"", "affiliations" : [ { "name" : "Leiden University", "city" : "Leiden", "country" : "the Netherlands" } ] } ] }
309 | ...
310 | 
311 | # Query a field - Find all Nobel Laureates who were born in the US and received a Nobel Prize in Physics
312 | # Note that here we have a nested field (category is under prizes as shown). Hence, we will use the dot notation.
313 | 
314 | > db.laureates.find({"bornCountryCode":"US", "prizes.category":"physics", "bornCity": /Chicago/})
315 | 
316 | { "_id" : ObjectId("597e202bcd8724f48de48638"), "id" : "103", "firstname" : "Ben Roy", "surname" : "Mottelson", "born" : "1926-07-09", "died" : "0000-00-00", "bornCountry" : "USA", "bornCountryCode" : "US", "bornCity" : "Chicago, IL", "gender" : "male", "prizes" : [ { "year" : "1975", "category" : "physics", "share" : "3", "motivation" : "\"for the discovery of the connection between collective motion and particle motion in atomic nuclei and the development of the theory of the structure of the atomic nucleus based on this connection\"", "affiliations" : [ { "name" : "Nordita", "city" : "Copenhagen", "country" : "Denmark" } ] } ] }
317 | ...
318 | 
319 | # Using Comparison Operators
320 | 
321 | # Find Nobel Laureates born in either India or Egypt
322 | 
323 | > db.laureates.find ( { bornCountryCode: { $in: ["IN","EG"] } } )
324 | 
325 | { "_id" : ObjectId("597e202bcd8724f48de485f7"), "id" : "37", "firstname" : "Sir Chandrasekhara Venkata", "surname" : "Raman", "born" : "1888-11-07", "died" : "1970-11-21", "bornCountry" : "India", "bornCountryCode" : "IN", "bornCity" : "Tiruchirappalli", "diedCountry" : "India", "diedCountryCode" : "IN", "diedCity" : "Bangalore", "gender" : "male", "prizes" : [ { "year" : "1930", "category" : "physics", "share" : "1", "motivation" : "\"for his work on the scattering of light and for the discovery of the effect named after him\"", "affiliations" : [ { "name" : "Calcutta University", "city" : "Calcutta", "country" : "India" } ] } ] }
326 | 
327 | { "_id" : ObjectId("597e202bcd8724f48de486b5"), "id" : "230", "firstname" : "Dorothy Crowfoot", "surname" : "Hodgkin", "born" : "1910-05-12", "died" : "1994-07-29", "bornCountry" : "Egypt", "bornCountryCode" : "EG", "bornCity" : "Cairo", "diedCountry" : "United Kingdom", "diedCountryCode" : "GB", "diedCity" : "Shipston-on-Stour", "gender" : "female", "prizes" : [ { "year" : "1964", "category" : "chemistry", "share" : "1", "motivation" : "\"for her determinations by X-ray techniques of the structures of important biochemical substances\"", "affiliations" : [ { "name" : "University of Oxford, Royal Society", "city" : "Oxford", "country" : "United Kingdom" } ] } ] }
328 | ...
329 | 
330 | # Using Multiple Comparison Operators
331 | 
332 | # Find Nobel laureates who were born in either US or China and won prize in either Physics or Chemistry
333 | 
334 | db.laureates.find( {
335 |     $and : [
336 |         { $or : [ { bornCountryCode : "US" }, { bornCountryCode : "CN" } ] },
337 |         { $or : [ { "prizes.category" : "physics" }, { "prizes.category" : "chemistry" }  ] }
338 |     ]
339 | } )
340 | 
341 | 
342 | > db.laureates.find( {
343 | ...     $and : [
344 | ...         { $or : [ { bornCountryCode : "US" }, { bornCountryCode : "CN" } ] },
345 | ...         { $or : [ { "prizes.category" : "physics" }, { "prizes.category" : "chemistry" }  ] }
346 | ...     ]
347 | ... } )
348 | 
349 | { "_id" : ObjectId("597e202bcd8724f48de485ee"), "id" : "28", "firstname" : "Robert Andrews", "surname" : "Millikan", "born" : "1868-03-22", "died" : "1953-12-19", "bornCountry" : "USA", "bornCountryCode" : "US", "bornCity" : "Morrison, IL", "diedCountry" : "USA", "diedCountryCode" : "US", "diedCity" : "San Marino, CA", "gender" : "male", "prizes" : [ { "year" : "1923", "category" : "physics", "share" : "1", "motivation" : "\"for his work on the elementary charge of electricity and on the photoelectric effect\"", "affiliations" : [ { "name" : "California Institute of Technology (Caltech)", "city" : "Pasadena, CA", "country" : "USA" } ] } ] }
350 | ...
351 | 
352 | # To count and aggregate total prizes by year
353 | 
354 | db.laureates.aggregate(
355 |   {$group: {_id: '$prizes.year', totalPrizes: {$sum: 1}}}, 
356 |   {$sort: {totalPrizes: -1}}
357 | );
358 | 
359 | > db.laureates.aggregate(
360 | ...   {$group: {_id: '$prizes.year', totalPrizes: {$sum: 1}}}, 
361 | ...   {$sort: {totalPrizes: -1}}
362 | ... );
363 | { "_id" : [ "2001" ], "totalPrizes" : 15 }
364 | { "_id" : [ "2014" ], "totalPrizes" : 13 }
365 | { "_id" : [ "2002" ], "totalPrizes" : 13 }
366 | { "_id" : [ "2000" ], "totalPrizes" : 13 }
367 | { "_id" : [ "2005" ], "totalPrizes" : 13 }
368 | { "_id" : [ "2011" ], "totalPrizes" : 13 }
369 | { "_id" : [ "2013" ], "totalPrizes" : 13 }
370 | { "_id" : [ "2009" ], "totalPrizes" : 13 }
371 | { "_id" : [ "1996" ], "totalPrizes" : 13 }
372 | { "_id" : [ "2008" ], "totalPrizes" : 12 }
373 | 
374 | 
375 | # To count and aggregate total prizes by country of birth
376 | 
377 | db.laureates.aggregate(
378 |   {$group: {_id: '$bornCountry', totalPrizes: {$sum: 1}}},
379 |   {$sort: {totalPrizes: -1}}
380 | );
381 | 
382 | > db.laureates.aggregate(
383 | ...   {$group: {_id: '$bornCountry', totalPrizes: {$sum: 1}}},
384 | ...   {$sort: {totalPrizes: -1}}
385 | ... );
386 | { "_id" : "USA", "totalPrizes" : 257 }
387 | { "_id" : "United Kingdom", "totalPrizes" : 84 }
388 | { "_id" : "Germany", "totalPrizes" : 61 }
389 | { "_id" : "France", "totalPrizes" : 51 }
390 | { "_id" : "Sweden", "totalPrizes" : 29 }
391 | { "_id" : null, "totalPrizes" : 29 }
392 | { "_id" : "Japan", "totalPrizes" : 24 }
393 | 
394 | 
395 | 
396 | 
397 | # Using Regular Expressions: Find count of nobel laureates by country of birth whose prize was related to 'radiation'
398 | 
399 | db.laureates.aggregate(
400 |   {$match : { "prizes.motivation" : /radiation/ }},
401 |   {$group: {_id: '$bornCountry', totalPrizes: {$sum: 1}}}, 
402 |   {$sort: {totalPrizes: -1}}
403 | );
404 | 
405 | > db.laureates.aggregate(
406 | ...   {$match : { "prizes.motivation" : /radiation/ }},
407 | ...   {$group: {_id: '$bornCountry', totalPrizes: {$sum: 1}}}, 
408 | ...   {$sort: {totalPrizes: -1}}
409 | ... );
410 | { "_id" : "USA", "totalPrizes" : 4 }
411 | { "_id" : "Germany", "totalPrizes" : 2 }
412 | { "_id" : "the Netherlands", "totalPrizes" : 2 }
413 | { "_id" : "United Kingdom", "totalPrizes" : 2 }
414 | { "_id" : "France", "totalPrizes" : 1 }
415 | { "_id" : "Prussia (now Russia)", "totalPrizes" : 1 }
416 | 
417 | 
418 | #### Result: We see that the highest number of prizes (in which radiation was mentioned as a key-word) was the US
419 | 
420 | # Left Join
421 | 
422 | db.laureates.aggregate(
423 |   {$lookup: { from: "country", localField: "bornCountryCode", foreignField: "countryCode", as: "countryInfo" }})
424 | 
425 | > db.laureates.aggregate(
426 | ...   {$lookup: { from: "country", localField: "bornCountryCode", foreignField: "countryCode", as: "countryInfo" }})
427 | 
428 | { "_id" : ObjectId("597e202bcd8724f48de485d4"), "id" : "1", "firstname" : "Wilhelm Conrad", "surname" : "Röntgen", "born" : "1845-03-27", "died" : "1923-02-10", "bornCountry" : "Prussia (now Germany)", "bornCountryCode" : "DE", "bornCity" : "Lennep (now Remscheid)", "diedCountry" : "Germany", "diedCountryCode" : "DE", "diedCity" : "Munich", "gender" : "male", "prizes" : [ { "year" : "1901", "category" : "physics", "share" : "1", "motivation" : "\"in recognition of the extraordinary services he has rendered by the discovery of the remarkable rays subsequently named after him\"", "affiliations" : [ { "name" : "Munich University", "city" : "Munich", "country" : "Germany" } ] } ], "countryInfo" : [ { "_id" : ObjectId("597e2f2bcd8724f48de489aa"), "continent" : "EU", "capital" : "Berlin", "languages" : "de", "geonameId" : 2921044, "south" : 47.2701236047002, "isoAlpha3" : "DEU", "north" : 55.0583836008072, "fipsCode" : "GM", "population" : "81802257", "east" : 15.0418156516163, "isoNumeric" : "276", "areaInSqKm" : "357021.0", "countryCode" : "DE", "west" : 5.8663152683722, "countryName" : "Germany", "continentName" : "Europe", "currencyCode" : "EUR" } ] }
429 | 
430 | 
431 | 
432 | # Aggregations along with left join: Number of Nobel laureates by continent
433 | 
434 | db.laureates.aggregate(
435 |   {$lookup: { from: "country", localField: "bornCountryCode", foreignField: "countryCode", as: "countryInfo" }},
436 |   {$group: {_id: '$countryInfo.continent', totalPrizes: {$sum: 1}}},
437 |   {$sort: {totalPrizes: -1}}
438 | );
439 | 
440 | 
441 | > db.laureates.aggregate(
442 | ...   {$lookup: { from: "country", localField: "bornCountryCode", foreignField: "countryCode", as: "countryInfo" }},
443 | ...   {$group: {_id: '$countryInfo.continent', totalPrizes: {$sum: 1}}},
444 | ...   {$sort: {totalPrizes: -1}}
445 | ... );
446 | { "_id" : [ "EU" ], "totalPrizes" : 478 }
447 | { "_id" : [ "NA" ], "totalPrizes" : 285 }
448 | { "_id" : [ "AS" ], "totalPrizes" : 67 }
449 | { "_id" : [ ], "totalPrizes" : 29 }
450 | { "_id" : [ "AF" ], "totalPrizes" : 25 }
451 | { "_id" : [ "OC" ], "totalPrizes" : 15 }
452 | { "_id" : [ "SA" ], "totalPrizes" : 11 }
453 | 
454 | 
455 | 


--------------------------------------------------------------------------------
/Chapter05/cms.q:
--------------------------------------------------------------------------------
1 | system "p 5001"system "l /home/packt/cms"/firstCap: Takes a string (sym) input and capitalizes the first letter of each word separated by a blank spacefirstCap:{" " sv {@[x;0;upper]} each (" " vs string x) except enlist ""}/VARIABLES AND HELPER TABLES/alldata: Aggregates data from the primary cms databasealldata: distinct `company`product xasc update showCompany:`$firstCap each company, showProduct:`$firstCap each product from ungroup select distinct product by company from cms where not null product/minDate: First monthminDate:exec date from select min date from cms where month=min month/maxDate: Last monthmaxDate:exec date from select max date from cms where month=max month/companyStateCity: Cleans and normalises the company names (capitalisations, etc)companyStateCity:select asc upper distinct state, asc `$firstCap each distinct city by company from cms/FUNCTIONS/getShowProduct: Function to get product list from company namegetShowProduct:{$[(`$"Select All") in x;raze exec showProduct from alldata;exec showProduct from alldata where showCompany in x]}/getShowState: Function to get state list from company namegetShowState:{$[(`$"Select All") in x;raze exec state from companyStateCity;exec state from companyStateCity where company = exec first company from alldata where showCompany in x]}/getShowCity: Function to get city list from company namegetShowCity:{$[(`$"Select All") in x;raze exec city from companyStateCity;exec city from companyStateCity where company = exec first company from alldata where showCompany in x]}/getShowInfo: Generic Function for Product, State and CitygetShowInfo:{y:`$"|" vs y;:asc distinct raze raze $[x~`product;getShowProduct each y;x~`state;getShowState each y;x~`city;getShowCity each y;""]}/Example: Run this after loading the entire script after removing the comment mark (/) from the beginning/getShowInfo[`state;"Abb Con-cise Optical Group Llc|Select All|Abbott Laboratories"]/Convert encoded URL into a Q dictionarydecodeJSON:{.j.k .h.uh x}/Convert atoms to listensym:{$[0>type x;enlist x;x]}/Date functionswithinDates:{enlist (within;`date;"D"$x[`date])}withinMonths:{enlist (within;`month;`month$"D"$x[`date])}/Helper function to remove null keysdelNullDict:{kx!x kx:where {not x~0n} each x}/If showdata=enlist 1,/Function to process the data for displaying results onlygetData:{"x is the dictionary from web";d:`$dx:lower delNullDict x; enz:`$delete showData,date,columns from dx; ?[`cms;(withinMonths x),(withinDates x),{(in;x 0;enlist 1_x)} each ((key enz),'value enz);0b;(dc)!dc:ensym `$x`columns]}/Aggregation FunctionaggDict:(`$("Total Payment";"Number of Payments";"Minimum Payment";"Maximum Payment";"Average Payment"))!((sum;`payment);(#:;`i);(min;`payment);(max;`payment);(avg;`payment))/Function to aggregate the datagetDataGroups:{[aggDict;x] "x is the dictionary from web";d:`$dx:lower delNullDict x; enz:`$delete showData,date,columns,aggVars,aggData from dx; ?[`cms;(withinMonths x),(withinDates x),{(in;x 0;enlist 1_x)} each ((key enz),'value enz);xv!xv:ensym `$x`aggVars;xa!aggDict xa:ensym `$x`aggData]}[aggDict;]/Generic Function to create error messageserrtable:{tab:([]Time:enlist `$string .z.Z;Alert:enlist x);(tab;"Missing Fields")}/Validation for inputinitialValidation:{$[0n~x[`company];:errtable `$"Company must be selected";(`aggVars in key x) and ((0=count x[`aggVars]) or 0n~x[`aggData]);:errtable `$"Both Metric and Aggregate Data field should be selected when using Aggregate Data option";x]}/Special Handling for some variables, in this case monthspecialHandling:{0N!x;$[`month in cols x; update `$string month from x;x]}/Normalise ColumnscolumnFix:{(`$firstCap each cols x) xcol x}/Use comma separator for numeric valuescommaFmt: {((x<0)#"-"),(reverse","sv 3 cut reverse string floor a),1_string(a:abs x)mod 1}/Wrapper for show data and aggregate data optionsgetRes:{0N!x;.Q.gc[];st:.z.t;x:decodeJSON x; if [not x ~ ix:initialValidation x;:ix]; res:$[`aggData in key x;getDataGroups x;getData x];res:specialHandling res; res:columnFix res;ccms:count cms; cres:count res; en:.z.t; .Q.gc[];:(res;`$(string en),": Processed ",(commaFmt ccms)," records in ",(string en - st)," seconds. Returned result with ",(commaFmt cres)," rows.\n")}


--------------------------------------------------------------------------------
/Chapter05/installing_kdb.txt:
--------------------------------------------------------------------------------
1 | $ cd Downloads/ # cd to the folder where you have downloaded the zip file$ unzip linuxx86.zip Archive:  linuxx86.zip  inflating: q/README.txt              inflating: q/l32/q                   inflating: q/q.q                     inflating: q/q.k                     inflating: q/s.k                     inflating: q/trade.q                 inflating: q/sp.q                  $ mv ~/Downloads/q ~/$ cd ~/q$ cd l32$ ./qKDB+ 3.5 2017.06.15 Copyright (C) 1993-2017 Kx Systemsl32/ 1()core 3830MB cloudera quickstart.cloudera 10.0.2.15 NONEXPIRE  Welcome to kdb+ 32bit editionFor support please see http://groups.google.com/d/forum/personal-kdbplusTutorials can be found at http://code.kx.com/wiki/TutorialsTo exit, type \\To remove this startup msg, edit q.qq)\\/NOTE THAT YOU MAY NEED TO INSTALL THE FOLLOWING IF YOU GET AN ERROR MESSAGE STATING THAT THE FILE q CANNOT BE FOUND:sudo dpkg --add-architecture i386sudo apt-get updatesudo apt-get install libc6:i386 libncurses5:i386 libstdc++6:i386


--------------------------------------------------------------------------------
/Chapter05/installing_r_packages.txt:
--------------------------------------------------------------------------------
1 | install.packages(c("shiny","shinydashboard","data.table",                   "DT","rjson","jsonlite","shinyjs","devtools"))library(devtools)devtools::install_github('kxsystems/rkdb', quiet=TRUE)


--------------------------------------------------------------------------------
/Chapter05/packt.css:
--------------------------------------------------------------------------------
1 | .shiny-text-output, .shiny-bount-output {  margin: 1px;  font-weight: bold;}.main-header .logo {height: 20px;font-size: 14px;font-weight: bold;line-height: 20px;}.main-header .sidebar-toggle {  padding: 0px;}.main-header .navbar {  min-height: 0px !important;}.left-side, .main-sidebar {  padding-top: 15px !important;}.form-group {  margin-bottom: 2px;}.selectize-input {  min-height: 0px !important;  padding-top: 1px !important;  padding-bottom: 1px !important;  padding-left: 12px !important;  padding-right: 12px !important;}.sidebar {  height: 99vh;   overflow-y: auto;}section.sidebar .shiny-input-container {    padding: 5px 15px 0px 12px;}.btn {  padding: 1px;  margin-left: 15px;  color:#636363;  background-color:#e0f3f8;  border-color:#e0f3f8;}.btn.focus, .btn:focus, .btn:hover {  color: #4575b4;  background-color:#fff;  border-color:#fff;}pre {    display: inline-table;    width: 100%;    padding: 2px;    margin: 0 0 5px;    font-size: 12px;    line-height: 1.42857143;    color: rgb(51, 52, 53);    word-break: break-all;    word-wrap: break-word;    background-color: rgba(10, 9, 9, 0.06);    border: 1px rgba(10, 9, 9, 0.06);    /* border-radius: 4px */}.skin-red .sidebar a {    color: #fff;}.sidebar {  color: #e0f3f8;  background-color:#4575b4;  border-color:#4575b4;}


--------------------------------------------------------------------------------
/Chapter06/.ipynb_checkpoints/Packt_Notebook-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     " ## SPARK TUTORIAL\n",
  8 |     " \n",
  9 |     " This tutorial has been reproduced with permission of Databricks Inc., a leading provider of Spark-based solutions.\n",
 10 |     " \n",
 11 |     "[Databricks](http://www.databricks.com)"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "# Section 1A\n",
 23 |     "# This is a Python cell. You can run normal Python code here...\n",
 24 |     "print 'The sum of 1 and 1 is {0}'.format(1+1)\n",
 25 |     "\n",
 26 |     "# Here is another Python cell, this time with a variable (x) declaration and an if statement:\n",
 27 |     "x = 42\n",
 28 |     "if x > 40:\n",
 29 |     "    print 'The sum of 1 and 2 is {0}'.format(1+2)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "# Section 1B\n",
 41 |     "\n",
 42 |     "# This cell relies on x being defined already.\n",
 43 |     "# If we didn't run the cells from part (1a) this code would fail.\n",
 44 |     "print x * 2"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {
 51 |     "collapsed": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# Section 1C\n",
 56 |     "\n",
 57 |     "# Import the regular expression library\n",
 58 |     "import re\n",
 59 |     "m = re.search('(?<=abc)def', 'abcdef')\n",
 60 |     "m.group(0)\n",
 61 |     "\n",
 62 |     "# Import the datetime library\n",
 63 |     "import datetime\n",
 64 |     "print 'This was last run on: {0}'.format(datetime.datetime.now())\n",
 65 |     "\n"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# Section 2A\n",
 77 |     "# Display the type of the Spark Context sc\n",
 78 |     "type(sc)\n"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 5,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# Section 2B\n",
 90 |     "# List sc's attributes\n",
 91 |     "dir(sc)\n"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 6,
 97 |    "metadata": {
 98 |     "collapsed": true
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "# Section 2C\n",
103 |     "\n",
104 |     "# Use help to obtain more detailed information\n",
105 |     "help(sc)\n",
106 |     "\n",
107 |     "# After reading the help we've decided we want to use sc.version to see what version of Spark we are running\n",
108 |     "sc.version\n",
109 |     "\n",
110 |     "# Help can be used on any Python object\n",
111 |     "help(map)\n",
112 |     "\n"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 7,
118 |    "metadata": {
119 |     "collapsed": true
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "# Section 3A\n",
124 |     "data = xrange(1, 10001)\n",
125 |     "\n",
126 |     "# Data is just a normal Python list\n",
127 |     "# Obtain data's first element\n",
128 |     "data[0]\n",
129 |     "\n",
130 |     "# We can check the size of the list using the len() function\n",
131 |     "len(data)\n"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 8,
137 |    "metadata": {
138 |     "collapsed": true
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "# Section 3B\n",
143 |     "# Parallelize data using 8 partitions\n",
144 |     "# This operation is a transformation of data into an RDD\n",
145 |     "# Spark uses lazy evaluation, so no Spark jobs are run at this point\n",
146 |     "xrangeRDD = sc.parallelize(data, 8)\n",
147 |     "\n",
148 |     "# Let's view help on parallelize\n",
149 |     "help(sc.parallelize)\n"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 9,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "# Section 3B Continued\n",
161 |     "# Let's see what type sc.parallelize() returned\n",
162 |     "print 'type of xrangeRDD: {0}'.format(type(xrangeRDD))\n",
163 |     "\n",
164 |     "# How about if we use a range\n",
165 |     "dataRange = range(1, 10001)\n",
166 |     "rangeRDD = sc.parallelize(dataRange, 8)\n",
167 |     "print 'type of dataRangeRDD: {0}'.format(type(rangeRDD))\n",
168 |     "\n",
169 |     "# Each RDD gets a unique ID\n",
170 |     "print 'xrangeRDD id: {0}'.format(xrangeRDD.id())\n",
171 |     "print 'rangeRDD id: {0}'.format(rangeRDD.id())\n",
172 |     "\n",
173 |     "# We can name each newly created RDD using the setName() method\n",
174 |     "xrangeRDD.setName('My first RDD')\n",
175 |     "\n",
176 |     "# Let's view the lineage (the set of transformations) of the RDD using toDebugString()\n",
177 |     "print xrangeRDD.toDebugString()\n"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 10,
183 |    "metadata": {
184 |     "collapsed": true
185 |    },
186 |    "outputs": [],
187 |    "source": [
188 |     "# Section 3B Continued\n",
189 |     "# Let's use help to see what methods we can call on this RDD\n",
190 |     "help(xrangeRDD)\n"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 11,
196 |    "metadata": {
197 |     "collapsed": true
198 |    },
199 |    "outputs": [],
200 |    "source": [
201 |     "# Section 3B Continued\n",
202 |     "\n",
203 |     "# Let's see how many partitions the RDD will be split into by using the getNumPartitions()\n",
204 |     "xrangeRDD.getNumPartitions()\n"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 12,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": [
215 |     "# Section 3C\n",
216 |     "# Create sub function to subtract 1\n",
217 |     "def sub(value):\n",
218 |     "    \"\"\"\"Subtracts one from `value`.\n",
219 |     "\n",
220 |     "    Args:\n",
221 |     "       value (int): A number.\n",
222 |     "\n",
223 |     "    Returns:\n",
224 |     "        int: `value` minus one.\n",
225 |     "    \"\"\"\n",
226 |     "    return (value - 1)\n",
227 |     "\n",
228 |     "# Transform xrangeRDD through map transformation using sub function\n",
229 |     "# Because map is a transformation and Spark uses lazy evaluation, no jobs, stages,\n",
230 |     "# or tasks will be launched when we run this code.\n",
231 |     "subRDD = xrangeRDD.map(sub)\n",
232 |     "\n",
233 |     "# Let's see the RDD transformation hierarchy\n",
234 |     "print subRDD.toDebugString()\n",
235 |     "\n"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 13,
241 |    "metadata": {
242 |     "collapsed": true
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "# Section 3D\n",
247 |     "# Let's collect the data\n",
248 |     "print subRDD.collect()\n"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 14,
254 |    "metadata": {
255 |     "collapsed": true
256 |    },
257 |    "outputs": [],
258 |    "source": [
259 |     "# Section 3D Continued\n",
260 |     "print xrangeRDD.count()\n",
261 |     "print subRDD.count()\n"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 15,
267 |    "metadata": {
268 |     "collapsed": true
269 |    },
270 |    "outputs": [],
271 |    "source": [
272 |     "# Section 3E\n",
273 |     "# Define a function to filter a single value\n",
274 |     "def ten(value):\n",
275 |     "    \"\"\"Return whether value is below ten.\n",
276 |     "\n",
277 |     "    Args:\n",
278 |     "        value (int): A number.\n",
279 |     "\n",
280 |     "    Returns:\n",
281 |     "        bool: Whether `value` is less than ten.\n",
282 |     "    \"\"\"\n",
283 |     "    if (value < 10):\n",
284 |     "        return True\n",
285 |     "    else:\n",
286 |     "        return False\n",
287 |     "# The ten function could also be written concisely as: def ten(value): return value < 10\n",
288 |     "\n",
289 |     "# Pass the function ten to the filter transformation\n",
290 |     "# Filter is a transformation so no tasks are run\n",
291 |     "filteredRDD = subRDD.filter(ten)\n",
292 |     "\n",
293 |     "# View the results using collect()\n",
294 |     "# Collect is an action and triggers the filter transformation to run\n",
295 |     "print filteredRDD.collect()\n"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 16,
301 |    "metadata": {
302 |     "collapsed": true
303 |    },
304 |    "outputs": [],
305 |    "source": [
306 |     "# Section 4\n",
307 |     "\n",
308 |     "lambdaRDD = subRDD.filter(lambda x: x < 10)\n",
309 |     "lambdaRDD.collect()\n",
310 |     "\n",
311 |     "# Let's collect the even values less than 10\n",
312 |     "evenRDD = lambdaRDD.filter(lambda x: x % 2 == 0)\n",
313 |     "evenRDD.collect()"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 17,
319 |    "metadata": {
320 |     "collapsed": true
321 |    },
322 |    "outputs": [],
323 |    "source": [
324 |     "# Section 5A\n",
325 |     "\n",
326 |     "# Let's get the first element\n",
327 |     "print filteredRDD.first()\n",
328 |     "# The first 4\n",
329 |     "print filteredRDD.take(4)\n",
330 |     "# Note that it is ok to take more elements than the RDD has\n",
331 |     "print filteredRDD.take(12)\n",
332 |     "\n",
333 |     "# Retrieve the three smallest elements\n",
334 |     "print filteredRDD.takeOrdered(3)\n",
335 |     "# Retrieve the five largest elements\n",
336 |     "print filteredRDD.top(5)\n",
337 |     "\n",
338 |     "# Pass a lambda function to takeOrdered to reverse the order\n",
339 |     "filteredRDD.takeOrdered(4, lambda s: -s)\n",
340 |     "\n",
341 |     "# Obtain Python's add function\n",
342 |     "from operator import add\n",
343 |     "# Efficiently sum the RDD using reduce\n",
344 |     "print filteredRDD.reduce(add)\n",
345 |     "# Sum using reduce with a lambda function\n",
346 |     "print filteredRDD.reduce(lambda a, b: a + b)\n",
347 |     "# Note that subtraction is not both associative and commutative\n",
348 |     "print filteredRDD.reduce(lambda a, b: a - b)\n",
349 |     "print filteredRDD.repartition(4).reduce(lambda a, b: a - b)\n",
350 |     "# While addition is\n",
351 |     "print filteredRDD.repartition(4).reduce(lambda a, b: a + b)\n"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 18,
357 |    "metadata": {
358 |     "collapsed": true
359 |    },
360 |    "outputs": [],
361 |    "source": [
362 |     "# Section 5B\n",
363 |     "# takeSample reusing elements\n",
364 |     "print filteredRDD.takeSample(withReplacement=True, num=6)\n",
365 |     "# takeSample without reuse\n",
366 |     "print filteredRDD.takeSample(withReplacement=False, num=6)\n",
367 |     "\n",
368 |     "# Set seed for predictability\n",
369 |     "print filteredRDD.takeSample(withReplacement=False, num=6, seed=500)\n",
370 |     "# Try reruning this cell and the cell above -- the results from this cell will remain constant\n",
371 |     "# Use ctrl-enter to run without moving to the next cell\n",
372 |     "\n",
373 |     "# Create new base RDD to show countByValue\n",
374 |     "repetitiveRDD = sc.parallelize([1, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 3, 3, 4, 5, 4, 6])\n",
375 |     "print repetitiveRDD.countByValue()\n"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 19,
381 |    "metadata": {
382 |     "collapsed": true
383 |    },
384 |    "outputs": [],
385 |    "source": [
386 |     "# Section 6A\n",
387 |     "\n",
388 |     "# Let's create a new base RDD to work from\n",
389 |     "wordsList = ['cat', 'elephant', 'rat', 'rat', 'cat']\n",
390 |     "wordsRDD = sc.parallelize(wordsList, 4)\n",
391 |     "\n",
392 |     "# Use map\n",
393 |     "singularAndPluralWordsRDDMap = wordsRDD.map(lambda x: (x, x + 's'))\n",
394 |     "# Use flatMap\n",
395 |     "singularAndPluralWordsRDD = wordsRDD.flatMap(lambda x: (x, x + 's'))\n",
396 |     "\n",
397 |     "# View the results\n",
398 |     "print singularAndPluralWordsRDDMap.collect()\n",
399 |     "print singularAndPluralWordsRDD.collect()\n",
400 |     "# View the number of elements in the RDD\n",
401 |     "print singularAndPluralWordsRDDMap.count()\n",
402 |     "print singularAndPluralWordsRDD.count()\n",
403 |     "\n",
404 |     "simpleRDD = sc.parallelize([2, 3, 4])\n",
405 |     "print simpleRDD.map(lambda x: range(1, x)).collect()\n",
406 |     "print simpleRDD.flatMap(lambda x: range(1, x)).collect()\n"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 20,
412 |    "metadata": {
413 |     "collapsed": true
414 |    },
415 |    "outputs": [],
416 |    "source": [
417 |     "# Section 6B\n",
418 |     "\n",
419 |     "pairRDD = sc.parallelize([('a', 1), ('a', 2), ('b', 1)])\n",
420 |     "# mapValues only used to improve format for printing\n",
421 |     "print pairRDD.groupByKey().mapValues(lambda x: list(x)).collect()\n",
422 |     "\n",
423 |     "# Different ways to sum by key\n",
424 |     "print pairRDD.groupByKey().map(lambda (k, v): (k, sum(v))).collect()\n",
425 |     "# Using mapValues, which is recommended when they key doesn't change\n",
426 |     "print pairRDD.groupByKey().mapValues(lambda x: sum(x)).collect()\n",
427 |     "# reduceByKey is more efficient / scalable\n",
428 |     "print pairRDD.reduceByKey(add).collect()\n"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 21,
434 |    "metadata": {
435 |     "collapsed": true
436 |    },
437 |    "outputs": [],
438 |    "source": [
439 |     "# Section 6C\n",
440 |     "\n",
441 |     "# mapPartitions takes a function that takes an iterator and returns an iterator\n",
442 |     "print wordsRDD.collect()\n",
443 |     "itemsRDD = wordsRDD.mapPartitions(lambda iterator: [','.join(iterator)])\n",
444 |     "print itemsRDD.collect()\n",
445 |     "\n",
446 |     "itemsByPartRDD = wordsRDD.mapPartitionsWithIndex(lambda index, iterator: [(index, list(iterator))])\n",
447 |     "# We can see that three of the (partitions) workers have one element and the fourth worker has two\n",
448 |     "# elements, although things may not bode well for the rat...\n",
449 |     "print itemsByPartRDD.collect()\n",
450 |     "# Rerun without returning a list (acts more like flatMap)\n",
451 |     "itemsByPartRDD = wordsRDD.mapPartitionsWithIndex(lambda index, iterator: (index, list(iterator)))\n",
452 |     "print itemsByPartRDD.collect()\n"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": 22,
458 |    "metadata": {
459 |     "collapsed": true
460 |    },
461 |    "outputs": [],
462 |    "source": [
463 |     "# Section 7A\n",
464 |     "# Name the RDD\n",
465 |     "filteredRDD.setName('My Filtered RDD')\n",
466 |     "# Cache the RDD\n",
467 |     "filteredRDD.cache()\n",
468 |     "# Is it cached\n",
469 |     "print filteredRDD.is_cached\n"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 23,
475 |    "metadata": {
476 |     "collapsed": true
477 |    },
478 |    "outputs": [],
479 |    "source": [
480 |     "# Section 7B\n",
481 |     "# Note that toDebugString also provides storage information\n",
482 |     "print filteredRDD.toDebugString()\n",
483 |     "\n",
484 |     "# If we are done with the RDD we can unpersist it so that its memory can be reclaimed\n",
485 |     "filteredRDD.unpersist()\n",
486 |     "# Storage level for a non cached RDD\n",
487 |     "print filteredRDD.getStorageLevel()\n",
488 |     "filteredRDD.cache()\n",
489 |     "# Storage level for a cached RDD\n",
490 |     "print filteredRDD.getStorageLevel()"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": 24,
496 |    "metadata": {
497 |     "collapsed": true
498 |    },
499 |    "outputs": [],
500 |    "source": [
501 |     "# Section 8A\n",
502 |     "\n",
503 |     "def brokenTen(value):\n",
504 |     "    \"\"\"Incorrect implementation of the ten function.\n",
505 |     "\n",
506 |     "    Note:\n",
507 |     "        The `if` statement checks an undefined variable `val` instead of `value`.\n",
508 |     "\n",
509 |     "    Args:\n",
510 |     "        value (int): A number.\n",
511 |     "\n",
512 |     "    Returns:\n",
513 |     "        bool: Whether `value` is less than ten.\n",
514 |     "\n",
515 |     "    Raises:\n",
516 |     "        NameError: The function references `val`, which is not available in the local or global\n",
517 |     "            namespace, so a `NameError` is raised.\n",
518 |     "    \"\"\"\n",
519 |     "    if (value < 10):\n",
520 |     "        return True\n",
521 |     "    else:\n",
522 |     "        return False\n",
523 |     "\n",
524 |     "brokenRDD = subRDD.filter(brokenTen)\n",
525 |     "\n",
526 |     "# Now we'll see the error\n",
527 |     "brokenRDD.collect()\n"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": 25,
533 |    "metadata": {
534 |     "collapsed": true
535 |    },
536 |    "outputs": [],
537 |    "source": [
538 |     "# Section 8C\n",
539 |     "# Cleaner code through lambda use\n",
540 |     "subRDD.filter(lambda x: x < 10).collect()\n",
541 |     "\n",
542 |     "# Even better by moving our chain of operators into a single line.\n",
543 |     "sc.parallelize(data).map(lambda y: y - 1).filter(lambda x: x < 10).collect()\n"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": 26,
549 |    "metadata": {
550 |     "collapsed": true
551 |    },
552 |    "outputs": [],
553 |    "source": [
554 |     "# Section 8D\n",
555 |     "(sc\n",
556 |     " .parallelize(data)\n",
557 |     " .map(lambda y: y - 1)\n",
558 |     " .filter(lambda x: x < 10)\n",
559 |     " .collect())\n"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": 27,
565 |    "metadata": {
566 |     "collapsed": true
567 |    },
568 |    "outputs": [],
569 |    "source": []
570 |   }
571 |  ],
572 |  "metadata": {
573 |   "kernelspec": {
574 |    "display_name": "Python 2",
575 |    "language": "python",
576 |    "name": "python2"
577 |   },
578 |   "language_info": {
579 |    "codemirror_mode": {
580 |     "name": "ipython",
581 |     "version": 2
582 |    },
583 |    "file_extension": ".py",
584 |    "mimetype": "text/x-python",
585 |    "name": "python",
586 |    "nbconvert_exporter": "python",
587 |    "pygments_lexer": "ipython2",
588 |    "version": "2.7.13"
589 |   },
590 |   "name": "Packt_Notebook",
591 |   "notebookId": 2573898132801625
592 |  },
593 |  "nbformat": 4,
594 |  "nbformat_minor": 0
595 | }
596 | 


--------------------------------------------------------------------------------
/Chapter06/Packt_Notebook.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter06/Packt_Notebook.dbc


--------------------------------------------------------------------------------
/Chapter06/Packt_Notebook.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["#![Spark Logo](http://spark-mooc.github.io/web-assets/images/ta_Spark-logo-small.png) + ![Python Logo](http://spark-mooc.github.io/web-assets/images/python-logo-master-v3-TM-flattened_small.png)\n# ** Basic Introduction to a few Spark Commands **\n\nThis notebook is based on tutorials conducted by [Databricks](https://databricks.com). The tutorial will be conducted using the Databricks' Community Edition of Spark available for sign up [here](https://databricks.com/try-databricks). Databricks is a leading provider of the commercial and enterprise supported version of Spark.\n\nIn this lab, we will introduce a few basic commands used in Spark. Users are encouraged to try out more extensive Spark tutorials and notebooks that are available on the web for more detailed examples.\n\nDocumentation for [Spark's Python API](https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.sql)."],"metadata":{}},{"cell_type":"code","source":["# The SparkContext/SparkSession is the entry point for all Spark operations\n# sc = the SparkContext = the execution environment of Spark, only 1 per JVM\n# Note that SparkSession is now the entry point (from Spark v2.0)\n# This tutorial uses SparkContext (was used prior to Spark 2.0)\n\nfrom pyspark import SparkContext\n# sc = SparkContext(appName = \"some_application_name\") # You'd normally run this, but in this case, it has already been created in the Databricks' environment"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["quote = \"To be, or not to be, that is the question:  Whether 'tis nobler in the mind to suffer  The slings and arrows of outrageous fortune,  Or to take Arms against a Sea of troubles,  And by opposing end them: to die, to sleep  No more; and by a sleep, to say we end  the heart-ache, and the thousand natural shocks  that Flesh is heir to? 'Tis a consummation  devoutly to be wished. To die, to sleep,  To sleep, perchance to Dream; aye, there's the rub,  for in that sleep of death, what dreams may come,  when we have shuffled off this mortal coil, must give us pause.\""],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["sparkdata = sc.parallelize(quote.split(' '))"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"code","source":["print \"sparkdata = \", sparkdata\nprint \"sparkdata.collect = \", sparkdata.collect\nprint \"sparkdata.collect() = \", sparkdata.collect()[1:10]"],"metadata":{},"outputs":[],"execution_count":5},{"cell_type":"code","source":["# A simple transformation - map\n\ndef mapword(word):\n  return (word,1)\n\nprint sparkdata.map(mapword) # Nothing has happened here\nprint sparkdata.map(mapword).collect()[1:10] # collect causes the DAG to execute"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"code","source":["# Another Transformation\n\ndef charsmorethan2(tuple1):\n  if len(tuple1[0])>2:\n    return tuple1\n  pass\n\nrdd3 = sparkdata.map(mapword).filter(lambda x: charsmorethan2(x))\n# Multiple Transformations in 1 statement, nothing is happening yet\n\nrdd3.collect()[1:10] # The DAG gets executed. Note that since we didn't remove punctuation marks ... 'be,', etc are also included"],"metadata":{},"outputs":[],"execution_count":7},{"cell_type":"code","source":["# With Tables, a general example\n\ncms = sc.parallelize([[1,\"Dr. A\",12.50,\"Yale\"],[2,\"Dr. B\",5.10,\"Duke\"],[3,\"Dr. C\",200.34,\"Mt. Sinai\"],[4,\"Dr. D\",5.67,\"Duke\"],[1,\"Dr. E\",52.50,\"Yale\"]])"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"code","source":["def findPayment(data):\n  return data[2]\n\nprint \"Payments = \", cms.map(findPayment).collect()\nprint \"Mean = \", cms.map(findPayment).mean() # Mean is an action"],"metadata":{},"outputs":[],"execution_count":9},{"cell_type":"code","source":["# Creating a DataFrame (familiar to Python programmers)\n\ncms_df = sqlContext.createDataFrame(cms, [\"ID\",\"Name\",\"Payment\",\"Hosp\"])\nprint cms_df.show()\nprint cms_df.groupby('Hosp').agg(func.avg('Payment'), func.max('Payment'),func.min('Payment'))\nprint cms_df.groupby('Hosp').agg(func.avg('Payment'), func.max('Payment'),func.min('Payment')).collect()\nprint\nprint \"Converting to a Pandas DataFrame\"\nprint \"--------------------------------\"\npd_df = cms_df.groupby('Hosp').agg(func.avg('Payment'), func.max('Payment'),func.min('Payment')).toPandas()\nprint type(pd_df)\nprint\nprint pd_df\n\n\n"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"code","source":["wordsList = ['to','be','or','not','to','be']\nwordsRDD = sc.parallelize(wordsList, 3) # Splits into 2 groups\n# Print out the type of wordsRDD\nprint type(wordsRDD)"],"metadata":{},"outputs":[],"execution_count":11},{"cell_type":"code","source":["# Glom coallesces all elements within each partition into a list\nprint wordsRDD.glom().take(2) # Take is an action, here we are 'take'-ing the first 2 elements of the wordsRDD\nprint wordsRDD.glom().collect() # Collect"],"metadata":{},"outputs":[],"execution_count":12},{"cell_type":"code","source":["# An example with changing the case of words\n\n# One way of completing the function\ndef makeUpperCase(word):\n    return word.upper()\n\nprint makeUpperCase('cat')\n"],"metadata":{},"outputs":[],"execution_count":13},{"cell_type":"code","source":["upperRDD = wordsRDD.map(makeUpperCase)\nprint upperRDD.collect()"],"metadata":{},"outputs":[],"execution_count":14},{"cell_type":"code","source":["upperLambdaRDD = wordsRDD.map(lambda word: word.upper())\nprint upperLambdaRDD.collect()"],"metadata":{},"outputs":[],"execution_count":15},{"cell_type":"code","source":["# Pair RDDs\nwordPairs = wordsRDD.map(lambda word: (word, 1))\nprint wordPairs.collect()"],"metadata":{},"outputs":[],"execution_count":16},{"cell_type":"markdown","source":["#### Part 2: Counting with pair RDDs \nThere are multiple ways of performing group-by operations in Spark\nOne such method is groupByKey()\n\n** Using groupByKey() **\n\nThis method creates a key-value pair whereby each key (in this case word) is assigned a value of 1 for our wordcount operation. It then combines all keys into a single list. This can be quite memory intensive, especially if the dataset is large."],"metadata":{}},{"cell_type":"code","source":["# Using groupByKey\nwordsGrouped = wordPairs.groupByKey()\nfor key, value in wordsGrouped.collect():\n    print '{0}: {1}'.format(key, list(value))"],"metadata":{},"outputs":[],"execution_count":18},{"cell_type":"code","source":["# Summation of the key values (to get the word count)\nwordCountsGrouped = wordsGrouped.map(lambda (k,v): (k, sum(v)))\nprint wordCountsGrouped.collect()"],"metadata":{},"outputs":[],"execution_count":19},{"cell_type":"markdown","source":["** (2c) Counting using reduceByKey **\n\nreduceByKey creates a new pair RDD. It then iteratively applies a function first to each key (i.e., within the key values) and then across all the keys, i.e., in other words it applies the given function iteratively."],"metadata":{}},{"cell_type":"code","source":["wordCounts = wordPairs.reduceByKey(lambda a,b: a+b)\nprint wordCounts.collect()"],"metadata":{},"outputs":[],"execution_count":21},{"cell_type":"markdown","source":["** Combining all of the above into a single statement **"],"metadata":{}},{"cell_type":"code","source":["wordCountsCollected = (wordsRDD\n                       .map(lambda word: (word, 1))\n                       .reduceByKey(lambda a,b: a+b)\n                       .collect())\nprint wordCountsCollected"],"metadata":{},"outputs":[],"execution_count":23},{"cell_type":"markdown","source":["This tutorial has provided a basic overview of Spark and introduced the Databricks community edition where users can upload and execute their own Spark notebooks. There are various in-depth tutorials on the web and also at Databricks on Spark and users are encouraged to peruse them if interested in learning further about Spark."],"metadata":{}}],"metadata":{"name":"Packt_Notebook","notebookId":538254486733003},"nbformat":4,"nbformat_minor":0}
2 | 


--------------------------------------------------------------------------------
/Chapter06/Packt_Notebook.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC #![Spark Logo](http://spark-mooc.github.io/web-assets/images/ta_Spark-logo-small.png) + ![Python Logo](http://spark-mooc.github.io/web-assets/images/python-logo-master-v3-TM-flattened_small.png)
  4 | # MAGIC # ** Basic Introduction to a few Spark Commands **
  5 | # MAGIC 
  6 | # MAGIC This notebook is based on tutorials conducted by [Databricks](https://databricks.com). The tutorial will be conducted using the Databricks' Community Edition of Spark available for sign up [here](https://databricks.com/try-databricks). Databricks is a leading provider of the commercial and enterprise supported version of Spark.
  7 | # MAGIC 
  8 | # MAGIC In this lab, we will introduce a few basic commands used in Spark. Users are encouraged to try out more extensive Spark tutorials and notebooks that are available on the web for more detailed examples.
  9 | # MAGIC 
 10 | # MAGIC Documentation for [Spark's Python API](https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.sql).
 11 | 
 12 | # COMMAND ----------
 13 | 
 14 | # The SparkContext/SparkSession is the entry point for all Spark operations
 15 | # sc = the SparkContext = the execution environment of Spark, only 1 per JVM
 16 | # Note that SparkSession is now the entry point (from Spark v2.0)
 17 | # This tutorial uses SparkContext (was used prior to Spark 2.0)
 18 | 
 19 | from pyspark import SparkContext
 20 | # sc = SparkContext(appName = "some_application_name") # You'd normally run this, but in this case, it has already been created in the Databricks' environment
 21 | 
 22 | # COMMAND ----------
 23 | 
 24 | quote = "To be, or not to be, that is the question:  Whether 'tis nobler in the mind to suffer  The slings and arrows of outrageous fortune,  Or to take Arms against a Sea of troubles,  And by opposing end them: to die, to sleep  No more; and by a sleep, to say we end  the heart-ache, and the thousand natural shocks  that Flesh is heir to? 'Tis a consummation  devoutly to be wished. To die, to sleep,  To sleep, perchance to Dream; aye, there's the rub,  for in that sleep of death, what dreams may come,  when we have shuffled off this mortal coil, must give us pause."
 25 | 
 26 | # COMMAND ----------
 27 | 
 28 | sparkdata = sc.parallelize(quote.split(' '))
 29 | 
 30 | # COMMAND ----------
 31 | 
 32 | print "sparkdata = ", sparkdata
 33 | print "sparkdata.collect = ", sparkdata.collect
 34 | print "sparkdata.collect() = ", sparkdata.collect()[1:10]
 35 | 
 36 | # COMMAND ----------
 37 | 
 38 | # A simple transformation - map
 39 | 
 40 | def mapword(word):
 41 |   return (word,1)
 42 | 
 43 | print sparkdata.map(mapword) # Nothing has happened here
 44 | print sparkdata.map(mapword).collect()[1:10] # collect causes the DAG to execute
 45 | 
 46 | # COMMAND ----------
 47 | 
 48 | # Another Transformation
 49 | 
 50 | def charsmorethan2(tuple1):
 51 |   if len(tuple1[0])>2:
 52 |     return tuple1
 53 |   pass
 54 | 
 55 | rdd3 = sparkdata.map(mapword).filter(lambda x: charsmorethan2(x))
 56 | # Multiple Transformations in 1 statement, nothing is happening yet
 57 | 
 58 | rdd3.collect()[1:10] # The DAG gets executed. Note that since we didn't remove punctuation marks ... 'be,', etc are also included
 59 | 
 60 | # COMMAND ----------
 61 | 
 62 | # With Tables, a general example
 63 | 
 64 | cms = sc.parallelize([[1,"Dr. A",12.50,"Yale"],[2,"Dr. B",5.10,"Duke"],[3,"Dr. C",200.34,"Mt. Sinai"],[4,"Dr. D",5.67,"Duke"],[1,"Dr. E",52.50,"Yale"]])
 65 | 
 66 | # COMMAND ----------
 67 | 
 68 | def findPayment(data):
 69 |   return data[2]
 70 | 
 71 | print "Payments = ", cms.map(findPayment).collect()
 72 | print "Mean = ", cms.map(findPayment).mean() # Mean is an action
 73 | 
 74 | # COMMAND ----------
 75 | 
 76 | # Creating a DataFrame (familiar to Python programmers)
 77 | 
 78 | cms_df = sqlContext.createDataFrame(cms, ["ID","Name","Payment","Hosp"])
 79 | print cms_df.show()
 80 | print cms_df.groupby('Hosp').agg(func.avg('Payment'), func.max('Payment'),func.min('Payment'))
 81 | print cms_df.groupby('Hosp').agg(func.avg('Payment'), func.max('Payment'),func.min('Payment')).collect()
 82 | print
 83 | print "Converting to a Pandas DataFrame"
 84 | print "--------------------------------"
 85 | pd_df = cms_df.groupby('Hosp').agg(func.avg('Payment'), func.max('Payment'),func.min('Payment')).toPandas()
 86 | print type(pd_df)
 87 | print
 88 | print pd_df
 89 | 
 90 | 
 91 | 
 92 | 
 93 | # COMMAND ----------
 94 | 
 95 | wordsList = ['to','be','or','not','to','be']
 96 | wordsRDD = sc.parallelize(wordsList, 3) # Splits into 2 groups
 97 | # Print out the type of wordsRDD
 98 | print type(wordsRDD)
 99 | 
100 | # COMMAND ----------
101 | 
102 | # Glom coallesces all elements within each partition into a list
103 | print wordsRDD.glom().take(2) # Take is an action, here we are 'take'-ing the first 2 elements of the wordsRDD
104 | print wordsRDD.glom().collect() # Collect
105 | 
106 | # COMMAND ----------
107 | 
108 | # An example with changing the case of words
109 | 
110 | # One way of completing the function
111 | def makeUpperCase(word):
112 |     return word.upper()
113 | 
114 | print makeUpperCase('cat')
115 | 
116 | 
117 | # COMMAND ----------
118 | 
119 | upperRDD = wordsRDD.map(makeUpperCase)
120 | print upperRDD.collect()
121 | 
122 | # COMMAND ----------
123 | 
124 | upperLambdaRDD = wordsRDD.map(lambda word: word.upper())
125 | print upperLambdaRDD.collect()
126 | 
127 | # COMMAND ----------
128 | 
129 | # Pair RDDs
130 | wordPairs = wordsRDD.map(lambda word: (word, 1))
131 | print wordPairs.collect()
132 | 
133 | # COMMAND ----------
134 | 
135 | # MAGIC %md
136 | # MAGIC #### Part 2: Counting with pair RDDs 
137 | # MAGIC There are multiple ways of performing group-by operations in Spark
138 | # MAGIC One such method is groupByKey()
139 | # MAGIC 
140 | # MAGIC ** Using groupByKey() **
141 | # MAGIC 
142 | # MAGIC This method creates a key-value pair whereby each key (in this case word) is assigned a value of 1 for our wordcount operation. It then combines all keys into a single list. This can be quite memory intensive, especially if the dataset is large.
143 | 
144 | # COMMAND ----------
145 | 
146 | # Using groupByKey
147 | wordsGrouped = wordPairs.groupByKey()
148 | for key, value in wordsGrouped.collect():
149 |     print '{0}: {1}'.format(key, list(value))
150 | 
151 | # COMMAND ----------
152 | 
153 | # Summation of the key values (to get the word count)
154 | wordCountsGrouped = wordsGrouped.map(lambda (k,v): (k, sum(v)))
155 | print wordCountsGrouped.collect()
156 | 
157 | # COMMAND ----------
158 | 
159 | # MAGIC %md
160 | # MAGIC ** (2c) Counting using reduceByKey **
161 | # MAGIC 
162 | # MAGIC reduceByKey creates a new pair RDD. It then iteratively applies a function first to each key (i.e., within the key values) and then across all the keys, i.e., in other words it applies the given function iteratively.
163 | 
164 | # COMMAND ----------
165 | 
166 | wordCounts = wordPairs.reduceByKey(lambda a,b: a+b)
167 | print wordCounts.collect()
168 | 
169 | # COMMAND ----------
170 | 
171 | # MAGIC %md
172 | # MAGIC ** Combining all of the above into a single statement **
173 | 
174 | # COMMAND ----------
175 | 
176 | wordCountsCollected = (wordsRDD
177 |                        .map(lambda word: (word, 1))
178 |                        .reduceByKey(lambda a,b: a+b)
179 |                        .collect())
180 | print wordCountsCollected
181 | 
182 | # COMMAND ----------
183 | 
184 | # MAGIC %md
185 | # MAGIC 
186 | # MAGIC This tutorial has provided a basic overview of Spark and introduced the Databricks community edition where users can upload and execute their own Spark notebooks. There are various in-depth tutorials on the web and also at Databricks on Spark and users are encouraged to peruse them if interested in learning further about Spark. 
187 | 


--------------------------------------------------------------------------------
/Chapter06/old/Packt_Notebook.dbc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter06/old/Packt_Notebook.dbc


--------------------------------------------------------------------------------
/Chapter06/old/Packt_Notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     " ## SPARK TUTORIAL\n",
  8 |     " \n",
  9 |     " This tutorial has been reproduced with permission of Databricks Inc., a leading provider of Spark-based solutions.\n",
 10 |     " \n",
 11 |     "[Databricks](http://www.databricks.com)"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "# Section 1A\n",
 23 |     "# This is a Python cell. You can run normal Python code here...\n",
 24 |     "print 'The sum of 1 and 1 is {0}'.format(1+1)\n",
 25 |     "\n",
 26 |     "# Here is another Python cell, this time with a variable (x) declaration and an if statement:\n",
 27 |     "x = 42\n",
 28 |     "if x > 40:\n",
 29 |     "    print 'The sum of 1 and 2 is {0}'.format(1+2)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "# Section 1B\n",
 41 |     "\n",
 42 |     "# This cell relies on x being defined already.\n",
 43 |     "# If we didn't run the cells from part (1a) this code would fail.\n",
 44 |     "print x * 2"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {
 51 |     "collapsed": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# Section 1C\n",
 56 |     "\n",
 57 |     "# Import the regular expression library\n",
 58 |     "import re\n",
 59 |     "m = re.search('(?<=abc)def', 'abcdef')\n",
 60 |     "m.group(0)\n",
 61 |     "\n",
 62 |     "# Import the datetime library\n",
 63 |     "import datetime\n",
 64 |     "print 'This was last run on: {0}'.format(datetime.datetime.now())\n",
 65 |     "\n"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {
 72 |     "collapsed": true
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# Section 2A\n",
 77 |     "# Display the type of the Spark Context sc\n",
 78 |     "type(sc)\n"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 5,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# Section 2B\n",
 90 |     "# List sc's attributes\n",
 91 |     "dir(sc)\n"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 6,
 97 |    "metadata": {
 98 |     "collapsed": true
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "# Section 2C\n",
103 |     "\n",
104 |     "# Use help to obtain more detailed information\n",
105 |     "help(sc)\n",
106 |     "\n",
107 |     "# After reading the help we've decided we want to use sc.version to see what version of Spark we are running\n",
108 |     "sc.version\n",
109 |     "\n",
110 |     "# Help can be used on any Python object\n",
111 |     "help(map)\n",
112 |     "\n"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 7,
118 |    "metadata": {
119 |     "collapsed": true
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "# Section 3A\n",
124 |     "data = xrange(1, 10001)\n",
125 |     "\n",
126 |     "# Data is just a normal Python list\n",
127 |     "# Obtain data's first element\n",
128 |     "data[0]\n",
129 |     "\n",
130 |     "# We can check the size of the list using the len() function\n",
131 |     "len(data)\n"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 8,
137 |    "metadata": {
138 |     "collapsed": true
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "# Section 3B\n",
143 |     "# Parallelize data using 8 partitions\n",
144 |     "# This operation is a transformation of data into an RDD\n",
145 |     "# Spark uses lazy evaluation, so no Spark jobs are run at this point\n",
146 |     "xrangeRDD = sc.parallelize(data, 8)\n",
147 |     "\n",
148 |     "# Let's view help on parallelize\n",
149 |     "help(sc.parallelize)\n"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 9,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "# Section 3B Continued\n",
161 |     "# Let's see what type sc.parallelize() returned\n",
162 |     "print 'type of xrangeRDD: {0}'.format(type(xrangeRDD))\n",
163 |     "\n",
164 |     "# How about if we use a range\n",
165 |     "dataRange = range(1, 10001)\n",
166 |     "rangeRDD = sc.parallelize(dataRange, 8)\n",
167 |     "print 'type of dataRangeRDD: {0}'.format(type(rangeRDD))\n",
168 |     "\n",
169 |     "# Each RDD gets a unique ID\n",
170 |     "print 'xrangeRDD id: {0}'.format(xrangeRDD.id())\n",
171 |     "print 'rangeRDD id: {0}'.format(rangeRDD.id())\n",
172 |     "\n",
173 |     "# We can name each newly created RDD using the setName() method\n",
174 |     "xrangeRDD.setName('My first RDD')\n",
175 |     "\n",
176 |     "# Let's view the lineage (the set of transformations) of the RDD using toDebugString()\n",
177 |     "print xrangeRDD.toDebugString()\n"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 10,
183 |    "metadata": {
184 |     "collapsed": true
185 |    },
186 |    "outputs": [],
187 |    "source": [
188 |     "# Section 3B Continued\n",
189 |     "# Let's use help to see what methods we can call on this RDD\n",
190 |     "help(xrangeRDD)\n"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 11,
196 |    "metadata": {
197 |     "collapsed": true
198 |    },
199 |    "outputs": [],
200 |    "source": [
201 |     "# Section 3B Continued\n",
202 |     "\n",
203 |     "# Let's see how many partitions the RDD will be split into by using the getNumPartitions()\n",
204 |     "xrangeRDD.getNumPartitions()\n"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 12,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": [
215 |     "# Section 3C\n",
216 |     "# Create sub function to subtract 1\n",
217 |     "def sub(value):\n",
218 |     "    \"\"\"\"Subtracts one from `value`.\n",
219 |     "\n",
220 |     "    Args:\n",
221 |     "       value (int): A number.\n",
222 |     "\n",
223 |     "    Returns:\n",
224 |     "        int: `value` minus one.\n",
225 |     "    \"\"\"\n",
226 |     "    return (value - 1)\n",
227 |     "\n",
228 |     "# Transform xrangeRDD through map transformation using sub function\n",
229 |     "# Because map is a transformation and Spark uses lazy evaluation, no jobs, stages,\n",
230 |     "# or tasks will be launched when we run this code.\n",
231 |     "subRDD = xrangeRDD.map(sub)\n",
232 |     "\n",
233 |     "# Let's see the RDD transformation hierarchy\n",
234 |     "print subRDD.toDebugString()\n",
235 |     "\n"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 13,
241 |    "metadata": {
242 |     "collapsed": true
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "# Section 3D\n",
247 |     "# Let's collect the data\n",
248 |     "print subRDD.collect()\n"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 14,
254 |    "metadata": {
255 |     "collapsed": true
256 |    },
257 |    "outputs": [],
258 |    "source": [
259 |     "# Section 3D Continued\n",
260 |     "print xrangeRDD.count()\n",
261 |     "print subRDD.count()\n"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 15,
267 |    "metadata": {
268 |     "collapsed": true
269 |    },
270 |    "outputs": [],
271 |    "source": [
272 |     "# Section 3E\n",
273 |     "# Define a function to filter a single value\n",
274 |     "def ten(value):\n",
275 |     "    \"\"\"Return whether value is below ten.\n",
276 |     "\n",
277 |     "    Args:\n",
278 |     "        value (int): A number.\n",
279 |     "\n",
280 |     "    Returns:\n",
281 |     "        bool: Whether `value` is less than ten.\n",
282 |     "    \"\"\"\n",
283 |     "    if (value < 10):\n",
284 |     "        return True\n",
285 |     "    else:\n",
286 |     "        return False\n",
287 |     "# The ten function could also be written concisely as: def ten(value): return value < 10\n",
288 |     "\n",
289 |     "# Pass the function ten to the filter transformation\n",
290 |     "# Filter is a transformation so no tasks are run\n",
291 |     "filteredRDD = subRDD.filter(ten)\n",
292 |     "\n",
293 |     "# View the results using collect()\n",
294 |     "# Collect is an action and triggers the filter transformation to run\n",
295 |     "print filteredRDD.collect()\n"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 16,
301 |    "metadata": {
302 |     "collapsed": true
303 |    },
304 |    "outputs": [],
305 |    "source": [
306 |     "# Section 4\n",
307 |     "\n",
308 |     "lambdaRDD = subRDD.filter(lambda x: x < 10)\n",
309 |     "lambdaRDD.collect()\n",
310 |     "\n",
311 |     "# Let's collect the even values less than 10\n",
312 |     "evenRDD = lambdaRDD.filter(lambda x: x % 2 == 0)\n",
313 |     "evenRDD.collect()"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 17,
319 |    "metadata": {
320 |     "collapsed": true
321 |    },
322 |    "outputs": [],
323 |    "source": [
324 |     "# Section 5A\n",
325 |     "\n",
326 |     "# Let's get the first element\n",
327 |     "print filteredRDD.first()\n",
328 |     "# The first 4\n",
329 |     "print filteredRDD.take(4)\n",
330 |     "# Note that it is ok to take more elements than the RDD has\n",
331 |     "print filteredRDD.take(12)\n",
332 |     "\n",
333 |     "# Retrieve the three smallest elements\n",
334 |     "print filteredRDD.takeOrdered(3)\n",
335 |     "# Retrieve the five largest elements\n",
336 |     "print filteredRDD.top(5)\n",
337 |     "\n",
338 |     "# Pass a lambda function to takeOrdered to reverse the order\n",
339 |     "filteredRDD.takeOrdered(4, lambda s: -s)\n",
340 |     "\n",
341 |     "# Obtain Python's add function\n",
342 |     "from operator import add\n",
343 |     "# Efficiently sum the RDD using reduce\n",
344 |     "print filteredRDD.reduce(add)\n",
345 |     "# Sum using reduce with a lambda function\n",
346 |     "print filteredRDD.reduce(lambda a, b: a + b)\n",
347 |     "# Note that subtraction is not both associative and commutative\n",
348 |     "print filteredRDD.reduce(lambda a, b: a - b)\n",
349 |     "print filteredRDD.repartition(4).reduce(lambda a, b: a - b)\n",
350 |     "# While addition is\n",
351 |     "print filteredRDD.repartition(4).reduce(lambda a, b: a + b)\n"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 18,
357 |    "metadata": {
358 |     "collapsed": true
359 |    },
360 |    "outputs": [],
361 |    "source": [
362 |     "# Section 5B\n",
363 |     "# takeSample reusing elements\n",
364 |     "print filteredRDD.takeSample(withReplacement=True, num=6)\n",
365 |     "# takeSample without reuse\n",
366 |     "print filteredRDD.takeSample(withReplacement=False, num=6)\n",
367 |     "\n",
368 |     "# Set seed for predictability\n",
369 |     "print filteredRDD.takeSample(withReplacement=False, num=6, seed=500)\n",
370 |     "# Try reruning this cell and the cell above -- the results from this cell will remain constant\n",
371 |     "# Use ctrl-enter to run without moving to the next cell\n",
372 |     "\n",
373 |     "# Create new base RDD to show countByValue\n",
374 |     "repetitiveRDD = sc.parallelize([1, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 3, 3, 4, 5, 4, 6])\n",
375 |     "print repetitiveRDD.countByValue()\n"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 19,
381 |    "metadata": {
382 |     "collapsed": true
383 |    },
384 |    "outputs": [],
385 |    "source": [
386 |     "# Section 6A\n",
387 |     "\n",
388 |     "# Let's create a new base RDD to work from\n",
389 |     "wordsList = ['cat', 'elephant', 'rat', 'rat', 'cat']\n",
390 |     "wordsRDD = sc.parallelize(wordsList, 4)\n",
391 |     "\n",
392 |     "# Use map\n",
393 |     "singularAndPluralWordsRDDMap = wordsRDD.map(lambda x: (x, x + 's'))\n",
394 |     "# Use flatMap\n",
395 |     "singularAndPluralWordsRDD = wordsRDD.flatMap(lambda x: (x, x + 's'))\n",
396 |     "\n",
397 |     "# View the results\n",
398 |     "print singularAndPluralWordsRDDMap.collect()\n",
399 |     "print singularAndPluralWordsRDD.collect()\n",
400 |     "# View the number of elements in the RDD\n",
401 |     "print singularAndPluralWordsRDDMap.count()\n",
402 |     "print singularAndPluralWordsRDD.count()\n",
403 |     "\n",
404 |     "simpleRDD = sc.parallelize([2, 3, 4])\n",
405 |     "print simpleRDD.map(lambda x: range(1, x)).collect()\n",
406 |     "print simpleRDD.flatMap(lambda x: range(1, x)).collect()\n"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 20,
412 |    "metadata": {
413 |     "collapsed": true
414 |    },
415 |    "outputs": [],
416 |    "source": [
417 |     "# Section 6B\n",
418 |     "\n",
419 |     "pairRDD = sc.parallelize([('a', 1), ('a', 2), ('b', 1)])\n",
420 |     "# mapValues only used to improve format for printing\n",
421 |     "print pairRDD.groupByKey().mapValues(lambda x: list(x)).collect()\n",
422 |     "\n",
423 |     "# Different ways to sum by key\n",
424 |     "print pairRDD.groupByKey().map(lambda (k, v): (k, sum(v))).collect()\n",
425 |     "# Using mapValues, which is recommended when they key doesn't change\n",
426 |     "print pairRDD.groupByKey().mapValues(lambda x: sum(x)).collect()\n",
427 |     "# reduceByKey is more efficient / scalable\n",
428 |     "print pairRDD.reduceByKey(add).collect()\n"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 21,
434 |    "metadata": {
435 |     "collapsed": true
436 |    },
437 |    "outputs": [],
438 |    "source": [
439 |     "# Section 6C\n",
440 |     "\n",
441 |     "# mapPartitions takes a function that takes an iterator and returns an iterator\n",
442 |     "print wordsRDD.collect()\n",
443 |     "itemsRDD = wordsRDD.mapPartitions(lambda iterator: [','.join(iterator)])\n",
444 |     "print itemsRDD.collect()\n",
445 |     "\n",
446 |     "itemsByPartRDD = wordsRDD.mapPartitionsWithIndex(lambda index, iterator: [(index, list(iterator))])\n",
447 |     "# We can see that three of the (partitions) workers have one element and the fourth worker has two\n",
448 |     "# elements, although things may not bode well for the rat...\n",
449 |     "print itemsByPartRDD.collect()\n",
450 |     "# Rerun without returning a list (acts more like flatMap)\n",
451 |     "itemsByPartRDD = wordsRDD.mapPartitionsWithIndex(lambda index, iterator: (index, list(iterator)))\n",
452 |     "print itemsByPartRDD.collect()\n"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": 22,
458 |    "metadata": {
459 |     "collapsed": true
460 |    },
461 |    "outputs": [],
462 |    "source": [
463 |     "# Section 7A\n",
464 |     "# Name the RDD\n",
465 |     "filteredRDD.setName('My Filtered RDD')\n",
466 |     "# Cache the RDD\n",
467 |     "filteredRDD.cache()\n",
468 |     "# Is it cached\n",
469 |     "print filteredRDD.is_cached\n"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 23,
475 |    "metadata": {
476 |     "collapsed": true
477 |    },
478 |    "outputs": [],
479 |    "source": [
480 |     "# Section 7B\n",
481 |     "# Note that toDebugString also provides storage information\n",
482 |     "print filteredRDD.toDebugString()\n",
483 |     "\n",
484 |     "# If we are done with the RDD we can unpersist it so that its memory can be reclaimed\n",
485 |     "filteredRDD.unpersist()\n",
486 |     "# Storage level for a non cached RDD\n",
487 |     "print filteredRDD.getStorageLevel()\n",
488 |     "filteredRDD.cache()\n",
489 |     "# Storage level for a cached RDD\n",
490 |     "print filteredRDD.getStorageLevel()"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": 24,
496 |    "metadata": {
497 |     "collapsed": true
498 |    },
499 |    "outputs": [],
500 |    "source": [
501 |     "# Section 8A\n",
502 |     "\n",
503 |     "def brokenTen(value):\n",
504 |     "    \"\"\"Incorrect implementation of the ten function.\n",
505 |     "\n",
506 |     "    Note:\n",
507 |     "        The `if` statement checks an undefined variable `val` instead of `value`.\n",
508 |     "\n",
509 |     "    Args:\n",
510 |     "        value (int): A number.\n",
511 |     "\n",
512 |     "    Returns:\n",
513 |     "        bool: Whether `value` is less than ten.\n",
514 |     "\n",
515 |     "    Raises:\n",
516 |     "        NameError: The function references `val`, which is not available in the local or global\n",
517 |     "            namespace, so a `NameError` is raised.\n",
518 |     "    \"\"\"\n",
519 |     "    if (value < 10):\n",
520 |     "        return True\n",
521 |     "    else:\n",
522 |     "        return False\n",
523 |     "\n",
524 |     "brokenRDD = subRDD.filter(brokenTen)\n",
525 |     "\n",
526 |     "# Now we'll see the error\n",
527 |     "brokenRDD.collect()\n"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": 25,
533 |    "metadata": {
534 |     "collapsed": true
535 |    },
536 |    "outputs": [],
537 |    "source": [
538 |     "# Section 8C\n",
539 |     "# Cleaner code through lambda use\n",
540 |     "subRDD.filter(lambda x: x < 10).collect()\n",
541 |     "\n",
542 |     "# Even better by moving our chain of operators into a single line.\n",
543 |     "sc.parallelize(data).map(lambda y: y - 1).filter(lambda x: x < 10).collect()\n"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": 26,
549 |    "metadata": {
550 |     "collapsed": true
551 |    },
552 |    "outputs": [],
553 |    "source": [
554 |     "# Section 8D\n",
555 |     "(sc\n",
556 |     " .parallelize(data)\n",
557 |     " .map(lambda y: y - 1)\n",
558 |     " .filter(lambda x: x < 10)\n",
559 |     " .collect())\n"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": 27,
565 |    "metadata": {
566 |     "collapsed": true
567 |    },
568 |    "outputs": [],
569 |    "source": []
570 |   }
571 |  ],
572 |  "metadata": {
573 |   "kernelspec": {
574 |    "display_name": "Python 2",
575 |    "language": "python",
576 |    "name": "python2"
577 |   },
578 |   "language_info": {
579 |    "codemirror_mode": {
580 |     "name": "ipython",
581 |     "version": 2
582 |    },
583 |    "file_extension": ".py",
584 |    "mimetype": "text/x-python",
585 |    "name": "python",
586 |    "nbconvert_exporter": "python",
587 |    "pygments_lexer": "ipython2",
588 |    "version": "2.7.13"
589 |   },
590 |   "name": "Packt_Notebook",
591 |   "notebookId": 2573898132801625
592 |  },
593 |  "nbformat": 4,
594 |  "nbformat_minor": 0
595 | }
596 | 


--------------------------------------------------------------------------------
/Chapter06/old/Packt_Notebook.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # This tutorial has been reproduced with permission of Databricks Inc., a leading 
  3 | # provider of Spark-based solutions. Databricks (http://www.databricks.com)
  4 | 
  5 | # Section 1A
  6 | # This is a Python cell. You can run normal Python code here...
  7 | print 'The sum of 1 and 1 is {0}'.format(1+1)
  8 | 
  9 | # Here is another Python cell, this time with a variable (x) declaration and an if statement:
 10 | x = 42
 11 | if x > 40:
 12 |     print 'The sum of 1 and 2 is {0}'.format(1+2)
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # Section 1B
 17 | 
 18 | # This cell relies on x being defined already.
 19 | # If we didn't run the cells from part (1a) this code would fail.
 20 | print x * 2
 21 | 
 22 | # COMMAND ----------
 23 | 
 24 | # Section 1C
 25 | 
 26 | # Import the regular expression library
 27 | import re
 28 | m = re.search('(?<=abc)def', 'abcdef')
 29 | m.group(0)
 30 | 
 31 | # Import the datetime library
 32 | import datetime
 33 | print 'This was last run on: {0}'.format(datetime.datetime.now())
 34 | 
 35 | 
 36 | 
 37 | # COMMAND ----------
 38 | 
 39 | # Section 2A
 40 | # Display the type of the Spark Context sc
 41 | type(sc)
 42 | 
 43 | 
 44 | # COMMAND ----------
 45 | 
 46 | # Section 2B
 47 | # List sc's attributes
 48 | dir(sc)
 49 | 
 50 | 
 51 | # COMMAND ----------
 52 | 
 53 | # Section 2C
 54 | 
 55 | # Use help to obtain more detailed information
 56 | help(sc)
 57 | 
 58 | # After reading the help we've decided we want to use sc.version to see what version of Spark we are running
 59 | sc.version
 60 | 
 61 | # Help can be used on any Python object
 62 | help(map)
 63 | 
 64 | 
 65 | 
 66 | # COMMAND ----------
 67 | 
 68 | # Section 3A
 69 | data = xrange(1, 10001)
 70 | 
 71 | # Data is just a normal Python list
 72 | # Obtain data's first element
 73 | data[0]
 74 | 
 75 | # We can check the size of the list using the len() function
 76 | len(data)
 77 | 
 78 | 
 79 | # COMMAND ----------
 80 | 
 81 | # Section 3B
 82 | # Parallelize data using 8 partitions
 83 | # This operation is a transformation of data into an RDD
 84 | # Spark uses lazy evaluation, so no Spark jobs are run at this point
 85 | xrangeRDD = sc.parallelize(data, 8)
 86 | 
 87 | # Let's view help on parallelize
 88 | help(sc.parallelize)
 89 | 
 90 | 
 91 | # COMMAND ----------
 92 | 
 93 | # Section 3B Continued
 94 | # Let's see what type sc.parallelize() returned
 95 | print 'type of xrangeRDD: {0}'.format(type(xrangeRDD))
 96 | 
 97 | # How about if we use a range
 98 | dataRange = range(1, 10001)
 99 | rangeRDD = sc.parallelize(dataRange, 8)
100 | print 'type of dataRangeRDD: {0}'.format(type(rangeRDD))
101 | 
102 | # Each RDD gets a unique ID
103 | print 'xrangeRDD id: {0}'.format(xrangeRDD.id())
104 | print 'rangeRDD id: {0}'.format(rangeRDD.id())
105 | 
106 | # We can name each newly created RDD using the setName() method
107 | xrangeRDD.setName('My first RDD')
108 | 
109 | # Let's view the lineage (the set of transformations) of the RDD using toDebugString()
110 | print xrangeRDD.toDebugString()
111 | 
112 | 
113 | # COMMAND ----------
114 | 
115 | # Section 3B Continued
116 | # Let's use help to see what methods we can call on this RDD
117 | help(xrangeRDD)
118 | 
119 | 
120 | # COMMAND ----------
121 | 
122 | # Section 3B Continued
123 | 
124 | # Let's see how many partitions the RDD will be split into by using the getNumPartitions()
125 | xrangeRDD.getNumPartitions()
126 | 
127 | 
128 | # COMMAND ----------
129 | 
130 | # Section 3C
131 | # Create sub function to subtract 1
132 | def sub(value):
133 |     """"Subtracts one from `value`.
134 | 
135 |     Args:
136 |        value (int): A number.
137 | 
138 |     Returns:
139 |         int: `value` minus one.
140 |     """
141 |     return (value - 1)
142 | 
143 | # Transform xrangeRDD through map transformation using sub function
144 | # Because map is a transformation and Spark uses lazy evaluation, no jobs, stages,
145 | # or tasks will be launched when we run this code.
146 | subRDD = xrangeRDD.map(sub)
147 | 
148 | # Let's see the RDD transformation hierarchy
149 | print subRDD.toDebugString()
150 | 
151 | 
152 | 
153 | # COMMAND ----------
154 | 
155 | # Section 3D
156 | # Let's collect the data
157 | print subRDD.collect()
158 | 
159 | 
160 | # COMMAND ----------
161 | 
162 | # Section 3D Continued
163 | print xrangeRDD.count()
164 | print subRDD.count()
165 | 
166 | 
167 | # COMMAND ----------
168 | 
169 | # Section 3E
170 | # Define a function to filter a single value
171 | def ten(value):
172 |     """Return whether value is below ten.
173 | 
174 |     Args:
175 |         value (int): A number.
176 | 
177 |     Returns:
178 |         bool: Whether `value` is less than ten.
179 |     """
180 |     if (value < 10):
181 |         return True
182 |     else:
183 |         return False
184 | # The ten function could also be written concisely as: def ten(value): return value < 10
185 | 
186 | # Pass the function ten to the filter transformation
187 | # Filter is a transformation so no tasks are run
188 | filteredRDD = subRDD.filter(ten)
189 | 
190 | # View the results using collect()
191 | # Collect is an action and triggers the filter transformation to run
192 | print filteredRDD.collect()
193 | 
194 | 
195 | # COMMAND ----------
196 | 
197 | # Section 4
198 | 
199 | lambdaRDD = subRDD.filter(lambda x: x < 10)
200 | lambdaRDD.collect()
201 | 
202 | # Let's collect the even values less than 10
203 | evenRDD = lambdaRDD.filter(lambda x: x % 2 == 0)
204 | evenRDD.collect()
205 | 
206 | # COMMAND ----------
207 | 
208 | # Section 5A
209 | 
210 | # Let's get the first element
211 | print filteredRDD.first()
212 | # The first 4
213 | print filteredRDD.take(4)
214 | # Note that it is ok to take more elements than the RDD has
215 | print filteredRDD.take(12)
216 | 
217 | # Retrieve the three smallest elements
218 | print filteredRDD.takeOrdered(3)
219 | # Retrieve the five largest elements
220 | print filteredRDD.top(5)
221 | 
222 | # Pass a lambda function to takeOrdered to reverse the order
223 | filteredRDD.takeOrdered(4, lambda s: -s)
224 | 
225 | # Obtain Python's add function
226 | from operator import add
227 | # Efficiently sum the RDD using reduce
228 | print filteredRDD.reduce(add)
229 | # Sum using reduce with a lambda function
230 | print filteredRDD.reduce(lambda a, b: a + b)
231 | # Note that subtraction is not both associative and commutative
232 | print filteredRDD.reduce(lambda a, b: a - b)
233 | print filteredRDD.repartition(4).reduce(lambda a, b: a - b)
234 | # While addition is
235 | print filteredRDD.repartition(4).reduce(lambda a, b: a + b)
236 | 
237 | 
238 | # COMMAND ----------
239 | 
240 | # Section 5B
241 | # takeSample reusing elements
242 | print filteredRDD.takeSample(withReplacement=True, num=6)
243 | # takeSample without reuse
244 | print filteredRDD.takeSample(withReplacement=False, num=6)
245 | 
246 | # Set seed for predictability
247 | print filteredRDD.takeSample(withReplacement=False, num=6, seed=500)
248 | # Try reruning this cell and the cell above -- the results from this cell will remain constant
249 | # Use ctrl-enter to run without moving to the next cell
250 | 
251 | # Create new base RDD to show countByValue
252 | repetitiveRDD = sc.parallelize([1, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 3, 3, 4, 5, 4, 6])
253 | print repetitiveRDD.countByValue()
254 | 
255 | 
256 | # COMMAND ----------
257 | 
258 | # Section 6A
259 | 
260 | # Let's create a new base RDD to work from
261 | wordsList = ['cat', 'elephant', 'rat', 'rat', 'cat']
262 | wordsRDD = sc.parallelize(wordsList, 4)
263 | 
264 | # Use map
265 | singularAndPluralWordsRDDMap = wordsRDD.map(lambda x: (x, x + 's'))
266 | # Use flatMap
267 | singularAndPluralWordsRDD = wordsRDD.flatMap(lambda x: (x, x + 's'))
268 | 
269 | # View the results
270 | print singularAndPluralWordsRDDMap.collect()
271 | print singularAndPluralWordsRDD.collect()
272 | # View the number of elements in the RDD
273 | print singularAndPluralWordsRDDMap.count()
274 | print singularAndPluralWordsRDD.count()
275 | 
276 | simpleRDD = sc.parallelize([2, 3, 4])
277 | print simpleRDD.map(lambda x: range(1, x)).collect()
278 | print simpleRDD.flatMap(lambda x: range(1, x)).collect()
279 | 
280 | 
281 | # COMMAND ----------
282 | 
283 | # Section 6B
284 | 
285 | pairRDD = sc.parallelize([('a', 1), ('a', 2), ('b', 1)])
286 | # mapValues only used to improve format for printing
287 | print pairRDD.groupByKey().mapValues(lambda x: list(x)).collect()
288 | 
289 | # Different ways to sum by key
290 | print pairRDD.groupByKey().map(lambda (k, v): (k, sum(v))).collect()
291 | # Using mapValues, which is recommended when they key doesn't change
292 | print pairRDD.groupByKey().mapValues(lambda x: sum(x)).collect()
293 | # reduceByKey is more efficient / scalable
294 | print pairRDD.reduceByKey(add).collect()
295 | 
296 | 
297 | # COMMAND ----------
298 | 
299 | # Section 6C
300 | 
301 | # mapPartitions takes a function that takes an iterator and returns an iterator
302 | print wordsRDD.collect()
303 | itemsRDD = wordsRDD.mapPartitions(lambda iterator: [','.join(iterator)])
304 | print itemsRDD.collect()
305 | 
306 | itemsByPartRDD = wordsRDD.mapPartitionsWithIndex(lambda index, iterator: [(index, list(iterator))])
307 | # We can see that three of the (partitions) workers have one element and the fourth worker has two
308 | # elements, although things may not bode well for the rat...
309 | print itemsByPartRDD.collect()
310 | # Rerun without returning a list (acts more like flatMap)
311 | itemsByPartRDD = wordsRDD.mapPartitionsWithIndex(lambda index, iterator: (index, list(iterator)))
312 | print itemsByPartRDD.collect()
313 | 
314 | 
315 | # COMMAND ----------
316 | 
317 | # Section 7A
318 | # Name the RDD
319 | filteredRDD.setName('My Filtered RDD')
320 | # Cache the RDD
321 | filteredRDD.cache()
322 | # Is it cached
323 | print filteredRDD.is_cached
324 | 
325 | 
326 | # COMMAND ----------
327 | 
328 | # Section 7B
329 | # Note that toDebugString also provides storage information
330 | print filteredRDD.toDebugString()
331 | 
332 | # If we are done with the RDD we can unpersist it so that its memory can be reclaimed
333 | filteredRDD.unpersist()
334 | # Storage level for a non cached RDD
335 | print filteredRDD.getStorageLevel()
336 | filteredRDD.cache()
337 | # Storage level for a cached RDD
338 | print filteredRDD.getStorageLevel()
339 | 
340 | # COMMAND ----------
341 | 
342 | # Section 8A
343 | 
344 | def brokenTen(value):
345 |     """Incorrect implementation of the ten function.
346 | 
347 |     Note:
348 |         The `if` statement checks an undefined variable `val` instead of `value`.
349 | 
350 |     Args:
351 |         value (int): A number.
352 | 
353 |     Returns:
354 |         bool: Whether `value` is less than ten.
355 | 
356 |     Raises:
357 |         NameError: The function references `val`, which is not available in the local or global
358 |             namespace, so a `NameError` is raised.
359 |     """
360 |     if (value < 10):
361 |         return True
362 |     else:
363 |         return False
364 | 
365 | brokenRDD = subRDD.filter(brokenTen)
366 | 
367 | # Now we'll see the error
368 | brokenRDD.collect()
369 | 
370 | 
371 | # COMMAND ----------
372 | 
373 | # Section 8C
374 | # Cleaner code through lambda use
375 | subRDD.filter(lambda x: x < 10).collect()
376 | 
377 | # Even better by moving our chain of operators into a single line.
378 | sc.parallelize(data).map(lambda y: y - 1).filter(lambda x: x < 10).collect()
379 | 
380 | 
381 | # COMMAND ----------
382 | 
383 | # Section 8D
384 | (sc
385 |  .parallelize(data)
386 |  .map(lambda y: y - 1)
387 |  .filter(lambda x: x < 10)
388 |  .collect())
389 | 
390 | 
391 | # COMMAND ----------
392 | 
393 | 
394 | 


--------------------------------------------------------------------------------
/Chapter07/.Rhistory:
--------------------------------------------------------------------------------
  1 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
  2 | diab_train <- diab[training_index,]
  3 | diab_test  <- diab[-training_index,]
  4 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
  5 | rf_model <- train(diabetes ~ ., data = diab_train, method = "glmnet",
  6 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "ROC")
  7 | varImp(rf_model)
  8 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
  9 | head(predictions)
 10 | cf <- confusionMatrix(predictions, diab_test$diabetes)
 11 | cf
 12 | plot(rf_model)
 13 | fourfoldplot(cf$table)
 14 | library(doMC)
 15 | registerDoMC(cores = 8)
 16 | data("PimaIndiansDiabetes2",package = 'mlbench')
 17 | diab <- PimaIndiansDiabetes
 18 | diab <- knnImputation(diab)
 19 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
 20 | diab_train <- diab[training_index,]
 21 | diab_test  <- diab[-training_index,]
 22 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
 23 | rf_model <- train(diabetes ~ ., data = diab_train, method = "glmnet",
 24 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "ROC")
 25 | varImp(rf_model)
 26 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
 27 | head(predictions)
 28 | cf <- confusionMatrix(predictions, diab_test$diabetes)
 29 | cf
 30 | plot(rf_model)
 31 | fourfoldplot(cf$table)
 32 | library(doMC)
 33 | registerDoMC(cores = 8)
 34 | data("PimaIndiansDiabetes2",package = 'mlbench')
 35 | diab <- PimaIndiansDiabetes
 36 | diab <- knnImputation(diab)
 37 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
 38 | diab_train <- diab[training_index,]
 39 | diab_test  <- diab[-training_index,]
 40 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
 41 | rf_model <- train(diabetes ~ ., data = diab_train, method = "glmnet",
 42 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "ROC")
 43 | varImp(rf_model)
 44 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
 45 | head(predictions)
 46 | cf <- confusionMatrix(predictions, diab_test$diabetes)
 47 | cf
 48 | plot(rf_model)
 49 | fourfoldplot(cf$table)
 50 | library(doMC)
 51 | registerDoMC(cores = 8)
 52 | data("PimaIndiansDiabetes2",package = 'mlbench')
 53 | set.seed(1)
 54 | diab <- PimaIndiansDiabetes2
 55 | diab <- knnImputation(diab)
 56 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
 57 | diab_train <- diab[training_index,]
 58 | diab_test  <- diab[-training_index,]
 59 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
 60 | rf_model <- train(diabetes ~ ., data = diab_train, method = "glmnet",
 61 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "ROC")
 62 | varImp(rf_model)
 63 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
 64 | head(predictions)
 65 | cf <- confusionMatrix(predictions, diab_test$diabetes)
 66 | cf
 67 | plot(rf_model)
 68 | fourfoldplot(cf$table)
 69 | library(doMC)
 70 | registerDoMC(cores = 8)
 71 | data("PimaIndiansDiabetes2",package = 'mlbench')
 72 | set.seed(1)
 73 | diab <- PimaIndiansDiabetes2
 74 | diab <- knnImputation(diab)
 75 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
 76 | diab_train <- diab[training_index,]
 77 | diab_test  <- diab[-training_index,]
 78 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
 79 | rf_model <- train(diabetes ~ ., data = diab_train, method = "glmnet",
 80 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "ROC")
 81 | varImp(rf_model)
 82 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
 83 | head(predictions)
 84 | cf <- confusionMatrix(predictions, diab_test$diabetes)
 85 | cf
 86 | plot(rf_model)
 87 | fourfoldplot(cf$table)
 88 | library(doMC)
 89 | registerDoMC(cores = 8)
 90 | data("PimaIndiansDiabetes2",package = 'mlbench')
 91 | set.seed(1)
 92 | diab <- PimaIndiansDiabetes
 93 | diab <- knnImputation(diab)
 94 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
 95 | diab_train <- diab[training_index,]
 96 | diab_test  <- diab[-training_index,]
 97 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
 98 | rf_model <- train(diabetes ~ ., data = diab_train, method = "glmnet",
 99 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "ROC")
100 | varImp(rf_model)
101 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
102 | head(predictions)
103 | cf <- confusionMatrix(predictions, diab_test$diabetes)
104 | cf
105 | plot(rf_model)
106 | fourfoldplot(cf$table)
107 | library(doMC)
108 | registerDoMC(cores = 8)
109 | data("PimaIndiansDiabetes2",package = 'mlbench')
110 | set.seed(1)
111 | diab <- PimaIndiansDiabetes
112 | diab <- knnImputation(diab)
113 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
114 | diab_train <- diab[training_index,]
115 | diab_test  <- diab[-training_index,]
116 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
117 | rf_model <- train(diabetes ~ ., data = diab_train, method = "glmnet",
118 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 15, metric = "ROC")
119 | varImp(rf_model)
120 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
121 | head(predictions)
122 | cf <- confusionMatrix(predictions, diab_test$diabetes)
123 | cf
124 | plot(rf_model)
125 | fourfoldplot(cf$table)
126 | library(doMC)
127 | registerDoMC(cores = 8)
128 | data("PimaIndiansDiabetes2",package = 'mlbench')
129 | set.seed(100)
130 | diab <- PimaIndiansDiabetes
131 | diab <- knnImputation(diab)
132 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
133 | diab_train <- diab[training_index,]
134 | diab_test  <- diab[-training_index,]
135 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
136 | rf_model <- train(diabetes ~ ., data = diab_train, method = "glmnet",
137 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 15, metric = "ROC")
138 | varImp(rf_model)
139 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
140 | head(predictions)
141 | cf <- confusionMatrix(predictions, diab_test$diabetes)
142 | cf
143 | plot(rf_model)
144 | fourfoldplot(cf$table)
145 | library(doMC)
146 | registerDoMC(cores = 8)
147 | data("PimaIndiansDiabetes2",package = 'mlbench')
148 | set.seed(100)
149 | diab <- PimaIndiansDiabetes
150 | diab <- knnImputation(diab)
151 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
152 | diab_train <- diab[training_index,]
153 | diab_test  <- diab[-training_index,]
154 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
155 | rf_model <- train(diabetes ~ ., data = diab_train, method = "rf",
156 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 15, metric = "ROC")
157 | varImp(rf_model)
158 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
159 | head(predictions)
160 | cf <- confusionMatrix(predictions, diab_test$diabetes)
161 | cf
162 | plot(rf_model)
163 | fourfoldplot(cf$table)
164 | library(doMC)
165 | registerDoMC(cores = 8)
166 | data("PimaIndiansDiabetes2",package = 'mlbench')
167 | set.seed(100)
168 | diab <- PimaIndiansDiabetes2
169 | diab <- knnImputation(diab)
170 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
171 | diab_train <- diab[training_index,]
172 | diab_test  <- diab[-training_index,]
173 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
174 | rf_model <- train(diabetes ~ ., data = diab_train, method = "rf",
175 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 7, metric = "ROC")
176 | varImp(rf_model)
177 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
178 | head(predictions)
179 | cf <- confusionMatrix(predictions, diab_test$diabetes)
180 | cf
181 | plot(rf_model)
182 | fourfoldplot(cf$table)
183 | plot(rf_model)
184 | plot(rf_model, plotType = "level")
185 | library(doMC)
186 | registerDoMC(cores = 8)
187 | data("PimaIndiansDiabetes2",package = 'mlbench')
188 | set.seed(100)
189 | diab <- PimaIndiansDiabetes2
190 | diab <- knnImputation(diab)
191 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
192 | diab_train <- diab[training_index,]
193 | diab_test  <- diab[-training_index,]
194 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
195 | rf_model <- train(diabetes ~ ., data = diab_train, method = "rpart",
196 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 7, metric = "ROC")
197 | varImp(rf_model)
198 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
199 | head(predictions)
200 | cf <- confusionMatrix(predictions, diab_test$diabetes)
201 | cf
202 | plot(rf_model)
203 | fourfoldplot(cf$table)
204 | library(doMC)
205 | registerDoMC(cores = 8)
206 | data("PimaIndiansDiabetes2",package = 'mlbench')
207 | set.seed(100)
208 | diab <- PimaIndiansDiabetes2
209 | diab <- knnImputation(diab)
210 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
211 | diab_train <- diab[training_index,]
212 | diab_test  <- diab[-training_index,]
213 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
214 | rf_model <- train(diabetes ~ ., data = diab_train, method = "rpart",
215 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "ROC")
216 | varImp(rf_model)
217 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
218 | head(predictions)
219 | cf <- confusionMatrix(predictions, diab_test$diabetes)
220 | cf
221 | plot(rf_model)
222 | fourfoldplot(cf$table)
223 | ?train
224 | rf_model <- train(diabetes ~ ., data = diab_train, method = "xgboost",
225 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 7, metric = "ROC")
226 | library(doMC)
227 | registerDoMC(cores = 8)
228 | data("PimaIndiansDiabetes2",package = 'mlbench')
229 | set.seed(100)
230 | diab <- PimaIndiansDiabetes2
231 | diab <- knnImputation(diab)
232 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
233 | diab_train <- diab[training_index,]
234 | diab_test  <- diab[-training_index,]
235 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
236 | rf_model <- train(diabetes ~ ., data = diab_train, method = "rpart",
237 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 7, metric = "ROC")
238 | varImp(rf_model)
239 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
240 | head(predictions)
241 | cf <- confusionMatrix(predictions, diab_test$diabetes)
242 | cf
243 | plot(rf_model)
244 | fourfoldplot(cf$table)
245 | library(doMC)
246 | registerDoMC(cores = 8)
247 | data("PimaIndiansDiabetes2",package = 'mlbench')
248 | set.seed(105)
249 | diab <- PimaIndiansDiabetes2
250 | diab <- knnImputation(diab)
251 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
252 | diab_train <- diab[training_index,]
253 | diab_test  <- diab[-training_index,]
254 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
255 | rf_model <- train(diabetes ~ ., data = diab_train, method = "rpart",
256 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 7, metric = "ROC")
257 | varImp(rf_model)
258 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
259 | head(predictions)
260 | cf <- confusionMatrix(predictions, diab_test$diabetes)
261 | cf
262 | plot(rf_model)
263 | fourfoldplot(cf$table)
264 | library(doMC)
265 | registerDoMC(cores = 8)
266 | data("PimaIndiansDiabetes2",package = 'mlbench')
267 | set.seed(105)
268 | diab <- PimaIndiansDiabetes2
269 | diab <- knnImputation(diab)
270 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
271 | diab_train <- diab[training_index,]
272 | diab_test  <- diab[-training_index,]
273 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
274 | rf_model <- train(diabetes ~ ., data = diab_train, method = "rf",
275 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 7, metric = "ROC")
276 | varImp(rf_model)
277 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
278 | head(predictions)
279 | cf <- confusionMatrix(predictions, diab_test$diabetes)
280 | cf
281 | plot(rf_model)
282 | fourfoldplot(cf$table)
283 | library(doMC)
284 | registerDoMC(cores = 8)
285 | data("PimaIndiansDiabetes2",package = 'mlbench')
286 | set.seed(105)
287 | diab <- PimaIndiansDiabetes2
288 | diab <- knnImputation(diab)
289 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
290 | diab_train <- diab[training_index,]
291 | diab_test  <- diab[-training_index,]
292 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
293 | rf_model <- train(diabetes ~ ., data = diab_train, method = "gbm",
294 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 7, metric = "ROC")
295 | varImp(rf_model)
296 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
297 | head(predictions)
298 | cf <- confusionMatrix(predictions, diab_test$diabetes)
299 | cf
300 | plot(rf_model)
301 | fourfoldplot(cf$table)
302 | rf_model <- train(diabetes ~ ., data = diab_train, method = "nnet",
303 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 7, metric = "ROC")
304 | varImp(rf_model)
305 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
306 | head(predictions)
307 | cf <- confusionMatrix(predictions, diab_test$diabetes)
308 | cf
309 | plot(rf_model)
310 | fourfoldplot(cf$table)
311 | plot(rf_model)
312 | library(doMC)
313 | registerDoMC(cores = 8)
314 | data("PimaIndiansDiabetes2",package = 'mlbench')
315 | set.seed(100)
316 | diab <- PimaIndiansDiabetes2
317 | diab <- knnImputation(diab)
318 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
319 | diab_train <- diab[training_index,]
320 | diab_test  <- diab[-training_index,]
321 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE, summaryFunction = twoClassSummary)
322 | rf_model <- train(diabetes ~ ., data = diab_train, method = "nnet",
323 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 7, metric = "ROC")
324 | varImp(rf_model)
325 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
326 | head(predictions)
327 | cf <- confusionMatrix(predictions, diab_test$diabetes)
328 | cf
329 | plot(rf_model)
330 | fourfoldplot(cf$table)
331 | library(doMC)
332 | registerDoMC(cores = 8)
333 | data("PimaIndiansDiabetes2",package = 'mlbench')
334 | set.seed(100)
335 | diab <- PimaIndiansDiabetes2
336 | diab <- knnImputation(diab)
337 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
338 | diab_train <- diab[training_index,]
339 | diab_test  <- diab[-training_index,]
340 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "grid", classProbs = TRUE)
341 | rf_model <- train(diabetes ~ ., data = diab_train, method = "nnet",
342 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 7, metric = "Accuracy")
343 | varImp(rf_model)
344 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
345 | head(predictions)
346 | cf <- confusionMatrix(predictions, diab_test$diabetes)
347 | cf
348 | plot(rf_model)
349 | fourfoldplot(cf$table)
350 | library(doMC)
351 | registerDoMC(cores = 8)
352 | data("PimaIndiansDiabetes2",package = 'mlbench')
353 | set.seed(100)
354 | diab <- PimaIndiansDiabetes2
355 | diab <- knnImputation(diab)
356 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
357 | diab_train <- diab[training_index,]
358 | diab_test  <- diab[-training_index,]
359 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "random", classProbs = TRUE)
360 | nn_model <- train(diabetes ~ ., data = diab_train, method = "nnet",
361 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 7, metric = "Accuracy")
362 | varImp(nn_model)
363 | predictions <- predict(nn_model, diab_test[,-ncol(diab_test)])
364 | head(predictions)
365 | cf <- confusionMatrix(predictions, diab_test$diabetes)
366 | cf
367 | plot(nn_model)
368 | fourfoldplot(cf$table)
369 | library(doMC)
370 | registerDoMC(cores = 8)
371 | data("PimaIndiansDiabetes2",package = 'mlbench')
372 | set.seed(100)
373 | diab <- PimaIndiansDiabetes2
374 | diab <- knnImputation(diab)
375 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
376 | diab_train <- diab[training_index,]
377 | diab_test  <- diab[-training_index,]
378 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "random", classProbs = TRUE)
379 | nn_model <- train(diabetes ~ ., data = diab_train, method = "nnet",
380 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 7, metric = "Accuracy")
381 | varImp(nn_model)
382 | predictions <- predict(nn_model, diab_test[,-ncol(diab_test)])
383 | head(predictions)
384 | cf <- confusionMatrix(predictions, diab_test$diabetes)
385 | cf
386 | plot(nn_model)
387 | fourfoldplot(cf$table)
388 | library(doMC)
389 | registerDoMC(cores = 8)
390 | data("PimaIndiansDiabetes2",package = 'mlbench')
391 | set.seed(100)
392 | diab <- PimaIndiansDiabetes2
393 | diab <- knnImputation(diab)
394 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
395 | diab_train <- diab[training_index,]
396 | diab_test  <- diab[-training_index,]
397 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "random", classProbs = TRUE)
398 | nn_model <- train(diabetes ~ ., data = diab_train, method = "nnet",
399 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "Accuracy")
400 | varImp(nn_model)
401 | predictions <- predict(nn_model, diab_test[,-ncol(diab_test)])
402 | head(predictions)
403 | cf <- confusionMatrix(predictions, diab_test$diabetes)
404 | cf
405 | plot(nn_model)
406 | fourfoldplot(cf$table)
407 | library(doMC)
408 | registerDoMC(cores = 8)
409 | data("PimaIndiansDiabetes2",package = 'mlbench')
410 | set.seed(100)
411 | diab <- PimaIndiansDiabetes2
412 | diab <- knnImputation(diab)
413 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
414 | diab_train <- diab[training_index,]
415 | diab_test  <- diab[-training_index,]
416 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "random", classProbs = TRUE)
417 | nn_model <- train(diabetes ~ ., data = diab_train, method = "nnet",
418 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 15, metric = "Accuracy")
419 | varImp(nn_model)
420 | predictions <- predict(nn_model, diab_test[,-ncol(diab_test)])
421 | head(predictions)
422 | cf <- confusionMatrix(predictions, diab_test$diabetes)
423 | cf
424 | plot(nn_model)
425 | fourfoldplot(cf$table)
426 | library(doMC)
427 | registerDoMC(cores = 8)
428 | data("PimaIndiansDiabetes2",package = 'mlbench')
429 | set.seed(100)
430 | diab <- PimaIndiansDiabetes2
431 | diab <- knnImputation(diab)
432 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
433 | diab_train <- diab[training_index,]
434 | diab_test  <- diab[-training_index,]
435 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "random", classProbs = TRUE)
436 | nn_model <- train(diabetes ~ ., data = diab_train, method = "nnet",
437 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "Accuracy")
438 | varImp(nn_model)
439 | predictions <- predict(nn_model, diab_test[,-ncol(diab_test)])
440 | head(predictions)
441 | cf <- confusionMatrix(predictions, diab_test$diabetes)
442 | cf
443 | plot(nn_model)
444 | fourfoldplot(cf$table)
445 | library(doMC)
446 | registerDoMC(cores = 8)
447 | data("PimaIndiansDiabetes2",package = 'mlbench')
448 | set.seed(100)
449 | diab <- PimaIndiansDiabetes
450 | diab <- knnImputation(diab)
451 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
452 | diab_train <- diab[training_index,]
453 | diab_test  <- diab[-training_index,]
454 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "random", classProbs = TRUE)
455 | nn_model <- train(diabetes ~ ., data = diab_train, method = "nnet",
456 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "Accuracy")
457 | varImp(nn_model)
458 | predictions <- predict(nn_model, diab_test[,-ncol(diab_test)])
459 | head(predictions)
460 | cf <- confusionMatrix(predictions, diab_test$diabetes)
461 | cf
462 | plot(nn_model)
463 | fourfoldplot(cf$table)
464 | diab <- PimaIndiansDiabetes2
465 | length(is.na(diab))
466 | sum(is.na(diab))
467 | is.na(diab)
468 | anyNA(diab)
469 | is.na(data.frame(a=c(1,NA),b=c(NA,NA)))
470 | sum(is.na(data.frame(a=c(1,NA),b=c(NA,NA))))
471 | sum(is.na(diab))
472 | library(doMC)
473 | registerDoMC(cores = 8)
474 | data("PimaIndiansDiabetes2",package = 'mlbench')
475 | set.seed(100)
476 | diab <- PimaIndiansDiabetes2
477 | diab <- knnImputation(diab)
478 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
479 | diab_train <- diab[training_index,]
480 | diab_test  <- diab[-training_index,]
481 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "random", classProbs = TRUE)
482 | nn_model <- train(diabetes ~ ., data = diab_train, method = "nnet",
483 | trControl = diab_control, tuneLength = 10, metric = "Accuracy")
484 | varImp(nn_model)
485 | predictions <- predict(nn_model, diab_test[,-ncol(diab_test)])
486 | head(predictions)
487 | cf <- confusionMatrix(predictions, diab_test$diabetes)
488 | cf
489 | plot(nn_model)
490 | fourfoldplot(cf$table)
491 | library(doMC)
492 | registerDoMC(cores = 8)
493 | data("PimaIndiansDiabetes2",package = 'mlbench')
494 | set.seed(100)
495 | diab <- PimaIndiansDiabetes2
496 | diab <- knnImputation(diab)
497 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
498 | diab_train <- diab[training_index,]
499 | diab_test  <- diab[-training_index,]
500 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "random", classProbs = TRUE)
501 | nn_model <- train(diabetes ~ ., data = diab_train, method = "nnet",
502 | preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "Accuracy")
503 | varImp(nn_model)
504 | predictions <- predict(nn_model, diab_test[,-ncol(diab_test)])
505 | head(predictions)
506 | cf <- confusionMatrix(predictions, diab_test$diabetes)
507 | cf
508 | plot(nn_model)
509 | fourfoldplot(cf$table)
510 | cf
511 | plot(nn_model)
512 | fourfoldplot(cf$table)
513 | 


--------------------------------------------------------------------------------
/Chapter07/chapter7_R_code.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | # Chapter 7
  3 | 
  4 | # Centering & Scaling
  5 | # ---
  6 | scores <- c(45,66,66,55,55,52,61,64,65,68)scale(scores)
  7 | 
  8 | 
  9 | # nearZeroVar
 10 | # ---
 11 | library(caret)repeated <- c(rep(100,9999),10) # 9999 values are 100 and the last value is 10random <- sample(100,10000,T) # 10,000 random values from 1 – 100data <- data.frame(random = random, repeated = repeated)nearZeroVar(data)# [1] 2names(data)[nearZeroVar(data)][1] "repeated"
 12 | 
 13 | 
 14 | # Correlations
 15 | # ---
 16 | Install.packages("mlbench")Install.packages("corrplot")library(corrplot)library(mlbench)diab <- PimaIndiansDiabetes# To produce a correlogramcorrplot(cor(diab[,-ncol(diab)]), method="color", type="upper")# To get the actual numberscorrplot(cor(diab[,-ncol(diab)]), method="number", type="upper")correlated_columns <- findCorrelation(cor(diab[,-ncol(diab)]), cutoff = 0.5)correlated_columns
 17 | 
 18 | 
 19 | 
 20 | # Data Sampling
 21 | # ---
 22 | library(mlbench)library(caret)diab     <- PimaIndiansDiabetesdiabsim  <- diabdiabrows <- nrow(diabsim)negrows  <- floor(.95 * diabrows)posrows  <- (diabrows - negrows)negrows# [1] 729posrows# [1] 39diabsim$diabetes[1:729]     <- as.factor("neg")diabsim$diabetes[-c(1:729)] <- as.factor("pos")table(diabsim$diabetes)# neg pos # 729  39 # We observe that in this simulated dataset, we have 729 occurrences of positive outcome and 39 occurrences of negative outcome# Method 1: Upsampling, i.e., increasing the number of observations marked as ‘pos’ (i.e., positive)upsampled_simdata <- upSample(diabsim[,-ncol(diabsim)], diabsim$diabetes)table(upsampled_simdata$Class)# neg pos # 729 729# NOTE THAT THE OUTCOME IS CALLED AS ‘Class’ and not ‘diabetes’# This is because of the use of the variable separately# We can always rename the column to revert to the original name# Method 2: Downsampling, i.e., reducing the number of observations marked as ‘pos’ (i.e., positive)downsampled_simdata <- downSample(diabsim[,-ncol(diabsim)], diabsim$diabetes)table(downsampled_simdata$Class)# neg pos # 39  39 
 23 | 
 24 | 
 25 | # SMOTE
 26 | # ---
 27 | 
 28 | # Method 3: SMOTE# The function SMOTE is available in the R Package DMwR# In order to use it, we first need to install DmWR as followsinstall.packages ("DMwR")# Once the package has been installed, we will create a synthetic# Dataset in which we will increase the number of ‘neg’ records# Let us check once again the distribution of neg/pos in the datasettable(diabsim$diabetes)# neg pos # 729  39 # Using SMOTE we can create synthetic cases of ‘pos’ as followsdiabsyn <- SMOTE(diabetes ~ ., diabsim, perc.over = 500, perc.under = 150)# perc.over = 500 means, increase the occurrence of the minority# class by 500%, i.e., 39 + 5*39 = 39 + 195 = 234# perc.under = 150 means, that for each new record generated for the# Minority class, we will generate 1.5 cases of the majority class# In this case, we created 195 new records (500% of 39) and hence# we will generate 150% of 195 records = 195 * 150% = 195 * 1.5# = 292.5, or 292 (rounded down) new records# We can verify this by running the table command against the newly# Created synthetic dataset, diabsyntable(diabsyn$diabetes)# neg pos # 292 234 
 29 | 
 30 | # ROSE
 31 | # ---
 32 | install.packages("ROSE")library(ROSE)# Loaded ROSE 0.0-3set.seed(1)diabsyn2 <- ROSE(diabetes ~ ., data=diabsim)table(diabsyn2$data$diabetes)# neg pos # 395 373
 33 | 
 34 | # Data Imputation
 35 | # ---
 36 | 
 37 | library(DMwR)library(caret)diab <- PimaIndiansDiabetes# In the dataset, the column mass represents the body mass index# Of the individuals represented in the corresponding row# mass: Body mass index (weight in kg/(height in m)\^2)# Creating a backup of the diabetes dataframediabmiss_orig <- diab# Creating a separate dataframe which we will modifydiabmiss <- diabmiss_orig# Saving the original values for body massactual <- diabmiss_orig$mass# Change 91 values of mass to NA in the datasetdiabmiss$mass[10:100] <- NA# Number of missing values in masssum(is.na(diabmiss$mass))# 91# View the missing valuesdiabmiss[5:15,]# Test with using the mean, we will set all the missing values# To the mean value for the columndiabmiss$mass[is.na(diabmiss$mass)] <- mean(diabmiss$mass,na.rm = TRUE)# Check the values that have been imputeddata.frame(actual=actual[10:100], impute_with_mean=diabmiss$mass[10:100])# Check the Root-Mean-Squared-Error for the entire column# Root Mean Squared Error provides an estimate for the# Difference between the actual and the predicted values# On ‘average’diabmissdf <- data.frame(actual=actual, impute_with_mean=diabmiss$mass)rmse1 <- RMSE(diabmissdf$impute_with_mean,actual)rmse1# [1] 3.417476# We will re-run the exercise using knnImputation (from package DMwR)# Change the value of the records back to NAdiabmiss <- diabmiss_origdiabmiss$mass[10:100] <- NA# Perform knnImputationdiabknn <- knnImputation(diabmiss,k=25)# Check the RMSE value for the knnImputation methodrmse2 <- RMSE(diabknn$mass,actual)rmse2# [1] 3.093827# Improvement using the knnImputation methods in percentage terms100 * (rmse1-rmse2)/rmse1[1] 22.20689
 38 | 
 39 | 
 40 | 
 41 | # createDataPartition
 42 | # ---
 43 | 
 44 | diab <- PimaIndiansDiabetes
 45 | 
 46 | # We will use the createDataPartition function from caret to split
 47 | # The data. The function produces a set of indices using which we
 48 | # will create the corresponding training and test sets
 49 | 
 50 | training_index <- createDataPartition(diab$diabetes, p = 0.80, list = FALSE, times = 1)
 51 | 
 52 | # Creating the training set
 53 | diab_train <- diab[training_index,]
 54 | 
 55 | # Create the test set
 56 | diab_test  <- diab[-training_index,]
 57 | 
 58 | # Create the trainControl parameters for the model
 59 | diab_control <- trainControl("repeatedcv", number = 3, repeats = 2, classProbs = TRUE, summaryFunction = twoClassSummary)
 60 | 
 61 | # Build the model
 62 | rf_model <- train(diabetes ~ ., data = diab_train, method = "rf", 
 63 |                   preProc = c("center", "scale"), tuneLength = 5, trControl = diab_control, metric = "ROC")
 64 | 
 65 | # Find the Variable Importance
 66 | varImp(rf_model)
 67 | 
 68 | # rf variable importance
 69 | # 
 70 | # Overall
 71 | # glucose  100.000
 72 | # mass      52.669
 73 | # age       39.230
 74 | # pedigree  24.885
 75 | # pressure  12.619
 76 | # pregnant   6.919
 77 | # insulin    2.294
 78 | # triceps    0.000
 79 | 
 80 | # This indicates that glucose levels, body mass index and age are the top 3 predictors of diabetes.
 81 | 
 82 | # caret also includes several useful plot functions. We can visualize the variable importance using the command:
 83 | 
 84 | plot(varImp(rf_model))
 85 | 
 86 | 
 87 | # Cross Validation
 88 | # ---
 89 | # Create the trainControl parameters for the model# The parameters indicate that a 3-Fold CV would be created# and that the process would be repeated 2 times (repeats)# The class probabilities in each run will be stored# And we’ll use the twoClassSummary* function to measure the model# Performancediab_control <- trainControl("repeatedcv", number = 3, repeats = 2, classProbs = TRUE, summaryFunction = twoClassSummary)# Build the model# We used the train function of caret to build the model# As part of the training process, we specified a tunelength** of 5# This parameter lets caret select a set of default model parameters# trControl = diab_control indicates that the model will be built# Using the cross-validation method specified in diab_control# Finally preProc = c("center", "scale") indicate that the data# Would be centered and scaled at each pass of the model iterationrf_model <- train(diabetes ~ ., data = diab_train, method = "rf", preProc = c("center", "scale"), tuneLength = 5, trControl = diab_control, metric = "ROC")
 90 | 
 91 | 
 92 | # Create Model, Get Predictions
 93 | # ---
 94 | 
 95 | # Install the R Package e1071, if you haven’t already
 96 | # By running install.packages("e1071")
 97 | 
 98 | # Use the predict function and the rf_model that was previously built
 99 | # To get the predictions on the test dataset
100 | # Note that we are not including the column diabetes in the test
101 | # dataset by using diab_test[,-ncol(diab_test)]
102 | 
103 | predictions <- predict(rf_model, diab_test[,-ncol(diab_test)])
104 | 
105 | # First few records predicted
106 | 
107 | head(predictions)
108 | # [1] neg neg pos pos pos pos
109 | Levels: neg pos
110 | 
111 | # The confusion matrix allows us to see the number of true positives
112 | # False positives, True negatives and False negatives
113 | 
114 | cf <- confusionMatrix(predictions, diab_test$diabetes)
115 | cf
116 | 
117 | # Confusion Matrix and Statistics
118 | # 
119 | # 		Reference
120 | # Prediction neg pos
121 | # 		neg  89  21
122 | # 		pos  11  32
123 | # 
124 | # Accuracy : 0.7908          
125 | # 95% CI : (0.7178, 0.8523)
126 | # No Information Rate : 0.6536          
127 | # P-Value [Acc > NIR] : 0.0001499       
128 | # 
129 | # Kappa : 0.5167          
130 | # Mcnemar's Test P-Value : 0.1116118       
131 | # 
132 | # Sensitivity : 0.8900          
133 | # Specificity : 0.6038          
134 | # Pos Pred Value : 0.8091          
135 | # Neg Pred Value : 0.7442          
136 | # Prevalence : 0.6536          
137 | # Detection Rate : 0.5817          
138 | # Detection Prevalence : 0.7190          
139 | # Balanced Accuracy : 0.7469          
140 | # 
141 | # 'Positive' Class : neg             
142 | 
143 | # Let us check what the confusion matrix tells us
144 | # This indicates that of the records that were marked negative (neg)
145 | # We predicted 89 of them as negative and 11 as positive (i.e., they
146 | # were negative but we incorrectly classified them as a positive
147 | 
148 | # We correctly identified 32 positives but incorrectly classified
149 | # 21 positives as negative
150 | 
151 | #
152 | # 		Reference
153 | # Prediction neg pos
154 | # 		neg  89  21
155 | # 		pos  11  32
156 | 
157 | # The overall accuracy was 79%
158 | # This can be improved (significantly) by using more 
159 | # Accuracy : 0.7908          
160 | 
161 | # We can plot the model using plot(rf_model) as follows
162 | plot(rf_model)
163 | 
164 | 
165 | # And finally we can also visualize our confusion matrix using the
166 | # inbuilt fourfoldplot function in R
167 | 
168 | fourfoldplot(cf$table)
169 | 
170 | 
171 | # Tutorial
172 | # ---
173 | library(doMC)
174 | registerDoMC(cores = 8)
175 | 
176 | data("PimaIndiansDiabetes2",package = 'mlbench')
177 | set.seed(100)
178 | diab <- PimaIndiansDiabetes2
179 | 
180 | diab <- knnImputation(diab)
181 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
182 | 
183 | diab_train <- diab[training_index,]
184 | diab_test  <- diab[-training_index,]
185 | 
186 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "random", classProbs = TRUE)
187 | 
188 | nn_model <- train(diabetes ~ ., data = diab_train, method = "nnet", 
189 |                   preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "Accuracy")
190 | 
191 | 
192 | varImp(nn_model)
193 | 
194 | predictions <- predict(nn_model, diab_test[,-ncol(diab_test)])
195 | head(predictions)
196 | 
197 | cf <- confusionMatrix(predictions, diab_test$diabetes)
198 | cf
199 | 
200 | plot(nn_model)
201 | fourfoldplot(cf$table)
202 | 
203 | 
204 | 


--------------------------------------------------------------------------------
/Chapter07/tutorial.R:
--------------------------------------------------------------------------------
 1 | library(doMC)
 2 | registerDoMC(cores = 8)
 3 | 
 4 | data("PimaIndiansDiabetes2",package = 'mlbench')
 5 | set.seed(100)
 6 | diab <- PimaIndiansDiabetes2
 7 | 
 8 | diab <- knnImputation(diab)
 9 | training_index <- createDataPartition(diab$diabetes, p = .8, list = FALSE, times = 1)
10 | 
11 | diab_train <- diab[training_index,]
12 | diab_test  <- diab[-training_index,]
13 | 
14 | diab_control <- trainControl("repeatedcv", number = 10, repeats = 3, search = "random", classProbs = TRUE)
15 | 
16 | nn_model <- train(diabetes ~ ., data = diab_train, method = "nnet", 
17 |                   preProc = c("center", "scale"), trControl = diab_control, tuneLength = 10, metric = "Accuracy")
18 | 
19 | 
20 | varImp(nn_model)
21 | 
22 | predictions <- predict(nn_model, diab_test[,-ncol(diab_test)])
23 | head(predictions)
24 | 
25 | cf <- confusionMatrix(predictions, diab_test$diabetes)
26 | cf
27 | 
28 | plot(nn_model)
29 | fourfoldplot(cf$table)
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/Chapter07/world_gdp.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter07/world_gdp.csv


--------------------------------------------------------------------------------
/Chapter07/world_gdp_per_capita.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter07/world_gdp_per_capita.csv


--------------------------------------------------------------------------------
/Chapter07/world_life_expectancy.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter07/world_life_expectancy.csv


--------------------------------------------------------------------------------
/Chapter07/world_population.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter07/world_population.csv


--------------------------------------------------------------------------------
/Chapter08/Chapter8.R:
--------------------------------------------------------------------------------
  1 | # Chapter 8: Packt
  2 | # Regression
  3 | library("mlbench")
  4 | data("PimaIndiansDiabetes")
  5 | lm_model <- lm(glucose ~ pressure + triceps + insulin, data=PimaIndiansDiabetes[1:100,])
  6 | plot(lm_model)
  7 | 
  8 | # Decision Trees
  9 | install.packages("rpart")
 10 | install.packages("rpart.plot")
 11 | 
 12 | library(rpart)
 13 | library(rpart.plot)
 14 | 
 15 | rpart_model <- rpart (diabetes ~ glucose + insulin + mass + age, data = PimaIndiansDiabetes)
 16 | rpart_model
 17 | 
 18 | # Fast & Frugal Decision Trees
 19 | 
 20 | install.packages("FFTrees")
 21 | library(caret)
 22 | library(mlbench)
 23 | library(FFTrees)
 24 | set.seed(123)
 25 | 
 26 | data("PimaIndiansDiabetes")
 27 | diab <- PimaIndiansDiabetes
 28 | diab$diabetes <- 1 * (diab$diabetes=="pos")
 29 | 
 30 | train_ind <- createDataPartition(diab$diabetes,p=0.8,list=FALSE,times=1)
 31 | 
 32 | training_diab <- diab[train_ind,]
 33 | test_diab <- diab[-train_ind,]
 34 | 
 35 | diabetes.fft <- FFTrees(diabetes ~.,data = training_diab,data.test = test_diab)
 36 | plot(diabetes.fft)
 37 | 
 38 | # Random Forest
 39 | 
 40 | rf_model1 <- randomForest(diabetes ~ ., data=PimaIndiansDiabetes)
 41 | rf_model1
 42 | 
 43 | library(caret)
 44 | library(doMC)
 45 | 
 46 | # THE NEXT STEP IS VERY CRITICAL – YOU DO ‘NOT’ NEED TO USE MULTICORE
 47 | # NOTE THAT THIS WILL USE ALL THE CORES ON THE MACHINE THAT YOU ARE
 48 | # USING TO RUN THE EXERCISE
 49 | 
 50 | # REMOVE THE # MARK FROM THE FRONT OF registerDoMC BEFORE RUNNING
 51 | # THE COMMAND
 52 | 
 53 | # registerDoMC(cores = 8) # CHANGE NUMBER OF CORES TO MATCH THE NUMBER OF CORES ON YOUR MACHINE 
 54 |   
 55 | rf_model <- train(diabetes ~ ., data=PimaIndiansDiabetes, method="rf")
 56 | rf_model
 57 | 
 58 | getTrainPerf(rf_model)
 59 | 
 60 | # Boosting - eXtreme Gradient Boosting
 61 | library(caret)
 62 | library(xgboost)
 63 | 
 64 | set.seed(123)
 65 | train_ind <- sample(nrow(PimaIndiansDiabetes),as.integer(nrow(PimaIndiansDiabetes)*.80))
 66 | 
 67 | training_diab <- PimaIndiansDiabetes[train_ind,]
 68 | test_diab <- PimaIndiansDiabetes[-train_ind,]
 69 | 
 70 | diab_train <- sparse.model.matrix(~.-1, data=training_diab[,-ncol(training_diab)])
 71 | diab_train_dmatrix <- xgb.DMatrix(data = diab_train, label=training_diab$diabetes=="pos")
 72 | 
 73 | diab_test <- sparse.model.matrix(~.-1, data=test_diab[,-ncol(test_diab)])
 74 | diab_test_dmatrix <- xgb.DMatrix(data = diab_test, label=test_diab$diabetes=="pos")
 75 | 
 76 | 
 77 | 
 78 | param_diab <- list(objective = "binary:logistic",
 79 |                    eval_metric = "error",
 80 |                    booster = "gbtree",
 81 |                    max_depth = 5,
 82 |                    eta = 0.1)
 83 | 
 84 | xgb_model <- xgb.train(data = diab_train_dmatrix,
 85 |                        param_diab, nrounds = 1000,
 86 |                        watchlist = list(train = diab_train_dmatrix, test = diab_test_dmatrix),
 87 |                        print_every_n = 10)
 88 | 
 89 | 
 90 | predicted <- predict(xgb_model, diab_test_dmatrix)
 91 | predicted <- predicted > 0.5
 92 | 
 93 | actual <- test_diab$diabetes == "pos"
 94 | confusionMatrix(actual,predicted)
 95 | 
 96 | # Support Vector Machines
 97 | 
 98 | 
 99 | library(mlbench)
100 | library(caret)
101 | library(e1071)
102 | set.seed(123)
103 | 
104 | 
105 | data("PimaIndiansDiabetes")
106 | diab <- PimaIndiansDiabetes
107 | 
108 | train_ind <- createDataPartition(diab$diabetes,p=0.8,list=FALSE,times=1)
109 | 
110 | training_diab <- diab[train_ind,]
111 | test_diab <- diab[-train_ind,]
112 | 
113 | svm_model <- svm(diabetes ~ ., data=training_diab)
114 | plot(svm_model,training_diab, glucose ~ mass)
115 | 
116 | svm_predicted <- predict(svm_model,test_diab[,-ncol(test_diab)])
117 | confusionMatrix(svm_predicted,test_diab$diabetes)
118 | 
119 | # K-Means
120 | 
121 | library(data.table)
122 | library(ggplot2)
123 | library()
124 | 
125 | historyData <- fread("history.csv") # Change to your appropriate location
126 | ggplot(historyData,aes(american_history,asian_history)) + geom_point() + geom_jitter()
127 | 
128 | historyCluster <- kmeans(historyData,2) # Create 2 clusters
129 | historyData[,cluster:=as.factor(historyCluster$cluster)]
130 | ggplot(historyData, aes(american_history,asian_history,color=cluster)) + geom_point() + geom_jitter()
131 | 
132 | # Neural Network
133 | library(mlbench)
134 | library(caret)
135 | set.seed(123)
136 | 
137 | data("PimaIndiansDiabetes")
138 | diab <- PimaIndiansDiabetes
139 | 
140 | train_ind <- createDataPartition(diab$diabetes,p=0.8,list=FALSE,times=1)
141 | training_diab <- diab[train_ind,]
142 | test_diab <- diab[-train_ind,]
143 | 
144 | nnet_grid <- expand.grid(.decay = c(0.5,0.1), .size = c(3,5,7))
145 | nnet_model <- train(diabetes ~ ., data = training_diab, method = "nnet", metric = "Accuracy", maxit = 500, tuneGrid = nnet_grid)
146 | nnet_predicted <- predict(nnet_model, test_diab)
147 | 
148 | confusionMatrix(nnet_predicted,test_diab$diabetes)
149 | 
150 | 
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/Chapter08/Regularisation.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/Regularisation.xlsx


--------------------------------------------------------------------------------
/Chapter08/history.csv:
--------------------------------------------------------------------------------
 1 | american_history,asian_history
 2 | 3,10
 3 | 4,7
 4 | 4,8
 5 | 4,9
 6 | 4,10
 7 | 5,6
 8 | 4,10
 9 | 5,10
10 | 5,7
11 | 4,10
12 | 3,8
13 | 3,9
14 | 4,10
15 | 3,9
16 | 3,10
17 | 5,9
18 | 5,8
19 | 5,7
20 | 5,7
21 | 5,6
22 | 3,6
23 | 3,9
24 | 4,7
25 | 5,10
26 | 5,7
27 | 5,9
28 | 4,9
29 | 5,8
30 | 4,8
31 | 5,6
32 | 4,7
33 | 3,7
34 | 5,6
35 | 4,6
36 | 3,7
37 | 3,7
38 | 4,8
39 | 5,7
40 | 3,6
41 | 3,9
42 | 3,10
43 | 4,10
44 | 4,9
45 | 5,10
46 | 4,10
47 | 3,8
48 | 5,6
49 | 3,7
50 | 4,6
51 | 4,7
52 | 8,2
53 | 8,3
54 | 10,6
55 | 8,3
56 | 10,3
57 | 8,4
58 | 10,3
59 | 10,3
60 | 10,5
61 | 9,6
62 | 10,3
63 | 8,2
64 | 8,2
65 | 8,3
66 | 9,3
67 | 8,6
68 | 2,1
69 | 1,3
70 | 3,3


--------------------------------------------------------------------------------
/Chapter08/rulespackt/app.R:
--------------------------------------------------------------------------------
  1 | # Packt: Big Data Analytics
  2 | # Chapter 8 Tutorial
  3 | 
  4 | library(shiny)
  5 | library(shinydashboard)
  6 | library(data.table)
  7 | library(DT)
  8 | library(shinyjs)
  9 | 
 10 | 
 11 | cms_factor_dt <- readRDS("~/r/rulespackt/cms_factor_dt.rds")
 12 | cms_rules_dt <- readRDS("~/r/rulespackt/cms_rules_dt.rds")
 13 | 
 14 | # Define UI for application that draws a histogram
 15 | ui <- dashboardPage (skin="green",   
 16 |        dashboardHeader(title = "Apriori Algorithm"),
 17 |        dashboardSidebar(
 18 |          useShinyjs(),
 19 |          sidebarMenu(
 20 |            uiOutput("company"),
 21 |            uiOutput("searchlhs"),
 22 |            uiOutput("searchrhs"),
 23 |            uiOutput("support2"),
 24 |            uiOutput("confidence"),
 25 |            uiOutput("lift"),
 26 |            downloadButton('downloadMatchingRules', "Download Rules")
 27 |            
 28 |          )
 29 |        ),dashboardBody(
 30 |          tags$head(
 31 |            tags$link(rel = "stylesheet", type = "text/css", href = "packt2.css"),
 32 |            tags$link(rel = "stylesheet", type = "text/css", href = "//fonts.googleapis.com/css?family=Fanwood+Text"),
 33 |            tags$link(rel = "stylesheet", type = "text/css", href = "//fonts.googleapis.com/css?family=Varela"),
 34 |            tags$link(rel = "stylesheet", type = "text/css", href = "fonts.css"),
 35 |            
 36 |            tags$style(type="text/css", "select { max-width: 200px; }"),
 37 |            tags$style(type="text/css", "textarea { max-width: 185px; }"),
 38 |            tags$style(type="text/css", ".jslider { max-width: 200px; }"),
 39 |            tags$style(type='text/css', ".well { max-width: 250px; padding: 10px; font-size: 8px}"),
 40 |            tags$style(type='text/css', ".span4 { max-width: 250px; }")
 41 |            
 42 |            
 43 |          ),
 44 |          
 45 |          fluidRow(
 46 |            dataTableOutput("result")
 47 |            
 48 |          )
 49 |        ),
 50 |        title = "Aprior Algorithm"
 51 | )
 52 | 
 53 | 
 54 | 
 55 | # Define server logic required to draw a histogram
 56 | server <- function(input, output, session) {
 57 |   
 58 |   PLACEHOLDERLIST2 <- list(
 59 |     placeholder = 'Select All',
 60 |     onInitialize = I('function() { this.setValue(""); }')
 61 |   )
 62 |   
 63 |   output$company <- renderUI({
 64 |     datasetList <- c("Select All",as.character(unique(sort(cms_factor_dt$company))))
 65 |     selectizeInput("company", "Select Company" , 
 66 |                    datasetList, multiple = FALSE,options = PLACEHOLDERLIST2,selected="Select All")
 67 |   })
 68 |   
 69 |   output$searchlhs <- renderUI({
 70 |     textInput("searchlhs", "Search LHS", placeholder = "Search")
 71 |   })
 72 |   
 73 |   output$searchrhs <- renderUI({
 74 |     textInput("searchrhs", "Search RHS", placeholder = "Search")
 75 |   })
 76 |   
 77 |   output$support2 <- renderUI({
 78 |     sliderInput("support2", label = 'Support',min=0,max=0.04,value=0.01,step=0.005)
 79 |   })
 80 |   
 81 |   output$confidence <- renderUI({
 82 |     sliderInput("confidence", label = 'Confidence',min=0,max=1,value=0.5)
 83 |   })
 84 |   
 85 |   output$lift <- renderUI({
 86 |     sliderInput("lift", label = 'Lift',min=0,max=10,value=0.8)
 87 |   })
 88 |   
 89 |   dataInput <- reactive({
 90 |     print(input$support2)
 91 |     print(input$company)
 92 |     print(identical(input$company,""))
 93 | 
 94 |     temp <- cms_rules_dt[support > input$support2 & confidence > input$confidence & lift > input$lift]
 95 |     
 96 |     if(!identical(input$searchlhs,"")){
 97 |       searchTerm <- paste0("*",input$searchlhs,"*")
 98 |       temp <- temp[LHS %like% searchTerm]
 99 |     }
100 |     
101 |     if(!identical(input$searchrhs,"")){
102 |       searchTerm <- paste0("*",input$searchrhs,"*")
103 |       temp <- temp[RHS %like% searchTerm]
104 |     }
105 |     
106 |     if(!identical(input$company,"Select All")){
107 |       # print("HERE")
108 |       temp <- temp[grepl(input$company,rules)]
109 |     }
110 |     temp[,.(LHS,RHS,support,confidence,lift)]
111 |   })
112 |   
113 |   output$downloadMatchingRules <- downloadHandler(
114 |     filename = "Rules.csv",
115 |     content = function(file) {
116 |       write.csv(dataInput(), file, row.names=FALSE)
117 |     }
118 |   )
119 |   
120 |   output$result <- renderDataTable({
121 |     z = dataInput()
122 |     if (nrow(z) == 0) {
123 |       z <- data.table("LHS" = '', "RHS"='', "Support"='', "Confidence"='', "Lift" = '')
124 |     }
125 |     setnames(z, c("LHS", "RHS", "Support", "Confidence", "Lift"))
126 |     datatable(z,options = list(scrollX = TRUE))
127 |   })
128 |   
129 | } 
130 | 
131 | shinyApp(ui = ui, server = server)
132 | 


--------------------------------------------------------------------------------
/Chapter08/rulespackt/cms_factor_dt.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/cms_factor_dt.rds


--------------------------------------------------------------------------------
/Chapter08/rulespackt/cms_rules.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/cms_rules.rds


--------------------------------------------------------------------------------
/Chapter08/rulespackt/cms_rules_dt.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/cms_rules_dt.rds


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts.css:
--------------------------------------------------------------------------------
 1 | /* rubik-regular - latin */
 2 | /*
 3 | Rubik-Light.ttf: Copyright 2015 The Rubik Project Authors (meir@sadan.com)
 4 | Rubik-LightItalic.ttf: Copyright 2015 The Rubik Project Authors (meir@sadan.com)
 5 | Rubik-Regular.ttf: Copyright 2015 The Rubik Project Authors (meir@sadan.com)
 6 | Rubik-Italic.ttf: Copyright 2015 The Rubik Project Authors (meir@sadan.com)
 7 | Rubik-Medium.ttf: Copyright 2015 The Rubik Project Authors (meir@sadan.com)
 8 | Rubik-MediumItalic.ttf: Copyright 2015 The Rubik Project Authors (meir@sadan.com)
 9 | Rubik-Bold.ttf: Copyright 2015 The Rubik Project Authors (meir@sadan.com)
10 | Rubik-BoldItalic.ttf: Copyright 2015 The Rubik Project Authors (meir@sadan.com)
11 | Rubik-Black.ttf: Copyright 2015 The Rubik Project Authors (meir@sadan.com)
12 | Rubik-BlackItalic.ttf: Copyright 2015 The Rubik Project Authors (meir@sadan.com)
13 | */
14 | 
15 | @font-face {
16 |   font-family: 'Rubik';
17 |   font-style: normal;
18 |   font-weight: 400;
19 |   src: url('fonts/rubik/rubik-v7-latin-regular.eot'); /* IE9 Compat Modes */
20 |   src: local('Rubik'), local('Rubik-Regular'),
21 |        url('fonts/rubik/rubik-v7-latin-regular.eot?#iefix') format('embedded-opentype'), /* IE6-IE8 */
22 |        url('fonts/rubik/rubik-v7-latin-regular.woff2') format('woff2'), /* Super Modern Browsers */
23 |        url('fonts/rubik/rubik-v7-latin-regular.woff') format('woff'), /* Modern Browsers */
24 |        url('fonts/rubik/rubik-v7-latin-regular.ttf') format('truetype'), /* Safari, Android, iOS */
25 |        url('fonts/rubik/rubik-v7-latin-regular.svg#Rubik') format('svg'); /* Legacy iOS */
26 | }
27 | 


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300.eot


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300.ttf


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300.woff


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300.woff2


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300italic.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300italic.eot


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300italic.ttf


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300italic.woff


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-300italic.woff2


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500.eot


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500.ttf


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500.woff


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500.woff2


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500italic.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500italic.eot


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500italic.ttf


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500italic.woff


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-500italic.woff2


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700.eot


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700.ttf


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700.woff


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700.woff2


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700italic.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700italic.eot


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700italic.ttf


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700italic.woff


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-700italic.woff2


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900.eot


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900.ttf


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900.woff


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900.woff2


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900italic.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900italic.eot


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900italic.ttf


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900italic.woff


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-900italic.woff2


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-italic.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-italic.eot


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-italic.ttf


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-italic.woff


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-italic.woff2


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-regular.eot


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-regular.ttf


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-regular.woff


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Practical-Big-Data-Analytics/d55338ca9cfa1a64dfe13ef6d8937ac0c0906e20/Chapter08/rulespackt/www/fonts/rubik/rubik-v7-latin-regular.woff2


--------------------------------------------------------------------------------
/Chapter08/rulespackt/www/packt2.css:
--------------------------------------------------------------------------------
  1 | 
  2 | :root {
  3 |     --alex-blue:#164398;
  4 |     --rich-black:#01161E;
  5 |     --mid-green:#124559;
  6 |     --green-text:#EFF6E0;
  7 |     --alex-lblue:#FFF;
  8 |     --alex-licorice:#0E1116;
  9 |     --alex-oxfordblue:#001C55;
 10 | }
 11 | 
 12 | 
 13 | 
 14 | .Rubik {
 15 | 	font-family: Rubik;
 16 | 	font-weight: normal;
 17 | 	font-style: normal;
 18 | }
 19 | 
 20 | body {
 21 |     font-family: 'Rubik', 'Source Sans Pro','Helvetica Neue',Helvetica,Arial,sans-serif;
 22 |     font-weight: 400;
 23 |     overflow-x: hidden;
 24 |     overflow-y: auto;
 25 | }
 26 | 
 27 | 
 28 | .selectize-control.single .selectize-input:after {
 29 |     content: ' ';
 30 |     display: block;
 31 |     position: absolute;
 32 |     top: 50%;
 33 |     right: 17px;
 34 |     margin-top: -3px;
 35 |     width: 0;
 36 |     height: 0;
 37 |     border-style: solid;
 38 |     border-width: 5px 5px 0 5px;
 39 |     border-color: #337ab7 transparent transparent transparent;
 40 | }
 41 | 
 42 | .shiny-text-output, .shiny-bound-output {
 43 |   margin: 1px;
 44 |   font-weight: 700;
 45 | }
 46 | 
 47 | .main-header .logo {
 48 | height: 35px;
 49 | font-size: 25px;
 50 | font-weight: normal;
 51 | line-height: 20px;
 52 | font-family: "Rubik";
 53 | text-align: left;
 54 | vertical-align:bottom;
 55 | padding-top:10px;
 56 | padding-left:5px;
 57 | }
 58 | 
 59 | .main-header {
 60 | border-left:2px solid #FFF;
 61 | border-bottom:2px solid var(--alex-blue);
 62 | }
 63 | 
 64 | .main-header .sidebar-toggle {
 65 |   padding: 7px;
 66 | }
 67 | 
 68 | 
 69 | .main-header .navbar {
 70 |   min-height: 15px !important;
 71 | }
 72 | 
 73 | .left-side, .main-sidebar {
 74 |   padding-top: 15px !important;
 75 |   background-color:blue;
 76 | }
 77 | 
 78 | .form-group {
 79 |   margin-bottom: 2px;
 80 | }
 81 | 
 82 | .selectize-input {
 83 |   min-height: 0px !important;
 84 |   padding-top: 1px !important;
 85 |   padding-bottom: 1px !important;
 86 |   padding-left: 12px !important;
 87 |   padding-right: 12px !important;
 88 |   border-radius:0 px !important;
 89 | }
 90 | 
 91 | .selectize-control {
 92 |     margin-bottom: 1px;
 93 | }
 94 | 
 95 | section.sidebar .shiny-input-container {
 96 |     padding: 4px 5px 4px 5px;
 97 | }
 98 | 
 99 | 
100 | 
101 | .sidebar {
102 |   height: 99vh;
103 |   overflow-y: auto;
104 | }
105 | 
106 | section.sidebar .shiny-input-container {
107 |     padding: 5px 15px 0px 12px;
108 | }
109 | 
110 | .btn {
111 |   padding: 2px;
112 |   margin: 5px;
113 |   color:var(--alex-lblue);
114 |   background-color:var(--alex-blue);
115 |   border-color:var(--alex-blue);
116 |   border-radius:0;
117 | }
118 | 
119 | 
120 | .label {
121 |   margin-left:5px;
122 | }
123 | 
124 | .nav {
125 |   margin-left:5px;
126 | }
127 | 
128 | #manualQuery {
129 |     margin-left: 3px;
130 |     margin-top:-4px;
131 | }
132 | 
133 | .btn.focus, .btn:focus, .btn:hover {
134 |   color: #FFF;
135 |   background-color:#337ab7;
136 |   border-color:#337ab7;
137 | }
138 | 
139 | pre {
140 |     display: inline-table;
141 |     width: 100%;
142 |     padding: 2px;
143 |     margin: 0 0 5px;
144 |     font-size: 12px;
145 |     line-height: 1.42857143;
146 |     color: rgb(51, 52, 53);
147 |     word-break: break-all;
148 |     word-wrap: break-word;
149 |     background-color: rgba(10, 9, 9, 0.06);
150 |     border: 1px rgba(10, 9, 9, 0.06);
151 |     /* border-radius: 4px */
152 | }
153 | 
154 | .skin-red .sidebar a {
155 |     color: #fff;
156 | }
157 | 
158 | .sidebar {
159 |   color: #241623;
160 |   background-color:#FFFFFF;
161 |   border-color:#337ab7;
162 |   border:1px;
163 |   font-family:"Varela";
164 |   padding-top:20px;
165 |   padding-left:5px;
166 | }
167 | 
168 | .skin-green .main-header .logo {
169 |     background-color: #FFF;
170 |     /*background-image: url(kdblogo.png);*/
171 |     background-size:170px;
172 |     background-repeat: no-repeat;
173 |     border-bottom: 0 solid transparent;
174 |     font-weight:normal;
175 |     font-size:1.5em;
176 |     color: #337ab7;
177 |     padding-bottom:1px;
178 | }
179 | 
180 | 
181 | .skin-green .main-header .navbar {
182 |     background-color: #f4f4f4;
183 | 
184 | }
185 | 
186 | 
187 | .skin-green .main-header .navbar .sidebar-toggle {
188 |     color: #337ab7;
189 | }
190 | 
191 | .skin-green:hover .main-header:hover .navbar:hover {
192 |     background-color: #f4f4f4;
193 | 
194 | }
195 | 
196 | 
197 | .skin-green .sidebar a:hover {
198 |     color: #FFF;
199 |     font-family:"Rubik";
200 |     font-size:14px;
201 |     size:10px;
202 | }
203 | 
204 | .skin-green .sidebar a {
205 |     color: #FFF;
206 |     font-family:"Rubik";
207 |     font-size:14px;
208 |     size:10px;
209 | }
210 | 
211 | 
212 | .skin-green .sidebar-menu>li.active>a, .skin-green .sidebar-menu>li:hover>a {
213 |     color: #fff;
214 |     background: #337ab7;
215 |     border-left-color: #337ab7;
216 | 
217 | }
218 | 
219 | .input-group .form-control {
220 |   visibility: hidden;
221 | }
222 | 
223 | .form-control {
224 |     color: var(--alex-oxfordblue);
225 |     font-family:"Rubik";
226 |     font-weight: 10px;
227 |     font-size:inherit;
228 |     height:inherit;
229 |     border-color:var(--alex-blue);
230 | }
231 | 
232 | .nav>li>a {
233 |     position: relative;
234 |     display: block;
235 |     padding: 4px 6px;
236 | }
237 | 
238 | .shiny-file-input-progress {
239 |   display: none;
240 | }
241 | 
242 | .box-header .box-title, .box-header>.fa, .box-header>.glyphicon, .box-header>.ion {
243 |     display: inline-block;
244 |     font-family:"Rubik";
245 | }
246 | 
247 | 
248 | .content-wrapper {
249 |   background-color:#FFFFFF;
250 |   border-left: 1px solid rgba(82,149,197, 0.53);
251 | }
252 | 
253 | 
254 | .box.box-primary {
255 |     border-top-color: #337ab7;
256 | }
257 | 
258 | section.sidebar .shiny-input-container {
259 |     padding: 5px 5px 5px 5px;
260 | }
261 | 
262 | element.style {
263 |     padding-top: 2px;
264 |     padding-bottom: 2px;
265 | }
266 | 
267 | .sidebar-menu>li>a {
268 |     padding: 5px 5px 5px 15px;
269 |     display: block;
270 |     border-bottom:1px solid rgba(82,149,197, 0.53);
271 | }
272 | 
273 | .skin-green:hover .main-header:hover .logo:hover {
274 |     background-color: #FFF;
275 |     /*color: #FFF;*/
276 | }
277 | 
278 | 
279 | 
280 | 
281 | .table>tbody>tr>td, .table>tbody>tr>th, .table>tfoot>tr>td, .table>tfoot>tr>th, .table>thead>tr>td, .table>thead>tr>th {
282 |     padding: 1px;
283 |     line-height: 1.42857143;
284 |     vertical-align: top;
285 |     border-top: 1px solid #337ab7;
286 | }
287 | 
288 | section.sidebar .shiny-bound-input.action-button, section.sidebar .shiny-bound-input.action-link {
289 |     margin: 6px 5px 6px 1px;
290 |     display: block;
291 | }
292 | 
293 | .shiny-bound-output .fa-download {
294 |   padding: 5px;
295 |   margin-left: 1px;
296 |   color:#337ab7;
297 |   background-color:#FFF;
298 |   border-color:#337ab7;
299 | }
300 | 
301 | label {
302 |     display: inline-block;
303 |     max-width: 100%;
304 |     margin-bottom: 5px;
305 |     font-weight: 700;
306 |     font-family: Rubik;
307 |     font-
308 | }
309 | 
310 | table.dataTable {
311 |     width: 100%;
312 |     margin: 0 auto;
313 |     clear: both;
314 |     border-collapse: separate;
315 |     border-spacing: 0;
316 |     font-size: 12px;
317 |     line-height: 1;
318 | }
319 | 
320 | section.sidebar .shiny-input-container {
321 |     padding: 5px 5px 5px 5px;
322 |     border-bottom: 1px solid #bbb;
323 | }
324 | 
325 | 
326 | table.dataTable {
327 |     width: 100%;
328 |     margin: 0 auto;
329 |     clear: both;
330 |     border-collapse: separate;
331 |     border-spacing: 0;
332 |     font-size: 12px;
333 |     line-height: 1;
334 |     color: var(--alex-oxfordblue);
335 | }
336 | 
337 | .btn .btn-default .action-button .shiny-bound-input {
338 |   padding-bottom:5px;
339 | }
340 | 
341 | 
342 | .skin-green .left-side, .skin-green .main-sidebar, .skin-green .wrapper {
343 | 	background-color: #fff;
344 | }
345 | 
346 | 
347 | .dataTables_wrapper .dataTables_filter input {
348 |     margin: 0.5em;
349 | }
350 | 
351 | .dataTables_wrapper .dataTables_scroll {
352 |     clear: both;
353 |     margin: 5px;
354 | }
355 | 
356 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Practical Big Data Analytics
 5 | This is the code repository for [Practical Big Data Analytics](https://www.packtpub.com/big-data-and-business-intelligence/practical-big-data-analytics?utm_source=github&utm_medium=repository&utm_campaign=9781783554393), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish.
 6 | ## About the Book
 7 | Big Data Analytics relates to the strategies used by organizations to collect, organize and analyze large amounts of data to bring out hidden patterns and insights from the data which otherwise cannot be analyzed through traditional systems.
 8 | 
 9 | 
10 | ## Instructions and Navigation
11 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02.
12 | 
13 | 
14 | 
15 | The code will look like the following:
16 | ```
17 | "_id" : ObjectId("597cdbb193acc5c362e7ae97"),
18 | "firstName" : "Nina",
19 | "age" : 53,
20 | "frequentFlyer" : [
21 | "Delta",
22 | "JetBlue",
23 | "Delta
24 | ```
25 | 
26 | * A general knowledge of Unix would be very helpful, although isn't mandatory
27 | * Access to a computer with an internet connection will be needed in order to
28 | download the necessary tools and software used in the exercises
29 | * No prior knowledge of the subject area has been assumed as such
30 | * Installation instructions for all the software and tools have been provided in
31 | Chapter 3, The Analytics Toolkit.
32 | 
33 | ## Related Products
34 | * [Ultimate Big Data Application Development](https://www.packtpub.com/big-data-and-business-intelligence/ultimate-big-data-application-development?utm_source=github&utm_medium=repository&utm_campaign=9781788399951)
35 | 
36 | * [Artificial Intelligence for Big Data](https://www.packtpub.com/big-data-and-business-intelligence/artificial-intelligence-big-data?utm_source=github&utm_medium=repository&utm_campaign=9781788472173)
37 | 
38 | * [Practical Industrial Internet of Things Security](https://www.packtpub.com/business/practical-industrial-internet-things-security?utm_source=github&utm_medium=repository&utm_campaign=9781788832687)
39 | 
40 | ### Suggestions and Feedback
41 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSe5qwunkGf6PUvzPirPDtuy1Du5Rlzew23UBp2S-P3wB-GcwQ/viewform) if you have any feedback or suggestions.
42 | ### Download a free PDF
43 | 
44 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
45 | <p align="center"> <a href="https://packt.link/free-ebook/9781783554393">https://packt.link/free-ebook/9781783554393 </a> </p>


--------------------------------------------------------------------------------