├── .nojekyll
├── 01_partI_bigdata.Rmd
├── 02_approaches_bigdata.Rmd
├── 03_partI_2domains_bigdata.Rmd
├── 04_partII_software.Rmd
├── 05_partII_hardware.Rmd
├── 06_partII_distributedsystems.Rmd
├── 07_partII_cloudcomputing.Rmd
├── 08_partIII_collection_storage.Rmd
├── 09_partIII_cleaning_transformation.Rmd
├── 0_preface.Rmd
├── 10_partIII_descriptives_aggregation.Rmd
├── 11_partIII_visualization.Rmd
├── 12_partIV_data_analyticsI.Rmd
├── 13_partIV_GPU_ML.Rmd
├── 14_partIV_regression_categorization_spark.Rmd
├── 15_partIV_large_scale_text_analysis.Rmd
├── 16_references.Rmd
├── BigData.Rproj
├── README.md
├── R_code_examples
    ├── 03_partI_2domains_bigdata.R
    ├── 04_partII_software.R
    ├── 05_partII_hardware.R
    ├── 06_partII_distributedsystems.R
    ├── 07_partII_cloudcomputing.R
    ├── 08_partIII_collection_storage.R
    ├── 09_partIII_cleaning_transformation.R
    ├── 10_partIII_descriptives_aggregation.R
    ├── 11_partIII_visualization.R
    ├── 12_partIV_data_analyticsI.R
    ├── 13_partIV_GPU_ML.R
    ├── 14_partIV_regression_categorization_spark.R
    └── 15_partIV_large_scale_text_analysis.R
├── docs
    ├── .nojekyll
    ├── 404.html
    ├── a.html
    ├── appendix-a-github.html
    ├── appendix-b-r-basics.html
    ├── appendix-c-install-hadoop.html
    ├── approaches-to-analyzing-big-data.html
    ├── big-data-cleaning-and-transformation.html
    ├── big-data-visualization.html
    ├── bigdata_files
    │   └── figure-html
    │   │   ├── unnamed-chunk-178-1.png
    │   │   ├── unnamed-chunk-190-1.png
    │   │   ├── unnamed-chunk-193-1.png
    │   │   ├── unnamed-chunk-194-1.png
    │   │   ├── unnamed-chunk-196-1.png
    │   │   ├── unnamed-chunk-198-1.png
    │   │   ├── unnamed-chunk-199-1.png
    │   │   ├── unnamed-chunk-200-1.png
    │   │   ├── unnamed-chunk-201-1.png
    │   │   ├── unnamed-chunk-202-1.png
    │   │   ├── unnamed-chunk-203-1.png
    │   │   ├── unnamed-chunk-204-1.png
    │   │   ├── unnamed-chunk-205-1.png
    │   │   ├── unnamed-chunk-206-1.png
    │   │   ├── unnamed-chunk-207-1.png
    │   │   ├── unnamed-chunk-208-1.png
    │   │   ├── unnamed-chunk-209-1.png
    │   │   ├── unnamed-chunk-210-1.png
    │   │   ├── unnamed-chunk-211-1.png
    │   │   ├── unnamed-chunk-221-1.png
    │   │   ├── unnamed-chunk-222-1.png
    │   │   ├── unnamed-chunk-223-1.png
    │   │   ├── unnamed-chunk-224-1.png
    │   │   ├── unnamed-chunk-225-1.png
    │   │   ├── unnamed-chunk-226-1.png
    │   │   ├── unnamed-chunk-227-1.png
    │   │   ├── unnamed-chunk-228-1.png
    │   │   ├── unnamed-chunk-229-1.png
    │   │   ├── unnamed-chunk-27-1.png
    │   │   ├── unnamed-chunk-275-1.png
    │   │   ├── unnamed-chunk-31-1.png
    │   │   ├── unnamed-chunk-318-1.png
    │   │   ├── unnamed-chunk-326-1.png
    │   │   ├── unnamed-chunk-33-1.png
    │   │   └── unnamed-chunk-9-1.png
    ├── bottlenecks-in-everyday-data-analytics-tasks.html
    ├── c.html
    ├── cloud-computing.html
    ├── data-collection-and-data-storage.html
    ├── descriptive-statistics-and-aggregation.html
    ├── distributed-systems.html
    ├── econometrics-with-gpus.html
    ├── hardware-computing-resources.html
    ├── img
    │   ├── 05_nlp_pipeline.jpg
    │   ├── II_computing_environment.png
    │   ├── I_approaches.png
    │   ├── TPU.png
    │   ├── aws_emr_ready.png
    │   ├── aws_rds_create.png
    │   ├── aws_rds_easycreate.png
    │   ├── colab_r_gpu.png
    │   ├── column_v_rowbased.png
    │   ├── cover_print.jpg
    │   ├── data_pipeline.png
    │   ├── distributed_system.jpg
    │   ├── druiddatasources.png
    │   ├── druidparse.png
    │   ├── druidquery.png
    │   ├── druidstart.png
    │   ├── ec2_gpu1.png
    │   ├── ec2_gpu2.png
    │   ├── ec2_rstudioserver_htop.png
    │   ├── gpt_SQL_prompt.png
    │   ├── gpt_sql_response.png
    │   ├── gpu_cpu.png
    │   ├── gpu_details.png
    │   ├── rds_inboundrules.png
    │   ├── rtx_2080.png
    │   ├── screenshot_rstudio_server_upload.png
    │   ├── uluru_comparison.png
    │   ├── uluru_comparison2.png
    │   └── virtual_memory.png
    ├── index.html
    ├── index.md
    ├── introduction.html
    ├── large-scale-text-analysis-with-sparklyr.html
    ├── libs
    │   ├── anchor-sections
    │   │   ├── anchor-sections-hash.css
    │   │   ├── anchor-sections.css
    │   │   └── anchor-sections.js
    │   ├── gitbook
    │   │   ├── css
    │   │   │   ├── fontawesome
    │   │   │   │   └── fontawesome-webfont.ttf
    │   │   │   ├── plugin-bookdown.css
    │   │   │   ├── plugin-clipboard.css
    │   │   │   ├── plugin-fontsettings.css
    │   │   │   ├── plugin-highlight.css
    │   │   │   ├── plugin-search.css
    │   │   │   ├── plugin-table.css
    │   │   │   └── style.css
    │   │   └── js
    │   │   │   ├── app.min.js
    │   │   │   ├── clipboard.min.js
    │   │   │   ├── jquery.highlight.js
    │   │   │   ├── plugin-bookdown.js
    │   │   │   ├── plugin-clipboard.js
    │   │   │   ├── plugin-fontsettings.js
    │   │   │   ├── plugin-search.js
    │   │   │   └── plugin-sharing.js
    │   └── jquery
    │   │   └── jquery-3.6.0.min.js
    ├── p.html
    ├── reference-keys.txt
    ├── references.html
    ├── regression-analysis-and-categorization-with-spark-and-r.html
    ├── s.html
    ├── search_index.json
    ├── software-programming-with-big-data.html
    ├── style
    │   ├── krantz.cls
    │   ├── krantz_new.cls
    │   ├── style.css
    │   ├── style_new.css
    │   └── toc.css
    ├── the-two-domains-of-big-data-analytics.html
    └── what-is-big-in-big-data.html
├── img
    ├── 02_df.png
    ├── 02_factor.png
    ├── 02_list.png
    ├── 02_matrix.png
    ├── 02_numvec.png
    ├── 03_script-hardware_w.jpg
    ├── 03_store-bitbyteword.png
    ├── 03_virtualmemory.png
    ├── 05_nlp_pipeline.jpg
    ├── II_computing_environment.png
    ├── I_approaches.png
    ├── Page 1.png
    ├── TPU.png
    ├── aws_emr_ready.png
    ├── aws_rds_create.png
    ├── aws_rds_easycreate.png
    ├── colab_r_gpu.png
    ├── column_v_rowbased.png
    ├── computing_environment.png
    ├── cover.jpg
    ├── cover_new.png
    ├── cover_new.tiff
    ├── cover_print.jpg
    ├── cover_print.png
    ├── data_pipeline.png
    ├── distributed_system.jpg
    ├── druiddatasources.png
    ├── druidparse.png
    ├── druidquery.png
    ├── druidstart.png
    ├── ec2_gpu1.png
    ├── ec2_gpu2.png
    ├── ec2_rstudioserver_htop.png
    ├── factor.png
    ├── gpt_SQL_prompt.png
    ├── gpt_sql_response.png
    ├── gpu_cpu.png
    ├── gpu_details.png
    ├── hadoop.png
    ├── list.png
    ├── nvidia_geeforce.png
    ├── nvidia_gpu.png
    ├── pipeline.png
    ├── rds_inboundrules.png
    ├── rtx_2080.png
    ├── screenshot_rstudio_server_upload.png
    ├── spark-stack.png
    ├── spark_components.jpg
    ├── uluru_comparison.png
    ├── uluru_comparison2.png
    └── virtual_memory.png
├── index.Rmd
├── references
    ├── bigdata.bib
    └── packages.bib
└── style
    ├── ioslides.css
    ├── ioslides_unilu.css
    ├── ioslides_white.css
    ├── krantz.cls
    ├── krantz_new.cls
    ├── nologo_template.html
    ├── notes.css
    ├── notes_hsg.css
    ├── notes_preamble.tex
    ├── style.css
    ├── style_new.css
    └── toc.css


/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/.nojekyll


--------------------------------------------------------------------------------
/01_partI_bigdata.Rmd:
--------------------------------------------------------------------------------
 1 | \mainmatter
 2 | 
 3 | # (PART) Setting the Scene: Analyzing Big Data {-} 
 4 | 
 5 | # Introduction {#s .unnumbered}  
 6 | 
 7 | > "Lost in the hoopla about such [Hadoop MapReduce] skills is the embarrassing fact that once upon a time, one could do such computing tasks, and even much more ambitious ones, much more easily than in this fancy new setting! A dataset could fit on a single processor, and the global maximum of the array 'x' could be computed with the six-character code fragment 'max(x)' in, say, Matlab or R."
 8 | [@donoho_2017,p.747]
 9 | 
10 | 
11 | This part of the book introduces you to the topic of Big Data analysis from a variety of perspectives. The goal of this part is to highlight the various aspects of modern econometrics involved in Big Data Analytics, as well as to clarify the approach and perspective taken in this book. In the first step, we must consider what makes data *big*. As a result, we make a fundamental distinction between data analysis problems that can arise from many observations (rows; *big N*)\index{Big N} and the problems that can arise from many variables (columns; *big P*)\index{Big P}.
12 | 
13 | In a second step, this part provides an overview of the four distinct approaches to Big Data Analytics that are most important for the perspective on Big Data taken in this book: a) statistics/econometrics techniques specifically designed to handle Big Data, b) writing more efficient R code, c) more efficiently using available local computing resources, and d) scaling up and scaling out with cloud computing resources.
14 | All of these approaches will be discussed further in the book, and it will be useful to remember the most important conceptual basics underlying these approaches from the overview presented here.
15 | 
16 | Finally, this section of the book provides two extensive examples of what problems related to (too) many observations or (too) many variables can mean for practical data analysis, as well as how some of the four approaches (a-d) can help in resolving these problems. 
17 | 
18 |  
19 | # What is *Big* in "Big Data"?
20 | 
21 | In this book, we will think of Big Data as data that is (a) difficult to handle and (b) hard to get value from due to its size and complexity. The handling of Big Data is difficult as the data is often gathered from unorthodox sources, providing poorly structured data (e.g., raw text, web pages, images, etc.) as well as because of the infrastructure needed to store and load/process large amounts of data. Then, the issue of statistical computation itself becomes a challenge. Taken together, getting value/insights from Big Data is related to three distinct properties that render its analysis difficult:
22 | 
23 | - Handling the *complexity and variety* of sources, structures, and formats of data for analytics purposes is becoming increasingly challenging in the context of empirical economic research and business analytics. On the one hand the ongoing digitization of information and processes boosts the generation and storage of digital data for all kinds of economic and social activity, making such data basically more available for analysis. On the other hand, however, the first order focus of such digitization is typically an end user who directly interacts with the information and is part of these processes, and not the data scientist or data analyst who might be interested in analyzing such data later on. Therefore, the interfaces for systematically collecting such data for analytics purposes are typically not optimal. Moreover, data might come in semi-structured formats such as webpages (i.e., the HyperText Markup Language (HTML))\index{HyperText Markup Language (HTML)}, raw text, or even images – each of which needs a different approach for importing/loading and pre-processing. Anyone who has worked on data analytics projects that build on various types of raw data from various sources knows that a large part of the practical data work deals with how to handle the complexity and variety to get to a useful analytic dataset. 
24 | 
25 | - The *big P*\index{Big P} problem: A dataset has close to or even more variables (columns) than observations, which renders the search for a good predictive model with traditional econometric techniques difficult or elusive. For example, suppose you run an e-commerce business that sells hundreds of thousands of products to tens of thousands of customers. You want to figure out from which product category a customer is most likely to buy an item, based on their previous product page visits. That is, you want to (in simple terms) regress an indicator of purchasing from a specific category on indicators for previous product page visits. Given this setup, you would potentially end up with hundreds of thousands of explanatory indicator variables (and potentially even linear combinations of those), while you "only" have tens of thousands of observations (one per user/customer and visit) to estimate your model. These sorts of problems are at the core of the domain of modern predictive econometrics, which shows how machine learning approaches like the lasso estimater\index{Lasso} can be applied to get reasonable estimates from such a predictive model.
26 | 
27 | - The *big N*\index{Big N} problem: a dataset has massive numbers of observations (rows) such that it cannot be handled with standard data analytics techniques and/or on a standard desktop computer. For example, suppose you want to segment your e-commerce customers based on the traces they leave on your website's server. Specifically, you plan to use the server log files (when does a customer visit the site, from where, etc.) in combination with purchase records and written product reviews by users. You focus on 50 variables that you measure on a daily basis over five years for all 50,000 users. The resulting dataset has $50,000 \times 365 \times 5=91,250,000$ rows, with 50 variables (at least 50 columns) – over 4.5 billion cells. Such a dataset can easily take up dozens of gigabytes on the hard disk. Hence it will either not fit into the memory of a standard computer to begin with (import fails), or the standard programs to process and analyze the data will likely be very inefficient and take ages to finish when used on such a large dataset. There are both econometric techniques as well as various specialized software and hardware tools to handle such a situation.
28 | 
29 | 
30 | After having a close look at the practical data analytics challenges behind both *big P* and *big N* in Chapter 3, most of this book focuses on practical challenges and solutions related to *big N* problems. However, several of the chapters contain code examples that are primarily discussed as a solution to a *big N* problem, but are shown in the context of econometric/machine learning techniques that are broadly used, for example, to find good predictive models (based on many variables, i.e., *big P*). At the same time, many of the topics discussed in this book are in one way or another related to the difficulties of handling various types of structured, semi-structured, and unstructured data. Hence you will get familiar with practical techniques to deal with *complexity and variety* of data as a byproduct.
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/02_approaches_bigdata.Rmd:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Approaches to Analyzing Big Data
 4 | 
 5 | Throughout the book, we consider four approaches to how to solve challenges related to analyzing big N and big P data. Those approaches should not be understood as mutually exclusive categories; rather they should help us to look at a specific problem from different angles in order to find the most efficient tool/approach to proceed. Figure \@ref(fig:approaches) presents an illustrative overview of the four approaches.
 6 | 
 7 | 
 8 | 
 9 | ```{r approaches, echo=FALSE, out.width = "99%", fig.align='center', fig.cap= "(ref:approaches)", purl=FALSE}
10 | include_graphics("img/I_approaches.png")
11 | ```
12 | 
13 | (ref:approaches) Four approaches to/perspectives on solving big N problems in data analytics.
14 | 
15 | 
16 | 1. *Statistics/econometrics and machine learning*\index{Machine Learning}: During the initial hype surrounding Big Data/Data Science\index{Data Science} about a decade ago, statisticians prominently (and justifiably) pointed out that statistics techniques that have always been very useful tools when analyzing "all the data" (the entire population) is too costly.^[David Donoho has nicely summarized this critique in a paper titled ["50 Years of Data Science"](https://doi.org/10.1080/10618600.2017.1384734) (@donoho_2017), which I warmly recommend.] In simple terms, when confronted with the challenge of answering an empirical question based on a *big N* dataset (which is too large to process on a normal computer), one might ask "why not simply take a random sample?" In some situations this might actually be a very reasonable question, and we should be sure to have a good answer for it before we rent a cluster computer with specialized software for distributed computing. After all, statistical inference is there to help us answer empirical questions in situations where collecting data on the entire population would be practically impossible or simply way too costly. In today's world, digital data is abundant in many domains, and the collection is not so much the problem any longer; but our standard data analytics tools are not made to analyze such amounts of data. Depending on the question and data at hand, it might thus make sense to simply use well-established "traditional" statistics/econometrics in order to properly address the empirical question. Note, though, that there are also various situations in which this would not work well. For example, consider online advertising. If you want to figure out which user characteristics make a user significantly more likely to click on a specific type of ad, you likely need hundreds of millions of data points because the expected probability that a specific user clicks on an ad is generally very low. That is, in many practical Big Data Analytics settings, you might expect rather small effects. Consequently, you need to rely on a big N dataset in order to get the statistical power to distinguish an actual effect from a zero effect. However, even then, it might make sense to first look at newer statistical procedures that are specifically made for big N data before renting a cluster computer. Similarly, traditional statistical/econometric approaches might help to deal with big P data, but they are usually rather inefficient or have rather problematic statistical properties in such situations. However, there are also well-established machine learning approaches to better address these problems. In sum, before focusing on specialized software like Apache Hadoop\index{Apache Hadoop} or Apache Spark\index{Apache Spark} and scaling up hardware resources, make sure to use the adequate statistical tools for a Big Data situation. This can save a lot of time and money. Once you have found the most efficient statistical procedure for the problem at hand, you can focus on how to compute it.
17 | 
18 | 2. *Writing efficient code*: No matter how suitable a statistical procedure is theoretically to analyze a large dataset, there are always various ways to implement this procedure in software. Some ways will be less efficient than others. When working with small or moderately sized datasets, you might not even notice whether your data analytics script is written in an efficient way. However, it might get uncomfortable to run your script once you confront it with a large dataset. Hence the question you should ask yourself when taking this perspective is, "Can I write this script in a different way to make it faster (but achieve the same result)?" Before introducing you to specialized R packages to work with large datasets, we thus look at a few important aspects of how to write efficient/fast code in R.
19 | 
20 | 3. *Using limited local computing resources more efficiently*: There are several strategies to use the available local computing resources (your PC) more efficiently, and many of those have been around for a while. In simple terms, these strategies are based on the idea of more explicitly telling the computer how to allocate and use the available hardware resources as part of a data analytics task (something that is usually automatically taken care of by the PC's operating system). We will touch upon several of these strategies – such as multi-core processing and the efficient use of virtual memory – and then practically implement these strategies with the help of specialized R packages. Unlike writing more efficient R code, these packages/strategies usually come with an overhead. That is, they help you save time only after a certain threshold. In other words, not using these approaches can be faster if the dataset is not "too big". In addition, there can be trade-offs between using one vs. another hardware component more efficiently. Hence, using these strategies can be tricky, and the best approach might well depend on the specific situation. The aim is thus to make you comfortable with answering the question, "How can I use my local computing environment more efficiently to further speed up this specific analytics task?"
21 | 
22 | 4. *Scaling up and scaling out*: once you have properly considered all of the above, but the task still cannot be done in a reasonable amount of time, you will need to either *scale up*\index{Scale Up} or *scale out*\index{Scale Out} the available computing resources. *Scaling up* refers to enlarging your machine (e.g., adding more random access memory) or switching to a more powerful machine altogether. Technically, this can mean literally building an additional hardware device into your PC; today it usually means renting a virtual server in the cloud. Instead of using a "bigger machine", *scaling out* means using several machines in concert (cluster computer, distributed systems). While this also has often been done locally (connecting several PCs to a cluster of PCs to combine all their computing power), today this too is usually done in the cloud (due to the much easier set up and maintenance). Practically, a key difference between scaling out and scaling up is that by-and-large scaling up does not require you to get familiar with specialized software. You can simply run the exact same script you tested locally on a larger machine in the cloud. Although most of the tools and services available to scale out your analyses are by now also quite easy to use, you will have to get familiar with some additional software components to really make use of the latter. In addition, in some situations, scaling up might be perfectly sufficient, while in others only scaling out makes sense (particularly if you need massive amounts of memory). In any event, you should be comfortable dealing with the questions, "Does it make sense to scale up or scale out?" and "If yes, how can it be done?" in a given situation.^[Importantly, the perspective on scaling up and scaling out provided in this book is solely focused on Big Data Analytics in the context of economic/business research. There is a large array of practical problems and corresponding solutions/tools to deal with "Big Data Analytics" in the context of application development (e.g. tools related to data streams), which this book does not cover.]
23 | 
24 | 
25 | Whether one or the other approach is "better" is sometimes a topic hotly debated between academics and/or practitioners with different academic backgrounds. The point of the following chapters is not to argue for one or the other approach, but to make you familiar with these different perspectives in order to make you more comfortable and able to take on large amounts of data for your analytics project. When might one or the other approach/perspective be more useful? This is highly context-dependent. However, as a general rule of thumb, consider the order in which the different approaches have been presented above.
26 | 
27 | - First, ask yourself whether there isn't an absolutely trivial solution to your big N problem, such as taking a random sample. I know, this sound banal, and you would be surprised at how many books and lectures focusing on the data engineering side of big N do not even mention this. But, we should not forget that the entire apparatus of statistical inference is essentially based on this idea.^[Originally, one could argue, the motivation for the development of statistical inference was rather related to the practical problem of gathering data on an entire population than handling a large dataset with observations of the entire population. However, in practice, inferring population properties from a random sample also works for the latter.] There is, however, a well-justified excuse for not simply taking a random sample of a large dataset. Both in academic research and in business data science and business analytics, the decision to be facilitated with data might in any event only have measurable consequences in rather a few cases. That is, the effect size of deciding either for A or B is anyway expected to be small, and hence we need sufficient statistical power (large N) to make a meaningful decision.
28 | 
29 | - Second, once you know which statistical procedure should be run on which final sample/dataset, be aware of how to write your analytics scripts in the most efficient way. As you will see in Chapter 4, there are a handful of R idiosyncrasies that are worth keeping in mind in this regard. This will make interactive sessions in the early, exploratory phase of a Big Data project much more comfortable.
30 | 
31 | - Third, once you have a clearer idea of the bottlenecks in the data preparation and analytics scripts, aim to optimize the usage of the available local computing resources. 
32 | 
33 | - In almost any organizational structure, be it a university department, a small firm, or a multinational conglomerate, switching from your laptop or desktop computer to a larger computing infrastructure, either locally or in the cloud, means additional administrative and budgetary hurdles (which means money and time spent on something other than interpreting data analysis results). That is, even before setting up the infrastructure and transferring your script and data, you will have to make an effort to scale up or scale out. Therefore, as a general rule of thumb, this option will be considered as a measure of last resort in this book. 
34 | 
35 | Following this recommended order of consideration, before we focus extensively on the topics of *using local computing resources more efficiently* and *scaling up/out* (in parts II and III of this book, respectively), we need to establish some of the basics regarding what is meant by statistical/econometric solutions for big P and big N problems (in the next chapter), as well as introducing a couple of helpful programming tools and skills for working on computationally intense tasks (in Chapter 4). 
36 | 


--------------------------------------------------------------------------------
/BigData.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 5
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: XeLaTeX
14 | 
15 | BuildType: Website
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Big Data Analytics
 2 | 
 3 | 
 4 | This repository contains the source of the [Big Data Analytics book](https://umatter.github.io/BigData/), as well as  supplementary online resources. The book is built using [bookdown](https://github.com/rstudio/bookdown).
 5 | 
 6 | ## Supplementary online resources
 7 | 
 8 | ### R code examples
 9 | 
10 | The [R_code_examples](/R_code_examples) folder contains R-scripts with all code examples and tutorials shown in the book. 
11 | 
12 | ### Data
13 | 
14 | The corresponding sections in the book contain typically contain detailed instructions of where and how the datasets used in the code examples can be downloaded from the original sources.
15 | 
16 | To ensure data availability for the code examples and tutorials in the long run, you find (smaller scale) versions for all key datasets discussed in the book in this S3-bucket: 
17 | 
18 | https://bda-examples.s3.eu-central-1.amazonaws.com/air_final.sqlite
19 | 
20 | https://bda-examples.s3.eu-central-1.amazonaws.com/airline_id.csv
21 | 
22 | https://bda-examples.s3.eu-central-1.amazonaws.com/airports.csv
23 | 
24 | https://bda-examples.s3.eu-central-1.amazonaws.com/carriers.csv
25 | 
26 | https://bda-examples.s3.eu-central-1.amazonaws.com/data_for_tables.dta
27 | 
28 | https://bda-examples.s3.eu-central-1.amazonaws.com/economics.csv
29 | 
30 | https://bda-examples.s3.eu-central-1.amazonaws.com/flights_sep_oct15.txt
31 | 
32 | https://bda-examples.s3.eu-central-1.amazonaws.com/flights.csv
33 | 
34 | https://bda-examples.s3.eu-central-1.amazonaws.com/ga.csv
35 | 
36 | https://bda-examples.s3.eu-central-1.amazonaws.com/inflation.csv
37 | 
38 | https://bda-examples.s3.eu-central-1.amazonaws.com/marketing_data.csv
39 | 
40 | https://bda-examples.s3.eu-central-1.amazonaws.com/mydb.sqlite
41 | 
42 | https://bda-examples.s3.eu-central-1.amazonaws.com/tlc_trips.csv
43 | 
44 | Note that the AWS bucket is configured such that the [requester pays](https://docs.aws.amazon.com/AmazonS3/latest/userguide/RequesterPaysBuckets.html?icmpid=docs_amazons3_console) for requests and transfer costs.
45 | 
46 | 
47 | ### Installation of dependencies and packages
48 | 
49 | Here you find additional resources and hints regarding the installation of some of the tools used in the book.
50 | 
51 |  - `gpuR`: The package is not anymore available via `install.packages()`. However, you can install it with `devtools::install_github("cdeterman/gpuR")`. For additional installation instructions (in particular regarding dependencies), see the wiki here: https://github.com/cdeterman/gpuR/wiki.
52 |  - Install Apache Spark via `sparklyr`: https://spark.rstudio.com/get-started/
53 |  - Install Tensorflow and Keras via the `tensorflow` and `keras` packages (from within R): https://tensorflow.rstudio.com/install/
54 |  
55 |  
56 | 


--------------------------------------------------------------------------------
/R_code_examples/03_partI_2domains_bigdata.R:
--------------------------------------------------------------------------------
  1 | # import/inspect data
  2 | ga <- read.csv("data/ga.csv")
  3 | head(ga[, c("source", "browser", "city", "purchase")])
  4 | # create model matrix (dummy vars)
  5 | mm <- cbind(ga$purchase,
  6 |             model.matrix(purchase~source, data=ga,)[,-1])
  7 | mm_df <- as.data.frame(mm)
  8 | # clean variable names
  9 | names(mm_df) <- c("purchase",
 10 |                   gsub("source", "", names(mm_df)[-1]))
 11 | # run logit
 12 | model1 <- glm(purchase ~ .,
 13 |               data=mm_df, family=binomial)
 14 | 
 15 | model1_sum <- summary(model1)
 16 | # select "significant" variables for final model
 17 | pvalues <- model1_sum$coefficients[,"Pr(>|z|)"]
 18 | vars <- names(pvalues[which(pvalues<0.05)][-1])
 19 | vars
 20 | 
 21 | # specify and estimate the final model
 22 | finalmodel <- glm(purchase ~.,
 23 |                   data = mm_df[, c("purchase", vars)],
 24 |                   family = binomial)
 25 | 
 26 | summary(finalmodel)$coef[,c("Estimate", "Pr(>|z|)")]
 27 | 
 28 | # load packages
 29 | library(gamlr)
 30 | # create the model matrix
 31 | mm <- model.matrix(purchase~source, data = ga)
 32 | 
 33 | # create the sparse model matrix
 34 | mm_sparse <- sparse.model.matrix(purchase~source, data = ga)
 35 | # compare the object's sizes
 36 | as.numeric(object.size(mm)/object.size(mm_sparse))
 37 | 
 38 | # run k-fold cross-validation lasso
 39 | cvpurchase <- cv.gamlr(mm_sparse, ga$purchase, family="binomial")
 40 | 
 41 | # load packages
 42 | library(PRROC)
 43 | # use "best" model for prediction
 44 | # (model selection based on average OSS deviance
 45 | pred <- predict(cvpurchase$gamlr, mm_sparse, type="response")
 46 | # compute tpr, fpr; plot ROC
 47 | comparison <- roc.curve(scores.class0 = pred,
 48 |                        weights.class0=ga$purchase,
 49 |                        curve=TRUE)
 50 | plot(comparison)
 51 | 
 52 | beta_ols <-
 53 |      function(X, y) {
 54 |           # compute cross products and inverse
 55 |           XXi <- solve(crossprod(X,X))
 56 |           Xy <- crossprod(X, y)
 57 |           return( XXi  %*% Xy )
 58 |      }
 59 | 
 60 | # set parameter values
 61 | n <- 10000000
 62 | p <- 4
 63 | # generate sample based on Monte Carlo
 64 | # generate a design matrix (~ our 'dataset')
 65 | # with 4 variables and 10,000 observations
 66 | X <- matrix(rnorm(n*p, mean = 10), ncol = p)
 67 | # add column for intercept
 68 | X <- cbind(rep(1, n), X)
 69 | 
 70 | # MC model
 71 | y <- 2 + 1.5*X[,2] + 4*X[,3] - 3.5*X[,4] + 0.5*X[,5] + rnorm(n)
 72 | 
 73 | # apply the OLS estimator
 74 | beta_ols(X, y)
 75 | 
 76 | beta_uluru <-
 77 |      function(X_subs, y_subs, X_rem, y_rem) {
 78 |           # compute beta_fs
 79 |           #(this is simply OLS applied to the subsample)
 80 |           XXi_subs <- solve(crossprod(X_subs, X_subs))
 81 |           Xy_subs <- crossprod(X_subs, y_subs)
 82 |           b_fs <- XXi_subs  %*% Xy_subs
 83 |           # compute \mathbf{R}_{rem}
 84 |           R_rem <- y_rem - X_rem %*% b_fs
 85 |           # compute \hat{\beta}_{correct}
 86 |           b_correct <-
 87 |                (nrow(X_subs)/(nrow(X_rem))) *
 88 |                XXi_subs %*% crossprod(X_rem, R_rem)
 89 |           # beta uluru
 90 |           return(b_fs + b_correct)
 91 |      }
 92 | 
 93 | # set size of sub-sample
 94 | n_subs <- 1000
 95 | # select sub-sample and remainder
 96 | n_obs <- nrow(X)
 97 | X_subs <- X[1L:n_subs,]
 98 | y_subs <- y[1L:n_subs]
 99 | X_rem <- X[(n_subs+1L):n_obs,]
100 | y_rem <- y[(n_subs+1L):n_obs]
101 | # apply the uluru estimator
102 | beta_uluru(X_subs, y_subs, X_rem, y_rem)
103 | 
104 | # define sub-samples
105 | n_subs_sizes <- seq(from = 1000, to = 500000, by=10000)
106 | n_runs <- length(n_subs_sizes)
107 | # compute uluru result, stop time
108 | mc_results <- rep(NA, n_runs)
109 | mc_times <- rep(NA, n_runs)
110 | for (i in 1:n_runs) {
111 |      # set size of sub-sample
112 |      n_subs <- n_subs_sizes[i]
113 |      # select sub-sample and remainder
114 |      n_obs <- nrow(X)
115 |      X_subs <- X[1L:n_subs,]
116 |      y_subs <- y[1L:n_subs]
117 |      X_rem <- X[(n_subs+1L):n_obs,]
118 |      y_rem <- y[(n_subs+1L):n_obs]
119 |      mc_results[i] <- beta_uluru(X_subs,
120 |                                  y_subs,
121 |                                  X_rem,
122 |                                  y_rem)[2] # (1 is the intercept)
123 |      mc_times[i] <- system.time(beta_uluru(X_subs,
124 |                                            y_subs,
125 |                                            X_rem,
126 |                                            y_rem))[3]
127 | }
128 | # compute OLS results and OLS time
129 | ols_time <- system.time(beta_ols(X, y))
130 | ols_res <- beta_ols(X, y)[2]
131 | 
132 | # load packages
133 | library(ggplot2)
134 | # prepare data to plot
135 | plotdata <- data.frame(beta1 = mc_results,
136 |                        time_elapsed = mc_times,
137 |                        subs_size = n_subs_sizes)
138 | 
139 | ggplot(plotdata, aes(x = subs_size, y = time_elapsed)) +
140 |      geom_point(color="darkgreen") +
141 |      geom_hline(yintercept = ols_time[3],
142 |                 color = "red",
143 |                 linewidth = 1) +
144 |      theme_minimal() +
145 |      ylab("Time elapsed") +
146 |      xlab("Subsample size")
147 | 
148 | 
149 | 
150 | ggplot(plotdata, aes(x = subs_size, y = beta1)) +
151 |      geom_hline(yintercept = ols_res,
152 |                 color = "red",
153 |                 linewidth = 1) +
154 |        geom_hline(yintercept = 1.5,
155 |                 color = "green",
156 |                 linewidth = 1) +
157 |      geom_point(color="darkgreen") +
158 |      theme_minimal() +
159 |      ylab("Estimated coefficient") +
160 |      xlab("Subsample size")
161 | 


--------------------------------------------------------------------------------
/R_code_examples/04_partII_software.R:
--------------------------------------------------------------------------------
  1 | # how much time does it take to run this loop?
  2 | system.time(for (i in 1:100) {i + 5})
  3 | 
  4 | # load package
  5 | library(microbenchmark)
  6 | # how much time does it take to run this loop (exactly)?
  7 | microbenchmark(for (i in 1:100) {i + 5})
  8 | 
  9 | hello <- "Hello, World!"
 10 | object.size(hello)
 11 | 
 12 | # initialize a large string vector containing letters
 13 | large_string <- rep(LETTERS[1:20], 1000^2)
 14 | head(large_string)
 15 | 
 16 | # store the same information as a factor in a new variable
 17 | large_factor <- as.factor(large_string)
 18 | 
 19 | # is one bigger than the other?
 20 | object.size(large_string) - object.size(large_factor)
 21 | 
 22 | # load package
 23 | library(pryr)
 24 | 
 25 | # initialize a vector with 1000 (pseudo)-random numbers
 26 | mem_change(
 27 |         thousand_numbers <- runif(1000)
 28 |         )
 29 | 
 30 | 
 31 | 
 32 | # initialize a vector with 1M (pseudo)-random numbers
 33 | mem_change(
 34 |         a_million_numbers <- runif(1000^2)
 35 |         )
 36 | 
 37 | # load packages
 38 | library(bench)
 39 | 
 40 | # initialize variables
 41 | x <- 1:10000
 42 | z <- 1.5
 43 | 
 44 | # approach I: loop
 45 | multiplication <- 
 46 |         function(x,z) {
 47 |                 result <- c()
 48 |                 for (i in 1:length(x)) {result <- c(result, x[i]*z)}
 49 |                 return(result)
 50 |         }
 51 | result <- multiplication(x,z)
 52 | head(result)
 53 | 
 54 | # approach II: "R-style"
 55 | result2 <- x * z 
 56 | head(result2)
 57 | 
 58 | # comparison
 59 | benchmarking <- 
 60 |         mark(
 61 |         result <- multiplication(x,z),
 62 |         result2 <- x * z, 
 63 |         min_iterations = 50 
 64 | )
 65 | benchmarking[, 4:9]
 66 | 
 67 | 
 68 | plot(benchmarking, type = "boxplot")
 69 | 
 70 | # load package
 71 | library(profvis)
 72 | 
 73 | # analyze performance of several lines of code
 74 | profvis({
 75 |         x <- 1:10000
 76 |         z <- 1.5
 77 | 
 78 | # approach I: loop
 79 | multiplication <-
 80 |         function(x,z) {
 81 |                 result <- c()
 82 |                 for (i in 1:length(x)) {result <- c(result, x[i]*z)}
 83 |                 return(result)
 84 |         }
 85 | result <- multiplication(x,z)
 86 | 
 87 | # approach II: "R-style"
 88 | result2 <- x * z
 89 | head(result2)
 90 | })
 91 | 
 92 | # naïve implementation
 93 | sqrt_vector <- 
 94 |      function(x) {
 95 |           output <- c()
 96 |           for (i in 1:length(x)) {
 97 |                output <- c(output, x[i]^(1/2))
 98 |           }
 99 |           
100 |           return(output)
101 |      }
102 | 
103 | # implementation with pre-allocation of memory
104 | sqrt_vector_faster <- 
105 |      function(x) {
106 |           output <- rep(NA, length(x))
107 |           for (i in 1:length(x)) {
108 |                output[i] <-  x[i]^(1/2)
109 |           }
110 |           
111 |           return(output)
112 |      }
113 | 
114 | 
115 | # the different sizes of the vectors we will put into the two functions
116 | input_sizes <- seq(from = 100, to = 10000, by = 100)
117 | # create the input vectors
118 | inputs <- sapply(input_sizes, rnorm)
119 | 
120 | # compute outputs for each of the functions
121 | output_slower <- 
122 |      sapply(inputs, 
123 |             function(x){ system.time(sqrt_vector(x))["elapsed"]
124 |                  }
125 |             )
126 | output_faster <- 
127 |      sapply(inputs, 
128 |             function(x){ system.time(sqrt_vector_faster(x))["elapsed"]
129 |                  }
130 |             )
131 | 
132 | # load packages
133 | library(ggplot2)
134 | 
135 | # initialize data frame for plot
136 | plotdata <- data.frame(time_elapsed = c(output_slower, output_faster),
137 |                        input_size = c(input_sizes, input_sizes),
138 |                        Implementation= c(rep("sqrt_vector",
139 |                                              length(output_slower)),
140 |                                          rep("sqrt_vector_faster",
141 |                                              length(output_faster))))
142 | 
143 | # plot
144 | ggplot(plotdata, aes(x=input_size, y= time_elapsed)) +
145 |      geom_point(aes(colour=Implementation)) +
146 |      theme_minimal(base_size = 18) +
147 |      theme(legend.position = "bottom") +
148 |      ylab("Time elapsed (in seconds)") +
149 |      xlab("No. of elements processed") 
150 |      
151 | 
152 | # implementation with vectorization
153 | sqrt_vector_fastest <- 
154 |      function(x) {
155 |                output <-  x^(1/2)
156 |           return(output)
157 |      }
158 | 
159 | # speed test
160 | output_fastest <- 
161 |      sapply(inputs, 
162 |             function(x){ system.time(sqrt_vector_fastest(x))["elapsed"]
163 |                  }
164 |             )
165 | 
166 | # load packages
167 | library(ggplot2)
168 | 
169 | # initialize data frame for plot
170 | plotdata <- data.frame(time_elapsed = c(output_faster, output_fastest),
171 |                        input_size = c(input_sizes, input_sizes),
172 |                        Implementation= c(rep("sqrt_vector_faster",
173 |                                              length(output_faster)),
174 |                                          rep("sqrt_vector_fastest",
175 |                                              length(output_fastest))))
176 | 
177 | # plot
178 | ggplot(plotdata, aes(x=time_elapsed, y=Implementation)) +
179 |      geom_boxplot(aes(colour=Implementation),
180 |                           show.legend = FALSE) +
181 |      theme_minimal(base_size = 18) +
182 |      xlab("Time elapsed (in seconds)")
183 |      
184 | 
185 | 
186 | 
187 | # load packages
188 | library(data.table)
189 | 
190 | # get a list of all file-paths
191 | textfiles <- list.files("data/twitter_texts", full.names = TRUE)
192 | 
193 | 
194 | # prepare loop
195 | all_texts <- list()
196 | n_files <- length(textfiles)
197 | length(all_texts) <- n_files
198 | # read all files listed in textfiles
199 | for (i in 1:n_files) {
200 |      all_texts[[i]] <- fread(textfiles[i])
201 | }
202 | 
203 | 
204 | # combine all in one data.table
205 | twitter_text <- rbindlist(all_texts)
206 | # check result
207 | dim(twitter_text)
208 | 
209 | 
210 | # use lapply instead of loop
211 | all_texts <- lapply(textfiles, fread)
212 | # combine all in one data.table
213 | twitter_text <- rbindlist(all_texts)
214 | # check result
215 | dim(twitter_text)
216 | 
217 | 
218 | # initialize the import function
219 | import_file <- 
220 |      function(x) {
221 |           parsed_x <- fread(x)
222 |           return(parsed_x)
223 |      }
224 | 
225 | # 'vectorize' it
226 | import_files <- Vectorize(import_file, SIMPLIFY = FALSE)
227 | 
228 | # Apply the vectorized function
229 | all_texts <- import_files(textfiles)
230 | twitter_text <- rbindlist(all_texts)
231 | # check the result
232 | dim(twitter_text)
233 | 
234 | a <- runif(10000)
235 | 
236 | b <- a
237 | 
238 | object_size(a)
239 | mem_change(c <- a)
240 | 
241 | # load packages
242 | library(lobstr)
243 | 
244 | # check memory addresses of objects
245 | obj_addr(a)
246 | obj_addr(b)
247 | 
248 | # check the first element's value
249 | a[1]
250 | b[1]
251 | 
252 | # modify a, check memory change
253 | mem_change(a[1] <- 0)
254 | 
255 | # check memory addresses
256 | obj_addr(a)
257 | obj_addr(b)
258 | 
259 | 
260 | mem_change(d <- runif(10000))
261 | mem_change(d[1] <- 0)
262 | 
263 | mem_change(large_vector <- runif(10^8))
264 | mem_change(rm(large_vector))
265 | 
266 | import_file
267 | 
268 | sum
269 | 
270 | # import data
271 | econ <- read.csv("data/economics.csv")
272 | 
273 | # filter
274 | econ2 <- econ["1968-01-01"<=econ$date,]
275 | 
276 | # compute yearly averages (basic R approach)
277 | econ2$year <- lubridate::year(econ2$date)
278 | years <- unique(econ2$year)
279 | averages <- 
280 |      sapply(years, FUN = function(x){
281 |           mean(econ2[econ2$year==x,"unemploy"])
282 |           })
283 | output <- data.frame(year=years, average_unemploy=averages)
284 | 
285 | # inspect the first few lines of the result
286 | head(output)
287 | 
288 | 
289 | 
290 | 
291 | SELECT
292 | 
293 | strftime('%Y', `date`)  AS year,
294 | 
295 | AVG(unemploy) AS average_unemploy
296 | 
297 | FROM econ
298 | 
299 | WHERE "1968-01-01"<=`date`
300 | 
301 | GROUP BY year LIMIT 6;
302 | 
303 | 
304 | 
305 | 
306 | groupby
307 | 
308 | 
309 | 
310 | 
311 | 
312 | 
313 | 
314 | 
315 | 
316 | 
317 | 
318 | 
319 | 
320 | 
321 | 
322 | 
323 | 
324 | 
325 | 
326 | 
327 | 
328 | 
329 | 
330 | select_example
331 | 
332 | 
333 | 
334 | simple_query
335 | 
336 | 
337 | 
338 | 
339 | 
340 | # import data
341 | econ <- read.csv("data/economics.csv")
342 | inflation <- read.csv("data/inflation.csv")
343 | 
344 | # prepare variable to match observations
345 | econ$year <- lubridate::year(econ$date)
346 | inflation$year <- lubridate::year(inflation$date)
347 | 
348 | # create final output
349 | years <- unique(econ$year)
350 | averages <- sapply(years, FUN = function(x) {
351 |         mean(econ[econ$year==x,"unemploy"]/econ[econ$year==x,"pop"])*100
352 |         
353 | } )
354 | unemp <- data.frame(year=years,
355 |                      average_unemp_percent=averages)
356 | # combine via the year column
357 | # keep all rows of econ
358 | output<- merge(unemp, inflation[, c("year", "inflation_percent")], by="year")
359 | # inspect output
360 | head(output)
361 | 
362 | 
363 | SELECT
364 | 
365 | strftime('%Y', econ.date)  AS year,
366 | 
367 | AVG(unemploy/pop)*100 AS average_unemp_percent,
368 | 
369 | inflation_percent
370 | 
371 | FROM econ INNER JOIN inflation ON year = strftime('%Y', inflation.date)
372 | 
373 | GROUP BY year
374 | 
375 | 
376 | innerjoin_example[1:6,]
377 | 
378 | dbDisconnect(con)
379 | 
380 | # replace "YOUR-API-KEY" with
381 | # your actual key
382 | Sys.setenv(OPENAI_API_KEY = "YOUR-API-KEY")
383 | # open chat window
384 | gptstudio:::chat_gpt_addin()
385 | 
386 | select date,
387 | 
388 | unemploy from econ
389 | 
390 | where unemploy > 15000
391 | 
392 | order by date;
393 | 
394 | 


--------------------------------------------------------------------------------
/R_code_examples/05_partII_hardware.R:
--------------------------------------------------------------------------------
  1 | # load packages
  2 | library(data.table)
  3 | 
  4 | # load example data from basic R installation
  5 | data("LifeCycleSavings")
  6 | 
  7 | # write data to normal csv file and check size
  8 | fwrite(LifeCycleSavings, file="lcs.csv")
  9 | file.size("lcs.csv")
 10 | 
 11 | # write data to a GZIPped (compressed) csv file and check size
 12 | fwrite(LifeCycleSavings, file="lcs.csv.gz")
 13 | file.size("lcs.csv.gz")
 14 | 
 15 | # read/import the compressed data
 16 | lcs <- data.table::fread("lcs.csv.gz")
 17 | 
 18 | # common ZIP compression (independent of data.table package)
 19 | write.csv(LifeCycleSavings, file="lcs.csv")
 20 | file.size("lcs.csv")
 21 | zip(zipfile = "lcs.csv.zip", files =  "lcs.csv")
 22 | file.size("lcs.csv.zip")
 23 | 
 24 | # unzip/decompress and read/import data
 25 | lcs_path <- unzip("lcs.csv.zip")
 26 | lcs <- read.csv(lcs_path)
 27 | 
 28 | 
 29 | 
 30 | 
 31 | 
 32 | # you can download the dataset from 
 33 | # https://www.kaggle.com/jackdaoud/marketing-data?
 34 | # select=marketing_data.csv
 35 | 
 36 | # PREPARATION -----------------------------
 37 | # packages
 38 | library(stringr)
 39 | 
 40 | # import data
 41 | marketing <- read.csv("data/marketing_data.csv")
 42 | # clean/prepare data
 43 | marketing$Income <- as.numeric(gsub("[[:punct:]]",
 44 |                                     "",
 45 |                                     marketing$Income)) 
 46 | marketing$days_customer <- 
 47 |      as.Date(Sys.Date())- 
 48 |      as.Date(marketing$Dt_Customer, "%m/%d/%y")
 49 | marketing$Dt_Customer <- NULL
 50 | 
 51 | # all sets of independent vars
 52 | indep <- names(marketing)[ c(2:19, 27,28)]
 53 | combinations_list <- lapply(1:length(indep),
 54 |                             function(x) combn(indep, x,
 55 |                                               simplify = FALSE))
 56 | combinations_list <- unlist(combinations_list, 
 57 |                             recursive = FALSE)
 58 | models <- lapply(combinations_list,
 59 |                  function(x) paste("Response ~", 
 60 |                                    paste(x, collapse="+")))
 61 | 
 62 | # COMPUTE REGRESSIONS --------------------------
 63 | N <- 10 #  N <- length(models) for all
 64 | pseudo_Rsq <- list()
 65 | length(pseudo_Rsq) <- N
 66 | for (i in 1:N) {
 67 |   # fit the logit model via maximum likelihood
 68 |   fit <- glm(models[[i]],
 69 |              data=marketing,
 70 |              family = binomial())
 71 |   # compute the proportion of deviance explained by 
 72 |   # the independent vars (~R^2)
 73 |   pseudo_Rsq[[i]] <- 1-(fit$deviance/fit$null.deviance)
 74 | }
 75 | 
 76 | # SELECT THE WINNER ---------------
 77 | models[[which.max(pseudo_Rsq)]]
 78 | 
 79 | 
 80 | # COMPUTE REGRESSIONS --------------------------
 81 | N <- 10 #  N <- length(models) for all
 82 | run_reg <- 
 83 |      function(model, data, family){
 84 |           # fit the logit model via maximum likelihood
 85 |           fit <- glm(model, data=data, family = family)
 86 |           # compute and return the proportion of deviance explained by 
 87 |           # the independent vars (~R^2)
 88 |           return(1-(fit$deviance/fit$null.deviance))
 89 |      }
 90 | 
 91 | pseudo_Rsq_list <-lapply(models[1:N], run_reg, data=marketing, family=binomial() )
 92 | pseudo_Rsq <- unlist(pseudo_Rsq_list)
 93 | 
 94 | # SELECT THE WINNER ---------------
 95 | models[[which.max(pseudo_Rsq)]]
 96 | 
 97 | 
 98 | 
 99 | # SET UP ------------------
100 | 
101 | # load packages
102 | library(future)
103 | library(future.apply)
104 | # instruct the package to resolve
105 | # futures in parallel (via a SOCK cluster)
106 | plan(multisession)
107 | 
108 | # COMPUTE REGRESSIONS --------------------------
109 | N <- 10 #  N <- length(models) for all
110 | pseudo_Rsq_list <- future_lapply(models[1:N],
111 |                                  run_reg,
112 |                                  data=marketing,
113 |                                  family=binomial() )
114 | pseudo_Rsq <- unlist(pseudo_Rsq_list)
115 | 
116 | # SELECT THE WINNER ---------------
117 | models[[which.max(pseudo_Rsq)]]
118 | 
119 | 
120 | # COMPUTE REGRESSIONS IN PARALLEL (MULTI-CORE) --------------------------
121 | 
122 | # packages for parallel processing
123 | library(parallel)
124 | library(doSNOW)
125 | 
126 | # get the number of cores available
127 | ncores <- parallel::detectCores()
128 | # set cores for parallel processing
129 | ctemp <- makeCluster(ncores)
130 | registerDoSNOW(ctemp)
131 | 
132 | # prepare loop
133 | N <- 10000 #  N <- length(models) for all
134 | # run loop in parallel
135 | pseudo_Rsq <-
136 |   foreach ( i = 1:N, .combine = c) %dopar% {
137 |     # fit the logit model via maximum likelihood
138 |     fit <- glm(models[[i]], 
139 |                data=marketing,
140 |                family = binomial())
141 |     # compute the proportion of deviance explained by 
142 |     # the independent vars (~R^2)
143 |     return(1-(fit$deviance/fit$null.deviance))
144 | }
145 | 
146 | # SELECT THE WINNER ---------------
147 | models[[which.max(pseudo_Rsq)]]
148 | 
149 | 
150 | # COMPUTE REGRESSIONS IN PARALLEL (MULTI-CORE) ---------------
151 | 
152 | 
153 | # prepare parallel lapply (based on forking, 
154 | # here clearly faster than foreach)
155 | N <- 10000 #  N <- length(models) for all
156 | # run parallel lapply
157 | pseudo_Rsq <- mclapply(1:N,
158 |                        mc.cores = ncores,
159 |                        FUN = function(i){
160 |                          # fit the logit model 
161 |                          fit <- glm(models[[i]],
162 |                                     data=marketing,
163 |                                     family = binomial())
164 |                          # compute the proportion of deviance 
165 |                          # explained  by the independent vars (~R^2)
166 |                          return(1-(fit$deviance/fit$null.deviance))
167 |                          })
168 | 
169 | # SELECT THE WINNER, SHOW FINAL OUTPUT ---------------
170 | 
171 | best_model <- models[[which.max(pseudo_Rsq)]]
172 | best_model
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | # load package
180 | library(bench)
181 | library(gpuR)
182 | 
183 | 
184 | # initialize dataset with pseudo-random numbers
185 | N <- 10000  # number of observations
186 | P <- 100 # number of variables
187 | X <- matrix(rnorm(N * P, 0, 1), nrow = N, ncol =P)
188 | 
189 | 
190 | # prepare GPU-specific objects/settings
191 | # transfer matrix to GPU (matrix stored in GPU memory)
192 | vclX <- vclMatrix(X, type = "float")  
193 | 
194 | # compare three approaches
195 | gpu_cpu <- bench::mark(
196 |   
197 |   # compute with CPU 
198 |   cpu <-t(X) %*% X,
199 |   
200 |   # GPU version, in GPU memory 
201 |   # (vclMatrix formation is a memory transfer)
202 |   gpu <- t(vclX) %*% vclX,
203 |  
204 | check = FALSE, memory = FALSE, min_iterations = 200)
205 | 
206 | plot(gpu_cpu, type = "boxplot")
207 | 
208 | include_graphics("img/gpu_cpu.png")
209 | 


--------------------------------------------------------------------------------
/R_code_examples/06_partII_distributedsystems.R:
--------------------------------------------------------------------------------
  1 | # initialize the input text (for simplicity as one text string)
  2 | input_text <-
  3 | "Apple Orange Mango
  4 | Orange Grapes Plum
  5 | Apple Plum Mango
  6 | Apple Apple Plum"
  7 | 
  8 | 
  9 | # Mapper splits input into lines
 10 | lines <- as.list(strsplit(input_text, "\n")[[1]])
 11 | lines[1:2]
 12 | 
 13 | 
 14 | # Mapper splits lines into key–value pairs
 15 | map_fun <-
 16 |      function(x){
 17 |           
 18 |           # remove special characters
 19 |           x_clean <- gsub("[[:punct:]]", "", x)
 20 |           # split line into words
 21 |           keys <- unlist(strsplit(x_clean, " "))
 22 |           # initialize key–value pairs
 23 |           key_values <- rep(1, length(keys))
 24 |           names(key_values) <- keys
 25 |           
 26 |           return(key_values)
 27 |      }
 28 | 
 29 | kv_pairs <- Map(map_fun, lines)
 30 | 
 31 | # look at the result
 32 | kv_pairs[1:2]
 33 | 
 34 | # order and shuffle
 35 | kv_pairs <- unlist(kv_pairs)
 36 | keys <- unique(names(kv_pairs))
 37 | keys <- keys[order(keys)]
 38 | shuffled <- lapply(keys,
 39 |                     function(x) kv_pairs[x == names(kv_pairs)])
 40 | shuffled[1:2]
 41 | 
 42 | sums <- lapply(shuffled, Reduce, f=sum)
 43 | names(sums) <- keys
 44 | sums[1:2]
 45 | 
 46 | # create directory for input files (typically text files)
 47 | 
 48 | mkdir ~/input
 49 | 
 50 | 
 51 | echo "Apple Orange Mango
 52 | 
 53 | Orange Grapes Plum
 54 | 
 55 | Apple Plum Mango
 56 | 
 57 | Apple Apple Plum" >>  ~/input/text.txt
 58 | 
 59 | 
 60 | 
 61 | 
 62 | # run mapreduce word count
 63 | 
 64 | /usr/local/hadoop/bin/hadoop jar \
 65 | 
 66 | /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.10.1.jar \
 67 | 
 68 | wordcount
 69 | 
 70 | ~/input ~/wc_example
 71 | 
 72 | 
 73 | cat ~/wc_example/*
 74 | 
 75 | 
 76 | 
 77 | 
 78 | # might have to switch to java version 8 first
 79 | 
 80 | sudo update-alternatives --config java
 81 | 
 82 | 
 83 | 
 84 | 
 85 | $ SPARK-HOME/bin/sparkR
 86 | 
 87 | 
 88 | # to install use
 89 | # devtools::install_github("cran/SparkR")
 90 | # load packages
 91 | library(SparkR)
 92 | # start session
 93 | sparkR.session()
 94 | 
 95 | 
 96 | # install.packages("SparkR")
 97 | # or, if temporarily not available on CRAN:
 98 | #if (!require('devtools')) install.packages('devtools')
 99 | #devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg') # replace x.x with the version of your spark installation
100 | 
101 | # load packages
102 | library(SparkR)
103 | 
104 | # start session
105 | sparkR.session(sparkHome = "/home/umatter/.cache/spark/spark-3.1.2-bin-hadoop2.7")
106 | 
107 | 
108 | 
109 | # Import data and create a SparkDataFrame 
110 | # (a distributed collection of data, RDD)
111 | flights <- read.df("data/flights.csv", source = "csv", header="true")
112 | 
113 | # inspect the object
114 | class(flights)
115 | dim(flights)
116 | 
117 | 
118 | flights$dep_delay <- cast(flights$dep_delay, "double")
119 | flights$dep_time <- cast(flights$dep_time, "double")
120 | flights$arr_time <- cast(flights$arr_time, "double")
121 | flights$arr_delay <- cast(flights$arr_delay, "double")
122 | flights$air_time <- cast(flights$air_time, "double")
123 | flights$distance <- cast(flights$distance, "double")
124 | 
125 | # filter
126 | long_flights <- select(flights, "carrier", "year", "arr_delay", "distance")
127 | long_flights <- filter(long_flights, long_flights$distance >= 1000)
128 | head(long_flights)
129 | 
130 | # aggregation: mean delay per carrier
131 | long_flights_delays<- summarize(groupBy(long_flights, long_flights$carrier),
132 |                       avg_delay = mean(long_flights$arr_delay))
133 | head(long_flights_delays)
134 | 
135 | # Convert result back into native R object
136 | delays <- collect(long_flights_delays)
137 | class(delays)
138 | delays
139 | 
140 | cd SPARK-HOME
141 | 
142 | 
143 | 
144 | 
145 | $ bin/spark-sql
146 | 
147 | 
148 | {"name":"Michael", "salary":3000}
149 | 
150 | {"name":"Andy", "salary":4500}
151 | 
152 | {"name":"Justin", "salary":3500}
153 | 
154 | {"name":"Berta", "salary":4000}
155 | 
156 | 
157 | 
158 | 
159 | SELECT *
160 | 
161 | FROM json.`examples/src/main/resources/employees.json`
162 | 
163 | ;
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | SELECT *
171 | 
172 | FROM json.`examples/src/main/resources/employees.json`
173 | 
174 | WHERE salary <4000
175 | 
176 | ;
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | SELECT AVG(salary) AS mean_salary
184 | 
185 | FROM json.`examples/src/main/resources/employees.json`;
186 | 
187 | 
188 | 
189 | 
190 | # to install use
191 | # devtools::install_github("cran/SparkR")
192 | # load packages
193 | library(SparkR)
194 | # start session
195 | sparkR.session()
196 | # read data 
197 | flights <- read.df("data/flights.csv", source = "csv", header="true")
198 | 
199 | 
200 | # register the data frame as a table
201 | createOrReplaceTempView(flights, "flights" )
202 | 
203 | # now run SQL queries on it
204 | query <- 
205 | "SELECT DISTINCT carrier,
206 | year,
207 | arr_delay,
208 | distance
209 | FROM flights
210 | WHERE 1000 <= distance"
211 | 
212 | long_flights2 <- sql(query)
213 | head(long_flights2)
214 | 
215 | 


--------------------------------------------------------------------------------
/R_code_examples/07_partII_cloudcomputing.R:
--------------------------------------------------------------------------------
  1 | # install packages for parallelization
  2 | install.packages("parallel", "doSNOW", "stringr")
  3 | 
  4 | # load packages
  5 | library(parallel)
  6 | library(doSNOW)
  7 | 
  8 | # verify no. of cores available
  9 | n_cores <- detectCores()
 10 | n_cores
 11 | 
 12 | 
 13 | 
 14 | # PREPARATION -----------------------------
 15 | 
 16 | # packages
 17 | library(stringr)
 18 | 
 19 | # import data
 20 | marketing <- read.csv("data/marketing_data.csv")
 21 | # clean/prepare data
 22 | marketing$Income <- as.numeric(gsub("[[:punct:]]", "", marketing$Income))
 23 | marketing$days_customer <- as.Date(Sys.Date())-
 24 |   as.Date(marketing$Dt_Customer, "%m/%d/%y")
 25 | marketing$Dt_Customer <- NULL
 26 | 
 27 | # all sets of independent vars
 28 | indep <- names(marketing)[ c(2:19, 27,28)]
 29 | combinations_list <- lapply(1:length(indep),
 30 |                             function(x) combn(indep, x, simplify = FALSE))
 31 | combinations_list <- unlist(combinations_list, recursive = FALSE)
 32 | models <- lapply(combinations_list,
 33 |                  function(x) paste("Response ~", paste(x, collapse="+")))
 34 | 
 35 | # set cores for parallel processing
 36 | # ctemp <- makeCluster(ncores)
 37 | # registerDoSNOW(ctemp)
 38 | 
 39 | # prepare loop
 40 | N <- 10 # just for illustration, the actual code is N <- length(models)
 41 | # run loop in parallel
 42 | pseudo_Rsq <-
 43 |   foreach ( i = 1:N, .combine = c) %dopar% {
 44 |     # fit the logit model via maximum likelihood
 45 |     fit <- glm(models[[i]], data=marketing, family = binomial())
 46 |     # compute the proportion of deviance explained
 47 |     #by the independent vars (~R^2)
 48 |     return(1-(fit$deviance/fit$null.deviance))
 49 | }
 50 | 
 51 | 
 52 | # set cores for parallel processing
 53 | ctemp <- makeCluster(ncores)
 54 | registerDoSNOW(ctemp)
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | library(knitr)
 66 | hook_output = knit_hooks$get('output')
 67 | knit_hooks$set(output = function(x, options) {
 68 |   # this hook is used only when the linewidth option is not NULL
 69 |   if (!is.null(n <- options$linewidth)) {
 70 |     x = xfun::split_lines(x)
 71 |     # any lines wider than n should be wrapped
 72 |     if (any(nchar(x) > n)) x = strwrap(x, width = n)
 73 |     x = paste(x, collapse = '\n')
 74 |   }
 75 |   hook_output(x, options)
 76 | })
 77 | 
 78 | aws emr create-cluster \
 79 | 
 80 | --release-label emr-6.1.0 \
 81 | 
 82 | --applications Name=Hadoop Name=Spark Name=Hive Name=Pig \
 83 | 
 84 | Name=Tez Name=Ganglia \
 85 | 
 86 | --name "EMR 6.1 RStudio + sparklyr"  \
 87 | 
 88 | --service-role EMR_DefaultRole \
 89 | 
 90 | --instance-groups InstanceGroupType=MASTER,InstanceCount=1,\
 91 | 
 92 | InstanceType=m3.2xlarge,InstanceGroupType=CORE,\
 93 | 
 94 | InstanceCount=2,InstanceType=m3.2xlarge \
 95 | 
 96 | --bootstrap-action \
 97 | 
 98 | Path='s3://aws-bigdata-blog/artifacts/
 99 | 
100 | aws-blog-emr-rstudio-sparklyr/rstudio_sparklyr_emr6.sh',\
101 | 
102 | Name="Install RStudio" --ec2-attributes InstanceProfile=EMR_EC2_DefaultRole,\
103 | 
104 | KeyName="sparklyr"
105 | 
106 | --configurations '[{"Classification":"spark",
107 | 
108 | "Properties":{"maximizeResourceAllocation":"true"}}]' \
109 | 
110 | --region us-east-1
111 | 
112 | 
113 | 
114 | 
115 | # load packages
116 | library(sparklyr)
117 | # connect rstudio session to cluster
118 | sc <- spark_connect(master = "yarn")
119 | 
120 | 


--------------------------------------------------------------------------------
/R_code_examples/09_partIII_cleaning_transformation.R:
--------------------------------------------------------------------------------
  1 | fs::file_size("data/flights.csv")
  2 | 
  3 | if (dir.exists("ff_files")){
  4 |      unlink("ff_files", recursive = TRUE, force = TRUE)
  5 | }
  6 | 
  7 | 
  8 | 
  9 | # SET UP --------------
 10 | # install.packages(c("ff", "ffbase"))
 11 | # you might have to install the ffbase package directly from GitHub:
 12 | # devtools::install_github("edwindj/ffbase", subdir="pkg")
 13 | # load packages
 14 | library(ff)
 15 | library(ffbase)
 16 | library(data.table) # for comparison
 17 | 
 18 | 
 19 | # create directory for ff chunks, and assign directory to ff 
 20 | system("mkdir ff_files")
 21 | options(fftempdir = "ff_files")
 22 | 
 23 | 
 24 | # usual in-memory csv import
 25 | flights_dt <- fread("data/flights.csv")
 26 | 
 27 | # out-of-memory approach
 28 | flights <- 
 29 |      read.table.ffdf(file="data/flights.csv",
 30 |                      sep=",",
 31 |                      VERBOSE=TRUE,
 32 |                      header=TRUE,
 33 |                      next.rows=100000,
 34 |                      colClasses=NA)
 35 | 
 36 | # compare object sizes
 37 | object.size(flights) # out-of-memory approach
 38 | object.size(flights_dt) # common data.table
 39 | 
 40 | # show the files in the directory keeping the chunks
 41 | head(list.files("ff_files"))
 42 | 
 43 | 
 44 | 
 45 | # SET UP ----------------
 46 | 
 47 | # load packages
 48 | library(bigmemory)
 49 | library(biganalytics)
 50 | 
 51 | # import the data
 52 | flights <- read.big.matrix("data/flights.csv",
 53 |                      type="integer",
 54 |                      header=TRUE,
 55 |                      backingfile="flights.bin",
 56 |                      descriptorfile="flights.desc")
 57 | 
 58 | object.size(flights)
 59 | str(flights)
 60 | 
 61 | 
 62 | # SET UP ----------------
 63 | 
 64 | # load packages
 65 | library(arrow)
 66 | 
 67 | # import the data
 68 | flights <- read_csv_arrow("data/flights.csv",
 69 |                      as_data_frame = FALSE)
 70 | 
 71 | summary(flights)
 72 | object.size(flights)
 73 | 
 74 | 
 75 | SET UP ------------------------
 76 | 
 77 | # create and set directory for ff files
 78 | system("mkdir ff_files")
 79 | options(fftempdir = "ff_files")
 80 | 
 81 | # load packages
 82 | library(ff)
 83 | library(ffbase)
 84 | library(pryr)
 85 | 
 86 | # fix vars
 87 | FLIGHTS_DATA <- "data/flights_sep_oct15.txt"
 88 | AIRLINES_DATA <- "data/airline_id.csv"
 89 | 
 90 | 
 91 | 
 92 | # DATA IMPORT ------------------
 93 | 
 94 | # check memory used
 95 | mem_used()
 96 | 
 97 | # 1. Upload flights_sep_oct15.txt and airline_id.csv files from flat files. 
 98 | 
 99 | system.time(flights.ff <- read.table.ffdf(file=FLIGHTS_DATA,
100 |                                           sep=",",
101 |                                           VERBOSE=TRUE,
102 |                                           header=TRUE,
103 |                                           next.rows=100000,
104 |                                           colClasses=NA))
105 | 
106 | system.time(airlines.ff <- read.csv.ffdf(file= AIRLINES_DATA,
107 |                              VERBOSE=TRUE,
108 |                              header=TRUE,
109 |                              next.rows=100000,
110 |                              colClasses=NA))
111 | 
112 | # check memory used
113 | mem_used()
114 | 
115 | 
116 | # Using read.table()
117 | system.time(flights.table <- read.table(FLIGHTS_DATA, 
118 |                                         sep=",",
119 |                                         header=TRUE))
120 | system.time(airlines.table <- read.csv(AIRLINES_DATA,
121 |                                        header = TRUE))
122 | # check the memory used
123 | mem_used()
124 | 
125 | 
126 | # 2. Inspect the ff_files objects.
127 | For flights.ff object:
128 | class(flights.ff)
129 | dim(flights.ff)
130 | For airlines.ff object:
131 | class(airlines.ff)
132 | dim(airlines.ff)
133 | 
134 | 
135 | # step 1: 
136 | # Rename "Code" variable from airlines.ff 
137 | # to "AIRLINE_ID" and "Description" into "AIRLINE_NM".
138 | names(airlines.ff) <- c("AIRLINE_ID", "AIRLINE_NM")
139 | names(airlines.ff)
140 | str(airlines.ff[1:20,])
141 | 
142 | # merge of ff_files objects
143 | mem_change(flights.data.ff <- merge.ffdf(flights.ff,
144 |                                          airlines.ff,
145 |                                          by="AIRLINE_ID"))
146 | #The new object is only 551.2 KB in size
147 | class(flights.data.ff)
148 | dim(flights.data.ff)
149 | names(flights.data.ff)
150 | 
151 | ##For flights.table:
152 | names(airlines.table) <- c("AIRLINE_ID", "AIRLINE_NM")
153 | names(airlines.table)
154 | str(airlines.table[1:20,])
155 | 
156 | # check memory usage of merge in RAM 
157 | mem_change(flights.data.table <- merge(flights.table,
158 |                                        airlines.table,
159 |                                        by="AIRLINE_ID"))
160 | #The new object is already 105.7 MB in size
161 | #A rapid spike in RAM use when processing
162 | 
163 | mem_used()
164 | 
165 | # Subset the ff_files object flights.data.ff:
166 | subs1.ff <- 
167 |      subset.ffdf(flights.data.ff, 
168 |                  CANCELLED == 1, 
169 |                  select = c(FL_DATE,
170 |                             AIRLINE_ID,
171 |                             ORIGIN_CITY_NAME,
172 |                             ORIGIN_STATE_NM,
173 |                             DEST_CITY_NAME,
174 |                             DEST_STATE_NM,
175 |                             CANCELLATION_CODE))
176 | 
177 | dim(subs1.ff)
178 | mem_used()
179 | 
180 | 
181 | # Save a newly created ff_files object to a data file:
182 | # (7 files (one for each column) created in the ffdb directory)
183 | save.ffdf(subs1.ff, overwrite = TRUE) 
184 | 
185 | 
186 | # Loading previously saved ff_files files:
187 | rm(subs1.ff)
188 | #gc()
189 | load.ffdf("ffdb")
190 | # check the class and structure of the loaded data
191 | class(subs1.ff) 
192 | dim(subs1.ff)
193 | dimnames(subs1.ff)
194 | 
195 | #  Export subs1.ff into CSV and TXT files:
196 | write.csv.ffdf(subs1.ff, "subset1.csv")
197 | 
198 | 
199 | 
200 | # SET UP ----------------
201 | 
202 | # load packages
203 | library(arrow)
204 | library(dplyr)
205 | library(pryr) # for profiling
206 | 
207 | # fix vars
208 | FLIGHTS_DATA <- "data/flights_sep_oct15.txt"
209 | AIRLINES_DATA <- "data/airline_id.csv"
210 | 
211 | # import the data
212 | flights <- read_csv_arrow(FLIGHTS_DATA,
213 |                      as_data_frame = FALSE)
214 | airlines <- read_csv_arrow(AIRLINES_DATA,
215 |                      as_data_frame = FALSE)
216 | 
217 | class(flights)
218 | class(airlines)
219 | object_size(flights)
220 | object_size(airlines)
221 | 
222 | # step 1: 
223 | # Rename "Code" variable from airlines.ff to "AIRLINE_ID"
224 | # and "Description" into "AIRLINE_NM".
225 | names(airlines) <- c("AIRLINE_ID", "AIRLINE_NM")
226 | names(airlines)
227 | 
228 | # merge the two datasets via Arrow
229 | flights.data.ar <- inner_join(airlines, flights, by="AIRLINE_ID")
230 | object_size(flights.data.ar)
231 | 
232 | 
233 | # Subset the ff_files object flights.data.ff:
234 | subs1.ar <- 
235 |         flights.data.ar %>%
236 |         filter(CANCELLED == 1) %>%
237 |         select(FL_DATE,
238 |                AIRLINE_ID,
239 |                ORIGIN_CITY_NAME,
240 |                ORIGIN_STATE_NM,
241 |                DEST_CITY_NAME,
242 |                DEST_STATE_NM,
243 |                CANCELLATION_CODE)
244 |         
245 | object_size(subs1.ar)
246 | 
247 | mem_change(subs1.ar.df <- collect(subs1.ar))
248 | class(subs1.ar.df)
249 | object_size(subs1.ar.df)
250 | 
251 | subs1.ar %>% 
252 |         compute() %>% 
253 |         write_csv_arrow(file="data/subs1.ar.csv")
254 | 


--------------------------------------------------------------------------------
/R_code_examples/10_partIII_descriptives_aggregation.R:
--------------------------------------------------------------------------------
  1 | # load packages
  2 | library(ff)
  3 | library(ffbase)
  4 | 
  5 | # set up the ff directory (for data file chunks)
  6 | if (!dir.exists("fftaxi")){
  7 |      system("mkdir fftaxi")
  8 | }
  9 | options(fftempdir = "fftaxi")
 10 | 
 11 | # import the first one million observations
 12 | taxi <- read.table.ffdf(file = "data/tlc_trips.csv",
 13 |                         sep = ",",
 14 |                         header = TRUE,
 15 |                         next.rows = 100000,
 16 |                         # colClasses= col_classes,
 17 |                         nrows = 1000000
 18 |                         )
 19 | 
 20 | 
 21 | # inspect the factor levels
 22 | levels(taxi$Payment_Type)
 23 | # recode them
 24 | levels(taxi$Payment_Type) <- tolower(levels(taxi$Payment_Type))
 25 | taxi$Payment_Type <- ff(taxi$Payment_Type,
 26 |                         levels = unique(levels(taxi$Payment_Type)),
 27 |                         ramclass = "factor")
 28 | # check result
 29 | levels(taxi$Payment_Type)
 30 | 
 31 | 
 32 | 
 33 | # load packages
 34 | library(doBy)
 35 | 
 36 | # split-apply-combine procedure on data file chunks
 37 | tip_pcategory <- ffdfdply(taxi,
 38 |                           split = taxi$Payment_Type,
 39 |                           BATCHBYTES = 100000000,
 40 |                           FUN = function(x) {
 41 |                                summaryBy(Tip_Amt~Payment_Type,
 42 |                                          data = x,
 43 |                                          FUN = mean,
 44 |                                          na.rm = TRUE)})
 45 | 
 46 | as.data.frame(tip_pcategory)
 47 | 
 48 | # add additional column with the share of tip
 49 | taxi$percent_tip <- (taxi$Tip_Amt/taxi$Total_Amt)*100
 50 | 
 51 | # recompute the aggregate stats
 52 | tip_pcategory <- ffdfdply(taxi,
 53 |                           split = taxi$Payment_Type,
 54 |                           BATCHBYTES = 100000000,
 55 |                           FUN = function(x) {
 56 |                              # note the difference here
 57 |                                summaryBy(percent_tip~Payment_Type, 
 58 |                                          data = x,
 59 |                                          FUN = mean,
 60 |                                          na.rm = TRUE)})
 61 | # show result as data frame
 62 | as.data.frame(tip_pcategory)
 63 | 
 64 | table.ff(taxi$Payment_Type)
 65 | 
 66 | # select the subset of observations only containing trips paid by
 67 | # credit card or cash
 68 | taxi_sub <- subset.ffdf(taxi, Payment_Type=="credit" | Payment_Type == "cash")
 69 | taxi_sub$Payment_Type <- ff(taxi_sub$Payment_Type,
 70 |                         levels = c("credit", "cash"),
 71 |                         ramclass = "factor")
 72 | 
 73 | # compute the cross tabulation
 74 | crosstab <- table.ff(taxi_sub$Passenger_Count,
 75 |                      taxi_sub$Payment_Type
 76 |                      )
 77 | # add names to the margins
 78 | names(dimnames(crosstab)) <- c("Passenger count", "Payment type")
 79 | # show result
 80 | crosstab
 81 | 
 82 | # install.packages(vcd)
 83 | # load package for mosaic plot
 84 | library(vcd)
 85 | 
 86 | # generate a mosaic plot
 87 | mosaic(crosstab, shade = TRUE)
 88 | 
 89 | # load packages
 90 | library(arrow)
 91 | library(dplyr)
 92 | 
 93 | # read the CSV file 
 94 | taxi <- read_csv_arrow("data/tlc_trips.csv", 
 95 |                        as_data_frame = FALSE)
 96 | 
 97 | 
 98 | 
 99 | # clean the categorical variable; aggregate by group
100 | taxi <- 
101 |    taxi %>% 
102 |    mutate(Payment_Type = tolower(Payment_Type))
103 | 
104 | taxi_summary <- 
105 |    taxi %>%
106 |    mutate(percent_tip = (Tip_Amt/Total_Amt)*100 ) %>% 
107 |    group_by(Payment_Type) %>% 
108 |    summarize(avg_percent_tip = mean(percent_tip)) %>% 
109 |    collect() 
110 | 
111 | library(tidyr)
112 | 
113 | # compute the frequencies; pull result into R
114 | ct <- taxi %>%
115 |    filter(Payment_Type %in% c("credit", "cash")) %>%
116 |    group_by(Passenger_Count, Payment_Type) %>%
117 |    summarize(n=n())%>%
118 |      collect()
119 | 
120 | # present as cross-tabulation
121 | pivot_wider(data=ct, 
122 |             names_from="Passenger_Count",
123 |             values_from = "n")
124 | 
125 | 
126 | # load packages
127 | library(data.table)
128 | 
129 | # import data into RAM (needs around 200MB)
130 | taxi <- fread("data/tlc_trips.csv",
131 |               nrows = 1000000)
132 | 
133 | 
134 | # clean the factor levels
135 | taxi$Payment_Type <- tolower(taxi$Payment_Type)
136 | taxi$Payment_Type <- factor(taxi$Payment_Type,
137 |                             levels = unique(taxi$Payment_Type))     
138 | 
139 | 
140 | taxi[, mean(Tip_Amt/Total_Amt)]
141 | 
142 | taxi[, .(percent_tip = mean((Tip_Amt/Total_Amt)*100)), by = Payment_Type]
143 | 
144 | dcast(taxi[Payment_Type %in% c("credit", "cash")],
145 |       Passenger_Count~Payment_Type, 
146 |       fun.aggregate = length,
147 |       value.var = "vendor_name")
148 | 
149 | # housekeeping
150 | #gc()
151 | system("rm -r fftaxi")
152 | 


--------------------------------------------------------------------------------
/R_code_examples/12_partIV_data_analyticsI.R:
--------------------------------------------------------------------------------
  1 | # SET UP ------------------
  2 | # load packages
  3 | library(foreign)
  4 | library(data.table)
  5 | library(lmtest)
  6 | # fix vars
  7 | DATA_PATH <- "data/data_for_tables.dta"
  8 | 
  9 | # import data
 10 | cm <- as.data.table(read.dta(DATA_PATH))
 11 | # keep only clean obs
 12 | cm <- cm[!(is.na(yes)
 13 |            |is.na(pctsumyessameparty)
 14 |            |is.na(pctsumyessameschool)
 15 |            |is.na(pctsumyessamestate))] 
 16 | 
 17 | 
 18 | # pooled model (no FE)
 19 | model0 <-   yes ~ 
 20 |   pctsumyessameschool + 
 21 |   pctsumyessamestate + 
 22 |   pctsumyessameparty 
 23 | 
 24 | dim(model.matrix(model0, data=cm))
 25 | 
 26 | model1 <- 
 27 |   yes ~ pctsumyessameschool + 
 28 |         pctsumyessamestate + 
 29 |         pctsumyessameparty + 
 30 |         factor(congress) +
 31 |         factor(id) -1
 32 | mm1 <- model.matrix(model1, data=cm)
 33 | dim(mm1)
 34 | 
 35 | 
 36 | # fit specification (1)
 37 | runtime <- system.time(fit1 <- lm(data = cm, formula = model1))
 38 | coeftest(fit1)[2:4,]
 39 | # median amount of time needed for estimation
 40 | runtime[3]
 41 | 
 42 | # illustration of within transformation for the senator fixed effects
 43 | cm_within <- 
 44 |   with(cm, data.table(yes = yes - ave(yes, id),
 45 |                       pctsumyessameschool = pctsumyessameschool -
 46 |                         ave(pctsumyessameschool, id),
 47 |                       pctsumyessamestate = pctsumyessamestate -
 48 |                         ave(pctsumyessamestate, id),
 49 |                       pctsumyessameparty = pctsumyessameparty -
 50 |                         ave(pctsumyessameparty, id)
 51 |                       ))
 52 | 
 53 | # comparison of dummy fixed effects estimator and within estimator
 54 | dummy_time <- system.time(fit_dummy <- 
 55 |               lm(yes ~ pctsumyessameschool + 
 56 |                        pctsumyessamestate +
 57 |                        pctsumyessameparty + 
 58 |                        factor(id) -1, data = cm
 59 |                          ))
 60 | within_time <- system.time(fit_within <- 
 61 |                              lm(yes ~ pctsumyessameschool +
 62 |                                       pctsumyessamestate + 
 63 |                                       pctsumyessameparty -1, 
 64 |                                       data = cm_within))
 65 | # computation time comparison
 66 | as.numeric(within_time[3])/as.numeric(dummy_time[3])
 67 | 
 68 | # comparison of estimates
 69 | coeftest(fit_dummy)[1:3,]
 70 | coeftest(fit_within)
 71 | 
 72 | 
 73 | library(lfe)
 74 | 
 75 | # model and clustered SE specifications
 76 | model1 <- yes ~ pctsumyessameschool + 
 77 |                 pctsumyessamestate + 
 78 |                 pctsumyessameparty |congress+id|0|id
 79 | model2 <- yes ~ pctsumyessameschool + 
 80 |                 pctsumyessamestate + 
 81 |                 pctsumyessameparty |congress_session_votenumber+id|0|id
 82 | 
 83 | # estimation
 84 | fit1 <- felm(model1, data=cm)
 85 | fit2 <- felm(model2, data=cm)
 86 | 
 87 | stargazer::stargazer(fit1,fit2,
 88 |                      type="text",
 89 |                      dep.var.labels = "Vote (yes/no)",
 90 |                      covariate.labels = c("School Connected Votes",
 91 |                                           "State Votes",
 92 |                                           "Party Votes"),
 93 |                      keep.stat = c("adj.rsq", "n"))
 94 | 
 95 | # read dataset into R
 96 | economics <- read.csv("data/economics.csv")
 97 | # have a look at the data
 98 | head(economics, 2)
 99 | # create a 'large' dataset out of this
100 | for (i in 1:3) {
101 |      economics <- rbind(economics, economics)
102 | }
103 | dim(economics)
104 | 
105 | 
106 | # Naïve approach (ignorant of R)
107 | deflator <- 1.05 # define deflator
108 | # iterate through each observation
109 | pce_real <- c()
110 | n_obs <- length(economics$pce)
111 | for (i in 1:n_obs) {
112 |   pce_real <- c(pce_real, economics$pce[i]/deflator)
113 | }
114 | 
115 | # look at the result
116 | head(pce_real, 2)
117 | 
118 | 
119 | 
120 | # Naïve approach (ignorant of R)
121 | deflator <- 1.05 # define deflator
122 | # iterate through each observation
123 | pce_real <- list()
124 | n_obs <- length(economics$pce)
125 | time_elapsed <-
126 |      system.time(
127 |          for (i in 1:n_obs) {
128 |               pce_real <- c(pce_real, economics$pce[i]/deflator)
129 | })
130 | 
131 | time_elapsed
132 | 
133 | 
134 | 
135 | time_per_row <- time_elapsed[3]/n_obs
136 | time_per_row
137 | 
138 | 
139 | # in seconds
140 | (time_per_row*100^4) 
141 | # in minutes
142 | (time_per_row*100^4)/60 
143 | # in hours
144 | (time_per_row*100^4)/60^2 
145 | 
146 | 
147 | # Improve memory allocation (still somewhat ignorant of R)
148 | deflator <- 1.05 # define deflator
149 | n_obs <- length(economics$pce)
150 | # allocate memory beforehand
151 | # Initialize the vector to the right size
152 | pce_real <- rep(NA, n_obs)
153 | # iterate through each observation
154 | time_elapsed <-
155 |      system.time(
156 |          for (i in 1:n_obs) {
157 |               pce_real[i] <- economics$pce[i]/deflator
158 | })
159 | 
160 | 
161 | 
162 | 
163 | time_per_row <- time_elapsed[3]/n_obs
164 | time_per_row
165 | 
166 | 
167 | # in seconds
168 | (time_per_row*100^4) 
169 | # in minutes
170 | (time_per_row*100^4)/60 
171 | # in hours
172 | (time_per_row*100^4)/60^2 
173 | 
174 | 
175 | # Do it 'the R way'
176 | deflator <- 1.05 # define deflator
177 | # Exploit R's vectorization
178 | time_elapsed <- 
179 |      system.time(
180 |      pce_real <- economics$pce/deflator
181 |           )
182 | # same result
183 | head(pce_real, 2)
184 | 
185 | 
186 | library(microbenchmark)
187 | # measure elapsed time in microseconds (avg.)
188 | time_elapsed <- 
189 |   summary(microbenchmark(pce_real <- economics$pce/deflator))$mean
190 | # per row (in sec)
191 | time_per_row <- (time_elapsed/n_obs)/10^6
192 | 
193 | 
194 | # in seconds
195 | (time_per_row*100^4) 
196 | # in minutes
197 | (time_per_row*100^4)/60 
198 | # in hours
199 | (time_per_row*100^4)/60^2 
200 | 
201 | 
202 | url <- 
203 | "https://vincentarelbundock.github.io/Rdatasets/csv/carData/MplsStops.csv"
204 | stopdata <- data.table::fread(url) 
205 | 
206 | url <-
207 | "https://vincentarelbundock.github.io/Rdatasets/csv/carData/MplsStops.csv"
208 | stopdata <- data.table::fread(url)
209 | 
210 | # remove incomplete obs
211 | stopdata <- na.omit(stopdata)
212 | # code dependent var
213 | stopdata$vsearch <- 0
214 | stopdata$vsearch[stopdata$vehicleSearch=="YES"] <- 1
215 | # code explanatory var
216 | stopdata$white <- 0
217 | stopdata$white[stopdata$race=="White"] <- 1
218 | 
219 | model <- vsearch ~ white + factor(policePrecinct)
220 | 
221 | fit <- lm(model, stopdata)
222 | summary(fit)
223 | 
224 | # load packages
225 | library(data.table)
226 | # set the 'seed' for random numbers (makes the example reproducible)
227 | set.seed(2)
228 | 
229 | # set number of bootstrap iterations
230 | B <- 10
231 | # get selection of precincts
232 | precincts <- unique(stopdata$policePrecinct)
233 | # container for coefficients
234 | boot_coefs <- matrix(NA, nrow = B, ncol = 2)
235 | # draw bootstrap samples, estimate model for each sample
236 | for (i in 1:B) {
237 |      
238 |      # draw sample of precincts (cluster level)
239 |      precincts_i <- base::sample(precincts, size = 5, replace = TRUE)
240 |      # get observations
241 |      bs_i <- 
242 |           lapply(precincts_i, function(x){
243 |                stopdata[stopdata$policePrecinct==x,]
244 |      } )
245 |      bs_i <- rbindlist(bs_i)
246 |      
247 |      # estimate model and record coefficients
248 |      boot_coefs[i,] <- coef(lm(model, bs_i))[1:2] # ignore FE-coefficients
249 | }
250 | 
251 | se_boot <- apply(boot_coefs, 
252 |                  MARGIN = 2,
253 |                  FUN = sd)
254 | se_boot
255 | 
256 | # load packages for parallel processing
257 | library(doSNOW)
258 | # get the number of cores available
259 | ncores <- parallel::detectCores()
260 | # set cores for parallel processing
261 | ctemp <- makeCluster(ncores) # 
262 | registerDoSNOW(ctemp)
263 | 
264 | 
265 | # set number of bootstrap iterations
266 | B <- 10
267 | # get selection of precincts
268 | precincts <- unique(stopdata$policePrecinct)
269 | # container for coefficients
270 | boot_coefs <- matrix(NA, nrow = B, ncol = 2)
271 | 
272 | # bootstrapping in parallel
273 | boot_coefs <- 
274 |      foreach(i = 1:B, .combine = rbind, .packages="data.table") %dopar% {
275 |           # draw sample of precincts (cluster level)
276 |           precincts_i <- base::sample(precincts, size = 5, replace = TRUE)
277 |           # get observations
278 |           bs_i <- lapply(precincts_i, function(x) {
279 |             stopdata[stopdata$policePrecinct==x,]
280 |           })
281 |           bs_i <- rbindlist(bs_i)
282 |           # estimate model and record coefficients
283 |           coef(lm(model, bs_i))[1:2] # ignore FE-coefficients
284 |      }
285 | # be a good citizen and stop the snow clusters
286 | stopCluster(cl = ctemp)
287 | 
288 | 
289 | se_boot <- apply(boot_coefs, 
290 |                  MARGIN = 2,
291 |                  FUN = sd)
292 | se_boot
293 | 
294 | 
295 | # install packages
296 | install.packages("data.table")
297 | install.packages("doSNOW")
298 | # load packages
299 | library(data.table)
300 | 
301 | # fetch the data
302 | url <-
303 | "https://vincentarelbundock.github.io/Rdatasets/csv/carData/MplsStops.csv"
304 | stopdata <- read.csv(url)
305 | # remove incomplete obs
306 | stopdata <- na.omit(stopdata)
307 | # code dependent var
308 | stopdata$vsearch <- 0
309 | stopdata$vsearch[stopdata$vehicleSearch=="YES"] <- 1
310 | # code explanatory var
311 | stopdata$white <- 0
312 | stopdata$white[stopdata$race=="White"] <- 1
313 | 
314 | # model fit
315 | model <- vsearch ~ white + factor(policePrecinct)
316 | fit <- lm(model, stopdata)
317 | summary(fit)
318 | # bootstrapping: normal approach
319 | # set the 'seed' for random numbers (makes the example reproducible)
320 | set.seed(2)
321 | # set number of bootstrap iterations
322 | B <- 50
323 | # get selection of precincts
324 | precincts <- unique(stopdata$policePrecinct)
325 | # container for coefficients
326 | boot_coefs <- matrix(NA, nrow = B, ncol = 2)
327 | # draw bootstrap samples, estimate model for each sample
328 | for (i in 1:B) {
329 |   # draw sample of precincts (cluster level)
330 |   precincts_i <- base::sample(precincts, size = 5, replace = TRUE)
331 |   # get observations
332 |   bs_i <-
333 |     lapply(precincts_i, function(x){
334 |       stopdata[stopdata$policePrecinct==x,]})
335 |   bs_i <- rbindlist(bs_i)
336 |   # estimate model and record coefficients
337 |   boot_coefs[i,] <- coef(lm(model, bs_i))[1:2] # ignore FE-coefficients
338 | }
339 | 
340 | se_boot <- apply(boot_coefs,
341 |                  MARGIN = 2,
342 |                  FUN = sd)
343 | se_boot
344 | 
345 | 
346 | # bootstrapping: parallel approaach
347 | # install.packages("doSNOW", "parallel")
348 | # load packages for parallel processing
349 | library(doSNOW)
350 | # set cores for parallel processing
351 | ncores <- parallel::detectCores()
352 | ctemp <- makeCluster(ncores)
353 | registerDoSNOW(ctemp)
354 | # set number of bootstrap iterations
355 | B <- 50
356 | # get selection of precincts
357 | precincts <- unique(stopdata$policePrecinct)
358 | # container for coefficients
359 | boot_coefs <- matrix(NA, nrow = B, ncol = 2)
360 | 
361 | # bootstrapping in parallel
362 | boot_coefs <-
363 |   foreach(i = 1:B, .combine = rbind, .packages="data.table") %dopar% {
364 |     # draw sample of precincts (cluster level)
365 |     precincts_i <- base::sample(precincts, size = 5, replace = TRUE)
366 |     # get observations
367 |     bs_i <- lapply(precincts_i, function(x){
368 |          stopdata[stopdata$policePrecinct==x,])
369 |     }
370 |     bs_i <- rbindlist(bs_i)
371 | 
372 |     # estimate model and record coefficients
373 |     coef(lm(model, bs_i))[1:2] # ignore FE-coefficients
374 |   }
375 | 
376 | # be a good citizen and stop the snow clusters
377 | stopCluster(cl = ctemp)
378 | # compute the bootstrapped standard errors
379 | se_boot <- apply(boot_coefs,
380 |                  MARGIN = 2,
381 |                  FUN = sd)
382 | 


--------------------------------------------------------------------------------
/R_code_examples/13_partIV_GPU_ML.R:
--------------------------------------------------------------------------------
  1 | set.seed(1)
  2 | # set parameter values
  3 | n <- 100000
  4 | p <- 4 
  5 | # generate a design matrix (~ our 'dataset') 
  6 | # with p variables and n observations
  7 | X <- matrix(rnorm(n*p, mean = 10), ncol = p)
  8 | # add column for intercept
  9 | #X <- cbind(rep(1, n), X)
 10 | 
 11 | # MC model
 12 | y <-  1.5*X[,1] + 4*X[,2] - 3.5*X[,3] + 0.5*X[,4] + rnorm(n)
 13 | 
 14 | 
 15 | 
 16 | beta_ols_gpu <- 
 17 |      function(X, y, gpu_memory=FALSE) {
 18 |           require(gpuR)
 19 |           
 20 |           if (!gpu_memory){
 21 |                # point GPU to matrix (matrix stored in non-GPU memory)
 22 |                vclX <- vclMatrix(X, type = "float")
 23 |                vcly <- vclVector(y, type = "float")
 24 |                # compute cross products and inverse
 25 |                XXi <- solve(crossprod(vclX,vclX))
 26 |                Xy <- crossprod(vclX, vcly) 
 27 |           } else {
 28 |                # point GPU to matrix (matrix stored in non-GPU memory)
 29 |                gpuX <- gpuMatrix(X, type = "float")
 30 |                gpuy <- gpuVector(y, type = "float")
 31 |                # compute cross products and inverse
 32 |                XXi <- solve(crossprod(gpuX,gpuX))
 33 |                Xy <- t(gpuX)  %*% gpuy
 34 |           }
 35 |           beta_hat <- as.vector(XXi  %*% Xy)
 36 |           return(beta_hat)
 37 |      }
 38 | 
 39 | 
 40 | beta_ols_gpu(X,y)
 41 | 
 42 | beta_ols_gpu(X,y, gpu_memory = TRUE)
 43 | 
 44 | if (Sys.info()["sysname"]=="Darwin"){ # run on macOS machine
 45 |      
 46 |         use_python("/Users/umatter/opt/anaconda3/bin/python") # IMPORTANT: keras/tensorflow is set up to run in this environment on this machine!
 47 | }
 48 | 
 49 | 
 50 | # load packages
 51 | library(keras)
 52 | library(tibble)
 53 | library(ggplot2)
 54 | library(tfdatasets)
 55 | # load data
 56 | boston_housing <- dataset_boston_housing()
 57 | str(boston_housing)
 58 | 
 59 | # assign training and test data/labels
 60 | c(train_data, train_labels) %<-% boston_housing$train
 61 | c(test_data, test_labels) %<-% boston_housing$test
 62 | 
 63 | 
 64 | library(dplyr)
 65 | 
 66 | column_names <- c('CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 
 67 |                   'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT')
 68 | 
 69 | train_df <- train_data %>% 
 70 |   as_tibble(.name_repair = "minimal") %>% 
 71 |   setNames(column_names) %>% 
 72 |   mutate(label = train_labels)
 73 | 
 74 | test_df <- test_data %>% 
 75 |   as_tibble(.name_repair = "minimal") %>% 
 76 |   setNames(column_names) %>% 
 77 |   mutate(label = test_labels)
 78 | 
 79 | # check training data dimensions and content
 80 | dim(train_df)
 81 | head(train_df) 
 82 | 
 83 | spec <- feature_spec(train_df, label ~ . ) %>%
 84 |   step_numeric_column(all_numeric(), normalizer_fn = scaler_standard()) %>%
 85 |   fit()
 86 | 
 87 | # Create the model
 88 | # model specification
 89 | input <- layer_input_from_dataset(train_df %>% select(-label))
 90 | 
 91 | output <- input %>% 
 92 |   layer_dense_features(dense_features(spec)) %>% 
 93 |   layer_dense(units = 64, activation = "relu") %>%
 94 |   layer_dense(units = 64, activation = "relu") %>%
 95 |   layer_dense(units = 1) 
 96 | 
 97 | model <- keras_model(input, output)
 98 | 
 99 | 
100 | # compile the model  
101 | model %>% 
102 |   compile(
103 |     loss = "mse",
104 |     optimizer = optimizer_rmsprop(),
105 |     metrics = list("mean_absolute_error")
106 |   )
107 | 
108 | # get a summary of the model
109 | model
110 | 
111 | # Set max. number of epochs
112 | epochs <- 500
113 | 
114 | # Fit the model and store training stats
115 | history <- model %>% fit(
116 |   x = train_df %>% select(-label),
117 |   y = train_df$label,
118 |   epochs = epochs,
119 |   validation_split = 0.2,
120 |   verbose = 0
121 | )
122 | plot(history)
123 | 


--------------------------------------------------------------------------------
/R_code_examples/14_partIV_regression_categorization_spark.R:
--------------------------------------------------------------------------------
  1 | # flights_r <- collect(flights) # very slow!
  2 | flights_r <- data.table::fread("data/flights.csv", nrows = 300) 
  3 | 
  4 | # specify the linear model
  5 | model1 <- arr_delay ~ dep_delay + distance
  6 | # fit the model with OLS
  7 | fit1 <- lm(model1, flights_r)
  8 | # compute t-tests etc.
  9 | summary(fit1)
 10 | 
 11 | library(sparklyr)
 12 | 
 13 | # connect with default configuration
 14 | sc <- spark_connect(master="local")
 15 | 
 16 | 
 17 | # load data to spark
 18 | flights_spark <- copy_to(sc, flights_r, "flights_spark")
 19 | # fit the model
 20 | fit1_spark <- ml_linear_regression(flights_spark, formula = model1)
 21 | # compute summary stats
 22 | summary(fit1_spark)
 23 | 
 24 | 
 25 | # fit the model
 26 | spark_apply(flights_spark,
 27 |             function(df){
 28 |               broom::tidy(lm(arr_delay ~ dep_delay + distance, df))},
 29 |             names = c("term",
 30 |                       "estimate",
 31 |                       "std.error",
 32 |                       "statistic",
 33 |                       "p.value")
 34 |     )
 35 | 
 36 | library(tidymodels)
 37 | library(parsnip)
 38 | 
 39 | # simple local linear regression example from above
 40 | # via tidymodels/parsnip
 41 | fit1 <- fit(linear_reg(engine="lm"), model1, data=flights_r)
 42 | tidy(fit1)
 43 | 
 44 | 
 45 | 
 46 | # run the same on Spark
 47 | fit1_spark <- fit(linear_reg(engine="spark"), model1, data=flights_spark)
 48 | tidy(fit1_spark)
 49 | 
 50 | # load into R, select variables of interest, remove missing
 51 | titanic_r <- read.csv("data/titanic3.csv")
 52 | titanic_r <- na.omit(titanic_r[, c("survived",
 53 |                            "pclass",
 54 |                            "sex",
 55 |                            "age",
 56 |                            "sibsp",
 57 |                            "parch")])
 58 | titanic_r$survived <- ifelse(titanic_r$survived==1, "yes", "no")
 59 | 
 60 | library(rsample)
 61 | 
 62 | # split into training and test set
 63 | titanic_r <- initial_split(titanic_r)
 64 | ti_training <- training(titanic_r)
 65 | ti_testing <- testing(titanic_r)
 66 | 
 67 | # load data to spark
 68 | ti_training_spark <- copy_to(sc, ti_training, "ti_training_spark")
 69 | ti_testing_spark <- copy_to(sc, ti_testing, "ti_testing_spark")
 70 | 
 71 | # models to be used
 72 | models <- list(logit=logistic_reg(engine="spark", mode = "classification"),
 73 |                btree=boost_tree(engine = "spark", mode = "classification"),
 74 |                rforest=rand_forest(engine = "spark", mode = "classification"))
 75 | # train/fit the models
 76 | fits <- lapply(models, fit, formula=survived~., data=ti_training_spark)
 77 | 
 78 | 
 79 | # run predictions
 80 | predictions <- lapply(fits, predict, new_data=ti_testing_spark)
 81 | # fetch predictions from Spark, format, add actual outcomes
 82 | pred_outcomes <-
 83 |      lapply(1:length(predictions), function(i){
 84 |           x_r <- collect(predictions[[i]]) # load into local R environment
 85 |           x_r$pred_class <- as.factor(x_r$pred_class) # format for predictions
 86 |           x_r$survived <- as.factor(ti_testing$survived) # add true outcomes
 87 |           return(x_r)
 88 | 
 89 | })
 90 | 
 91 | 
 92 | acc <- lapply(pred_outcomes, accuracy, truth="survived", estimate="pred_class")
 93 | acc <- bind_rows(acc)
 94 | acc$model <- names(fits)
 95 | acc[order(acc$.estimate, decreasing = TRUE),]
 96 | 
 97 | tidy(fits[["btree"]])
 98 | 
 99 | tidy(fits[["rforest"]])
100 | 
101 | spark_disconnect(sc)
102 | 
103 | # load packages
104 | library(sparklyr)
105 | library(dplyr)
106 | 
107 | # fix vars
108 | INPUT_DATA <- "data/ga.csv"
109 | 
110 | 
111 | # import to local R session, prepare raw data
112 | ga <- na.omit(read.csv(INPUT_DATA))
113 | #ga$purchase <- as.factor(ifelse(ga$purchase==1, "yes", "no"))
114 | # connect to, and copy the data to the local cluster
115 | sc <- spark_connect(master = "local")
116 | ga_spark <- copy_to(sc, ga, "ga_spark", overwrite = TRUE)
117 | 
118 | 
119 | # ml pipeline
120 | ga_pipeline <-
121 |      ml_pipeline(sc) %>%
122 |      ft_string_indexer(input_col="city",
123 |                        output_col="city_output",
124 |                        handle_invalid = "skip") %>%
125 |      ft_string_indexer(input_col="country",
126 |                        output_col="country_output",
127 |                        handle_invalid = "skip") %>%
128 |      ft_string_indexer(input_col="source",
129 |                        output_col="source_output",
130 |                        handle_invalid = "skip") %>%
131 |      ft_string_indexer(input_col="browser",
132 |                        output_col="browser_output",
133 |                        handle_invalid = "skip") %>%
134 |      ft_r_formula(purchase ~ .) %>%
135 |      ml_logistic_regression(elastic_net_param = list(alpha=1))
136 | 
137 | 
138 | # specify the hyperparameter grid
139 | # (parameter values to be considered in optimization)
140 | ga_params <- list(logistic_regression=list(max_iter=80))
141 | 
142 | # create the cross-validator object
143 | set.seed(1)
144 | cv_lasso <- ml_cross_validator(sc,
145 |                          estimator=ga_pipeline,
146 |                          estimator_param_maps = ga_params,
147 |                          ml_binary_classification_evaluator(sc),
148 |                          num_folds = 30,
149 |                          parallelism = 8)
150 | 
151 | # train/fit the model
152 | cv_lasso_fit <- ml_fit(cv_lasso, ga_spark)
153 | # note: this takes several minutes to run on a local machine (1 node, 8 cores)
154 | 
155 | 
156 | # pipeline summary
157 | # cv_lasso_fit
158 | # average performance
159 | cv_lasso_fit$avg_metrics_df
160 | 
161 | 
162 | # save the entire pipeline/fit
163 | ml_save(
164 |   cv_lasso_fit,
165 |   "ga_cv_lasso_fit",
166 |   overwrite = TRUE
167 | )
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/R_code_examples/15_partIV_large_scale_text_analysis.R:
--------------------------------------------------------------------------------
  1 | # install additional packages
  2 | # install.packages("gutenbergr") # download book from Project Gutenberg
  3 | # install.packages("dplyr") # for the data preparatory steps
  4 | 
  5 | # load packages
  6 | library(sparklyr)
  7 | library(gutenbergr)
  8 | library(dplyr)
  9 | 
 10 | # fix vars
 11 | TELL <- "https://www.gutenberg.org/cache/epub/6788/pg6788.txt"
 12 | 
 13 | 
 14 | # connect rstudio session to cluster
 15 | sc <- spark_connect(master = "yarn")
 16 | 
 17 | 
 18 | # install additional packages
 19 | # install.packages("gutenbergr") # to download book texts from Project Gutenberg
 20 | # install.packages("dplyr") # for the data preparatory steps
 21 | # load packages
 22 | library(sparklyr)
 23 | library(gutenbergr)
 24 | library(dplyr)
 25 | # fix vars
 26 | TELL <- "https://www.gutenberg.org/cache/epub/6788/pg6788.txt"
 27 | # connect rstudio session to cluster
 28 | conf <- spark_config()
 29 | conf$`sparklyr.shell.driver-memory` <- "8g"
 30 | sc <- spark_connect(master = "local",
 31 |                     config = conf)
 32 | 
 33 | 
 34 | # Data gathering and preparation
 35 | # fetch Schiller's Tell, load to cluster
 36 | tmp_file <- tempfile()
 37 | download.file(TELL, tmp_file)
 38 | raw_text <- readLines(tmp_file)
 39 | tell <- data.frame(raw_text=raw_text)
 40 | tell_spark <- copy_to(sc, tell,
 41 |                       "tell_spark",
 42 |                       overwrite = TRUE)
 43 | 
 44 | 
 45 | # data cleaning
 46 | tell_spark <- filter(tell_spark, raw_text!="")
 47 | tell_spark <- select(tell_spark, raw_text)
 48 | tell_spark <- mutate(tell_spark, 
 49 |                      raw_text = regexp_replace(raw_text, "[^0-9a-zA-Z]+", " "))
 50 | 
 51 | 
 52 | 
 53 | # split into words
 54 | tell_spark <- ft_tokenizer(tell_spark, 
 55 |                            input_col = "raw_text",
 56 |                            output_col = "words")
 57 | 
 58 | 
 59 | 
 60 | # remove stop-words
 61 | tell_spark <- ft_stop_words_remover(tell_spark,
 62 |                                     input_col = "words",
 63 |                                     output_col = "words_wo_stop")
 64 | 
 65 | 
 66 | # unnest words, combine in one row
 67 | all_tell_words <- mutate(tell_spark, 
 68 |                word = explode(words_wo_stop))
 69 | 
 70 | # final cleaning
 71 | all_tell_words <- select(all_tell_words, word)
 72 | all_tell_words <- filter(all_tell_words, 2<nchar(word))
 73 | 
 74 | # get word count and store result in Spark memory
 75 | compute(count(all_tell_words, word), "wordcount_tell")
 76 | 
 77 | spark_disconnect(sc)
 78 | 
 79 | # download and unzip the raw text data
 80 | URL <- "https://stacks.stanford.edu/file/druid:md374tz9962/hein-daily.zip"
 81 | PATH <- "data/hein-daily.zip"
 82 | system(paste0("curl ",
 83 |               URL,
 84 |               " > ",
 85 |               PATH,
 86 |               " && unzip ",
 87 |               PATH))
 88 | # move the speeches files
 89 | system("mkdir data/text/ && mkdir data/text/speeches")
 90 | system("mv hein-daily/speeches* data/text/speeches/")
 91 | # move the speaker files
 92 | system("mkdir data/text/speakers")
 93 | system("mv hein-daily/*SpeakerMap.txt data/text/speakers/")
 94 | 
 95 | 
 96 | # download and unzip procedural phrases data
 97 | URL_P <- "https://stacks.stanford.edu/file/druid:md374tz9962/vocabulary.zip"
 98 | PATH_P <- "data/vocabulary.zip"
 99 | system(paste0("curl ",
100 |               URL_P,
101 |               " > ",
102 |               PATH_P,
103 |               " && unzip ",
104 |               PATH_P))
105 | # move the procedural vocab file
106 | system("mv vocabulary/vocab.txt data/text/")
107 | 
108 | # SET UP ----------------
109 | 
110 | # load packages
111 | library(sparklyr)
112 | library(dplyr)
113 | # fix vars
114 | INPUT_PATH_SPEECHES <- "data/text/speeches/" 
115 | INPUT_PATH_SPEAKERS <- "data/text/speakers/" 
116 | 
117 | # configuration of local spark cluster
118 | conf <- spark_config()
119 | conf$`sparklyr.shell.driver-memory` <- "16g"
120 | # connect rstudio session to cluster
121 | sc <- spark_connect(master = "local", 
122 |                     config = conf)
123 | 
124 | 
125 | # LOAD TEXT DATA  --------------------
126 | 
127 | # load data
128 | speeches <- spark_read_csv(sc,
129 |                            name = "speeches",
130 |                            path =  INPUT_PATH_SPEECHES,
131 |                            delimiter = "|")
132 | speakers <- spark_read_csv(sc,
133 |                            name = "speakers",
134 |                            path =  INPUT_PATH_SPEAKERS,
135 |                            delimiter = "|")
136 | 
137 | 
138 | # JOIN --------------------
139 | speeches <- 
140 |      inner_join(speeches,
141 |                 speakers,
142 |                 by="speech_id") %>%
143 |      filter(party %in% c("R", "D"), chamber=="H")  %>%
144 |      mutate(congress=substr(speech_id, 1,3)) %>%
145 |      select(speech_id, speech, party, congress)
146 |      
147 | 
148 | # CLEANING ----------------
149 | # clean text: numbers, letters (bill IDs, etc.
150 | speeches <- 
151 |      mutate(speeches, speech = tolower(speech)) %>%
152 |      mutate(speech = regexp_replace(speech,
153 |                                     "[_\"\'():;,.!?\\-]",
154 |                                     "")) %>%
155 |      mutate(speech = regexp_replace(speech, "\\\\(.+\\\\)", " ")) %>%
156 |      mutate(speech = regexp_replace(speech, "[0-9]+", " ")) %>%
157 |      mutate(speech = regexp_replace(speech, "<[a-z]+>", " ")) %>%
158 |      mutate(speech = regexp_replace(speech, "<\\w+>", " ")) %>%
159 |      mutate(speech = regexp_replace(speech, "_", " ")) %>%
160 |      mutate(speech = trimws(speech))
161 | 
162 | 
163 | # TOKENIZATION, STOPWORDS REMOVAL, NGRAMS ----------------
164 | 
165 | # stopwords list 
166 | stop <- readLines("http://snowball.tartarus.org/algorithms/english/stop.txt")
167 | stop <- trimws(gsub("\\|.*", "", stop))
168 | stop <- stop[stop!=""]
169 | 
170 | # clean text: numbers, letters (bill IDs, etc.
171 | bigrams <- 
172 |      ft_tokenizer(speeches, "speech", "words")  %>%
173 |      ft_stop_words_remover("words", "words_wo_stop",
174 |                            stop_words = stop )  %>%
175 |      ft_ngram("words_wo_stop", "bigram_list", n=2)  %>%
176 |      mutate(bigram=explode(bigram_list)) %>%
177 |      mutate(bigram=trim(bigram)) %>%
178 |      mutate(n_words=as.numeric(length(bigram) - 
179 |                                     length(replace(bigram, ' ', '')) + 1)) %>%
180 |      filter(3<nchar(bigram), 1<n_words) %>%
181 |      select(party, congress, bigram)
182 | 
183 | 
184 | 
185 | # load the procedural phrases list
186 | valid_vocab <- spark_read_csv(sc,
187 |                              path="data/text/vocab.txt",
188 |                              name = "valid_vocab",
189 |                              delimiter = "|",
190 |                              header = FALSE)
191 | # remove corresponding bigrams via anti-join
192 | bigrams <- inner_join(bigrams, valid_vocab, by= c("bigram"="V1"))
193 | 
194 | # BIGRAM COUNT PER PARTY ---------------
195 | bigram_count <- 
196 |      count(bigrams, party, bigram, congress)  %>%
197 |      compute("bigram_count")
198 | 
199 | # FIND MOST PARTISAN BIGRAMS ------------
200 | 
201 | # compute frequencies and chi-squared values
202 | freqs <- 
203 |      bigram_count  %>%
204 |      group_by(party, congress)  %>%
205 |      mutate(total=sum(n), f_npl=total-n)
206 | freqs_d <-
207 |      filter(freqs, party=="D") %>%
208 |      rename(f_pld=n, f_npld=f_npl) %>%
209 |      select(bigram, congress, f_pld, f_npld)
210 | freqs_r <-
211 |      filter(freqs, party=="R") %>%
212 |      rename(f_plr=n, f_nplr=f_npl) %>%
213 |      select(bigram, congress, f_plr, f_nplr)
214 | 
215 | pol_bigrams <-
216 |      inner_join(freqs_d, freqs_r, by=c("bigram", "congress")) %>%
217 |      group_by(bigram, congress) %>%
218 |      mutate(x2=((f_plr*f_npld-f_pld*f_nplr)^2)/
219 |                  ((f_plr + f_pld)*(f_plr + f_nplr)*
220 |                        (f_pld + f_npld)*(f_nplr + f_npld))) %>%
221 |      select(bigram, congress, x2, f_pld, f_plr) %>%
222 |      compute("pol_bigrams")
223 | 
224 | 
225 | # create output data frame
226 | output <- pol_bigrams  %>%
227 |      group_by(congress) %>%
228 |      arrange(desc(x2)) %>%
229 |      sdf_with_sequential_id(id="index")  %>%
230 |      filter(index<=2000) %>%
231 |      mutate(Party=ifelse(f_pld<f_plr, "R", "D"))%>%
232 |      select(bigram, congress, Party, x2) %>%
233 |      collect()
234 | 
235 | # disconnect from cluster
236 | spark_disconnect(sc)
237 | 
238 | # packages to prepare and plot
239 | library(data.table)
240 | library(ggplot2)
241 | # select top ten per congress, clean
242 | output <- as.data.table(output)
243 | topten <- output[order(congress, x2, decreasing = TRUE),
244 |                  rank:=1:.N, by=list(congress)][rank %in% (1:5)]
245 | topten[, congress:=gsub("990", "99", congress)]
246 | topten[, congress:=gsub("980", "98", congress)]
247 | topten[, congress:=gsub("970", "97", congress)]
248 | 
249 | # plot a visualization of the most partisan terms
250 | ggplot(topten, mapping=aes(x=as.integer(congress), y=log(x2), color=Party)) +
251 |      geom_text(aes(label=bigram), nudge_y = 1)+
252 |      ylab("Partisanship score (Ln of Chisq. value)") +
253 |      xlab("Congress") +
254 |      scale_color_manual(values=c("D"="blue", "R"="red"), name="Party") +
255 |      guides(color=guide_legend(title.position="top")) +
256 |      scale_x_continuous(breaks=as.integer(unique(topten$congress))) +
257 |      theme_minimal() +
258 |      theme(axis.text.x = element_text(angle = 90, hjust = 1),
259 |            axis.text.y = element_text(hjust = 1),
260 |            panel.grid.major = element_blank(),
261 |            panel.grid.minor = element_blank(),
262 |            panel.background = element_blank())
263 |      
264 | 
265 | 
266 | # load packages
267 | library(dplyr)
268 | library(sparklyr)
269 | library(sparknlp)
270 | library(sparklyr.nested)
271 | 
272 | # configuration of local spark cluster
273 | conf <- spark_config()
274 | conf$`sparklyr.shell.driver-memory` <- "16g"
275 | # connect rstudio session to cluster
276 | sc <- spark_connect(master = "local", 
277 |                     config = conf)
278 | 
279 | # LOAD --------------------
280 | 
281 | # load speeches
282 | INPUT_PATH_SPEECHES <- "data/text/speeches/" 
283 | speeches <- 
284 |      spark_read_csv(sc,
285 |                     name = "speeches",
286 |                     path =  INPUT_PATH_SPEECHES,
287 |                     delimiter = "|",
288 |                     overwrite = TRUE) %>% 
289 |      sample_n(10000, replace = FALSE)  %>% 
290 |      compute("speeches")
291 |      
292 | 
293 | # load the nlp pipeline for sentiment analysis
294 | pipeline <- nlp_pretrained_pipeline(sc, "analyze_sentiment", "en")
295 | 
296 | speeches_a <- 
297 |      nlp_annotate(pipeline,
298 |                   target = speeches,
299 |                   column = "speech")
300 | 
301 | # extract sentiment coding per speech
302 | sentiments <- 
303 |      speeches_a %>%
304 |      sdf_select(speech_id, sentiments=sentiment.result) %>% 
305 |      sdf_explode(sentiments)  %>% 
306 |      mutate(pos = as.integer(sentiments=="positive"),
307 |             neg = as.integer(sentiments=="negative"))  %>% 
308 |      select(speech_id, pos, neg) 
309 | 
310 | 
311 | # aggregate and download to R environment -----
312 | sentiments_aggr <- 
313 |      sentiments  %>%
314 |      select(speech_id, pos, neg) %>%
315 |      group_by(speech_id) %>%
316 |      mutate(rel_pos = sum(pos)/(sum(pos) + sum(neg))) %>%
317 |      filter(0<rel_pos) %>%
318 |      select(speech_id, rel_pos) %>%
319 |      sdf_distinct(name = "sentiments_aggr") %>%
320 |      collect()
321 | 
322 | # disconnect from cluster
323 | spark_disconnect(sc)
324 | 
325 | # clean
326 | library(data.table)
327 | sa <- as.data.table(sentiments_aggr)
328 | sa[, congress:=substr(speech_id, 1,3)]
329 | sa[, congress:=gsub("990", "99", congress)]
330 | sa[, congress:=gsub("980", "98", congress)]
331 | sa[, congress:=gsub("970", "97", congress)]
332 | 
333 | # visualize results
334 | library(ggplot2)
335 | ggplot(sa, aes(x=as.integer(congress),
336 |                y=rel_pos,
337 |                group=congress)) +
338 |      geom_boxplot() +
339 |      ylab("Share of sentences with positive tone") +
340 |      xlab("Congress") +
341 |      theme_minimal()
342 | 
343 | 
344 | system.time(
345 | speeches_a <- 
346 |      nlp_annotate(pipeline,
347 |                   target = speeches,
348 |                   column = "speech")
349 | )
350 | 
351 | system.time(
352 | speeches_a <- 
353 |      nlp_annotate(pipeline,
354 |                   target = speeches,
355 |                   column = "speech") %>%
356 |      compute(name= "speeches_a")
357 | )
358 | 
359 | # disconnect from cluster
360 | spark_disconnect(sc)
361 | 


--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/.nojekyll


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-178-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-178-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-190-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-190-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-193-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-193-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-194-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-194-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-196-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-196-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-198-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-198-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-199-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-199-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-200-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-200-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-201-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-201-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-202-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-202-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-203-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-203-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-204-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-204-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-205-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-205-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-206-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-206-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-207-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-207-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-208-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-208-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-209-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-209-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-210-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-210-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-211-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-211-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-221-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-221-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-222-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-222-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-223-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-223-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-224-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-224-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-225-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-225-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-226-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-226-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-227-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-227-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-228-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-228-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-229-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-229-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-27-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-27-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-275-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-275-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-31-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-31-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-318-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-318-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-326-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-326-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-33-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-33-1.png


--------------------------------------------------------------------------------
/docs/bigdata_files/figure-html/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/docs/img/05_nlp_pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/05_nlp_pipeline.jpg


--------------------------------------------------------------------------------
/docs/img/II_computing_environment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/II_computing_environment.png


--------------------------------------------------------------------------------
/docs/img/I_approaches.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/I_approaches.png


--------------------------------------------------------------------------------
/docs/img/TPU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/TPU.png


--------------------------------------------------------------------------------
/docs/img/aws_emr_ready.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/aws_emr_ready.png


--------------------------------------------------------------------------------
/docs/img/aws_rds_create.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/aws_rds_create.png


--------------------------------------------------------------------------------
/docs/img/aws_rds_easycreate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/aws_rds_easycreate.png


--------------------------------------------------------------------------------
/docs/img/colab_r_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/colab_r_gpu.png


--------------------------------------------------------------------------------
/docs/img/column_v_rowbased.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/column_v_rowbased.png


--------------------------------------------------------------------------------
/docs/img/cover_print.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/cover_print.jpg


--------------------------------------------------------------------------------
/docs/img/data_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/data_pipeline.png


--------------------------------------------------------------------------------
/docs/img/distributed_system.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/distributed_system.jpg


--------------------------------------------------------------------------------
/docs/img/druiddatasources.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/druiddatasources.png


--------------------------------------------------------------------------------
/docs/img/druidparse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/druidparse.png


--------------------------------------------------------------------------------
/docs/img/druidquery.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/druidquery.png


--------------------------------------------------------------------------------
/docs/img/druidstart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/druidstart.png


--------------------------------------------------------------------------------
/docs/img/ec2_gpu1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/ec2_gpu1.png


--------------------------------------------------------------------------------
/docs/img/ec2_gpu2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/ec2_gpu2.png


--------------------------------------------------------------------------------
/docs/img/ec2_rstudioserver_htop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/ec2_rstudioserver_htop.png


--------------------------------------------------------------------------------
/docs/img/gpt_SQL_prompt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/gpt_SQL_prompt.png


--------------------------------------------------------------------------------
/docs/img/gpt_sql_response.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/gpt_sql_response.png


--------------------------------------------------------------------------------
/docs/img/gpu_cpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/gpu_cpu.png


--------------------------------------------------------------------------------
/docs/img/gpu_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/gpu_details.png


--------------------------------------------------------------------------------
/docs/img/rds_inboundrules.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/rds_inboundrules.png


--------------------------------------------------------------------------------
/docs/img/rtx_2080.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/rtx_2080.png


--------------------------------------------------------------------------------
/docs/img/screenshot_rstudio_server_upload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/screenshot_rstudio_server_upload.png


--------------------------------------------------------------------------------
/docs/img/uluru_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/uluru_comparison.png


--------------------------------------------------------------------------------
/docs/img/uluru_comparison2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/uluru_comparison2.png


--------------------------------------------------------------------------------
/docs/img/virtual_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/virtual_memory.png


--------------------------------------------------------------------------------
/docs/libs/anchor-sections/anchor-sections-hash.css:
--------------------------------------------------------------------------------
1 | /* Styles for section anchors */
2 | a.anchor-section::before {content: '#';font-size: 80%;}
3 | 


--------------------------------------------------------------------------------
/docs/libs/anchor-sections/anchor-sections.css:
--------------------------------------------------------------------------------
1 | /* Styles for section anchors */
2 | a.anchor-section {margin-left: 10px; visibility: hidden; color: inherit;}
3 | .hasAnchor:hover a.anchor-section {visibility: visible;}
4 | ul > li > .anchor-section {display: none;}
5 | 


--------------------------------------------------------------------------------
/docs/libs/anchor-sections/anchor-sections.js:
--------------------------------------------------------------------------------
 1 | document.addEventListener('DOMContentLoaded', function () {
 2 |   // If section divs is used, we need to put the anchor in the child header
 3 |   const headers = document.querySelectorAll("div.hasAnchor.section[class*='level'] > :first-child")
 4 | 
 5 |   headers.forEach(function (x) {
 6 |     // Add to the header node
 7 |     if (!x.classList.contains('hasAnchor')) x.classList.add('hasAnchor')
 8 |     // Remove from the section or div created by Pandoc
 9 |     x.parentElement.classList.remove('hasAnchor')
10 |   })
11 | })
12 | 


--------------------------------------------------------------------------------
/docs/libs/gitbook/css/fontawesome/fontawesome-webfont.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/libs/gitbook/css/fontawesome/fontawesome-webfont.ttf


--------------------------------------------------------------------------------
/docs/libs/gitbook/css/plugin-bookdown.css:
--------------------------------------------------------------------------------
  1 | .book .book-header h1 {
  2 |   padding-left: 20px;
  3 |   padding-right: 20px;
  4 | }
  5 | .book .book-header.fixed {
  6 |   position: fixed;
  7 |   right: 0;
  8 |   top: 0;
  9 |   left: 0;
 10 |   border-bottom: 1px solid rgba(0,0,0,.07);
 11 | }
 12 | span.search-highlight {
 13 |   background-color: #ffff88;
 14 | }
 15 | @media (min-width: 600px) {
 16 |   .book.with-summary .book-header.fixed {
 17 |     left: 300px;
 18 |   }
 19 | }
 20 | @media (max-width: 1240px) {
 21 |   .book .book-body.fixed {
 22 |     top: 50px;
 23 |   }
 24 |   .book .book-body.fixed .body-inner {
 25 |     top: auto;
 26 |   }
 27 | }
 28 | @media (max-width: 600px) {
 29 |   .book.with-summary .book-header.fixed {
 30 |     left: calc(100% - 60px);
 31 |     min-width: 300px;
 32 |   }
 33 |   .book.with-summary .book-body {
 34 |     transform: none;
 35 |     left: calc(100% - 60px);
 36 |     min-width: 300px;
 37 |   }
 38 |   .book .book-body.fixed {
 39 |     top: 0;
 40 |   }
 41 | }
 42 | 
 43 | .book .book-body.fixed .body-inner {
 44 |   top: 50px;
 45 | }
 46 | .book .book-body .page-wrapper .page-inner section.normal sub, .book .book-body .page-wrapper .page-inner section.normal sup {
 47 |   font-size: 85%;
 48 | }
 49 | 
 50 | @media print {
 51 |   .book .book-summary, .book .book-body .book-header, .fa {
 52 |     display: none !important;
 53 |   }
 54 |   .book .book-body.fixed {
 55 |     left: 0px;
 56 |   }
 57 |   .book .book-body,.book .book-body .body-inner, .book.with-summary {
 58 |     overflow: visible !important;
 59 |   }
 60 | }
 61 | .kable_wrapper {
 62 |   border-spacing: 20px 0;
 63 |   border-collapse: separate;
 64 |   border: none;
 65 |   margin: auto;
 66 | }
 67 | .kable_wrapper > tbody > tr > td {
 68 |   vertical-align: top;
 69 | }
 70 | .book .book-body .page-wrapper .page-inner section.normal table tr.header {
 71 |   border-top-width: 2px;
 72 | }
 73 | .book .book-body .page-wrapper .page-inner section.normal table tr:last-child td {
 74 |   border-bottom-width: 2px;
 75 | }
 76 | .book .book-body .page-wrapper .page-inner section.normal table td, .book .book-body .page-wrapper .page-inner section.normal table th {
 77 |   border-left: none;
 78 |   border-right: none;
 79 | }
 80 | .book .book-body .page-wrapper .page-inner section.normal table.kable_wrapper > tbody > tr, .book .book-body .page-wrapper .page-inner section.normal table.kable_wrapper > tbody > tr > td {
 81 |   border-top: none;
 82 | }
 83 | .book .book-body .page-wrapper .page-inner section.normal table.kable_wrapper > tbody > tr:last-child > td {
 84 |     border-bottom: none;
 85 | }
 86 | 
 87 | div.theorem, div.lemma, div.corollary, div.proposition, div.conjecture {
 88 |   font-style: italic;
 89 | }
 90 | span.theorem, span.lemma, span.corollary, span.proposition, span.conjecture {
 91 |   font-style: normal;
 92 | }
 93 | div.proof>*:last-child:after {
 94 |   content: "\25a2";
 95 |   float: right;
 96 | }
 97 | .header-section-number {
 98 |   padding-right: .5em;
 99 | }
100 | #header .multi-author {
101 |   margin: 0.5em 0 -0.5em 0;
102 | }
103 | #header .date {
104 |   margin-top: 1.5em;
105 | }
106 | 


--------------------------------------------------------------------------------
/docs/libs/gitbook/css/plugin-clipboard.css:
--------------------------------------------------------------------------------
 1 | div.sourceCode {
 2 |   position: relative;
 3 | }
 4 | 
 5 | .copy-to-clipboard-button {
 6 |   position: absolute;
 7 |   right: 0;
 8 |   top: 0;
 9 |   visibility: hidden;
10 | }
11 | 
12 | .copy-to-clipboard-button:focus {
13 |   outline: 0;
14 | }
15 | 
16 | div.sourceCode:hover > .copy-to-clipboard-button {
17 |   visibility: visible;
18 | }
19 | 


--------------------------------------------------------------------------------
/docs/libs/gitbook/css/plugin-fontsettings.css:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Theme 1
  3 |  */
  4 | .color-theme-1 .dropdown-menu {
  5 |   background-color: #111111;
  6 |   border-color: #7e888b;
  7 | }
  8 | .color-theme-1 .dropdown-menu .dropdown-caret .caret-inner {
  9 |   border-bottom: 9px solid #111111;
 10 | }
 11 | .color-theme-1 .dropdown-menu .buttons {
 12 |   border-color: #7e888b;
 13 | }
 14 | .color-theme-1 .dropdown-menu .button {
 15 |   color: #afa790;
 16 | }
 17 | .color-theme-1 .dropdown-menu .button:hover {
 18 |   color: #73553c;
 19 | }
 20 | /*
 21 |  * Theme 2
 22 |  */
 23 | .color-theme-2 .dropdown-menu {
 24 |   background-color: #2d3143;
 25 |   border-color: #272a3a;
 26 | }
 27 | .color-theme-2 .dropdown-menu .dropdown-caret .caret-inner {
 28 |   border-bottom: 9px solid #2d3143;
 29 | }
 30 | .color-theme-2 .dropdown-menu .buttons {
 31 |   border-color: #272a3a;
 32 | }
 33 | .color-theme-2 .dropdown-menu .button {
 34 |   color: #62677f;
 35 | }
 36 | .color-theme-2 .dropdown-menu .button:hover {
 37 |   color: #f4f4f5;
 38 | }
 39 | .book .book-header .font-settings .font-enlarge {
 40 |   line-height: 30px;
 41 |   font-size: 1.4em;
 42 | }
 43 | .book .book-header .font-settings .font-reduce {
 44 |   line-height: 30px;
 45 |   font-size: 1em;
 46 | }
 47 | 
 48 | /* sidebar transition background */
 49 | div.book.color-theme-1 {
 50 |   background: #f3eacb;
 51 | }
 52 | .book.color-theme-1 .book-body {
 53 |   color: #704214;
 54 |   background: #f3eacb;
 55 | }
 56 | .book.color-theme-1 .book-body .page-wrapper .page-inner section {
 57 |   background: #f3eacb;
 58 | }
 59 | 
 60 | /* sidebar transition background */
 61 | div.book.color-theme-2 {
 62 |   background: #1c1f2b;
 63 | }
 64 | 
 65 | .book.color-theme-2 .book-body {
 66 |   color: #bdcadb;
 67 |   background: #1c1f2b;
 68 | }
 69 | .book.color-theme-2 .book-body .page-wrapper .page-inner section {
 70 |   background: #1c1f2b;
 71 | }
 72 | .book.font-size-0 .book-body .page-inner section {
 73 |   font-size: 1.2rem;
 74 | }
 75 | .book.font-size-1 .book-body .page-inner section {
 76 |   font-size: 1.4rem;
 77 | }
 78 | .book.font-size-2 .book-body .page-inner section {
 79 |   font-size: 1.6rem;
 80 | }
 81 | .book.font-size-3 .book-body .page-inner section {
 82 |   font-size: 2.2rem;
 83 | }
 84 | .book.font-size-4 .book-body .page-inner section {
 85 |   font-size: 4rem;
 86 | }
 87 | .book.font-family-0 {
 88 |   font-family: Georgia, serif;
 89 | }
 90 | .book.font-family-1 {
 91 |   font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
 92 | }
 93 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal {
 94 |   color: #704214;
 95 | }
 96 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal a {
 97 |   color: inherit;
 98 | }
 99 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h1,
100 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h2,
101 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h3,
102 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h4,
103 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h5,
104 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h6 {
105 |   color: inherit;
106 | }
107 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h1,
108 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h2 {
109 |   border-color: inherit;
110 | }
111 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h6 {
112 |   color: inherit;
113 | }
114 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal hr {
115 |   background-color: inherit;
116 | }
117 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal blockquote {
118 |   border-color: #c4b29f;
119 |   opacity: 0.9;
120 | }
121 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal pre,
122 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal code {
123 |   background: #fdf6e3;
124 |   color: #657b83;
125 |   border-color: #f8df9c;
126 | }
127 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal .highlight {
128 |   background-color: inherit;
129 | }
130 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal table th,
131 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal table td {
132 |   border-color: #f5d06c;
133 | }
134 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal table tr {
135 |   color: inherit;
136 |   background-color: #fdf6e3;
137 |   border-color: #444444;
138 | }
139 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal table tr:nth-child(2n) {
140 |   background-color: #fbeecb;
141 | }
142 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal {
143 |   color: #bdcadb;
144 | }
145 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal a {
146 |   color: #3eb1d0;
147 | }
148 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h1,
149 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h2,
150 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h3,
151 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h4,
152 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h5,
153 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h6 {
154 |   color: #fffffa;
155 | }
156 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h1,
157 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h2 {
158 |   border-color: #373b4e;
159 | }
160 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h6 {
161 |   color: #373b4e;
162 | }
163 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal hr {
164 |   background-color: #373b4e;
165 | }
166 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal blockquote {
167 |   border-color: #373b4e;
168 | }
169 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal pre,
170 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal code {
171 |   color: #9dbed8;
172 |   background: #2d3143;
173 |   border-color: #2d3143;
174 | }
175 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal .highlight {
176 |   background-color: #282a39;
177 | }
178 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal table th,
179 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal table td {
180 |   border-color: #3b3f54;
181 | }
182 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal table tr {
183 |   color: #b6c2d2;
184 |   background-color: #2d3143;
185 |   border-color: #3b3f54;
186 | }
187 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal table tr:nth-child(2n) {
188 |   background-color: #35394b;
189 | }
190 | .book.color-theme-1 .book-header {
191 |   color: #afa790;
192 |   background: transparent;
193 | }
194 | .book.color-theme-1 .book-header .btn {
195 |   color: #afa790;
196 | }
197 | .book.color-theme-1 .book-header .btn:hover {
198 |   color: #73553c;
199 |   background: none;
200 | }
201 | .book.color-theme-1 .book-header h1 {
202 |   color: #704214;
203 | }
204 | .book.color-theme-2 .book-header {
205 |   color: #7e888b;
206 |   background: transparent;
207 | }
208 | .book.color-theme-2 .book-header .btn {
209 |   color: #3b3f54;
210 | }
211 | .book.color-theme-2 .book-header .btn:hover {
212 |   color: #fffff5;
213 |   background: none;
214 | }
215 | .book.color-theme-2 .book-header h1 {
216 |   color: #bdcadb;
217 | }
218 | .book.color-theme-1 .book-body .navigation {
219 |   color: #afa790;
220 | }
221 | .book.color-theme-1 .book-body .navigation:hover {
222 |   color: #73553c;
223 | }
224 | .book.color-theme-2 .book-body .navigation {
225 |   color: #383f52;
226 | }
227 | .book.color-theme-2 .book-body .navigation:hover {
228 |   color: #fffff5;
229 | }
230 | /*
231 |  * Theme 1
232 |  */
233 | .book.color-theme-1 .book-summary {
234 |   color: #afa790;
235 |   background: #111111;
236 |   border-right: 1px solid rgba(0, 0, 0, 0.07);
237 | }
238 | .book.color-theme-1 .book-summary .book-search {
239 |   background: transparent;
240 | }
241 | .book.color-theme-1 .book-summary .book-search input,
242 | .book.color-theme-1 .book-summary .book-search input:focus {
243 |   border: 1px solid transparent;
244 | }
245 | .book.color-theme-1 .book-summary ul.summary li.divider {
246 |   background: #7e888b;
247 |   box-shadow: none;
248 | }
249 | .book.color-theme-1 .book-summary ul.summary li i.fa-check {
250 |   color: #33cc33;
251 | }
252 | .book.color-theme-1 .book-summary ul.summary li.done > a {
253 |   color: #877f6a;
254 | }
255 | .book.color-theme-1 .book-summary ul.summary li a,
256 | .book.color-theme-1 .book-summary ul.summary li span {
257 |   color: #877f6a;
258 |   background: transparent;
259 |   font-weight: normal;
260 | }
261 | .book.color-theme-1 .book-summary ul.summary li.active > a,
262 | .book.color-theme-1 .book-summary ul.summary li a:hover {
263 |   color: #704214;
264 |   background: transparent;
265 |   font-weight: normal;
266 | }
267 | /*
268 |  * Theme 2
269 |  */
270 | .book.color-theme-2 .book-summary {
271 |   color: #bcc1d2;
272 |   background: #2d3143;
273 |   border-right: none;
274 | }
275 | .book.color-theme-2 .book-summary .book-search {
276 |   background: transparent;
277 | }
278 | .book.color-theme-2 .book-summary .book-search input,
279 | .book.color-theme-2 .book-summary .book-search input:focus {
280 |   border: 1px solid transparent;
281 | }
282 | .book.color-theme-2 .book-summary ul.summary li.divider {
283 |   background: #272a3a;
284 |   box-shadow: none;
285 | }
286 | .book.color-theme-2 .book-summary ul.summary li i.fa-check {
287 |   color: #33cc33;
288 | }
289 | .book.color-theme-2 .book-summary ul.summary li.done > a {
290 |   color: #62687f;
291 | }
292 | .book.color-theme-2 .book-summary ul.summary li a,
293 | .book.color-theme-2 .book-summary ul.summary li span {
294 |   color: #c1c6d7;
295 |   background: transparent;
296 |   font-weight: 600;
297 | }
298 | .book.color-theme-2 .book-summary ul.summary li.active > a,
299 | .book.color-theme-2 .book-summary ul.summary li a:hover {
300 |   color: #f4f4f5;
301 |   background: #252737;
302 |   font-weight: 600;
303 | }
304 | 


--------------------------------------------------------------------------------
/docs/libs/gitbook/css/plugin-search.css:
--------------------------------------------------------------------------------
 1 | .book .book-summary .book-search {
 2 |   padding: 6px;
 3 |   background: transparent;
 4 |   position: absolute;
 5 |   top: -50px;
 6 |   left: 0px;
 7 |   right: 0px;
 8 |   transition: top 0.5s ease;
 9 | }
10 | .book .book-summary .book-search input,
11 | .book .book-summary .book-search input:focus,
12 | .book .book-summary .book-search input:hover {
13 |   width: 100%;
14 |   background: transparent;
15 |   border: 1px solid #ccc;
16 |   box-shadow: none;
17 |   outline: none;
18 |   line-height: 22px;
19 |   padding: 7px 4px;
20 |   color: inherit;
21 |   box-sizing: border-box;
22 | }
23 | .book.with-search .book-summary .book-search {
24 |   top: 0px;
25 | }
26 | .book.with-search .book-summary ul.summary {
27 |   top: 50px;
28 | }
29 | .with-search .summary li[data-level] a[href*=".html#"] {
30 |   display: none;
31 | }
32 | 


--------------------------------------------------------------------------------
/docs/libs/gitbook/css/plugin-table.css:
--------------------------------------------------------------------------------
1 | .book .book-body .page-wrapper .page-inner section.normal table{display:table;width:100%;border-collapse:collapse;border-spacing:0;overflow:auto}.book .book-body .page-wrapper .page-inner section.normal table td,.book .book-body .page-wrapper .page-inner section.normal table th{padding:6px 13px;border:1px solid #ddd}.book .book-body .page-wrapper .page-inner section.normal table tr{background-color:#fff;border-top:1px solid #ccc}.book .book-body .page-wrapper .page-inner section.normal table tr:nth-child(2n){background-color:#f8f8f8}.book .book-body .page-wrapper .page-inner section.normal table th{font-weight:700}
2 | 


--------------------------------------------------------------------------------
/docs/libs/gitbook/js/clipboard.min.js:
--------------------------------------------------------------------------------
1 | /*!
2 |  * clipboard.js v2.0.4
3 |  * https://zenorocha.github.io/clipboard.js
4 |  *
5 |  * Licensed MIT © Zeno Rocha
6 |  */
7 | !function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.ClipboardJS=e():t.ClipboardJS=e()}(this,function(){return function(n){var o={};function r(t){if(o[t])return o[t].exports;var e=o[t]={i:t,l:!1,exports:{}};return n[t].call(e.exports,e,e.exports,r),e.l=!0,e.exports}return r.m=n,r.c=o,r.d=function(t,e,n){r.o(t,e)||Object.defineProperty(t,e,{enumerable:!0,get:n})},r.r=function(t){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(t,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(t,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(t){var e=t&&t.__esModule?function(){return t.default}:function(){return t};return r.d(e,"a",e),e},r.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},r.p="",r(r.s=0)}([function(t,e,n){"use strict";var r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t},i=function(){function o(t,e){for(var n=0;n<e.length;n++){var o=e[n];o.enumerable=o.enumerable||!1,o.configurable=!0,"value"in o&&(o.writable=!0),Object.defineProperty(t,o.key,o)}}return function(t,e,n){return e&&o(t.prototype,e),n&&o(t,n),t}}(),a=o(n(1)),c=o(n(3)),u=o(n(4));function o(t){return t&&t.__esModule?t:{default:t}}var l=function(t){function o(t,e){!function(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}(this,o);var n=function(t,e){if(!t)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!e||"object"!=typeof e&&"function"!=typeof e?t:e}(this,(o.__proto__||Object.getPrototypeOf(o)).call(this));return n.resolveOptions(e),n.listenClick(t),n}return function(t,e){if("function"!=typeof e&&null!==e)throw new TypeError("Super expression must either be null or a function, not "+typeof e);t.prototype=Object.create(e&&e.prototype,{constructor:{value:t,enumerable:!1,writable:!0,configurable:!0}}),e&&(Object.setPrototypeOf?Object.setPrototypeOf(t,e):t.__proto__=e)}(o,c.default),i(o,[{key:"resolveOptions",value:function(){var t=0<arguments.length&&void 0!==arguments[0]?arguments[0]:{};this.action="function"==typeof t.action?t.action:this.defaultAction,this.target="function"==typeof t.target?t.target:this.defaultTarget,this.text="function"==typeof t.text?t.text:this.defaultText,this.container="object"===r(t.container)?t.container:document.body}},{key:"listenClick",value:function(t){var e=this;this.listener=(0,u.default)(t,"click",function(t){return e.onClick(t)})}},{key:"onClick",value:function(t){var e=t.delegateTarget||t.currentTarget;this.clipboardAction&&(this.clipboardAction=null),this.clipboardAction=new a.default({action:this.action(e),target:this.target(e),text:this.text(e),container:this.container,trigger:e,emitter:this})}},{key:"defaultAction",value:function(t){return s("action",t)}},{key:"defaultTarget",value:function(t){var e=s("target",t);if(e)return document.querySelector(e)}},{key:"defaultText",value:function(t){return s("text",t)}},{key:"destroy",value:function(){this.listener.destroy(),this.clipboardAction&&(this.clipboardAction.destroy(),this.clipboardAction=null)}}],[{key:"isSupported",value:function(){var t=0<arguments.length&&void 0!==arguments[0]?arguments[0]:["copy","cut"],e="string"==typeof t?[t]:t,n=!!document.queryCommandSupported;return e.forEach(function(t){n=n&&!!document.queryCommandSupported(t)}),n}}]),o}();function s(t,e){var n="data-clipboard-"+t;if(e.hasAttribute(n))return e.getAttribute(n)}t.exports=l},function(t,e,n){"use strict";var o,r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t},i=function(){function o(t,e){for(var n=0;n<e.length;n++){var o=e[n];o.enumerable=o.enumerable||!1,o.configurable=!0,"value"in o&&(o.writable=!0),Object.defineProperty(t,o.key,o)}}return function(t,e,n){return e&&o(t.prototype,e),n&&o(t,n),t}}(),a=n(2),c=(o=a)&&o.__esModule?o:{default:o};var u=function(){function e(t){!function(t,e){if(!(t instanceof e))throw new TypeError("Cannot call a class as a function")}(this,e),this.resolveOptions(t),this.initSelection()}return i(e,[{key:"resolveOptions",value:function(){var t=0<arguments.length&&void 0!==arguments[0]?arguments[0]:{};this.action=t.action,this.container=t.container,this.emitter=t.emitter,this.target=t.target,this.text=t.text,this.trigger=t.trigger,this.selectedText=""}},{key:"initSelection",value:function(){this.text?this.selectFake():this.target&&this.selectTarget()}},{key:"selectFake",value:function(){var t=this,e="rtl"==document.documentElement.getAttribute("dir");this.removeFake(),this.fakeHandlerCallback=function(){return t.removeFake()},this.fakeHandler=this.container.addEventListener("click",this.fakeHandlerCallback)||!0,this.fakeElem=document.createElement("textarea"),this.fakeElem.style.fontSize="12pt",this.fakeElem.style.border="0",this.fakeElem.style.padding="0",this.fakeElem.style.margin="0",this.fakeElem.style.position="absolute",this.fakeElem.style[e?"right":"left"]="-9999px";var n=window.pageYOffset||document.documentElement.scrollTop;this.fakeElem.style.top=n+"px",this.fakeElem.setAttribute("readonly",""),this.fakeElem.value=this.text,this.container.appendChild(this.fakeElem),this.selectedText=(0,c.default)(this.fakeElem),this.copyText()}},{key:"removeFake",value:function(){this.fakeHandler&&(this.container.removeEventListener("click",this.fakeHandlerCallback),this.fakeHandler=null,this.fakeHandlerCallback=null),this.fakeElem&&(this.container.removeChild(this.fakeElem),this.fakeElem=null)}},{key:"selectTarget",value:function(){this.selectedText=(0,c.default)(this.target),this.copyText()}},{key:"copyText",value:function(){var e=void 0;try{e=document.execCommand(this.action)}catch(t){e=!1}this.handleResult(e)}},{key:"handleResult",value:function(t){this.emitter.emit(t?"success":"error",{action:this.action,text:this.selectedText,trigger:this.trigger,clearSelection:this.clearSelection.bind(this)})}},{key:"clearSelection",value:function(){this.trigger&&this.trigger.focus(),window.getSelection().removeAllRanges()}},{key:"destroy",value:function(){this.removeFake()}},{key:"action",set:function(){var t=0<arguments.length&&void 0!==arguments[0]?arguments[0]:"copy";if(this._action=t,"copy"!==this._action&&"cut"!==this._action)throw new Error('Invalid "action" value, use either "copy" or "cut"')},get:function(){return this._action}},{key:"target",set:function(t){if(void 0!==t){if(!t||"object"!==(void 0===t?"undefined":r(t))||1!==t.nodeType)throw new Error('Invalid "target" value, use a valid Element');if("copy"===this.action&&t.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if("cut"===this.action&&(t.hasAttribute("readonly")||t.hasAttribute("disabled")))throw new Error('Invalid "target" attribute. You can\'t cut text from elements with "readonly" or "disabled" attributes');this._target=t}},get:function(){return this._target}}]),e}();t.exports=u},function(t,e){t.exports=function(t){var e;if("SELECT"===t.nodeName)t.focus(),e=t.value;else if("INPUT"===t.nodeName||"TEXTAREA"===t.nodeName){var n=t.hasAttribute("readonly");n||t.setAttribute("readonly",""),t.select(),t.setSelectionRange(0,t.value.length),n||t.removeAttribute("readonly"),e=t.value}else{t.hasAttribute("contenteditable")&&t.focus();var o=window.getSelection(),r=document.createRange();r.selectNodeContents(t),o.removeAllRanges(),o.addRange(r),e=o.toString()}return e}},function(t,e){function n(){}n.prototype={on:function(t,e,n){var o=this.e||(this.e={});return(o[t]||(o[t]=[])).push({fn:e,ctx:n}),this},once:function(t,e,n){var o=this;function r(){o.off(t,r),e.apply(n,arguments)}return r._=e,this.on(t,r,n)},emit:function(t){for(var e=[].slice.call(arguments,1),n=((this.e||(this.e={}))[t]||[]).slice(),o=0,r=n.length;o<r;o++)n[o].fn.apply(n[o].ctx,e);return this},off:function(t,e){var n=this.e||(this.e={}),o=n[t],r=[];if(o&&e)for(var i=0,a=o.length;i<a;i++)o[i].fn!==e&&o[i].fn._!==e&&r.push(o[i]);return r.length?n[t]=r:delete n[t],this}},t.exports=n},function(t,e,n){var d=n(5),h=n(6);t.exports=function(t,e,n){if(!t&&!e&&!n)throw new Error("Missing required arguments");if(!d.string(e))throw new TypeError("Second argument must be a String");if(!d.fn(n))throw new TypeError("Third argument must be a Function");if(d.node(t))return s=e,f=n,(l=t).addEventListener(s,f),{destroy:function(){l.removeEventListener(s,f)}};if(d.nodeList(t))return a=t,c=e,u=n,Array.prototype.forEach.call(a,function(t){t.addEventListener(c,u)}),{destroy:function(){Array.prototype.forEach.call(a,function(t){t.removeEventListener(c,u)})}};if(d.string(t))return o=t,r=e,i=n,h(document.body,o,r,i);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList");var o,r,i,a,c,u,l,s,f}},function(t,n){n.node=function(t){return void 0!==t&&t instanceof HTMLElement&&1===t.nodeType},n.nodeList=function(t){var e=Object.prototype.toString.call(t);return void 0!==t&&("[object NodeList]"===e||"[object HTMLCollection]"===e)&&"length"in t&&(0===t.length||n.node(t[0]))},n.string=function(t){return"string"==typeof t||t instanceof String},n.fn=function(t){return"[object Function]"===Object.prototype.toString.call(t)}},function(t,e,n){var a=n(7);function i(t,e,n,o,r){var i=function(e,n,t,o){return function(t){t.delegateTarget=a(t.target,n),t.delegateTarget&&o.call(e,t)}}.apply(this,arguments);return t.addEventListener(n,i,r),{destroy:function(){t.removeEventListener(n,i,r)}}}t.exports=function(t,e,n,o,r){return"function"==typeof t.addEventListener?i.apply(null,arguments):"function"==typeof n?i.bind(null,document).apply(null,arguments):("string"==typeof t&&(t=document.querySelectorAll(t)),Array.prototype.map.call(t,function(t){return i(t,e,n,o,r)}))}},function(t,e){if("undefined"!=typeof Element&&!Element.prototype.matches){var n=Element.prototype;n.matches=n.matchesSelector||n.mozMatchesSelector||n.msMatchesSelector||n.oMatchesSelector||n.webkitMatchesSelector}t.exports=function(t,e){for(;t&&9!==t.nodeType;){if("function"==typeof t.matches&&t.matches(e))return t;t=t.parentNode}}}])});
8 | 


--------------------------------------------------------------------------------
/docs/libs/gitbook/js/jquery.highlight.js:
--------------------------------------------------------------------------------
 1 | gitbook.require(["jQuery"], function(jQuery) {
 2 | 
 3 | /*
 4 |  * jQuery Highlight plugin
 5 |  *
 6 |  * Based on highlight v3 by Johann Burkard
 7 |  * http://johannburkard.de/blog/programming/javascript/highlight-javascript-text-higlighting-jquery-plugin.html
 8 |  *
 9 |  * Code a little bit refactored and cleaned (in my humble opinion).
10 |  * Most important changes:
11 |  *  - has an option to highlight only entire words (wordsOnly - false by default),
12 |  *  - has an option to be case sensitive (caseSensitive - false by default)
13 |  *  - highlight element tag and class names can be specified in options
14 |  *
15 |  * Copyright (c) 2009 Bartek Szopka
16 |  *
17 |  * Licensed under MIT license.
18 |  *
19 |  */
20 | 
21 | jQuery.extend({
22 |     highlight: function (node, re, nodeName, className) {
23 |         if (node.nodeType === 3) {
24 |             var match = node.data.match(re);
25 |             if (match) {
26 |                 var highlight = document.createElement(nodeName || 'span');
27 |                 highlight.className = className || 'highlight';
28 |                 var wordNode = node.splitText(match.index);
29 |                 wordNode.splitText(match[0].length);
30 |                 var wordClone = wordNode.cloneNode(true);
31 |                 highlight.appendChild(wordClone);
32 |                 wordNode.parentNode.replaceChild(highlight, wordNode);
33 |                 return 1; //skip added node in parent
34 |             }
35 |         } else if ((node.nodeType === 1 && node.childNodes) && // only element nodes that have children
36 |                 !/(script|style)/i.test(node.tagName) && // ignore script and style nodes
37 |                 !(node.tagName === nodeName.toUpperCase() && node.className === className)) { // skip if already highlighted
38 |             for (var i = 0; i < node.childNodes.length; i++) {
39 |                 i += jQuery.highlight(node.childNodes[i], re, nodeName, className);
40 |             }
41 |         }
42 |         return 0;
43 |     }
44 | });
45 | 
46 | jQuery.fn.unhighlight = function (options) {
47 |     var settings = { className: 'highlight', element: 'span' };
48 |     jQuery.extend(settings, options);
49 | 
50 |     return this.find(settings.element + "." + settings.className).each(function () {
51 |         var parent = this.parentNode;
52 |         parent.replaceChild(this.firstChild, this);
53 |         parent.normalize();
54 |     }).end();
55 | };
56 | 
57 | jQuery.fn.highlight = function (words, options) {
58 |     var settings = { className: 'highlight', element: 'span', caseSensitive: false, wordsOnly: false };
59 |     jQuery.extend(settings, options);
60 | 
61 |     if (words.constructor === String) {
62 |         words = [words];
63 |         // also match 'foo-bar' if search for 'foo bar'
64 |         if (/\s/.test(words[0])) words.push(words[0].replace(/\s+/, '-'));
65 |     }
66 |     words = jQuery.grep(words, function(word, i){
67 |       return word !== '';
68 |     });
69 |     words = jQuery.map(words, function(word, i) {
70 |       return word.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&");
71 |     });
72 |     if (words.length === 0) { return this; }
73 | 
74 |     var flag = settings.caseSensitive ? "" : "i";
75 |     var pattern = "(" + words.join("|") + ")";
76 |     if (settings.wordsOnly) {
77 |         pattern = "\\b" + pattern + "\\b";
78 |     }
79 |     var re = new RegExp(pattern, flag);
80 | 
81 |     return this.each(function () {
82 |         jQuery.highlight(this, re, settings.element, settings.className);
83 |     });
84 | };
85 | 
86 | });
87 | 


--------------------------------------------------------------------------------
/docs/libs/gitbook/js/plugin-bookdown.js:
--------------------------------------------------------------------------------
  1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) {
  2 | 
  3 |   var gs = gitbook.storage;
  4 | 
  5 |   gitbook.events.bind("start", function(e, config) {
  6 | 
  7 |     // add the Edit button (edit on Github)
  8 |     var edit = config.edit;
  9 |     if (edit && edit.link) gitbook.toolbar.createButton({
 10 |       icon: 'fa fa-edit',
 11 |       label: edit.text || 'Edit',
 12 |       position: 'left',
 13 |       onClick: function(e) {
 14 |         e.preventDefault();
 15 |         window.open(edit.link);
 16 |       }
 17 |     });
 18 | 
 19 |     // add the History button (file history on Github)
 20 |     var history = config.history;
 21 |     if (history && history.link) gitbook.toolbar.createButton({
 22 |       icon: 'fa fa-history',
 23 |       label: history.text || 'History',
 24 |       position: 'left',
 25 |       onClick: function(e) {
 26 |         e.preventDefault();
 27 |         window.open(history.link);
 28 |       }
 29 |     });
 30 | 
 31 |     // add the View button (file view on Github)
 32 |     var view = config.view;
 33 |     if (view && view.link) gitbook.toolbar.createButton({
 34 |       icon: 'fa fa-eye',
 35 |       label: view.text || 'View Source',
 36 |       position: 'left',
 37 |       onClick: function(e) {
 38 |         e.preventDefault();
 39 |         window.open(view.link);
 40 |       }
 41 |     });
 42 | 
 43 |     // add the Download button
 44 |     var down = config.download;
 45 |     var normalizeDownload = function() {
 46 |       if (!down || !(down instanceof Array) || down.length === 0) return;
 47 |       if (down[0] instanceof Array) return down;
 48 |       return $.map(down, function(file, i) {
 49 |         return [[file, file.replace(/.*[.]/g, '').toUpperCase()]];
 50 |       });
 51 |     };
 52 |     down = normalizeDownload(down);
 53 |     if (down) if (down.length === 1 && /[.]pdf$/.test(down[0][0])) {
 54 |       gitbook.toolbar.createButton({
 55 |         icon: 'fa fa-file-pdf-o',
 56 |         label: down[0][1],
 57 |         position: 'left',
 58 |         onClick: function(e) {
 59 |           e.preventDefault();
 60 |           window.open(down[0][0]);
 61 |         }
 62 |       });
 63 |     } else {
 64 |       gitbook.toolbar.createButton({
 65 |         icon: 'fa fa-download',
 66 |         label: 'Download',
 67 |         position: 'left',
 68 |         dropdown: $.map(down, function(item, i) {
 69 |           return {
 70 |             text: item[1],
 71 |             onClick: function(e) {
 72 |               e.preventDefault();
 73 |               window.open(item[0]);
 74 |             }
 75 |           };
 76 |         })
 77 |       });
 78 |     }
 79 | 
 80 |     // add the Information button
 81 |     var info = ['Keyboard shortcuts (<> indicates arrow keys):',
 82 |       '<left>/<right>: navigate to previous/next page',
 83 |       's: Toggle sidebar'];
 84 |     if (config.search !== false) info.push('f: Toggle search input ' +
 85 |       '(use <up>/<down>/Enter in the search input to navigate through search matches; ' +
 86 |       'press Esc to cancel search)');
 87 |     if (config.info !== false) gitbook.toolbar.createButton({
 88 |       icon: 'fa fa-info',
 89 |       label: 'Information about the toolbar',
 90 |       position: 'left',
 91 |       onClick: function(e) {
 92 |         e.preventDefault();
 93 |         window.alert(info.join('\n\n'));
 94 |       }
 95 |     });
 96 | 
 97 |     // highlight the current section in TOC
 98 |     var href = window.location.pathname;
 99 |     href = href.substr(href.lastIndexOf('/') + 1);
100 |     // accentuated characters need to be decoded (#819)
101 |     href = decodeURIComponent(href);
102 |     if (href === '') href = 'index.html';
103 |     var li = $('a[href^="' + href + location.hash + '"]').parent('li.chapter').first();
104 |     var summary = $('ul.summary'), chaps = summary.find('li.chapter');
105 |     if (li.length === 0) li = chaps.first();
106 |     li.addClass('active');
107 |     chaps.on('click', function(e) {
108 |       chaps.removeClass('active');
109 |       $(this).addClass('active');
110 |       gs.set('tocScrollTop', summary.scrollTop());
111 |     });
112 | 
113 |     var toc = config.toc;
114 |     // collapse TOC items that are not for the current chapter
115 |     if (toc && toc.collapse) (function() {
116 |       var type = toc.collapse;
117 |       if (type === 'none') return;
118 |       if (type !== 'section' && type !== 'subsection') return;
119 |       // sections under chapters
120 |       var toc_sub = summary.children('li[data-level]').children('ul');
121 |       if (type === 'section') {
122 |         toc_sub.hide()
123 |           .parent().has(li).children('ul').show();
124 |       } else {
125 |         toc_sub.children('li').children('ul').hide()
126 |           .parent().has(li).children('ul').show();
127 |       }
128 |       li.children('ul').show();
129 |       var toc_sub2 = toc_sub.children('li');
130 |       if (type === 'section') toc_sub2.children('ul').hide();
131 |       summary.children('li[data-level]').find('a')
132 |         .on('click.bookdown', function(e) {
133 |           if (href === $(this).attr('href').replace(/#.*/, ''))
134 |             $(this).parent('li').children('ul').toggle();
135 |         });
136 |     })();
137 | 
138 |     // add tooltips to the <a>'s that are truncated
139 |     $('a').each(function(i, el) {
140 |       if (el.offsetWidth >= el.scrollWidth) return;
141 |       if (typeof el.title === 'undefined') return;
142 |       el.title = el.text;
143 |     });
144 | 
145 |     // restore TOC scroll position
146 |     var pos = gs.get('tocScrollTop');
147 |     if (typeof pos !== 'undefined') summary.scrollTop(pos);
148 | 
149 |     // highlight the TOC item that has same text as the heading in view as scrolling
150 |     if (toc && toc.scroll_highlight !== false && li.length > 0) (function() {
151 |       // scroll the current TOC item into viewport
152 |       var ht = $(window).height(), rect = li[0].getBoundingClientRect();
153 |       if (rect.top >= ht || rect.top <= 0 || rect.bottom <= 0) {
154 |         summary.scrollTop(li[0].offsetTop);
155 |       }
156 |       // current chapter TOC items
157 |       var items = $('a[href^="' + href + '"]').parent('li.chapter'),
158 |           m = items.length;
159 |       if (m === 0) {
160 |         items = summary.find('li.chapter');
161 |         m = items.length;
162 |       }
163 |       if (m === 0) return;
164 |       // all section titles on current page
165 |       var hs = bookInner.find('.page-inner').find('h1,h2,h3'), n = hs.length,
166 |           ts = hs.map(function(i, el) { return $(el).text(); });
167 |       if (n === 0) return;
168 |       var scrollHandler = function(e) {
169 |         var ht = $(window).height();
170 |         clearTimeout($.data(this, 'scrollTimer'));
171 |         $.data(this, 'scrollTimer', setTimeout(function() {
172 |           // find the first visible title in the viewport
173 |           for (var i = 0; i < n; i++) {
174 |             var rect = hs[i].getBoundingClientRect();
175 |             if (rect.top >= 0 && rect.bottom <= ht) break;
176 |           }
177 |           if (i === n) return;
178 |           items.removeClass('active');
179 |           for (var j = 0; j < m; j++) {
180 |             if (items.eq(j).children('a').first().text() === ts[i]) break;
181 |           }
182 |           if (j === m) j = 0;  // highlight the chapter title
183 |           // search bottom-up for a visible TOC item to highlight; if an item is
184 |           // hidden, we check if its parent is visible, and so on
185 |           while (j > 0 && items.eq(j).is(':hidden')) j--;
186 |           items.eq(j).addClass('active');
187 |         }, 250));
188 |       };
189 |       bookInner.on('scroll.bookdown', scrollHandler);
190 |       bookBody.on('scroll.bookdown', scrollHandler);
191 |     })();
192 | 
193 |     // do not refresh the page if the TOC item points to the current page
194 |     $('a[href="' + href + '"]').parent('li.chapter').children('a')
195 |       .on('click', function(e) {
196 |         bookInner.scrollTop(0);
197 |         bookBody.scrollTop(0);
198 |         return false;
199 |       });
200 | 
201 |     var toolbar = config.toolbar;
202 |     if (!toolbar || toolbar.position !== 'static') {
203 |       var bookHeader = $('.book-header');
204 |       bookBody.addClass('fixed');
205 |       bookHeader.addClass('fixed')
206 |       .css('background-color', bookBody.css('background-color'))
207 |       .on('click.bookdown', function(e) {
208 |         // the theme may have changed after user clicks the theme button
209 |         bookHeader.css('background-color', bookBody.css('background-color'));
210 |       });
211 |     }
212 | 
213 |   });
214 | 
215 |   gitbook.events.bind("page.change", function(e) {
216 |     // store TOC scroll position
217 |     var summary = $('ul.summary');
218 |     gs.set('tocScrollTop', summary.scrollTop());
219 |   });
220 | 
221 |   var bookBody = $('.book-body'), bookInner = bookBody.find('.body-inner');
222 |   var chapterTitle = function() {
223 |     return bookInner.find('.page-inner').find('h1,h2').first().text();
224 |   };
225 |   var saveScrollPos = function(e) {
226 |     // save scroll position before page is reloaded
227 |     gs.set('bodyScrollTop', {
228 |       body: bookBody.scrollTop(),
229 |       inner: bookInner.scrollTop(),
230 |       focused: document.hasFocus(),
231 |       title: chapterTitle()
232 |     });
233 |   };
234 |   $(document).on('servr:reload', saveScrollPos);
235 | 
236 |   // check if the page is loaded in an iframe (e.g. the RStudio preview window)
237 |   var inIFrame = function() {
238 |     var inIframe = true;
239 |     try { inIframe = window.self !== window.top; } catch (e) {}
240 |     return inIframe;
241 |   };
242 |   if (inIFrame()) {
243 |     $(window).on('blur unload', saveScrollPos);
244 |   }
245 | 
246 |   $(function(e) {
247 |     var pos = gs.get('bodyScrollTop');
248 |     if (pos) {
249 |       if (pos.title === chapterTitle()) {
250 |         if (pos.body !== 0) bookBody.scrollTop(pos.body);
251 |         if (pos.inner !== 0) bookInner.scrollTop(pos.inner);
252 |       }
253 |     }
254 |     if ((pos && pos.focused) || !inIFrame()) bookInner.find('.page-wrapper').focus();
255 |     // clear book body scroll position
256 |     gs.remove('bodyScrollTop');
257 |   });
258 | 
259 | });
260 | 


--------------------------------------------------------------------------------
/docs/libs/gitbook/js/plugin-clipboard.js:
--------------------------------------------------------------------------------
 1 | gitbook.require(["gitbook", "jQuery"], function(gitbook, $) {
 2 | 
 3 |   var copyButton = '<button type="button" class="copy-to-clipboard-button" title="Copy to clipboard" aria-label="Copy to clipboard"><i class="fa fa-copy"></i></button>';
 4 |   var clipboard;
 5 | 
 6 |   gitbook.events.bind("page.change", function() {
 7 | 
 8 |     if (!ClipboardJS.isSupported()) return;
 9 | 
10 |     // the page.change event is thrown twice: before and after the page changes
11 |     if (clipboard) {
12 |       // clipboard is already defined but we are on the same page
13 |       if (clipboard._prevPage === window.location.pathname) return;
14 |       // clipboard is already defined and url path change
15 |       // we can deduct that we are before page changes
16 |       clipboard.destroy(); // destroy the previous events listeners
17 |       clipboard = undefined; // reset the clipboard object
18 |       return;
19 |     }
20 | 
21 |     $(copyButton).prependTo("div.sourceCode");
22 | 
23 |     clipboard = new ClipboardJS(".copy-to-clipboard-button", {
24 |       text: function(trigger) {
25 |         return trigger.parentNode.textContent;
26 |       }
27 |     });
28 | 
29 |     clipboard._prevPage = window.location.pathname
30 | 
31 |   });
32 | 
33 | });
34 | 


--------------------------------------------------------------------------------
/docs/libs/gitbook/js/plugin-fontsettings.js:
--------------------------------------------------------------------------------
  1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) {
  2 |     var fontState;
  3 | 
  4 |     var THEMES = {
  5 |         "white": 0,
  6 |         "sepia": 1,
  7 |         "night": 2
  8 |     };
  9 | 
 10 |     var FAMILY = {
 11 |         "serif": 0,
 12 |         "sans": 1
 13 |     };
 14 | 
 15 |     // Save current font settings
 16 |     function saveFontSettings() {
 17 |         gitbook.storage.set("fontState", fontState);
 18 |         update();
 19 |     }
 20 | 
 21 |     // Increase font size
 22 |     function enlargeFontSize(e) {
 23 |         e.preventDefault();
 24 |         if (fontState.size >= 4) return;
 25 | 
 26 |         fontState.size++;
 27 |         saveFontSettings();
 28 |     };
 29 | 
 30 |     // Decrease font size
 31 |     function reduceFontSize(e) {
 32 |         e.preventDefault();
 33 |         if (fontState.size <= 0) return;
 34 | 
 35 |         fontState.size--;
 36 |         saveFontSettings();
 37 |     };
 38 | 
 39 |     // Change font family
 40 |     function changeFontFamily(index, e) {
 41 |         e.preventDefault();
 42 | 
 43 |         fontState.family = index;
 44 |         saveFontSettings();
 45 |     };
 46 | 
 47 |     // Change type of color
 48 |     function changeColorTheme(index, e) {
 49 |         e.preventDefault();
 50 | 
 51 |         var $book = $(".book");
 52 | 
 53 |         if (fontState.theme !== 0)
 54 |             $book.removeClass("color-theme-"+fontState.theme);
 55 | 
 56 |         fontState.theme = index;
 57 |         if (fontState.theme !== 0)
 58 |             $book.addClass("color-theme-"+fontState.theme);
 59 | 
 60 |         saveFontSettings();
 61 |     };
 62 | 
 63 |     function update() {
 64 |         var $book = gitbook.state.$book;
 65 | 
 66 |         $(".font-settings .font-family-list li").removeClass("active");
 67 |         $(".font-settings .font-family-list li:nth-child("+(fontState.family+1)+")").addClass("active");
 68 | 
 69 |         $book[0].className = $book[0].className.replace(/\bfont-\S+/g, '');
 70 |         $book.addClass("font-size-"+fontState.size);
 71 |         $book.addClass("font-family-"+fontState.family);
 72 | 
 73 |         if(fontState.theme !== 0) {
 74 |             $book[0].className = $book[0].className.replace(/\bcolor-theme-\S+/g, '');
 75 |             $book.addClass("color-theme-"+fontState.theme);
 76 |         }
 77 |     };
 78 | 
 79 |     function init(config) {
 80 |         var $bookBody, $book;
 81 | 
 82 |         //Find DOM elements.
 83 |         $book = gitbook.state.$book;
 84 |         $bookBody = $book.find(".book-body");
 85 | 
 86 |         // Instantiate font state object
 87 |         fontState = gitbook.storage.get("fontState", {
 88 |             size: config.size || 2,
 89 |             family: FAMILY[config.family || "sans"],
 90 |             theme: THEMES[config.theme || "white"]
 91 |         });
 92 | 
 93 |         update();
 94 |     };
 95 | 
 96 | 
 97 |     gitbook.events.bind("start", function(e, config) {
 98 |         var opts = config.fontsettings;
 99 |         if (!opts) return;
100 |         
101 |         // Create buttons in toolbar
102 |         gitbook.toolbar.createButton({
103 |             icon: 'fa fa-font',
104 |             label: 'Font Settings',
105 |             className: 'font-settings',
106 |             dropdown: [
107 |                 [
108 |                     {
109 |                         text: 'A',
110 |                         className: 'font-reduce',
111 |                         onClick: reduceFontSize
112 |                     },
113 |                     {
114 |                         text: 'A',
115 |                         className: 'font-enlarge',
116 |                         onClick: enlargeFontSize
117 |                     }
118 |                 ],
119 |                 [
120 |                     {
121 |                         text: 'Serif',
122 |                         onClick: _.partial(changeFontFamily, 0)
123 |                     },
124 |                     {
125 |                         text: 'Sans',
126 |                         onClick: _.partial(changeFontFamily, 1)
127 |                     }
128 |                 ],
129 |                 [
130 |                     {
131 |                         text: 'White',
132 |                         onClick: _.partial(changeColorTheme, 0)
133 |                     },
134 |                     {
135 |                         text: 'Sepia',
136 |                         onClick: _.partial(changeColorTheme, 1)
137 |                     },
138 |                     {
139 |                         text: 'Night',
140 |                         onClick: _.partial(changeColorTheme, 2)
141 |                     }
142 |                 ]
143 |             ]
144 |         });
145 | 
146 | 
147 |         // Init current settings
148 |         init(opts);
149 |     });
150 | });
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/docs/libs/gitbook/js/plugin-search.js:
--------------------------------------------------------------------------------
  1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) {
  2 |     var index = null;
  3 |     var fuse = null;
  4 |     var _search = {engine: 'lunr', opts: {}};
  5 |     var $searchInput, $searchLabel, $searchForm;
  6 |     var $highlighted = [], hi, hiOpts = { className: 'search-highlight' };
  7 |     var collapse = false, toc_visible = [];
  8 | 
  9 |     function init(config) {
 10 |         // Instantiate search settings
 11 |         _search = gitbook.storage.get("search", {
 12 |             engine: config.search.engine || 'lunr',
 13 |             opts: config.search.options || {},
 14 |         });
 15 |     };
 16 | 
 17 |     // Save current search settings
 18 |     function saveSearchSettings() {
 19 |         gitbook.storage.set("search", _search);
 20 |     }
 21 | 
 22 |     // Use a specific index
 23 |     function loadIndex(data) {
 24 |         // [Yihui] In bookdown, I use a character matrix to store the chapter
 25 |         // content, and the index is dynamically built on the client side.
 26 |         // Gitbook prebuilds the index data instead: https://github.com/GitbookIO/plugin-search
 27 |         // We can certainly do that via R packages V8 and jsonlite, but let's
 28 |         // see how slow it really is before improving it. On the other hand,
 29 |         // lunr cannot handle non-English text very well, e.g. the default
 30 |         // tokenizer cannot deal with Chinese text, so we may want to replace
 31 |         // lunr with a dumb simple text matching approach.
 32 |         if (_search.engine === 'lunr') {
 33 |           index = lunr(function () {
 34 |             this.ref('url');
 35 |             this.field('title', { boost: 10 });
 36 |             this.field('body');
 37 |           });
 38 |           data.map(function(item) {
 39 |             index.add({
 40 |               url: item[0],
 41 |               title: item[1],
 42 |               body: item[2]
 43 |             });
 44 |           });
 45 |           return;
 46 |         }
 47 |         fuse = new Fuse(data.map((_data => {
 48 |             return {
 49 |                 url: _data[0],
 50 |                 title: _data[1],
 51 |                 body: _data[2]
 52 |             };
 53 |         })), Object.assign(
 54 |             {
 55 |                 includeScore: true,
 56 |                 threshold: 0.1,
 57 |                 ignoreLocation: true,
 58 |                 keys: ["title", "body"]
 59 |             },
 60 |             _search.opts
 61 |         ));
 62 |     }
 63 | 
 64 |     // Fetch the search index
 65 |     function fetchIndex() {
 66 |         return $.getJSON(gitbook.state.basePath+"/search_index.json")
 67 |                 .then(loadIndex);  // [Yihui] we need to use this object later
 68 |     }
 69 | 
 70 |     // Search for a term and return results
 71 |     function search(q) {
 72 |         let results = [];
 73 |         switch (_search.engine) {
 74 |             case 'fuse':
 75 |                 if (!fuse) return;
 76 |                 results = fuse.search(q).map(function(result) {
 77 |                     var parts = result.item.url.split('#');
 78 |                     return {
 79 |                         path: parts[0],
 80 |                         hash: parts[1]
 81 |                     };
 82 |                 });
 83 |                 break;
 84 |             case 'lunr':
 85 |             default:
 86 |                 if (!index) return;
 87 |                 results = _.chain(index.search(q)).map(function(result) {
 88 |                     var parts = result.ref.split("#");
 89 |                     return {
 90 |                         path: parts[0],
 91 |                         hash: parts[1]
 92 |                     };
 93 |                 })
 94 |                 .value();
 95 |         }
 96 | 
 97 |         // [Yihui] Highlight the search keyword on current page
 98 |         $highlighted = $('.page-inner')
 99 |           .unhighlight(hiOpts).highlight(q, hiOpts).find('span.search-highlight');
100 |         scrollToHighlighted(0);
101 | 
102 |         return results;
103 |     }
104 | 
105 |     // [Yihui] Scroll the chapter body to the i-th highlighted string
106 |     function scrollToHighlighted(d) {
107 |       var n = $highlighted.length;
108 |       hi = hi === undefined ? 0 : hi + d;
109 |       // navignate to the previous/next page in the search results if reached the top/bottom
110 |       var b = hi < 0;
111 |       if (d !== 0 && (b || hi >= n)) {
112 |         var path = currentPath(), n2 = toc_visible.length;
113 |         if (n2 === 0) return;
114 |         for (var i = b ? 0 : n2; (b && i < n2) || (!b && i >= 0); i += b ? 1 : -1) {
115 |           if (toc_visible.eq(i).data('path') === path) break;
116 |         }
117 |         i += b ? -1 : 1;
118 |         if (i < 0) i = n2 - 1;
119 |         if (i >= n2) i = 0;
120 |         var lnk = toc_visible.eq(i).find('a[href$=".html"]');
121 |         if (lnk.length) lnk[0].click();
122 |         return;
123 |       }
124 |       if (n === 0) return;
125 |       var $p = $highlighted.eq(hi);
126 |       $p[0].scrollIntoView();
127 |       $highlighted.css('background-color', '');
128 |       // an orange background color on the current item and removed later
129 |       $p.css('background-color', 'orange');
130 |       setTimeout(function() {
131 |         $p.css('background-color', '');
132 |       }, 2000);
133 |     }
134 | 
135 |     function currentPath() {
136 |       var href = window.location.pathname;
137 |       href = href.substr(href.lastIndexOf('/') + 1);
138 |       return href === '' ? 'index.html' : href;
139 |     }
140 | 
141 |     // Create search form
142 |     function createForm(value) {
143 |         if ($searchForm) $searchForm.remove();
144 |         if ($searchLabel) $searchLabel.remove();
145 |         if ($searchInput) $searchInput.remove();
146 | 
147 |         $searchForm = $('<div>', {
148 |             'class': 'book-search',
149 |             'role': 'search'
150 |         });
151 | 
152 |         $searchLabel = $('<label>', {
153 |             'for': 'search-box',
154 |             'aria-hidden': 'false',
155 |             'hidden': ''
156 |         });
157 | 
158 |         $searchInput = $('<input>', {
159 |             'id': 'search-box',
160 |             'type': 'search',
161 |             'class': 'form-control',
162 |             'val': value,
163 |             'placeholder': 'Type to search (Enter for navigation)',
164 |             'title': 'Use Enter or the <Down> key to navigate to the next match, or the <Up> key to the previous match'
165 |         });
166 | 
167 |         $searchLabel.append("Type to search");
168 |         $searchLabel.appendTo($searchForm);
169 |         $searchInput.appendTo($searchForm);
170 |         $searchForm.prependTo(gitbook.state.$book.find('.book-summary'));
171 |     }
172 | 
173 |     // Return true if search is open
174 |     function isSearchOpen() {
175 |         return gitbook.state.$book.hasClass("with-search");
176 |     }
177 | 
178 |     // Toggle the search
179 |     function toggleSearch(_state) {
180 |         if (isSearchOpen() === _state) return;
181 |         if (!$searchInput) return;
182 | 
183 |         gitbook.state.$book.toggleClass("with-search", _state);
184 | 
185 |         // If search bar is open: focus input
186 |         if (isSearchOpen()) {
187 |             gitbook.sidebar.toggle(true);
188 |             $searchInput.focus();
189 |         } else {
190 |             $searchInput.blur();
191 |             $searchInput.val("");
192 |             gitbook.storage.remove("keyword");
193 |             gitbook.sidebar.filter(null);
194 |             $('.page-inner').unhighlight(hiOpts);
195 |         }
196 |     }
197 | 
198 |     function sidebarFilter(results) {
199 |         gitbook.sidebar.filter(_.pluck(results, "path"));
200 |         toc_visible = $('ul.summary').find('li:visible');
201 |     }
202 | 
203 |     // Recover current search when page changed
204 |     function recoverSearch() {
205 |         var keyword = gitbook.storage.get("keyword", "");
206 | 
207 |         createForm(keyword);
208 | 
209 |         if (keyword.length > 0) {
210 |             if(!isSearchOpen()) {
211 |                 toggleSearch(true); // [Yihui] open the search box
212 |             }
213 |             sidebarFilter(search(keyword));
214 |         }
215 |     }
216 | 
217 | 
218 |     gitbook.events.bind("start", function(e, config) {
219 |         // [Yihui] disable search
220 |         if (config.search === false) return;
221 |         init(config);
222 |         collapse = !config.toc || config.toc.collapse === 'section' ||
223 |           config.toc.collapse === 'subsection';
224 | 
225 |         // Pre-fetch search index and create the form
226 |         fetchIndex()
227 |         // [Yihui] recover search after the page is loaded
228 |         .then(recoverSearch);
229 | 
230 | 
231 |         // Type in search bar
232 |         $(document).on("keyup", ".book-search input", function(e) {
233 |             var key = (e.keyCode ? e.keyCode : e.which);
234 |             // [Yihui] Escape -> close search box; Up/Down/Enter: previous/next highlighted
235 |             if (key == 27) {
236 |                 e.preventDefault();
237 |                 toggleSearch(false);
238 |             } else if (key == 38) {
239 |               scrollToHighlighted(-1);
240 |             } else if (key == 40 || key == 13) {
241 |               scrollToHighlighted(1);
242 |             }
243 |         }).on("input", ".book-search input", function(e) {
244 |             var q = $(this).val().trim();
245 |             if (q.length === 0) {
246 |                 gitbook.sidebar.filter(null);
247 |                 gitbook.storage.remove("keyword");
248 |                 $('.page-inner').unhighlight(hiOpts);
249 |             } else {
250 |                 var results = search(q);
251 |                 sidebarFilter(results);
252 |                 gitbook.storage.set("keyword", q);
253 |             }
254 |         });
255 | 
256 |         // Create the toggle search button
257 |         gitbook.toolbar.createButton({
258 |             icon: 'fa fa-search',
259 |             label: 'Search',
260 |             position: 'left',
261 |             onClick: toggleSearch
262 |         });
263 | 
264 |         // Bind keyboard to toggle search
265 |         gitbook.keyboard.bind(['f'], toggleSearch);
266 |     });
267 | 
268 |     // [Yihui] do not try to recover search; always start fresh
269 |     // gitbook.events.bind("page.change", recoverSearch);
270 | });
271 | 


--------------------------------------------------------------------------------
/docs/libs/gitbook/js/plugin-sharing.js:
--------------------------------------------------------------------------------
  1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) {
  2 |     var SITES = {
  3 |         'github': {
  4 |             'label': 'Github',
  5 |             'icon': 'fa fa-github',
  6 |             'onClick': function(e) {
  7 |                 e.preventDefault();
  8 |                 var repo = $('meta[name="github-repo"]').attr('content');
  9 |                 if (typeof repo === 'undefined') throw("Github repo not defined");
 10 |                 window.open("https://github.com/"+repo);
 11 |             }
 12 |         },
 13 |         'facebook': {
 14 |             'label': 'Facebook',
 15 |             'icon': 'fa fa-facebook',
 16 |             'onClick': function(e) {
 17 |                 e.preventDefault();
 18 |                 window.open("http://www.facebook.com/sharer/sharer.php?u="+encodeURIComponent(location.href));
 19 |             }
 20 |         },
 21 |         'twitter': {
 22 |             'label': 'Twitter',
 23 |             'icon': 'fa fa-twitter',
 24 |             'onClick': function(e) {
 25 |                 e.preventDefault();
 26 |                 window.open("http://twitter.com/intent/tweet?text="+encodeURIComponent(document.title)+"&url="+encodeURIComponent(location.href)+"&hashtags=rmarkdown,bookdown");
 27 |             }
 28 |         },
 29 |         'linkedin': {
 30 |             'label': 'LinkedIn',
 31 |             'icon': 'fa fa-linkedin',
 32 |             'onClick': function(e) {
 33 |                 e.preventDefault();
 34 |                 window.open("https://www.linkedin.com/shareArticle?mini=true&url="+encodeURIComponent(location.href)+"&title="+encodeURIComponent(document.title));
 35 |             }
 36 |         },
 37 |         'weibo': {
 38 |             'label': 'Weibo',
 39 |             'icon': 'fa fa-weibo',
 40 |             'onClick': function(e) {
 41 |                 e.preventDefault();
 42 |                 window.open("http://service.weibo.com/share/share.php?content=utf-8&url="+encodeURIComponent(location.href)+"&title="+encodeURIComponent(document.title));
 43 |             }
 44 |         },
 45 |         'instapaper': {
 46 |             'label': 'Instapaper',
 47 |             'icon': 'fa fa-italic',
 48 |             'onClick': function(e) {
 49 |                 e.preventDefault();
 50 |                 window.open("http://www.instapaper.com/text?u="+encodeURIComponent(location.href));
 51 |             }
 52 |         },
 53 |         'vk': {
 54 |             'label': 'VK',
 55 |             'icon': 'fa fa-vk',
 56 |             'onClick': function(e) {
 57 |                 e.preventDefault();
 58 |                 window.open("http://vkontakte.ru/share.php?url="+encodeURIComponent(location.href));
 59 |             }
 60 |         },
 61 |         'whatsapp': {
 62 |             'label': 'Whatsapp',
 63 |             'icon': 'fa fa-whatsapp',
 64 |             'onClick': function(e) {
 65 |                 e.preventDefault();
 66 |                 var url = encodeURIComponent(location.href);
 67 |                 window.open((isMobile() ? "whatsapp://send" : "https://web.whatsapp.com/send") + "?text=" + url);
 68 |             }
 69 |         },
 70 |     };
 71 | 
 72 |     function isMobile() {
 73 |       return !!navigator.maxTouchPoints;
 74 |     }
 75 | 
 76 |     gitbook.events.bind("start", function(e, config) {
 77 |         var opts = config.sharing;
 78 |         if (!opts) return;
 79 | 
 80 |         // Create dropdown menu
 81 |         var menu = _.chain(opts.all)
 82 |             .map(function(id) {
 83 |                 var site = SITES[id];
 84 |                 if (!site) return;
 85 |                 return {
 86 |                     text: site.label,
 87 |                     onClick: site.onClick
 88 |                 };
 89 |             })
 90 |             .compact()
 91 |             .value();
 92 | 
 93 |         // Create main button with dropdown
 94 |         if (menu.length > 0) {
 95 |             gitbook.toolbar.createButton({
 96 |                 icon: 'fa fa-share-alt',
 97 |                 label: 'Share',
 98 |                 position: 'right',
 99 |                 dropdown: [menu]
100 |             });
101 |         }
102 | 
103 |         // Direct actions to share
104 |         _.each(SITES, function(site, sideId) {
105 |             if (!opts[sideId]) return;
106 | 
107 |             gitbook.toolbar.createButton({
108 |                 icon: site.icon,
109 |                 label: site.label,
110 |                 title: site.label,
111 |                 position: 'right',
112 |                 onClick: site.onClick
113 |             });
114 |         });
115 |     });
116 | });
117 | 


--------------------------------------------------------------------------------
/docs/reference-keys.txt:
--------------------------------------------------------------------------------
  1 | fig:approaches
  2 | fig:components
  3 | fig:gptinput
  4 | fig:gptoutput
  5 | fig:vm
  6 | fig:rtx
  7 | fig:gpuarchitecture
  8 | fig:tpu
  9 | fig:distributedsystems
 10 | fig:ec2rstudioserver
 11 | fig:ec2rstudioserverhtop
 12 | fig:colabr
 13 | fig:ec2gpusetup1
 14 | fig:ec2gpusetup2
 15 | fig:emrsetup
 16 | fig:datapipeline
 17 | fig:columnvsrow
 18 | fig:rdscreate
 19 | fig:rdseasy
 20 | fig:rdsinboundrules
 21 | fig:druidstart
 22 | fig:druidparse
 23 | fig:druiddatasources
 24 | fig:druidquery
 25 | fig:nlppipeline
 26 | what-is-big-in-big-data
 27 | approaches-to-analyzing-big-data
 28 | the-two-domains-of-big-data-analytics
 29 | a-practical-big-p-problem
 30 | simple-logistic-regression-naive-approach
 31 | regularization-the-lasso-estimator
 32 | a-practical-big-n-problem
 33 | ols-as-a-point-of-reference
 34 | the-uluru-algorithm-as-an-alternative-to-ols
 35 | software-programming-with-big-data
 36 | domains-of-programming-with-big-data
 37 | measuring-r-performance
 38 | writing-efficient-r-code
 39 | memory-allocation-and-growing-objects
 40 | vectorization-in-basic-r-functions
 41 | apply-type-functions-and-vectorization
 42 | avoiding-unnecessary-copying
 43 | releasing-memory
 44 | beyond-r
 45 | sql-basics
 46 | first-steps-in-sqlite
 47 | simple-queries
 48 | joins
 49 | with-a-little-help-from-my-friends-gpt-and-rsql-coding
 50 | wrapping-up
 51 | hardware-computing-resources
 52 | mass-storage
 53 | avoiding-redundancies
 54 | data-compression
 55 | random-access-memory-ram
 56 | combining-ram-and-hard-disk-virtual-memory
 57 | cpu-and-parallelization
 58 | naive-multi-session-approach
 59 | multi-session-approach-with-futures
 60 | multi-core-and-multi-node-approach
 61 | parallel-for-loops-using-socket
 62 | parallel-lapply-using-forking
 63 | gpus-for-scientific-computing
 64 | gpus-in-r
 65 | the-road-ahead-hardware-made-for-machine-learning
 66 | wrapping-up-1
 67 | still-have-insufficient-computing-resources
 68 | distributed-systems
 69 | mapreduce
 70 | apache-hadoop
 71 | hadoop-word-count-example
 72 | apache-spark
 73 | spark-with-r
 74 | data-import-and-summary-statistics
 75 | spark-with-sql
 76 | spark-with-r-sql
 77 | wrapping-up-2
 78 | cloud-computing
 79 | cloud-computing-basics-and-platforms
 80 | transitioning-to-the-cloud
 81 | scaling-up-in-the-cloud-virtual-servers
 82 | parallelization-with-an-ec2-instance
 83 | scaling-up-with-gpus
 84 | gpus-on-google-colab
 85 | rstudio-and-ec2-with-gpus-on-aws
 86 | scaling-out-mapreduce-in-the-cloud
 87 | wrapping-up-3
 88 | data-collection-and-data-storage
 89 | gathering-and-compilation-of-raw-data
 90 | stackcombine-raw-source-files
 91 | efficient-local-data-storage
 92 | rdbms-basics
 93 | efficient-data-access-indices-and-joins-in-sqlite
 94 | connecting-r-to-an-rdbms
 95 | creating-a-new-database-with-rsqlite
 96 | importing-data
 97 | issuing-queries
 98 | cloud-solutions-for-big-data-storage
 99 | easy-to-use-rdbms-in-the-cloud-aws-rds
100 | column-based-analytics-databases
101 | installation-and-start-up
102 | first-steps-via-druids-gui
103 | load-data-into-druid
104 | query-druid-via-the-gui-sql-console
105 | query-druid-from-r
106 | data-warehouses
107 | data-warehouse-for-analytics-google-bigquery-example
108 | data-lakes-and-simple-storage-service
109 | aws-s3-with-r-first-steps
110 | uploading-data-to-s3
111 | more-than-just-simple-storage-s3-amazon-athena
112 | wrapping-up-4
113 | big-data-cleaning-and-transformation
114 | out-of-memory-strategies-and-lazy-evaluation-practical-basics
115 | chunking-data-with-the-ff-package
116 | memory-mapping-with-bigmemory
117 | connecting-to-apache-arrow
118 | big-data-preparation-tutorial-with-ff
119 | set-up
120 | data-import
121 | inspect-imported-files
122 | data-cleaning-and-transformation
123 | inspect-difference-in-in-memory-operation
124 | subsetting
125 | saveloadexport-ff-files
126 | big-data-preparation-tutorial-with-arrow
127 | wrapping-up-5
128 | descriptive-statistics-and-aggregation
129 | data-aggregation-the-split-apply-combine-strategy
130 | data-aggregation-with-chunked-data-files
131 | high-speed-in-memory-data-aggregation-with-arrow
132 | high-speed-in-memory-data-aggregation-with-data.table
133 | wrapping-up-6
134 | big-data-visualization
135 | challenges-of-big-data-visualization
136 | data-exploration-with-ggplot2
137 | visualizing-time-and-space
138 | preparations
139 | pick-up-and-drop-off-locations
140 | wrapping-up-7
141 | bottlenecks-in-everyday-data-analytics-tasks
142 | case-study-efficient-fixed-effects-estimation
143 | case-study-loops-memory-and-vectorization
144 | naïve-approach-ignorant-of-r
145 | improvement-1-pre-allocation-of-memory
146 | improvement-2-exploit-vectorization
147 | case-study-bootstrapping-and-parallel-processing
148 | parallelization-with-an-ec2-instance-1
149 | econometrics-with-gpus
150 | ols-on-gpus
151 | a-word-of-caution
152 | higher-level-interfaces-for-basic-econometrics-with-gpus
153 | tensorflowkeras-example-predict-housing-prices
154 | data-preparation
155 | model-specification
156 | training-and-prediction
157 | wrapping-up-8
158 | regression-analysis-and-categorization-with-spark-and-r
159 | simple-linear-regression-analysis
160 | machine-learning-for-classification
161 | building-machine-learning-pipelines-with-r-and-spark
162 | set-up-and-data-import
163 | building-the-pipeline
164 | wrapping-up-9
165 | large-scale-text-analysis-with-sparklyr
166 | getting-started-import-pre-processing-and-word-count
167 | tutorial-political-slant
168 | data-download-and-import
169 | cleaning-speeches-data
170 | create-a-bigrams-count-per-party
171 | find-partisan-phrases
172 | results-most-partisan-phrases-by-congress
173 | natural-language-processing-at-scale
174 | preparatory-steps
175 | sentiment-annotation
176 | aggregation-and-visualization
177 | sparklyr-and-lazy-evaluation
178 | 


--------------------------------------------------------------------------------
/docs/style/style.css:
--------------------------------------------------------------------------------
 1 | .rmdcaution, .rmdimportant, .rmdnote, .rmdtip, .rmdwarning {
 2 |   padding: 1em 1em 1em 4em;
 3 |   margin-bottom: 10px;
 4 |   background: #f5f5f5 5px center/3em no-repeat;
 5 | }
 6 | .rmdcaution {
 7 |   background-image: url("../img/caution.png");
 8 | }
 9 | .rmdimportant {
10 |   background-image: url("../img/important.png");
11 | }
12 | .rmdnote {
13 |   background-image: url("../img/note.png");
14 | }
15 | .rmdtip {
16 |   background-image: url("../img/tip.png");
17 | }
18 | .rmdwarning {
19 |   background-image: url("../img/warning.png");
20 | }
21 | p.caption {
22 |   color: #777;
23 |   margin-top: 10px;
24 | }
25 | p code {
26 |   white-space: inherit;
27 | }
28 | pre {
29 |   word-break: normal;
30 |   word-wrap: normal;
31 | }
32 | pre code {
33 |   white-space: inherit;
34 | }
35 | p.flushright {
36 |   text-align: right;
37 | }
38 | blockquote > p:last-child {
39 |   text-align: right;
40 | }
41 | blockquote > p:first-child {
42 |   text-align: inherit;
43 | }
44 | .header-section-number {
45 |   padding-right: .2em;
46 |   font-weight: 500;
47 | }
48 | .level1 .header-section-number {
49 |   display: inline-block;
50 |   border-bottom: 3px solid;
51 | }
52 | .level1 h1 {
53 |   border-bottom: 1px solid;
54 | }
55 | h1, h2, h3, h4, h5, h6 {
56 |   font-weight: normal;
57 | }
58 | h1.title {
59 |   font-weight: 700;
60 | }
61 | .smallcaps {
62 |   font-variant: small-caps;
63 | }
64 | .book .book-body .page-wrapper .page-inner section.normal strong {
65 |   font-weight: 600;
66 | }
67 | 
68 | .infobox {
69 |   padding: 1em 1em 1em 4em;
70 |   margin-bottom: 10px;
71 |   border: 2px solid black;
72 |   border-radius: 10px;
73 |   background: #d9e3f0 5px center/3em no-repeat;
74 | }
75 | 
76 | /* General text color - Background Blue */
77 | body {
78 |     color: #000E91;
79 | }
80 | 
81 | /* Link color - Neon Purple */
82 | a {
83 |     color: #A100A1;
84 | }
85 | 
86 | /* Link hover color - for better UX, slightly different shade of Neon Purple */
87 | a:hover {
88 |     color: #8F0091;
89 | }
90 | 
91 | 


--------------------------------------------------------------------------------
/docs/style/style_new.css:
--------------------------------------------------------------------------------
  1 | .rmdcaution, .rmdimportant, .rmdnote, .rmdtip, .rmdwarning {
  2 |   padding: 1em 1em 1em 4em;
  3 |   margin-bottom: 10px;
  4 |   background: #f5f5f5 5px center/3em no-repeat;
  5 | }
  6 | .rmdcaution {
  7 |   background-image: url("../img/caution.png");
  8 | }
  9 | .rmdimportant {
 10 |   background-image: url("../img/important.png");
 11 | }
 12 | .rmdnote {
 13 |   background-image: url("../img/note.png");
 14 | }
 15 | .rmdtip {
 16 |   background-image: url("../img/tip.png");
 17 | }
 18 | .rmdwarning {
 19 |   background-image: url("../img/warning.png");
 20 | }
 21 | p.caption {
 22 |   color: #777;
 23 |   margin-top: 10px;
 24 | }
 25 | p code {
 26 |   white-space: inherit;
 27 | }
 28 | pre {
 29 |   word-break: normal;
 30 |   word-wrap: normal;
 31 | }
 32 | pre code {
 33 |   white-space: inherit;
 34 | }
 35 | p.flushright {
 36 |   text-align: right;
 37 | }
 38 | blockquote > p:last-child {
 39 |   text-align: right;
 40 | }
 41 | blockquote > p:first-child {
 42 |   text-align: inherit;
 43 | }
 44 | .header-section-number {
 45 |   padding-right: .2em;
 46 |   font-weight: 500;
 47 | }
 48 | .level1 .header-section-number {
 49 |   display: inline-block;
 50 |   border-bottom: 3px solid;
 51 | }
 52 | .level1 h1 {
 53 |   border-bottom: 1px solid;
 54 | }
 55 | h1, h2, h3, h4, h5, h6 {
 56 |   font-weight: normal;
 57 | }
 58 | h1.title {
 59 |   font-weight: 700;
 60 | }
 61 | .smallcaps {
 62 |   font-variant: small-caps;
 63 | }
 64 | .book .book-body .page-wrapper .page-inner section.normal strong {
 65 |   font-weight: 600;
 66 | }
 67 | 
 68 | .infobox {
 69 |   padding: 1em 1em 1em 4em;
 70 |   margin-bottom: 10px;
 71 |   border: 2px solid black;
 72 |   border-radius: 10px;
 73 |   background: #d9e3f0 5px center/3em no-repeat;
 74 | }
 75 | 
 76 | /* General text color - Background Blue */
 77 | .book-body h {
 78 |     color: #0f1b27 !important; 
 79 | }
 80 | 
 81 | 
 82 | /* General navigation bar text color */
 83 | .book-summary .summary a {
 84 |     color: #0f1b27 !important;
 85 | }
 86 | 
 87 | /* Link color - Neon Purple */
 88 | .book .page-wrapper .page-inner a {
 89 |     color: #a20058 !important;
 90 | }
 91 | 
 92 | .book-summary .summary a:hover,
 93 | .book-summary .summary .active a {
 94 |     color: #a20058 !important;
 95 | }
 96 | 
 97 | .book .page-wrapper .page-inner blockquote {
 98 |     color:  #a20058; /* Change the text color of the blockquote */
 99 |     border-left: 4px solid  #a20058 !important; /* Change the left border color of the blockquote */
100 | }
101 | 
102 | .book-header .btn:active,
103 | .book-header .btn:hover {
104 |     color: #a20058 !important;
105 | }
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/docs/style/toc.css:
--------------------------------------------------------------------------------
  1 | #TOC ul,
  2 | #TOC li,
  3 | #TOC span,
  4 | #TOC a {
  5 |   margin: 0;
  6 |   padding: 0;
  7 |   position: relative;
  8 | }
  9 | #TOC {
 10 |   line-height: 1;
 11 |   border-radius: 5px 5px 0 0;
 12 |   background: #141414;
 13 |   background: linear-gradient(to bottom, #333333 0%, #141414 100%);
 14 |   border-bottom: 2px solid #0fa1e0;
 15 |   width: auto;
 16 | }
 17 | #TOC:after,
 18 | #TOC ul:after {
 19 |   content: '';
 20 |   display: block;
 21 |   clear: both;
 22 | }
 23 | #TOC a {
 24 |   background: #141414;
 25 |   background: linear-gradient(to bottom, #333333 0%, #141414 100%);
 26 |   color: #ffffff;
 27 |   display: block;
 28 |   padding: 19px 20px;
 29 |   text-decoration: none;
 30 |   text-shadow: none;
 31 | }
 32 | #TOC ul {
 33 |   list-style: none;
 34 | }
 35 | #TOC > ul > li {
 36 |   display: inline-block;
 37 |   float: left;
 38 |   margin: 0;
 39 | }
 40 | #TOC > ul > li > a {
 41 |   color: #ffffff;
 42 | }
 43 | #TOC > ul > li:hover:after {
 44 |   content: '';
 45 |   display: block;
 46 |   width: 0;
 47 |   height: 0;
 48 |   position: absolute;
 49 |   left: 50%;
 50 |   bottom: 0;
 51 |   border-left: 10px solid transparent;
 52 |   border-right: 10px solid transparent;
 53 |   border-bottom: 10px solid #0fa1e0;
 54 |   margin-left: -10px;
 55 | }
 56 | #TOC > ul > li:first-child > a {
 57 |   border-radius: 5px 0 0 0;
 58 | }
 59 | #TOC.align-right > ul > li:first-child > a,
 60 | #TOC.align-center > ul > li:first-child > a {
 61 |   border-radius: 0;
 62 | }
 63 | #TOC.align-right > ul > li:last-child > a {
 64 |   border-radius: 0 5px 0 0;
 65 | }
 66 | #TOC > ul > li.active > a,
 67 | #TOC > ul > li:hover > a {
 68 |   color: #02934a;
 69 |   box-shadow: inset 0 0 3px #000000;
 70 |   background: #070707;
 71 |   background: linear-gradient(to bottom, #262626 0%, #070707 100%);
 72 | }
 73 | 
 74 | #TOC .has-sub:hover > ul {
 75 |   display: block;
 76 | }
 77 | #TOC .has-sub ul {
 78 |   display: none;
 79 |   z-index: 1;
 80 |   position: absolute;
 81 |   width: 200px;
 82 |   top: 100%;
 83 |   left: 0;
 84 | }
 85 | #TOC .has-sub ul li a {
 86 |   background: #0fa1e0;
 87 |   border-bottom: 1px dotted #31b7f1;
 88 |   filter: none;
 89 |   display: block;
 90 |   line-height: 120%;
 91 |   padding: 10px;
 92 |   color: #ffffff;
 93 | }
 94 | #TOC .has-sub ul li:hover a {
 95 |   background: #0c7fb0;
 96 | }
 97 | #TOC ul ul li:hover > a {
 98 |   color: #02934a;
 99 | }
100 | #TOC .has-sub .has-sub:hover > ul {
101 |   display: block;
102 | }
103 | #TOC .has-sub .has-sub ul {
104 |   display: none;
105 |   position: absolute;
106 |   left: 100%;
107 |   top: 0;
108 | }
109 | #TOC .has-sub .has-sub ul li a {
110 |   background: #0c7fb0;
111 |   border-bottom: 1px dotted #31b7f1;
112 | }
113 | #TOC .has-sub .has-sub ul li a:hover {
114 |   background: #0a6d98;
115 | }
116 | #TOC ul ul li.last > a,
117 | #TOC ul ul li:last-child > a,
118 | #TOC ul ul ul li.last > a,
119 | #TOC ul ul ul li:last-child > a,
120 | #TOC .has-sub ul li:last-child > a,
121 | #TOC .has-sub ul li.last > a {
122 |   border-bottom: 0;
123 | }
124 | 


--------------------------------------------------------------------------------
/img/02_df.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/02_df.png


--------------------------------------------------------------------------------
/img/02_factor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/02_factor.png


--------------------------------------------------------------------------------
/img/02_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/02_list.png


--------------------------------------------------------------------------------
/img/02_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/02_matrix.png


--------------------------------------------------------------------------------
/img/02_numvec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/02_numvec.png


--------------------------------------------------------------------------------
/img/03_script-hardware_w.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/03_script-hardware_w.jpg


--------------------------------------------------------------------------------
/img/03_store-bitbyteword.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/03_store-bitbyteword.png


--------------------------------------------------------------------------------
/img/03_virtualmemory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/03_virtualmemory.png


--------------------------------------------------------------------------------
/img/05_nlp_pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/05_nlp_pipeline.jpg


--------------------------------------------------------------------------------
/img/II_computing_environment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/II_computing_environment.png


--------------------------------------------------------------------------------
/img/I_approaches.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/I_approaches.png


--------------------------------------------------------------------------------
/img/Page 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/Page 1.png


--------------------------------------------------------------------------------
/img/TPU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/TPU.png


--------------------------------------------------------------------------------
/img/aws_emr_ready.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/aws_emr_ready.png


--------------------------------------------------------------------------------
/img/aws_rds_create.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/aws_rds_create.png


--------------------------------------------------------------------------------
/img/aws_rds_easycreate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/aws_rds_easycreate.png


--------------------------------------------------------------------------------
/img/colab_r_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/colab_r_gpu.png


--------------------------------------------------------------------------------
/img/column_v_rowbased.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/column_v_rowbased.png


--------------------------------------------------------------------------------
/img/computing_environment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/computing_environment.png


--------------------------------------------------------------------------------
/img/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/cover.jpg


--------------------------------------------------------------------------------
/img/cover_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/cover_new.png


--------------------------------------------------------------------------------
/img/cover_new.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/cover_new.tiff


--------------------------------------------------------------------------------
/img/cover_print.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/cover_print.jpg


--------------------------------------------------------------------------------
/img/cover_print.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/cover_print.png


--------------------------------------------------------------------------------
/img/data_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/data_pipeline.png


--------------------------------------------------------------------------------
/img/distributed_system.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/distributed_system.jpg


--------------------------------------------------------------------------------
/img/druiddatasources.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/druiddatasources.png


--------------------------------------------------------------------------------
/img/druidparse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/druidparse.png


--------------------------------------------------------------------------------
/img/druidquery.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/druidquery.png


--------------------------------------------------------------------------------
/img/druidstart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/druidstart.png


--------------------------------------------------------------------------------
/img/ec2_gpu1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/ec2_gpu1.png


--------------------------------------------------------------------------------
/img/ec2_gpu2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/ec2_gpu2.png


--------------------------------------------------------------------------------
/img/ec2_rstudioserver_htop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/ec2_rstudioserver_htop.png


--------------------------------------------------------------------------------
/img/factor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/factor.png


--------------------------------------------------------------------------------
/img/gpt_SQL_prompt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/gpt_SQL_prompt.png


--------------------------------------------------------------------------------
/img/gpt_sql_response.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/gpt_sql_response.png


--------------------------------------------------------------------------------
/img/gpu_cpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/gpu_cpu.png


--------------------------------------------------------------------------------
/img/gpu_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/gpu_details.png


--------------------------------------------------------------------------------
/img/hadoop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/hadoop.png


--------------------------------------------------------------------------------
/img/list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/list.png


--------------------------------------------------------------------------------
/img/nvidia_geeforce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/nvidia_geeforce.png


--------------------------------------------------------------------------------
/img/nvidia_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/nvidia_gpu.png


--------------------------------------------------------------------------------
/img/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/pipeline.png


--------------------------------------------------------------------------------
/img/rds_inboundrules.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/rds_inboundrules.png


--------------------------------------------------------------------------------
/img/rtx_2080.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/rtx_2080.png


--------------------------------------------------------------------------------
/img/screenshot_rstudio_server_upload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/screenshot_rstudio_server_upload.png


--------------------------------------------------------------------------------
/img/spark-stack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/spark-stack.png


--------------------------------------------------------------------------------
/img/spark_components.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/spark_components.jpg


--------------------------------------------------------------------------------
/img/uluru_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/uluru_comparison.png


--------------------------------------------------------------------------------
/img/uluru_comparison2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/uluru_comparison2.png


--------------------------------------------------------------------------------
/img/virtual_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/img/virtual_memory.png


--------------------------------------------------------------------------------
/index.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | knit: "bookdown::render_book"
 3 | title: "Big Data Analytics"
 4 | author: "Ulrich Matter"
 5 | github-repo: "umatter/BigData"
 6 | url: "https://umatter.github.io/BigData"
 7 | documentclass: style/krantz        
 8 | classoption: 
 9 | - krantz2
10 | link-citations: yes
11 | colorlinks: yes
12 | lot: no
13 | lof: no
14 | fontsize: 12pt
15 | monofont: "Source Code Pro"
16 | monofontoptions: "Scale=0.7"
17 | site: bookdown::bookdown_site
18 | bibliography: [references/bigdata.bib, references/packages.bib]
19 | biblio-style: apalike
20 | cover-image: img/cover_print.jpg
21 | ---
22 | 
23 | ```{r echo=FALSE, message=FALSE, warning=FALSE}
24 | library(rmarkdown)
25 | library(bookdown)
26 | library(knitr)
27 | 
28 | knitr::write_bib(c(
29 |   .packages(), 'bookdown', 'knitr', 'rmarkdown', 'httr', 'rvest', 
30 |   'twitteR', 'jsonlite', 'xml2',  'data.table', 'future', 'future.apply',
31 |   'sparklyr', 'SparkR', 'arrow', 'aws.s3', 'bench', 'ff', 'ffbase', 
32 |   'foreach', 'ggplot2', 'gpuR', 'microbenchmark', 'RJDBC', 'scattermore',
33 |   'tfestimators', 'tensorflow', 'keras', 'pryr', 'dplyr', 'gamlr'
34 | ), 'references/packages.bib')
35 | 
36 | opts_chunk$set(size="small")
37 | options( htmltools.dir.version = FALSE, formatR.indent = 2,
38 |   width = 55, digits = 4, warnPartialMatchAttr = FALSE, warnPartialMatchDollar = FALSE)
39 | options(ggplot2.discrete.colour= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"),
40 |         ggplot2.discrete.fill= c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"))
41 | ```
42 | 
43 | `r if (knitr::is_latex_output()) '<!--'` 
44 | 
45 | # Welcome {-}
46 | 
47 | 
48 | 
49 | <div style="display: flex;">
50 | <img src="img/cover_print.jpg"  style="margin-right: 10px;">
51 |   <div>
52 | 
53 | This is the website of the 1st edition of "Big Data Analytics". The book provides an introduction to Big Data Analytics for academics and practitioners. You will learn about the computational constraints underlying Big Data Analytics and how to handle them in the statistical computing environment R (local and in the cloud). The book covers each step of dealing with large data sets in applied data science (storage/import, transformation, visualization, aggregation, analysis).
54 | 
55 | The website is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a> and is free to access. To order a physical copy of the book, please visit the [publisher's website](https://www.routledge.com/Big-Data-Analytics-A-Guide-to-Data-Science-Practitioners-Making-the-Transition/Matter/p/book/9781032458144) or [Amazon](https://www.amazon.com/Data-Analytics-Chapman-Hall-Science/dp/1032458143/ref=tmm_pap_swatch_0?_encoding=UTF8&qid=1693251784&sr=8-1).
56 |   </div>
57 | </div>
58 | 
59 | <br>
60 | 
61 | >“This book is a superb practical guide for data scientists and graduate students in business and economics interested in data analytics. The combination of a clear introduction to the concepts and techniques of big data analytics with examples of how to code these tools makes this book both accessible and practical. I highly recommend this book to anyone seeking to prepare themselves for the ever-evolving world of data analytics in business and economics research.”
62 | - <em>Oded Netzer</em>, Vice Dean for Research, Columbia Business School
63 | 
64 | 
65 | 
66 | 
67 | `r if (knitr::is_latex_output()) '-->'`
68 | 
69 | 


--------------------------------------------------------------------------------
/style/nologo_template.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |   <title>$if(title-prefix)$$title-prefix$ - $endif$$pagetitle$</title>
  5 | 
  6 |   <meta charset="utf-8">
  7 |   <meta http-equiv="X-UA-Compatible" content="chrome=1">
  8 |   <meta name="generator" content="pandoc" />
  9 | 
 10 | 
 11 | 
 12 | $if(date-meta)$
 13 |   <meta name="date" content="$date-meta$" />
 14 | $endif$
 15 | 
 16 |   <meta name="viewport" content="width=device-width, initial-scale=1">
 17 |   <meta name="apple-mobile-web-app-capable" content="yes">
 18 | 
 19 |   <base target="_blank">
 20 | 
 21 |   <script type="text/javascript">
 22 |     var SLIDE_CONFIG = {
 23 |       // Slide settings
 24 |       settings: {
 25 |         $if(title)$
 26 |         title: '$title$',
 27 |         $endif$
 28 |         $if(subtitle)$
 29 |         subtitle: '$subtitle$',
 30 |         $endif$
 31 |         useBuilds: true,
 32 |         usePrettify: true,
 33 |         enableSlideAreas: true,
 34 |         enableTouch: true,
 35 |         $if(analytics)$
 36 |         analytics: '$analytics$',
 37 |         $endif$
 38 |         $if(logo)$
 39 |         favIcon: '$logo$',
 40 |         $endif$
 41 |       },
 42 | 
 43 |       // Author information
 44 |       presenters: [
 45 |       $for(author)$
 46 |       {
 47 |         name: $if(author.name)$ '$author.name$' $else$ '$author$' $endif$,
 48 |         company: '$author.company$',
 49 |         gplus: '$author.gplus$',
 50 |         twitter: '$author.twitter$',
 51 |         www: '$author.www$',
 52 |         github: '$author.github$'
 53 |       },
 54 |       $endfor$
 55 |       ]
 56 |     };
 57 |   </script>
 58 | 
 59 | $for(header-includes)$
 60 |   $header-includes$
 61 | $endfor$
 62 | 
 63 |   <style type="text/css">
 64 | 
 65 |     b, strong {
 66 |       font-weight: bold;
 67 |     }
 68 | 
 69 |     em {
 70 |       font-style: italic;
 71 |     }
 72 | 
 73 |     summary {
 74 |       display: list-item;
 75 |     }
 76 | 
 77 |     slides > slide {
 78 |       -webkit-transition: all $transition$s ease-in-out;
 79 |       -moz-transition: all $transition$s ease-in-out;
 80 |       -o-transition: all $transition$s ease-in-out;
 81 |       transition: all $transition$s ease-in-out;
 82 |     }
 83 | 
 84 |     .auto-fadein {
 85 |       -webkit-transition: opacity 0.6s ease-in;
 86 |       -webkit-transition-delay: $transition$s;
 87 |       -moz-transition: opacity 0.6s ease-in $transition$s;
 88 |       -o-transition: opacity 0.6s ease-in $transition$s;
 89 |       transition: opacity 0.6s ease-in $transition$s;
 90 |       opacity: 0;
 91 |     }
 92 | /* https://github.com/ropensci/plotly/pull/524#issuecomment-468142578 */
 93 | slide:not(.current) .plotly.html-widget{
 94 |   display: block;
 95 | }
 96 | 
 97 | $if(logo)$
 98 |     slides > slide:not(.nobackground):before {
 99 |       font-size: 12pt;
100 |       content: "";
101 |       position: absolute;
102 |       bottom: 20px;
103 |       left: 60px;
104 | 
105 |       background-size: 30px 30px;
106 |       padding-left: 40px;
107 |       height: 30px;
108 |       line-height: 1.9;
109 |     }
110 | $endif$
111 |   </style>
112 | 
113 | $for(css)$
114 |   <link rel="stylesheet" href="$css$" $if(html5)$$else$type="text/css" $endif$/>
115 | $endfor$
116 | 
117 | </head>
118 | 
119 | <body style="opacity: 0">
120 | 
121 | $if(widescreen)$
122 | <slides class="layout-widescreen">
123 | $else$
124 | <slides>
125 | $endif$
126 | 
127 | $if(include-before)$
128 |   $for(include-before)$
129 |     $include-before$
130 |   $endfor$
131 | $else$
132 |   <slide class="title-slide segue nobackground">
133 |     $if(logo)$
134 |     <aside class="gdbar"><img src="$logo$"></aside>
135 |     $endif$
136 |     <!-- The content of this hgroup is replaced programmatically through the slide_config.json. -->
137 |     <hgroup class="auto-fadein">
138 |       <h1 data-config-title><!-- populated from slide_config.json --></h1>
139 |       <h2 data-config-subtitle><!-- populated from slide_config.json --></h2>
140 |       <p data-config-presenter><!-- populated from slide_config.json --></p>
141 |       $if(date)$
142 |       <p style="margin-top: 6px; margin-left: -2px;">$date$</p>
143 |       $endif$
144 |     </hgroup>
145 |   </slide>
146 | $endif$
147 | 
148 | RENDERED_SLIDES
149 | 
150 | $for(include-after)$
151 |   $include-after$
152 | $endfor$
153 | 
154 |   <slide class="backdrop"></slide>
155 | 
156 | </slides>
157 | 
158 | $if(mathjax-url)$
159 | <!-- dynamically load mathjax for compatibility with self-contained -->
160 | <script>
161 |   (function () {
162 |     var script = document.createElement("script");
163 |     script.type = "text/javascript";
164 |     script.src  = "$mathjax-url$";
165 |     document.getElementsByTagName("head")[0].appendChild(script);
166 |   })();
167 | </script>
168 | $endif$
169 | 
170 | <!-- map slide visiblity events into shiny -->
171 | <script>
172 |   (function() {
173 |     if (window.jQuery) {
174 |        window.jQuery(document).on('slideleave', function(e) {
175 |          window.jQuery(e.target).trigger('hidden');
176 |       });
177 |        window.jQuery(document).on('slideenter', function(e) {
178 |          window.jQuery(e.target).trigger('shown');
179 |       });
180 |     }
181 |   })();
182 | </script>
183 | 
184 | </body>
185 | </html>
186 | 


--------------------------------------------------------------------------------
/style/notes.css:
--------------------------------------------------------------------------------
 1 | body {
 2 | text-align: justify}
 3 | 
 4 | .caption {
 5 |     font-style: italic;
 6 |     font-size: small;
 7 | }
 8 | 
 9 |   .col2 {
10 |     columns: 2 200px;         /* number of columns and width in pixels*/
11 |     -webkit-columns: 2 200px; /* chrome, safari */
12 |     -moz-columns: 2 200px;    /* firefox */
13 |   }
14 |   .col3 {
15 |     columns: 3 100px;
16 |     -webkit-columns: 3 100px;
17 |     -moz-columns: 3 100px;
18 |   }
19 | 
20 | 


--------------------------------------------------------------------------------
/style/notes_hsg.css:
--------------------------------------------------------------------------------
 1 | 
 2 | .author {
 3 |   color: #02934a;
 4 | }
 5 | 
 6 | .date {
 7 |   color: #02934a;
 8 | }
 9 | .title {
10 |   color: #02934a;
11 | }
12 | h1 { /* Header 1 */
13 |   color: #02934a;
14 | }
15 | h2 { /* Header 2 */
16 |   color: #02934a;
17 | }
18 | h3 { /* Header 3 */
19 |   color: #02934a;
20 | }
21 | 
22 | /* links */
23 | a {
24 |   color: #02934a;
25 |   text-decoration: none;
26 | }
27 | a:visited {
28 |   color: #02934a;
29 |   text-decoration: none;
30 | }
31 | 
32 | a:active {
33 |   color: #02934a;
34 |   text-decoration: none;
35 | }
36 | 
37 | /* active links */
38 | a:hover {
39 |   color: #02934a;
40 |   border-bottom: 1px solid #02934a;
41 |   text-decoration: none;
42 | 
43 | }
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/style/notes_preamble.tex:
--------------------------------------------------------------------------------
 1 | \begin{document}
 2 | 
 3 | \usepackage{fancyhdr}
 4 | \pagestyle{fancy}
 5 | \usepackage[T1]{fontenc}
 6 | \usepackage{hyperref}
 7 | \usepackage{framed}
 8 | \usepackage{float}
 9 | \let\origfigure\figure
10 | \let\endorigfigure\endfigure
11 | \renewenvironment{figure}[1][2] {
12 |     \expandafter\origfigure\expandafter[H]
13 | } {
14 |     \endorigfigure
15 | }
16 | 


--------------------------------------------------------------------------------
/style/style.css:
--------------------------------------------------------------------------------
 1 | .rmdcaution, .rmdimportant, .rmdnote, .rmdtip, .rmdwarning {
 2 |   padding: 1em 1em 1em 4em;
 3 |   margin-bottom: 10px;
 4 |   background: #f5f5f5 5px center/3em no-repeat;
 5 | }
 6 | .rmdcaution {
 7 |   background-image: url("../img/caution.png");
 8 | }
 9 | .rmdimportant {
10 |   background-image: url("../img/important.png");
11 | }
12 | .rmdnote {
13 |   background-image: url("../img/note.png");
14 | }
15 | .rmdtip {
16 |   background-image: url("../img/tip.png");
17 | }
18 | .rmdwarning {
19 |   background-image: url("../img/warning.png");
20 | }
21 | p.caption {
22 |   color: #777;
23 |   margin-top: 10px;
24 | }
25 | p code {
26 |   white-space: inherit;
27 | }
28 | pre {
29 |   word-break: normal;
30 |   word-wrap: normal;
31 | }
32 | pre code {
33 |   white-space: inherit;
34 | }
35 | p.flushright {
36 |   text-align: right;
37 | }
38 | blockquote > p:last-child {
39 |   text-align: right;
40 | }
41 | blockquote > p:first-child {
42 |   text-align: inherit;
43 | }
44 | .header-section-number {
45 |   padding-right: .2em;
46 |   font-weight: 500;
47 | }
48 | .level1 .header-section-number {
49 |   display: inline-block;
50 |   border-bottom: 3px solid;
51 | }
52 | .level1 h1 {
53 |   border-bottom: 1px solid;
54 | }
55 | h1, h2, h3, h4, h5, h6 {
56 |   font-weight: normal;
57 | }
58 | h1.title {
59 |   font-weight: 700;
60 | }
61 | .smallcaps {
62 |   font-variant: small-caps;
63 | }
64 | .book .book-body .page-wrapper .page-inner section.normal strong {
65 |   font-weight: 600;
66 | }
67 | 
68 | .infobox {
69 |   padding: 1em 1em 1em 4em;
70 |   margin-bottom: 10px;
71 |   border: 2px solid black;
72 |   border-radius: 10px;
73 |   background: #d9e3f0 5px center/3em no-repeat;
74 | }
75 | 
76 | /* General text color - Background Blue */
77 | body {
78 |     color: #000E91;
79 | }
80 | 
81 | /* Link color - Neon Purple */
82 | a {
83 |     color: #A100A1;
84 | }
85 | 
86 | /* Link hover color - for better UX, slightly different shade of Neon Purple */
87 | a:hover {
88 |     color: #8F0091;
89 | }
90 | 
91 | 


--------------------------------------------------------------------------------
/style/style_new.css:
--------------------------------------------------------------------------------
  1 | .rmdcaution, .rmdimportant, .rmdnote, .rmdtip, .rmdwarning {
  2 |   padding: 1em 1em 1em 4em;
  3 |   margin-bottom: 10px;
  4 |   background: #f5f5f5 5px center/3em no-repeat;
  5 | }
  6 | .rmdcaution {
  7 |   background-image: url("../img/caution.png");
  8 | }
  9 | .rmdimportant {
 10 |   background-image: url("../img/important.png");
 11 | }
 12 | .rmdnote {
 13 |   background-image: url("../img/note.png");
 14 | }
 15 | .rmdtip {
 16 |   background-image: url("../img/tip.png");
 17 | }
 18 | .rmdwarning {
 19 |   background-image: url("../img/warning.png");
 20 | }
 21 | p.caption {
 22 |   color: #777;
 23 |   margin-top: 10px;
 24 | }
 25 | p code {
 26 |   white-space: inherit;
 27 | }
 28 | pre {
 29 |   word-break: normal;
 30 |   word-wrap: normal;
 31 | }
 32 | pre code {
 33 |   white-space: inherit;
 34 | }
 35 | p.flushright {
 36 |   text-align: right;
 37 | }
 38 | blockquote > p:last-child {
 39 |   text-align: right;
 40 | }
 41 | blockquote > p:first-child {
 42 |   text-align: inherit;
 43 | }
 44 | .header-section-number {
 45 |   padding-right: .2em;
 46 |   font-weight: 500;
 47 | }
 48 | .level1 .header-section-number {
 49 |   display: inline-block;
 50 |   border-bottom: 3px solid;
 51 | }
 52 | .level1 h1 {
 53 |   border-bottom: 1px solid;
 54 | }
 55 | h1, h2, h3, h4, h5, h6 {
 56 |   font-weight: normal;
 57 | }
 58 | h1.title {
 59 |   font-weight: 700;
 60 | }
 61 | .smallcaps {
 62 |   font-variant: small-caps;
 63 | }
 64 | .book .book-body .page-wrapper .page-inner section.normal strong {
 65 |   font-weight: 600;
 66 | }
 67 | 
 68 | .infobox {
 69 |   padding: 1em 1em 1em 4em;
 70 |   margin-bottom: 10px;
 71 |   border: 2px solid black;
 72 |   border-radius: 10px;
 73 |   background: #d9e3f0 5px center/3em no-repeat;
 74 | }
 75 | 
 76 | /* General text color - Background Blue */
 77 | .book-body h {
 78 |     color: #0f1b27 !important; 
 79 | }
 80 | 
 81 | 
 82 | /* General navigation bar text color */
 83 | .book-summary .summary a {
 84 |     color: #0f1b27 !important;
 85 | }
 86 | 
 87 | /* Link color - Neon Purple */
 88 | .book .page-wrapper .page-inner a {
 89 |     color: #a20058 !important;
 90 | }
 91 | 
 92 | .book-summary .summary a:hover,
 93 | .book-summary .summary .active a {
 94 |     color: #a20058 !important;
 95 | }
 96 | 
 97 | .book .page-wrapper .page-inner blockquote {
 98 |     color:  #a20058; /* Change the text color of the blockquote */
 99 |     border-left: 4px solid  #a20058 !important; /* Change the left border color of the blockquote */
100 | }
101 | 
102 | .book-header .btn:active,
103 | .book-header .btn:hover {
104 |     color: #a20058 !important;
105 | }
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/style/toc.css:
--------------------------------------------------------------------------------
  1 | #TOC ul,
  2 | #TOC li,
  3 | #TOC span,
  4 | #TOC a {
  5 |   margin: 0;
  6 |   padding: 0;
  7 |   position: relative;
  8 | }
  9 | #TOC {
 10 |   line-height: 1;
 11 |   border-radius: 5px 5px 0 0;
 12 |   background: #141414;
 13 |   background: linear-gradient(to bottom, #333333 0%, #141414 100%);
 14 |   border-bottom: 2px solid #0fa1e0;
 15 |   width: auto;
 16 | }
 17 | #TOC:after,
 18 | #TOC ul:after {
 19 |   content: '';
 20 |   display: block;
 21 |   clear: both;
 22 | }
 23 | #TOC a {
 24 |   background: #141414;
 25 |   background: linear-gradient(to bottom, #333333 0%, #141414 100%);
 26 |   color: #ffffff;
 27 |   display: block;
 28 |   padding: 19px 20px;
 29 |   text-decoration: none;
 30 |   text-shadow: none;
 31 | }
 32 | #TOC ul {
 33 |   list-style: none;
 34 | }
 35 | #TOC > ul > li {
 36 |   display: inline-block;
 37 |   float: left;
 38 |   margin: 0;
 39 | }
 40 | #TOC > ul > li > a {
 41 |   color: #ffffff;
 42 | }
 43 | #TOC > ul > li:hover:after {
 44 |   content: '';
 45 |   display: block;
 46 |   width: 0;
 47 |   height: 0;
 48 |   position: absolute;
 49 |   left: 50%;
 50 |   bottom: 0;
 51 |   border-left: 10px solid transparent;
 52 |   border-right: 10px solid transparent;
 53 |   border-bottom: 10px solid #0fa1e0;
 54 |   margin-left: -10px;
 55 | }
 56 | #TOC > ul > li:first-child > a {
 57 |   border-radius: 5px 0 0 0;
 58 | }
 59 | #TOC.align-right > ul > li:first-child > a,
 60 | #TOC.align-center > ul > li:first-child > a {
 61 |   border-radius: 0;
 62 | }
 63 | #TOC.align-right > ul > li:last-child > a {
 64 |   border-radius: 0 5px 0 0;
 65 | }
 66 | #TOC > ul > li.active > a,
 67 | #TOC > ul > li:hover > a {
 68 |   color: #02934a;
 69 |   box-shadow: inset 0 0 3px #000000;
 70 |   background: #070707;
 71 |   background: linear-gradient(to bottom, #262626 0%, #070707 100%);
 72 | }
 73 | 
 74 | #TOC .has-sub:hover > ul {
 75 |   display: block;
 76 | }
 77 | #TOC .has-sub ul {
 78 |   display: none;
 79 |   z-index: 1;
 80 |   position: absolute;
 81 |   width: 200px;
 82 |   top: 100%;
 83 |   left: 0;
 84 | }
 85 | #TOC .has-sub ul li a {
 86 |   background: #0fa1e0;
 87 |   border-bottom: 1px dotted #31b7f1;
 88 |   filter: none;
 89 |   display: block;
 90 |   line-height: 120%;
 91 |   padding: 10px;
 92 |   color: #ffffff;
 93 | }
 94 | #TOC .has-sub ul li:hover a {
 95 |   background: #0c7fb0;
 96 | }
 97 | #TOC ul ul li:hover > a {
 98 |   color: #02934a;
 99 | }
100 | #TOC .has-sub .has-sub:hover > ul {
101 |   display: block;
102 | }
103 | #TOC .has-sub .has-sub ul {
104 |   display: none;
105 |   position: absolute;
106 |   left: 100%;
107 |   top: 0;
108 | }
109 | #TOC .has-sub .has-sub ul li a {
110 |   background: #0c7fb0;
111 |   border-bottom: 1px dotted #31b7f1;
112 | }
113 | #TOC .has-sub .has-sub ul li a:hover {
114 |   background: #0a6d98;
115 | }
116 | #TOC ul ul li.last > a,
117 | #TOC ul ul li:last-child > a,
118 | #TOC ul ul ul li.last > a,
119 | #TOC ul ul ul li:last-child > a,
120 | #TOC .has-sub ul li:last-child > a,
121 | #TOC .has-sub ul li.last > a {
122 |   border-bottom: 0;
123 | }
124 | 


--------------------------------------------------------------------------------