├── .nojekyll ├── 01_partI_bigdata.Rmd ├── 02_approaches_bigdata.Rmd ├── 03_partI_2domains_bigdata.Rmd ├── 04_partII_software.Rmd ├── 05_partII_hardware.Rmd ├── 06_partII_distributedsystems.Rmd ├── 07_partII_cloudcomputing.Rmd ├── 08_partIII_collection_storage.Rmd ├── 09_partIII_cleaning_transformation.Rmd ├── 0_preface.Rmd ├── 10_partIII_descriptives_aggregation.Rmd ├── 11_partIII_visualization.Rmd ├── 12_partIV_data_analyticsI.Rmd ├── 13_partIV_GPU_ML.Rmd ├── 14_partIV_regression_categorization_spark.Rmd ├── 15_partIV_large_scale_text_analysis.Rmd ├── 16_references.Rmd ├── BigData.Rproj ├── README.md ├── R_code_examples ├── 03_partI_2domains_bigdata.R ├── 04_partII_software.R ├── 05_partII_hardware.R ├── 06_partII_distributedsystems.R ├── 07_partII_cloudcomputing.R ├── 08_partIII_collection_storage.R ├── 09_partIII_cleaning_transformation.R ├── 10_partIII_descriptives_aggregation.R ├── 11_partIII_visualization.R ├── 12_partIV_data_analyticsI.R ├── 13_partIV_GPU_ML.R ├── 14_partIV_regression_categorization_spark.R └── 15_partIV_large_scale_text_analysis.R ├── docs ├── .nojekyll ├── 404.html ├── a.html ├── appendix-a-github.html ├── appendix-b-r-basics.html ├── appendix-c-install-hadoop.html ├── approaches-to-analyzing-big-data.html ├── big-data-cleaning-and-transformation.html ├── big-data-visualization.html ├── bigdata_files │ └── figure-html │ │ ├── unnamed-chunk-178-1.png │ │ ├── unnamed-chunk-190-1.png │ │ ├── unnamed-chunk-193-1.png │ │ ├── unnamed-chunk-194-1.png │ │ ├── unnamed-chunk-196-1.png │ │ ├── unnamed-chunk-198-1.png │ │ ├── unnamed-chunk-199-1.png │ │ ├── unnamed-chunk-200-1.png │ │ ├── unnamed-chunk-201-1.png │ │ ├── unnamed-chunk-202-1.png │ │ ├── unnamed-chunk-203-1.png │ │ ├── unnamed-chunk-204-1.png │ │ ├── unnamed-chunk-205-1.png │ │ ├── unnamed-chunk-206-1.png │ │ ├── unnamed-chunk-207-1.png │ │ ├── unnamed-chunk-208-1.png │ │ ├── unnamed-chunk-209-1.png │ │ ├── unnamed-chunk-210-1.png │ │ ├── unnamed-chunk-211-1.png │ │ ├── unnamed-chunk-221-1.png │ │ ├── unnamed-chunk-222-1.png │ │ ├── unnamed-chunk-223-1.png │ │ ├── unnamed-chunk-224-1.png │ │ ├── unnamed-chunk-225-1.png │ │ ├── unnamed-chunk-226-1.png │ │ ├── unnamed-chunk-227-1.png │ │ ├── unnamed-chunk-228-1.png │ │ ├── unnamed-chunk-229-1.png │ │ ├── unnamed-chunk-27-1.png │ │ ├── unnamed-chunk-275-1.png │ │ ├── unnamed-chunk-31-1.png │ │ ├── unnamed-chunk-318-1.png │ │ ├── unnamed-chunk-326-1.png │ │ ├── unnamed-chunk-33-1.png │ │ └── unnamed-chunk-9-1.png ├── bottlenecks-in-everyday-data-analytics-tasks.html ├── c.html ├── cloud-computing.html ├── data-collection-and-data-storage.html ├── descriptive-statistics-and-aggregation.html ├── distributed-systems.html ├── econometrics-with-gpus.html ├── hardware-computing-resources.html ├── img │ ├── 05_nlp_pipeline.jpg │ ├── II_computing_environment.png │ ├── I_approaches.png │ ├── TPU.png │ ├── aws_emr_ready.png │ ├── aws_rds_create.png │ ├── aws_rds_easycreate.png │ ├── colab_r_gpu.png │ ├── column_v_rowbased.png │ ├── cover_print.jpg │ ├── data_pipeline.png │ ├── distributed_system.jpg │ ├── druiddatasources.png │ ├── druidparse.png │ ├── druidquery.png │ ├── druidstart.png │ ├── ec2_gpu1.png │ ├── ec2_gpu2.png │ ├── ec2_rstudioserver_htop.png │ ├── gpt_SQL_prompt.png │ ├── gpt_sql_response.png │ ├── gpu_cpu.png │ ├── gpu_details.png │ ├── rds_inboundrules.png │ ├── rtx_2080.png │ ├── screenshot_rstudio_server_upload.png │ ├── uluru_comparison.png │ ├── uluru_comparison2.png │ └── virtual_memory.png ├── index.html ├── index.md ├── introduction.html ├── large-scale-text-analysis-with-sparklyr.html ├── libs │ ├── anchor-sections │ │ ├── anchor-sections-hash.css │ │ ├── anchor-sections.css │ │ └── anchor-sections.js │ ├── gitbook │ │ ├── css │ │ │ ├── fontawesome │ │ │ │ └── fontawesome-webfont.ttf │ │ │ ├── plugin-bookdown.css │ │ │ ├── plugin-clipboard.css │ │ │ ├── plugin-fontsettings.css │ │ │ ├── plugin-highlight.css │ │ │ ├── plugin-search.css │ │ │ ├── plugin-table.css │ │ │ └── style.css │ │ └── js │ │ │ ├── app.min.js │ │ │ ├── clipboard.min.js │ │ │ ├── jquery.highlight.js │ │ │ ├── plugin-bookdown.js │ │ │ ├── plugin-clipboard.js │ │ │ ├── plugin-fontsettings.js │ │ │ ├── plugin-search.js │ │ │ └── plugin-sharing.js │ └── jquery │ │ └── jquery-3.6.0.min.js ├── p.html ├── reference-keys.txt ├── references.html ├── regression-analysis-and-categorization-with-spark-and-r.html ├── s.html ├── search_index.json ├── software-programming-with-big-data.html ├── style │ ├── krantz.cls │ ├── krantz_new.cls │ ├── style.css │ ├── style_new.css │ └── toc.css ├── the-two-domains-of-big-data-analytics.html └── what-is-big-in-big-data.html ├── img ├── 02_df.png ├── 02_factor.png ├── 02_list.png ├── 02_matrix.png ├── 02_numvec.png ├── 03_script-hardware_w.jpg ├── 03_store-bitbyteword.png ├── 03_virtualmemory.png ├── 05_nlp_pipeline.jpg ├── II_computing_environment.png ├── I_approaches.png ├── Page 1.png ├── TPU.png ├── aws_emr_ready.png ├── aws_rds_create.png ├── aws_rds_easycreate.png ├── colab_r_gpu.png ├── column_v_rowbased.png ├── computing_environment.png ├── cover.jpg ├── cover_new.png ├── cover_new.tiff ├── cover_print.jpg ├── cover_print.png ├── data_pipeline.png ├── distributed_system.jpg ├── druiddatasources.png ├── druidparse.png ├── druidquery.png ├── druidstart.png ├── ec2_gpu1.png ├── ec2_gpu2.png ├── ec2_rstudioserver_htop.png ├── factor.png ├── gpt_SQL_prompt.png ├── gpt_sql_response.png ├── gpu_cpu.png ├── gpu_details.png ├── hadoop.png ├── list.png ├── nvidia_geeforce.png ├── nvidia_gpu.png ├── pipeline.png ├── rds_inboundrules.png ├── rtx_2080.png ├── screenshot_rstudio_server_upload.png ├── spark-stack.png ├── spark_components.jpg ├── uluru_comparison.png ├── uluru_comparison2.png └── virtual_memory.png ├── index.Rmd ├── references ├── bigdata.bib └── packages.bib └── style ├── ioslides.css ├── ioslides_unilu.css ├── ioslides_white.css ├── krantz.cls ├── krantz_new.cls ├── nologo_template.html ├── notes.css ├── notes_hsg.css ├── notes_preamble.tex ├── style.css ├── style_new.css └── toc.css /.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/.nojekyll -------------------------------------------------------------------------------- /01_partI_bigdata.Rmd: -------------------------------------------------------------------------------- 1 | \mainmatter 2 | 3 | # (PART) Setting the Scene: Analyzing Big Data {-} 4 | 5 | # Introduction {#s .unnumbered} 6 | 7 | > "Lost in the hoopla about such [Hadoop MapReduce] skills is the embarrassing fact that once upon a time, one could do such computing tasks, and even much more ambitious ones, much more easily than in this fancy new setting! A dataset could fit on a single processor, and the global maximum of the array 'x' could be computed with the six-character code fragment 'max(x)' in, say, Matlab or R." 8 | [@donoho_2017,p.747] 9 | 10 | 11 | This part of the book introduces you to the topic of Big Data analysis from a variety of perspectives. The goal of this part is to highlight the various aspects of modern econometrics involved in Big Data Analytics, as well as to clarify the approach and perspective taken in this book. In the first step, we must consider what makes data *big*. As a result, we make a fundamental distinction between data analysis problems that can arise from many observations (rows; *big N*)\index{Big N} and the problems that can arise from many variables (columns; *big P*)\index{Big P}. 12 | 13 | In a second step, this part provides an overview of the four distinct approaches to Big Data Analytics that are most important for the perspective on Big Data taken in this book: a) statistics/econometrics techniques specifically designed to handle Big Data, b) writing more efficient R code, c) more efficiently using available local computing resources, and d) scaling up and scaling out with cloud computing resources. 14 | All of these approaches will be discussed further in the book, and it will be useful to remember the most important conceptual basics underlying these approaches from the overview presented here. 15 | 16 | Finally, this section of the book provides two extensive examples of what problems related to (too) many observations or (too) many variables can mean for practical data analysis, as well as how some of the four approaches (a-d) can help in resolving these problems. 17 | 18 | 19 | # What is *Big* in "Big Data"? 20 | 21 | In this book, we will think of Big Data as data that is (a) difficult to handle and (b) hard to get value from due to its size and complexity. The handling of Big Data is difficult as the data is often gathered from unorthodox sources, providing poorly structured data (e.g., raw text, web pages, images, etc.) as well as because of the infrastructure needed to store and load/process large amounts of data. Then, the issue of statistical computation itself becomes a challenge. Taken together, getting value/insights from Big Data is related to three distinct properties that render its analysis difficult: 22 | 23 | - Handling the *complexity and variety* of sources, structures, and formats of data for analytics purposes is becoming increasingly challenging in the context of empirical economic research and business analytics. On the one hand the ongoing digitization of information and processes boosts the generation and storage of digital data for all kinds of economic and social activity, making such data basically more available for analysis. On the other hand, however, the first order focus of such digitization is typically an end user who directly interacts with the information and is part of these processes, and not the data scientist or data analyst who might be interested in analyzing such data later on. Therefore, the interfaces for systematically collecting such data for analytics purposes are typically not optimal. Moreover, data might come in semi-structured formats such as webpages (i.e., the HyperText Markup Language (HTML))\index{HyperText Markup Language (HTML)}, raw text, or even images – each of which needs a different approach for importing/loading and pre-processing. Anyone who has worked on data analytics projects that build on various types of raw data from various sources knows that a large part of the practical data work deals with how to handle the complexity and variety to get to a useful analytic dataset. 24 | 25 | - The *big P*\index{Big P} problem: A dataset has close to or even more variables (columns) than observations, which renders the search for a good predictive model with traditional econometric techniques difficult or elusive. For example, suppose you run an e-commerce business that sells hundreds of thousands of products to tens of thousands of customers. You want to figure out from which product category a customer is most likely to buy an item, based on their previous product page visits. That is, you want to (in simple terms) regress an indicator of purchasing from a specific category on indicators for previous product page visits. Given this setup, you would potentially end up with hundreds of thousands of explanatory indicator variables (and potentially even linear combinations of those), while you "only" have tens of thousands of observations (one per user/customer and visit) to estimate your model. These sorts of problems are at the core of the domain of modern predictive econometrics, which shows how machine learning approaches like the lasso estimater\index{Lasso} can be applied to get reasonable estimates from such a predictive model. 26 | 27 | - The *big N*\index{Big N} problem: a dataset has massive numbers of observations (rows) such that it cannot be handled with standard data analytics techniques and/or on a standard desktop computer. For example, suppose you want to segment your e-commerce customers based on the traces they leave on your website's server. Specifically, you plan to use the server log files (when does a customer visit the site, from where, etc.) in combination with purchase records and written product reviews by users. You focus on 50 variables that you measure on a daily basis over five years for all 50,000 users. The resulting dataset has $50,000 \times 365 \times 5=91,250,000$ rows, with 50 variables (at least 50 columns) – over 4.5 billion cells. Such a dataset can easily take up dozens of gigabytes on the hard disk. Hence it will either not fit into the memory of a standard computer to begin with (import fails), or the standard programs to process and analyze the data will likely be very inefficient and take ages to finish when used on such a large dataset. There are both econometric techniques as well as various specialized software and hardware tools to handle such a situation. 28 | 29 | 30 | After having a close look at the practical data analytics challenges behind both *big P* and *big N* in Chapter 3, most of this book focuses on practical challenges and solutions related to *big N* problems. However, several of the chapters contain code examples that are primarily discussed as a solution to a *big N* problem, but are shown in the context of econometric/machine learning techniques that are broadly used, for example, to find good predictive models (based on many variables, i.e., *big P*). At the same time, many of the topics discussed in this book are in one way or another related to the difficulties of handling various types of structured, semi-structured, and unstructured data. Hence you will get familiar with practical techniques to deal with *complexity and variety* of data as a byproduct. 31 | 32 | 33 | -------------------------------------------------------------------------------- /02_approaches_bigdata.Rmd: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Approaches to Analyzing Big Data 4 | 5 | Throughout the book, we consider four approaches to how to solve challenges related to analyzing big N and big P data. Those approaches should not be understood as mutually exclusive categories; rather they should help us to look at a specific problem from different angles in order to find the most efficient tool/approach to proceed. Figure \@ref(fig:approaches) presents an illustrative overview of the four approaches. 6 | 7 | 8 | 9 | ```{r approaches, echo=FALSE, out.width = "99%", fig.align='center', fig.cap= "(ref:approaches)", purl=FALSE} 10 | include_graphics("img/I_approaches.png") 11 | ``` 12 | 13 | (ref:approaches) Four approaches to/perspectives on solving big N problems in data analytics. 14 | 15 | 16 | 1. *Statistics/econometrics and machine learning*\index{Machine Learning}: During the initial hype surrounding Big Data/Data Science\index{Data Science} about a decade ago, statisticians prominently (and justifiably) pointed out that statistics techniques that have always been very useful tools when analyzing "all the data" (the entire population) is too costly.^[David Donoho has nicely summarized this critique in a paper titled ["50 Years of Data Science"](https://doi.org/10.1080/10618600.2017.1384734) (@donoho_2017), which I warmly recommend.] In simple terms, when confronted with the challenge of answering an empirical question based on a *big N* dataset (which is too large to process on a normal computer), one might ask "why not simply take a random sample?" In some situations this might actually be a very reasonable question, and we should be sure to have a good answer for it before we rent a cluster computer with specialized software for distributed computing. After all, statistical inference is there to help us answer empirical questions in situations where collecting data on the entire population would be practically impossible or simply way too costly. In today's world, digital data is abundant in many domains, and the collection is not so much the problem any longer; but our standard data analytics tools are not made to analyze such amounts of data. Depending on the question and data at hand, it might thus make sense to simply use well-established "traditional" statistics/econometrics in order to properly address the empirical question. Note, though, that there are also various situations in which this would not work well. For example, consider online advertising. If you want to figure out which user characteristics make a user significantly more likely to click on a specific type of ad, you likely need hundreds of millions of data points because the expected probability that a specific user clicks on an ad is generally very low. That is, in many practical Big Data Analytics settings, you might expect rather small effects. Consequently, you need to rely on a big N dataset in order to get the statistical power to distinguish an actual effect from a zero effect. However, even then, it might make sense to first look at newer statistical procedures that are specifically made for big N data before renting a cluster computer. Similarly, traditional statistical/econometric approaches might help to deal with big P data, but they are usually rather inefficient or have rather problematic statistical properties in such situations. However, there are also well-established machine learning approaches to better address these problems. In sum, before focusing on specialized software like Apache Hadoop\index{Apache Hadoop} or Apache Spark\index{Apache Spark} and scaling up hardware resources, make sure to use the adequate statistical tools for a Big Data situation. This can save a lot of time and money. Once you have found the most efficient statistical procedure for the problem at hand, you can focus on how to compute it. 17 | 18 | 2. *Writing efficient code*: No matter how suitable a statistical procedure is theoretically to analyze a large dataset, there are always various ways to implement this procedure in software. Some ways will be less efficient than others. When working with small or moderately sized datasets, you might not even notice whether your data analytics script is written in an efficient way. However, it might get uncomfortable to run your script once you confront it with a large dataset. Hence the question you should ask yourself when taking this perspective is, "Can I write this script in a different way to make it faster (but achieve the same result)?" Before introducing you to specialized R packages to work with large datasets, we thus look at a few important aspects of how to write efficient/fast code in R. 19 | 20 | 3. *Using limited local computing resources more efficiently*: There are several strategies to use the available local computing resources (your PC) more efficiently, and many of those have been around for a while. In simple terms, these strategies are based on the idea of more explicitly telling the computer how to allocate and use the available hardware resources as part of a data analytics task (something that is usually automatically taken care of by the PC's operating system). We will touch upon several of these strategies – such as multi-core processing and the efficient use of virtual memory – and then practically implement these strategies with the help of specialized R packages. Unlike writing more efficient R code, these packages/strategies usually come with an overhead. That is, they help you save time only after a certain threshold. In other words, not using these approaches can be faster if the dataset is not "too big". In addition, there can be trade-offs between using one vs. another hardware component more efficiently. Hence, using these strategies can be tricky, and the best approach might well depend on the specific situation. The aim is thus to make you comfortable with answering the question, "How can I use my local computing environment more efficiently to further speed up this specific analytics task?" 21 | 22 | 4. *Scaling up and scaling out*: once you have properly considered all of the above, but the task still cannot be done in a reasonable amount of time, you will need to either *scale up*\index{Scale Up} or *scale out*\index{Scale Out} the available computing resources. *Scaling up* refers to enlarging your machine (e.g., adding more random access memory) or switching to a more powerful machine altogether. Technically, this can mean literally building an additional hardware device into your PC; today it usually means renting a virtual server in the cloud. Instead of using a "bigger machine", *scaling out* means using several machines in concert (cluster computer, distributed systems). While this also has often been done locally (connecting several PCs to a cluster of PCs to combine all their computing power), today this too is usually done in the cloud (due to the much easier set up and maintenance). Practically, a key difference between scaling out and scaling up is that by-and-large scaling up does not require you to get familiar with specialized software. You can simply run the exact same script you tested locally on a larger machine in the cloud. Although most of the tools and services available to scale out your analyses are by now also quite easy to use, you will have to get familiar with some additional software components to really make use of the latter. In addition, in some situations, scaling up might be perfectly sufficient, while in others only scaling out makes sense (particularly if you need massive amounts of memory). In any event, you should be comfortable dealing with the questions, "Does it make sense to scale up or scale out?" and "If yes, how can it be done?" in a given situation.^[Importantly, the perspective on scaling up and scaling out provided in this book is solely focused on Big Data Analytics in the context of economic/business research. There is a large array of practical problems and corresponding solutions/tools to deal with "Big Data Analytics" in the context of application development (e.g. tools related to data streams), which this book does not cover.] 23 | 24 | 25 | Whether one or the other approach is "better" is sometimes a topic hotly debated between academics and/or practitioners with different academic backgrounds. The point of the following chapters is not to argue for one or the other approach, but to make you familiar with these different perspectives in order to make you more comfortable and able to take on large amounts of data for your analytics project. When might one or the other approach/perspective be more useful? This is highly context-dependent. However, as a general rule of thumb, consider the order in which the different approaches have been presented above. 26 | 27 | - First, ask yourself whether there isn't an absolutely trivial solution to your big N problem, such as taking a random sample. I know, this sound banal, and you would be surprised at how many books and lectures focusing on the data engineering side of big N do not even mention this. But, we should not forget that the entire apparatus of statistical inference is essentially based on this idea.^[Originally, one could argue, the motivation for the development of statistical inference was rather related to the practical problem of gathering data on an entire population than handling a large dataset with observations of the entire population. However, in practice, inferring population properties from a random sample also works for the latter.] There is, however, a well-justified excuse for not simply taking a random sample of a large dataset. Both in academic research and in business data science and business analytics, the decision to be facilitated with data might in any event only have measurable consequences in rather a few cases. That is, the effect size of deciding either for A or B is anyway expected to be small, and hence we need sufficient statistical power (large N) to make a meaningful decision. 28 | 29 | - Second, once you know which statistical procedure should be run on which final sample/dataset, be aware of how to write your analytics scripts in the most efficient way. As you will see in Chapter 4, there are a handful of R idiosyncrasies that are worth keeping in mind in this regard. This will make interactive sessions in the early, exploratory phase of a Big Data project much more comfortable. 30 | 31 | - Third, once you have a clearer idea of the bottlenecks in the data preparation and analytics scripts, aim to optimize the usage of the available local computing resources. 32 | 33 | - In almost any organizational structure, be it a university department, a small firm, or a multinational conglomerate, switching from your laptop or desktop computer to a larger computing infrastructure, either locally or in the cloud, means additional administrative and budgetary hurdles (which means money and time spent on something other than interpreting data analysis results). That is, even before setting up the infrastructure and transferring your script and data, you will have to make an effort to scale up or scale out. Therefore, as a general rule of thumb, this option will be considered as a measure of last resort in this book. 34 | 35 | Following this recommended order of consideration, before we focus extensively on the topics of *using local computing resources more efficiently* and *scaling up/out* (in parts II and III of this book, respectively), we need to establish some of the basics regarding what is meant by statistical/econometric solutions for big P and big N problems (in the next chapter), as well as introducing a couple of helpful programming tools and skills for working on computationally intense tasks (in Chapter 4). 36 | -------------------------------------------------------------------------------- /BigData.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 5 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: XeLaTeX 14 | 15 | BuildType: Website 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Big Data Analytics 2 | 3 | 4 | This repository contains the source of the [Big Data Analytics book](https://umatter.github.io/BigData/), as well as supplementary online resources. The book is built using [bookdown](https://github.com/rstudio/bookdown). 5 | 6 | ## Supplementary online resources 7 | 8 | ### R code examples 9 | 10 | The [R_code_examples](/R_code_examples) folder contains R-scripts with all code examples and tutorials shown in the book. 11 | 12 | ### Data 13 | 14 | The corresponding sections in the book contain typically contain detailed instructions of where and how the datasets used in the code examples can be downloaded from the original sources. 15 | 16 | To ensure data availability for the code examples and tutorials in the long run, you find (smaller scale) versions for all key datasets discussed in the book in this S3-bucket: 17 | 18 | https://bda-examples.s3.eu-central-1.amazonaws.com/air_final.sqlite 19 | 20 | https://bda-examples.s3.eu-central-1.amazonaws.com/airline_id.csv 21 | 22 | https://bda-examples.s3.eu-central-1.amazonaws.com/airports.csv 23 | 24 | https://bda-examples.s3.eu-central-1.amazonaws.com/carriers.csv 25 | 26 | https://bda-examples.s3.eu-central-1.amazonaws.com/data_for_tables.dta 27 | 28 | https://bda-examples.s3.eu-central-1.amazonaws.com/economics.csv 29 | 30 | https://bda-examples.s3.eu-central-1.amazonaws.com/flights_sep_oct15.txt 31 | 32 | https://bda-examples.s3.eu-central-1.amazonaws.com/flights.csv 33 | 34 | https://bda-examples.s3.eu-central-1.amazonaws.com/ga.csv 35 | 36 | https://bda-examples.s3.eu-central-1.amazonaws.com/inflation.csv 37 | 38 | https://bda-examples.s3.eu-central-1.amazonaws.com/marketing_data.csv 39 | 40 | https://bda-examples.s3.eu-central-1.amazonaws.com/mydb.sqlite 41 | 42 | https://bda-examples.s3.eu-central-1.amazonaws.com/tlc_trips.csv 43 | 44 | Note that the AWS bucket is configured such that the [requester pays](https://docs.aws.amazon.com/AmazonS3/latest/userguide/RequesterPaysBuckets.html?icmpid=docs_amazons3_console) for requests and transfer costs. 45 | 46 | 47 | ### Installation of dependencies and packages 48 | 49 | Here you find additional resources and hints regarding the installation of some of the tools used in the book. 50 | 51 | - `gpuR`: The package is not anymore available via `install.packages()`. However, you can install it with `devtools::install_github("cdeterman/gpuR")`. For additional installation instructions (in particular regarding dependencies), see the wiki here: https://github.com/cdeterman/gpuR/wiki. 52 | - Install Apache Spark via `sparklyr`: https://spark.rstudio.com/get-started/ 53 | - Install Tensorflow and Keras via the `tensorflow` and `keras` packages (from within R): https://tensorflow.rstudio.com/install/ 54 | 55 | 56 | -------------------------------------------------------------------------------- /R_code_examples/03_partI_2domains_bigdata.R: -------------------------------------------------------------------------------- 1 | # import/inspect data 2 | ga <- read.csv("data/ga.csv") 3 | head(ga[, c("source", "browser", "city", "purchase")]) 4 | # create model matrix (dummy vars) 5 | mm <- cbind(ga$purchase, 6 | model.matrix(purchase~source, data=ga,)[,-1]) 7 | mm_df <- as.data.frame(mm) 8 | # clean variable names 9 | names(mm_df) <- c("purchase", 10 | gsub("source", "", names(mm_df)[-1])) 11 | # run logit 12 | model1 <- glm(purchase ~ ., 13 | data=mm_df, family=binomial) 14 | 15 | model1_sum <- summary(model1) 16 | # select "significant" variables for final model 17 | pvalues <- model1_sum$coefficients[,"Pr(>|z|)"] 18 | vars <- names(pvalues[which(pvalues<0.05)][-1]) 19 | vars 20 | 21 | # specify and estimate the final model 22 | finalmodel <- glm(purchase ~., 23 | data = mm_df[, c("purchase", vars)], 24 | family = binomial) 25 | 26 | summary(finalmodel)$coef[,c("Estimate", "Pr(>|z|)")] 27 | 28 | # load packages 29 | library(gamlr) 30 | # create the model matrix 31 | mm <- model.matrix(purchase~source, data = ga) 32 | 33 | # create the sparse model matrix 34 | mm_sparse <- sparse.model.matrix(purchase~source, data = ga) 35 | # compare the object's sizes 36 | as.numeric(object.size(mm)/object.size(mm_sparse)) 37 | 38 | # run k-fold cross-validation lasso 39 | cvpurchase <- cv.gamlr(mm_sparse, ga$purchase, family="binomial") 40 | 41 | # load packages 42 | library(PRROC) 43 | # use "best" model for prediction 44 | # (model selection based on average OSS deviance 45 | pred <- predict(cvpurchase$gamlr, mm_sparse, type="response") 46 | # compute tpr, fpr; plot ROC 47 | comparison <- roc.curve(scores.class0 = pred, 48 | weights.class0=ga$purchase, 49 | curve=TRUE) 50 | plot(comparison) 51 | 52 | beta_ols <- 53 | function(X, y) { 54 | # compute cross products and inverse 55 | XXi <- solve(crossprod(X,X)) 56 | Xy <- crossprod(X, y) 57 | return( XXi %*% Xy ) 58 | } 59 | 60 | # set parameter values 61 | n <- 10000000 62 | p <- 4 63 | # generate sample based on Monte Carlo 64 | # generate a design matrix (~ our 'dataset') 65 | # with 4 variables and 10,000 observations 66 | X <- matrix(rnorm(n*p, mean = 10), ncol = p) 67 | # add column for intercept 68 | X <- cbind(rep(1, n), X) 69 | 70 | # MC model 71 | y <- 2 + 1.5*X[,2] + 4*X[,3] - 3.5*X[,4] + 0.5*X[,5] + rnorm(n) 72 | 73 | # apply the OLS estimator 74 | beta_ols(X, y) 75 | 76 | beta_uluru <- 77 | function(X_subs, y_subs, X_rem, y_rem) { 78 | # compute beta_fs 79 | #(this is simply OLS applied to the subsample) 80 | XXi_subs <- solve(crossprod(X_subs, X_subs)) 81 | Xy_subs <- crossprod(X_subs, y_subs) 82 | b_fs <- XXi_subs %*% Xy_subs 83 | # compute \mathbf{R}_{rem} 84 | R_rem <- y_rem - X_rem %*% b_fs 85 | # compute \hat{\beta}_{correct} 86 | b_correct <- 87 | (nrow(X_subs)/(nrow(X_rem))) * 88 | XXi_subs %*% crossprod(X_rem, R_rem) 89 | # beta uluru 90 | return(b_fs + b_correct) 91 | } 92 | 93 | # set size of sub-sample 94 | n_subs <- 1000 95 | # select sub-sample and remainder 96 | n_obs <- nrow(X) 97 | X_subs <- X[1L:n_subs,] 98 | y_subs <- y[1L:n_subs] 99 | X_rem <- X[(n_subs+1L):n_obs,] 100 | y_rem <- y[(n_subs+1L):n_obs] 101 | # apply the uluru estimator 102 | beta_uluru(X_subs, y_subs, X_rem, y_rem) 103 | 104 | # define sub-samples 105 | n_subs_sizes <- seq(from = 1000, to = 500000, by=10000) 106 | n_runs <- length(n_subs_sizes) 107 | # compute uluru result, stop time 108 | mc_results <- rep(NA, n_runs) 109 | mc_times <- rep(NA, n_runs) 110 | for (i in 1:n_runs) { 111 | # set size of sub-sample 112 | n_subs <- n_subs_sizes[i] 113 | # select sub-sample and remainder 114 | n_obs <- nrow(X) 115 | X_subs <- X[1L:n_subs,] 116 | y_subs <- y[1L:n_subs] 117 | X_rem <- X[(n_subs+1L):n_obs,] 118 | y_rem <- y[(n_subs+1L):n_obs] 119 | mc_results[i] <- beta_uluru(X_subs, 120 | y_subs, 121 | X_rem, 122 | y_rem)[2] # (1 is the intercept) 123 | mc_times[i] <- system.time(beta_uluru(X_subs, 124 | y_subs, 125 | X_rem, 126 | y_rem))[3] 127 | } 128 | # compute OLS results and OLS time 129 | ols_time <- system.time(beta_ols(X, y)) 130 | ols_res <- beta_ols(X, y)[2] 131 | 132 | # load packages 133 | library(ggplot2) 134 | # prepare data to plot 135 | plotdata <- data.frame(beta1 = mc_results, 136 | time_elapsed = mc_times, 137 | subs_size = n_subs_sizes) 138 | 139 | ggplot(plotdata, aes(x = subs_size, y = time_elapsed)) + 140 | geom_point(color="darkgreen") + 141 | geom_hline(yintercept = ols_time[3], 142 | color = "red", 143 | linewidth = 1) + 144 | theme_minimal() + 145 | ylab("Time elapsed") + 146 | xlab("Subsample size") 147 | 148 | 149 | 150 | ggplot(plotdata, aes(x = subs_size, y = beta1)) + 151 | geom_hline(yintercept = ols_res, 152 | color = "red", 153 | linewidth = 1) + 154 | geom_hline(yintercept = 1.5, 155 | color = "green", 156 | linewidth = 1) + 157 | geom_point(color="darkgreen") + 158 | theme_minimal() + 159 | ylab("Estimated coefficient") + 160 | xlab("Subsample size") 161 | -------------------------------------------------------------------------------- /R_code_examples/04_partII_software.R: -------------------------------------------------------------------------------- 1 | # how much time does it take to run this loop? 2 | system.time(for (i in 1:100) {i + 5}) 3 | 4 | # load package 5 | library(microbenchmark) 6 | # how much time does it take to run this loop (exactly)? 7 | microbenchmark(for (i in 1:100) {i + 5}) 8 | 9 | hello <- "Hello, World!" 10 | object.size(hello) 11 | 12 | # initialize a large string vector containing letters 13 | large_string <- rep(LETTERS[1:20], 1000^2) 14 | head(large_string) 15 | 16 | # store the same information as a factor in a new variable 17 | large_factor <- as.factor(large_string) 18 | 19 | # is one bigger than the other? 20 | object.size(large_string) - object.size(large_factor) 21 | 22 | # load package 23 | library(pryr) 24 | 25 | # initialize a vector with 1000 (pseudo)-random numbers 26 | mem_change( 27 | thousand_numbers <- runif(1000) 28 | ) 29 | 30 | 31 | 32 | # initialize a vector with 1M (pseudo)-random numbers 33 | mem_change( 34 | a_million_numbers <- runif(1000^2) 35 | ) 36 | 37 | # load packages 38 | library(bench) 39 | 40 | # initialize variables 41 | x <- 1:10000 42 | z <- 1.5 43 | 44 | # approach I: loop 45 | multiplication <- 46 | function(x,z) { 47 | result <- c() 48 | for (i in 1:length(x)) {result <- c(result, x[i]*z)} 49 | return(result) 50 | } 51 | result <- multiplication(x,z) 52 | head(result) 53 | 54 | # approach II: "R-style" 55 | result2 <- x * z 56 | head(result2) 57 | 58 | # comparison 59 | benchmarking <- 60 | mark( 61 | result <- multiplication(x,z), 62 | result2 <- x * z, 63 | min_iterations = 50 64 | ) 65 | benchmarking[, 4:9] 66 | 67 | 68 | plot(benchmarking, type = "boxplot") 69 | 70 | # load package 71 | library(profvis) 72 | 73 | # analyze performance of several lines of code 74 | profvis({ 75 | x <- 1:10000 76 | z <- 1.5 77 | 78 | # approach I: loop 79 | multiplication <- 80 | function(x,z) { 81 | result <- c() 82 | for (i in 1:length(x)) {result <- c(result, x[i]*z)} 83 | return(result) 84 | } 85 | result <- multiplication(x,z) 86 | 87 | # approach II: "R-style" 88 | result2 <- x * z 89 | head(result2) 90 | }) 91 | 92 | # naïve implementation 93 | sqrt_vector <- 94 | function(x) { 95 | output <- c() 96 | for (i in 1:length(x)) { 97 | output <- c(output, x[i]^(1/2)) 98 | } 99 | 100 | return(output) 101 | } 102 | 103 | # implementation with pre-allocation of memory 104 | sqrt_vector_faster <- 105 | function(x) { 106 | output <- rep(NA, length(x)) 107 | for (i in 1:length(x)) { 108 | output[i] <- x[i]^(1/2) 109 | } 110 | 111 | return(output) 112 | } 113 | 114 | 115 | # the different sizes of the vectors we will put into the two functions 116 | input_sizes <- seq(from = 100, to = 10000, by = 100) 117 | # create the input vectors 118 | inputs <- sapply(input_sizes, rnorm) 119 | 120 | # compute outputs for each of the functions 121 | output_slower <- 122 | sapply(inputs, 123 | function(x){ system.time(sqrt_vector(x))["elapsed"] 124 | } 125 | ) 126 | output_faster <- 127 | sapply(inputs, 128 | function(x){ system.time(sqrt_vector_faster(x))["elapsed"] 129 | } 130 | ) 131 | 132 | # load packages 133 | library(ggplot2) 134 | 135 | # initialize data frame for plot 136 | plotdata <- data.frame(time_elapsed = c(output_slower, output_faster), 137 | input_size = c(input_sizes, input_sizes), 138 | Implementation= c(rep("sqrt_vector", 139 | length(output_slower)), 140 | rep("sqrt_vector_faster", 141 | length(output_faster)))) 142 | 143 | # plot 144 | ggplot(plotdata, aes(x=input_size, y= time_elapsed)) + 145 | geom_point(aes(colour=Implementation)) + 146 | theme_minimal(base_size = 18) + 147 | theme(legend.position = "bottom") + 148 | ylab("Time elapsed (in seconds)") + 149 | xlab("No. of elements processed") 150 | 151 | 152 | # implementation with vectorization 153 | sqrt_vector_fastest <- 154 | function(x) { 155 | output <- x^(1/2) 156 | return(output) 157 | } 158 | 159 | # speed test 160 | output_fastest <- 161 | sapply(inputs, 162 | function(x){ system.time(sqrt_vector_fastest(x))["elapsed"] 163 | } 164 | ) 165 | 166 | # load packages 167 | library(ggplot2) 168 | 169 | # initialize data frame for plot 170 | plotdata <- data.frame(time_elapsed = c(output_faster, output_fastest), 171 | input_size = c(input_sizes, input_sizes), 172 | Implementation= c(rep("sqrt_vector_faster", 173 | length(output_faster)), 174 | rep("sqrt_vector_fastest", 175 | length(output_fastest)))) 176 | 177 | # plot 178 | ggplot(plotdata, aes(x=time_elapsed, y=Implementation)) + 179 | geom_boxplot(aes(colour=Implementation), 180 | show.legend = FALSE) + 181 | theme_minimal(base_size = 18) + 182 | xlab("Time elapsed (in seconds)") 183 | 184 | 185 | 186 | 187 | # load packages 188 | library(data.table) 189 | 190 | # get a list of all file-paths 191 | textfiles <- list.files("data/twitter_texts", full.names = TRUE) 192 | 193 | 194 | # prepare loop 195 | all_texts <- list() 196 | n_files <- length(textfiles) 197 | length(all_texts) <- n_files 198 | # read all files listed in textfiles 199 | for (i in 1:n_files) { 200 | all_texts[[i]] <- fread(textfiles[i]) 201 | } 202 | 203 | 204 | # combine all in one data.table 205 | twitter_text <- rbindlist(all_texts) 206 | # check result 207 | dim(twitter_text) 208 | 209 | 210 | # use lapply instead of loop 211 | all_texts <- lapply(textfiles, fread) 212 | # combine all in one data.table 213 | twitter_text <- rbindlist(all_texts) 214 | # check result 215 | dim(twitter_text) 216 | 217 | 218 | # initialize the import function 219 | import_file <- 220 | function(x) { 221 | parsed_x <- fread(x) 222 | return(parsed_x) 223 | } 224 | 225 | # 'vectorize' it 226 | import_files <- Vectorize(import_file, SIMPLIFY = FALSE) 227 | 228 | # Apply the vectorized function 229 | all_texts <- import_files(textfiles) 230 | twitter_text <- rbindlist(all_texts) 231 | # check the result 232 | dim(twitter_text) 233 | 234 | a <- runif(10000) 235 | 236 | b <- a 237 | 238 | object_size(a) 239 | mem_change(c <- a) 240 | 241 | # load packages 242 | library(lobstr) 243 | 244 | # check memory addresses of objects 245 | obj_addr(a) 246 | obj_addr(b) 247 | 248 | # check the first element's value 249 | a[1] 250 | b[1] 251 | 252 | # modify a, check memory change 253 | mem_change(a[1] <- 0) 254 | 255 | # check memory addresses 256 | obj_addr(a) 257 | obj_addr(b) 258 | 259 | 260 | mem_change(d <- runif(10000)) 261 | mem_change(d[1] <- 0) 262 | 263 | mem_change(large_vector <- runif(10^8)) 264 | mem_change(rm(large_vector)) 265 | 266 | import_file 267 | 268 | sum 269 | 270 | # import data 271 | econ <- read.csv("data/economics.csv") 272 | 273 | # filter 274 | econ2 <- econ["1968-01-01"<=econ$date,] 275 | 276 | # compute yearly averages (basic R approach) 277 | econ2$year <- lubridate::year(econ2$date) 278 | years <- unique(econ2$year) 279 | averages <- 280 | sapply(years, FUN = function(x){ 281 | mean(econ2[econ2$year==x,"unemploy"]) 282 | }) 283 | output <- data.frame(year=years, average_unemploy=averages) 284 | 285 | # inspect the first few lines of the result 286 | head(output) 287 | 288 | 289 | 290 | 291 | SELECT 292 | 293 | strftime('%Y', `date`) AS year, 294 | 295 | AVG(unemploy) AS average_unemploy 296 | 297 | FROM econ 298 | 299 | WHERE "1968-01-01"<=`date` 300 | 301 | GROUP BY year LIMIT 6; 302 | 303 | 304 | 305 | 306 | groupby 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | select_example 331 | 332 | 333 | 334 | simple_query 335 | 336 | 337 | 338 | 339 | 340 | # import data 341 | econ <- read.csv("data/economics.csv") 342 | inflation <- read.csv("data/inflation.csv") 343 | 344 | # prepare variable to match observations 345 | econ$year <- lubridate::year(econ$date) 346 | inflation$year <- lubridate::year(inflation$date) 347 | 348 | # create final output 349 | years <- unique(econ$year) 350 | averages <- sapply(years, FUN = function(x) { 351 | mean(econ[econ$year==x,"unemploy"]/econ[econ$year==x,"pop"])*100 352 | 353 | } ) 354 | unemp <- data.frame(year=years, 355 | average_unemp_percent=averages) 356 | # combine via the year column 357 | # keep all rows of econ 358 | output<- merge(unemp, inflation[, c("year", "inflation_percent")], by="year") 359 | # inspect output 360 | head(output) 361 | 362 | 363 | SELECT 364 | 365 | strftime('%Y', econ.date) AS year, 366 | 367 | AVG(unemploy/pop)*100 AS average_unemp_percent, 368 | 369 | inflation_percent 370 | 371 | FROM econ INNER JOIN inflation ON year = strftime('%Y', inflation.date) 372 | 373 | GROUP BY year 374 | 375 | 376 | innerjoin_example[1:6,] 377 | 378 | dbDisconnect(con) 379 | 380 | # replace "YOUR-API-KEY" with 381 | # your actual key 382 | Sys.setenv(OPENAI_API_KEY = "YOUR-API-KEY") 383 | # open chat window 384 | gptstudio:::chat_gpt_addin() 385 | 386 | select date, 387 | 388 | unemploy from econ 389 | 390 | where unemploy > 15000 391 | 392 | order by date; 393 | 394 | -------------------------------------------------------------------------------- /R_code_examples/05_partII_hardware.R: -------------------------------------------------------------------------------- 1 | # load packages 2 | library(data.table) 3 | 4 | # load example data from basic R installation 5 | data("LifeCycleSavings") 6 | 7 | # write data to normal csv file and check size 8 | fwrite(LifeCycleSavings, file="lcs.csv") 9 | file.size("lcs.csv") 10 | 11 | # write data to a GZIPped (compressed) csv file and check size 12 | fwrite(LifeCycleSavings, file="lcs.csv.gz") 13 | file.size("lcs.csv.gz") 14 | 15 | # read/import the compressed data 16 | lcs <- data.table::fread("lcs.csv.gz") 17 | 18 | # common ZIP compression (independent of data.table package) 19 | write.csv(LifeCycleSavings, file="lcs.csv") 20 | file.size("lcs.csv") 21 | zip(zipfile = "lcs.csv.zip", files = "lcs.csv") 22 | file.size("lcs.csv.zip") 23 | 24 | # unzip/decompress and read/import data 25 | lcs_path <- unzip("lcs.csv.zip") 26 | lcs <- read.csv(lcs_path) 27 | 28 | 29 | 30 | 31 | 32 | # you can download the dataset from 33 | # https://www.kaggle.com/jackdaoud/marketing-data? 34 | # select=marketing_data.csv 35 | 36 | # PREPARATION ----------------------------- 37 | # packages 38 | library(stringr) 39 | 40 | # import data 41 | marketing <- read.csv("data/marketing_data.csv") 42 | # clean/prepare data 43 | marketing$Income <- as.numeric(gsub("[[:punct:]]", 44 | "", 45 | marketing$Income)) 46 | marketing$days_customer <- 47 | as.Date(Sys.Date())- 48 | as.Date(marketing$Dt_Customer, "%m/%d/%y") 49 | marketing$Dt_Customer <- NULL 50 | 51 | # all sets of independent vars 52 | indep <- names(marketing)[ c(2:19, 27,28)] 53 | combinations_list <- lapply(1:length(indep), 54 | function(x) combn(indep, x, 55 | simplify = FALSE)) 56 | combinations_list <- unlist(combinations_list, 57 | recursive = FALSE) 58 | models <- lapply(combinations_list, 59 | function(x) paste("Response ~", 60 | paste(x, collapse="+"))) 61 | 62 | # COMPUTE REGRESSIONS -------------------------- 63 | N <- 10 # N <- length(models) for all 64 | pseudo_Rsq <- list() 65 | length(pseudo_Rsq) <- N 66 | for (i in 1:N) { 67 | # fit the logit model via maximum likelihood 68 | fit <- glm(models[[i]], 69 | data=marketing, 70 | family = binomial()) 71 | # compute the proportion of deviance explained by 72 | # the independent vars (~R^2) 73 | pseudo_Rsq[[i]] <- 1-(fit$deviance/fit$null.deviance) 74 | } 75 | 76 | # SELECT THE WINNER --------------- 77 | models[[which.max(pseudo_Rsq)]] 78 | 79 | 80 | # COMPUTE REGRESSIONS -------------------------- 81 | N <- 10 # N <- length(models) for all 82 | run_reg <- 83 | function(model, data, family){ 84 | # fit the logit model via maximum likelihood 85 | fit <- glm(model, data=data, family = family) 86 | # compute and return the proportion of deviance explained by 87 | # the independent vars (~R^2) 88 | return(1-(fit$deviance/fit$null.deviance)) 89 | } 90 | 91 | pseudo_Rsq_list <-lapply(models[1:N], run_reg, data=marketing, family=binomial() ) 92 | pseudo_Rsq <- unlist(pseudo_Rsq_list) 93 | 94 | # SELECT THE WINNER --------------- 95 | models[[which.max(pseudo_Rsq)]] 96 | 97 | 98 | 99 | # SET UP ------------------ 100 | 101 | # load packages 102 | library(future) 103 | library(future.apply) 104 | # instruct the package to resolve 105 | # futures in parallel (via a SOCK cluster) 106 | plan(multisession) 107 | 108 | # COMPUTE REGRESSIONS -------------------------- 109 | N <- 10 # N <- length(models) for all 110 | pseudo_Rsq_list <- future_lapply(models[1:N], 111 | run_reg, 112 | data=marketing, 113 | family=binomial() ) 114 | pseudo_Rsq <- unlist(pseudo_Rsq_list) 115 | 116 | # SELECT THE WINNER --------------- 117 | models[[which.max(pseudo_Rsq)]] 118 | 119 | 120 | # COMPUTE REGRESSIONS IN PARALLEL (MULTI-CORE) -------------------------- 121 | 122 | # packages for parallel processing 123 | library(parallel) 124 | library(doSNOW) 125 | 126 | # get the number of cores available 127 | ncores <- parallel::detectCores() 128 | # set cores for parallel processing 129 | ctemp <- makeCluster(ncores) 130 | registerDoSNOW(ctemp) 131 | 132 | # prepare loop 133 | N <- 10000 # N <- length(models) for all 134 | # run loop in parallel 135 | pseudo_Rsq <- 136 | foreach ( i = 1:N, .combine = c) %dopar% { 137 | # fit the logit model via maximum likelihood 138 | fit <- glm(models[[i]], 139 | data=marketing, 140 | family = binomial()) 141 | # compute the proportion of deviance explained by 142 | # the independent vars (~R^2) 143 | return(1-(fit$deviance/fit$null.deviance)) 144 | } 145 | 146 | # SELECT THE WINNER --------------- 147 | models[[which.max(pseudo_Rsq)]] 148 | 149 | 150 | # COMPUTE REGRESSIONS IN PARALLEL (MULTI-CORE) --------------- 151 | 152 | 153 | # prepare parallel lapply (based on forking, 154 | # here clearly faster than foreach) 155 | N <- 10000 # N <- length(models) for all 156 | # run parallel lapply 157 | pseudo_Rsq <- mclapply(1:N, 158 | mc.cores = ncores, 159 | FUN = function(i){ 160 | # fit the logit model 161 | fit <- glm(models[[i]], 162 | data=marketing, 163 | family = binomial()) 164 | # compute the proportion of deviance 165 | # explained by the independent vars (~R^2) 166 | return(1-(fit$deviance/fit$null.deviance)) 167 | }) 168 | 169 | # SELECT THE WINNER, SHOW FINAL OUTPUT --------------- 170 | 171 | best_model <- models[[which.max(pseudo_Rsq)]] 172 | best_model 173 | 174 | 175 | 176 | 177 | 178 | 179 | # load package 180 | library(bench) 181 | library(gpuR) 182 | 183 | 184 | # initialize dataset with pseudo-random numbers 185 | N <- 10000 # number of observations 186 | P <- 100 # number of variables 187 | X <- matrix(rnorm(N * P, 0, 1), nrow = N, ncol =P) 188 | 189 | 190 | # prepare GPU-specific objects/settings 191 | # transfer matrix to GPU (matrix stored in GPU memory) 192 | vclX <- vclMatrix(X, type = "float") 193 | 194 | # compare three approaches 195 | gpu_cpu <- bench::mark( 196 | 197 | # compute with CPU 198 | cpu <-t(X) %*% X, 199 | 200 | # GPU version, in GPU memory 201 | # (vclMatrix formation is a memory transfer) 202 | gpu <- t(vclX) %*% vclX, 203 | 204 | check = FALSE, memory = FALSE, min_iterations = 200) 205 | 206 | plot(gpu_cpu, type = "boxplot") 207 | 208 | include_graphics("img/gpu_cpu.png") 209 | -------------------------------------------------------------------------------- /R_code_examples/06_partII_distributedsystems.R: -------------------------------------------------------------------------------- 1 | # initialize the input text (for simplicity as one text string) 2 | input_text <- 3 | "Apple Orange Mango 4 | Orange Grapes Plum 5 | Apple Plum Mango 6 | Apple Apple Plum" 7 | 8 | 9 | # Mapper splits input into lines 10 | lines <- as.list(strsplit(input_text, "\n")[[1]]) 11 | lines[1:2] 12 | 13 | 14 | # Mapper splits lines into key–value pairs 15 | map_fun <- 16 | function(x){ 17 | 18 | # remove special characters 19 | x_clean <- gsub("[[:punct:]]", "", x) 20 | # split line into words 21 | keys <- unlist(strsplit(x_clean, " ")) 22 | # initialize key–value pairs 23 | key_values <- rep(1, length(keys)) 24 | names(key_values) <- keys 25 | 26 | return(key_values) 27 | } 28 | 29 | kv_pairs <- Map(map_fun, lines) 30 | 31 | # look at the result 32 | kv_pairs[1:2] 33 | 34 | # order and shuffle 35 | kv_pairs <- unlist(kv_pairs) 36 | keys <- unique(names(kv_pairs)) 37 | keys <- keys[order(keys)] 38 | shuffled <- lapply(keys, 39 | function(x) kv_pairs[x == names(kv_pairs)]) 40 | shuffled[1:2] 41 | 42 | sums <- lapply(shuffled, Reduce, f=sum) 43 | names(sums) <- keys 44 | sums[1:2] 45 | 46 | # create directory for input files (typically text files) 47 | 48 | mkdir ~/input 49 | 50 | 51 | echo "Apple Orange Mango 52 | 53 | Orange Grapes Plum 54 | 55 | Apple Plum Mango 56 | 57 | Apple Apple Plum" >> ~/input/text.txt 58 | 59 | 60 | 61 | 62 | # run mapreduce word count 63 | 64 | /usr/local/hadoop/bin/hadoop jar \ 65 | 66 | /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.10.1.jar \ 67 | 68 | wordcount 69 | 70 | ~/input ~/wc_example 71 | 72 | 73 | cat ~/wc_example/* 74 | 75 | 76 | 77 | 78 | # might have to switch to java version 8 first 79 | 80 | sudo update-alternatives --config java 81 | 82 | 83 | 84 | 85 | $ SPARK-HOME/bin/sparkR 86 | 87 | 88 | # to install use 89 | # devtools::install_github("cran/SparkR") 90 | # load packages 91 | library(SparkR) 92 | # start session 93 | sparkR.session() 94 | 95 | 96 | # install.packages("SparkR") 97 | # or, if temporarily not available on CRAN: 98 | #if (!require('devtools')) install.packages('devtools') 99 | #devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg') # replace x.x with the version of your spark installation 100 | 101 | # load packages 102 | library(SparkR) 103 | 104 | # start session 105 | sparkR.session(sparkHome = "/home/umatter/.cache/spark/spark-3.1.2-bin-hadoop2.7") 106 | 107 | 108 | 109 | # Import data and create a SparkDataFrame 110 | # (a distributed collection of data, RDD) 111 | flights <- read.df("data/flights.csv", source = "csv", header="true") 112 | 113 | # inspect the object 114 | class(flights) 115 | dim(flights) 116 | 117 | 118 | flights$dep_delay <- cast(flights$dep_delay, "double") 119 | flights$dep_time <- cast(flights$dep_time, "double") 120 | flights$arr_time <- cast(flights$arr_time, "double") 121 | flights$arr_delay <- cast(flights$arr_delay, "double") 122 | flights$air_time <- cast(flights$air_time, "double") 123 | flights$distance <- cast(flights$distance, "double") 124 | 125 | # filter 126 | long_flights <- select(flights, "carrier", "year", "arr_delay", "distance") 127 | long_flights <- filter(long_flights, long_flights$distance >= 1000) 128 | head(long_flights) 129 | 130 | # aggregation: mean delay per carrier 131 | long_flights_delays<- summarize(groupBy(long_flights, long_flights$carrier), 132 | avg_delay = mean(long_flights$arr_delay)) 133 | head(long_flights_delays) 134 | 135 | # Convert result back into native R object 136 | delays <- collect(long_flights_delays) 137 | class(delays) 138 | delays 139 | 140 | cd SPARK-HOME 141 | 142 | 143 | 144 | 145 | $ bin/spark-sql 146 | 147 | 148 | {"name":"Michael", "salary":3000} 149 | 150 | {"name":"Andy", "salary":4500} 151 | 152 | {"name":"Justin", "salary":3500} 153 | 154 | {"name":"Berta", "salary":4000} 155 | 156 | 157 | 158 | 159 | SELECT * 160 | 161 | FROM json.`examples/src/main/resources/employees.json` 162 | 163 | ; 164 | 165 | 166 | 167 | 168 | 169 | 170 | SELECT * 171 | 172 | FROM json.`examples/src/main/resources/employees.json` 173 | 174 | WHERE salary <4000 175 | 176 | ; 177 | 178 | 179 | 180 | 181 | 182 | 183 | SELECT AVG(salary) AS mean_salary 184 | 185 | FROM json.`examples/src/main/resources/employees.json`; 186 | 187 | 188 | 189 | 190 | # to install use 191 | # devtools::install_github("cran/SparkR") 192 | # load packages 193 | library(SparkR) 194 | # start session 195 | sparkR.session() 196 | # read data 197 | flights <- read.df("data/flights.csv", source = "csv", header="true") 198 | 199 | 200 | # register the data frame as a table 201 | createOrReplaceTempView(flights, "flights" ) 202 | 203 | # now run SQL queries on it 204 | query <- 205 | "SELECT DISTINCT carrier, 206 | year, 207 | arr_delay, 208 | distance 209 | FROM flights 210 | WHERE 1000 <= distance" 211 | 212 | long_flights2 <- sql(query) 213 | head(long_flights2) 214 | 215 | -------------------------------------------------------------------------------- /R_code_examples/07_partII_cloudcomputing.R: -------------------------------------------------------------------------------- 1 | # install packages for parallelization 2 | install.packages("parallel", "doSNOW", "stringr") 3 | 4 | # load packages 5 | library(parallel) 6 | library(doSNOW) 7 | 8 | # verify no. of cores available 9 | n_cores <- detectCores() 10 | n_cores 11 | 12 | 13 | 14 | # PREPARATION ----------------------------- 15 | 16 | # packages 17 | library(stringr) 18 | 19 | # import data 20 | marketing <- read.csv("data/marketing_data.csv") 21 | # clean/prepare data 22 | marketing$Income <- as.numeric(gsub("[[:punct:]]", "", marketing$Income)) 23 | marketing$days_customer <- as.Date(Sys.Date())- 24 | as.Date(marketing$Dt_Customer, "%m/%d/%y") 25 | marketing$Dt_Customer <- NULL 26 | 27 | # all sets of independent vars 28 | indep <- names(marketing)[ c(2:19, 27,28)] 29 | combinations_list <- lapply(1:length(indep), 30 | function(x) combn(indep, x, simplify = FALSE)) 31 | combinations_list <- unlist(combinations_list, recursive = FALSE) 32 | models <- lapply(combinations_list, 33 | function(x) paste("Response ~", paste(x, collapse="+"))) 34 | 35 | # set cores for parallel processing 36 | # ctemp <- makeCluster(ncores) 37 | # registerDoSNOW(ctemp) 38 | 39 | # prepare loop 40 | N <- 10 # just for illustration, the actual code is N <- length(models) 41 | # run loop in parallel 42 | pseudo_Rsq <- 43 | foreach ( i = 1:N, .combine = c) %dopar% { 44 | # fit the logit model via maximum likelihood 45 | fit <- glm(models[[i]], data=marketing, family = binomial()) 46 | # compute the proportion of deviance explained 47 | #by the independent vars (~R^2) 48 | return(1-(fit$deviance/fit$null.deviance)) 49 | } 50 | 51 | 52 | # set cores for parallel processing 53 | ctemp <- makeCluster(ncores) 54 | registerDoSNOW(ctemp) 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | library(knitr) 66 | hook_output = knit_hooks$get('output') 67 | knit_hooks$set(output = function(x, options) { 68 | # this hook is used only when the linewidth option is not NULL 69 | if (!is.null(n <- options$linewidth)) { 70 | x = xfun::split_lines(x) 71 | # any lines wider than n should be wrapped 72 | if (any(nchar(x) > n)) x = strwrap(x, width = n) 73 | x = paste(x, collapse = '\n') 74 | } 75 | hook_output(x, options) 76 | }) 77 | 78 | aws emr create-cluster \ 79 | 80 | --release-label emr-6.1.0 \ 81 | 82 | --applications Name=Hadoop Name=Spark Name=Hive Name=Pig \ 83 | 84 | Name=Tez Name=Ganglia \ 85 | 86 | --name "EMR 6.1 RStudio + sparklyr" \ 87 | 88 | --service-role EMR_DefaultRole \ 89 | 90 | --instance-groups InstanceGroupType=MASTER,InstanceCount=1,\ 91 | 92 | InstanceType=m3.2xlarge,InstanceGroupType=CORE,\ 93 | 94 | InstanceCount=2,InstanceType=m3.2xlarge \ 95 | 96 | --bootstrap-action \ 97 | 98 | Path='s3://aws-bigdata-blog/artifacts/ 99 | 100 | aws-blog-emr-rstudio-sparklyr/rstudio_sparklyr_emr6.sh',\ 101 | 102 | Name="Install RStudio" --ec2-attributes InstanceProfile=EMR_EC2_DefaultRole,\ 103 | 104 | KeyName="sparklyr" 105 | 106 | --configurations '[{"Classification":"spark", 107 | 108 | "Properties":{"maximizeResourceAllocation":"true"}}]' \ 109 | 110 | --region us-east-1 111 | 112 | 113 | 114 | 115 | # load packages 116 | library(sparklyr) 117 | # connect rstudio session to cluster 118 | sc <- spark_connect(master = "yarn") 119 | 120 | -------------------------------------------------------------------------------- /R_code_examples/09_partIII_cleaning_transformation.R: -------------------------------------------------------------------------------- 1 | fs::file_size("data/flights.csv") 2 | 3 | if (dir.exists("ff_files")){ 4 | unlink("ff_files", recursive = TRUE, force = TRUE) 5 | } 6 | 7 | 8 | 9 | # SET UP -------------- 10 | # install.packages(c("ff", "ffbase")) 11 | # you might have to install the ffbase package directly from GitHub: 12 | # devtools::install_github("edwindj/ffbase", subdir="pkg") 13 | # load packages 14 | library(ff) 15 | library(ffbase) 16 | library(data.table) # for comparison 17 | 18 | 19 | # create directory for ff chunks, and assign directory to ff 20 | system("mkdir ff_files") 21 | options(fftempdir = "ff_files") 22 | 23 | 24 | # usual in-memory csv import 25 | flights_dt <- fread("data/flights.csv") 26 | 27 | # out-of-memory approach 28 | flights <- 29 | read.table.ffdf(file="data/flights.csv", 30 | sep=",", 31 | VERBOSE=TRUE, 32 | header=TRUE, 33 | next.rows=100000, 34 | colClasses=NA) 35 | 36 | # compare object sizes 37 | object.size(flights) # out-of-memory approach 38 | object.size(flights_dt) # common data.table 39 | 40 | # show the files in the directory keeping the chunks 41 | head(list.files("ff_files")) 42 | 43 | 44 | 45 | # SET UP ---------------- 46 | 47 | # load packages 48 | library(bigmemory) 49 | library(biganalytics) 50 | 51 | # import the data 52 | flights <- read.big.matrix("data/flights.csv", 53 | type="integer", 54 | header=TRUE, 55 | backingfile="flights.bin", 56 | descriptorfile="flights.desc") 57 | 58 | object.size(flights) 59 | str(flights) 60 | 61 | 62 | # SET UP ---------------- 63 | 64 | # load packages 65 | library(arrow) 66 | 67 | # import the data 68 | flights <- read_csv_arrow("data/flights.csv", 69 | as_data_frame = FALSE) 70 | 71 | summary(flights) 72 | object.size(flights) 73 | 74 | 75 | SET UP ------------------------ 76 | 77 | # create and set directory for ff files 78 | system("mkdir ff_files") 79 | options(fftempdir = "ff_files") 80 | 81 | # load packages 82 | library(ff) 83 | library(ffbase) 84 | library(pryr) 85 | 86 | # fix vars 87 | FLIGHTS_DATA <- "data/flights_sep_oct15.txt" 88 | AIRLINES_DATA <- "data/airline_id.csv" 89 | 90 | 91 | 92 | # DATA IMPORT ------------------ 93 | 94 | # check memory used 95 | mem_used() 96 | 97 | # 1. Upload flights_sep_oct15.txt and airline_id.csv files from flat files. 98 | 99 | system.time(flights.ff <- read.table.ffdf(file=FLIGHTS_DATA, 100 | sep=",", 101 | VERBOSE=TRUE, 102 | header=TRUE, 103 | next.rows=100000, 104 | colClasses=NA)) 105 | 106 | system.time(airlines.ff <- read.csv.ffdf(file= AIRLINES_DATA, 107 | VERBOSE=TRUE, 108 | header=TRUE, 109 | next.rows=100000, 110 | colClasses=NA)) 111 | 112 | # check memory used 113 | mem_used() 114 | 115 | 116 | # Using read.table() 117 | system.time(flights.table <- read.table(FLIGHTS_DATA, 118 | sep=",", 119 | header=TRUE)) 120 | system.time(airlines.table <- read.csv(AIRLINES_DATA, 121 | header = TRUE)) 122 | # check the memory used 123 | mem_used() 124 | 125 | 126 | # 2. Inspect the ff_files objects. 127 | For flights.ff object: 128 | class(flights.ff) 129 | dim(flights.ff) 130 | For airlines.ff object: 131 | class(airlines.ff) 132 | dim(airlines.ff) 133 | 134 | 135 | # step 1: 136 | # Rename "Code" variable from airlines.ff 137 | # to "AIRLINE_ID" and "Description" into "AIRLINE_NM". 138 | names(airlines.ff) <- c("AIRLINE_ID", "AIRLINE_NM") 139 | names(airlines.ff) 140 | str(airlines.ff[1:20,]) 141 | 142 | # merge of ff_files objects 143 | mem_change(flights.data.ff <- merge.ffdf(flights.ff, 144 | airlines.ff, 145 | by="AIRLINE_ID")) 146 | #The new object is only 551.2 KB in size 147 | class(flights.data.ff) 148 | dim(flights.data.ff) 149 | names(flights.data.ff) 150 | 151 | ##For flights.table: 152 | names(airlines.table) <- c("AIRLINE_ID", "AIRLINE_NM") 153 | names(airlines.table) 154 | str(airlines.table[1:20,]) 155 | 156 | # check memory usage of merge in RAM 157 | mem_change(flights.data.table <- merge(flights.table, 158 | airlines.table, 159 | by="AIRLINE_ID")) 160 | #The new object is already 105.7 MB in size 161 | #A rapid spike in RAM use when processing 162 | 163 | mem_used() 164 | 165 | # Subset the ff_files object flights.data.ff: 166 | subs1.ff <- 167 | subset.ffdf(flights.data.ff, 168 | CANCELLED == 1, 169 | select = c(FL_DATE, 170 | AIRLINE_ID, 171 | ORIGIN_CITY_NAME, 172 | ORIGIN_STATE_NM, 173 | DEST_CITY_NAME, 174 | DEST_STATE_NM, 175 | CANCELLATION_CODE)) 176 | 177 | dim(subs1.ff) 178 | mem_used() 179 | 180 | 181 | # Save a newly created ff_files object to a data file: 182 | # (7 files (one for each column) created in the ffdb directory) 183 | save.ffdf(subs1.ff, overwrite = TRUE) 184 | 185 | 186 | # Loading previously saved ff_files files: 187 | rm(subs1.ff) 188 | #gc() 189 | load.ffdf("ffdb") 190 | # check the class and structure of the loaded data 191 | class(subs1.ff) 192 | dim(subs1.ff) 193 | dimnames(subs1.ff) 194 | 195 | # Export subs1.ff into CSV and TXT files: 196 | write.csv.ffdf(subs1.ff, "subset1.csv") 197 | 198 | 199 | 200 | # SET UP ---------------- 201 | 202 | # load packages 203 | library(arrow) 204 | library(dplyr) 205 | library(pryr) # for profiling 206 | 207 | # fix vars 208 | FLIGHTS_DATA <- "data/flights_sep_oct15.txt" 209 | AIRLINES_DATA <- "data/airline_id.csv" 210 | 211 | # import the data 212 | flights <- read_csv_arrow(FLIGHTS_DATA, 213 | as_data_frame = FALSE) 214 | airlines <- read_csv_arrow(AIRLINES_DATA, 215 | as_data_frame = FALSE) 216 | 217 | class(flights) 218 | class(airlines) 219 | object_size(flights) 220 | object_size(airlines) 221 | 222 | # step 1: 223 | # Rename "Code" variable from airlines.ff to "AIRLINE_ID" 224 | # and "Description" into "AIRLINE_NM". 225 | names(airlines) <- c("AIRLINE_ID", "AIRLINE_NM") 226 | names(airlines) 227 | 228 | # merge the two datasets via Arrow 229 | flights.data.ar <- inner_join(airlines, flights, by="AIRLINE_ID") 230 | object_size(flights.data.ar) 231 | 232 | 233 | # Subset the ff_files object flights.data.ff: 234 | subs1.ar <- 235 | flights.data.ar %>% 236 | filter(CANCELLED == 1) %>% 237 | select(FL_DATE, 238 | AIRLINE_ID, 239 | ORIGIN_CITY_NAME, 240 | ORIGIN_STATE_NM, 241 | DEST_CITY_NAME, 242 | DEST_STATE_NM, 243 | CANCELLATION_CODE) 244 | 245 | object_size(subs1.ar) 246 | 247 | mem_change(subs1.ar.df <- collect(subs1.ar)) 248 | class(subs1.ar.df) 249 | object_size(subs1.ar.df) 250 | 251 | subs1.ar %>% 252 | compute() %>% 253 | write_csv_arrow(file="data/subs1.ar.csv") 254 | -------------------------------------------------------------------------------- /R_code_examples/10_partIII_descriptives_aggregation.R: -------------------------------------------------------------------------------- 1 | # load packages 2 | library(ff) 3 | library(ffbase) 4 | 5 | # set up the ff directory (for data file chunks) 6 | if (!dir.exists("fftaxi")){ 7 | system("mkdir fftaxi") 8 | } 9 | options(fftempdir = "fftaxi") 10 | 11 | # import the first one million observations 12 | taxi <- read.table.ffdf(file = "data/tlc_trips.csv", 13 | sep = ",", 14 | header = TRUE, 15 | next.rows = 100000, 16 | # colClasses= col_classes, 17 | nrows = 1000000 18 | ) 19 | 20 | 21 | # inspect the factor levels 22 | levels(taxi$Payment_Type) 23 | # recode them 24 | levels(taxi$Payment_Type) <- tolower(levels(taxi$Payment_Type)) 25 | taxi$Payment_Type <- ff(taxi$Payment_Type, 26 | levels = unique(levels(taxi$Payment_Type)), 27 | ramclass = "factor") 28 | # check result 29 | levels(taxi$Payment_Type) 30 | 31 | 32 | 33 | # load packages 34 | library(doBy) 35 | 36 | # split-apply-combine procedure on data file chunks 37 | tip_pcategory <- ffdfdply(taxi, 38 | split = taxi$Payment_Type, 39 | BATCHBYTES = 100000000, 40 | FUN = function(x) { 41 | summaryBy(Tip_Amt~Payment_Type, 42 | data = x, 43 | FUN = mean, 44 | na.rm = TRUE)}) 45 | 46 | as.data.frame(tip_pcategory) 47 | 48 | # add additional column with the share of tip 49 | taxi$percent_tip <- (taxi$Tip_Amt/taxi$Total_Amt)*100 50 | 51 | # recompute the aggregate stats 52 | tip_pcategory <- ffdfdply(taxi, 53 | split = taxi$Payment_Type, 54 | BATCHBYTES = 100000000, 55 | FUN = function(x) { 56 | # note the difference here 57 | summaryBy(percent_tip~Payment_Type, 58 | data = x, 59 | FUN = mean, 60 | na.rm = TRUE)}) 61 | # show result as data frame 62 | as.data.frame(tip_pcategory) 63 | 64 | table.ff(taxi$Payment_Type) 65 | 66 | # select the subset of observations only containing trips paid by 67 | # credit card or cash 68 | taxi_sub <- subset.ffdf(taxi, Payment_Type=="credit" | Payment_Type == "cash") 69 | taxi_sub$Payment_Type <- ff(taxi_sub$Payment_Type, 70 | levels = c("credit", "cash"), 71 | ramclass = "factor") 72 | 73 | # compute the cross tabulation 74 | crosstab <- table.ff(taxi_sub$Passenger_Count, 75 | taxi_sub$Payment_Type 76 | ) 77 | # add names to the margins 78 | names(dimnames(crosstab)) <- c("Passenger count", "Payment type") 79 | # show result 80 | crosstab 81 | 82 | # install.packages(vcd) 83 | # load package for mosaic plot 84 | library(vcd) 85 | 86 | # generate a mosaic plot 87 | mosaic(crosstab, shade = TRUE) 88 | 89 | # load packages 90 | library(arrow) 91 | library(dplyr) 92 | 93 | # read the CSV file 94 | taxi <- read_csv_arrow("data/tlc_trips.csv", 95 | as_data_frame = FALSE) 96 | 97 | 98 | 99 | # clean the categorical variable; aggregate by group 100 | taxi <- 101 | taxi %>% 102 | mutate(Payment_Type = tolower(Payment_Type)) 103 | 104 | taxi_summary <- 105 | taxi %>% 106 | mutate(percent_tip = (Tip_Amt/Total_Amt)*100 ) %>% 107 | group_by(Payment_Type) %>% 108 | summarize(avg_percent_tip = mean(percent_tip)) %>% 109 | collect() 110 | 111 | library(tidyr) 112 | 113 | # compute the frequencies; pull result into R 114 | ct <- taxi %>% 115 | filter(Payment_Type %in% c("credit", "cash")) %>% 116 | group_by(Passenger_Count, Payment_Type) %>% 117 | summarize(n=n())%>% 118 | collect() 119 | 120 | # present as cross-tabulation 121 | pivot_wider(data=ct, 122 | names_from="Passenger_Count", 123 | values_from = "n") 124 | 125 | 126 | # load packages 127 | library(data.table) 128 | 129 | # import data into RAM (needs around 200MB) 130 | taxi <- fread("data/tlc_trips.csv", 131 | nrows = 1000000) 132 | 133 | 134 | # clean the factor levels 135 | taxi$Payment_Type <- tolower(taxi$Payment_Type) 136 | taxi$Payment_Type <- factor(taxi$Payment_Type, 137 | levels = unique(taxi$Payment_Type)) 138 | 139 | 140 | taxi[, mean(Tip_Amt/Total_Amt)] 141 | 142 | taxi[, .(percent_tip = mean((Tip_Amt/Total_Amt)*100)), by = Payment_Type] 143 | 144 | dcast(taxi[Payment_Type %in% c("credit", "cash")], 145 | Passenger_Count~Payment_Type, 146 | fun.aggregate = length, 147 | value.var = "vendor_name") 148 | 149 | # housekeeping 150 | #gc() 151 | system("rm -r fftaxi") 152 | -------------------------------------------------------------------------------- /R_code_examples/12_partIV_data_analyticsI.R: -------------------------------------------------------------------------------- 1 | # SET UP ------------------ 2 | # load packages 3 | library(foreign) 4 | library(data.table) 5 | library(lmtest) 6 | # fix vars 7 | DATA_PATH <- "data/data_for_tables.dta" 8 | 9 | # import data 10 | cm <- as.data.table(read.dta(DATA_PATH)) 11 | # keep only clean obs 12 | cm <- cm[!(is.na(yes) 13 | |is.na(pctsumyessameparty) 14 | |is.na(pctsumyessameschool) 15 | |is.na(pctsumyessamestate))] 16 | 17 | 18 | # pooled model (no FE) 19 | model0 <- yes ~ 20 | pctsumyessameschool + 21 | pctsumyessamestate + 22 | pctsumyessameparty 23 | 24 | dim(model.matrix(model0, data=cm)) 25 | 26 | model1 <- 27 | yes ~ pctsumyessameschool + 28 | pctsumyessamestate + 29 | pctsumyessameparty + 30 | factor(congress) + 31 | factor(id) -1 32 | mm1 <- model.matrix(model1, data=cm) 33 | dim(mm1) 34 | 35 | 36 | # fit specification (1) 37 | runtime <- system.time(fit1 <- lm(data = cm, formula = model1)) 38 | coeftest(fit1)[2:4,] 39 | # median amount of time needed for estimation 40 | runtime[3] 41 | 42 | # illustration of within transformation for the senator fixed effects 43 | cm_within <- 44 | with(cm, data.table(yes = yes - ave(yes, id), 45 | pctsumyessameschool = pctsumyessameschool - 46 | ave(pctsumyessameschool, id), 47 | pctsumyessamestate = pctsumyessamestate - 48 | ave(pctsumyessamestate, id), 49 | pctsumyessameparty = pctsumyessameparty - 50 | ave(pctsumyessameparty, id) 51 | )) 52 | 53 | # comparison of dummy fixed effects estimator and within estimator 54 | dummy_time <- system.time(fit_dummy <- 55 | lm(yes ~ pctsumyessameschool + 56 | pctsumyessamestate + 57 | pctsumyessameparty + 58 | factor(id) -1, data = cm 59 | )) 60 | within_time <- system.time(fit_within <- 61 | lm(yes ~ pctsumyessameschool + 62 | pctsumyessamestate + 63 | pctsumyessameparty -1, 64 | data = cm_within)) 65 | # computation time comparison 66 | as.numeric(within_time[3])/as.numeric(dummy_time[3]) 67 | 68 | # comparison of estimates 69 | coeftest(fit_dummy)[1:3,] 70 | coeftest(fit_within) 71 | 72 | 73 | library(lfe) 74 | 75 | # model and clustered SE specifications 76 | model1 <- yes ~ pctsumyessameschool + 77 | pctsumyessamestate + 78 | pctsumyessameparty |congress+id|0|id 79 | model2 <- yes ~ pctsumyessameschool + 80 | pctsumyessamestate + 81 | pctsumyessameparty |congress_session_votenumber+id|0|id 82 | 83 | # estimation 84 | fit1 <- felm(model1, data=cm) 85 | fit2 <- felm(model2, data=cm) 86 | 87 | stargazer::stargazer(fit1,fit2, 88 | type="text", 89 | dep.var.labels = "Vote (yes/no)", 90 | covariate.labels = c("School Connected Votes", 91 | "State Votes", 92 | "Party Votes"), 93 | keep.stat = c("adj.rsq", "n")) 94 | 95 | # read dataset into R 96 | economics <- read.csv("data/economics.csv") 97 | # have a look at the data 98 | head(economics, 2) 99 | # create a 'large' dataset out of this 100 | for (i in 1:3) { 101 | economics <- rbind(economics, economics) 102 | } 103 | dim(economics) 104 | 105 | 106 | # Naïve approach (ignorant of R) 107 | deflator <- 1.05 # define deflator 108 | # iterate through each observation 109 | pce_real <- c() 110 | n_obs <- length(economics$pce) 111 | for (i in 1:n_obs) { 112 | pce_real <- c(pce_real, economics$pce[i]/deflator) 113 | } 114 | 115 | # look at the result 116 | head(pce_real, 2) 117 | 118 | 119 | 120 | # Naïve approach (ignorant of R) 121 | deflator <- 1.05 # define deflator 122 | # iterate through each observation 123 | pce_real <- list() 124 | n_obs <- length(economics$pce) 125 | time_elapsed <- 126 | system.time( 127 | for (i in 1:n_obs) { 128 | pce_real <- c(pce_real, economics$pce[i]/deflator) 129 | }) 130 | 131 | time_elapsed 132 | 133 | 134 | 135 | time_per_row <- time_elapsed[3]/n_obs 136 | time_per_row 137 | 138 | 139 | # in seconds 140 | (time_per_row*100^4) 141 | # in minutes 142 | (time_per_row*100^4)/60 143 | # in hours 144 | (time_per_row*100^4)/60^2 145 | 146 | 147 | # Improve memory allocation (still somewhat ignorant of R) 148 | deflator <- 1.05 # define deflator 149 | n_obs <- length(economics$pce) 150 | # allocate memory beforehand 151 | # Initialize the vector to the right size 152 | pce_real <- rep(NA, n_obs) 153 | # iterate through each observation 154 | time_elapsed <- 155 | system.time( 156 | for (i in 1:n_obs) { 157 | pce_real[i] <- economics$pce[i]/deflator 158 | }) 159 | 160 | 161 | 162 | 163 | time_per_row <- time_elapsed[3]/n_obs 164 | time_per_row 165 | 166 | 167 | # in seconds 168 | (time_per_row*100^4) 169 | # in minutes 170 | (time_per_row*100^4)/60 171 | # in hours 172 | (time_per_row*100^4)/60^2 173 | 174 | 175 | # Do it 'the R way' 176 | deflator <- 1.05 # define deflator 177 | # Exploit R's vectorization 178 | time_elapsed <- 179 | system.time( 180 | pce_real <- economics$pce/deflator 181 | ) 182 | # same result 183 | head(pce_real, 2) 184 | 185 | 186 | library(microbenchmark) 187 | # measure elapsed time in microseconds (avg.) 188 | time_elapsed <- 189 | summary(microbenchmark(pce_real <- economics$pce/deflator))$mean 190 | # per row (in sec) 191 | time_per_row <- (time_elapsed/n_obs)/10^6 192 | 193 | 194 | # in seconds 195 | (time_per_row*100^4) 196 | # in minutes 197 | (time_per_row*100^4)/60 198 | # in hours 199 | (time_per_row*100^4)/60^2 200 | 201 | 202 | url <- 203 | "https://vincentarelbundock.github.io/Rdatasets/csv/carData/MplsStops.csv" 204 | stopdata <- data.table::fread(url) 205 | 206 | url <- 207 | "https://vincentarelbundock.github.io/Rdatasets/csv/carData/MplsStops.csv" 208 | stopdata <- data.table::fread(url) 209 | 210 | # remove incomplete obs 211 | stopdata <- na.omit(stopdata) 212 | # code dependent var 213 | stopdata$vsearch <- 0 214 | stopdata$vsearch[stopdata$vehicleSearch=="YES"] <- 1 215 | # code explanatory var 216 | stopdata$white <- 0 217 | stopdata$white[stopdata$race=="White"] <- 1 218 | 219 | model <- vsearch ~ white + factor(policePrecinct) 220 | 221 | fit <- lm(model, stopdata) 222 | summary(fit) 223 | 224 | # load packages 225 | library(data.table) 226 | # set the 'seed' for random numbers (makes the example reproducible) 227 | set.seed(2) 228 | 229 | # set number of bootstrap iterations 230 | B <- 10 231 | # get selection of precincts 232 | precincts <- unique(stopdata$policePrecinct) 233 | # container for coefficients 234 | boot_coefs <- matrix(NA, nrow = B, ncol = 2) 235 | # draw bootstrap samples, estimate model for each sample 236 | for (i in 1:B) { 237 | 238 | # draw sample of precincts (cluster level) 239 | precincts_i <- base::sample(precincts, size = 5, replace = TRUE) 240 | # get observations 241 | bs_i <- 242 | lapply(precincts_i, function(x){ 243 | stopdata[stopdata$policePrecinct==x,] 244 | } ) 245 | bs_i <- rbindlist(bs_i) 246 | 247 | # estimate model and record coefficients 248 | boot_coefs[i,] <- coef(lm(model, bs_i))[1:2] # ignore FE-coefficients 249 | } 250 | 251 | se_boot <- apply(boot_coefs, 252 | MARGIN = 2, 253 | FUN = sd) 254 | se_boot 255 | 256 | # load packages for parallel processing 257 | library(doSNOW) 258 | # get the number of cores available 259 | ncores <- parallel::detectCores() 260 | # set cores for parallel processing 261 | ctemp <- makeCluster(ncores) # 262 | registerDoSNOW(ctemp) 263 | 264 | 265 | # set number of bootstrap iterations 266 | B <- 10 267 | # get selection of precincts 268 | precincts <- unique(stopdata$policePrecinct) 269 | # container for coefficients 270 | boot_coefs <- matrix(NA, nrow = B, ncol = 2) 271 | 272 | # bootstrapping in parallel 273 | boot_coefs <- 274 | foreach(i = 1:B, .combine = rbind, .packages="data.table") %dopar% { 275 | # draw sample of precincts (cluster level) 276 | precincts_i <- base::sample(precincts, size = 5, replace = TRUE) 277 | # get observations 278 | bs_i <- lapply(precincts_i, function(x) { 279 | stopdata[stopdata$policePrecinct==x,] 280 | }) 281 | bs_i <- rbindlist(bs_i) 282 | # estimate model and record coefficients 283 | coef(lm(model, bs_i))[1:2] # ignore FE-coefficients 284 | } 285 | # be a good citizen and stop the snow clusters 286 | stopCluster(cl = ctemp) 287 | 288 | 289 | se_boot <- apply(boot_coefs, 290 | MARGIN = 2, 291 | FUN = sd) 292 | se_boot 293 | 294 | 295 | # install packages 296 | install.packages("data.table") 297 | install.packages("doSNOW") 298 | # load packages 299 | library(data.table) 300 | 301 | # fetch the data 302 | url <- 303 | "https://vincentarelbundock.github.io/Rdatasets/csv/carData/MplsStops.csv" 304 | stopdata <- read.csv(url) 305 | # remove incomplete obs 306 | stopdata <- na.omit(stopdata) 307 | # code dependent var 308 | stopdata$vsearch <- 0 309 | stopdata$vsearch[stopdata$vehicleSearch=="YES"] <- 1 310 | # code explanatory var 311 | stopdata$white <- 0 312 | stopdata$white[stopdata$race=="White"] <- 1 313 | 314 | # model fit 315 | model <- vsearch ~ white + factor(policePrecinct) 316 | fit <- lm(model, stopdata) 317 | summary(fit) 318 | # bootstrapping: normal approach 319 | # set the 'seed' for random numbers (makes the example reproducible) 320 | set.seed(2) 321 | # set number of bootstrap iterations 322 | B <- 50 323 | # get selection of precincts 324 | precincts <- unique(stopdata$policePrecinct) 325 | # container for coefficients 326 | boot_coefs <- matrix(NA, nrow = B, ncol = 2) 327 | # draw bootstrap samples, estimate model for each sample 328 | for (i in 1:B) { 329 | # draw sample of precincts (cluster level) 330 | precincts_i <- base::sample(precincts, size = 5, replace = TRUE) 331 | # get observations 332 | bs_i <- 333 | lapply(precincts_i, function(x){ 334 | stopdata[stopdata$policePrecinct==x,]}) 335 | bs_i <- rbindlist(bs_i) 336 | # estimate model and record coefficients 337 | boot_coefs[i,] <- coef(lm(model, bs_i))[1:2] # ignore FE-coefficients 338 | } 339 | 340 | se_boot <- apply(boot_coefs, 341 | MARGIN = 2, 342 | FUN = sd) 343 | se_boot 344 | 345 | 346 | # bootstrapping: parallel approaach 347 | # install.packages("doSNOW", "parallel") 348 | # load packages for parallel processing 349 | library(doSNOW) 350 | # set cores for parallel processing 351 | ncores <- parallel::detectCores() 352 | ctemp <- makeCluster(ncores) 353 | registerDoSNOW(ctemp) 354 | # set number of bootstrap iterations 355 | B <- 50 356 | # get selection of precincts 357 | precincts <- unique(stopdata$policePrecinct) 358 | # container for coefficients 359 | boot_coefs <- matrix(NA, nrow = B, ncol = 2) 360 | 361 | # bootstrapping in parallel 362 | boot_coefs <- 363 | foreach(i = 1:B, .combine = rbind, .packages="data.table") %dopar% { 364 | # draw sample of precincts (cluster level) 365 | precincts_i <- base::sample(precincts, size = 5, replace = TRUE) 366 | # get observations 367 | bs_i <- lapply(precincts_i, function(x){ 368 | stopdata[stopdata$policePrecinct==x,]) 369 | } 370 | bs_i <- rbindlist(bs_i) 371 | 372 | # estimate model and record coefficients 373 | coef(lm(model, bs_i))[1:2] # ignore FE-coefficients 374 | } 375 | 376 | # be a good citizen and stop the snow clusters 377 | stopCluster(cl = ctemp) 378 | # compute the bootstrapped standard errors 379 | se_boot <- apply(boot_coefs, 380 | MARGIN = 2, 381 | FUN = sd) 382 | -------------------------------------------------------------------------------- /R_code_examples/13_partIV_GPU_ML.R: -------------------------------------------------------------------------------- 1 | set.seed(1) 2 | # set parameter values 3 | n <- 100000 4 | p <- 4 5 | # generate a design matrix (~ our 'dataset') 6 | # with p variables and n observations 7 | X <- matrix(rnorm(n*p, mean = 10), ncol = p) 8 | # add column for intercept 9 | #X <- cbind(rep(1, n), X) 10 | 11 | # MC model 12 | y <- 1.5*X[,1] + 4*X[,2] - 3.5*X[,3] + 0.5*X[,4] + rnorm(n) 13 | 14 | 15 | 16 | beta_ols_gpu <- 17 | function(X, y, gpu_memory=FALSE) { 18 | require(gpuR) 19 | 20 | if (!gpu_memory){ 21 | # point GPU to matrix (matrix stored in non-GPU memory) 22 | vclX <- vclMatrix(X, type = "float") 23 | vcly <- vclVector(y, type = "float") 24 | # compute cross products and inverse 25 | XXi <- solve(crossprod(vclX,vclX)) 26 | Xy <- crossprod(vclX, vcly) 27 | } else { 28 | # point GPU to matrix (matrix stored in non-GPU memory) 29 | gpuX <- gpuMatrix(X, type = "float") 30 | gpuy <- gpuVector(y, type = "float") 31 | # compute cross products and inverse 32 | XXi <- solve(crossprod(gpuX,gpuX)) 33 | Xy <- t(gpuX) %*% gpuy 34 | } 35 | beta_hat <- as.vector(XXi %*% Xy) 36 | return(beta_hat) 37 | } 38 | 39 | 40 | beta_ols_gpu(X,y) 41 | 42 | beta_ols_gpu(X,y, gpu_memory = TRUE) 43 | 44 | if (Sys.info()["sysname"]=="Darwin"){ # run on macOS machine 45 | 46 | use_python("/Users/umatter/opt/anaconda3/bin/python") # IMPORTANT: keras/tensorflow is set up to run in this environment on this machine! 47 | } 48 | 49 | 50 | # load packages 51 | library(keras) 52 | library(tibble) 53 | library(ggplot2) 54 | library(tfdatasets) 55 | # load data 56 | boston_housing <- dataset_boston_housing() 57 | str(boston_housing) 58 | 59 | # assign training and test data/labels 60 | c(train_data, train_labels) %<-% boston_housing$train 61 | c(test_data, test_labels) %<-% boston_housing$test 62 | 63 | 64 | library(dplyr) 65 | 66 | column_names <- c('CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 67 | 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT') 68 | 69 | train_df <- train_data %>% 70 | as_tibble(.name_repair = "minimal") %>% 71 | setNames(column_names) %>% 72 | mutate(label = train_labels) 73 | 74 | test_df <- test_data %>% 75 | as_tibble(.name_repair = "minimal") %>% 76 | setNames(column_names) %>% 77 | mutate(label = test_labels) 78 | 79 | # check training data dimensions and content 80 | dim(train_df) 81 | head(train_df) 82 | 83 | spec <- feature_spec(train_df, label ~ . ) %>% 84 | step_numeric_column(all_numeric(), normalizer_fn = scaler_standard()) %>% 85 | fit() 86 | 87 | # Create the model 88 | # model specification 89 | input <- layer_input_from_dataset(train_df %>% select(-label)) 90 | 91 | output <- input %>% 92 | layer_dense_features(dense_features(spec)) %>% 93 | layer_dense(units = 64, activation = "relu") %>% 94 | layer_dense(units = 64, activation = "relu") %>% 95 | layer_dense(units = 1) 96 | 97 | model <- keras_model(input, output) 98 | 99 | 100 | # compile the model 101 | model %>% 102 | compile( 103 | loss = "mse", 104 | optimizer = optimizer_rmsprop(), 105 | metrics = list("mean_absolute_error") 106 | ) 107 | 108 | # get a summary of the model 109 | model 110 | 111 | # Set max. number of epochs 112 | epochs <- 500 113 | 114 | # Fit the model and store training stats 115 | history <- model %>% fit( 116 | x = train_df %>% select(-label), 117 | y = train_df$label, 118 | epochs = epochs, 119 | validation_split = 0.2, 120 | verbose = 0 121 | ) 122 | plot(history) 123 | -------------------------------------------------------------------------------- /R_code_examples/14_partIV_regression_categorization_spark.R: -------------------------------------------------------------------------------- 1 | # flights_r <- collect(flights) # very slow! 2 | flights_r <- data.table::fread("data/flights.csv", nrows = 300) 3 | 4 | # specify the linear model 5 | model1 <- arr_delay ~ dep_delay + distance 6 | # fit the model with OLS 7 | fit1 <- lm(model1, flights_r) 8 | # compute t-tests etc. 9 | summary(fit1) 10 | 11 | library(sparklyr) 12 | 13 | # connect with default configuration 14 | sc <- spark_connect(master="local") 15 | 16 | 17 | # load data to spark 18 | flights_spark <- copy_to(sc, flights_r, "flights_spark") 19 | # fit the model 20 | fit1_spark <- ml_linear_regression(flights_spark, formula = model1) 21 | # compute summary stats 22 | summary(fit1_spark) 23 | 24 | 25 | # fit the model 26 | spark_apply(flights_spark, 27 | function(df){ 28 | broom::tidy(lm(arr_delay ~ dep_delay + distance, df))}, 29 | names = c("term", 30 | "estimate", 31 | "std.error", 32 | "statistic", 33 | "p.value") 34 | ) 35 | 36 | library(tidymodels) 37 | library(parsnip) 38 | 39 | # simple local linear regression example from above 40 | # via tidymodels/parsnip 41 | fit1 <- fit(linear_reg(engine="lm"), model1, data=flights_r) 42 | tidy(fit1) 43 | 44 | 45 | 46 | # run the same on Spark 47 | fit1_spark <- fit(linear_reg(engine="spark"), model1, data=flights_spark) 48 | tidy(fit1_spark) 49 | 50 | # load into R, select variables of interest, remove missing 51 | titanic_r <- read.csv("data/titanic3.csv") 52 | titanic_r <- na.omit(titanic_r[, c("survived", 53 | "pclass", 54 | "sex", 55 | "age", 56 | "sibsp", 57 | "parch")]) 58 | titanic_r$survived <- ifelse(titanic_r$survived==1, "yes", "no") 59 | 60 | library(rsample) 61 | 62 | # split into training and test set 63 | titanic_r <- initial_split(titanic_r) 64 | ti_training <- training(titanic_r) 65 | ti_testing <- testing(titanic_r) 66 | 67 | # load data to spark 68 | ti_training_spark <- copy_to(sc, ti_training, "ti_training_spark") 69 | ti_testing_spark <- copy_to(sc, ti_testing, "ti_testing_spark") 70 | 71 | # models to be used 72 | models <- list(logit=logistic_reg(engine="spark", mode = "classification"), 73 | btree=boost_tree(engine = "spark", mode = "classification"), 74 | rforest=rand_forest(engine = "spark", mode = "classification")) 75 | # train/fit the models 76 | fits <- lapply(models, fit, formula=survived~., data=ti_training_spark) 77 | 78 | 79 | # run predictions 80 | predictions <- lapply(fits, predict, new_data=ti_testing_spark) 81 | # fetch predictions from Spark, format, add actual outcomes 82 | pred_outcomes <- 83 | lapply(1:length(predictions), function(i){ 84 | x_r <- collect(predictions[[i]]) # load into local R environment 85 | x_r$pred_class <- as.factor(x_r$pred_class) # format for predictions 86 | x_r$survived <- as.factor(ti_testing$survived) # add true outcomes 87 | return(x_r) 88 | 89 | }) 90 | 91 | 92 | acc <- lapply(pred_outcomes, accuracy, truth="survived", estimate="pred_class") 93 | acc <- bind_rows(acc) 94 | acc$model <- names(fits) 95 | acc[order(acc$.estimate, decreasing = TRUE),] 96 | 97 | tidy(fits[["btree"]]) 98 | 99 | tidy(fits[["rforest"]]) 100 | 101 | spark_disconnect(sc) 102 | 103 | # load packages 104 | library(sparklyr) 105 | library(dplyr) 106 | 107 | # fix vars 108 | INPUT_DATA <- "data/ga.csv" 109 | 110 | 111 | # import to local R session, prepare raw data 112 | ga <- na.omit(read.csv(INPUT_DATA)) 113 | #ga$purchase <- as.factor(ifelse(ga$purchase==1, "yes", "no")) 114 | # connect to, and copy the data to the local cluster 115 | sc <- spark_connect(master = "local") 116 | ga_spark <- copy_to(sc, ga, "ga_spark", overwrite = TRUE) 117 | 118 | 119 | # ml pipeline 120 | ga_pipeline <- 121 | ml_pipeline(sc) %>% 122 | ft_string_indexer(input_col="city", 123 | output_col="city_output", 124 | handle_invalid = "skip") %>% 125 | ft_string_indexer(input_col="country", 126 | output_col="country_output", 127 | handle_invalid = "skip") %>% 128 | ft_string_indexer(input_col="source", 129 | output_col="source_output", 130 | handle_invalid = "skip") %>% 131 | ft_string_indexer(input_col="browser", 132 | output_col="browser_output", 133 | handle_invalid = "skip") %>% 134 | ft_r_formula(purchase ~ .) %>% 135 | ml_logistic_regression(elastic_net_param = list(alpha=1)) 136 | 137 | 138 | # specify the hyperparameter grid 139 | # (parameter values to be considered in optimization) 140 | ga_params <- list(logistic_regression=list(max_iter=80)) 141 | 142 | # create the cross-validator object 143 | set.seed(1) 144 | cv_lasso <- ml_cross_validator(sc, 145 | estimator=ga_pipeline, 146 | estimator_param_maps = ga_params, 147 | ml_binary_classification_evaluator(sc), 148 | num_folds = 30, 149 | parallelism = 8) 150 | 151 | # train/fit the model 152 | cv_lasso_fit <- ml_fit(cv_lasso, ga_spark) 153 | # note: this takes several minutes to run on a local machine (1 node, 8 cores) 154 | 155 | 156 | # pipeline summary 157 | # cv_lasso_fit 158 | # average performance 159 | cv_lasso_fit$avg_metrics_df 160 | 161 | 162 | # save the entire pipeline/fit 163 | ml_save( 164 | cv_lasso_fit, 165 | "ga_cv_lasso_fit", 166 | overwrite = TRUE 167 | ) 168 | 169 | 170 | -------------------------------------------------------------------------------- /R_code_examples/15_partIV_large_scale_text_analysis.R: -------------------------------------------------------------------------------- 1 | # install additional packages 2 | # install.packages("gutenbergr") # download book from Project Gutenberg 3 | # install.packages("dplyr") # for the data preparatory steps 4 | 5 | # load packages 6 | library(sparklyr) 7 | library(gutenbergr) 8 | library(dplyr) 9 | 10 | # fix vars 11 | TELL <- "https://www.gutenberg.org/cache/epub/6788/pg6788.txt" 12 | 13 | 14 | # connect rstudio session to cluster 15 | sc <- spark_connect(master = "yarn") 16 | 17 | 18 | # install additional packages 19 | # install.packages("gutenbergr") # to download book texts from Project Gutenberg 20 | # install.packages("dplyr") # for the data preparatory steps 21 | # load packages 22 | library(sparklyr) 23 | library(gutenbergr) 24 | library(dplyr) 25 | # fix vars 26 | TELL <- "https://www.gutenberg.org/cache/epub/6788/pg6788.txt" 27 | # connect rstudio session to cluster 28 | conf <- spark_config() 29 | conf$`sparklyr.shell.driver-memory` <- "8g" 30 | sc <- spark_connect(master = "local", 31 | config = conf) 32 | 33 | 34 | # Data gathering and preparation 35 | # fetch Schiller's Tell, load to cluster 36 | tmp_file <- tempfile() 37 | download.file(TELL, tmp_file) 38 | raw_text <- readLines(tmp_file) 39 | tell <- data.frame(raw_text=raw_text) 40 | tell_spark <- copy_to(sc, tell, 41 | "tell_spark", 42 | overwrite = TRUE) 43 | 44 | 45 | # data cleaning 46 | tell_spark <- filter(tell_spark, raw_text!="") 47 | tell_spark <- select(tell_spark, raw_text) 48 | tell_spark <- mutate(tell_spark, 49 | raw_text = regexp_replace(raw_text, "[^0-9a-zA-Z]+", " ")) 50 | 51 | 52 | 53 | # split into words 54 | tell_spark <- ft_tokenizer(tell_spark, 55 | input_col = "raw_text", 56 | output_col = "words") 57 | 58 | 59 | 60 | # remove stop-words 61 | tell_spark <- ft_stop_words_remover(tell_spark, 62 | input_col = "words", 63 | output_col = "words_wo_stop") 64 | 65 | 66 | # unnest words, combine in one row 67 | all_tell_words <- mutate(tell_spark, 68 | word = explode(words_wo_stop)) 69 | 70 | # final cleaning 71 | all_tell_words <- select(all_tell_words, word) 72 | all_tell_words <- filter(all_tell_words, 2 ", 85 | PATH, 86 | " && unzip ", 87 | PATH)) 88 | # move the speeches files 89 | system("mkdir data/text/ && mkdir data/text/speeches") 90 | system("mv hein-daily/speeches* data/text/speeches/") 91 | # move the speaker files 92 | system("mkdir data/text/speakers") 93 | system("mv hein-daily/*SpeakerMap.txt data/text/speakers/") 94 | 95 | 96 | # download and unzip procedural phrases data 97 | URL_P <- "https://stacks.stanford.edu/file/druid:md374tz9962/vocabulary.zip" 98 | PATH_P <- "data/vocabulary.zip" 99 | system(paste0("curl ", 100 | URL_P, 101 | " > ", 102 | PATH_P, 103 | " && unzip ", 104 | PATH_P)) 105 | # move the procedural vocab file 106 | system("mv vocabulary/vocab.txt data/text/") 107 | 108 | # SET UP ---------------- 109 | 110 | # load packages 111 | library(sparklyr) 112 | library(dplyr) 113 | # fix vars 114 | INPUT_PATH_SPEECHES <- "data/text/speeches/" 115 | INPUT_PATH_SPEAKERS <- "data/text/speakers/" 116 | 117 | # configuration of local spark cluster 118 | conf <- spark_config() 119 | conf$`sparklyr.shell.driver-memory` <- "16g" 120 | # connect rstudio session to cluster 121 | sc <- spark_connect(master = "local", 122 | config = conf) 123 | 124 | 125 | # LOAD TEXT DATA -------------------- 126 | 127 | # load data 128 | speeches <- spark_read_csv(sc, 129 | name = "speeches", 130 | path = INPUT_PATH_SPEECHES, 131 | delimiter = "|") 132 | speakers <- spark_read_csv(sc, 133 | name = "speakers", 134 | path = INPUT_PATH_SPEAKERS, 135 | delimiter = "|") 136 | 137 | 138 | # JOIN -------------------- 139 | speeches <- 140 | inner_join(speeches, 141 | speakers, 142 | by="speech_id") %>% 143 | filter(party %in% c("R", "D"), chamber=="H") %>% 144 | mutate(congress=substr(speech_id, 1,3)) %>% 145 | select(speech_id, speech, party, congress) 146 | 147 | 148 | # CLEANING ---------------- 149 | # clean text: numbers, letters (bill IDs, etc. 150 | speeches <- 151 | mutate(speeches, speech = tolower(speech)) %>% 152 | mutate(speech = regexp_replace(speech, 153 | "[_\"\'():;,.!?\\-]", 154 | "")) %>% 155 | mutate(speech = regexp_replace(speech, "\\\\(.+\\\\)", " ")) %>% 156 | mutate(speech = regexp_replace(speech, "[0-9]+", " ")) %>% 157 | mutate(speech = regexp_replace(speech, "<[a-z]+>", " ")) %>% 158 | mutate(speech = regexp_replace(speech, "<\\w+>", " ")) %>% 159 | mutate(speech = regexp_replace(speech, "_", " ")) %>% 160 | mutate(speech = trimws(speech)) 161 | 162 | 163 | # TOKENIZATION, STOPWORDS REMOVAL, NGRAMS ---------------- 164 | 165 | # stopwords list 166 | stop <- readLines("http://snowball.tartarus.org/algorithms/english/stop.txt") 167 | stop <- trimws(gsub("\\|.*", "", stop)) 168 | stop <- stop[stop!=""] 169 | 170 | # clean text: numbers, letters (bill IDs, etc. 171 | bigrams <- 172 | ft_tokenizer(speeches, "speech", "words") %>% 173 | ft_stop_words_remover("words", "words_wo_stop", 174 | stop_words = stop ) %>% 175 | ft_ngram("words_wo_stop", "bigram_list", n=2) %>% 176 | mutate(bigram=explode(bigram_list)) %>% 177 | mutate(bigram=trim(bigram)) %>% 178 | mutate(n_words=as.numeric(length(bigram) - 179 | length(replace(bigram, ' ', '')) + 1)) %>% 180 | filter(3% 181 | select(party, congress, bigram) 182 | 183 | 184 | 185 | # load the procedural phrases list 186 | valid_vocab <- spark_read_csv(sc, 187 | path="data/text/vocab.txt", 188 | name = "valid_vocab", 189 | delimiter = "|", 190 | header = FALSE) 191 | # remove corresponding bigrams via anti-join 192 | bigrams <- inner_join(bigrams, valid_vocab, by= c("bigram"="V1")) 193 | 194 | # BIGRAM COUNT PER PARTY --------------- 195 | bigram_count <- 196 | count(bigrams, party, bigram, congress) %>% 197 | compute("bigram_count") 198 | 199 | # FIND MOST PARTISAN BIGRAMS ------------ 200 | 201 | # compute frequencies and chi-squared values 202 | freqs <- 203 | bigram_count %>% 204 | group_by(party, congress) %>% 205 | mutate(total=sum(n), f_npl=total-n) 206 | freqs_d <- 207 | filter(freqs, party=="D") %>% 208 | rename(f_pld=n, f_npld=f_npl) %>% 209 | select(bigram, congress, f_pld, f_npld) 210 | freqs_r <- 211 | filter(freqs, party=="R") %>% 212 | rename(f_plr=n, f_nplr=f_npl) %>% 213 | select(bigram, congress, f_plr, f_nplr) 214 | 215 | pol_bigrams <- 216 | inner_join(freqs_d, freqs_r, by=c("bigram", "congress")) %>% 217 | group_by(bigram, congress) %>% 218 | mutate(x2=((f_plr*f_npld-f_pld*f_nplr)^2)/ 219 | ((f_plr + f_pld)*(f_plr + f_nplr)* 220 | (f_pld + f_npld)*(f_nplr + f_npld))) %>% 221 | select(bigram, congress, x2, f_pld, f_plr) %>% 222 | compute("pol_bigrams") 223 | 224 | 225 | # create output data frame 226 | output <- pol_bigrams %>% 227 | group_by(congress) %>% 228 | arrange(desc(x2)) %>% 229 | sdf_with_sequential_id(id="index") %>% 230 | filter(index<=2000) %>% 231 | mutate(Party=ifelse(f_pld% 232 | select(bigram, congress, Party, x2) %>% 233 | collect() 234 | 235 | # disconnect from cluster 236 | spark_disconnect(sc) 237 | 238 | # packages to prepare and plot 239 | library(data.table) 240 | library(ggplot2) 241 | # select top ten per congress, clean 242 | output <- as.data.table(output) 243 | topten <- output[order(congress, x2, decreasing = TRUE), 244 | rank:=1:.N, by=list(congress)][rank %in% (1:5)] 245 | topten[, congress:=gsub("990", "99", congress)] 246 | topten[, congress:=gsub("980", "98", congress)] 247 | topten[, congress:=gsub("970", "97", congress)] 248 | 249 | # plot a visualization of the most partisan terms 250 | ggplot(topten, mapping=aes(x=as.integer(congress), y=log(x2), color=Party)) + 251 | geom_text(aes(label=bigram), nudge_y = 1)+ 252 | ylab("Partisanship score (Ln of Chisq. value)") + 253 | xlab("Congress") + 254 | scale_color_manual(values=c("D"="blue", "R"="red"), name="Party") + 255 | guides(color=guide_legend(title.position="top")) + 256 | scale_x_continuous(breaks=as.integer(unique(topten$congress))) + 257 | theme_minimal() + 258 | theme(axis.text.x = element_text(angle = 90, hjust = 1), 259 | axis.text.y = element_text(hjust = 1), 260 | panel.grid.major = element_blank(), 261 | panel.grid.minor = element_blank(), 262 | panel.background = element_blank()) 263 | 264 | 265 | 266 | # load packages 267 | library(dplyr) 268 | library(sparklyr) 269 | library(sparknlp) 270 | library(sparklyr.nested) 271 | 272 | # configuration of local spark cluster 273 | conf <- spark_config() 274 | conf$`sparklyr.shell.driver-memory` <- "16g" 275 | # connect rstudio session to cluster 276 | sc <- spark_connect(master = "local", 277 | config = conf) 278 | 279 | # LOAD -------------------- 280 | 281 | # load speeches 282 | INPUT_PATH_SPEECHES <- "data/text/speeches/" 283 | speeches <- 284 | spark_read_csv(sc, 285 | name = "speeches", 286 | path = INPUT_PATH_SPEECHES, 287 | delimiter = "|", 288 | overwrite = TRUE) %>% 289 | sample_n(10000, replace = FALSE) %>% 290 | compute("speeches") 291 | 292 | 293 | # load the nlp pipeline for sentiment analysis 294 | pipeline <- nlp_pretrained_pipeline(sc, "analyze_sentiment", "en") 295 | 296 | speeches_a <- 297 | nlp_annotate(pipeline, 298 | target = speeches, 299 | column = "speech") 300 | 301 | # extract sentiment coding per speech 302 | sentiments <- 303 | speeches_a %>% 304 | sdf_select(speech_id, sentiments=sentiment.result) %>% 305 | sdf_explode(sentiments) %>% 306 | mutate(pos = as.integer(sentiments=="positive"), 307 | neg = as.integer(sentiments=="negative")) %>% 308 | select(speech_id, pos, neg) 309 | 310 | 311 | # aggregate and download to R environment ----- 312 | sentiments_aggr <- 313 | sentiments %>% 314 | select(speech_id, pos, neg) %>% 315 | group_by(speech_id) %>% 316 | mutate(rel_pos = sum(pos)/(sum(pos) + sum(neg))) %>% 317 | filter(0% 318 | select(speech_id, rel_pos) %>% 319 | sdf_distinct(name = "sentiments_aggr") %>% 320 | collect() 321 | 322 | # disconnect from cluster 323 | spark_disconnect(sc) 324 | 325 | # clean 326 | library(data.table) 327 | sa <- as.data.table(sentiments_aggr) 328 | sa[, congress:=substr(speech_id, 1,3)] 329 | sa[, congress:=gsub("990", "99", congress)] 330 | sa[, congress:=gsub("980", "98", congress)] 331 | sa[, congress:=gsub("970", "97", congress)] 332 | 333 | # visualize results 334 | library(ggplot2) 335 | ggplot(sa, aes(x=as.integer(congress), 336 | y=rel_pos, 337 | group=congress)) + 338 | geom_boxplot() + 339 | ylab("Share of sentences with positive tone") + 340 | xlab("Congress") + 341 | theme_minimal() 342 | 343 | 344 | system.time( 345 | speeches_a <- 346 | nlp_annotate(pipeline, 347 | target = speeches, 348 | column = "speech") 349 | ) 350 | 351 | system.time( 352 | speeches_a <- 353 | nlp_annotate(pipeline, 354 | target = speeches, 355 | column = "speech") %>% 356 | compute(name= "speeches_a") 357 | ) 358 | 359 | # disconnect from cluster 360 | spark_disconnect(sc) 361 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/.nojekyll -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-178-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-178-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-190-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-190-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-193-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-193-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-194-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-194-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-196-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-196-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-198-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-198-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-199-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-199-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-200-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-200-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-201-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-201-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-202-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-202-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-203-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-203-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-204-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-204-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-205-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-205-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-206-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-206-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-207-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-207-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-208-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-208-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-209-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-209-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-210-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-210-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-211-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-211-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-221-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-221-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-222-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-222-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-223-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-223-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-224-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-224-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-225-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-225-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-226-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-226-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-227-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-227-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-228-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-228-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-229-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-229-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-27-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-27-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-275-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-275-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-31-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-31-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-318-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-318-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-326-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-326-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-33-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-33-1.png -------------------------------------------------------------------------------- /docs/bigdata_files/figure-html/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/bigdata_files/figure-html/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /docs/img/05_nlp_pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/05_nlp_pipeline.jpg -------------------------------------------------------------------------------- /docs/img/II_computing_environment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/II_computing_environment.png -------------------------------------------------------------------------------- /docs/img/I_approaches.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/I_approaches.png -------------------------------------------------------------------------------- /docs/img/TPU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/TPU.png -------------------------------------------------------------------------------- /docs/img/aws_emr_ready.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/aws_emr_ready.png -------------------------------------------------------------------------------- /docs/img/aws_rds_create.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/aws_rds_create.png -------------------------------------------------------------------------------- /docs/img/aws_rds_easycreate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/aws_rds_easycreate.png -------------------------------------------------------------------------------- /docs/img/colab_r_gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/colab_r_gpu.png -------------------------------------------------------------------------------- /docs/img/column_v_rowbased.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/column_v_rowbased.png -------------------------------------------------------------------------------- /docs/img/cover_print.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/cover_print.jpg -------------------------------------------------------------------------------- /docs/img/data_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/data_pipeline.png -------------------------------------------------------------------------------- /docs/img/distributed_system.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/distributed_system.jpg -------------------------------------------------------------------------------- /docs/img/druiddatasources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/druiddatasources.png -------------------------------------------------------------------------------- /docs/img/druidparse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/druidparse.png -------------------------------------------------------------------------------- /docs/img/druidquery.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/druidquery.png -------------------------------------------------------------------------------- /docs/img/druidstart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/druidstart.png -------------------------------------------------------------------------------- /docs/img/ec2_gpu1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/ec2_gpu1.png -------------------------------------------------------------------------------- /docs/img/ec2_gpu2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/ec2_gpu2.png -------------------------------------------------------------------------------- /docs/img/ec2_rstudioserver_htop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/ec2_rstudioserver_htop.png -------------------------------------------------------------------------------- /docs/img/gpt_SQL_prompt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/gpt_SQL_prompt.png -------------------------------------------------------------------------------- /docs/img/gpt_sql_response.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/gpt_sql_response.png -------------------------------------------------------------------------------- /docs/img/gpu_cpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/gpu_cpu.png -------------------------------------------------------------------------------- /docs/img/gpu_details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/gpu_details.png -------------------------------------------------------------------------------- /docs/img/rds_inboundrules.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/rds_inboundrules.png -------------------------------------------------------------------------------- /docs/img/rtx_2080.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/rtx_2080.png -------------------------------------------------------------------------------- /docs/img/screenshot_rstudio_server_upload.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/screenshot_rstudio_server_upload.png -------------------------------------------------------------------------------- /docs/img/uluru_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/uluru_comparison.png -------------------------------------------------------------------------------- /docs/img/uluru_comparison2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/uluru_comparison2.png -------------------------------------------------------------------------------- /docs/img/virtual_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/img/virtual_memory.png -------------------------------------------------------------------------------- /docs/libs/anchor-sections/anchor-sections-hash.css: -------------------------------------------------------------------------------- 1 | /* Styles for section anchors */ 2 | a.anchor-section::before {content: '#';font-size: 80%;} 3 | -------------------------------------------------------------------------------- /docs/libs/anchor-sections/anchor-sections.css: -------------------------------------------------------------------------------- 1 | /* Styles for section anchors */ 2 | a.anchor-section {margin-left: 10px; visibility: hidden; color: inherit;} 3 | .hasAnchor:hover a.anchor-section {visibility: visible;} 4 | ul > li > .anchor-section {display: none;} 5 | -------------------------------------------------------------------------------- /docs/libs/anchor-sections/anchor-sections.js: -------------------------------------------------------------------------------- 1 | document.addEventListener('DOMContentLoaded', function () { 2 | // If section divs is used, we need to put the anchor in the child header 3 | const headers = document.querySelectorAll("div.hasAnchor.section[class*='level'] > :first-child") 4 | 5 | headers.forEach(function (x) { 6 | // Add to the header node 7 | if (!x.classList.contains('hasAnchor')) x.classList.add('hasAnchor') 8 | // Remove from the section or div created by Pandoc 9 | x.parentElement.classList.remove('hasAnchor') 10 | }) 11 | }) 12 | -------------------------------------------------------------------------------- /docs/libs/gitbook/css/fontawesome/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/umatter/BigData/fbe55123144cab51fd082448149ecf5ccd859815/docs/libs/gitbook/css/fontawesome/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /docs/libs/gitbook/css/plugin-bookdown.css: -------------------------------------------------------------------------------- 1 | .book .book-header h1 { 2 | padding-left: 20px; 3 | padding-right: 20px; 4 | } 5 | .book .book-header.fixed { 6 | position: fixed; 7 | right: 0; 8 | top: 0; 9 | left: 0; 10 | border-bottom: 1px solid rgba(0,0,0,.07); 11 | } 12 | span.search-highlight { 13 | background-color: #ffff88; 14 | } 15 | @media (min-width: 600px) { 16 | .book.with-summary .book-header.fixed { 17 | left: 300px; 18 | } 19 | } 20 | @media (max-width: 1240px) { 21 | .book .book-body.fixed { 22 | top: 50px; 23 | } 24 | .book .book-body.fixed .body-inner { 25 | top: auto; 26 | } 27 | } 28 | @media (max-width: 600px) { 29 | .book.with-summary .book-header.fixed { 30 | left: calc(100% - 60px); 31 | min-width: 300px; 32 | } 33 | .book.with-summary .book-body { 34 | transform: none; 35 | left: calc(100% - 60px); 36 | min-width: 300px; 37 | } 38 | .book .book-body.fixed { 39 | top: 0; 40 | } 41 | } 42 | 43 | .book .book-body.fixed .body-inner { 44 | top: 50px; 45 | } 46 | .book .book-body .page-wrapper .page-inner section.normal sub, .book .book-body .page-wrapper .page-inner section.normal sup { 47 | font-size: 85%; 48 | } 49 | 50 | @media print { 51 | .book .book-summary, .book .book-body .book-header, .fa { 52 | display: none !important; 53 | } 54 | .book .book-body.fixed { 55 | left: 0px; 56 | } 57 | .book .book-body,.book .book-body .body-inner, .book.with-summary { 58 | overflow: visible !important; 59 | } 60 | } 61 | .kable_wrapper { 62 | border-spacing: 20px 0; 63 | border-collapse: separate; 64 | border: none; 65 | margin: auto; 66 | } 67 | .kable_wrapper > tbody > tr > td { 68 | vertical-align: top; 69 | } 70 | .book .book-body .page-wrapper .page-inner section.normal table tr.header { 71 | border-top-width: 2px; 72 | } 73 | .book .book-body .page-wrapper .page-inner section.normal table tr:last-child td { 74 | border-bottom-width: 2px; 75 | } 76 | .book .book-body .page-wrapper .page-inner section.normal table td, .book .book-body .page-wrapper .page-inner section.normal table th { 77 | border-left: none; 78 | border-right: none; 79 | } 80 | .book .book-body .page-wrapper .page-inner section.normal table.kable_wrapper > tbody > tr, .book .book-body .page-wrapper .page-inner section.normal table.kable_wrapper > tbody > tr > td { 81 | border-top: none; 82 | } 83 | .book .book-body .page-wrapper .page-inner section.normal table.kable_wrapper > tbody > tr:last-child > td { 84 | border-bottom: none; 85 | } 86 | 87 | div.theorem, div.lemma, div.corollary, div.proposition, div.conjecture { 88 | font-style: italic; 89 | } 90 | span.theorem, span.lemma, span.corollary, span.proposition, span.conjecture { 91 | font-style: normal; 92 | } 93 | div.proof>*:last-child:after { 94 | content: "\25a2"; 95 | float: right; 96 | } 97 | .header-section-number { 98 | padding-right: .5em; 99 | } 100 | #header .multi-author { 101 | margin: 0.5em 0 -0.5em 0; 102 | } 103 | #header .date { 104 | margin-top: 1.5em; 105 | } 106 | -------------------------------------------------------------------------------- /docs/libs/gitbook/css/plugin-clipboard.css: -------------------------------------------------------------------------------- 1 | div.sourceCode { 2 | position: relative; 3 | } 4 | 5 | .copy-to-clipboard-button { 6 | position: absolute; 7 | right: 0; 8 | top: 0; 9 | visibility: hidden; 10 | } 11 | 12 | .copy-to-clipboard-button:focus { 13 | outline: 0; 14 | } 15 | 16 | div.sourceCode:hover > .copy-to-clipboard-button { 17 | visibility: visible; 18 | } 19 | -------------------------------------------------------------------------------- /docs/libs/gitbook/css/plugin-fontsettings.css: -------------------------------------------------------------------------------- 1 | /* 2 | * Theme 1 3 | */ 4 | .color-theme-1 .dropdown-menu { 5 | background-color: #111111; 6 | border-color: #7e888b; 7 | } 8 | .color-theme-1 .dropdown-menu .dropdown-caret .caret-inner { 9 | border-bottom: 9px solid #111111; 10 | } 11 | .color-theme-1 .dropdown-menu .buttons { 12 | border-color: #7e888b; 13 | } 14 | .color-theme-1 .dropdown-menu .button { 15 | color: #afa790; 16 | } 17 | .color-theme-1 .dropdown-menu .button:hover { 18 | color: #73553c; 19 | } 20 | /* 21 | * Theme 2 22 | */ 23 | .color-theme-2 .dropdown-menu { 24 | background-color: #2d3143; 25 | border-color: #272a3a; 26 | } 27 | .color-theme-2 .dropdown-menu .dropdown-caret .caret-inner { 28 | border-bottom: 9px solid #2d3143; 29 | } 30 | .color-theme-2 .dropdown-menu .buttons { 31 | border-color: #272a3a; 32 | } 33 | .color-theme-2 .dropdown-menu .button { 34 | color: #62677f; 35 | } 36 | .color-theme-2 .dropdown-menu .button:hover { 37 | color: #f4f4f5; 38 | } 39 | .book .book-header .font-settings .font-enlarge { 40 | line-height: 30px; 41 | font-size: 1.4em; 42 | } 43 | .book .book-header .font-settings .font-reduce { 44 | line-height: 30px; 45 | font-size: 1em; 46 | } 47 | 48 | /* sidebar transition background */ 49 | div.book.color-theme-1 { 50 | background: #f3eacb; 51 | } 52 | .book.color-theme-1 .book-body { 53 | color: #704214; 54 | background: #f3eacb; 55 | } 56 | .book.color-theme-1 .book-body .page-wrapper .page-inner section { 57 | background: #f3eacb; 58 | } 59 | 60 | /* sidebar transition background */ 61 | div.book.color-theme-2 { 62 | background: #1c1f2b; 63 | } 64 | 65 | .book.color-theme-2 .book-body { 66 | color: #bdcadb; 67 | background: #1c1f2b; 68 | } 69 | .book.color-theme-2 .book-body .page-wrapper .page-inner section { 70 | background: #1c1f2b; 71 | } 72 | .book.font-size-0 .book-body .page-inner section { 73 | font-size: 1.2rem; 74 | } 75 | .book.font-size-1 .book-body .page-inner section { 76 | font-size: 1.4rem; 77 | } 78 | .book.font-size-2 .book-body .page-inner section { 79 | font-size: 1.6rem; 80 | } 81 | .book.font-size-3 .book-body .page-inner section { 82 | font-size: 2.2rem; 83 | } 84 | .book.font-size-4 .book-body .page-inner section { 85 | font-size: 4rem; 86 | } 87 | .book.font-family-0 { 88 | font-family: Georgia, serif; 89 | } 90 | .book.font-family-1 { 91 | font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; 92 | } 93 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal { 94 | color: #704214; 95 | } 96 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal a { 97 | color: inherit; 98 | } 99 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h1, 100 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h2, 101 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h3, 102 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h4, 103 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h5, 104 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h6 { 105 | color: inherit; 106 | } 107 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h1, 108 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h2 { 109 | border-color: inherit; 110 | } 111 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal h6 { 112 | color: inherit; 113 | } 114 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal hr { 115 | background-color: inherit; 116 | } 117 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal blockquote { 118 | border-color: #c4b29f; 119 | opacity: 0.9; 120 | } 121 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal pre, 122 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal code { 123 | background: #fdf6e3; 124 | color: #657b83; 125 | border-color: #f8df9c; 126 | } 127 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal .highlight { 128 | background-color: inherit; 129 | } 130 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal table th, 131 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal table td { 132 | border-color: #f5d06c; 133 | } 134 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal table tr { 135 | color: inherit; 136 | background-color: #fdf6e3; 137 | border-color: #444444; 138 | } 139 | .book.color-theme-1 .book-body .page-wrapper .page-inner section.normal table tr:nth-child(2n) { 140 | background-color: #fbeecb; 141 | } 142 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal { 143 | color: #bdcadb; 144 | } 145 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal a { 146 | color: #3eb1d0; 147 | } 148 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h1, 149 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h2, 150 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h3, 151 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h4, 152 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h5, 153 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h6 { 154 | color: #fffffa; 155 | } 156 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h1, 157 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h2 { 158 | border-color: #373b4e; 159 | } 160 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal h6 { 161 | color: #373b4e; 162 | } 163 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal hr { 164 | background-color: #373b4e; 165 | } 166 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal blockquote { 167 | border-color: #373b4e; 168 | } 169 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal pre, 170 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal code { 171 | color: #9dbed8; 172 | background: #2d3143; 173 | border-color: #2d3143; 174 | } 175 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal .highlight { 176 | background-color: #282a39; 177 | } 178 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal table th, 179 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal table td { 180 | border-color: #3b3f54; 181 | } 182 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal table tr { 183 | color: #b6c2d2; 184 | background-color: #2d3143; 185 | border-color: #3b3f54; 186 | } 187 | .book.color-theme-2 .book-body .page-wrapper .page-inner section.normal table tr:nth-child(2n) { 188 | background-color: #35394b; 189 | } 190 | .book.color-theme-1 .book-header { 191 | color: #afa790; 192 | background: transparent; 193 | } 194 | .book.color-theme-1 .book-header .btn { 195 | color: #afa790; 196 | } 197 | .book.color-theme-1 .book-header .btn:hover { 198 | color: #73553c; 199 | background: none; 200 | } 201 | .book.color-theme-1 .book-header h1 { 202 | color: #704214; 203 | } 204 | .book.color-theme-2 .book-header { 205 | color: #7e888b; 206 | background: transparent; 207 | } 208 | .book.color-theme-2 .book-header .btn { 209 | color: #3b3f54; 210 | } 211 | .book.color-theme-2 .book-header .btn:hover { 212 | color: #fffff5; 213 | background: none; 214 | } 215 | .book.color-theme-2 .book-header h1 { 216 | color: #bdcadb; 217 | } 218 | .book.color-theme-1 .book-body .navigation { 219 | color: #afa790; 220 | } 221 | .book.color-theme-1 .book-body .navigation:hover { 222 | color: #73553c; 223 | } 224 | .book.color-theme-2 .book-body .navigation { 225 | color: #383f52; 226 | } 227 | .book.color-theme-2 .book-body .navigation:hover { 228 | color: #fffff5; 229 | } 230 | /* 231 | * Theme 1 232 | */ 233 | .book.color-theme-1 .book-summary { 234 | color: #afa790; 235 | background: #111111; 236 | border-right: 1px solid rgba(0, 0, 0, 0.07); 237 | } 238 | .book.color-theme-1 .book-summary .book-search { 239 | background: transparent; 240 | } 241 | .book.color-theme-1 .book-summary .book-search input, 242 | .book.color-theme-1 .book-summary .book-search input:focus { 243 | border: 1px solid transparent; 244 | } 245 | .book.color-theme-1 .book-summary ul.summary li.divider { 246 | background: #7e888b; 247 | box-shadow: none; 248 | } 249 | .book.color-theme-1 .book-summary ul.summary li i.fa-check { 250 | color: #33cc33; 251 | } 252 | .book.color-theme-1 .book-summary ul.summary li.done > a { 253 | color: #877f6a; 254 | } 255 | .book.color-theme-1 .book-summary ul.summary li a, 256 | .book.color-theme-1 .book-summary ul.summary li span { 257 | color: #877f6a; 258 | background: transparent; 259 | font-weight: normal; 260 | } 261 | .book.color-theme-1 .book-summary ul.summary li.active > a, 262 | .book.color-theme-1 .book-summary ul.summary li a:hover { 263 | color: #704214; 264 | background: transparent; 265 | font-weight: normal; 266 | } 267 | /* 268 | * Theme 2 269 | */ 270 | .book.color-theme-2 .book-summary { 271 | color: #bcc1d2; 272 | background: #2d3143; 273 | border-right: none; 274 | } 275 | .book.color-theme-2 .book-summary .book-search { 276 | background: transparent; 277 | } 278 | .book.color-theme-2 .book-summary .book-search input, 279 | .book.color-theme-2 .book-summary .book-search input:focus { 280 | border: 1px solid transparent; 281 | } 282 | .book.color-theme-2 .book-summary ul.summary li.divider { 283 | background: #272a3a; 284 | box-shadow: none; 285 | } 286 | .book.color-theme-2 .book-summary ul.summary li i.fa-check { 287 | color: #33cc33; 288 | } 289 | .book.color-theme-2 .book-summary ul.summary li.done > a { 290 | color: #62687f; 291 | } 292 | .book.color-theme-2 .book-summary ul.summary li a, 293 | .book.color-theme-2 .book-summary ul.summary li span { 294 | color: #c1c6d7; 295 | background: transparent; 296 | font-weight: 600; 297 | } 298 | .book.color-theme-2 .book-summary ul.summary li.active > a, 299 | .book.color-theme-2 .book-summary ul.summary li a:hover { 300 | color: #f4f4f5; 301 | background: #252737; 302 | font-weight: 600; 303 | } 304 | -------------------------------------------------------------------------------- /docs/libs/gitbook/css/plugin-search.css: -------------------------------------------------------------------------------- 1 | .book .book-summary .book-search { 2 | padding: 6px; 3 | background: transparent; 4 | position: absolute; 5 | top: -50px; 6 | left: 0px; 7 | right: 0px; 8 | transition: top 0.5s ease; 9 | } 10 | .book .book-summary .book-search input, 11 | .book .book-summary .book-search input:focus, 12 | .book .book-summary .book-search input:hover { 13 | width: 100%; 14 | background: transparent; 15 | border: 1px solid #ccc; 16 | box-shadow: none; 17 | outline: none; 18 | line-height: 22px; 19 | padding: 7px 4px; 20 | color: inherit; 21 | box-sizing: border-box; 22 | } 23 | .book.with-search .book-summary .book-search { 24 | top: 0px; 25 | } 26 | .book.with-search .book-summary ul.summary { 27 | top: 50px; 28 | } 29 | .with-search .summary li[data-level] a[href*=".html#"] { 30 | display: none; 31 | } 32 | -------------------------------------------------------------------------------- /docs/libs/gitbook/css/plugin-table.css: -------------------------------------------------------------------------------- 1 | .book .book-body .page-wrapper .page-inner section.normal table{display:table;width:100%;border-collapse:collapse;border-spacing:0;overflow:auto}.book .book-body .page-wrapper .page-inner section.normal table td,.book .book-body .page-wrapper .page-inner section.normal table th{padding:6px 13px;border:1px solid #ddd}.book .book-body .page-wrapper .page-inner section.normal table tr{background-color:#fff;border-top:1px solid #ccc}.book .book-body .page-wrapper .page-inner section.normal table tr:nth-child(2n){background-color:#f8f8f8}.book .book-body .page-wrapper .page-inner section.normal table th{font-weight:700} 2 | -------------------------------------------------------------------------------- /docs/libs/gitbook/js/clipboard.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * clipboard.js v2.0.4 3 | * https://zenorocha.github.io/clipboard.js 4 | * 5 | * Licensed MIT © Zeno Rocha 6 | */ 7 | !function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.ClipboardJS=e():t.ClipboardJS=e()}(this,function(){return function(n){var o={};function r(t){if(o[t])return o[t].exports;var e=o[t]={i:t,l:!1,exports:{}};return n[t].call(e.exports,e,e.exports,r),e.l=!0,e.exports}return r.m=n,r.c=o,r.d=function(t,e,n){r.o(t,e)||Object.defineProperty(t,e,{enumerable:!0,get:n})},r.r=function(t){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(t,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(t,"__esModule",{value:!0})},r.t=function(e,t){if(1&t&&(e=r(e)),8&t)return e;if(4&t&&"object"==typeof e&&e&&e.__esModule)return e;var n=Object.create(null);if(r.r(n),Object.defineProperty(n,"default",{enumerable:!0,value:e}),2&t&&"string"!=typeof e)for(var o in e)r.d(n,o,function(t){return e[t]}.bind(null,o));return n},r.n=function(t){var e=t&&t.__esModule?function(){return t.default}:function(){return t};return r.d(e,"a",e),e},r.o=function(t,e){return Object.prototype.hasOwnProperty.call(t,e)},r.p="",r(r.s=0)}([function(t,e,n){"use strict";var r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t},i=function(){function o(t,e){for(var n=0;n indicates arrow keys):', 82 | '/: navigate to previous/next page', 83 | 's: Toggle sidebar']; 84 | if (config.search !== false) info.push('f: Toggle search input ' + 85 | '(use //Enter in the search input to navigate through search matches; ' + 86 | 'press Esc to cancel search)'); 87 | if (config.info !== false) gitbook.toolbar.createButton({ 88 | icon: 'fa fa-info', 89 | label: 'Information about the toolbar', 90 | position: 'left', 91 | onClick: function(e) { 92 | e.preventDefault(); 93 | window.alert(info.join('\n\n')); 94 | } 95 | }); 96 | 97 | // highlight the current section in TOC 98 | var href = window.location.pathname; 99 | href = href.substr(href.lastIndexOf('/') + 1); 100 | // accentuated characters need to be decoded (#819) 101 | href = decodeURIComponent(href); 102 | if (href === '') href = 'index.html'; 103 | var li = $('a[href^="' + href + location.hash + '"]').parent('li.chapter').first(); 104 | var summary = $('ul.summary'), chaps = summary.find('li.chapter'); 105 | if (li.length === 0) li = chaps.first(); 106 | li.addClass('active'); 107 | chaps.on('click', function(e) { 108 | chaps.removeClass('active'); 109 | $(this).addClass('active'); 110 | gs.set('tocScrollTop', summary.scrollTop()); 111 | }); 112 | 113 | var toc = config.toc; 114 | // collapse TOC items that are not for the current chapter 115 | if (toc && toc.collapse) (function() { 116 | var type = toc.collapse; 117 | if (type === 'none') return; 118 | if (type !== 'section' && type !== 'subsection') return; 119 | // sections under chapters 120 | var toc_sub = summary.children('li[data-level]').children('ul'); 121 | if (type === 'section') { 122 | toc_sub.hide() 123 | .parent().has(li).children('ul').show(); 124 | } else { 125 | toc_sub.children('li').children('ul').hide() 126 | .parent().has(li).children('ul').show(); 127 | } 128 | li.children('ul').show(); 129 | var toc_sub2 = toc_sub.children('li'); 130 | if (type === 'section') toc_sub2.children('ul').hide(); 131 | summary.children('li[data-level]').find('a') 132 | .on('click.bookdown', function(e) { 133 | if (href === $(this).attr('href').replace(/#.*/, '')) 134 | $(this).parent('li').children('ul').toggle(); 135 | }); 136 | })(); 137 | 138 | // add tooltips to the 's that are truncated 139 | $('a').each(function(i, el) { 140 | if (el.offsetWidth >= el.scrollWidth) return; 141 | if (typeof el.title === 'undefined') return; 142 | el.title = el.text; 143 | }); 144 | 145 | // restore TOC scroll position 146 | var pos = gs.get('tocScrollTop'); 147 | if (typeof pos !== 'undefined') summary.scrollTop(pos); 148 | 149 | // highlight the TOC item that has same text as the heading in view as scrolling 150 | if (toc && toc.scroll_highlight !== false && li.length > 0) (function() { 151 | // scroll the current TOC item into viewport 152 | var ht = $(window).height(), rect = li[0].getBoundingClientRect(); 153 | if (rect.top >= ht || rect.top <= 0 || rect.bottom <= 0) { 154 | summary.scrollTop(li[0].offsetTop); 155 | } 156 | // current chapter TOC items 157 | var items = $('a[href^="' + href + '"]').parent('li.chapter'), 158 | m = items.length; 159 | if (m === 0) { 160 | items = summary.find('li.chapter'); 161 | m = items.length; 162 | } 163 | if (m === 0) return; 164 | // all section titles on current page 165 | var hs = bookInner.find('.page-inner').find('h1,h2,h3'), n = hs.length, 166 | ts = hs.map(function(i, el) { return $(el).text(); }); 167 | if (n === 0) return; 168 | var scrollHandler = function(e) { 169 | var ht = $(window).height(); 170 | clearTimeout($.data(this, 'scrollTimer')); 171 | $.data(this, 'scrollTimer', setTimeout(function() { 172 | // find the first visible title in the viewport 173 | for (var i = 0; i < n; i++) { 174 | var rect = hs[i].getBoundingClientRect(); 175 | if (rect.top >= 0 && rect.bottom <= ht) break; 176 | } 177 | if (i === n) return; 178 | items.removeClass('active'); 179 | for (var j = 0; j < m; j++) { 180 | if (items.eq(j).children('a').first().text() === ts[i]) break; 181 | } 182 | if (j === m) j = 0; // highlight the chapter title 183 | // search bottom-up for a visible TOC item to highlight; if an item is 184 | // hidden, we check if its parent is visible, and so on 185 | while (j > 0 && items.eq(j).is(':hidden')) j--; 186 | items.eq(j).addClass('active'); 187 | }, 250)); 188 | }; 189 | bookInner.on('scroll.bookdown', scrollHandler); 190 | bookBody.on('scroll.bookdown', scrollHandler); 191 | })(); 192 | 193 | // do not refresh the page if the TOC item points to the current page 194 | $('a[href="' + href + '"]').parent('li.chapter').children('a') 195 | .on('click', function(e) { 196 | bookInner.scrollTop(0); 197 | bookBody.scrollTop(0); 198 | return false; 199 | }); 200 | 201 | var toolbar = config.toolbar; 202 | if (!toolbar || toolbar.position !== 'static') { 203 | var bookHeader = $('.book-header'); 204 | bookBody.addClass('fixed'); 205 | bookHeader.addClass('fixed') 206 | .css('background-color', bookBody.css('background-color')) 207 | .on('click.bookdown', function(e) { 208 | // the theme may have changed after user clicks the theme button 209 | bookHeader.css('background-color', bookBody.css('background-color')); 210 | }); 211 | } 212 | 213 | }); 214 | 215 | gitbook.events.bind("page.change", function(e) { 216 | // store TOC scroll position 217 | var summary = $('ul.summary'); 218 | gs.set('tocScrollTop', summary.scrollTop()); 219 | }); 220 | 221 | var bookBody = $('.book-body'), bookInner = bookBody.find('.body-inner'); 222 | var chapterTitle = function() { 223 | return bookInner.find('.page-inner').find('h1,h2').first().text(); 224 | }; 225 | var saveScrollPos = function(e) { 226 | // save scroll position before page is reloaded 227 | gs.set('bodyScrollTop', { 228 | body: bookBody.scrollTop(), 229 | inner: bookInner.scrollTop(), 230 | focused: document.hasFocus(), 231 | title: chapterTitle() 232 | }); 233 | }; 234 | $(document).on('servr:reload', saveScrollPos); 235 | 236 | // check if the page is loaded in an iframe (e.g. the RStudio preview window) 237 | var inIFrame = function() { 238 | var inIframe = true; 239 | try { inIframe = window.self !== window.top; } catch (e) {} 240 | return inIframe; 241 | }; 242 | if (inIFrame()) { 243 | $(window).on('blur unload', saveScrollPos); 244 | } 245 | 246 | $(function(e) { 247 | var pos = gs.get('bodyScrollTop'); 248 | if (pos) { 249 | if (pos.title === chapterTitle()) { 250 | if (pos.body !== 0) bookBody.scrollTop(pos.body); 251 | if (pos.inner !== 0) bookInner.scrollTop(pos.inner); 252 | } 253 | } 254 | if ((pos && pos.focused) || !inIFrame()) bookInner.find('.page-wrapper').focus(); 255 | // clear book body scroll position 256 | gs.remove('bodyScrollTop'); 257 | }); 258 | 259 | }); 260 | -------------------------------------------------------------------------------- /docs/libs/gitbook/js/plugin-clipboard.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["gitbook", "jQuery"], function(gitbook, $) { 2 | 3 | var copyButton = ''; 4 | var clipboard; 5 | 6 | gitbook.events.bind("page.change", function() { 7 | 8 | if (!ClipboardJS.isSupported()) return; 9 | 10 | // the page.change event is thrown twice: before and after the page changes 11 | if (clipboard) { 12 | // clipboard is already defined but we are on the same page 13 | if (clipboard._prevPage === window.location.pathname) return; 14 | // clipboard is already defined and url path change 15 | // we can deduct that we are before page changes 16 | clipboard.destroy(); // destroy the previous events listeners 17 | clipboard = undefined; // reset the clipboard object 18 | return; 19 | } 20 | 21 | $(copyButton).prependTo("div.sourceCode"); 22 | 23 | clipboard = new ClipboardJS(".copy-to-clipboard-button", { 24 | text: function(trigger) { 25 | return trigger.parentNode.textContent; 26 | } 27 | }); 28 | 29 | clipboard._prevPage = window.location.pathname 30 | 31 | }); 32 | 33 | }); 34 | -------------------------------------------------------------------------------- /docs/libs/gitbook/js/plugin-fontsettings.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) { 2 | var fontState; 3 | 4 | var THEMES = { 5 | "white": 0, 6 | "sepia": 1, 7 | "night": 2 8 | }; 9 | 10 | var FAMILY = { 11 | "serif": 0, 12 | "sans": 1 13 | }; 14 | 15 | // Save current font settings 16 | function saveFontSettings() { 17 | gitbook.storage.set("fontState", fontState); 18 | update(); 19 | } 20 | 21 | // Increase font size 22 | function enlargeFontSize(e) { 23 | e.preventDefault(); 24 | if (fontState.size >= 4) return; 25 | 26 | fontState.size++; 27 | saveFontSettings(); 28 | }; 29 | 30 | // Decrease font size 31 | function reduceFontSize(e) { 32 | e.preventDefault(); 33 | if (fontState.size <= 0) return; 34 | 35 | fontState.size--; 36 | saveFontSettings(); 37 | }; 38 | 39 | // Change font family 40 | function changeFontFamily(index, e) { 41 | e.preventDefault(); 42 | 43 | fontState.family = index; 44 | saveFontSettings(); 45 | }; 46 | 47 | // Change type of color 48 | function changeColorTheme(index, e) { 49 | e.preventDefault(); 50 | 51 | var $book = $(".book"); 52 | 53 | if (fontState.theme !== 0) 54 | $book.removeClass("color-theme-"+fontState.theme); 55 | 56 | fontState.theme = index; 57 | if (fontState.theme !== 0) 58 | $book.addClass("color-theme-"+fontState.theme); 59 | 60 | saveFontSettings(); 61 | }; 62 | 63 | function update() { 64 | var $book = gitbook.state.$book; 65 | 66 | $(".font-settings .font-family-list li").removeClass("active"); 67 | $(".font-settings .font-family-list li:nth-child("+(fontState.family+1)+")").addClass("active"); 68 | 69 | $book[0].className = $book[0].className.replace(/\bfont-\S+/g, ''); 70 | $book.addClass("font-size-"+fontState.size); 71 | $book.addClass("font-family-"+fontState.family); 72 | 73 | if(fontState.theme !== 0) { 74 | $book[0].className = $book[0].className.replace(/\bcolor-theme-\S+/g, ''); 75 | $book.addClass("color-theme-"+fontState.theme); 76 | } 77 | }; 78 | 79 | function init(config) { 80 | var $bookBody, $book; 81 | 82 | //Find DOM elements. 83 | $book = gitbook.state.$book; 84 | $bookBody = $book.find(".book-body"); 85 | 86 | // Instantiate font state object 87 | fontState = gitbook.storage.get("fontState", { 88 | size: config.size || 2, 89 | family: FAMILY[config.family || "sans"], 90 | theme: THEMES[config.theme || "white"] 91 | }); 92 | 93 | update(); 94 | }; 95 | 96 | 97 | gitbook.events.bind("start", function(e, config) { 98 | var opts = config.fontsettings; 99 | if (!opts) return; 100 | 101 | // Create buttons in toolbar 102 | gitbook.toolbar.createButton({ 103 | icon: 'fa fa-font', 104 | label: 'Font Settings', 105 | className: 'font-settings', 106 | dropdown: [ 107 | [ 108 | { 109 | text: 'A', 110 | className: 'font-reduce', 111 | onClick: reduceFontSize 112 | }, 113 | { 114 | text: 'A', 115 | className: 'font-enlarge', 116 | onClick: enlargeFontSize 117 | } 118 | ], 119 | [ 120 | { 121 | text: 'Serif', 122 | onClick: _.partial(changeFontFamily, 0) 123 | }, 124 | { 125 | text: 'Sans', 126 | onClick: _.partial(changeFontFamily, 1) 127 | } 128 | ], 129 | [ 130 | { 131 | text: 'White', 132 | onClick: _.partial(changeColorTheme, 0) 133 | }, 134 | { 135 | text: 'Sepia', 136 | onClick: _.partial(changeColorTheme, 1) 137 | }, 138 | { 139 | text: 'Night', 140 | onClick: _.partial(changeColorTheme, 2) 141 | } 142 | ] 143 | ] 144 | }); 145 | 146 | 147 | // Init current settings 148 | init(opts); 149 | }); 150 | }); 151 | 152 | 153 | -------------------------------------------------------------------------------- /docs/libs/gitbook/js/plugin-search.js: -------------------------------------------------------------------------------- 1 | gitbook.require(["gitbook", "lodash", "jQuery"], function(gitbook, _, $) { 2 | var index = null; 3 | var fuse = null; 4 | var _search = {engine: 'lunr', opts: {}}; 5 | var $searchInput, $searchLabel, $searchForm; 6 | var $highlighted = [], hi, hiOpts = { className: 'search-highlight' }; 7 | var collapse = false, toc_visible = []; 8 | 9 | function init(config) { 10 | // Instantiate search settings 11 | _search = gitbook.storage.get("search", { 12 | engine: config.search.engine || 'lunr', 13 | opts: config.search.options || {}, 14 | }); 15 | }; 16 | 17 | // Save current search settings 18 | function saveSearchSettings() { 19 | gitbook.storage.set("search", _search); 20 | } 21 | 22 | // Use a specific index 23 | function loadIndex(data) { 24 | // [Yihui] In bookdown, I use a character matrix to store the chapter 25 | // content, and the index is dynamically built on the client side. 26 | // Gitbook prebuilds the index data instead: https://github.com/GitbookIO/plugin-search 27 | // We can certainly do that via R packages V8 and jsonlite, but let's 28 | // see how slow it really is before improving it. On the other hand, 29 | // lunr cannot handle non-English text very well, e.g. the default 30 | // tokenizer cannot deal with Chinese text, so we may want to replace 31 | // lunr with a dumb simple text matching approach. 32 | if (_search.engine === 'lunr') { 33 | index = lunr(function () { 34 | this.ref('url'); 35 | this.field('title', { boost: 10 }); 36 | this.field('body'); 37 | }); 38 | data.map(function(item) { 39 | index.add({ 40 | url: item[0], 41 | title: item[1], 42 | body: item[2] 43 | }); 44 | }); 45 | return; 46 | } 47 | fuse = new Fuse(data.map((_data => { 48 | return { 49 | url: _data[0], 50 | title: _data[1], 51 | body: _data[2] 52 | }; 53 | })), Object.assign( 54 | { 55 | includeScore: true, 56 | threshold: 0.1, 57 | ignoreLocation: true, 58 | keys: ["title", "body"] 59 | }, 60 | _search.opts 61 | )); 62 | } 63 | 64 | // Fetch the search index 65 | function fetchIndex() { 66 | return $.getJSON(gitbook.state.basePath+"/search_index.json") 67 | .then(loadIndex); // [Yihui] we need to use this object later 68 | } 69 | 70 | // Search for a term and return results 71 | function search(q) { 72 | let results = []; 73 | switch (_search.engine) { 74 | case 'fuse': 75 | if (!fuse) return; 76 | results = fuse.search(q).map(function(result) { 77 | var parts = result.item.url.split('#'); 78 | return { 79 | path: parts[0], 80 | hash: parts[1] 81 | }; 82 | }); 83 | break; 84 | case 'lunr': 85 | default: 86 | if (!index) return; 87 | results = _.chain(index.search(q)).map(function(result) { 88 | var parts = result.ref.split("#"); 89 | return { 90 | path: parts[0], 91 | hash: parts[1] 92 | }; 93 | }) 94 | .value(); 95 | } 96 | 97 | // [Yihui] Highlight the search keyword on current page 98 | $highlighted = $('.page-inner') 99 | .unhighlight(hiOpts).highlight(q, hiOpts).find('span.search-highlight'); 100 | scrollToHighlighted(0); 101 | 102 | return results; 103 | } 104 | 105 | // [Yihui] Scroll the chapter body to the i-th highlighted string 106 | function scrollToHighlighted(d) { 107 | var n = $highlighted.length; 108 | hi = hi === undefined ? 0 : hi + d; 109 | // navignate to the previous/next page in the search results if reached the top/bottom 110 | var b = hi < 0; 111 | if (d !== 0 && (b || hi >= n)) { 112 | var path = currentPath(), n2 = toc_visible.length; 113 | if (n2 === 0) return; 114 | for (var i = b ? 0 : n2; (b && i < n2) || (!b && i >= 0); i += b ? 1 : -1) { 115 | if (toc_visible.eq(i).data('path') === path) break; 116 | } 117 | i += b ? -1 : 1; 118 | if (i < 0) i = n2 - 1; 119 | if (i >= n2) i = 0; 120 | var lnk = toc_visible.eq(i).find('a[href$=".html"]'); 121 | if (lnk.length) lnk[0].click(); 122 | return; 123 | } 124 | if (n === 0) return; 125 | var $p = $highlighted.eq(hi); 126 | $p[0].scrollIntoView(); 127 | $highlighted.css('background-color', ''); 128 | // an orange background color on the current item and removed later 129 | $p.css('background-color', 'orange'); 130 | setTimeout(function() { 131 | $p.css('background-color', ''); 132 | }, 2000); 133 | } 134 | 135 | function currentPath() { 136 | var href = window.location.pathname; 137 | href = href.substr(href.lastIndexOf('/') + 1); 138 | return href === '' ? 'index.html' : href; 139 | } 140 | 141 | // Create search form 142 | function createForm(value) { 143 | if ($searchForm) $searchForm.remove(); 144 | if ($searchLabel) $searchLabel.remove(); 145 | if ($searchInput) $searchInput.remove(); 146 | 147 | $searchForm = $('
', { 148 | 'class': 'book-search', 149 | 'role': 'search' 150 | }); 151 | 152 | $searchLabel = $('