├── .Rprofile
├── .github
    └── workflows
    │   └── publish.yml
├── .gitignore
├── README.md
├── _publish.yml
├── _quarto.yml
├── content
    ├── chapter01.qmd
    ├── chapter02.qmd
    ├── chapter03.qmd
    ├── chapter04.qmd
    ├── chapter05.qmd
    ├── chapter06.qmd
    ├── chapter07.qmd
    ├── chapter08.qmd
    ├── chapter09.qmd
    ├── chapter10.qmd
    ├── chapter11.qmd
    ├── chapter12-note-text.qmd
    ├── chapter12.qmd
    ├── chapter13.qmd
    ├── chapter14.qmd
    ├── chapter15.qmd
    ├── chapter16.qmd
    ├── common_setup.qmd
    ├── img
    │   ├── ch04_colab.png
    │   ├── ch04_github.png
    │   ├── ch04_notebooks.png
    │   ├── ch07_figjoins.png
    │   ├── ch09_cnn_cropped.png
    │   ├── ch09_matrix.png
    │   ├── ch09_overfitting.png
    │   ├── ch09_roccurve.png
    │   ├── ch15_fashion.png
    │   ├── ch15_location.png
    │   ├── ch15_numbers.png
    │   ├── ch15_pixel.png
    │   ├── ch15_refugees.png
    │   ├── ch15_yolo.png
    │   ├── ch3_r_studio.png
    │   ├── ch4_stackover.png
    │   ├── ch6_csv-in-editor.png
    │   ├── favicon.png
    │   ├── fig_decisiontree.png
    │   ├── fig_hiddenlayers.png
    │   ├── fig_perceptron.png
    │   ├── fig_sigmoid.png
    │   ├── jupyter.png
    │   ├── lda.png
    │   ├── messy.png
    │   ├── ssh.png
    │   ├── vmazure.png
    │   └── vmopennebula.png
    ├── references.bib
    └── references.qmd
├── index.qmd
├── renv.lock
├── renv
    ├── .gitignore
    ├── activate.R
    ├── install.R
    └── settings.dcf
└── requirements.txt


/.Rprofile:
--------------------------------------------------------------------------------
1 | source("renv/activate.R")
2 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 |   push:
 4 |     branches: main
 5 | 
 6 | name: Quarto Publish
 7 | 
 8 | jobs:
 9 |   build-deploy:
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       contents: write
13 |     steps:
14 |       - name: Check out repository
15 |         uses: actions/checkout@v2
16 | 
17 |       - name: Set up Quarto
18 |         uses: quarto-dev/quarto-actions/setup@v2
19 | 
20 |       - name: Install R
21 |         uses: r-lib/actions/setup-r@v2
22 |         with:
23 |           r-version: '4.2.0'
24 | 
25 |       - name: Additional apt packages
26 |         uses: awalsh128/cache-apt-pkgs-action@latest
27 |         with:
28 |           packages: libcurl4-openssl-dev libpoppler-cpp-dev libmagick++-dev
29 |           version: 1.0
30 | 
31 |       - name: Install R Dependencies
32 |         uses: r-lib/actions/setup-renv@v2
33 |         with:
34 |           cache-version: 1
35 | 
36 |       - name: Setup reticulate
37 |         shell: Rscript {0}
38 |         run: |
39 |           install.packages('reticulate')
40 | 
41 |       - name: Render and Publish
42 |         uses: quarto-dev/quarto-actions/publish@v2
43 |         with:
44 |           target: netlify
45 |           NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /_freeze/
 2 | env
 3 | /.quarto/
 4 | 
 5 | # Cache files from various chapters
 6 | /content/*.udpipe
 7 | /content/glove*.txt
 8 | /content/reviewdata.pickle.bz2
 9 | /content/reviewdata.rds
10 | /content/w2v_320d_trimmed
11 | /content/mnist_784.pickle
12 | 
13 | /env/
14 | 
15 | /content/*_cache/
16 | /content/*_files/
17 | /tikz*.log
18 | /.RData
19 | /.Rhistory
20 | 
21 | /_book/
22 | /.vscode/
23 | .Rproj.user
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Quarto book for Computational Analysis of Communication
 2 | 
 3 | This is a quarto re-render of the original latex+ipynb sources of the CAC book. 
 4 | 
 5 | It is now considered the 'canonical' source of the book, and we are using this version to publish the [cssbook.net](https://cssbook.net) open access version and to update it for a new version. 
 6 | 
 7 | There is a [github action](https://github.com/vanatteveldt/cssbook/actions) to automatically update [cssbook.net](https://cssbook.net) after each commit, so you can make small fixes directly on github and/or work locally and push your changes. 
 8 | 
 9 | # Setup
10 | 
11 | Steps needed to render the book:
12 |  - Install quarto
13 |  - Clone this repository
14 |  - Activate the `renv` R virtual environment, e.g. using [renv/install.R]
15 | 
16 | On my system, the following works for these steps:
17 | 
18 | ```
19 | # Install quarto 
20 | wget https://github.com/quarto-dev/quarto-cli/releases/download/v1.2.313/quarto-1.2.313-linux-amd64.deb
21 | sudo apt install ./quarto-1.2.313-linux-amd64.deb
22 | 
23 | # Install prerequisites for R packages (might be incomplete, please add if you find more requirements)
24 | sudo apt install gfortran cmake liblapack-dev libgsl-dev libpng-dev libpoppler-cpp-dev libmagick++-dev
25 | 
26 | # Clone the repository
27 | git clone git@github.com:vanatteveldt/cssbook
28 | cd cssbook
29 | 
30 | # Activate the renv
31 | Rscript renv/install.R
32 | ```
33 | 
34 | # Render the book 
35 | 
36 | ```
37 | quarto render
38 | ```
39 | 
40 | # A note on caching
41 | 
42 | The first time to render the book will take a long time. 
43 | After this, content is both *frozen* at the chapter level, and *cached* at the chunk level where sensible. 
44 | 
45 | Note that knitr caching for python does not preserve global variables, so python chunks that create objects used in another chunk should not be cached.
46 | For R objects are cached so this is possible.
47 | 
48 | 


--------------------------------------------------------------------------------
/_publish.yml:
--------------------------------------------------------------------------------
1 | - source: project
2 |   netlify:
3 |     - id: d941c9c2-6028-431e-bd4d-abdfcb5aad2e
4 |       url: 'https://statuesque-pony-87679c.netlify.app'
5 | 


--------------------------------------------------------------------------------
/_quarto.yml:
--------------------------------------------------------------------------------
 1 | project:
 2 |   type: book
 3 | 
 4 | book:
 5 |   title: "Computational Analysis of Communication"
 6 |   author: "Wouter van Atteveldt, Damian Trilling & Carlos Arcila"
 7 |   date: "2022-03-11"
 8 |   favicon: img/favicon.png
 9 |   chapters:
10 |     - index.qmd
11 |     - content/chapter01.qmd
12 |     - content/chapter02.qmd
13 |     - content/chapter03.qmd
14 |     - content/chapter04.qmd
15 |     - content/chapter05.qmd
16 |     - content/chapter06.qmd
17 |     - content/chapter07.qmd
18 |     - content/chapter08.qmd
19 |     - content/chapter09.qmd
20 |     - content/chapter10.qmd
21 |     - content/chapter11.qmd
22 |     - content/chapter12.qmd
23 |     - content/chapter13.qmd
24 |     - content/chapter14.qmd
25 |     - content/chapter15.qmd
26 |     - content/chapter16.qmd
27 |     - content/references.qmd
28 |   twitter-card:
29 |     creator: "@vanatteveldt"
30 |     image: content/img/cover.jpg
31 |   
32 | bibliography: content/references.bib
33 | 
34 | format:
35 |   html:
36 |     theme: cosmo
37 |     include-after-body:
38 |       text: |
39 |         <script>
40 |           var python = [];
41 |           var r = [];
42 |           document.querySelectorAll('a[data-bs-toggle="tab"]').forEach((a) => {
43 |             var target = a.textContent == "Python code"?python:r;
44 |             target.push(new bootstrap.Tab(a));
45 |             a.addEventListener('shown.bs.tab', () => target.forEach((tab) => tab.show()));
46 |            });
47 |         </script>
48 | 
49 | editor: visual
50 | 
51 | engine: knitr
52 | execute:
53 |   warning: false
54 |   freeze: auto


--------------------------------------------------------------------------------
/content/chapter01.qmd:
--------------------------------------------------------------------------------
  1 | # Introduction {#sec-chap-introduction}
  2 | 
  3 | ```{python}
  4 | #| echo: false
  5 | import warnings; warnings.filterwarnings('ignore')
  6 | ```
  7 | 
  8 | **Abstract.**
  9 | This chapter explains how the methods outlined in this book are
 10 | situated within the methodological and epistemological frameworks used
 11 | by social scientists. It argues why the use of Python and R is
 12 | fundamental for the computational analysis of communication. Finally,
 13 | it shows how this book can be used by students and scholars.
 14 | 
 15 | **Keywords.** computational social science, Python, R
 16 | 
 17 | **Objectives:**
 18 | 
 19 | -  Understand the role of computational analysis in the social sciences
 20 |  -  Understand the choice between Python and/or R
 21 |  -  Know how to read this book
 22 | 
 23 | ## The Role of Computational Analysis in the Social Sciences {#sec-ccs}
 24 | 
 25 | The use of computers is nothing new in the social sciences. In fact,
 26 | one could argue that some disciplines within the social sciences have
 27 | even been early adopters of computational approaches. Take the
 28 | gathering and analyzing of large-scale survey data, dating back to
 29 | the use of the Hollerith Machine in the 1890 US census. Long before
 30 | every scholar had a personal computer on their desk, social scientists
 31 | were using punch cards and mainframe computers to deal with such
 32 | data. If we think of the analysis of *communication*
 33 | more specifically, we already see attempts to automate content analysis
 34 |  in the 1960's [see, e.g. @Scharkow2017].
 35 | 
 36 | However, something has profoundly changed in recent decades. The amount
 37 | and type of data we can collect as well as the computational power we
 38 | have access to have increased dramatically. In particular, digital
 39 | traces that we leave when communicating online, from access logs to
 40 | comments we place, have required new approaches [e.g., @Trilling2017b]. At the same time, better computational
 41 | facilities now allow us to ask questions we could not answer before.
 42 | 
 43 | @Gonzalez-Bailon2017, for instance, argued that the
 44 | computational analysis of communication now allows us to test theories
 45 | that were formulated a century ago, such as Tarde's theory of
 46 | social imitation. @Salganik2019 tells an impressive
 47 | methodological story of continuity in showing how new digital
 48 | research methods build on and relate to established methods such as
 49 | surveys and experiments, while offering new possibilities by observing
 50 | behavior in new ways.
 51 | 
 52 | A frequent misunderstanding, then, about computational approaches is
 53 | that they would somehow be a-theoretical. This is probably fueled by
 54 | clichés coined during the "Big Data"-hype in the 2010's, such as the
 55 | infamous saying that in the age of Big Data, correlation is enough [@Mayer2013];
 56 | but one could not be more wrong: as the work of Kitchin shows [@Kitchin2014;@Kitchin2014data], computational approaches can
 57 | be well situated within existing epistemologies.
 58 | For the field to advance, computational and theoretical work should be symbiotic, with each informing the other
 59 | and with neither claiming superiority [@margolin19].
 60 | Thus, the computational
 61 | scientists' toolbox includes both more data-driven and more
 62 | theory-driven techniques; some are more bottom-up and inductive,
 63 | others are more top-down and deductive. What matters here, and what is
 64 | often overlooked, is in which stage of the research process they are
 65 | employed. In other words, both inductive and deductive approaches as
 66 | they are distinguished in more traditional social-science textbooks
 67 | [e.g., @Bryman2012] have their equivalent in the computational
 68 | social sciences.
 69 | 
 70 | Therefore, we suggest that the data collection and data
 71 | analysis process is thought of as a pipeline. To test, for instance, a theoretically
 72 | grounded hypothesis about personalization in the news, we could
 73 | imagine a pipeline that starts with scraping online news, proceeds
 74 | with some natural-language processing techniques such as Named Entity
 75 | Recognition, and finally tests whether the mentioning of persons has
 76 | an influence on the placement of the stories. We can distinguish here
 77 | between parts of the pipeline that are just necessary but not
 78 | inherently interesting to us, and parts of the pipeline that answer a
 79 | genuinely interesting question. In this example, the inner workings of
 80 | the Named Entity Recognition step are not genuinely interesting for us
 81 | -- we just need to do it to answer our question.
 82 | We do care about how well it works and especially which biases it may have that could affect our substantive outcomes,
 83 | but we are not really evaluating any theory on Named Entity Recognition here.
 84 | We are, however, answering a theoretically
 85 | interesting question when we look at the pipeline as a whole,
 86 | that is, when we apply the tools in order to tackle a social scientific problem.
 87 | Of course, what is genuinely interesting depends on one's discipline: For a
 88 | computational linguist, the inner workings of the Named Entity Recognition
 89 | may actually be the interesting part, and our research question just one
 90 | possible "downstream task".
 91 | 
 92 | This distinction is also sometimes referred to as "building a better
 93 | mousetrap" versus "understanding". For instance, @Breiman2001
 94 | remarked: "My attitude toward new and/or complicated methods is
 95 | pragmatic. Prove that you've got a better mousetrap and I'll buy
 96 | it. But the proof had better be concrete and convincing."
 97 | (p. 230).
 98 | In contrast, many social scientists are using statistical
 99 | models to test theories and to understand social processes: they want
100 | to specifically understand how $x$ relates to $y$, even if $y$ may be better
101 | predicted by another (theoretically uninteresting) variable.
102 | 
103 | This book is to some extent about both building mousetraps and understanding. When you
104 | are building a supervised machine learning classifier to determine the
105 | topic of each text in a large collection of news articles or
106 | parliamentary speeches, you are building a (better) mousetrap. But as
107 | a social scientist, your work does not stop there. You need to use
108 | the mousetrap to answer some theoretically interesting question.
109 | 
110 | Actually, we expect that the contents of this book will provide a background that helps you to face the current research challenges in both academia and industry. On the one hand, the emerging field of Computational Social Science has become one of the most promising areas of knowledge and many universities and research institutes are looking for scholars with this profile.  On the other hand, it is widely known that nowadays the computational skills will increase your job opportunities in private companies, public organizations, or NGOs, given the growing interest in data-driven solutions.
111 | 
112 | When planning this book, we needed to make a couple of tough
113 | choices. We aimed to at least give an introduction to all techniques
114 | that students and scholars who want to computationally analyze
115 | communication will probably be confronted with. Of course, specific --
116 | technical -- literature on techniques such as, for instance, machine
117 | learning can cover the subject in more depth, and the interested student may indeed
118 | want to dive into one or several of the techniques we cover more
119 | deeply. Our goal here is to offer enough working knowledge to apply
120 | these techniques and to know what to look for.  While trying to cover
121 | the breadth of the field without sacrificing too much depth when
122 | covering each technique, we still needed to draw some boundaries. One
123 | technique that some readers may miss is agent-based modeling
124 | (ABM). Arguably, such simulation techniques are an important technique
125 | in the computational social sciences more broadly
126 | [@cioffi-revilla2014], and they have recently been applied to the
127 | analysis of communication as well
128 | [@Waldherr2014;@Wettstein2020]. Nevertheless, when reviewing the
129 | curricula of current courses teaching the computational analysis of
130 | communication, we found that simulation approaches do not seem to be at the core of
131 | such analyses (yet).  Instead, when looking at the use of computational
132 | techniques in fields such as journalism studies
133 | [e.g., @Boumans2016], media studies [e.g., @Rieder2017], or
134 | the text-as-data movement [@Grimmer2013], we see a core of
135 | techniques that are used  over and over again, and that we have therefore
136 | included in our book. In particular, besides general data analysis and visualization techniques,
137 | these are techniques for
138 | gathering data such as web scraping or the use of API's; techniques
139 | for dealing with text such as natural language processing and
140 | different ways to turn text into numbers; supervised and unsupervised
141 | machine learning techniques; and network analysis.
142 | 
143 | ## Why Python and/or R? {#sec-whypythonr}
144 | By far most work in the computational social sciences is done using
145 | Python and/or R. Sure, for some specific tasks there are standalone
146 | programs that are occasionally used; and there are some useful applications
147 | written in other languages such as C or Java. But we believe it is
148 | fair to say that it is very hard to delve into the computational analysis
149 | of communication without learning at least either Python or R, and
150 | preferably  both of them.
151 | There are very few tasks that you cannot do with at least one of them.
152 | 
153 | Some people have strong beliefs as to which language is "better" -- we
154 | do not subscribe to that view. Most techniques that are relevant to us can be
155 | done in either language, and personal preference is a big factor. R
156 | started out as a statistical programming environment, and that
157 | heritage is still visible, for instance in the strong emphasis on
158 | vectors, factors, et cetera, or the possibility to estimate complex
159 | statistical models in just one line of code. Python started out as a
160 | general-purpose programming language, which means that some of the things we
161 | do feel a bit more `low-level' -- Python abstracts away less of the
162 | underlying programming concepts than R does. This sometimes gives us
163 | more flexibility -- at the cost of being more wordy.
164 | In recent years, however, Python and R have been
165 | growing closer to each other: with modules like *pandas* and
166 | *statsmodels*, Python now has R-like functionality handling data
167 | frames and estimating common statistical models on them; and with
168 | packages such as *quanteda*, handling of text -- traditionally a
169 | strong domain of Python -- has become more accessible in R.
170 | 
171 | This is the main reason why we decided to write this "bi-lingual"
172 | book. We wanted to teach techniques for the computational analysis of
173 | communication, without enforcing a specific implementation. We hope
174 | that the reader will learn from our book, say, how to transform a text
175 | into features and how to choose an appropriate machine learning model,
176 | but will find it of less importance in which language this happens.
177 | 
178 | However, sometimes, there are good reasons to choose one language above
179 | the other. For instance, many machine learning models in the popular *caret* package in R under the
180 | hood create a dense matrix, which severely limits the number of
181 | documents and features one can use; also, some complex web scraping
182 | tasks are maybe easier to realize in Python. On the other hand, R's
183 | data wrangling and visualization techniques in the *tidyverse*
184 | environment are known for their user-friendliness and quality.  In the
185 | rare cases where we believe that R or Python is clearly superior for a
186 | given task, we indicate this; for the rest, we believe that it is up to
187 | the reader to choose.
188 | 
189 | ## How to use this book {#sec-howtouse}
190 | 
191 | This book differs from more technically oriented books on the one hand
192 | and more conceptual books on the other hand. We do cover the technical
193 | background that is necessary to understand what is going on, but we
194 | keep both computer science concepts and mathematical concepts to a
195 | minimum. For instance, if we had written a more technical book about
196 | programming in Python, we would have introduced rather early and in
197 | detail concepts such as classes, inheritance, and instances of
198 | classes. Instead, we decided to provide such information only as
199 | additional background where necessary, and to focus, rather pragmatically,
200 | on the application of techniques for the computational analysis of
201 | communication. Vice versa, if we had written a more conceptual book on
202 | new methods in our field, we would have given more emphasis to
203 | epistemological aspects, and had skipped the programming examples,
204 | which are now at the core of this book.
205 | 
206 | We do not expect much prior knowledge from the readers of this
207 | book. Sure, some affinity with computers helps, but there is no strict
208 | requirement on what you need to know. Also in terms of statistics, it
209 | helps if you have heard of concepts such as correlation or
210 | regression analysis, but even if your knowledge here is rather
211 | limited, you should be able to follow along.
212 | 
213 | This also means that you may be able to skip chapters. For instance,
214 | if you already work with R and/or Python, you may not need our
215 | detailed instructions at the beginning. Still, the book follows a
216 | logical order in which chapters build on previous ones. For instance,
217 | when explaining supervised machine learning on textual data, we expect
218 | you to be familiar with previous chapters that deal with machine
219 | learning in general, or with the handling of textual data.
220 | 
221 | This book is designed in such a way that it can be used as a text book
222 | for introductory courses on the computational analysis of
223 | communications. Often, such courses will be on the graduate level,
224 | but it is equally possible to use this book in an undergraduate
225 | course; maybe skipping some parts that may go too deep. All code
226 | examples are not only printed in this book, but also available
227 | online. Students as well as social-scientists who want to brush up
228 | their skillset should therefore also be able to use this book for
229 | self-study, without a formal course around it. Lastly, this book can
230 | also be a reference for readers asking themselves: "How do I
231 | do this again?". In particular, if the main language you work in is
232 | R, you can look up how to do similar things in Python and vice versa.
233 | 
234 | ::: {.callout-note icon=false collapse=true}
235 | ## Code examples
236 | 
237 | Regardless of the context in which you use this book, one thing is for sure:
238 | The only way to learn computational analysis methods is by practicing and playing around.
239 | For this reason, the code examples are probably the most important part of the book.
240 | Where possible, the examples use real world data that is freely available on the Internet.
241 | To make sure that the examples still work in five years' time,
242 | we generally provide a copy of this data on the book website,
243 | but we also provide a link to the original source.
244 | 
245 | One thing to note is that to avoid unnecessary repetition
246 | the examples are sometimes designed to continue on earlier
247 | snippets from that chapter.
248 | So, if you seem to be missing a data set, or if some package is not imported yet,
249 | make sure you run all the code examples from that chapter.
250 | 
251 | Note that although it is possible to copy-paste the code from the website accompanying this book[^1],
252 | we would actually recommend typing the examples yourself.
253 | That way, you are more conscious about the commands you are using and you are adding them to your `muscle memory'.
254 | 
255 | Finally, realize that the code examples in this book are just examples.
256 | There's often more ways to do something, and our way is not necessarily the only good (let alone the best) way.
257 | So, after you get an example to work, spend some time to play around with it:
258 | try different options, maybe try it on your own data, or try to achieve the same result in a different way.
259 | The most important thing to remember is: you can't break anything!
260 | So just go ahead, have fun, and if nothing works anymore you can always start over from the code example from the book.
261 | :::
262 | 
263 | ## Installing R and Python {#sec-installing}
264 | 
265 | R and Python are the most popular programming languages that data
266 | scientists and computational scholars have adopted to conduct their
267 | work. While many develop a preference
268 | for  one or the other language, the chances are good that you
269 | will ultimately switch back and forth between them, depending on
270 | the specific task at hand and the project you are involved in.
271 | 
272 | Before you can start with analyzing data and communication in Python or R,
273 | you need to install interpreters for these languages (i.e., programs that can read code in these languages and execute it) on your computer.
274 | Interpreters for both Python and R are open source and completely free to download and use.
275 | Although there are various web-based services on which you can run code for both languages
276 | (such as Google Colab or RStudio Cloud),
277 | it is generally better to install an interpreter on your own computer.
278 | 
279 | After installing Python or R, you can execute code in these languages, but you also want a nice
280 | `Integrated Development Environment (IDE)` to develop your data analysis scripts.
281 | For R we recommend RStudio, which is free to install and is currently the most popular environment for working with R.
282 | For Python we recommend starting with JupyterLab or JupyterNotebook, which is a browser-based environment for writing and running Python code.
283 | All of these tools are available and well documented for Windows, MacOS, and Linux.
284 | After explaining how to install R and Python, there is a very important section on installing packages.
285 | If you plan to only use either R or Python (for now), feel free to skip the part about the other language.
286 | 
287 | If you are writing longer Python programs (as opposed to, for instance, short data analysis scripts) you probably want to install a full-blown IDE as well.
288 | We recommend PyCharm[^2] for this, which has a free version that has everything you need, and the premium version is also free for students and academic or open source developers.
289 | See their website for download and installation instructions.
290 | 
291 | ::: {.callout-note icon=false collapse=true}
292 | ## Anaconda
293 | . An alternative to installing
294 |   R, Python, and optional libraries separately and as you need them
295 |   (which we will explain later in this chapter) is to install the so-called
296 |   Anaconda distribution, one of the most used and extensive platforms
297 |   to perform data science. Anaconda is free and open-source, and is
298 |   conceived to run Python and R code for data analysis and machine
299 |   learning. Installing the complete Anaconda Distribution on your
300 |   computer[^3]
301 |   provides you with everything that you need to follow the examples in
302 |   this book and includes development environments such as Spyder,
303 |   Jupyter, and RStudio. It also includes a large set of pre-installed
304 |   packages often used in data science and its own package manager,
305 |   *conda*, which will help you to install and update other
306 |   libraries or dependencies. In short, Anaconda bundles  almost all the
307 |   important software to perform computational analysis of
308 |   communication.
309 | 
310 | So, should you install Anaconda, or should you
311 |   install all software separately as outlined in this chapter? It
312 |   depends. On the pro side, by downloading Anaconda you have everything installed at once and do
313 |   not have to worry about dependencies (e.g., Windows users usually
314 |   do not have a C compiler installed, but some packages may need
315 |   it). On the con side, it is huge and also installs many
316 |   things you do not need, so you essentially get a non-standard
317 |   installation, in which programs and packages are stored in different
318 |   locations than those you (or your computer) may expect. Nowadays, as almost all computers
319 |   actually already *have* some version of Python installed (even though you may
320 |   not know it), you also end up in a possibly confusing situation
321 |   where it may be unclear which version you are actually running, or
322 |   for which version you installed a package.
323 |   For this reason, our recommendation is to not use Anaconda unless
324 |   it is already installed or you have a specific reason to do so
325 |   (for example, if your professor requires you to use it).
326 | :::
327 | 
328 | ### Installing R and RStudio
329 | 
330 | Firstly, we will install R and its most popular IDE RStudio, and we
331 | will learn how to install additional packages and how to run a
332 | script. R is an object-based programming language
333 | orientated to statistical computing that can be used for most of the
334 | stages of computational analysis of communication.  If you are
335 | completely new to R, but familiar with other popular
336 | statistical packages in social sciences (such as SPSS or STATA), you
337 | will find that you can perform in R many already-known statistical
338 | operations. If you are not familiar with other statistical packages,
339 | do not panic, we will guide you from the very beginning. Unlike
340 | much traditional software that requires just one complete and initial
341 | installation, when working with R, we will first install the raw
342 | programming language and then we will continue to install additional
343 | components during our journey. It might sound cumbersome, but
344 | in fact it will make your work more powerful and flexible, since you
345 | will be able to choose the best way to interact with R and especially
346 | you will select the packages that are suitable for your project.
347 | 
348 | Now, let us install R.
349 | The easiest way is to go to the RStudio CRAN page at [cran.rstudio.com/](https://cran.rstudio.com/).
350 | [^4]
351 | Click on the link for installing R for your operating system, and
352 | install the latest version.
353 | If you use Linux, you may want to install R via your package manager.
354 | For Ubuntu linux, it is best to follow the instructions on [cran.r-project.org/bin/linux/ubuntu/](https://cran.r-project.org/bin/linux/ubuntu/).
355 | 
356 | After installing R, let us immediately install RStudio Desktop (the free version).
357 | Go to [rstudio.com/products/rstudio/download/#download](https://rstudio.com/products/rstudio/download/#download) and download and run the installer for your computer.
358 | If you open RStudio you should get a screen similar to @fig-rstudio.
359 | If this is the first time you open RStudio you probably won't see the top left pane (the scripts),
360 | you can create that pane by creating a new *R Script* via the *file* menu or with the green plus icon in the top left corner.
361 | 
362 | ![RStudio Desktop.](img/ch3_r_studio.png){#fig-rstudio}
363 | 
364 | Of the four panes in RStudio,
365 | you will probably spend most time in the top left pane, where you can view and edit your analysis *scripts*.
366 | A script is simply a list of commands that the computer should execute one after the other,
367 | for example: open your data, do some computations, and make a nice graph.
368 | 
369 | To run a line of code, you can place your cursor anywhere on that line and click the *Run* icon or
370 | press control+Enter.
371 | To try that, type the following into your newly opened script:
372 | 
373 | `print("Hello world")`
374 | 
375 | Now, place your cursor on that line and press Run (or control+Enter).
376 | What happens is that the line is copied to the *Console* in the bottom left corner
377 | and executed.
378 | So, the results of your commands (and any error messages) will be shown in this console view.
379 | 
380 | In contrast to most traditional programming languages,
381 | the easiest way to run R code is line by line.
382 | You can simply place your cursor on the first line,
383 | and repeatedly press control+Enter, which executes a line and then places the cursor on the next line.
384 | You can also select multiple lines (or part of a line) to execute those commands together,
385 | but in general it is easier to check that everything is going as planned if you run the code line by line.
386 | 
387 | You can also write commands directly in the console and execute them (by pressing Enter).
388 | This can be useful for trying things out or to run things that only need to be run once,
389 | but in general we would strongly recommend typing all your commands in a script and then executing them.
390 | That way, the script serves as a log of the commands you used to analyze your data,
391 | so you (or a colleague) can read and understand how you did the analyses.
392 | 
393 | ::: {.callout-note icon=false collapse=true}
394 | ## RStudio Projects
395 | 
396 | A very good idea to organize your data and code is to work with RStudio Projects.
397 |   In fact, we would recommend you to now create a new empty project for the examples in this book.
398 |   To do this, click on the *Project* button in the top right and select "New Project".
399 |   Then, select New Directory and New Project and enter a name for this project
400 |   and a parent folder for the project if you don't want it in your Documents.
401 |   Using a project means that the scripts and data files for your project are all in the same location
402 |   and you don't need to mess around with specifying the locations of files
403 |   (which will probably be different for someone else or on a different computer).
404 |   Moreover, RStudio remembers which files you were editing for each project,
405 |   so if you are working on multiple projects, it's very easy to switch between them.
406 |   We recommend creating a project now for the book (and/or for any projects you are working on),
407 |   and always switching to a project when you open RStudio
408 | :::
409 | 
410 | On the right side of the RStudio workspace you will find two additional
411 | windows. In the top right pane there are two or more tabs:
412 | *environment* and *history*, and depending on additional
413 | packages you may have installed there may be some more.  In
414 | *environment* you can manage your workspace (the set of elements
415 | you need to deploy for data analysis) and have a list of the objects
416 | you have uploaded to it. You may also import datasets with this tool.
417 | In the *history* tab you
418 | have an inventory of code executions, which you can save to a file, or
419 | move directly to console or to an R document.
420 | 
421 | Note that in the environment you can save and load your "workspace" (all data in the computer memory).
422 | However, relying on this functionality is often not a good idea: it
423 | will only save the state of your current session, whereas you most
424 | likely will want to save your R syntax file and/or your data instead.
425 | If you have your raw input data (e.g., as a csv file, see @sec-chap-filetodata)
426 | and your analysis script, you can always
427 | reproduce what you have been doing. If you only have a snapshot of
428 | your workspace, you know the state in which you arrived, but cannot
429 | necessarily reproduce (or change) how you got there.
430 | 
431 | In the bottom right pane there are five additional useful tabs.
432 | In *files* you can explore
433 | your computer and manage all the files you may use for the project,
434 | including importing datasets. In *plots*, *help* and
435 | *viewer*, you can visualize the  outputs, figures, documentation
436 | and general outcomes, respectively, that you have executed in your
437 | script. Finally, the tab for *packages* will be of great
438 | utility since it will let you install or update packages from CRAN or
439 | even from a file saved on your computer with a friendly interface.
440 | 
441 | ### Installing Python and Jupyter Notebook
442 | 
443 | Python is an object-orientated programming language
444 | and it is probably the favorite language of computational and data
445 | scientists in all disciplines around the world.
446 | There are different releases of Python, but the biggest difference used to be between Python 2 and Python 3.
447 | Fortunately, you will probably never need to install or use Python 2, and in fact, since January 2020 it is no longer supported.
448 | Thus, you can just use any recent Python 3 version for this book.
449 | When browsing through questions on online fora such as Stackoverflow or reading other people's code on Github (we will talk about that in @sec-chap-worldcode), you still may come across legacy code in Python 2. Such code usually does not run directly in a Python 3 interpreter, but in most cases, only minor adaptions are necessary to make it work.
450 | 
451 | We will install and run Python and Jupyter Notebook using a `terminal` or command line interface.
452 | This is a tool that is installed on all computers that allows you to enter commands to the computer directly.
453 | First, create a project folder for this book using the File Explorer (Windows) or Finder (MacOS).
454 | Then, on Windows you can shift + Right click that folder and select "Open command Window here".
455 | On MacOS, after navigating to the folder you just created, you click on "Finder" in the menu at the top of the screen, then on "Services", then on "New Terminal at Folder."
456 | In both cases, this should open a new window (usually black or gray) that allows you to type commands.
457 | 
458 | Note that on most computers, Python is already installed by default.
459 | You can check this by typing the following command in your terminal:
460 | 
461 | ```
462 | python3 --version
463 | ```
464 | 
465 | On some versions of Windows, you may need to use `py` instead of `python3`:
466 | ```
467 | py --version
468 | ```
469 | 
470 | In either case, the output of this command should be something like `Python 3.8.5`.
471 | If `python --version` also returns this version, you are free to use either command
472 | (but on older systems `python` can still refer to Python 2, so make sure that you are using Python 3 for this book!).
473 | 
474 | If Python is not installed on your system, go to [www.python.org/downloads/windows/](https://www.python.org/downloads/windows/) or [www.python.org/downloads/mac-osx/](https://www.python.org/downloads/mac-osx/) and download and install the latest stable release (which at the time of writing is `3.9.0`).
475 | [^5]
476 | After installing it, open a terminal again and run the command above to verify that it is installed correctly.
477 | 
478 | Included in any recent Python install is `pip`, the program that you will use for installing Python packages.
479 | You can check that pip is installed correctly by typing the following command on your terminal:
480 | 
481 | ```
482 | pip3 --version
483 | ```
484 | 
485 | Which should report something like `pip 20.0.2 from ... (python 3.8)`.
486 | Again, if `pip` reports the same version you can also use it instead of pip3.
487 | On some systems `pip3` will not work, so use `pip` in that case
488 | (but make sure to check that it points to Python 3).
489 | 
490 | **Installing Jupyter Notebook.**
491 | Next, we will install Jupyter Notebook, which you can use to run all the examples in this book
492 | and is a great environment for developing Python data analysis scripts.
493 | Jupyter Notebooks (in IDE JupyterLab if you installed that),
494 | are run as a web application
495 | that allows you to create documents that contain code and inline text fragments.
496 |  One of the nicest things about
497 | the Jupyter Notebook is that the code is inserted in fields (so-called "cells") that you
498 | can run one by one, getting its respective output, which when added to the
499 | narrative text, will make your script more clean and
500 | reproducible. You can also add formatted text blocks (using a simple formatting language called `Markdown`)
501 | to explain to the reader what you are doing. In @sec-practices, we will address
502 | notebooks again as a good practice for a computational scientist.
503 | 
504 | You can install Jupyter notebook directly using pip using the following command
505 | (executed in a terminal):
506 | 
507 | ```
508 | pip3 install jupyter-notebook
509 | ```
510 | 
511 | Now, you can run Jupyter by executing the following command on the terminal:
512 | 
513 | ```
514 | jupyter notebook
515 | ```
516 | 
517 | This will print some useful information, including the URL at which you can access the notebook.
518 | However, it should also directly open this in a browser (e.g. Chrome) so you can directly start working.
519 | In your browser you should see the Jupyter main screen similar to the middle window in @fig-jupyter.
520 | Create a new notebook by clicking on the *New* button in the top right and selecting Python 3.
521 | This should open a window similar to the bottom window in @fig-jupyter.
522 | 
523 | ![Jupyter Notebook.](img/jupyter.png){#fig-jupyter}
524 | 
525 | In Jupyter, code is entered into cells.
526 | First, type `print("Hello World")` into the empty cell next to the `In [ ]:` prompt.
527 | Then, click the Run button or press control+Enter. This should execute your command and display
528 | the text `"Hello World"` in the output area right below the input cell.
529 | Note that you can create more cells using the plus icon or with the insert menu.
530 | You can also set the cell type via the Cell menu: select code for analysis scripts (which is the default),
531 | or Markdown for text fragments, which can be used to explain the code and/or interpret the results.
532 | 
533 | ## Installing Third-Party Packages {#sec-installingpackages}
534 | 
535 | The `print` function used above is automatically included when you start R or Python.
536 | Many functions, however, are included in separate `packages` (also known as `libraries` or `modules`), which are
537 | generally collections of commands for a certain task or activity.
538 | 
539 | Although both R and Python come pre-installed with many useful `packages`,
540 | one of the great things of both languages is that they have a very active community that continuously develops, improves, and publishes new packages.
541 | Throughout this book, we will be using such third-party packages for a variety of tasks, from data wrangling and visualization to text analysis.
542 | For example, we will use the R package *tidyverse* and the Python packages *pandas* for data wrangling.
543 | 
544 | To install these packages on your computer, run the following commands:
545 | (Note: if you are using Anaconda, replace `pip3 install` with `conda install`)
546 | 
547 | ::: {.panel-tabset}
548 | ## Installing a package from Jupyter
549 | ```{python None-None}
550 | #| eval: false
551 | !pip3 install pandas
552 | # (On some systems, !pip install pandas)
553 | ```
554 | ## Installing a package in R
555 | ```{r None-None1}
556 | #| eval: false
557 | install.packages("tidyverse")
558 | ```
559 | :::
560 | 
561 | These commands will automatically fetch the package from the right repository[^6] and install them on your computer. This can take a while, especially for large packages such as tidyverse.
562 | Fortunately, this only needs to be done once.
563 | Every time you use a package, you also need to *activate* it using the `import` (Python) or  `library` (R) command.
564 | 
565 | In general, whenever you get an error `No module named 'pandas'` (Python) or `there is no package called ‘tidyverse’`,
566 | you can just install the package with that name using the code listed above.
567 | If you get an error such as `name 'pandas' is not defined` (Python) or `object 'ggplot' not found` (R),
568 | it is quite possible you forgot to activate the package that includes that function.
569 | 
570 | ::: {.callout-note icon=false collapse=true}
571 | ## Packages used in each chapter
572 | 
573 | Some packages, like the *tidyverse* (R) and *pandas* (Python) packages for data handling are used in almost every chapter.
574 | Many chapters also introduce specific packages such as *igraph*/*networkx* for network analysis in @sec-chap-network.
575 | To make it easy to keep track of the packages needed for each chapter,
576 | every chapter that includes code in this book starts with a note like this that gives an overview of the main packages introduced in that chapter.
577 | It also includes the code needed to install these packages, which of course is only needed if you didn't install these packages before.
578 | Note again that if you are using Anaconda for Python,
579 | you should replace `!pip3 install` with `!conda install` in that code. On some systems, you may need to use `!pip install` instead of `!pip3 install`.
580 | 
581 | These notes also includes a code block to import all the packages used for that chapter,
582 | which you need to run every time you use examples from that chapter.
583 | :::
584 | 
585 | [^1]: https://cssbook.net
586 | 
587 | [^2]: [www.jetbrains.com/pycharm/](https://www.jetbrains.com/pycharm/)
588 | 
589 | [^3]: [www.anaconda.com/distribution/\#download-section](https://www.anaconda.com/distribution/\#download-section)
590 | 
591 | [^4]: `CRAN`, short for Comprehensive R Archive Network, is a network
592 |   of websites on which R itself and various R packages are hosted.
593 | 
594 | [^5]: For linux, install python3 and pip using your package manager. For example, on ubuntu you can run `sudo apt install python3-pip`
595 | 
596 | [^6]: Similar to the App Store or Play Store, both R and Python have a centralized repository for third party packages.  For R, this is the Comprehensive R Archive Network (CRAN) encountered earlier,
597 |     while for Python this is the Python Package Index (PyPI) accessed by `pip`.  Normally, all packages in these repositories are open source and safe to install.
598 | 
599 | 


--------------------------------------------------------------------------------
/content/chapter02.qmd:
--------------------------------------------------------------------------------
  1 | # Getting started: Fun with data and visualizations {#sec-chap-fundata}
  2 | 
  3 | ```{python}
  4 | #| echo: false
  5 | import warnings; warnings.filterwarnings('ignore')
  6 | ```
  7 | 
  8 | **Abstract.** This chapter is a lightning tour of some of the cool (and informative) things you can do with R and Python. Starting from a dataset of tweets about COVID-19, we show how you can analyze this data using text analysis, network analysis, and using geographic information. The goal of this chapter is not to teach you all these techniques in detail, rather, each of the examples showcases a possibility and guides you to the chapter where it will be explained in more detail. So don't worry too much about understanding every line of code, but relax and enjoy the ride!
  9 | 
 10 | **Keywords.** basics of programming, data analysis
 11 | 
 12 | **Objectives:**
 13 | 
 14 | -   Get an overview of the possibilities of R and Python for data analysis and visualization
 15 | -   Understand how different aspects of data gathering, cleaning, and analysis work together
 16 | -   Have fun with data and visualizations!
 17 | 
 18 | ::: {.callout-note icon="false" collapse="true"}
 19 | ## Packages used in this chapter
 20 | 
 21 | Since this chapter showcases a wide variety of possibilities, it relies on quite a number of third party packages. If needed, you can install these packages with the code below (see Section [-@sec-installing] for more details):
 22 | 
 23 | ::: panel-tabset
 24 | ## Python code
 25 | 
 26 | ```{python chapter02install-python}
 27 | #| eval: false
 28 | !pip3 install pandas matplotlib  geopandas
 29 | !pip3 install descartes shifterator
 30 | !pip3 install wordcloud gensim nltk networkx
 31 | ```
 32 | 
 33 | ## R code
 34 | 
 35 | ```{r chapter02install-r}
 36 | #| eval: false
 37 | install.packages(c("tidyverse", "igraph","maps",
 38 |     "quanteda", "quanteda.textplots",
 39 |     "quanteda.textstats", "topicmodels"))
 40 | ```
 41 | :::
 42 | 
 43 | After installing, you need to import (activate) the packages every session:
 44 | 
 45 | ::: panel-tabset
 46 | ## Python code
 47 | 
 48 | ```{python chapter02library-python}
 49 | import re
 50 | import pandas as pd
 51 | import matplotlib.pyplot as plt
 52 | from collections import Counter, defaultdict
 53 | from wordcloud import WordCloud
 54 | from gensim import corpora, models
 55 | import geopandas as gpd
 56 | import shifterator as sh
 57 | import nltk
 58 | from nltk.corpus import stopwords
 59 | import networkx as nx
 60 | 
 61 | ```
 62 | 
 63 | ## R code
 64 | 
 65 | ```{r chapter02library-r}
 66 | library(tidyverse)
 67 | library(lubridate)
 68 | library(quanteda)
 69 | library(quanteda.textplots)
 70 | library(quanteda.textstats)
 71 | library(topicmodels)
 72 | library(igraph)
 73 | library(maps)
 74 | ```
 75 | :::
 76 | :::
 77 | 
 78 | ## Fun With Tweets {#sec-funtweets}
 79 | 
 80 | The goal of this chapter is to showcase how you can use R or Python to quickly and easily run some impressive analyses of real world data. For this purpose, we will be using a dataset of tweets about the COVID pandemic that is engulfing much of the world at the time this book is written. Of course, tweets are probably only representative for what is said on Twitter, but the data are (semi-)public and rich, containing text, location, and network characteristics. This makes them ideal for exploring the many ways in which we can analyze and visualize information with Python and R.
 81 | 
 82 | Example [-@exm-funtweets] shows how you can read this dataset into memory using a single command. Note that this does not retrieve the tweets from Twitter itself, but rather downloads our cached version of the tweets. In Chapter [-@sec-chap-scraping] we will show how you can download tweets and location data yourself, but to make sure we can get down to business immediately we will start from this cached version.
 83 | 
 84 | ::: {.callout-note appearance="simple" icon="false"}
 85 | ::: {#exm-funtweets}
 86 | Retrieving cached tweets about COVID
 87 | 
 88 | ::: panel-tabset
 89 | ## Python code
 90 | 
 91 | ```{python tweets-python}
 92 | tw = pd.read_csv("https://cssbook.net/d/covid.csv")
 93 | tw.head()
 94 | 
 95 | ```
 96 | 
 97 | ## R code
 98 | 
 99 | ```{r tweets-r}
100 | tw = read_csv("https://cssbook.net/d/covid.csv")
101 | head(tw)
102 | ```
103 | :::
104 | :::
105 | :::
106 | 
107 | As you can see, the dataset contains almost 10000 tweets, listing their sender, their location and language, the text, the number of retweets, and whether it was a reply. You can read the start of the three most retweeted messages, which contain one (political) tweet from India and two seemingly political and factual tweets from the United States.
108 | 
109 | **My first bar plot.** Before diving into the textual, network, and geographic data in the dataset, let's first make a simple visualization of the date on which the tweets were posted. Example [-@exm-funtime] does this in two steps: first, the number of tweets per hour is counted with an aggregation command. Next, a bar plot is made of this calculated value with some options to make it look relatively clean and professional. If you want to play around with this, you can for example try to plot the number of tweets per language, or create a line plot instead of a bar plot. For more information on visualization, please see Chapter [-@sec-chap-eda]. See Chapter [-@sec-chap-datawrangling] for an in-depth explanation of the aggregation command.
110 | 
111 | ::: {.callout-note appearance="simple" icon="false"}
112 | ::: {#exm-funtime}
113 | Barplot of tweets over time
114 | 
115 | ::: panel-tabset
116 | ## Python code
117 | 
118 | ```{python funtime-python}
119 | #| results: hide
120 | tw.index = pd.DatetimeIndex(tw["created_at"])
121 | tw["status_id"].groupby(pd.Grouper(freq="H")).count().plot(kind="bar")
122 | # (note the use of \ to split a long line)
123 | 
124 | ```
125 | 
126 | ## R code
127 | 
128 | ```{r funtime-r}
129 | tweets_per_hour = tw %>%
130 |   mutate(hour=round_date(created_at, "hour")) %>%
131 |   group_by(hour) %>% summarize(n=n())
132 | ggplot(tweets_per_hour, aes(x=hour, y=n)) +
133 |   geom_col() + theme_classic() +
134 |   xlab("Time") + ylab("# of tweets") +
135 |   ggtitle("Number of COVID tweets over time")
136 | ```
137 | :::
138 | :::
139 | :::
140 | 
141 | ## Fun With Textual Data {#sec-funtext}
142 | 
143 | **Corpus Analysis.** Next, we can analyze which hashtags are most frequently used in this dataset. Example [-@exm-funcloud] does this by creating a *document-term matrix* using the package *quanteda* (in R) or by manually counting the words using a defaultdict (in Python). The code shows a number of steps that are made to create the final results, each of which represent researcher choices about which data to keep and which to discard as noise. In this case, we select English tweets, convert text to lower case, remove stop words, and keep only words that start with #, while dropping words starting with `#corona` and `#covid`. To play around with this example, see if you can adjust the code to e.g. include all words or only at-mentions instead of the hashtags and make a different selection of tweets, for example Spanish language tweets or only popular (retweeted) tweets. Please see Chapter [-@sec-chap-dtm] if you want to learn more about corpus analysis, and see Chapter [-@sec-chap-datawrangling] for more information on how to select subsets of your data.
144 | 
145 | ::: {.callout-note appearance="simple" icon="false"}
146 | ::: {#exm-funcloud}
147 | My First Tag Cloud
148 | 
149 | ::: panel-tabset
150 | ## Python code
151 | 
152 | ```{python funcloud-python}
153 | #| results: hide
154 | freq = defaultdict(int)
155 | for tweet in tw["text"]:
156 |     for tag in re.findall("#\w+", tweet.lower()):
157 |         if not re.search("#covid|#corona", tag):
158 |             freq[tag] += 1
159 | wc = WordCloud().generate_from_frequencies(freq)
160 | plt.imshow(wc, interpolation="bilinear")
161 | plt.axis("off")
162 | plt.show()
163 | ```
164 | 
165 | ## R code
166 | 
167 | ```{r funcloud-r}
168 | dtm_tags = filter(tw, lang=="en") %>%
169 |   corpus() %>% tokens() %>%
170 |   dfm(tolower = T) %>%
171 |   dfm_select(pattern = "#*") %>%
172 |   dfm_remove(c("#corona*", "#covid*"))
173 | textplot_wordcloud(dtm_tags, max_words=100)
174 | ```
175 | :::
176 | :::
177 | :::
178 | 
179 | **Topic Model.** Where a word cloud (or tag cloud) shows which words occur most frequently, a `topic model` analysis shows which words co-occur in the same documents. Using the most common topic modeling algorithm, Latent Dirichlet Allocation or LDA, Example [-@exm-funlda] explores the tweets by automatically clustering the tags selected earlier into 10 *topics*. Topic modeling is non-deterministic -- if you run it again you can get slightly different topics, and topics are swapped around randomly as the topic numbers have no special meaning. By setting the computer's *random seed* you can ensure that if you run it again you get the same results. As you can see, some topics seem easily interpretable (such as topic 2 about social distancing, and topic 8 on health care), it is always recommended that you inspect the clustered documents and edge cases in addition to the top words (or tags) as shown here. You can play around with this example by using a different selection of words (modifying the code in Example [-@exm-funcloud]) or changing the number of topics. You can also change (or remove) the random seed and see how running the same model multiple times will give different results. See @sec-unsupervised for more information about fitting, interpreting, and validating topic models.
180 | 
181 | ::: {.callout-note appearance="simple" icon="false"}
182 | ::: {#exm-funlda}
183 | Topic Model of the COVID tags
184 | 
185 | ::: panel-tabset
186 | ## Python code
187 | 
188 | ```{python funlda-python}
189 | tags = [
190 |     [tag.lower() for tag in re.findall("#\w+", tweet)] for tweet in tw["text"]
191 | ]
192 | voca = corpora.Dictionary(tags)
193 | corpus = [voca.doc2bow(doc) for doc in tags]
194 | m = models.LdaModel(
195 |     corpus, num_topics=10, id2word=voca, distributed=False, random_state=123
196 | )
197 | for topic, words in m.print_topics(num_words=3):
198 |     print(f"{topic}: {words}")
199 | 
200 | ```
201 | 
202 | ## R code
203 | 
204 | ```{r funlda-r}
205 | set.seed(1)
206 | m = convert(dtm_tags, to="topicmodel") %>%
207 |   LDA(10, method="Gibbs")
208 | terms(m, 5)
209 | ```
210 | :::
211 | :::
212 | :::
213 | 
214 | ## Fun With Visualizing Geographic Information {#sec-fungeo}
215 | 
216 | For the final set of examples, we will use the location information contained in the Twitter data. This information is based on what Twitter users enter into their profile, and as such it is incomplete and noisy with many users giving a nonsensical location such as \`Ethereally here' or not filling in any location at all. However, if we assume that most users that do enter a proper location (such as Lahore or Florida in the top tweets displayed above), we can use it to map where most tweets are coming from.
217 | 
218 | The first step in this analysis is to resolve a name such as \`Lahore, Pakistan' to its geographical coordinates (in this case, about 31 degrees north and 74 degrees east). This is called geocoding, and both Google maps and Open Street Maps can be used to perform this automatically. As with the tweets themselves, we will use a cached version of the geocoding results here so we can proceed directly. Please see https://cssbook.net/datasets for the code that was used to create this file so you can play around with it as well.
219 | 
220 | Example [-@exm-funmap] shows how this data can be used to create a map of Twitter activity. First, the cached user data is retrieved, showing the correct location for Lahore but also illustrating the noisiness of the data with the location "Un peu partout". Next, this data is `joined` to the Twitter data, so the coordinates are filled in where known. Finally, we plot this information on a map, showing tweets with more retweets as larger dots. See Chapter [-@sec-chap-eda] for more information on visualization.
221 | 
222 | ::: {.callout-note appearance="simple" icon="false"}
223 | ::: {#exm-funmap}
224 | Location of COVID tweets
225 | 
226 | ::: panel-tabset
227 | ## Python code
228 | 
229 | ```{python funmap-python}
230 | #| results: hide
231 | url = "https://cssbook.net/d/covid_users.csv"
232 | users = pd.read_csv(url)
233 | tw2 = tw.merge(users, on="screen_name", how="left")
234 | world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
235 | gdf = gpd.GeoDataFrame(tw2, geometry=gpd.points_from_xy(tw2.long, tw2.lat))
236 | ax = world.plot(color="white", edgecolor="black", figsize=(10, 10))
237 | gdf.plot(ax=ax, color="red", alpha=0.2, markersize=tw["retweet_count"])
238 | plt.show()
239 | 
240 | ```
241 | 
242 | ## R code
243 | 
244 | ```{r funmap-r}
245 | url = "https://cssbook.net/d/covid_users.csv"
246 | users = read_csv(url)
247 | tw2 = left_join(tw, users)
248 | ggplot(mapping=aes(x=long, y=lat)) +
249 |   geom_polygon(aes(group=group),
250 |     data=map_data("world"),
251 |     fill="lightgray", colour = "white") +
252 |   geom_point(aes(size=retweet_count,
253 |                  alpha=retweet_count),
254 |              data=tw2, color="red") +
255 |   theme_void() + theme(aspect.ratio=1) +
256 |   guides(alpha=FALSE, size=FALSE) +
257 |   ggtitle("Location of COVID tweets",
258 |           "Size indicates number of retweets")
259 | ```
260 | :::
261 | :::
262 | :::
263 | 
264 | **Combining textual and structured information.** Since we know the location of a subset of our tweet's users, we can differentiate between e.g. American, European, and Asian tweets. Example [-@exm-funcompare] creates a very rough identification of North American tweets, and uses that to compute the relative frequency of words in those tweets compared to the rest. Not surprisingly, those tweets are much more about American politics, locations, and institutions. The other tweets talk about UK politics but also use a variety of names to refer to the pandemic. To play around with this, see if you can isolate e.g. Asian or South American tweets, or compare Spanish tweets from different locations.
265 | 
266 | ::: {.callout-note appearance="simple" icon="false"}
267 | ::: {#exm-funcompare}
268 | Corpus comparison: North American tweets vs. the rest
269 | 
270 | ::: panel-tabset
271 | ## Python code
272 | 
273 | ```{python funcompare-python}
274 | #| results: hide
275 | nltk.download("stopwords")
276 | cn = gdf.query("lang=='en'&(long<-60 & lat>25)")
277 | cn = Counter(cn["text"].str.cat().lower().split())
278 | cr = gdf.query("lang=='en' & (long>-60 | lat<25)")
279 | cr = Counter(cr["text"].str.cat().lower().split())
280 | for k in stopwords.words("english"):
281 |     del cn[k]
282 |     del cr[k]
283 | key = sh.ProportionShift(type2freq_1=cn, type2freq_2=cr)
284 | # WvA: It looks like shifterator is not working anymore?
285 | # key.get_shift_graph().plot()
286 | 
287 | ```
288 | 
289 | ## R code
290 | 
291 | ```{r funcompare-r}
292 | dfm = tw2 %>% mutate(northamerica=ifelse(
293 |     long < -60 & lat > 25,"N. America","Rest"))%>%
294 |   filter(lang=="en") %>%
295 |   corpus(docid_field="status_id") %>%
296 |   tokens(remove_punct=T) %>%
297 |   tokens_group(northamerica) %>%
298 |   dfm(tolower=T) %>%
299 |   dfm_remove(stopwords("en")) %>%
300 |   dfm_select(min_nchar=4)
301 | key = textstat_keyness(dfm, target="N. America")
302 | textplot_keyness(key, margin=0.2) +
303 |   ggtitle("Words preferred by North Americans",
304 |           "(Only English-language tweets)") +
305 |   theme_void()
306 | 
307 | ```
308 | :::
309 | :::
310 | :::
311 | 
312 | ## Fun With Networks {#sec-funnet}
313 | 
314 | Twitter, of course, is a social network as well as a microblogging service: users are connected to other users because they follow each other and retweet and like each others' tweets. Using the `reply_to_screen_name` column, we can inspect the reply network contained in the COVID tweet dataset. Example [-@exm-fungraph] first uses the data summarization commands from <code>tidyverse</code>(R) and <code>pandas</code>(Python) to create a data frame of connections or `edges` listing how often each user replies each other user. The second code block shows how the *igraph* (R) and *networkx* (Python) packages are used to convert this edge list into a graph. From this graph, we select only the largest connected component and use a clustering algorithm to analyze which nodes (users) form cohesive subnetworks. Finally, a number of options are used to set the color and size of the edges, nodes, and labels, and the resulting network is plotted. As you can see, the central node is Donald Trump, who is replied by a large number of users, some of which are then replied by other users. You can play around with different settings for the plot options, or try to filter e.g. only tweets from a certain language. You could also easily compute social network metrics such as centrality on this network, and/or export the network for further analysis in specialized social network analysis software. See Chapter [-@sec-chap-network] for more information on network analysis, and Chapter [-@sec-chap-datawrangling] for the summarization commands used to create the edge list.
315 | 
316 | ::: {.callout-note appearance="simple" icon="false"}
317 | ::: {#exm-fungraph}
318 | Reply network in the COVID tweets.
319 | 
320 | ::: panel-tabset
321 | ## Python code
322 | 
323 | ```{python fungraph-python}
324 | edges = tw2[["screen_name", "reply_to_screen_name"]]
325 | edges = edges.dropna().rename(
326 |     {"screen_name": "from", "reply_to_screen_name": "to"}, axis="columns"
327 | )
328 | edges.groupby(["from", "to"]).size().head()
329 | 
330 | ```
331 | 
332 | ## R code
333 | 
334 | ```{r fungraph-r}
335 | edges = tw2 %>%
336 |   select(from=screen_name,
337 |          to=reply_to_screen_name) %>%
338 |   filter(to != "") %>%
339 |   group_by(to, from) %>%
340 |   summarize(n=n())
341 | head(edges)
342 | ```
343 | :::
344 | 
345 | ::: panel-tabset
346 | ## Python code
347 | 
348 | ```{python fungraphb-python}
349 | #| results: hide
350 | g1 = nx.Graph()
351 | g1.add_edges_from(edges.to_numpy())
352 | largest = max(nx.connected_components(g1), key=len)
353 | g2 = g1.subgraph(largest)
354 | 
355 | pos = nx.spring_layout(g2)
356 | plt.figure(figsize=(20, 20))
357 | axes_info = plt.axis("off")
358 | sizes = [s * 1e4 for s in nx.centrality.degree_centrality(g2).values()]
359 | nx.draw_networkx_nodes(g2, pos, node_size=sizes)
360 | edge_info = nx.draw_networkx_labels(g2, pos)
361 | nx.draw_networkx_edges(g2, pos)
362 | plt.show()
363 | 
364 | ```
365 | 
366 | ## R code
367 | 
368 | ```{r fungraphb-r}
369 | # create igraph and select largest component
370 | g = graph_from_data_frame(edges)
371 | components <- decompose.graph(g)
372 | largest = which.max(sapply(components, gsize))
373 | g2 = components[[largest]]
374 | # Color nodes by cluster
375 | clusters = cluster_spinglass(g2)
376 | V(g2)$color = clusters$membership
377 | V(g2)$frame.color = V(g2)$color
378 | # Set node (user) and edge (arrow) size
379 | V(g2)$size = degree(g2)^.5
380 | V(g2)$label.cex = V(g2)$size/3
381 | V(g2)$label = ifelse(degree(g2)<=1,"",V(g2)$name)
382 | E(g2)$width = E(g2)$n
383 | E(g2)$arrow.size= E(g2)$width/10
384 | plot(g2)
385 | ```
386 | :::
387 | :::
388 | :::
389 | 
390 | **Geographic networks.** In the final example of this chapter, we will combine the geographic and network information to show which regions of the world interact with each other. For this, in Example [-@exm-fungeonet] we join the user information to the edges data frame created above twice: once for the sender, once for the replied-to user. Then, we adapt the earlier code for plotting the map by adding a line for each node in the network. As you can see, users in the main regions (US, EU, India) mostly interact with each other, with almost all regions also interacting with the US.
391 | 
392 | ::: {.callout-note appearance="simple" icon="false"}
393 | ::: {#exm-fungeonet}
394 | Reply Network of Tweets
395 | 
396 | ::: panel-tabset
397 | ## Python code
398 | 
399 | ```{python fungeonet-python}
400 | #| results: hide
401 | u = users.drop(["location"], axis=1)
402 | uf = u.rename(
403 |     {"screen_name": "from", "lat": "lat_from", "long": "long_from"}, axis=1
404 | )
405 | ut = u.rename({"screen_name": "to", "lat": "lat_to", "long": "long_to"}, axis=1)
406 | edges = edges.merge(uf).merge(ut).query("long_to!=long_from & lat_to!=lat_from")
407 | 
408 | world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
409 | g_to = gpd.GeoDataFrame(
410 |     edges.copy(), geometry=gpd.points_from_xy(edges.long_to, edges.lat_to)
411 | )
412 | g_from = gpd.GeoDataFrame(
413 |     edges.copy(), geometry=gpd.points_from_xy(edges.long_from, edges.lat_from)
414 | )
415 | 
416 | ax = world.plot(color="white", edgecolor="black", figsize=(10, 10))
417 | g_from.plot(ax=ax, color="red", alpha=0.2)
418 | g_to.plot(ax=ax, color="blue", alpha=0.2)
419 | 
420 | e = g_from.join(g_to, lsuffix="_from", rsuffix="_to")
421 | e = e[["geometry_from", "geometry_to"]]
422 | px = lambda point: point.x
423 | py = lambda point: point.y
424 | 
425 | # WVA: This code no longer works but gives
426 | #      UnsupportedOperationException: getX called on empty Point
427 | # x_values = list(zip(e["geometry_from"].map(px),
428 | #                    e["geometry_to"].map(px)))
429 | # y_values = list(zip(e["geometry_from"].map(py),
430 | #                    e["geometry_to"].map(py)))
431 | # plt.plot(x_values, y_values, linewidth = 1,
432 | #    linestyle = "-", color = "green", alpha=.3)
433 | # plt.show()
434 | 
435 | ```
436 | 
437 | ## R code
438 | 
439 | ```{r fungeonet-r}
440 | edges2 = edges %>%
441 |   inner_join(users, by=c("from"="screen_name"))%>%
442 |   inner_join(users, by=c("to"="screen_name"),
443 |              suffix=c("", ".to")) %>%
444 |   filter(lat != lat.to | long != long.to )
445 | ggplot(mapping=aes(x = long, y = lat)) +
446 |   geom_polygon(aes(group=group),map_data("world"),
447 |     fill="lightgray", colour = "white") +
448 |   geom_point(aes(size=retweet_count,
449 |   alpha=retweet_count), data=tw2, color="red")+
450 |   geom_curve(aes(xend=long.to,yend=lat.to,size=n),
451 |              edges2, curvature=.1, alpha=.5) +
452 |   theme_void() + guides(alpha=FALSE, size=FALSE) +
453 |   ggtitle("Retweet network of COVID tweets",
454 |   "Bubble size indicates total no. of retweets")
455 | ```
456 | :::
457 | :::
458 | :::
459 | 


--------------------------------------------------------------------------------
/content/chapter04.qmd:
--------------------------------------------------------------------------------
  1 | 
  2 | # How to write code {#sec-chap-worldcode}
  3 | 
  4 | ```{python}
  5 | #| echo: false
  6 | import warnings; warnings.filterwarnings('ignore')
  7 | ```
  8 | 
  9 | **Abstract.**
 10 | Programming is no longer a solitary activity, and almost all questions, problems, and error messages have been encountered and solved before. This chapter explains the most common forms of collaboration and sources of outside help, as well as outlining best practice on how to write and share code yourself.
 11 | 
 12 | **Keywords.** package, library, errors, computational hygiene, notebooks
 13 | 
 14 | **Objectives:**
 15 | 
 16 | -  Understand the importance of re-using code when programming
 17 |  -  Help beginning coders avoid getting stuck
 18 |  -  Explain "computational hygiene" and show best practices in R and Python to write and share code
 19 | 
 20 | In Chapter [-@sec-chap-programmingconcepts], you have learned how to write
 21 | your first lines of code.  You created objects of different types,
 22 | used and wrote some functions, and explored the major control structures.
 23 | You are probably eager to write your first longer piece of code and
 24 | produce some interesting data processing or analysis script. In this
 25 | chapter, we prepare you to do this with as little frustration as possible.
 26 | You will be introduced to some best practices so that you can implement
 27 | them right from the start, and to some tools that will make your life easier.
 28 | 
 29 | First, we will answer the question "how do you avoid reinventing
 30 | the wheel": when is it appropriate to simply use someone else's existing code, and
 31 | when do you need to write your own code from scratch? And is there a middle ground?
 32 | Second, we will discuss how to turn error messages -- which you will inevitably
 33 | see a lot -- from a frustrating annoyance into a helpful tool.
 34 | Finally, we will discuss some best practices when writing code.
 35 | 
 36 | ## Re-using Code: How Not to Re-Invent the Wheel {#sec-code}
 37 | 
 38 | Just as in any human language,  programming languages also consist of a vocabulary, syntax rules, and expressions. Using the proper words and grammar, you can build from scratch any idea your imagination allows. That's a wonderful thing! But, let's be honest: the language itself, the expressions, ideas, and all the abstract constructs seldom come originally from you. And in fact, that's great as well: otherwise, you'd have to deeply think of every element before talking and expressing any thought. Instead, you use pre-existing rules, ideas, perceptions, and many different narratives to create your own messages to interact with the world. It's the same with coding: you never start from scratch.
 39 | 
 40 | Of course you *can* code anything you want from the very beginning, even just using 0's and 1's!
 41 | When reading through the previous chapters, maybe you even started to think that complex operations will be  exhausting and will take a really long time. After all, from the basic operations we did to a useful statistical model seems like a long way to go.
 42 | 
 43 | Luckily, this is not the case.
 44 | There is almost no project in which computational scientists, data analysts, or developers do not re-use earlier code in order to achieve their goals more quickly and  efficiently.
 45 | The more common a task is, the greater the chance that you do not have to re-invent the wheel.
 46 | Of course, you have to give credit where credit is due, but it is not uncommon to paste code snippets from others into your own code and adapt them. This is especially true for standard operations, for which there are only so many ways to  achieve the desired result.
 47 | 
 48 | There are different ways to re-use earlier code. One is to copy and adapt raw lines of code written by someone else or by yourself in the past. In fact, there are many online repositories such as GitHub or BitBucket that contain many programs and well-documented code examples (see Section [-@sec-practices]). When conducting computational analyses, you will spend a significant part of your time in such repositories trying to understand what others have done and figuring out how you can use it in your own work. Of course, make sure that the license of the code allows you to use it in the way you want. Also, give credit where credit is due: at the very least, place a comment with a link in your code to indicate what inspired you.
 49 | 
 50 | Another way is to build or import functions that summarize many lines of code into a simpler command, as we explained in Section [-@sec-functions]. Functions are indeed powerful ways of reusing code, since you do not have to write the same code over and over again if you need it in multiple places. Packages are probably the most elegant approach to recycle the work done by other colleagues. In Section [-@sec-installing] you already learned how to install a package, and you probably noticed how easy it is to bring many pre-built functionalities onto your workspace. You can also write and publish your own package in the future to help your colleagues to write less code and to be more efficient in their daily job (see also Section [-@sec-publishingsource])!
 51 | 
 52 | Many questions can arise here: what to re-use? When to use a function written by someone else instead of writing the code yourself? Which scripts and sources are trustworthy? Which is the best package to choose? How many packages should we use within the same project? Should we care about package versions? And must we know every package that is released in our field? There are of course multiple answers to these questions and it will be probably a matter of practice how to obtain the most appropriate ones. In general, we can say that one premise is to re-use and share code as much as you can. This idea is limited by constraints of quality, availability, parsimony, updates, and expertise. In other words, when recycling code we should think of the reputation of the source, the difficulty of accessing it, the risk of having an excessive and messy number of inputs, the need to share the last developments with your colleagues, and the fact that you will never be able to know everything.
 53 | 
 54 | Let's take an example. Imagine you want to compute the Levenshtein distance between two
 55 | strings. That's a pretty straightforward metric that answers the question: "How many
 56 | edits (removing/changing/inserting characters) do I need to apply to transform string1 into string2?"
 57 | It can be used for plagiarism detection, but may be interesting for us to determine, for instance,
 58 | whether a newspaper copied some content from somewhere else, even if small changes have been
 59 | applied. You could now try to write some code to calculate that (and we are sure you could
 60 | do that if you invested some time in it!), but it is such a common problem that it
 61 | has been solved multiple times before. You could, for instance,  look up some
 62 | functions that are known to solve the problem and copy-paste them into your code. You can find
 63 | a large number of different implementations for both Python and R here:
 64 | [en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein\_distance](https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein\_distance).
 65 | You can then choose and copy-paste the one which is most appropriate for you. One that is very fast, because
 66 | you want to compare a huge set of strings? One that is easy to understand? One that uses
 67 | only a few lines of code to not distract the reader?
 68 | Alternatively, if you look for available packages for Python and R, you see that there are
 69 | multiple packages that you can install with `install.packages` (R) or `pip` (Python)
 70 | and then import. If you go for that route, you don't need
 71 | to care about the internal workings and can "abstract away" and outsource the problem -- on the other
 72 | hand, the users of your code now have one more dependency to install before they can
 73 | use your code.
 74 | 
 75 | In the case of package selection, we understand it can be quite overwhelming,
 76 | with so many different packages from different contributors.
 77 | In fact, sometimes the same task, such as topic modeling,
 78 | can be done using multiple different packages.
 79 | So, how to find and choose the best package?
 80 | Besides resources like this book, the most important guide is probably the community around you:
 81 | using packages that a lot of other people also use means that the package is probably well maintained and documented,
 82 | and that there is a community to ask for help if needed.
 83 | Since all packages on Pypi and CRAN are free to download and install, however, you can also shop around and see what the various packages do.
 84 | When comparing different packages, it is always good to check their documentation and their GitHub page:
 85 | packages that are well documented and that are updated frequently are often a good choice.
 86 | 
 87 | For example, the authors of this book had several intensive discussions of which packages to mention and use in the proposed exercises, an issue that became complex given the variety of topics addressed in this book. In the case of text analysis, a library such as `NLTK` for Python was incredibly popular among computational analysts until a few years ago becoming a package of reference in the field, but it has -- at least for some applications -- been overpassed by friendly and sophisticated new packages for natural language processing like `SpaCy`. So, which should we have included in this book? The one which is well-known (with excellent documentation by the way) and still used by thousands of practitioners and students around the world, or the one which is penetrating the market because of its easiness and advantages? Moreover, when choosing the second option, are we sure a more trendy package is going to be stable in time or is it going to be superseded by a different one in just few months?
 88 | 
 89 | There isn't the one golden way of how to re-use code and packages, but this dynamic scenario also depicts an exciting and provocative field that forces us to keep ourselves up to date.
 90 | 
 91 | ## Understanding Errors and Getting Help {#sec-errors}
 92 | 
 93 | Even though re-using code makes writing programs easier and less error-prone, every programmer makes mistakes.
 94 | Programming can be a frustrating endeavor, and you will encounter error messages, bugs, and problems that you don’t know how to fix. This section shows how error messages are useful rather than scary and lists the main error messages encountered in the beginning. It explains how to search for help within the R/Python documentation, how to use online resources, and how to formulate questions to your instructor or community so you get a useful answer.
 95 | 
 96 | If you tried out some of the concepts in Chapter [-@sec-chap-programmingconcepts], you have probably already come across some typical or basic errors in programming. Maybe you tried to call a function from a library that you forgot to load before, or maybe you tried to multiply a string with a float.
 97 | There are thousands of errors that you will encounter, and there is no exhaustive list of them, so you won't find a complete structured catalogue to solve your problems when coding. This might seem a rough road for any scientist but in fact you will get used to finding the answers by different means.
 98 | 
 99 | ### Error Messages
100 | 
101 | There are two common strategies to *avoid getting stuck* and move on with your task: one is to understand the *type* of error you are getting, and the other is to know *where* to go to obtain valuable help. We would add a third one: be patient and do not despair!
102 | 
103 | Both R and Python produce warning or error messages when something is wrong in your code. Beginning computational researchers may sometimes feel afraid, confused, or even frustrated when they get such a *painful* message (we have all felt this) and some then would become so anxious that they don't pay enough attention to the text of the error message thinking it will not be helpful to solve the problem and blaming themselves for not being a perfect programmer. But the more you code, the more you realize that getting these error messages is just part of the routine and that it is very useful to carefully read the warning instead of skipping it.
104 | 
105 | In most cases, the error message in your console will tell you exactly where the problem is: a specific line or operation within your code. With this information in many cases you will quickly identify what the problem is about and you will know how to solve it. One of the most common causes for  errors is just very silly typos!
106 | 
107 | Next to the location (the line number), the error message will also tell you more about the problem. For example, when trying to multiply the float object `a` by the string object `b` you will get "Error in a * b : non-numeric argument to binary operator" in R or "TypeError: can't multiply sequence by non-int of type 'float' " in Python. As intimidating as this language may sound in the first place, if you re-read it, you will realize that it, in fact, explains exactly what went wrong. This helps you to understand what you did wrong and enable you to fix it.
108 | 
109 | If you get a warning error that you don't understand or get an incorrect result in your code you have three options to get more information: use the `help` commands to know more about any object or function (`help(object)` in both R and  Python); read the documentation of base R, base Python or of any individual package (there are plenty of them online!); and look at the wonderful community of worldwide coders, read what they have discussed so far or even pose a question to *challenge* their minds.
110 | 
111 | Let's consider this third option. Imagine you read the text of an error message and you feel you don't understand it.
112 | It may be because the wording is too complex or because it just gives an "error code"
113 | (i.e. "error 401 - Unauthorized" when trying to connect to the Twitter API).
114 | If your first thought is to try searching for it in Google, then this is completely correct:
115 | it might take you to code documentation, or better to an online discussion in sites such as Stack Overflow,
116 | which is a useful question and answer website for coders (see Figure [-@fig-stackover]).
117 | It is very likely that some colleagues have already posed a question about the meaning of that error
118 | and others have already provided an answer to what it means and especially help with *how to* fix it.
119 | 
120 | ![A online discussion in Stack Overflow about a warning message ](img/ch4_stackover.png){#fig-stackover}
121 | 
122 | Depending on the complexity and novelty of your problem you might  find a helpful answer in a few minutes or it might take you hours. Never get desperate if you visit many discussions without understanding everything directly: you may have to come back to some of them after reading all. Moreover, some answers will include the exact code you need (ready for copy-and-paste), code to be adapted (i.e. changing the name of your variables) or in pseudocode (informal description of the code). In all of the cases you will be the responsible for making sense of the huge (and sometimes messy) amount of sources you will come across.
123 | 
124 | It is of course possible that you don't get what you need in previous discussions. In that case you will be able to create your own question and wait for someone to reply. If you decide to do this, take some advice into account. First, be sure that the answer is not elsewhere within the same website (a first answer could be just a link to a previous post!). Second, don't worry that your question is silly or too basic: you will find in the community all kinds of coders, from those who are taking their first steps to those who are very advanced. Third, be clear, specific, and focus on what you need to solve. This is probably the most important advice since it is necessary that other coders understand what you need in a few words (not philosophical discussions or previous elaborated rationales)  so they can decide to spend some minutes of their time and help you. It is a very common practice that you copy in the questions the warning message or the code you are having trouble with because peers can even fix it themselves and give the solution right away. Do not worry if your post receives a lot of replies after getting what you needed. This thread might also help others in the future!
125 | 
126 | ### Debugging Strategies
127 | 
128 | It's not always straightforward to understand what is going
129 | wrong. Maybe your script does not even produce an error
130 | message, but just produces some unexpected result.
131 | 
132 | Of course, every program is different and there is not one
133 | way to solve every issue, but there are some simple strategies
134 | that help you debugging your code. The underlying core
135 | principle is to better understand what exactly is happening.
136 | 
137 | -  Print more. For example, if you have a for-loop, just add a print statement to the loop that prints the current value that is processed, or some other information that helps you understanding what data exactly are processed, or what intermediate result is achieved at which step. There are more advanced tools for keeping track of values, such as the so-called *debugger*s in advanced IDEs or the *logging* module in Python, but a couple of extra print functions can serve the same purpose.
138 |    -  Keep track of which code blocks have been executed how often. For instance, maybe you have some *if* statement, but the condition is simply never True, so that the whole code block is never executed. You can create an integer with value 0 at the beginning of the code, and then increment it by one within the code block. If you print it afterwards, you know how often the block has been visited.
139 |    -  Cut it down. Remove (comment out) everything that is not strictly necessary and see whether you can make a simplified version of your code run, before you extend it.
140 |    -  Add consistency checks. For instance, if from a theoretical point of view, two lists need to have the same length, check it with the length function; similarly, if you know that an object must have a specific value (e.g., because you know the result), check this assumption.
141 | 
142 | Finally, when you know that some typical errors may arise and you don't want your script to stop or crash, you can add an `exception` in your code. Suppose for example that you are building a function to connect to an API (see Section [-@sec-apis]). There might be many reasons for getting an error, such as an Internet connection problem, a server issue, or a missing document. You might decide to skip  the error and continue the next lines or  you could even give more detailed instructions of what to do (i.e. wait five minutes and try again). The inclusion of these exceptions are in fact a good practice and will help your code to be more robust and stable.
143 | 
144 | Let's make Example [-@exm-if1] from the previous chapter more robust so that it does not fail if an invalid headline is passed. For instance, in Python, the object `None` has no defined length; and in R, it is illegal to calculate the number of characters in a factor. It is a good idea to think about how you want to deal with this: either you want your script to just fail (and clean up the data), or you may want to deal with the error in some way. Especially if you have little control over the input data and/or if the process you are dealing with takes a long time, you may want to  handle these errors rather than having your script fail. In Example [-@exm-tryexcept], we show how to use such a try/except construction: you indicate which code block you want to try (e.g., run as normal); and in the next block, you indicate what should happen if that results in an error.
145 | 
146 | Note that using *try ... except* statements like this is fairly common in Python code,
147 | in R it is not needed as frequently.
148 | In many cases where a Python function like `int` raises an exception
149 | if the input cannot be converted to an integer, the R function `as.numeric` just returns a missing value.
150 | Thus, in R you normally only encounter these statements when using external resources,
151 | for example when using an API or scraping a web page. See Chapter [-@sec-chap-scraping] for more details on these topics.
152 | 
153 | ::: {.callout-note appearance="simple" icon=false}
154 | 
155 | ::: {#exm-tryexcept}
156 | Error handling.
157 | 
158 | ::: {.panel-tabset}
159 | ## Python code
160 | ```{python tryexcept-python}
161 | headlines = (
162 |     "US condemns terrorist  attacks",
163 |     None,
164 |     "Venezuelan president is dismissed",
165 | )
166 | 
167 | for x in headlines:
168 |     try:
169 |         # Getting len of None will raise an error
170 |         if len(x) < 40:
171 |             print(x)
172 |     except:
173 |         print(f"{x} is not a valid headline")
174 | 
175 | ```
176 | ## R code
177 | ```{r tryexcept-r}
178 | headlines = list("US condemns terrorist attacks", 
179 |   NA, "Venezuelan president is dismissed")
180 | 
181 | for (x in headlines){
182 |   tryCatch(
183 |       # Getting nchar of NA will raise an error
184 |       if (nchar(x)<40) print(x),
185 |       error=function(error_condition) {
186 |         print(paste(x, "is not a valid headline"))
187 |     }
188 |   )
189 | }
190 | ```
191 | :::
192 | :::
193 | :::
194 | 
195 | ## Best Practice: Beautiful Code, GitHub, and Notebooks {#sec-practices}
196 | 
197 | This section gives a brief explanation of "computational hygiene": how to structure your code so you can understand it later, the importance of naming and documentation, the use of versioning and online repositories such as GitHub, and the use of literate programming (such as through the use of RMarkdown or Jupyter notebooks) to explain, share, and publish code.
198 | 
199 | Coding is more than learning the basic rules and creating a message. If you want to use code to communicate ideas and to work with peers you have to take care of many content and shape details in order to guarantee the comprehension and reproducibility of the scripts. It even applies to the code you write for "private use" because it is highly likely that you will  forget your original thoughts from one day to another, or that you later realize that you need to share it with someone else to ask for help. Thus instead of writing personal, hidden and *illegible* code without adopting any social conventions, you should dedicate some extra effort to make your scripts easy and ready to share.
200 | 
201 | The first step of the computational hygiene is within the code itself. Every time you create an object, a variable, or a function, you have to take many apparently unimportant decisions such as giving a name, separating words, lines, or blocks, and including comments. These decisions are personal, but should mostly depend on social conventions in order to be useful. As you may imagine, there are many of these conventions for general programming and specially for specific languages. To mention just a few, you can find an "official" style guide for Python[^1]  or Google's R style guide[^2]. Some of these guides are extensive (they cover every detail) and some are more general or abstract. You do not have to see them as a "bible" that needs to be strictly adhered to in each and every situation, but they offer very good guidance for best practice. In fact, even when you find them useful it is true that you will probably learn more of these practices from reading good examples, and especially from interacting with a specific community and its rules.
202 | 
203 | We will mention some general guidelines that apply for both R and Python. If it is the first time you are venturing into the world of code you will find this advice useful,  but if you are a more advanced learner you will probably get more specific knowledge in the more detailed sources for each language and community.
204 | 
205 | In the case of naming, we encourage you to use meaningful names or standard abbreviations to objects, using lower-case or mixed-case (remember both Python and R are case-sensitive!), avoiding special characters and operators (such as \&, @ or \%), and not exceeding 32 characters. You normally begin with a letter[^3] (an upper-case when defining a class), followed by other letters or numbers, and using underscores to separate the words if  necessary (i.e. `data_2020` or `Filter_Text`). Some suggest that variable names should be nouns and function names should be verbs, which seems logical if you think of the nature of these objects.
206 | 
207 | When writing code, please also take into consideration white space and indentations, because you should use them to give proper structure to the code by creating the block statements. In the case of R, also pay attention to the use of curly braces: the convention is that the opening curly brace begins after some code and is always followed by a new line; and the closing curly brace is in its own line except if there are more instructions in the block. Do not write very long lines (more than 80 characters) to help your code fit the screen and avoid lateral scrolling. Good separation of words, lines and blocks will make your script more readable!
208 | 
209 | Now, if you want to make your code highly understandable and shareable, you have to include *documentation*.
210 | This is probably a very basic dimension of coding but unfortunately some authors forget to take the few minutes it takes to describe what their script does (and why), making your journey more difficult.
211 | An essential good practice in coding is to include enough information to clarify your code when it is not clear by itself.
212 | You can do this in different ways (even by writing a separate codebook), but the most natural and straightforward manner is to include some  *comments* in the code.
213 | These comments should be included both at the beginning of the script to give an overview or introduction to the code,
214 | and within the script (in independent lines or at the end of a line) to give specific orientations.
215 | In many cases, you will need to read your code later (for example when you need to revise an article or analysis), and a short time spent documenting your code will save you a lot of time later.
216 | 
217 | R and Python use the hash sign `\#` to create these comments. The comment will always begin after the hash. If the first character in your line is  a `\#` all the text included will be considered as a comment; but if you have already written some code in a line and include a `\#` after the code, the initial code will be executed and you will always see the comment by its side. You will normally combine these two ways of documenting your script.
218 | As a rule of thumb, insert a comment if the code itself is not obvious,
219 | and explain the choices and intentions of the code.
220 | So, if a line says `df = df - 1`, a comment like *Decrease df by one* is not very useful (as that is obvious from the code), but a comment like *Remove one degree of freedom since we estimated the mean* does help, as it makes it clear why we are subtracting one from the `df` object.
221 | 
222 | Additionally, Python and R encourage the use of so-called `docstrings`:
223 | In Python, place a string surrounded by triple quotation marks at the start of a function; in R, place a comment `#'` right above the function.
224 | [^4]
225 | In this documentation, you can explain what the function does and what parameters it requires.
226 | The nice thing is that if properly used, docstrings are automatically displayed in help functions and automatically generated documentation.
227 | 
228 | Another way to make your code more beautiful and, crucially, easier to re-use by others and yourself is to make your code as generic as possible.
229 | For instance, imagine you need to calculate the sum of the length of two texts, "Good morning!" and "Goodbye!".
230 | You could just write `x = 13 + 8`. But what if the strings change in the future? And how to remember what `13 + 8` was supposed to mean?
231 | Instead of using such *hardcoded* values, you can therefore write it better as `x = len("Good morning!") + len("Goodbye")` (for R, replace `len` by `nchar`).
232 | But the strings themselves are still hardcoded, so you can  create these strings and assign them the names `s1` and `s2` first, and then just calculate `x = len(s1) + len(s2)`. In practice, these types of generalization often involve the uses of functions (Section [-@sec-functions]) and loops (Section [-@sec-loops]). So, don't use hard-coded values or "magic numbers": `circumference=6.28*r` is much less clear than `PI=3.14; circumference=2*PI*r`.
233 | 
234 | Moreover, you must be aware that your code is *dynamic* and it will normally evolve over time. For example, you may have different files (.py or .R) containing different versions of your script, though this is normally inefficient and chaotic. In order to have a more powerful control of versions and to track changes, coders usually use online repositories to host their scripts for private use and especially to share them. And there are many of these sites, but we believe that GitHub[^5] is the most well-known and is preferred by data scientists (Figure [-@fig-github] shows the repository we used to write this book).
235 | 
236 | ![The online repository GitHub.](img/ch04_github.png){#fig-github}
237 | 
238 | Once you upload (or *commit and push*) your code to GitHub, you can access it from anywhere, and will be able to track the historical changes, which in practice will allow you to have multiple versions in the very same place. You will decide if you make the code public or keep it private, and who to invite to edit the repository. When working collaboratively you it will feel like editing a *wiki* of code, while having a *webpage* for your project and a *network of friends* (followers), will be similar to social media. You can work locally or even from a web interface, and then synchronize the changes. When you allow colleagues to download (or *clone*) your repository you are then making a good contribution to the community of developers and you can also monitor your impact. In addition to code, you can also upload other kinds of files, including notebooks, and organize them in folders, just as you have  on your own computer.
239 | 
240 | One extended good practice when sharing code is the use of *literate programming*, which is an elegant, practical, and pedagogic way to document and execute the base code. We have already mentioned in this section the importance of including documentation within your code (i.e. using the `\#` sign and docstrings), but you also have the opportunity to extend this documentation (with formatted texts, images and even equations!) and put everything together to present in a logical structure everything necessary to understand the code and to run the executable lines step by step.
241 | 
242 | There are different approaches to implement this literate programming in web and local environments, but the standard in R and Python is the use of *notebooks*. In a notebook you can alternate a text processor with an executable cell to place formatted text between blocks of code. By doing this you can include complete documentation of your scripts, and even more important you can execute each cell one step at a time (loading the results in memory while the notebook is open). This last point allows you avoid the risk of executing the whole script at once, and also gives you more control of the intermediate outputs produced in your code. Once you get used to notebooks, you will probably never write code for data analysis in a basic editor again!
243 | 
244 | The usual tool in R is the R Markdown Notebook, and in Python the Jupyter Notebook (see figure [-@fig-notebooks]), but in practice you can also deploy Python in Markdown and R in Jupyter. Both tools can help you with similar tasks to organize your script, though their internal technical procedures are quite different. We have chosen Jupyter to develop the examples in this book because it is a web-based interactive tool. Moreover, there are several services such as Google Colab[^6] (Figure [-@fig-colab]), that allow you to remotely run these notebooks online without installing anything on your computer, making the code highly reproducible.
245 | 
246 | ![Markdown (left) and Jupyter (right) Notebooks](img/ch04_notebooks.png){#fig-notebooks}
247 | 
248 | So far you have seen many of the possibilities that the world of code offers you from a technical and collaboration perspective. We will come back to ethical and normative considerations throughout the book, in particular in Section [-@sec-ethicallegalpractical] and Section [-@sec-ethics].
249 | 
250 | ![Jupyter notebook in Google Colab](img/ch04_colab.png){#fig-colab}
251 | 
252 | [^1]: https://www.python.org/dev/peps/pep-0008/
253 | 
254 | [^2]: https://google.github.io/styleguide/Rguide.html
255 | 
256 | [^3]: An exception are so-called private identifiers -- identifiers that are not supposed to be directly addressed. They conventionally begin with an underscore.
257 | 
258 | [^4]: For more information, see [www.python.org/dev/peps/pep-0257/\#what-is-a-docstring](https://www.python.org/dev/peps/pep-0257/\#what-is-a-docstring) and [cran.r-project.org/web/packages/roxygen2/vignettes/roxygen2.html](https://cran.r-project.org/web/packages/roxygen2/vignettes/roxygen2.html),respectively
259 | 
260 | [^5]: https://github.com/
261 | 
262 | [^6]: https://colab.research.google.com
263 | 
264 | 


--------------------------------------------------------------------------------
/content/chapter05.qmd:
--------------------------------------------------------------------------------
  1 | # From file to data frame and back {#sec-chap-filetodata}
  2 | 
  3 | ```{python}
  4 | #| echo: false
  5 | import warnings; warnings.filterwarnings('ignore')
  6 | ```
  7 | 
  8 | **Abstract.**
  9 |   This chapter teaches you the basics of file handling, such as different file formats and encodings. It introduces csv files, json files, plain text files, and binary file formats. We discuss different approaches to organizing data in files, and how to write data frames to and read them from these files.  Finally, we provide guidance for retrieving example datasets.
 10 | 
 11 | **Keywords.** file formats, encodings, reading and writing files, data frames, datasets
 12 | 
 13 | **Objectives:**
 14 | 
 15 | -  Know how to handle different encodings and dialects
 16 |  -  Make an informed choice for a file format
 17 |  -  Know how to access existing datasets
 18 | 
 19 | ::: {.callout-note icon=false collapse=true}
 20 | ## Packages used in this chapter
 21 | 
 22 | This chapter relies mostly on the *pandas* (Python) and *tidyverse* (R) functionality to read and write files. Additionally, *haven* is used to read data from other tools such as SPSS.
 23 |   Finally, we show how to use existing data from packages such as *sotu* (R) and *nltk* and *scikit-learn* (Python).
 24 |   If needed, you can install these packages with the code below
 25 |   (see Section [-@sec-installing] for more details):
 26 | 
 27 | ::: {.panel-tabset}
 28 | ## Python code
 29 | ```{python chapter05install-python}
 30 | #| eval: false
 31 | !pip3 install pandas nltk scikit-learn
 32 | ```
 33 | ## R code
 34 | ```{r chapter05install-r}
 35 | #| eval: false
 36 | install.packages(c("sotu", "haven", "tidyverse", 
 37 |                    "glue", "jsonlite"))
 38 | ```
 39 | :::
 40 |  After installing, you need to import (activate) the packages every session:
 41 | 
 42 | ::: {.panel-tabset}
 43 | ## Python code
 44 | ```{python chapter05library-python}
 45 | import json
 46 | import urllib
 47 | import pandas as pd
 48 | import nltk
 49 | from nltk.corpus import state_union
 50 | 
 51 | nltk.download("punkt")
 52 | from sklearn.datasets import fetch_20newsgroups
 53 | 
 54 | ```
 55 | ## R code
 56 | ```{r chapter05library-r}
 57 | library(tidyverse)
 58 | library(haven)
 59 | library(sotu)
 60 | library(glue)
 61 | library(jsonlite)
 62 | ```
 63 | :::
 64 | :::
 65 | 
 66 | ## Why and When Do We Use Data Frames? {#sec-dataframes}
 67 | 
 68 | In Section [-@sec-datatypes], we introduced basic data types: strings (which contain text), integers (which contain whole numbers, or numbers without anything "behind the dot"), floats (floating point numbers; numbers with decimals), and bools (boolean values, True or False).
 69 | We also learned that a series of multiple values (e.g., multiple integers, multiple strings) can be stored in what we call a vector (R) or a list (Python).
 70 | 
 71 | In most social-scientific applications, however, we do not deal with isolated series of values. We rather want to link multiple values to each other. One way to achieve this is by the use of dictionaries (see Section [-@sec-datatypes]).
 72 | Such data structures are really useful for nested data:
 73 | For example, if we do not want to only store people's ages, but also their addresses,
 74 | we could store a dict within a dict.
 75 | 
 76 | In fact, as we will see later in this chapter, much of the data used by computational social scientists comes in such a format.
 77 | For instance, data about an online product can contain many reviews which in turn have various pieces of information on the review author.
 78 | 
 79 | But ultimately, for many social-scientific analyses, a tabular data format is preferred.
 80 | We are used to thinking of observations (cases) as rows with columns containing information or measurements about these observations (e.g., age, gender, days per week of newspaper reading, ...). It also simplifies how we can run many statistical analyses later on.
 81 | 
 82 | We could simply construct a list of lists to achieve such a tabular data format.
 83 | In fact, this list-of-lists technique is often used to store tabular data or matrices, and you will probably encounter it in some examples in this book or elsewhere. The list-of-lists approach is very low-level, though: if we wanted, for instance, to insert a column or a row at a specific place, writing the code to do so could be cumbersome. There are also no things like column headers, and no consistency checks: nothing would warn us if one row actually contained more "columns" than another, which should not be the case in a rectangular table.
 84 | 
 85 | To make our lives easier, we can therefore use a data structure called a data frame.
 86 | Data frames can be generated from list-of-list structures, from dictionaries, and many others.
 87 | One way of doing this is shown in Example [-@exm-createdataframe], but very often, you'd rather read data from a file or an online resource directly into a data frame (see Section [-@sec-reading]).
 88 | 
 89 | ::: {.callout-note appearance="simple" icon=false}
 90 | 
 91 | ::: {#exm-createdataframe}
 92 | Creating a data frame from other data structures
 93 | 
 94 | ::: {.panel-tabset}
 95 | ## Python code
 96 | ```{python createdataframe-python}
 97 | # Create two lists that will be columns
 98 | list1 = ["Anna", "Peter", "Sarah", "Kees"]
 99 | list2 = [40, 33, 40, 77]
100 | 
101 | # or we could have a list of lists instead
102 | mytable = [["Anna", 40], ["Peter", 33], ["Sarah", 40], ["Kees", 77]]
103 | 
104 | # Convert an array to a dataframe
105 | df = pd.DataFrame(mytable)
106 | 
107 | # Or create the data frame directly from vectors
108 | df2 = pd.DataFrame.from_records(zip(list1, list2))
109 | 
110 | # No. of rows, no. of columns, and shape
111 | print(f"{len(df)} rows x {len(df.columns)} cols")
112 | print(f"Its shape is {df.shape}")
113 | 
114 | print("Element-wise equality of df and df2:")
115 | print(df == df2)
116 | 
117 | ```
118 | ## R code
119 | ```{r createdataframe-r}
120 | # Create two vectors that will be columns
121 | vector1 <- c("Anna","Peter","Sarah","Kees")
122 | vector2 <- c(40,33,40,77)
123 | 
124 | # Create an array of four rows and two columns
125 | myarray <- array(c(vector1,vector2), dim=c(4,2))
126 | 
127 | # Convert an array to a dataframe
128 | df1=data.frame(myarray)
129 | 
130 | # Or create the data frame directly from vectors
131 | df2=data.frame(vector1, vector2)
132 | 
133 | # No. of rows, no. of columns, and dimension
134 | print(glue("{ncol(df1)} rows x {nrow(df1)} cols"))
135 | print(dim(df1))
136 | 
137 | print("Element-wise equality of df1 and df2:")
138 | print(df1 == df2)
139 | ```
140 | :::
141 | :::
142 | :::
143 | 
144 | In this book, we use data frames a lot, because they are very convenient for handling tabular data, and because they provide a lot of useful functionalities, instead of requiring us to re-invent the wheel all the time. In the next section, we will discuss some of them.
145 | 
146 | Of course, there are some situations when data frames are *not* a good choice to organize your data:
147 |  -  Your data is one-dimensional. Think, for example, of resources like a list of stopwords, or a list of texts without any meta-information.
148 |  -  Your data do not have a tabular structure. Think, for example, of deeply nested data, network data or of very messy data.
149 |  -  Your data are so large that you cannot (or do not want to) load it into memory. For instance, if you want to process the text of all articles on Wikipedia, you probably want to process them one-by-one instead of loading all articles at the same time.
150 | 
151 | Therefore, you will come across (and we will introduce you to) examples in which we do *not* use data frames to organize our data.
152 | But in most cases we will, because they make our life easier:
153 | once we have constructed our data frame, we have a range of handy functions at our disposal that allow us to select rows or columns, add new rows or columns, apply functions to them, and so on.
154 | We will discuss these in Chapter [-@sec-chap-datawrangling].
155 | 
156 | But how do we -- toy examples like those in Example [-@exm-createdataframe] aside -- get data into and out of data frames?
157 | 
158 | ## Reading and Saving Data {#sec-reading}
159 | 
160 | ### The Role of Files
161 | 
162 | In statistical software like SPSS or Stata, or in all typical office applications for that matter, you *open* a file, do some work on it, and then *save* the changes to the same file once you are done. You basically "work on that file".
163 | 
164 | That's not how your typical workflow in R or Python looks.
165 | Here, you work on one or multiple data frames (or some other data structures).
166 | That means that you might start by *reading* the contents of some file into a data frame,
167 | but once that is done, there is no link between the data frame and that file any more.
168 | Once your work is done, you can save your data frame to a file, of course,
169 | but it is a good practice not to overwrite your input file, so that you can always go back to where you started.
170 | A typical workflow would look like this:
171 | 
172 | -  Read raw data from file `myrawdata.csv` into data frame `df` -  Do some operations and analyses on `df` -  Save `df` to file `myfinaldata.csv`
173 | Note that the last step is not even necessary, but may be handy if running the script takes very long, or if you want to re-distribute the resulting file.
174 | 
175 | The format in which we read files into a data frame and the format to which we save our final data frame by no means needs to be identical. We can, for example, read data created by someone else in Stata's proprietary
176 | `.dta` format into a data frame and later save it to a .csv table.
177 | 
178 | While we sometimes do not have the choice in which format we get our input data, we have a range of options regarding our output data. We usually prefer formats that are *open* and *interoperable* for this, which ensures that they can be used by as many people as possible, and that they are not tied to any specific (proprietary) software tool which might not be available to everyone and can be discontinued in the future.
179 | 
180 | The most common file formats that are relevant to us are listed in Table [-@tbl-fileformats]. `txt` files are particularly useful for long texts (think of one file containing one newspaper article or even a whole book), but they are bad for storing associated meta data. `csv` files are the default choice for tabular data, and `json` files allow us to store nested data in a dictionary-like format.
181 | 
182 | For the sake of completeness, we also listed the native Python and R formats pickle, RDS, and RDA. Because of their lack of interoperability, they are not very suitable for long-term storage or for sharing data, but they can have a place in a workflow as an intermediate step to solve the issue that none of the other formats are able to store all properties of a data frame (e.g., the csv file cannot store whether a given column in an R data frame is to be understood as containing strings such as "man", "woman", "non-binary" or a factor with the three levels man, woman, non-binary). If it is important to store an object (such as a data frame) exactly as-it-is, we can use these formats. One of the rare instances where we use these formats is in Example [-@exm-reuse], where we store machine learning models for later reuse.
183 | 
184 | | |Used for? | open | interoperable?|
185 | |-|-|-|-|
186 | |txt | plain text | yes | yes|
187 | |csv | tabular data | yes | yes|
188 | |json | nested data, key-value pairs | yes | yes|
189 | |pickle | Python objects | yes | no|
190 | |RDS/RDA | R objects | yes | no|
191 | :  Basics of data frame handling {#tbl-fileformats}
192 | 
193 | ### Encodings and Dialects {#sec-encodings}
194 | 
195 | Plain `txt` files, `csv` files, and `json` files are all files that are based on text. Unlike binary file formats, you can read them in any text editor. Try it yourself to understand what is going on under the hood.
196 | 
197 | Download a csv file (such as [cssbook.net/d/gun-polls.csv](https://cssbook.net/d/gun-polls.csv))
198 | and open it in a text editor of your choice. Some people swear that their preferred editor is the best (google to learn about the vi versus emacs war for some entertainment), but if you have no strong feeling, then Notepad++, Atom, or Sublime may be good choices that you may want to look into.
199 | 
200 | As you will see (Figure [-@fig-csv-in-editor]), a csv file internally just looks like a bunch of text in which each line represents a row and in which the columns are separated by a comma (hence the name comma separated values (csv)).
201 | Looking at the data in a text editor is a very good way to find out what happens if reading your files into a data frame does not work as expected -- which can happen more frequently than you would expect.
202 | 
203 | Mostly due to historical reasons, not every text based file (which, as we have seen, includes csv files) is internally stored in the same way.
204 | For a long time, it was common to *encode* in such a way that one character mapped to one byte. That was easy from a programming perspective (after all, the $n$th character of a text can be directly  read from and written to the $n$th byte of a file) and was also storage-efficient. But given that a byte consists of 8 bits, that means that there are only 256 possible characters. All letters in the alphabet in uppercase, again in lowercase, numbers, punctuation, some control characters -- and you are out of characters. Due to this limitation, there were different encodings or codepages for different languages that told a program which value should be interpreted as which character.
205 | 
206 | We all know the phenomenon of garbled special characters, like German umlauts or Scandinavian characters like ø, å, or œ being displayed as something completely different. This happens when files are read with a different encoding than the encoding that was used for creating them.
207 | 
208 | In principle, this issue has been solved due to the advent of Unicode. Unicode allows all characters from all scripts to be handled, including emoticons, Korean and Chinese characters, and so on. The most popular encoding for Unicode characters is called UTF-8, and it has been around for decades.
209 | 
210 | To avoid any data loss, it is advisable to make sure that your whole workflow uses UTF-8 files. Most modern applications support UTF-8, even though some still by default use a different encoding (e.g., "Windows-1252") to store data. As Figure [-@fig-csv-in-editor] illustrates, you can use a text editor to find out what encoding your data has, and many editors also offer an option to change the encoding. However, you cannot recover what has been lost (e.g., if at one point you saved your data with an encoding that only allows 256 different characters, it follows logically that you cannot recover that information).
211 | 
212 | ![A csv file opened in a text editor, illustrating that the columns are separated by commas, and showing the encoding and the line endings.](img/ch6_csv-in-editor.png){#fig-csv-in-editor}
213 | 
214 | As we will show in the practical code examples below, you can also force Python and R to use a specific encoding, which can come in handy if your data arrives in a legacy encoding.
215 | 
216 | Related to the different encodings a file can have, but less problematic, are different conventions of how a *line ending* is denoted.
217 | Windows-based programs have been using a Carriage Return followed by a Line Feed (denoted as `\r\n`),
218 | very old versions of MacOS used a Carriage Return only (`\r`), and newer versions of MacOS as well as Linux use a Line Feed only (`n`).
219 | In our field, the Linux (or Unix) style line endings have become most dominant,
220 | and Python 3 even automatically converts Windows style line endings to Unix style line endings when reading a file -- even on Windows itself.
221 | 
222 | A third difference is the use of so-called *byte-order markers* (BOM). In essence, a BOM is an additional byte added to the beginning of a text file to indicate that it is a UTF-encoded file and to indicate in which order the bytes are to be read (the so-called endianness). While informative, this can cause trouble if your program does not expect that byte to be there. In that case, you might either want to remove it or explicitly specify the encoding as such. For instance, you can add an argument such as `encoding="UTF-8"` or `encoding="UTF-8bom"` to the `open` (Python) or `scan` (R) command.
223 | 
224 | In short, the most standard form in which you probably want to encode your data is in UTF-8 with Linux-style line endings without the use of a byte-order marker.
225 | 
226 | In the case of reading and writing csv files, we thus need to know the encoding, and potentially also the line ending conventions and the presence of a byte-order marker. However, there are also some additional variations that we need to consider. There is no single definition of what a csv file needs to look like, and there are multiple dialects that are widely used. They mainly differ in two aspects: the delimiter that is chosen, and the quoting and/or escaping of values.
227 | 
228 | First, even though csv stands for comma separated values, one could use other characters instead of a comma to separate the columns. In fact, because many countries use a comma instead of a dot  as a decimal separator (\$10.30 versus 10,30€), in these countries a semicolon (;) is used instead of a comma as the column delimiter.
229 | To avoid any possible confusion, others use a tab character (`t`) to separate columns.
230 | Sometimes, these files are then called a tab-separated file, and instead of .csv,
231 | they may have a file extension such as `.tsv`, `.tab`, or even `.txt`.
232 | However, this does not change the way how you can read them -- but what you need to know is whether your columns are separated by
233 | `,`, `;`, or `t`.
234 | 
235 | Second, there may be different ways  to deal with strings as values in a csv file. For instance, it may be that a specific value contains the same character that is also used as a delimiter. These cases are usually resolved by either putting all strings into quotes, putting only strings that contain such ambiguities in quotes, or by prepending the ambiguous character with a specific escape character. Most likely, all of this is just handled automatically under the hood, but in case of problems, you might want to look into this and check out the documentation of the packages you are using on how to specify which strategy is to be used.
236 | 
237 | Let's get practical and try out reading and writing files into a data frame (Example [-@exm-readfiles]).
238 | 
239 | ::: {.callout-note appearance="simple" icon=false}
240 | 
241 | ::: {#exm-readfiles}
242 | Reading files into a data frame
243 | 
244 | ::: {.panel-tabset}
245 | ## Python code
246 | ```{python readfiles-python}
247 | url = "https://cssbook.net/d/media.csv"
248 | # Directly read a csv file from internet
249 | df = pd.read_csv(url)
250 | 
251 | # We can also explicitly specify delimiter etc.
252 | df = pd.read_csv(url, delimiter=",")
253 | # Note: use help(pd.read_csv) to see all options
254 | 
255 | # Save dataframe to a csv:
256 | df.to_csv("mynewcsvfile.csv")
257 | 
258 | ```
259 | ## R code
260 | ```{r readfiles-r}
261 | url = "https://cssbook.net/d/media.csv"
262 | # Directly read a csv file from internet
263 | df = read_csv(url)
264 | 
265 | # We can also explicitly specify delimiter etc.
266 | df = read_delim(url, delim = ",")
267 | # Note: use ?read_csv to see all options
268 | 
269 | # Save dataframe to a csv:
270 | write_csv(df,"mynewcsvfile.csv")
271 | ```
272 | :::
273 | :::
274 | :::
275 | 
276 | Of course, we can read more than just csv files. In the Python
277 | example, you can use tabcompletion to get an overview of all file
278 | formats Python supports: type `pd.read` and then press the TAB key to
279 | get a list of all supported files. For instance, you could 
280 | `pd.read_excel('test.xlsx')`, `df3 = pd.read_stata('test.dta')`, or
281 | `df4 = pd.read_json('test.json')` Similarly, for R, you can hit TAB
282 | after typing `haven::` to get an overview over functions such as
283 | `read_spss`.
284 | 
285 | ### File handling beyond data frames
286 | Data frames are a very useful data structure for organizing and analyzing data, and will occur in many examples in this book.
287 | However, not all things that we might want to read from a file needs to go into a data frame.
288 | Imagine if we have a list of words that we later want to remove from some texts (so-called stopwords, see Chapter [-@sec-chap-protext]).
289 | We could make a list (or vector) of such words directly in our code.
290 | But if we have more than a couple of such words, it is easier and more readable to keep them in an external file. We could create a file `stopwords.txt` in a text editor with one of such words per line:
291 | 
292 | ```
293 | and
294 | or
295 | a
296 | an
297 | ```
298 | 
299 | If you do not wish to create this list yourself, you could also
300 | download one from [cssbook.net/d/stopwords.txt](https://cssbook.net/d/stopwords.txt) and save it
301 | in the same directory as your Python or R script.
302 | 
303 | Then, you can read this file into a vector or list  (see Example [-@exm-readingstopwords]).
304 | 
305 | ::: {.callout-note appearance="simple" icon=false}
306 | 
307 | ::: {#exm-readingstopwords}
308 | Reading files without data frames
309 | 
310 | ::: {.panel-tabset}
311 | ## Python code
312 | ```{python readingstopwords-python}
313 | # Define stopword list in the code itself
314 | stopwords = ["and", "or", "a", "an", "the"]
315 | 
316 | # Better idea: Download stopwords file and read it
317 | url = "https://cssbook.net/d/stopwords.txt"
318 | urllib.request.urlretrieve(url, "stopwords.txt")
319 | with open("stopwords.txt") as f:
320 |     stopwords = [w.strip() for w in f]
321 | stopwords
322 | 
323 | ```
324 | ## R code
325 | ```{r readingstopwords-r}
326 | # Define stopword list in the code itself 
327 | stopwords = c("and", "or", "a", "an", "the")
328 | 
329 | # Better idea: Download stopwords file and read it
330 | url = "https://cssbook.net/d/stopwords.txt"
331 | download.file(url, "stopwords.txt")
332 | stopwords =  scan("stopwords.txt", what="string")
333 | stopwords
334 | ```
335 | :::
336 | :::
337 | :::
338 | 
339 | ::: {.callout-note appearance="simple" icon=false}
340 | 
341 | ::: {#exm-extendedfilehandling}
342 | More examples for reading from and writing to files.
343 | 
344 | ::: {.panel-tabset}
345 | ## Python code
346 | ```{python extendedfilehandling-python}
347 | # Modify the stopword list and save it:
348 | stopwords += ["somenewstopword", "andanotherone"]
349 | with open("newstopwords.txt", mode="w") as f:
350 |     f.writelines(stopwords)
351 | 
352 | # Use json to read/write dictionaries
353 | somedict = {"label": "Report", "entries": [1, 2, 3, 4]}
354 | 
355 | with open("test.json", mode="w") as f:
356 |     json.dump(somedict, f)
357 | 
358 | with open("test.json", mode="r") as f:
359 |     d = json.load(f)
360 | print(d)
361 | 
362 | ```
363 | ## R code
364 | ```{r extendedfilehandling-r}
365 | # Modify the stopword list and save it:
366 | stopwords = c(stopwords, 
367 |               "somenewstopword", "andanotherone")
368 | fileConn<-file("newstopwords.txt")
369 | writeLines(stopwords, fileConn)
370 | close(fileConn)
371 | 
372 | # Use json to read/write named lists
373 | somedict = list(label="Report",
374 |                entries=c(1,2,3,4))
375 | 
376 | write_json(somedict, "/tmp/x.json", auto_unbox=T)
377 | 
378 | d=read_json("/tmp/x.json", simplifyVector = T)
379 | print(d)
380 | ```
381 | :::
382 | :::
383 | :::
384 | 
385 | Example [-@exm-extendedfilehandling] provides you with some more elaborate code examples that allows us to dig a bit deeper into the general way of handling files.
386 | 
387 | In the Python example,  we can open a file and assign a handle to it that allows us to refer to it (the name of the handle is arbitrary, let's just call it `f` here).
388 | Then, we can use a for loop to iterate over all lines in the file and add it to a list.
389 | 
390 | The `mode = 'r'` specifies that we want to read from the file. `mode = 'w'` would open the file for writing, create it if necessary, and immediately deletes all content that may have been in there if the file already existed (!).
391 | Note that the `.strip()` is necessary to remove the line ending itself, and also any possible whitespace at the beginning or end of a line.
392 | If we want to save our stopwords, we can do this in a similar way: we first open the file (this time, for writing), and then use the file handle's methods to write to it.
393 | We are not limited to plain text files, here. For instance, we can use the same approach to read json files into a Python dict or to store a Python dict into a json file.
394 | 
395 | We could also combine this with a for loop that goes over all files in a dictionary.
396 | Imagine we have a folder full of positive movie reviews, and another one full of negative movie reviews that we want to use to train a machine learning classifier (see Section [-@sec-supervised]).
397 | Let's further assume that all these reviews are saved as `.txt` files.
398 | We can iterate over all of them, as shown in Example [-@exm-reviewdata]. If you want to read text files into a data frame in R, the *readtext* package may be interesting for you.
399 | 
400 | ## Data from online sources {#sec-gathering}
401 | 
402 | Many data that are interesting to those analyzing communication are
403 | nowadays gathered online. In Chapter [-@sec-chap-scraping], you will
404 | learn how to use APIs to retrieve data from web services, and how to
405 | write your own web scraper to automatically download large numbers of
406 | web pages and extract relevant information. For instance, you might
407 | want to retrieve customer reviews from a website or articles from news
408 | sites.
409 | 
410 | In this section, however, we will focus on how to re-use existing
411 | datasets that others have made available online. For instance, the
412 | open science movement has led to more and more datasets being shared
413 | openly using repositories such as Dataverse, Figshare, or
414 | others. Re-using existing data can be very good for several reasons:
415 | first, to confirm (or not) the conclusions drawn by others; second, to
416 | avoid wasting resources by re-collecting very similar or even identical
417 | data all over again; and third, because gathering a large,
418 | high-quality dataset might just not be feasible with your means. This
419 | is especially true when you need annotated (i.e., hand-coded) data for
420 | supervised machine learning purposes (Chapter [-@sec-chap-introsml]).
421 | 
422 | We can distinguish between two types of existing online datasets:
423 | datasets that are inherently interesting, and so-called toy datasets.
424 | 
425 | Toy datasets may include made-up data, but often, they contain real data. 
426 | However, they are not analyzed to gain scientific insights (any
427 | more), as they may be too small, outdated, or already analyzed
428 | all-over again. These provide a great way, though, to learn and
429 | explore new techniques: after all, the results and the characteristics
430 | of the data are already known. Hence, such toy datasets are often
431 | even included in R and Python packages. Some of them are really
432 | well-known in teaching (e.g., the iris dataset containing measurements
433 | of some flowers; or the titanic dataset containing statistics on
434 | survival rates of passengers on the Titanic;  MNIST for image classification; or the MPG dataset on car fuel consumption). Many of these are included
435 | in packages like *scikit-learn*, *seaborn*, or *ggplot2*-- and you can have a look at their documentation.
436 | 
437 | For instance, the 20 Newsgroups dataset contains $18846$ posts from
438 | newsgroups plus the groups where they were posted
439 | (Example [-@exm-20newsgroups]). This can be an interesting resource for
440 | practicing  with natural language processing, unsupervised, and
441 | supervised machine learning. Other interesting resource are
442 | collections of political speeches, such as the state-of-the-union
443 | speeches from the US, which are available in multiple packages
444 | (Example [-@exm-sotudata]).
445 | Other interesting datasets with large collections of textual data may
446 | be the Financial News dataset compiled by @Chen2017 or the
447 | political news dataset compiled by @Horne2018.
448 | 
449 | ::: {.callout-note appearance="simple" icon=false}
450 | 
451 | ::: {#exm-20newsgroups}
452 | In Python, scikit-learn has a convenience function to automatically download the 20 newsgroup dataset and automatically clean it up. In R, you can download the raw version (there are multiple copies floating around on the internet) and perform the cleaning yourself.
453 | 
454 | ::: {.panel-tabset}
455 | ## Python code
456 | ```{python 20newsgroups-python}
457 | # Note: use fetch_20newsgroups? for more options
458 | d = fetch_20newsgroups(remove=("headers", "footers", "quotes"))
459 | df = pd.DataFrame(zip(d["data"], d["target_names"]))
460 | df.head()
461 | 
462 | ```
463 | ## R code
464 | ```{r 20newsgroups-r}
465 | url = "https://cssbook.net/d/20_newsgroups.csv"
466 | d = read_csv(url)
467 | head(d)
468 | ```
469 | :::
470 | :::
471 | :::
472 | 
473 | ::: {.callout-note appearance="simple" icon=false}
474 | 
475 | ::: {#exm-sotudata}
476 | A collection of US state-of-the-union speeches is available in multiple packages in various forms.
477 | 
478 | ::: {.panel-tabset}
479 | ## Python code
480 | ```{python sotudata-python}
481 | # Note: download is only needed once...
482 | nltk.download("state_union")
483 | sentences = state_union.sents()
484 | print(f"There are {len(sentences)} sentences.")
485 | 
486 | ```
487 | ## R code
488 | ```{r sotudata-r}
489 | speeches = sotu_meta
490 | # show only first 50 characters
491 | speeches %>% 
492 |     mutate(text = substr(sotu_text,0,50)) %>%
493 |     head()
494 | ```
495 | :::
496 | :::
497 | :::
498 | 
499 | There are also some more generic resources that you may want to consider for finding
500 | more datasets to play around with. On  [datasetsearch.research.google.com](https://datasetsearch.research.google.com),
501 | you can search for datasets of all kinds, both really interesting ones and toy datasets.
502 | Another great research is [kaggle.com](https://kaggle.com), a site that hosts data
503 | science competitions.
504 | 
505 | ```{bash cleanup}
506 | #| echo: false
507 | rm -f mynewcsvfile.csv newstopwords.txt stopwords.txt test.json
508 | ```
509 | 


--------------------------------------------------------------------------------
/content/chapter12-note-text.qmd:
--------------------------------------------------------------------------------
 1 | ::: {.callout-note icon=false collapse=true}
 2 | ## Do you care about the children?
 3 | 
 4 | Regardless of whether you use XPATHS or CSS Selectors to specify which part of the page you are interested in, it is often the case that there are other elements within it. 
 5 | Depending on whether you want to also retrieve the text of these elements or not, you have to use different approaches. 
 6 | The code examples below shows some of these differences
 7 | 
 8 | Appending `/text()` to the XPATH gives you exactly the text that is in the element itself, including line-breaks that happen to be in the source code.
 9 | In python, the same information is also present in the `.text` property of the elements (but without the line-breaks):
10 | 
11 | ::: {.panel-tabset}
12 | 
13 | ## Python code
14 | ```{python htmlparse2-python1}
15 | print(tree.xpath("//div[@class='restaurant']/text()"))
16 | print([e.text for e in tree.xpath("//div[@class='restaurant']")])
17 | ```
18 | ## R code
19 | ```{r htmlparse2-r1}
20 | page %>% html_nodes(xpath="//div[@class='restaurant']/text()")
21 | ```
22 | :::
23 | 
24 | You can also use the `.text_content` property (in Python) or the `html_text` function (in R) to accces the full text of an element, including children:
25 | 
26 | ::: {.panel-tabset}
27 | 
28 | ## Python code
29 | ```{python htmlparse2-python2}
30 | print([e.text_content() for e in tree.xpath("//div[@class='restaurant']")])
31 | print([e.text_content() for e in tree.getroot().cssselect(".restaurant")])
32 | ```
33 | ## R code
34 | ```{r htmlparse2-r2}
35 | page %>% html_nodes(xpath="//div[@class='restaurant']") %>% html_text()
36 | page %>% html_nodes(".restaurant") %>% html_text()
37 | ```
38 | :::
39 | And you can do the same but using CSS rather than XPATH:
40 | 
41 | ::: {.panel-tabset}
42 | 
43 | ## Python code
44 | ```{python htmlparse2-python3}
45 | print([e.text_content() for e in tree.getroot().cssselect(".restaurant")])
46 | ```
47 | ## R code
48 | ```{r htmlparse2-r3}
49 | page %>% html_nodes(".restaurant") %>% html_text()
50 | ```
51 | :::
52 | :::


--------------------------------------------------------------------------------
/content/chapter15.qmd:
--------------------------------------------------------------------------------
  1 | # Scaling up and distributing {#sec-chap-scalingup}
  2 | 
  3 | {{< include common_setup.qmd >}}
  4 | ```{bash}
  5 | #| echo: false
  6 | # Remove the database in case it exists to avoid 'table X already exists' error
  7 | rm -f mydb.db mydb.sqlite
  8 | ```
  9 | 
 10 | **Abstract.**
 11 |   Throughout this book, we have been working with examples that consist of
 12 |   code to conduct one specific analysis of data sets of modest size.
 13 |   But at some point, you may want to scale up. You may want  others
 14 |   to be able to  apply your code to their data; and you may want to be able to also
 15 |   use your own analyses on larger and more complex datasets. Or you may
 16 |   need to run analyses that your own computer cannot deal with.
 17 |   This chapter deals with such  steps and points you to some techniques that become increasingly useful
 18 |   the larger your projects get.
 19 | 
 20 | **Keywords.** databases, cloud computing, containerization, source code, version control
 21 | 
 22 | **Objectives:**
 23 | 
 24 | -  Be able to scale up your analyses
 25 |  -  Know when to use databases
 26 |  -  Know when to use cloud computing
 27 |  -  Know about distributing source code and containers.
 28 | 
 29 | ::: {.callout-note icon=false collapse=true}
 30 | 
 31 | In this chapter, we provide a brief overview of techniques for scaling up computational analyses. In particular, we introduce SQL and noSQL databases, cloud computing platforms, version control systems, and Docker containers.
 32 | :::
 33 | 
 34 | ## Storing Data in SQL and noSQL Databases {#sec-databases}
 35 | 
 36 | ### When to Use a Database {#sec-whendb}
 37 | 
 38 | In this book, we have so far stored our data in files. In fact, before
 39 | covering the wide range of methods for computational analysis, we
 40 | discussed some basics of file handling
 41 | (Chapter [-@sec-chap-filetodata]). Probably, you did not experience any major
 42 | trouble here (apart from occasional struggles with non-standard
 43 | encodings, or confusion about the delimiters in a csv file). On the
 44 | other hand, the examples we used were still modest in size: usually,
 45 | you were dealing with a handful of csv files; except for huge image classification datasets, the maximum you had to
 46 | deal with were the 50000 text files from the IMDB movie review
 47 | dataset.
 48 | 
 49 | In particular, when loading your data into a data frame, you copied all
 50 | the data from your disk into memory[^1].
 51 | But what if you want to scale up your analyses a bit
 52 | @Trilling2018b? Maybe we want to build up a larger
 53 | data collection, maybe even share it with multiple team members, search
 54 | and filter our data, or collect it over a larger timespan? An
 55 | example may illustrate the problems that can arise.
 56 | 
 57 | Imagine you do some web scraping (Chapter [-@sec-chap-scraping]) that goes beyond
 58 | a few thousand texts. Maybe you want to visit relevant news sites on a
 59 | regular basis (say, once an hour) and retrieve everything that's
 60 | new. How do you store your data then? You could append everything to a
 61 | huge csv file, but this file would quickly grow so large that you
 62 | cannot load it into memory any more. Besides, you may run the risk of
 63 | corrupting the file if something goes wrong in one of your attempts to
 64 | extend the file. Or you could also write each article to a new, separate file.
 65 | That's maybe more failsafe, but you would need to design a good way
 66 | to organize the data. In particular, devising a method to search
 67 | and find relevant files would be a whole project in itself.
 68 | 
 69 | Luckily, you can outsource all these problems to a database that you can
 70 | install on your own computer or possibly on a server (in that case, make
 71 | sure that it is properly secured!). In the example, the scraper, which
 72 | is running once an hour, just sends the scraped data to the database
 73 | instead of to a file, and the database will take care of storing it.
 74 | Once you want to retrieve a subset of your articles for analysis,
 75 | you can send a query to the database and read from it. Both Python and
 76 | R offer integration for multiple commonly used databases. It is even
 77 | possible to directly get the results of such a database query in the
 78 | form of a data frame.
 79 | 
 80 | We can distinguish two main categories of databases
 81 | that are most relevant to us [see also @Gunther2018]:
 82 | relational databases (or SQL-databases) and noSQL-databases. Strictly
 83 | speaking, SQL ("structured query language") is a query language for
 84 | databases, but it is so widespread that it is used almost synonymously
 85 | for relational databases. Even though they have already been around for
 86 |  50 years [@Codd1970], relational databases are still  very
 87 | powerful and very widely used.  They consist of multiple tables that
 88 | are linked by shared columns (keys). For instance, you could imagine a
 89 | table with the orders placed in a webshop that has a column
 90 | `customer-id`, and a different table with addresses, billing
 91 | information, and names for each `customer-id`. Using filter and join
 92 | operations (like in Chapter [-@sec-chap-datawrangling], but then on the database directly), one can then easily retrieve
 93 | information on where the order has to be shipped. A big advantage of
 94 | such a relational database is that, if a customer places 100 orders,
 95 | we do not need to store their address 100 times, but only once, which
 96 | is not only more efficient in terms of storage, but also prevents
 97 | inconsistencies in the data.
 98 | 
 99 | In contrast to SQL databases, noSQL databases are not based on tables,
100 | but use concepts such as "documents" or key-value pairs, very much
101 | like Python dictionaries or JSON files. These types of databases are
102 | particularly interesting when your data are less well-structured. If
103 | not all of your cases have the same variables, or if the content is not
104 | well-defined (let's say, you don't know exactly in what format the date
105 | of publication on a news site will be written), or if the data structure
106 | may change over time, then it is hard or impossible to come up with a
107 | good table structure for an SQL database. Therefore, in many "big data"
108 | contexts, noSQL databases are used, as they -- depending on your
109 | configuration -- will happily accept almost any kind of content you dump
110 | in them. This comes, of course, at the expense of giving up advantages
111 | of SQL databases, such as the avoidance of inconsistencies. But often,
112 | you may  want to store your data first and clean up later, rather
113 | than risking that data collection fails because you enforced a too strict
114 | structure. Also, there are many noSQL databases that are very fast in
115 | searching full text -- something that SQL databases, in general, are
116 | not optimized for.
117 | 
118 | Despite all of these differences, both SQL and noSQL databases can play
119 | the same role in the computational analysis of communication. They both
120 | help you to focus on data collection and data analysis without needing
121 | to devise an ingenious way to store your data. They both allow for much
122 | more efficient searching and filtering than you could design on your own.
123 | All of this becomes especially interesting when your dataset grows too
124 | large to fit in memory, but also when your data are continuously changed,
125 | for instance because new data are added while scraping.
126 | 
127 | ### Choosing the Right Database {#sec-rightdb}
128 | 
129 | Choosing the right database is not always easy, and has many consequences
130 | for the way you may conduct your analyses. As @Gunther2018 explain,
131 | this is not a purely technical choice, but impacts your social-scientific
132 | workflow. Do you want to enforce a specific structure from the very
133 | start, or do you  want to collect everything first and clean up
134 | later? What is your trade-off between avoiding any inconsistency and risking
135 |  throwing away too much raw information?
136 | 
137 | Acknowledging that there are often many different valid choices, and at
138 | the risk of oversimplifying matters, we will try to give some guidance
139 | in which databases to choose by offering some guiding questions.
140 | 
141 | **How is your data structured?.**
142 | Ask yourself: can I organize my data
143 | in a set of relational tables? For instance, think of television
144 | viewing data: there may be a table that gives information on when the
145 | television set was switched on and which channel was watched and by
146 | which user id. A second table can be used to associate personal
147 | characteristics such as age and gender with the user id. And a third
148 | table may be used to map the time stamps to details about a specific
149 | program aired at the time.  If your data looks like this, ask
150 | yourself: can I determine the columns and the data types for each
151 | column in advance?  If so, then a SQL database such as *MySQL*,
152 | *PostgreSQL*, or *MariaDB* is probably what you are looking
153 | for. If, on the other hand, you cannot determine such a structure *a
154 | priori*, if you believe that the structure of your information will
155 | change over time, or if it is very messy, then you may need a more
156 | flexible noSQL approach, such as *MongoDB* or
157 | *ElasticSearch*.
158 | 
159 | **How important is full-text searching for you?.**
160 | SQL databases can handle numeric datatypes as well as text datatypes, but they are usually not optimized for the latter. They handle short strings (such as usernames, addresses, and so on) just fine, but if you are interested in full-text searching, they are not the right tool for the job. This is in particular true if you want to be able to do fuzzy searches where, for instance,  documents containing the plural of a word that you searched for as singular are also found. Databases of, for instance, news articles, tweets, transcripts of speeches, or other documents are much better accessed in a database such as *ElasticSearch*.
161 | 
162 | **How flexible does it need to be?.**
163 | In relational databases, it is relatively hard to change the structure afterwards. In contrast, a noSQL database has no problem whatsoever with adding a new document that contains keys that did not exist before. There is no assumption that all documents contain the same keys. Therefore, if it is hard to tell in advance which "columns" or "keys" may represent your data best, you should stay clear of SQL databases. In particular, if you think of gradually extending your data and use it on a long timeline for re-use, potentially even by multiple teams, the flexibility of a noSQL database may be a game changer.
164 | 
165 | ### A Brief Example Using SQLite {#sec-sqlite}
166 | 
167 | Installing a database server such as *mysql*, *mariadb* (an
168 | open-source fork of mysql), *MongoDB*, or *Elasticsearch* is
169 | not really difficult (in fact, it may already be come pre-packaged
170 | with your operating system), but the exact configuration and setup may
171 | differ widely depending on your computer and your needs. Most
172 | importantly, especially if you store sensitive data in your database,
173 | you will need to think about authentication, roles, etc. --- all
174 | beyond the scope of this book.
175 | 
176 | Luckily, there is a compromise between storing your data in the files
177 | that you need to manage yourself and setting up a database server,
178 | locally or remotely. The library *SQlite* offers a self-contained
179 | database engine -- essentially, it allows you to store a whole
180 | database in one file and interact with it using the SQL query language.
181 | Both R and Python offer multiple ways of directly interacting with
182 | sqlite files (Example [-@exm-sqlite]). This gives you access to some great
183 | functionality straight away: after all, you can issue (almost) any SQL command
184 | now, including (and maybe most importantly) commands for filtering,
185 | joining, and aggregating data. Or you could consider immediately writing
186 | each datapoint you get from an API or a webscraper (Chapter [-@sec-chap-scraping])
187 | without risking  losing any data if connections time out or scraping
188 | fails halfway.
189 | 
190 | ::: {.callout-note appearance="simple" icon=false}
191 | 
192 | ::: {#exm-sqlite}
193 | SQLite offers you database functionality without setting up a database server such as mysql
194 | 
195 | ::: {.panel-tabset}
196 | ## Python code
197 | ```{python sqlite-python}
198 | import pandas as pd
199 | import sqlite3
200 | 
201 | # Load a dataframe
202 | url = "https://cssbook.net/d/gun-polls.csv"
203 | d = pd.read_csv(url)
204 | 
205 | # connecting  to a SQLite database
206 | conn = sqlite3.connect("mydb.db")
207 | # store the df as table "gunpolls" in the database
208 | d.to_sql("gunpolls", con=conn)
209 | 
210 | # run a query on the SQLite database
211 | sql = """SELECT support, pollster 
212 |          FROM gunpolls LIMIT 5;"""
213 | d2 = pd.read_sql_query(sql, conn)
214 | # close connection
215 | conn.close()
216 | d2
217 | 
218 | ```
219 | ## R code
220 | ```{r sqlite-r}
221 | library(tidyverse)
222 | library(RSQLite)
223 | 
224 | # Load a dataframe
225 | url = "https://cssbook.net/d/gun-polls.csv"
226 | d = read_csv(url)
227 | 
228 | # connecting  to a SQLite database
229 | mydb = dbConnect(RSQLite::SQLite(), "mydb.sqlite")
230 | # store the df as table "gunpolls" in the database
231 | dbWriteTable(mydb, "gunpolls", d)
232 | 
233 | # run a query on the SQLite database
234 | sql = "SELECT support, pollster 
235 |        FROM gunpolls LIMIT 5;"
236 | d2 = dbGetQuery(mydb, sql)
237 | d2
238 | # close connection
239 | dbDisconnect(mydb)
240 | 
241 | ```
242 | :::
243 | :::
244 | :::
245 | 
246 | Of course, *SQlite* cannot give you the same performance as a "real" mysql (or similar) installation could offer. Therefore, if your project grows bigger, or if you have a lot of read-
247 |  or
248 | write-operations per second, then you may have to switch at some
249 | point. But as you can see in Example [-@exm-sqlite], Python and R do not
250 | really care about the back end: all you need to do is to change the
251 | connection `conn` such that it points to your new database instead of
252 | the sqlite file.
253 | 
254 | ## Using Cloud Computing {#sec-cloudcomputing}
255 | 
256 | Throughout this book, we assumed that all tasks can actually be
257 | performed on your own computer. And often, that is indeed the best
258 | thing to do: you normally want to maintain a local copy of your data
259 | anyway, and it may be the safest bet for ethical and legal reasons --
260 | when working with sensitive data, you need to know what you are doing
261 | before transferring them somewhere else.
262 | 
263 | However, once you scale up your project, problems may arise (see @Trilling2018b):
264 |  -  Multiple people need to work on the same data
265 |  -  Your dataset is too large to fit on your disk
266 |  -  You do not have enough RAM or processing power
267 |  -  Running a process simply takes too long (e.g., training a model
268 |   for several days) or needs to be run in continuous intervals (e.g.,
269 |   scraping news articles once an hour) and you need your computer for
270 |   other things.
271 | 
272 | This is the point where you need to start moving your project to some
273 | remote server instead. Broadly speaking, we can consider four
274 | scenarios:
275 |  -  A cloud service that just lets you run code. Here, you can just
276 |   submit your code and have it run. You do not have full control, you
277 |   cannot set up your own system, but you also do not have to do any
278 |   administration.
279 |  -  A dedicated server. You (or your university) could buy a
280 |   dedicated, physical server to run computational social science
281 |   analyses. On the bright side, this gives you full control, but it is
282 |   also not very flexible: after all, you make a larger investment
283 |   once, and if it turns out that you need more (or less) resources,
284 |   then it might be too late to change.
285 |  -  A virtual machine (VM) on a cloud computing platform. For most
286 |   practical purposes, you can do the same as in the previous option,
287 |   with the crucial difference that you rent the resources. If you need
288 |   more, you just rent more; and when you are done, you just stop the
289 |   machine.
290 |  -  A set of machines to run complex tasks using parallel computing. With large amounts of information (think about image or video data) and sophisticated modeling (such as deep learning) you may need to distribute the computation among several different computers at the same time.
291 | 
292 | An example for the first option is Google Colab. While it makes it easy to share and run notebooks,
293 | the free tier we used so far does not necessarily solve any of the
294 | scalability issues discussed. However, Google Colab also has a paid Pro
295 | version, in which additional hardware (such as GPUs, TPUs or extra memory)
296 | that you may not have on your own computer can be used. This makes
297 | it an attractive solution for enabling projects (e.g., involving
298 | resource-intensive neural networks) that otherwise would not be possible.
299 | 
300 | However, this is often not enough. For instance, you may want to run
301 | a database (Section [-@sec-databases]) or define a so-called *cron* job,
302 | which runs a specific script (e.g., a web scraper) at defined intervals.
303 | Here, options 2 and 3 come into play -- most realistically for most
304 | beginners, option 3.
305 | 
306 | There are different providers for setting up VMs in the cloud, the
307 | most well-known probably being Amazon Web Services (AWS) and
308 | Microsoft Azure. Some universities or (national) research infrastructure
309 | providers provide high-performance computing in the cloud as well.
310 | While the specific way to set up a virtual machine of your own on
311 | such an infrastructure varies, the processes are roughly similar:
312 | you select the technical specifications such as the number of CPU
313 | cores and the amount of memory you need, attach some storage, and
314 | select a disk image with an operating system, virtually always some
315 | Linux distribution ([@fig-createvm], [@fig-createvm2]).
316 | After a couple of minutes, your machine is ready to use.
317 | 
318 | ![Creating a Virtual Machine on Microsoft Azure](img/vmazure.png){#fig-createvm}
319 | 
320 | ![Creating a Virtual Machine on a university cloud computing platform using OpenNebula](img/vmopennebula.png){#fig-createvm2}
321 | 
322 | While setting up such a machine is easy, some knowledge is required
323 | for the responsible and safe administration of the machine, in
324 | particular to prevent unauthorized access.
325 | 
326 | Imagine you have a script `myscript.py` that takes a couple of days to
327 | run. You can then use the tool `scp` to copy it to your new virtual
328 | machine, log on to your virtual machine using `ssh`, and then -- now
329 | on your virtual machine! -- run the script using a tool such as
330 | `nohup` or `screen` that will start your script and will keep running
331 | it (Figure [-@fig-ssh]). You can safely logout again, and your
332 | virtual machine in the cloud will keep on doing its work. The only
333 | thing you need to do is collect your results once your script is done,
334 | even if that's a couple of weeks later. Or you may want to add your
335 | script to the crontab (Google it!), which will automatically run
336 | it at set intervals.
337 | 
338 | ![Running a script on a virtual machine. Note that the first two commands are issued on the local machine ('damian-thinkpad') and the next command on the remote machine ('packer-ubuntu-16').](img/ssh.png){#fig-ssh}
339 | 
340 | You may want to have some extra luxury, though. Popular things to
341 | set up are databases (Section [-@sec-databases]) and *JupyterHub*, which
342 | allows users such as your colleagues to connect through their
343 | web browser with your server and run their own Jupyter Notebooks
344 | on the machine. Do not forget to properly encrypt all connections,
345 | for instance using *letsencrypt*.
346 | 
347 | Finally, option 4 must be selected when the scale of your data and the complexity of the tasks cannot be deployed in a single server or virtual machine. For example, building a classification model by training a complex and deep convolutional neural network with millions of images and update this model constantly may require the use of different computers at the same time. Actually, in modern computers with multiple cores or processors you normally run parallel computing within a single machine. But when working at scale you will probably need to set a infrastructure of different computers such as that of a *grid* or a *cluster*.
348 | 
349 | Cloud services (e.g. AWS, Microsoft Azure, etc.) or scientific infrastructures (e.g. supercomputers) offer the possibility to set these architectures remotely. For instance, in a computer cluster you can configure a group of virtual computers, where one will act as a *main* and the others as *workers*. With this logic the main can distribute the storage and analysis of data among the slaves and then resume the results: see for example the *MapReduce* or the *Resilient Distributed Dataset* (RDD) approaches used by the open-source software *Apache Hadoop* and *Apache Spark* respectively.  For a specific example of parallel computing in computational analysis of communication you can take a look at the implementation of distributed supervised sentiment analysis,  in which one of the authors of this book deployed supervised text classification in *Apache Spark*  and connected this infrastructure with real-time analysis of tweets using *Apache Kafka*  in order to perform streaming analytics (@calderon2019distributed).
350 | 
351 | These architectures for parallel processing will significantly increase your computation capacity for big data problems but the initial implementation will consume time and (most of the time) money, which is the reason why you must think in advance if there is a simpler solution (such as a single but powerful machine) before implementing a more complex infrastructure in your analysis.
352 | 
353 | ## Publishing Your Source {#sec-publishingsource}
354 | 
355 | Already in Section [-@sec-practices], we briefly introduced the idea of
356 | version control protocols such as *git*, and the most well-known
357 | online git repository *GitHub*.
358 | There are others, such as *Bitbucket*
359 | and the question of which one you use is not really of importance for our
360 | argument here. Already for small projects, it is a good idea to use
361 | version control so that you can always go back to earlier versions,
362 | but as soon as you start working with multiple people on one project,
363 | it becomes indispensable.
364 | 
365 | In particular, it is possible to work on multiple *branches*,
366 | different versions of the code that can later be merged again. In this
367 | way, it is possible to develop new features without interfering with
368 | the main version of the code. There are plenty of git tutorials
369 | available online, and we highly recommended using git from the
370 | beginning of a specific project on -- be it your bachelor, master or doctoral thesis, a paper, or a tool that you want to create.
371 | 
372 | In the computational analysis of communication, it is becoming more and
373 | more the norm to publish all your source code together with an
374 | article, even though it is important to keep in mind ethical and legal
375 | restrictions (@VanAtteveldt2019). Using a version control
376 | platform like GitHub from the beginning makes this easy: when
377 | publishing your paper, the only thing you have to do is to set access
378 | of your repository to "public" (in case it was private before), add
379 | a `README.md` file (in case you have not done so earlier), and
380 | preferably, get a persistent identifier, a `doi` for your
381 | code (see [guides.github.com/activities/citable-code/](https://guides.github.com/activities/citable-code/)).
382 | And don't forget to add a license to your code, such as MIT, GPL, or
383 | Apache. All of these have specific implications on what others can or
384 | cannot do with your code (e.g., whether it can be used for commercial
385 | purposes or whether derivatives need to be published under the same license as well). Whatever you choose here, it is important that you make a choice, as otherwise, it may not be (legally) possible to use
386 | your code at all.
387 | If your code pertains to a specific paper, then we suggest you organize
388 | your repository as a so-called "research compendium", integrating
389 | both your code and your data.
390 | @compendium provide a template and tools for easily creating one
391 | [^2].
392 | 
393 | In virtually all instances, your code will rely on libraries written
394 | by others, which are available free of charge. Therefore,
395 | it only seems fair to "give back" and make sure that any code that
396 | you wrote and that can be useful to others, is also available to them.
397 | 
398 | Just like in the case of a research compendium for a specific paper,
399 | publishing source code for more generic reuse also begins with a
400 | GitHub repository. In fact, both R (with *devtools*) and Python
401 | (via *pip*) can install packages directly from GitHub. In order
402 | to make sure that your code can be installed as a package, you
403 | need to follow specific instructions on how to name files, how to
404 | structure your directory, and so on (see [packaging.python.org/tutorials/packaging-projects/](https://packaging.python.org/tutorials/packaging-projects/)
405 | and [r-pkgs.had.co.nz/](http://r-pkgs.had.co.nz/)).
406 | 
407 | Regardless of these specific technical instructions, you can make
408 | sure from the outset, though, that your code is easily re-usable.
409 | The checklist below can help making your code publishable from the
410 | outset.
411 | 
412 | -  Do not hard-code values. Rather than using `"myoutputfile.csv"` or `50` within your script, create constants like `OUTPUTFILE="myoutputfile"` and `NUMBER_OF_TOPICS=50` at the beginning of your script and use these variables instead of the values later on. Even better, let the user provide these arguments as command line arguments or via a configuration file.
413 |  -  Use functions. Rather than writing large scripts that are executed from the first line to the last in that order, structure the code in different functions that fulfill one specific task each, and can hence be reused. If you find yourself copy-pasting code, then most likely, you can write a function instead.
414 |  -  Document your code. Use docstrings (Python) or comments (R) to make clear what each function does.
415 | 
416 | ## Distributing Your Software as Container {#sec-container}
417 | 
418 | When publishing your software, you can think of multiple user
419 | groups. Some may be interested in building on and further developing
420 | your code. Some may not care about your code at all and just want your
421 | software to run. And many others will be somewhere in between.
422 | 
423 | *Only* publishing your source code (Section [-@sec-publishingsource]) may
424 | be a burden for those who want your code to "just run" once your
425 | code becomes more complex and has more dependencies. Imagine a
426 | scenario where your software requires a specific version of
427 | Python or R and/or some very specific (or maybe incompatible) libraries
428 | that you do not want to force the user to install.
429 | 
430 | And maybe your prospective user does not even know any R or Python.
431 | 
432 | For such cases, so-called containers are the solution, with as most
433 | prominent platform *Docker*. You can envision a container as a
434 | minimalistic virtual machine that includes everything to run your
435 | software. To the outside, none of that is visible -- just a network
436 | port to connect to, or a command line to interact with, depending on
437 | your choices.
438 | 
439 | Software that is containerized using Docker is distributed as a
440 | so-called *Docker image*. You can build such an image yourself,
441 | but it can also be distributed by pushing it to a so-called registry,
442 | such as the *Docker Hub*. If you publish your software this
443 | way, the end user has to do nothing other than installing Docker and
444 | running the command `docker run nameofyourimage` -- it will even be
445 | downloaded automatically if necessary. There are also GUI versions
446 | of Docker available, which lowers the threshold for some end user
447 | groups even more.
448 | 
449 | Let's illustrate the typical workflow with a toy example. Imagine
450 | you wrote the following script, `myscript.py`:
451 | 
452 | ```{python}
453 | #| eval: false
454 | import numpy as np
455 | from random import randint
456 | a = randint(0,10)
457 | print(f"exp({a}) = {np.exp(a)}")
458 | ```
459 | 
460 | You think that this is an awesome program (after all, it calculates
461 | $e$ to the power of a random integer!), and others should be able
462 | to use it. And you don't want to bother them with setting up Python,
463 | installing *numpy*, and then running the script. In fact, they do
464 | not even need to *know* that it's a Python program. You
465 | could have written it as well in R, or any other language -- for
466 | the user, that will make no difference at all.
467 | 
468 | What would a Docker image that runs this code need to contain? Not
469 | much: first some basic operating system (usually, a tiny Linux distribution),
470 | Python, numpy, and the script itself.
471 | 
472 | To create such a Docker image, you create a file named `Dockerfile`
473 | in the same directory as your script with the following content:
474 | 
475 | ```
476 | FROM python:3
477 | ADD myscript.py /
478 | RUN pip install numpy
479 | CMD [ "python", "./myscript.py" ]
480 | ```
481 | 
482 | The first line tells Docker to build your new image by starting
483 | from an existing image that already contains an operating system
484 | and Python3. You could also start from scratch here, but this
485 | makes your life much easier. The next line adds your script to the
486 | image, and then we run `pip install numpy` within the image.
487 | The last line just specifies which command with which parameters
488 | needs to be executed when the image is run -- in our case
489 | `python ./myscript.py`.
490 | 
491 | To create the image, you run `docker build -t dockertest .` (naming
492 | the image "dockertest"). After that, you can run
493 | it using `docker run dockertest` -- and, if you want to, publish it.
494 | 
495 | Easy, right?
496 | 
497 | But when does it make sense to use Docker? Not in our toy example,
498 | of course. While the original code is only a couple of bytes, it now
499 | got bloated to hundreds of megabytes. But there are plenty of
500 | scenarios where this makes a lot of sense.
501 | 
502 | -  To "abstract away" the inner workings of your code. Rather than giving potentially complicated instructions how to run your code, which dependencies to install, etc., you can just provide users with the Docker image, in which everything is already taken care of.
503 |  -  To ensure that users get the same results. Though it doesn't form a huge problem on a daily basis for most computational scientists, different versions of different libraries on different systems may occasionally produce slightly different results. The container ensures that the code is run using the same software setup.
504 |  -  To avoid interfering with existing installations. Already our toy example had a dependency, *numpy*, but often, dependencies can be more complex and a program we write may need very specific libraries, or even some other software beyond Python or R libraries. Distributing the source code alone means forcing the user to also install these; and there are many good reasons why people may be reluctant to do so. It may be incompatible with other software on their computer, there may be security concerns, or it just may be too much work. But if it runs inside of the Docker container, many of these problems disappear.
505 | 
506 | In short, the Docker image is rarely the *only* way in which you distribute your source code. But already adding a Dockerfile to your GitHub repository so that users can build a Docker container can offer another and maybe better way of running your software to your audience.
507 | 
508 | [^1]: In fact, this is
509 |   sometimes a reason to avoid data frames: for instance, it is possible
510 |   to use a generator that reads data line-by-line from a file and
511 |   yields them to *scikit-learn*. In this way, only *one* row
512 |   of data is in your memory at the same time (see Section [-@sec-functions]).
513 | 
514 | [^2]: See [compendium.ccs.amsterdam]([https://compendium.ccs.amsterdam])
515 | 
516 | ```{bash}
517 | #| echo: false
518 | rm -f mydb.db mydb.sqlite
519 | ```
520 | 


--------------------------------------------------------------------------------
/content/chapter16.qmd:
--------------------------------------------------------------------------------
 1 | # Where to go next {#sec-chap-wherenext}
 2 | 
 3 | **Abstract.**
 4 | This chapter summarizes the main learning goals of the book, and outlines possible next steps. Special attention is paid to an ethical application of computational methods, as well as to the importance of open and transparent science.
 5 | 
 6 | **Keywords.** summary, open science, ethics
 7 | 
 8 | **Objectives:**
 9 | 
10 | -  Reflect on the learning goals of the book
11 |  -  Point out avenues for future study
12 |  -  Highlight ethical considerations for applying the techniques covered in the book
13 |  -  Relate the techniques covered in the book to Open Science practices
14 | 
15 | ::: {.callout-note icon=false collapse=true}
16 | 
17 | This concluding chapter provides a broad overview of what was covered so far, and what interesting avenues there are to explore next. It gives pointers to resources to learn more about topics such as programming, statistical modeling or deep learning. It also discusses considerations regarding ethics and open science.
18 | :::
19 | 
20 | ## How Far Have We Come? {#sec-howfarcome}
21 | In this book, we introduced you to the computational analysis of communication. In Chapter [-@sec-chap-introduction], we tried to convince you that the computational analysis of communication is a worthwhile endeavor -- and we also highlighted that there is much more to the subject  than this book can cover.
22 | So here we are now. Maybe you skipped some chapters, maybe you did some additional reading or followed some online tutorials, and maybe you completed your first small project that involved some of techniques we covered. Time to recap.
23 | 
24 | You now have some knowledge of programming. We hope that this has opened new doors for you, and allows you to use a wealth of libraries, tutorials, and tools that may make your life easier, your research more productive, and your analyses better.
25 | 
26 | You have learned how to handle new types of data. Not only traditional tabular datasets, but also textual data, semi-structured data, and to some extent network data and images.
27 | 
28 | You can apply machine-learning frameworks. You know about both unsupervised and supervised approaches, and can decide  how they can be useful for finding answers to your research questions.
29 | 
30 | Finally, you have got at least a first impression of some cool techniques like neural networks and services such as databases, containers, and cloud computing. We hope that being aware of them will help you to make an informed decision whether they may be good tools to dive into for your upcoming projects.
31 | 
32 | ## Where To Go Next? {#sec-wheretogo}
33 | But what should you learn next?
34 | 
35 | Most importantly, we cannot stress enough that it should be the research question that is the guide, not the method. You shouldn't use the newest neural network module just because it's cool, when counting occurrences of a simple regular expression does the trick. But this also applies the other way around: if a new method performs much better than an old one, you should learn it! For too long, for instance, people have relied on simple bag-of-words sentiment analyses with off-the-shelf dictionaries, simply because they were easy to use -- despite better alternatives being available.
36 | 
37 | Having said that, we will nevertheless try to give some general recommendations for what to learn next.
38 | 
39 | **Become better at programming.** In this book, we tried to find a compromise between teaching the programming concepts necessary to apply our methods on the one hand, and not getting overly technical on the other hand. After all, for many social scientists, programming is a means to an end, not a goal in itself. But as you progress, a deeper understanding of some programming concepts will make it easier for you to tailor everything according to your needs, and will -- again -- open new doors. There are countless books and online tutorials on "Programming in [Language of your choice]". In fact, in this "bilingual" book we have shown you how to program with R and Python (the most used languages by data scientists), but there are other programming languages that might also deserve your attention (e.g. Java, Scala, Julia, etc.) if you become a computational scientist.
40 | 
41 | **Learn how to write libraries.** A very specific yet widely applicable skill we'd encourage you to learn is writing your own packages ("modules" or "libraries"). One of the nice things about computational analyses is that they are very much compatible with an Open Science approach. Sharing what you have done is much easier if everything you did is already documented in some code that you can share. But you can go one step further: of course it is nice if people can exactly reproduce your analysis, but wouldn't it be even nicer if they could also use your code to run analyses using their own data?
42 | If you thought about a great way to compute some statistic, why not make it easy for others to do the same?
43 | Consider writing (and documenting!) your code  in a general way and then publishing it on CRAN or pypi so others can easily install and use it.
44 | 
45 | **Get inspiration for new types of studies.** Try to think a bit out of the box and beyond classical surveys, experiments, and content analyses to design new studies. Books like *Bit by Bit* @Salganik2019 may help you with this. You can also take a look at other scientific disciplines such as computational biology that has reinvented its methods, questions and hypotheses. Keep in mind that computational methods have an impact on the theoretical and empirical discussions of communication processes, which in turn will call for novel types of studies. The emerging scientific fields such as Computational Communication Science, Computational Social Sciences and Digital Humanities show how theory and methods can develop hand in hand.
46 | 
47 | **Get a deeper understanding of deep learning.** For many tasks in the computational analysis of communication, classical machine learning approaches (like regression or support vector machines) work just fine. In fact, there is no need to always jump on the latest band wagon of the newest technique. If a simple logistic regression achieves an F1-score of 88.1, and the most fancy neural network achieves an 88.5 -- would it be worth the extra effort and the loss of explainability? It depends on your use case, but probably not. Nevertheless, by now, we can be fairly certain that neural networks and deep learning are here to stay. We could only give a limited introduction in this book, but state-of-the-art analysis of text and especially visual material cannot do without it any more. Even though you may not train such models yourself all the time, but may use, for instance, pre-trained word embeddings or use packages like *spacy* that have been trained using neural networks, it seems worthwhile to understand these techniques better. Also here, a lot of online tutorials exist for frameworks such as `keras` or `tensorflow`, but also thorough books that provide a sound understanding of the underlying models @goldberg2017.
48 | 
49 | **Learn more about statistical models.** Not everything in the computational analysis of communication is machine learning. We used the analogy of the mouse trap (where we only care about the performance, not the underlying mechanism) versus better prior understanding, and argued that often, we may use machine learning as a "mouse trap" to enrich our data -- even if we are ultimately interested in explaining some other process. For instance, we may want to use machine learning as one step in a workflow to predict the topic of social media messages, and then use a conventional statistical approach to understand which factors explain how often the message has been shared. Such data, though, often have different characteristics than data that you may encounter in surveys or experiments. In this case, for instance, the number of shares is a so-called count variable: it can take only positive integers, and thus has a lower bound (0) but no upper bound. That's very different than normally distributed data and requires regression models such as negative binomial regression. That's not difficult to do, but worth  reading up on. Similarly, multilevel modelling will often be appropriate for the data you work with. Being familiar with this and other techniques (such as mediation and moderation analysis, or even structural equation modeling) will allow you to make better choices. On a different note, you may want to familiarize yourself with Bayesian statistics -- a framework that is very different from the so-called frequentist approach that you probably know from your statistics courses.
50 | 
51 | And, last but not least: have fun! At least for us, that is one of the most important parts: don't forget to enjoy the skills you gained, and create projects that you enjoy!
52 | 
53 | ## Open, Transparent, and Ethical Computational Science {#sec-ethics}
54 | 
55 | We started this book by reflecting on what we are actually doing when conducting computational analyses of communication.
56 | One of the things we highlighted in Chapter [-@sec-chap-introduction] was our use of open-source tools, in particular Python and R
57 | and the wealth of open-source libraries that extend them. Hopefully, you have also realized not only how much your work could
58 | therefore build on the work of others, but also how many of the resources you used were created as a community effort.
59 | 
60 | Now that you acquired the knowledge it takes to conduct computational research on communication, it is time to reflect
61 | on how to give back to the community, and how to contribute to an open research environment. At the same time, it is
62 | not as simple as "just putting everything online" -- after all, researchers often work with sensitive data. We therefore
63 | conclude this book with a short discussion on open, transparent, and ethical computational science.
64 | 
65 | **Transparent and Open Science.**
66 | In the wake of the so-called reproducibility crisis, the call for transparent and open science has become louder and louder
67 | in the last years. The public, funders, and journals increasingly ask for access to data and analysis scripts that underly
68 | published research. Of course, publishing your data and code is not a panacea for all problems, but it is a step towards
69 | better science from at least two perspectives [@VanAtteveldt2019]: first, it allows others to reproduce your work, enhancing its credibility
70 | (and the credibility of the field as a whole). Second, it allows others to build on your work without reinventing the wheel.
71 | 
72 | So, how can you contribute to this? Most importantly, as we advised in Section [-@sec-practices]: use a version control system
73 | and share your code on a site like github.com. We also discussed code-sharing possibilities in Section [-@sec-publishingsource]. Finally, you can find a template for organizing your code and data so that your
74 | research is easy to reproduce at [github.com/ccs-amsterdam/compendium](https://github.com/ccs-amsterdam/compendium).
75 | 
76 | **The privacy--transparency trade-off.** While the sharing of code is not particularly controversial, the sharing of
77 | data sometimes is. In particular, you may deal with data that contain personally identifiable information. On the one
78 | hand, you should share your data to make sure that your work can be reproduced -- on the other hand, it would be ethically
79 | (and depending on your jurisdiction, potentially also legally) wrong to share personal data about individuals.
80 | As @boyd2012 write: "Just because it is accessible does not make it ethical." Hence, the
81 | situation is not always black or white, and some techniques exist to find a balance between the two: you can remove
82 | (or hash) information such as usernames, you can aggregate your data, you can add artificial noise. Ideally, you should
83 | integrate legal, ethical, and technical considerations to make an informed decision on how to find a balance such that
84 | transparency is maximized while privacy risks are minimized. More and more literature explores different possibilities @Breuer2020.
85 | 
86 | **Other Ethical Challenges in Computational Analyses.**
87 | Lastly, there are also other ethical challenges that go beyond the use of privacy-sensitive data. Many tools we
88 | use give us great power, and with that comes great responsibility. For instance, as we highlighted in
89 | Section [-@sec-ethicallegalpractical], every time we scrape a website, we cause some costs somewhere. They may be
90 | neglectable for a single http request, but they may add up. Similarly, calculations on some cloud service
91 | cause environmental costs. Before starting a large-scale project, we should therefore make a trade-off between
92 | the costs or damage we cause, and the (scientific) gain that we achieve.
93 | 
94 | In the end, though, we firmly believe that as computational scientists, we are well-equipped to contribute to the move towards
95 | more ethical, open, and transparent science. Let's do it!
96 | 


--------------------------------------------------------------------------------
/content/common_setup.qmd:
--------------------------------------------------------------------------------
 1 | ```{python}
 2 | #| echo: false
 3 | import warnings; warnings.filterwarnings('ignore')
 4 | def print(x, *args, **kargs):
 5 |   x = str(x)
 6 |   for line in x.split("\n"):
 7 |     if len(line) > 75:
 8 |         line = line[:75] + "..."
 9 |     __builtins__.print(line, *args, **kargs)
10 | ```
11 | 
12 | 
13 | ```{r interceptprint}
14 | #| echo: false
15 | print = function(x, ...) {
16 |     if (is.list(x) & !is.null(names(x)) ) {
17 |         base::print(x[names(x)[1:3]])
18 |         cat("...")
19 |     } else {
20 |         if (length(x) == 1 && nchar(x) > 70) x = paste0(substr(x, 1, 70), "...")
21 |         base::print(x, ...)
22 |     }
23 | }
24 | ```


--------------------------------------------------------------------------------
/content/img/ch04_colab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch04_colab.png


--------------------------------------------------------------------------------
/content/img/ch04_github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch04_github.png


--------------------------------------------------------------------------------
/content/img/ch04_notebooks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch04_notebooks.png


--------------------------------------------------------------------------------
/content/img/ch07_figjoins.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch07_figjoins.png


--------------------------------------------------------------------------------
/content/img/ch09_cnn_cropped.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch09_cnn_cropped.png


--------------------------------------------------------------------------------
/content/img/ch09_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch09_matrix.png


--------------------------------------------------------------------------------
/content/img/ch09_overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch09_overfitting.png


--------------------------------------------------------------------------------
/content/img/ch09_roccurve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch09_roccurve.png


--------------------------------------------------------------------------------
/content/img/ch15_fashion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch15_fashion.png


--------------------------------------------------------------------------------
/content/img/ch15_location.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch15_location.png


--------------------------------------------------------------------------------
/content/img/ch15_numbers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch15_numbers.png


--------------------------------------------------------------------------------
/content/img/ch15_pixel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch15_pixel.png


--------------------------------------------------------------------------------
/content/img/ch15_refugees.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch15_refugees.png


--------------------------------------------------------------------------------
/content/img/ch15_yolo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch15_yolo.png


--------------------------------------------------------------------------------
/content/img/ch3_r_studio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch3_r_studio.png


--------------------------------------------------------------------------------
/content/img/ch4_stackover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch4_stackover.png


--------------------------------------------------------------------------------
/content/img/ch6_csv-in-editor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ch6_csv-in-editor.png


--------------------------------------------------------------------------------
/content/img/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/favicon.png


--------------------------------------------------------------------------------
/content/img/fig_decisiontree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/fig_decisiontree.png


--------------------------------------------------------------------------------
/content/img/fig_hiddenlayers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/fig_hiddenlayers.png


--------------------------------------------------------------------------------
/content/img/fig_perceptron.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/fig_perceptron.png


--------------------------------------------------------------------------------
/content/img/fig_sigmoid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/fig_sigmoid.png


--------------------------------------------------------------------------------
/content/img/jupyter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/jupyter.png


--------------------------------------------------------------------------------
/content/img/lda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/lda.png


--------------------------------------------------------------------------------
/content/img/messy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/messy.png


--------------------------------------------------------------------------------
/content/img/ssh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/ssh.png


--------------------------------------------------------------------------------
/content/img/vmazure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/vmazure.png


--------------------------------------------------------------------------------
/content/img/vmopennebula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vanatteveldt/cssbook/2b7f5506ad2932c48fbec6c56678bcd72e42f6c2/content/img/vmopennebula.png


--------------------------------------------------------------------------------
/content/references.qmd:
--------------------------------------------------------------------------------
1 | # References {.unnumbered}
2 | 
3 | ::: {#refs}
4 | :::


--------------------------------------------------------------------------------
/index.qmd:
--------------------------------------------------------------------------------
 1 | This is the online version of the book *Computational Analysis of Communication* published with [Wiley-Blackwell](https://www.wiley.com/en-us/Computational+Analysis+of+Communication-p-9781119680239). 
 2 | To buy a hard copy or eBook version of the book, please visit your local academic or independent bookstore or use the link above to order.
 3 | 
 4 | ::: {.callout-warning}
 5 | This is the draft version of the book, which is actively being updated.
 6 | The open access edition of the published version is available at [cssbook.net](https://cssbook.net)
 7 | :::
 8 | 
 9 | 
10 | 
11 | :::: {.columns}
12 | 
13 | ::: {.column width="50%"}
14 | ![Computational Analysis of Communication](https://media.wiley.com/product_data/coverImage300/39/11196802/1119680239.jpg)
15 | :::
16 | 
17 | ::: {.column width="50%"}
18 | #### Table of Contents {.unnumbered}
19 | - [Introduction](content/chapter01.qmd)
20 | - [Fun with Data](content/chapter02.qmd)
21 | - [Programming Concepts](content/chapter03.qmd)
22 | - [How to write code](content/chapter04.qmd)
23 | - [Files and Data Frames](content/chapter05.qmd)
24 | - [Data Wrangling](content/chapter06.qmd)
25 | - [Exploratory data analysis](content/chapter07.qmd)
26 | - [Machine Learning](content/chapter08.qmd)
27 | - [Processing text](content/chapter09.qmd)
28 | - [Text as data](content/chapter10.qmd)
29 | - [Automatic analysis of text](content/chapter11.qmd)
30 | - [Scraping online data](content/chapter12.qmd)
31 | - [Network Data](content/chapter13.qmd)
32 | - [Multimedia data](content/chapter14.qmd)
33 | - [Scaling up and distributing](content/chapter15.qmd)
34 | - [Where to go next](content/chapter16.qmd)
35 | :::
36 | ::::
37 | 
38 | This website contains the full contents (text, code examples, and figures) of the book and is (and will be) available completely free and open access. We hope that this will make computational techniques accessible (and fun!) to as many students and researchers as possible, regardless of means and institutional support. We also hope that this will make it easy for students and professors to use a sub set of chapters without forcing students to buy the whole book. We would really like to thank Wiley-Blackwell for their confidence in making this open access option possible.
39 | 
40 | ## Acknowledgements and contributing
41 | 
42 | We would like to thank colleagues, friends, and students who provided feedback and input on earlier versions of parts of the manuscript: Dmitry Bogdanov, Andreu Casas, Modesto Escobar, Anne Kroon, Nicolas Mattis , Cecil Meeusen, Jesús Sánchez-Oro, Nel Ruigrok, Susan Vermeer, Mehdi Zamani, Marthe Möller. Of course, we also want to thank all others that we might have forgotten to mention here (sorry!) -- please contact us if you feel that your name should be here.
43 | 
44 | Our intention is for the online version to be a 'living document' that we will update as tools (or our insights) change, and hopefully serve as the basis for a second edition in the future. For that reason, **all feedback is highly appreciated**! 
45 | 
46 | What can you do to help:
47 | 
48 |  - Create a [github issue](https://github.com/vanatteveldt/cssbook/issues) if you see any errors, find anything hard to understand, or have any other sort of suggestions or feedback.
49 |  - Fix typo's or other issues directly by editing the relevant file on [github](https://github.com/vanatteveldt/cssbook/) and creating a pull request
50 | 
51 | Any contributors will be acknowledged on this page and in a possible second edition of the book.
52 | If you are willing to contribute in a more major way (e.g. rewrite or add an entire chapter), please [get in touch](mailto:wouter.van.atteveldt@vu.nl) with us and we can work something out. 
53 | 
54 | ## Citing this book
55 | 
56 | To cite this book, use:
57 | 
58 | Van Atteveldt, W., Trilling, D., & Arcila Calderón, C. (2022). *Computational Analysis of Communication*. Wiley Blackwell.
59 | 
60 | Bibtex:
61 | 
62 | ```
63 | @book{vanatteveldt2022computational,
64 |   title={Computational Analysis of Communication},
65 |   author={{Van Atteveldt}, Wouter and Trilling, Damian and Arcila Calder{\'o}n, Carlos},
66 |   year={2022},
67 |   publisher={Wiley Blackwell}
68 | }
69 | ```
70 | 


--------------------------------------------------------------------------------
/renv/.gitignore:
--------------------------------------------------------------------------------
1 | library/
2 | local/
3 | cellar/
4 | lock/
5 | python/
6 | sandbox/
7 | staging/
8 | 


--------------------------------------------------------------------------------
/renv/activate.R:
--------------------------------------------------------------------------------
  1 | 
  2 | local({
  3 | 
  4 |   # the requested version of renv
  5 |   version <- "0.16.0"
  6 | 
  7 |   # the project directory
  8 |   project <- getwd()
  9 | 
 10 |   # figure out whether the autoloader is enabled
 11 |   enabled <- local({
 12 | 
 13 |     # first, check config option
 14 |     override <- getOption("renv.config.autoloader.enabled")
 15 |     if (!is.null(override))
 16 |       return(override)
 17 | 
 18 |     # next, check environment variables
 19 |     # TODO: prefer using the configuration one in the future
 20 |     envvars <- c(
 21 |       "RENV_CONFIG_AUTOLOADER_ENABLED",
 22 |       "RENV_AUTOLOADER_ENABLED",
 23 |       "RENV_ACTIVATE_PROJECT"
 24 |     )
 25 | 
 26 |     for (envvar in envvars) {
 27 |       envval <- Sys.getenv(envvar, unset = NA)
 28 |       if (!is.na(envval))
 29 |         return(tolower(envval) %in% c("true", "t", "1"))
 30 |     }
 31 | 
 32 |     # enable by default
 33 |     TRUE
 34 | 
 35 |   })
 36 | 
 37 |   if (!enabled)
 38 |     return(FALSE)
 39 | 
 40 |   # avoid recursion
 41 |   if (identical(getOption("renv.autoloader.running"), TRUE)) {
 42 |     warning("ignoring recursive attempt to run renv autoloader")
 43 |     return(invisible(TRUE))
 44 |   }
 45 | 
 46 |   # signal that we're loading renv during R startup
 47 |   options(renv.autoloader.running = TRUE)
 48 |   on.exit(options(renv.autoloader.running = NULL), add = TRUE)
 49 | 
 50 |   # signal that we've consented to use renv
 51 |   options(renv.consent = TRUE)
 52 | 
 53 |   # load the 'utils' package eagerly -- this ensures that renv shims, which
 54 |   # mask 'utils' packages, will come first on the search path
 55 |   library(utils, lib.loc = .Library)
 56 | 
 57 |   # unload renv if it's already been loaded
 58 |   if ("renv" %in% loadedNamespaces())
 59 |     unloadNamespace("renv")
 60 | 
 61 |   # load bootstrap tools   
 62 |   `%||%` <- function(x, y) {
 63 |     if (is.environment(x) || length(x)) x else y
 64 |   }
 65 |   
 66 |   bootstrap <- function(version, library) {
 67 |   
 68 |     # attempt to download renv
 69 |     tarball <- tryCatch(renv_bootstrap_download(version), error = identity)
 70 |     if (inherits(tarball, "error"))
 71 |       stop("failed to download renv ", version)
 72 |   
 73 |     # now attempt to install
 74 |     status <- tryCatch(renv_bootstrap_install(version, tarball, library), error = identity)
 75 |     if (inherits(status, "error"))
 76 |       stop("failed to install renv ", version)
 77 |   
 78 |   }
 79 |   
 80 |   renv_bootstrap_tests_running <- function() {
 81 |     getOption("renv.tests.running", default = FALSE)
 82 |   }
 83 |   
 84 |   renv_bootstrap_repos <- function() {
 85 |   
 86 |     # check for repos override
 87 |     repos <- Sys.getenv("RENV_CONFIG_REPOS_OVERRIDE", unset = NA)
 88 |     if (!is.na(repos))
 89 |       return(repos)
 90 |   
 91 |     # check for lockfile repositories
 92 |     repos <- tryCatch(renv_bootstrap_repos_lockfile(), error = identity)
 93 |     if (!inherits(repos, "error") && length(repos))
 94 |       return(repos)
 95 |   
 96 |     # if we're testing, re-use the test repositories
 97 |     if (renv_bootstrap_tests_running())
 98 |       return(getOption("renv.tests.repos"))
 99 |   
100 |     # retrieve current repos
101 |     repos <- getOption("repos")
102 |   
103 |     # ensure @CRAN@ entries are resolved
104 |     repos[repos == "@CRAN@"] <- getOption(
105 |       "renv.repos.cran",
106 |       "https://cloud.r-project.org"
107 |     )
108 |   
109 |     # add in renv.bootstrap.repos if set
110 |     default <- c(FALLBACK = "https://cloud.r-project.org")
111 |     extra <- getOption("renv.bootstrap.repos", default = default)
112 |     repos <- c(repos, extra)
113 |   
114 |     # remove duplicates that might've snuck in
115 |     dupes <- duplicated(repos) | duplicated(names(repos))
116 |     repos[!dupes]
117 |   
118 |   }
119 |   
120 |   renv_bootstrap_repos_lockfile <- function() {
121 |   
122 |     lockpath <- Sys.getenv("RENV_PATHS_LOCKFILE", unset = "renv.lock")
123 |     if (!file.exists(lockpath))
124 |       return(NULL)
125 |   
126 |     lockfile <- tryCatch(renv_json_read(lockpath), error = identity)
127 |     if (inherits(lockfile, "error")) {
128 |       warning(lockfile)
129 |       return(NULL)
130 |     }
131 |   
132 |     repos <- lockfile$R$Repositories
133 |     if (length(repos) == 0)
134 |       return(NULL)
135 |   
136 |     keys <- vapply(repos, `[[`, "Name", FUN.VALUE = character(1))
137 |     vals <- vapply(repos, `[[`, "URL", FUN.VALUE = character(1))
138 |     names(vals) <- keys
139 |   
140 |     return(vals)
141 |   
142 |   }
143 |   
144 |   renv_bootstrap_download <- function(version) {
145 |   
146 |     # if the renv version number has 4 components, assume it must
147 |     # be retrieved via github
148 |     nv <- numeric_version(version)
149 |     components <- unclass(nv)[[1]]
150 |   
151 |     # if this appears to be a development version of 'renv', we'll
152 |     # try to restore from github
153 |     dev <- length(components) == 4L
154 |   
155 |     # begin collecting different methods for finding renv
156 |     methods <- c(
157 |       renv_bootstrap_download_tarball,
158 |       if (dev)
159 |         renv_bootstrap_download_github
160 |       else c(
161 |         renv_bootstrap_download_cran_latest,
162 |         renv_bootstrap_download_cran_archive
163 |       )
164 |     )
165 |   
166 |     for (method in methods) {
167 |       path <- tryCatch(method(version), error = identity)
168 |       if (is.character(path) && file.exists(path))
169 |         return(path)
170 |     }
171 |   
172 |     stop("failed to download renv ", version)
173 |   
174 |   }
175 |   
176 |   renv_bootstrap_download_impl <- function(url, destfile) {
177 |   
178 |     mode <- "wb"
179 |   
180 |     # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17715
181 |     fixup <-
182 |       Sys.info()[["sysname"]] == "Windows" &&
183 |       substring(url, 1L, 5L) == "file:"
184 |   
185 |     if (fixup)
186 |       mode <- "w+b"
187 |   
188 |     args <- list(
189 |       url      = url,
190 |       destfile = destfile,
191 |       mode     = mode,
192 |       quiet    = TRUE
193 |     )
194 |   
195 |     if ("headers" %in% names(formals(utils::download.file)))
196 |       args$headers <- renv_bootstrap_download_custom_headers(url)
197 |   
198 |     do.call(utils::download.file, args)
199 |   
200 |   }
201 |   
202 |   renv_bootstrap_download_custom_headers <- function(url) {
203 |   
204 |     headers <- getOption("renv.download.headers")
205 |     if (is.null(headers))
206 |       return(character())
207 |   
208 |     if (!is.function(headers))
209 |       stopf("'renv.download.headers' is not a function")
210 |   
211 |     headers <- headers(url)
212 |     if (length(headers) == 0L)
213 |       return(character())
214 |   
215 |     if (is.list(headers))
216 |       headers <- unlist(headers, recursive = FALSE, use.names = TRUE)
217 |   
218 |     ok <-
219 |       is.character(headers) &&
220 |       is.character(names(headers)) &&
221 |       all(nzchar(names(headers)))
222 |   
223 |     if (!ok)
224 |       stop("invocation of 'renv.download.headers' did not return a named character vector")
225 |   
226 |     headers
227 |   
228 |   }
229 |   
230 |   renv_bootstrap_download_cran_latest <- function(version) {
231 |   
232 |     spec <- renv_bootstrap_download_cran_latest_find(version)
233 |     type  <- spec$type
234 |     repos <- spec$repos
235 |   
236 |     message("* Downloading renv ", version, " ... ", appendLF = FALSE)
237 |   
238 |     baseurl <- utils::contrib.url(repos = repos, type = type)
239 |     ext <- if (identical(type, "source"))
240 |       ".tar.gz"
241 |     else if (Sys.info()[["sysname"]] == "Windows")
242 |       ".zip"
243 |     else
244 |       ".tgz"
245 |     name <- sprintf("renv_%s%s", version, ext)
246 |     url <- paste(baseurl, name, sep = "/")
247 |   
248 |     destfile <- file.path(tempdir(), name)
249 |     status <- tryCatch(
250 |       renv_bootstrap_download_impl(url, destfile),
251 |       condition = identity
252 |     )
253 |   
254 |     if (inherits(status, "condition")) {
255 |       message("FAILED")
256 |       return(FALSE)
257 |     }
258 |   
259 |     # report success and return
260 |     message("OK (downloaded ", type, ")")
261 |     destfile
262 |   
263 |   }
264 |   
265 |   renv_bootstrap_download_cran_latest_find <- function(version) {
266 |   
267 |     # check whether binaries are supported on this system
268 |     binary <-
269 |       getOption("renv.bootstrap.binary", default = TRUE) &&
270 |       !identical(.Platform$pkgType, "source") &&
271 |       !identical(getOption("pkgType"), "source") &&
272 |       Sys.info()[["sysname"]] %in% c("Darwin", "Windows")
273 |   
274 |     types <- c(if (binary) "binary", "source")
275 |   
276 |     # iterate over types + repositories
277 |     for (type in types) {
278 |       for (repos in renv_bootstrap_repos()) {
279 |   
280 |         # retrieve package database
281 |         db <- tryCatch(
282 |           as.data.frame(
283 |             utils::available.packages(type = type, repos = repos),
284 |             stringsAsFactors = FALSE
285 |           ),
286 |           error = identity
287 |         )
288 |   
289 |         if (inherits(db, "error"))
290 |           next
291 |   
292 |         # check for compatible entry
293 |         entry <- db[db$Package %in% "renv" & db$Version %in% version, ]
294 |         if (nrow(entry) == 0)
295 |           next
296 |   
297 |         # found it; return spec to caller
298 |         spec <- list(entry = entry, type = type, repos = repos)
299 |         return(spec)
300 |   
301 |       }
302 |     }
303 |   
304 |     # if we got here, we failed to find renv
305 |     fmt <- "renv %s is not available from your declared package repositories"
306 |     stop(sprintf(fmt, version))
307 |   
308 |   }
309 |   
310 |   renv_bootstrap_download_cran_archive <- function(version) {
311 |   
312 |     name <- sprintf("renv_%s.tar.gz", version)
313 |     repos <- renv_bootstrap_repos()
314 |     urls <- file.path(repos, "src/contrib/Archive/renv", name)
315 |     destfile <- file.path(tempdir(), name)
316 |   
317 |     message("* Downloading renv ", version, " ... ", appendLF = FALSE)
318 |   
319 |     for (url in urls) {
320 |   
321 |       status <- tryCatch(
322 |         renv_bootstrap_download_impl(url, destfile),
323 |         condition = identity
324 |       )
325 |   
326 |       if (identical(status, 0L)) {
327 |         message("OK")
328 |         return(destfile)
329 |       }
330 |   
331 |     }
332 |   
333 |     message("FAILED")
334 |     return(FALSE)
335 |   
336 |   }
337 |   
338 |   renv_bootstrap_download_tarball <- function(version) {
339 |   
340 |     # if the user has provided the path to a tarball via
341 |     # an environment variable, then use it
342 |     tarball <- Sys.getenv("RENV_BOOTSTRAP_TARBALL", unset = NA)
343 |     if (is.na(tarball))
344 |       return()
345 |   
346 |     # allow directories
347 |     info <- file.info(tarball, extra_cols = FALSE)
348 |     if (identical(info$isdir, TRUE)) {
349 |       name <- sprintf("renv_%s.tar.gz", version)
350 |       tarball <- file.path(tarball, name)
351 |     }
352 |   
353 |     # bail if it doesn't exist
354 |     if (!file.exists(tarball)) {
355 |   
356 |       # let the user know we weren't able to honour their request
357 |       fmt <- "* RENV_BOOTSTRAP_TARBALL is set (%s) but does not exist."
358 |       msg <- sprintf(fmt, tarball)
359 |       warning(msg)
360 |   
361 |       # bail
362 |       return()
363 |   
364 |     }
365 |   
366 |     fmt <- "* Bootstrapping with tarball at path '%s'."
367 |     msg <- sprintf(fmt, tarball)
368 |     message(msg)
369 |   
370 |     tarball
371 |   
372 |   }
373 |   
374 |   renv_bootstrap_download_github <- function(version) {
375 |   
376 |     enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE")
377 |     if (!identical(enabled, "TRUE"))
378 |       return(FALSE)
379 |   
380 |     # prepare download options
381 |     pat <- Sys.getenv("GITHUB_PAT")
382 |     if (nzchar(Sys.which("curl")) && nzchar(pat)) {
383 |       fmt <- "--location --fail --header \"Authorization: token %s\""
384 |       extra <- sprintf(fmt, pat)
385 |       saved <- options("download.file.method", "download.file.extra")
386 |       options(download.file.method = "curl", download.file.extra = extra)
387 |       on.exit(do.call(base::options, saved), add = TRUE)
388 |     } else if (nzchar(Sys.which("wget")) && nzchar(pat)) {
389 |       fmt <- "--header=\"Authorization: token %s\""
390 |       extra <- sprintf(fmt, pat)
391 |       saved <- options("download.file.method", "download.file.extra")
392 |       options(download.file.method = "wget", download.file.extra = extra)
393 |       on.exit(do.call(base::options, saved), add = TRUE)
394 |     }
395 |   
396 |     message("* Downloading renv ", version, " from GitHub ... ", appendLF = FALSE)
397 |   
398 |     url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version)
399 |     name <- sprintf("renv_%s.tar.gz", version)
400 |     destfile <- file.path(tempdir(), name)
401 |   
402 |     status <- tryCatch(
403 |       renv_bootstrap_download_impl(url, destfile),
404 |       condition = identity
405 |     )
406 |   
407 |     if (!identical(status, 0L)) {
408 |       message("FAILED")
409 |       return(FALSE)
410 |     }
411 |   
412 |     message("OK")
413 |     return(destfile)
414 |   
415 |   }
416 |   
417 |   renv_bootstrap_install <- function(version, tarball, library) {
418 |   
419 |     # attempt to install it into project library
420 |     message("* Installing renv ", version, " ... ", appendLF = FALSE)
421 |     dir.create(library, showWarnings = FALSE, recursive = TRUE)
422 |   
423 |     # invoke using system2 so we can capture and report output
424 |     bin <- R.home("bin")
425 |     exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R"
426 |     r <- file.path(bin, exe)
427 |   
428 |     args <- c(
429 |       "--vanilla", "CMD", "INSTALL", "--no-multiarch",
430 |       "-l", shQuote(path.expand(library)),
431 |       shQuote(path.expand(tarball))
432 |     )
433 |   
434 |     output <- system2(r, args, stdout = TRUE, stderr = TRUE)
435 |     message("Done!")
436 |   
437 |     # check for successful install
438 |     status <- attr(output, "status")
439 |     if (is.numeric(status) && !identical(status, 0L)) {
440 |       header <- "Error installing renv:"
441 |       lines <- paste(rep.int("=", nchar(header)), collapse = "")
442 |       text <- c(header, lines, output)
443 |       writeLines(text, con = stderr())
444 |     }
445 |   
446 |     status
447 |   
448 |   }
449 |   
450 |   renv_bootstrap_platform_prefix <- function() {
451 |   
452 |     # construct version prefix
453 |     version <- paste(R.version$major, R.version$minor, sep = ".")
454 |     prefix <- paste("R", numeric_version(version)[1, 1:2], sep = "-")
455 |   
456 |     # include SVN revision for development versions of R
457 |     # (to avoid sharing platform-specific artefacts with released versions of R)
458 |     devel <-
459 |       identical(R.version[["status"]],   "Under development (unstable)") ||
460 |       identical(R.version[["nickname"]], "Unsuffered Consequences")
461 |   
462 |     if (devel)
463 |       prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r")
464 |   
465 |     # build list of path components
466 |     components <- c(prefix, R.version$platform)
467 |   
468 |     # include prefix if provided by user
469 |     prefix <- renv_bootstrap_platform_prefix_impl()
470 |     if (!is.na(prefix) && nzchar(prefix))
471 |       components <- c(prefix, components)
472 |   
473 |     # build prefix
474 |     paste(components, collapse = "/")
475 |   
476 |   }
477 |   
478 |   renv_bootstrap_platform_prefix_impl <- function() {
479 |   
480 |     # if an explicit prefix has been supplied, use it
481 |     prefix <- Sys.getenv("RENV_PATHS_PREFIX", unset = NA)
482 |     if (!is.na(prefix))
483 |       return(prefix)
484 |   
485 |     # if the user has requested an automatic prefix, generate it
486 |     auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA)
487 |     if (auto %in% c("TRUE", "True", "true", "1"))
488 |       return(renv_bootstrap_platform_prefix_auto())
489 |   
490 |     # empty string on failure
491 |     ""
492 |   
493 |   }
494 |   
495 |   renv_bootstrap_platform_prefix_auto <- function() {
496 |   
497 |     prefix <- tryCatch(renv_bootstrap_platform_os(), error = identity)
498 |     if (inherits(prefix, "error") || prefix %in% "unknown") {
499 |   
500 |       msg <- paste(
501 |         "failed to infer current operating system",
502 |         "please file a bug report at https://github.com/rstudio/renv/issues",
503 |         sep = "; "
504 |       )
505 |   
506 |       warning(msg)
507 |   
508 |     }
509 |   
510 |     prefix
511 |   
512 |   }
513 |   
514 |   renv_bootstrap_platform_os <- function() {
515 |   
516 |     sysinfo <- Sys.info()
517 |     sysname <- sysinfo[["sysname"]]
518 |   
519 |     # handle Windows + macOS up front
520 |     if (sysname == "Windows")
521 |       return("windows")
522 |     else if (sysname == "Darwin")
523 |       return("macos")
524 |   
525 |     # check for os-release files
526 |     for (file in c("/etc/os-release", "/usr/lib/os-release"))
527 |       if (file.exists(file))
528 |         return(renv_bootstrap_platform_os_via_os_release(file, sysinfo))
529 |   
530 |     # check for redhat-release files
531 |     if (file.exists("/etc/redhat-release"))
532 |       return(renv_bootstrap_platform_os_via_redhat_release())
533 |   
534 |     "unknown"
535 |   
536 |   }
537 |   
538 |   renv_bootstrap_platform_os_via_os_release <- function(file, sysinfo) {
539 |   
540 |     # read /etc/os-release
541 |     release <- utils::read.table(
542 |       file             = file,
543 |       sep              = "=",
544 |       quote            = c("\"", "'"),
545 |       col.names        = c("Key", "Value"),
546 |       comment.char     = "#",
547 |       stringsAsFactors = FALSE
548 |     )
549 |   
550 |     vars <- as.list(release$Value)
551 |     names(vars) <- release$Key
552 |   
553 |     # get os name
554 |     os <- tolower(sysinfo[["sysname"]])
555 |   
556 |     # read id
557 |     id <- "unknown"
558 |     for (field in c("ID", "ID_LIKE")) {
559 |       if (field %in% names(vars) && nzchar(vars[[field]])) {
560 |         id <- vars[[field]]
561 |         break
562 |       }
563 |     }
564 |   
565 |     # read version
566 |     version <- "unknown"
567 |     for (field in c("UBUNTU_CODENAME", "VERSION_CODENAME", "VERSION_ID", "BUILD_ID")) {
568 |       if (field %in% names(vars) && nzchar(vars[[field]])) {
569 |         version <- vars[[field]]
570 |         break
571 |       }
572 |     }
573 |   
574 |     # join together
575 |     paste(c(os, id, version), collapse = "-")
576 |   
577 |   }
578 |   
579 |   renv_bootstrap_platform_os_via_redhat_release <- function() {
580 |   
581 |     # read /etc/redhat-release
582 |     contents <- readLines("/etc/redhat-release", warn = FALSE)
583 |   
584 |     # infer id
585 |     id <- if (grepl("centos", contents, ignore.case = TRUE))
586 |       "centos"
587 |     else if (grepl("redhat", contents, ignore.case = TRUE))
588 |       "redhat"
589 |     else
590 |       "unknown"
591 |   
592 |     # try to find a version component (very hacky)
593 |     version <- "unknown"
594 |   
595 |     parts <- strsplit(contents, "[[:space:]]")[[1L]]
596 |     for (part in parts) {
597 |   
598 |       nv <- tryCatch(numeric_version(part), error = identity)
599 |       if (inherits(nv, "error"))
600 |         next
601 |   
602 |       version <- nv[1, 1]
603 |       break
604 |   
605 |     }
606 |   
607 |     paste(c("linux", id, version), collapse = "-")
608 |   
609 |   }
610 |   
611 |   renv_bootstrap_library_root_name <- function(project) {
612 |   
613 |     # use project name as-is if requested
614 |     asis <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT_ASIS", unset = "FALSE")
615 |     if (asis)
616 |       return(basename(project))
617 |   
618 |     # otherwise, disambiguate based on project's path
619 |     id <- substring(renv_bootstrap_hash_text(project), 1L, 8L)
620 |     paste(basename(project), id, sep = "-")
621 |   
622 |   }
623 |   
624 |   renv_bootstrap_library_root <- function(project) {
625 |   
626 |     prefix <- renv_bootstrap_profile_prefix()
627 |   
628 |     path <- Sys.getenv("RENV_PATHS_LIBRARY", unset = NA)
629 |     if (!is.na(path))
630 |       return(paste(c(path, prefix), collapse = "/"))
631 |   
632 |     path <- renv_bootstrap_library_root_impl(project)
633 |     if (!is.null(path)) {
634 |       name <- renv_bootstrap_library_root_name(project)
635 |       return(paste(c(path, prefix, name), collapse = "/"))
636 |     }
637 |   
638 |     renv_bootstrap_paths_renv("library", project = project)
639 |   
640 |   }
641 |   
642 |   renv_bootstrap_library_root_impl <- function(project) {
643 |   
644 |     root <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT", unset = NA)
645 |     if (!is.na(root))
646 |       return(root)
647 |   
648 |     type <- renv_bootstrap_project_type(project)
649 |     if (identical(type, "package")) {
650 |       userdir <- renv_bootstrap_user_dir()
651 |       return(file.path(userdir, "library"))
652 |     }
653 |   
654 |   }
655 |   
656 |   renv_bootstrap_validate_version <- function(version) {
657 |   
658 |     loadedversion <- utils::packageDescription("renv", fields = "Version")
659 |     if (version == loadedversion)
660 |       return(TRUE)
661 |   
662 |     # assume four-component versions are from GitHub; three-component
663 |     # versions are from CRAN
664 |     components <- strsplit(loadedversion, "[.-]")[[1]]
665 |     remote <- if (length(components) == 4L)
666 |       paste("rstudio/renv", loadedversion, sep = "@")
667 |     else
668 |       paste("renv", loadedversion, sep = "@")
669 |   
670 |     fmt <- paste(
671 |       "renv %1$s was loaded from project library, but this project is configured to use renv %2$s.",
672 |       "Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.",
673 |       "Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.",
674 |       sep = "\n"
675 |     )
676 |   
677 |     msg <- sprintf(fmt, loadedversion, version, remote)
678 |     warning(msg, call. = FALSE)
679 |   
680 |     FALSE
681 |   
682 |   }
683 |   
684 |   renv_bootstrap_hash_text <- function(text) {
685 |   
686 |     hashfile <- tempfile("renv-hash-")
687 |     on.exit(unlink(hashfile), add = TRUE)
688 |   
689 |     writeLines(text, con = hashfile)
690 |     tools::md5sum(hashfile)
691 |   
692 |   }
693 |   
694 |   renv_bootstrap_load <- function(project, libpath, version) {
695 |   
696 |     # try to load renv from the project library
697 |     if (!requireNamespace("renv", lib.loc = libpath, quietly = TRUE))
698 |       return(FALSE)
699 |   
700 |     # warn if the version of renv loaded does not match
701 |     renv_bootstrap_validate_version(version)
702 |   
703 |     # load the project
704 |     renv::load(project)
705 |   
706 |     TRUE
707 |   
708 |   }
709 |   
710 |   renv_bootstrap_profile_load <- function(project) {
711 |   
712 |     # if RENV_PROFILE is already set, just use that
713 |     profile <- Sys.getenv("RENV_PROFILE", unset = NA)
714 |     if (!is.na(profile) && nzchar(profile))
715 |       return(profile)
716 |   
717 |     # check for a profile file (nothing to do if it doesn't exist)
718 |     path <- renv_bootstrap_paths_renv("profile", profile = FALSE, project = project)
719 |     if (!file.exists(path))
720 |       return(NULL)
721 |   
722 |     # read the profile, and set it if it exists
723 |     contents <- readLines(path, warn = FALSE)
724 |     if (length(contents) == 0L)
725 |       return(NULL)
726 |   
727 |     # set RENV_PROFILE
728 |     profile <- contents[[1L]]
729 |     if (!profile %in% c("", "default"))
730 |       Sys.setenv(RENV_PROFILE = profile)
731 |   
732 |     profile
733 |   
734 |   }
735 |   
736 |   renv_bootstrap_profile_prefix <- function() {
737 |     profile <- renv_bootstrap_profile_get()
738 |     if (!is.null(profile))
739 |       return(file.path("profiles", profile, "renv"))
740 |   }
741 |   
742 |   renv_bootstrap_profile_get <- function() {
743 |     profile <- Sys.getenv("RENV_PROFILE", unset = "")
744 |     renv_bootstrap_profile_normalize(profile)
745 |   }
746 |   
747 |   renv_bootstrap_profile_set <- function(profile) {
748 |     profile <- renv_bootstrap_profile_normalize(profile)
749 |     if (is.null(profile))
750 |       Sys.unsetenv("RENV_PROFILE")
751 |     else
752 |       Sys.setenv(RENV_PROFILE = profile)
753 |   }
754 |   
755 |   renv_bootstrap_profile_normalize <- function(profile) {
756 |   
757 |     if (is.null(profile) || profile %in% c("", "default"))
758 |       return(NULL)
759 |   
760 |     profile
761 |   
762 |   }
763 |   
764 |   renv_bootstrap_path_absolute <- function(path) {
765 |   
766 |     substr(path, 1L, 1L) %in% c("~", "/", "\\") || (
767 |       substr(path, 1L, 1L) %in% c(letters, LETTERS) &&
768 |       substr(path, 2L, 3L) %in% c(":/", ":\\")
769 |     )
770 |   
771 |   }
772 |   
773 |   renv_bootstrap_paths_renv <- function(..., profile = TRUE, project = NULL) {
774 |     renv <- Sys.getenv("RENV_PATHS_RENV", unset = "renv")
775 |     root <- if (renv_bootstrap_path_absolute(renv)) NULL else project
776 |     prefix <- if (profile) renv_bootstrap_profile_prefix()
777 |     components <- c(root, renv, prefix, ...)
778 |     paste(components, collapse = "/")
779 |   }
780 |   
781 |   renv_bootstrap_project_type <- function(path) {
782 |   
783 |     descpath <- file.path(path, "DESCRIPTION")
784 |     if (!file.exists(descpath))
785 |       return("unknown")
786 |   
787 |     desc <- tryCatch(
788 |       read.dcf(descpath, all = TRUE),
789 |       error = identity
790 |     )
791 |   
792 |     if (inherits(desc, "error"))
793 |       return("unknown")
794 |   
795 |     type <- desc$Type
796 |     if (!is.null(type))
797 |       return(tolower(type))
798 |   
799 |     package <- desc$Package
800 |     if (!is.null(package))
801 |       return("package")
802 |   
803 |     "unknown"
804 |   
805 |   }
806 |   
807 |   renv_bootstrap_user_dir <- function() {
808 |     dir <- renv_bootstrap_user_dir_impl()
809 |     path.expand(chartr("\\", "/", dir))
810 |   }
811 |   
812 |   renv_bootstrap_user_dir_impl <- function() {
813 |   
814 |     # use local override if set
815 |     override <- getOption("renv.userdir.override")
816 |     if (!is.null(override))
817 |       return(override)
818 |   
819 |     # use R_user_dir if available
820 |     tools <- asNamespace("tools")
821 |     if (is.function(tools$R_user_dir))
822 |       return(tools$R_user_dir("renv", "cache"))
823 |   
824 |     # try using our own backfill for older versions of R
825 |     envvars <- c("R_USER_CACHE_DIR", "XDG_CACHE_HOME")
826 |     for (envvar in envvars) {
827 |       root <- Sys.getenv(envvar, unset = NA)
828 |       if (!is.na(root))
829 |         return(file.path(root, "R/renv"))
830 |     }
831 |   
832 |     # use platform-specific default fallbacks
833 |     if (Sys.info()[["sysname"]] == "Windows")
834 |       file.path(Sys.getenv("LOCALAPPDATA"), "R/cache/R/renv")
835 |     else if (Sys.info()[["sysname"]] == "Darwin")
836 |       "~/Library/Caches/org.R-project.R/R/renv"
837 |     else
838 |       "~/.cache/R/renv"
839 |   
840 |   }
841 |   
842 |   
843 |   renv_json_read <- function(file = NULL, text = NULL) {
844 |   
845 |     # if jsonlite is loaded, use that instead
846 |     if ("jsonlite" %in% loadedNamespaces())
847 |       renv_json_read_jsonlite(file, text)
848 |     else
849 |       renv_json_read_default(file, text)
850 |   
851 |   }
852 |   
853 |   renv_json_read_jsonlite <- function(file = NULL, text = NULL) {
854 |     text <- paste(text %||% read(file), collapse = "\n")
855 |     jsonlite::fromJSON(txt = text, simplifyVector = FALSE)
856 |   }
857 |   
858 |   renv_json_read_default <- function(file = NULL, text = NULL) {
859 |   
860 |     # find strings in the JSON
861 |     text <- paste(text %||% read(file), collapse = "\n")
862 |     pattern <- '["](?:(?:\\\\.)|(?:[^"\\\\]))*?["]'
863 |     locs <- gregexpr(pattern, text, perl = TRUE)[[1]]
864 |   
865 |     # if any are found, replace them with placeholders
866 |     replaced <- text
867 |     strings <- character()
868 |     replacements <- character()
869 |   
870 |     if (!identical(c(locs), -1L)) {
871 |   
872 |       # get the string values
873 |       starts <- locs
874 |       ends <- locs + attr(locs, "match.length") - 1L
875 |       strings <- substring(text, starts, ends)
876 |   
877 |       # only keep those requiring escaping
878 |       strings <- grep("[[\\]{}:]", strings, perl = TRUE, value = TRUE)
879 |   
880 |       # compute replacements
881 |       replacements <- sprintf('"\032%i\032"', seq_along(strings))
882 |   
883 |       # replace the strings
884 |       mapply(function(string, replacement) {
885 |         replaced <<- sub(string, replacement, replaced, fixed = TRUE)
886 |       }, strings, replacements)
887 |   
888 |     }
889 |   
890 |     # transform the JSON into something the R parser understands
891 |     transformed <- replaced
892 |     transformed <- gsub("{}", "`names<-`(list(), character())", transformed, fixed = TRUE)
893 |     transformed <- gsub("[[{]", "list(", transformed, perl = TRUE)
894 |     transformed <- gsub("[]}]", ")", transformed, perl = TRUE)
895 |     transformed <- gsub(":", "=", transformed, fixed = TRUE)
896 |     text <- paste(transformed, collapse = "\n")
897 |   
898 |     # parse it
899 |     json <- parse(text = text, keep.source = FALSE, srcfile = NULL)[[1L]]
900 |   
901 |     # construct map between source strings, replaced strings
902 |     map <- as.character(parse(text = strings))
903 |     names(map) <- as.character(parse(text = replacements))
904 |   
905 |     # convert to list
906 |     map <- as.list(map)
907 |   
908 |     # remap strings in object
909 |     remapped <- renv_json_remap(json, map)
910 |   
911 |     # evaluate
912 |     eval(remapped, envir = baseenv())
913 |   
914 |   }
915 |   
916 |   renv_json_remap <- function(json, map) {
917 |   
918 |     # fix names
919 |     if (!is.null(names(json))) {
920 |       lhs <- match(names(json), names(map), nomatch = 0L)
921 |       rhs <- match(names(map), names(json), nomatch = 0L)
922 |       names(json)[rhs] <- map[lhs]
923 |     }
924 |   
925 |     # fix values
926 |     if (is.character(json))
927 |       return(map[[json]] %||% json)
928 |   
929 |     # handle true, false, null
930 |     if (is.name(json)) {
931 |       text <- as.character(json)
932 |       if (text == "true")
933 |         return(TRUE)
934 |       else if (text == "false")
935 |         return(FALSE)
936 |       else if (text == "null")
937 |         return(NULL)
938 |     }
939 |   
940 |     # recurse
941 |     if (is.recursive(json)) {
942 |       for (i in seq_along(json)) {
943 |         json[i] <- list(renv_json_remap(json[[i]], map))
944 |       }
945 |     }
946 |   
947 |     json
948 |   
949 |   }
950 | 
951 |   # load the renv profile, if any
952 |   renv_bootstrap_profile_load(project)
953 | 
954 |   # construct path to library root
955 |   root <- renv_bootstrap_library_root(project)
956 | 
957 |   # construct library prefix for platform
958 |   prefix <- renv_bootstrap_platform_prefix()
959 | 
960 |   # construct full libpath
961 |   libpath <- file.path(root, prefix)
962 | 
963 |   # attempt to load
964 |   if (renv_bootstrap_load(project, libpath, version))
965 |     return(TRUE)
966 | 
967 |   # load failed; inform user we're about to bootstrap
968 |   prefix <- paste("# Bootstrapping renv", version)
969 |   postfix <- paste(rep.int("-", 77L - nchar(prefix)), collapse = "")
970 |   header <- paste(prefix, postfix)
971 |   message(header)
972 | 
973 |   # perform bootstrap
974 |   bootstrap(version, libpath)
975 | 
976 |   # exit early if we're just testing bootstrap
977 |   if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA)))
978 |     return(TRUE)
979 | 
980 |   # try again to load
981 |   if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) {
982 |     message("* Successfully installed and loaded renv ", version, ".")
983 |     return(renv::load())
984 |   }
985 | 
986 |   # failed to download or load renv; warn the user
987 |   msg <- c(
988 |     "Failed to find an renv installation: the project will not be loaded.",
989 |     "Use `renv::activate()` to re-initialize the project."
990 |   )
991 | 
992 |   warning(paste(msg, collapse = "\n"), call. = FALSE)
993 | 
994 | })
995 | 


--------------------------------------------------------------------------------
/renv/install.R:
--------------------------------------------------------------------------------
1 | install.packages("renv")
2 | renv::restore()
3 | # somehow, reticulate is not included in the snapshot? install manually...
4 | install.packages("reticulate")
5 | 


--------------------------------------------------------------------------------
/renv/settings.dcf:
--------------------------------------------------------------------------------
 1 | bioconductor.version:
 2 | external.libraries:
 3 | ignored.packages:
 4 | package.dependency.fields: Imports, Depends, LinkingTo
 5 | r.version:
 6 | snapshot.type: implicit
 7 | use.cache: TRUE
 8 | vcs.ignore.cellar: TRUE
 9 | vcs.ignore.library: TRUE
10 | vcs.ignore.local: TRUE
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==2.0.0
  2 | adjustText==0.8
  3 | asttokens==2.4.1
  4 | astunparse==1.6.3
  5 | async-generator==1.10
  6 | attrs==23.1.0
  7 | bioinfokit==2.1.3
  8 | blinker==1.7.0
  9 | blis==0.7.11
 10 | cachetools==5.3.2
 11 | catalogue==2.0.10
 12 | certifi==2023.11.17
 13 | charset-normalizer==3.3.2
 14 | click==8.1.7
 15 | click-plugins==1.1.1
 16 | cligj==0.7.2
 17 | comm==0.2.0
 18 | community==1.0.0b1
 19 | confection==0.1.4
 20 | conllu==4.5.3
 21 | contourpy==1.2.0
 22 | cssselect==1.2.0
 23 | cycler==0.12.1
 24 | cymem==2.0.8
 25 | Cython==3.0.6
 26 | debugpy==1.8.0
 27 | decorator==5.1.1
 28 | descartes==1.1.0
 29 | dyNET38==2.1
 30 | eli5==0.13.0
 31 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl
 32 | es-core-news-sm==3.1.0
 33 | exceptiongroup==1.2.0
 34 | executing==2.0.1
 35 | fiona==1.9.5
 36 | Flask==3.0.0
 37 | flatbuffers==23.5.26
 38 | fonttools==4.46.0
 39 | fst-pso==1.8.1
 40 | FuzzyTM==2.0.5
 41 | gast==0.5.4
 42 | gensim==4.3.2
 43 | geopandas==0.14.1
 44 | google-auth==2.24.0
 45 | google-auth-oauthlib==1.1.0
 46 | google-pasta==0.2.0
 47 | graphviz==0.20.1
 48 | grpcio==1.59.3
 49 | h11==0.14.0
 50 | h5py==3.10.0
 51 | idna==3.6
 52 | ipykernel==6.27.1
 53 | ipython==8.18.1
 54 | itsdangerous==2.1.2
 55 | jedi==0.19.1
 56 | Jinja2==3.1.2
 57 | joblib==1.3.2
 58 | jupyter_client==8.6.0
 59 | jupyter_core==5.5.0
 60 | keras==2.15.0
 61 | Keras-Preprocessing==1.1.2
 62 | kiwisolver==1.4.5
 63 | langcodes==3.3.0
 64 | libclang==16.0.6
 65 | lxml==4.9.3
 66 | Markdown==3.5.1
 67 | MarkupSafe==2.1.3
 68 | matplotlib==3.8.2
 69 | matplotlib-inline==0.1.6
 70 | matplotlib-venn==0.11.9
 71 | miniful==0.0.6
 72 | ml-dtypes==0.2.0
 73 | munch==4.0.0
 74 | murmurhash==1.0.10
 75 | nagisa==0.2.9
 76 | nest-asyncio==1.5.8
 77 | networkx==3.2.1
 78 | nltk==3.8.1
 79 | numpy==1.26.2
 80 | oauthlib==3.2.2
 81 | opt-einsum==3.3.0
 82 | outcome==1.3.0.post0
 83 | packaging==23.2
 84 | pandas==2.1.3
 85 | parso==0.8.3
 86 | pathy==0.10.3
 87 | patsy==0.5.4
 88 | pexpect==4.9.0
 89 | Pillow==10.1.0
 90 | platformdirs==4.0.0
 91 | preshed==3.0.9
 92 | prompt-toolkit==3.0.41
 93 | protobuf==4.23.4
 94 | psutil==5.9.6
 95 | ptyprocess==0.7.0
 96 | pure-eval==0.2.2
 97 | pyasn1==0.5.1
 98 | pyasn1-modules==0.3.0
 99 | pydantic==1.8.2
100 | pyFUME==0.2.25
101 | Pygments==2.17.2
102 | pyparsing==3.1.1
103 | pyproj==3.6.1
104 | PySocks==1.7.1
105 | python-dateutil==2.8.2
106 | python-louvain==0.16
107 | pytz==2023.3.post1
108 | pyzmq==25.1.1
109 | regex==2023.10.3
110 | requests==2.31.0
111 | requests-oauthlib==1.3.1
112 | rsa==4.9
113 | scikit-learn==1.3.2
114 | scipy==1.11.4
115 | seaborn==0.13.0
116 | selenium==4.15.2
117 | shapely==2.0.2
118 | shifterator==0.3.0
119 | simpful==2.11.1
120 | six==1.16.0
121 | smart-open==6.4.0
122 | sniffio==1.3.0
123 | sortedcontainers==2.4.0
124 | spacy==3.1.7
125 | spacy-legacy==3.0.12
126 | spacy-loggers==1.0.5
127 | srsly==2.4.8
128 | stack-data==0.6.3
129 | statsmodels==0.14.0
130 | tabulate==0.9.0
131 | tensorboard==2.15.1
132 | tensorboard-data-server==0.7.2
133 | tensorboard-plugin-wit==1.8.1
134 | tensorflow==2.15.0
135 | tensorflow-estimator==2.15.0
136 | tensorflow-io-gcs-filesystem==0.34.0
137 | termcolor==2.4.0
138 | textwrap3==0.9.2
139 | thinc==8.0.17
140 | threadpoolctl==3.2.0
141 | tornado==6.4
142 | tqdm==4.66.1
143 | traitlets==5.14.0
144 | trio==0.23.1
145 | trio-websocket==0.11.1
146 | typer==0.4.2
147 | typing_extensions==4.8.0
148 | tzdata==2023.3
149 | ufal.udpipe==1.3.1.1
150 | urllib3==2.1.0
151 | wasabi==0.10.1
152 | wcwidth==0.2.12
153 | Werkzeug==3.0.1
154 | wordcloud==1.9.2
155 | wrapt==1.14.1
156 | wsproto==1.2.0
157 | xlrd==2.0.1
158 | 


--------------------------------------------------------------------------------