├── .github
├── FUNDING.yml
└── workflows
│ ├── CompatHelper.yml
│ ├── Format.yml
│ └── CI.yml
├── contents
├── references.md
├── project_management.md
├── appendix.md
├── data_vis_makie_cairo.md
├── data_vis_makie_latex.md
├── data_vis_makie.md
├── dataframes_groupby_combine.md
├── index.md
├── notation.md
├── dataframes_transform.md
├── preface.md
├── dataframes_join.md
├── data_vis_makie_glmakie.md
├── data_vis_makie_themes.md
├── dataframes_select.md
├── dataframes_load_save.md
├── data_vis_makie_layouts.md
├── dataframes.md
├── data_vis_makie_colors.md
├── data_vis_makie_attributes.md
├── dataframes_indexing.md
├── dataframes_performance.md
├── stats_distributions.md
├── stats_vis.md
└── why_julia.md
├── .gitignore
├── pandoc
├── favicon.png
├── favicon_black.png
├── template.tex
└── references.bib
├── images
├── benchmarks.png
├── logoMakie.png
├── animScatters.mp4
├── galtons-board.png
├── language_comparisons.png
├── makiePlottingFunctions.png
├── GLMakiePlottingFunctions.png
├── GLMakiePlottingFunctionsHide.png
└── makiePlottingFunctionsHide.png
├── .devcontainer
├── build.sh
├── devcontainer.json
├── README.md
├── Dockerfile
└── run.sh
├── .JuliaFormatter.toml
├── format
├── Project.toml
└── src
│ ├── juliaformatter.jl
│ ├── FormatJDS.jl
│ └── whitespace.jl
├── src
├── environment.jl
├── bezier.jl
├── showcode_additions.jl
├── ci.jl
├── JDS.jl
├── cover.jl
├── df.jl
├── front-cover.jl
└── stats.jl
├── .gitattributes
├── CITATION.cff
├── config.toml
├── metadata.yml
├── README.md
├── Project.toml
└── LICENSE
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: JuliaDataScience
2 |
--------------------------------------------------------------------------------
/contents/references.md:
--------------------------------------------------------------------------------
1 | # 参考文献 {#sec:references}
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | _build/
2 | _gen/
3 | *.log
4 | Manifest.toml
5 | .vscode/
6 | .DS_Store
7 |
--------------------------------------------------------------------------------
/pandoc/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaCN/JuliaDataScience/HEAD/pandoc/favicon.png
--------------------------------------------------------------------------------
/images/benchmarks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaCN/JuliaDataScience/HEAD/images/benchmarks.png
--------------------------------------------------------------------------------
/images/logoMakie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaCN/JuliaDataScience/HEAD/images/logoMakie.png
--------------------------------------------------------------------------------
/.devcontainer/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | docker build -t jds .devcontainer/
6 |
--------------------------------------------------------------------------------
/images/animScatters.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaCN/JuliaDataScience/HEAD/images/animScatters.mp4
--------------------------------------------------------------------------------
/images/galtons-board.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaCN/JuliaDataScience/HEAD/images/galtons-board.png
--------------------------------------------------------------------------------
/pandoc/favicon_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaCN/JuliaDataScience/HEAD/pandoc/favicon_black.png
--------------------------------------------------------------------------------
/images/language_comparisons.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaCN/JuliaDataScience/HEAD/images/language_comparisons.png
--------------------------------------------------------------------------------
/images/makiePlottingFunctions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaCN/JuliaDataScience/HEAD/images/makiePlottingFunctions.png
--------------------------------------------------------------------------------
/images/GLMakiePlottingFunctions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaCN/JuliaDataScience/HEAD/images/GLMakiePlottingFunctions.png
--------------------------------------------------------------------------------
/images/GLMakiePlottingFunctionsHide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaCN/JuliaDataScience/HEAD/images/GLMakiePlottingFunctionsHide.png
--------------------------------------------------------------------------------
/images/makiePlottingFunctionsHide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JuliaCN/JuliaDataScience/HEAD/images/makiePlottingFunctionsHide.png
--------------------------------------------------------------------------------
/.JuliaFormatter.toml:
--------------------------------------------------------------------------------
1 | margin=500
2 | verbose=true
3 | join_lines_based_on_source=true
4 | whitespace_in_kwargs=false
5 | # This is too aggressive.
6 | format_markdown=false
7 |
--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
1 | {
2 | "extensions": [
3 | "julialang.language-julia",
4 | "ms-vscode.cpptools"
5 | ],
6 |
7 | "dockerFile": "Dockerfile"
8 | }
9 |
--------------------------------------------------------------------------------
/contents/project_management.md:
--------------------------------------------------------------------------------
1 | ## Project Management in Julia {#sec:project_management}
2 |
3 | ```{=comment}
4 | This is a nice overview that we should be inspired? https://opensourc.es/blog/all-about-pkg/
5 | ```
6 |
--------------------------------------------------------------------------------
/format/Project.toml:
--------------------------------------------------------------------------------
1 | name = "FormatJDS"
2 | uuid = "adfa99fd-7844-4562-9f5d-8091726a002b"
3 | authors = ["Jose Storopoli", "Rik Huijzer", "Lazaro Alonso"]
4 | version = "0.1.0"
5 |
6 | [deps]
7 | JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
8 |
9 | [compat]
10 | JuliaFormatter = "0.17"
11 |
--------------------------------------------------------------------------------
/src/environment.jl:
--------------------------------------------------------------------------------
1 | function pkg_deps()
2 | deps = [pair.second for pair in Pkg.dependencies()]
3 | deps = filter(p -> p.is_direct_dep, deps)
4 | deps = filter(p -> !isnothing(p.version), deps)
5 | list = ["$(p.name) $(p.version)" for p in deps]
6 | sort!(list)
7 | code_block(string(join(list, '\n')))
8 | end
9 |
--------------------------------------------------------------------------------
/contents/appendix.md:
--------------------------------------------------------------------------------
1 | # 附录 {#sec:appendix}
2 |
3 | ## 库的版本 {#sec:appendix_pkg}
4 |
5 | 本书由 Julia `jl string(VERSION)` 及以下库构建:
6 |
7 | ```jl
8 | JDS.pkg_deps()
9 | ```
10 |
11 | ```jl
12 | let
13 | date = today()
14 | hour = Dates.hour(now())
15 | min = Dates.minute(now())
16 |
17 | "Build: $date $hour:$min UTC"
18 | end
19 | ```
20 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # LaTeX Stuff
2 | pandoc/references.bib linguist-generated
3 | *.tex linguist-generated
4 |
5 | # Site
6 | _build/* text
7 |
8 | # Serialisation
9 | *.json text
10 | *.toml text
11 | *.xml text
12 | *.yaml text
13 | *.yml text
14 | *.html text
15 |
16 | # Data files
17 | *.csv text
18 |
19 | # Devcontainer files
20 | .devcontainer/* linguist-generated
21 |
--------------------------------------------------------------------------------
/.devcontainer/README.md:
--------------------------------------------------------------------------------
1 | # .devcontainer
2 |
3 | To build the JDS project with Docker (without VS Code), go to the root of the repository and run:
4 |
5 | ```
6 | .devcontainer/build.sh
7 | ```
8 |
9 | When the build is finished, run the project with:
10 |
11 | ```
12 | .devcontainer/run.sh
13 | ```
14 |
15 | And, if you want to use `serve`, use:
16 |
17 | ```
18 | .devcontainer/run.sh SERVE
19 | ```
20 |
21 | and
22 |
23 | ```
24 | serve(; host="0.0.0.0")
25 | ```
26 |
27 | to accept connections coming from outside of the container.
28 |
--------------------------------------------------------------------------------
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM julia:1.7
2 |
3 | # Extra dependencies for GLMakie.
4 | RUN set -eux; \
5 | apt-get update; \
6 | apt-get install -y --no-install-recommends \
7 | freeglut3-dev \
8 | imagemagick \
9 | libgl1 \
10 | libxcursor-dev \
11 | libxext-dev \
12 | libxi-dev \
13 | libxinerama-dev \
14 | libxrandr-dev \
15 | mesa-utils \
16 | xauth \
17 | xorg-dev \
18 | xvfb \
19 | ; \
20 | rm -rf /var/lib/apt/lists/*
21 |
22 | ENV DISPLAY=:0
23 | ENV JULIA_NUM_THREADS=2
24 |
25 | ENTRYPOINT ["/bin/sh", "-c", "/usr/bin/xvfb-run -s '-screen 0 1024x768x24' $@", ""]
26 | CMD ["julia"]
27 |
--------------------------------------------------------------------------------
/.devcontainer/run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | DEPOT_PATH="$HOME/.julia-docker"
6 | mkdir -p "$DEPOT_PATH/config"
7 | if [ -d "$DEPOT_PATH/packages/Revise" ]; then
8 | echo 'println("using Revise"); using Revise' > "$DEPOT_PATH/config/startup.jl"
9 | else
10 | echo 'println("Revise is not available.")' > "$DEPOT_PATH/config/startup.jl"
11 | fi
12 |
13 | if [ "$1" == "SERVE" ]; then
14 | docker run -it --rm \
15 | --env GKSwstype=nul \
16 | -p 8006:8006 \
17 | -v "$HOME/.julia-docker":"/root/.julia" \
18 | -v "$PWD":/app -w /app jds
19 | else
20 | docker run -it --rm \
21 | --env GKSwstype=nul \
22 | -v "$HOME/.julia-docker":"/root/.julia" \
23 | -v "$PWD":/app -w /app jds
24 | fi
25 |
--------------------------------------------------------------------------------
/format/src/juliaformatter.jl:
--------------------------------------------------------------------------------
1 | """
2 | format()
3 |
4 | Format all files in the book.
5 | This removes trailing whitespace by default.
6 | """
7 | function format()::BitVector
8 | files = source_files()
9 | filter!(f -> !endswith(f, ".bib"), files)
10 | format_options = parse_config(joinpath(JDS_DIR, ".JuliaFormatter.toml"))
11 | results = format_file.(files; format_options...)
12 | # JuliaFormatter can introduce trailing whitespace.
13 | remove_trailing_whitespace()
14 | return results
15 | end
16 |
17 | """
18 | is_formatted!()
19 |
20 | Return whether the files in the repository are formatted.
21 | This method is used in CI.
22 | """
23 | function is_formatted!()
24 | return all(format())
25 | end
26 |
27 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | # YAML 1.2
2 | ---
3 | authors:
4 | -
5 | family-names: Storopoli
6 | given-names: Jose
7 | orcid: "https://orcid.org/0000-0002-0559-5176"
8 | -
9 | family-names: Huijzer
10 | given-names: Rik
11 | orcid: "https://orcid.org/0000-0001-9445-8466"
12 | -
13 | family-names: Alonso
14 | given-names: Lazaro
15 | orcid: "https://orcid.org/0000-0001-6979-859X"
16 | cff-version: "1.2.0"
17 | date-released: 2021-10-01
18 | identifiers:
19 | -
20 | type: url
21 | value: "https://juliadatascience.io/"
22 | license: "CC-BY-NC-SA-4.0"
23 | message: "If you use this software, please cite it using these metadata."
24 | repository-code: "https://github.com/JuliaDataScience/JuliaDataScience"
25 | title: "Julia Data Science"
26 | version: 1.0.0
27 | ...
28 |
--------------------------------------------------------------------------------
/contents/data_vis_makie_cairo.md:
--------------------------------------------------------------------------------
1 | ## CairoMakie.jl {#sec:cairomakie}
2 |
3 | 我们开始绘制的第一张图是标注了散点的直线:
4 |
5 | ```
6 | using CairoMakie
7 | CairoMakie.activate!()
8 | ```
9 |
10 | ```jl
11 | s = """
12 | CairoMakie.activate!() # hide
13 | fig = scatterlines(1:10, 1:10)
14 | label = "firstplot" # hide
15 | caption = "First plot." # hide
16 | link_attributes = "width=60%" # hide
17 | Options(fig; filename=label, caption, label, link_attributes) # hide
18 | """
19 | sco(s)
20 | ```
21 |
22 | 注意前面的图采用默认输出样式,因此需要使用轴名称和轴标签进一步调整。
23 |
24 | 同时注意每一个像 `scatterlines` 这样的绘图函数都创建了一个 `FigureAxisPlot` 列表,其中包含 `Figure`, `Axis` 和 `plot` 对象。
25 | 这些函数也被称为 `non-mutating` 方法。
26 | 另一方面, `mutating` 方法(例如 `scatterlines!`,注意多了 `!`) 仅返回一个 plot 对象,它可以被添加到给定的 `axis` 或 `current_figure()` 中。
27 |
28 | 下一个问题是如何改变颜色或标记的类型?
29 | 这可以通过 `attributes` 实现, 将在下一节讨论。
30 |
--------------------------------------------------------------------------------
/format/src/FormatJDS.jl:
--------------------------------------------------------------------------------
1 | module FormatJDS
2 |
3 | using JuliaFormatter:
4 | format_file,
5 | parse_config
6 |
7 | const JDS_DIR = dirname(dirname(@__DIR__))
8 |
9 | const SOURCE_FILE_EXTENSIONS = [".jl", ".md", ".bib"]
10 |
11 | export source_files
12 | export project_has_trailing_whitespace, remove_trailing_whitespace
13 | export format, is_formatted!
14 |
15 | function is_source_file_extension(file::String)
16 | _, ext = splitext(file)
17 | return ext in SOURCE_FILE_EXTENSIONS
18 | end
19 |
20 | function source_files()
21 | subdirs = [
22 | "contents",
23 | "pandoc",
24 | "src"
25 | ]
26 | files = [readdir(joinpath(JDS_DIR, dir); join=true) for dir in subdirs]
27 | files = collect(Iterators.flatten(files))
28 | files = filter(is_source_file_extension, files)
29 | return files
30 | end
31 |
32 | include("whitespace.jl")
33 | include("juliaformatter.jl")
34 |
35 | end # module
36 |
--------------------------------------------------------------------------------
/.github/workflows/CompatHelper.yml:
--------------------------------------------------------------------------------
1 | name: CompatHelper
2 |
3 | on:
4 | schedule:
5 | - cron: '00 04 * * 6'
6 | workflow_dispatch:
7 |
8 | jobs:
9 | CompatHelper:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - name: "Add the General registry via Git"
13 | run: |
14 | import Pkg
15 | ENV["JULIA_PKG_SERVER"] = ""
16 | Pkg.Registry.add("General")
17 | shell: julia --color=yes {0}
18 | - name: "Install CompatHelper"
19 | run: |
20 | import Pkg
21 | name = "CompatHelper"
22 | uuid = "aa819f21-2bde-4658-8897-bab36330d9b7"
23 | version = "3"
24 | Pkg.add(; name, uuid, version)
25 | shell: julia --color=yes {0}
26 | - name: "Run CompatHelper"
27 | run: |
28 | using CompatHelper
29 | CompatHelper.main(; entry_type=DropEntry())
30 | shell: julia --color=yes {0}
31 | env:
32 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
33 | COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }}
34 |
--------------------------------------------------------------------------------
/src/bezier.jl:
--------------------------------------------------------------------------------
1 | u = LinRange(0, 2π, 72)
2 | a, b = 5.0, 2.0
3 | function ellipse(u; a=2, b=3, h=0, k=0)
4 | Point2f(h + a / 2 * cos(u), k + b / 2 * sin(u))
5 | end
6 |
7 | # https://github.com/dronir/Bezier.jl
8 | # it's not longer in the registry
9 | struct BezierCurve
10 | Xcoef::Vector{Float64}
11 | Ycoef::Vector{Float64}
12 | end
13 |
14 | function BezierCurve(r0::Vector, r1::Vector, t0::Vector, t1::Vector)
15 | X = bezier_coefs(r0[1], r1[1], t0[1], t1[1])
16 | Y = bezier_coefs(r0[2], r1[2], t0[2], t1[2])
17 | return BezierCurve(X, Y)
18 | end
19 |
20 | function (curve::BezierCurve)(t::Real)
21 | out = zeros(2)
22 | for i = 1:4
23 | out[1] += curve.Xcoef[i] * t^(4 - i)
24 | out[2] += curve.Ycoef[i] * t^(4 - i)
25 | end
26 | return out
27 | end
28 |
29 |
30 | # Give coefficients of one Bezier curve component
31 | function bezier_coefs(x0::Real, x1::Real, t0::Real, t1::Real)
32 | a = 2(x0 - x1) + t1 + t0
33 | b = 3(x1 - x0) - t1 - 2t0
34 | c = t0
35 | d = x0
36 | return [a, b, c, d]
37 | end
38 |
--------------------------------------------------------------------------------
/config.toml:
--------------------------------------------------------------------------------
1 | [projects]
2 |
3 | [projects.default]
4 | contents = [
5 | "preface",
6 | "why_julia",
7 | "julia_basics",
8 | "dataframes",
9 | "dataframes_load_save",
10 | "dataframes_indexing",
11 | "dataframes_select",
12 | "dataframes_join",
13 | "dataframes_transform",
14 | "dataframes_groupby_combine",
15 | "dataframes_performance",
16 | "data_vis_makie",
17 | "data_vis_makie_cairo",
18 | "data_vis_makie_attributes",
19 | "data_vis_makie_themes",
20 | "data_vis_makie_latex",
21 | "data_vis_makie_colors",
22 | "data_vis_makie_layouts",
23 | "data_vis_makie_glmakie",
24 | #"stats",
25 | #"stats_distributions",
26 | #"stats_vis",
27 | "appendix",
28 | "notation",
29 | #"project_management",
30 | "references"
31 | ]
32 | port = 8006
33 |
34 | # Extra directories to be copied.
35 | extra_directories = [
36 | "images"
37 | ]
38 |
39 | online_url = "https://cn.julialang.org"
40 | online_url_prefix = "/JuliaDataScience"
41 |
42 | output_filename = "juliadatascience"
43 |
--------------------------------------------------------------------------------
/.github/workflows/Format.yml:
--------------------------------------------------------------------------------
1 | name: Format
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | workflow_dispatch:
9 |
10 | jobs:
11 | docs:
12 | name: CheckFormatting
13 | runs-on: ubuntu-20.04
14 | steps:
15 | - uses: actions/checkout@v2
16 |
17 | - shell: julia --project=format --color=yes {0}
18 | run: |
19 | using Pkg
20 | Pkg.instantiate()
21 |
22 | - shell: julia --project=format --color=yes {0}
23 | run: |
24 | using FormatJDS:
25 | project_has_trailing_whitespace,
26 | remove_trailing_whitespace
27 |
28 | exitcode = project_has_trailing_whitespace() ? 1 : 0
29 | println(exitcode == 1 ? "Trailing whitespace found" : "No trailing whitespace found")
30 | if exitcode == 1
31 | # Show where the trailing whitespace occurs.
32 | remove_trailing_whitespace()
33 | run(`git config --global color.diff true`)
34 | run(`git diff`)
35 | end
36 | exit(exitcode)
37 |
--------------------------------------------------------------------------------
/contents/data_vis_makie_latex.md:
--------------------------------------------------------------------------------
1 | ## 使用 LaTeXStrings.jl {#sec:using_latex}
2 |
3 | 通过调用 `LaTeXStrings.jl`,`Makie.jl` 实现了对 LaTeX 的支持:
4 |
5 | ```
6 | using LaTeXStrings
7 | ```
8 |
9 | 一个简单的基础用法例子如下所示 (@fig:latex_strings),其主要包含用于 x-y 标签和图例的 LaTeX 字符串。
10 |
11 | ```jl
12 | @sc LaTeX_Strings()
13 | ```
14 |
15 | ```jl
16 | s = """
17 | CairoMakie.activate!() # hide
18 | with_theme(LaTeX_Strings, publication_theme())
19 | label = "latex_strings" # hide
20 | caption = "Plot with LaTeX strings." # hide
21 | link_attributes = "width=60%" # hide
22 | Options(current_figure(); filename=label, caption, label, link_attributes) # hide
23 | """
24 | sco(s)
25 | ```
26 |
27 | 下面是更复杂的例子,图中的`text`是一些等式,并且图例编号随着曲线数增加:
28 |
29 | ```jl
30 | @sco JDS.multiple_lines()
31 | ```
32 |
33 | 但不太好的是,一些曲线的颜色是重复的。
34 | 添加标记和线条类型通常能解决此问题。
35 | 所以让我们使用 [`Cycles`](http://makie.juliaplots.org/stable/documentation/theming/index.html#cycles) 来添加标记和线条类型。
36 | 设置 `covary=true`,使所有元素一起循环:
37 |
38 | ```jl
39 | @sco JDS.multiple_scatters_and_lines()
40 | ```
41 |
42 | 一张出版质量的图如上所示。
43 | 那我们还能做些什么操作?
44 | 答案是还可以为图定义不同的默认颜色或者调色盘。
45 | 在下一节,我们将再次了解如何使用 [`Cycles`](http://makie.juliaplots.org/stable/documentation/theming/index.html#cycles) 以及有关它的更多信息,即通过添加额外的关键字参数就可以实现前面的操作。
46 |
--------------------------------------------------------------------------------
/metadata.yml:
--------------------------------------------------------------------------------
1 | ---
2 | title: Julia Data Science
3 | subtitle: ""
4 | author:
5 | - Jose Storopoli
6 | - Rik Huijzer
7 | - Lazaro Alonso
8 | - 刘贵欣 (中文翻译)
9 | - 田俊 (中文审校)
10 | html-license: 'CC BY-NC-SA 4.0'
11 | pdf-footer: ""
12 | tex-license: Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
13 | lang: en-US
14 | tags: [JuliaLang, Data Science, Data Visualization, Data Manipulation, Data Analysis]
15 | repo: https://github.com/JuliaDataScience/JuliaDataScience
16 | secPrefix: Section
17 | figPrefix: 图
18 | tblPrefix: 表
19 | titlepage: true
20 | bibliography: pandoc/references.bib
21 | allow-subsubsections: true
22 |
23 | # This font appears to not support ZH characters.
24 | mainfont: DejaVu Sans
25 | CJKmainfont: Noto Serif CJK SC
26 |
27 | # Only used in PDF.
28 | titlepage-top: >
29 | \begin{tabular}{l}
30 | Jose Storopoli\\
31 | Universidade Nove de Julho - UNINOVE\\
32 | Brazil\\
33 | \\
34 | Rik Huijzer\\
35 | University of Groningen\\
36 | the Netherlands\\
37 | \\
38 | Lazaro Alonso\\
39 | Max Planck Institute for Biogeochemistry\\
40 | Germany
41 | \end{tabular}
42 |
43 | titlepage-bottom: |
44 | First edition published 2021
45 |
46 | \url{https://juliadatascience.io}
47 |
48 | ISBN: 9798489859165
49 | ---
50 |
--------------------------------------------------------------------------------
/format/src/whitespace.jl:
--------------------------------------------------------------------------------
1 | """
2 | line_has_trailing_whitespace(line::AbstractString)
3 |
4 | Return whether `line` has trailing whitespace.
5 | Assumes that `line` doesn't contain newlines.
6 | """
7 | line_has_trailing_whitespace(line::AbstractString) = endswith(line, ' ')
8 |
9 | """
10 | file_has_trailing_whitespace(path) -> Bool
11 |
12 | Check for trailing whitespace in a naive but reasonably effective way for the file at `path`.
13 | Return whether the file contains trailing whitespace.
14 |
15 | Not using exising tools for this simple procedure, because it's easier to fine tune this
16 | code to our use-case.
17 | """
18 | function file_has_trailing_whitespace(path)::Bool
19 | lines = split(read(path, String), '\n')
20 | return any(line_has_trailing_whitespace.(lines))
21 | end
22 |
23 | function project_has_trailing_whitespace()::Bool
24 | return any(file_has_trailing_whitespace.(source_files()))
25 | end
26 |
27 | function remove_trailing_whitespace(path)
28 | sep = '\n'
29 | text = read(path, String)
30 | lines = split(text, sep)
31 | updated_lines = rstrip.(lines, ' ')
32 | updated_text = join(updated_lines, sep)
33 | write(path, updated_text)
34 | return nothing
35 | end
36 |
37 | function remove_trailing_whitespace()
38 | remove_trailing_whitespace.(source_files())
39 | return nothing
40 | end
41 |
--------------------------------------------------------------------------------
/src/showcode_additions.jl:
--------------------------------------------------------------------------------
1 | function get_error(expr::String)
2 | try
3 | sco(expr)
4 | catch e
5 | exc, bt = last(Base.catch_stack())
6 | stacktrace = sprint(Base.showerror, exc, bt)::String
7 | stacktrace = clean_stacktrace(stacktrace)
8 | lines = split(stacktrace, '\n')
9 | lines = lines[1:end-8]
10 | join(lines, '\n')
11 | end
12 | end
13 |
14 | function trim_last_n_lines(s::String, n::Int)
15 | lines = split(s, '\n')
16 | lines = lines[1:end-n]
17 | lines = [lines; " ..."]
18 | join(lines, '\n')
19 | end
20 | trim_last_n_lines(n::Int) = s -> trim_last_n_lines(s, n)
21 |
22 | """
23 | sce(expr::String)
24 |
25 | Show code and error.
26 | """
27 | function sce(expr::String; post::Function=identity)
28 | code = code_block(expr)
29 | err = JDS.get_error(expr)
30 | err = post(err)
31 | out = output_block(err)
32 | """
33 | $code
34 | $out
35 | """
36 | end
37 |
38 | """
39 | scsob(expr::String)
40 |
41 | Source code string output block.
42 | Abbreviation for `sco(s; process=string, post=output_block)`.
43 | """
44 | function scsob(expr::String)
45 | sco(expr; process=string, post=output_block)
46 | end
47 |
48 | function plainblock(expr)
49 | return """
50 | ```language-plain
51 | $expr
52 | ```
53 | """
54 | end
55 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Julia Data Science
6 |
7 |
8 | Open source and open access book for data science in Julia.
9 |
10 |
11 | [](https://github.com/JuliaDataScience/JuliaDataScience/actions?query=workflow%3ACI+branch%3Amain)
12 | [![CC BY-NC-SA 4.0][cc-by-nc-sa-shield]][cc-by-nc-sa]
13 | [](https://github.com/invenia/BlueStyle)
14 |
15 | 本书的中文译本提供了 [在线阅读版](https://cn.julialang.org/JuliaDataScience/) 和 [PDF离线阅读版](https://cn.julialang.org/JuliaDataScience/juliadatascience.pdf).
16 |
17 | 此外,本书的英文版也可以在 [Amazon.com](https://www.amazon.com/dp/B09KMRKQ96/) 上购买阅读。
18 |
19 | ## LICENSE
20 |
21 | This book is licensed under [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International][cc-by-nc-sa].
22 |
23 | [![CC BY-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]
24 |
25 | [cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/
26 | [cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png
27 | [cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC%20BY--NC--SA%204.0-lightgrey.svg
28 |
--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | workflow_dispatch:
9 |
10 | jobs:
11 | BuildAndDeploy:
12 | runs-on: ubuntu-20.04
13 | steps:
14 | - uses: actions/checkout@v2
15 | with:
16 | persist-credentials: false
17 |
18 | - uses: julia-actions/setup-julia@v1
19 | with:
20 | version: "1.7"
21 |
22 | - uses: julia-actions/cache@v1
23 |
24 | - name: Install GLMakie dependencies
25 | run: sudo apt-get update && sudo apt-get install -y xorg-dev imagemagick mesa-utils xvfb libgl1 freeglut3-dev libxrandr-dev libxinerama-dev libxcursor-dev libxi-dev libxext-dev
26 |
27 | - name: Install dependencies
28 | run: julia --color=yes --project -e 'using Pkg; Pkg.instantiate();
29 | using Books; Books.install_dependencies()'
30 |
31 | - run: >
32 | DISPLAY=:0 xvfb-run -s '-screen 0 1024x768x24' julia --project -e 'using JDS; JDS.build()'
33 | env:
34 | # Fix for Plots with GR backend.
35 | GKSwstype: nul
36 |
37 | - name: Deploy to secondary branch
38 | if: ${{ github.event_name != 'pull_request' }}
39 | uses: peaceiris/actions-gh-pages@v3
40 | with:
41 | github_token: ${{ secrets.GITHUB_TOKEN }}
42 | force_orphan: true
43 | publish_dir: ./_build/
44 |
--------------------------------------------------------------------------------
/contents/data_vis_makie.md:
--------------------------------------------------------------------------------
1 | # 使用 Makie.jl 做数据可视化 {#sec:DataVisualizationMakie}
2 |
3 | > Maki-e 来源于日语, 它指的是一种在漆面上撒金粉和银粉的技术。
4 | > 数据就是我们这个时代的金和银,让我们在屏幕上制作美丽的数据图吧!
5 | >
6 | > _Simon Danisch, `Makie.jl` 创始人_
7 |
8 | [Makie.jl](http://makie.juliaplots.org/stable/index.html) 是高性能,可扩展且跨平台的 Julia 语言绘图系统。
9 | 我们认为,它是最漂亮和最通用的绘图包。
10 |
11 | 与其他绘图包一样,该库的代码分为多个包。
12 | `Makie.jl` 是绘图前端,它定义了所有创建绘图对象需要的函数。
13 | 虽然这些对象存储了绘图所需的全部信息,但还未转换为图片。
14 | 因此,我们需要一个 Makie 后端。
15 | 默认情况下,每一个后端都将 `Makie.jl` 中的API都重新导出了,因此只需要安装和加载所需的后端包即可。
16 |
17 | 目前主要有三个后端实现了 Makie 中定义的所有抽象类型的渲染功能。
18 | 第一个后端能够绘制 2D 非交互式的出版物质量级矢量图:`CairoMakie.jl`。
19 | 另一个后端是交互式 2D 和 3D 绘图库 `GLFW.jl`(支持 GPU),`GLMakie.jl`。
20 | 第三个后端是基于 WebGL 的交互式 2D 和 3D 绘图库 `WGLMakie.jl`,它运行在浏览器中。[查阅 Makie 文档了解更多](http://makie.juliaplots.org/stable/documentation/backends_and_output/)。
21 |
22 | 本书将只介绍一些 `CairoMakie.jl` 和 `GLMakie.jl` 的例子。
23 |
24 | 使用任一绘图后端的方法是 `using` 该后端并调用 `activate!` 函数。
25 | 示例如下:
26 |
27 | ```
28 | using GLMakie
29 | GLMakie.activate!()
30 | ```
31 |
32 | 现在可以开始绘制出版质量级的图。
33 | 但是,在绘图之前,应知道如何保存。
34 | `save` 图片 `fig` 的最简单方法是 `save("filename.png", fig)`。
35 | `CairoMakie.jl` 也支持保存为其他格式,如 `svg` 和 `pdf`。
36 | 通过传递指定的参数可以轻松地改变图片的分辨率。
37 | 对于矢量格式,指定的参数为 `pt_per_unit`。例如:
38 |
39 | ```
40 | save("filename.pdf", fig; pt_per_unit=2)
41 | ```
42 |
43 | 或
44 |
45 | ```
46 | save("filename.pdf", fig; pt_per_unit=0.5)
47 | ```
48 |
49 | 对于 `png`,则指定 `px_per_unit`。
50 | 查阅 [后端 & 输出](https://makie.juliaplots.org/stable/documentation/backends_and_output/) 可获得更多详细信息。
51 |
52 | 另一重要问题是如何可视化输出数据图。
53 | 在使用 `CairoMakie.jl` 时,Julia REPL 不支持显示图片, 所以你还需要 IDE(Integrated Development Environment,集成开发环境),例如支持 `png` 或 `svg` 作为输出的 VSCode,Jupyter 或 Pluto。
54 | 另一个包 `GLMakie.jl` 则能够创建交互式窗口,或在调用 `Makie.inline!(true)` 时在行间显示位图。
55 |
--------------------------------------------------------------------------------
/Project.toml:
--------------------------------------------------------------------------------
1 | name = "JDS"
2 | uuid = "6c596d62-2771-44f8-8373-3ec4b616ee9d"
3 | authors = ["Jose Storopoli", "Rik Huijzer", "Lazaro Alonso"]
4 | version = "0.1.0"
5 |
6 | [deps]
7 | Books = "939d5c6b-51ae-42e7-97ca-7564d0d4ad91"
8 | CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
9 | CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
10 | CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
11 | ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
12 | Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
13 | DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
14 | Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
15 | Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
16 | Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
17 | FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
18 | GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
19 | GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326"
20 | ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
21 | InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
22 | LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
23 | LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
24 | Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
25 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
26 | QuartzImageIO = "dca85d43-d64c-5e67-8c65-017450d5d020"
27 | Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
28 | Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
29 | Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
30 | StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
31 | TestImages = "5e47fb64-e119-507b-a336-dd2b206d9990"
32 | XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0"
33 |
34 | [compat]
35 | Books = "1"
36 | CSV = "0.10"
37 | CairoMakie = "0.7"
38 | CategoricalArrays = "0.10"
39 | ColorSchemes = "3"
40 | Colors = "0.12"
41 | DataFrames = "1.1"
42 | Distributions = "0.25"
43 | FileIO = "1"
44 | GLMakie = "0.5"
45 | GeometryBasics = "0.4"
46 | ImageMagick = "1"
47 | LaTeXStrings = "1"
48 | Makie = "0.16"
49 | QuartzImageIO = "0.7"
50 | Reexport = "1.1"
51 | StatsBase = "0.33"
52 | TestImages = "1"
53 | XLSX = "0.7"
54 |
--------------------------------------------------------------------------------
/contents/dataframes_groupby_combine.md:
--------------------------------------------------------------------------------
1 | ## Groupby 和 Combine {#sec:groupby_combine}
2 |
3 | 在 R 编程语言中,@wickham2011split 推广了用于数据转换的 split-apply-combine 模式。
4 | 在该模式中,我们先将数据 **split** 成不同组,然后对每一组 **apply** 一个或多个函数,最后 **combine** 每组的结果。
5 | `DataFrames.jl` 完全支持 split-apply-combine 模式。
6 | 本节使用之前的学生成绩数据作为示例。
7 | 假设想获得每个学生的平均成绩:
8 |
9 | ```jl
10 | @sco process=without_caption_label all_grades()
11 | ```
12 |
13 | 按照该模式,先将数据集按照学生名称 **split** 为不同组,其次对每组数据 **apply** 均值函数,最后 **combine** 每组的结果。
14 |
15 | 在 split 步骤中使用的函数为 `groupby`,并将函数的第二个参数列 ID 指定为数据集分割的条件。
16 |
17 | ```jl
18 | s = "groupby(all_grades(), :name)"
19 | sco(s; process=string, post=plainblock)
20 | ```
21 |
22 | `mean` 函数来自 Julia 标准库中的 `Statistics` 模块:
23 |
24 | ```
25 | using Statistics
26 | ```
27 |
28 | 应用此函数时,需调用 `combine` 函数:
29 |
30 | ```jl
31 | s = """
32 | gdf = groupby(all_grades(), :name)
33 | combine(gdf, :grade => mean)
34 | """
35 | sco(s; process=without_caption_label)
36 | ```
37 |
38 | 想象一下,如果没有 `groupby` 和 `combine` 函数,则需按照下文这样做。
39 | 我们必须循环遍历数据以将其分割为多组,然后循环遍历每组以应用函数,**以及** 循环遍历每组以收集最终结果。
40 | 因此,split-apply-combine 模式是值得掌握的技术。
41 |
42 | ### Multiple Source Columns {#sec:groupby_combine_multiple_source}
43 |
44 | 但如果我们想将一个函数应用到多列数据,该如何操作?
45 |
46 | ```jl
47 | s = """
48 | group = [:A, :A, :B, :B]
49 | X = 1:4
50 | Y = 5:8
51 | df = DataFrame(; group, X, Y)
52 | """
53 | sco(s; process=without_caption_label)
54 | ```
55 |
56 | 操作与之前类似:
57 |
58 | ```jl
59 | s = """
60 | gdf = groupby(df, :group)
61 | combine(gdf, [:X, :Y] .=> mean; renamecols=false)
62 | """
63 | sco(s; process=without_caption_label)
64 | ```
65 |
66 | 注意到,我们在右箭头 `=>` 前使用了 `.` 点运算符,这表示 `mean` 函数将应用到多个列 `[:X, :Y]`。
67 |
68 | 要在`combine`中使用组合函数,一种简单的方法是创建一个函数来执行预期的组合变换。
69 | 例如,对于一组数据,在先应用 `mean`后调用 `round` 对值取整(即 `Int` )。
70 |
71 | ```jl
72 | s = """
73 | gdf = groupby(df, :group)
74 | rounded_mean(data_col) = round(Int, mean(data_col))
75 | combine(gdf, [:X, :Y] .=> rounded_mean; renamecols=false)
76 | """
77 | sco(s; process=without_caption_label)
78 | ```
79 |
--------------------------------------------------------------------------------
/contents/index.md:
--------------------------------------------------------------------------------
1 | # Welcome {-}
2 |
3 | ```{=html}
4 |
16 |
17 |
22 | ```
23 |
24 | ```{=comment}
25 | This file is only included on the website.
26 | ```
27 |
28 | Welcome! 这是一本关于 **[Julia](https://julialang.org) 数据科学** 的开放获取书籍,同时源代码开源。
29 | 我们的目标读者是来自应用科学各个领域的研究人员。
30 | 当然,我们也希望能对工业界有用。
31 | 你可以使用键盘上的箭头键(左/右)浏览电子书。
32 |
33 | 此译本主要由 [guixinliu](https://github.com/guixinliu) 完成, [findmyway](https://github.com/findmyway) 提供了审校, 其源码公开在 [GitHub](https://github.com/JuliaCN/JuliaDataScience){target="_blank"},如果你在阅读过程中有任何问题和建议,欢迎前往创建issue。 此外,你也可以下载[PDF版的中文译本](https://cn.julialang.org/JuliaDataScience/juliadatascience.pdf)方便离线阅读。
34 |
35 | 本书的英文版可以在其官网[在线阅读](https://juliadatascience.io/)或者获取[英文PDF版](https://juliadatascience.io/juliadatascience.pdf)离线阅读。此外,本书也同时发布在 [Amazon.com](https://www.amazon.com/dp/B09KMRKQ96/)。
36 |
37 |
38 | ### 引用信息 {-}
39 |
40 | 可使用如下条目引用本书的内容:
41 |
42 | ```plaintext
43 | Storopoli, Huijzer and Alonso (2021). Julia Data Science. https://juliadatascience.io. ISBN: 9798489859165.
44 | ```
45 |
46 | 或以 BibTeX 格式:
47 |
48 | ```plaintext
49 | @book{storopolihuijzeralonso2021juliadatascience,
50 | title = {Julia Data Science},
51 | author = {Jose Storopoli and Rik Huijzer and Lazaro Alonso},
52 | url = {https://juliadatascience.io},
53 | year = {2021},
54 | isbn = {9798489859165}
55 | }
56 | ```
57 |
58 | ### 封面 {-}
59 |
60 | ```jl
61 | let
62 | fig = front_cover()
63 | # Use lazy loading to keep homepage speed high.
64 | link_attributes = """loading="lazy" width=80%"""
65 | # When changing this name, also change the link in README.md.
66 | # This doesn't work for some reason; I need to fix it.
67 | filename = "frontcover"
68 | Options(fig; filename, label=filename)
69 | end
70 | ```
71 |
72 |
--------------------------------------------------------------------------------
/src/ci.jl:
--------------------------------------------------------------------------------
1 | """
2 | write_thanks_page()
3 |
4 | Thanks page for when people sign up for email updates.
5 | """
6 | function write_thanks_page()
7 | text = """
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | Thank you
17 |
18 |
19 |
20 | You successfully signed up for email updates.
21 |
22 |
23 |
24 |
27 |
28 |
29 | """
30 | path = joinpath(BUILD_DIR, "thanks.html")
31 | write(path, text)
32 | return path
33 | end
34 |
35 | function build_all(; project="default", extra_head="", fail_on_error=false)
36 | mkpath(BUILD_DIR)
37 | filename = "favicon.png"
38 | from_path = joinpath("pandoc", filename)
39 | if isfile(from_path)
40 | cp(from_path, joinpath(BUILD_DIR, filename); force=true)
41 | end
42 | build_sitemap = true
43 | html(; project, extra_head, fail_on_error, build_sitemap)
44 | write_extra_html_files(project)
45 | pdf(; project=project)
46 | # docx(; project)
47 | end
48 |
49 | function install_fonts()
50 | @info "installing font[NotoSerifCJKsc]..."
51 | fonts_dir = joinpath(homedir(), ".fonts")
52 | mkpath(fonts_dir)
53 | run(`wget -q -O tmp.zip https://noto-website-2.storage.googleapis.com/pkgs/NotoSerifCJKsc-hinted.zip`)
54 | run(`unzip tmp.zip -d $(joinpath(fonts_dir, "NotoSerifCJKsc"))`)
55 | run(`rm tmp.zip`)
56 | run(`fc-cache --verbose $fonts_dir`)
57 | end
58 |
59 | """
60 | build()
61 |
62 | This method is called during CI.
63 | """
64 | function build()
65 | println("Building JDS")
66 | install_fonts()
67 | write_thanks_page()
68 | fail_on_error = true
69 | gen(; fail_on_error)
70 | build_all(; fail_on_error)
71 | end
72 |
--------------------------------------------------------------------------------
/contents/notation.md:
--------------------------------------------------------------------------------
1 | ## 符号 {#sec:notation}
2 |
3 | 我们尽量保持本书符号的一致性。
4 | 这会使阅读和编写代码更容易。
5 | 我们可以将符号定义为三个部分。
6 |
7 | ### Julia 风格指南 {#sec:julia_style_guide}
8 |
9 | 首先,我们尝试遵循 [Julia 风格指南](https://docs.julialang.org/en/v1/manual/style-guide/) 中的约定惯例。
10 | 更重要的是,要编写函数而不是脚本(也可查阅 @sec:engineering)。
11 | 另外,我们使用与 Julia `base/` 模块一致的命名约定,即:
12 |
13 | - 模块采用驼峰命名法: `module JuliaDataScience`, `struct MyPoint`。
14 | (之所叫驼峰命名法,是因为单词的首字母大写,如 "iPad" 或 "CamelCase", 这使得单词看起来像驼峰。)
15 | - 函数名全部小写,并用下划线分隔单词。
16 | 不过也允许在命名函数时省略分隔符。
17 | 例如,这些函数名都符合约定: `my_function`, `myfunction` 和 `string2int`。
18 |
19 | 同时,避免在条件语句中使用括号,即写为 `if a == b` 而不是 `if (a == b)`,并且每级缩进使用 4 个空格。
20 |
21 | ### Blue 风格指南 {#sec:blue_style_guide}
22 |
23 | [Blue 风格指南](https://github.com/invenia/BlueStyle) 在默认的 Julia 风格指南基础上增加了更多的约定。
24 | 一些规则可能听起来有点古板,但我们发现这样能提高代码的可读性。
25 |
26 | 根据风格指南,我们具体坚持:
27 |
28 | - 每行代码最多 92 字符(Markdown 文件允许更长的行)。
29 | - 使用 `using` 加载模块,且每行最多加载一个。
30 | - 行尾无空格。
31 | 行尾的空格会使代码更改检查更加困难,因为虽然它们不会修改代码行为,但会显示为更改。
32 | - 避免括号内的多余空格。
33 | 因此,要写为 `string(1, 2)` 而不是 `string( 1 , 2 )`。
34 | - 应避免全局变量。
35 | - 尝试将函数名压缩至一到两个词。
36 | - 使用分号 `;` 来说明参数是否为关键字参数。
37 | 例如,使用 `func(x; y=3)` 而不是 `func(x, y=3)`。
38 | - 避免使用多个空格来对齐对象。
39 | 所以,应该写
40 | ```
41 | a = 1
42 | lorem = 2
43 | ```
44 | 而不是
45 | ```
46 | a = 1
47 | lorem = 2
48 | ```
49 | - 当合适时,我们应在双目运算符两侧增加空格,例如, `1 == 2` 或 `y = x + 1`。
50 | - 缩进三引号和三反引号:
51 | ```
52 | s = """
53 | my long text:
54 | [...]
55 | the end.
56 | """
57 | ```
58 | - 不要省略浮点数中的零(即使 Julia 允许这样做)。
59 | 因此,写为 `1.0` 而不是 `1.` ,写为 `0.1` 而不是 `.1`。
60 | - 在 for 循环中使用 `in`,而不是 `=` 或 `∈` (即使 Julia 允许这样做)。
61 |
62 | ### 我们的补充
63 |
64 | - 在行文时,我们将使用 `M.foo` 引用 `M.foo(3, 4)`,而不是使用 `M.foo(...)` 或 `M.foo()`。
65 | - 当讨论软件包时,如 DataFrames 包,我们每次都会明确地写为 `DataFrames.jl`。
66 | 这使得可以非常容易地定位正在讨论的包。
67 | - 对于文件名, 我们坚持使用 "file.txt",而不是 `file.txt` 或 file.txt,因为这种形式与代码保持一致。
68 | - 对于表中的列,如列 `x`,我们坚持使用 `:x`,因为这种形式与代码保持一致。
69 | - 不要在行内代码使用 Unicode 符号。
70 | 这只是一个 PDF 生成中的 bug,但现在我们必须解决它。
71 | - 每个代码块前面的行以冒号 (:) 结尾,表示此行属于该代码块。
72 |
73 | #### 加载符号
74 |
75 | 在不使用 REPL 时,我们更喜欢显式加载符号,即更喜欢使用 `using A: foo` 而不是 `using A`(另请查阅 @jump2021using)。
76 | 在此上下文中,符号表示对象的标识符。
77 | 例如,即使看起来不正常, 但本质上 `DataFrame`、`π` 和 `CSV` 都是符号。
78 | 在使用诸如 `isdefined` 这样的 Julia 方法时,我们发现了这一点:
79 |
80 | ```jl
81 | scob("isdefined(Main, :π)")
82 | ```
83 |
84 | 接下来使用 `using` 时会变得显式,另外更喜欢使用 `using A: foo` 而不是 `import A: foo` ,因为后者更容易意外地扩展 `foo`。
85 | 注意这不仅仅是针对 Julia 的建议:
86 | Python 也不鼓励通过 `from import *` 隐式加载符号 [@pep8]。
87 |
88 | 显式加载的重要性与语义版本控制有关。
89 | 结合语义版本控制 () 后,版本号将关系到包是否存在 **破坏性** 更新。
90 | 例如,当包 `A` 的版本号从 `0.2.2` 变化到 `0.2.3`,其进行的是非破坏性更新。
91 | 在这种非破坏性更新下,你不用担心你的包会产生破坏,即抛出错误或改变行为。
92 | 如果包 `A` 从 `0.2` 变化到 `1.0`, 这意味着破坏性更新,然后你预计需要对你的包做一些修改,然后才能使包 `A` 再次正常运行。
93 | **然而**,导出额外符号视为非破坏性更新。
94 | 所以,在隐式加载符号时, **非破坏性更新会破坏你的包**。
95 | 这就是为什么显式加载符号是一种很好的风格实践。
96 |
--------------------------------------------------------------------------------
/src/JDS.jl:
--------------------------------------------------------------------------------
1 | module JDS
2 |
3 | import Pkg
4 |
5 | using Reexport: @reexport
6 |
7 | @reexport begin
8 | using Books:
9 | BUILD_DIR,
10 | @sc,
11 | @sco,
12 | Options,
13 | catch_show,
14 | clean_stacktrace,
15 | code_block,
16 | convert_output,
17 | gen,
18 | output_block,
19 | sc,
20 | sco,
21 | scob,
22 | serve,
23 | without_caption_label,
24 | html,
25 | pdf,
26 | write_extra_html_files
27 | using CSV
28 | using CairoMakie
29 | using CategoricalArrays
30 | using ColorSchemes
31 | using Colors
32 | using DataFrames:
33 | ByRow,
34 | DataFrame,
35 | DataFrameRow,
36 | Not,
37 | antijoin,
38 | combine,
39 | crossjoin,
40 | filter,
41 | groupby,
42 | innerjoin,
43 | leftjoin,
44 | outerjoin,
45 | rightjoin,
46 | select!,
47 | select,
48 | semijoin,
49 | subset,
50 | transform,
51 | transform!
52 | using Dates
53 | using Distributions
54 | using Downloads
55 | using FileIO
56 | using GLMakie
57 | using GeometryBasics
58 | using InteractiveUtils
59 | using LaTeXStrings
60 | using LinearAlgebra
61 | using Random: rand, randn, seed!
62 | using Statistics
63 | using StatsBase:
64 | mad,
65 | mode
66 | using TestImages
67 | using XLSX:
68 | eachtablerow,
69 | readxlsx,
70 | writetable
71 | end # @reexport
72 |
73 | const SMALL_IM_ATTR = "width=70%"
74 |
75 | include("ci.jl")
76 | include("df.jl")
77 | include("environment.jl")
78 | include("showcode_additions.jl")
79 | include("makie.jl")
80 | include("stats.jl")
81 | include("bezier.jl")
82 | include("front-cover.jl")
83 |
84 | # Showcode additions.
85 | export sce, scsob, trim_last_n_lines, plainblock
86 |
87 | # Makie.
88 | export publication_theme, plot_with_legend_and_colorbar
89 | export LaTeX_Strings, demo_themes, new_cycle_theme, scatters_and_lines
90 | export nested_sub_plot!, add_box_inset, add_axis_inset, peaks
91 |
92 | # DataFrames.
93 | export grades_2020, grades_2021, all_grades, grades_array, grade_2020
94 | export convert_output, equals_alice, write_grades_csv, grades_with_commas
95 | export write_grades_xlsx, write_xlsx, salaries, responses, wrong_types
96 | export only_pass, correct_types, fix_age_column, fix_date_column
97 |
98 | # Stats.
99 | export more_grades
100 | export statistics_graph, plot_central
101 | export plot_dispersion_std, plot_dispersion_mad, plot_dispersion_iqr
102 | export plot_corr
103 | export plot_normal_lognormal, plot_discrete_continuous
104 | export plot_pmf, plot_pdf, plot_cdf
105 | export calculate_pdf
106 | export anscombe_quartet, plot_anscombe
107 |
108 | # Book cover.
109 | export front_cover
110 |
111 | end # module
112 |
--------------------------------------------------------------------------------
/contents/dataframes_transform.md:
--------------------------------------------------------------------------------
1 | ## 变量变换 {#sec:df_transform}
2 |
3 | ```{=comment}
4 | We need to cover `ifelse` and `case_when`
5 | ```
6 |
7 | 在 @sec:filter 中,我们使用 `filter` 函数筛选一列或多列数据。
8 | 回忆一下, `filter` 函数使用 `source => f::Function` 这样的语法:`filter(:name => name -> name == "Alice", df)`。
9 |
10 | 在 @sec:select 中, 我们使用 `select` 函数选择一列或多列源数据, 并传入一个或多个目标列 `source => target`。
11 | 同样也有例子帮助回忆: `select(df, :name => :people_names)`。
12 |
13 | 本节将讨论如何 **变换** 变量,即如何 **更改数据**。
14 | `DataFrames.jl` 中对应的语法是 `source => transformation => target`。
15 |
16 | 与之前一样,使用 `grades_2020` 数据集:
17 |
18 | ```jl
19 | @sco process=without_caption_label grades_2020()
20 | ```
21 |
22 | 假设想要 `grades_2020` 中的所有成绩加 1。
23 | 首先,需要定义一个接收向量数据并使所有元素加 1 的函数。
24 | 然后使用 `DataFrames.jl` 中的 `transform` 函数。与其他原生 `DataFrames.jl` 函数一样,按照其语法,它接收 `DataFrame` 作为第一个参数:
25 |
26 | ```jl
27 | s = """
28 | plus_one(grades) = grades .+ 1
29 | transform(grades_2020(), :grade_2020 => plus_one)
30 | """
31 | sco(s; process=without_caption_label)
32 | ```
33 |
34 | 如上, `plus_one` 函数接收了 `:grade_2020` 整列。
35 | 这就是为什么要在加 `+` 运算符前添加 `.` 广播运算符。
36 | 可以查阅 @sec:broadcasting 回顾有关广播的操作。
37 |
38 | 如之前所说, `DataFrames.jl` 总是支持 `source => transformation => target` 这样的短语法。
39 | 所以,如果想在输出中保留 `target` 列的命名,操作如下:
40 |
41 | ```jl
42 | s = """
43 | transform(grades_2020(), :grade_2020 => plus_one => :grade_2020)
44 | """
45 | sco(s; process=without_caption_label)
46 | ```
47 |
48 | 也可以使用关键字参数 `renamecols=false`:
49 |
50 | ```jl
51 | s = """
52 | transform(grades_2020(), :grade_2020 => plus_one; renamecols=false)
53 | """
54 | sco(s; process=without_caption_label)
55 | ```
56 |
57 | 还可以使用 `select` 实现相同的转换,具体如下:
58 |
59 | ```jl
60 | s = """
61 | select(grades_2020(), :, :grade_2020 => plus_one => :grade_2020)
62 | """
63 | sco(s; process=without_caption_label)
64 | ```
65 |
66 | 其中 `:` 表明 "选择所有列" ,正如在 @sec:select 讨论的那样。
67 | 另外,还可以使用 Julia 广播更改 `grade_2020` 列,即直接访问 `df.grade_2020`:
68 |
69 | ```jl
70 | s = """
71 | df = grades_2020()
72 | df.grade_2020 = plus_one.(df.grade_2020)
73 | df
74 | """
75 | sco(s; process=without_caption_label)
76 | ```
77 |
78 | 但是,尽管很容易使用 Julia 原生操作构建最后的例子,**我们仍然强烈建议使用在大多数例子中提到的 `DataFrames.jl` 函数,因为它们更加强大并且更容易与其他代码组织**。
79 |
80 | ### 多条件变换 {#sec:multiple_transform}
81 |
82 | 为了展示如何同时更改两列, 我们使用 @sec:join 中的左合并数据:
83 |
84 | ```jl
85 | s = """
86 | leftjoined = leftjoin(grades_2020(), grades_2021(); on=:name)
87 | """
88 | sco(s; process=without_caption_label)
89 | ```
90 |
91 | 结合此数据集,我们增加一列来判断每位同学是否都有一门课的成绩大于 5.5:
92 |
93 | ```jl
94 | s = """
95 | pass(A, B) = [5.5 < a || 5.5 < b for (a, b) in zip(A, B)]
96 | transform(leftjoined, [:grade_2020, :grade_2021] => pass; renamecols=false)
97 | """
98 | sco(s; process=without_caption_label)
99 | ```
100 |
101 | ```{=comment}
102 | I don't think you have covered vector of symbols as col selector...
103 | You might have to do this in the `dataframes_select.md`
104 | ```
105 |
106 | 可以清理下结果,并将上述逻辑整合到一个函数中,然后最终得到符合标准学生的名单:
107 |
108 | ```jl
109 | @sco only_pass()
110 | ```
111 |
--------------------------------------------------------------------------------
/src/cover.jl:
--------------------------------------------------------------------------------
1 | """
2 | cover()
3 |
4 | Return the book cover.
5 | """
6 | function cover()
7 | width = 2 * 2016
8 | height = (10 / 7) * width # Ratio 7 * 10 inch.
9 | fig = Figure(; resolution=(width, height))
10 | # fig[1, 2] = Scene(front_cover())
11 | # return fig
12 |
13 | filename = "cover.pdf"
14 | dir = joinpath(pkgdir(JDS), BUILD_DIR)
15 | pdf_path = joinpath(dir, filename)
16 |
17 | favicon_from = joinpath(pkgdir(JDS), "pandoc", "favicon.png")
18 | favicon_to = joinpath(pkgdir(JDS), BUILD_DIR, "favicon.png")
19 | cp(favicon_from, favicon_to; force=true)
20 |
21 | fig = front_cover()
22 | png_path = joinpath(dir, "front_cover.png")
23 | FileIO.save(png_path, fig; px_per_unit=1)
24 |
25 | # See https://kdp.amazon.com/cover-calculator for details.
26 | tex = raw"""
27 | \documentclass[
28 | coverwidth=11.417in,
29 | coverheight=16.233in,
30 | spinewidth=0.685in,
31 | bleedwidth=0.591in,
32 | 12pt
33 | ]{bookcover}
34 |
35 | \usepackage[
36 | ]{geometry}
37 |
38 | \usepackage{graphicx}
39 |
40 | \begin{document}
41 | \begin{bookcover}
42 | \bookcovercomponent{color}{bg whole}{black}
43 |
44 | \bookcovercomponent{center}{spine}{
45 | \rotatebox[origin=c]{-90}{
46 | \large\textcolor{white}{Storopoli, Huijzer \& Alonso}
47 | \hspace*{7in}
48 | \Huge\bfseries\textcolor{white}{Julia Data Science}
49 | }
50 | \vspace*{1in}
51 | % Should be 0.75 or less. Checked with the Amazon Print Previewer.
52 | \includegraphics[height=0.75\textwidth]{favicon.png}
53 | }
54 |
55 | \bookcovercomponent{normal}{back}{
56 | \vspace*{1in}
57 | \hspace*{0.6in}
58 | \parbox[c]{0.85\textwidth}{\Large\textcolor{white}{
59 | There are many programming languages and each and every one of them has its strengths and weaknesses.
60 | Some languages are very quick, but verbose.
61 | Other languages are very easy to write in, but slow.
62 | This is known as the `two-language` problem and the Julia programming language aims at circumventing this problem.
63 | Even though all three of us come from different fields, we all found the Julia language more effective for our research than languages that we've used before.
64 | However, compared to other languages, Julia is one of the newest languages around.
65 | This means that the ecosystem around the language is sometimes difficult to navigate through.
66 | It's difficult to figure out where to start and how all the different packages fit together.
67 | That is why we decided to create this book!
68 | We wanted to make it easier for researchers, and especially our colleagues, to start using this awesome language.
69 | }}
70 | }
71 |
72 | \bookcovercomponent{normal}{front}{
73 | \vspace*{-0.5in}
74 | \hspace*{0.5in} % At minimum hinge size.
75 | \includegraphics[width=0.97\textwidth]{front_cover.png}
76 | }
77 |
78 | \end{bookcover}
79 | \end{document}
80 | """
81 | tex_path = joinpath(dir, "cover.tex")
82 | write(tex_path, tex)
83 |
84 | tectonic() do bin
85 | cd(dir) do
86 | run(`$bin --print $tex_path`)
87 | end
88 | end
89 |
90 | return "[PDF](/$(filename))"
91 | end
92 |
--------------------------------------------------------------------------------
/contents/preface.md:
--------------------------------------------------------------------------------
1 | # 前言 {#sec:preface}
2 |
3 | 每一种编程语言都有其优势和劣势。
4 | 某些语言可能非常快,但代码冗长。
5 | 另外一些其它语言可能很容易编写代码,但运行较慢。 这就是所谓的 **两语言问题**,Julia 的目标就是避免此问题。
6 | 尽管我们三位作者来自不同的领域,但我们都发现,与之前使用的编程语言相比,使用 Julia 进行研究更加高效。
7 | 我们将在 @sec:why_julia 讨论一些关于 Julia 的观点。
8 | 不过,与其他语言相比,Julia 还是最新颖的语言之一。
9 | 这意味着有时很难驾驭该语言的生态。
10 | 比如,很难弄清楚从哪里开始,也不明白如何组合不同的软件包。
11 | 这就是我们决定写这本书的原因!
12 | 我们想让研究者,特别是我们的同事,更加容易地开始使用这门超酷的语言。
13 |
14 | 如前面所说,每一门语言都有其优势和劣势。
15 | 我们认为,数据科学无疑是 Julia 的优势。
16 | 同时,我们三个都使用 Julia 作为日常的数据科学工具。
17 | 另外,你可能使用 Julia 研究数据科学!
18 | 这就是为什么这本书聚焦在数据科学上。
19 |
20 | 在本节的下一部分,我们将强调 **数据科学的“数据”部分**,并将讨论为什么目前工业界和学术界一直需要数据技能。
21 | 我们还认为,**将软件工程实践引入数据科学** 将有利于减少与合作者更新和共享代码时的冲突。
22 | 大多数数据分析都是合作的结果,因此软件工程实践能够起到很大的帮助。
23 |
24 | ### 数据无处不在 {#sec:data_everywhere}
25 |
26 | 目前来看,**数据很丰富**,在不久的未来还将产生更多的数据。
27 | 一份 2012 年底的报告总结说,从 2005 年到 2020 年,数字化存储的数据量将增长 300 倍,**从 130 EB^[1 EB = 1,000,000 TB。]增加到 40000 EB**[@gantz2012digital]。
28 | 这个数字相当于 40 万亿 GB,更确切地说,这相当于**地球上的每个人创建了 5.2 TB 的数据!**
29 | 目前,在 2020 年,每人平均 **每秒创建 1.7 MB 的数据** [@domo2018data]。
30 | 一份最近的报告指出大约 **在2022年,三分之二(65%)的国家其 GDP 正在实现数字化** [@fitzgerald2020idc]。
31 |
32 | 每份职业都将受到越来越多的数据可用性和数据重要性的影响[@chen2014big; @khan2014big]。
33 | 数据用于沟通交流和构建知识,以及制定决策。
34 | 这也就是为什么数据技能很重要。
35 | 如果能自如地处理数据,那么你就会成为一名有价值的研究人员或专业人士。
36 | 换句话说,你将成为 **具有数字素养的人**。
37 |
38 | ## 什么是数据科学? {#sec:why_data_science}
39 |
40 | 数据科学不仅仅是机器学习和统计学,而且也不全是关于预测。
41 | 它甚至不是一门完全包含 STEM(科学,技术,工程,和数学)所有领域的学科 [@Meng2019Data]。
42 | 但有一件事我们可以非常自信地断言,那就是数据科学始终与 **数据** 有关。
43 | 我们写这本书有两重目标:
44 |
45 | * 专注讨论数据科学的主干: **数据**。
46 | * 使用 **Julia** 编程语言来处理数据。
47 |
48 | 我们将在 @sec:why_julia 章节讨论为什么 Julia 对于数据科学来说是一门相当高效的语言。
49 | 现在将注意力继续转向数据。
50 |
51 | ### 数字素养 {#sec:data_literacy}
52 |
53 | 根据 [维基百科](https://en.wikipedia.org/wiki/Data_literacy),数字素养的正式定义是 **阅读、理解、创建和使用数据进行信息交流的能力**。
54 | 我们也喜欢这个非正式的理解,即作为一个具有数字素养的人,你不会对大量数据感到不知所措,相反地可以使用它来做出正确的决策。
55 | 因此,数字素养可以被视为一种具有高度竞争力的技能。
56 | 本书将讨论 数字素养的两个方面:
57 |
58 | 1. 使用 `DataFrames.jl` **操作数据** (@sec:dataframes)。
59 | 你将在本章学到如何:
60 | 1. 读取 CSV 和 Excel 数据到 Julia 。
61 | 2. 使用 Julia 处理数据,即学习如何回答数据问题。
62 | 3. 使用 `filter` 和 `subset` 筛选数据。
63 | 4. 处理缺失数据。
64 | 5. 连接多个数据源。
65 | 6. 分组和汇总数据。
66 | 7. 从 Julia 导出数据到 CSV 和 Excel 文件。
67 | 2. 使用 `Makie.jl` **可视化数据** (@sec:DataVisualizationMakie)。
68 | 你将在本章学到如何:
69 | 1. 使用不同的 `Makie.jl` 后端绘制数据图。
70 | 2. 将可视化数据图保存为多种格式,例如 PNG 或 PDF。
71 | 3. 使用不同的绘图函数实现多样化的数据可视化。
72 | 4. 结合属性自定义可视化图。
73 | 5. 使用和创建新的绘图主题。
74 | 6. 向图中增加 $\LaTeX$ 元素。
75 | 7. 改变颜色和颜色图。
76 | 8. 创建复杂的图布局。
77 |
78 | ## 软件工程 {#sec:engineering}
79 |
80 | 不像大多数据数据科学书籍,这本书将更多地强调 **组织代码**。
81 | 这是因为,我们了解到很多数据科学家仅是将他们的代码放在一个大文件中,然后按顺序运行所有语句。
82 | 你可以想象这种情况:强迫读者从头读到尾,而不允许重新回顾之前的部分或立即跳转到感兴趣的部分。
83 | 这适用于小型和简单的项目。但是,随着项目变得更大或更复杂,这将开始出现更多的新问题。
84 | 例如,对于一本写得很好的书,它应被分为不同标题的章和节,其中包含对书中其他部分的引用。
85 | 与此相对应的软件工程实践是**将代码分解为函数**。
86 | 每个函数都有一项名称和一些内容。
87 | 在代码中的任何地方,你可以使用函数告诉计算机应从此处跳转到另一处,然后在那里继续。
88 | 这使你可以更容易地在项目间重用代码、更新代码、共享代码、以及协作并查看全局。
89 | 因此,使用函数可以**节省时间**。
90 |
91 | 所以,在阅读本书时,你最终要习惯阅读和使用函数。
92 | 拥有软件工程技能的另一个优点是,它使得你可以更容易地阅读正在使用的软件包的源码。当你在调试代码或者想准确地理解正在使用的软件包时,这项技能会变得尤为有用。
93 | 最后,你可以放心,我们没有自己发明这项关于函数的强调。
94 | 在行业中,鼓励开发者 **使用函数而不是注释** 是一种常见的做法。
95 | 这意味着,开发者既不单是为人类编写注释,也不单是为计算机编写代码,而是编写一个既能被人类也能被计算机阅读的函数。
96 |
97 | 此外,我们还努力坚持一致的风格指南。
98 | 编程风格指南为编写代码提供指导;比如,哪里应该有空格,哪些命名应该大写。
99 | 坚持严格的风格指南可能听起来有点古板,有时也确实如此。
100 | 然而,代码风格越一致,就越容易阅读和理解代码。
101 | 要阅读我们的代码,你不需要知道我们的风格指南。
102 | 阅读的时候你就会明白了。
103 | 如果您想了解我们风格指南的详细内容,请查阅 @sec:notation。
104 |
105 | ## 致谢 {#sec:acknowledgements}
106 |
107 | 许多人对这本书有直接或间接的贡献。
108 |
109 | Jose Storopoli 要感谢他的家人,特别是他的妻子,他们在写作和评审过程中给予了支持和爱。
110 | 他也感谢他的同事,特别是 [Fernando Serra](https://orcid.org/0000-0002-8178-7313), [Wonder Alexandre Luz Alves](https://orcid.org/0000-0003-0430-950X) 和 [André Librantz](https://orcid.org/0000-0001-8599-9009), 感谢他们的鼓励。
111 |
112 | Rik Huijzer 首先要感谢他格罗宁根大学的博士导师, [Peter de Jonge](https://www.rug.nl/staff/peter.de.jonge/)、[Ruud den Hartigh](https://www.rug.nl/staff/j.r.den.hartigh/) 和 [Frank Blaauw](https://frankblaauw.nl/) ,感谢他们的支持。
113 | 其次,他要感谢他的父母和女朋友,在撰写这本书的假期、周末和晚上,他们提供了巨大的支持。
114 |
115 | Lazaro Alonso 要感谢他的妻子和女儿们鼓励他参与这个项目。
116 |
--------------------------------------------------------------------------------
/contents/dataframes_join.md:
--------------------------------------------------------------------------------
1 | ## Join {#sec:join}
2 |
3 | 本章主要展示和讨论关于多张表的操作。
4 | 目前为止,我们仅探讨了单张表的操作,接下来将探讨如何合并多张表。
5 | `DataFrames.jl` 通过 `join` 函数合并多张表。
6 | `join` 函数非常强大,但可能需要花些时间才能理解。
7 | 然而,你不需要记住下面所有的 `join` 函数,因为 [`DataFrames.jl` 文档](https://DataFrames.juliadata.org/stable/man/joins/) 和本书将会列出它们。
8 | 但是,必须要知道存在 `join` 操作。
9 | 如果要在某张 `DataFrame` 中遍历所有行并与其他数据行比较,那么可能需要如下这些 `join` 函数。
10 |
11 | @sec:dataframes 给出了 2020 年的成绩 `grades_2020`:
12 |
13 | ```jl
14 | s = "grades_2020()"
15 | sco(s; process=without_caption_label)
16 | ```
17 |
18 | 现在需要将 `grades_2020` 与 2021 年的成绩合并:
19 |
20 | ```jl
21 | s = "grades_2021()"
22 | sco(s; process=without_caption_label)
23 | ```
24 |
25 | 此功能的实现就需要用到 `join`。
26 | `DataFrames.jl` 列出了不少于七种的 `join` 函数,
27 | 这看起来令人生畏,但请坚持,因为它们都很有用,后面将会逐个讨论所有函数。
28 |
29 | ### innerjoin {#sec:innerjoin}
30 |
31 | 首先讨论的是 **`innerjoin`**。
32 | 假设存在两个数据集 `A` 和 `B`, 分别具有列 `A_1, A_2, ..., A_n` 和 `B_1, B_2, ..., B_m` ,**并且** 其中的一列具有相同的名字:`A_1` 和 `B_1` 都是 `:id`。
33 | 然后对 `:id` 使用 `innerjoin`,则将遍历 `A_1` 中的所有元素并且与 `B_1` 中的元素进行比较。
34 | 如果元素 **是相同的**,然后将会把 `A_2, ..., A_n` 和 `B_2, ..., B_m` 的相应信息添加到 `:id` 列后。
35 |
36 | 好吧,如果你没有明白上面的描述,请不要担心。
37 | 请查看如下所示的成绩数据集合并结果:
38 |
39 | ```jl
40 | s = "innerjoin(grades_2020(), grades_2021(); on=:name)"
41 | sco(s; process=without_caption_label)
42 | ```
43 |
44 | 注意只有 "Sally" 和 "Hank" 同时存在于两个数据集中。
45 | `innerjoin` 的名字对应了数学中的 **交集**, 即“存在于 $A$ 的元素,也存在于 $B$,或者说存在于 $B$ 的元素,也存在于 $A$”。
46 |
47 | ### outerjoin {#sec:outerjoin}
48 |
49 | 也许你在想, “aha,如果我们有`inner`,那我们可能也会有 `outer`”。
50 | 是的,你猜对了!
51 |
52 | **`outerjoin`** 没有 `innerjoin` 那么严格,只要在 **至少一个数据集中**发现包含的 `name`,就会将相应的列合并到结果中:
53 |
54 | ```jl
55 | s = "outerjoin(grades_2020(), grades_2021(); on=:name)"
56 | sco(s; process=without_caption_label)
57 | ```
58 |
59 | 因此,当其中一个原始数据集不存在对应的值时,该方法会创建 `missing` 值。
60 |
61 | ### crossjoin {#sec:crossjoin}
62 |
63 | 如果使用 **`crossjoin`** 将会出现更多的 `missing` 值。
64 | 该方法会给出 **行的笛卡尔积**,也就是行的乘法,即对于每一行创建一个与另一张表中所有行的组合:
65 |
66 | ```jl
67 | s = "crossjoin(grades_2020(), grades_2021(); on=:id)"
68 | sce(s; post=trim_last_n_lines(2))
69 | ```
70 |
71 | 呃,出错了。
72 | 因为 `crossjoin` 并不按行考虑元素,所以不需要将 `on` 参数指定为想要合并的列:
73 |
74 | ```jl
75 | s = "crossjoin(grades_2020(), grades_2021())"
76 | sce(s; post=trim_last_n_lines(6))
77 | ```
78 |
79 | 呃,又出错了。
80 | 这是一个 `DataFrame` 和 `join` 中很常见的错误。
81 | 2020 和 2021 年成绩表有一个重复的列名,即 `:name`。
82 | 与之前一样,`DataFrames.jl` 的报错输出给出了一个可能修复此错误的简单建议。
83 | 尝试仅传递 `makeunique=true` 解决此问题:
84 |
85 | ```jl
86 | s = "crossjoin(grades_2020(), grades_2021(); makeunique=true)"
87 | sco(s; process=without_caption_label)
88 | ```
89 |
90 | 所以现在,对于 2020 和 2021 年成绩表中的每个人,新表都存在表示其成绩的一行。
91 | 对于直接的查询,例如“谁的成绩最高?”,笛卡尔积的结果通常不太可行,但对于“统计学” 查询来说具有一定意义。
92 |
93 | ### leftjoin 和 rightjoin {#sec:leftjoin_rightjoin}
94 |
95 | **对数据科学项目更有用的是 `leftjoin` 和 `rightjoin`**。
96 | `leftjoin` 将考虑合并时左侧 `DataFrame` 中的所有元素:
97 |
98 | ```jl
99 | s = "leftjoin(grades_2020(), grades_2021(); on=:name)"
100 | sco(s; process=without_caption_label)
101 | ```
102 |
103 | 此处注意,“Bob” 和 “Alice” 的成绩在 2021 成绩表格中是 **缺失** 的,这就是为什么对应的位置是 `missing` 值。
104 | `rightjoin` 实现了相反的操作:
105 |
106 | ```jl
107 | s = "rightjoin(grades_2020(), grades_2021(); on=:name)"
108 | sco(s; process=without_caption_label)
109 | ```
110 |
111 | 而现在 2020 中的部分成绩是缺失的。
112 |
113 | 注意到, **`leftjoin(A, B) != rightjoin(B, A)`**,因为它们的列顺序不同。
114 | 例如,将下面的输出与之前的输出进行比较:
115 |
116 | ```jl
117 | s = "leftjoin(grades_2021(), grades_2020(); on=:name)"
118 | sco(s; process=without_caption_label)
119 | ```
120 |
121 | ### semijoin 和 antijoin {#sec:semijoin_antijoin}
122 |
123 | 最后讨论 **`semijoin`** 和 **`antijoin`**。
124 |
125 | `semijoin` 比 `innerjoin` 更具有限制性。
126 | 它仅返回 **存在于左侧 `DataFrame` 并同时存在于两张 `DataFrame` 的元素**。
127 | 这看起来像是 `innerjoin` 和 `leftjoin` 的组合。
128 |
129 | ```jl
130 | s = "semijoin(grades_2020(), grades_2021(); on=:name)"
131 | sco(s; process=without_caption_label)
132 | ```
133 |
134 | 与 `semijoin` 相对的是 `antijoin`。
135 | 它仅返回 **存在于左侧 `DataFrame` 但不存在于右侧 `DataFrame` 的元素**。
136 |
137 | ```jl
138 | s = "antijoin(grades_2020(), grades_2021(); on=:name)"
139 | sco(s; process=without_caption_label)
140 | ```
141 |
--------------------------------------------------------------------------------
/src/df.jl:
--------------------------------------------------------------------------------
1 | function grades_ages()
2 | name = ["Bob", "Sally", "Bob 2", "Alice", "Hank"]
3 | age = [17, 18, 17, 20, 19]
4 | DataFrame(; name, age)
5 | end
6 |
7 | function grades_2020()
8 | name = ["Sally", "Bob", "Alice", "Hank"]
9 | grade_2020 = [1, 5, 8.5, 4]
10 | DataFrame(; name, grade_2020)
11 | end
12 |
13 | function grades_2021()
14 | name = ["Bob 2", "Sally", "Hank"]
15 | grade_2021 = [9.5, 9.5, 6]
16 | DataFrame(; name, grade_2021)
17 | end
18 |
19 | function all_grades()
20 | df1 = grades_2020()
21 | df1 = select(df1, :name, :grade_2020 => :grade)
22 | df2 = grades_2021()
23 | df2 = select(df2, :name, :grade_2021 => :grade)
24 | rename_bob2(data_col) = replace.(data_col, "Bob 2" => "Bob")
25 | df2 = transform(df2, :name => rename_bob2 => :name)
26 | return vcat(df1, df2)
27 | end
28 |
29 | function grades_for_2020()
30 | innerjoin(grades_ages(), grades_2020(); on=:name)
31 | end
32 |
33 | function grades_array()
34 | name = ["Bob", "Sally", "Alice", "Hank"]
35 | age = [17, 18, 20, 19]
36 | grade_2020 = [5.0, 1.0, 8.5, 4.0]
37 | (; name, age, grade_2020)
38 | end
39 |
40 | function second_row()
41 | name, age, grade_2020 = grades_array()
42 | i = 2
43 | row = (name[i], age[i], grade_2020[i])
44 | end
45 |
46 | function names_grades1()
47 | df = grades_2020()
48 | df.name
49 | end
50 |
51 | function names_grades2()
52 | df = grades_2020()
53 | df[!, :name]
54 | end
55 |
56 | function grade_2020(i::Int)
57 | df = grades_2020()
58 | df[i, :]
59 | end
60 |
61 | function grade_2020(name::String)
62 | df = grades_2020()
63 | dic = Dict(zip(df.name, df.grade_2020))
64 | dic[name]
65 | end
66 |
67 | grades_indexing(df) = df[1:2, :name]
68 |
69 | function grades_2020(names::Vector{Int})
70 | df = grades_2020()
71 | df[names, :]
72 | end
73 |
74 | equals_alice(name::String) = name == "Alice"
75 |
76 | function write_grades_csv()
77 | path = "grades.csv"
78 | CSV.write(path, grades_2020())
79 | end
80 |
81 | function grades_with_commas()
82 | df = grades_2020()
83 | df[3, :name] = "Alice,"
84 | df
85 | end
86 |
87 | inside_tempdir(f) = cd(f, mktempdir())
88 | output_block_inside_tempdir(f) = output_block(inside_tempdir(f))
89 |
90 | function write_xlsx(name, df::DataFrame)
91 | path = "$name.xlsx"
92 | data = collect(eachcol(df))
93 | cols = names(df)
94 | writetable(path, data, cols)
95 | end
96 |
97 | function write_grades_xlsx()
98 | path = "grades"
99 | write_xlsx(path, grades_2020())
100 | "$path.xlsx"
101 | end
102 |
103 | function salaries()
104 | names = ["John", "Hank", "Karen", "Zed"]
105 | salary = [1_900, 2_800, 2_800, missing]
106 | DataFrame(; names, salary)
107 | end
108 |
109 | function responses()
110 | id = [1, 2]
111 | q1 = [28, 61]
112 | q2 = [:us, :fr]
113 | q3 = ["F", "B"]
114 | q4 = ["B", "C"]
115 | q5 = ["A", "E"]
116 | DataFrame(; id, q1, q2, q3, q4, q5)
117 | end
118 |
119 | function wrong_types()
120 | id = 1:4
121 | date = ["28-01-2018", "03-04-2019", "01-08-2018", "22-11-2020"]
122 | age = ["adolescent", "adult", "infant", "adult"]
123 | DataFrame(; id, date, age)
124 | end
125 |
126 | """
127 | Not using `transform(df, :date => str2date; renamecols=false)`, because it's less readable.
128 | """
129 | function fix_date_column(df::DataFrame)
130 | strings2dates(dates::Vector) = Date.(dates, dateformat"dd-mm-yyyy")
131 | dates = strings2dates(df[!, :date])
132 | df[!, :date] = dates
133 | df
134 | end
135 |
136 | function fix_age_column(df)
137 | levels = ["infant", "adolescent", "adult"]
138 | ages = categorical(df[!, :age]; levels, ordered=true)
139 | df[!, :age] = ages
140 | df
141 | end
142 |
143 | function correct_types()
144 | df = wrong_types()
145 | df = fix_date_column(df)
146 | df = fix_age_column(df)
147 | end
148 |
149 | function only_pass()
150 | leftjoined = leftjoin(grades_2020(), grades_2021(); on=:name)
151 | pass(A, B) = [5.5 < a || 5.5 < b for (a, b) in zip(A, B)]
152 | leftjoined = transform(leftjoined, [:grade_2020, :grade_2021] => pass => :pass)
153 | passed = subset(leftjoined, :pass; skipmissing=true)
154 | return passed.name
155 | end
156 |
--------------------------------------------------------------------------------
/contents/data_vis_makie_glmakie.md:
--------------------------------------------------------------------------------
1 | ## GLMakie.jl {#sec:glmakie}
2 |
3 | `CairoMakie.jl` 满足了所有关于静态 2D 图的需求。
4 | 但除此之外,有时候还需要交互性,特别是在处理 3D 图的时候。
5 | 使用 3D 图可视化数据是 **洞察** 数据的常见做法。
6 | 这就是 `GLMakie.jl` 的用武之地,它使用 [OpenGL](http://www.opengl.org/) 作为添加交互和响应功能的绘图后端。
7 | 与之前一样,一幅简单的图只包括线和点。因此,接下来将从简单图开始。因为已经知道布局如何使用,所以将在例子中应用一些布局。
8 |
9 | ### 散点图和折线图
10 |
11 | 散点图有两种绘制选项,第一种是 `scatter(x, y, z)`,另一种是 `meshscatter(x, y, z)`。
12 | 若使用第一种,标记则不会沿着坐标轴缩放,但在使用第二种时标记会缩放, 这是因为此时它们是三维空间的几何实体。
13 | 例子如下:
14 |
15 | ```
16 | using GLMakie
17 | GLMakie.activate!()
18 | ```
19 |
20 | ```jl
21 | @sco JDS.scatters_in_3D()
22 | ```
23 |
24 | 另请注意,标记可以是不同的几何实体,比如正方形或矩形。另外,也可以为标记设置 `colormap`。
25 | 对于上面位于中间的 3D 图,如果想得到获得完美的球体,那么只需如右侧图那样添加 `aspect = :data` 参数。
26 | 绘制 `lines` 或 `scatterlines` 也很简单:
27 |
28 | ```jl
29 | @sco JDS.lines_in_3D()
30 | ```
31 |
32 | 在 3D 图中绘制 `surface`, `wireframe` 和 `contour` 是一项容易的工作。
33 |
34 | ### `surface`,`wireframe`,`contour`,`contourf` 和 `contour3d`
35 |
36 | 将使用如下的 `peaks` 函数展示这些例子:
37 |
38 | ```jl
39 | @sc JDS.peaks()
40 | ```
41 |
42 | 不同绘图函数的输出如下:
43 |
44 | ```jl
45 | @sco JDS.plot_peaks_function()
46 | ```
47 |
48 | 但是也可以使用 `heatmap(x, y, z)`,`contour(x, y, z)` 或 `contourf(x, y, z)` 绘图:
49 |
50 | ```jl
51 | @sco JDS.heatmap_contour_and_contourf()
52 | ```
53 |
54 | 另外,只要将`Axis` 更改为 `Axis3`,这些图就会自动位于 x-y 平面:
55 |
56 | ```jl
57 | @sco JDS.heatmap_contour_and_contourf_in_a_3d_plane()
58 | ```
59 |
60 | 将这些绘图函数混合在一起也是非常简单的,如下所示:
61 |
62 | ```
63 | using TestImages
64 | ```
65 |
66 | ```jl
67 | @sco JDS.mixing_surface_contour3d_contour_and_contourf()
68 | ```
69 |
70 | 还不错,对吧?从这里也可以看出,任何的 `heatmap`, `contour`,`contourf` 和 `image` 都可以绘制在任何平面上。
71 |
72 | ### `arrows` 和 `streamplot`
73 |
74 | 当想要知道给定变量的方向时,`arrows` 和 `streamplot` 会变得非常有用。
75 | 参见如下的示例^[此处使用 Julia 标准库中的 `LinearAlgebra`。]:
76 |
77 | ```
78 | using LinearAlgebra
79 | ```
80 |
81 | ```jl
82 | @sco JDS.arrows_and_streamplot_in_3d()
83 | ```
84 |
85 | 另外一些有趣的例子是 `mesh(obj)`,`volume(x, y, z, vals)` 和 `contour(x, y, z, vals)`。
86 |
87 | ### `mesh` 和 `volume`
88 |
89 | 绘制网格在想要画出几何实体时很有用,例如 `Sphere` 或矩形这样的几何实体,即 `FRect3D`。
90 | 另一种在 3D 空间中可视化的方法是调用 `volume` 和 `contour` 函数,它们通过实现 [光线追踪](https://en.wikipedia.org/wiki/Ray_tracing_(graphics)) 来模拟各种光学效果。
91 | 例子如下:
92 |
93 | ```
94 | using GeometryBasics
95 | ```
96 |
97 | ```jl
98 | @sco JDS.mesh_volume_contour()
99 | ```
100 |
101 | 注意到透明球和立方体绘制在同一个坐标系中。
102 | 截至目前,我们已经包含了 3D 绘图的大多数用例。
103 | 另一个例子是 `?linesegments`。
104 |
105 | 参考之前的例子,可以使用球体和矩形平面创建一些自定义图:
106 |
107 | ```
108 | using GeometryBasics, Colors
109 | ```
110 |
111 | 首先为球体定义一个矩形网格,而且给每个球定义不同的颜色。
112 | 另外,可以将球体和平面混合在一张图里。下面的代码定义了所有必要的数据。
113 |
114 | ```jl
115 | sc("""
116 | seed!(123)
117 | spheresGrid = [Point3f(i,j,k) for i in 1:2:10 for j in 1:2:10 for k in 1:2:10]
118 | colorSphere = [RGBA(i * 0.1, j * 0.1, k * 0.1, 0.75) for i in 1:2:10 for j in 1:2:10 for k in 1:2:10]
119 | spheresPlane = [Point3f(i,j,k) for i in 1:2.5:20 for j in 1:2.5:10 for k in 1:2.5:4]
120 | cmap = get(colorschemes[:plasma], LinRange(0, 1, 50))
121 | colorsPlane = cmap[rand(1:50,50)]
122 | rectMesh = FRect3D(Vec3f(-1, -1, 2.1), Vec3f(22, 11, 0.5))
123 | recmesh = GeometryBasics.mesh(rectMesh)
124 | colors = [RGBA(rand(4)...) for v in recmesh.position]
125 | """)
126 | ```
127 |
128 | 然后可使用如下方式简单地绘图:
129 |
130 | ```jl
131 | @sco JDS.grid_spheres_and_rectangle_as_plate()
132 | ```
133 |
134 | 注意,右侧图中的矩形平面是半透明的,这是因为颜色函数 `RGBA()` 中定义了 `alpha` 参数。
135 | 矩形函数是通用的,因此很容易用来实现 3D 方块,而它又能用于绘制 3D 直方图。
136 | 参见如下的例子,我们将再次使用 `peaks` 函数并增加一些定义:
137 |
138 | ```jl
139 | sc("""
140 | x, y, z = peaks(; n=15)
141 | δx = (x[2] - x[1]) / 2
142 | δy = (y[2] - y[1]) / 2
143 | cbarPal = :Spectral_11
144 | ztmp = (z .- minimum(z)) ./ (maximum(z .- minimum(z)))
145 | cmap = get(colorschemes[cbarPal], ztmp)
146 | cmap2 = reshape(cmap, size(z))
147 | ztmp2 = abs.(z) ./ maximum(abs.(z)) .+ 0.15
148 | """)
149 | ```
150 |
151 | 其中方块的尺寸由 $\delta x, \delta y$ 指定。 `cmap2` 用于指定每个方块的颜色而 `ztmp2` 用于指定每个方块的透明度。如下图所示。
152 |
153 | ```jl
154 | @sco JDS.histogram_or_bars_in_3d()
155 | ```
156 |
157 | 应注意到,也可以在 `mesh` 对象上调用 `lines` 或 `wireframe`。
158 |
159 | ### 填充的线和带
160 |
161 | 在最终的例子中, 我们将展示如何使用 `band`和一些 `linesegments` 填充 3D 图中的曲线:
162 |
163 | ```jl
164 | @sco JDS.filled_line_and_linesegments_in_3D()
165 | ```
166 |
167 | 最后,我们的3D绘图之旅到此结束。
168 | 你可以将我们这里展示的一切结合起来,去创造令人惊叹的 3D 图!
169 |
--------------------------------------------------------------------------------
/contents/data_vis_makie_themes.md:
--------------------------------------------------------------------------------
1 | ## 主题 {#sec:themes}
2 |
3 | 有多种方式可以改变图的整体外观。你可以使用 [预定义主题](http://makie.juliaplots.org/stable/documentation/theming/predefined_themes/index.html) 或自定义的主题。例如,通过 `with_theme(your_plot_function, theme_dark())` 使用预定义的暗色主题。另外,也可以使用 `Theme(kwargs)` 构建你自己的主题或使用 `update_theme!(kwargs)` 更新当前激活的主题。
4 |
5 | 还可以使用 `set_theme!(theme; kwargs...)` 将当前主题改为 `theme`, 并且通过 `kwargs` 覆盖或增加一些属性。使用不带参数的 `set_theme!()` 即可恢复到之前主题的设置。在下面的例子中,我们准备了具有不同样式的测试绘图函数,以便于观察每个主题的大多数属性。
6 |
7 | ```jl
8 | sco(
9 | """
10 | using Random: seed!
11 | seed!(123)
12 | y = cumsum(randn(6, 6), dims=2)
13 | """
14 | )
15 | ```
16 |
17 |
18 | 本例随机生成了一个大小为 `(20,20)` 的矩阵,以便于绘制一张热力图(heatmap)。
19 | 同时本例也指定了 $x$ 和 $y$ 的范围。
20 |
21 | ```jl
22 | sco(
23 | """
24 | using Random: seed!
25 | seed!(13)
26 | xv = yv = LinRange(-3, 0.5, 20)
27 | matrix = randn(20, 20)
28 | matrix[1:6, 1:6] # first 6 rows and columns
29 | """
30 | )
31 | ```
32 |
33 | 因此,新绘图函数如下所示:
34 |
35 | ```jl
36 | @sc demo_themes(y, xv, yv, matrix)
37 | ```
38 |
39 | 注意,`series` 函数的作用是同时绘制多条附带标签的直线图和散点图。另外还绘制了附带 colorbar 的 heatmap。如图所示,有两种暗色主题,一种是 `theme_dark()` ,另一种是 `theme_black()`。
40 |
41 | ```jl
42 | s = """
43 | CairoMakie.activate!() # hide
44 | filenames = ["theme_dark", "theme_black"] # hide
45 | objects = [ # hide
46 | # Don't indent here because it indent the output incorrectly. # hide
47 | with_theme(theme_dark()) do
48 | demo_themes(y, xv, yv, matrix)
49 | end
50 | with_theme(theme_black()) do
51 | demo_themes(y, xv, yv, matrix)
52 | end
53 | ] # hide
54 | link_attributes = "width=60%" # hide
55 | Options(obj, filename, link_attributes) = Options(obj; filename, link_attributes) # hide
56 | Options.(objects, filenames, link_attributes) # hide
57 | """
58 | sco(s)
59 | ```
60 |
61 | 另外有三种白色主题,`theme_ggplot2()`,`theme_minimal()` 和 `theme_light()`。这些主题对于更标准的出版图很有用。
62 |
63 | ```jl
64 | s = """
65 | CairoMakie.activate!() # hide
66 | filenames = ["theme_ggplot2", # hide
67 | "theme_minimal", "theme_light"] # hide
68 | objects = [ # hide
69 | # Don't indent here because it indent the output incorrectly. # hide
70 | with_theme(theme_ggplot2()) do
71 | demo_themes(y, xv, yv, matrix)
72 | end
73 | with_theme(theme_minimal()) do
74 | demo_themes(y, xv, yv, matrix)
75 | end
76 | with_theme(theme_light()) do
77 | demo_themes(y, xv, yv, matrix)
78 | end
79 | ] # hide
80 | link_attributes = "width=60%" # hide
81 | Options(obj, filename, link_attributes) = Options(obj; filename, link_attributes) # hide
82 | Options.(objects, filenames, link_attributes) # hide
83 | """
84 | sco(s)
85 | ```
86 |
87 | 另一种方案是通过使用 `with_theme(your_plot, your_theme())` 创建自定义 `Theme` 。
88 | 例如,以下主题可以作为出版质量图的初级模板:
89 |
90 | ```jl
91 | @sc publication_theme()
92 | ```
93 |
94 | 为简单起见,在接下来的例子中使用它绘制 `scatterlines` 和 `heatmap`。
95 |
96 | ```jl
97 | @sc plot_with_legend_and_colorbar()
98 | ```
99 |
100 | 然后使用前面定义的 `Theme`,其输出如 (@fig:plot_with_legend_and_colorbar) 所示。
101 |
102 | ```jl
103 | s = """
104 | CairoMakie.activate!() # hide
105 | with_theme(plot_with_legend_and_colorbar, publication_theme())
106 | label = "plot_with_legend_and_colorbar" # hide
107 | caption = "Themed plot with Legend and Colorbar." # hide
108 | link_attributes = "width=60%" # hide
109 | Options(current_figure(); filename=label, label, caption, link_attributes) # hide
110 | """
111 | sco(s)
112 | ```
113 |
114 | 如果需要在 `set_theme!(your_theme)`后更改一些设置,那么可以使用 `update_theme!(resolution=(500, 400), fontsize=18)`。
115 | 另一种方法是给 `with_theme` 函数传递额外的参数:
116 |
117 | ```jl
118 | s = """
119 | CairoMakie.activate!() # hide
120 | fig = (resolution=(600, 400), figure_padding=1, backgroundcolor=:grey90)
121 | ax = (; aspect=DataAspect(), xlabel=L"x", ylabel=L"y")
122 | cbar = (; height=Relative(4 / 5))
123 | with_theme(publication_theme(); fig..., Axis=ax, Colorbar=cbar) do
124 | plot_with_legend_and_colorbar()
125 | end
126 | label = "plot_theme_extra_args" # hide
127 | caption = "Theme with extra args." # hide
128 | link_attributes = "width=60%" # hide
129 | Options(current_figure(); filename=label, caption, label, link_attributes) # hide
130 | """
131 | sco(s)
132 | ```
133 |
134 | 现在,接下来将讨论如何使用 LaTeX 字符串和自定义主题进行绘图。
135 |
--------------------------------------------------------------------------------
/contents/dataframes_select.md:
--------------------------------------------------------------------------------
1 | ## Select {#sec:select}
2 |
3 | 上节讨论了 **按行选取的 `filter`**, 而本节将讨论 **按列选取的 `select`**。
4 | 然而, `select` 不止能用于按列选取,本节还会讨论更加广泛的用法。
5 | 首先,创建具有多列的数据集:
6 |
7 | ```jl
8 | @sco responses()
9 | ```
10 |
11 | 上述数据表示某问卷中五个问题的(`q1`,`q2`,...,`q5`)的答案。
12 | 首先,选取数据集中的一些列。
13 | 照例使用 `Symbol` 指定列:
14 |
15 | ```jl
16 | s = "select(responses(), :id, :q1)"
17 | sco(s, process=without_caption_label)
18 | ```
19 |
20 | 也可以使用字符串:
21 |
22 | ```jl
23 | s = """select(responses(), "id", "q1", "q2")"""
24 | sco(s, process=without_caption_label)
25 | ```
26 |
27 | 如果要选取**除了** 某些列外的所有列,请使用 `Not`:
28 |
29 | ```jl
30 | s = """select(responses(), Not(:q5))"""
31 | sco(s, process=without_caption_label)
32 | ```
33 |
34 | `Not` 也适用于多列:
35 |
36 | ```jl
37 | s = """select(responses(), Not([:q4, :q5]))"""
38 | sco(s, process=without_caption_label)
39 | ```
40 |
41 | 当然也可以将要保留的列参数和 **不** 保留的列参数组合起来:
42 |
43 | ```jl
44 | s = """select(responses(), :q5, Not(:id))"""
45 | sco(s, process=without_caption_label)
46 | ```
47 |
48 | 注意,`q5` 是 `select` 返回的 `DataFrame` 的第一列。
49 | 要实现如上的操作,更聪明的做法是使用 `:`。
50 | 冒号 `:` 可以认为是 **前述条件尚未包含的所有列**。
51 | 例如:
52 |
53 | ```jl
54 | s = """select(responses(), :q5, :)"""
55 | sco(s, process=without_caption_label)
56 | ```
57 |
58 | 或者,把 `q5` 放在第二个位置[^sudete]:
59 |
60 | [^sudete]: 感谢 Sudete 在 Discourse 论坛 () 上给予的建议。
61 |
62 | ```jl
63 | s = "select(responses(), 1, :q5, :)"
64 | sco(s, process=without_caption_label)
65 | ```
66 |
67 | > **_NOTE:_**
68 | > 正如你所看到的那样,有多种列选择方法。
69 | > 它们都被称为 [**列选择器**](https://bkamins.github.io/julialang/2021/02/06/colsel.html)。
70 | >
71 | > 可以使用:
72 | >
73 | > * `Symbol`: `select(df, :col)`
74 | >
75 | > * `String`: `select(df, "col")`
76 | >
77 | > * `Integer`: `select(df, 1)`
78 |
79 | 甚至可以使用 `select` 重命名列,语法是 `source => target`:
80 |
81 | ```jl
82 | s = """select(responses(), 1 => "participant", :q1 => "age", :q2 => "nationality")"""
83 | sco(s, process=without_caption_label)
84 | ```
85 |
86 | 另外,还可以使用 "splat" 算符 `...` (请查阅 @sec:splat) 写作如下形式:
87 |
88 | ```jl
89 | s = """
90 | renames = (1 => "participant", :q1 => "age", :q2 => "nationality")
91 | select(responses(), renames...)
92 | """
93 | sco(s, process=without_caption_label)
94 | ```
95 |
96 | ## 类型和缺失值 {#sec:missing_data}
97 |
98 | ```{=comment}
99 | Try to combine with transformations
100 |
101 | categorical
102 | allowmissing
103 | disallowmissing
104 | ```
105 |
106 | 正如在 @sec:load_save 讨论的那样, `CSV.jl` 会尽可能推断每列数据应该使用的类型。
107 | 然而,这并不总是能完美实现。
108 | 本节将说明为什么合适的类型是重要的,以及如何修复错误数据类型。
109 | 为了更清晰地展示类型,接下来将给出 `DataFrame` 的文本输出,而不是格式化打印的表。
110 | 本节将使用如下的数据集:
111 |
112 | ```jl
113 | @sco process=string post=output_block wrong_types()
114 | ```
115 |
116 | 因为日期列的类型并不正确,所以 `sort` 并不能正常工作:
117 |
118 | ```{=comment}
119 | Whoa! You haven't introduced the reader to sorting with `sort` yet.
120 | ```
121 |
122 | ```jl
123 | s = "sort(wrong_types(), :date)"
124 | scsob(s)
125 | ```
126 |
127 | 为了修复此问题,可以使用在 @sec:dates 中提到的 Julia 标准库 `Date` 模块:
128 |
129 | ```jl
130 | @sco process=string post=output_block fix_date_column(wrong_types())
131 | ```
132 |
133 | 现在,排序的结果与预期相符:
134 |
135 | ```jl
136 | s = """
137 | df = fix_date_column(wrong_types())
138 | sort(df, :date)
139 | """
140 | scsob(s)
141 | ```
142 |
143 | 年龄列存在相似的问题:
144 |
145 | ```jl
146 | s = "sort(wrong_types(), :age)"
147 | scsob(s)
148 | ```
149 |
150 | 这显然不正确,因为婴儿比成年人和青少年更年轻。
151 | 对于此问题和其他分类数据的解决方案是 `CategoricalArrays.jl`:
152 |
153 | ```
154 | using CategoricalArrays
155 | ```
156 |
157 | 可以使用 `CategoricalArrays.jl` 包为分类变量数据添加层级顺序:
158 |
159 | ```jl
160 | @sco process=string post=output_block fix_age_column(wrong_types())
161 | ```
162 |
163 | > **_NOTE:_**
164 | > 此处注意参数 `ordered=true` 将告诉 `CategoricalArrays.jl` 的 `categorical` 函数,分类数据是排好序的。
165 | > 如果没有此参数,任何的大小比较都不能实现。
166 |
167 | 现在可以正确地按年龄排序:
168 |
169 | ```jl
170 | s = """
171 | df = fix_age_column(wrong_types())
172 | sort(df, :age)
173 | """
174 | scsob(s)
175 | ```
176 |
177 | 因为已经定义了一组函数,因此可以通过调用函数来定义修正后的数据:
178 | ```jl
179 | @sco process=string post=output_block correct_types()
180 | ```
181 |
182 | 数据中的年龄是有序的 (`ordered=true`),因此可以正确比较年龄类别:
183 |
184 | ```jl
185 | s = """
186 | df = correct_types()
187 | a = df[1, :age]
188 | b = df[2, :age]
189 | a < b
190 | """
191 | scob(s)
192 | ```
193 |
194 | 如果元素类型为字符串,这将产生错误的比较:
195 |
196 | ```jl
197 | s = "\"infant\" < \"adult\""
198 | scob(s)
199 | ```
200 |
--------------------------------------------------------------------------------
/contents/dataframes_load_save.md:
--------------------------------------------------------------------------------
1 | ## 加载和保存文件 {#sec:load_save}
2 |
3 | 仅在 Julia 程序中使用数据非常有局限性,通常还需要能够加载或保存数据。
4 | 因此,本节主要讨论如何存储文件到硬盘和从硬盘读取文件。
5 | 我们重点关注 CSV 和 Excel 这两类最常见的数据文件格式,分别参见 @sec:csv 和 @sec:excel。
6 |
7 | ### CSV {#sec:csv}
8 |
9 | **C**omma-**s**eparated-**v**alues (CSV) 文件是非常有效的表格存储方式。
10 | CSV 文件相比其他数据存储文件有两点优势。首先,正如名称所指示的那样,它使用逗号`,`来分隔存储值。此首字母缩写词也被用作文件扩展名。因此,请确保使用“.csv”扩展名(例如“myfile.csv”)保存文件。为了演示 CSV 文件的结构,安装 [`CSV.jl`](http://csv.juliadata.org/latest/) 包:
11 |
12 | ```
13 | julia> ]
14 |
15 | pkg> add CSV
16 | ```
17 |
18 | 并且通过以下方式导入:
19 |
20 | ```
21 | using CSV
22 | ```
23 |
24 | 现在可使用之前的数据:
25 |
26 | ```jl
27 | sco("
28 | grades_2020()
29 | "; process=without_caption_label)
30 | ```
31 |
32 | 并在写入后从文件中读取:
33 |
34 | ```jl
35 | @sc write_grades_csv()
36 | ```
37 |
38 | ```jl
39 | sco("""
40 | JDS.output_block_inside_tempdir() do # hide
41 | path = write_grades_csv()
42 | read(path, String)
43 | end # hide
44 | """)
45 | ```
46 |
47 | 上文还能看到 CSV 数据格式的第二个好处:可以使用简单的文本编辑器读取数据。
48 | 这与许多需要专有软件的其他数据格式不同,例如 Excel。
49 |
50 | 这很有效,但是如果我们的数据 **包含逗号 `,`** 作为值怎么办?
51 | 如果我们天真地用逗号写入数据,那么文件将很难转换回表格。
52 | 幸运的是,`CSV.jl` 会自动处理此问题。
53 | 考虑以下带逗号`,`的数据:
54 |
55 | ```jl
56 | @sco grades_with_commas()
57 | ```
58 |
59 | 如果写入文件,将得到:
60 |
61 | ```jl
62 | sco("""
63 | JDS.output_block_inside_tempdir() do # hide
64 | function write_comma_csv()
65 | path = "grades-commas.csv"
66 | CSV.write(path, grades_with_commas())
67 | end
68 | path = write_comma_csv()
69 | read(path, String)
70 | end # hide
71 | """)
72 | ```
73 |
74 | 因此,`CSV.jl` 在包含逗号的值周围添加引号 `"`。
75 | 解决此问题的另一种常见方法是将数据写入 **t**ab-**s**eparated **v**alues (TSV) 文件格式。
76 | 该格式假设数据不包含制表符,这一点在大多数情况下是成立的。
77 |
78 | 另请注意,也可以使用简单的文本编辑器读取 TSV 文件,这些文件使用“.tsv”扩展名。
79 |
80 | ```jl
81 | sco("""
82 | JDS.output_block_inside_tempdir() do # hide
83 | function write_comma_tsv()
84 | path = "grades-comma.tsv"
85 | CSV.write(path, grades_with_commas(); delim='\\t')
86 | end
87 | read(write_comma_tsv(), String)
88 | end # hide
89 | """)
90 | ```
91 |
92 | 像 CSV 和 TSV 这样的文本文件格式还可以使用其他分割符,例如分号“;”,空格“\ ”,甚至是像“π”这样不寻常的字符。
93 |
94 | ```jl
95 | sco("""
96 | JDS.output_block_inside_tempdir() do # hide
97 | function write_space_separated()
98 | path = "grades-space-separated.csv"
99 | CSV.write(path, grades_2020(); delim=' ')
100 | end
101 | read(write_space_separated(), String)
102 | end # hide
103 | """)
104 | ```
105 |
106 | 按照惯例,最好还是为文件指定特殊的分隔符,例如“;”,“.csv”扩展名。
107 |
108 | 使用 `CSV.jl` 加载 CSV 文件的方式与此类似。
109 | 您可以使用 `CSV.read` 并指定您想要的输出格式。
110 | 这里指定为`DataFrame`。
111 |
112 | ```jl
113 | sco("""
114 | JDS.inside_tempdir() do # hide
115 | path = write_grades_csv()
116 | CSV.read(path, DataFrame)
117 | end # hide
118 | """; process=without_caption_label)
119 | ```
120 |
121 | 方便地,`CSV.jl`将自动推断列类型:
122 |
123 | ```jl
124 | sco("""
125 | JDS.inside_tempdir() do # hide
126 | path = write_grades_csv()
127 | df = CSV.read(path, DataFrame)
128 | end # hide
129 | """; process=string, post=output_block)
130 | ```
131 |
132 | 它甚至适用于更复杂的数据:
133 |
134 | ```jl
135 | sco("""
136 | JDS.inside_tempdir() do # hide
137 | my_data = \"\"\"
138 | a,b,c,d,e
139 | Kim,2018-02-03,3,4.0,2018-02-03T10:00
140 | \"\"\"
141 | path = "my_data.csv"
142 | write(path, my_data)
143 | df = CSV.read(path, DataFrame)
144 | end # hide
145 | """; process=string, post=output_block)
146 | ```
147 |
148 | 这些CSV基础应该涵盖大多数用例。
149 | 关于更多信息,请参阅[`CSV.jl` 文档](https://csv.juliadata.org/stable)尤其是[`CSV.File` 构建 docstring](https://csv.juliadata.org/stable/#CSV.File)。
150 |
151 |
152 | ### Excel {#sec:excel}
153 |
154 | 多个 Julia 包可以读取 Excel 文件。
155 | 本节将只讨论 [`XLSX.jl`](https://github.com/felipenoris/XLSX.jl),因为它是 Julia 生态系统中处理 Excel 数据的最积极维护的包。
156 | 另外一个优点是,`XLSX.jl` 是用纯 Julia 编写的,这使得可以轻松地检查和理解指令背后发生的事情。
157 |
158 | 加载 `XLSX.jl` 的方式是
159 |
160 | ```
161 | using XLSX:
162 | eachtablerow,
163 | readxlsx,
164 | writetable
165 | ```
166 |
167 | 为了写入文件,我们为数据和列名定义一个辅助函数:
168 |
169 | ```jl
170 | @sc write_xlsx("", DataFrame())
171 | ```
172 |
173 | 现在,可以轻松地将成绩写入 Excel 文件:
174 |
175 | ```jl
176 | @sc write_grades_xlsx()
177 | ```
178 |
179 | 当成绩被读取回来时,我们将看到 `XLSX.jl` 将数据放在 `XLSXFile` 类型中,并且可以像访问 `Dict` 一样访问所需的 `sheet`:
180 |
181 | ```jl
182 | sco("""
183 | JDS.inside_tempdir() do # hide
184 | path = write_grades_xlsx()
185 | xf = readxlsx(path)
186 | end # hide
187 | """)
188 | ```
189 |
190 | ```jl
191 | s = """
192 | JDS.inside_tempdir() do # hide
193 | xf = readxlsx(write_grades_xlsx())
194 | sheet = xf["Sheet1"]
195 | eachtablerow(sheet) |> DataFrame
196 | end # hide
197 | """
198 | sco(s; process=without_caption_label)
199 | ```
200 |
201 | 请注意,本节只介绍了`XLSX.jl`的基础知识,但它还提供了更强大的用法和自定义功能。
202 | 有关更多信息和选项,请参阅[`XLSX.jl` 文档](https://felipenoris.github.io/XLSX.jl/stable/).
203 |
--------------------------------------------------------------------------------
/contents/data_vis_makie_layouts.md:
--------------------------------------------------------------------------------
1 | ## 布局 {#sec:makie_layouts}
2 |
3 | 一个完整的 **画布/布局** 是由 `Figure` 定义的,创建后将在其中填充各种内容。
4 | 下面将以一个包含 `Axis`,`Legend` 和 `Colorbar` 的简单例子开始。
5 | 在这项任务中, 就像 `Array`/`Matrix` 那样,可以使用 `rows` 和 `columns` 索引 `Figure`。
6 | `Axis` 位于 **第 1 行,第 1 列**, 即为 `fig[1, 1]`。 `Colorbar` 位于 **第 1 行,第 2 列**, 即为 `fig[1, 2]`。
7 | 另外, `Legend` 位于 **第 2 行** 和 **第 1 - 2 列**, 即为 `fig[2, 1:2]`。
8 |
9 | ```jl
10 | @sco JDS.first_layout()
11 | ```
12 |
13 | 这看起来已经不错了,但能变得更好。可以使用以下关键字和方法来解决图的间距问题:
14 |
15 | - `figure_padding=(left, right, bottom, top)`
16 | - `padding=(left, right, bottom, top)`
17 |
18 | 改变 `Legend` 或 `Colorbar` 实际大小的方法为:
19 |
20 | > - `tellheight=true` or `false`
21 | > - `tellwidth=true` or `false`
22 | >
23 | > **将这些设置为 `true` 后则需考虑 `Legend` 或 `Colorbar` 的实际大小(高或宽)。**
24 | > 然后这些内容将会相应地调整大小。
25 |
26 | 可以使用以下方法指定行和列的间距:
27 |
28 | > - `colgap!(fig.layout, col, separation)`
29 | > - `rowgap!(fig.layout, row, separation)`
30 | >
31 | > **列间距** (`colgap!`),如果给定了 `col`,那么间距将只应用在指定的列。
32 | > **行间距** (`rowgap!`),如果给定了 `row`,那么间距将只应用在指定的行。
33 |
34 | 接下来将学习如何将内容放进 **突出部分(protrusion)**,即为 **标题 `x` 和 `y`,或 `ticks` 以及 `label`** 保留的空间。
35 | 实现方法是将位置索引改为 `fig[i, j, protrusion]`, 其中 _`protrusion`_ 可以是 `Left()`, `Right()`,`Bottom()` 和 `Top()`,或者是四个角 `TopLeft()`, `TopRight()`, `BottomRight()`,`BottomLeft()`。
36 | 这些选项将在如下的例子中使用:
37 |
38 | ```jl
39 | @sco JDS.first_layout_fixed()
40 | ```
41 |
42 | 这里在 `TopLeft()`添加标签 `(a)` 可能是不必要的, 因为标签仅在有两个以上的图时有意义。
43 | 在接下来的例子中,我们将继续使用之前的工具和一些新工具,并创建一个更丰富、更复杂的图。
44 |
45 | 可以使用以下函数隐藏图的装饰部分和轴线:
46 |
47 | > - `hidedecorations!(ax; kwargs...)`
48 | > - `hidexdecorations!(ax; kwargs...)`
49 | > - `hideydecorations!(ax; kwargs...)`
50 | > - `hidespines!(ax; kwargs...)`
51 |
52 | 应记住总是可以调用 `help` 查看能够传递的参数,例如:
53 |
54 | ```jl
55 | s = """
56 | help(hidespines!)
57 | """
58 | sco(s)
59 | ```
60 |
61 | 另外,对于 `hidedecorations!` 有:
62 |
63 | ```jl
64 | s = """
65 | help(hidedecorations!)
66 | """
67 | sco(s)
68 | ```
69 |
70 | 对于 **不想隐藏的** 元素,仅需要将它们的值设置为 `false`,即 `hideydecorations!(ax; ticks=false, grid=false)`。
71 |
72 |
73 | 同步 `Axis` 的方式如下:
74 |
75 | > - `linkaxes!`, `linkyaxes!` 和 `linkxaxes!`
76 | >
77 | > 这在需要共享轴时会变得很有用。
78 | > 另一种获得共享轴的方法是设置 `limits!`。
79 |
80 | 使用以下方式可一次性设定`limits`,当然也能单独为每个方向的轴单独设定:
81 |
82 | > - `limits!(ax; l, r, b, t)`,其中 `l` 为左侧, `r` 右侧,`b` 底部, 和 `t` 顶部。
83 | >
84 | > 还能使用 `ylims!(low, high)` 或 `xlims!(low, high)`,甚至可以通过 `ylims!(low=0)` 或 `xlims!(high=1)` 只设定一边。
85 |
86 | 例子如下:
87 |
88 | ```jl
89 | @sco JDS.complex_layout_double_axis()
90 | ```
91 |
92 | 如上所示, `Colorbar` 的方向已经变为水平且它的标签也处在其下方。
93 | 这是因为设定了 `vertical=false` 和 `flipaxis=false`。
94 | 另外,也可以将更多的 `Axis` 添加到 `fig` 里,甚至可以是 `Colorbar` 和 `Legend`,然后再构建布局。
95 |
96 | 另一种常见布局是热力图组成的正方网格:
97 |
98 | ```jl
99 | @sco JDS.squares_layout()
100 | ```
101 |
102 | 上图中每一个标签都位于 **突出部分** 并且每一个 `Axis` 都有 `AspectData()` 率属性。
103 | 图中 `Colorbar` 位于第三列,并从第一行跨到第二行。
104 |
105 | 下例将使用称为 `Mixed()` 的**对齐模式**,这在处理 `Axis` 间的大量空白区域时很有用,而这些空白区域通常是由长标签导致的。
106 | 另外,本例还需要使用 Julia 标准库中的 `Dates` 。
107 |
108 | ```
109 | using Dates
110 | ```
111 |
112 | ```jl
113 | @sco JDS.mixed_mode_layout()
114 | ```
115 |
116 | 如上,参数 `alignmode=Mixed(bottom=0)` 将边界框移动到底部,使其与左侧面板保持对齐。
117 |
118 | 从上图也可以看到 `colsize!` 和 `rowsize!` 如何作用于不同的行和列。
119 | 可以向函数传递一个数字而不是 `Auto()`,但那会固定所有的设置。
120 | 另外, 在定义 `Axis` 时也可以设定 `height` 或 `width`,例如 `Axis(fig, heigth=50)` 将会固定轴的高度。
121 |
122 | ### 嵌套 `Axis` (_subplots_)
123 |
124 | 精准定义一组 `Axis` (_subplots_) 也是可行的, 可以使用一组 `Axis` 构造具有多行多列的图。
125 | 例如,下面展示了一组较复杂的 `Axis`:
126 |
127 | ```jl
128 | @sc nested_sub_plot!(fig)
129 | ```
130 |
131 | 当通过多次调用它来构建更复杂的图时,可以得到:
132 |
133 | ```jl
134 | @sco JDS.main_figure()
135 | ```
136 |
137 | 注意,这里可以调用不同的子图函数。
138 | 另外,每一个 `Axis` 都是 `Figure` 的独立部分。
139 | 因此,当在进行 `rowgap!`或者 `colsize!` 这样的操作时,你需要考虑是对每一个子图单独作用还是对所有的图一起作用。
140 |
141 | 对于组合的 `Axis` (_subplots_) 可以使用 `GridLayout()`, 它能用来构造更复杂的 `Figure`。
142 |
143 | ### 嵌套网格布局
144 |
145 | 可以使用 `GridLayout()` 组合子图,这种方法能够更自由地构建更复杂的图。
146 | 这里再次使用之前的 `nested_sub_plot!`,它定义了三组子图和一个普通的 `Axis`:
147 |
148 | ```jl
149 | @sco JDS.nested_Grid_Layouts()
150 | ```
151 |
152 | 现在,对每一组使用 `rowgap!` 或 `colsize!` 将是可行的,并且 `rowsize!, colsize!` 也能够应用于 `GridLayout()`。
153 |
154 | ### 插图
155 |
156 | 目前,绘制 `inset` 是一项棘手的工作。
157 | 本节展示两种在初始时通过定义辅助函数实现绘制插图的方法。
158 | 第一种是定义 `BBox`,它存在于整个 `Figure` 空间:
159 |
160 | ```jl
161 | @sc add_box_inset(fig)
162 | ```
163 |
164 | 然后可以按照如下方式轻松地绘制插图:
165 |
166 | ```jl
167 | @sco JDS.figure_box_inset()
168 | ```
169 |
170 | 其中 `Box` 的尺寸受到 `Figure`中 `resolution` 参数的约束。
171 | 注意,也可以在 `Axis` 外绘制插图。
172 | 另一种绘制插图的方法是,在位置`fig[i, j]`处定义一个新的 `Axis`,并且指定 `width`, `height`, `halign` 和 `valign`。
173 | 如下面的函数例子所示:
174 |
175 | ```jl
176 | @sc add_axis_inset()
177 | ```
178 |
179 | 在下面的例子中,如果总图的大小发生变化,那么将重新缩放灰色背景的 `Axis`。
180 | 同时 **插图** 要受到 `Axis` 位置的约束。
181 |
182 | ```jl
183 | @sco JDS.figure_axis_inset()
184 | ```
185 |
186 | 以上包含了 Makie 中布局选项的大多数常见用例。
187 | 现在,让我们接下来使用 `GLMakie.jl` 绘制一些漂亮的3D示例图。
188 |
--------------------------------------------------------------------------------
/contents/dataframes.md:
--------------------------------------------------------------------------------
1 | # DataFrames.jl {#sec:dataframes}
2 |
3 | 数据通常以表格格式存储。
4 | 在表格格式中,数据由包含行和列的表组成。
5 | 每列通常具有相同的数据类型,而每行数据类型不同。
6 | 实际上,行表示观测量,而列表示变量。
7 | 例如,我们有一个电视节目表,其中包含每个节目的制作国家和大众个人评分,如 @tbl:TV_shows 所示。
8 |
9 | ```{=comment}
10 | Using a different example from the rest in the chapter to make the text a bit more interesting.
11 | We could even ask the reader to answer the queries described below as exercises.
12 | ```
13 |
14 | ```jl
15 | tv_shows = DataFrame(
16 | name=["Game of Thrones", "The Crown", "Friends", "..."],
17 | country=["United States", "England", "United States", "..."],
18 | rating=[8.2, 7.3, 7.8, "..."]
19 | )
20 | Options(tv_shows; label="TV_shows")
21 | ```
22 |
23 | 此处的省略号表示这是一张非常长的表,但只显示了少数行。
24 | 在分析数据时,我们经常会提出一些关于数据的有趣问题,这也称为 **数据查询**。
25 | 对于大型表格,计算机能够比手工查询更快地回答此类问题。
26 | 一些 **数据查询** 问题的例子如下:
27 |
28 | - 哪个电视节目评分最高?
29 | - 哪些电视节目由美国制作?
30 | - 哪些电视节目由相同的国家制作?
31 |
32 | 但是,作为研究人员,实际的科学往往从多张表格或多个数据源开始。
33 | 例如,如果我们也有其他人的电视节目评分数据 (@tbl:ratings):
34 |
35 | ```jl
36 | ratings = DataFrame(
37 | name=["Game of Thrones", "Friends", "..."],
38 | rating=[7, 6.4, "..."])
39 | Options(ratings; label="ratings")
40 | ```
41 |
42 | 现在则能够提出以下问题:
43 |
44 | - 节目 Game of Thrones 的平均评分是多少?
45 | - 谁对 Friends 给出了最高的评分?
46 | - 哪些节目你评分了,但其他人没有?
47 |
48 | 在本章的其余部分中,我们将展示如何借助 Julia 来轻松地回答这些问题。
49 | 因此此,首先说明为什么需要 Julia 包 `DataFrames.jl`。
50 | 下节将展示如何使用此包,最后将展示如何编写快速数据变换的代码 (@sec:df_performance)。
51 |
52 | ```{=comment}
53 | TODO: Add a comparison with Excel to see where Julia is better.
54 | In summary, because it is much easier to structure and reproduce the logic.
55 | (Jose approves)
56 | ```
57 |
58 | 首先查看如下的成绩表 @tbl:grades_for_2020 :
59 |
60 | ```jl
61 | JDS.grades_for_2020()
62 | ```
63 |
64 | 其中 name 列的类型为 `string`, age 列的类型为 `integer`,而 grade 列的类型为 `float`。
65 |
66 | 截至目前,本书只介绍了 Julia 的基础知识。
67 | 这些基础能够处理很多东西,但不能处理表。
68 | 因此,为了说明我们需要更多类型,让我们尝试将表格数据存储在数组中:
69 |
70 | ```jl
71 | @sc JDS.grades_array()
72 | ```
73 |
74 | 现在,数据以列优先形式存储,当想从行获取数据时,这种形式很麻烦:
75 |
76 | ```jl
77 | @sco JDS.second_row()
78 | ```
79 |
80 | 或者,如果想获得 Alice 的成绩,首先需要弄清楚 Alice 所在的行:
81 |
82 | ```jl
83 | scob("""
84 | function row_alice()
85 | names = grades_array().name
86 | i = findfirst(names .== "Alice")
87 | end
88 | row_alice()
89 | """)
90 | ```
91 |
92 | 然后才能得到成绩:
93 |
94 | ```jl
95 | scob("""
96 | function value_alice()
97 | grades = grades_array().grade_2020
98 | i = row_alice()
99 | grades[i]
100 | end
101 | value_alice()
102 | """)
103 | ```
104 |
105 | `DataFrames.jl` 可以很容易地处理此类问题。
106 | 首先使用 `using` 加载 `DataFrames.jl` :
107 |
108 | ```
109 | using DataFrames
110 | ```
111 |
112 | 通过 `DataFrames.jl`,我们可以定义 `DataFrame` 来存储表格数据:
113 |
114 | ```jl
115 | sco("""
116 | names = ["Sally", "Bob", "Alice", "Hank"]
117 | grades = [1, 5, 8.5, 4]
118 | df = DataFrame(; name=names, grade_2020=grades)
119 | without_caption_label(df) # hide
120 | """)
121 | ```
122 |
123 | 即此处返回的变量 `df` 以表格格式存储数据。
124 |
125 | ```{=comment}
126 | Although this section is a duplicate of earlier chapters, I do think it might be a good idea to keep the duplicate.
127 | According to MIT instructor Patrick Winston (https://youtu.be/Unzc731iCUY), convincing someone of something means repeating it a few times.
128 | With this section, people who already understand it, understand it a bit better and people who didn't understand it yet might understand it here.
129 | ```
130 |
131 | > **_NOTE:_**
132 | > 这是可行的,但我们需要立即改变一件事。
133 | > 在本例中,我们在全局作用域定义了变量 `name`、 `grade_2020` 和 `df`。
134 | > 这意味着可以从任何位置访问和修改这些变量。
135 | > 如果我们继续像这样写这本书,那么我们会在书结尾时拥有上百个变量,即使变量 `name` 中的数据本应只能通过 `DataFrame` 访问!
136 | > 变量 `name` 和 `grade_2020` 不应该持久地保存!
137 | > 现在,想象一下,我们将会在本书中多次修改 `grade_2020`。
138 | > 如果本书只有 PDF 格式, 那么几乎不可能在最后指出变量的内容。
139 | >
140 | > 可以使用函数轻松地解决此类问题。
141 |
142 | 让我们使用函数完成同样的操作:
143 |
144 | ```jl
145 | @sco grades_2020()
146 | ```
147 |
148 | 注意, `name` 和 `grade_2020` 会在函数返回后销毁,即它们仅在函数中可用。
149 | 这样做还有两个好处。
150 | 首先,读者可以清晰地看到 `name` 和 `grade_2020` 由谁所有:它们属于 2020 成绩表。
151 | 其次,很容易在书中的任何地方确定 `grades_2020()` 的输出。
152 | 例如,可以将数据赋给变量 `df`:
153 |
154 | ```jl
155 | sco("""
156 | df = grades_2020()
157 | """; process=without_caption_label)
158 | ```
159 |
160 | 改变 `df` 的内容:
161 |
162 | ```jl
163 | sco("""
164 | df = DataFrame(name = ["Malice"], grade_2020 = ["10"])
165 | """; process=without_caption_label)
166 | ```
167 |
168 | 而且仍然能够无损恢复数据:
169 |
170 | ```jl
171 | sco("""
172 | df = grades_2020()
173 | """; process=without_caption_label)
174 | ```
175 |
176 | 当然,此处假设没有重新定义函数。
177 | 我们在本书中保证不会这样做,因为这是非常糟糕的做法。
178 | 我们不会 “改变” 函数,而是创建一个具有明确名称的新函数。
179 |
180 | 因此,回到 `DataFrames`构造器。
181 | 如你所见,创建方法是将向量作为参数传递给 `DataFrame` 构造器。
182 | 你可以给定任何合法的 Julia 向量,并且 **只要向量长度相同**,就能成功构造 `DataFrame`。
183 | 重复的向量、Unicode 符号和任何类型的数字都可以:
184 |
185 | ```jl
186 | sco("""
187 | DataFrame(σ = ["a", "a", "a"], δ = [π, π/2, π/3])
188 | """; process=without_caption_label)
189 | ```
190 |
191 | 通常,您在代码中会创建函数来包装一个或多个作用于 `DataFrame` 的函数。
192 | 例如,可以创建函数来获取一个或多个 `names` 的成绩:
193 |
194 | ```jl
195 | @sco process=without_caption_label JDS.grades_2020([3, 4])
196 | ```
197 |
198 | 使用函数来包装基本功能的这种方式,在编程语言和包中非常常见。
199 | 基本上,你可以把 Julia 和 `DataFrames.jl` 看作基本模块的提供者。
200 | 它们提供了相当 **通用的** 模块,从而你可以在此基础之上实现一些 **特例** ,比如这个成绩例子。
201 | 借助这些基本模块,你可以编写数据分析脚本,控制机器人或任何你想要构造的东西。
202 |
203 | 截至目前,由于必须使用索引,这些例子都非常麻烦。
204 | 下节将介绍如何在 `DataFrames.jl` 中加载和保存数据,以及其它一些强大的基本模块。
205 |
--------------------------------------------------------------------------------
/contents/data_vis_makie_colors.md:
--------------------------------------------------------------------------------
1 | ## 颜色和颜色图(Colormap){#sec:makie_colors}
2 |
3 | 在展示结果时,其中重要的一步是为图选择一组合适的颜色或 colorbar。
4 | `Makie.jl` 支持使用 [Colors.jl](https://github.com/JuliaGraphics/Colors.jl) ,因此你可以使用 [named colors](https://juliagraphics.github.io/Colors.jl/latest/namedcolors/) 而不是传递 `RGB` 或 `RGBA` 值。
5 | 另外,也可以使用 [ColorSchemes.jl](https://github.com/JuliaGraphics/ColorSchemes.jl) 和 [PerceptualColourMaps.jl](https://github.com/peterkovesi/PerceptualColourMaps.jl) 中的颜色图。
6 | 值得了解的是,可以使用 `Reverse(:colormap_name)` 反转颜色图 ,也可以通过 `color=(:red,0.5)` and `colormap=(:viridis, 0.5)` 获得透明的颜色或颜色图。
7 |
8 | 下文介绍不同的用例。 接下来使用新的颜色和颜色栏(Colorbar)调色盘来创建自定义主题。
9 |
10 | 默认情况下, `Makie.jl` 已经预定义一组颜色,可以循环使用该组颜色。
11 | 之前的图因此并未设置任何特定颜色。
12 | 覆盖这些默认颜色的方法是,在绘图函数中调用 `color` 关键字并使用 `Symbol` 或 `String` 指定新的颜色。
13 | 该操作如下所示:
14 |
15 | ```jl
16 | @sco JDS.set_colors_and_cycle()
17 | ```
18 |
19 | 这里通过`color` 关键字指定了上例前两条曲线的颜色。
20 | 其余使用默认的颜色集。
21 | 稍后将学习如何使用自定义颜色循环。
22 |
23 | 关于颜色图,我们已经非常熟悉用于热力图和散点图的 `colormap`。下面展示的是,颜色图也可以像颜色那样通过 `Symbol` 或 `String` 进行指定。
24 | 此外,也可以是 `RGB` 颜色的向量。
25 | 下面是第一个例子,通过 `Symbol`, `String` 和分类值的 `cgrad` 来指定颜色图。
26 | 输入 `?cgrad` 查看更多信息。
27 |
28 | ```jl
29 | scolor = """
30 | CairoMakie.activate!() # hide
31 | figure = (; resolution=(600, 400), font="CMU Serif")
32 | axis = (; xlabel=L"x", ylabel=L"y", aspect=DataAspect())
33 | fig, ax, pltobj = heatmap(rand(20, 20); colorrange=(0, 1),
34 | colormap=Reverse(:viridis), axis=axis, figure=figure)
35 | Colorbar(fig[1, 2], pltobj, label = "Reverse colormap Sequential")
36 | fig
37 | label = "Reverse_colormap_sequential" # hide
38 | caption = "Reverse colormap sequential and colorrange." # hide
39 | link_attributes = "width=60%" # hide
40 | Options(fig; filename=label, label, caption, link_attributes) # hide
41 | """
42 | sco(scolor)
43 | ```
44 |
45 | 当设置 `colorrange` 后,超出此范围的颜色值会被相应地设置为颜色图的第一种和最后一种颜色。
46 | 但是,有时最好自行指定两端的颜色。这可以通过 `highclip` 和 `lowclip` 实现:
47 |
48 | ```
49 | using ColorSchemes
50 | ```
51 |
52 | ```jl
53 | s = """
54 | CairoMakie.activate!() # hide
55 | figure = (; resolution=(600, 400), font="CMU Serif")
56 | axis = (; xlabel=L"x", ylabel=L"y", aspect=DataAspect())
57 | fig, ax, pltobj=heatmap(randn(20, 20); colorrange=(-2, 2),
58 | colormap="diverging_rainbow_bgymr_45_85_c67_n256",
59 | highclip=:black, lowclip=:white, axis=axis, figure=figure)
60 | Colorbar(fig[1, 2], pltobj, label = "Diverging colormap")
61 | fig
62 | label = "diverging_colormap" # hide
63 | caption = "Diverging Colormap with low and high clip." # hide
64 | link_attributes = "width=60%" # hide
65 | Options(fig; filename=label, label, caption, link_attributes) # hide
66 | """
67 | sco(s)
68 | ```
69 |
70 | 另外 `RGB` 向量也是合法的选项。
71 | 在下面的例子中, 你可以传递一个自定义颜色图 _perse_ 或使用 `cgrad` 来创建分类值的 `Colorbar`。
72 |
73 | ```
74 | using Colors, ColorSchemes
75 | ```
76 |
77 | ```jl
78 | scat = """
79 | CairoMakie.activate!() # hide
80 | figure = (; resolution=(600, 400), font="CMU Serif")
81 | axis = (; xlabel=L"x", ylabel=L"y", aspect=DataAspect())
82 | cmap = ColorScheme(range(colorant"red", colorant"green", length=3))
83 | mygrays = ColorScheme([RGB{Float64}(i, i, i) for i in [0.0, 0.5, 1.0]])
84 | fig, ax, pltobj = heatmap(rand(-1:1, 20, 20);
85 | colormap=cgrad(mygrays, 3, categorical=true, rev=true), # cgrad and Symbol, mygrays,
86 | axis=axis, figure=figure)
87 | cbar = Colorbar(fig[1, 2], pltobj, label="Categories")
88 | cbar.ticks = ([-0.66, 0, 0.66], ["-1", "0", "1"])
89 | fig
90 | label = "categorical_colormap" # hide
91 | caption = "Categorical Colormap." # hide
92 | link_attributes = "width=60%" # hide
93 | Options(fig; filename=label, label, caption, link_attributes) # hide
94 | """
95 | sco(scat)
96 | ```
97 |
98 | 最后,分类值的颜色栏标签默认不在每种颜色间居中。
99 | 添加自定义标签可修复此问题,即 `cbar.ticks = (positions, ticks)`。
100 | 最后一种情况是传递颜色的元组给 `colormap`,其中颜色可以通过 `Symbol`, `String` 或它们的混合指定。
101 | 然后将会得到这两组颜色间的插值颜色图。
102 |
103 | 另外,也支持十六进制编码的颜色作为输入。因此作为示范,下例将在热力图上放置一个半透明的标记。
104 |
105 | ```jl
106 | s2color2 = """
107 | CairoMakie.activate!() # hide
108 | figure = (; resolution=(600, 400), font="CMU Serif")
109 | axis = (; xlabel=L"x", ylabel=L"y", aspect=DataAspect())
110 | fig, ax, pltobj = heatmap(rand(20, 20); colorrange=(0, 1),
111 | colormap=(:red, "black"), axis=axis, figure=figure)
112 | scatter!(ax, [11], [11], color=("#C0C0C0", 0.5), markersize=150)
113 | Colorbar(fig[1, 2], pltobj, label="2 colors")
114 | fig
115 | label = "colormap_two_colors" # hide
116 | caption = "Colormap from two colors." # hide
117 | link_attributes = "width=60%" # hide
118 | Options(fig; filename=label, label, caption, link_attributes) # hide
119 | """
120 | sco(s2color2)
121 | ```
122 |
123 | ### 自定义颜色循环
124 |
125 | 可以通过新的颜色循环定义全局 `Theme` ,但通常 **不建议** 这样做。
126 | 更好的做法是定义新的主题并像上节那样使用它。
127 | 定义带有 `:color`, `:linestyle`, `:marker` 属性的新 `cycle` 和默认的 `colormap` 。
128 | 下面为之前的 `publication_theme` 增加一些新的属性。
129 |
130 | ```jl
131 | @sc new_cycle_theme()
132 | ```
133 |
134 | 然后将它应用到绘图函数中,如下所示:
135 |
136 | ```jl
137 | @sc scatters_and_lines()
138 | ```
139 |
140 | ```jl
141 | s = """
142 | CairoMakie.activate!() # hide
143 | with_theme(scatters_and_lines, new_cycle_theme())
144 | label = "custom_cycle" # hide
145 | caption = "Custom theme with new cycle and colormap." # hide
146 | link_attributes = "width=60%" # hide
147 | Options(current_figure(); filename=label, caption, label, link_attributes) # hide
148 | """
149 | sco(s)
150 | ```
151 |
152 | 此时,通过颜色,曲线样式,标记和颜色图,你已经能够 **完全控制** 绘图结果。
153 | 下一部分将讨论如何管理和控制 **布局**。
154 |
--------------------------------------------------------------------------------
/contents/data_vis_makie_attributes.md:
--------------------------------------------------------------------------------
1 | ## 属性 {#sec:datavisMakie_attributes}
2 |
3 | 使用 `attributes` 可以创建自定义的图。
4 | 设置属性可以使用多个关键字参数。
5 | 每个 plot 对象的 `attributes` 列表可以通过以下方式查看:
6 |
7 | ```jl
8 | s = """
9 | CairoMakie.activate!() # hide
10 | fig, ax, pltobj = scatterlines(1:10)
11 | pltobj.attributes
12 | """
13 | sco(s)
14 | ```
15 |
16 | 或者调用 `pltobject.attributes.attributes` 返回对象属性的`Dict` 。
17 |
18 | 对于任一给定的绘图函数,都能在 `REPL` 中以 `?lines` 或 `help(lines)` 的形式获取帮助。Julia将输出该函数的相应属性,并简要说明如何使用该函数。
19 | 关于 `lines` 的例子如下:
20 |
21 | ```jl
22 | s = """
23 | help(lines)
24 | """
25 | sco(s)
26 | ```
27 |
28 | 不仅 plot 对象有属性,`Axis` 和 `Figure` 对象也有属性。
29 | 例如,Figure 的属性有 `backgroundcolor`,`resolution`,`font` 和 `fontsize` 以及 `figure_padding`。 其中 `figure_padding` 改变了图像周围的空白区域,如图 (@fig:custom_plot) 中的灰色区域所示。
30 | 它使用一个数字指定所有边的范围,或使用四个数的元组表示上下左右。
31 |
32 | `Axis` 同样有一系列属性,典型的有 `backgroundcolor`, `xgridcolor` 和 `title`。
33 | 使用 `help(Axis)` 可查看所有属性。
34 |
35 | 在接下来这张图里,我们将设置一些属性:
36 |
37 | ```jl
38 | s = """
39 | CairoMakie.activate!() # hide
40 | lines(1:10, (1:10).^2; color=:black, linewidth=2, linestyle=:dash,
41 | figure=(; figure_padding=5, resolution=(600, 400), font="sans",
42 | backgroundcolor=:grey90, fontsize=16),
43 | axis=(; xlabel="x", ylabel="x²", title="title",
44 | xgridstyle=:dash, ygridstyle=:dash))
45 | current_figure()
46 | filename = "custom_plot" # hide
47 | link_attributes = "width=60%" # hide
48 | caption = "Custom plot." # hide
49 | Options(current_figure(); filename, caption, label=filename, link_attributes) # hide
50 | """
51 | sco(s)
52 | ```
53 |
54 | 此例已经包含了大多数用户经常会用到的属性。
55 | 或许在图上加一个 `legend` 会更好,这在有多条曲线时尤为有意义。
56 | 所以,向图上 `append` 另一个 `plot object` 并且通过调用 `axislegend` 添加对应的图例。
57 | 它将收集所有 plot 函数中的 `labels`, 并且图例默认位于图的右上角。
58 | 本例调用了 `position=:ct` 参数,其中 `:ct` 表示图例将位于 `center`和 `top`, 如图 @fig:custom_plot_leg 所示:
59 |
60 | ```jl
61 | s = """
62 | CairoMakie.activate!() # hide
63 | lines(1:10, (1:10).^2; label="x²", linewidth=2, linestyle=nothing,
64 | figure=(; figure_padding=5, resolution=(600, 400), font="sans",
65 | backgroundcolor=:grey90, fontsize=16),
66 | axis=(; xlabel="x", title="title", xgridstyle=:dash,
67 | ygridstyle=:dash))
68 | scatterlines!(1:10, (10:-1:1).^2; label="Reverse(x)²")
69 | axislegend("legend"; position=:ct)
70 | current_figure()
71 | label = "custom_plot_leg" # hide
72 | link_attributes = "width=60%" # hide
73 | caption = "Custom plot legend." # hide
74 | Options(current_figure(); label, filename=label, caption, link_attributes) # hide
75 | """
76 | sco(s)
77 | ```
78 |
79 | 通过组合 `left(l), center(c), right(r)` 和 `bottom(b), center(c), top(t)` 还可以再指定其他位置。
80 | 例如,使用`:lt` 指定为左上角。
81 |
82 | 然而,仅仅为两条曲线编写这么多代码是比较复杂的。
83 | 所以,如果要以相同的样式绘制一组曲线,那么最好指定一个主题。
84 | 使用 `set_theme!()` 可实现该操作,如下所示。
85 |
86 | 使用 `set_theme!(kwargs)`定义的新配置,重新绘制之前的图:
87 |
88 | ```jl
89 | s = """
90 | CairoMakie.activate!() # hide
91 | set_theme!(; resolution=(600, 400),
92 | backgroundcolor=(:orange, 0.5), fontsize=16, font="sans",
93 | Axis=(backgroundcolor=:grey90, xgridstyle=:dash, ygridstyle=:dash),
94 | Legend=(bgcolor=(:red, 0.2), framecolor=:dodgerblue))
95 | lines(1:10, (1:10).^2; label="x²", linewidth=2, linestyle=nothing,
96 | axis=(; xlabel="x", title="title"))
97 | scatterlines!(1:10, (10:-1:1).^2; label="Reverse(x)²")
98 | axislegend("legend"; position=:ct)
99 | current_figure()
100 | set_theme!()
101 | label = "setTheme" # hide
102 | link_attributes = "width=60%" # hide
103 | caption = "Set theme example."
104 | Options(current_figure(); filename=label, caption, label, link_attributes) # hide
105 | """
106 | sco(s)
107 | ```
108 |
109 | 倒数第二行的 `set_theme!()` 会将主题重置到 Makie 的默认设置。
110 | 有关 `themes` 的更多内容请转到 @sec:themes。
111 |
112 | 在进入下节前, 值得先看一个例子:将多个参数所组成的 `array` 传递给绘图函数来配置属性。
113 | 例如,使用 `scatter` 绘图函数绘制气泡图。
114 |
115 | 本例随机生成 100 行 3 列的 `array` ,这些数据满足正态分布。
116 | 其中第一列表示 `x` 轴上的位置,第二列表示 `y` 轴上的位置,第三列表示与每一点关联的属性值。
117 | 例如可以用来指定不同的 `color` 或者不同的标记大小。气泡图就可以实现相同的操作。
118 |
119 | ```jl
120 | s = """
121 | using Random: seed!
122 | seed!(28)
123 | xyvals = randn(100, 3)
124 | xyvals[1:5, :]
125 | """
126 | sco(s)
127 | ```
128 |
129 | 对应的图 @fig:bubble 如下所示:
130 |
131 | ```jl
132 | s = """
133 | CairoMakie.activate!() # hide
134 | fig, ax, pltobj = scatter(xyvals[:, 1], xyvals[:, 2]; color=xyvals[:, 3],
135 | label="Bubbles", colormap=:plasma, markersize=15 * abs.(xyvals[:, 3]),
136 | figure=(; resolution=(600, 400)), axis=(; aspect=DataAspect()))
137 | limits!(-3, 3, -3, 3)
138 | Legend(fig[1, 2], ax, valign=:top)
139 | Colorbar(fig[1, 2], pltobj, height=Relative(3 / 4))
140 | fig
141 | label = "bubble" # hide
142 | link_attributes = "width=60%" # hide
143 | caption = "Bubble plot."
144 | Options(current_figure(); filename=label, caption, label, link_attributes) # hide
145 | """
146 | sco(s)
147 | ```
148 |
149 | 为了在图上添加 `Legend` 和 `Colorbar`,需将 `FigureAxisPlot` 元组分解为 `fig, ax, pltobj`。
150 | 我们将在 @sec:makie_layouts 讨论有关布局选项的更多细节。
151 |
152 | 通过一些基本且有趣的例子,我们展示了如何使用`Makie.jl`,现在你可能想知道:还能做什么?
153 | `Makie.jl` 都还有哪些绘图函数?
154 | 为了回答此问题,我们制作了一个 _cheat sheet_ 如 @fig:cheat_sheet_cairomakie 所示。
155 | 使用 `CairoMakie.jl` 后端可以轻松绘制这些图。
156 |
157 | {#fig:cheat_sheet_cairomakie}
158 |
159 | @fig:cheat_sheet_glmakie 展示了 `GLMakie.jl` 的_cheat sheet_ ,这些函数支持绘制大多数 3D 图。
160 | 这些将在后面的 `GLMakie.jl` 节进一步讨论。
161 |
162 | {#fig:cheat_sheet_glmakie}
163 |
164 | 现在,我们已经大致了解到能做什么。接下来应该掉过头来继续研究基础知识。
165 | 是时候学习如何改变图的整体外观了。
166 |
--------------------------------------------------------------------------------
/contents/dataframes_indexing.md:
--------------------------------------------------------------------------------
1 | ## Index 和 Summarize {#sec:index_summarize}
2 |
3 | 回顾之前定义的 `grades_2020()` 数据集:
4 |
5 | ```jl
6 | sco("grades_2020()"; process=without_caption_label)
7 | ```
8 |
9 | 可以通过 `.` 语法提取 `DataFrame` 中的 `name` 列向量,正如之前 @sec:julia_basics 中 `struct` 的操作那般:
10 |
11 | ```jl
12 | @sco JDS.names_grades1()
13 | ```
14 |
15 | 或者,可以像 `Array` 那样通过 `Symbol` 或特殊字符索引 `DataFrame` 。
16 | **第二个索引是列索引**:
17 |
18 | ```jl
19 | @sco JDS.names_grades2()
20 | ```
21 |
22 | 注意, `df.name` 与 `df[!, :name]` 完全相同, 这可以自行验证:
23 |
24 | ```
25 | julia> df = DataFrame(id=[1]);
26 |
27 | julia> @edit df.name
28 | ```
29 |
30 | 这两个例子都会得到 `:name`。
31 | 同样,也存在 `df[:, :name]` 这样的语法,不过它复制了 `:name` 列。
32 | 大多数情况下, `df[!, :name]` 是最佳的做法,因为它更通用,而且没有内存拷贝,对其的所有操作都是 in-place 的。
33 |
34 |
35 | 对于任意 **行**, 例如第二行, 可以使用 **第一个索引作为行索引**:
36 |
37 | ```jl
38 | s = """
39 | df = grades_2020()
40 | df[2, :]
41 | df = DataFrame(df[2, :]) # hide
42 | """
43 | sco(s; process=without_caption_label)
44 | ```
45 |
46 | 或者创建函数来获取某一行 `i`:
47 |
48 | ```jl
49 | @sco process=without_caption_label JDS.grade_2020(2)
50 | ```
51 |
52 | 还可以使用 **切片** (与 `Array` 类似)来仅获取 `names` 列的前两行:
53 |
54 | ```jl
55 | @sco JDS.grades_indexing(grades_2020())
56 | ```
57 |
58 | 如果假设表中的每个名字是唯一的,那么可以编写一个函数来通过 `name` 获取每个人的成绩。
59 | 要实现此操作,需将上表转换为一种 Julia 基本数据结构,即可以实现映射的 `Dict`:
60 |
61 | ```jl
62 | @sco post=output_block grade_2020("Bob")
63 | ```
64 |
65 | 这是可行的,因为 `zip` 会同时遍历 `df.name` 和 `df.grade_2020`,就像 “拉链” 那样:
66 |
67 | ```jl
68 | sco("""
69 | df = grades_2020()
70 | collect(zip(df.name, df.grade_2020))
71 | """)
72 | ```
73 |
74 | 然而, `DataFrame` 转 `Dict` 操作仅在元素唯一的情况下可行。
75 | 一般情况下,上述条件并不成立,所以需要学习如何对 `DataFrame` 进行 `filter` 操作。
76 |
77 | ## Filter 和 Subset {#sec:filter_subset}
78 |
79 | 有两种方式可以选取 `DataFrame` 中的某些行, 一种是 `filter` (@sec:filter) 而另一种是 `subset` (@sec:subset)。
80 |
81 | `DataFrames.jl` 较早地添加了 `filter` 函数, 它更强大且与 Julia `Base` 库的语法保持一致,因此我们先讨论 `filter`。
82 | `subset` 是较新的函数,但它通常更简便。
83 |
84 | ### Filter {#sec:filter}
85 |
86 | 由此开始,接下来将讨论 `DataFrames.jl` 中非常强大的特性。
87 | 在讨论伊始,首先学习一些函数,例如 `select` 和 `filter`。
88 | 但请不要担心!
89 | 可以先松一口气,因为 **`DataFrames.jl` 的总体设计目标就是让用户需学习的函数保持在最低限度[^verbs]**。
90 |
91 | [^verbs]: 这来自于 Bogumił Kamiński (`DataFrames.jl` 的首席开发者和维护者) 在 Discourse () 论坛上的发言。
92 |
93 | 与之前一样,从 `grades_2020` 开始:
94 |
95 | ```jl
96 | sco("grades_2020()"; process=without_caption_label)
97 | ```
98 |
99 | 可以使用 `filter(source => f::Function, df)` 筛选行。
100 | 注意,这个函数与 Julia `Base` 模块中的 `filter(f::Function, V::Vector)` 函数非常相似。
101 | 这是因为 `DataFrames.jl` 使用**多重派发** (see @sec:multiple_dispatch) 扩展`filter`,以使其能够接收`DataFrame` 作为参数。
102 |
103 | 从第一印象来看,实际中定义和使用函数 `f` 可能有些困难。
104 | 但请坚持学习,我们的努力会有超高的回报,因为 **这是非常强大的数据筛选方法**。
105 | 如下是一个简单的例子, 创建函数 `equals_alice` 来检查输入是否等于 "Alice":
106 |
107 | ```jl
108 | @sco post=output_block JDS.equals_alice("Bob")
109 | ```
110 |
111 | ```jl
112 | sco("equals_alice(\"Alice\")"; post=output_block)
113 | ```
114 |
115 | 结合该函数, 可以使用 `f` 筛选出所有 `name` 等于 "Alice" 的行:
116 |
117 | ```jl
118 | s = "filter(:name => equals_alice, grades_2020())"
119 | sco(s; process=without_caption_label)
120 | ```
121 |
122 | 注意这不仅适用于 `DataFrame`,也适用于向量:
123 |
124 | ```jl
125 | s = """filter(equals_alice, ["Alice", "Bob", "Dave"])"""
126 | sco(s)
127 | ```
128 |
129 | 还可以使用 **匿名函数** 缩短代码长度 (请查阅 @sec:function_anonymous):
130 |
131 | ```jl
132 | s = """filter(n -> n == "Alice", ["Alice", "Bob", "Dave"])"""
133 | sco(s)
134 | ```
135 |
136 | 它也可用于 `grades_2020`:
137 |
138 | ```jl
139 | s = """filter(:name => n -> n == "Alice", grades_2020())"""
140 | sco(s; process=without_caption_label)
141 | ```
142 |
143 | 简单来说,上述函数可以理解为 “遍历 `:name` 列的所有元素,对每一个元素 `n`,检查 `n` 是否等于 Alice”。
144 | 可能对于某些人来说,这样的代码些许冗长。
145 | 幸运的是,Julia 已经扩展了 `==` 的**偏函数应用(partial function application)** (译注:指定部分参数的函数)。
146 | 其中的细节不重要 -- 只需知道能像其他函数一样使用 `==`:
147 |
148 | ```jl
149 | sco("""
150 | s = "This is here to workaround a bug in books" # hide
151 | filter(:name => ==("Alice"), grades_2020())
152 | """; process=without_caption_label)
153 | ```
154 |
155 | ### Subset {#sec:subset}
156 |
157 | `subset` 函数的加入使得处理 `missing` 值 (@sec:missing_data) 更加容易。
158 | 与 `filter` 相反, `subset` 对整列进行操作,而不是整行或者单个值。
159 | 如果想使用之前的函数,可以将其包装在 `ByRow` 里:
160 |
161 | ```jl
162 | s = "subset(grades_2020(), :name => ByRow(equals_alice))"
163 | sco(s; process=without_caption_label)
164 | ```
165 |
166 | 另请注意, `DataFrame` 是 `subset(df, args...)` 的第一个参数,而而对于 `filter` 来说是第二个参数,即 `filter(f, df)`。
167 | 这是因为, Julia 定义 `filter` 的方式为 `filter(f, V::Vector)`,而 `DataFrames.jl` 在使用多重派发将其扩展到 `DataFrame` 类型时,选择与现有函数形式保持一致。
168 |
169 | > **_NOTE:_**
170 | > `subset` 所属的大多数原生 `DataFrames.jl` 函数都保持着一致的函数签名,即 **将 `DataFrame` 作为第一个参数**。
171 |
172 | 与 `filter` 一样,可以在 `subset` 中使用匿名函数:
173 |
174 | ```jl
175 | s = "subset(grades_2020(), :name => ByRow(name -> name == \"Alice\"))"
176 | sco(s; process=without_caption_label)
177 | ```
178 |
179 | 或者使用 `==` 的偏函数应用:
180 |
181 | ```jl
182 | s = "subset(grades_2020(), :name => ByRow(==(\"Alice\")))"
183 | sco(s; process=without_caption_label)
184 | ```
185 |
186 | 最后展示 `subset` 的真正用处。
187 | 首先,创建一个含有 `missing` 值的数据集:
188 |
189 | ```jl
190 | @sco salaries()
191 | ```
192 |
193 | 这是一种合理的情况:你想算出同事们的工资,但还没算 Zed 的。
194 | 尽管我们不鼓励这么做,但这是一个有趣的例子。
195 | 假设我们想知道谁的工资超过了 2000。
196 | 如果使用 `filter`, 但未考虑 `missing`值,则会失败:
197 |
198 | ```jl
199 | s = "filter(:salary => >(2_000), salaries())"
200 | sce(s, post=trim_last_n_lines(25))
201 | ```
202 |
203 | `subset` 同样会失败,但幸运的是,报错指出一则简单的解决方案:
204 |
205 | ```jl
206 | s = "subset(salaries(), :salary => ByRow(>(2_000)))"
207 | sce(s, post=trim_last_n_lines(25))
208 | ```
209 |
210 | 所以仅需要传递关键字参数 `skipmissing=true`:
211 |
212 | ```jl
213 | s = "subset(salaries(), :salary => ByRow(>(2_000)); skipmissing=true)"
214 | sco(s; process=without_caption_label)
215 | ```
216 |
217 | ```{=comment}
218 | Rik, we need a example of both filter and subset with multiple conditions, as in:
219 |
220 | `filter(row -> row.col1 >= something1 && row.col2 <= something2, df)`
221 |
222 | and:
223 |
224 | `subset(df, :col1 => ByRow(>=(something1)), :col2 => ByRow(<=(something2)>))
225 | ```
226 |
--------------------------------------------------------------------------------
/contents/dataframes_performance.md:
--------------------------------------------------------------------------------
1 | ## 性能 {#sec:df_performance}
2 |
3 | 截至目前,我们还没有尝试让 `DataFrames.jl` 代码变得 **快些**。
4 | 就像 Julia 中的一切, `DataFrames.jl` 实际上也可以变得非常快。
5 | 本节将给出一些性能建议和技巧。
6 |
7 | ### In-place 的操作 {#sec:df_performance_inplace}
8 |
9 | 如在 @sec:function_bang 讨论的那样,如果函数结尾带有叹号 `!`,那么这表明该函数会更改传入的参数。
10 | 在 Julia 高性能代码的语境中,这**表明** 带有 `!`的函数将会原地(in-place)修改我们传入的参数对象。
11 |
12 | 几乎所有的 `DataFrames.jl` 函数都有一个带 `!` 的版本。
13 | 例如, `filter` 有 _in-place_ 的 `filter!`, `select` 有 `select!`, `subset` 有 `subset!` 等等。
14 | 注意,这些函数都 **没有** 返回新的 `DataFrame`,而是直接 **更新** 传入的 `DataFrame` 。
15 | 另外, `DataFrames.jl` (从版本 1.3 开始)支持 in-place 的 `leftjoin` ,即 `leftjoin!` 函数。
16 | 该函数会使用右侧 `DataFrame` 的数据列更新左侧 `DataFrame` 。
17 | 需要注意的是,左表的每一行 **最多** 只能匹配右表中的一行。
18 |
19 | 如果想在代码中获得最高的速度和性能,你绝对应该使用带 `!` 的函数,而不是常规的 `DataFrames.jl` 函数。
20 |
21 | 让我们回到在 @sec:select 开始部分提到的关于 `select` 函数的例子。
22 | 如下是 `responses` 的 `DataFrame`:
23 |
24 | ```jl
25 | sco("responses()"; process=without_caption_label)
26 | ```
27 |
28 | 现在使用 `select` 函数来进行选择,就像之前所做的那样:
29 |
30 | ```jl
31 | s = """
32 | # allocating function # hide
33 | select(responses(), :id, :q1)
34 | """
35 | sco(s, process=without_caption_label)
36 | ```
37 |
38 | 而 _in-place_ 函数如下:
39 |
40 | ```jl
41 | s = """
42 | # non allocarting function # hide
43 | select!(responses(), :id, :q1)
44 | """
45 | sco(s; process=without_caption_label)
46 | ```
47 |
48 | `@allocated` 宏会告诉我们运行过程中分配的内存大小。
49 | 换句话说,**计算机在运行代码时需要在内存中存储多少新信息**。
50 | 让我们看看运行结果:
51 |
52 | ```jl
53 | s = """
54 | # allocation # hide
55 | df = responses()
56 | @allocated select(df, :id, :q1)
57 | """
58 | sco(s; process=string, post=plainblock)
59 | ```
60 |
61 | ```jl
62 | s = """
63 | # non allocation # hide
64 | df = responses()
65 | @allocated select!(df, :id, :q1)
66 | """
67 | sco(s; process=string, post=plainblock)
68 | ```
69 |
70 | 如我们所看到的那样, `select!` 分配的内存小于 `select` 的。
71 | 所以,由于消耗更少的内存,它应该更快。
72 |
73 | ### 复制或者不复制列 {#sec:df_performance_df_copy}
74 |
75 | 有两种 **访问 DataFrame 列**的方式。
76 | 它们的不同之处在于:一种方式是创建列的 “view”,并且没有拷贝;而另一种方式是从原始列复制出一个全新的列。
77 |
78 | 第一种方式通常使用点运算符 `.` + 列名的语法,即 `df.col`。
79 | 这种方式 **不拷贝** 列 `col`。
80 | 实际上,`df.col` 创建了链接到原始列的 "view" ,且没有分配任何内存。
81 | 另外,`df.col` 语法等价于带有 `i` 的列选择器 `df[!, :col]` 。
82 |
83 | 第二种访问 `DataFrame` 列的方式是 `df[:, :col]`,即使用 `:` 作为列选择器。
84 | 这种方式 **会拷贝** 列 `col`, 所以请注意这将产生非预期的内存分配。
85 |
86 | 与之前一样, 让我们尝试这两种方法来访问 `DataFrame` `responses` 中的列:
87 |
88 | ```jl
89 | s = """
90 | # allocation # hide
91 | df = responses()
92 | @allocated col = df[:, :id]
93 | """
94 | sco(s; process=string, post=plainblock)
95 | ```
96 |
97 |
98 | ```jl
99 | s = """
100 | # non allocation # hide
101 | df = responses()
102 | @allocated col = df[!, :id]
103 | """
104 | sco(s; process=string, post=plainblock)
105 | ```
106 |
107 | 当访问某列而不复制它时,不会进行任何内存分配,代码应该更快。
108 | 所以,如果不需要复制,通常请使用 `df.col` 或 `df[!, :col]` 访问 `DataFrame` 的列, 而不是 `df[:, :col]`。
109 |
110 | ### CSV.read 和 CSV.File {#sec:df_performance_csv_read_file}
111 |
112 | 如果查看过 `CSV.read` 的帮助输出, 你将会发现一个与该函数功能等价的便利函数 `CSV.File`,它们拥有相同的关键字参数。
113 | `CSV.read` 和 `CSV.File` 都可以用来读取 CSV 文件的内容,但它们的默认行为不同。
114 | **在默认情况下,`CSV.read` 不会复制** 输入数据。
115 | 取而代之的是,`CSV.read` 会把所有数据传入第二个参数(称为“槽”)。
116 |
117 | 因此如下所示:
118 |
119 | ```julia
120 | df = CSV.read("file.csv", DataFrame)
121 | ```
122 |
123 | 这将会把 `file.csv` 中的数据传入 `DataFrame` 槽, 然后把 `DataFrame` 类型返回给 `df` 变量。
124 |
125 | 在 `CSV.File` 的例子中,**情况相反:默认情况下,它将会复制 CSV 文件中的每一列**。
126 | 另外,语法上也稍有不同。
127 | 我们需要将 `CSV.File` 返回的所有数据包含在 `DataFrame` 构造器函数:
128 |
129 | ```julia
130 | df = DataFrame(CSV.File("file.csv"))
131 | ```
132 |
133 | 或者,也可以使用 `|>` 管道运算符:
134 |
135 | ```julia
136 | df = CSV.File("file.csv") |> DataFrame
137 | ```
138 |
139 | 如之前所说, `CSV.File` 将会复制 CSV 文件中的每一列。
140 | 因此,如果想要最佳性能,那么肯定应该使用 `CSV.read` 而不是 `CSV.File`。
141 | 这就是为什么 在 @sec:csv 中只讨论 `CSV.read`。
142 |
143 | ### CSV.jl 与多文件 {#sec:df_performance_csv_multiple}
144 |
145 | 现在让我们关注 `CSV.jl`。
146 | 特别要关注将多个 CSV 文件读取到一个 `DataFrame` 的例子。
147 | 从 `CSV.jl` 的 0.9 版本开始, 我们可以提供文件名字符串的向量。
148 | 在此之前,用户需要按顺序读取多个文件,并将它们垂直连接到单个 `DataFrame` 中。
149 | 举个例子,下面的代码将读取多个 CSV 文件,并使用 `vcat` 和 `reduce` 函数将它们垂直连接到单个 `DataFrame` 中:
150 |
151 | ```julia
152 | files = filter(endswith(".csv"), readdir())
153 | df = reduce(vcat, CSV.read(file, DataFrame) for file in files)
154 | ```
155 |
156 | 一个附加的特点是,`reduce` 不能并行化,因为它必须遵循与 `files`向量相同的顺序。
157 |
158 | 若在 `CSV.jl` 使用该功能,则可以简单地向 `CSV.read` 函数传入 `files` 向量:
159 |
160 | ```julia
161 | files = filter(endswith(".csv"), readdir())
162 | df = CSV.read(files, DataFrame)
163 | ```
164 |
165 | `CSV.jl` 将会为每个文件单独指定一个计算机中可用的线程,然后将每个线程的的输出延迟连接到 `DataFrame` 中。
166 | 因此,在不使用 `reduce` 函数时,我们获得了 **额外的多线程优点**。
167 |
168 | ### CategoricalArrays.jl 压缩 {#sec:df_performance_categorical_compression}
169 |
170 | 如果需要处理大量的分类值,例如许多代表不同定性数据的文本数据列,那么可能需要使用 `CategoricalArrays.jl` 压缩来处理此类情况。
171 |
172 | 默认情况下, **`CategoricalArrays.jl` 将会使用 32 位 无符号整数 `UInt32` 来表示基础的类别**:
173 |
174 | ```jl
175 | s = """
176 | typeof(categorical(["A", "B", "C"]))
177 | """
178 | sco(s; process=string, post=plainblock)
179 | ```
180 |
181 | 这意味着,`CategoricalArrays.jl` 将最多可以在一列中表示 $2^{32}$ 中不同的类别,这是一个非常大的数字 (接近 43亿)。
182 | 在处理常规数据时,可能永远都不需要如此级别的容量[^bigdata]。
183 | 这就是为什么 `categorical` 有一个 `compress` 参数, 它可以通过接收 `true` 或 `false` 来决定是否压缩基本分类数据。
184 | 如果传入了 **`compress=true`, `CategoricalArrays.jl` 将会尝试把基本分类数据压缩到最小的 `UInt` 表示**。
185 | 例如,之前的 `categorical` 向量可以表示为 8 位无符号整数 `UInt8` (通常这样做,因为这是 Julia 中最小的无符号整型):
186 |
187 | [^bigdata]: 同时注意到常规数据 (最多 10 000 行)不算大数据(超过 100 000 行)。因此,若是主要处理大数据,请你谨慎设定分类值。
188 |
189 | ```jl
190 | s = """
191 | typeof(categorical(["A", "B", "C"]; compress=true))
192 | """
193 | sco(s; process=string, post=plainblock)
194 | ```
195 |
196 | 这些都意味着什么呢?
197 | 假设你有一个超级大的向量。
198 | 例如,1 百万元素的向量,但仅有四种基本类型:A,B,C,或 D。
199 | 如果不想压缩生成的分类向量,那么你将会存储 1 百万 `UInt32` 类型的元素。
200 | 另一方面,如果压缩它,那么将会存储 1 百万 `UInt8` 类型的元素。
201 | 可以使用 `Base.summarysize` 函数获得给定对象的基本大小(以字节为单位)。
202 | 因此,让我们量化一下,在不压缩一百万分类向量时,将需要多少更多的内存:
203 |
204 | ```julia
205 | using Random
206 | ```
207 |
208 | ```jl
209 | s = """
210 | one_mi_vec = rand(["A", "B", "C", "D"], 1_000_000)
211 | Base.summarysize(categorical(one_mi_vec))
212 | """
213 | sco(s; process=string, post=plainblock)
214 | ```
215 |
216 | 4 百万字节,大概是 3.8 MB。
217 | 不要觉得我们错了,这是对原始字符串向量的很大改进:
218 |
219 | ```jl
220 | s = """
221 | Base.summarysize(one_mi_vec)
222 | """
223 | sco(s; process=string, post=plainblock)
224 | ```
225 |
226 | 通过使用 `CategoricalArrays.jl` 将数据表示为 `UInt32` ,我们节省了 50% 的原始数据大小。
227 |
228 | 现在与压缩选项进行对比:
229 |
230 | ```jl
231 | s = """
232 | Base.summarysize(categorical(one_mi_vec; compress=true))
233 | """
234 | sco(s; process=string, post=plainblock)
235 | ```
236 |
237 | 在不丢失信息的情况下,我们将大小减少到原始未压缩向量大小的25% (四分之一)。
238 | 我们的压缩分类向量现在有100万字节,大约是1.0 MB。
239 |
240 | 因此,只要有提高性能的可能,请考虑在分类数据中使用 `compress=true`。
241 |
--------------------------------------------------------------------------------
/pandoc/template.tex:
--------------------------------------------------------------------------------
1 | \documentclass[
2 | notoc % Suppress Tufte style table of contents.
3 | ]{tufte-book}
4 |
5 | % Required Tufte packages.
6 | \usepackage{changepage} % or changepage
7 | \usepackage{fancyhdr}
8 | \usepackage{fontenc}
9 | \usepackage{geometry}
10 | \usepackage{hyperref}
11 | \usepackage{natbib}
12 | \usepackage{bibentry}
13 | \usepackage{optparams}
14 | \usepackage{paralist}
15 | \usepackage{placeins}
16 | \usepackage{ragged2e}
17 | \usepackage{setspace}
18 | \usepackage{textcase}
19 | \usepackage{textcomp}
20 | \usepackage{titlesec}
21 | \usepackage{titletoc}
22 | \usepackage{xcolor}
23 | \usepackage{xifthen}
24 |
25 | \geometry{$for(geometry)$$geometry$$sep$,$endfor$}
26 |
27 | % Tufte vs. Pandoc workaround.
28 | % Issue: https://github.com/Tufte-LaTeX/tufte-latex/issues/64.
29 | \renewcommand\allcapsspacing[1]{{\addfontfeature{LetterSpace=15}#1}}
30 | \renewcommand\smallcapsspacing[1]{{\addfontfeature{LetterSpace=10}#1}}
31 |
32 | % \setmainfont{TeX Gyre Pagella}
33 | \usepackage[utf8]{inputenc}
34 | \usepackage[T1]{fontenc}
35 | \setmainfont{texgyrepagella}[
36 | Extension = .otf,
37 | UprightFont = *-regular,
38 | BoldFont = *-bold,
39 | ItalicFont = *-italic,
40 | BoldItalicFont = *-bolditalic,
41 | ]
42 |
43 | \newfontfamily\JuliaMono{JuliaMono}[
44 | UprightFont = *-Regular,
45 | BoldFont = *-Bold
46 | ]
47 | \newfontface\JuliaMonoRegular{JuliaMono-Regular}
48 | \newfontface\JuliaMonoBold{JuliaMono-Bold}
49 |
50 | \setmonofont{JuliaMono-Medium}[
51 | Contextuals=Alternate,
52 | Ligatures=NoCommon
53 | ]
54 |
55 | $if(graphics)$
56 | \usepackage{graphicx}
57 | \makeatletter
58 | \def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
59 | \def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
60 | \makeatother
61 | % Scale images if necessary, so that they will not overflow the page
62 | % margins by default, and it is still possible to overwrite the defaults
63 | % using explicit options in \includegraphics[width, height, ...]{}
64 | \setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
65 | $endif$
66 | $if(links-as-notes)$
67 | \DeclareRobustCommand{\href}[2]{#2\footnote{\url{#1}}}
68 | $endif$
69 |
70 | \usepackage{float}
71 | \floatplacement{figure}{H}
72 |
73 | % Listings Julia syntax definition.
74 | \input{$listings-path$}
75 |
76 | % Unicode support.
77 | \input{$listings-unicode-path$}
78 |
79 | % Used by Pandoc.
80 | \providecommand{\tightlist}{%
81 | \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}
82 | }
83 | \newcommand{\passthrough}[1]{#1}
84 |
85 | \usepackage{longtable}
86 | \usepackage{booktabs}
87 | \usepackage{array}
88 |
89 | % Source: Wandmalfarbe/pandoc-latex-template.
90 | $if(csl-refs)$
91 | \newlength{\cslhangindent}
92 | \setlength{\cslhangindent}{1.5em}
93 | \newlength{\csllabelwidth}
94 | \setlength{\csllabelwidth}{3em}
95 | \newenvironment{CSLReferences}[2] % #1 hanging-ident, #2 entry spacing
96 | {% don't indent paragraphs
97 | \setlength{\parindent}{0pt}
98 | % turn on hanging indent if param 1 is 1
99 | \ifodd #1 \everypar{\setlength{\hangindent}{\cslhangindent}}\ignorespaces\fi
100 | % set entry spacing
101 | \ifnum #2 > 0
102 | \setlength{\parskip}{#2\baselineskip}
103 | \fi
104 | }%
105 | {}
106 | \usepackage{calc}
107 | \newcommand{\CSLBlock}[1]{#1\hfill\break}
108 | \newcommand{\CSLLeftMargin}[1]{\parbox[t]{\csllabelwidth}{#1}}
109 | \newcommand{\CSLRightInline}[1]{\parbox[t]{\linewidth - \csllabelwidth}{#1}\break}
110 | \newcommand{\CSLIndent}[1]{\hspace{\cslhangindent}#1}
111 | $endif$
112 |
113 | \definecolor{linkblue}{HTML}{117af2}
114 | \usepackage{hyperref}
115 | \hypersetup{
116 | colorlinks,
117 | citecolor=linkblue,
118 | linkcolor=linkblue,
119 | urlcolor=linkblue,
120 | linktoc=page, % Avoid Table of Contents being nearly completely blue.
121 | $if(title-meta)$
122 | pdftitle={$title-meta$},
123 | $endif$
124 | $if(author-meta)$
125 | pdfauthor={$author-meta$},
126 | $endif$
127 | $if(lang)$
128 | pdflang={$lang$},
129 | $endif$
130 | $if(subject)$
131 | pdfsubject={$subject$},
132 | $endif$
133 | $if(keywords)$
134 | pdfkeywords={$for(keywords)$$keywords$$sep$, $endfor$},
135 | $endif$
136 | breaklinks=true,
137 | pdfcreator={LaTeX via Pandoc}%
138 | }
139 | \urlstyle{same} % disable monospaced font for URLs
140 |
141 | \title{$title$}
142 | \author{$for(author)$\noindent{$author$}\\[3mm] $endfor$}
143 | \date{$date$}
144 |
145 | % Re-enable section numbering which was disabled by tufte.
146 | \setcounter{secnumdepth}{2}
147 |
148 | % Fix captions for longtable.
149 | % Thanks to David Carlisle at https://tex.stackexchange.com/a/183344/92217.
150 | \makeatletter
151 | \def\LT@makecaption#1#2#3{%
152 | \noalign{\smash{\hbox{\kern\textwidth\rlap{\kern\marginparsep
153 | \parbox[t]{\marginparwidth}{\vspace{12pt}%
154 | \@tufte@caption@font \@tufte@caption@justification \noindent
155 | #1{#2: }\ignorespaces #3}}}}}}
156 | \makeatother
157 |
158 | % Doesn't seem to do anything.
159 | \usepackage{float}
160 | \floatplacement{figure}{H}
161 | \floatplacement{table}{H}
162 |
163 | % Reduce large spacing around sections.
164 | \titlespacing*{\chapter}{0pt}{5pt}{20pt}
165 | \titlespacing*{\section}{0pt}{2.5ex plus 1ex minus .2ex}{1.3ex plus .2ex}
166 | \titlespacing*{\subsection}{0pt}{1.75ex plus 1ex minus .2ex}{1.0ex plus.2ex}
167 |
168 | \titleformat{\chapter}%
169 | [hang]% shape
170 | {\normalfont\huge\itshape}% format applied to label+text
171 | {\huge\thechapter}% label
172 | {1em}% horizontal separation between label and title body
173 | {}% before the title body
174 | []% after the title body
175 |
176 | % Reduce spacing in table of contents.
177 | \usepackage{etoolbox}
178 | \makeatletter
179 | \pretocmd{\chapter}{\addtocontents{toc}{\protect\addvspace{-3\p@}}}{}{}
180 | \pretocmd{\section}{\addtocontents{toc}{\protect\addvspace{-4\p@}}}{}{}
181 | \pretocmd{\subsection}{\addtocontents{toc}{\protect\addvspace{-5\p@}}}{}{}
182 | \makeatother
183 |
184 | % Long texts are harder to read than tables.
185 | % Therefore, we can reduce the font size of the table.
186 | \AtBeginEnvironment{longtable}{\footnotesize}
187 |
188 | % Some space between paragraphs is necessary because code blocks can output single line paragraphs.
189 | \setlength\parskip{1em plus 0.1em minus 0.2em}
190 |
191 | % For justified text.
192 | \usepackage{ragged2e}
193 |
194 | % tufte-book disables subsubsections by default.
195 | % Got this definition back via `\show\subsubsection`.
196 | $if(allow-subsubsections)$
197 | \makeatletter
198 | \renewcommand\subsubsection{%
199 | \@startsection{subsubsection}{3}{\z@ }{-3.25ex\@plus -1ex \@minus -.2ex}{1.5ex \@plus .2ex}{\normalfont \normalsize \bfseries }
200 | }
201 | \makeatother
202 | $endif$
203 |
204 | $if(CJKmainfont)$
205 | \PassOptionsToPackage{space}{xeCJK}
206 | \ifxetex
207 | \usepackage{xeCJK}
208 | \setCJKmainfont[$for(CJKoptions)$$CJKoptions$$sep$,$endfor$]{$CJKmainfont$}
209 | \fi
210 | \ifluatex
211 | \usepackage[$for(luatexjafontspecoptions)$$luatexjafontspecoptions$$sep$,$endfor$]{luatexja-fontspec}
212 | \setmainjfont[$for(CJKoptions)$$CJKoptions$$sep$,$endfor$]{$CJKmainfont$}
213 | \fi
214 | $endif$
215 |
216 | \usepackage{amsfonts}
217 | \usepackage{amssymb}
218 | \usepackage{amsmath}
219 | \usepackage{unicode-math}
220 |
221 | % URL line breaks.
222 | \usepackage{xurl}
223 |
224 | % Probably doesn't hurt.
225 | \usepackage{marginfix}
226 |
227 | $if(disable-cleardoublepage)$
228 | \let\cleardoublepage\clearpage
229 | $endif$
230 |
231 | \begin{document}
232 |
233 | \makeatletter
234 | \thispagestyle{empty}
235 | \vfill
236 | {\Huge\bf
237 | \noindent
238 | \@title
239 | }\\[1in]
240 | {\Large
241 | \noindent
242 | \@author
243 | }
244 | \makeatother
245 |
246 | \makeatletter
247 | \newpage
248 | \thispagestyle{empty}
249 | \vfill
250 | {\noindent
251 | $titlepage-top$
252 | }
253 | \vfill
254 | {\small
255 | $titlepage-bottom$
256 |
257 | $build-info$
258 |
259 | $tex-license$
260 | }
261 | \makeatother
262 |
263 | $if(dedication)$
264 | \newpage
265 | \thispagestyle{empty}
266 | \begin{center}
267 | \vspace*{\fill}
268 | $dedication$
269 | \vspace*{\fill}
270 | \end{center}
271 | \cleardoublepage
272 | $endif$
273 |
274 | % Don't remove this or authors will show up in header of every page.
275 | \frontmatter
276 | \mainmatter
277 | $if(pdf-footer)$
278 | \fancyfoot[C]{$pdf-footer$}
279 | $endif$
280 |
281 | $if(toc)$
282 | \setcounter{tocdepth}{$tocdepth$}
283 | \tableofcontents
284 | $endif$
285 |
286 | % Justify text.
287 | \justifying
288 |
289 | % parindent seems to be set from within another class too.
290 | % it is really not useful here because it will also indent lines directly after
291 | % code blocks. Which most of the times not useful.
292 | \setlength{\parindent}{0pt}
293 |
294 | $body$
295 |
296 | \backmatter
297 |
298 | \end{document}
299 |
300 |
--------------------------------------------------------------------------------
/pandoc/references.bib:
--------------------------------------------------------------------------------
1 | @article{bezanson2017julia,
2 | title = {Julia: {{A}} Fresh Approach to Numerical Computing},
3 | author = {Bezanson, Jeff and Edelman, Alan and Karpinski, Stefan and Shah, Viral B},
4 | year = {2017},
5 | volume = {59},
6 | pages = {65--98},
7 | publisher = {{SIAM}},
8 | journal = {SIAM review},
9 | number = {1}
10 | }
11 |
12 | @article{chen2014big,
13 | title = {Big data: A survey},
14 | author = {Chen, Min and Mao, Shiwen and Liu, Yunhao},
15 | journal = {Mobile networks and applications},
16 | volume = {19},
17 | number = {2},
18 | pages = {171--209},
19 | year = {2014},
20 | publisher = {Springer}
21 | }
22 |
23 | @article{chenRobustBenchmarkingNoisy2016,
24 | title = {Robust Benchmarking in Noisy Environments},
25 | author = {Chen, Jiahao and Revels, Jarrett},
26 | year = {2016},
27 | month = aug,
28 | archiveprefix = {arXiv},
29 | eprint = {1608.04295},
30 | eprinttype = {arxiv},
31 | journal = {arXiv:1608.04295 [cs]},
32 | keywords = {68N30,B.8.1,Computer Science - Performance,D.2.5},
33 | primaryclass = {cs}
34 | }
35 |
36 | @misc{domo2018data,
37 | title = {Data Never Sleeps 6.0},
38 | author = {Domo},
39 | url = {https://www.domo.com/assets/downloads/18_domo_data-never-sleeps-6+verticals.pdf},
40 | year = {2018}
41 | }
42 |
43 | @article{fitzgerald2020idc,
44 | title = {IDC FutureScape: Worldwide Digital Transformation 2021 Predictions},
45 | author = {Fitzgerald, S and Jimenez, D. Z. and Findling S. and Yorifuji, Y. and Kumar, M. and Wu, L. and Carosella, G. and Ng, S and Parker, R. Carter, P. and Whalen, M.},
46 | journal = {IDC FutureScape},
47 | year = {2020}
48 | }
49 |
50 | @article{gantz2012digital,
51 | title = {The digital universe in 2020: Big data, bigger digital shadows, and biggest growth in the far east},
52 | author = {Gantz, John and Reinsel, David},
53 | journal = {IDC iView: IDC Analyze the future},
54 | volume = {2007},
55 | number = {2012},
56 | pages = {1--16},
57 | year = {2012}
58 | }
59 |
60 | @misc{johnmyleswhiteJuliaDataDataFramesJl2020,
61 | title = {{{JuliaData}}/{{DataFrames}}.Jl: V0.22.1},
62 | shorttitle = {{{JuliaData}}/{{DataFrames}}.Jl},
63 | author = {John Myles White and Bogumi{\l} Kami{\'n}ski and {powerdistribution} and {Milan Bouchet-Valat} and Sean Garborg and Jacob Quinn and Simon Kornblith and {cjprybol} and Alexey Stukalov and Douglas Bates and Tom Short and Chris DuBois and Harlan Harris and Kevin Squire and Alex Arslan and {pdeffebach} and David Anthoff and Dave Kleinschmidt and Andreas Noack and Viral B. Shah and Alex Mellnik and Takafumi Arakaki and Tanmay Mohapatra and Peter and Stefan Karpinski and Dahua Lin and {timema} and ExpandingMan and Florian Oswald and Lyndon White},
64 | year = {2020},
65 | month = nov,
66 | doi = {10.5281/zenodo.4282946},
67 | howpublished = {Zenodo}
68 | }
69 |
70 | @misc{jump2021using,
71 | title = {JuMP Style Guide},
72 | year = 2021,
73 | url = {https://jump.dev/JuMP.jl/v0.21/developers/style/#using-vs.-import},
74 | note = {Accessed: 2021-09-19}
75 | }
76 |
77 | @article{khan2014big,
78 | title = {Big data: survey, technologies, opportunities, and challenges},
79 | author = {Khan, Nawsher and Yaqoob, Ibrar and Hashem, Ibrahim Abaker Targio and Inayat, Zakira and Mahmoud Ali, Waleed Kamaleldin and Alam, Muhammad and Shiraz, Muhammad and Gani, Abdullah},
80 | journal = {The scientific world journal},
81 | volume = {2014},
82 | year = {2014},
83 | publisher = {Hindawi}
84 | }
85 |
86 | @book{lauwensThinkJuliaHow2019,
87 | title = {Think {{Julia}}: {{How}} to {{Think Like}} a {{Computer Scientist}}},
88 | shorttitle = {Think {{Julia}}},
89 | author = {Lauwens, Ben and Downey, Allen B.},
90 | year = {2019},
91 | month = may,
92 | edition = {1st edition},
93 | publisher = {{O'Reilly Media}},
94 | address = {{Beijing ; Sebastopol, CA}},
95 | isbn = {978-1-4920-4503-8},
96 | language = {English}
97 | }
98 |
99 |
100 | @article{Meng2019Data,
101 | journal = {Harvard Data Science Review},
102 | doi = {10.1162/99608f92.ba20f892},
103 | number = {1},
104 | note = {https://hdsr.mitpress.mit.edu/pub/jhy4g6eg},
105 | title = {Data Science: An Artificial Ecosystem},
106 | url = {https://hdsr.mitpress.mit.edu/pub/jhy4g6eg},
107 | volume = {1},
108 | author = {Meng, Xiao-Li},
109 | date = {2019-07-01},
110 | year = {2019},
111 | month = {7},
112 | day = {1}
113 | }
114 |
115 | @article{perkelJuliaComeSyntax2019,
116 | title = {Julia: Come for the Syntax, Stay for the Speed},
117 | shorttitle = {Julia},
118 | author = {Perkel, Jeffrey M.},
119 | year = {2019},
120 | month = jul,
121 | volume = {572},
122 | pages = {141--142},
123 | publisher = {{Nature Publishing Group}},
124 | doi = {10.1038/d41586-019-02310-3},
125 | copyright = {2021 Nature},
126 | journal = {Nature},
127 | language = {en},
128 | number = {7767}
129 | }
130 |
131 | @techreport{pep8,
132 | author = {Guido {van Rossum} and Barry Warsaw and Nick Coghlan},
133 | title = {Style Guide for {Python} Code},
134 | year = {2001},
135 | type = {PEP},
136 | number = {8},
137 | url = {https://www.python.org/dev/peps/pep-0008/},
138 | }
139 |
140 | @book{senguptaJuliaHighPerformance2019,
141 | title = {Julia {{High Performance}}: {{Optimizations}}, Distributed Computing, Multithreading, and {{GPU}} Programming with {{Julia}} 1.0 and beyond, 2nd {{Edition}}},
142 | shorttitle = {Julia {{High Performance}}},
143 | author = {Sengupta, Avik and Edelman, Alan},
144 | year = {2019},
145 | month = jun,
146 | publisher = {{Packt Publishing}},
147 | isbn = {978-1-78829-811-7},
148 | language = {English}
149 | }
150 |
151 | @misc{simonJuliaPlotsMakieJl2021,
152 | title = {{{JuliaPlots}}/{{Makie}}.Jl: V0.14.1},
153 | shorttitle = {{{JuliaPlots}}/{{Makie}}.Jl},
154 | author = {Simon and {jkrumbiegel} and Singhvi, Anshul and Wang, Anthony and Freyer, Frederic and Vertechi, Pietro and Holy, Tim and Borregaard, Michael Krabbe and Datseris, George and M, Mustafa and Greimel, Fabian and Butterworth, Ian and Foster, Chris and Dehaybe, Henri and Schauer, Moritz and Kilpatrick, Logan and Byrne, Simon and Widmann, David and {kragol} and Weidner, Jan and Sharma, Arsh and {Micluța-C{\^a}mpeanu}, Sebastian and Hatherly, Michael and Herikstad, Roger and Goretkin, Gustavo and TagBot, Julia and {\v S}tih, Vilim and {smldis} and Ponet, Louis},
155 | year = {2021},
156 | month = jun,
157 | doi = {10.5281/zenodo.4983497},
158 | howpublished = {Zenodo}
159 | }
160 |
161 | @misc{storopoli2021bayesianjulia,
162 | author = {Storopoli, Jose},
163 | title = {Bayesian Statistics with Julia and Turing},
164 | url = {https://storopoli.io/Bayesian-Julia},
165 | year = {2021}
166 | }
167 |
168 | @misc{tanmaybakshiBakingKnowledgeMachine2021,
169 | title = {Baking {{Knowledge}} into {{Machine Learning Models}}\textemdash{{Chris Rackauckas}} on {{TechLifeSkills}} w/ {{Tanmay Ep}}.55},
170 | author = {{tanmay bakshi}},
171 | year = {2021},
172 | month = apr,
173 | url = {https://youtu.be/moyPIhvw4Nk}
174 | }
175 |
176 | @misc{tedxtalksProgrammingLanguageHeal2020,
177 | title = {A Programming Language to Heal the Planet Together: {{Julia}} | {{Alan Edelman}} | {{TEDxMIT}}},
178 | shorttitle = {A Programming Language to Heal the Planet Together},
179 | author = {{TEDx Talks}},
180 | year = {2020},
181 | month = feb,
182 | url = {https://youtu.be/qGW0GT1rCvs}
183 | }
184 |
185 | @misc{tombreloffJuliaPlotsPlotsJl2021,
186 | title = {{{JuliaPlots}}/{{Plots}}.Jl: V1.13.2},
187 | shorttitle = {{{JuliaPlots}}/{{Plots}}.Jl},
188 | author = {Tom Breloff and Daniel Schwabeneder and Michael Krabbe Borregaard and Simon Christ and Josef Heinen and Yuval and Andrew Palugniok and Simon and Pietro Vertechi and Zhanibek and Thatcher Chamberlin and {ma-laforge} and Christopher Rackauckas and Oliver Schulz and Sebastian Pfitzner and Takafumi Arakaki and Amin Yahyaabadi and Jack Devine and Sebastian Pech and Patrick Kofod Mogensen and Samuel S. Watson},
189 | year = {2021},
190 | month = apr,
191 | doi = {10.5281/zenodo.4725318},
192 | howpublished = {Zenodo}
193 | }
194 |
195 | @article{wickham2011split,
196 | title={The split-apply-combine strategy for data analysis},
197 | author={Wickham, Hadley},
198 | journal={Journal of statistical software},
199 | volume={40},
200 | number={1},
201 | pages={1--29},
202 | year={2011}
203 | }
204 |
205 | @book{bertsekas2000introduction,
206 | title={Introduction to probability},
207 | author={Bertsekas, Dimitri P and Tsitsiklis, John N},
208 | year={2008},
209 | edition={2nd edition},
210 | publisher={Athena Scientinis}
211 | }
212 |
213 | @article{lewandowskiGeneratingRandomCorrelation2009,
214 | title = {Generating Random Correlation Matrices Based on Vines and Extended Onion Method},
215 | author = {Lewandowski, Daniel and Kurowicka, Dorota and Joe, Harry},
216 | date = {2009-10-01},
217 | journaltitle = {Journal of Multivariate Analysis},
218 | shortjournal = {Journal of Multivariate Analysis},
219 | volume = {100},
220 | number = {9},
221 | pages = {1989--2001},
222 | issn = {0047-259X},
223 | doi = {10.1016/j.jmva.2009.04.008},
224 | url = {http://www.sciencedirect.com/science/article/pii/S0047259X09000876},
225 | urldate = {2021-01-27},
226 | langid = {english},
227 | keywords = {Correlation matrix,Dependence vines,Onion method,Partial correlation},
228 | }
229 |
230 | @article{anscombe1973graphs,
231 | title={Graphs in statistical analysis},
232 | author={Anscombe, Francis J},
233 | journal={The american statistician},
234 | volume={27},
235 | number={1},
236 | pages={17--21},
237 | year={1973},
238 | publisher={Taylor \& Francis Group}
239 | }
240 |
--------------------------------------------------------------------------------
/contents/stats_distributions.md:
--------------------------------------------------------------------------------
1 | ## Distributions {#sec:stats_dist}
2 |
3 | Distributions are the thing that makes most of the statistical machinery tick.
4 | We can show how a distribution can arise via a simple example.
5 | Suppose that a ball falls on top of a few rows of pins.
6 | At every pin, the ball can either fall to the left or to the right.
7 | We count a fall to the right as 1 and a fall to the left as 0.
8 | Now, the question is: how many times will the ball fall to the right?
9 |
10 | To simulate this, we can use the `Random` module from Julia's standard library.
11 | We also set a seed to ensure that the outcome of this code is the same for every run:
12 |
13 | ```
14 | using Random
15 | ```
16 |
17 | ```jl
18 | sc("seed!(0)")
19 | ```
20 |
21 | ```jl
22 | s = """
23 | n_rows = 100
24 | x = count(rand(Bool, n_rows))
25 | """
26 | scob(s)
27 | ```
28 |
29 | Okay, so over `jl n_rows` rows, the ball fell `jl x` times to the right.
30 | What if we do the same thing a few more times?
31 |
32 | ```jl
33 | s = """
34 | n_repeats = 800
35 | X = [count(rand(Bool, n_rows)) for i in 1:n_repeats]
36 | X[1:10]
37 | """
38 | sco(s)
39 | ```
40 |
41 | Apparently, the ball doesn't always fall `jl x` times to the right.
42 | We can create a scatter plot for these numbers:
43 |
44 | ```jl
45 | s = """
46 | CairoMakie.activate!() # hide
47 | figure = (; resolution=(600, 400))
48 | xlabel = "number of times fallen to the right"
49 | axis = (; xlabel, ylabel="ball")
50 | scatter(X, 1:n_repeats; figure, axis)
51 | label = "first_distribution" # hide
52 | link_attributes = "width=80%" # hide
53 | caption = "Scatter plot for balls falling on pins." # hide
54 | Options(current_figure(); filename=label, caption, label, link_attributes) # hide
55 | """
56 | sco(s)
57 | ```
58 |
59 | To answer our question about how many times the ball will fall to the right, we can look at it from another angle.
60 | We can't answer the question exactly, but we can make a guess based on how many balls have fallen in each region.
61 | For example, when looking at @fig:first_distribution, we can conclude that it is very unlikely that a ball will fall 0 times to the right or that the ball will fall 100 times to the right.
62 | It is more likely that a ball will fall roughly half of the time to the left and half of the time to the right.
63 | In other words, that the ball will fall 50 times to the right.
64 | But, it also clearly isn't the case that the ball always falls 50 times to the right.
65 | A question that we can answer is: how many times could we expect the ball to fall 40 times to the right? What about 60 times?
66 | To answer that, we can divide the range $[0, 100]$ into bins and count how many balls have fallen in that range.
67 | This is called a **hist**ogram:
68 |
69 | ```jl
70 | s = """
71 | figure = (; resolution=(600, 400))
72 | axis = (; xlabel, ylabel="balls per bin")
73 | hist(X; figure, axis)
74 | label = "first_histogram" # hide
75 | link_attributes = "width=80%" # hide
76 | caption = "Histogram for balls falling on pins." # hide
77 | Options(current_figure(); filename=label, caption, label, link_attributes) # hide
78 | """
79 | sco(s)
80 | ```
81 |
82 | And there we have something that is bell-shaped.
83 | Now, the question about how many times a ball will fall to the right can be answered by estimating the distribution for these data points.
84 | Then, the answer will look something like "we expect the falling of the ball to be distributed by a ... distribution with ...."
85 | For example, "we expect the falling of the ball to be distributed by a normal distribution with a mean of 50 and a variance of 5" (see @sec:stats_dist_normal for more information about the normal distribution).
86 |
87 | What we have shown here is inspired by Galton's board (see @fig:galton).
88 | This board was used around 1880 by Sir Francis Galton to demonstrate regression to the mean.
89 | On the board, balls are dropped onto onto the pins and drop either to the left or the right at various levels.
90 | At the bottom, the balls are aggregated in bins and here a bell-shape arises too.
91 |
92 | {#fig:galton width=60%}
93 |
94 | There are several distributions defined in textbooks[^stats_book] and scientific articles[^stats_articles].
95 | We'll cover a few of them in this section.
96 | In Julia, there is a package that provides a large collection of distributions and related functions.
97 | The package is called [`Distributions.jl`](https://juliastats.org/Distributions.jl/dev/), which we will use to showcase some distributions:
98 |
99 | [^stats_book]: we recommend @bertsekas2000introduction.
100 | [^stats_articles]: one example is the $\operatorname{LKJ}$ distribution [@lewandowskiGeneratingRandomCorrelation2009].
101 |
102 | ```julia
103 | using Distributions
104 | ```
105 |
106 | ### Normal and Non-Normal Distributions {#sec:stats_dist_normal}
107 |
108 | The most commonly arising distribution is the **normal distribution**.
109 | It occurs a lot in nature; for example, the height and weight of humans, and the size of snowflakes.
110 | It has the ubiquitous shape of a bell curve and is also known as the *Gaussian* distribution in honor of the mathematician Carl Friedrich Gauss.
111 |
112 | This distribution is generally used in the social and natural sciences to represent variables in which their distributions are not known.
113 | Some phenomena follow a normal distribution, such as human height, human shoe size, or test scores.
114 |
115 | However, beware that *not all phenomena are normally distributed*.
116 | For example, income, financial returns, city size, social media followers are examples of important phenomena that, generally, do not follow a normal distribution.
117 | Instead, they follow what we will call a non-normal distribution, or, as some also denote, a long-tailed distribution.
118 |
119 | In @fig:plot_normal_lognormal, we can see the comparison between a distribution that follows a normal distribution and a distribution that is long-tailed, i.e. does *not* follow a normal distribution.
120 |
121 | ```jl
122 | fig = plot_normal_lognormal()
123 | caption = "Normal and Non-Normal Distributed Distributions."
124 | label = "plot_normal_lognormal"
125 | Options(fig; filename=label, caption, label)
126 | ```
127 |
128 | ### Discrete and Continuous {#sec:stats_dist_discrete_continuous}
129 |
130 | Regarding distributions, we have mainly two types of distributions: **discrete** and **continuous**.
131 | In figure @fig:plot_discrete_continuous, we have two distributions.
132 | To the left, a discrete distribution represented by bins of values and their probability as the height of the bar.
133 | To the right, a continuous distribution represented by a continuous curve and the probability of values as the area under the curve.
134 |
135 | ```jl
136 | fig = plot_discrete_continuous()
137 | caption = "Discrete and Continuous Distributions."
138 | label = "plot_discrete_continuous"
139 | Options(fig; filename=label, caption, label)
140 | ```
141 |
142 | A distribution is called **discrete if the set of values that it can take as inputs is either finite or countably finite**.
143 | Discrete distributions examples are the toss of a coin, the roll of a dice, or the number of earthquakes per year.
144 |
145 | How we characterize distributions is through the probability values that its inputs can take and we express it as a probability function.
146 | In the case of the discrete distributions, the inputs have a "mass", that is why their characterization is captured by the **probability mass function** (pmf).
147 |
148 | $$ \operatorname{pmf} = P(X = x) $$ {#eq:pmf}
149 |
150 | In @fig:plot_pmf we have the pmf of a 6-sided dice where each one of the six outcomes are equally likely, i.e. they have equal probability.
151 |
152 | ```jl
153 | fig = plot_pmf()
154 | caption = "Probability Mass Function of a 6-sided Dice."
155 | label = "plot_pmf"
156 | Options(fig; filename=label, caption, label)
157 | ```
158 |
159 | A distribution is called **continuous if the set of value that it can take as inputs is uncountably infinite**.
160 | Examples of continuous distributions are the weight or height or a person, the waiting time for the next bus to arrive or the infection rate of a contagious disease.
161 |
162 | Analogously, in the continuous case, the inputs have a "density", thus we characterize continuous distributions with the **probability density function** (pdf).
163 |
164 | $$ \operatorname{pdf} = P(a \leq X \leq b) = \int_a^b f(x) dx $$ {#eq:pdf}
165 |
166 | Since a continuous distribution takes uncountably infinite this means that the pdf of a specific value is always *zero* and the pdf is only defined on an interval.
167 | That's why we have an integral in @eq:pdf.
168 |
169 | For example, in @fig:plot_pdf, we have a normal distribution with mean 0 and standard deviation 1.
170 | Notice that, since this is a valid probability, the gray shaded area that represents all the possible values integrates to 1.
171 | The red shaded area is the probability, under this distribution, of observing values from 1 to 2 and we need to calculate the total area.
172 | So we integrate the distribution from 1 to 2, this is the pdf and evaluates to `jl calculate_pdf(1, 2)`.
173 |
174 | ```jl
175 | fig = plot_pdf()
176 | caption = "Probability Density Function of a Normal Distribution."
177 | label = "plot_pdf"
178 | Options(fig; filename=label, caption, label)
179 | ```
180 |
181 | ### Cumulative Distribution Function {#sec:stats_dist_cdf}
182 |
183 | Finally, there is one more important distribution function.
184 | The **cumulative distribution function (cdf) provides the probability $P(X \leq x)$**.
185 | In other words, **the cdf is an accumulation of the probability we observe values up to a given value $x$**.
186 | It is defined both for discrete and continuous variables:
187 |
188 | $$ \operatorname{cdf} = P(X \leq x) =
189 | \begin{cases}
190 | \sum_{k \leq x} P(k), & \text{if $X$ is discrete},\\
191 | \int^x_{- \infty} f(t) dt, & \text{if $X$ is continuous.}
192 | \end{cases} $$ {#eq:cdf}
193 |
194 | In @eq:cdf we sum all values less than or equal to $x$ if the distribution $X$ is discrete or we integrate from minus infinity to $x$ if the distribution $X$ is continuous.
195 | For example, in @fig:plot_cdf_discrete we have the cdf of a 6-sided dice.
196 | Notice that since the values are equally likely, the cdf becomes a step function that scales linearly with the possible outcome values.
197 |
198 | ```jl
199 | fig = plot_cdf("discrete")
200 | caption = "Cumulative Distribution Function of a Discrete Distribution -- 6-sided Dice."
201 | label = "plot_cdf_discrete"
202 | Options(fig; filename=label, caption, label)
203 | ```
204 |
205 | To contrast, we present the cdf of a normal distribution with mean 0 and standard deviation 1 in @fig:plot_cdf_continuous.
206 | You can see that since the outcomes are *not* equally likely, our cdf scales differently.
207 |
208 | ```jl
209 | fig = plot_cdf("continuous")
210 | caption = "Cumulative Distribution Function of a Continuous Distribution -- Normal Distribution."
211 | label = "plot_cdf_continuous"
212 | Options(fig; filename=label, caption, label)
213 | ```
214 |
--------------------------------------------------------------------------------
/src/front-cover.jl:
--------------------------------------------------------------------------------
1 | const NOTO_SANS_BOLD = assetpath("fonts", "NotoSans-Bold.ttf")
2 | const JuliaColors = Colors.JULIA_LOGO_COLORS
3 | const Set1 = ColorSchemes.Set1_4
4 | seed!(123)
5 |
6 | myrand(x, y, z) = rand()
7 |
8 | const FRONTX = 1:11
9 | const FRONTY = 1:10
10 | const FRONTZ = 1:11
11 |
12 | function mypeaks(; n=49)
13 | x = LinRange(-3, 3, n)
14 | y = LinRange(-3, 3, n)
15 | a = 3 * (1 .- x') .^ 2 .* exp.(-(x' .^ 2) .- (y .+ 1) .^ 2)
16 | b = 10 * (x' / 5 .- x' .^ 3 .- y .^ 5) .* exp.(-x' .^ 2 .- y .^ 2)
17 | c = 1 / 3 * exp.(-(x' .+ 1) .^ 2 .- y .^ 2)
18 | return (x, y, a .- b .- c)
19 | end
20 |
21 | arr = [(i, j, k) for i in FRONTX[1:end-1], j in FRONTY, k in FRONTZ[1:end-1]]
22 | positions = vec(arr)
23 | #posRandom = vec([(i,j , k) for i in rand(12:20,5),j in rand(1:10,5), k in rand(1:10,5)])
24 | arrTop = [(i, j, k) for i in FRONTY[1:end-1], j in FRONTY, k in FRONTZ[end]]
25 | posTops = vec(arrTop)
26 | arrSides = [(i, j, k) for i in FRONTZ[end], j in FRONTY, k in FRONTZ[1:end-1]]
27 | posSides = vec(arrSides)
28 |
29 |
30 | vals = [randn() for ix in FRONTX[1:end-1], iy in FRONTY, iz in FRONTZ[1:end-1]]
31 | valsTop = [myrand(ix, iy, iz) for ix in FRONTY[1:end-1], iy in FRONTY, iz in FRONTZ[end]]
32 | valsSides = [myrand(ix, iy, iz) for ix in FRONTZ[end], iy in FRONTY, iz in FRONTZ[1:end-1]]
33 | colorTop = vec(valsTop[:, end])
34 | colorSides = vec(valsSides[end, :])
35 |
36 |
37 | """
38 | front_cover()
39 |
40 | Return the Julia Data Science book front cover.
41 | """
42 | function front_cover()
43 | # Probably it will be good to have two versions, one black and one white.
44 | CairoMakie.activate!()
45 | with_theme(theme_black(); Axis=(; ygridcolor=:grey70, xgridcolor=:grey70,
46 | xgridstyle=:dashdot, ygridstyle=:dashdot),
47 | Axis3=(; xgridcolor=:grey70, ygridcolor=:grey70, zgridcolor=:grey70)) do
48 |
49 | width = 2016
50 | height = (10 / 7) * width # Ratio 7 * 10 inch.
51 | fig = Figure(figure_padding=(50, 15, 5, 5), resolution=(width, height))
52 | # Colors
53 | colors = ColorSchemes.Set1_6
54 | #colors = Makie.wong_colors()
55 | # Markers
56 | ms = 20
57 | # Axis
58 | ax11 = Axis3(fig[1, 1], perspectiveness=0.5, azimuth=7.19, elevation=0.57,
59 | xlabel="x label", ylabel="y label", zlabel="z label",
60 | xgridvisible=false, ygridvisible=false, zgridvisible=false,
61 | aspect=(1, 1, 1))
62 | #ax12 = Axis3(fig[1,2]; perspectiveness = 0.5, aspect = (1,1,1)) # empty is ok, that's the idea... Q, how could u plot this kind of data
63 | ax21 = Axis(fig[2, 1], aspect=AxisAspect(1)) # xgridvisible=false, ygridvisible=false) # we can include this on the theme
64 | ax31 = Axis(fig[3, 1], aspect=AxisAspect(1)) # xgridvisible=false, ygridvisible=false)
65 | ax41 = Axis(fig[4, 1], aspect=AxisAspect(1)) # xgridvisible=false, ygridvisible=false)
66 | ax22 = Axis3(fig[2, 2], perspectiveness=0.5, aspect=(1, 1, 1)) # xgridvisible=false, ygridvisible=false, zgridvisible=false)
67 | #ax23 = Axis3(fig[2,3]; perspectiveness = 0.5, aspect = (1,1,1)) # empty is ok, that's the idea... Q, how could u plot this kind of data, alternatives
68 | ax32 = Axis(fig[3, 2], aspect=1) # xgridvisible=false, ygridvisible=false)
69 | ax33 = Axis(fig[3, 3], aspect=1) # xgridvisible=false, ygridvisible=false)
70 | ax42 = Axis(fig[4, 2], aspect=1) # xgridvisible=false, ygridvisible=false)
71 | ax43 = Axis(fig[4, 3], aspect=1) # xgridvisible=false, ygridvisible=false)
72 | ax44 = Axis(fig[4, 4], aspect=1) # xgridvisible=false, ygridvisible=false)
73 | ax45 = Axis(fig[4, 5], aspect=1) # xgridvisible=false, ygridvisible=false)
74 | axBubbles = Axis(fig[1:end, 1:end]; xgridvisible=false, ygridvisible=false)
75 | axs = [ax11, ax21, ax31, ax41,
76 | ax22, #ax23,
77 | ax32, ax33,
78 | ax42, ax43, ax44, ax45,
79 | axBubbles,
80 | ]
81 | # First Column 1,1 to 4,1
82 | meshscatter!(ax11, positions, color=vec(vals),
83 | marker=FRect3D(Vec3f0(0), Vec3f0(7)), # here, if you use less than 10, you will see smaller squares.
84 | colormap=:linear_grey_10_95_c0_n256, colorrange=(-2, 2),
85 | transparency=false,
86 | shading=false)
87 | meshscatter!(ax11, posTops, color=vec(valsTop), marker=FRect3D(Vec3f0(0), Vec3f0(7)),
88 | transparency=false, colormap=(:plasma, 0.65),
89 | shading=false, colorrange=(0, 1))
90 | meshscatter!(ax11, posSides, color=vec(valsSides), marker=FRect3D(Vec3f0(0), Vec3f0(7)),
91 | transparency=false, colormap=(:viridis, 0.65),
92 | shading=false)
93 | meshscatter!(ax21, vec(arrTop[:, end]), color=colorTop[end:-1:1], shading=false,
94 | marker=FRect3D(Vec3f0(0), Vec3f0(7)), colormap=:plasma, colorrange=(0, 1))
95 | meshscatter!(ax21, vec([(0, i) for i = 0:9]), color=colorSides, shading=false,
96 | marker=FRect3D(Vec3f0(0), Vec3f0(7)), colormap=:viridis, colorrange=(0, 1))
97 | meshscatter!(ax21, vec([(j, i) for i = 0:9, j = 1:10]), shading=false,
98 | color=vec(vals[end:-1:1, end, :]'),
99 | marker=FRect3D(Vec3f0(0), Vec3f0(7)), colormap=:linear_grey_10_95_c0_n256, colorrange=(-2, 2))
100 | meshscatter!(ax31, vec(arrTop[1:2, end]), color=colorTop[end:-1:1][1:2], shading=false,
101 | marker=FRect3D(Vec3f0(0), Vec3f0(7)), colormap=:plasma, colorrange=(0, 1))
102 | meshscatter!(ax31, vec([(0, i) for i = 0:9]), color=colorSides, shading=false,
103 | marker=FRect3D(Vec3f0(0), Vec3f0(7)), colormap=:viridis, colorrange=(0, 1))
104 | meshscatter!(ax31, vec([(j, i) for i = 0:9, j = 1:2]), shading=false,
105 | color=vec(vals[end:-1:1, end, :]')[1:20],
106 | marker=FRect3D(Vec3f0(0), Vec3f0(7)), colormap=:linear_grey_10_95_c0_n256, colorrange=(-2, 2))
107 | meshscatter!(ax41, vec(arrTop[1:1, end]), color=colorTop[end:-1:1][1:1], shading=false,
108 | marker=FRect3D(Vec3f0(0), Vec3f0(7)), colormap=:plasma, colorrange=(0, 1))
109 | meshscatter!(ax41, vec([(0, i) for i = 0:9]), color=colorSides, shading=false,
110 | marker=FRect3D(Vec3f0(0), Vec3f0(7)), colormap=:viridis, colorrange=(0, 1))
111 | meshscatter!(ax41, vec([(j, i) for i = 0:9, j = 1:1]), shading=false,
112 | color=vec(vals[end:-1:1, end, :]')[1:10],
113 | marker=FRect3D(Vec3f0(0), Vec3f0(7)), colormap=:linear_grey_10_95_c0_n256, colorrange=(-2, 2))
114 | # Limits
115 | xlims!(ax11, -1, 12)
116 | ylims!(ax11, -1, 11)
117 | zlims!(ax11, -1, 12)
118 | xlims!(ax21, -0.5, 11)
119 | ylims!(ax21, -1, 11)
120 | xlims!(ax31, -0.5, 11)
121 | ylims!(ax31, -1, 11)
122 | xlims!(ax41, -0.5, 11)
123 | ylims!(ax41, -1, 11)
124 | # Second Columns 2,2 to 4,2
125 | x, y, z = mypeaks()
126 | surface!(ax22, x, y, z; colormap=:plasma)
127 | # contourf!(ax23, x, y, z; colormap = :bone_1)
128 | x = rand(10)
129 | y = rand(10)
130 | z = rand(10)
131 | # Third Row 3,2 to to 3,3
132 | scatter!(ax32, rand(10), rand(10); color="Yellow", markersize=16)
133 | lines!(ax33, 0 .. 10, x -> exp(-x); color=JuliaColors.red, linewidth=4)
134 | limits!(ax32, -0.1, 1.1, -0.1, 1.1)
135 | limits!(ax33, -1, 11, -0.1, 1.1)
136 | # Fourt Row 4,2 to 4,5
137 | hist!(ax42, randn(1000), bins=32; color="Yellow", strokewidth=1.5,
138 | strokecolor=:grey80)
139 | density!(ax43, randn(1000); color=JuliaColors.red,
140 | strokewidth=2, strokecolor=JuliaColors.red)
141 | violin!(ax44, fill(1, 1000), randn(1000); color=(JuliaColors.purple, 0.5),
142 | strokewidth=2, strokecolor=JuliaColors.purple, show_median=true)
143 | boxplot!(ax45, fill(1, 1000), randn(1000); color=JuliaColors.green, strokecolor=:grey80,
144 | whiskercolor=JuliaColors.green, whiskerwidth=1, strokewidth=1)
145 | scatter!(axBubbles, rand(Distributions.Normal(1, 1), 1500), rand(Distributions.Normal(1, 1), 1500);
146 | color=1:1500, markersize=20 * rand(1500),
147 | colormap=tuple.(to_colormap(:thermal), rand(40) .+ 0.15),
148 | marker=:rect)
149 | limits!(axBubbles, -2.2, 2, -3.3, 2)
150 | xlims!(ax44, 0.1, 1.9)
151 | xlims!(ax45, 0.1, 1.9)
152 | ylims!(ax42, 0, 150)
153 | ylims!(ax43, 0, 0.55)
154 | ylims!(ax44, -5.6, 5.5)
155 | ylims!(ax45, -5.6, 5.5)
156 | # Pipes for First Column
157 | pipisize = 52
158 | Label(fig[1, 1, Bottom()], "|>", textsize=pipisize,
159 | rotation=-π / 2, padding=(0, 0, 0, 0), font=NOTO_SANS_BOLD)
160 | Label(fig[2, 1, Bottom()], " |>", textsize=pipisize,
161 | rotation=-π / 2, padding=(0, 0, 0, 0), font=NOTO_SANS_BOLD)
162 | Label(fig[3, 1, Bottom()], " |>", textsize=pipisize,
163 | rotation=-π / 2, padding=(0, 0, 0, 0), font=NOTO_SANS_BOLD)
164 | # Pipes between columns
165 | Label(fig[2, 1, Right()], "|>", textsize=pipisize,
166 | rotation=0π, padding=(5, 5, 0, 0), font=NOTO_SANS_BOLD)
167 | Label(fig[3, 1, Right()], "|>", textsize=pipisize,
168 | rotation=0π, padding=(5, 5, 0, 0), font=NOTO_SANS_BOLD)
169 | Label(fig[3, 2, Right()], "|>", textsize=pipisize,
170 | rotation=0π, padding=(5, 5, 0, 0), font=NOTO_SANS_BOLD)
171 | Label(fig[4, 1, Right()], "|>", textsize=pipisize,
172 | rotation=0π, padding=(5, 5, 0, 0), font=NOTO_SANS_BOLD)
173 | Label(fig[4, 2, Right()], "|>", textsize=pipisize,
174 | rotation=0π, padding=(5, 5, 0, 0), font=NOTO_SANS_BOLD)
175 | Label(fig[4, 3, Right()], "|>", textsize=pipisize,
176 | rotation=0π, padding=(5, 5, 0, 0), font=NOTO_SANS_BOLD)
177 | Label(fig[4, 4, Right()], "|>", textsize=pipisize,
178 | rotation=0π, padding=(5, 5, 0, 0), font=NOTO_SANS_BOLD)
179 |
180 | legJ = Label(fig[1, 3:5], "Julia", textsize=394,
181 | tellheight=false, halign=:left, font=NOTO_SANS_BOLD)
182 | translate!(legJ.elements[:text], 0, 0, 9)
183 | legD = Label(fig[1, 3:5], "\n\n\n\nData Science", textsize=126,
184 | tellheight=false, halign=:left, font=NOTO_SANS_BOLD)
185 | translate!(legD.elements[:text], 0, 0, 9)
186 | vspace = "\n\n"
187 | hspace = " "
188 | legJose = Label(fig[2, 3:5], "$(vspace)$(hspace)Jose Storopoli", textsize=80,
189 | tellheight=false, halign=:left, font=NOTO_SANS_BOLD)
190 | legRik = Label(fig[2, 3:5], "$(vspace)\n\n$(hspace)Rik Huijzer", textsize=80,
191 | tellheight=false, halign=:left, font=NOTO_SANS_BOLD)
192 | legLaz = Label(fig[2, 3:5], "$(vspace)\n\n\n\n$(hspace)Lazaro Alonso", textsize=80,
193 | tellheight=false, halign=:left, font=NOTO_SANS_BOLD)
194 | translate!(legJose.elements[:text], 0, 0, 9)
195 | translate!(legRik.elements[:text], 0, 0, 9)
196 | translate!(legLaz.elements[:text], 0, 0, 9)
197 | # textsize = 60, tellheight = false)
198 | # Final Axis and Figure touches
199 | [hidedecorations!(ax; grid=false) for ax in axs]
200 | [hidespines!(ax) for ax in axs]
201 | rowgap!(fig.layout, 0)
202 | colgap!(fig.layout, 0)
203 | return fig
204 | end
205 | end
206 |
--------------------------------------------------------------------------------
/contents/stats_vis.md:
--------------------------------------------------------------------------------
1 | ## Statistical Visualizations {#sec:stats_vis}
2 |
3 | There are several statistical visualization techniques.
4 | For now, we will focus on only three: **histograms**, **box plots**, and **density plots**, since they are commonly used to analyze univariate data.
5 |
6 | We will also use the `more_grades` dataset from @sec:stats_central.
7 |
8 | ### Histograms {#sec:stats_vis_histograms}
9 |
10 | As already briefly shown in @sec:stats_dist, **histograms approximate the distribution for given data**.
11 | We construct them by "binning", i.e., inserting into discrete bins the range of values into a series of intervals and then counting up how many values fall in each given interval.
12 | The bins are represented as a bar in which the height describes the frequency of values belonging to that bin.
13 |
14 | We can draw histograms using `Makie.jl`:
15 |
16 | ```jl
17 | s = """
18 | CairoMakie.activate!() # hide
19 | label = "histogram" # hide
20 | caption = "Histogram" # hide
21 | df = more_grades()
22 | fig = Figure(; resolution=(600, 400))
23 | ax = Axis(fig[1, 1], xticks=1:10)
24 | hist!(ax, df.grade; color=(:dodgerblue, 0.5))
25 | Options(current_figure(); filename=label, caption, label) # hide
26 | """
27 | sco(s)
28 | ```
29 |
30 | Note that by default `hist!` uses 15 bins.
31 | We can change that with the `bins` keyword:
32 |
33 | ```jl
34 | s = """
35 | CairoMakie.activate!() # hide
36 | label = "histogram_bins" # hide
37 | caption = "Histogram with Custom Bins" # hide
38 | df = more_grades()
39 | fig = Figure(; resolution=(600, 400))
40 | ax = Axis(fig[1, 1], xticks=1:10)
41 | hist!(ax, df.grade; color=(:dodgerblue, 0.5), bins=10)
42 | Options(current_figure(); filename=label, caption, label) # hide
43 | """
44 | sco(s)
45 | ```
46 |
47 | We can see clearly that most of the grades are between 4 and 9.
48 |
49 | ### Box Plots {#sec:stats_vis_boxplots}
50 |
51 | Box plots are a method for graphically depicting numerical data through their quartiles (see @fig:boxplot).
52 | The "box" is typically represented by the quartiles 1 to 3 (see @sec:stats_dispersion_quantiles).
53 | The median, second quartile -- Q2, or percentile 0.5, is the line inside the box.
54 | The first and third quartile, Q1 and Q3, or percentiles 0.25 and 0.75, respectively, are the box's lower and upper bounds.
55 | Finally, we have the "whisker" which, traditionally (and the default in most data visualization techniques), is the range composed by extending the interquartile range (IQR) by 1.5.
56 |
57 | The basic box plot can be drawn using `Makie.jl` (see Chapter -@sec:DataVisualizationMakie).
58 | It accepts `x` and `y` vectors which represents the positions of the categories and the variables within the boxes, respectively.
59 | Since the elements in our vector `x` are of type `String`, we need to convert it to `categorical` using `CategoricalArrays.jl` (@sec:missing_data) and then pass the `Axis` keyword argument `xticks` (see @sec:datavisMakie_attributes) as a tuple of values and labels.
60 | For the `xticks`' labels we used the `levels` function from `CategoricalArrays.jl` that returns the categorical levels from our `name` variable in the same order as the integer codes.
61 | Finally, for the `x` vector inside Makie's `boxplot` function, we wrap the `name` variable with the `levelcode` function, also from `CategoricalArrays.jl`, which returns the underlying integer codes from our categorical variable `name`.
62 | We do this because Makie's `boxplot` only accepts a vector of `Int`s as inputs for the `x` argument.
63 | Here is the code:
64 |
65 | ```jl
66 | s = """
67 | CairoMakie.activate!() # hide
68 | label = "boxplot" # hide
69 | caption = "Box Plot" # hide
70 | df = more_grades()
71 | transform!(df, :name => categorical; renamecols=false)
72 | fig = Figure(; resolution=(600, 400))
73 | ax = Axis(fig[1, 1]; xticks = (1:4, levels(df.name)))
74 | boxplot!(ax, levelcode.(df.name), df.grade)
75 | Options(current_figure(); filename=label, caption, label) # hide
76 | """
77 | sco(s)
78 | ```
79 |
80 | The default IQR range for the whiskers in `Makie.jl` is 1.5.
81 | However, sometimes we see the whiskers either with a different IQR range or with a small vertical bar to better visualize the whiskers' tips.
82 | We can control both of those with the `range` (default `1.5`) and `whiskerwidth` (default `0.0`) arguments:
83 |
84 | ```jl
85 | s = """
86 | CairoMakie.activate!() # hide
87 | label = "boxplot_custom" # hide
88 | caption = "Box Plot with different IQR and Whiskers Vertical Bars" # hide
89 | df = more_grades()
90 | transform!(df, :name => categorical; renamecols=false)
91 | fig = Figure(; resolution=(600, 400))
92 | ax = Axis(fig[1, 1]; xticks = (1:4, levels(df.name)))
93 | boxplot!(ax, levelcode.(df.name), df.grade; range=2.0, whiskerwidth=0.5)
94 | Options(current_figure(); filename=label, caption, label) # hide
95 | """
96 | sco(s)
97 | ```
98 |
99 | Box plots can also flag anything outside the whiskers as outliers.
100 | By default, these observations are not shown in `Makie.jl` but you can control this with the `show_outliers` argument:
101 |
102 | ```jl
103 | s = """
104 | CairoMakie.activate!() # hide
105 | label = "boxplot_outliers" # hide
106 | caption = "Box Plot with Outliers" # hide
107 | df = more_grades()
108 | transform!(df, :name => categorical; renamecols=false)
109 | fig = Figure(; resolution=(600, 400))
110 | ax = Axis(fig[1, 1]; xticks = (1:4, levels(df.name)))
111 | boxplot!(ax, levelcode.(df.name), df.grade; range=0.5, show_outliers=true)
112 | Options(current_figure(); filename=label, caption, label) # hide
113 | """
114 | sco(s)
115 | ```
116 |
117 | As you can see, **box plots are a useful way to visualize data with robust central tendencies and dispersion measures to outliers**.
118 |
119 | ### Density Plots {#sec:stats_vis_densityplots}
120 |
121 | Box plots limit us just to summarize statistics like median, quartiles, and IQRs.
122 | Often we want to see the underlying distribution of the data.
123 | Histograms are discrete approximations.
124 | If we would like to have continuous approximations we need something else: **density plots**.
125 | **Density plots are graphical density estimations of numerical data**.
126 | It shows us the approximate distribution of a given variable by depicting it as a density, where the higher the curve at a given point is, the more likely is the variable to take a certain value.
127 |
128 | A density plot can also be drawn using `Makie.jl`. However, it is more convoluted than the box plot.
129 | First, we want to pass for each `density!` function only the values with respect to one observation.
130 | Thus, we define a `values` function that will accept a `code` argument to filter the dataset's variable `name` wrapped with the `levelcode` function.
131 | Then, we plot a density `pltobj` for each one of the variable `name`'s `levels`.
132 | Finally, we make sure that the density `plotobj`s have their own `ytick` with the `offset` keyword paired with a custom `yticks` in the `Axis` constructor by specifying, same as before, a tuple of values and labels.
133 | The effect of the `offset` in the `for` loop is the increment from 1 to 4, by 1, of both the `offset` argument for `density!` and the `code` argument for `values`:
134 |
135 | ```jl
136 | s = """
137 | CairoMakie.activate!() # hide
138 | label = "densityplot" # hide
139 | caption = "Density Plot" # hide
140 | df = more_grades()
141 | transform!(df, :name => categorical; renamecols=false)
142 | categories = levels(df.name)
143 | values(code) = filter(row -> levelcode.(row.name) == code, df).grade
144 | fig = Figure(; resolution=(600, 400))
145 | ax = Axis(fig[1, 1]; yticks = (1:4, categories), limits=((-1, 11), nothing))
146 | for i in 1:length(categories)
147 | density!(ax, values(i); offset=i)
148 | end
149 | Options(current_figure(); filename=label, caption, label) # hide
150 | """
151 | sco(s)
152 | ```
153 |
154 | As explained in @sec:makie_colors, we can change Makie's colors by either specifying a `color` or `colormap`.
155 | This can also be applied to `density`:
156 |
157 | ```jl
158 | s = """
159 | CairoMakie.activate!() # hide
160 | label = "densityplot_colors" # hide
161 | caption = "Density Plots with Different Color Schemes" # hide
162 | df = more_grades()
163 | transform!(df, :name => categorical; renamecols=false)
164 | categories = levels(df.name)
165 | values(code) = filter(row -> levelcode.(row.name) == code, df).grade
166 | fig = Figure(; resolution=(600, 400))
167 | ax1 = Axis(fig[1, 1]; yticks = (1:4, categories), limits=((-1, 11), nothing))
168 | ax2 = Axis(fig[1, 2]; yticks = (1:4, categories), limits=((-1, 11), nothing))
169 | for i in 1:length(categories)
170 | density!(ax1, values(i); offset=i, color=(:dodgerblue, 0.5))
171 | end
172 | for i in 1:length(categories)
173 | density!(ax2, values(i); offset=i, color=:x, colormap=:viridis)
174 | end
175 | Options(current_figure(); filename=label, caption, label) # hide
176 | """
177 | sco(s)
178 | ```
179 |
180 | Here, in the first figure (left) we are using a specific `color` for all `density!`'s `plotobj`s.
181 | And in the second figure (right) we pass the `:x` argument to `color` to tell Makie to apply the `colormap` gradient along the x-axis (from left to right) while also specifying which `colormap` palette as `:viridis`.
182 | The color code gradient in the `y` direction is most common and is a visual aid to easily identify trends;
183 | in the `x` direction is useful when you want to know how things go in some time-dependent variable, but is not widely used.
184 |
185 | ### Anscombe Quartet {#sec:stats_vis_anscombe}
186 |
187 | We conclude this Statistics chapter with a demonstration of the **importance of data visualization in statistical analysis**.
188 | For this, we present the **Anscombe Quartet** [@anscombe1973graphs], which comprises four datasets that have *nearly identical* simple descriptive statistics, yet have very *different* distributions and appear very different when **plotted**.
189 | Each dataset has 11 observations with `x` and `y` variables.
190 | They were created in 1973 by the statistician Francis Anscombe to show the importance of plotting data before conducting statistical analysis.
191 | Here is the table with the four datasets:
192 |
193 | ```jl
194 | Options(anscombe_quartet(;type="wide"); caption="Anscome Quartet", label="anscombe_quartet")
195 | ```
196 |
197 | Now, if you look at the descriptive statistics for both `x` and `y` variables in all 4 datasets they are pretty much the same along with their correlation (both up to 2 decimal places):
198 |
199 | ```jl
200 | s = """
201 | df = anscombe_quartet()
202 | round_up = x -> round(x; digits=2)
203 | combine(groupby(df, :dataset),
204 | [:x, :y] .=> round_up .∘ [mean std],
205 | [:x, :y] => round_up ∘ cor)
206 | """
207 | sco(s; process=without_caption_label)
208 | ```
209 |
210 | Now, if we take a look at a simple scatter plot of all 4 datasets, we clearly see that something else is going on:
211 |
212 | ```jl
213 | fig = plot_anscombe()
214 | caption = "Anscombe Quartet"
215 | label = "plot_anscombe"
216 | Options(fig; filename=label, caption, label)
217 | ```
218 |
219 | Here, the first dataset (upper left) is a frequent situation that we encounter in data science: `x` and `y` are correlated with added random noise.
220 | In the second dataset (upper right), we see a perfect correlation except for an outlier in the second to last observation.
221 | For the third dataset (lower left), the relationship is non-linear.
222 | Finally, for the fourth dataset (lower right) there isn't any relationship except by an outlier observation.
223 |
224 | The Anscombe Quartet tells us that sometimes **descriptive statistics can fool us** and we should rely also on **visualizations** to analyze our data.
225 |
--------------------------------------------------------------------------------
/contents/why_julia.md:
--------------------------------------------------------------------------------
1 | # 为什么选择 Julia ? {#sec:why_julia}
2 |
3 | 数据科学领域中充满了各种各样的开源编程语言。
4 |
5 | 工业界大多使用 Python,而学术界偏爱 R。
6 | **那为什么要学习另外一种语言呢?**
7 | 我们分别从两种常见背景来回答此问题:
8 |
9 | 1. **从未编过程** -- 请查阅 @sec:non-programmers。
10 |
11 | 2. **以前编过程** -- 请查阅 @sec:programmers。
12 |
13 | ## 从未编过程 {#sec:non-programmers}
14 |
15 | 对于第一种背景的读者,我们期望都有着如下的基本故事。
16 |
17 | 数据科学肯定已经吸引到了你,使你有兴趣去了解它到底是什么,以及如何利用它构建你在学术界或工业界的职业生涯。
18 | 然后,在尝试寻找资源学习这门新学科时,你会闯进一个充满缩写词的世界:
19 | `pandas`、`dplyr`、`data.table`、`numpy`、`matplotlib`、`ggplot2`、`bokeh`,以及更多数不胜数的例子。
20 |
21 | 然后会突然听到一个名字:“Julia”。
22 | 它究竟是什么样的呢?
23 | 它与其他别人告诉你的数据科学工具有什么不同?
24 |
25 | 为什么你应该投入珍贵的时间去学习这门新语言呢?它几乎从来不会在任何工作要求,实验室职位,博士后职位,或学术职位描述中提到。
26 | 答案是,Julia 是用于编程和数据科学的 **全新方法**。
27 | 在 Python 或 R 所实现的一切,都可以使用 Julia 实现,并且代码还具有可读性好[^readable]、速度快、功能强大等优点。
28 | 因此,Julia 语言越来越受欢迎,而且具有很充分的理由。
29 |
30 | 所以,**如果你没有任何编程背景知识,我们强烈鼓励你学习 Julia**,让它成为你的第一门编程语言和数据科学框架。
31 |
32 | ## 有编程经验 {#sec:programmers}
33 |
34 | 对了有编程经验的读者,背景故事发生了些变化。
35 | 你也许知道如何编程,并且可能以此为生。
36 | 你熟悉多种编程语言,并且可以在它们之间来回切换。
37 | 你已经听说了一种叫做“数据科学”的新奇事物,并且想要跟随这一潮流。
38 | 你开始学习如何使用 `numpy`,如何在 `pandas` 中操作 `DataFrames` ,以及如何使用 `matplotlib` 绘图。
39 | 又或者,你可能已经通过 tidyverse 学习了所有的操作,包括 `tibbles`、`data.frames`、`%>%` (管道运算符)和 `geom_*` 等等 ……
40 |
41 | 然后通过某些人或某些地方,你关注到一门名为 “Julia” 的新语言。
42 | 何必呢?
43 | 你已经精通了 Python 或 R ,并且掌握了你所需要的一切。
44 | 好吧,让我们设想一些场景。
45 |
46 | **假设你正在使用 Python 或 R:**
47 |
48 | 1. 编写的代码未能达到需要的性能?
49 | 实际上, **若使用 Julia, Python 或 R 的分钟级运行时间可能会缩短为秒级**^[有时是毫秒级。]。
50 | 我们将在 @sec:julia_wild 展示 Julia 在学术界和工业界的成功应用案例。
51 |
52 | 2. 尝试做些不符合 `numpy`/`dplyr` 惯例的操作,但发现代码很慢,然后不得不学习黑魔法^[`numba`、甚至 `Rcpp` 或 `cython`?] 来加速代码?
53 | **在 Julia 中,你可以自定义各种各样的代码,而且不会有任何性能损失**。
54 |
55 | 3. 不得不调试代码以及有时需要阅读 Fortran 或 C/C++ 源码,但却又不明白实现的原理?
56 | **在 Julia 中,你仅需要阅读 Julia 代码,并且不需要学习其他语言来加速原来的代码**。
57 | 这就是 “两语言问题” (请查阅 @sec:two_language)。
58 | 这还能对应此种情况: “你想把一个有趣的想法贡献给开源项目。但是不得不放弃,因为所有库的编程语言既不是 Python,也不是 R,而是C/C++ 或 Fortran”^[浏览一些 GitHub 中的深度学习库,你会惊讶地发现 Python 只占代码库的25%-33%。]。
59 |
60 | 4. 并不能直接使用其他包中的数据结构,而是需要构建一组接口^[这通常是 Python 生态系统的问题,虽然 R 并没有受到严重的影响,但也并不乐观。]。
61 | **而 Julia 用户能够轻松地共享和重用来自不同包的代码。**
62 | 大多数 Julia 用户定义的类型和函数都是开箱即用的^[或者需要做出极少的努力。],一些用户甚至会惊讶地发现其他库可能以无法想象的方式使用他们的包。
63 | 我们会在 @sec:multiple_dispatch 介绍一些例子。
64 |
65 | 5. 想要更好的项目管理工具,其需包含精确的、可管理的、可复制的依赖和版本控制?
66 | **而 Julia 有着令人惊叹的项目管理方案和绝佳的包管理器**。
67 | 与安装和管理单个全局软件集的传统包管理器不同,Julia 的包管理器围绕“环境”设计:
68 | 这些独立的软件集既可局部生效于单个项目,也能在不同的项目间共享。
69 | 每个项目独立维护自己的软件版本集。
70 |
71 | 如果这些熟悉或看似合理的情景吸引到了你的兴趣,那么你可能会想了解更多关于新 Julia 语言的内容。
72 |
73 | 让我们继续吧!
74 |
75 | ## Julia 想实现什么? {#sec:julia_accomplish}
76 |
77 | > **_NOTE:_**
78 | > 本节将详细解释是什么使 Julia 成为一门出色的编程语言。
79 | > 如果这对你来说太过技术性,你可以跳过这节并前往 @sec:dataframes 学习如何使用 `DataFrames.jl` 处理表格数据。
80 |
81 | Julia 编程语言 [@bezanson2017julia] 是一门较新的语言,第一版发布于 2012 年,其目标是 **简单且快速**。
82 | 即,“ 运行起来像C^[有时甚至快于C。], 但阅读起来像 Python”[@perkelJuliaComeSyntax2019]。
83 | 它是为科学计算设计的,能够处理 **大规模的数据与计算** 。但仍可以相当 **容易地创建和操作原型代码**。
84 |
85 | Julia 的创始人在一篇[2012 年的博客](https://julialang.org/blog/2012/02/why-we-created-julia/) 中解释了为什么要创造 Julia。
86 | 他们说^[译者注:这段话的翻译参考了 InfoQ 的文章 “再见 Python,你好 Julia!”。]:
87 |
88 | > 我们很贪婪:我们想要更多。
89 | > 我们想要一门采用自由许可证的开源语言。
90 | > 我们想要 C 的性能和 Ruby 的动态特性。
91 | > 我们想要一门同调的语言,它既拥有 Lisp 那样真正的宏, 但又具有 Matlab 那样明显又熟悉的数学运算符。
92 | > 我们希望这门语言可以像 Python 一样用于常规编程,像 R 一样容易地用于统计领域,像 Perl 一样自然地处理字符串,像 Matlab 一样拥有强大的线性代数系统,像 Shell 一样能够擅长组合程序。
93 | > 这门语言要简单易学,但又能打动最认真的极客。
94 | > 我们希望它可交互,同时希望它是编译的。
95 |
96 |
97 | 大多数用户都被 Julia 的 **优越速度** 所吸引。
98 | 毕竟,Julia 可是著名独家俱乐部 petaflop 的成员。
99 | [**petaflop 俱乐部**](https://www.hpcwire.com/off-the-wire/julia-joins-petaflop-club/) 的组成成员都是一些峰值运算速度超过 **千万亿次每秒** 的编程语言。
100 | 现在只有 C,C++,Fortran,和 Julia 属于 [petaflop 俱乐部](https://www.nextplatform.com/2017/11/28/julia-language-delivers-petascale-hpc-performance/)。
101 |
102 | 但是,速度不是 Julia 的全部。
103 | Julia 的一些特性还包括**易用性**、 **Unicode 支持** 和 **代码共享的便捷性**。
104 | 本节将讨论这些所有的特性,不过目前先来关注 Julia 的代码共享特性。
105 |
106 | Julia 软件包的生态非常独特。
107 | 它不仅允许共享代码,也允许共享用户自定义的类型。
108 | 例如,Python 的 `pandas` 使用自带的 `Datetime` 类型来处理日期。
109 | 同时, R tidyverse 的 `lubridate` 包也使用自定义的 `datetime` 类型来处理日期。
110 | Julia 不需要上述任何一种类型, 因为其标准库已准备好了所有的日期工具。
111 | 这意味其他包不需要担心日期处理。
112 | 其他包仅需要为 Julia `DateTime` 类型扩展新功能,即定义新函数但不需要定义新类型。
113 | Julia `Dates` 模块可以实现许多令人惊叹的功能,但目前讨论它有些超前。
114 | 于是让我们来讨论一些 Julia 的其他特性。
115 |
116 | ### Julia VS 其他编程语言
117 |
118 | [@fig:language_comparison] 给出了非常个性化的分类,它将主流的开源科学计算编程语言分在一张 2x2 图中, 该图具有两个轴:
119 | **Slow-Fast(慢-快)** 和 **Easy-Hard(简单-困难)**。
120 | 我们省略了闭源语言,因为允许其他人运行你的代码以及检查源代码中的问题会具有许多好处。
121 |
122 | 我们把 C++ 和 FORTRAN 放在 困难-快 象限。
123 | 作为需要编译、类型检查和其他专业管理的静态语言,它们真的很难学习,原型代码也编写很缓慢。
124 | 好处是它们都是 **非常快的** 语言。
125 |
126 | R 和 Python 放在 简单-慢 象限。
127 | 它们是不需要编译的动态语言,在运行时执行。
128 | 因此,它们很容易学习,能够快速创建原型代码。
129 | 当然,这会导致共同的缺点:
130 | 它们都是 **非常慢的** 语言。
131 |
132 | Julia 是唯一一门在 简单-快 象限的语言。
133 | 我们知道任何其他严格的语言都不会想变得困难且缓慢,所以此象限为空。
134 |
135 | {#fig:language_comparison}
136 |
137 | **Julia 很快! 特别快!**
138 | 它起初就为速度而设计。
139 | 而这通过多重派发实现。
140 | 基本上,这个想法能够生成非常高效的 LLVM[^LLVM] 代码。
141 | LLVM 代码,也称为 LLVM 指令,它非常靠近底层,即非常接近计算机执行的实际操作。
142 | 所以,本质上, Julia 会将你可读性好的手写代码转换为 LLVM 机器码。虽然 LLVM 机器码对于人类来说很难阅读,但对于计算机来说很容易。
143 | 例如,如果你定义了一个接收单个参数的函数并向该函数传递整数,然后 Julia 会创建一个 **专门的** `MethodInstance`。
144 | 下次你再向该函数传递整数时,Julia 将会查找之前创建的 `MethodInstance`,并引用其执行操作。
145 | 一个**很棒的** 技巧是,可以在调用函数的嵌套函数中使用它。
146 | 例如,如果向函数 `f` 传递了某些数据类型,而 `f` 又调用了函数 `g`,同时传递给 `g` 的数据类型都是相同且已知的,那么生成函数 `g` 就会硬编码到 `f` 中!
147 | 这意味着 Julia 不再需要查找 `MethodInstances`,此时代码就会运行地非常快。
148 | 此处需要权衡的是,在某些情况下,早期关于硬编码 `MethodInstances` 的假设可能是无效的。
149 | 然后需要重新创建硬编码的 `MethodInstances`。
150 | 因此,权衡也需包括花时间推断哪些能够硬编码,而哪些不能。
151 | 这也解释了为什么 Julia 代码在第一次执行前通常要花费较长的时间:
152 | Julia 在背后优化代码。
153 |
154 | 编译器接着做它最擅长的事情:优化机器码^[如果你想了解更多关于 Julia 如何设计的内容,你绝对需要看 @bezanson2017julia 。]。
155 | 你可以在 Julia 网站上找到 Julia 和其他语言的 [benchmarks](https://julialang.org/benchmarks/) 。
156 | @fig:benchmarks 取自于 [Julia 网站的 benchmarks 节^[请注意上述的 Julia 结果不包含编译时间。]](https://julialang.org/benchmarks/)。
157 | 如你所见, Julia 是**相当** 快的。
158 |
159 | {#fig:benchmarks}
160 |
161 | 我们非常信任 Julia。
162 | 否则,我们不会写这本书。
163 | 我们认为, Julia 是 **科学计算和科学数据分析的未来**。
164 | 它使得用户可以通过简单的语法开发快速且强大的代码。
165 | 研究人员通常使用一种简单但缓慢的语言开发原型代码。
166 | 一旦确定代码正常运行且实现其目标,然后就会开始将当前的代码转换为一门快速但困难的编程语言。
167 | 这就是“两语言问题”,接下来将讨论它。
168 |
169 | ### 两语言问题 {#sec:two_language}
170 |
171 | “两语言问题” 是科学计算中的典型问题。通常研究人员想要设计一种算法或方案来解决手头的问题或分析。
172 | 一般地,解决方案的原型代码都采用容易编程的语言(像 Python 或 R)。
173 | 如果原型能够正常工作,那么研究人员就会使用不易编写原型但快速的语言(C++ 或 FORTRAN)重新实现。
174 | 因此,开发解决方案的过程涉及了两种语言。
175 | 一种语言易于编写原型代码并不适合方案实现 (通常由于缓慢的速度)。
176 | 而另一种语言并不易于编写原型代码,但由于非常快,所以适合方案实现。
177 | Julia 能够避免此类情形,因为 **开发原型(易编程)和方案实现(速度快)将采用相同的语言**。
178 |
179 | 另外, Julia 允许使用 **Unicode 字符作为变量或参数**。
180 | 这意味着不再使用 `sigma` 或 `sigma_i`,而是像数学记号那样使用 $σ$ 或 $σᵢ$ 。
181 | 当查看算法代码或数学方程时,你会看到几乎相同的符号和术语。
182 | 我们将这种强大的特性称为 **“代码和数学关系的一对一”**。
183 |
184 | 我们认为,Alan Edelman,Julia 创始人之一,在一次[TEDx Talk](https://youtu.be/qGW0GT1rCvs) [@tedxtalksProgrammingLanguageHeal2020] 中对 “两语言问题” 和 “代码和数学关系的一对一” 作出了最好的描述。
185 |
186 |
187 |
188 | ### 多重派发 {#sec:multiple_dispatch}
189 |
190 | 多重派发(multiple dispatch)是一种强大的特性,它使得能够扩展现有的函数或为新类型自定义复杂行为。
191 | 假设想要定义两种 `struct` 来表示不同的动物:
192 |
193 | ```jl
194 | s = """
195 | abstract type Animal end
196 | struct Fox <: Animal
197 | weight::Float64
198 | end
199 | struct Chicken <: Animal
200 | weight::Float64
201 | end
202 | """
203 | sc(s)
204 | ```
205 |
206 | 这表明此处定义了动物类型 `Fox` 和 `Chicken`。
207 | 然后生成名为 Fiona 的 `Fox` 和名为 Big Bird 的 `Chicken`。
208 |
209 | ```jl
210 | s = """
211 | fiona = Fox(4.2)
212 | big_bird = Chicken(2.9)
213 | """
214 | sc(s)
215 | ```
216 |
217 | 为了知道他们的重量之和,编写如下的函数:
218 |
219 | ```jl
220 | sco("combined_weight(A1::Animal, A2::Animal) = A1.weight + A2.weight")
221 | ```
222 |
223 | 然后还想知道它们能否相处得好。
224 | 采用条件语句实现:
225 |
226 | ```jl
227 | s = """
228 | function naive_trouble(A::Animal, B::Animal)
229 | if A isa Fox && B isa Chicken
230 | return true
231 | elseif A isa Chicken && B isa Fox
232 | return true
233 | elseif A isa Chicken && B isa Chicken
234 | return false
235 | end
236 | end
237 | """
238 | sco(s)
239 | ```
240 |
241 | 现在,看看 Fiona 和 Big Bird 待在一起是否会产生麻烦:
242 |
243 | ```jl
244 | scob("naive_trouble(fiona, big_bird)")
245 | ```
246 |
247 | 好的,看起来不错。
248 | 编写 `naive_trouble` 函数已经足够简单了。然而,使用多重派发编写 `trouble` 函数还可以带来新的优势。按照如下方式创建函数:
249 |
250 | ```jl
251 | s = """
252 | trouble(F::Fox, C::Chicken) = true
253 | trouble(C::Chicken, F::Fox) = true
254 | trouble(C1::Chicken, C2::Chicken) = false
255 | """
256 | sco(s)
257 | ```
258 |
259 | 定义这些方法后,`trouble` 会得到与 `naive_trouble` 相同的结果。
260 | 例如:
261 |
262 | ```jl
263 | scob("trouble(fiona, big_bird)")
264 | ```
265 |
266 | 把 Big Bird 和另外一只小鸡 Dora 放在一起也是可以的。
267 |
268 | ```jl
269 | s = """
270 | dora = Chicken(2.2)
271 | trouble(dora, big_bird)
272 | """
273 | scob(s)
274 | ```
275 |
276 | 所以在本例中,多重派发的优势就是可以仅声明类型,然后由 Julia 去为类型找到正确的函数方法。
277 | 若是在嵌套函数中使用多重派发则更是如此,Julia 编译器实际上会自动优化函数调用。
278 | 例如,函数如下:
279 |
280 |
281 | ```
282 | function trouble(A::Fox, B::Chicken, C::Chicken)
283 | return trouble(A, B) || trouble(B, C) || trouble(C, A)
284 | end
285 | ```
286 |
287 | 根据上下文,Julia 会将其优化为:
288 |
289 | ```
290 | function trouble(A::Fox, B::Chicken, C::Chicken)
291 | return true || false || true
292 | end
293 | ```
294 |
295 | 因为编译器 **知道** `A` 是 `Fox`, `B` 是 `Chicken` ,所以方法替换为 `trouble(F::Fox, C::Chicken)`。
296 | `trouble(C1::Chicken, C2::Chicken)` 同理。
297 | 然后,编译器进一步优化:
298 |
299 | ```
300 | function trouble(A::Fox, B::Chicken, C::Chicken)
301 | return true
302 | end
303 | ```
304 |
305 | 此外,多重派发还使比较已存在的动物和新的动物 Zebra 成为可能。
306 | 可以在其他包中定义 Zebra :
307 |
308 | ```jl
309 | s = """
310 | struct Zebra <: Animal
311 | weight::Float64
312 | end
313 | """
314 | sc(s)
315 | ```
316 |
317 | 然后定义与现有动物的交互:
318 |
319 | ```jl
320 | s = """
321 | trouble(F::Fox, Z::Zebra) = false
322 | trouble(Z::Zebra, F::Fox) = false
323 | trouble(C::Chicken, Z::Zebra) = false
324 | trouble(Z::Zebra, F::Fox) = false
325 | """
326 | sco(s)
327 | ```
328 |
329 | 现在可查看 Marty(Zebra 动物)是否能与 Big Bird 和谐相处:
330 |
331 | ```jl
332 | s = """
333 | marty = Zebra(412)
334 | trouble(big_bird, marty)
335 | """
336 | scob(s)
337 | ```
338 |
339 | 更好的是,**不需额外定义任何函数即可计算 Zebra 和其他动物的重量之和**:
340 |
341 | ```jl
342 | scob("combined_weight(big_bird, marty)")
343 | ```
344 |
345 | 因此,总而言之,即使在编写代码时只考虑了 `Fox` 和 `Chicken`,但它也能用于 **从未见过的** 类型!
346 | 在实践中,这意味着重用其他 Julia 项目的代码会非常容易。
347 |
348 | 如果你和我们一样对多重派发感到兴奋,那么可以了解下面这些深入的例子。
349 | 第一个例子是,@storopoli2021bayesianjulia 关于 [ one-hot 向量的快速而优雅的实现](https://storopoli.io/Bayesian-Julia/pages/1_why_Julia/#example_one-hot_vector) 。
350 | 第二个例子是,[Tanmay Bakshi YouTube 频道](https://youtu.be/moyPIhvw4Nk?t=2107) 对 [Christopher Rackauckas](https://www.chrisrackauckas.com/) 的采访 (查看时间 35:07 ) [@tanmaybakshiBakingKnowledgeMachine2021]。
351 | Chris 提到, 在他开发和维护 [`DifferentialEquations.jl`](https://diffeq.sciml.ai/dev/) 包时,一名用户报告问题说:他基于 GPU 构造的 ODE 求解器并不能正常工作。
352 | Chris 对这个请求感到非常惊讶,因为他从来没有期望能够将 GPU 计算与求解边界值问题结合起来。
353 | 他甚至更惊讶地发现,用户犯了一个小错误,但一切正常。
354 | 这些大多数优点都来自于多重派发和高可用的代码 / 类型共享。
355 |
356 | 总的来说,我们认为多重派发的最好解释来自于 Julia 创始人
357 | [Stefan Karpinski 在 JuliaCon 2019 的演讲](https://youtu.be/kc9HwsxE1OY)。
358 |
359 |
360 |
361 |
362 | ## Julia 应用案例 {#sec:julia_wild}
363 |
364 | @sec:julia_accomplish 解释了为什么我们认为 Julia 是门如此独一无二的编程语言。
365 | 我们在上节展示了一些 Julia 特性的简单例子。
366 | 如果想要深入了解 Julia 的使用, 下面介绍一些 **有趣的案例**:
367 |
368 | 1. NASA 使用 Julia 在超级计算机上分析了 ["迄今为止发现的最大一批地球尺寸的行星"](https://exoplanets.nasa.gov/news/1669/seven-rocky-trappist-1-planets-may-be-made-of-similar-stuff/) ,并且实现了惊人的 **1,000 倍加速**,在 15 分钟内分类了1.88 亿个天体。
369 | 2. [气候建模联盟(Climate Modeling Alliance,CliMa)](https://clima.caltech.edu/) **在 GPU 和 CPU 上模拟天气**。
370 | 该项目启动于 2018 年,与加州理工大学、 NASA 喷气推进实验室以及海军研究生院的研究人员合作,CliMa 项目组采用最近的计算科学进展来开发一个地球系统模型,该模型能够以前所未有的精度和速度预测干旱、热浪和降雨。
371 | 3. [美国联邦航空管理局 (FAA) 正在使用 Julia 开发一种 **空中防碰撞系统 (ACAS-X)** ](https://youtu.be/19zm1Fn0S9M)。
372 | 这也是一个“两语言问题” 的好例子(查看 @sec:julia_accomplish)。
373 | 之前的方案是使用 Matlab 开发算法 并使用 C++ 编写高性能实现。
374 | 现在,FAA 使用 Julia 语言完成所有的事。
375 | 4. [使用 Julia 在 GPU 上 **175 倍加速** 辉瑞的药理学模型](https://juliacomputing.com/case-studies/pfizer/)。
376 | 这是一份第11届美国定量药理学会议的[海报](https://chrisrackauckas.com/assets/Posters/ACoP11_Poster_Abstracts_2020.pdf),它还获得了 [ quality award](https://web.archive.org/web/20210121164011/https://www.go-acop.org/abstract-awards)。
377 | 5. [巴西卫星亚马逊 1 号的姿态和轨道控制子系统 (AOCS) **100% 使用 Julia 编写**](https://discourse.julialang.org/t/julia-and-the-satellite-amazonia-1/57541) ,它的作者是 Ronan Arraes Jardim Chagas ()。
378 | 6. [巴西国家发展银行 (BNDES) 放弃了付费解决方案,转而选择开源 Julia 模型并获得 **10 倍加速**。](https://youtu.be/NY0HcGqHj3g)
379 |
380 | 如果觉得这些仍不够,[Julia 计算网站](https://juliacomputing.com/case-studies/) 上还有更多的例子。
381 |
382 | [^readable]: 没有调用 C++ 或 FORTRAN API。
383 | [^LLVM]: LLVM 是 **L**ow **L**evel **V**irtual **M**chine 的缩写,你可以在LLVM 网站()找到更多信息。
384 |
385 |
--------------------------------------------------------------------------------
/src/stats.jl:
--------------------------------------------------------------------------------
1 | function statistics_graph()
2 | u = LinRange(0, 2π, 72)
3 | a, b = 5.0, 2.0
4 | # arrow lines
5 | x0 = [1, 0] # start point
6 | x1 = [0.75, -5] # end point
7 | t0 = [2, 1.0] # starting tangent vector
8 | t1 = [-2, 1.0] # end tangent vector
9 | curve = BezierCurve(x0, x1, t0, t1)
10 | T = range(0, 1; length=100)
11 | points = [curve(t) for t in T]
12 | points = hcat(points...)'
13 | ##
14 | fig, ax, = lines(ellipse.(u);
15 | figure=(; resolution=(600, 400)),
16 | axis=(; aspect=1))
17 | lines!(ellipse.(u; a=1.5, b=3, k=-5))
18 | lines!(points[:, 1], points[:, 2])
19 | lines!(-points[:, 1], points[:, 2])
20 | arrows!([points[end-5, 1]], [points[end-5, 2]],
21 | [-0.1], [0], arrowsize=20, lengthscale=0.2)
22 | arrows!([-points[5, 1]], [points[5, 2]],
23 | [0.1], [0], arrowsize=20, lengthscale=0.2)
24 |
25 | text!("Data\nGenerating\nProcess", position=(0, 0),
26 | align=(:center, :center), textsize=24)
27 | text!("Observed\nData", position=(0, -5),
28 | align=(:center, :center), textsize=24)
29 | text!("Inference", position=(-1.2, -2.5), textsize=24)
30 | text!("Probability", position=(0.65, -2.5), textsize=24,
31 | align=(:center, :center))
32 | hidedecorations!(ax)
33 | hidespines!(ax)
34 | return fig
35 | end
36 |
37 | function more_grades()
38 | df1 = all_grades()
39 | df2 = DataFrame(; name=["Bob", "Sally", "Hank", "Alice"], grade=[6.5, 7.0, 6.0, 5.5])
40 | return vcat(df1, df2)
41 | end
42 |
43 | function normal_dist(mean, std; seed=123)
44 | seed!(seed)
45 | d = Distributions.Normal(mean, std)
46 | return d, rand(d, 1_000)
47 | end
48 |
49 | function lognormal_dist(mean, std; seed=123)
50 | seed!(seed)
51 | d = LogNormal(log(mean), log(std))
52 | return d, rand(d, 1_000)
53 | end
54 |
55 |
56 | dens(ax, rand_d, color) = density!(ax, rand_d; color=color, strokewidth=1.5, strokecolor=(:black, 0.5))
57 |
58 | function plot_central()
59 | CairoMakie.activate!() # hide
60 | fig = Figure(; resolution=(600, 400))
61 | ax1 = Axis(fig[1, 1]; limits=((3, 20), nothing))
62 | ax2 = Axis(fig[2, 1]; limits=((3, 20), nothing))
63 | d1, rand_d1 = normal_dist(10, 1)
64 | d2, rand_d2 = lognormal_dist(10, 1.5)
65 | dens(ax1, rand_d1, (:silver, 0.15))
66 | dens(ax2, rand_d2, (:grey, 0.25))
67 | # colorbrewer2 palettes
68 | ylim_ax1 = 0.38
69 | vlines!(
70 | ax1,
71 | mean(d1);
72 | ymax=Distributions.pdf(d1, mean(d1)) / ylim_ax1,
73 | color=:dodgerblue,
74 | linewidth=3,
75 | linestyle=:solid,
76 | label="mean",
77 | )
78 | vlines!(
79 | ax1,
80 | median(d1);
81 | ymax=Distributions.pdf(d1, median(d1)) / ylim_ax1,
82 | color=:red,
83 | linewidth=3,
84 | linestyle=:dot,
85 | label="median",
86 | )
87 | vlines!(
88 | ax1,
89 | mode(d1);
90 | ymax=Distributions.pdf(d1, mode(d1)) / ylim_ax1,
91 | color="black",
92 | linewidth=3,
93 | linestyle=:dashdot,
94 | label="mode",
95 | )
96 | #ylims!(ax1, 0, ylim_ax1)
97 | ylim_ax2 = 0.105
98 | vlines!(
99 | ax2,
100 | mean(d2);
101 | ymax=Distributions.pdf(d2, mean(d2)) / ylim_ax2,
102 | color=:dodgerblue,
103 | linewidth=3,
104 | linestyle=:solid,
105 | label="mean",
106 | )
107 | vlines!(
108 | ax2,
109 | median(d2);
110 | ymax=Distributions.pdf(d2, median(d2)) / ylim_ax2,
111 | color="red",
112 | linewidth=3,
113 | linestyle=:dot,
114 | label="median",
115 | )
116 | vlines!(
117 | ax2,
118 | mode(d2);
119 | ymax=Distributions.pdf(d2, mode(d2)) / ylim_ax2,
120 | color="black",
121 | linewidth=3,
122 | linestyle=:dashdot,
123 | label="mode",
124 | )
125 | #ylims!(ax2, 0, ylim_ax2)
126 | #ylims!(ax1, 0, ylim_ax1)
127 |
128 | hidexdecorations!(ax1; grid=false, ticks=false)
129 | #hideydecorations!(ax1; grid = false)
130 | #hideydecorations!(ax2; grid = false)
131 | #fig[1:2, 2] = Legend(fig, ax2)
132 | axislegend(ax1, position=:rt)
133 | rowgap!(fig.layout, 8)
134 | return fig
135 | end
136 |
137 | function plot_dispersion_std()
138 | CairoMakie.activate!() # hide
139 | fig = Figure(; resolution=(600, 400))
140 | ax1 = Axis(fig[1, 1]; limits=((3, 20), nothing))
141 | ax2 = Axis(fig[2, 1]; limits=((3, 20), nothing))
142 | d1, rand_d1 = normal_dist(10, 1)
143 | d2, rand_d2 = lognormal_dist(10, 1.5)
144 | dens(ax1, rand_d1, (:silver, 0.15))
145 | dens(ax2, rand_d2, (:grey, 0.25))
146 | # colorbrewer2 palettes
147 | ylim_ax1 = 0.38
148 | vlines!(
149 | ax1,
150 | mean(d1);
151 | ymax=Distributions.pdf(d1, mean(d1)) / ylim_ax1,
152 | color=:dodgerblue,
153 | linewidth=3,
154 | linestyle=:solid,
155 | label=L"\mu",
156 | )
157 | msd = [mean(d1) - std(d1), mean(d1) + std(d1)]
158 | vlines!(
159 | ax1,
160 | msd;
161 | ymax=Distributions.pdf(d1, msd) ./ ylim_ax1,
162 | color=:red,
163 | linewidth=3,
164 | linestyle=:dot,
165 | label=L"1 \cdot \sigma",
166 | )
167 | #ylims!(ax1, 0, ylim_ax1)
168 | ylim_ax2 = 0.105
169 | vlines!(
170 | ax2,
171 | mean(d2);
172 | ymax=Distributions.pdf(d2, mean(d2)) / ylim_ax2,
173 | color=:dodgerblue,
174 | linewidth=3,
175 | linestyle=:solid,
176 | label=L"\mu",
177 | )
178 | vlines!(
179 | ax2,
180 | [mean(d2) - std(d2), mean(d2) + std(d2)];
181 | ymax=Distributions.pdf(d2, [mean(d2) - std(d2), mean(d2) + std(d2)]) ./ ylim_ax2,
182 | color="red",
183 | linewidth=3,
184 | linestyle=:dot,
185 | label=L"1 \cdot \sigma",
186 | )
187 | #ylims!(ax2, 0, ylim_ax2)
188 | #ylims!(ax1, 0, ylim_ax1)
189 |
190 | hidexdecorations!(ax1; grid=false, ticks=false)
191 | #hideydecorations!(ax1; grid = false)
192 | #hideydecorations!(ax2; grid = false)
193 | #fig[1:2, 2] = Legend(fig, ax2)
194 | axislegend(ax1, position=:rt)
195 | rowgap!(fig.layout, 8)
196 | return fig
197 | end
198 |
199 | function plot_dispersion_mad()
200 | CairoMakie.activate!() # hide
201 | fig = Figure(; resolution=(600, 400))
202 | ax1 = Axis(fig[1, 1]; limits=((3, 20), nothing))
203 | ax2 = Axis(fig[2, 1]; limits=((3, 20), nothing))
204 | d1, rand_d1 = normal_dist(10, 1)
205 | d2, rand_d2 = lognormal_dist(10, 1.5)
206 | dens(ax1, rand_d1, (:silver, 0.15))
207 | dens(ax2, rand_d2, (:grey, 0.25))
208 | # colorbrewer2 palettes
209 | ylim_ax1 = 0.38
210 | vlines!(
211 | ax1,
212 | median(d1);
213 | ymax=Distributions.pdf(d1, median(d1)) / ylim_ax1,
214 | color=:dodgerblue,
215 | linewidth=3,
216 | linestyle=:solid,
217 | label="median",
218 | )
219 | vlines!(
220 | ax1,
221 | [median(d1) - mad(rand_d1), median(d1) + mad(rand_d1)];
222 | ymax=Distributions.pdf(d1, [median(d1) - mad(rand_d1), median(d1) + mad(rand_d1)]) ./ ylim_ax1,
223 | color=:red,
224 | linewidth=3,
225 | linestyle=:dot,
226 | label=L"1 \cdot MAD",
227 | )
228 | #ylims!(ax1, 0, ylim_ax1)
229 | ylim_ax2 = 0.105
230 | vlines!(
231 | ax2,
232 | median(d2);
233 | ymax=Distributions.pdf(d2, median(d2)) / ylim_ax2,
234 | color=:dodgerblue,
235 | linewidth=3,
236 | linestyle=:solid,
237 | label="median",
238 | )
239 | vlines!(
240 | ax2,
241 | [median(d2) - mad(rand_d2), median(d2) + mad(rand_d2)];
242 | ymax=Distributions.pdf(d2, [median(d2) - mad(rand_d2), median(d2) + mad(rand_d2)]) ./ ylim_ax2,
243 | color="red",
244 | linewidth=3,
245 | linestyle=:dot,
246 | label=L"1 \cdot MAD",
247 | )
248 | #ylims!(ax2, 0, ylim_ax2)
249 | #ylims!(ax1, 0, ylim_ax1)
250 |
251 | hidexdecorations!(ax1; grid=false, ticks=false)
252 | #hideydecorations!(ax1; grid = false)
253 | #hideydecorations!(ax2; grid = false)
254 | #fig[1:2, 2] = Legend(fig, ax2)
255 | axislegend(ax1, position=:rt)
256 | rowgap!(fig.layout, 8)
257 | return fig
258 | end
259 |
260 | function plot_dispersion_iqr()
261 | CairoMakie.activate!() # hide
262 | fig = Figure(; resolution=(600, 400))
263 | ax1 = Axis(fig[1, 1]; limits=((3, 20), nothing))
264 | ax2 = Axis(fig[2, 1]; limits=((3, 20), nothing))
265 | d1, rand_d1 = normal_dist(10, 1)
266 | d2, rand_d2 = lognormal_dist(10, 1.5)
267 | dens(ax1, rand_d1, (:silver, 0.15))
268 | dens(ax2, rand_d2, (:grey, 0.25))
269 | # colorbrewer2 palettes
270 | ylim_ax1 = 0.38
271 | vlines!(
272 | ax1,
273 | median(d1);
274 | ymax=Distributions.pdf(d1, median(d1)) / ylim_ax1,
275 | color=:dodgerblue,
276 | linewidth=3,
277 | linestyle=:solid,
278 | label="median",
279 | )
280 | vlines!(
281 | ax1,
282 | quantile(d1, 0.25);
283 | ymax=Distributions.pdf(d1, quantile(d1, 0.25)) / ylim_ax1,
284 | color=:red,
285 | linewidth=3,
286 | linestyle=:dot,
287 | label="Q1",
288 | )
289 | vlines!(
290 | ax1,
291 | quantile(d1, 0.75);
292 | ymax=Distributions.pdf(d1, quantile(d1, 0.75)) / ylim_ax1,
293 | color="black",
294 | linewidth=3,
295 | linestyle=:dashdot,
296 | label="Q3",
297 | )
298 | vspan!(
299 | ax1,
300 | quantile(d1, 0.25),
301 | quantile(d1, 0.75);
302 | color=(:green, 0.3),
303 | #linewidth=3,
304 | #linestyle = :dashdot,
305 | label="IQR",
306 | )
307 | #ylims!(ax1, 0, ylim_ax1)
308 | ylim_ax2 = 0.105
309 | vlines!(
310 | ax2,
311 | median(d2);
312 | ymax=Distributions.pdf(d2, median(d2)) / ylim_ax2,
313 | color=:dodgerblue,
314 | linewidth=3,
315 | linestyle=:solid,
316 | label="median",
317 | )
318 | vlines!(
319 | ax2,
320 | quantile(d2, 0.25);
321 | #ymax=Distributions.pdf(d2, quantile(d2, 0.25)) / ylim_ax1,
322 | color="red",
323 | linewidth=3,
324 | linestyle=:dot,
325 | label="Q1",
326 | )
327 | vlines!(
328 | ax2,
329 | quantile(d2, 0.75);
330 | #ymax=Distributions.pdf(d2, quantile(d2, 0.75)) / ylim_ax1,
331 | color="black",
332 | linewidth=3,
333 | linestyle=:dashdot,
334 | label="Q3",
335 | )
336 | vspan!(
337 | ax2,
338 | quantile(d2, 0.25),
339 | quantile(d2, 0.75);
340 | color=(:green, 0.3),
341 | #linewidth=3,
342 | #linestyle = :dashdot,
343 | label="IQR",
344 | )
345 | #ylims!(ax2, 0, ylim_ax2)
346 | #ylims!(ax1, 0, ylim_ax1)
347 |
348 | hidexdecorations!(ax1; grid=false, ticks=false)
349 | #hideydecorations!(ax1; grid = false)
350 | #hideydecorations!(ax2; grid = false)
351 | #fig[1:2, 2] = Legend(fig, ax2)
352 | axislegend(ax1, position=:rt)
353 | rowgap!(fig.layout, 8)
354 | return fig
355 | end
356 |
357 | function plot_corr()
358 | seed!(123)
359 | CairoMakie.activate!() # hide
360 | fig = Figure(; resolution=(600, 600))
361 | corrs = [0.5, -0.5, 0.8, -0.8]
362 | ds = [MvNormal([1 i; i 1]) for i in corrs]
363 | d0 = MvNormal(2, 1)
364 | ax1 = Axis(
365 | fig[1, 1:2]; title="Correlation = $(corrs[1])", titlesize=20, limits=((-2, 2), (-2, 2)),
366 | )
367 | ax2 = Axis(
368 | fig[1, 3:4]; title="Correlation = $(corrs[2])", titlesize=20, limits=((-2, 2), (-2, 2)),
369 | )
370 | ax3 = Axis(
371 | fig[2, 2:3]; title="Correlation = 0", titlesize=20, limits=((-2, 2), (-2, 2)),
372 | )
373 | ax4 = Axis(
374 | fig[3, 1:2]; title="Correlation = $(corrs[3])", titlesize=20, limits=((-2, 2), (-2, 2)),
375 | )
376 | ax5 = Axis(
377 | fig[3, 3:4]; title="Correlation = $(corrs[4])", titlesize=20, limits=((-2, 2), (-2, 2)),
378 | )
379 | scatter!(ax1, rand(ds[1], 50)'; marker=:circle, color=:dodgerblue)
380 | scatter!(ax2, rand(ds[2], 50)'; marker=:circle, color=:dodgerblue)
381 | scatter!(ax3, rand(d0, 50)'; marker=:circle, color=:dodgerblue)
382 | scatter!(ax4, rand(ds[3], 50)'; marker=:circle, color=:dodgerblue)
383 | scatter!(ax5, rand(ds[4], 50)'; marker=:circle, color=:dodgerblue)
384 | abline!(ax1, 0, corrs[1]; linewidth=2, linestyle=:dash, color=:red)
385 | abline!(ax2, 0, corrs[2]; linewidth=2, linestyle=:dash, color=:red)
386 | abline!(ax3, 0, 0; linewidth=2, linestyle=:dash, color=:red)
387 | abline!(ax4, 0, corrs[3]; linewidth=2, linestyle=:dash, color=:red)
388 | abline!(ax5, 0, corrs[4]; linewidth=2, linestyle=:dash, color=:red)
389 | rowgap!(fig.layout, 8)
390 | hidexdecorations!(ax1; grid=false, ticks=false)
391 | hidexdecorations!(ax2; grid=false, ticks=false)
392 | hideydecorations!(ax2; grid=false, ticks=false)
393 | hideydecorations!(ax5; grid=false, ticks=false)
394 | #hidedecorations!(ax3; grid=false, ticks=false)
395 | colgap!(fig.layout, 8)
396 | return fig
397 | end
398 |
399 | function plot_normal_lognormal()
400 | CairoMakie.activate!() # hide
401 | fig = Figure(; resolution=(600, 400))
402 | ax = Axis(fig[1, 1]; limits=((3, 20), nothing))
403 | _, rand_d1 = normal_dist(10, 1)
404 | _, rand_d2 = lognormal_dist(10, 1.3)
405 | density!(ax, rand_d1; color=(:dodgerblue, 0.15), strokewidth=1.5, strokecolor=(:black, 0.5), label="normal")
406 | density!(ax, rand_d2; color=(:red, 0.15), strokewidth=1.5, strokecolor=(:black, 0.5), label="non-normal")
407 | axislegend(ax, position=:rt)
408 | hideydecorations!(ax; grid=false, ticks=true)
409 | return fig
410 | end
411 |
412 | function plot_discrete_continuous()
413 | seed!(123)
414 | discrete = Binomial(10, 0.6)
415 | continuous = Distributions.Normal(6, 2)
416 | CairoMakie.activate!() # hide
417 | fig = Figure(; resolution=(600, 400))
418 | ax1 = Axis(fig[1, 1]; limits=((0.5, 10.5), nothing), title="Discrete", titlesize=20)
419 | ax2 = Axis(fig[1, 2]; limits=((-1, 13), nothing), title="Continuous", titlesize=20)
420 | hist!(ax1, rand(discrete, 1_000); color=(:dodgerblue, 0.5), strokewidth=1.5, strokecolor=(:black, 0.5), bins=10, normalization=:pdf)
421 | density!(ax2, rand(continuous, 1_000); color=(:red, 0.5), strokewidth=1.5, strokecolor=(:black, 0.5))
422 | hidedecorations!(ax1)
423 | hidedecorations!(ax2)
424 | return fig
425 | end
426 |
427 | function plot_pmf()
428 | dice = DiscreteUniform(1, 6)
429 | CairoMakie.activate!() # hide
430 | fig = Figure(; resolution=(600, 400))
431 | ax = Axis(fig[1, 1]; xticks=1:6, limits=(nothing, (0, 0.2)), ylabel="pmf")
432 | barplot!(ax, 1:6, Distributions.pdf(dice, 1:6); color=(:grey, 0.25), strokewidth=1.5, strokecolor=(:black, 0.5))
433 | return fig
434 | end
435 |
436 | function plot_pdf()
437 | d = Distributions.Normal()
438 | CairoMakie.activate!() # hide
439 | fig = Figure(; resolution=(600, 400))
440 | ax = Axis(fig[1, 1]; xticks=-3:3, ylabel="pdf")
441 | range = -3:0.01:3.0
442 | subset = 1:0.01:2.0
443 | band!(ax, range, fill(0, length(range)), Distributions.pdf(d, range); color=(:grey, 0.25), strokewidth=1.5, strokecolor=(:black, 0.5))
444 | band!(ax, subset, fill(0, length(subset)), Distributions.pdf(d, subset); color=(:red, 0.25), strokewidth=1.5, strokecolor=(:black, 0.5))
445 | return fig
446 | end
447 |
448 | function plot_cdf(type::AbstractString)
449 | CairoMakie.activate!() # hide
450 | fig = Figure(; resolution=(600, 400))
451 | if type == "discrete"
452 | d = Distributions.DiscreteUniform(1, 6)
453 | range = 1:6
454 | ax = Axis(fig[1, 1]; xticks=1:6, limits=((0.8, 6.8), (0, 1.1)), ylabel="cdf")
455 | for i in range
456 | lines!(ax, i:i+1, repeat([Distributions.cdf(d, i)], 2); linewidth=4, color=(:black, 0.5))
457 | end
458 | elseif type == "continuous"
459 | d = Distributions.Normal()
460 | range = -3:0.01:3.0
461 | ax = Axis(fig[1, 1]; xticks=-3:3, ylabel="cdf")
462 | lines!(ax, range, Distributions.cdf(d, range); linewidth=4, color=(:black, 0.5))
463 | end
464 | return fig
465 | end
466 |
467 | function calculate_pdf(a, b; d=Distributions.Normal())
468 | return round(cdf(d, b) - cdf(d, a); digits=2)
469 | end
470 |
471 | function anscombe_quartet(; type="long")
472 | dataset = repeat(1:4; inner=11)
473 | x = [10.0 8.0 13.0 9.0 11.0 14.0 6.0 4.0 12.0 7.0 5.0
474 | 8.0 8.0 8.0 8.0 8.0 8.0 8.0 19.0 8.0 8.0 8.0]
475 | y = [8.04 9.14 7.46 6.58
476 | 6.95 8.14 6.77 5.76
477 | 7.58 8.74 12.74 7.71
478 | 8.81 8.77 7.11 8.84
479 | 8.33 9.26 7.81 8.47
480 | 9.96 8.1 8.84 7.04
481 | 7.24 6.13 6.08 5.25
482 | 4.26 3.1 5.39 12.5
483 | 10.84 9.13 8.15 5.56
484 | 4.82 7.26 6.42 7.91
485 | 5.68 4.74 5.73 6.89]
486 | if type == "long"
487 | x = vcat(repeat(x[1, :]; outer=3), x[2, :])
488 | y = (reshape(y, 11 * 4))
489 | return DataFrame(; dataset, x, y)
490 | elseif type == "wide"
491 | return DataFrame(; x_1=x[1, :], y_1=y[:, 1], x_2=x[1, :], y_2=y[:, 2],
492 | x_3=x[1, :], y_3=y[:, 3], x_4=x[2, :], y_4=y[:, 4])
493 | else
494 | return nothing
495 | end
496 | end
497 |
498 | function plot_anscombe()
499 | df = anscombe_quartet()
500 | filter_anscombe(idx) = filter(row -> row.dataset == idx, df)
501 | CairoMakie.activate!() # hide
502 | fig = Figure(; resolution=(600, 600))
503 | axs = [Axis(fig[i, j]; limits=((3, 20), (2.5, 14)),
504 | xticks=4:2:20, yticks=2:14)
505 | for i = 1:2, j = 1:2]
506 | for i = 1:4
507 | df_filter = Matrix(filter_anscombe(i)[!, 2:3])
508 | abline!(axs[i], 3, 0.5; linewidth=2, linestyle=:dash, color=:red)
509 | scatter!(axs[i], df_filter; marker=:circle, color=:dodgerblue)
510 | hidedecorations!(axs[i]; grid=false, ticks=false)
511 | end
512 | rowgap!(fig.layout, 8)
513 | colgap!(fig.layout, 8)
514 | return fig
515 | end
516 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Attribution-NonCommercial-ShareAlike 4.0 International
2 |
3 | =======================================================================
4 |
5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
6 | does not provide legal services or legal advice. Distribution of
7 | Creative Commons public licenses does not create a lawyer-client or
8 | other relationship. Creative Commons makes its licenses and related
9 | information available on an "as-is" basis. Creative Commons gives no
10 | warranties regarding its licenses, any material licensed under their
11 | terms and conditions, or any related information. Creative Commons
12 | disclaims all liability for damages resulting from their use to the
13 | fullest extent possible.
14 |
15 | Using Creative Commons Public Licenses
16 |
17 | Creative Commons public licenses provide a standard set of terms and
18 | conditions that creators and other rights holders may use to share
19 | original works of authorship and other material subject to copyright
20 | and certain other rights specified in the public license below. The
21 | following considerations are for informational purposes only, are not
22 | exhaustive, and do not form part of our licenses.
23 |
24 | Considerations for licensors: Our public licenses are
25 | intended for use by those authorized to give the public
26 | permission to use material in ways otherwise restricted by
27 | copyright and certain other rights. Our licenses are
28 | irrevocable. Licensors should read and understand the terms
29 | and conditions of the license they choose before applying it.
30 | Licensors should also secure all rights necessary before
31 | applying our licenses so that the public can reuse the
32 | material as expected. Licensors should clearly mark any
33 | material not subject to the license. This includes other CC-
34 | licensed material, or material used under an exception or
35 | limitation to copyright. More considerations for licensors:
36 | wiki.creativecommons.org/Considerations_for_licensors
37 |
38 | Considerations for the public: By using one of our public
39 | licenses, a licensor grants the public permission to use the
40 | licensed material under specified terms and conditions. If
41 | the licensor's permission is not necessary for any reason--for
42 | example, because of any applicable exception or limitation to
43 | copyright--then that use is not regulated by the license. Our
44 | licenses grant only permissions under copyright and certain
45 | other rights that a licensor has authority to grant. Use of
46 | the licensed material may still be restricted for other
47 | reasons, including because others have copyright or other
48 | rights in the material. A licensor may make special requests,
49 | such as asking that all changes be marked or described.
50 | Although not required by our licenses, you are encouraged to
51 | respect those requests where reasonable. More_considerations
52 | for the public:
53 | wiki.creativecommons.org/Considerations_for_licensees
54 |
55 | =======================================================================
56 |
57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
58 | Public License
59 |
60 | By exercising the Licensed Rights (defined below), You accept and agree
61 | to be bound by the terms and conditions of this Creative Commons
62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License
63 | ("Public License"). To the extent this Public License may be
64 | interpreted as a contract, You are granted the Licensed Rights in
65 | consideration of Your acceptance of these terms and conditions, and the
66 | Licensor grants You such rights in consideration of benefits the
67 | Licensor receives from making the Licensed Material available under
68 | these terms and conditions.
69 |
70 |
71 | Section 1 -- Definitions.
72 |
73 | a. Adapted Material means material subject to Copyright and Similar
74 | Rights that is derived from or based upon the Licensed Material
75 | and in which the Licensed Material is translated, altered,
76 | arranged, transformed, or otherwise modified in a manner requiring
77 | permission under the Copyright and Similar Rights held by the
78 | Licensor. For purposes of this Public License, where the Licensed
79 | Material is a musical work, performance, or sound recording,
80 | Adapted Material is always produced where the Licensed Material is
81 | synched in timed relation with a moving image.
82 |
83 | b. Adapter's License means the license You apply to Your Copyright
84 | and Similar Rights in Your contributions to Adapted Material in
85 | accordance with the terms and conditions of this Public License.
86 |
87 | c. BY-NC-SA Compatible License means a license listed at
88 | creativecommons.org/compatiblelicenses, approved by Creative
89 | Commons as essentially the equivalent of this Public License.
90 |
91 | d. Copyright and Similar Rights means copyright and/or similar rights
92 | closely related to copyright including, without limitation,
93 | performance, broadcast, sound recording, and Sui Generis Database
94 | Rights, without regard to how the rights are labeled or
95 | categorized. For purposes of this Public License, the rights
96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar
97 | Rights.
98 |
99 | e. Effective Technological Measures means those measures that, in the
100 | absence of proper authority, may not be circumvented under laws
101 | fulfilling obligations under Article 11 of the WIPO Copyright
102 | Treaty adopted on December 20, 1996, and/or similar international
103 | agreements.
104 |
105 | f. Exceptions and Limitations means fair use, fair dealing, and/or
106 | any other exception or limitation to Copyright and Similar Rights
107 | that applies to Your use of the Licensed Material.
108 |
109 | g. License Elements means the license attributes listed in the name
110 | of a Creative Commons Public License. The License Elements of this
111 | Public License are Attribution, NonCommercial, and ShareAlike.
112 |
113 | h. Licensed Material means the artistic or literary work, database,
114 | or other material to which the Licensor applied this Public
115 | License.
116 |
117 | i. Licensed Rights means the rights granted to You subject to the
118 | terms and conditions of this Public License, which are limited to
119 | all Copyright and Similar Rights that apply to Your use of the
120 | Licensed Material and that the Licensor has authority to license.
121 |
122 | j. Licensor means the individual(s) or entity(ies) granting rights
123 | under this Public License.
124 |
125 | k. NonCommercial means not primarily intended for or directed towards
126 | commercial advantage or monetary compensation. For purposes of
127 | this Public License, the exchange of the Licensed Material for
128 | other material subject to Copyright and Similar Rights by digital
129 | file-sharing or similar means is NonCommercial provided there is
130 | no payment of monetary compensation in connection with the
131 | exchange.
132 |
133 | l. Share means to provide material to the public by any means or
134 | process that requires permission under the Licensed Rights, such
135 | as reproduction, public display, public performance, distribution,
136 | dissemination, communication, or importation, and to make material
137 | available to the public including in ways that members of the
138 | public may access the material from a place and at a time
139 | individually chosen by them.
140 |
141 | m. Sui Generis Database Rights means rights other than copyright
142 | resulting from Directive 96/9/EC of the European Parliament and of
143 | the Council of 11 March 1996 on the legal protection of databases,
144 | as amended and/or succeeded, as well as other essentially
145 | equivalent rights anywhere in the world.
146 |
147 | n. You means the individual or entity exercising the Licensed Rights
148 | under this Public License. Your has a corresponding meaning.
149 |
150 |
151 | Section 2 -- Scope.
152 |
153 | a. License grant.
154 |
155 | 1. Subject to the terms and conditions of this Public License,
156 | the Licensor hereby grants You a worldwide, royalty-free,
157 | non-sublicensable, non-exclusive, irrevocable license to
158 | exercise the Licensed Rights in the Licensed Material to:
159 |
160 | a. reproduce and Share the Licensed Material, in whole or
161 | in part, for NonCommercial purposes only; and
162 |
163 | b. produce, reproduce, and Share Adapted Material for
164 | NonCommercial purposes only.
165 |
166 | 2. Exceptions and Limitations. For the avoidance of doubt, where
167 | Exceptions and Limitations apply to Your use, this Public
168 | License does not apply, and You do not need to comply with
169 | its terms and conditions.
170 |
171 | 3. Term. The term of this Public License is specified in Section
172 | 6(a).
173 |
174 | 4. Media and formats; technical modifications allowed. The
175 | Licensor authorizes You to exercise the Licensed Rights in
176 | all media and formats whether now known or hereafter created,
177 | and to make technical modifications necessary to do so. The
178 | Licensor waives and/or agrees not to assert any right or
179 | authority to forbid You from making technical modifications
180 | necessary to exercise the Licensed Rights, including
181 | technical modifications necessary to circumvent Effective
182 | Technological Measures. For purposes of this Public License,
183 | simply making modifications authorized by this Section 2(a)
184 | (4) never produces Adapted Material.
185 |
186 | 5. Downstream recipients.
187 |
188 | a. Offer from the Licensor -- Licensed Material. Every
189 | recipient of the Licensed Material automatically
190 | receives an offer from the Licensor to exercise the
191 | Licensed Rights under the terms and conditions of this
192 | Public License.
193 |
194 | b. Additional offer from the Licensor -- Adapted Material.
195 | Every recipient of Adapted Material from You
196 | automatically receives an offer from the Licensor to
197 | exercise the Licensed Rights in the Adapted Material
198 | under the conditions of the Adapter's License You apply.
199 |
200 | c. No downstream restrictions. You may not offer or impose
201 | any additional or different terms or conditions on, or
202 | apply any Effective Technological Measures to, the
203 | Licensed Material if doing so restricts exercise of the
204 | Licensed Rights by any recipient of the Licensed
205 | Material.
206 |
207 | 6. No endorsement. Nothing in this Public License constitutes or
208 | may be construed as permission to assert or imply that You
209 | are, or that Your use of the Licensed Material is, connected
210 | with, or sponsored, endorsed, or granted official status by,
211 | the Licensor or others designated to receive attribution as
212 | provided in Section 3(a)(1)(A)(i).
213 |
214 | b. Other rights.
215 |
216 | 1. Moral rights, such as the right of integrity, are not
217 | licensed under this Public License, nor are publicity,
218 | privacy, and/or other similar personality rights; however, to
219 | the extent possible, the Licensor waives and/or agrees not to
220 | assert any such rights held by the Licensor to the limited
221 | extent necessary to allow You to exercise the Licensed
222 | Rights, but not otherwise.
223 |
224 | 2. Patent and trademark rights are not licensed under this
225 | Public License.
226 |
227 | 3. To the extent possible, the Licensor waives any right to
228 | collect royalties from You for the exercise of the Licensed
229 | Rights, whether directly or through a collecting society
230 | under any voluntary or waivable statutory or compulsory
231 | licensing scheme. In all other cases the Licensor expressly
232 | reserves any right to collect such royalties, including when
233 | the Licensed Material is used other than for NonCommercial
234 | purposes.
235 |
236 |
237 | Section 3 -- License Conditions.
238 |
239 | Your exercise of the Licensed Rights is expressly made subject to the
240 | following conditions.
241 |
242 | a. Attribution.
243 |
244 | 1. If You Share the Licensed Material (including in modified
245 | form), You must:
246 |
247 | a. retain the following if it is supplied by the Licensor
248 | with the Licensed Material:
249 |
250 | i. identification of the creator(s) of the Licensed
251 | Material and any others designated to receive
252 | attribution, in any reasonable manner requested by
253 | the Licensor (including by pseudonym if
254 | designated);
255 |
256 | ii. a copyright notice;
257 |
258 | iii. a notice that refers to this Public License;
259 |
260 | iv. a notice that refers to the disclaimer of
261 | warranties;
262 |
263 | v. a URI or hyperlink to the Licensed Material to the
264 | extent reasonably practicable;
265 |
266 | b. indicate if You modified the Licensed Material and
267 | retain an indication of any previous modifications; and
268 |
269 | c. indicate the Licensed Material is licensed under this
270 | Public License, and include the text of, or the URI or
271 | hyperlink to, this Public License.
272 |
273 | 2. You may satisfy the conditions in Section 3(a)(1) in any
274 | reasonable manner based on the medium, means, and context in
275 | which You Share the Licensed Material. For example, it may be
276 | reasonable to satisfy the conditions by providing a URI or
277 | hyperlink to a resource that includes the required
278 | information.
279 | 3. If requested by the Licensor, You must remove any of the
280 | information required by Section 3(a)(1)(A) to the extent
281 | reasonably practicable.
282 |
283 | b. ShareAlike.
284 |
285 | In addition to the conditions in Section 3(a), if You Share
286 | Adapted Material You produce, the following conditions also apply.
287 |
288 | 1. The Adapter's License You apply must be a Creative Commons
289 | license with the same License Elements, this version or
290 | later, or a BY-NC-SA Compatible License.
291 |
292 | 2. You must include the text of, or the URI or hyperlink to, the
293 | Adapter's License You apply. You may satisfy this condition
294 | in any reasonable manner based on the medium, means, and
295 | context in which You Share Adapted Material.
296 |
297 | 3. You may not offer or impose any additional or different terms
298 | or conditions on, or apply any Effective Technological
299 | Measures to, Adapted Material that restrict exercise of the
300 | rights granted under the Adapter's License You apply.
301 |
302 |
303 | Section 4 -- Sui Generis Database Rights.
304 |
305 | Where the Licensed Rights include Sui Generis Database Rights that
306 | apply to Your use of the Licensed Material:
307 |
308 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right
309 | to extract, reuse, reproduce, and Share all or a substantial
310 | portion of the contents of the database for NonCommercial purposes
311 | only;
312 |
313 | b. if You include all or a substantial portion of the database
314 | contents in a database in which You have Sui Generis Database
315 | Rights, then the database in which You have Sui Generis Database
316 | Rights (but not its individual contents) is Adapted Material,
317 | including for purposes of Section 3(b); and
318 |
319 | c. You must comply with the conditions in Section 3(a) if You Share
320 | all or a substantial portion of the contents of the database.
321 |
322 | For the avoidance of doubt, this Section 4 supplements and does not
323 | replace Your obligations under this Public License where the Licensed
324 | Rights include other Copyright and Similar Rights.
325 |
326 |
327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
328 |
329 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
330 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
331 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
332 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
333 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
334 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
335 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
336 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
337 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
338 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
339 |
340 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
341 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
342 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
343 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
344 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
345 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
346 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
347 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
348 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
349 |
350 | c. The disclaimer of warranties and limitation of liability provided
351 | above shall be interpreted in a manner that, to the extent
352 | possible, most closely approximates an absolute disclaimer and
353 | waiver of all liability.
354 |
355 |
356 | Section 6 -- Term and Termination.
357 |
358 | a. This Public License applies for the term of the Copyright and
359 | Similar Rights licensed here. However, if You fail to comply with
360 | this Public License, then Your rights under this Public License
361 | terminate automatically.
362 |
363 | b. Where Your right to use the Licensed Material has terminated under
364 | Section 6(a), it reinstates:
365 |
366 | 1. automatically as of the date the violation is cured, provided
367 | it is cured within 30 days of Your discovery of the
368 | violation; or
369 |
370 | 2. upon express reinstatement by the Licensor.
371 |
372 | For the avoidance of doubt, this Section 6(b) does not affect any
373 | right the Licensor may have to seek remedies for Your violations
374 | of this Public License.
375 |
376 | c. For the avoidance of doubt, the Licensor may also offer the
377 | Licensed Material under separate terms or conditions or stop
378 | distributing the Licensed Material at any time; however, doing so
379 | will not terminate this Public License.
380 |
381 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
382 | License.
383 |
384 |
385 | Section 7 -- Other Terms and Conditions.
386 |
387 | a. The Licensor shall not be bound by any additional or different
388 | terms or conditions communicated by You unless expressly agreed.
389 |
390 | b. Any arrangements, understandings, or agreements regarding the
391 | Licensed Material not stated herein are separate from and
392 | independent of the terms and conditions of this Public License.
393 |
394 |
395 | Section 8 -- Interpretation.
396 |
397 | a. For the avoidance of doubt, this Public License does not, and
398 | shall not be interpreted to, reduce, limit, restrict, or impose
399 | conditions on any use of the Licensed Material that could lawfully
400 | be made without permission under this Public License.
401 |
402 | b. To the extent possible, if any provision of this Public License is
403 | deemed unenforceable, it shall be automatically reformed to the
404 | minimum extent necessary to make it enforceable. If the provision
405 | cannot be reformed, it shall be severed from this Public License
406 | without affecting the enforceability of the remaining terms and
407 | conditions.
408 |
409 | c. No term or condition of this Public License will be waived and no
410 | failure to comply consented to unless expressly agreed to by the
411 | Licensor.
412 |
413 | d. Nothing in this Public License constitutes or may be interpreted
414 | as a limitation upon, or waiver of, any privileges and immunities
415 | that apply to the Licensor or You, including from the legal
416 | processes of any jurisdiction or authority.
417 |
418 | =======================================================================
419 |
420 | Creative Commons is not a party to its public
421 | licenses. Notwithstanding, Creative Commons may elect to apply one of
422 | its public licenses to material it publishes and in those instances
423 | will be considered the “Licensor.” The text of the Creative Commons
424 | public licenses is dedicated to the public domain under the CC0 Public
425 | Domain Dedication. Except for the limited purpose of indicating that
426 | material is shared under a Creative Commons public license or as
427 | otherwise permitted by the Creative Commons policies published at
428 | creativecommons.org/policies, Creative Commons does not authorize the
429 | use of the trademark "Creative Commons" or any other trademark or logo
430 | of Creative Commons without its prior written consent including,
431 | without limitation, in connection with any unauthorized modifications
432 | to any of its public licenses or any other arrangements,
433 | understandings, or agreements concerning use of licensed material. For
434 | the avoidance of doubt, this paragraph does not form part of the
435 | public licenses.
436 |
437 | Creative Commons may be contacted at creativecommons.org.
438 |
--------------------------------------------------------------------------------