├── .gitattributes
├── .github
└── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE.txt
├── README-ja.md
├── README-zh-Hans.md
├── README-zh-TW.md
├── README.md
├── TRANSLATIONS.md
├── epub-metadata.yaml
├── generate-epub.sh
├── images
├── 0vBc0hN.png
├── 4edXG0T.png
├── 4j99mhe.png
├── 54GYsSx.png
├── 5KeocQs.jpg
├── C9ioGtn.png
├── IOyLj4i.jpg
├── JdAsdvG.jpg
├── MzExP06.png
├── ONjORqk.png
├── OfVllex.png
├── Q6z24La.png
├── TcUo2fw.png
├── U3qV33e.png
├── V5q57vU.png
├── Xkm5CXz.png
├── b4YtAEN.png
├── bWxPtQA.png
├── bgLMI2u.png
├── cdCv5g7.png
├── fNcl65g.png
├── h81n9iK.png
├── h9TAuGI.jpg
├── iF4Mkb5.png
├── jj3A5N8.png
├── jrUBAF7.png
├── krAHLGg.png
├── kxtjqgE.png
├── n16iOGk.png
├── n41Azff.png
├── rgSrvjG.png
├── wU8x5Id.png
├── wXGqG5f.png
├── yB5SYwm.png
├── yzDrJtA.jpg
└── zdCAkB3.png
├── resources
├── flash_cards
│ ├── OO Design.apkg
│ ├── System Design Exercises.apkg
│ └── System Design.apkg
├── study_guide.graffle
└── study_guide.png
└── solutions
├── object_oriented_design
├── call_center
│ ├── __init__.py
│ ├── call_center.ipynb
│ └── call_center.py
├── deck_of_cards
│ ├── __init__.py
│ ├── deck_of_cards.ipynb
│ └── deck_of_cards.py
├── hash_table
│ ├── __init__.py
│ ├── hash_map.ipynb
│ └── hash_map.py
├── lru_cache
│ ├── __init__.py
│ ├── lru_cache.ipynb
│ └── lru_cache.py
├── online_chat
│ ├── __init__.py
│ ├── online_chat.ipynb
│ └── online_chat.py
└── parking_lot
│ ├── __init__.py
│ ├── parking_lot.ipynb
│ └── parking_lot.py
└── system_design
├── mint
├── README-zh-Hans.md
├── README.md
├── __init__.py
├── mint.graffle
├── mint.png
├── mint_basic.graffle
├── mint_basic.png
├── mint_mapreduce.py
└── mint_snippets.py
├── pastebin
├── README-zh-Hans.md
├── README.md
├── __init__.py
├── pastebin.graffle
├── pastebin.png
├── pastebin.py
├── pastebin_basic.graffle
└── pastebin_basic.png
├── query_cache
├── README-zh-Hans.md
├── README.md
├── __init__.py
├── query_cache.graffle
├── query_cache.png
├── query_cache_basic.graffle
├── query_cache_basic.png
└── query_cache_snippets.py
├── sales_rank
├── README-zh-Hans.md
├── README.md
├── __init__.py
├── sales_rank.graffle
├── sales_rank.png
├── sales_rank_basic.graffle
├── sales_rank_basic.png
└── sales_rank_mapreduce.py
├── scaling_aws
├── README-zh-Hans.md
├── README.md
├── scaling_aws.graffle
├── scaling_aws.png
├── scaling_aws_1.png
├── scaling_aws_2.png
├── scaling_aws_3.png
├── scaling_aws_4.png
├── scaling_aws_5.png
├── scaling_aws_6.png
└── scaling_aws_7.png
├── social_graph
├── README-zh-Hans.md
├── README.md
├── __init__.py
├── social_graph.graffle
├── social_graph.png
├── social_graph_basic.graffle
├── social_graph_basic.png
└── social_graph_snippets.py
├── template
└── template.graffle
├── twitter
├── README-zh-Hans.md
├── README.md
├── twitter.graffle
├── twitter.png
├── twitter_basic.graffle
└── twitter_basic.png
└── web_crawler
├── README-zh-Hans.md
├── README.md
├── __init__.py
├── web_crawler.graffle
├── web_crawler.png
├── web_crawler_basic.graffle
├── web_crawler_basic.png
├── web_crawler_mapreduce.py
└── web_crawler_snippets.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-language=Python
2 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ## Review the Contributing Guidelines
2 |
3 | Before submitting a pull request, verify it meets all requirements in the [Contributing Guidelines](https://github.com/donnemartin/system-design-primer/blob/master/CONTRIBUTING.md).
4 |
5 | ### Translations
6 |
7 | See the [Contributing Guidelines](https://github.com/donnemartin/system-design-primer/blob/master/CONTRIBUTING.md). Verify you've:
8 |
9 | * Tagged the [language maintainer](https://github.com/donnemartin/system-design-primer/blob/master/TRANSLATIONS.md)
10 | * Prefixed the title with a language code
11 | * Example: "ja: Fix ..."
12 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | *.epub
3 | __pycache__/
4 | *.py[cod]
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 |
44 | # Translations
45 | *.mo
46 | *.pot
47 |
48 | # Django stuff:
49 | *.log
50 |
51 | # Sphinx documentation
52 | docs/_build/
53 |
54 | # PyBuilder
55 | target/
56 |
57 | # IPython notebook
58 | .ipynb_checkpoints
59 |
60 | # Repo scratch directory
61 | scratch/
62 |
63 | # IPython Notebook templates
64 | template.ipynb
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Contributing
2 | ============
3 |
4 | Contributions are welcome!
5 |
6 | **Please carefully read this page to make the code review process go as smoothly as possible and to maximize the likelihood of your contribution being merged.**
7 |
8 | ## Bug Reports
9 |
10 | For bug reports or requests [submit an issue](https://github.com/donnemartin/system-design-primer/issues).
11 |
12 | ## Pull Requests
13 |
14 | The preferred way to contribute is to fork the
15 | [main repository](https://github.com/donnemartin/system-design-primer) on GitHub.
16 |
17 | 1. Fork the [main repository](https://github.com/donnemartin/system-design-primer). Click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server.
18 |
19 | 2. Clone this copy to your local disk:
20 |
21 | $ git clone git@github.com:YourLogin/system-design-primer.git
22 | $ cd system-design-primer
23 |
24 | 3. Create a branch to hold your changes and start making changes. Don't work in the `master` branch!
25 |
26 | $ git checkout -b my-feature
27 |
28 | 4. Work on this copy on your computer using Git to do the version control. When you're done editing, run the following to record your changes in Git:
29 |
30 | $ git add modified_files
31 | $ git commit
32 |
33 | 5. Push your changes to GitHub with:
34 |
35 | $ git push -u origin my-feature
36 |
37 | 6. Finally, go to the web page of your fork of the `system-design-primer` repo and click 'Pull Request' to send your changes for review.
38 |
39 | ### GitHub Pull Requests Docs
40 |
41 | If you are not familiar with pull requests, review the [pull request docs](https://help.github.com/articles/using-pull-requests/).
42 |
43 | ## Translations
44 |
45 | We'd like for the guide to be available in many languages. Here is the process for maintaining translations:
46 |
47 | * This original version and content of the guide is maintained in English.
48 | * Translations follow the content of the original. Contributors must speak at least some English, so that translations do not diverge.
49 | * Each translation has a maintainer to update the translation as the original evolves and to review others' changes. This doesn't require a lot of time, but a review by the maintainer is important to maintain quality.
50 |
51 | See [Translations](TRANSLATIONS.md).
52 |
53 | ### Changes to translations
54 |
55 | * Changes to content should be made to the English version first, and then translated to each other language.
56 | * Changes that improve translations should be made directly on the file for that language. Pull requests should only modify one language at a time.
57 | * Submit a pull request with changes to the file in that language. Each language has a maintainer, who reviews changes in that language. Then the primary maintainer [@donnemartin](https://github.com/donnemartin) merges it in.
58 | * Prefix pull requests and issues with language codes if they are for that translation only, e.g. "es: Improve grammar", so maintainers can find them easily.
59 | * Tag the translation maintainer for a code review, see the list of [translation maintainers](TRANSLATIONS.md).
60 | * You will need to get a review from a native speaker (preferably the language maintainer) before your pull request is merged.
61 |
62 | ### Adding translations to new languages
63 |
64 | Translations to new languages are always welcome! Keep in mind a translation must be maintained.
65 |
66 | * Do you have time to be a maintainer for a new language? Please see the list of [translations](TRANSLATIONS.md) and tell us so we know we can count on you in the future.
67 | * Check the [translations](TRANSLATIONS.md), issues, and pull requests to see if a translation is in progress or stalled. If it's in progress, offer to help. If it's stalled, consider becoming the maintainer if you can commit to it.
68 | * If a translation has not yet been started, file an issue for your language so people know you are working on it and we'll coordinate. Confirm you are native level in the language and are willing to maintain the translation, so it's not orphaned.
69 | * To get started, fork the repo, then submit a pull request to the main repo with the single file README-xx.md added, where xx is the language code. Use standard [IETF language tags](https://www.w3.org/International/articles/language-tags/), i.e. the same as is used by Wikipedia, *not* the code for a single country. These are usually just the two-letter lowercase code, for example, `fr` for French and `uk` for Ukrainian (not `ua`, which is for the country). For languages that have variations, use the shortest tag, such as `zh-Hant`.
70 | * Feel free to invite friends to help your original translation by having them fork your repo, then merging their pull requests to your forked repo. Translations are difficult and usually have errors that others need to find.
71 | * Add links to your translation at the top of every README-XX.md file. For consistency, the link should be added in alphabetical order by ISO code, and the anchor text should be in the native language.
72 | * When you've fully translated the English README.md, comment on the pull request in the main repo that it's ready to be merged.
73 | * You'll need to have a complete and reviewed translation of the English README.md before your translation will be merged into the `master` branch.
74 | * Once accepted, your pull request will be squashed into a single commit into the `master` branch.
75 |
76 | ### Translation template credits
77 |
78 | Thanks to [The Art of Command Line](https://github.com/jlevy/the-art-of-command-line) for the translation template.
79 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | I am providing code and resources in this repository to you under an open source
2 | license. Because this is my personal repository, the license you receive to my
3 | code and resources is from me and not my employer (Facebook).
4 |
5 | Copyright 2017 Donne Martin
6 |
7 | Creative Commons Attribution 4.0 International License (CC BY 4.0)
8 |
9 | http://creativecommons.org/licenses/by/4.0/
10 |
--------------------------------------------------------------------------------
/TRANSLATIONS.md:
--------------------------------------------------------------------------------
1 | # Translations
2 |
3 | **Thank you to our awesome translation maintainers!**
4 |
5 | ## Contributing
6 |
7 | See the [Contributing Guidelines](CONTRIBUTING.md).
8 |
9 | ## Translation Statuses
10 |
11 | * 🎉 **Live**: Merged into `master` branch
12 | * ⏳ **In Progress**: Under active translation for eventual merge into `master` branch
13 | * ❗ **Stalled***: Needs an active maintainer ✋
14 |
15 | **Within the past 2 months, there has been 1) No active work in the translation fork, and 2) No discussions from previous maintainer(s) in the discussion thread.*
16 |
17 | Languages not listed here have not been started, [contribute](CONTRIBUTING.md)!
18 |
19 | Languages are grouped by status and are listed in alphabetical order.
20 |
21 | ## Live
22 |
23 | ### 🎉 Japanese
24 |
25 | * [README-ja.md](README-ja.md)
26 | * Maintainer(s): [@tsukukobaan](https://github.com/tsukukobaan) 👏
27 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/100
28 |
29 | ### 🎉 Simplified Chinese
30 |
31 | * [zh-Hans.md](README-zh-Hans.md)
32 | * Maintainer(s): [@sqrthree](https://github.com/sqrthree) 👏
33 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/38
34 |
35 | ### 🎉 Traditional Chinese
36 |
37 | * [README-zh-TW.md](README-zh-TW.md)
38 | * Maintainer(s): [@kevingo](https://github.com/kevingo) 👏
39 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/88
40 |
41 | ## In Progress
42 |
43 | ### ⏳ Korean
44 |
45 | * Maintainer(s): [@bonomoon](https://github.com/bonomoon), [@mingrammer](https://github.com/mingrammer) 👏
46 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/102
47 | * Translation Fork: https://github.com/bonomoon/system-design-primer, https://github.com/donnemartin/system-design-primer/pull/103
48 |
49 | ### ⏳ Russian
50 |
51 | * Maintainer(s): [@voitau](https://github.com/voitau), [@DmitryOlkhovoi](https://github.com/DmitryOlkhovoi) 👏
52 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/87
53 | * Translation Fork: https://github.com/voitau/system-design-primer/blob/master/README-ru.md
54 |
55 | ## Stalled
56 |
57 | **Notes**:
58 |
59 | * If you're able to commit to being an active maintainer for a language, let us know in the discussion thread for your language and update this file with a pull request.
60 | * If you're listed here as a "Previous Maintainer" but can commit to being an active maintainer, also let us know.
61 | * See the [Contributing Guidelines](CONTRIBUTING.md).
62 |
63 | ### ❗ Arabic
64 |
65 | * Maintainer(s): **Help Wanted** ✋
66 | * Previous Maintainer(s): [@aymns](https://github.com/aymns)
67 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/170
68 | * Translation Fork: https://github.com/aymns/system-design-primer/blob/develop/README-ar.md
69 |
70 | ### ❗ Bengali
71 |
72 | * Maintainer(s): **Help Wanted** ✋
73 | * Previous Maintainer(s): [@nutboltu](https://github.com/nutboltu)
74 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/220
75 | * Translation Fork: https://github.com/donnemartin/system-design-primer/pull/240
76 |
77 | ### ❗ Brazilian Portuguese
78 |
79 | * Maintainer(s): **Help Wanted** ✋
80 | * Previous Maintainer(s): [@IuryAlves](https://github.com/IuryAlves)
81 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/40
82 | * Translation Fork: https://github.com/IuryAlves/system-design-primer, https://github.com/donnemartin/system-design-primer/pull/67
83 |
84 | ### ❗ French
85 |
86 | * Maintainer(s): **Help Wanted** ✋
87 | * Previous Maintainer(s): [@spuyet](https://github.com/spuyet)
88 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/250
89 | * Translation Fork: https://github.com/spuyet/system-design-primer/blob/add-french-translation/README-fr.md
90 |
91 | ### ❗ German
92 |
93 | * Maintainer(s): **Help Wanted** ✋
94 | * Previous Maintainer(s): [@Allaman](https://github.com/Allaman)
95 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/186
96 | * Translation Fork: None
97 |
98 | ### ❗ Greek
99 |
100 | * Maintainer(s): **Help Wanted** ✋
101 | * Previous Maintainer(s): [@Belonias](https://github.com/Belonias)
102 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/130
103 | * Translation Fork: None
104 |
105 | ### ❗ Hebrew
106 |
107 | * Maintainer(s): **Help Wanted** ✋
108 | * Previous Maintainer(s): [@EladLeev](https://github.com/EladLeev)
109 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/272
110 | * Translation Fork: https://github.com/EladLeev/system-design-primer/tree/he-translate
111 |
112 | ### ❗ Italian
113 |
114 | * Maintainer(s): **Help Wanted** ✋
115 | * Previous Maintainer(s): [@pgoodjohn](https://github.com/pgoodjohn)
116 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/104
117 | * Translation Fork: https://github.com/pgoodjohn/system-design-primer
118 |
119 | ### ❗ Persian
120 |
121 | * Maintainer(s): **Help Wanted** ✋
122 | * Previous Maintainer(s): [@hadisinaee](https://github.com/hadisinaee)
123 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/pull/112
124 | * Translation Fork: https://github.com/donnemartin/system-design-primer/pull/112
125 |
126 | ### ❗ Spanish
127 |
128 | * Maintainer(s): **Help Wanted** ✋
129 | * Previous Maintainer(s): [@eamanu](https://github.com/eamanu)
130 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/136
131 | * Translation Fork: https://github.com/donnemartin/system-design-primer/pull/189
132 |
133 | ### ❗ Thai
134 |
135 | * Maintainer(s): **Help Wanted** ✋
136 | * Previous Maintainer(s): [@iphayao](https://github.com/iphayao)
137 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/187
138 | * Translation Fork: https://github.com/donnemartin/system-design-primer/pull/221
139 |
140 | ### ❗ Turkish
141 |
142 | * Maintainer(s): **Help Wanted** ✋
143 | * Previous Maintainer(s): [@hwclass](https://github.com/hwclass), [@canerbaran](https://github.com/canerbaran), [@emrahtoy](https://github.com/emrahtoy)
144 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/39
145 | * Translation Fork: https://github.com/donnemartin/system-design-primer/pull/239
146 |
147 | ### ❗ Ukrainian
148 |
149 | * Maintainer(s): **Help Wanted** ✋
150 | * Previous Maintainer(s): [@Kietzmann](https://github.com/Kietzmann), [@Acarus](https://github.com/Acarus)
151 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/248
152 | * Translation Fork: https://github.com/Acarus/system-design-primer
153 |
154 | ### ❗ Vietnamese
155 |
156 | * Maintainer(s): **Help Wanted** ✋
157 | * Previous Maintainer(s): [@tranlyvu](https://github.com/tranlyvu), [@duynguyenhoang](https://github.com/duynguyenhoang)
158 | * Discussion Thread: https://github.com/donnemartin/system-design-primer/issues/127
159 | * Translation Fork: https://github.com/donnemartin/system-design-primer/pull/241, https://github.com/donnemartin/system-design-primer/pull/327
160 |
161 | ## Not Started
162 |
163 | Languages not listed here have not been started, [contribute](CONTRIBUTING.md)!
164 |
--------------------------------------------------------------------------------
/epub-metadata.yaml:
--------------------------------------------------------------------------------
1 | title: System Design Primer
2 | creator: Donne Martin
3 | date: 2018
--------------------------------------------------------------------------------
/generate-epub.sh:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env bash
2 |
3 | generate_from_stdin() {
4 | outfile=$1
5 | language=$2
6 |
7 | echo "Generating '$language' ..."
8 |
9 | pandoc --metadata-file=epub-metadata.yaml --metadata=lang:$2 --from=markdown -o $1 <&0
10 |
11 | echo "Done! You can find the '$language' book at ./$outfile"
12 | }
13 |
14 | generate_with_solutions () {
15 | tmpfile=$(mktemp /tmp/sytem-design-primer-epub-generator.XXX)
16 |
17 | cat ./README.md >> $tmpfile
18 |
19 | for dir in ./solutions/system_design/*; do
20 | case $dir in *template*) continue;; esac
21 | case $dir in *__init__.py*) continue;; esac
22 | : [[ -d "$dir" ]] && ( cd "$dir" && cat ./README.md >> $tmpfile && echo "" >> $tmpfile )
23 | done
24 |
25 | cat $tmpfile | generate_from_stdin 'README.epub' 'en'
26 |
27 | rm "$tmpfile"
28 | }
29 |
30 | generate () {
31 | name=$1
32 | language=$2
33 |
34 | cat $name.md | generate_from_stdin $name.epub $language
35 | }
36 |
37 | # Check if dependencies exist
38 | check_dependencies () {
39 | for dependency in "${dependencies[@]}"
40 | do
41 | if ! [ -x "$(command -v $dependency)" ]; then
42 | echo "Error: $dependency is not installed." >&2
43 | exit 1
44 | fi
45 | done
46 | }
47 |
48 | dependencies=("pandoc")
49 |
50 | check_dependencies
51 | generate_with_solutions
52 | generate README-ja ja
53 | generate README-zh-Hans zh-Hans
54 | generate README-zh-TW zh-TW
55 |
--------------------------------------------------------------------------------
/images/0vBc0hN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/0vBc0hN.png
--------------------------------------------------------------------------------
/images/4edXG0T.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/4edXG0T.png
--------------------------------------------------------------------------------
/images/4j99mhe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/4j99mhe.png
--------------------------------------------------------------------------------
/images/54GYsSx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/54GYsSx.png
--------------------------------------------------------------------------------
/images/5KeocQs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/5KeocQs.jpg
--------------------------------------------------------------------------------
/images/C9ioGtn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/C9ioGtn.png
--------------------------------------------------------------------------------
/images/IOyLj4i.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/IOyLj4i.jpg
--------------------------------------------------------------------------------
/images/JdAsdvG.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/JdAsdvG.jpg
--------------------------------------------------------------------------------
/images/MzExP06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/MzExP06.png
--------------------------------------------------------------------------------
/images/ONjORqk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/ONjORqk.png
--------------------------------------------------------------------------------
/images/OfVllex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/OfVllex.png
--------------------------------------------------------------------------------
/images/Q6z24La.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/Q6z24La.png
--------------------------------------------------------------------------------
/images/TcUo2fw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/TcUo2fw.png
--------------------------------------------------------------------------------
/images/U3qV33e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/U3qV33e.png
--------------------------------------------------------------------------------
/images/V5q57vU.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/V5q57vU.png
--------------------------------------------------------------------------------
/images/Xkm5CXz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/Xkm5CXz.png
--------------------------------------------------------------------------------
/images/b4YtAEN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/b4YtAEN.png
--------------------------------------------------------------------------------
/images/bWxPtQA.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/bWxPtQA.png
--------------------------------------------------------------------------------
/images/bgLMI2u.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/bgLMI2u.png
--------------------------------------------------------------------------------
/images/cdCv5g7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/cdCv5g7.png
--------------------------------------------------------------------------------
/images/fNcl65g.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/fNcl65g.png
--------------------------------------------------------------------------------
/images/h81n9iK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/h81n9iK.png
--------------------------------------------------------------------------------
/images/h9TAuGI.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/h9TAuGI.jpg
--------------------------------------------------------------------------------
/images/iF4Mkb5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/iF4Mkb5.png
--------------------------------------------------------------------------------
/images/jj3A5N8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/jj3A5N8.png
--------------------------------------------------------------------------------
/images/jrUBAF7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/jrUBAF7.png
--------------------------------------------------------------------------------
/images/krAHLGg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/krAHLGg.png
--------------------------------------------------------------------------------
/images/kxtjqgE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/kxtjqgE.png
--------------------------------------------------------------------------------
/images/n16iOGk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/n16iOGk.png
--------------------------------------------------------------------------------
/images/n41Azff.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/n41Azff.png
--------------------------------------------------------------------------------
/images/rgSrvjG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/rgSrvjG.png
--------------------------------------------------------------------------------
/images/wU8x5Id.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/wU8x5Id.png
--------------------------------------------------------------------------------
/images/wXGqG5f.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/wXGqG5f.png
--------------------------------------------------------------------------------
/images/yB5SYwm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/yB5SYwm.png
--------------------------------------------------------------------------------
/images/yzDrJtA.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/yzDrJtA.jpg
--------------------------------------------------------------------------------
/images/zdCAkB3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/images/zdCAkB3.png
--------------------------------------------------------------------------------
/resources/flash_cards/OO Design.apkg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/resources/flash_cards/OO Design.apkg
--------------------------------------------------------------------------------
/resources/flash_cards/System Design Exercises.apkg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/resources/flash_cards/System Design Exercises.apkg
--------------------------------------------------------------------------------
/resources/flash_cards/System Design.apkg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/resources/flash_cards/System Design.apkg
--------------------------------------------------------------------------------
/resources/study_guide.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/resources/study_guide.graffle
--------------------------------------------------------------------------------
/resources/study_guide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/resources/study_guide.png
--------------------------------------------------------------------------------
/solutions/object_oriented_design/call_center/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/object_oriented_design/call_center/__init__.py
--------------------------------------------------------------------------------
/solutions/object_oriented_design/call_center/call_center.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook was prepared by [Donne Martin](https://github.com/donnemartin). Source and license info is on [GitHub](https://github.com/donnemartin/system-design-primer)."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Design a call center"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Constraints and assumptions\n",
22 | "\n",
23 | "* What levels of employees are in the call center?\n",
24 | " * Operator, supervisor, director\n",
25 | "* Can we assume operators always get the initial calls?\n",
26 | " * Yes\n",
27 | "* If there is no available operators or the operator can't handle the call, does the call go to the supervisors?\n",
28 | " * Yes\n",
29 | "* If there is no available supervisors or the supervisor can't handle the call, does the call go to the directors?\n",
30 | " * Yes\n",
31 | "* Can we assume the directors can handle all calls?\n",
32 | " * Yes\n",
33 | "* What happens if nobody can answer the call?\n",
34 | " * It gets queued\n",
35 | "* Do we need to handle 'VIP' calls where we put someone to the front of the line?\n",
36 | " * No\n",
37 | "* Can we assume inputs are valid or do we have to validate them?\n",
38 | " * Assume they're valid"
39 | ]
40 | },
41 | {
42 | "cell_type": "markdown",
43 | "metadata": {},
44 | "source": [
45 | "## Solution"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 1,
51 | "metadata": {
52 | "collapsed": false
53 | },
54 | "outputs": [
55 | {
56 | "name": "stdout",
57 | "output_type": "stream",
58 | "text": [
59 | "Overwriting call_center.py\n"
60 | ]
61 | }
62 | ],
63 | "source": [
64 | "%%writefile call_center.py\n",
65 | "from abc import ABCMeta, abstractmethod\n",
66 | "from collections import deque\n",
67 | "from enum import Enum\n",
68 | "\n",
69 | "\n",
70 | "class Rank(Enum):\n",
71 | "\n",
72 | " OPERATOR = 0\n",
73 | " SUPERVISOR = 1\n",
74 | " DIRECTOR = 2\n",
75 | "\n",
76 | "\n",
77 | "class Employee(metaclass=ABCMeta):\n",
78 | "\n",
79 | " def __init__(self, employee_id, name, rank, call_center):\n",
80 | " self.employee_id = employee_id\n",
81 | " self.name = name\n",
82 | " self.rank = rank\n",
83 | " self.call = None\n",
84 | " self.call_center = call_center\n",
85 | "\n",
86 | " def take_call(self, call):\n",
87 | " \"\"\"Assume the employee will always successfully take the call.\"\"\"\n",
88 | " self.call = call\n",
89 | " self.call.employee = self\n",
90 | " self.call.state = CallState.IN_PROGRESS\n",
91 | "\n",
92 | " def complete_call(self):\n",
93 | " self.call.state = CallState.COMPLETE\n",
94 | " self.call_center.notify_call_completed(self.call)\n",
95 | "\n",
96 | " @abstractmethod\n",
97 | " def escalate_call(self):\n",
98 | " pass\n",
99 | "\n",
100 | " def _escalate_call(self):\n",
101 | " self.call.state = CallState.READY\n",
102 | " call = self.call\n",
103 | " self.call = None\n",
104 | " self.call_center.notify_call_escalated(call)\n",
105 | "\n",
106 | "\n",
107 | "class Operator(Employee):\n",
108 | "\n",
109 | " def __init__(self, employee_id, name):\n",
110 | " super(Operator, self).__init__(employee_id, name, Rank.OPERATOR)\n",
111 | "\n",
112 | " def escalate_call(self):\n",
113 | " self.call.level = Rank.SUPERVISOR\n",
114 | " self._escalate_call()\n",
115 | "\n",
116 | "\n",
117 | "class Supervisor(Employee):\n",
118 | "\n",
119 | " def __init__(self, employee_id, name):\n",
120 | " super(Operator, self).__init__(employee_id, name, Rank.SUPERVISOR)\n",
121 | "\n",
122 | " def escalate_call(self):\n",
123 | " self.call.level = Rank.DIRECTOR\n",
124 | " self._escalate_call()\n",
125 | "\n",
126 | "\n",
127 | "class Director(Employee):\n",
128 | "\n",
129 | " def __init__(self, employee_id, name):\n",
130 | " super(Operator, self).__init__(employee_id, name, Rank.DIRECTOR)\n",
131 | "\n",
132 | " def escalate_call(self):\n",
133 | " raise NotImplemented('Directors must be able to handle any call')\n",
134 | "\n",
135 | "\n",
136 | "class CallState(Enum):\n",
137 | "\n",
138 | " READY = 0\n",
139 | " IN_PROGRESS = 1\n",
140 | " COMPLETE = 2\n",
141 | "\n",
142 | "\n",
143 | "class Call(object):\n",
144 | "\n",
145 | " def __init__(self, rank):\n",
146 | " self.state = CallState.READY\n",
147 | " self.rank = rank\n",
148 | " self.employee = None\n",
149 | "\n",
150 | "\n",
151 | "class CallCenter(object):\n",
152 | "\n",
153 | " def __init__(self, operators, supervisors, directors):\n",
154 | " self.operators = operators\n",
155 | " self.supervisors = supervisors\n",
156 | " self.directors = directors\n",
157 | " self.queued_calls = deque()\n",
158 | "\n",
159 | " def dispatch_call(self, call):\n",
160 | " if call.rank not in (Rank.OPERATOR, Rank.SUPERVISOR, Rank.DIRECTOR):\n",
161 | " raise ValueError('Invalid call rank: {}'.format(call.rank))\n",
162 | " employee = None\n",
163 | " if call.rank == Rank.OPERATOR:\n",
164 | " employee = self._dispatch_call(call, self.operators)\n",
165 | " if call.rank == Rank.SUPERVISOR or employee is None:\n",
166 | " employee = self._dispatch_call(call, self.supervisors)\n",
167 | " if call.rank == Rank.DIRECTOR or employee is None:\n",
168 | " employee = self._dispatch_call(call, self.directors)\n",
169 | " if employee is None:\n",
170 | " self.queued_calls.append(call)\n",
171 | "\n",
172 | " def _dispatch_call(self, call, employees):\n",
173 | " for employee in employees:\n",
174 | " if employee.call is None:\n",
175 | " employee.take_call(call)\n",
176 | " return employee\n",
177 | " return None\n",
178 | "\n",
179 | " def notify_call_escalated(self, call): # ...\n",
180 | " def notify_call_completed(self, call): # ...\n",
181 | " def dispatch_queued_call_to_newly_freed_employee(self, call, employee): # ..."
182 | ]
183 | }
184 | ],
185 | "metadata": {
186 | "kernelspec": {
187 | "display_name": "Python 3",
188 | "language": "python",
189 | "name": "python3"
190 | },
191 | "language_info": {
192 | "codemirror_mode": {
193 | "name": "ipython",
194 | "version": 3
195 | },
196 | "file_extension": ".py",
197 | "mimetype": "text/x-python",
198 | "name": "python",
199 | "nbconvert_exporter": "python",
200 | "pygments_lexer": "ipython3",
201 | "version": "3.4.3"
202 | }
203 | },
204 | "nbformat": 4,
205 | "nbformat_minor": 0
206 | }
207 |
--------------------------------------------------------------------------------
/solutions/object_oriented_design/call_center/call_center.py:
--------------------------------------------------------------------------------
1 | from abc import ABCMeta, abstractmethod
2 | from collections import deque
3 | from enum import Enum
4 |
5 |
6 | class Rank(Enum):
7 |
8 | OPERATOR = 0
9 | SUPERVISOR = 1
10 | DIRECTOR = 2
11 |
12 |
13 | class Employee(metaclass=ABCMeta):
14 |
15 | def __init__(self, employee_id, name, rank, call_center):
16 | self.employee_id = employee_id
17 | self.name = name
18 | self.rank = rank
19 | self.call = None
20 | self.call_center = call_center
21 |
22 | def take_call(self, call):
23 | """Assume the employee will always successfully take the call."""
24 | self.call = call
25 | self.call.employee = self
26 | self.call.state = CallState.IN_PROGRESS
27 |
28 | def complete_call(self):
29 | self.call.state = CallState.COMPLETE
30 | self.call_center.notify_call_completed(self.call)
31 |
32 | @abstractmethod
33 | def escalate_call(self):
34 | pass
35 |
36 | def _escalate_call(self):
37 | self.call.state = CallState.READY
38 | call = self.call
39 | self.call = None
40 | self.call_center.notify_call_escalated(call)
41 |
42 |
43 | class Operator(Employee):
44 |
45 | def __init__(self, employee_id, name):
46 | super(Operator, self).__init__(employee_id, name, Rank.OPERATOR)
47 |
48 | def escalate_call(self):
49 | self.call.level = Rank.SUPERVISOR
50 | self._escalate_call()
51 |
52 |
53 | class Supervisor(Employee):
54 |
55 | def __init__(self, employee_id, name):
56 | super(Operator, self).__init__(employee_id, name, Rank.SUPERVISOR)
57 |
58 | def escalate_call(self):
59 | self.call.level = Rank.DIRECTOR
60 | self._escalate_call()
61 |
62 |
63 | class Director(Employee):
64 |
65 | def __init__(self, employee_id, name):
66 | super(Operator, self).__init__(employee_id, name, Rank.DIRECTOR)
67 |
68 | def escalate_call(self):
69 | raise NotImplementedError('Directors must be able to handle any call')
70 |
71 |
72 | class CallState(Enum):
73 |
74 | READY = 0
75 | IN_PROGRESS = 1
76 | COMPLETE = 2
77 |
78 |
79 | class Call(object):
80 |
81 | def __init__(self, rank):
82 | self.state = CallState.READY
83 | self.rank = rank
84 | self.employee = None
85 |
86 |
87 | class CallCenter(object):
88 |
89 | def __init__(self, operators, supervisors, directors):
90 | self.operators = operators
91 | self.supervisors = supervisors
92 | self.directors = directors
93 | self.queued_calls = deque()
94 |
95 | def dispatch_call(self, call):
96 | if call.rank not in (Rank.OPERATOR, Rank.SUPERVISOR, Rank.DIRECTOR):
97 | raise ValueError('Invalid call rank: {}'.format(call.rank))
98 | employee = None
99 | if call.rank == Rank.OPERATOR:
100 | employee = self._dispatch_call(call, self.operators)
101 | if call.rank == Rank.SUPERVISOR or employee is None:
102 | employee = self._dispatch_call(call, self.supervisors)
103 | if call.rank == Rank.DIRECTOR or employee is None:
104 | employee = self._dispatch_call(call, self.directors)
105 | if employee is None:
106 | self.queued_calls.append(call)
107 |
108 | def _dispatch_call(self, call, employees):
109 | for employee in employees:
110 | if employee.call is None:
111 | employee.take_call(call)
112 | return employee
113 | return None
114 |
115 | def notify_call_escalated(self, call):
116 | pass
117 |
118 | def notify_call_completed(self, call):
119 | pass
120 |
121 | def dispatch_queued_call_to_newly_freed_employee(self, call, employee):
122 | pass
123 |
--------------------------------------------------------------------------------
/solutions/object_oriented_design/deck_of_cards/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/object_oriented_design/deck_of_cards/__init__.py
--------------------------------------------------------------------------------
/solutions/object_oriented_design/deck_of_cards/deck_of_cards.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook was prepared by [Donne Martin](https://github.com/donnemartin). Source and license info is on [GitHub](https://github.com/donnemartin/system-design-primer)."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Design a deck of cards"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Constraints and assumptions\n",
22 | "\n",
23 | "* Is this a generic deck of cards for games like poker and black jack?\n",
24 | " * Yes, design a generic deck then extend it to black jack\n",
25 | "* Can we assume the deck has 52 cards (2-10, Jack, Queen, King, Ace) and 4 suits?\n",
26 | " * Yes\n",
27 | "* Can we assume inputs are valid or do we have to validate them?\n",
28 | " * Assume they're valid"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "## Solution"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 1,
41 | "metadata": {
42 | "collapsed": false
43 | },
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "Overwriting deck_of_cards.py\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "%%writefile deck_of_cards.py\n",
55 | "from abc import ABCMeta, abstractmethod\n",
56 | "from enum import Enum\n",
57 | "import sys\n",
58 | "\n",
59 | "\n",
60 | "class Suit(Enum):\n",
61 | "\n",
62 | " HEART = 0\n",
63 | " DIAMOND = 1\n",
64 | " CLUBS = 2\n",
65 | " SPADE = 3\n",
66 | "\n",
67 | "\n",
68 | "class Card(metaclass=ABCMeta):\n",
69 | "\n",
70 | " def __init__(self, value, suit):\n",
71 | " self.value = value\n",
72 | " self.suit = suit\n",
73 | " self.is_available = True\n",
74 | "\n",
75 | " @property\n",
76 | " @abstractmethod\n",
77 | " def value(self):\n",
78 | " pass\n",
79 | "\n",
80 | " @value.setter\n",
81 | " @abstractmethod\n",
82 | " def value(self, other):\n",
83 | " pass\n",
84 | "\n",
85 | "\n",
86 | "class BlackJackCard(Card):\n",
87 | "\n",
88 | " def __init__(self, value, suit):\n",
89 | " super(BlackJackCard, self).__init__(value, suit)\n",
90 | "\n",
91 | " def is_ace(self):\n",
92 | " return self._value == 1\n",
93 | "\n",
94 | " def is_face_card(self):\n",
95 | " \"\"\"Jack = 11, Queen = 12, King = 13\"\"\"\n",
96 | " return 10 < self._value <= 13\n",
97 | "\n",
98 | " @property\n",
99 | " def value(self):\n",
100 | " if self.is_ace() == 1:\n",
101 | " return 1\n",
102 | " elif self.is_face_card():\n",
103 | " return 10\n",
104 | " else:\n",
105 | " return self._value\n",
106 | "\n",
107 | " @value.setter\n",
108 | " def value(self, new_value):\n",
109 | " if 1 <= new_value <= 13:\n",
110 | " self._value = new_value\n",
111 | " else:\n",
112 | " raise ValueError('Invalid card value: {}'.format(new_value))\n",
113 | "\n",
114 | "\n",
115 | "class Hand(object):\n",
116 | "\n",
117 | " def __init__(self, cards):\n",
118 | " self.cards = cards\n",
119 | "\n",
120 | " def add_card(self, card):\n",
121 | " self.cards.append(card)\n",
122 | "\n",
123 | " def score(self):\n",
124 | " total_value = 0\n",
125 | " for card in self.cards:\n",
126 | " total_value += card.value\n",
127 | " return total_value\n",
128 | "\n",
129 | "\n",
130 | "class BlackJackHand(Hand):\n",
131 | "\n",
132 | " BLACKJACK = 21\n",
133 | "\n",
134 | " def __init__(self, cards):\n",
135 | " super(BlackJackHand, self).__init__(cards)\n",
136 | "\n",
137 | " def score(self):\n",
138 | " min_over = sys.MAXSIZE\n",
139 | " max_under = -sys.MAXSIZE\n",
140 | " for score in self.possible_scores():\n",
141 | " if self.BLACKJACK < score < min_over:\n",
142 | " min_over = score\n",
143 | " elif max_under < score <= self.BLACKJACK:\n",
144 | " max_under = score\n",
145 | " return max_under if max_under != -sys.MAXSIZE else min_over\n",
146 | "\n",
147 | " def possible_scores(self):\n",
148 | " \"\"\"Return a list of possible scores, taking Aces into account.\"\"\"\n",
149 | " # ...\n",
150 | "\n",
151 | "\n",
152 | "class Deck(object):\n",
153 | "\n",
154 | " def __init__(self, cards):\n",
155 | " self.cards = cards\n",
156 | " self.deal_index = 0\n",
157 | "\n",
158 | " def remaining_cards(self):\n",
159 | " return len(self.cards) - deal_index\n",
160 | "\n",
161 | " def deal_card():\n",
162 | " try:\n",
163 | " card = self.cards[self.deal_index]\n",
164 | " card.is_available = False\n",
165 | " self.deal_index += 1\n",
166 | " except IndexError:\n",
167 | " return None\n",
168 | " return card\n",
169 | "\n",
170 | " def shuffle(self): # ..."
171 | ]
172 | }
173 | ],
174 | "metadata": {
175 | "kernelspec": {
176 | "display_name": "Python 3",
177 | "language": "python",
178 | "name": "python3"
179 | },
180 | "language_info": {
181 | "codemirror_mode": {
182 | "name": "ipython",
183 | "version": 3
184 | },
185 | "file_extension": ".py",
186 | "mimetype": "text/x-python",
187 | "name": "python",
188 | "nbconvert_exporter": "python",
189 | "pygments_lexer": "ipython3",
190 | "version": "3.4.3"
191 | }
192 | },
193 | "nbformat": 4,
194 | "nbformat_minor": 0
195 | }
196 |
--------------------------------------------------------------------------------
/solutions/object_oriented_design/deck_of_cards/deck_of_cards.py:
--------------------------------------------------------------------------------
1 | from abc import ABCMeta, abstractmethod
2 | from enum import Enum
3 | import sys
4 |
5 |
6 | class Suit(Enum):
7 |
8 | HEART = 0
9 | DIAMOND = 1
10 | CLUBS = 2
11 | SPADE = 3
12 |
13 |
14 | class Card(metaclass=ABCMeta):
15 |
16 | def __init__(self, value, suit):
17 | self.value = value
18 | self.suit = suit
19 | self.is_available = True
20 |
21 | @property
22 | @abstractmethod
23 | def value(self):
24 | pass
25 |
26 | @value.setter
27 | @abstractmethod
28 | def value(self, other):
29 | pass
30 |
31 |
32 | class BlackJackCard(Card):
33 |
34 | def __init__(self, value, suit):
35 | super(BlackJackCard, self).__init__(value, suit)
36 |
37 | def is_ace(self):
38 | return True if self._value == 1 else False
39 |
40 | def is_face_card(self):
41 | """Jack = 11, Queen = 12, King = 13"""
42 | return True if 10 < self._value <= 13 else False
43 |
44 | @property
45 | def value(self):
46 | if self.is_ace() == 1:
47 | return 1
48 | elif self.is_face_card():
49 | return 10
50 | else:
51 | return self._value
52 |
53 | @value.setter
54 | def value(self, new_value):
55 | if 1 <= new_value <= 13:
56 | self._value = new_value
57 | else:
58 | raise ValueError('Invalid card value: {}'.format(new_value))
59 |
60 |
61 | class Hand(object):
62 |
63 | def __init__(self, cards):
64 | self.cards = cards
65 |
66 | def add_card(self, card):
67 | self.cards.append(card)
68 |
69 | def score(self):
70 | total_value = 0
71 | for card in self.cards:
72 | total_value += card.value
73 | return total_value
74 |
75 |
76 | class BlackJackHand(Hand):
77 |
78 | BLACKJACK = 21
79 |
80 | def __init__(self, cards):
81 | super(BlackJackHand, self).__init__(cards)
82 |
83 | def score(self):
84 | min_over = sys.MAXSIZE
85 | max_under = -sys.MAXSIZE
86 | for score in self.possible_scores():
87 | if self.BLACKJACK < score < min_over:
88 | min_over = score
89 | elif max_under < score <= self.BLACKJACK:
90 | max_under = score
91 | return max_under if max_under != -sys.MAXSIZE else min_over
92 |
93 | def possible_scores(self):
94 | """Return a list of possible scores, taking Aces into account."""
95 | pass
96 |
97 |
98 | class Deck(object):
99 |
100 | def __init__(self, cards):
101 | self.cards = cards
102 | self.deal_index = 0
103 |
104 | def remaining_cards(self):
105 | return len(self.cards) - self.deal_index
106 |
107 | def deal_card(self):
108 | try:
109 | card = self.cards[self.deal_index]
110 | card.is_available = False
111 | self.deal_index += 1
112 | except IndexError:
113 | return None
114 | return card
115 |
116 | def shuffle(self):
117 | pass
118 |
--------------------------------------------------------------------------------
/solutions/object_oriented_design/hash_table/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/object_oriented_design/hash_table/__init__.py
--------------------------------------------------------------------------------
/solutions/object_oriented_design/hash_table/hash_map.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook was prepared by [Donne Martin](https://github.com/donnemartin). Source and license info is on [GitHub](https://github.com/donnemartin/system-design-primer)."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Design a hash map"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Constraints and assumptions\n",
22 | "\n",
23 | "* For simplicity, are the keys integers only?\n",
24 | " * Yes\n",
25 | "* For collision resolution, can we use chaining?\n",
26 | " * Yes\n",
27 | "* Do we have to worry about load factors?\n",
28 | " * No\n",
29 | "* Can we assume inputs are valid or do we have to validate them?\n",
30 | " * Assume they're valid\n",
31 | "* Can we assume this fits memory?\n",
32 | " * Yes"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "## Solution"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 1,
45 | "metadata": {
46 | "collapsed": false
47 | },
48 | "outputs": [
49 | {
50 | "name": "stdout",
51 | "output_type": "stream",
52 | "text": [
53 | "Overwriting hash_map.py\n"
54 | ]
55 | }
56 | ],
57 | "source": [
58 | "%%writefile hash_map.py\n",
59 | "class Item(object):\n",
60 | "\n",
61 | " def __init__(self, key, value):\n",
62 | " self.key = key\n",
63 | " self.value = value\n",
64 | "\n",
65 | "\n",
66 | "class HashTable(object):\n",
67 | "\n",
68 | " def __init__(self, size):\n",
69 | " self.size = size\n",
70 | " self.table = [[] for _ in range(self.size)]\n",
71 | "\n",
72 | " def _hash_function(self, key):\n",
73 | " return key % self.size\n",
74 | "\n",
75 | " def set(self, key, value):\n",
76 | " hash_index = self._hash_function(key)\n",
77 | " for item in self.table[hash_index]:\n",
78 | " if item.key == key:\n",
79 | " item.value = value\n",
80 | " return\n",
81 | " self.table[hash_index].append(Item(key, value))\n",
82 | "\n",
83 | " def get(self, key):\n",
84 | " hash_index = self._hash_function(key)\n",
85 | " for item in self.table[hash_index]:\n",
86 | " if item.key == key:\n",
87 | " return item.value\n",
88 | " raise KeyError('Key not found')\n",
89 | "\n",
90 | " def remove(self, key):\n",
91 | " hash_index = self._hash_function(key)\n",
92 | " for index, item in enumerate(self.table[hash_index]):\n",
93 | " if item.key == key:\n",
94 | " del self.table[hash_index][index]\n",
95 | " return\n",
96 | " raise KeyError('Key not found')"
97 | ]
98 | }
99 | ],
100 | "metadata": {
101 | "kernelspec": {
102 | "display_name": "Python 3",
103 | "language": "python",
104 | "name": "python3"
105 | },
106 | "language_info": {
107 | "codemirror_mode": {
108 | "name": "ipython",
109 | "version": 3
110 | },
111 | "file_extension": ".py",
112 | "mimetype": "text/x-python",
113 | "name": "python",
114 | "nbconvert_exporter": "python",
115 | "pygments_lexer": "ipython3",
116 | "version": "3.4.3"
117 | }
118 | },
119 | "nbformat": 4,
120 | "nbformat_minor": 0
121 | }
122 |
--------------------------------------------------------------------------------
/solutions/object_oriented_design/hash_table/hash_map.py:
--------------------------------------------------------------------------------
1 | class Item(object):
2 |
3 | def __init__(self, key, value):
4 | self.key = key
5 | self.value = value
6 |
7 |
8 | class HashTable(object):
9 |
10 | def __init__(self, size):
11 | self.size = size
12 | self.table = [[] for _ in range(self.size)]
13 |
14 | def _hash_function(self, key):
15 | return key % self.size
16 |
17 | def set(self, key, value):
18 | hash_index = self._hash_function(key)
19 | for item in self.table[hash_index]:
20 | if item.key == key:
21 | item.value = value
22 | return
23 | self.table[hash_index].append(Item(key, value))
24 |
25 | def get(self, key):
26 | hash_index = self._hash_function(key)
27 | for item in self.table[hash_index]:
28 | if item.key == key:
29 | return item.value
30 | raise KeyError('Key not found')
31 |
32 | def remove(self, key):
33 | hash_index = self._hash_function(key)
34 | for index, item in enumerate(self.table[hash_index]):
35 | if item.key == key:
36 | del self.table[hash_index][index]
37 | return
38 | raise KeyError('Key not found')
39 |
--------------------------------------------------------------------------------
/solutions/object_oriented_design/lru_cache/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/object_oriented_design/lru_cache/__init__.py
--------------------------------------------------------------------------------
/solutions/object_oriented_design/lru_cache/lru_cache.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook was prepared by [Donne Martin](https://github.com/donnemartin). Source and license info is on [GitHub](https://github.com/donnemartin/system-design-primer)."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Design an LRU cache"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Constraints and assumptions\n",
22 | "\n",
23 | "* What are we caching?\n",
24 | " * We are caching the results of web queries\n",
25 | "* Can we assume inputs are valid or do we have to validate them?\n",
26 | " * Assume they're valid\n",
27 | "* Can we assume this fits memory?\n",
28 | " * Yes"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "## Solution"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 1,
41 | "metadata": {
42 | "collapsed": false
43 | },
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "Overwriting lru_cache.py\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "%%writefile lru_cache.py\n",
55 | "class Node(object):\n",
56 | "\n",
57 | " def __init__(self, results):\n",
58 | " self.results = results\n",
59 | " self.prev = None\n",
60 | " self.next = None\n",
61 | "\n",
62 | "\n",
63 | "class LinkedList(object):\n",
64 | "\n",
65 | " def __init__(self):\n",
66 | " self.head = None\n",
67 | " self.tail = None\n",
68 | "\n",
69 | " def move_to_front(self, node): # ...\n",
70 | " def append_to_front(self, node): # ...\n",
71 | " def remove_from_tail(self): # ...\n",
72 | "\n",
73 | "\n",
74 | "class Cache(object):\n",
75 | "\n",
76 | " def __init__(self, MAX_SIZE):\n",
77 | " self.MAX_SIZE = MAX_SIZE\n",
78 | " self.size = 0\n",
79 | " self.lookup = {} # key: query, value: node\n",
80 | " self.linked_list = LinkedList()\n",
81 | "\n",
82 | " def get(self, query)\n",
83 | " \"\"\"Get the stored query result from the cache.\n",
84 | " \n",
85 | " Accessing a node updates its position to the front of the LRU list.\n",
86 | " \"\"\"\n",
87 | " node = self.lookup.get(query)\n",
88 | " if node is None:\n",
89 | " return None\n",
90 | " self.linked_list.move_to_front(node)\n",
91 | " return node.results\n",
92 | "\n",
93 | " def set(self, results, query):\n",
94 | " \"\"\"Set the result for the given query key in the cache.\n",
95 | " \n",
96 | " When updating an entry, updates its position to the front of the LRU list.\n",
97 | " If the entry is new and the cache is at capacity, removes the oldest entry\n",
98 | " before the new entry is added.\n",
99 | " \"\"\"\n",
100 | " node = self.lookup.get(query)\n",
101 | " if node is not None:\n",
102 | " # Key exists in cache, update the value\n",
103 | " node.results = results\n",
104 | " self.linked_list.move_to_front(node)\n",
105 | " else:\n",
106 | " # Key does not exist in cache\n",
107 | " if self.size == self.MAX_SIZE:\n",
108 | " # Remove the oldest entry from the linked list and lookup\n",
109 | " self.lookup.pop(self.linked_list.tail.query, None)\n",
110 | " self.linked_list.remove_from_tail()\n",
111 | " else:\n",
112 | " self.size += 1\n",
113 | " # Add the new key and value\n",
114 | " new_node = Node(results)\n",
115 | " self.linked_list.append_to_front(new_node)\n",
116 | " self.lookup[query] = new_node"
117 | ]
118 | }
119 | ],
120 | "metadata": {
121 | "kernelspec": {
122 | "display_name": "Python 3",
123 | "language": "python",
124 | "name": "python3"
125 | },
126 | "language_info": {
127 | "codemirror_mode": {
128 | "name": "ipython",
129 | "version": 3
130 | },
131 | "file_extension": ".py",
132 | "mimetype": "text/x-python",
133 | "name": "python",
134 | "nbconvert_exporter": "python",
135 | "pygments_lexer": "ipython3",
136 | "version": "3.4.3"
137 | }
138 | },
139 | "nbformat": 4,
140 | "nbformat_minor": 0
141 | }
142 |
--------------------------------------------------------------------------------
/solutions/object_oriented_design/lru_cache/lru_cache.py:
--------------------------------------------------------------------------------
1 | class Node(object):
2 |
3 | def __init__(self, results):
4 | self.results = results
5 | self.next = next
6 |
7 |
8 | class LinkedList(object):
9 |
10 | def __init__(self):
11 | self.head = None
12 | self.tail = None
13 |
14 | def move_to_front(self, node):
15 | pass
16 |
17 | def append_to_front(self, node):
18 | pass
19 |
20 | def remove_from_tail(self):
21 | pass
22 |
23 |
24 | class Cache(object):
25 |
26 | def __init__(self, MAX_SIZE):
27 | self.MAX_SIZE = MAX_SIZE
28 | self.size = 0
29 | self.lookup = {} # key: query, value: node
30 | self.linked_list = LinkedList()
31 |
32 | def get(self, query):
33 | """Get the stored query result from the cache.
34 |
35 | Accessing a node updates its position to the front of the LRU list.
36 | """
37 | node = self.lookup.get(query)
38 | if node is None:
39 | return None
40 | self.linked_list.move_to_front(node)
41 | return node.results
42 |
43 | def set(self, results, query):
44 | """Set the result for the given query key in the cache.
45 |
46 | When updating an entry, updates its position to the front of the LRU list.
47 | If the entry is new and the cache is at capacity, removes the oldest entry
48 | before the new entry is added.
49 | """
50 | node = self.lookup.get(query)
51 | if node is not None:
52 | # Key exists in cache, update the value
53 | node.results = results
54 | self.linked_list.move_to_front(node)
55 | else:
56 | # Key does not exist in cache
57 | if self.size == self.MAX_SIZE:
58 | # Remove the oldest entry from the linked list and lookup
59 | self.lookup.pop(self.linked_list.tail.query, None)
60 | self.linked_list.remove_from_tail()
61 | else:
62 | self.size += 1
63 | # Add the new key and value
64 | new_node = Node(results)
65 | self.linked_list.append_to_front(new_node)
66 | self.lookup[query] = new_node
67 |
--------------------------------------------------------------------------------
/solutions/object_oriented_design/online_chat/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/object_oriented_design/online_chat/__init__.py
--------------------------------------------------------------------------------
/solutions/object_oriented_design/online_chat/online_chat.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook was prepared by [Donne Martin](https://github.com/donnemartin). Source and license info is on [GitHub](https://github.com/donnemartin/system-design-primer)."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Design an online chat"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Constraints and assumptions\n",
22 | "\n",
23 | "* Assume we'll focus on the following workflows:\n",
24 | " * Text conversations only\n",
25 | " * Users\n",
26 | " * Add a user\n",
27 | " * Remove a user\n",
28 | " * Update a user\n",
29 | " * Add to a user's friends list\n",
30 | " * Add friend request\n",
31 | " * Approve friend request\n",
32 | " * Reject friend request\n",
33 | " * Remove from a user's friends list\n",
34 | " * Create a group chat\n",
35 | " * Invite friends to a group chat\n",
36 | " * Post a message to a group chat\n",
37 | " * Private 1-1 chat\n",
38 | " * Invite a friend to a private chat\n",
39 | " * Post a meesage to a private chat\n",
40 | "* No need to worry about scaling initially"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "## Solution"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 1,
53 | "metadata": {
54 | "collapsed": false
55 | },
56 | "outputs": [
57 | {
58 | "name": "stdout",
59 | "output_type": "stream",
60 | "text": [
61 | "Overwriting online_chat.py\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "%%writefile online_chat.py\n",
67 | "from abc import ABCMeta\n",
68 | "\n",
69 | "\n",
70 | "class UserService(object):\n",
71 | "\n",
72 | " def __init__(self):\n",
73 | " self.users_by_id = {} # key: user id, value: User\n",
74 | "\n",
75 | " def add_user(self, user_id, name, pass_hash): # ...\n",
76 | " def remove_user(self, user_id): # ...\n",
77 | " def add_friend_request(self, from_user_id, to_user_id): # ...\n",
78 | " def approve_friend_request(self, from_user_id, to_user_id): # ...\n",
79 | " def reject_friend_request(self, from_user_id, to_user_id): # ...\n",
80 | "\n",
81 | "\n",
82 | "class User(object):\n",
83 | "\n",
84 | " def __init__(self, user_id, name, pass_hash):\n",
85 | " self.user_id = user_id\n",
86 | " self.name = name\n",
87 | " self.pass_hash = pass_hash\n",
88 | " self.friends_by_id = {} # key: friend id, value: User\n",
89 | " self.friend_ids_to_private_chats = {} # key: friend id, value: private chats\n",
90 | " self.group_chats_by_id = {} # key: chat id, value: GroupChat\n",
91 | " self.received_friend_requests_by_friend_id = {} # key: friend id, value: AddRequest\n",
92 | " self.sent_friend_requests_by_friend_id = {} # key: friend id, value: AddRequest\n",
93 | "\n",
94 | " def message_user(self, friend_id, message): # ...\n",
95 | " def message_group(self, group_id, message): # ...\n",
96 | " def send_friend_request(self, friend_id): # ...\n",
97 | " def receive_friend_request(self, friend_id): # ...\n",
98 | " def approve_friend_request(self, friend_id): # ...\n",
99 | " def reject_friend_request(self, friend_id): # ...\n",
100 | "\n",
101 | "\n",
102 | "class Chat(metaclass=ABCMeta):\n",
103 | "\n",
104 | " def __init__(self, chat_id):\n",
105 | " self.chat_id = chat_id\n",
106 | " self.users = []\n",
107 | " self.messages = []\n",
108 | "\n",
109 | "\n",
110 | "class PrivateChat(Chat):\n",
111 | "\n",
112 | " def __init__(self, first_user, second_user):\n",
113 | " super(PrivateChat, self).__init__()\n",
114 | " self.users.append(first_user)\n",
115 | " self.users.append(second_user)\n",
116 | "\n",
117 | "\n",
118 | "class GroupChat(Chat):\n",
119 | "\n",
120 | " def add_user(self, user): # ...\n",
121 | " def remove_user(self, user): # ... \n",
122 | "\n",
123 | "\n",
124 | "class Message(object):\n",
125 | "\n",
126 | " def __init__(self, message_id, message, timestamp):\n",
127 | " self.message_id = message_id\n",
128 | " self.message = message\n",
129 | " self.timestamp = timestamp\n",
130 | "\n",
131 | "\n",
132 | "class AddRequest(object):\n",
133 | "\n",
134 | " def __init__(self, from_user_id, to_user_id, request_status, timestamp):\n",
135 | " self.from_user_id = from_user_id\n",
136 | " self.to_user_id = to_user_id\n",
137 | " self.request_status = request_status\n",
138 | " self.timestamp = timestamp\n",
139 | "\n",
140 | "\n",
141 | "class RequestStatus(Enum):\n",
142 | "\n",
143 | " UNREAD = 0\n",
144 | " READ = 1\n",
145 | " ACCEPTED = 2\n",
146 | " REJECTED = 3"
147 | ]
148 | }
149 | ],
150 | "metadata": {
151 | "kernelspec": {
152 | "display_name": "Python 3",
153 | "language": "python",
154 | "name": "python3"
155 | },
156 | "language_info": {
157 | "codemirror_mode": {
158 | "name": "ipython",
159 | "version": 3
160 | },
161 | "file_extension": ".py",
162 | "mimetype": "text/x-python",
163 | "name": "python",
164 | "nbconvert_exporter": "python",
165 | "pygments_lexer": "ipython3",
166 | "version": "3.4.3"
167 | }
168 | },
169 | "nbformat": 4,
170 | "nbformat_minor": 0
171 | }
172 |
--------------------------------------------------------------------------------
/solutions/object_oriented_design/online_chat/online_chat.py:
--------------------------------------------------------------------------------
1 | from abc import ABCMeta
2 | from enum import Enum
3 |
4 |
5 | class UserService(object):
6 |
7 | def __init__(self):
8 | self.users_by_id = {} # key: user id, value: User
9 |
10 | def add_user(self, user_id, name, pass_hash):
11 | pass
12 |
13 | def remove_user(self, user_id):
14 | pass
15 |
16 | def add_friend_request(self, from_user_id, to_user_id):
17 | pass
18 |
19 | def approve_friend_request(self, from_user_id, to_user_id):
20 | pass
21 |
22 | def reject_friend_request(self, from_user_id, to_user_id):
23 | pass
24 |
25 |
26 | class User(object):
27 |
28 | def __init__(self, user_id, name, pass_hash):
29 | self.user_id = user_id
30 | self.name = name
31 | self.pass_hash = pass_hash
32 | self.friends_by_id = {} # key: friend id, value: User
33 | self.friend_ids_to_private_chats = {} # key: friend id, value: private chats
34 | self.group_chats_by_id = {} # key: chat id, value: GroupChat
35 | self.received_friend_requests_by_friend_id = {} # key: friend id, value: AddRequest
36 | self.sent_friend_requests_by_friend_id = {} # key: friend id, value: AddRequest
37 |
38 | def message_user(self, friend_id, message):
39 | pass
40 |
41 | def message_group(self, group_id, message):
42 | pass
43 |
44 | def send_friend_request(self, friend_id):
45 | pass
46 |
47 | def receive_friend_request(self, friend_id):
48 | pass
49 |
50 | def approve_friend_request(self, friend_id):
51 | pass
52 |
53 | def reject_friend_request(self, friend_id):
54 | pass
55 |
56 |
57 | class Chat(metaclass=ABCMeta):
58 |
59 | def __init__(self, chat_id):
60 | self.chat_id = chat_id
61 | self.users = []
62 | self.messages = []
63 |
64 |
65 | class PrivateChat(Chat):
66 |
67 | def __init__(self, first_user, second_user):
68 | super(PrivateChat, self).__init__()
69 | self.users.append(first_user)
70 | self.users.append(second_user)
71 |
72 |
73 | class GroupChat(Chat):
74 |
75 | def add_user(self, user):
76 | pass
77 |
78 | def remove_user(self, user):
79 | pass
80 |
81 |
82 | class Message(object):
83 |
84 | def __init__(self, message_id, message, timestamp):
85 | self.message_id = message_id
86 | self.message = message
87 | self.timestamp = timestamp
88 |
89 |
90 | class AddRequest(object):
91 |
92 | def __init__(self, from_user_id, to_user_id, request_status, timestamp):
93 | self.from_user_id = from_user_id
94 | self.to_user_id = to_user_id
95 | self.request_status = request_status
96 | self.timestamp = timestamp
97 |
98 |
99 | class RequestStatus(Enum):
100 |
101 | UNREAD = 0
102 | READ = 1
103 | ACCEPTED = 2
104 | REJECTED = 3
105 |
--------------------------------------------------------------------------------
/solutions/object_oriented_design/parking_lot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/object_oriented_design/parking_lot/__init__.py
--------------------------------------------------------------------------------
/solutions/object_oriented_design/parking_lot/parking_lot.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "This notebook was prepared by [Donne Martin](https://github.com/donnemartin). Source and license info is on [GitHub](https://github.com/donnemartin/system-design-primer)."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Design a parking lot"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Constraints and assumptions\n",
22 | "\n",
23 | "* What types of vehicles should we support?\n",
24 | " * Motorcycle, Car, Bus\n",
25 | "* Does each vehicle type take up a different amount of parking spots?\n",
26 | " * Yes\n",
27 | " * Motorcycle spot -> Motorcycle\n",
28 | " * Compact spot -> Motorcycle, Car\n",
29 | " * Large spot -> Motorcycle, Car\n",
30 | " * Bus can park if we have 5 consecutive \"large\" spots\n",
31 | "* Does the parking lot have multiple levels?\n",
32 | " * Yes"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "## Solution"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 1,
45 | "metadata": {
46 | "collapsed": false
47 | },
48 | "outputs": [
49 | {
50 | "name": "stdout",
51 | "output_type": "stream",
52 | "text": [
53 | "Overwriting parking_lot.py\n"
54 | ]
55 | }
56 | ],
57 | "source": [
58 | "%%writefile parking_lot.py\n",
59 | "from abc import ABCMeta, abstractmethod\n",
60 | "\n",
61 | "\n",
62 | "class VehicleSize(Enum):\n",
63 | "\n",
64 | " MOTORCYCLE = 0\n",
65 | " COMPACT = 1\n",
66 | " LARGE = 2\n",
67 | "\n",
68 | "\n",
69 | "class Vehicle(metaclass=ABCMeta):\n",
70 | "\n",
71 | " def __init__(self, vehicle_size, license_plate, spot_size):\n",
72 | " self.vehicle_size = vehicle_size\n",
73 | " self.license_plate = license_plate\n",
74 | " self.spot_size = spot_size\n",
75 | " self.spots_taken = []\n",
76 | "\n",
77 | " def clear_spots(self):\n",
78 | " for spot in self.spots_taken:\n",
79 | " spot.remove_vehicle(self)\n",
80 | " self.spots_taken = []\n",
81 | "\n",
82 | " def take_spot(self, spot):\n",
83 | " self.spots_taken.append(spot)\n",
84 | "\n",
85 | " @abstractmethod\n",
86 | " def can_fit_in_spot(self, spot):\n",
87 | " pass\n",
88 | "\n",
89 | "\n",
90 | "class Motorcycle(Vehicle):\n",
91 | "\n",
92 | " def __init__(self, license_plate):\n",
93 | " super(Motorcycle, self).__init__(VehicleSize.MOTORCYCLE, license_plate, spot_size=1)\n",
94 | "\n",
95 | " def can_fit_in_spot(self, spot):\n",
96 | " return True\n",
97 | "\n",
98 | "\n",
99 | "class Car(Vehicle):\n",
100 | "\n",
101 | " def __init__(self, license_plate):\n",
102 | " super(Car, self).__init__(VehicleSize.COMPACT, license_plate, spot_size=1)\n",
103 | "\n",
104 | " def can_fit_in_spot(self, spot):\n",
105 | " return True if (spot.size == LARGE or spot.size == COMPACT) else False\n",
106 | "\n",
107 | "\n",
108 | "class Bus(Vehicle):\n",
109 | "\n",
110 | " def __init__(self, license_plate):\n",
111 | " super(Bus, self).__init__(VehicleSize.LARGE, license_plate, spot_size=5)\n",
112 | "\n",
113 | " def can_fit_in_spot(self, spot):\n",
114 | " return True if spot.size == LARGE else False\n",
115 | "\n",
116 | "\n",
117 | "class ParkingLot(object):\n",
118 | "\n",
119 | " def __init__(self, num_levels):\n",
120 | " self.num_levels = num_levels\n",
121 | " self.levels = []\n",
122 | "\n",
123 | " def park_vehicle(self, vehicle):\n",
124 | " for level in levels:\n",
125 | " if level.park_vehicle(vehicle):\n",
126 | " return True\n",
127 | " return False\n",
128 | "\n",
129 | "\n",
130 | "class Level(object):\n",
131 | "\n",
132 | " SPOTS_PER_ROW = 10\n",
133 | "\n",
134 | " def __init__(self, floor, total_spots):\n",
135 | " self.floor = floor\n",
136 | " self.num_spots = total_spots\n",
137 | " self.available_spots = 0\n",
138 | " self.parking_spots = []\n",
139 | "\n",
140 | " def spot_freed(self):\n",
141 | " self.available_spots += 1\n",
142 | "\n",
143 | " def park_vehicle(self, vehicle):\n",
144 | " spot = self._find_available_spot(vehicle)\n",
145 | " if spot is None:\n",
146 | " return None\n",
147 | " else:\n",
148 | " spot.park_vehicle(vehicle)\n",
149 | " return spot\n",
150 | "\n",
151 | " def _find_available_spot(self, vehicle):\n",
152 | " \"\"\"Find an available spot where vehicle can fit, or return None\"\"\"\n",
153 | " # ...\n",
154 | "\n",
155 | " def _park_starting_at_spot(self, spot, vehicle):\n",
156 | " \"\"\"Occupy starting at spot.spot_number to vehicle.spot_size.\"\"\"\n",
157 | " # ...\n",
158 | "\n",
159 | "\n",
160 | "class ParkingSpot(object):\n",
161 | "\n",
162 | " def __init__(self, level, row, spot_number, spot_size, vehicle_size):\n",
163 | " self.level = level\n",
164 | " self.row = row\n",
165 | " self.spot_number = spot_number\n",
166 | " self.spot_size = spot_size\n",
167 | " self.vehicle_size = vehicle_size\n",
168 | " self.vehicle = None\n",
169 | "\n",
170 | " def is_available(self):\n",
171 | " return True if self.vehicle is None else False\n",
172 | "\n",
173 | " def can_fit_vehicle(self, vehicle):\n",
174 | " if self.vehicle is not None:\n",
175 | " return False\n",
176 | " return vehicle.can_fit_in_spot(self)\n",
177 | "\n",
178 | " def park_vehicle(self, vehicle): # ...\n",
179 | " def remove_vehicle(self): # ..."
180 | ]
181 | }
182 | ],
183 | "metadata": {
184 | "kernelspec": {
185 | "display_name": "Python 3",
186 | "language": "python",
187 | "name": "python3"
188 | },
189 | "language_info": {
190 | "codemirror_mode": {
191 | "name": "ipython",
192 | "version": 3
193 | },
194 | "file_extension": ".py",
195 | "mimetype": "text/x-python",
196 | "name": "python",
197 | "nbconvert_exporter": "python",
198 | "pygments_lexer": "ipython3",
199 | "version": "3.4.3"
200 | }
201 | },
202 | "nbformat": 4,
203 | "nbformat_minor": 0
204 | }
205 |
--------------------------------------------------------------------------------
/solutions/object_oriented_design/parking_lot/parking_lot.py:
--------------------------------------------------------------------------------
1 | from abc import ABCMeta, abstractmethod
2 | from enum import Enum
3 |
4 |
5 | class VehicleSize(Enum):
6 |
7 | MOTORCYCLE = 0
8 | COMPACT = 1
9 | LARGE = 2
10 |
11 |
12 | class Vehicle(metaclass=ABCMeta):
13 |
14 | def __init__(self, vehicle_size, license_plate, spot_size):
15 | self.vehicle_size = vehicle_size
16 | self.license_plate = license_plate
17 | self.spot_size
18 | self.spots_taken = []
19 |
20 | def clear_spots(self):
21 | for spot in self.spots_taken:
22 | spot.remove_vehicle(self)
23 | self.spots_taken = []
24 |
25 | def take_spot(self, spot):
26 | self.spots_taken.append(spot)
27 |
28 | @abstractmethod
29 | def can_fit_in_spot(self, spot):
30 | pass
31 |
32 |
33 | class Motorcycle(Vehicle):
34 |
35 | def __init__(self, license_plate):
36 | super(Motorcycle, self).__init__(VehicleSize.MOTORCYCLE, license_plate, spot_size=1)
37 |
38 | def can_fit_in_spot(self, spot):
39 | return True
40 |
41 |
42 | class Car(Vehicle):
43 |
44 | def __init__(self, license_plate):
45 | super(Car, self).__init__(VehicleSize.COMPACT, license_plate, spot_size=1)
46 |
47 | def can_fit_in_spot(self, spot):
48 | return spot.size in (VehicleSize.LARGE, VehicleSize.COMPACT)
49 |
50 |
51 | class Bus(Vehicle):
52 |
53 | def __init__(self, license_plate):
54 | super(Bus, self).__init__(VehicleSize.LARGE, license_plate, spot_size=5)
55 |
56 | def can_fit_in_spot(self, spot):
57 | return spot.size == VehicleSize.LARGE
58 |
59 |
60 | class ParkingLot(object):
61 |
62 | def __init__(self, num_levels):
63 | self.num_levels = num_levels
64 | self.levels = [] # List of Levels
65 |
66 | def park_vehicle(self, vehicle):
67 | for level in self.levels:
68 | if level.park_vehicle(vehicle):
69 | return True
70 | return False
71 |
72 |
73 | class Level(object):
74 |
75 | SPOTS_PER_ROW = 10
76 |
77 | def __init__(self, floor, total_spots):
78 | self.floor = floor
79 | self.num_spots = total_spots
80 | self.available_spots = 0
81 | self.spots = [] # List of ParkingSpots
82 |
83 | def spot_freed(self):
84 | self.available_spots += 1
85 |
86 | def park_vehicle(self, vehicle):
87 | spot = self._find_available_spot(vehicle)
88 | if spot is None:
89 | return None
90 | else:
91 | spot.park_vehicle(vehicle)
92 | return spot
93 |
94 | def _find_available_spot(self, vehicle):
95 | """Find an available spot where vehicle can fit, or return None"""
96 | pass
97 |
98 | def _park_starting_at_spot(self, spot, vehicle):
99 | """Occupy starting at spot.spot_number to vehicle.spot_size."""
100 | pass
101 |
102 |
103 | class ParkingSpot(object):
104 |
105 | def __init__(self, level, row, spot_number, spot_size, vehicle_size):
106 | self.level = level
107 | self.row = row
108 | self.spot_number = spot_number
109 | self.spot_size = spot_size
110 | self.vehicle_size = vehicle_size
111 | self.vehicle = None
112 |
113 | def is_available(self):
114 | return True if self.vehicle is None else False
115 |
116 | def can_fit_vehicle(self, vehicle):
117 | if self.vehicle is not None:
118 | return False
119 | return vehicle.can_fit_in_spot(self)
120 |
121 | def park_vehicle(self, vehicle):
122 | pass
123 |
124 | def remove_vehicle(self):
125 | pass
126 |
--------------------------------------------------------------------------------
/solutions/system_design/mint/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/mint/__init__.py
--------------------------------------------------------------------------------
/solutions/system_design/mint/mint.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/mint/mint.graffle
--------------------------------------------------------------------------------
/solutions/system_design/mint/mint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/mint/mint.png
--------------------------------------------------------------------------------
/solutions/system_design/mint/mint_basic.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/mint/mint_basic.graffle
--------------------------------------------------------------------------------
/solutions/system_design/mint/mint_basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/mint/mint_basic.png
--------------------------------------------------------------------------------
/solutions/system_design/mint/mint_mapreduce.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from mrjob.job import MRJob
4 |
5 |
6 | class SpendingByCategory(MRJob):
7 |
8 | def __init__(self, categorizer):
9 | self.categorizer = categorizer
10 | ...
11 |
12 | def current_year_month(self):
13 | """Return the current year and month."""
14 | ...
15 |
16 | def extract_year_month(self, timestamp):
17 | """Return the year and month portions of the timestamp."""
18 | ...
19 |
20 | def handle_budget_notifications(self, key, total):
21 | """Call notification API if nearing or exceeded budget."""
22 | ...
23 |
24 | def mapper(self, _, line):
25 | """Parse each log line, extract and transform relevant lines.
26 |
27 | Emit key value pairs of the form:
28 |
29 | (2016-01, shopping), 25
30 | (2016-01, shopping), 100
31 | (2016-01, gas), 50
32 | """
33 | timestamp, category, amount = line.split('\t')
34 | period = self. extract_year_month(timestamp)
35 | if period == self.current_year_month():
36 | yield (period, category), amount
37 |
38 | def reducer(self, key, values):
39 | """Sum values for each key.
40 |
41 | (2016-01, shopping), 125
42 | (2016-01, gas), 50
43 | """
44 | total = sum(values)
45 | self.handle_budget_notifications(key, total)
46 | yield key, sum(values)
47 |
48 | def steps(self):
49 | """Run the map and reduce steps."""
50 | return [
51 | self.mr(mapper=self.mapper,
52 | reducer=self.reducer)
53 | ]
54 |
55 |
56 | if __name__ == '__main__':
57 | SpendingByCategory.run()
58 |
--------------------------------------------------------------------------------
/solutions/system_design/mint/mint_snippets.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from enum import Enum
4 |
5 |
6 | class DefaultCategories(Enum):
7 |
8 | HOUSING = 0
9 | FOOD = 1
10 | GAS = 2
11 | SHOPPING = 3
12 | # ...
13 |
14 |
15 | seller_category_map = {}
16 | seller_category_map['Exxon'] = DefaultCategories.GAS
17 | seller_category_map['Target'] = DefaultCategories.SHOPPING
18 |
19 |
20 | class Categorizer(object):
21 |
22 | def __init__(self, seller_category_map, seller_category_overrides_map):
23 | self.seller_category_map = seller_category_map
24 | self.seller_category_overrides_map = seller_category_overrides_map
25 |
26 | def categorize(self, transaction):
27 | if transaction.seller in self.seller_category_map:
28 | return self.seller_category_map[transaction.seller]
29 | if transaction.seller in self.seller_category_overrides_map:
30 | seller_category_map[transaction.seller] = \
31 | self.manual_overrides[transaction.seller].peek_min()
32 | return self.seller_category_map[transaction.seller]
33 | return None
34 |
35 |
36 | class Transaction(object):
37 |
38 | def __init__(self, timestamp, seller, amount):
39 | self.timestamp = timestamp
40 | self.seller = seller
41 | self.amount = amount
42 |
43 |
44 | class Budget(object):
45 |
46 | def __init__(self, template_categories_to_budget_map):
47 | self.categories_to_budget_map = template_categories_to_budget_map
48 |
49 | def override_category_budget(self, category, amount):
50 | self.categories_to_budget_map[category] = amount
51 |
--------------------------------------------------------------------------------
/solutions/system_design/pastebin/README-zh-Hans.md:
--------------------------------------------------------------------------------
1 | # 设计 Pastebin.com (或者 Bit.ly)
2 |
3 | **注意: 为了避免重复,当前文档会直接链接到[系统设计主题](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#系统设计主题的索引)的相关区域,请参考链接内容以获得综合的讨论点、权衡和替代方案。**
4 |
5 | **设计 Bit.ly** - 是一个类似的问题,区别是 pastebin 需要存储的是 paste 的内容,而不是原始的未短化的 url。
6 |
7 | ## 第一步:概述用例和约束
8 |
9 | > 收集这个问题的需求和范畴。
10 | > 问相关问题来明确用例和约束。
11 | > 讨论一些假设。
12 |
13 | 因为没有面试官来明确这些问题,所以我们自己将定义一些用例和约束。
14 |
15 | ### 用例
16 |
17 | #### 我们将问题的范畴限定在如下用例
18 |
19 | * **用户** 输入一段文本,然后得到一个随机生成的链接
20 | * 过期设置
21 | * 默认的设置是不会过期的
22 | * 可以选择设置一个过期的时间
23 | * **用户** 输入一个 paste 的 url 后,可以看到它存储的内容
24 | * **用户** 是匿名的
25 | * **Service** 跟踪页面分析
26 | * 一个月的访问统计
27 | * **Service** 删除过期的 pastes
28 | * **Service** 需要高可用
29 |
30 | #### 超出范畴的用例
31 |
32 | * **用户** 可以注册一个账户
33 | * **用户** 通过验证邮箱
34 | * **用户** 可以用注册的账户登录
35 | * **用户** 可以编辑文档
36 | * **用户** 可以设置可见性
37 | * **用户** 可以设置短链接
38 |
39 | ### 约束和假设
40 |
41 | #### 状态假设
42 |
43 | * 访问流量不是均匀分布的
44 | * 打开一个短链接应该是很快的
45 | * pastes 只能是文本
46 | * 页面访问分析数据可以不用实时
47 | * 一千万的用户量
48 | * 每个月一千万的 paste 写入量
49 | * 每个月一亿的 paste 读取量
50 | * 读写比例在 10:1
51 |
52 | #### 计算使用
53 |
54 | **向面试官说明你是否应该粗略计算一下使用情况。**
55 |
56 | * 每个 paste 的大小
57 | * 每一个 paste 1 KB
58 | * `shortlink` - 7 bytes
59 | * `expiration_length_in_minutes` - 4 bytes
60 | * `created_at` - 5 bytes
61 | * `paste_path` - 255 bytes
62 | * 总共 = ~1.27 KB
63 | * 每个月新的 paste 内容在 12.7GB
64 | * (1.27 * 10000000)KB / 月的 paste
65 | * 三年内将近 450GB 的新 paste 内容
66 | * 三年内 3.6 亿短链接
67 | * 假设大部分都是新的 paste,而不是需要更新已存在的 paste
68 | * 平均 4paste/s 的写入速度
69 | * 平均 40paste/s 的读取速度
70 |
71 | 简单的转换指南:
72 |
73 | * 2.5 百万 req/s
74 | * 1 req/s = 2.5 百万 req/m
75 | * 40 req/s = 1 亿 req/m
76 | * 400 req/s = 10 亿 req/m
77 |
78 | ## 第二步:创建一个高层次设计
79 |
80 | > 概述一个包括所有重要的组件的高层次设计
81 |
82 | 
83 |
84 | ## 第三步:设计核心组件
85 |
86 | > 深入每一个核心组件的细节
87 |
88 | ### 用例:用户输入一段文本,然后得到一个随机生成的链接
89 |
90 | 我们可以用一个 [关系型数据库](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#关系型数据库管理系统rdbms)作为一个大的哈希表,用来把生成的 url 映射到一个包含 paste 文件的文件服务器和路径上。
91 |
92 | 为了避免托管一个文件服务器,我们可以用一个托管的**对象存储**,比如 Amazon 的 S3 或者[NoSQL 文档类型存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#文档类型存储)。
93 |
94 | 作为一个大的哈希表的关系型数据库的替代方案,我们可以用[NoSQL 键值存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#键-值存储)。我们需要讨论[选择 SQL 或 NoSQL 之间的权衡](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-还是-nosql)。下面的讨论是使用关系型数据库方法。
95 |
96 | * **客户端** 发送一个创建 paste 的请求到作为一个[反向代理](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#反向代理web-服务器)启动的 **Web 服务器**。
97 | * **Web 服务器** 转发请求给 **写接口** 服务器
98 | * **写接口** 服务器执行如下操作:
99 | * 生成一个唯一的 url
100 | * 检查这个 url 在 **SQL 数据库** 里面是否是唯一的
101 | * 如果这个 url 不是唯一的,生成另外一个 url
102 | * 如果我们支持自定义 url,我们可以使用用户提供的 url(也需要检查是否重复)
103 | * 把生成的 url 存储到 **SQL 数据库** 的 `pastes` 表里面
104 | * 存储 paste 的内容数据到 **对象存储** 里面
105 | * 返回生成的 url
106 |
107 | **向面试官阐明你需要写多少代码**
108 |
109 | `pastes` 表可以有如下结构:
110 |
111 | ```sql
112 | shortlink char(7) NOT NULL
113 | expiration_length_in_minutes int NOT NULL
114 | created_at datetime NOT NULL
115 | paste_path varchar(255) NOT NULL
116 | PRIMARY KEY(shortlink)
117 | ```
118 |
119 | 我们将在 `shortlink` 字段和 `created_at` 字段上创建一个[数据库索引](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#使用正确的索引),用来提高查询的速度(避免因为扫描全表导致的长时间查询)并将数据保存在内存中,从内存里面顺序读取 1MB 的数据需要大概 250 微秒,而从 SSD 上读取则需要花费 4 倍的时间,从硬盘上则需要花费 80 倍的时间。 1
120 |
121 | 为了生成唯一的 url,我们可以:
122 |
123 | * 使用 [**MD5**](https://en.wikipedia.org/wiki/MD5) 来哈希用户的 IP 地址 + 时间戳
124 | * MD5 是一个普遍用来生成一个 128-bit 长度的哈希值的一种哈希方法
125 | * MD5 是一致分布的
126 | * 或者我们也可以用 MD5 哈希一个随机生成的数据
127 | * 用 [**Base 62**](https://www.kerstner.at/2012/07/shortening-strings-using-base-62-encoding/) 编码 MD5 哈希值
128 | * 对于 urls,使用 Base 62 编码 `[a-zA-Z0-9]` 是比较合适的
129 | * 对于每一个原始输入只会有一个 hash 结果,Base 62 是确定的(不涉及随机性)
130 | * Base 64 是另外一个流行的编码方案,但是对于 urls,会因为额外的 `+` 和 `-` 字符串而产生一些问题
131 | * 以下 [Base 62 伪代码](http://stackoverflow.com/questions/742013/how-to-code-a-url-shortener) 执行的时间复杂度是 O(k),k 是数字的数量 = 7:
132 |
133 | ```python
134 | def base_encode(num, base=62):
135 | digits = []
136 | while num > 0
137 | remainder = modulo(num, base)
138 | digits.push(remainder)
139 | num = divide(num, base)
140 | digits = digits.reverse
141 | ```
142 |
143 | * 取输出的前 7 个字符,结果会有 62^7 个可能的值,应该足以满足在 3 年内处理 3.6 亿个短链接的约束:
144 |
145 | ```python
146 | url = base_encode(md5(ip_address+timestamp))[:URL_LENGTH]
147 | ```
148 |
149 | 我们将会用一个公开的 [**REST 风格接口**](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#表述性状态转移rest):
150 |
151 | ```shell
152 | $ curl -X POST --data '{"expiration_length_in_minutes":"60", \"paste_contents":"Hello World!"}' https://pastebin.com/api/v1/paste
153 | ```
154 |
155 | Response:
156 |
157 | ```json
158 | {
159 | "shortlink": "foobar"
160 | }
161 | ```
162 |
163 | 用于内部通信,我们可以用 [RPC](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#远程过程调用协议rpc)。
164 |
165 | ### 用例:用户输入一个 paste 的 url 后可以看到它存储的内容
166 |
167 | * **客户端** 发送一个获取 paste 请求到 **Web Server**
168 | * **Web Server** 转发请求给 **读取接口** 服务器
169 | * **读取接口** 服务器执行如下操作:
170 | * 在 **SQL 数据库** 检查这个生成的 url
171 | * 如果这个 url 在 **SQL 数据库** 里面,则从 **对象存储** 获取这个 paste 的内容
172 | * 否则,返回一个错误页面给用户
173 |
174 | REST API:
175 |
176 | ```shell
177 | curl https://pastebin.com/api/v1/paste?shortlink=foobar
178 | ```
179 |
180 | Response:
181 |
182 | ```json
183 | {
184 | "paste_contents": "Hello World",
185 | "created_at": "YYYY-MM-DD HH:MM:SS",
186 | "expiration_length_in_minutes": "60"
187 | }
188 | ```
189 |
190 | ### 用例: 服务跟踪分析页面
191 |
192 | 因为实时分析不是必须的,所以我们可以简单的 **MapReduce** **Web Server** 的日志,用来生成点击次数。
193 |
194 | ```python
195 | class HitCounts(MRJob):
196 |
197 | def extract_url(self, line):
198 | """Extract the generated url from the log line."""
199 | ...
200 |
201 | def extract_year_month(self, line):
202 | """Return the year and month portions of the timestamp."""
203 | ...
204 |
205 | def mapper(self, _, line):
206 | """Parse each log line, extract and transform relevant lines.
207 |
208 | Emit key value pairs of the form:
209 |
210 | (2016-01, url0), 1
211 | (2016-01, url0), 1
212 | (2016-01, url1), 1
213 | """
214 | url = self.extract_url(line)
215 | period = self.extract_year_month(line)
216 | yield (period, url), 1
217 |
218 | def reducer(self, key, values):
219 | """Sum values for each key.
220 |
221 | (2016-01, url0), 2
222 | (2016-01, url1), 1
223 | """
224 | yield key, sum(values)
225 | ```
226 |
227 | ### 用例: 服务删除过期的 pastes
228 |
229 | 为了删除过期的 pastes,我们可以直接搜索 **SQL 数据库** 中所有的过期时间比当前时间更早的记录,
230 | 所有过期的记录将从这张表里面删除(或者将其标记为过期)。
231 |
232 | ## 第四步:扩展这个设计
233 |
234 | > 给定约束条件,识别和解决瓶颈。
235 |
236 | 
237 |
238 | **重要提示: 不要简单的从最初的设计直接跳到最终的设计**
239 |
240 | 说明您将迭代地执行这样的操作:1)**Benchmark/Load 测试**,2)**Profile** 出瓶颈,3)在评估替代方案和权衡时解决瓶颈,4)重复前面,可以参考[在 AWS 上设计一个可以支持百万用户的系统](../scaling_aws/README.md)这个用来解决如何迭代地扩展初始设计的例子。
241 |
242 | 重要的是讨论在初始设计中可能遇到的瓶颈,以及如何解决每个瓶颈。比如,在多个 **Web 服务器** 上添加 **负载平衡器** 可以解决哪些问题? **CDN** 解决哪些问题?**Master-Slave Replicas** 解决哪些问题? 替代方案是什么和怎么对每一个替代方案进行权衡比较?
243 |
244 | 我们将介绍一些组件来完成设计,并解决可伸缩性问题。内部的负载平衡器并不能减少杂乱。
245 |
246 | **为了避免重复的讨论**, 参考以下[系统设计主题](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#系统设计主题的索引)获取主要讨论要点、权衡和替代方案:
247 |
248 | * [DNS](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#域名系统)
249 | * [CDN](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#内容分发网络cdn)
250 | * [负载均衡器](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#负载均衡器)
251 | * [水平扩展](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#水平扩展)
252 | * [反向代理(web 服务器)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#反向代理web-服务器)
253 | * [应用层](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#应用层)
254 | * [缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#缓存)
255 | * [关系型数据库管理系统 (RDBMS)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#关系型数据库管理系统rdbms)
256 | * [SQL write master-slave failover](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#故障切换)
257 | * [主从复制](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#主从复制)
258 | * [一致性模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#一致性模式)
259 | * [可用性模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#可用性模式)
260 |
261 | **分析存储数据库** 可以用比如 Amazon Redshift 或者 Google BigQuery 这样的数据仓库解决方案。
262 |
263 | 一个像 Amazon S3 这样的 **对象存储**,可以轻松处理每月 12.7 GB 的新内容约束。
264 |
265 | 要处理 *平均* 每秒 40 读请求(峰值更高),其中热点内容的流量应该由 **内存缓存** 处理,而不是数据库。**内存缓存** 对于处理分布不均匀的流量和流量峰值也很有用。只要副本没有陷入复制写的泥潭,**SQL Read Replicas** 应该能够处理缓存丢失。
266 |
267 | 对于单个 **SQL Write Master-Slave**,*平均* 每秒 4paste 写入 (峰值更高) 应该是可以做到的。否则,我们需要使用额外的 SQL 扩展模式:
268 |
269 | * [联合](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#联合)
270 | * [分片](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#分片)
271 | * [非规范化](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#非规范化)
272 | * [SQL 调优](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#SQL调优)
273 |
274 | 我们还应该考虑将一些数据移动到 **NoSQL 数据库**。
275 |
276 | ## 额外的话题
277 |
278 | > 是否更深入探讨额外主题,取决于问题的范围和面试剩余的时间。
279 |
280 | ### NoSQL
281 |
282 | * [键值存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#键-值存储)
283 | * [文档存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#文档类型存储)
284 | * [列型存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#列型存储)
285 | * [图数据库](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#图数据库)
286 | * [sql 还是 nosql](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-还是-nosql)
287 |
288 | ### 缓存
289 |
290 | * 在哪缓存
291 | * [客户端缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#客户端缓存)
292 | * [CDN 缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#cdn-缓存)
293 | * [Web 服务器缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#web-服务器缓存)
294 | * [数据库缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#数据库缓存)
295 | * [应用缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#应用缓存)
296 | * 缓存什么
297 | * [数据库查询级别的缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#数据库查询级别的缓存)
298 | * [对象级别的缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#对象级别的缓存)
299 | * 何时更新缓存
300 | * [缓存模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#缓存模式)
301 | * [直写模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#直写模式)
302 | * [回写模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#回写模式)
303 | * [刷新](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#刷新)
304 |
305 | ### 异步和微服务
306 |
307 | * [消息队列](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#消息队列)
308 | * [任务队列](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#任务队列)
309 | * [背压](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#背压)
310 | * [微服务](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#微服务)
311 |
312 | ### 通信
313 |
314 | * 讨论权衡:
315 | * 跟客户端之间的外部通信 - [HTTP APIs following REST](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#表述性状态转移rest)
316 | * 内部通信 - [RPC](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#远程过程调用协议rpc)
317 | * [服务发现](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#服务发现)
318 |
319 | ### 安全
320 |
321 | 参考[安全](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#安全)。
322 |
323 | ### 延迟数字
324 |
325 | 见[每个程序员都应该知道的延迟数](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#每个程序员都应该知道的延迟数)。
326 |
327 | ### 持续进行
328 |
329 | * 继续对系统进行基准测试和监控,以在瓶颈出现时解决它们
330 | * 扩展是一个迭代的过程
331 |
--------------------------------------------------------------------------------
/solutions/system_design/pastebin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/pastebin/__init__.py
--------------------------------------------------------------------------------
/solutions/system_design/pastebin/pastebin.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/pastebin/pastebin.graffle
--------------------------------------------------------------------------------
/solutions/system_design/pastebin/pastebin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/pastebin/pastebin.png
--------------------------------------------------------------------------------
/solutions/system_design/pastebin/pastebin.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from mrjob.job import MRJob
4 |
5 |
6 | class HitCounts(MRJob):
7 |
8 | def extract_url(self, line):
9 | """Extract the generated url from the log line."""
10 | pass
11 |
12 | def extract_year_month(self, line):
13 | """Return the year and month portions of the timestamp."""
14 | pass
15 |
16 | def mapper(self, _, line):
17 | """Parse each log line, extract and transform relevant lines.
18 |
19 | Emit key value pairs of the form:
20 |
21 | (2016-01, url0), 1
22 | (2016-01, url0), 1
23 | (2016-01, url1), 1
24 | """
25 | url = self.extract_url(line)
26 | period = self.extract_year_month(line)
27 | yield (period, url), 1
28 |
29 | def reducer(self, key, values):
30 | """Sum values for each key.
31 |
32 | (2016-01, url0), 2
33 | (2016-01, url1), 1
34 | """
35 | yield key, sum(values)
36 |
37 | def steps(self):
38 | """Run the map and reduce steps."""
39 | return [
40 | self.mr(mapper=self.mapper,
41 | reducer=self.reducer)
42 | ]
43 |
44 |
45 | if __name__ == '__main__':
46 | HitCounts.run()
47 |
--------------------------------------------------------------------------------
/solutions/system_design/pastebin/pastebin_basic.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/pastebin/pastebin_basic.graffle
--------------------------------------------------------------------------------
/solutions/system_design/pastebin/pastebin_basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/pastebin/pastebin_basic.png
--------------------------------------------------------------------------------
/solutions/system_design/query_cache/README-zh-Hans.md:
--------------------------------------------------------------------------------
1 | # 设计一个键-值缓存来存储最近 web 服务查询的结果
2 |
3 | **注意:这个文档中的链接会直接指向[系统设计主题索引](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#系统设计主题的索引)中的有关部分,以避免重复的内容。你可以参考链接的相关内容,来了解其总的要点、方案的权衡取舍以及可选的替代方案。**
4 |
5 | ## 第一步:简述用例与约束条件
6 |
7 | > 搜集需求与问题的范围。
8 | > 提出问题来明确用例与约束条件。
9 | > 讨论假设。
10 |
11 | 我们将在没有面试官明确说明问题的情况下,自己定义一些用例以及限制条件。
12 |
13 | ### 用例
14 |
15 | #### 我们将把问题限定在仅处理以下用例的范围中
16 |
17 | * **用户**发送一个搜索请求,命中缓存
18 | * **用户**发送一个搜索请求,未命中缓存
19 | * **服务**有着高可用性
20 |
21 | ### 限制条件与假设
22 |
23 | #### 提出假设
24 |
25 | * 网络流量不是均匀分布的
26 | * 经常被查询的内容应该一直存于缓存中
27 | * 需要确定如何规定缓存过期、缓存刷新规则
28 | * 缓存提供的服务查询速度要快
29 | * 机器间延迟较低
30 | * 缓存有内存限制
31 | * 需要决定缓存什么、移除什么
32 | * 需要缓存百万级的查询
33 | * 1000 万用户
34 | * 每个月 100 亿次查询
35 |
36 | #### 计算用量
37 |
38 | **如果你需要进行粗略的用量计算,请向你的面试官说明。**
39 |
40 | * 缓存存储的是键值对有序表,键为 `query`(查询),值为 `results`(结果)。
41 | * `query` - 50 字节
42 | * `title` - 20 字节
43 | * `snippet` - 200 字节
44 | * 总计:270 字节
45 | * 假如 100 亿次查询都是不同的,且全部需要存储,那么每个月需要 2.7 TB 的缓存空间
46 | * 单次查询 270 字节 * 每月查询 100 亿次
47 | * 假设内存大小有限制,需要决定如何制定缓存过期规则
48 | * 每秒 4,000 次请求
49 |
50 | 便利换算指南:
51 |
52 | * 每个月有 250 万秒
53 | * 每秒一个请求 = 每个月 250 万次请求
54 | * 每秒 40 个请求 = 每个月 1 亿次请求
55 | * 每秒 400 个请求 = 每个月 10 亿次请求
56 |
57 | ## 第二步:概要设计
58 |
59 | > 列出所有重要组件以规划概要设计。
60 |
61 | 
62 |
63 | ## 第三步:设计核心组件
64 |
65 | > 深入每个核心组件的细节。
66 |
67 | ### 用例:用户发送了一次请求,命中了缓存
68 |
69 | 常用的查询可以由例如 Redis 或者 Memcached 之类的**内存缓存**提供支持,以减少数据读取延迟,并且避免**反向索引服务**以及**文档服务**的过载。从内存读取 1 MB 连续数据大约要花 250 微秒,而从 SSD 读取同样大小的数据要花费 4 倍的时间,从机械硬盘读取需要花费 80 倍以上的时间。1
70 |
71 | 由于缓存容量有限,我们将使用 LRU(近期最少使用算法)来控制缓存的过期。
72 |
73 | * **客户端**向运行[反向代理](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#反向代理web-服务器)的 **Web 服务器**发送一个请求
74 | * 这个 **Web 服务器**将请求转发给**查询 API** 服务
75 | * **查询 API** 服务将会做这些事情:
76 | * 分析查询
77 | * 移除多余的内容
78 | * 将文本分割成词组
79 | * 修正拼写错误
80 | * 规范化字母的大小写
81 | * 将查询转换为布尔运算
82 | * 检测**内存缓存**是否有匹配查询的内容
83 | * 如果命中**内存缓存**,**内存缓存**将会做以下事情:
84 | * 将缓存入口的位置指向 LRU 链表的头部
85 | * 返回缓存内容
86 | * 否则,**查询 API** 将会做以下事情:
87 | * 使用**反向索引服务**来查找匹配查询的文档
88 | * **反向索引服务**对匹配到的结果进行排名,然后返回最符合的结果
89 | * 使用**文档服务**返回文章标题与片段
90 | * 更新**内存缓存**,存入内容,将**内存缓存**入口位置指向 LRU 链表的头部
91 |
92 | #### 缓存的实现
93 |
94 | 缓存可以使用双向链表实现:新元素将会在头结点加入,过期的元素将会在尾节点被删除。我们使用哈希表以便能够快速查找每个链表节点。
95 |
96 | **向你的面试官告知你准备写多少代码**。
97 |
98 | 实现**查询 API 服务**:
99 |
100 | ```python
101 | class QueryApi(object):
102 |
103 | def __init__(self, memory_cache, reverse_index_service):
104 | self.memory_cache = memory_cache
105 | self.reverse_index_service = reverse_index_service
106 |
107 | def parse_query(self, query):
108 | """移除多余内容,将文本分割成词组,修复拼写错误,
109 | 规范化字母大小写,转换布尔运算。
110 | """
111 | ...
112 |
113 | def process_query(self, query):
114 | query = self.parse_query(query)
115 | results = self.memory_cache.get(query)
116 | if results is None:
117 | results = self.reverse_index_service.process_search(query)
118 | self.memory_cache.set(query, results)
119 | return results
120 | ```
121 |
122 | 实现**节点**:
123 |
124 | ```python
125 | class Node(object):
126 |
127 | def __init__(self, query, results):
128 | self.query = query
129 | self.results = results
130 | ```
131 |
132 | 实现**链表**:
133 |
134 | ```python
135 | class LinkedList(object):
136 |
137 | def __init__(self):
138 | self.head = None
139 | self.tail = None
140 |
141 | def move_to_front(self, node):
142 | ...
143 |
144 | def append_to_front(self, node):
145 | ...
146 |
147 | def remove_from_tail(self):
148 | ...
149 | ```
150 |
151 | 实现**缓存**:
152 |
153 | ```python
154 | class Cache(object):
155 |
156 | def __init__(self, MAX_SIZE):
157 | self.MAX_SIZE = MAX_SIZE
158 | self.size = 0
159 | self.lookup = {} # key: query, value: node
160 | self.linked_list = LinkedList()
161 |
162 | def get(self, query)
163 | """从缓存取得存储的内容
164 |
165 | 将入口节点位置更新为 LRU 链表的头部。
166 | """
167 | node = self.lookup[query]
168 | if node is None:
169 | return None
170 | self.linked_list.move_to_front(node)
171 | return node.results
172 |
173 | def set(self, results, query):
174 | """将所给查询键的结果存在缓存中。
175 |
176 | 当更新缓存记录的时候,将它的位置指向 LRU 链表的头部。
177 | 如果这个记录是新的记录,并且缓存空间已满,应该在加入新记录前
178 | 删除最老的记录。
179 | """
180 | node = self.lookup[query]
181 | if node is not None:
182 | # 键存在于缓存中,更新它对应的值
183 | node.results = results
184 | self.linked_list.move_to_front(node)
185 | else:
186 | # 键不存在于缓存中
187 | if self.size == self.MAX_SIZE:
188 | # 在链表中查找并删除最老的记录
189 | self.lookup.pop(self.linked_list.tail.query, None)
190 | self.linked_list.remove_from_tail()
191 | else:
192 | self.size += 1
193 | # 添加新的键值对
194 | new_node = Node(query, results)
195 | self.linked_list.append_to_front(new_node)
196 | self.lookup[query] = new_node
197 | ```
198 |
199 | #### 何时更新缓存
200 |
201 | 缓存将会在以下几种情况更新:
202 |
203 | * 页面内容发生变化
204 | * 页面被移除或者加入了新页面
205 | * 页面的权值发生变动
206 |
207 | 解决这些问题的最直接的方法,就是为缓存记录设置一个它在被更新前能留在缓存中的最长时间,这个时间简称为存活时间(TTL)。
208 |
209 | 参考 [「何时更新缓存」](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#何时更新缓存)来了解其权衡取舍及替代方案。以上方法在[缓存模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#缓存模式)一章中详细地进行了描述。
210 |
211 | ## 第四步:架构扩展
212 |
213 | > 根据限制条件,找到并解决瓶颈。
214 |
215 | 
216 |
217 | **重要提示:不要从最初设计直接跳到最终设计中!**
218 |
219 | 现在你要 1) **基准测试、负载测试**。2) **分析、描述**性能瓶颈。3) 在解决瓶颈问题的同时,评估替代方案、权衡利弊。4) 重复以上步骤。请阅读[「设计一个系统,并将其扩大到为数以百万计的 AWS 用户服务」](../scaling_aws/README.md) 来了解如何逐步扩大初始设计。
220 |
221 | 讨论初始设计可能遇到的瓶颈及相关解决方案是很重要的。例如加上一个配置多台 **Web 服务器**的**负载均衡器**是否能够解决问题?**CDN**呢?**主从复制**呢?它们各自的替代方案和需要**权衡**的利弊又有什么呢?
222 |
223 | 我们将会介绍一些组件来完成设计,并解决架构扩张问题。内置的负载均衡器将不做讨论以节省篇幅。
224 |
225 | **为了避免重复讨论**,请参考[系统设计主题索引](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#系统设计主题的索引)相关部分来了解其要点、方案的权衡取舍以及可选的替代方案。
226 |
227 | * [DNS](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#域名系统)
228 | * [负载均衡器](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#负载均衡器)
229 | * [水平拓展](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#水平扩展)
230 | * [反向代理(web 服务器)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#反向代理web-服务器)
231 | * [API 服务(应用层)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#应用层)
232 | * [缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#缓存)
233 | * [一致性模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#一致性模式)
234 | * [可用性模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#可用性模式)
235 |
236 | ### 将内存缓存扩大到多台机器
237 |
238 | 为了解决庞大的请求负载以及巨大的内存需求,我们将要对架构进行水平拓展。如何在我们的**内存缓存**集群中存储数据呢?我们有以下三个主要可选方案:
239 |
240 | * **缓存集群中的每一台机器都有自己的缓存** - 简单,但是它会降低缓存命中率。
241 | * **缓存集群中的每一台机器都有缓存的拷贝** - 简单,但是它的内存使用效率太低了。
242 | * **对缓存进行[分片](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#分片),分别部署在缓存集群中的所有机器中** - 更加复杂,但是它是最佳的选择。我们可以使用哈希,用查询语句 `machine = hash(query)` 来确定哪台机器有需要缓存。当然我们也可以使用[一致性哈希](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#正在完善中)。
243 |
244 | ## 其它要点
245 |
246 | > 是否深入这些额外的主题,取决于你的问题范围和剩下的时间。
247 |
248 | ### SQL 缩放模式
249 |
250 | * [读取复制](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#主从复制)
251 | * [联合](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#联合)
252 | * [分片](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#分片)
253 | * [非规范化](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#非规范化)
254 | * [SQL 调优](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-调优)
255 |
256 | #### NoSQL
257 |
258 | * [键-值存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#键-值存储)
259 | * [文档类型存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#文档类型存储)
260 | * [列型存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#列型存储)
261 | * [图数据库](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#图数据库)
262 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-还是-nosql)
263 |
264 | ### 缓存
265 |
266 | * 在哪缓存
267 | * [客户端缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#客户端缓存)
268 | * [CDN 缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#cdn-缓存)
269 | * [Web 服务器缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#web-服务器缓存)
270 | * [数据库缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#数据库缓存)
271 | * [应用缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#应用缓存)
272 | * 什么需要缓存
273 | * [数据库查询级别的缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#数据库查询级别的缓存)
274 | * [对象级别的缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#对象级别的缓存)
275 | * 何时更新缓存
276 | * [缓存模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#缓存模式)
277 | * [直写模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#直写模式)
278 | * [回写模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#回写模式)
279 | * [刷新](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#刷新)
280 |
281 | ### 异步与微服务
282 |
283 | * [消息队列](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#消息队列)
284 | * [任务队列](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#任务队列)
285 | * [背压](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#背压)
286 | * [微服务](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#微服务)
287 |
288 | ### 通信
289 |
290 | * 可权衡选择的方案:
291 | * 与客户端的外部通信 - [使用 REST 作为 HTTP API](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#表述性状态转移rest)
292 | * 服务器内部通信 - [RPC](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#远程过程调用协议rpc)
293 | * [服务发现](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#服务发现)
294 |
295 | ### 安全性
296 |
297 | 请参阅[「安全」](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#安全)一章。
298 |
299 | ### 延迟数值
300 |
301 | 请参阅[「每个程序员都应该知道的延迟数」](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#每个程序员都应该知道的延迟数)。
302 |
303 | ### 持续探讨
304 |
305 | * 持续进行基准测试并监控你的系统,以解决他们提出的瓶颈问题。
306 | * 架构拓展是一个迭代的过程。
307 |
--------------------------------------------------------------------------------
/solutions/system_design/query_cache/README.md:
--------------------------------------------------------------------------------
1 | # Design a key-value cache to save the results of the most recent web server queries
2 |
3 | *Note: This document links directly to relevant areas found in the [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) to avoid duplication. Refer to the linked content for general talking points, tradeoffs, and alternatives.*
4 |
5 | ## Step 1: Outline use cases and constraints
6 |
7 | > Gather requirements and scope the problem.
8 | > Ask questions to clarify use cases and constraints.
9 | > Discuss assumptions.
10 |
11 | Without an interviewer to address clarifying questions, we'll define some use cases and constraints.
12 |
13 | ### Use cases
14 |
15 | #### We'll scope the problem to handle only the following use cases
16 |
17 | * **User** sends a search request resulting in a cache hit
18 | * **User** sends a search request resulting in a cache miss
19 | * **Service** has high availability
20 |
21 | ### Constraints and assumptions
22 |
23 | #### State assumptions
24 |
25 | * Traffic is not evenly distributed
26 | * Popular queries should almost always be in the cache
27 | * Need to determine how to expire/refresh
28 | * Serving from cache requires fast lookups
29 | * Low latency between machines
30 | * Limited memory in cache
31 | * Need to determine what to keep/remove
32 | * Need to cache millions of queries
33 | * 10 million users
34 | * 10 billion queries per month
35 |
36 | #### Calculate usage
37 |
38 | **Clarify with your interviewer if you should run back-of-the-envelope usage calculations.**
39 |
40 | * Cache stores ordered list of key: query, value: results
41 | * `query` - 50 bytes
42 | * `title` - 20 bytes
43 | * `snippet` - 200 bytes
44 | * Total: 270 bytes
45 | * 2.7 TB of cache data per month if all 10 billion queries are unique and all are stored
46 | * 270 bytes per search * 10 billion searches per month
47 | * Assumptions state limited memory, need to determine how to expire contents
48 | * 4,000 requests per second
49 |
50 | Handy conversion guide:
51 |
52 | * 2.5 million seconds per month
53 | * 1 request per second = 2.5 million requests per month
54 | * 40 requests per second = 100 million requests per month
55 | * 400 requests per second = 1 billion requests per month
56 |
57 | ## Step 2: Create a high level design
58 |
59 | > Outline a high level design with all important components.
60 |
61 | 
62 |
63 | ## Step 3: Design core components
64 |
65 | > Dive into details for each core component.
66 |
67 | ### Use case: User sends a request resulting in a cache hit
68 |
69 | Popular queries can be served from a **Memory Cache** such as Redis or Memcached to reduce read latency and to avoid overloading the **Reverse Index Service** and **Document Service**. Reading 1 MB sequentially from memory takes about 250 microseconds, while reading from SSD takes 4x and from disk takes 80x longer.1
70 |
71 | Since the cache has limited capacity, we'll use a least recently used (LRU) approach to expire older entries.
72 |
73 | * The **Client** sends a request to the **Web Server**, running as a [reverse proxy](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server)
74 | * The **Web Server** forwards the request to the **Query API** server
75 | * The **Query API** server does the following:
76 | * Parses the query
77 | * Removes markup
78 | * Breaks up the text into terms
79 | * Fixes typos
80 | * Normalizes capitalization
81 | * Converts the query to use boolean operations
82 | * Checks the **Memory Cache** for the content matching the query
83 | * If there's a hit in the **Memory Cache**, the **Memory Cache** does the following:
84 | * Updates the cached entry's position to the front of the LRU list
85 | * Returns the cached contents
86 | * Else, the **Query API** does the following:
87 | * Uses the **Reverse Index Service** to find documents matching the query
88 | * The **Reverse Index Service** ranks the matching results and returns the top ones
89 | * Uses the **Document Service** to return titles and snippets
90 | * Updates the **Memory Cache** with the contents, placing the entry at the front of the LRU list
91 |
92 | #### Cache implementation
93 |
94 | The cache can use a doubly-linked list: new items will be added to the head while items to expire will be removed from the tail. We'll use a hash table for fast lookups to each linked list node.
95 |
96 | **Clarify with your interviewer how much code you are expected to write**.
97 |
98 | **Query API Server** implementation:
99 |
100 | ```python
101 | class QueryApi(object):
102 |
103 | def __init__(self, memory_cache, reverse_index_service):
104 | self.memory_cache = memory_cache
105 | self.reverse_index_service = reverse_index_service
106 |
107 | def parse_query(self, query):
108 | """Remove markup, break text into terms, deal with typos,
109 | normalize capitalization, convert to use boolean operations.
110 | """
111 | ...
112 |
113 | def process_query(self, query):
114 | query = self.parse_query(query)
115 | results = self.memory_cache.get(query)
116 | if results is None:
117 | results = self.reverse_index_service.process_search(query)
118 | self.memory_cache.set(query, results)
119 | return results
120 | ```
121 |
122 | **Node** implementation:
123 |
124 | ```python
125 | class Node(object):
126 |
127 | def __init__(self, query, results):
128 | self.query = query
129 | self.results = results
130 | ```
131 |
132 | **LinkedList** implementation:
133 |
134 | ```python
135 | class LinkedList(object):
136 |
137 | def __init__(self):
138 | self.head = None
139 | self.tail = None
140 |
141 | def move_to_front(self, node):
142 | ...
143 |
144 | def append_to_front(self, node):
145 | ...
146 |
147 | def remove_from_tail(self):
148 | ...
149 | ```
150 |
151 | **Cache** implementation:
152 |
153 | ```python
154 | class Cache(object):
155 |
156 | def __init__(self, MAX_SIZE):
157 | self.MAX_SIZE = MAX_SIZE
158 | self.size = 0
159 | self.lookup = {} # key: query, value: node
160 | self.linked_list = LinkedList()
161 |
162 | def get(self, query)
163 | """Get the stored query result from the cache.
164 |
165 | Accessing a node updates its position to the front of the LRU list.
166 | """
167 | node = self.lookup[query]
168 | if node is None:
169 | return None
170 | self.linked_list.move_to_front(node)
171 | return node.results
172 |
173 | def set(self, results, query):
174 | """Set the result for the given query key in the cache.
175 |
176 | When updating an entry, updates its position to the front of the LRU list.
177 | If the entry is new and the cache is at capacity, removes the oldest entry
178 | before the new entry is added.
179 | """
180 | node = self.lookup[query]
181 | if node is not None:
182 | # Key exists in cache, update the value
183 | node.results = results
184 | self.linked_list.move_to_front(node)
185 | else:
186 | # Key does not exist in cache
187 | if self.size == self.MAX_SIZE:
188 | # Remove the oldest entry from the linked list and lookup
189 | self.lookup.pop(self.linked_list.tail.query, None)
190 | self.linked_list.remove_from_tail()
191 | else:
192 | self.size += 1
193 | # Add the new key and value
194 | new_node = Node(query, results)
195 | self.linked_list.append_to_front(new_node)
196 | self.lookup[query] = new_node
197 | ```
198 |
199 | #### When to update the cache
200 |
201 | The cache should be updated when:
202 |
203 | * The page contents change
204 | * The page is removed or a new page is added
205 | * The page rank changes
206 |
207 | The most straightforward way to handle these cases is to simply set a max time that a cached entry can stay in the cache before it is updated, usually referred to as time to live (TTL).
208 |
209 | Refer to [When to update the cache](https://github.com/donnemartin/system-design-primer#when-to-update-the-cache) for tradeoffs and alternatives. The approach above describes [cache-aside](https://github.com/donnemartin/system-design-primer#cache-aside).
210 |
211 | ## Step 4: Scale the design
212 |
213 | > Identify and address bottlenecks, given the constraints.
214 |
215 | 
216 |
217 | **Important: Do not simply jump right into the final design from the initial design!**
218 |
219 | State you would 1) **Benchmark/Load Test**, 2) **Profile** for bottlenecks 3) address bottlenecks while evaluating alternatives and trade-offs, and 4) repeat. See [Design a system that scales to millions of users on AWS](../scaling_aws/README.md) as a sample on how to iteratively scale the initial design.
220 |
221 | It's important to discuss what bottlenecks you might encounter with the initial design and how you might address each of them. For example, what issues are addressed by adding a **Load Balancer** with multiple **Web Servers**? **CDN**? **Master-Slave Replicas**? What are the alternatives and **Trade-Offs** for each?
222 |
223 | We'll introduce some components to complete the design and to address scalability issues. Internal load balancers are not shown to reduce clutter.
224 |
225 | *To avoid repeating discussions*, refer to the following [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) for main talking points, tradeoffs, and alternatives:
226 |
227 | * [DNS](https://github.com/donnemartin/system-design-primer#domain-name-system)
228 | * [Load balancer](https://github.com/donnemartin/system-design-primer#load-balancer)
229 | * [Horizontal scaling](https://github.com/donnemartin/system-design-primer#horizontal-scaling)
230 | * [Web server (reverse proxy)](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server)
231 | * [API server (application layer)](https://github.com/donnemartin/system-design-primer#application-layer)
232 | * [Cache](https://github.com/donnemartin/system-design-primer#cache)
233 | * [Consistency patterns](https://github.com/donnemartin/system-design-primer#consistency-patterns)
234 | * [Availability patterns](https://github.com/donnemartin/system-design-primer#availability-patterns)
235 |
236 | ### Expanding the Memory Cache to many machines
237 |
238 | To handle the heavy request load and the large amount of memory needed, we'll scale horizontally. We have three main options on how to store the data on our **Memory Cache** cluster:
239 |
240 | * **Each machine in the cache cluster has its own cache** - Simple, although it will likely result in a low cache hit rate.
241 | * **Each machine in the cache cluster has a copy of the cache** - Simple, although it is an inefficient use of memory.
242 | * **The cache is [sharded](https://github.com/donnemartin/system-design-primer#sharding) across all machines in the cache cluster** - More complex, although it is likely the best option. We could use hashing to determine which machine could have the cached results of a query using `machine = hash(query)`. We'll likely want to use [consistent hashing](https://github.com/donnemartin/system-design-primer#under-development).
243 |
244 | ## Additional talking points
245 |
246 | > Additional topics to dive into, depending on the problem scope and time remaining.
247 |
248 | ### SQL scaling patterns
249 |
250 | * [Read replicas](https://github.com/donnemartin/system-design-primer#master-slave-replication)
251 | * [Federation](https://github.com/donnemartin/system-design-primer#federation)
252 | * [Sharding](https://github.com/donnemartin/system-design-primer#sharding)
253 | * [Denormalization](https://github.com/donnemartin/system-design-primer#denormalization)
254 | * [SQL Tuning](https://github.com/donnemartin/system-design-primer#sql-tuning)
255 |
256 | #### NoSQL
257 |
258 | * [Key-value store](https://github.com/donnemartin/system-design-primer#key-value-store)
259 | * [Document store](https://github.com/donnemartin/system-design-primer#document-store)
260 | * [Wide column store](https://github.com/donnemartin/system-design-primer#wide-column-store)
261 | * [Graph database](https://github.com/donnemartin/system-design-primer#graph-database)
262 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql)
263 |
264 | ### Caching
265 |
266 | * Where to cache
267 | * [Client caching](https://github.com/donnemartin/system-design-primer#client-caching)
268 | * [CDN caching](https://github.com/donnemartin/system-design-primer#cdn-caching)
269 | * [Web server caching](https://github.com/donnemartin/system-design-primer#web-server-caching)
270 | * [Database caching](https://github.com/donnemartin/system-design-primer#database-caching)
271 | * [Application caching](https://github.com/donnemartin/system-design-primer#application-caching)
272 | * What to cache
273 | * [Caching at the database query level](https://github.com/donnemartin/system-design-primer#caching-at-the-database-query-level)
274 | * [Caching at the object level](https://github.com/donnemartin/system-design-primer#caching-at-the-object-level)
275 | * When to update the cache
276 | * [Cache-aside](https://github.com/donnemartin/system-design-primer#cache-aside)
277 | * [Write-through](https://github.com/donnemartin/system-design-primer#write-through)
278 | * [Write-behind (write-back)](https://github.com/donnemartin/system-design-primer#write-behind-write-back)
279 | * [Refresh ahead](https://github.com/donnemartin/system-design-primer#refresh-ahead)
280 |
281 | ### Asynchronism and microservices
282 |
283 | * [Message queues](https://github.com/donnemartin/system-design-primer#message-queues)
284 | * [Task queues](https://github.com/donnemartin/system-design-primer#task-queues)
285 | * [Back pressure](https://github.com/donnemartin/system-design-primer#back-pressure)
286 | * [Microservices](https://github.com/donnemartin/system-design-primer#microservices)
287 |
288 | ### Communications
289 |
290 | * Discuss tradeoffs:
291 | * External communication with clients - [HTTP APIs following REST](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest)
292 | * Internal communications - [RPC](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc)
293 | * [Service discovery](https://github.com/donnemartin/system-design-primer#service-discovery)
294 |
295 | ### Security
296 |
297 | Refer to the [security section](https://github.com/donnemartin/system-design-primer#security).
298 |
299 | ### Latency numbers
300 |
301 | See [Latency numbers every programmer should know](https://github.com/donnemartin/system-design-primer#latency-numbers-every-programmer-should-know).
302 |
303 | ### Ongoing
304 |
305 | * Continue benchmarking and monitoring your system to address bottlenecks as they come up
306 | * Scaling is an iterative process
307 |
--------------------------------------------------------------------------------
/solutions/system_design/query_cache/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/query_cache/__init__.py
--------------------------------------------------------------------------------
/solutions/system_design/query_cache/query_cache.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/query_cache/query_cache.graffle
--------------------------------------------------------------------------------
/solutions/system_design/query_cache/query_cache.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/query_cache/query_cache.png
--------------------------------------------------------------------------------
/solutions/system_design/query_cache/query_cache_basic.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/query_cache/query_cache_basic.graffle
--------------------------------------------------------------------------------
/solutions/system_design/query_cache/query_cache_basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/query_cache/query_cache_basic.png
--------------------------------------------------------------------------------
/solutions/system_design/query_cache/query_cache_snippets.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | class QueryApi(object):
5 |
6 | def __init__(self, memory_cache, reverse_index_cluster):
7 | self.memory_cache = memory_cache
8 | self.reverse_index_cluster = reverse_index_cluster
9 |
10 | def parse_query(self, query):
11 | """Remove markup, break text into terms, deal with typos,
12 | normalize capitalization, convert to use boolean operations.
13 | """
14 | ...
15 |
16 | def process_query(self, query):
17 | query = self.parse_query(query)
18 | results = self.memory_cache.get(query)
19 | if results is None:
20 | results = self.reverse_index_cluster.process_search(query)
21 | self.memory_cache.set(query, results)
22 | return results
23 |
24 |
25 | class Node(object):
26 |
27 | def __init__(self, query, results):
28 | self.query = query
29 | self.results = results
30 |
31 |
32 | class LinkedList(object):
33 |
34 | def __init__(self):
35 | self.head = None
36 | self.tail = None
37 |
38 | def move_to_front(self, node):
39 | ...
40 |
41 | def append_to_front(self, node):
42 | ...
43 |
44 | def remove_from_tail(self):
45 | ...
46 |
47 |
48 | class Cache(object):
49 |
50 | def __init__(self, MAX_SIZE):
51 | self.MAX_SIZE = MAX_SIZE
52 | self.size = 0
53 | self.lookup = {}
54 | self.linked_list = LinkedList()
55 |
56 | def get(self, query):
57 | """Get the stored query result from the cache.
58 |
59 | Accessing a node updates its position to the front of the LRU list.
60 | """
61 | node = self.lookup[query]
62 | if node is None:
63 | return None
64 | self.linked_list.move_to_front(node)
65 | return node.results
66 |
67 | def set(self, results, query):
68 | """Set the result for the given query key in the cache.
69 |
70 | When updating an entry, updates its position to the front of the LRU list.
71 | If the entry is new and the cache is at capacity, removes the oldest entry
72 | before the new entry is added.
73 | """
74 | node = self.map[query]
75 | if node is not None:
76 | # Key exists in cache, update the value
77 | node.results = results
78 | self.linked_list.move_to_front(node)
79 | else:
80 | # Key does not exist in cache
81 | if self.size == self.MAX_SIZE:
82 | # Remove the oldest entry from the linked list and lookup
83 | self.lookup.pop(self.linked_list.tail.query, None)
84 | self.linked_list.remove_from_tail()
85 | else:
86 | self.size += 1
87 | # Add the new key and value
88 | new_node = Node(query, results)
89 | self.linked_list.append_to_front(new_node)
90 | self.lookup[query] = new_node
91 |
--------------------------------------------------------------------------------
/solutions/system_design/sales_rank/README-zh-Hans.md:
--------------------------------------------------------------------------------
1 | # 为 Amazon 设计分类售卖排行
2 |
3 | **注意:这个文档中的链接会直接指向[系统设计主题索引](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#系统设计主题的索引)中的有关部分,以避免重复的内容。你可以参考链接的相关内容,来了解其总的要点、方案的权衡取舍以及可选的替代方案。**
4 |
5 | ## 第一步:简述用例与约束条件
6 |
7 | > 搜集需求与问题的范围。
8 | > 提出问题来明确用例与约束条件。
9 | > 讨论假设。
10 |
11 | 我们将在没有面试官明确说明问题的情况下,自己定义一些用例以及限制条件。
12 |
13 | ### 用例
14 |
15 | #### 我们将把问题限定在仅处理以下用例的范围中
16 |
17 | * **服务**根据分类计算过去一周中最受欢迎的商品
18 | * **用户**通过分类浏览过去一周中最受欢迎的商品
19 | * **服务**有着高可用性
20 |
21 | #### 不在用例范围内的有
22 |
23 | * 一般的电商网站
24 | * 只为售卖排行榜设计组件
25 |
26 | ### 限制条件与假设
27 |
28 | #### 提出假设
29 |
30 | * 网络流量不是均匀分布的
31 | * 一个商品可能存在于多个分类中
32 | * 商品不能够更改分类
33 | * 不会存在如 `foo/bar/baz` 之类的子分类
34 | * 每小时更新一次结果
35 | * 受欢迎的商品越多,就需要更频繁地更新
36 | * 1000 万个商品
37 | * 1000 个分类
38 | * 每个月 10 亿次交易
39 | * 每个月 1000 亿次读取请求
40 | * 100:1 的读写比例
41 |
42 | #### 计算用量
43 |
44 | **如果你需要进行粗略的用量计算,请向你的面试官说明。**
45 |
46 | * 每笔交易的用量:
47 | * `created_at` - 5 字节
48 | * `product_id` - 8 字节
49 | * `category_id` - 4 字节
50 | * `seller_id` - 8 字节
51 | * `buyer_id` - 8 字节
52 | * `quantity` - 4 字节
53 | * `total_price` - 5 字节
54 | * 总计:大约 40 字节
55 | * 每个月的交易内容会产生 40 GB 的记录
56 | * 每次交易 40 字节 * 每个月 10 亿次交易
57 | * 3年内产生了 1.44 TB 的新交易内容记录
58 | * 假定大多数的交易都是新交易而不是更改以前进行完的交易
59 | * 平均每秒 400 次交易次数
60 | * 平均每秒 40,000 次读取请求
61 |
62 | 便利换算指南:
63 |
64 | * 每个月有 250 万秒
65 | * 每秒一个请求 = 每个月 250 万次请求
66 | * 每秒 40 个请求 = 每个月 1 亿次请求
67 | * 每秒 400 个请求 = 每个月 10 亿次请求
68 |
69 | ## 第二步:概要设计
70 |
71 | > 列出所有重要组件以规划概要设计。
72 |
73 | 
74 |
75 | ## 第三步:设计核心组件
76 |
77 | > 深入每个核心组件的细节。
78 |
79 | ### 用例:服务需要根据分类计算上周最受欢迎的商品
80 |
81 | 我们可以在现成的**对象存储**系统(例如 Amazon S3 服务)中存储 **售卖 API** 服务产生的日志文本, 因此不需要我们自己搭建分布式文件系统了。
82 |
83 | **向你的面试官告知你准备写多少代码**。
84 |
85 | 假设下面是一个用 tab 分割的简易的日志记录:
86 |
87 | ```
88 | timestamp product_id category_id qty total_price seller_id buyer_id
89 | t1 product1 category1 2 20.00 1 1
90 | t2 product1 category2 2 20.00 2 2
91 | t2 product1 category2 1 10.00 2 3
92 | t3 product2 category1 3 7.00 3 4
93 | t4 product3 category2 7 2.00 4 5
94 | t5 product4 category1 1 5.00 5 6
95 | ...
96 | ```
97 |
98 | **售卖排行服务** 需要用到 **MapReduce**,并使用 **售卖 API** 服务进行日志记录,同时将结果写入 **SQL 数据库**中的总表 `sales_rank` 中。我们也可以讨论一下[究竟是用 SQL 还是用 NoSQL](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-还是-nosql)。
99 |
100 | 我们需要通过以下步骤使用 **MapReduce**:
101 |
102 | * **第 1 步** - 将数据转换为 `(category, product_id), sum(quantity)` 的形式
103 | * **第 2 步** - 执行分布式排序
104 |
105 | ```python
106 | class SalesRanker(MRJob):
107 |
108 | def within_past_week(self, timestamp):
109 | """如果时间戳属于过去的一周则返回 True,
110 | 否则返回 False。"""
111 | ...
112 |
113 | def mapper(self, _ line):
114 | """解析日志的每一行,提取并转换相关行,
115 |
116 | 将键值对设定为如下形式:
117 |
118 | (category1, product1), 2
119 | (category2, product1), 2
120 | (category2, product1), 1
121 | (category1, product2), 3
122 | (category2, product3), 7
123 | (category1, product4), 1
124 | """
125 | timestamp, product_id, category_id, quantity, total_price, seller_id, \
126 | buyer_id = line.split('\t')
127 | if self.within_past_week(timestamp):
128 | yield (category_id, product_id), quantity
129 |
130 | def reducer(self, key, value):
131 | """将每个 key 的值加起来。
132 |
133 | (category1, product1), 2
134 | (category2, product1), 3
135 | (category1, product2), 3
136 | (category2, product3), 7
137 | (category1, product4), 1
138 | """
139 | yield key, sum(values)
140 |
141 | def mapper_sort(self, key, value):
142 | """构造 key 以确保正确的排序。
143 |
144 | 将键值对转换成如下形式:
145 |
146 | (category1, 2), product1
147 | (category2, 3), product1
148 | (category1, 3), product2
149 | (category2, 7), product3
150 | (category1, 1), product4
151 |
152 | MapReduce 的随机排序步骤会将键
153 | 值的排序打乱,变成下面这样:
154 |
155 | (category1, 1), product4
156 | (category1, 2), product1
157 | (category1, 3), product2
158 | (category2, 3), product1
159 | (category2, 7), product3
160 | """
161 | category_id, product_id = key
162 | quantity = value
163 | yield (category_id, quantity), product_id
164 |
165 | def reducer_identity(self, key, value):
166 | yield key, value
167 |
168 | def steps(self):
169 | """ 此处为 map reduce 步骤"""
170 | return [
171 | self.mr(mapper=self.mapper,
172 | reducer=self.reducer),
173 | self.mr(mapper=self.mapper_sort,
174 | reducer=self.reducer_identity),
175 | ]
176 | ```
177 |
178 | 得到的结果将会是如下的排序列,我们将其插入 `sales_rank` 表中:
179 |
180 | ```
181 | (category1, 1), product4
182 | (category1, 2), product1
183 | (category1, 3), product2
184 | (category2, 3), product1
185 | (category2, 7), product3
186 | ```
187 |
188 | `sales_rank` 表的数据结构如下:
189 |
190 | ```
191 | id int NOT NULL AUTO_INCREMENT
192 | category_id int NOT NULL
193 | total_sold int NOT NULL
194 | product_id int NOT NULL
195 | PRIMARY KEY(id)
196 | FOREIGN KEY(category_id) REFERENCES Categories(id)
197 | FOREIGN KEY(product_id) REFERENCES Products(id)
198 | ```
199 |
200 | 我们会以 `id`、`category_id` 与 `product_id` 创建一个 [索引](https://github.com/donnemartin/system-design-primer#use-good-indices)以加快查询速度(只需要使用读取日志的时间,不再需要每次都扫描整个数据表)并让数据常驻内存。从内存读取 1 MB 连续数据大约要花 250 微秒,而从 SSD 读取同样大小的数据要花费 4 倍的时间,从机械硬盘读取需要花费 80 倍以上的时间。1
201 |
202 | ### 用例:用户需要根据分类浏览上周中最受欢迎的商品
203 |
204 | * **客户端**向运行[反向代理](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#反向代理web-服务器)的 **Web 服务器**发送一个请求
205 | * 这个 **Web 服务器**将请求转发给**查询 API** 服务
206 | * The **查询 API** 服务将从 **SQL 数据库**的 `sales_rank` 表中读取数据
207 |
208 | 我们可以调用一个公共的 [REST API](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#表述性状态转移rest):
209 |
210 | ```
211 | $ curl https://amazon.com/api/v1/popular?category_id=1234
212 | ```
213 |
214 | 返回:
215 |
216 | ```
217 | {
218 | "id": "100",
219 | "category_id": "1234",
220 | "total_sold": "100000",
221 | "product_id": "50",
222 | },
223 | {
224 | "id": "53",
225 | "category_id": "1234",
226 | "total_sold": "90000",
227 | "product_id": "200",
228 | },
229 | {
230 | "id": "75",
231 | "category_id": "1234",
232 | "total_sold": "80000",
233 | "product_id": "3",
234 | },
235 | ```
236 |
237 | 而对于服务器内部的通信,我们可以使用 [RPC](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#远程过程调用协议rpc)。
238 |
239 | ## 第四步:架构扩展
240 |
241 | > 根据限制条件,找到并解决瓶颈。
242 |
243 | 
244 |
245 | **重要提示:不要从最初设计直接跳到最终设计中!**
246 |
247 | 现在你要 1) **基准测试、负载测试**。2) **分析、描述**性能瓶颈。3) 在解决瓶颈问题的同时,评估替代方案、权衡利弊。4) 重复以上步骤。请阅读[「设计一个系统,并将其扩大到为数以百万计的 AWS 用户服务」](../scaling_aws/README.md) 来了解如何逐步扩大初始设计。
248 |
249 | 讨论初始设计可能遇到的瓶颈及相关解决方案是很重要的。例如加上一个配置多台 **Web 服务器**的**负载均衡器**是否能够解决问题?**CDN**呢?**主从复制**呢?它们各自的替代方案和需要**权衡**的利弊又有什么呢?
250 |
251 | 我们将会介绍一些组件来完成设计,并解决架构扩张问题。内置的负载均衡器将不做讨论以节省篇幅。
252 |
253 | **为了避免重复讨论**,请参考[系统设计主题索引](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#系统设计主题的索引)相关部分来了解其要点、方案的权衡取舍以及可选的替代方案。
254 |
255 | * [DNS](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#域名系统)
256 | * [负载均衡器](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#负载均衡器)
257 | * [水平拓展](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#水平扩展)
258 | * [反向代理(web 服务器)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#反向代理web-服务器)
259 | * [API 服务(应用层)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#应用层)
260 | * [缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#缓存)
261 | * [关系型数据库管理系统 (RDBMS)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#关系型数据库管理系统rdbms)
262 | * [SQL 故障主从切换](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#故障切换)
263 | * [主从复制](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#主从复制)
264 | * [一致性模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#一致性模式)
265 | * [可用性模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#可用性模式)
266 |
267 | **分析数据库** 可以用现成的数据仓储系统,例如使用 Amazon Redshift 或者 Google BigQuery 的解决方案。
268 |
269 | 当使用数据仓储技术或者**对象存储**系统时,我们只想在数据库中存储有限时间段的数据。Amazon S3 的**对象存储**系统可以很方便地设置每个月限制只允许新增 40 GB 的存储内容。
270 |
271 | 平均每秒 40,000 次的读取请求(峰值将会更高), 可以通过扩展 **内存缓存** 来处理热点内容的读取流量,这对于处理不均匀分布的流量和流量峰值也很有用。由于读取量非常大,**SQL Read 副本** 可能会遇到处理缓存未命中的问题,我们可能需要使用额外的 SQL 扩展模式。
272 |
273 | 平均每秒 400 次写操作(峰值将会更高)可能对于单个 **SQL 写主-从** 模式来说比较很困难,因此同时还需要更多的扩展技术
274 |
275 | SQL 缩放模式包括:
276 |
277 | * [联合](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#联合)
278 | * [分片](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#分片)
279 | * [非规范化](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#非规范化)
280 | * [SQL 调优](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-调优)
281 |
282 | 我们也可以考虑将一些数据移至 **NoSQL 数据库**。
283 |
284 | ## 其它要点
285 |
286 | > 是否深入这些额外的主题,取决于你的问题范围和剩下的时间。
287 |
288 | #### NoSQL
289 |
290 | * [键-值存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#键-值存储)
291 | * [文档类型存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#文档类型存储)
292 | * [列型存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#列型存储)
293 | * [图数据库](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#图数据库)
294 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-还是-nosql)
295 |
296 | ### 缓存
297 |
298 | * 在哪缓存
299 | * [客户端缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#客户端缓存)
300 | * [CDN 缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#cdn-缓存)
301 | * [Web 服务器缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#web-服务器缓存)
302 | * [数据库缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#数据库缓存)
303 | * [应用缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#应用缓存)
304 | * 什么需要缓存
305 | * [数据库查询级别的缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#数据库查询级别的缓存)
306 | * [对象级别的缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#对象级别的缓存)
307 | * 何时更新缓存
308 | * [缓存模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#缓存模式)
309 | * [直写模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#直写模式)
310 | * [回写模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#回写模式)
311 | * [刷新](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#刷新)
312 |
313 | ### 异步与微服务
314 |
315 | * [消息队列](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#消息队列)
316 | * [任务队列](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#任务队列)
317 | * [背压](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#背压)
318 | * [微服务](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#微服务)
319 |
320 | ### 通信
321 |
322 | * 可权衡选择的方案:
323 | * 与客户端的外部通信 - [使用 REST 作为 HTTP API](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#表述性状态转移rest)
324 | * 服务器内部通信 - [RPC](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#远程过程调用协议rpc)
325 | * [服务发现](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#服务发现)
326 |
327 | ### 安全性
328 |
329 | 请参阅[「安全」](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#安全)一章。
330 |
331 | ### 延迟数值
332 |
333 | 请参阅[「每个程序员都应该知道的延迟数」](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#每个程序员都应该知道的延迟数)。
334 |
335 | ### 持续探讨
336 |
337 | * 持续进行基准测试并监控你的系统,以解决他们提出的瓶颈问题。
338 | * 架构拓展是一个迭代的过程。
339 |
--------------------------------------------------------------------------------
/solutions/system_design/sales_rank/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/sales_rank/__init__.py
--------------------------------------------------------------------------------
/solutions/system_design/sales_rank/sales_rank.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/sales_rank/sales_rank.graffle
--------------------------------------------------------------------------------
/solutions/system_design/sales_rank/sales_rank.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/sales_rank/sales_rank.png
--------------------------------------------------------------------------------
/solutions/system_design/sales_rank/sales_rank_basic.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/sales_rank/sales_rank_basic.graffle
--------------------------------------------------------------------------------
/solutions/system_design/sales_rank/sales_rank_basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/sales_rank/sales_rank_basic.png
--------------------------------------------------------------------------------
/solutions/system_design/sales_rank/sales_rank_mapreduce.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from mrjob.job import MRJob
4 |
5 |
6 | class SalesRanker(MRJob):
7 |
8 | def within_past_week(self, timestamp):
9 | """Return True if timestamp is within past week, False otherwise."""
10 | ...
11 |
12 | def mapper(self, _, line):
13 | """Parse each log line, extract and transform relevant lines.
14 |
15 | Emit key value pairs of the form:
16 |
17 | (foo, p1), 2
18 | (bar, p1), 2
19 | (bar, p1), 1
20 | (foo, p2), 3
21 | (bar, p3), 10
22 | (foo, p4), 1
23 | """
24 | timestamp, product_id, category, quantity = line.split('\t')
25 | if self.within_past_week(timestamp):
26 | yield (category, product_id), quantity
27 |
28 | def reducer(self, key, values):
29 | """Sum values for each key.
30 |
31 | (foo, p1), 2
32 | (bar, p1), 3
33 | (foo, p2), 3
34 | (bar, p3), 10
35 | (foo, p4), 1
36 | """
37 | yield key, sum(values)
38 |
39 | def mapper_sort(self, key, value):
40 | """Construct key to ensure proper sorting.
41 |
42 | Transform key and value to the form:
43 |
44 | (foo, 2), p1
45 | (bar, 3), p1
46 | (foo, 3), p2
47 | (bar, 10), p3
48 | (foo, 1), p4
49 |
50 | The shuffle/sort step of MapReduce will then do a
51 | distributed sort on the keys, resulting in:
52 |
53 | (category1, 1), product4
54 | (category1, 2), product1
55 | (category1, 3), product2
56 | (category2, 3), product1
57 | (category2, 7), product3
58 | """
59 | category, product_id = key
60 | quantity = value
61 | yield (category, quantity), product_id
62 |
63 | def reducer_identity(self, key, value):
64 | yield key, value
65 |
66 | def steps(self):
67 | """Run the map and reduce steps."""
68 | return [
69 | self.mr(mapper=self.mapper,
70 | reducer=self.reducer),
71 | self.mr(mapper=self.mapper_sort,
72 | reducer=self.reducer_identity),
73 | ]
74 |
75 |
76 | if __name__ == '__main__':
77 | SalesRanker.run()
78 |
--------------------------------------------------------------------------------
/solutions/system_design/scaling_aws/README-zh-Hans.md:
--------------------------------------------------------------------------------
1 | # 在 AWS 上设计支持百万级到千万级用户的系统
2 |
3 | **注释:为了避免重复,这篇文章的链接直接关联到 [系统设计主题](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) 的相关章节。为一讨论要点、折中方案和可选方案做参考。**
4 |
5 | ## 第 1 步:用例和约束概要
6 |
7 | > 收集需求并调查问题。
8 | > 通过提问清晰用例和约束。
9 | > 讨论假设。
10 |
11 | 如果没有面试官提出明确的问题,我们将自己定义一些用例和约束条件。
12 |
13 | ### 用例
14 |
15 | 解决这个问题是一个循序渐进的过程:1) **基准/负载 测试**, 2) 瓶颈 **概述**, 3) 当评估可选和折中方案时定位瓶颈,4) 重复,这是向可扩展的设计发展基础设计的好模式。
16 |
17 | 除非你有 AWS 的背景或者正在申请需要 AWS 知识的相关职位,否则不要求了解 AWS 的相关细节。并且,这个练习中讨论的许多原则可以更广泛地应用于AWS生态系统之外。
18 |
19 | #### 我们就处理以下用例讨论这一问题
20 |
21 | * **用户** 进行读或写请求
22 | * **服务** 进行处理,存储用户数据,然后返回结果
23 | * **服务** 需要从支持小规模用户开始到百万用户
24 | * 在我们演化架构来处理大量的用户和请求时,讨论一般的扩展模式
25 | * **服务** 高可用
26 |
27 | ### 约束和假设
28 |
29 | #### 状态假设
30 |
31 | * 流量不均匀分布
32 | * 需要关系数据
33 | * 从一个用户扩展到千万用户
34 | * 表示用户量的增长
35 | * 用户量+
36 | * 用户量++
37 | * 用户量+++
38 | * ...
39 | * 1000 万用户
40 | * 每月 10 亿次写入
41 | * 每月 1000 亿次读出
42 | * 100:1 读写比率
43 | * 每次写入 1 KB 内容
44 |
45 | #### 计算使用
46 |
47 | **向你的面试官厘清你是否应该做粗略的使用计算**
48 |
49 | * 1 TB 新内容 / 月
50 | * 1 KB 每次写入 * 10 亿 写入 / 月
51 | * 36 TB 新内容 / 3 年
52 | * 假设大多数写入都是新内容而不是更新已有内容
53 | * 平均每秒 400 次写入
54 | * 平均每秒 40,000 次读取
55 |
56 | 便捷的转换指南:
57 |
58 | * 250 万秒 / 月
59 | * 1 次请求 / 秒 = 250 万次请求 / 月
60 | * 40 次请求 / 秒 = 1 亿次请求 / 月
61 | * 400 次请求 / 秒 = 10 亿请求 / 月
62 |
63 | ## 第 2 步:创建高级设计方案
64 |
65 | > 用所有重要组件概述高水平设计
66 |
67 | 
68 |
69 | ## 第 3 步:设计核心组件
70 |
71 | > 深入每个核心组件的细节。
72 |
73 | ### 用例:用户进行读写请求
74 |
75 | #### 目标
76 |
77 | * 只有 1-2 个用户时,你只需要基础配置
78 | * 为简单起见,只需要一台服务器
79 | * 必要时进行纵向扩展
80 | * 监控以确定瓶颈
81 |
82 | #### 以单台服务器开始
83 |
84 | * **Web 服务器** 在 EC2 上
85 | * 存储用户数据
86 | * [**MySQL 数据库**](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms)
87 |
88 | 运用 **纵向扩展**:
89 |
90 | * 选择一台更大容量的服务器
91 | * 密切关注指标,确定如何扩大规模
92 | * 使用基本监控来确定瓶颈:CPU、内存、IO、网络等
93 | * CloudWatch, top, nagios, statsd, graphite等
94 | * 纵向扩展的代价将变得更昂贵
95 | * 无冗余/容错
96 |
97 | **折中方案, 可选方案, 和其他细节:**
98 |
99 | * **纵向扩展** 的可选方案是 [**横向扩展**](https://github.com/donnemartin/system-design-primer#horizontal-scaling)
100 |
101 | #### 自 SQL 开始,但认真考虑 NoSQL
102 |
103 | 约束条件假设需要关系型数据。我们可以开始时在单台服务器上使用 **MySQL 数据库**。
104 |
105 | **折中方案, 可选方案, 和其他细节:**
106 |
107 | * 查阅 [关系型数据库管理系统 (RDBMS)](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms) 章节
108 | * 讨论使用 [SQL 或 NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql) 的原因
109 |
110 | #### 分配公共静态 IP
111 |
112 | * 弹性 IP 提供了一个公共端点,不会在重启时改变 IP。
113 | * 故障转移时只需要把域名指向新 IP。
114 |
115 | #### 使用 DNS 服务
116 |
117 | 添加 **DNS** 服务,比如 Route 53([Amazon Route 53](https://aws.amazon.com/cn/route53/) - 译者注),将域映射到实例的公共 IP 中。
118 |
119 | **折中方案, 可选方案, 和其他细节:**
120 |
121 | * 查阅 [域名系统](https://github.com/donnemartin/system-design-primer#domain-name-system) 章节
122 |
123 | #### 安全的 Web 服务器
124 |
125 | * 只开放必要的端口
126 | * 允许 Web 服务器响应来自以下端口的请求
127 | * HTTP 80
128 | * HTTPS 443
129 | * SSH IP 白名单 22
130 | * 防止 Web 服务器启动外链
131 |
132 | **折中方案, 可选方案, 和其他细节:**
133 |
134 | * 查阅 [安全](https://github.com/donnemartin/system-design-primer#security) 章节
135 |
136 | ## 第 4 步:扩展设计
137 |
138 | > 在给定约束条件下,定义和确认瓶颈。
139 |
140 | ### 用户+
141 |
142 | 
143 |
144 | #### 假设
145 |
146 | 我们的用户数量开始上升,并且单台服务器的负载上升。**基准/负载测试** 和 **分析** 指出 **MySQL 数据库** 占用越来越多的内存和 CPU 资源,同时用户数据将填满硬盘空间。
147 |
148 | 目前,我们尚能在纵向扩展时解决这些问题。不幸的是,解决这些问题的代价变得相当昂贵,并且原来的系统并不能允许在 **MySQL 数据库** 和 **Web 服务器** 的基础上进行独立扩展。
149 |
150 | #### 目标
151 |
152 | * 减轻单台服务器负载并且允许独立扩展
153 | * 在 **对象存储** 中单独存储静态内容
154 | * 将 **MySQL 数据库** 迁移到单独的服务器上
155 | * 缺点
156 | * 这些变化会增加复杂性,并要求对 **Web服务器** 进行更改,以指向 **对象存储** 和 **MySQL 数据库**
157 | * 必须采取额外的安全措施来确保新组件的安全
158 | * AWS 的成本也会增加,但应该与自身管理类似系统的成本做比较
159 |
160 | #### 独立保存静态内容
161 |
162 | * 考虑使用像 S3 这样可管理的 **对象存储** 服务来存储静态内容
163 | * 高扩展性和可靠性
164 | * 服务器端加密
165 | * 迁移静态内容到 S3
166 | * 用户文件
167 | * JS
168 | * CSS
169 | * 图片
170 | * 视频
171 |
172 | #### 迁移 MySQL 数据库到独立机器上
173 |
174 | * 考虑使用类似 RDS 的服务来管理 **MySQL 数据库**
175 | * 简单的管理,扩展
176 | * 多个可用区域
177 | * 空闲时加密
178 |
179 | #### 系统安全
180 |
181 | * 在传输和空闲时对数据进行加密
182 | * 使用虚拟私有云
183 | * 为单个 **Web 服务器** 创建一个公共子网,这样就可以发送和接收来自 internet 的流量
184 | * 为其他内容创建一个私有子网,禁止外部访问
185 | * 在每个组件上只为白名单 IP 打开端口
186 | * 这些相同的模式应当在新的组件的实现中实践
187 |
188 | **折中方案, 可选方案, 和其他细节:**
189 |
190 | * 查阅 [安全](https://github.com/donnemartin/system-design-primer#security) 章节
191 |
192 | ### 用户+++
193 |
194 | 
195 |
196 | #### 假设
197 |
198 | 我们的 **基准/负载测试** 和 **性能测试** 显示,在高峰时段,我们的单一 **Web服务器** 存在瓶颈,导致响应缓慢,在某些情况下还会宕机。随着服务的成熟,我们也希望朝着更高的可用性和冗余发展。
199 |
200 | #### 目标
201 |
202 | * 下面的目标试图用 **Web服务器** 解决扩展问题
203 | * 基于 **基准/负载测试** 和 **分析**,你可能只需要实现其中的一两个技术
204 | * 使用 [**横向扩展**](https://github.com/donnemartin/system-design-primer#horizontal-scaling) 来处理增加的负载和单点故障
205 | * 添加 [**负载均衡器**](https://github.com/donnemartin/system-design-primer#load-balancer) 例如 Amazon 的 ELB 或 HAProxy
206 | * ELB 是高可用的
207 | * 如果你正在配置自己的 **负载均衡器**, 在多个可用区域中设置多台服务器用于 [双活](https://github.com/donnemartin/system-design-primer#active-active) 或 [主被](https://github.com/donnemartin/system-design-primer#active-passive) 将提高可用性
208 | * 终止在 **负载平衡器** 上的SSL,以减少后端服务器上的计算负载,并简化证书管理
209 | * 在多个可用区域中使用多台 **Web服务器**
210 | * 在多个可用区域的 [**主-从 故障转移**](https://github.com/donnemartin/system-design-primer#master-slave-replication) 模式中使用多个 **MySQL** 实例来改进冗余
211 | * 分离 **Web 服务器** 和 [**应用服务器**](https://github.com/donnemartin/system-design-primer#application-layer)
212 | * 独立扩展和配置每一层
213 | * **Web 服务器** 可以作为 [**反向代理**](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server)
214 | * 例如, 你可以添加 **应用服务器** 处理 **读 API** 而另外一些处理 **写 API**
215 | * 将静态(和一些动态)内容转移到 [**内容分发网络 (CDN)**](https://github.com/donnemartin/system-design-primer#content-delivery-network) 例如 CloudFront 以减少负载和延迟
216 |
217 | **折中方案, 可选方案, 和其他细节:**
218 |
219 | * 查阅以上链接获得更多细节
220 |
221 | ### 用户+++
222 |
223 | 
224 |
225 | **注意:** **内部负载均衡** 不显示以减少混乱
226 |
227 | #### 假设
228 |
229 | 我们的 **性能/负载测试** 和 **性能测试** 显示我们读操作频繁(100:1 的读写比率),并且数据库在高读请求时表现很糟糕。
230 |
231 | #### 目标
232 |
233 | * 下面的目标试图解决 **MySQL数据库** 的伸缩性问题
234 | * * 基于 **基准/负载测试** 和 **分析**,你可能只需要实现其中的一两个技术
235 | * 将下列数据移动到一个 [**内存缓存**](https://github.com/donnemartin/system-design-primer#cache),例如弹性缓存,以减少负载和延迟:
236 | * **MySQL** 中频繁访问的内容
237 | * 首先, 尝试配置 **MySQL 数据库** 缓存以查看是否足以在实现 **内存缓存** 之前缓解瓶颈
238 | * 来自 **Web 服务器** 的会话数据
239 | * **Web 服务器** 变成无状态的, 允许 **自动伸缩**
240 | * 从内存中读取 1 MB 内存需要大约 250 微秒,而从SSD中读取时间要长 4 倍,从磁盘读取的时间要长 80 倍。1
241 | * 添加 [**MySQL 读取副本**](https://github.com/donnemartin/system-design-primer#master-slave-replication) 来减少写主线程的负载
242 | * 添加更多 **Web 服务器** and **应用服务器** 来提高响应
243 |
244 | **折中方案, 可选方案, 和其他细节:**
245 |
246 | * 查阅以上链接获得更多细节
247 |
248 | #### 添加 MySQL 读取副本
249 |
250 | * 除了添加和扩展 **内存缓存**,**MySQL 读副本服务器** 也能够帮助缓解在 **MySQL 写主服务器** 的负载。
251 | * 添加逻辑到 **Web 服务器** 来区分读和写操作
252 | * 在 **MySQL 读副本服务器** 之上添加 **负载均衡器** (不是为了减少混乱)
253 | * 大多数服务都是读取负载大于写入负载
254 |
255 | **折中方案, 可选方案, 和其他细节:**
256 |
257 | * 查阅 [关系型数据库管理系统 (RDBMS)](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms) 章节
258 |
259 | ### 用户++++
260 |
261 | 
262 |
263 | #### 假设
264 |
265 | **基准/负载测试** 和 **分析** 显示,在美国,正常工作时间存在流量峰值,当用户离开办公室时,流量骤降。我们认为,可以通过真实负载自动转换服务器数量来降低成本。我们是一家小商店,所以我们希望 DevOps 尽量自动化地进行 **自动伸缩** 和通用操作。
266 |
267 | #### 目标
268 |
269 | * 根据需要添加 **自动扩展**
270 | * 跟踪流量高峰
271 | * 通过关闭未使用的实例来降低成本
272 | * 自动化 DevOps
273 | * Chef, Puppet, Ansible 工具等
274 | * 继续监控指标以解决瓶颈
275 | * **主机水平** - 检查一个 EC2 实例
276 | * **总水平** - 检查负载均衡器统计数据
277 | * **日志分析** - CloudWatch, CloudTrail, Loggly, Splunk, Sumo
278 | * **外部站点的性能** - Pingdom or New Relic
279 | * **处理通知和事件** - PagerDuty
280 | * **错误报告** - Sentry
281 |
282 | #### 添加自动扩展
283 |
284 | * 考虑使用一个托管服务,比如AWS **自动扩展**
285 | * 为每个 **Web 服务器** 创建一个组,并为每个 **应用服务器** 类型创建一个组,将每个组放置在多个可用区域中
286 | * 设置最小和最大实例数
287 | * 通过 CloudWatch 来扩展或收缩
288 | * 可预测负载的简单时间度量
289 | * 一段时间内的指标:
290 | * CPU 负载
291 | * 延迟
292 | * 网络流量
293 | * 自定义指标
294 | * 缺点
295 | * 自动扩展会引入复杂性
296 | * 可能需要一段时间才能适当扩大规模,以满足增加的需求,或者在需求下降时缩减规模
297 |
298 | ### 用户+++++
299 |
300 | 
301 |
302 | **注释:** **自动伸缩** 组不显示以减少混乱
303 |
304 | #### 假设
305 |
306 | 当服务继续向着限制条件概述的方向发展,我们反复地运行 **基准/负载测试** 和 **分析** 来进一步发现和定位新的瓶颈。
307 |
308 | #### 目标
309 |
310 | 由于问题的约束,我们将继续提出扩展性的问题:
311 |
312 | * 如果我们的 **MySQL 数据库** 开始变得过于庞大, 我们可能只考虑把数据在数据库中存储一段有限的时间, 同时在例如 Redshift 这样的数据仓库中存储其余的数据
313 | * 像 Redshift 这样的数据仓库能够轻松处理每月 1TB 的新内容
314 | * 平均每秒 40,000 次的读取请求, 可以通过扩展 **内存缓存** 来处理热点内容的读取流量,这对于处理不均匀分布的流量和流量峰值也很有用
315 | * **SQL读取副本** 可能会遇到处理缓存未命中的问题, 我们可能需要使用额外的 SQL 扩展模式
316 | * 对于单个 **SQL 写主-从** 模式来说,平均每秒 400 次写操作(明显更高)可能会很困难,同时还需要更多的扩展技术
317 |
318 | SQL 扩展模型包括:
319 |
320 | * [集合](https://github.com/donnemartin/system-design-primer#federation)
321 | * [分片](https://github.com/donnemartin/system-design-primer#sharding)
322 | * [反范式](https://github.com/donnemartin/system-design-primer#denormalization)
323 | * [SQL 调优](https://github.com/donnemartin/system-design-primer#sql-tuning)
324 |
325 | 为了进一步处理高读和写请求,我们还应该考虑将适当的数据移动到一个 [**NoSQL数据库**](https://github.com/donnemartin/system-design-primer#nosql) ,例如 DynamoDB。
326 |
327 | 我们可以进一步分离我们的 [**应用服务器**](https://github.com/donnemartin/system-design-primer#application-layer) 以允许独立扩展。不需要实时完成的批处理任务和计算可以通过 Queues 和 Workers 异步完成:
328 |
329 | * 以照片服务为例,照片上传和缩略图的创建可以分开进行
330 | * **客户端** 上传图片
331 | * **应用服务器** 推送一个任务到 **队列** 例如 SQS
332 | * EC2 上的 **Worker 服务** 或者 Lambda 从 **队列** 拉取 work,然后:
333 | * 创建缩略图
334 | * 更新 **数据库**
335 | * 在 **对象存储** 中存储缩略图
336 |
337 | **折中方案, 可选方案, 和其他细节:**
338 |
339 | * 查阅以上链接获得更多细节
340 |
341 | ## 额外的话题
342 |
343 | > 根据问题的范围和剩余时间,还需要深入讨论其他问题。
344 |
345 | ### SQL 扩展模式
346 |
347 | * [读取副本](https://github.com/donnemartin/system-design-primer#master-slave-replication)
348 | * [集合](https://github.com/donnemartin/system-design-primer#federation)
349 | * [分区](https://github.com/donnemartin/system-design-primer#sharding)
350 | * [反规范化](https://github.com/donnemartin/system-design-primer#denormalization)
351 | * [SQL 调优](https://github.com/donnemartin/system-design-primer#sql-tuning)
352 |
353 | #### NoSQL
354 |
355 | * [键值存储](https://github.com/donnemartin/system-design-primer#key-value-store)
356 | * [文档存储](https://github.com/donnemartin/system-design-primer#document-store)
357 | * [宽表存储](https://github.com/donnemartin/system-design-primer#wide-column-store)
358 | * [图数据库](https://github.com/donnemartin/system-design-primer#graph-database)
359 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql)
360 |
361 | ### 缓存
362 |
363 | * 缓存到哪里
364 | * [客户端缓存](https://github.com/donnemartin/system-design-primer#client-caching)
365 | * [CDN 缓存](https://github.com/donnemartin/system-design-primer#cdn-caching)
366 | * [Web 服务缓存](https://github.com/donnemartin/system-design-primer#web-server-caching)
367 | * [数据库缓存](https://github.com/donnemartin/system-design-primer#database-caching)
368 | * [应用缓存](https://github.com/donnemartin/system-design-primer#application-caching)
369 | * 缓存什么
370 | * [数据库请求层缓存](https://github.com/donnemartin/system-design-primer#caching-at-the-database-query-level)
371 | * [对象层缓存](https://github.com/donnemartin/system-design-primer#caching-at-the-object-level)
372 | * 何时更新缓存
373 | * [预留缓存](https://github.com/donnemartin/system-design-primer#cache-aside)
374 | * [完全写入](https://github.com/donnemartin/system-design-primer#write-through)
375 | * [延迟写 (写回)](https://github.com/donnemartin/system-design-primer#write-behind-write-back)
376 | * [事先更新](https://github.com/donnemartin/system-design-primer#refresh-ahead)
377 |
378 | ### 异步性和微服务
379 |
380 | * [消息队列](https://github.com/donnemartin/system-design-primer#message-queues)
381 | * [任务队列](https://github.com/donnemartin/system-design-primer#task-queues)
382 | * [回退压力](https://github.com/donnemartin/system-design-primer#back-pressure)
383 | * [微服务](https://github.com/donnemartin/system-design-primer#microservices)
384 |
385 | ### 沟通
386 |
387 | * 关于折中方案的讨论:
388 | * 客户端的外部通讯 - [遵循 REST 的 HTTP APIs](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest)
389 | * 内部通讯 - [RPC](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc)
390 | * [服务探索](https://github.com/donnemartin/system-design-primer#service-discovery)
391 |
392 | ### 安全性
393 |
394 | 参考 [安全章节](https://github.com/donnemartin/system-design-primer#security)
395 |
396 | ### 延迟数字指标
397 |
398 | 查阅 [每个程序员必懂的延迟数字](https://github.com/donnemartin/system-design-primer#latency-numbers-every-programmer-should-know)
399 |
400 | ### 正在进行
401 |
402 | * 继续基准测试并监控你的系统以解决出现的瓶颈问题
403 | * 扩展是一个迭代的过程
404 |
--------------------------------------------------------------------------------
/solutions/system_design/scaling_aws/scaling_aws.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/scaling_aws/scaling_aws.graffle
--------------------------------------------------------------------------------
/solutions/system_design/scaling_aws/scaling_aws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/scaling_aws/scaling_aws.png
--------------------------------------------------------------------------------
/solutions/system_design/scaling_aws/scaling_aws_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/scaling_aws/scaling_aws_1.png
--------------------------------------------------------------------------------
/solutions/system_design/scaling_aws/scaling_aws_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/scaling_aws/scaling_aws_2.png
--------------------------------------------------------------------------------
/solutions/system_design/scaling_aws/scaling_aws_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/scaling_aws/scaling_aws_3.png
--------------------------------------------------------------------------------
/solutions/system_design/scaling_aws/scaling_aws_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/scaling_aws/scaling_aws_4.png
--------------------------------------------------------------------------------
/solutions/system_design/scaling_aws/scaling_aws_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/scaling_aws/scaling_aws_5.png
--------------------------------------------------------------------------------
/solutions/system_design/scaling_aws/scaling_aws_6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/scaling_aws/scaling_aws_6.png
--------------------------------------------------------------------------------
/solutions/system_design/scaling_aws/scaling_aws_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/scaling_aws/scaling_aws_7.png
--------------------------------------------------------------------------------
/solutions/system_design/social_graph/README-zh-Hans.md:
--------------------------------------------------------------------------------
1 | # 为社交网络设计数据结构
2 |
3 | **注释:为了避免重复,这篇文章的链接直接关联到 [系统设计主题](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) 的相关章节。为一讨论要点、折中方案和可选方案做参考。**
4 |
5 | ## 第 1 步:用例和约束概要
6 |
7 | > 收集需求并调查问题。
8 | > 通过提问清晰用例和约束。
9 | > 讨论假设。
10 |
11 | 如果没有面试官提出明确的问题,我们将自己定义一些用例和约束条件。
12 |
13 | ### 用例
14 |
15 | #### 我们就处理以下用例审视这一问题
16 |
17 | * **用户** 寻找某人并显示与被寻人之间的最短路径
18 | * **服务** 高可用
19 |
20 | ### 约束和假设
21 |
22 | #### 状态假设
23 |
24 | * 流量分布不均
25 | * 某些搜索比别的更热门,同时某些搜索仅执行一次
26 | * 图数据不适用单一机器
27 | * 图的边没有权重
28 | * 1 千万用户
29 | * 每个用户平均有 50 个朋友
30 | * 每月 10 亿次朋友搜索
31 |
32 | 训练使用更传统的系统 - 别用图特有的解决方案例如 [GraphQL](http://graphql.org/) 或图数据库如 [Neo4j](https://neo4j.com/)。
33 |
34 | #### 计算使用
35 |
36 | **向你的面试官厘清你是否应该做粗略的使用计算**
37 |
38 | * 50 亿朋友关系
39 | * 1 亿用户 * 平均每人 50 个朋友
40 | * 每秒 400 次搜索请求
41 |
42 | 便捷的转换指南:
43 |
44 | * 每月 250 万秒
45 | * 每秒 1 个请求 = 每月 250 万次请求
46 | * 每秒 40 个请求 = 每月 1 亿次请求
47 | * 每秒 400 个请求 = 每月 10 亿次请求
48 |
49 | ## 第 2 步:创建高级设计方案
50 |
51 | > 用所有重要组件概述高水平设计
52 |
53 | 
54 |
55 | ## 第 3 步:设计核心组件
56 |
57 | > 深入每个核心组件的细节。
58 |
59 | ### 用例: 用户搜索某人并查看到被搜人的最短路径
60 |
61 | **和你的面试官说清你期望的代码量**
62 |
63 | 没有百万用户(点)的和十亿朋友关系(边)的限制,我们能够用一般 BFS 方法解决无权重最短路径任务:
64 |
65 | ```python
66 | class Graph(Graph):
67 |
68 | def shortest_path(self, source, dest):
69 | if source is None or dest is None:
70 | return None
71 | if source is dest:
72 | return [source.key]
73 | prev_node_keys = self._shortest_path(source, dest)
74 | if prev_node_keys is None:
75 | return None
76 | else:
77 | path_ids = [dest.key]
78 | prev_node_key = prev_node_keys[dest.key]
79 | while prev_node_key is not None:
80 | path_ids.append(prev_node_key)
81 | prev_node_key = prev_node_keys[prev_node_key]
82 | return path_ids[::-1]
83 |
84 | def _shortest_path(self, source, dest):
85 | queue = deque()
86 | queue.append(source)
87 | prev_node_keys = {source.key: None}
88 | source.visit_state = State.visited
89 | while queue:
90 | node = queue.popleft()
91 | if node is dest:
92 | return prev_node_keys
93 | prev_node = node
94 | for adj_node in node.adj_nodes.values():
95 | if adj_node.visit_state == State.unvisited:
96 | queue.append(adj_node)
97 | prev_node_keys[adj_node.key] = prev_node.key
98 | adj_node.visit_state = State.visited
99 | return None
100 | ```
101 |
102 | 我们不能在同一台机器上满足所有用户,我们需要通过 **人员服务器** [拆分](https://github.com/donnemartin/system-design-primer#sharding) 用户并且通过 **查询服务** 访问。
103 |
104 | * **客户端** 向 **服务器** 发送请求,**服务器** 作为 [反向代理](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server)
105 | * **搜索 API** 服务器向 **用户图服务** 转发请求
106 | * **用户图服务** 有以下功能:
107 | * 使用 **查询服务** 找到当前用户信息存储的 **人员服务器**
108 | * 找到适当的 **人员服务器** 检索当前用户的 `friend_ids` 列表
109 | * 把当前用户作为 `source` 运行 BFS 搜索算法同时 当前用户的 `friend_ids` 作为每个 `adjacent_node` 的 ids
110 | * 给定 id 获取 `adjacent_node`:
111 | * **用户图服务** 将 **再次** 和 **查询服务** 通讯,最后判断出和给定 id 相匹配的存储 `adjacent_node` 的 **人员服务器**(有待优化)
112 |
113 | **和你的面试官说清你应该写的代码量**
114 |
115 | **注释**:简易版错误处理执行如下。询问你是否需要编写适当的错误处理方法。
116 |
117 | **查询服务** 实现:
118 |
119 | ```python
120 | class LookupService(object):
121 |
122 | def __init__(self):
123 | self.lookup = self._init_lookup() # key: person_id, value: person_server
124 |
125 | def _init_lookup(self):
126 | ...
127 |
128 | def lookup_person_server(self, person_id):
129 | return self.lookup[person_id]
130 | ```
131 |
132 | **人员服务器** 实现:
133 |
134 | ```python
135 | class PersonServer(object):
136 |
137 | def __init__(self):
138 | self.people = {} # key: person_id, value: person
139 |
140 | def add_person(self, person):
141 | ...
142 |
143 | def people(self, ids):
144 | results = []
145 | for id in ids:
146 | if id in self.people:
147 | results.append(self.people[id])
148 | return results
149 | ```
150 |
151 | **用户** 实现:
152 |
153 | ```python
154 | class Person(object):
155 |
156 | def __init__(self, id, name, friend_ids):
157 | self.id = id
158 | self.name = name
159 | self.friend_ids = friend_ids
160 | ```
161 |
162 | **用户图服务** 实现:
163 |
164 | ```python
165 | class UserGraphService(object):
166 |
167 | def __init__(self, lookup_service):
168 | self.lookup_service = lookup_service
169 |
170 | def person(self, person_id):
171 | person_server = self.lookup_service.lookup_person_server(person_id)
172 | return person_server.people([person_id])
173 |
174 | def shortest_path(self, source_key, dest_key):
175 | if source_key is None or dest_key is None:
176 | return None
177 | if source_key is dest_key:
178 | return [source_key]
179 | prev_node_keys = self._shortest_path(source_key, dest_key)
180 | if prev_node_keys is None:
181 | return None
182 | else:
183 | # Iterate through the path_ids backwards, starting at dest_key
184 | path_ids = [dest_key]
185 | prev_node_key = prev_node_keys[dest_key]
186 | while prev_node_key is not None:
187 | path_ids.append(prev_node_key)
188 | prev_node_key = prev_node_keys[prev_node_key]
189 | # Reverse the list since we iterated backwards
190 | return path_ids[::-1]
191 |
192 | def _shortest_path(self, source_key, dest_key, path):
193 | # Use the id to get the Person
194 | source = self.person(source_key)
195 | # Update our bfs queue
196 | queue = deque()
197 | queue.append(source)
198 | # prev_node_keys keeps track of each hop from
199 | # the source_key to the dest_key
200 | prev_node_keys = {source_key: None}
201 | # We'll use visited_ids to keep track of which nodes we've
202 | # visited, which can be different from a typical bfs where
203 | # this can be stored in the node itself
204 | visited_ids = set()
205 | visited_ids.add(source.id)
206 | while queue:
207 | node = queue.popleft()
208 | if node.key is dest_key:
209 | return prev_node_keys
210 | prev_node = node
211 | for friend_id in node.friend_ids:
212 | if friend_id not in visited_ids:
213 | friend_node = self.person(friend_id)
214 | queue.append(friend_node)
215 | prev_node_keys[friend_id] = prev_node.key
216 | visited_ids.add(friend_id)
217 | return None
218 | ```
219 |
220 | 我们用的是公共的 [**REST API**](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest):
221 |
222 | ```
223 | $ curl https://social.com/api/v1/friend_search?person_id=1234
224 | ```
225 |
226 | 响应:
227 |
228 | ```
229 | {
230 | "person_id": "100",
231 | "name": "foo",
232 | "link": "https://social.com/foo",
233 | },
234 | {
235 | "person_id": "53",
236 | "name": "bar",
237 | "link": "https://social.com/bar",
238 | },
239 | {
240 | "person_id": "1234",
241 | "name": "baz",
242 | "link": "https://social.com/baz",
243 | },
244 | ```
245 |
246 | 内部通信使用 [远端过程调用](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc)。
247 |
248 | ## 第 4 步:扩展设计
249 |
250 | > 在给定约束条件下,定义和确认瓶颈。
251 |
252 | 
253 |
254 | **重要:别简化从最初设计到最终设计的过程!**
255 |
256 | 你将要做的是:1) **基准/负载 测试**, 2) 瓶颈 **概述**, 3) 当评估可选和折中方案时定位瓶颈,4) 重复。以 [在 AWS 上设计支持百万级到千万级用户的系统](../scaling_aws/README.md) 为参考迭代地扩展最初设计。
257 |
258 | 讨论最初设计可能遇到的瓶颈和处理方法十分重要。例如,什么问题可以通过添加多台 **Web 服务器** 作为 **负载均衡** 解决?**CDN**?**主从副本**?每个问题都有哪些替代和 **折中** 方案?
259 |
260 | 我们即将介绍一些组件来完成设计和解决扩展性问题。内部负载均衡不显示以减少混乱。
261 |
262 | **避免重复讨论**,以下网址链接到 [系统设计主题](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) 相关的主流方案、折中方案和替代方案。
263 |
264 | * [DNS](https://github.com/donnemartin/system-design-primer#domain-name-system)
265 | * [负载均衡](https://github.com/donnemartin/system-design-primer#load-balancer)
266 | * [横向扩展](https://github.com/donnemartin/system-design-primer#horizontal-scaling)
267 | * [Web 服务器(反向代理)](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server)
268 | * [API 服务器(应用层)](https://github.com/donnemartin/system-design-primer#application-layer)
269 | * [缓存](https://github.com/donnemartin/system-design-primer#cache)
270 | * [一致性模式](https://github.com/donnemartin/system-design-primer#consistency-patterns)
271 | * [可用性模式](https://github.com/donnemartin/system-design-primer#availability-patterns)
272 |
273 | 解决 **平均** 每秒 400 次请求的限制(峰值),人员数据可以存在例如 Redis 或 Memcached 这样的 **内存** 中以减少响应次数和下游流量通信服务。这尤其在用户执行多次连续查询和查询哪些广泛连接的人时十分有用。从内存中读取 1MB 数据大约要 250 微秒,从 SSD 中读取同样大小的数据时间要长 4 倍,从硬盘要长 80 倍。1
274 |
275 | 以下是进一步优化方案:
276 |
277 | * 在 **内存** 中存储完整的或部分的BFS遍历加快后续查找
278 | * 在 **NoSQL 数据库** 中批量离线计算并存储完整的或部分的BFS遍历加快后续查找
279 | * 在同一台 **人员服务器** 上托管批处理同一批朋友查找减少机器跳转
280 | * 通过地理位置 [拆分](https://github.com/donnemartin/system-design-primer#sharding) **人员服务器** 来进一步优化,因为朋友通常住得都比较近
281 | * 同时进行两个 BFS 查找,一个从 source 开始,一个从 destination 开始,然后合并两个路径
282 | * 从有庞大朋友圈的人开始找起,这样更有可能减小当前用户和搜索目标之间的 [离散度数](https://en.wikipedia.org/wiki/Six_degrees_of_separation)
283 | * 在询问用户是否继续查询之前设置基于时间或跳跃数阈值,当在某些案例中搜索耗费时间过长时。
284 | * 使用类似 [Neo4j](https://neo4j.com/) 的 **图数据库** 或图特定查询语法,例如 [GraphQL](http://graphql.org/)(如果没有禁止使用 **图数据库** 的限制的话)
285 |
286 | ## 额外的话题
287 |
288 | > 根据问题的范围和剩余时间,还需要深入讨论其他问题。
289 |
290 | ### SQL 扩展模式
291 |
292 | * [读取副本](https://github.com/donnemartin/system-design-primer#master-slave-replication)
293 | * [集合](https://github.com/donnemartin/system-design-primer#federation)
294 | * [分区](https://github.com/donnemartin/system-design-primer#sharding)
295 | * [反规范化](https://github.com/donnemartin/system-design-primer#denormalization)
296 | * [SQL 调优](https://github.com/donnemartin/system-design-primer#sql-tuning)
297 |
298 | #### NoSQL
299 |
300 | * [键值存储](https://github.com/donnemartin/system-design-primer#key-value-store)
301 | * [文档存储](https://github.com/donnemartin/system-design-primer#document-store)
302 | * [宽表存储](https://github.com/donnemartin/system-design-primer#wide-column-store)
303 | * [图数据库](https://github.com/donnemartin/system-design-primer#graph-database)
304 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql)
305 |
306 | ### 缓存
307 |
308 | * 缓存到哪里
309 | * [客户端缓存](https://github.com/donnemartin/system-design-primer#client-caching)
310 | * [CDN 缓存](https://github.com/donnemartin/system-design-primer#cdn-caching)
311 | * [Web 服务缓存](https://github.com/donnemartin/system-design-primer#web-server-caching)
312 | * [数据库缓存](https://github.com/donnemartin/system-design-primer#database-caching)
313 | * [应用缓存](https://github.com/donnemartin/system-design-primer#application-caching)
314 | * 缓存什么
315 | * [数据库请求层缓存](https://github.com/donnemartin/system-design-primer#caching-at-the-database-query-level)
316 | * [对象层缓存](https://github.com/donnemartin/system-design-primer#caching-at-the-object-level)
317 | * 何时更新缓存
318 | * [预留缓存](https://github.com/donnemartin/system-design-primer#cache-aside)
319 | * [完全写入](https://github.com/donnemartin/system-design-primer#write-through)
320 | * [延迟写 (写回)](https://github.com/donnemartin/system-design-primer#write-behind-write-back)
321 | * [事先更新](https://github.com/donnemartin/system-design-primer#refresh-ahead)
322 |
323 | ### 异步性和微服务
324 |
325 | * [消息队列](https://github.com/donnemartin/system-design-primer#message-queues)
326 | * [任务队列](https://github.com/donnemartin/system-design-primer#task-queues)
327 | * [回退压力](https://github.com/donnemartin/system-design-primer#back-pressure)
328 | * [微服务](https://github.com/donnemartin/system-design-primer#microservices)
329 |
330 | ### 沟通
331 |
332 | * 关于折中方案的讨论:
333 | * 客户端的外部通讯 - [遵循 REST 的 HTTP APIs](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest)
334 | * 内部通讯 - [RPC](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc)
335 | * [服务探索](https://github.com/donnemartin/system-design-primer#service-discovery)
336 |
337 | ### 安全性
338 |
339 | 参考 [安全章节](https://github.com/donnemartin/system-design-primer#security)
340 |
341 | ### 延迟数字指标
342 |
343 | 查阅 [每个程序员必懂的延迟数字](https://github.com/donnemartin/system-design-primer#latency-numbers-every-programmer-should-know)
344 |
345 | ### 正在进行
346 |
347 | * 继续基准测试并监控你的系统以解决出现的瓶颈问题
348 | * 扩展是一个迭代的过程
349 |
--------------------------------------------------------------------------------
/solutions/system_design/social_graph/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/social_graph/__init__.py
--------------------------------------------------------------------------------
/solutions/system_design/social_graph/social_graph.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/social_graph/social_graph.graffle
--------------------------------------------------------------------------------
/solutions/system_design/social_graph/social_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/social_graph/social_graph.png
--------------------------------------------------------------------------------
/solutions/system_design/social_graph/social_graph_basic.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/social_graph/social_graph_basic.graffle
--------------------------------------------------------------------------------
/solutions/system_design/social_graph/social_graph_basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/social_graph/social_graph_basic.png
--------------------------------------------------------------------------------
/solutions/system_design/social_graph/social_graph_snippets.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from collections import deque
3 | from enum import Enum
4 |
5 |
6 | class State(Enum):
7 | unvisited = 0
8 | visited = 1
9 |
10 |
11 | class Graph(object):
12 |
13 | def bfs(self, source, dest):
14 | if source is None:
15 | return False
16 | queue = deque()
17 | queue.append(source)
18 | source.visit_state = State.visited
19 | while queue:
20 | node = queue.popleft()
21 | print(node)
22 | if dest is node:
23 | return True
24 | for adjacent_node in node.adj_nodes.values():
25 | if adjacent_node.visit_state == State.unvisited:
26 | queue.append(adjacent_node)
27 | adjacent_node.visit_state = State.visited
28 | return False
29 |
30 |
31 | class Person(object):
32 |
33 | def __init__(self, id, name):
34 | self.id = id
35 | self.name = name
36 | self.friend_ids = []
37 |
38 |
39 | class LookupService(object):
40 |
41 | def __init__(self):
42 | self.lookup = {} # key: person_id, value: person_server
43 |
44 | def get_person(self, person_id):
45 | person_server = self.lookup[person_id]
46 | return person_server.people[person_id]
47 |
48 |
49 | class PersonServer(object):
50 |
51 | def __init__(self):
52 | self.people = {} # key: person_id, value: person
53 |
54 | def get_people(self, ids):
55 | results = []
56 | for id in ids:
57 | if id in self.people:
58 | results.append(self.people[id])
59 | return results
60 |
61 |
62 | class UserGraphService(object):
63 |
64 | def __init__(self, person_ids, lookup):
65 | self.lookup = lookup
66 | self.person_ids = person_ids
67 | self.visited_ids = set()
68 |
69 | def bfs(self, source, dest):
70 | # Use self.visited_ids to track visited nodes
71 | # Use self.lookup to translate a person_id to a Person
72 | pass
73 |
--------------------------------------------------------------------------------
/solutions/system_design/template/template.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/template/template.graffle
--------------------------------------------------------------------------------
/solutions/system_design/twitter/README-zh-Hans.md:
--------------------------------------------------------------------------------
1 | # 设计推特时间轴与搜索功能
2 |
3 | **注意:这个文档中的链接会直接指向[系统设计主题索引](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#系统设计主题的索引)中的有关部分,以避免重复的内容。你可以参考链接的相关内容,来了解其总的要点、方案的权衡取舍以及可选的替代方案。**
4 |
5 | **设计 Facebook 的 feed** 与**设计 Facebook 搜索**与此为同一类型问题。
6 |
7 | ## 第一步:简述用例与约束条件
8 |
9 | > 搜集需求与问题的范围。
10 | > 提出问题来明确用例与约束条件。
11 | > 讨论假设。
12 |
13 | 我们将在没有面试官明确说明问题的情况下,自己定义一些用例以及限制条件。
14 |
15 | ### 用例
16 |
17 | #### 我们将把问题限定在仅处理以下用例的范围中
18 |
19 | * **用户**发布了一篇推特
20 | * **服务**将推特推送给关注者,给他们发送消息通知与邮件
21 | * **用户**浏览用户时间轴(用户最近的活动)
22 | * **用户**浏览主页时间轴(用户关注的人最近的活动)
23 | * **用户**搜索关键词
24 | * **服务**需要有高可用性
25 |
26 | #### 不在用例范围内的有
27 |
28 | * **服务**向 Firehose 与其它流数据接口推送推特
29 | * **服务**根据用户的”是否可见“选项排除推特
30 | * 隐藏未关注者的 @回复
31 | * 关心”隐藏转发“设置
32 | * 数据分析
33 |
34 | ### 限制条件与假设
35 |
36 | #### 提出假设
37 |
38 | 普遍情况
39 |
40 | * 网络流量不是均匀分布的
41 | * 发布推特的速度需要足够快速
42 | * 除非有上百万的关注者,否则将推特推送给粉丝的速度要足够快
43 | * 1 亿个活跃用户
44 | * 每天新发布 5 亿条推特,每月新发布 150 亿条推特
45 | * 平均每条推特需要推送给 5 个人
46 | * 每天需要进行 50 亿次推送
47 | * 每月需要进行 1500 亿次推送
48 | * 每月需要处理 2500 亿次读取请求
49 | * 每月需要处理 100 亿次搜索
50 |
51 | 时间轴功能
52 |
53 | * 浏览时间轴需要足够快
54 | * 推特的读取负载要大于写入负载
55 | * 需要为推特的快速读取进行优化
56 | * 存入推特是高写入负载功能
57 |
58 | 搜索功能
59 |
60 | * 搜索速度需要足够快
61 | * 搜索是高负载读取功能
62 |
63 | #### 计算用量
64 |
65 | **如果你需要进行粗略的用量计算,请向你的面试官说明。**
66 |
67 | * 每条推特的大小:
68 | * `tweet_id` - 8 字节
69 | * `user_id` - 32 字节
70 | * `text` - 140 字节
71 | * `media` - 平均 10 KB
72 | * 总计: 大约 10 KB
73 | * 每月产生新推特的内容为 150 TB
74 | * 每条推特 10 KB * 每天 5 亿条推特 * 每月 30 天
75 | * 3 年产生新推特的内容为 5.4 PB
76 | * 每秒需要处理 10 万次读取请求
77 | * 每个月需要处理 2500 亿次请求 * (每秒 400 次请求 / 每月 10 亿次请求)
78 | * 每秒发布 6000 条推特
79 | * 每月发布 150 亿条推特 * (每秒 400 次请求 / 每月 10 次请求)
80 | * 每秒推送 6 万条推特
81 | * 每月推送 1500 亿条推特 * (每秒 400 次请求 / 每月 10 亿次请求)
82 | * 每秒 4000 次搜索请求
83 |
84 | 便利换算指南:
85 |
86 | * 每个月有 250 万秒
87 | * 每秒一个请求 = 每个月 250 万次请求
88 | * 每秒 40 个请求 = 每个月 1 亿次请求
89 | * 每秒 400 个请求 = 每个月 10 亿次请求
90 |
91 | ## 第二步:概要设计
92 |
93 | > 列出所有重要组件以规划概要设计。
94 |
95 | 
96 |
97 | ## 第三步:设计核心组件
98 |
99 | > 深入每个核心组件的细节。
100 |
101 | ### 用例:用户发表了一篇推特
102 |
103 | 我们可以将用户自己发表的推特存储在[关系数据库](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms)中。我们也可以讨论一下[究竟是用 SQL 还是用 NoSQL](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-还是-nosql)。
104 |
105 | 构建用户主页时间轴(查看关注用户的活动)以及推送推特是件麻烦事。将特推传播给所有关注者(每秒约递送 6 万条推特)这一操作有可能会使传统的[关系数据库](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms)超负载。因此,我们可以使用 **NoSQL 数据库**或**内存数据库**之类的更快的数据存储方式。从内存读取 1 MB 连续数据大约要花 250 微秒,而从 SSD 读取同样大小的数据要花费 4 倍的时间,从机械硬盘读取需要花费 80 倍以上的时间。1
106 |
107 | 我们可以将照片、视频之类的媒体存储于**对象存储**中。
108 |
109 | * **客户端**向应用[反向代理](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server)的**Web 服务器**发送一条推特
110 | * **Web 服务器**将请求转发给**写 API**服务器
111 | * **写 API**服务器将推特使用 **SQL 数据库**存储于用户时间轴中
112 | * **写 API**调用**消息输出服务**,进行以下操作:
113 | * 查询**用户 图 服务**找到存储于**内存缓存**中的此用户的粉丝
114 | * 将推特存储于**内存缓存**中的**此用户的粉丝的主页时间轴**中
115 | * O(n) 复杂度操作: 1000 名粉丝 = 1000 次查找与插入
116 | * 将特推存储在**搜索索引服务**中,以加快搜索
117 | * 将媒体存储于**对象存储**中
118 | * 使用**通知服务**向粉丝发送推送:
119 | * 使用**队列**异步推送通知
120 |
121 | **向你的面试官告知你准备写多少代码**。
122 |
123 | 如果我们用 Redis 作为**内存缓存**,那可以用 Redis 原生的 list 作为其数据结构。结构如下:
124 |
125 | ```
126 | tweet n+2 tweet n+1 tweet n
127 | | 8 bytes 8 bytes 1 byte | 8 bytes 8 bytes 1 byte | 8 bytes 8 bytes 1 byte |
128 | | tweet_id user_id meta | tweet_id user_id meta | tweet_id user_id meta |
129 | ```
130 |
131 | 新发布的推特将被存储在对应用户(关注且活跃的用户)的主页时间轴的**内存缓存**中。
132 |
133 | 我们可以调用一个公共的 [REST API](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#表述性状态转移rest):
134 |
135 | ```
136 | $ curl -X POST --data '{ "user_id": "123", "auth_token": "ABC123", \
137 | "status": "hello world!", "media_ids": "ABC987" }' \
138 | https://twitter.com/api/v1/tweet
139 | ```
140 |
141 | 返回:
142 |
143 | ```
144 | {
145 | "created_at": "Wed Sep 05 00:37:15 +0000 2012",
146 | "status": "hello world!",
147 | "tweet_id": "987",
148 | "user_id": "123",
149 | ...
150 | }
151 | ```
152 |
153 | 而对于服务器内部的通信,我们可以使用 [RPC](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#远程过程调用协议rpc)。
154 |
155 | ### 用例:用户浏览主页时间轴
156 |
157 | * **客户端**向 **Web 服务器**发起一次读取主页时间轴的请求
158 | * **Web 服务器**将请求转发给**读取 API**服务器
159 | * **读取 API**服务器调用**时间轴服务**进行以下操作:
160 | * 从**内存缓存**读取时间轴数据,其中包括推特 id 与用户 id - O(1)
161 | * 通过 [multiget](http://redis.io/commands/mget) 向**推特信息服务**进行查询,以获取相关 id 推特的额外信息 - O(n)
162 | * 通过 muiltiget 向**用户信息服务**进行查询,以获取相关 id 用户的额外信息 - O(n)
163 |
164 | REST API:
165 |
166 | ```
167 | $ curl https://twitter.com/api/v1/home_timeline?user_id=123
168 | ```
169 |
170 | 返回:
171 |
172 | ```
173 | {
174 | "user_id": "456",
175 | "tweet_id": "123",
176 | "status": "foo"
177 | },
178 | {
179 | "user_id": "789",
180 | "tweet_id": "456",
181 | "status": "bar"
182 | },
183 | {
184 | "user_id": "789",
185 | "tweet_id": "579",
186 | "status": "baz"
187 | },
188 | ```
189 |
190 | ### 用例:用户浏览用户时间轴
191 |
192 | * **客户端**向**Web 服务器**发起获得用户时间线的请求
193 | * **Web 服务器**将请求转发给**读取 API**服务器
194 | * **读取 API**从 **SQL 数据库**中取出用户的时间轴
195 |
196 | REST API 与前面的主页时间轴类似,区别只在于取出的推特是由用户自己发送而不是关注人发送。
197 |
198 | ### 用例:用户搜索关键词
199 |
200 | * **客户端**将搜索请求发给**Web 服务器**
201 | * **Web 服务器**将请求转发给**搜索 API**服务器
202 | * **搜索 API**调用**搜索服务**进行以下操作:
203 | * 对输入进行转换与分词,弄明白需要搜索什么东西
204 | * 移除标点等额外内容
205 | * 将文本打散为词组
206 | * 修正拼写错误
207 | * 规范字母大小写
208 | * 将查询转换为布尔操作
209 | * 查询**搜索集群**(例如[Lucene](https://lucene.apache.org/))检索结果:
210 | * 对集群内的所有服务器进行查询,将有结果的查询进行[发散聚合(Scatter gathers)](https://github.com/donnemartin/system-design-primer#under-development)
211 | * 合并取到的条目,进行评分与排序,最终返回结果
212 |
213 | REST API:
214 |
215 | ```
216 | $ curl https://twitter.com/api/v1/search?query=hello+world
217 | ```
218 |
219 | 返回结果与前面的主页时间轴类似,只不过返回的是符合查询条件的推特。
220 |
221 | ## 第四步:架构扩展
222 |
223 | > 根据限制条件,找到并解决瓶颈。
224 |
225 | 
226 |
227 | **重要提示:不要从最初设计直接跳到最终设计中!**
228 |
229 | 现在你要 1) **基准测试、负载测试**。2) **分析、描述**性能瓶颈。3) 在解决瓶颈问题的同时,评估替代方案、权衡利弊。4) 重复以上步骤。请阅读[「设计一个系统,并将其扩大到为数以百万计的 AWS 用户服务」](../scaling_aws/README.md) 来了解如何逐步扩大初始设计。
230 |
231 | 讨论初始设计可能遇到的瓶颈及相关解决方案是很重要的。例如加上一个配置多台 **Web 服务器**的**负载均衡器**是否能够解决问题?**CDN**呢?**主从复制**呢?它们各自的替代方案和需要**权衡**的利弊又有什么呢?
232 |
233 | 我们将会介绍一些组件来完成设计,并解决架构扩张问题。内置的负载均衡器将不做讨论以节省篇幅。
234 |
235 | **为了避免重复讨论**,请参考[系统设计主题索引](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#系统设计主题的索引)相关部分来了解其要点、方案的权衡取舍以及可选的替代方案。
236 |
237 | * [DNS](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#域名系统)
238 | * [负载均衡器](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#负载均衡器)
239 | * [水平拓展](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#水平扩展)
240 | * [反向代理(web 服务器)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#反向代理web-服务器)
241 | * [API 服务(应用层)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#应用层)
242 | * [缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#缓存)
243 | * [关系型数据库管理系统 (RDBMS)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#关系型数据库管理系统rdbms)
244 | * [SQL 故障主从切换](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#故障切换)
245 | * [主从复制](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#主从复制)
246 | * [一致性模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#一致性模式)
247 | * [可用性模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#可用性模式)
248 |
249 | **消息输出服务**有可能成为性能瓶颈。那些有着百万数量关注着的用户可能发一条推特就需要好几分钟才能完成消息输出进程。这有可能使 @回复 这种推特时出现竞争条件,因此需要根据服务时间对此推特进行重排序来降低影响。
250 |
251 | 我们还可以避免从高关注量的用户输出推特。相反,我们可以通过搜索来找到高关注量用户的推特,并将搜索结果与用户的主页时间轴合并,再根据时间对其进行排序。
252 |
253 | 此外,还可以通过以下内容进行优化:
254 |
255 | * 仅为每个主页时间轴在**内存缓存**中存储数百条推特
256 | * 仅在**内存缓存**中存储活动用户的主页时间轴
257 | * 如果某个用户在过去 30 天都没有产生活动,那我们可以使用 **SQL 数据库**重新构建他的时间轴
258 | * 使用**用户 图 服务**来查询并确定用户关注的人
259 | * 从 **SQL 数据库**中取出推特,并将它们存入**内存缓存**
260 | * 仅在**推特信息服务**中存储一个月的推特
261 | * 仅在**用户信息服务**中存储活动用户的信息
262 | * **搜索集群**需要将推特保留在内存中,以降低延迟
263 |
264 | 我们还可以考虑优化 **SQL 数据库** 来解决一些瓶颈问题。
265 |
266 | **内存缓存**能减小一些数据库的负载,靠 **SQL Read 副本**已经足够处理缓存未命中情况。我们还可以考虑使用一些额外的 SQL 性能拓展技术。
267 |
268 | 高容量的写入将淹没单个的 **SQL 写主从**模式,因此需要更多的拓展技术。
269 |
270 | * [联合](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#联合)
271 | * [分片](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#分片)
272 | * [非规范化](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#非规范化)
273 | * [SQL 调优](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-调优)
274 |
275 | 我们也可以考虑将一些数据移至 **NoSQL 数据库**。
276 |
277 | ## 其它要点
278 |
279 | > 是否深入这些额外的主题,取决于你的问题范围和剩下的时间。
280 |
281 | #### NoSQL
282 |
283 | * [键-值存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#键-值存储)
284 | * [文档类型存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#文档类型存储)
285 | * [列型存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#列型存储)
286 | * [图数据库](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#图数据库)
287 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-还是-nosql)
288 |
289 | ### 缓存
290 |
291 | * 在哪缓存
292 | * [客户端缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#客户端缓存)
293 | * [CDN 缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#cdn-缓存)
294 | * [Web 服务器缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#web-服务器缓存)
295 | * [数据库缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#数据库缓存)
296 | * [应用缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#应用缓存)
297 | * 什么需要缓存
298 | * [数据库查询级别的缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#数据库查询级别的缓存)
299 | * [对象级别的缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#对象级别的缓存)
300 | * 何时更新缓存
301 | * [缓存模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#缓存模式)
302 | * [直写模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#直写模式)
303 | * [回写模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#回写模式)
304 | * [刷新](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#刷新)
305 |
306 | ### 异步与微服务
307 |
308 | * [消息队列](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#消息队列)
309 | * [任务队列](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#任务队列)
310 | * [背压](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#背压)
311 | * [微服务](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#微服务)
312 |
313 | ### 通信
314 |
315 | * 可权衡选择的方案:
316 | * 与客户端的外部通信 - [使用 REST 作为 HTTP API](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#表述性状态转移rest)
317 | * 服务器内部通信 - [RPC](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#远程过程调用协议rpc)
318 | * [服务发现](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#服务发现)
319 |
320 | ### 安全性
321 |
322 | 请参阅[「安全」](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#安全)一章。
323 |
324 | ### 延迟数值
325 |
326 | 请参阅[「每个程序员都应该知道的延迟数」](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#每个程序员都应该知道的延迟数)。
327 |
328 | ### 持续探讨
329 |
330 | * 持续进行基准测试并监控你的系统,以解决他们提出的瓶颈问题。
331 | * 架构拓展是一个迭代的过程。
332 |
--------------------------------------------------------------------------------
/solutions/system_design/twitter/twitter.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/twitter/twitter.graffle
--------------------------------------------------------------------------------
/solutions/system_design/twitter/twitter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/twitter/twitter.png
--------------------------------------------------------------------------------
/solutions/system_design/twitter/twitter_basic.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/twitter/twitter_basic.graffle
--------------------------------------------------------------------------------
/solutions/system_design/twitter/twitter_basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/twitter/twitter_basic.png
--------------------------------------------------------------------------------
/solutions/system_design/web_crawler/README-zh-Hans.md:
--------------------------------------------------------------------------------
1 | # 设计一个网页爬虫
2 |
3 | **注意:这个文档中的链接会直接指向[系统设计主题索引](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#系统设计主题的索引)中的有关部分,以避免重复的内容。你可以参考链接的相关内容,来了解其总的要点、方案的权衡取舍以及可选的替代方案。**
4 |
5 | ## 第一步:简述用例与约束条件
6 |
7 | > 把所有需要的东西聚集在一起,审视问题。不停的提问,以至于我们可以明确使用场景和约束。讨论假设。
8 |
9 | 我们将在没有面试官明确说明问题的情况下,自己定义一些用例以及限制条件。
10 |
11 | ### 用例
12 |
13 | #### 我们把问题限定在仅处理以下用例的范围中
14 |
15 | * **服务** 抓取一系列链接:
16 | * 生成包含搜索词的网页倒排索引
17 | * 生成页面的标题和摘要信息
18 | * 页面标题和摘要都是静态的,它们不会根据搜索词改变
19 | * **用户** 输入搜索词后,可以看到相关的搜索结果列表,列表每一项都包含由网页爬虫生成的页面标题及摘要
20 | * 只给该用例绘制出概要组件和交互说明,无需讨论细节
21 | * **服务** 具有高可用性
22 |
23 | #### 无需考虑
24 |
25 | * 搜索分析
26 | * 个性化搜索结果
27 | * 页面排名
28 |
29 | ### 限制条件与假设
30 |
31 | #### 提出假设
32 |
33 | * 搜索流量分布不均
34 | * 有些搜索词非常热门,有些则非常冷门
35 | * 只支持匿名用户
36 | * 用户很快就能看到搜索结果
37 | * 网页爬虫不应该陷入死循环
38 | * 当爬虫路径包含环的时候,将会陷入死循环
39 | * 抓取 10 亿个链接
40 | * 要定期重新抓取页面以确保新鲜度
41 | * 平均每周重新抓取一次,网站越热门,那么重新抓取的频率越高
42 | * 每月抓取 40 亿个链接
43 | * 每个页面的平均存储大小:500 KB
44 | * 简单起见,重新抓取的页面算作新页面
45 | * 每月搜索量 1000 亿次
46 |
47 | 用更传统的系统来练习 —— 不要使用 [solr](http://lucene.apache.org/solr/) 、[nutch](http://nutch.apache.org/) 之类的现成系统。
48 |
49 | #### 计算用量
50 |
51 | **如果你需要进行粗略的用量计算,请向你的面试官说明。**
52 |
53 | * 每月存储 2 PB 页面
54 | * 每月抓取 40 亿个页面,每个页面 500 KB
55 | * 三年存储 72 PB 页面
56 | * 每秒 1600 次写请求
57 | * 每秒 40000 次搜索请求
58 |
59 | 简便换算指南:
60 |
61 | * 一个月有 250 万秒
62 | * 每秒 1 个请求,即每月 250 万个请求
63 | * 每秒 40 个请求,即每月 1 亿个请求
64 | * 每秒 400 个请求,即每月 10 亿个请求
65 |
66 | ## 第二步: 概要设计
67 |
68 | > 列出所有重要组件以规划概要设计。
69 |
70 | 
71 |
72 | ## 第三步:设计核心组件
73 |
74 | > 对每一个核心组件进行详细深入的分析。
75 |
76 | ### 用例:爬虫服务抓取一系列网页
77 |
78 | 假设我们有一个初始列表 `links_to_crawl`(待抓取链接),它最初基于网站整体的知名度来排序。当然如果这个假设不合理,我们可以使用 [Yahoo](https://www.yahoo.com/)、[DMOZ](http://www.dmoz.org/) 等知名门户网站作为种子链接来进行扩散 。
79 |
80 | 我们将用表 `crawled_links` (已抓取链接 )来记录已经处理过的链接以及相应的页面签名。
81 |
82 | 我们可以将 `links_to_crawl` 和 `crawled_links` 记录在键-值型 **NoSQL 数据库**中。对于 `crawled_links` 中已排序的链接,我们可以使用 [Redis](https://redis.io/) 的有序集合来维护网页链接的排名。我们应当在 [选择 SQL 还是 NoSQL 的问题上,讨论有关使用场景以及利弊 ](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-还是-nosql)。
83 |
84 | * **爬虫服务**按照以下流程循环处理每一个页面链接:
85 | * 选取排名最靠前的待抓取链接
86 | * 在 **NoSQL 数据库**的 `crawled_links` 中,检查待抓取页面的签名是否与某个已抓取页面的签名相似
87 | * 若存在,则降低该页面链接的优先级
88 | * 这样做可以避免陷入死循环
89 | * 继续(进入下一次循环)
90 | * 若不存在,则抓取该链接
91 | * 在**倒排索引服务**任务队列中,新增一个生成[倒排索引](https://en.wikipedia.org/wiki/Search_engine_indexing)任务。
92 | * 在**文档服务**任务队列中,新增一个生成静态标题和摘要的任务。
93 | * 生成页面签名
94 | * 在 **NoSQL 数据库**的 `links_to_crawl` 中删除该链接
95 | * 在 **NoSQL 数据库**的 `crawled_links` 中插入该链接以及页面签名
96 |
97 | **向面试官了解你需要写多少代码**。
98 |
99 | `PagesDataStore` 是**爬虫服务**中的一个抽象类,它使用 **NoSQL 数据库**进行存储。
100 |
101 | ```python
102 | class PagesDataStore(object):
103 |
104 | def __init__(self, db);
105 | self.db = db
106 | ...
107 |
108 | def add_link_to_crawl(self, url):
109 | """将指定链接加入 `links_to_crawl`。"""
110 | ...
111 |
112 | def remove_link_to_crawl(self, url):
113 | """从 `links_to_crawl` 中删除指定链接。"""
114 | ...
115 |
116 | def reduce_priority_link_to_crawl(self, url)
117 | """在 `links_to_crawl` 中降低一个链接的优先级以避免死循环。"""
118 | ...
119 |
120 | def extract_max_priority_page(self):
121 | """返回 `links_to_crawl` 中优先级最高的链接。"""
122 | ...
123 |
124 | def insert_crawled_link(self, url, signature):
125 | """将指定链接加入 `crawled_links`。"""
126 | ...
127 |
128 | def crawled_similar(self, signature):
129 | """判断待抓取页面的签名是否与某个已抓取页面的签名相似。"""
130 | ...
131 | ```
132 |
133 | `Page` 是**爬虫服务**的一个抽象类,它封装了网页对象,由页面链接、页面内容、子链接和页面签名构成。
134 |
135 | ```python
136 | class Page(object):
137 |
138 | def __init__(self, url, contents, child_urls, signature):
139 | self.url = url
140 | self.contents = contents
141 | self.child_urls = child_urls
142 | self.signature = signature
143 | ```
144 |
145 | `Crawler` 是**爬虫服务**的主类,由`Page` 和 `PagesDataStore` 组成。
146 |
147 | ```python
148 | class Crawler(object):
149 |
150 | def __init__(self, data_store, reverse_index_queue, doc_index_queue):
151 | self.data_store = data_store
152 | self.reverse_index_queue = reverse_index_queue
153 | self.doc_index_queue = doc_index_queue
154 |
155 | def create_signature(self, page):
156 | """基于页面链接与内容生成签名。"""
157 | ...
158 |
159 | def crawl_page(self, page):
160 | for url in page.child_urls:
161 | self.data_store.add_link_to_crawl(url)
162 | page.signature = self.create_signature(page)
163 | self.data_store.remove_link_to_crawl(page.url)
164 | self.data_store.insert_crawled_link(page.url, page.signature)
165 |
166 | def crawl(self):
167 | while True:
168 | page = self.data_store.extract_max_priority_page()
169 | if page is None:
170 | break
171 | if self.data_store.crawled_similar(page.signature):
172 | self.data_store.reduce_priority_link_to_crawl(page.url)
173 | else:
174 | self.crawl_page(page)
175 | ```
176 |
177 | ### 处理重复内容
178 |
179 | 我们要谨防网页爬虫陷入死循环,这通常会发生在爬虫路径中存在环的情况。
180 |
181 | **向面试官了解你需要写多少代码**.
182 |
183 | 删除重复链接:
184 |
185 | * 假设数据量较小,我们可以用类似于 `sort | unique` 的方法。(译注: 先排序,后去重)
186 | * 假设有 10 亿条数据,我们应该使用 **MapReduce** 来输出只出现 1 次的记录。
187 |
188 | ```python
189 | class RemoveDuplicateUrls(MRJob):
190 |
191 | def mapper(self, _, line):
192 | yield line, 1
193 |
194 | def reducer(self, key, values):
195 | total = sum(values)
196 | if total == 1:
197 | yield key, total
198 | ```
199 |
200 | 比起处理重复内容,检测重复内容更为复杂。我们可以基于网页内容生成签名,然后对比两者签名的相似度。可能会用到的算法有 [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) 以及 [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity)。
201 |
202 | ### 抓取结果更新策略
203 |
204 | 要定期重新抓取页面以确保新鲜度。抓取结果应该有个 `timestamp` 字段记录上一次页面抓取时间。每隔一段时间,比如说 1 周,所有页面都需要更新一次。对于热门网站或是内容频繁更新的网站,爬虫抓取间隔可以缩短。
205 |
206 | 尽管我们不会深入网页数据分析的细节,我们仍然要做一些数据挖掘工作来确定一个页面的平均更新时间,并且根据相关的统计数据来决定爬虫的重新抓取频率。
207 |
208 | 当然我们也应该根据站长提供的 `Robots.txt` 来控制爬虫的抓取频率。
209 |
210 | ### 用例:用户输入搜索词后,可以看到相关的搜索结果列表,列表每一项都包含由网页爬虫生成的页面标题及摘要
211 |
212 | * **客户端**向运行[反向代理](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#反向代理web-服务器)的 **Web 服务器**发送一个请求
213 | * **Web 服务器** 发送请求到 **Query API** 服务器
214 | * **查询 API** 服务将会做这些事情:
215 | * 解析查询参数
216 | * 删除 HTML 标记
217 | * 将文本分割成词组 (译注: 分词处理)
218 | * 修正错别字
219 | * 规范化大小写
220 | * 将搜索词转换为布尔运算
221 | * 使用**倒排索引服务**来查找匹配查询的文档
222 | * **倒排索引服务**对匹配到的结果进行排名,然后返回最符合的结果
223 | * 使用**文档服务**返回文章标题与摘要
224 |
225 | 我们使用 [**REST API**](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#表述性状态转移rest) 与客户端通信:
226 |
227 | ```
228 | $ curl https://search.com/api/v1/search?query=hello+world
229 | ```
230 |
231 | 响应内容:
232 |
233 | ```
234 | {
235 | "title": "foo's title",
236 | "snippet": "foo's snippet",
237 | "link": "https://foo.com",
238 | },
239 | {
240 | "title": "bar's title",
241 | "snippet": "bar's snippet",
242 | "link": "https://bar.com",
243 | },
244 | {
245 | "title": "baz's title",
246 | "snippet": "baz's snippet",
247 | "link": "https://baz.com",
248 | },
249 | ```
250 |
251 | 对于服务器内部通信,我们可以使用 [远程过程调用协议(RPC)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#远程过程调用协议rpc)
252 |
253 |
254 | ## 第四步:架构扩展
255 |
256 | > 根据限制条件,找到并解决瓶颈。
257 |
258 | 
259 |
260 | **重要提示:不要直接从最初设计跳到最终设计!**
261 |
262 | 现在你要 1) **基准测试、负载测试**。2) **分析、描述**性能瓶颈。3) 在解决瓶颈问题的同时,评估替代方案、权衡利弊。4) 重复以上步骤。请阅读[设计一个系统,并将其扩大到为数以百万计的 AWS 用户服务](../scaling_aws/README.md) 来了解如何逐步扩大初始设计。
263 |
264 | 讨论初始设计可能遇到的瓶颈及相关解决方案是很重要的。例如加上一套配备多台 **Web 服务器**的**负载均衡器**是否能够解决问题?**CDN**呢?**主从复制**呢?它们各自的替代方案和需要**权衡**的利弊又有哪些呢?
265 |
266 | 我们将会介绍一些组件来完成设计,并解决架构规模扩张问题。内置的负载均衡器将不做讨论以节省篇幅。
267 |
268 | **为了避免重复讨论**,请参考[系统设计主题索引](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#系统设计主题的索引)相关部分来了解其要点、方案的权衡取舍以及替代方案。
269 |
270 | * [DNS](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#域名系统)
271 | * [负载均衡器](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#负载均衡器)
272 | * [水平扩展](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#水平扩展)
273 | * [Web 服务器(反向代理)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#反向代理web-服务器)
274 | * [API 服务器(应用层)](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#应用层)
275 | * [缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#缓存)
276 | * [NoSQL](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#nosql)
277 | * [一致性模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#一致性模式)
278 | * [可用性模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#可用性模式)
279 |
280 | 有些搜索词非常热门,有些则非常冷门。热门的搜索词可以通过诸如 Redis 或者 Memcached 之类的**内存缓存**来缩短响应时间,避免**倒排索引服务**以及**文档服务**过载。**内存缓存**同样适用于流量分布不均匀以及流量短时高峰问题。从内存中读取 1 MB 连续数据大约需要 250 微秒,而从 SSD 读取同样大小的数据要花费 4 倍的时间,从机械硬盘读取需要花费 80 倍以上的时间。1
281 |
282 |
283 | 以下是优化**爬虫服务**的其他建议:
284 |
285 | * 为了处理数据大小问题以及网络请求负载,**倒排索引服务**和**文档服务**可能需要大量应用数据分片和数据复制。
286 | * DNS 查询可能会成为瓶颈,**爬虫服务**最好专门维护一套定期更新的 DNS 查询服务。
287 | * 借助于[连接池](https://en.wikipedia.org/wiki/Connection_pool),即同时维持多个开放网络连接,可以提升**爬虫服务**的性能并减少内存使用量。
288 | * 改用 [UDP](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#用户数据报协议udp) 协议同样可以提升性能
289 | * 网络爬虫受带宽影响较大,请确保带宽足够维持高吞吐量。
290 |
291 | ## 其它要点
292 |
293 | > 是否深入这些额外的主题,取决于你的问题范围和剩下的时间。
294 |
295 | ### SQL 扩展模式
296 |
297 | * [读取复制](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#主从复制)
298 | * [联合](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#联合)
299 | * [分片](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#分片)
300 | * [非规范化](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#非规范化)
301 | * [SQL 调优](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-调优)
302 |
303 | #### NoSQL
304 |
305 | * [键-值存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#键-值存储)
306 | * [文档类型存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#文档类型存储)
307 | * [列型存储](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#列型存储)
308 | * [图数据库](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#图数据库)
309 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#sql-还是-nosql)
310 |
311 |
312 | ### 缓存
313 |
314 | * 在哪缓存
315 | * [客户端缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#客户端缓存)
316 | * [CDN 缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#cdn-缓存)
317 | * [Web 服务器缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#web-服务器缓存)
318 | * [数据库缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#数据库缓存)
319 | * [应用缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#应用缓存)
320 | * 什么需要缓存
321 | * [数据库查询级别的缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#数据库查询级别的缓存)
322 | * [对象级别的缓存](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#对象级别的缓存)
323 | * 何时更新缓存
324 | * [缓存模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#缓存模式)
325 | * [直写模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#直写模式)
326 | * [回写模式](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#回写模式)
327 | * [刷新](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#刷新)
328 |
329 | ### 异步与微服务
330 |
331 | * [消息队列](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#消息队列)
332 | * [任务队列](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#任务队列)
333 | * [背压](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#背压)
334 | * [微服务](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#微服务)
335 |
336 | ### 通信
337 |
338 | * 可权衡选择的方案:
339 | * 与客户端的外部通信 - [使用 REST 作为 HTTP API](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#表述性状态转移rest)
340 | * 内部通信 - [RPC](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#远程过程调用协议rpc)
341 | * [服务发现](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#服务发现)
342 |
343 |
344 | ### 安全性
345 |
346 | 请参阅[安全](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#安全)。
347 |
348 |
349 | ### 延迟数值
350 |
351 | 请参阅[每个程序员都应该知道的延迟数](https://github.com/donnemartin/system-design-primer/blob/master/README-zh-Hans.md#每个程序员都应该知道的延迟数)。
352 |
353 | ### 持续探讨
354 |
355 | * 持续进行基准测试并监控你的系统,以解决他们提出的瓶颈问题。
356 | * 架构扩展是一个迭代的过程。
357 |
--------------------------------------------------------------------------------
/solutions/system_design/web_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/web_crawler/__init__.py
--------------------------------------------------------------------------------
/solutions/system_design/web_crawler/web_crawler.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/web_crawler/web_crawler.graffle
--------------------------------------------------------------------------------
/solutions/system_design/web_crawler/web_crawler.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/web_crawler/web_crawler.png
--------------------------------------------------------------------------------
/solutions/system_design/web_crawler/web_crawler_basic.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/web_crawler/web_crawler_basic.graffle
--------------------------------------------------------------------------------
/solutions/system_design/web_crawler/web_crawler_basic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/donnemartin/system-design-primer/40d5d2edccd00b4a66fb0e24d887d8b1a0d7ea0e/solutions/system_design/web_crawler/web_crawler_basic.png
--------------------------------------------------------------------------------
/solutions/system_design/web_crawler/web_crawler_mapreduce.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from mrjob.job import MRJob
4 |
5 |
6 | class RemoveDuplicateUrls(MRJob):
7 |
8 | def mapper(self, _, line):
9 | yield line, 1
10 |
11 | def reducer(self, key, values):
12 | total = sum(values)
13 | if total == 1:
14 | yield key, total
15 |
16 | def steps(self):
17 | """Run the map and reduce steps."""
18 | return [
19 | self.mr(mapper=self.mapper,
20 | reducer=self.reducer)
21 | ]
22 |
23 |
24 | if __name__ == '__main__':
25 | RemoveDuplicateUrls.run()
26 |
--------------------------------------------------------------------------------
/solutions/system_design/web_crawler/web_crawler_snippets.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | class PagesDataStore(object):
5 |
6 | def __init__(self, db):
7 | self.db = db
8 | pass
9 |
10 | def add_link_to_crawl(self, url):
11 | """Add the given link to `links_to_crawl`."""
12 | pass
13 |
14 | def remove_link_to_crawl(self, url):
15 | """Remove the given link from `links_to_crawl`."""
16 | pass
17 |
18 | def reduce_priority_link_to_crawl(self, url):
19 | """Reduce the priority of a link in `links_to_crawl` to avoid cycles."""
20 | pass
21 |
22 | def extract_max_priority_page(self):
23 | """Return the highest priority link in `links_to_crawl`."""
24 | pass
25 |
26 | def insert_crawled_link(self, url, signature):
27 | """Add the given link to `crawled_links`."""
28 | pass
29 |
30 | def crawled_similar(self, signature):
31 | """Determine if we've already crawled a page matching the given signature"""
32 | pass
33 |
34 |
35 | class Page(object):
36 |
37 | def __init__(self, url, contents, child_urls):
38 | self.url = url
39 | self.contents = contents
40 | self.child_urls = child_urls
41 | self.signature = self.create_signature()
42 |
43 | def create_signature(self):
44 | # Create signature based on url and contents
45 | pass
46 |
47 |
48 | class Crawler(object):
49 |
50 | def __init__(self, pages, data_store, reverse_index_queue, doc_index_queue):
51 | self.pages = pages
52 | self.data_store = data_store
53 | self.reverse_index_queue = reverse_index_queue
54 | self.doc_index_queue = doc_index_queue
55 |
56 | def crawl_page(self, page):
57 | for url in page.child_urls:
58 | self.data_store.add_link_to_crawl(url)
59 | self.reverse_index_queue.generate(page)
60 | self.doc_index_queue.generate(page)
61 | self.data_store.remove_link_to_crawl(page.url)
62 | self.data_store.insert_crawled_link(page.url, page.signature)
63 |
64 | def crawl(self):
65 | while True:
66 | page = self.data_store.extract_max_priority_page()
67 | if page is None:
68 | break
69 | if self.data_store.crawled_similar(page.signature):
70 | self.data_store.reduce_priority_link_to_crawl(page.url)
71 | else:
72 | self.crawl_page(page)
73 | page = self.data_store.extract_max_priority_page()
74 |
--------------------------------------------------------------------------------