├── .gitattributes ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE.txt ├── README.md ├── resources └── flash_cards │ ├── OO Design.apkg │ ├── System Design Exercises.apkg │ └── System Design.apkg └── solutions ├── object_oriented_design ├── call_center │ ├── __init__.py │ ├── call_center.ipynb │ └── call_center.py ├── deck_of_cards │ ├── __init__.py │ ├── deck_of_cards.ipynb │ └── deck_of_cards.py ├── hash_table │ ├── __init__.py │ ├── hash_map.ipynb │ └── hash_map.py ├── lru_cache │ ├── __init__.py │ ├── lru_cache.ipynb │ └── lru_cache.py ├── online_chat │ ├── __init__.py │ ├── online_chat.ipynb │ └── online_chat.py └── parking_lot │ ├── __init__.py │ ├── parking_lot.ipynb │ └── parking_lot.py └── system_design ├── mint ├── README.md ├── __init__.py ├── mint.png ├── mint_basic.png ├── mint_mapreduce.py └── mint_snippets.py ├── pastebin ├── README.md ├── __init__.py ├── pastebin.png ├── pastebin.py └── pastebin_basic.png ├── query_cache ├── README.md ├── __init__.py ├── query_cache.png ├── query_cache_basic.png └── query_cache_snippets.py ├── sales_rank ├── README.md ├── __init__.py ├── sales_rank.png ├── sales_rank_basic.png └── sales_rank_mapreduce.py ├── scaling_aws ├── README.md ├── scaling_aws.png ├── scaling_aws_1.png ├── scaling_aws_2.png ├── scaling_aws_3.png ├── scaling_aws_4.png ├── scaling_aws_5.png ├── scaling_aws_6.png └── scaling_aws_7.png ├── social_graph ├── README.md ├── __init__.py ├── social_graph.png ├── social_graph_basic.png └── social_graph_snippets.py ├── twitter ├── README.md ├── twitter.png └── twitter_basic.png └── web_crawler ├── README.md ├── __init__.py ├── web_crawler.png ├── web_crawler_basic.png ├── web_crawler_mapreduce.py └── web_crawler_snippets.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-language=Python 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # IPython notebook 57 | .ipynb_checkpoints 58 | 59 | # Repo scratch directory 60 | scratch/ 61 | 62 | # IPython Notebook templates 63 | template.ipynb -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | Contributions are welcome! 5 | 6 | **Please carefully read this page to make the code review process go as smoothly as possible and to maximize the likelihood of your contribution being merged.** 7 | 8 | ## Bug Reports 9 | 10 | For bug reports or requests [submit an issue](https://github.com/donnemartin/system-design-primer/issues). 11 | 12 | ## Pull Requests 13 | 14 | The preferred way to contribute is to fork the 15 | [main repository](https://github.com/donnemartin/system-design-primer) on GitHub. 16 | 17 | 1. Fork the [main repository](https://github.com/donnemartin/system-design-primer). Click on the 'Fork' button near the top of the page. This creates a copy of the code under your account on the GitHub server. 18 | 19 | 2. Clone this copy to your local disk: 20 | 21 | $ git clone git@github.com:YourLogin/system-design-primer.git 22 | $ cd system-design-primer 23 | 24 | 3. Create a branch to hold your changes and start making changes. Don't work in the `master` branch! 25 | 26 | $ git checkout -b my-feature 27 | 28 | 4. Work on this copy on your computer using Git to do the version control. When you're done editing, run the following to record your changes in Git: 29 | 30 | $ git add modified_files 31 | $ git commit 32 | 33 | 5. Push your changes to GitHub with: 34 | 35 | $ git push -u origin my-feature 36 | 37 | 6. Finally, go to the web page of your fork of the `system-design-primer` repo and click 'Pull Request' to send your changes for review. 38 | 39 | ### GitHub Pull Requests Docs 40 | 41 | If you are not familiar with pull requests, review the [pull request docs](https://help.github.com/articles/using-pull-requests/). 42 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution 4.0 International License (CC BY 4.0) 2 | 3 | http://creativecommons.org/licenses/by/4.0/ 4 | -------------------------------------------------------------------------------- /resources/flash_cards/OO Design.apkg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/resources/flash_cards/OO Design.apkg -------------------------------------------------------------------------------- /resources/flash_cards/System Design Exercises.apkg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/resources/flash_cards/System Design Exercises.apkg -------------------------------------------------------------------------------- /resources/flash_cards/System Design.apkg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/resources/flash_cards/System Design.apkg -------------------------------------------------------------------------------- /solutions/object_oriented_design/call_center/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/object_oriented_design/call_center/__init__.py -------------------------------------------------------------------------------- /solutions/object_oriented_design/call_center/call_center.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](https://github.com/donnemartin). Source and license info is on [GitHub](https://github.com/donnemartin/system-design-primer-primer)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Design a call center" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Constraints and assumptions\n", 22 | "\n", 23 | "* What levels of employees are in the call center?\n", 24 | " * Operator, supervisor, director\n", 25 | "* Can we assume operators always get the initial calls?\n", 26 | " * Yes\n", 27 | "* If there is no free operators or the operator can't handle the call, does the call go to the supervisors?\n", 28 | " * Yes\n", 29 | "* If there is no free supervisors or the supervisor can't handle the call, does the call go to the directors?\n", 30 | " * Yes\n", 31 | "* Can we assume the directors can handle all calls?\n", 32 | " * Yes\n", 33 | "* What happens if nobody can answer the call?\n", 34 | " * It gets queued\n", 35 | "* Do we need to handle 'VIP' calls where we put someone to the front of the line?\n", 36 | " * No\n", 37 | "* Can we assume inputs are valid or do we have to validate them?\n", 38 | " * Assume they're valid" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Solution" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 1, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "Overwriting call_center.py\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "%%writefile call_center.py\n", 65 | "from abc import ABCMeta, abstractmethod\n", 66 | "from collections import deque\n", 67 | "from enum import Enum\n", 68 | "\n", 69 | "\n", 70 | "class Rank(Enum):\n", 71 | "\n", 72 | " OPERATOR = 0\n", 73 | " SUPERVISOR = 1\n", 74 | " DIRECTOR = 2\n", 75 | "\n", 76 | "\n", 77 | "class Employee(metaclass=ABCMeta):\n", 78 | "\n", 79 | " def __init__(self, employee_id, name, rank, call_center):\n", 80 | " self.employee_id = employee_id\n", 81 | " self.name = name\n", 82 | " self.rank = rank\n", 83 | " self.call = None\n", 84 | " self.call_center = call_center\n", 85 | "\n", 86 | " def take_call(self, call):\n", 87 | " \"\"\"Assume the employee will always successfully take the call.\"\"\"\n", 88 | " self.call = call\n", 89 | " self.call.employee = self\n", 90 | " self.call.state = CallState.IN_PROGRESS\n", 91 | "\n", 92 | " def complete_call(self):\n", 93 | " self.call.state = CallState.COMPLETE\n", 94 | " self.call_center.notify_call_completed(self.call)\n", 95 | "\n", 96 | " @abstractmethod\n", 97 | " def escalate_call(self):\n", 98 | " pass\n", 99 | "\n", 100 | " def _escalate_call(self):\n", 101 | " self.call.state = CallState.READY\n", 102 | " call = self.call\n", 103 | " self.call = None\n", 104 | " self.call_center.notify_call_escalated(call)\n", 105 | "\n", 106 | "\n", 107 | "class Operator(Employee):\n", 108 | "\n", 109 | " def __init__(self, employee_id, name):\n", 110 | " super(Operator, self).__init__(employee_id, name, Rank.OPERATOR)\n", 111 | "\n", 112 | " def escalate_call(self):\n", 113 | " self.call.level = Rank.SUPERVISOR\n", 114 | " self._escalate_call()\n", 115 | "\n", 116 | "\n", 117 | "class Supervisor(Employee):\n", 118 | "\n", 119 | " def __init__(self, employee_id, name):\n", 120 | " super(Operator, self).__init__(employee_id, name, Rank.SUPERVISOR)\n", 121 | "\n", 122 | " def escalate_call(self):\n", 123 | " self.call.level = Rank.DIRECTOR\n", 124 | " self._escalate_call()\n", 125 | "\n", 126 | "\n", 127 | "class Director(Employee):\n", 128 | "\n", 129 | " def __init__(self, employee_id, name):\n", 130 | " super(Operator, self).__init__(employee_id, name, Rank.DIRECTOR)\n", 131 | "\n", 132 | " def escalate_call(self):\n", 133 | " raise NotImplemented('Directors must be able to handle any call')\n", 134 | "\n", 135 | "\n", 136 | "class CallState(Enum):\n", 137 | "\n", 138 | " READY = 0\n", 139 | " IN_PROGRESS = 1\n", 140 | " COMPLETE = 2\n", 141 | "\n", 142 | "\n", 143 | "class Call(object):\n", 144 | "\n", 145 | " def __init__(self, rank):\n", 146 | " self.state = CallState.READY\n", 147 | " self.rank = rank\n", 148 | " self.employee = None\n", 149 | "\n", 150 | "\n", 151 | "class CallCenter(object):\n", 152 | "\n", 153 | " def __init__(self, operators, supervisors, directors):\n", 154 | " self.operators = operators\n", 155 | " self.supervisors = supervisors\n", 156 | " self.directors = directors\n", 157 | " self.queued_calls = deque()\n", 158 | "\n", 159 | " def dispatch_call(self, call):\n", 160 | " if call.rank not in (Rank.OPERATOR, Rank.SUPERVISOR, Rank.DIRECTOR):\n", 161 | " raise ValueError('Invalid call rank: {}'.format(call.rank))\n", 162 | " employee = None\n", 163 | " if call.rank == Rank.OPERATOR:\n", 164 | " employee = self._dispatch_call(call, self.operators)\n", 165 | " if call.rank == Rank.SUPERVISOR or employee is None:\n", 166 | " employee = self._dispatch_call(call, self.supervisors)\n", 167 | " if call.rank == Rank.DIRECTOR or employee is None:\n", 168 | " employee = self._dispatch_call(call, self.directors)\n", 169 | " if employee is None:\n", 170 | " self.queued_calls.append(call)\n", 171 | "\n", 172 | " def _dispatch_call(self, call, employees):\n", 173 | " for employee in employees:\n", 174 | " if employee.call is None:\n", 175 | " employee.take_call(call)\n", 176 | " return employee\n", 177 | " return None\n", 178 | "\n", 179 | " def notify_call_escalated(self, call): # ...\n", 180 | " def notify_call_completed(self, call): # ...\n", 181 | " def dispatch_queued_call_to_newly_freed_employee(self, call, employee): # ..." 182 | ] 183 | } 184 | ], 185 | "metadata": { 186 | "kernelspec": { 187 | "display_name": "Python 3", 188 | "language": "python", 189 | "name": "python3" 190 | }, 191 | "language_info": { 192 | "codemirror_mode": { 193 | "name": "ipython", 194 | "version": 3 195 | }, 196 | "file_extension": ".py", 197 | "mimetype": "text/x-python", 198 | "name": "python", 199 | "nbconvert_exporter": "python", 200 | "pygments_lexer": "ipython3", 201 | "version": "3.4.3" 202 | } 203 | }, 204 | "nbformat": 4, 205 | "nbformat_minor": 0 206 | } 207 | -------------------------------------------------------------------------------- /solutions/object_oriented_design/call_center/call_center.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | from collections import deque 3 | from enum import Enum 4 | 5 | 6 | class Rank(Enum): 7 | 8 | OPERATOR = 0 9 | SUPERVISOR = 1 10 | DIRECTOR = 2 11 | 12 | 13 | class Employee(metaclass=ABCMeta): 14 | 15 | def __init__(self, employee_id, name, rank, call_center): 16 | self.employee_id = employee_id 17 | self.name = name 18 | self.rank = rank 19 | self.call = None 20 | self.call_center = call_center 21 | 22 | def take_call(self, call): 23 | """Assume the employee will always successfully take the call.""" 24 | self.call = call 25 | self.call.employee = self 26 | self.call.state = CallState.IN_PROGRESS 27 | 28 | def complete_call(self): 29 | self.call.state = CallState.COMPLETE 30 | self.call_center.notify_call_completed(self.call) 31 | 32 | @abstractmethod 33 | def escalate_call(self): 34 | pass 35 | 36 | def _escalate_call(self): 37 | self.call.state = CallState.READY 38 | call = self.call 39 | self.call = None 40 | self.call_center.notify_call_escalated(call) 41 | 42 | 43 | class Operator(Employee): 44 | 45 | def __init__(self, employee_id, name): 46 | super(Operator, self).__init__(employee_id, name, Rank.OPERATOR) 47 | 48 | def escalate_call(self): 49 | self.call.level = Rank.SUPERVISOR 50 | self._escalate_call() 51 | 52 | 53 | class Supervisor(Employee): 54 | 55 | def __init__(self, employee_id, name): 56 | super(Operator, self).__init__(employee_id, name, Rank.SUPERVISOR) 57 | 58 | def escalate_call(self): 59 | self.call.level = Rank.DIRECTOR 60 | self._escalate_call() 61 | 62 | 63 | class Director(Employee): 64 | 65 | def __init__(self, employee_id, name): 66 | super(Operator, self).__init__(employee_id, name, Rank.DIRECTOR) 67 | 68 | def escalate_call(self): 69 | raise NotImplemented('Directors must be able to handle any call') 70 | 71 | 72 | class CallState(Enum): 73 | 74 | READY = 0 75 | IN_PROGRESS = 1 76 | COMPLETE = 2 77 | 78 | 79 | class Call(object): 80 | 81 | def __init__(self, rank): 82 | self.state = CallState.READY 83 | self.rank = rank 84 | self.employee = None 85 | 86 | 87 | class CallCenter(object): 88 | 89 | def __init__(self, operators, supervisors, directors): 90 | self.operators = operators 91 | self.supervisors = supervisors 92 | self.directors = directors 93 | self.queued_calls = deque() 94 | 95 | def dispatch_call(self, call): 96 | if call.rank not in (Rank.OPERATOR, Rank.SUPERVISOR, Rank.DIRECTOR): 97 | raise ValueError('Invalid call rank: {}'.format(call.rank)) 98 | employee = None 99 | if call.rank == Rank.OPERATOR: 100 | employee = self._dispatch_call(call, self.operators) 101 | if call.rank == Rank.SUPERVISOR or employee is None: 102 | employee = self._dispatch_call(call, self.supervisors) 103 | if call.rank == Rank.DIRECTOR or employee is None: 104 | employee = self._dispatch_call(call, self.directors) 105 | if employee is None: 106 | self.queued_calls.append(call) 107 | 108 | def _dispatch_call(self, call, employees): 109 | for employee in employees: 110 | if employee.call is None: 111 | employee.take_call(call) 112 | return employee 113 | return None 114 | 115 | def notify_call_escalated(self, call): # ... 116 | def notify_call_completed(self, call): # ... 117 | def dispatch_queued_call_to_newly_freed_employee(self, call, employee): # ... -------------------------------------------------------------------------------- /solutions/object_oriented_design/deck_of_cards/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/object_oriented_design/deck_of_cards/__init__.py -------------------------------------------------------------------------------- /solutions/object_oriented_design/deck_of_cards/deck_of_cards.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](https://github.com/donnemartin). Source and license info is on [GitHub](https://github.com/donnemartin/system-design-primer-primer)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Design a deck of cards" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Constraints and assumptions\n", 22 | "\n", 23 | "* Is this a generic deck of cards for games like poker and black jack?\n", 24 | " * Yes, design a generic deck then extend it to black jack\n", 25 | "* Can we assume the deck has 52 cards (2-10, Jack, Queen, King, Ace) and 4 suits?\n", 26 | " * Yes\n", 27 | "* Can we assume inputs are valid or do we have to validate them?\n", 28 | " * Assume they're valid" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Solution" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 1, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "Overwriting deck_of_cards.py\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "%%writefile deck_of_cards.py\n", 55 | "from abc import ABCMeta, abstractmethod\n", 56 | "from enum import Enum\n", 57 | "import sys\n", 58 | "\n", 59 | "\n", 60 | "class Suit(Enum):\n", 61 | "\n", 62 | " HEART = 0\n", 63 | " DIAMOND = 1\n", 64 | " CLUBS = 2\n", 65 | " SPADE = 3\n", 66 | "\n", 67 | "\n", 68 | "class Card(metaclass=ABCMeta):\n", 69 | "\n", 70 | " def __init__(self, value, suit):\n", 71 | " self.value = value\n", 72 | " self.suit = suit\n", 73 | " self.is_available = True\n", 74 | "\n", 75 | " @property\n", 76 | " @abstractmethod\n", 77 | " def value(self):\n", 78 | " pass\n", 79 | "\n", 80 | " @value.setter\n", 81 | " @abstractmethod\n", 82 | " def value(self, other):\n", 83 | " pass\n", 84 | "\n", 85 | "\n", 86 | "class BlackJackCard(Card):\n", 87 | "\n", 88 | " def __init__(self, value, suit):\n", 89 | " super(BlackJackCard, self).__init__(value, suit)\n", 90 | "\n", 91 | " def is_ace(self):\n", 92 | " return True if self._value == 1 else False\n", 93 | "\n", 94 | " def is_face_card(self):\n", 95 | " \"\"\"Jack = 11, Queen = 12, King = 13\"\"\"\n", 96 | " return True if 10 < self._value <= 13 else False\n", 97 | "\n", 98 | " @property\n", 99 | " def value(self):\n", 100 | " if self.is_ace() == 1:\n", 101 | " return 1\n", 102 | " elif self.is_face_card():\n", 103 | " return 10\n", 104 | " else:\n", 105 | " return self._value\n", 106 | "\n", 107 | " @value.setter\n", 108 | " def value(self, new_value):\n", 109 | " if 1 <= new_value <= 13:\n", 110 | " self._value = new_value\n", 111 | " else:\n", 112 | " raise ValueError('Invalid card value: {}'.format(new_value))\n", 113 | "\n", 114 | "\n", 115 | "class Hand(object):\n", 116 | "\n", 117 | " def __init__(self, cards):\n", 118 | " self.cards = cards\n", 119 | "\n", 120 | " def add_card(self, card):\n", 121 | " self.cards.append(card)\n", 122 | "\n", 123 | " def score(self):\n", 124 | " total_value = 0\n", 125 | " for card in card:\n", 126 | " total_value += card.value\n", 127 | " return total_value\n", 128 | "\n", 129 | "\n", 130 | "class BlackJackHand(Hand):\n", 131 | "\n", 132 | " BLACKJACK = 21\n", 133 | "\n", 134 | " def __init__(self, cards):\n", 135 | " super(BlackJackHand, self).__init__(cards)\n", 136 | "\n", 137 | " def score(self):\n", 138 | " min_over = sys.MAXSIZE\n", 139 | " max_under = -sys.MAXSIZE\n", 140 | " for score in self.possible_scores():\n", 141 | " if self.BLACKJACK < score < min_over:\n", 142 | " min_over = score\n", 143 | " elif max_under < score <= self.BLACKJACK:\n", 144 | " max_under = score\n", 145 | " return max_under if max_under != -sys.MAXSIZE else min_over\n", 146 | "\n", 147 | " def possible_scores(self):\n", 148 | " \"\"\"Return a list of possible scores, taking Aces into account.\"\"\"\n", 149 | " # ...\n", 150 | "\n", 151 | "\n", 152 | "class Deck(object):\n", 153 | "\n", 154 | " def __init__(self, cards):\n", 155 | " self.cards = cards\n", 156 | " self.deal_index = 0\n", 157 | "\n", 158 | " def remaining_cards(self):\n", 159 | " return len(self.cards) - deal_index\n", 160 | "\n", 161 | " def deal_card():\n", 162 | " try:\n", 163 | " card = self.cards[self.deal_index]\n", 164 | " card.is_available = False\n", 165 | " self.deal_index += 1\n", 166 | " except IndexError:\n", 167 | " return None\n", 168 | " return card\n", 169 | "\n", 170 | " def shuffle(self): # ..." 171 | ] 172 | } 173 | ], 174 | "metadata": { 175 | "kernelspec": { 176 | "display_name": "Python 3", 177 | "language": "python", 178 | "name": "python3" 179 | }, 180 | "language_info": { 181 | "codemirror_mode": { 182 | "name": "ipython", 183 | "version": 3 184 | }, 185 | "file_extension": ".py", 186 | "mimetype": "text/x-python", 187 | "name": "python", 188 | "nbconvert_exporter": "python", 189 | "pygments_lexer": "ipython3", 190 | "version": "3.4.3" 191 | } 192 | }, 193 | "nbformat": 4, 194 | "nbformat_minor": 0 195 | } 196 | -------------------------------------------------------------------------------- /solutions/object_oriented_design/deck_of_cards/deck_of_cards.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | from enum import Enum 3 | import sys 4 | 5 | 6 | class Suit(Enum): 7 | 8 | HEART = 0 9 | DIAMOND = 1 10 | CLUBS = 2 11 | SPADE = 3 12 | 13 | 14 | class Card(metaclass=ABCMeta): 15 | 16 | def __init__(self, value, suit): 17 | self.value = value 18 | self.suit = suit 19 | self.is_available = True 20 | 21 | @property 22 | @abstractmethod 23 | def value(self): 24 | pass 25 | 26 | @value.setter 27 | @abstractmethod 28 | def value(self, other): 29 | pass 30 | 31 | 32 | class BlackJackCard(Card): 33 | 34 | def __init__(self, value, suit): 35 | super(BlackJackCard, self).__init__(value, suit) 36 | 37 | def is_ace(self): 38 | return True if self._value == 1 else False 39 | 40 | def is_face_card(self): 41 | """Jack = 11, Queen = 12, King = 13""" 42 | return True if 10 < self._value <= 13 else False 43 | 44 | @property 45 | def value(self): 46 | if self.is_ace() == 1: 47 | return 1 48 | elif self.is_face_card(): 49 | return 10 50 | else: 51 | return self._value 52 | 53 | @value.setter 54 | def value(self, new_value): 55 | if 1 <= new_value <= 13: 56 | self._value = new_value 57 | else: 58 | raise ValueError('Invalid card value: {}'.format(new_value)) 59 | 60 | 61 | class Hand(object): 62 | 63 | def __init__(self, cards): 64 | self.cards = cards 65 | 66 | def add_card(self, card): 67 | self.cards.append(card) 68 | 69 | def score(self): 70 | total_value = 0 71 | for card in card: 72 | total_value += card.value 73 | return total_value 74 | 75 | 76 | class BlackJackHand(Hand): 77 | 78 | BLACKJACK = 21 79 | 80 | def __init__(self, cards): 81 | super(BlackJackHand, self).__init__(cards) 82 | 83 | def score(self): 84 | min_over = sys.MAXSIZE 85 | max_under = -sys.MAXSIZE 86 | for score in self.possible_scores(): 87 | if self.BLACKJACK < score < min_over: 88 | min_over = score 89 | elif max_under < score <= self.BLACKJACK: 90 | max_under = score 91 | return max_under if max_under != -sys.MAXSIZE else min_over 92 | 93 | def possible_scores(self): 94 | """Return a list of possible scores, taking Aces into account.""" 95 | # ... 96 | 97 | 98 | class Deck(object): 99 | 100 | def __init__(self, cards): 101 | self.cards = cards 102 | self.deal_index = 0 103 | 104 | def remaining_cards(self): 105 | return len(self.cards) - deal_index 106 | 107 | def deal_card(): 108 | try: 109 | card = self.cards[self.deal_index] 110 | card.is_available = False 111 | self.deal_index += 1 112 | except IndexError: 113 | return None 114 | return card 115 | 116 | def shuffle(self): # ... -------------------------------------------------------------------------------- /solutions/object_oriented_design/hash_table/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/object_oriented_design/hash_table/__init__.py -------------------------------------------------------------------------------- /solutions/object_oriented_design/hash_table/hash_map.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](https://github.com/donnemartin). Source and license info is on [GitHub](https://github.com/donnemartin/system-design-primer-primer)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Design a hash map" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Constraints and assumptions\n", 22 | "\n", 23 | "* For simplicity, are the keys integers only?\n", 24 | " * Yes\n", 25 | "* For collision resolution, can we use chaining?\n", 26 | " * Yes\n", 27 | "* Do we have to worry about load factors?\n", 28 | " * No\n", 29 | "* Can we assume inputs are valid or do we have to validate them?\n", 30 | " * Assume they're valid\n", 31 | "* Can we assume this fits memory?\n", 32 | " * Yes" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Solution" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 1, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "Overwriting hash_map.py\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "%%writefile hash_map.py\n", 59 | "class Item(object):\n", 60 | "\n", 61 | " def __init__(self, key, value):\n", 62 | " self.key = key\n", 63 | " self.value = value\n", 64 | "\n", 65 | "\n", 66 | "class HashTable(object):\n", 67 | "\n", 68 | " def __init__(self, size):\n", 69 | " self.size = size\n", 70 | " self.table = [[] for _ in range(self.size)]\n", 71 | "\n", 72 | " def _hash_function(self, key):\n", 73 | " return key % self.size\n", 74 | "\n", 75 | " def set(self, key, value):\n", 76 | " hash_index = self._hash_function(key)\n", 77 | " for item in self.table[hash_index]:\n", 78 | " if item.key == key:\n", 79 | " item.value = value\n", 80 | " return\n", 81 | " self.table[hash_index].append(Item(key, value))\n", 82 | "\n", 83 | " def get(self, key):\n", 84 | " hash_index = self._hash_function(key)\n", 85 | " for item in self.table[hash_index]:\n", 86 | " if item.key == key:\n", 87 | " return item.value\n", 88 | " raise KeyError('Key not found')\n", 89 | "\n", 90 | " def remove(self, key):\n", 91 | " hash_index = self._hash_function(key)\n", 92 | " for index, item in enumerate(self.table[hash_index]):\n", 93 | " if item.key == key:\n", 94 | " del self.table[hash_index][index]\n", 95 | " return\n", 96 | " raise KeyError('Key not found')" 97 | ] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "Python 3", 103 | "language": "python", 104 | "name": "python3" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 3 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.4.3" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 0 121 | } 122 | -------------------------------------------------------------------------------- /solutions/object_oriented_design/hash_table/hash_map.py: -------------------------------------------------------------------------------- 1 | class Item(object): 2 | 3 | def __init__(self, key, value): 4 | self.key = key 5 | self.value = value 6 | 7 | 8 | class HashTable(object): 9 | 10 | def __init__(self, size): 11 | self.size = size 12 | self.table = [[] for _ in range(self.size)] 13 | 14 | def _hash_function(self, key): 15 | return key % self.size 16 | 17 | def set(self, key, value): 18 | hash_index = self._hash_function(key) 19 | for item in self.table[hash_index]: 20 | if item.key == key: 21 | item.value = value 22 | return 23 | self.table[hash_index].append(Item(key, value)) 24 | 25 | def get(self, key): 26 | hash_index = self._hash_function(key) 27 | for item in self.table[hash_index]: 28 | if item.key == key: 29 | return item.value 30 | raise KeyError('Key not found') 31 | 32 | def remove(self, key): 33 | hash_index = self._hash_function(key) 34 | for index, item in enumerate(self.table[hash_index]): 35 | if item.key == key: 36 | del self.table[hash_index][index] 37 | return 38 | raise KeyError('Key not found') -------------------------------------------------------------------------------- /solutions/object_oriented_design/lru_cache/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/object_oriented_design/lru_cache/__init__.py -------------------------------------------------------------------------------- /solutions/object_oriented_design/lru_cache/lru_cache.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](https://github.com/donnemartin). Source and license info is on [GitHub](https://github.com/donnemartin/system-design-primer-primer)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Design an LRU cache" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Constraints and assumptions\n", 22 | "\n", 23 | "* What are we caching?\n", 24 | " * We are cahing the results of web queries\n", 25 | "* Can we assume inputs are valid or do we have to validate them?\n", 26 | " * Assume they're valid\n", 27 | "* Can we assume this fits memory?\n", 28 | " * Yes" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Solution" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 1, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "Overwriting lru_cache.py\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "%%writefile lru_cache.py\n", 55 | "class Node(object):\n", 56 | "\n", 57 | " def __init__(self, results):\n", 58 | " self.results = results\n", 59 | " self.next = next\n", 60 | "\n", 61 | "\n", 62 | "class LinkedList(object):\n", 63 | "\n", 64 | " def __init__(self):\n", 65 | " self.head = None\n", 66 | " self.tail = None\n", 67 | "\n", 68 | " def move_to_front(self, node): # ...\n", 69 | " def append_to_front(self, node): # ...\n", 70 | " def remove_from_tail(self): # ...\n", 71 | "\n", 72 | "\n", 73 | "class Cache(object):\n", 74 | "\n", 75 | " def __init__(self, MAX_SIZE):\n", 76 | " self.MAX_SIZE = MAX_SIZE\n", 77 | " self.size = 0\n", 78 | " self.lookup = {} # key: query, value: node\n", 79 | " self.linked_list = LinkedList()\n", 80 | "\n", 81 | " def get(self, query)\n", 82 | " \"\"\"Get the stored query result from the cache.\n", 83 | " \n", 84 | " Accessing a node updates its position to the front of the LRU list.\n", 85 | " \"\"\"\n", 86 | " node = self.lookup[query]\n", 87 | " if node is None:\n", 88 | " return None\n", 89 | " self.linked_list.move_to_front(node)\n", 90 | " return node.results\n", 91 | "\n", 92 | " def set(self, results, query):\n", 93 | " \"\"\"Set the result for the given query key in the cache.\n", 94 | " \n", 95 | " When updating an entry, updates its position to the front of the LRU list.\n", 96 | " If the entry is new and the cache is at capacity, removes the oldest entry\n", 97 | " before the new entry is added.\n", 98 | " \"\"\"\n", 99 | " node = self.lookup[query]\n", 100 | " if node is not None:\n", 101 | " # Key exists in cache, update the value\n", 102 | " node.results = results\n", 103 | " self.linked_list.move_to_front(node)\n", 104 | " else:\n", 105 | " # Key does not exist in cache\n", 106 | " if self.size == self.MAX_SIZE:\n", 107 | " # Remove the oldest entry from the linked list and lookup\n", 108 | " self.lookup.pop(self.linked_list.tail.query, None)\n", 109 | " self.linked_list.remove_from_tail()\n", 110 | " else:\n", 111 | " self.size += 1\n", 112 | " # Add the new key and value\n", 113 | " new_node = Node(results)\n", 114 | " self.linked_list.append_to_front(new_node)\n", 115 | " self.lookup[query] = new_node" 116 | ] 117 | } 118 | ], 119 | "metadata": { 120 | "kernelspec": { 121 | "display_name": "Python 3", 122 | "language": "python", 123 | "name": "python3" 124 | }, 125 | "language_info": { 126 | "codemirror_mode": { 127 | "name": "ipython", 128 | "version": 3 129 | }, 130 | "file_extension": ".py", 131 | "mimetype": "text/x-python", 132 | "name": "python", 133 | "nbconvert_exporter": "python", 134 | "pygments_lexer": "ipython3", 135 | "version": "3.4.3" 136 | } 137 | }, 138 | "nbformat": 4, 139 | "nbformat_minor": 0 140 | } 141 | -------------------------------------------------------------------------------- /solutions/object_oriented_design/lru_cache/lru_cache.py: -------------------------------------------------------------------------------- 1 | class Node(object): 2 | 3 | def __init__(self, results): 4 | self.results = results 5 | self.next = next 6 | 7 | 8 | class LinkedList(object): 9 | 10 | def __init__(self): 11 | self.head = None 12 | self.tail = None 13 | 14 | def move_to_front(self, node): # ... 15 | def append_to_front(self, node): # ... 16 | def remove_from_tail(self): # ... 17 | 18 | 19 | class Cache(object): 20 | 21 | def __init__(self, MAX_SIZE): 22 | self.MAX_SIZE = MAX_SIZE 23 | self.size = 0 24 | self.lookup = {} # key: query, value: node 25 | self.linked_list = LinkedList() 26 | 27 | def get(self, query) 28 | """Get the stored query result from the cache. 29 | 30 | Accessing a node updates its position to the front of the LRU list. 31 | """ 32 | node = self.lookup[query] 33 | if node is None: 34 | return None 35 | self.linked_list.move_to_front(node) 36 | return node.results 37 | 38 | def set(self, results, query): 39 | """Set the result for the given query key in the cache. 40 | 41 | When updating an entry, updates its position to the front of the LRU list. 42 | If the entry is new and the cache is at capacity, removes the oldest entry 43 | before the new entry is added. 44 | """ 45 | node = self.lookup[query] 46 | if node is not None: 47 | # Key exists in cache, update the value 48 | node.results = results 49 | self.linked_list.move_to_front(node) 50 | else: 51 | # Key does not exist in cache 52 | if self.size == self.MAX_SIZE: 53 | # Remove the oldest entry from the linked list and lookup 54 | self.lookup.pop(self.linked_list.tail.query, None) 55 | self.linked_list.remove_from_tail() 56 | else: 57 | self.size += 1 58 | # Add the new key and value 59 | new_node = Node(results) 60 | self.linked_list.append_to_front(new_node) 61 | self.lookup[query] = new_node -------------------------------------------------------------------------------- /solutions/object_oriented_design/online_chat/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/object_oriented_design/online_chat/__init__.py -------------------------------------------------------------------------------- /solutions/object_oriented_design/online_chat/online_chat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](https://github.com/donnemartin). Source and license info is on [GitHub](https://github.com/donnemartin/system-design-primer-primer)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Design an online chat" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Constraints and assumptions\n", 22 | "\n", 23 | "* Assume we'll focus on the following workflows:\n", 24 | " * Text conversations only\n", 25 | " * Users\n", 26 | " * Add a user\n", 27 | " * Remove a user\n", 28 | " * Update a user\n", 29 | " * Add to a user's friends list\n", 30 | " * Add friend request\n", 31 | " * Approve friend request\n", 32 | " * Reject friend request\n", 33 | " * Remove from a user's friends list\n", 34 | " * Create a group chat\n", 35 | " * Invite friends to a group chat\n", 36 | " * Post a message to a group chat\n", 37 | " * Private 1-1 chat\n", 38 | " * Invite a friend to a private chat\n", 39 | " * Post a meesage to a private chat\n", 40 | "* No need to worry about scaling initially" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## Solution" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 1, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "Overwriting online_chat.py\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "%%writefile online_chat.py\n", 67 | "from abc import ABCMeta\n", 68 | "\n", 69 | "\n", 70 | "class UserService(object):\n", 71 | "\n", 72 | " def __init__(self):\n", 73 | " self.users_by_id = {} # key: user id, value: User\n", 74 | "\n", 75 | " def add_user(self, user_id, name, pass_hash): # ...\n", 76 | " def remove_user(self, user_id): # ...\n", 77 | " def add_friend_request(self, from_user_id, to_user_id): # ...\n", 78 | " def approve_friend_request(self, from_user_id, to_user_id): # ...\n", 79 | " def reject_friend_request(self, from_user_id, to_user_id): # ...\n", 80 | "\n", 81 | "\n", 82 | "class User(object):\n", 83 | "\n", 84 | " def __init__(self, user_id, name, pass_hash):\n", 85 | " self.user_id = user_id\n", 86 | " self.name = name\n", 87 | " self.pass_hash = pass_hash\n", 88 | " self.friends_by_id = {} # key: friend id, value: User\n", 89 | " self.friend_ids_to_private_chats = {} # key: friend id, value: private chats\n", 90 | " self.group_chats_by_id = {} # key: chat id, value: GroupChat\n", 91 | " self.received_friend_requests_by_friend_id = {} # key: friend id, value: AddRequest\n", 92 | " self.sent_friend_requests_by_friend_id = {} # key: friend id, value: AddRequest\n", 93 | "\n", 94 | " def message_user(self, friend_id, message): # ...\n", 95 | " def message_group(self, group_id, message): # ...\n", 96 | " def send_friend_request(self, friend_id): # ...\n", 97 | " def receive_friend_request(self, friend_id): # ...\n", 98 | " def approve_friend_request(self, friend_id): # ...\n", 99 | " def reject_friend_request(self, friend_id): # ...\n", 100 | "\n", 101 | "\n", 102 | "class Chat(metaclass=ABCMeta):\n", 103 | "\n", 104 | " def __init__(self, chat_id):\n", 105 | " self.chat_id = chat_id\n", 106 | " self.users = []\n", 107 | " self.messages = []\n", 108 | "\n", 109 | "\n", 110 | "class PrivateChat(Chat):\n", 111 | "\n", 112 | " def __init__(self, first_user, second_user):\n", 113 | " super(PrivateChat, self).__init__()\n", 114 | " self.users.append(first_user)\n", 115 | " self.users.append(second_user)\n", 116 | "\n", 117 | "\n", 118 | "class GroupChat(Chat):\n", 119 | "\n", 120 | " def add_user(self, user): # ...\n", 121 | " def remove_user(self, user): # ... \n", 122 | "\n", 123 | "\n", 124 | "class Message(object):\n", 125 | "\n", 126 | " def __init__(self, message_id, message, timestamp):\n", 127 | " self.message_id = message_id\n", 128 | " self.message = message\n", 129 | " self.timestamp = timestamp\n", 130 | "\n", 131 | "\n", 132 | "class AddRequest(object):\n", 133 | "\n", 134 | " def __init__(self, from_user_id, to_user_id, request_status, timestamp):\n", 135 | " self.from_user_id = from_user_id\n", 136 | " self.to_user_id = to_user_id\n", 137 | " self.request_status = request_status\n", 138 | " self.timestamp = timestamp\n", 139 | "\n", 140 | "\n", 141 | "class RequestStatus(Enum):\n", 142 | "\n", 143 | " UNREAD = 0\n", 144 | " READ = 1\n", 145 | " ACCEPTED = 2\n", 146 | " REJECTED = 3" 147 | ] 148 | } 149 | ], 150 | "metadata": { 151 | "kernelspec": { 152 | "display_name": "Python 3", 153 | "language": "python", 154 | "name": "python3" 155 | }, 156 | "language_info": { 157 | "codemirror_mode": { 158 | "name": "ipython", 159 | "version": 3 160 | }, 161 | "file_extension": ".py", 162 | "mimetype": "text/x-python", 163 | "name": "python", 164 | "nbconvert_exporter": "python", 165 | "pygments_lexer": "ipython3", 166 | "version": "3.4.3" 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 0 171 | } 172 | -------------------------------------------------------------------------------- /solutions/object_oriented_design/online_chat/online_chat.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta 2 | 3 | 4 | class UserService(object): 5 | 6 | def __init__(self): 7 | self.users_by_id = {} # key: user id, value: User 8 | 9 | def add_user(self, user_id, name, pass_hash): # ... 10 | def remove_user(self, user_id): # ... 11 | def add_friend_request(self, from_user_id, to_user_id): # ... 12 | def approve_friend_request(self, from_user_id, to_user_id): # ... 13 | def reject_friend_request(self, from_user_id, to_user_id): # ... 14 | 15 | 16 | class User(object): 17 | 18 | def __init__(self, user_id, name, pass_hash): 19 | self.user_id = user_id 20 | self.name = name 21 | self.pass_hash = pass_hash 22 | self.friends_by_id = {} # key: friend id, value: User 23 | self.friend_ids_to_private_chats = {} # key: friend id, value: private chats 24 | self.group_chats_by_id = {} # key: chat id, value: GroupChat 25 | self.received_friend_requests_by_friend_id = {} # key: friend id, value: AddRequest 26 | self.sent_friend_requests_by_friend_id = {} # key: friend id, value: AddRequest 27 | 28 | def message_user(self, friend_id, message): # ... 29 | def message_group(self, group_id, message): # ... 30 | def send_friend_request(self, friend_id): # ... 31 | def receive_friend_request(self, friend_id): # ... 32 | def approve_friend_request(self, friend_id): # ... 33 | def reject_friend_request(self, friend_id): # ... 34 | 35 | 36 | class Chat(metaclass=ABCMeta): 37 | 38 | def __init__(self, chat_id): 39 | self.chat_id = chat_id 40 | self.users = [] 41 | self.messages = [] 42 | 43 | 44 | class PrivateChat(Chat): 45 | 46 | def __init__(self, first_user, second_user): 47 | super(PrivateChat, self).__init__() 48 | self.users.append(first_user) 49 | self.users.append(second_user) 50 | 51 | 52 | class GroupChat(Chat): 53 | 54 | def add_user(self, user): # ... 55 | def remove_user(self, user): # ... 56 | 57 | 58 | class Message(object): 59 | 60 | def __init__(self, message_id, message, timestamp): 61 | self.message_id = message_id 62 | self.message = message 63 | self.timestamp = timestamp 64 | 65 | 66 | class AddRequest(object): 67 | 68 | def __init__(self, from_user_id, to_user_id, request_status, timestamp): 69 | self.from_user_id = from_user_id 70 | self.to_user_id = to_user_id 71 | self.request_status = request_status 72 | self.timestamp = timestamp 73 | 74 | 75 | class RequestStatus(Enum): 76 | 77 | UNREAD = 0 78 | READ = 1 79 | ACCEPTED = 2 80 | REJECTED = 3 -------------------------------------------------------------------------------- /solutions/object_oriented_design/parking_lot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/object_oriented_design/parking_lot/__init__.py -------------------------------------------------------------------------------- /solutions/object_oriented_design/parking_lot/parking_lot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook was prepared by [Donne Martin](https://github.com/donnemartin). Source and license info is on [GitHub](https://github.com/donnemartin/system-design-primer-primer)." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Design a parking lot" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Constraints and assumptions\n", 22 | "\n", 23 | "* What types of vehicles should we support?\n", 24 | " * Motorcycle, Car, Bus\n", 25 | "* Does each vehicle type take up a different amount of parking spots?\n", 26 | " * Yes\n", 27 | " * Motorcycle spot -> Motorcycle\n", 28 | " * Compact spot -> Motorcycle, Car\n", 29 | " * Large spot -> Motorcycle, Car\n", 30 | " * Bus can park if we have 5 consecutive \"large\" spots\n", 31 | "* Does the parking lot have multiple levels?\n", 32 | " * Yes" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Solution" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 1, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "Overwriting parking_lot.py\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "%%writefile parking_lot.py\n", 59 | "from abc import ABCMeta, abstractmethod\n", 60 | "\n", 61 | "\n", 62 | "class VehicleSize(Enum):\n", 63 | "\n", 64 | " MOTORCYCLE = 0\n", 65 | " COMPACT = 1\n", 66 | " LARGE = 2\n", 67 | "\n", 68 | "\n", 69 | "class Vehicle(metaclass=ABCMeta):\n", 70 | "\n", 71 | " def __init__(self, vehicle_size, license_plate, spot_size):\n", 72 | " self.vehicle_size = vehicle_size\n", 73 | " self.license_plate = license_plate\n", 74 | " self.spot_size\n", 75 | " self.spots_taken = []\n", 76 | "\n", 77 | " def clear_spots(self):\n", 78 | " for spot in self.spots_taken:\n", 79 | " spot.remove_vehicle(self)\n", 80 | " self.spots_taken = []\n", 81 | "\n", 82 | " def take_spot(self, spot):\n", 83 | " self.spots_taken.append(spot)\n", 84 | "\n", 85 | " @abstractmethod\n", 86 | " def can_fit_in_spot(self, spot):\n", 87 | " pass\n", 88 | "\n", 89 | "\n", 90 | "class Motorcycle(Vehicle):\n", 91 | "\n", 92 | " def __init__(self, license_plate):\n", 93 | " super(Motorcycle, self).__init__(VehicleSize.MOTORCYCLE, license_plate, spot_size=1)\n", 94 | "\n", 95 | " def can_fit_in_spot(self, spot):\n", 96 | " return True\n", 97 | "\n", 98 | "\n", 99 | "class Car(Vehicle):\n", 100 | "\n", 101 | " def __init__(self, license_plate):\n", 102 | " super(Car, self).__init__(VehicleSize.COMPACT, license_plate, spot_size=1)\n", 103 | "\n", 104 | " def can_fit_in_spot(self, spot):\n", 105 | " return True if (spot.size == LARGE or spot.size == COMPACT) else False\n", 106 | "\n", 107 | "\n", 108 | "class Bus(Vehicle):\n", 109 | "\n", 110 | " def __init__(self, license_plate):\n", 111 | " super(Bus, self).__init__(VehicleSize.LARGE, license_plate, spot_size=5)\n", 112 | "\n", 113 | " def can_fit_in_spot(self, spot):\n", 114 | " return True if spot.size == LARGE else False\n", 115 | "\n", 116 | "\n", 117 | "class ParkingLot(object):\n", 118 | "\n", 119 | " def __init__(self, num_levels):\n", 120 | " self.num_levels = num_levels\n", 121 | " self.levels = []\n", 122 | "\n", 123 | " def park_vehicle(self, vehicle):\n", 124 | " for level in levels:\n", 125 | " if level.park_vehicle(vehicle):\n", 126 | " return True\n", 127 | " return False\n", 128 | "\n", 129 | "\n", 130 | "class Level(object):\n", 131 | "\n", 132 | " SPOTS_PER_ROW = 10\n", 133 | "\n", 134 | " def __init__(self, floor, total_spots):\n", 135 | " self.floor = floor\n", 136 | " self.num_spots = total_spots\n", 137 | " self.available_spots = 0\n", 138 | " self.parking_spots = []\n", 139 | "\n", 140 | " def spot_freed(self):\n", 141 | " self.available_spots += 1\n", 142 | "\n", 143 | " def park_vehicle(self, vehicle):\n", 144 | " spot = self._find_available_spot(vehicle)\n", 145 | " if spot is None:\n", 146 | " return None\n", 147 | " else:\n", 148 | " spot.park_vehicle(vehicle)\n", 149 | " return spot\n", 150 | "\n", 151 | " def _find_available_spot(self, vehicle):\n", 152 | " \"\"\"Find an available spot where vehicle can fit, or return None\"\"\"\n", 153 | " # ...\n", 154 | "\n", 155 | " def _park_starting_at_spot(self, spot, vehicle):\n", 156 | " \"\"\"Occupy starting at spot.spot_number to vehicle.spot_size.\"\"\"\n", 157 | " # ...\n", 158 | "\n", 159 | "\n", 160 | "class ParkingSpot(object):\n", 161 | "\n", 162 | " def __init__(self, level, row, spot_number, spot_size, vehicle_size):\n", 163 | " self.level = level\n", 164 | " self.row = row\n", 165 | " self.spot_number = spot_number\n", 166 | " self.spot_size = spot_size\n", 167 | " self.vehicle_size = vehicle_size\n", 168 | " self.vehicle = None\n", 169 | "\n", 170 | " def is_available(self):\n", 171 | " return True if self.vehicle is None else False\n", 172 | "\n", 173 | " def can_fit_vehicle(self, vehicle):\n", 174 | " if self.vehicle is not None:\n", 175 | " return False\n", 176 | " return vehicle.can_fit_in_spot(self)\n", 177 | "\n", 178 | " def park_vehicle(self, vehicle): # ...\n", 179 | " def remove_vehicle(self): # ..." 180 | ] 181 | } 182 | ], 183 | "metadata": { 184 | "kernelspec": { 185 | "display_name": "Python 3", 186 | "language": "python", 187 | "name": "python3" 188 | }, 189 | "language_info": { 190 | "codemirror_mode": { 191 | "name": "ipython", 192 | "version": 3 193 | }, 194 | "file_extension": ".py", 195 | "mimetype": "text/x-python", 196 | "name": "python", 197 | "nbconvert_exporter": "python", 198 | "pygments_lexer": "ipython3", 199 | "version": "3.4.3" 200 | } 201 | }, 202 | "nbformat": 4, 203 | "nbformat_minor": 0 204 | } 205 | -------------------------------------------------------------------------------- /solutions/object_oriented_design/parking_lot/parking_lot.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | 3 | 4 | class VehicleSize(Enum): 5 | 6 | MOTORCYCLE = 0 7 | COMPACT = 1 8 | LARGE = 2 9 | 10 | 11 | class Vehicle(metaclass=ABCMeta): 12 | 13 | def __init__(self, vehicle_size, license_plate, spot_size): 14 | self.vehicle_size = vehicle_size 15 | self.license_plate = license_plate 16 | self.spot_size 17 | self.spots_taken = [] 18 | 19 | def clear_spots(self): 20 | for spot in self.spots_taken: 21 | spot.remove_vehicle(self) 22 | self.spots_taken = [] 23 | 24 | def take_spot(self, spot): 25 | self.spots_taken.append(spot) 26 | 27 | @abstractmethod 28 | def can_fit_in_spot(self, spot): 29 | pass 30 | 31 | 32 | class Motorcycle(Vehicle): 33 | 34 | def __init__(self, license_plate): 35 | super(Motorcycle, self).__init__(VehicleSize.MOTORCYCLE, license_plate, spot_size=1) 36 | 37 | def can_fit_in_spot(self, spot): 38 | return True 39 | 40 | 41 | class Car(Vehicle): 42 | 43 | def __init__(self, license_plate): 44 | super(Car, self).__init__(VehicleSize.COMPACT, license_plate, spot_size=1) 45 | 46 | def can_fit_in_spot(self, spot): 47 | return True if (spot.size == LARGE or spot.size == COMPACT) else False 48 | 49 | 50 | class Bus(Vehicle): 51 | 52 | def __init__(self, license_plate): 53 | super(Bus, self).__init__(VehicleSize.LARGE, license_plate, spot_size=5) 54 | 55 | def can_fit_in_spot(self, spot): 56 | return True if spot.size == LARGE else False 57 | 58 | 59 | class ParkingLot(object): 60 | 61 | def __init__(self, num_levels): 62 | self.num_levels = num_levels 63 | self.levels = [] # List of Levels 64 | 65 | def park_vehicle(self, vehicle): 66 | for level in levels: 67 | if level.park_vehicle(vehicle): 68 | return True 69 | return False 70 | 71 | 72 | class Level(object): 73 | 74 | SPOTS_PER_ROW = 10 75 | 76 | def __init__(self, floor, total_spots): 77 | self.floor = floor 78 | self.num_spots = total_spots 79 | self.available_spots = 0 80 | self.spots = [] # List of ParkingSpots 81 | 82 | def spot_freed(self): 83 | self.available_spots += 1 84 | 85 | def park_vehicle(self, vehicle): 86 | spot = self._find_available_spot(vehicle) 87 | if spot is None: 88 | return None 89 | else: 90 | spot.park_vehicle(vehicle) 91 | return spot 92 | 93 | def _find_available_spot(self, vehicle): 94 | """Find an available spot where vehicle can fit, or return None""" 95 | # ... 96 | 97 | def _park_starting_at_spot(self, spot, vehicle): 98 | """Occupy starting at spot.spot_number to vehicle.spot_size.""" 99 | # ... 100 | 101 | 102 | class ParkingSpot(object): 103 | 104 | def __init__(self, level, row, spot_number, spot_size, vehicle_size): 105 | self.level = level 106 | self.row = row 107 | self.spot_number = spot_number 108 | self.spot_size = spot_size 109 | self.vehicle_size = vehicle_size 110 | self.vehicle = None 111 | 112 | def is_available(self): 113 | return True if self.vehicle is None else False 114 | 115 | def can_fit_vehicle(self, vehicle): 116 | if self.vehicle is not None: 117 | return False 118 | return vehicle.can_fit_in_spot(self) 119 | 120 | def park_vehicle(self, vehicle): # ... 121 | def remove_vehicle(self): # ... -------------------------------------------------------------------------------- /solutions/system_design/mint/README.md: -------------------------------------------------------------------------------- 1 | # Design Mint.com 2 | 3 | *Note: This document links directly to relevant areas found in the [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) to avoid duplication. Refer to the linked content for general talking points, tradeoffs, and alternatives.* 4 | 5 | ## Step 1: Outline use cases and constraints 6 | 7 | > Gather requirements and scope the problem. 8 | > Ask questions to clarify use cases and constraints. 9 | > Discuss assumptions. 10 | 11 | Without an interviewer to address clarifying questions, we'll define some use cases and constraints. 12 | 13 | ### Use cases 14 | 15 | #### We'll scope the problem to handle only the following use cases 16 | 17 | * **User** connects to a financial account 18 | * **Service** extracts transactions from the account 19 | * Updates daily 20 | * Categorizes transactions 21 | * Allows manual category override by the user 22 | * No automatic re-categorization 23 | * Analyzes monthly spending, by category 24 | * **Service** recommends a budget 25 | * Allows users to manually set a budget 26 | * Sends notifications when approaching or exceeding budget 27 | * **Service** has high availability 28 | 29 | #### Out of scope 30 | 31 | * **Service** performs additional logging and analytics 32 | 33 | ### Constraints and assumptions 34 | 35 | #### State assumptions 36 | 37 | * Traffic is not evenly distributed 38 | * Automatic daily update of accounts applies only to users active in the past 30 days 39 | * Adding or removing financial accounts is relatively rare 40 | * Budget notifications don't need to be instant 41 | * 10 million users 42 | * 10 budget categories per user = 100 million budget items 43 | * Example categories: 44 | * Housing = $1,000 45 | * Food = $200 46 | * Gas = $100 47 | * Sellers are used to determine transaction category 48 | * 50,000 sellers 49 | * 30 million financial accounts 50 | * 5 billion transactions per month 51 | * 500 million read requests per month 52 | * 10:1 write to read ratio 53 | * Write-heavy, users make transactions daily, but few visit the site daily 54 | 55 | #### Calculate usage 56 | 57 | **Clarify with your interviewer if you should run back-of-the-envelope usage calculations.** 58 | 59 | * Size per transaction: 60 | * `user_id` - 8 bytes 61 | * `created_at` - 5 bytes 62 | * `seller` - 32 bytes 63 | * `amount` - 5 bytes 64 | * Total: ~50 bytes 65 | * 250 GB of new transaction content per month 66 | * 50 bytes per transaction * 5 billion transactions per month 67 | * 9 TB of new transaction content in 3 years 68 | * Assume most are new transactions instead of updates to existing ones 69 | * 2,000 transactions per second on average 70 | * 200 read requests per second on average 71 | 72 | Handy conversion guide: 73 | 74 | * 2.5 million seconds per month 75 | * 1 request per second = 2.5 million requests per month 76 | * 40 requests per second = 100 million requests per month 77 | * 400 requests per second = 1 billion requests per month 78 | 79 | ## Step 2: Create a high level design 80 | 81 | > Outline a high level design with all important components. 82 | 83 | ![Imgur](http://i.imgur.com/E8klrBh.png) 84 | 85 | ## Step 3: Design core components 86 | 87 | > Dive into details for each core component. 88 | 89 | ### Use case: User connects to a financial account 90 | 91 | We could store info on the 10 million users in a [relational database](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms). We should discuss the [use cases and tradeoffs between choosing SQL or NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql). 92 | 93 | * The **Client** sends a request to the **Web Server**, running as a [reverse proxy](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 94 | * The **Web Server** forwards the request to the **Accounts API** server 95 | * The **Accounts API** server updates the **SQL Database** `accounts` table with the newly entered account info 96 | 97 | **Clarify with your interviewer how much code you are expected to write**. 98 | 99 | The `accounts` table could have the following structure: 100 | 101 | ``` 102 | id int NOT NULL AUTO_INCREMENT 103 | created_at datetime NOT NULL 104 | last_update datetime NOT NULL 105 | account_url varchar(255) NOT NULL 106 | account_login varchar(32) NOT NULL 107 | account_password_hash char(64) NOT NULL 108 | user_id int NOT NULL 109 | PRIMARY KEY(id) 110 | FOREIGN KEY(user_id) REFERENCES users(id) 111 | ``` 112 | 113 | We'll create an [index](https://github.com/donnemartin/system-design-primer#use-good-indices) on `id`, `user_id `, and `created_at` to speed up lookups (log-time instead of scanning the entire table) and to keep the data in memory. Reading 1 MB sequentially from memory takes about 250 microseconds, while reading from SSD takes 4x and from disk takes 80x longer.1 114 | 115 | We'll use a public [**REST API**](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest): 116 | 117 | ``` 118 | $ curl -X POST --data '{ "user_id": "foo", "account_url": "bar", \ 119 | "account_login": "baz", "account_password": "qux" }' \ 120 | https://mint.com/api/v1/account 121 | ``` 122 | 123 | For internal communications, we could use [Remote Procedure Calls](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc). 124 | 125 | Next, the service extracts transactions from the account. 126 | 127 | ### Use case: Service extracts transactions from the account 128 | 129 | We'll want to extract information from an account in these cases: 130 | 131 | * The user first links the account 132 | * The user manually refreshes the account 133 | * Automatically each day for users who have been active in the past 30 days 134 | 135 | Data flow: 136 | 137 | * The **Client** sends a request to the **Web Server** 138 | * The **Web Server** forwards the request to the **Accounts API** server 139 | * The **Accounts API** server places a job on a **Queue** such as Amazon SQS or [RabbitMQ](https://www.rabbitmq.com/) 140 | * Extracting transactions could take awhile, we'd probably want to do this [asynchronously with a queue](https://github.com/donnemartin/system-design-primer#asynchronism), although this introduces additional complexity 141 | * The **Transaction Extraction Service** does the following: 142 | * Pulls from the **Queue** and extracts transactions for the given account from the financial institution, storing the results as raw log files in the **Object Store** 143 | * Uses the **Category Service** to categorize each transaction 144 | * Uses the **Budget Service** to calculate aggregate monthly spending by category 145 | * The **Budget Service** uses the **Notification Service** to let users know if they are nearing or have exceeded their budget 146 | * Updates the **SQL Database** `transactions` table with categorized transactions 147 | * Updates the **SQL Database** `monthly_spending` table with aggregate monthly spending by category 148 | * Notifies the user the transactions have completed through the **Notification Service**: 149 | * Uses a **Queue** (not pictured) to asynchronously send out notifications 150 | 151 | The `transactions` table could have the following structure: 152 | 153 | ``` 154 | id int NOT NULL AUTO_INCREMENT 155 | created_at datetime NOT NULL 156 | seller varchar(32) NOT NULL 157 | amount decimal NOT NULL 158 | user_id int NOT NULL 159 | PRIMARY KEY(id) 160 | FOREIGN KEY(user_id) REFERENCES users(id) 161 | ``` 162 | 163 | We'll create an [index](https://github.com/donnemartin/system-design-primer#use-good-indices) on `id`, `user_id `, and `created_at`. 164 | 165 | The `monthly_spending` table could have the following structure: 166 | 167 | ``` 168 | id int NOT NULL AUTO_INCREMENT 169 | month_year date NOT NULL 170 | category varchar(32) 171 | amount decimal NOT NULL 172 | user_id int NOT NULL 173 | PRIMARY KEY(id) 174 | FOREIGN KEY(user_id) REFERENCES users(id) 175 | ``` 176 | 177 | We'll create an [index](https://github.com/donnemartin/system-design-primer#use-good-indices) on `id` and `user_id `. 178 | 179 | #### Category service 180 | 181 | For the **Category Service**, we can seed a seller-to-category dictionary with the most popular sellers. If we estimate 50,000 sellers and estimate each entry to take less than 255 bytes, the dictionary would only take about 12 MB of memory. 182 | 183 | **Clarify with your interviewer how much code you are expected to write**. 184 | 185 | ``` 186 | class DefaultCategories(Enum): 187 | 188 | HOUSING = 0 189 | FOOD = 1 190 | GAS = 2 191 | SHOPPING = 3 192 | ... 193 | 194 | seller_category_map = {} 195 | seller_category_map['Exxon'] = DefaultCategories.GAS 196 | seller_category_map['Target'] = DefaultCategories.SHOPPING 197 | ... 198 | ``` 199 | 200 | For sellers not initially seeded in the map, we could use a crowdsourcing effort by evaluating the manual category overrides our users provide. We could use a heap to quickly lookup the top manual override per seller in O(1) time. 201 | 202 | ``` 203 | class Categorizer(object): 204 | 205 | def __init__(self, seller_category_map, self.seller_category_crowd_overrides_map): 206 | self.seller_category_map = seller_category_map 207 | self.seller_category_crowd_overrides_map = \ 208 | seller_category_crowd_overrides_map 209 | 210 | def categorize(self, transaction): 211 | if transaction.seller in self.seller_category_map: 212 | return self.seller_category_map[transaction.seller] 213 | elif transaction.seller in self.seller_category_crowd_overrides_map: 214 | self.seller_category_map[transaction.seller] = \ 215 | self.seller_category_crowd_overrides_map[transaction.seller].peek_min() 216 | return self.seller_category_map[transaction.seller] 217 | return None 218 | ``` 219 | 220 | Transaction implementation: 221 | 222 | ``` 223 | class Transaction(object): 224 | 225 | def __init__(self, created_at, seller, amount): 226 | self.timestamp = timestamp 227 | self.seller = seller 228 | self.amount = amount 229 | ``` 230 | 231 | ### Use case: Service recommends a budget 232 | 233 | To start, we could use a generic budget template that allocates category amounts based on income tiers. Using this approach, we would not have to store the 100 million budget items identified in the constraints, only those that the user overrides. If a user overrides a budget category, which we could store the override in the `TABLE budget_overrides`. 234 | 235 | ``` 236 | class Budget(object): 237 | 238 | def __init__(self, income): 239 | self.income = income 240 | self.categories_to_budget_map = self.create_budget_template() 241 | 242 | def create_budget_template(self): 243 | return { 244 | 'DefaultCategories.HOUSING': income * .4, 245 | 'DefaultCategories.FOOD': income * .2 246 | 'DefaultCategories.GAS': income * .1, 247 | 'DefaultCategories.SHOPPING': income * .2 248 | ... 249 | } 250 | 251 | def override_category_budget(self, category, amount): 252 | self.categories_to_budget_map[category] = amount 253 | ``` 254 | 255 | For the **Budget Service**, we can potentially run SQL queries on the `transactions` table to generate the `monthly_spending` aggregate table. The `monthly_spending` table would likely have much fewer rows than the total 5 billion transactions, since users typically have many transactions per month. 256 | 257 | As an alternative, we can run **MapReduce** jobs on the raw transaction files to: 258 | 259 | * Categorize each transaction 260 | * Generate aggregate monthly spending by category 261 | 262 | Running analyses on the transaction files could significantly reduce the load on the database. 263 | 264 | We could call the **Budget Service** to re-run the analysis if the user updates a category. 265 | 266 | **Clarify with your interviewer how much code you are expected to write**. 267 | 268 | Sample log file format, tab delimited: 269 | 270 | ``` 271 | user_id timestamp seller amount 272 | ``` 273 | 274 | **MapReduce** implementation: 275 | 276 | ``` 277 | class SpendingByCategory(MRJob): 278 | 279 | def __init__(self, categorizer): 280 | self.categorizer = categorizer 281 | self.current_year_month = calc_current_year_month() 282 | ... 283 | 284 | def calc_current_year_month(self): 285 | """Return the current year and month.""" 286 | ... 287 | 288 | def extract_year_month(self, timestamp): 289 | """Return the year and month portions of the timestamp.""" 290 | ... 291 | 292 | def handle_budget_notifications(self, key, total): 293 | """Call notification API if nearing or exceeded budget.""" 294 | ... 295 | 296 | def mapper(self, _, line): 297 | """Parse each log line, extract and transform relevant lines. 298 | 299 | Argument line will be of the form: 300 | 301 | user_id timestamp seller amount 302 | 303 | Using the categorizer to convert seller to category, 304 | emit key value pairs of the form: 305 | 306 | (user_id, 2016-01, shopping), 25 307 | (user_id, 2016-01, shopping), 100 308 | (user_id, 2016-01, gas), 50 309 | """ 310 | user_id, timestamp, seller, amount = line.split('\t') 311 | category = self.categorizer.categorize(seller) 312 | period = self.extract_year_month(timestamp) 313 | if period == self.current_year_month: 314 | yield (user_id, period, category), amount 315 | 316 | def reducer(self, key, value): 317 | """Sum values for each key. 318 | 319 | (user_id, 2016-01, shopping), 125 320 | (user_id, 2016-01, gas), 50 321 | """ 322 | total = sum(values) 323 | yield key, sum(values) 324 | ``` 325 | 326 | ## Step 4: Scale the design 327 | 328 | > Identify and address bottlenecks, given the constraints. 329 | 330 | ![Imgur](http://i.imgur.com/V5q57vU.png) 331 | 332 | **Important: Do not simply jump right into the final design from the initial design!** 333 | 334 | State you would 1) **Benchmark/Load Test**, 2) **Profile** for bottlenecks 3) address bottlenecks while evaluating alternatives and trade-offs, and 4) repeat. See [Design a system that scales to millions of users on AWS](https://github.com/donnemartin/system-design-primer/blob/master/solutions/system_design/scaling_aws/README.md) as a sample on how to iteratively scale the initial design. 335 | 336 | It's important to discuss what bottlenecks you might encounter with the initial design and how you might address each of them. For example, what issues are addressed by adding a **Load Balancer** with multiple **Web Servers**? **CDN**? **Master-Slave Replicas**? What are the alternatives and **Trade-Offs** for each? 337 | 338 | We'll introduce some components to complete the design and to address scalability issues. Internal load balancers are not shown to reduce clutter. 339 | 340 | *To avoid repeating discussions*, refer to the following [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) for main talking points, tradeoffs, and alternatives: 341 | 342 | * [DNS](https://github.com/donnemartin/system-design-primer#domain-name-system) 343 | * [CDN](https://github.com/donnemartin/system-design-primer#content-delivery-network) 344 | * [Load balancer](https://github.com/donnemartin/system-design-primer#load-balancer) 345 | * [Horizontal scaling](https://github.com/donnemartin/system-design-primer#horizontal-scaling) 346 | * [Web server (reverse proxy)](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 347 | * [API server (application layer)](https://github.com/donnemartin/system-design-primer#application-layer) 348 | * [Cache](https://github.com/donnemartin/system-design-primer#cache) 349 | * [Relational database management system (RDBMS)](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms) 350 | * [SQL write master-slave failover](https://github.com/donnemartin/system-design-primer#fail-over) 351 | * [Master-slave replication](https://github.com/donnemartin/system-design-primer#master-slave-replication) 352 | * [Asynchronism](https://github.com/donnemartin/system-design-primer#aysnchronism) 353 | * [Consistency patterns](https://github.com/donnemartin/system-design-primer#consistency-patterns) 354 | * [Availability patterns](https://github.com/donnemartin/system-design-primer#availability-patterns) 355 | 356 | We'll add an additional use case: **User** accesses summaries and transactions. 357 | 358 | User sessions, aggregate stats by category, and recent transactions could be placed in a **Memory Cache** such as Redis or Memcached. 359 | 360 | * The **Client** sends a read request to the **Web Server** 361 | * The **Web Server** forwards the request to the **Read API** server 362 | * Static content can be served from the **Object Store** such as S3, which is cached on the **CDN** 363 | * The **Read API** server does the following: 364 | * Checks the **Memory Cache** for the content 365 | * If the url is in the **Memory Cache**, returns the cached contents 366 | * Else 367 | * If the url is in the **SQL Database**, fetches the contents 368 | * Updates the **Memory Cache** with the contents 369 | 370 | Refer to [When to update the cache](https://github.com/donnemartin/system-design-primer#when-to-update-the-cache) for tradeoffs and alternatives. The approach above describes [cache-aside](https://github.com/donnemartin/system-design-primer#cache-aside). 371 | 372 | Instead of keeping the `monthly_spending` aggregate table in the **SQL Database**, we could create a separate **Analytics Database** using a data warehousing solution such as Amazon Redshift or Google BigQuery. 373 | 374 | We might only want to store a month of `transactions` data in the database, while storing the rest in a data warehouse or in an **Object Store**. An **Object Store** such as Amazon S3 can comfortably handle the constraint of 250 GB of new content per month. 375 | 376 | To address the 2,000 *average* read requests per second (higher at peak), traffic for popular content should be handled by the **Memory Cache** instead of the database. The **Memory Cache** is also useful for handling the unevenly distributed traffic and traffic spikes. The **SQL Read Replicas** should be able to handle the cache misses, as long as the replicas are not bogged down with replicating writes. 377 | 378 | 200 *average* transaction writes per second (higher at peak) might be tough for a single **SQL Write Master-Slave**. We might need to employ additional SQL scaling patterns: 379 | 380 | * [Federation](https://github.com/donnemartin/system-design-primer#federation) 381 | * [Sharding](https://github.com/donnemartin/system-design-primer#sharding) 382 | * [Denormalization](https://github.com/donnemartin/system-design-primer#denormalization) 383 | * [SQL Tuning](https://github.com/donnemartin/system-design-primer#sql-tuning) 384 | 385 | We should also consider moving some data to a **NoSQL Database**. 386 | 387 | ## Additional talking points 388 | 389 | > Additional topics to dive into, depending on the problem scope and time remaining. 390 | 391 | #### NoSQL 392 | 393 | * [Key-value store](https://github.com/donnemartin/system-design-primer#key-value-store) 394 | * [Document store](https://github.com/donnemartin/system-design-primer#document-store) 395 | * [Wide column store](https://github.com/donnemartin/system-design-primer#wide-column-store) 396 | * [Graph database](https://github.com/donnemartin/system-design-primer#graph-database) 397 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql) 398 | 399 | ### Caching 400 | 401 | * Where to cache 402 | * [Client caching](https://github.com/donnemartin/system-design-primer#client-caching) 403 | * [CDN caching](https://github.com/donnemartin/system-design-primer#cdn-caching) 404 | * [Web server caching](https://github.com/donnemartin/system-design-primer#web-server-caching) 405 | * [Database caching](https://github.com/donnemartin/system-design-primer#database-caching) 406 | * [Application caching](https://github.com/donnemartin/system-design-primer#application-caching) 407 | * What to cache 408 | * [Caching at the database query level](https://github.com/donnemartin/system-design-primer#caching-at-the-database-query-level) 409 | * [Caching at the object level](https://github.com/donnemartin/system-design-primer#caching-at-the-object-level) 410 | * When to update the cache 411 | * [Cache-aside](https://github.com/donnemartin/system-design-primer#cache-aside) 412 | * [Write-through](https://github.com/donnemartin/system-design-primer#write-through) 413 | * [Write-behind (write-back)](https://github.com/donnemartin/system-design-primer#write-behind-write-back) 414 | * [Refresh ahead](https://github.com/donnemartin/system-design-primer#refresh-ahead) 415 | 416 | ### Asynchronism and microservices 417 | 418 | * [Message queues](https://github.com/donnemartin/system-design-primer#message-queues) 419 | * [Task queues](https://github.com/donnemartin/system-design-primer#task-queues) 420 | * [Back pressure](https://github.com/donnemartin/system-design-primer#back-pressure) 421 | * [Microservices](https://github.com/donnemartin/system-design-primer#microservices) 422 | 423 | ### Communications 424 | 425 | * Discuss tradeoffs: 426 | * External communication with clients - [HTTP APIs following REST](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest) 427 | * Internal communications - [RPC](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc) 428 | * [Service discovery](https://github.com/donnemartin/system-design-primer#service-discovery) 429 | 430 | ### Security 431 | 432 | Refer to the [security section](https://github.com/donnemartin/system-design-primer#security). 433 | 434 | ### Latency numbers 435 | 436 | See [Latency numbers every programmer should know](https://github.com/donnemartin/system-design-primer#latency-numbers-every-programmer-should-know). 437 | 438 | ### Ongoing 439 | 440 | * Continue benchmarking and monitoring your system to address bottlenecks as they come up 441 | * Scaling is an iterative process 442 | -------------------------------------------------------------------------------- /solutions/system_design/mint/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/mint/__init__.py -------------------------------------------------------------------------------- /solutions/system_design/mint/mint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/mint/mint.png -------------------------------------------------------------------------------- /solutions/system_design/mint/mint_basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/mint/mint_basic.png -------------------------------------------------------------------------------- /solutions/system_design/mint/mint_mapreduce.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from mrjob.job import MRJob 4 | 5 | 6 | class SpendingByCategory(MRJob): 7 | 8 | def __init__(self, categorizer): 9 | self.categorizer = categorizer 10 | ... 11 | 12 | def current_year_month(self): 13 | """Return the current year and month.""" 14 | ... 15 | 16 | def extract_year_month(self, timestamp): 17 | """Return the year and month portions of the timestamp.""" 18 | ... 19 | 20 | def handle_budget_notifications(self, key, total): 21 | """Call notification API if nearing or exceeded budget.""" 22 | ... 23 | 24 | def mapper(self, _, line): 25 | """Parse each log line, extract and transform relevant lines. 26 | 27 | Emit key value pairs of the form: 28 | 29 | (2016-01, shopping), 25 30 | (2016-01, shopping), 100 31 | (2016-01, gas), 50 32 | """ 33 | timestamp, seller, amount = line.split('\t') 34 | period = self. extract_year_month(timestamp) 35 | if period == self.current_year_month(): 36 | yield (period, category), amount 37 | 38 | def reducer(self, key, value): 39 | """Sum values for each key. 40 | 41 | (2016-01, shopping), 125 42 | (2016-01, gas), 50 43 | """ 44 | total = sum(values) 45 | self.handle_budget_notifications(key, total) 46 | yield key, sum(values) 47 | 48 | def steps(self): 49 | """Run the map and reduce steps.""" 50 | return [ 51 | self.mr(mapper=self.mapper, 52 | reducer=self.reducer) 53 | ] 54 | 55 | 56 | if __name__ == '__main__': 57 | SpendingByCategory.run() 58 | -------------------------------------------------------------------------------- /solutions/system_design/mint/mint_snippets.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class DefaultCategories(Enum): 4 | 5 | HOUSING = 0 6 | FOOD = 1 7 | GAS = 2 8 | SHOPPING = 3 9 | ... 10 | 11 | seller_category_map = {} 12 | seller_category_map['Exxon'] = DefaultCategories.GAS 13 | seller_category_map['Target'] = DefaultCategories.SHOPPING 14 | 15 | 16 | class Categorizer(object): 17 | 18 | def __init__(self, seller_category_map, seller_category_overrides_map): 19 | self.seller_category_map = seller_category_map 20 | self.seller_category_overrides_map = seller_category_overrides_map 21 | 22 | def categorize(self, transaction): 23 | if transaction.seller in self.seller_category_map: 24 | return self.seller_category_map[transaction.seller] 25 | if transaction.seller in self.seller_category_overrides_map: 26 | seller_category_map[transaction.seller] = \ 27 | self.manual_overrides[transaction.seller].peek_min() 28 | return self.seller_category_map[transaction.seller] 29 | return None 30 | 31 | 32 | class Transaction(object): 33 | 34 | def __init__(self, timestamp, seller, amount): 35 | self.timestamp = timestamp 36 | self.seller = seller 37 | self.amount = amount 38 | 39 | 40 | class Budget(object): 41 | 42 | def __init__(self, template_categories_to_budget_map): 43 | self.categories_to_budget_map = template_categories_to_budget_map 44 | 45 | def override_category_budget(self, category, amount): 46 | self.categories_to_budget_map[category] = amount 47 | 48 | -------------------------------------------------------------------------------- /solutions/system_design/pastebin/README.md: -------------------------------------------------------------------------------- 1 | # Design Pastebin.com (or Bit.ly) 2 | 3 | *Note: This document links directly to relevant areas found in the [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) to avoid duplication. Refer to the linked content for general talking points, tradeoffs, and alternatives.* 4 | 5 | **Design Bit.ly** - is a similar question, except pastebin requires storing the paste contents instead of the original unshortened url. 6 | 7 | ## Step 1: Outline use cases and constraints 8 | 9 | > Gather requirements and scope the problem. 10 | > Ask questions to clarify use cases and constraints. 11 | > Discuss assumptions. 12 | 13 | Without an interviewer to address clarifying questions, we'll define some use cases and constraints. 14 | 15 | ### Use cases 16 | 17 | #### We'll scope the problem to handle only the following use cases 18 | 19 | * **User** enters a block of text and gets a randomly generated link 20 | * Expiration 21 | * Default setting does not expire 22 | * Can optionally set a timed expiration 23 | * **User** enters a paste's url and views the contents 24 | * **User** is anonymous 25 | * **Service** tracks analytics of pages 26 | * Monthly visit stats 27 | * **Service** deletes expired pastes 28 | * **Service** has high availability 29 | 30 | #### Out of scope 31 | 32 | * **User** registers for an account 33 | * **User** verifies email 34 | * **User** logs into a registered account 35 | * **User** edits the document 36 | * **User** can set visibility 37 | * **User** can set the shortlink 38 | 39 | ### Constraints and assumptions 40 | 41 | #### State assumptions 42 | 43 | * Traffic is not evenly distributed 44 | * Following a short link should be fast 45 | * Pastes are text only 46 | * Page view analytics do not need to be realtime 47 | * 10 million users 48 | * 10 million paste writes per month 49 | * 100 million paste reads per month 50 | * 10:1 read to write ratio 51 | 52 | #### Calculate usage 53 | 54 | **Clarify with your interviewer if you should run back-of-the-envelope usage calculations.** 55 | 56 | * Size per paste 57 | * 1 KB content per paste 58 | * `shortlink` - 7 bytes 59 | * `expiration_length_in_minutes` - 4 bytes 60 | * `created_at` - 5 bytes 61 | * `paste_path` - 255 bytes 62 | * total = ~1.27 KB 63 | * 12.7 GB of new paste content per month 64 | * 1.27 KB per paste * 10 million pastes per month 65 | * ~450 GB of new paste content in 3 years 66 | * 360 million shortlinks in 3 years 67 | * Assume most are new pastes instead of updates to existing ones 68 | * 4 paste writes per second on average 69 | * 40 read requests per second on average 70 | 71 | Handy conversion guide: 72 | 73 | * 2.5 million seconds per month 74 | * 1 request per second = 2.5 million requests per month 75 | * 40 requests per second = 100 million requests per month 76 | * 400 requests per second = 1 billion requests per month 77 | 78 | ## Step 2: Create a high level design 79 | 80 | > Outline a high level design with all important components. 81 | 82 | ![Imgur](http://i.imgur.com/BKsBnmG.png) 83 | 84 | ## Step 3: Design core components 85 | 86 | > Dive into details for each core component. 87 | 88 | ### Use case: User enters a block of text and gets a randomly generated link 89 | 90 | We could use a [relational database](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms) as a large hash table, mapping the generated url to a file server and path containing the paste file. 91 | 92 | Instead of managing a file server, we could use a managed **Object Store** such as Amazon S3 or a [NoSQL document store](https://github.com/donnemartin/system-design-primer#document-store). 93 | 94 | An alternative to a relational database acting as a large hash table, we could use a [NoSQL key-value store](https://github.com/donnemartin/system-design-primer#key-value-store). We should discuss the [tradeoffs between choosing SQL or NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql). The following discussion uses the relational database approach. 95 | 96 | * The **Client** sends a create paste request to the **Web Server**, running as a [reverse proxy](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 97 | * The **Web Server** forwards the request to the **Write API** server 98 | * The **Write API** server does does the following: 99 | * Generates a unique url 100 | * Checks if the url is unique by looking at the **SQL Database** for a duplicate 101 | * If the url is not unique, it generates another url 102 | * If we supported a custom url, we could use the user-supplied (also check for a duplicate) 103 | * Saves to the **SQL Database** `pastes` table 104 | * Saves the paste data to the **Object Store** 105 | * Returns the url 106 | 107 | **Clarify with your interviewer how much code you are expected to write**. 108 | 109 | The `pastes` table could have the following structure: 110 | 111 | ``` 112 | shortlink char(7) NOT NULL 113 | expiration_length_in_minutes int NOT NULL 114 | created_at datetime NOT NULL 115 | paste_path varchar(255) NOT NULL 116 | PRIMARY KEY(shortlink) 117 | ``` 118 | 119 | We'll create an [index](https://github.com/donnemartin/system-design-primer#use-good-indices) on `shortlink ` and `created_at` to speed up lookups (log-time instead of scanning the entire table) and to keep the data in memory. Reading 1 MB sequentially from memory takes about 250 microseconds, while reading from SSD takes 4x and from disk takes 80x longer.1 120 | 121 | To generate the unique url, we could: 122 | 123 | * Take the [**MD5**](https://en.wikipedia.org/wiki/MD5) hash of the user's ip_address + timestamp 124 | * MD5 is a widely used hashing function that produces a 128-bit hash value 125 | * MD5 is uniformly distributed 126 | * Alternatively, we could also take the MD5 hash of randomly-generated data 127 | * [**Base 62**](https://www.kerstner.at/2012/07/shortening-strings-using-base-62-encoding/) encode the MD5 hash 128 | * Base 62 encodes to `[a-zA-Z0-9]` which works well for urls, eliminating the need for escaping special characters 129 | * There is only one hash result for the original input and and Base 62 is deterministic (no randomness involved) 130 | * Base 64 is another popular encoding but provides issues for urls because of the additional `+` and `/` characters 131 | * The following [Base 62 pseudocode](http://stackoverflow.com/questions/742013/how-to-code-a-url-shortener) runs in O(k) time where k is the number of digits = 7: 132 | 133 | ``` 134 | def base_encode(num, base=62): 135 | digits = [] 136 | while num > 0 137 | remainder = modulo(num, base) 138 | digits.push(remainder) 139 | num = divide(num, base) 140 | digits = digits.reverse 141 | ``` 142 | 143 | * Take the first 7 characters of the output, which results in 62^7 possible values and should be sufficient to handle our constraint of 360 million shortlinks in 3 years: 144 | 145 | ``` 146 | url = base_encode(md5(ip_address+timestamp))[:URL_LENGTH] 147 | ``` 148 | 149 | We'll use a public [**REST API**](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest): 150 | 151 | ``` 152 | $ curl -X POST --data '{ "expiration_length_in_minutes": "60", \ 153 | "paste_contents": "Hello World!" }' https://pastebin.com/api/v1/paste 154 | ``` 155 | 156 | Response: 157 | 158 | ``` 159 | { 160 | "shortlink": "foobar" 161 | } 162 | ``` 163 | 164 | For internal communications, we could use [Remote Procedure Calls](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc). 165 | 166 | ### Use case: User enters a paste's url and views the contents 167 | 168 | * The **Client** sends a get paste request to the **Web Server** 169 | * The **Web Server** forwards the request to the **Read API** server 170 | * The **Read API** server does the following: 171 | * Checks the **SQL Database** for the generated url 172 | * If the url is in the **SQL Database**, fetch the paste contents from the **Object Store** 173 | * Else, return an error message for the user 174 | 175 | REST API: 176 | 177 | ``` 178 | $ curl https://pastebin.com/api/v1/paste?shortlink=foobar 179 | ``` 180 | 181 | Response: 182 | 183 | ``` 184 | { 185 | "paste_contents": "Hello World" 186 | "created_at": "YYYY-MM-DD HH:MM:SS" 187 | "expiration_length_in_minutes": "60" 188 | } 189 | ``` 190 | 191 | ### Use case: Service tracks analytics of pages 192 | 193 | Since realtime analytics are not a requirement, we could simply **MapReduce** the **Web Server** logs to generate hit counts. 194 | 195 | **Clarify with your interviewer how much code you are expected to write**. 196 | 197 | ``` 198 | class HitCounts(MRJob): 199 | 200 | def extract_url(self, line): 201 | """Extract the generated url from the log line.""" 202 | ... 203 | 204 | def extract_year_month(self, line): 205 | """Return the year and month portions of the timestamp.""" 206 | ... 207 | 208 | def mapper(self, _, line): 209 | """Parse each log line, extract and transform relevant lines. 210 | 211 | Emit key value pairs of the form: 212 | 213 | (2016-01, url0), 1 214 | (2016-01, url0), 1 215 | (2016-01, url1), 1 216 | """ 217 | url = self.extract_url(line) 218 | period = self.extract_year_month(line) 219 | yield (period, url), 1 220 | 221 | def reducer(self, key, value): 222 | """Sum values for each key. 223 | 224 | (2016-01, url0), 2 225 | (2016-01, url1), 1 226 | """ 227 | yield key, sum(values) 228 | ``` 229 | 230 | ### Use case: Service deletes expired pastes 231 | 232 | To delete expired pastes, we could just scan the **SQL Database** for all entries whose expiration timestamp are older than the current timestamp. All expired entries would then be deleted (or marked as expired) from the table. 233 | 234 | ## Step 4: Scale the design 235 | 236 | > Identify and address bottlenecks, given the constraints. 237 | 238 | ![Imgur](http://i.imgur.com/4edXG0T.png) 239 | 240 | **Important: Do not simply jump right into the final design from the initial design!** 241 | 242 | State you would do this iteratively: 1) **Benchmark/Load Test**, 2) **Profile** for bottlenecks 3) address bottlenecks while evaluating alternatives and trade-offs, and 4) repeat. See [Design a system that scales to millions of users on AWS](https://github.com/donnemartin/system-design-primer/blob/master/solutions/system_design/scaling_aws/README.md) as a sample on how to iteratively scale the initial design. 243 | 244 | It's important to discuss what bottlenecks you might encounter with the initial design and how you might address each of them. For example, what issues are addressed by adding a **Load Balancer** with multiple **Web Servers**? **CDN**? **Master-Slave Replicas**? What are the alternatives and **Trade-Offs** for each? 245 | 246 | We'll introduce some components to complete the design and to address scalability issues. Internal load balancers are not shown to reduce clutter. 247 | 248 | *To avoid repeating discussions*, refer to the following [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) for main talking points, tradeoffs, and alternatives: 249 | 250 | * [DNS](https://github.com/donnemartin/system-design-primer#domain-name-system) 251 | * [CDN](https://github.com/donnemartin/system-design-primer#content-delivery-network) 252 | * [Load balancer](https://github.com/donnemartin/system-design-primer#load-balancer) 253 | * [Horizontal scaling](https://github.com/donnemartin/system-design-primer#horizontal-scaling) 254 | * [Web server (reverse proxy)](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 255 | * [API server (application layer)](https://github.com/donnemartin/system-design-primer#application-layer) 256 | * [Cache](https://github.com/donnemartin/system-design-primer#cache) 257 | * [Relational database management system (RDBMS)](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms) 258 | * [SQL write master-slave failover](https://github.com/donnemartin/system-design-primer#fail-over) 259 | * [Master-slave replication](https://github.com/donnemartin/system-design-primer#master-slave-replication) 260 | * [Consistency patterns](https://github.com/donnemartin/system-design-primer#consistency-patterns) 261 | * [Availability patterns](https://github.com/donnemartin/system-design-primer#availability-patterns) 262 | 263 | The **Analytics Database** could use a data warehousing solution such as Amazon Redshift or Google BigQuery. 264 | 265 | An **Object Store** such as Amazon S3 can comfortably handle the constraint of 12.7 GB of new content per month. 266 | 267 | To address the 40 *average* read requests per second (higher at peak), traffic for popular content should be handled by the **Memory Cache** instead of the database. The **Memory Cache** is also useful for handling the unevenly distributed traffic and traffic spikes. The **SQL Read Replicas** should be able to handle the cache misses, as long as the replicas are not bogged down with replicating writes. 268 | 269 | 4 *average* paste writes per second (with higher at peak) should be do-able for a single **SQL Write Master-Slave**. Otherwise, we'll need to employ additional SQL scaling patterns: 270 | 271 | * [Federation](https://github.com/donnemartin/system-design-primer#federation) 272 | * [Sharding](https://github.com/donnemartin/system-design-primer#sharding) 273 | * [Denormalization](https://github.com/donnemartin/system-design-primer#denormalization) 274 | * [SQL Tuning](https://github.com/donnemartin/system-design-primer#sql-tuning) 275 | 276 | We should also consider moving some data to a **NoSQL Database**. 277 | 278 | ## Additional talking points 279 | 280 | > Additional topics to dive into, depending on the problem scope and time remaining. 281 | 282 | #### NoSQL 283 | 284 | * [Key-value store](https://github.com/donnemartin/system-design-primer#key-value-store) 285 | * [Document store](https://github.com/donnemartin/system-design-primer#document-store) 286 | * [Wide column store](https://github.com/donnemartin/system-design-primer#wide-column-store) 287 | * [Graph database](https://github.com/donnemartin/system-design-primer#graph-database) 288 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql) 289 | 290 | ### Caching 291 | 292 | * Where to cache 293 | * [Client caching](https://github.com/donnemartin/system-design-primer#client-caching) 294 | * [CDN caching](https://github.com/donnemartin/system-design-primer#cdn-caching) 295 | * [Web server caching](https://github.com/donnemartin/system-design-primer#web-server-caching) 296 | * [Database caching](https://github.com/donnemartin/system-design-primer#database-caching) 297 | * [Application caching](https://github.com/donnemartin/system-design-primer#application-caching) 298 | * What to cache 299 | * [Caching at the database query level](https://github.com/donnemartin/system-design-primer#caching-at-the-database-query-level) 300 | * [Caching at the object level](https://github.com/donnemartin/system-design-primer#caching-at-the-object-level) 301 | * When to update the cache 302 | * [Cache-aside](https://github.com/donnemartin/system-design-primer#cache-aside) 303 | * [Write-through](https://github.com/donnemartin/system-design-primer#write-through) 304 | * [Write-behind (write-back)](https://github.com/donnemartin/system-design-primer#write-behind-write-back) 305 | * [Refresh ahead](https://github.com/donnemartin/system-design-primer#refresh-ahead) 306 | 307 | ### Asynchronism and microservices 308 | 309 | * [Message queues](https://github.com/donnemartin/system-design-primer#message-queues) 310 | * [Task queues](https://github.com/donnemartin/system-design-primer#task-queues) 311 | * [Back pressure](https://github.com/donnemartin/system-design-primer#back-pressure) 312 | * [Microservices](https://github.com/donnemartin/system-design-primer#microservices) 313 | 314 | ### Communications 315 | 316 | * Discuss tradeoffs: 317 | * External communication with clients - [HTTP APIs following REST](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest) 318 | * Internal communications - [RPC](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc) 319 | * [Service discovery](https://github.com/donnemartin/system-design-primer#service-discovery) 320 | 321 | ### Security 322 | 323 | Refer to the [security section](https://github.com/donnemartin/system-design-primer#security). 324 | 325 | ### Latency numbers 326 | 327 | See [Latency numbers every programmer should know](https://github.com/donnemartin/system-design-primer#latency-numbers-every-programmer-should-know). 328 | 329 | ### Ongoing 330 | 331 | * Continue benchmarking and monitoring your system to address bottlenecks as they come up 332 | * Scaling is an iterative process 333 | -------------------------------------------------------------------------------- /solutions/system_design/pastebin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/pastebin/__init__.py -------------------------------------------------------------------------------- /solutions/system_design/pastebin/pastebin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/pastebin/pastebin.png -------------------------------------------------------------------------------- /solutions/system_design/pastebin/pastebin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from mrjob.job import MRJob 4 | 5 | 6 | class HitCounts(MRJob): 7 | 8 | def extract_url(self, line): 9 | """Extract the generated url from the log line.""" 10 | pass 11 | 12 | def extract_year_month(self, line): 13 | """Return the year and month portions of the timestamp.""" 14 | pass 15 | 16 | def mapper(self, _, line): 17 | """Parse each log line, extract and transform relevant lines. 18 | 19 | Emit key value pairs of the form: 20 | 21 | (2016-01, url0), 1 22 | (2016-01, url0), 1 23 | (2016-01, url1), 1 24 | """ 25 | url = self.extract_url(line) 26 | period = self.extract_year_month(line) 27 | yield (period, url), 1 28 | 29 | def reducer(self, key, value): 30 | """Sum values for each key. 31 | 32 | (2016-01, url0), 2 33 | (2016-01, url1), 1 34 | """ 35 | yield key, sum(values) 36 | 37 | def steps(self): 38 | """Run the map and reduce steps.""" 39 | return [ 40 | self.mr(mapper=self.mapper, 41 | reducer=self.reducer) 42 | ] 43 | 44 | 45 | if __name__ == '__main__': 46 | HitCounts.run() 47 | -------------------------------------------------------------------------------- /solutions/system_design/pastebin/pastebin_basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/pastebin/pastebin_basic.png -------------------------------------------------------------------------------- /solutions/system_design/query_cache/README.md: -------------------------------------------------------------------------------- 1 | # Design a key-value cache to save the results of the most recent web server queries 2 | 3 | *Note: This document links directly to relevant areas found in the [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) to avoid duplication. Refer to the linked content for general talking points, tradeoffs, and alternatives.* 4 | 5 | ## Step 1: Outline use cases and constraints 6 | 7 | > Gather requirements and scope the problem. 8 | > Ask questions to clarify use cases and constraints. 9 | > Discuss assumptions. 10 | 11 | Without an interviewer to address clarifying questions, we'll define some use cases and constraints. 12 | 13 | ### Use cases 14 | 15 | #### We'll scope the problem to handle only the following use cases 16 | 17 | * **User** sends a search request resulting in a cache hit 18 | * **User** sends a search request resulting in a cache miss 19 | * **Service** has high availability 20 | 21 | ### Constraints and assumptions 22 | 23 | #### State assumptions 24 | 25 | * Traffic is not evenly distributed 26 | * Popular queries should almost always be in the cache 27 | * Need to determine how to expire/refresh 28 | * Serving from cache requires fast lookups 29 | * Low latency between machines 30 | * Limited memory in cache 31 | * Need to determine what to keep/remove 32 | * Need to cache millions of queries 33 | * 10 million users 34 | * 10 billion queries per month 35 | 36 | #### Calculate usage 37 | 38 | **Clarify with your interviewer if you should run back-of-the-envelope usage calculations.** 39 | 40 | * Cache stores ordered list of key: query, value: results 41 | * `query` - 50 bytes 42 | * `title` - 20 bytes 43 | * `snippet` - 200 bytes 44 | * Total: 270 bytes 45 | * 2.7 TB of cache data per month if all 10 billion queries are unique and all are stored 46 | * 270 bytes per search * 10 billion searches per month 47 | * Assumptions state limited memory, need to determine how to expire contents 48 | * 4,000 requests per second 49 | 50 | Handy conversion guide: 51 | 52 | * 2.5 million seconds per month 53 | * 1 request per second = 2.5 million requests per month 54 | * 40 requests per second = 100 million requests per month 55 | * 400 requests per second = 1 billion requests per month 56 | 57 | ## Step 2: Create a high level design 58 | 59 | > Outline a high level design with all important components. 60 | 61 | ![Imgur](http://i.imgur.com/KqZ3dSx.png) 62 | 63 | ## Step 3: Design core components 64 | 65 | > Dive into details for each core component. 66 | 67 | ### Use case: User sends a request resulting in a cache hit 68 | 69 | Popular queries can be served from a **Memory Cache** such as Redis or Memcached to reduce read latency and to avoid overloading the **Reverse Index Service** and **Document Service**. Reading 1 MB sequentially from memory takes about 250 microseconds, while reading from SSD takes 4x and from disk takes 80x longer.1 70 | 71 | Since the cache has limited capacity, we'll use a least recently used (LRU) approach to expire older entries. 72 | 73 | * The **Client** sends a request to the **Web Server**, running as a [reverse proxy](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 74 | * The **Web Server** forwards the request to the **Query API** server 75 | * The **Query API** server does the following: 76 | * Parses the query 77 | * Removes markup 78 | * Breaks up the text into terms 79 | * Fixes typos 80 | * Normalizes capitalization 81 | * Converts the query to use boolean operations 82 | * Checks the **Memory Cache** for the content matching the query 83 | * If there's a hit in the **Memory Cache**, the **Memory Cache** does the following: 84 | * Updates the cached entry's position to the front of the LRU list 85 | * Returns the cached contents 86 | * Else, the **Query API** does the following: 87 | * Uses the **Reverse Index Service** to find documents matching the query 88 | * The **Reverse Index Service** ranks the matching results and returns the top ones 89 | * Uses the **Document Service** to return titles and snippets 90 | * Updates the **Memory Cache** with the contents, placing the entry at the front of the LRU list 91 | 92 | #### Cache implementation 93 | 94 | The cache can use a doubly-linked list: new items will be added to the head while items to expire will be removed from the tail. We'll use a hash table for fast lookups to each linked list node. 95 | 96 | **Clarify with your interviewer how much code you are expected to write**. 97 | 98 | **Query API Server** implementation: 99 | 100 | ``` 101 | class QueryApi(object): 102 | 103 | def __init__(self, memory_cache, reverse_index_service): 104 | self.memory_cache = memory_cache 105 | self.reverse_index_service = reverse_index_service 106 | 107 | def parse_query(self, query): 108 | """Remove markup, break text into terms, deal with typos, 109 | normalize capitalization, convert to use boolean operations. 110 | """ 111 | ... 112 | 113 | def process_query(self, query): 114 | query = self.parse_query(query) 115 | results = self.memory_cache.get(query) 116 | if results is None: 117 | results = self.reverse_index_service.process_search(query) 118 | self.memory_cache.set(query, results) 119 | return results 120 | ``` 121 | 122 | **Node** implementation: 123 | 124 | ``` 125 | class Node(object): 126 | 127 | def __init__(self, query, results): 128 | self.query = query 129 | self.results = results 130 | ``` 131 | 132 | **LinkedList** implementation: 133 | 134 | ``` 135 | class LinkedList(object): 136 | 137 | def __init__(self): 138 | self.head = None 139 | self.tail = None 140 | 141 | def move_to_front(self, node): 142 | ... 143 | 144 | def append_to_front(self, node): 145 | ... 146 | 147 | def remove_from_tail(self): 148 | ... 149 | ``` 150 | 151 | **Cache** implementation: 152 | 153 | ``` 154 | class Cache(object): 155 | 156 | def __init__(self, MAX_SIZE): 157 | self.MAX_SIZE = MAX_SIZE 158 | self.size = 0 159 | self.lookup = {} # key: query, value: node 160 | self.linked_list = LinkedList() 161 | 162 | def get(self, query) 163 | """Get the stored query result from the cache. 164 | 165 | Accessing a node updates its position to the front of the LRU list. 166 | """ 167 | node = self.lookup[query] 168 | if node is None: 169 | return None 170 | self.linked_list.move_to_front(node) 171 | return node.results 172 | 173 | def set(self, results, query): 174 | """Set the result for the given query key in the cache. 175 | 176 | When updating an entry, updates its position to the front of the LRU list. 177 | If the entry is new and the cache is at capacity, removes the oldest entry 178 | before the new entry is added. 179 | """ 180 | node = self.lookup[query] 181 | if node is not None: 182 | # Key exists in cache, update the value 183 | node.results = results 184 | self.linked_list.move_to_front(node) 185 | else: 186 | # Key does not exist in cache 187 | if self.size == self.MAX_SIZE: 188 | # Remove the oldest entry from the linked list and lookup 189 | self.lookup.pop(self.linked_list.tail.query, None) 190 | self.linked_list.remove_from_tail() 191 | else: 192 | self.size += 1 193 | # Add the new key and value 194 | new_node = Node(query, results) 195 | self.linked_list.append_to_front(new_node) 196 | self.lookup[query] = new_node 197 | ``` 198 | 199 | #### When to update the cache 200 | 201 | The cache should be updated when: 202 | 203 | * The page contents change 204 | * The page is removed or a new page is added 205 | * The page rank changes 206 | 207 | The most straightforward way to handle these cases is to simply set a max time that a cached entry can stay in the cache before it is updated, usually referred to as time to live (TTL). 208 | 209 | Refer to [When to update the cache](https://github.com/donnemartin/system-design-primer#when-to-update-the-cache) for tradeoffs and alternatives. The approach above describes [cache-aside](https://github.com/donnemartin/system-design-primer#cache-aside). 210 | 211 | ## Step 4: Scale the design 212 | 213 | > Identify and address bottlenecks, given the constraints. 214 | 215 | ![Imgur](http://i.imgur.com/4j99mhe.png) 216 | 217 | **Important: Do not simply jump right into the final design from the initial design!** 218 | 219 | State you would 1) **Benchmark/Load Test**, 2) **Profile** for bottlenecks 3) address bottlenecks while evaluating alternatives and trade-offs, and 4) repeat. See [Design a system that scales to millions of users on AWS](https://github.com/donnemartin/system-design-primer/blob/master/solutions/system_design/scaling_aws/README.md) as a sample on how to iteratively scale the initial design. 220 | 221 | It's important to discuss what bottlenecks you might encounter with the initial design and how you might address each of them. For example, what issues are addressed by adding a **Load Balancer** with multiple **Web Servers**? **CDN**? **Master-Slave Replicas**? What are the alternatives and **Trade-Offs** for each? 222 | 223 | We'll introduce some components to complete the design and to address scalability issues. Internal load balancers are not shown to reduce clutter. 224 | 225 | *To avoid repeating discussions*, refer to the following [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) for main talking points, tradeoffs, and alternatives: 226 | 227 | * [DNS](https://github.com/donnemartin/system-design-primer#domain-name-system) 228 | * [Load balancer](https://github.com/donnemartin/system-design-primer#load-balancer) 229 | * [Horizontal scaling](https://github.com/donnemartin/system-design-primer#horizontal-scaling) 230 | * [Web server (reverse proxy)](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 231 | * [API server (application layer)](https://github.com/donnemartin/system-design-primer#application-layer) 232 | * [Cache](https://github.com/donnemartin/system-design-primer#cache) 233 | * [Consistency patterns](https://github.com/donnemartin/system-design-primer#consistency-patterns) 234 | * [Availability patterns](https://github.com/donnemartin/system-design-primer#availability-patterns) 235 | 236 | ### Expanding the Memory Cache to many machines 237 | 238 | To handle the heavy request load and the large amount of memory needed, we'll scale horizontally. We have three main options on how to store the data on our **Memory Cache** cluster: 239 | 240 | * **Each machine in the cache cluster has its own cache** - Simple, although it will likely result in a low cache hit rate. 241 | * **Each machine in the cache cluster has a copy of the cache** - Simple, although it is an inefficient use of memory. 242 | * **The cache is [sharded](https://github.com/donnemartin/system-design-primer#sharding) across all machines in the cache cluster** - More complex, although it is likely the best option. We could use hashing to determine which machine could have the cached results of a query using `machine = hash(query)`. We'll likely want to use [consistent hashing](https://github.com/donnemartin/system-design-primer#under-development). 243 | 244 | ## Additional talking points 245 | 246 | > Additional topics to dive into, depending on the problem scope and time remaining. 247 | 248 | ### SQL scaling patterns 249 | 250 | * [Read replicas](https://github.com/donnemartin/system-design-primer#master-slave) 251 | * [Federation](https://github.com/donnemartin/system-design-primer#federation) 252 | * [Sharding](https://github.com/donnemartin/system-design-primer#sharding) 253 | * [Denormalization](https://github.com/donnemartin/system-design-primer#denormalization) 254 | * [SQL Tuning](https://github.com/donnemartin/system-design-primer#sql-tuning) 255 | 256 | #### NoSQL 257 | 258 | * [Key-value store](https://github.com/donnemartin/system-design-primer#key-value-store) 259 | * [Document store](https://github.com/donnemartin/system-design-primer#document-store) 260 | * [Wide column store](https://github.com/donnemartin/system-design-primer#wide-column-store) 261 | * [Graph database](https://github.com/donnemartin/system-design-primer#graph-database) 262 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql) 263 | 264 | ### Caching 265 | 266 | * Where to cache 267 | * [Client caching](https://github.com/donnemartin/system-design-primer#client-caching) 268 | * [CDN caching](https://github.com/donnemartin/system-design-primer#cdn-caching) 269 | * [Web server caching](https://github.com/donnemartin/system-design-primer#web-server-caching) 270 | * [Database caching](https://github.com/donnemartin/system-design-primer#database-caching) 271 | * [Application caching](https://github.com/donnemartin/system-design-primer#application-caching) 272 | * What to cache 273 | * [Caching at the database query level](https://github.com/donnemartin/system-design-primer#caching-at-the-database-query-level) 274 | * [Caching at the object level](https://github.com/donnemartin/system-design-primer#caching-at-the-object-level) 275 | * When to update the cache 276 | * [Cache-aside](https://github.com/donnemartin/system-design-primer#cache-aside) 277 | * [Write-through](https://github.com/donnemartin/system-design-primer#write-through) 278 | * [Write-behind (write-back)](https://github.com/donnemartin/system-design-primer#write-behind-write-back) 279 | * [Refresh ahead](https://github.com/donnemartin/system-design-primer#refresh-ahead) 280 | 281 | ### Asynchronism and microservices 282 | 283 | * [Message queues](https://github.com/donnemartin/system-design-primer#message-queues) 284 | * [Task queues](https://github.com/donnemartin/system-design-primer#task-queues) 285 | * [Back pressure](https://github.com/donnemartin/system-design-primer#back-pressure) 286 | * [Microservices](https://github.com/donnemartin/system-design-primer#microservices) 287 | 288 | ### Communications 289 | 290 | * Discuss tradeoffs: 291 | * External communication with clients - [HTTP APIs following REST](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest) 292 | * Internal communications - [RPC](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc) 293 | * [Service discovery](https://github.com/donnemartin/system-design-primer#service-discovery) 294 | 295 | ### Security 296 | 297 | Refer to the [security section](https://github.com/donnemartin/system-design-primer#security). 298 | 299 | ### Latency numbers 300 | 301 | See [Latency numbers every programmer should know](https://github.com/donnemartin/system-design-primer#latency-numbers-every-programmer-should-know). 302 | 303 | ### Ongoing 304 | 305 | * Continue benchmarking and monitoring your system to address bottlenecks as they come up 306 | * Scaling is an iterative process 307 | -------------------------------------------------------------------------------- /solutions/system_design/query_cache/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/query_cache/__init__.py -------------------------------------------------------------------------------- /solutions/system_design/query_cache/query_cache.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/query_cache/query_cache.png -------------------------------------------------------------------------------- /solutions/system_design/query_cache/query_cache_basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/query_cache/query_cache_basic.png -------------------------------------------------------------------------------- /solutions/system_design/query_cache/query_cache_snippets.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class QueryApi(object): 4 | 5 | def __init__(self, memory_cache, reverse_index_cluster): 6 | self.memory_cache = memory_cache 7 | self.reverse_index_cluster = reverse_index_cluster 8 | 9 | def parse_query(self, query): 10 | """Remove markup, break text into terms, deal with typos, 11 | normalize capitalization, convert to use boolean operations. 12 | """ 13 | ... 14 | 15 | def process_query(self, query): 16 | query = self.parse_query(query) 17 | results = self.memory_cache.get(query) 18 | if results is None: 19 | results = self.reverse_index_cluster.process_search(query) 20 | self.memory_cache.set(query, results) 21 | return results 22 | 23 | 24 | class Node(object): 25 | 26 | def __init__(self, query, results): 27 | self.query = query 28 | self.results = results 29 | 30 | 31 | class LinkedList(object): 32 | 33 | def __init__(self): 34 | self.head = None 35 | self.tail = None 36 | 37 | def move_to_front(self, node): 38 | ... 39 | 40 | def append_to_front(self, node): 41 | ... 42 | 43 | def remove_from_tail(self): 44 | ... 45 | 46 | 47 | class Cache(object): 48 | 49 | def __init__(self, MAX_SIZE): 50 | self.MAX_SIZE = MAX_SIZE 51 | self.size = 0 52 | self.lookup = {} 53 | self.linked_list = LinkedList() 54 | 55 | def get(self, query) 56 | """Get the stored query result from the cache. 57 | 58 | Accessing a node updates its position to the front of the LRU list. 59 | """ 60 | node = self.lookup[query] 61 | if node is None: 62 | return None 63 | self.linked_list.move_to_front(node) 64 | return node.results 65 | 66 | def set(self, results, query): 67 | """Set the result for the given query key in the cache. 68 | 69 | When updating an entry, updates its position to the front of the LRU list. 70 | If the entry is new and the cache is at capacity, removes the oldest entry 71 | before the new entry is added. 72 | """ 73 | node = self.map[query] 74 | if node is not None: 75 | # Key exists in cache, update the value 76 | node.results = results 77 | self.linked_list.move_to_front(node) 78 | else: 79 | # Key does not exist in cache 80 | if self.size == self.MAX_SIZE: 81 | # Remove the oldest entry from the linked list and lookup 82 | self.lookup.pop(self.linked_list.tail.query, None) 83 | self.linked_list.remove_from_tail() 84 | else: 85 | self.size += 1 86 | # Add the new key and value 87 | new_node = Node(query, results) 88 | self.linked_list.append_to_front(new_node) 89 | self.lookup[query] = new_node 90 | -------------------------------------------------------------------------------- /solutions/system_design/sales_rank/README.md: -------------------------------------------------------------------------------- 1 | # Design Amazon's sales rank by category feature 2 | 3 | *Note: This document links directly to relevant areas found in the [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) to avoid duplication. Refer to the linked content for general talking points, tradeoffs, and alternatives.* 4 | 5 | ## Step 1: Outline use cases and constraints 6 | 7 | > Gather requirements and scope the problem. 8 | > Ask questions to clarify use cases and constraints. 9 | > Discuss assumptions. 10 | 11 | Without an interviewer to address clarifying questions, we'll define some use cases and constraints. 12 | 13 | ### Use cases 14 | 15 | #### We'll scope the problem to handle only the following use case 16 | 17 | * **Service** calculates the past week's most popular products by category 18 | * **User** views the past week's most popular products by category 19 | * **Service** has high availability 20 | 21 | #### Out of scope 22 | 23 | * The general e-commerce site 24 | * Design components only for calculating sales rank 25 | 26 | ### Constraints and assumptions 27 | 28 | #### State assumptions 29 | 30 | * Traffic is not evenly distributed 31 | * Items can be in multiple categories 32 | * Items cannot change categories 33 | * There are no subcategories ie `foo/bar/baz` 34 | * Results must be updated hourly 35 | * More popular products might need to be updated more frequently 36 | * 10 million products 37 | * 1000 categories 38 | * 1 billion transactions per month 39 | * 100 billion read requests per month 40 | * 100:1 read to write ratio 41 | 42 | #### Calculate usage 43 | 44 | **Clarify with your interviewer if you should run back-of-the-envelope usage calculations.** 45 | 46 | * Size per transaction: 47 | * `created_at` - 5 bytes 48 | * `product_id` - 8 bytes 49 | * `category_id` - 4 bytes 50 | * `seller_id` - 8 bytes 51 | * `buyer_id` - 8 bytes 52 | * `quantity` - 4 bytes 53 | * `total_price` - 5 bytes 54 | * Total: ~40 bytes 55 | * 40 GB of new transaction content per month 56 | * 40 bytes per transaction * 1 billion transactions per month 57 | * 1.44 TB of new transaction content in 3 years 58 | * Assume most are new transactions instead of updates to existing ones 59 | * 400 transactions per second on average 60 | * 40,000 read requests per second on average 61 | 62 | Handy conversion guide: 63 | 64 | * 2.5 million seconds per month 65 | * 1 request per second = 2.5 million requests per month 66 | * 40 requests per second = 100 million requests per month 67 | * 400 requests per second = 1 billion requests per month 68 | 69 | ## Step 2: Create a high level design 70 | 71 | > Outline a high level design with all important components. 72 | 73 | ![Imgur](http://i.imgur.com/vwMa1Qu.png) 74 | 75 | ## Step 3: Design core components 76 | 77 | > Dive into details for each core component. 78 | 79 | ### Use case: Service calculates the past week's most popular products by category 80 | 81 | We could store the raw **Sales API** server log files on a managed **Object Store** such as Amazon S3, rather than managing our own distributed file system. 82 | 83 | **Clarify with your interviewer how much code you are expected to write**. 84 | 85 | We'll assume this is a sample log entry, tab delimited: 86 | 87 | ``` 88 | timestamp product_id category_id qty total_price seller_id buyer_id 89 | t1 product1 category1 2 20.00 1 1 90 | t2 product1 category2 2 20.00 2 2 91 | t2 product1 category2 1 10.00 2 3 92 | t3 product2 category1 3 7.00 3 4 93 | t4 product3 category2 7 2.00 4 5 94 | t5 product4 category1 1 5.00 5 6 95 | ... 96 | ``` 97 | 98 | The **Sales Rank Service** could use **MapReduce**, using the **Sales API** server log files as input and writing the results to an aggregate table `sales_rank` in a **SQL Database**. We should discuss the [use cases and tradeoffs between choosing SQL or NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql). 99 | 100 | We'll use a multi-step **MapReduce**: 101 | 102 | * **Step 1** - Transform the data to `(category, product_id), sum(quantity)` 103 | * **Step 2** - Perform a distributed sort 104 | 105 | ``` 106 | class SalesRanker(MRJob): 107 | 108 | def within_past_week(self, timestamp): 109 | """Return True if timestamp is within past week, False otherwise.""" 110 | ... 111 | 112 | def mapper(self, _ line): 113 | """Parse each log line, extract and transform relevant lines. 114 | 115 | Emit key value pairs of the form: 116 | 117 | (category1, product1), 2 118 | (category2, product1), 2 119 | (category2, product1), 1 120 | (category1, product2), 3 121 | (category2, product3), 7 122 | (category1, product4), 1 123 | """ 124 | timestamp, product_id, category_id, quantity, total_price, seller_id, \ 125 | buyer_id = line.split('\t') 126 | if self.within_past_week(timestamp): 127 | yield (category_id, product_id), quantity 128 | 129 | def reducer(self, key, value): 130 | """Sum values for each key. 131 | 132 | (category1, product1), 2 133 | (category2, product1), 3 134 | (category1, product2), 3 135 | (category2, product3), 7 136 | (category1, product4), 1 137 | """ 138 | yield key, sum(values) 139 | 140 | def mapper_sort(self, key, value): 141 | """Construct key to ensure proper sorting. 142 | 143 | Transform key and value to the form: 144 | 145 | (category1, 2), product1 146 | (category2, 3), product1 147 | (category1, 3), product2 148 | (category2, 7), product3 149 | (category1, 1), product4 150 | 151 | The shuffle/sort step of MapReduce will then do a 152 | distributed sort on the keys, resulting in: 153 | 154 | (category1, 1), product4 155 | (category1, 2), product1 156 | (category1, 3), product2 157 | (category2, 3), product1 158 | (category2, 7), product3 159 | """ 160 | category_id, product_id = key 161 | quantity = value 162 | yield (category_id, quantity), product_id 163 | 164 | def reducer_identity(self, key, value): 165 | yield key, value 166 | 167 | def steps(self): 168 | """Run the map and reduce steps.""" 169 | return [ 170 | self.mr(mapper=self.mapper, 171 | reducer=self.reducer), 172 | self.mr(mapper=self.mapper_sort, 173 | reducer=self.reducer_identity), 174 | ] 175 | ``` 176 | 177 | The result would be the following sorted list, which we could insert into the `sales_rank` table: 178 | 179 | ``` 180 | (category1, 1), product4 181 | (category1, 2), product1 182 | (category1, 3), product2 183 | (category2, 3), product1 184 | (category2, 7), product3 185 | ``` 186 | 187 | The `sales_rank` table could have the following structure: 188 | 189 | ``` 190 | id int NOT NULL AUTO_INCREMENT 191 | category_id int NOT NULL 192 | total_sold int NOT NULL 193 | product_id int NOT NULL 194 | PRIMARY KEY(id) 195 | FOREIGN KEY(category_id) REFERENCES Categories(id) 196 | FOREIGN KEY(product_id) REFERENCES Products(id) 197 | ``` 198 | 199 | We'll create an [index](https://github.com/donnemartin/system-design-primer#use-good-indices) on `id `, `category_id`, and `product_id` to speed up lookups (log-time instead of scanning the entire table) and to keep the data in memory. Reading 1 MB sequentially from memory takes about 250 microseconds, while reading from SSD takes 4x and from disk takes 80x longer.1 200 | 201 | ### Use case: User views the past week's most popular products by category 202 | 203 | * The **Client** sends a request to the **Web Server**, running as a [reverse proxy](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 204 | * The **Web Server** forwards the request to the **Read API** server 205 | * The **Read API** server reads from the **SQL Database** `sales_rank` table 206 | 207 | We'll use a public [**REST API**](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest): 208 | 209 | ``` 210 | $ curl https://amazon.com/api/v1/popular?category_id=1234 211 | ``` 212 | 213 | Response: 214 | 215 | ``` 216 | { 217 | "id": "100", 218 | "category_id": "1234", 219 | "total_sold": "100000", 220 | "product_id": "50", 221 | }, 222 | { 223 | "id": "53", 224 | "category_id": "1234", 225 | "total_sold": "90000", 226 | "product_id": "200", 227 | }, 228 | { 229 | "id": "75", 230 | "category_id": "1234", 231 | "total_sold": "80000", 232 | "product_id": "3", 233 | }, 234 | ``` 235 | 236 | For internal communications, we could use [Remote Procedure Calls](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc). 237 | 238 | ## Step 4: Scale the design 239 | 240 | > Identify and address bottlenecks, given the constraints. 241 | 242 | ![Imgur](http://i.imgur.com/MzExP06.png) 243 | 244 | **Important: Do not simply jump right into the final design from the initial design!** 245 | 246 | State you would 1) **Benchmark/Load Test**, 2) **Profile** for bottlenecks 3) address bottlenecks while evaluating alternatives and trade-offs, and 4) repeat. See [Design a system that scales to millions of users on AWS](https://github.com/donnemartin/system-design-primer/blob/master/solutions/system_design/scaling_aws/README.md) as a sample on how to iteratively scale the initial design. 247 | 248 | It's important to discuss what bottlenecks you might encounter with the initial design and how you might address each of them. For example, what issues are addressed by adding a **Load Balancer** with multiple **Web Servers**? **CDN**? **Master-Slave Replicas**? What are the alternatives and **Trade-Offs** for each? 249 | 250 | We'll introduce some components to complete the design and to address scalability issues. Internal load balancers are not shown to reduce clutter. 251 | 252 | *To avoid repeating discussions*, refer to the following [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) for main talking points, tradeoffs, and alternatives: 253 | 254 | * [DNS](https://github.com/donnemartin/system-design-primer#domain-name-system) 255 | * [CDN](https://github.com/donnemartin/system-design-primer#content-delivery-network) 256 | * [Load balancer](https://github.com/donnemartin/system-design-primer#load-balancer) 257 | * [Horizontal scaling](https://github.com/donnemartin/system-design-primer#horizontal-scaling) 258 | * [Web server (reverse proxy)](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 259 | * [API server (application layer)](https://github.com/donnemartin/system-design-primer#application-layer) 260 | * [Cache](https://github.com/donnemartin/system-design-primer#cache) 261 | * [Relational database management system (RDBMS)](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms) 262 | * [SQL write master-slave failover](https://github.com/donnemartin/system-design-primer#fail-over) 263 | * [Master-slave replication](https://github.com/donnemartin/system-design-primer#master-slave-replication) 264 | * [Consistency patterns](https://github.com/donnemartin/system-design-primer#consistency-patterns) 265 | * [Availability patterns](https://github.com/donnemartin/system-design-primer#availability-patterns) 266 | 267 | The **Analytics Database** could use a data warehousing solution such as Amazon Redshift or Google BigQuery. 268 | 269 | We might only want to store a limited time period of data in the database, while storing the rest in a data warehouse or in an **Object Store**. An **Object Store** such as Amazon S3 can comfortably handle the constraint of 40 GB of new content per month. 270 | 271 | To address the 40,000 *average* read requests per second (higher at peak), traffic for popular content (and their sales rank) should be handled by the **Memory Cache** instead of the database. The **Memory Cache** is also useful for handling the unevenly distributed traffic and traffic spikes. With the large volume of reads, the **SQL Read Replicas** might not be able to handle the cache misses. We'll probably need to employ additional SQL scaling patterns. 272 | 273 | 400 *average* writes per second (higher at peak) might be tough for a single **SQL Write Master-Slave**, also pointing to a need for additional scaling techniques. 274 | 275 | SQL scaling patterns include: 276 | 277 | * [Federation](https://github.com/donnemartin/system-design-primer#federation) 278 | * [Sharding](https://github.com/donnemartin/system-design-primer#sharding) 279 | * [Denormalization](https://github.com/donnemartin/system-design-primer#denormalization) 280 | * [SQL Tuning](https://github.com/donnemartin/system-design-primer#sql-tuning) 281 | 282 | We should also consider moving some data to a **NoSQL Database**. 283 | 284 | ## Additional talking points 285 | 286 | > Additional topics to dive into, depending on the problem scope and time remaining. 287 | 288 | #### NoSQL 289 | 290 | * [Key-value store](https://github.com/donnemartin/system-design-primer#key-value-store) 291 | * [Document store](https://github.com/donnemartin/system-design-primer#document-store) 292 | * [Wide column store](https://github.com/donnemartin/system-design-primer#wide-column-store) 293 | * [Graph database](https://github.com/donnemartin/system-design-primer#graph-database) 294 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql) 295 | 296 | ### Caching 297 | 298 | * Where to cache 299 | * [Client caching](https://github.com/donnemartin/system-design-primer#client-caching) 300 | * [CDN caching](https://github.com/donnemartin/system-design-primer#cdn-caching) 301 | * [Web server caching](https://github.com/donnemartin/system-design-primer#web-server-caching) 302 | * [Database caching](https://github.com/donnemartin/system-design-primer#database-caching) 303 | * [Application caching](https://github.com/donnemartin/system-design-primer#application-caching) 304 | * What to cache 305 | * [Caching at the database query level](https://github.com/donnemartin/system-design-primer#caching-at-the-database-query-level) 306 | * [Caching at the object level](https://github.com/donnemartin/system-design-primer#caching-at-the-object-level) 307 | * When to update the cache 308 | * [Cache-aside](https://github.com/donnemartin/system-design-primer#cache-aside) 309 | * [Write-through](https://github.com/donnemartin/system-design-primer#write-through) 310 | * [Write-behind (write-back)](https://github.com/donnemartin/system-design-primer#write-behind-write-back) 311 | * [Refresh ahead](https://github.com/donnemartin/system-design-primer#refresh-ahead) 312 | 313 | ### Asynchronism and microservices 314 | 315 | * [Message queues](https://github.com/donnemartin/system-design-primer#message-queues) 316 | * [Task queues](https://github.com/donnemartin/system-design-primer#task-queues) 317 | * [Back pressure](https://github.com/donnemartin/system-design-primer#back-pressure) 318 | * [Microservices](https://github.com/donnemartin/system-design-primer#microservices) 319 | 320 | ### Communications 321 | 322 | * Discuss tradeoffs: 323 | * External communication with clients - [HTTP APIs following REST](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest) 324 | * Internal communications - [RPC](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc) 325 | * [Service discovery](https://github.com/donnemartin/system-design-primer#service-discovery) 326 | 327 | ### Security 328 | 329 | Refer to the [security section](https://github.com/donnemartin/system-design-primer#security). 330 | 331 | ### Latency numbers 332 | 333 | See [Latency numbers every programmer should know](https://github.com/donnemartin/system-design-primer#latency-numbers-every-programmer-should-know). 334 | 335 | ### Ongoing 336 | 337 | * Continue benchmarking and monitoring your system to address bottlenecks as they come up 338 | * Scaling is an iterative process 339 | -------------------------------------------------------------------------------- /solutions/system_design/sales_rank/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/sales_rank/__init__.py -------------------------------------------------------------------------------- /solutions/system_design/sales_rank/sales_rank.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/sales_rank/sales_rank.png -------------------------------------------------------------------------------- /solutions/system_design/sales_rank/sales_rank_basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/sales_rank/sales_rank_basic.png -------------------------------------------------------------------------------- /solutions/system_design/sales_rank/sales_rank_mapreduce.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from mrjob.job import MRJob 4 | 5 | 6 | class SalesRanker(MRJob): 7 | 8 | def within_past_week(self, timestamp): 9 | """Return True if timestamp is within past week, False otherwise.""" 10 | ... 11 | 12 | def mapper(self, _ line): 13 | """Parse each log line, extract and transform relevant lines. 14 | 15 | Emit key value pairs of the form: 16 | 17 | (foo, p1), 2 18 | (bar, p1), 2 19 | (bar, p1), 1 20 | (foo, p2), 3 21 | (bar, p3), 10 22 | (foo, p4), 1 23 | """ 24 | timestamp, product_id, category, quantity = line.split('\t') 25 | if self.within_past_week(timestamp): 26 | yield (category, product_id), quantity 27 | 28 | def reducer(self, key, value): 29 | """Sum values for each key. 30 | 31 | (foo, p1), 2 32 | (bar, p1), 3 33 | (foo, p2), 3 34 | (bar, p3), 10 35 | (foo, p4), 1 36 | """ 37 | yield key, sum(values) 38 | 39 | def mapper_sort(self, key, value): 40 | """Construct key to ensure proper sorting. 41 | 42 | Transform key and value to the form: 43 | 44 | (foo, 2), p1 45 | (bar, 3), p1 46 | (foo, 3), p2 47 | (bar, 10), p3 48 | (foo, 1), p4 49 | 50 | The shuffle/sort step of MapReduce will then do a 51 | distributed sort on the keys, resulting in: 52 | 53 | (category1, 1), product4 54 | (category1, 2), product1 55 | (category1, 3), product2 56 | (category2, 3), product1 57 | (category2, 7), product3 58 | """ 59 | category, product_id = key 60 | quantity = value 61 | yield (category, quantity), product_id 62 | 63 | def reducer_identity(self, key, value): 64 | yield key, value 65 | 66 | def steps(self): 67 | """Run the map and reduce steps.""" 68 | return [ 69 | self.mr(mapper=self.mapper, 70 | reducer=self.reducer), 71 | self.mr(mapper=self.mapper_sort, 72 | reducer=self.reducer_identity), 73 | ] 74 | 75 | 76 | if __name__ == '__main__': 77 | HitCounts.run() 78 | -------------------------------------------------------------------------------- /solutions/system_design/scaling_aws/README.md: -------------------------------------------------------------------------------- 1 | # Design a system that scales to millions of users on AWS 2 | 3 | *Note: This document links directly to relevant areas found in the [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) to avoid duplication. Refer to the linked content for general talking points, tradeoffs, and alternatives.* 4 | 5 | ## Step 1: Outline use cases and constraints 6 | 7 | > Gather requirements and scope the problem. 8 | > Ask questions to clarify use cases and constraints. 9 | > Discuss assumptions. 10 | 11 | Without an interviewer to address clarifying questions, we'll define some use cases and constraints. 12 | 13 | ### Use cases 14 | 15 | Solving this problem takes an iterative approach of: 1) **Benchmark/Load Test**, 2) **Profile** for bottlenecks 3) address bottlenecks while evaluating alternatives and trade-offs, and 4) repeat, which is good pattern for evolving basic designs to scalable designs. 16 | 17 | Unless you have a background in AWS or are applying for a position that requires AWS knowledge, AWS-specific details are not a requirement. However, **much of the principles discussed in this exercise can apply more generally outside of the AWS ecosystem.** 18 | 19 | #### We'll scope the problem to handle only the following use cases 20 | 21 | * **User** makes a read or write request 22 | * **Service** does processing, stores user data, then returns the results 23 | * **Service** needs to evolve from serving a small amount of users to millions of users 24 | * Discuss general scaling patterns as we evolve an architecture to handle a large number of users and requests 25 | * **Service** has high availability 26 | 27 | ### Constraints and assumptions 28 | 29 | #### State assumptions 30 | 31 | * Traffic is not evenly distributed 32 | * Need for relational data 33 | * Scale from 1 user to tens of millions of users 34 | * Denote increase of users as: 35 | * Users+ 36 | * Users++ 37 | * Users+++ 38 | * ... 39 | * 10 million users 40 | * 1 billion writes per month 41 | * 100 billion reads per month 42 | * 100:1 read to write ratio 43 | * 1 KB content per write 44 | 45 | #### Calculate usage 46 | 47 | **Clarify with your interviewer if you should run back-of-the-envelope usage calculations.** 48 | 49 | * 1 TB of new content per month 50 | * 1 KB per write * 1 billion writes per month 51 | * 36 TB of new content in 3 years 52 | * Assume most writes are from new content instead of updates to existing ones 53 | * 400 writes per second on average 54 | * 40,000 reads per second on average 55 | 56 | Handy conversion guide: 57 | 58 | * 2.5 million seconds per month 59 | * 1 request per second = 2.5 million requests per month 60 | * 40 requests per second = 100 million requests per month 61 | * 400 requests per second = 1 billion requests per month 62 | 63 | ## Step 2: Create a high level design 64 | 65 | > Outline a high level design with all important components. 66 | 67 | ![Imgur](http://i.imgur.com/B8LDKD7.png) 68 | 69 | ## Step 3: Design core components 70 | 71 | > Dive into details for each core component. 72 | 73 | ### Use case: User makes a read or write request 74 | 75 | #### Goals 76 | 77 | * With only 1-2 users, you only need a basic setup 78 | * Single box for simplicity 79 | * Vertical scaling when needed 80 | * Monitor to determine bottlenecks 81 | 82 | #### Start with a single box 83 | 84 | * **Web server** on EC2 85 | * Storage for user data 86 | * [**MySQL Database**](https://github.com/donnemartin/system-design-primer#sql) 87 | 88 | Use **Vertical Scaling**: 89 | 90 | * Simply choose a bigger box 91 | * Keep an eye on metrics to determine how to scale up 92 | * Use basic monitoring to determine bottlenecks: CPU, memory, IO, network, etc 93 | * CloudWatch, top, nagios, statsd, graphite, etc 94 | * Scaling vertically can get very expensive 95 | * No redundancy/failover 96 | 97 | *Trade-offs, alternatives, and additional details:* 98 | 99 | * The alternative to **Vertical Scaling** is [**Horizontal scaling**](https://github.com/donnemartin/system-design-primer#horizontal-scaling) 100 | 101 | #### Start with SQL, consider NoSQL 102 | 103 | The constraints assume there is a need for relational data. We can start off using a **MySQL Database** on the single box. 104 | 105 | *Trade-offs, alternatives, and additional details:* 106 | 107 | * See the [Relational database management system (RDBMS)](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms) section 108 | * Discuss reasons to use [SQL or NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql) 109 | 110 | #### Assign a public static IP 111 | 112 | * Elastic IPs provide a public endpoint whose IP doesn't change on reboot 113 | * Helps with failover, just point the domain to a new IP 114 | 115 | #### Use a DNS 116 | 117 | Add a **DNS** such as Route 53 to map the domain to the instance's public IP. 118 | 119 | *Trade-offs, alternatives, and additional details:* 120 | 121 | * See the [Domain name system](https://github.com/donnemartin/system-design-primer#domain-name-system) section 122 | 123 | #### Secure the web server 124 | 125 | * Open up only necessary ports 126 | * Allow the web server to respond to incoming requests from: 127 | * 80 for HTTP 128 | * 443 for HTTPS 129 | * 22 for SSH to only whitelisted IPs 130 | * Prevent the web server from initiating outbound connections 131 | 132 | *Trade-offs, alternatives, and additional details:* 133 | 134 | * See the [Security](https://github.com/donnemartin/system-design-primer#security) section 135 | 136 | ## Step 4: Scale the design 137 | 138 | > Identify and address bottlenecks, given the constraints. 139 | 140 | ### Users+ 141 | 142 | ![Imgur](http://i.imgur.com/rrfjMXB.png) 143 | 144 | #### Assumptions 145 | 146 | Our user count is starting to pick up and the load is increasing on our single box. Our **Benchmarks/Load Tests** and **Profiling** are pointing to the **MySQL Database** taking up more and more memory and CPU resources, while the user content is filling up disk space. 147 | 148 | We've been able to address these issues with **Vertical Scaling** so far. Unfortunately, this has become quite expensive and it doesn't allow for independent scaling of the **MySQL Database** and **Web Server**. 149 | 150 | #### Goals 151 | 152 | * Lighten load on the single box and allow for independent scaling 153 | * Store static content separately in an **Object Store** 154 | * Move the **MySQL Database** to a separate box 155 | * Disadvantages 156 | * These changes would increase complexity and would require changes to the **Web Server** to point to the **Object Store** and the **MySQL Database** 157 | * Additional security measures must be taken to secure the new components 158 | * AWS costs could also increase, but should be weighed with the costs of managing similar systems on your own 159 | 160 | #### Store static content separately 161 | 162 | * Consider using a managed **Object Store** like S3 to store static content 163 | * Highly scalable and reliable 164 | * Server side encryption 165 | * Move static content to S3 166 | * User files 167 | * JS 168 | * CSS 169 | * Images 170 | * Videos 171 | 172 | #### Move the MySQL database to a separate box 173 | 174 | * Consider using a service like RDS to manage the **MySQL Database** 175 | * Simple to administer, scale 176 | * Multiple availability zones 177 | * Encryption at rest 178 | 179 | #### Secure the system 180 | 181 | * Encrypt data in transit and at rest 182 | * Use a Virtual Private Cloud 183 | * Create a public subnet for the single **Web Server** so it can send and receive traffic from the internet 184 | * Create a private subnet for everything else, preventing outside access 185 | * Only open ports from whitelisted IPs for each component 186 | * These same patterns should be implemented for new components in the remainder of the exercise 187 | 188 | *Trade-offs, alternatives, and additional details:* 189 | 190 | * See the [Security](https://github.com/donnemartin/system-design-primer#security) section 191 | 192 | ### Users++ 193 | 194 | ![Imgur](http://i.imgur.com/raoFTXM.png) 195 | 196 | #### Assumptions 197 | 198 | Our **Benchmarks/Load Tests** and **Profiling** show that our single **Web Server** bottlenecks during peak hours, resulting in slow responses and in some cases, downtime. As the service matures, we'd also like to move towards higher availability and redundancy. 199 | 200 | #### Goals 201 | 202 | * The following goals attempt to address the scaling issues with the **Web Server** 203 | * Based on the **Benchmarks/Load Tests** and **Profiling**, you might only need to implement one or two of these techniques 204 | * Use [**Horizontal Scaling**](https://github.com/donnemartin/system-design-primer#horizontal-scaling) to handle increasing loads and to address single points of failure 205 | * Add a [**Load Balancer**](https://github.com/donnemartin/system-design-primer#load-balancer) such as Amazon's ELB or HAProxy 206 | * ELB is highly available 207 | * If you are configuring your own **Load Balancer**, setting up multiple servers in [active-active](https://github.com/donnemartin/system-design-primer#active-active) or [active-passive](https://github.com/donnemartin/system-design-primer#active-passive) in multiple availability zones will improve availability 208 | * Terminate SSL on the **Load Balancer** to reduce computational load on backend servers and to simplify certificate administration 209 | * Use multiple **Web Servers** spread out over multiple availability zones 210 | * Use multiple **MySQL** instances in [**Master-Slave Failover**](https://github.com/donnemartin/system-design-primer#master-slave-replication) mode across multiple availability zones to improve redundancy 211 | * Separate out the **Web Servers** from the [**Application Servers**](https://github.com/donnemartin/system-design-primer#application-layer) 212 | * Scale and configure both layers independently 213 | * **Web Servers** can run as a [**Reverse Proxy**](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 214 | * For example, you can add **Application Servers** handling **Read APIs** while others handle **Write APIs** 215 | * Move static (and some dynamic) content to a [**Content Delivery Network (CDN)**](https://github.com/donnemartin/system-design-primer#content-delivery-network) such as CloudFront to reduce load and latency 216 | 217 | *Trade-offs, alternatives, and additional details:* 218 | 219 | * See the linked content above for details 220 | 221 | ### Users+++ 222 | 223 | ![Imgur](http://i.imgur.com/OZCxJr0.png) 224 | 225 | **Note:** **Internal Load Balancers** not shown to reduce clutter 226 | 227 | #### Assumptions 228 | 229 | Our **Benchmarks/Load Tests** and **Profiling** show that we are read-heavy (100:1 with writes) and our database is suffering from poor performance from the high read requests. 230 | 231 | #### Goals 232 | 233 | * The following goals attempt to address the scaling issues with the **MySQL Database** 234 | * Based on the **Benchmarks/Load Tests** and **Profiling**, you might only need to implement one or two of these techniques 235 | * Move the following data to a [**Memory Cache**](https://github.com/donnemartin/system-design-primer#cache) such as Elasticache to reduce load and latency: 236 | * Frequently accessed content from **MySQL** 237 | * First, try to configure the **MySQL Database** cache to see if that is sufficient to relieve the bottleneck before implementing a **Memory Cache** 238 | * Session data from the **Web Servers** 239 | * The **Web Servers** become stateless, allowing for **Autoscaling** 240 | * Reading 1 MB sequentially from memory takes about 250 microseconds, while reading from SSD takes 4x and from disk takes 80x longer.1 241 | * Add [**MySQL Read Replicas**](https://github.com/donnemartin/system-design-primer#master-slave-replication) to reduce load on the write master 242 | * Add more **Web Servers** and **Application Servers** to improve responsiveness 243 | 244 | *Trade-offs, alternatives, and additional details:* 245 | 246 | * See the linked content above for details 247 | 248 | #### Add MySQL read replicas 249 | 250 | * In addition to adding and scaling a **Memory Cache**, **MySQL Read Replicas** can also help relieve load on the **MySQL Write Master** 251 | * Add logic to **Web Server** to separate out writes and reads 252 | * Add **Load Balancers** in front of **MySQL Read Replicas** (not pictured to reduce clutter) 253 | * Most services are read-heavy vs write-heavy 254 | 255 | *Trade-offs, alternatives, and additional details:* 256 | 257 | * See the [Relational database management system (RDBMS)](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms) section 258 | 259 | ### Users++++ 260 | 261 | ![Imgur](http://i.imgur.com/3X8nmdL.png) 262 | 263 | #### Assumptions 264 | 265 | Our **Benchmarks/Load Tests** and **Profiling** show that our traffic spikes during regular business hours in the U.S. and drop significantly when users leave the office. We think we can cut costs by automatically spinning up and down servers based on actual load. We're a small shop so we'd like to automate as much of the DevOps as possible for **Autoscaling** and for the general operations. 266 | 267 | #### Goals 268 | 269 | * Add **Autoscaling** to provision capacity as needed 270 | * Keep up with traffic spikes 271 | * Reduce costs by powering down unused instances 272 | * Automate DevOps 273 | * Chef, Puppet, Ansible, etc 274 | * Continue monitoring metrics to address bottlenecks 275 | * **Host level** - Review a single EC2 instance 276 | * **Aggregate level** - Review load balancer stats 277 | * **Log analysis** - CloudWatch, CloudTrail, Loggly, Splunk, Sumo 278 | * **External site performance** - Pingdom or New Relic 279 | * **Handle notifications and incidents** - PagerDuty 280 | * **Error Reporting** - Sentry 281 | 282 | #### Add autoscaling 283 | 284 | * Consider a managed service such as AWS **Autoscaling** 285 | * Create one group for each **Web Server** and one for each **Application Server** type, place each group in multiple availability zones 286 | * Set a min and max number of instances 287 | * Trigger to scale up and down through CloudWatch 288 | * Simple time of day metric for predictable loads or 289 | * Metrics over a time period: 290 | * CPU load 291 | * Latency 292 | * Network traffic 293 | * Custom metric 294 | * Disadvantages 295 | * Autoscaling can introduce complexity 296 | * It could take some time before a system appropriately scales up to meet increased demand, or to scale down when demand drops 297 | 298 | ### Users+++++ 299 | 300 | ![Imgur](http://i.imgur.com/jj3A5N8.png) 301 | 302 | **Note:** **Autoscaling** groups not shown to reduce clutter 303 | 304 | #### Assumptions 305 | 306 | As the service continues to grow towards the figures outlined in the constraints, we iteratively run **Benchmarks/Load Tests** and **Profiling** to uncover and address new bottlenecks. 307 | 308 | #### Goals 309 | 310 | We'll continue to address scaling issues due to the problem's constraints: 311 | 312 | * If our **MySQL Database** starts to grow too large, we might considering only storing a limited time period of data in the database, while storing the rest in a data warehouse such as Redshift 313 | * A data warehouse such as Redshift can comfortably handle the constraint of 1 TB of new content per month 314 | * With 40,000 average read requests per second, read traffic for popular content can be addressed by scaling the **Memory Cache**, which is also useful for handling the unevenly distributed traffic and traffic spikes 315 | * The **SQL Read Replicas** might have trouble handling the cache misses, we'll probably need to employ additional SQL scaling patterns 316 | * 400 average writes per second (with presumably significantly higher peaks) might be tough for a single **SQL Write Master-Slave**, also pointing to a need for additional scaling techniques 317 | 318 | SQL scaling patterns include: 319 | 320 | * [Federation](https://github.com/donnemartin/system-design-primer#federation) 321 | * [Sharding](https://github.com/donnemartin/system-design-primer#sharding) 322 | * [Denormalization](https://github.com/donnemartin/system-design-primer#denormalization) 323 | * [SQL Tuning](https://github.com/donnemartin/system-design-primer#sql-tuning) 324 | 325 | To further address the high read and write requests, we should also consider moving appropriate data to a [**NoSQL Database**](https://github.com/donnemartin/system-design-primer#nosql) such as DynamoDB. 326 | 327 | We can further separate out our [**Application Servers**](https://github.com/donnemartin/system-design-primer#application-layer) to allow for independent scaling. Batch processes or computations that do not need to be done in real-time can be done [**Asynchronously**](https://github.com/donnemartin/system-design-primer#asynchronism) with **Queues** and **Workers**: 328 | 329 | * For example, in a photo service, the photo upload and the thumbnail creation can be separated: 330 | * **Client** uploads photo 331 | * **Application Server** puts a job in a **Queue** such as SQS 332 | * The **Worker Service** on EC2 or Lambda pulls work off the **Queue** then: 333 | * Creates a thumbnail 334 | * Updates a **Database** 335 | * Stores the thumbnail in the **Object Store** 336 | 337 | *Trade-offs, alternatives, and additional details:* 338 | 339 | * See the linked content above for details 340 | 341 | ## Additional talking points 342 | 343 | > Additional topics to dive into, depending on the problem scope and time remaining. 344 | 345 | ### SQL scaling patterns 346 | 347 | * [Read replicas](https://github.com/donnemartin/system-design-primer#master-slave) 348 | * [Federation](https://github.com/donnemartin/system-design-primer#federation) 349 | * [Sharding](https://github.com/donnemartin/system-design-primer#sharding) 350 | * [Denormalization](https://github.com/donnemartin/system-design-primer#denormalization) 351 | * [SQL Tuning](https://github.com/donnemartin/system-design-primer#sql-tuning) 352 | 353 | #### NoSQL 354 | 355 | * [Key-value store](https://github.com/donnemartin/system-design-primer#key-value-store) 356 | * [Document store](https://github.com/donnemartin/system-design-primer#document-store) 357 | * [Wide column store](https://github.com/donnemartin/system-design-primer#wide-column-store) 358 | * [Graph database](https://github.com/donnemartin/system-design-primer#graph-database) 359 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql) 360 | 361 | ### Caching 362 | 363 | * Where to cache 364 | * [Client caching](https://github.com/donnemartin/system-design-primer#client-caching) 365 | * [CDN caching](https://github.com/donnemartin/system-design-primer#cdn-caching) 366 | * [Web server caching](https://github.com/donnemartin/system-design-primer#web-server-caching) 367 | * [Database caching](https://github.com/donnemartin/system-design-primer#database-caching) 368 | * [Application caching](https://github.com/donnemartin/system-design-primer#application-caching) 369 | * What to cache 370 | * [Caching at the database query level](https://github.com/donnemartin/system-design-primer#caching-at-the-database-query-level) 371 | * [Caching at the object level](https://github.com/donnemartin/system-design-primer#caching-at-the-object-level) 372 | * When to update the cache 373 | * [Cache-aside](https://github.com/donnemartin/system-design-primer#cache-aside) 374 | * [Write-through](https://github.com/donnemartin/system-design-primer#write-through) 375 | * [Write-behind (write-back)](https://github.com/donnemartin/system-design-primer#write-behind-write-back) 376 | * [Refresh ahead](https://github.com/donnemartin/system-design-primer#refresh-ahead) 377 | 378 | ### Asynchronism and microservices 379 | 380 | * [Message queues](https://github.com/donnemartin/system-design-primer#message-queues) 381 | * [Task queues](https://github.com/donnemartin/system-design-primer#task-queues) 382 | * [Back pressure](https://github.com/donnemartin/system-design-primer#back-pressure) 383 | * [Microservices](https://github.com/donnemartin/system-design-primer#microservices) 384 | 385 | ### Communications 386 | 387 | * Discuss tradeoffs: 388 | * External communication with clients - [HTTP APIs following REST](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest) 389 | * Internal communications - [RPC](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc) 390 | * [Service discovery](https://github.com/donnemartin/system-design-primer#service-discovery) 391 | 392 | ### Security 393 | 394 | Refer to the [security section](https://github.com/donnemartin/system-design-primer#security). 395 | 396 | ### Latency numbers 397 | 398 | See [Latency numbers every programmer should know](https://github.com/donnemartin/system-design-primer#latency-numbers-every-programmer-should-know). 399 | 400 | ### Ongoing 401 | 402 | * Continue benchmarking and monitoring your system to address bottlenecks as they come up 403 | * Scaling is an iterative process 404 | -------------------------------------------------------------------------------- /solutions/system_design/scaling_aws/scaling_aws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/scaling_aws/scaling_aws.png -------------------------------------------------------------------------------- /solutions/system_design/scaling_aws/scaling_aws_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/scaling_aws/scaling_aws_1.png -------------------------------------------------------------------------------- /solutions/system_design/scaling_aws/scaling_aws_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/scaling_aws/scaling_aws_2.png -------------------------------------------------------------------------------- /solutions/system_design/scaling_aws/scaling_aws_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/scaling_aws/scaling_aws_3.png -------------------------------------------------------------------------------- /solutions/system_design/scaling_aws/scaling_aws_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/scaling_aws/scaling_aws_4.png -------------------------------------------------------------------------------- /solutions/system_design/scaling_aws/scaling_aws_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/scaling_aws/scaling_aws_5.png -------------------------------------------------------------------------------- /solutions/system_design/scaling_aws/scaling_aws_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/scaling_aws/scaling_aws_6.png -------------------------------------------------------------------------------- /solutions/system_design/scaling_aws/scaling_aws_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/scaling_aws/scaling_aws_7.png -------------------------------------------------------------------------------- /solutions/system_design/social_graph/README.md: -------------------------------------------------------------------------------- 1 | # Design the data structures for a social network 2 | 3 | *Note: This document links directly to relevant areas found in the [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) to avoid duplication. Refer to the linked content for general talking points, tradeoffs, and alternatives.* 4 | 5 | ## Step 1: Outline use cases and constraints 6 | 7 | > Gather requirements and scope the problem. 8 | > Ask questions to clarify use cases and constraints. 9 | > Discuss assumptions. 10 | 11 | Without an interviewer to address clarifying questions, we'll define some use cases and constraints. 12 | 13 | ### Use cases 14 | 15 | #### We'll scope the problem to handle only the following use cases 16 | 17 | * **User** searches for someone and sees the shortest path to the searched person 18 | * **Service** has high availability 19 | 20 | ### Constraints and assumptions 21 | 22 | #### State assumptions 23 | 24 | * Traffic is not evenly distributed 25 | * Some searches are more popular than others, while others are only executed once 26 | * Graph data won't fit on a single machine 27 | * Graph edges are unweighted 28 | * 100 million users 29 | * 50 friends per user average 30 | * 1 billion friend searches per month 31 | 32 | Exercise the use of more traditional systems - don't use graph-specific solutions such as [GraphQL](http://graphql.org/) or a graph database like [Neo4j](https://neo4j.com/) 33 | 34 | #### Calculate usage 35 | 36 | **Clarify with your interviewer if you should run back-of-the-envelope usage calculations.** 37 | 38 | * 5 billion friend relationships 39 | * 100 million users * 50 friends per user average 40 | * 400 search requests per second 41 | 42 | Handy conversion guide: 43 | 44 | * 2.5 million seconds per month 45 | * 1 request per second = 2.5 million requests per month 46 | * 40 requests per second = 100 million requests per month 47 | * 400 requests per second = 1 billion requests per month 48 | 49 | ## Step 2: Create a high level design 50 | 51 | > Outline a high level design with all important components. 52 | 53 | ![Imgur](http://i.imgur.com/wxXyq2J.png) 54 | 55 | ## Step 3: Design core components 56 | 57 | > Dive into details for each core component. 58 | 59 | ### Use case: User searches for someone and sees the shortest path to the searched person 60 | 61 | **Clarify with your interviewer how much code you are expected to write**. 62 | 63 | Without the constraint of millions of users (vertices) and billions of friend relationships (edges), we could solve this unweighted shortest path task with a general BFS approach: 64 | 65 | ``` 66 | class Graph(Graph): 67 | 68 | def shortest_path(self, source, dest): 69 | if source is None or dest is None: 70 | return None 71 | if source is dest: 72 | return [source.key] 73 | prev_node_keys = self._shortest_path(source, dest) 74 | if prev_node_keys is None: 75 | return None 76 | else: 77 | path_ids = [dest.key] 78 | prev_node_key = prev_node_keys[dest.key] 79 | while prev_node_key is not None: 80 | path_ids.append(prev_node_key) 81 | prev_node_key = prev_node_keys[prev_node_key] 82 | return path_ids[::-1] 83 | 84 | def _shortest_path(self, source, dest): 85 | queue = deque() 86 | queue.append(source) 87 | prev_node_keys = {source.key: None} 88 | source.visit_state = State.visited 89 | while queue: 90 | node = queue.popleft() 91 | if node is dest: 92 | return prev_node_keys 93 | prev_node = node 94 | for adj_node in node.adj_nodes.values(): 95 | if adj_node.visit_state == State.unvisited: 96 | queue.append(adj_node) 97 | prev_node_keys[adj_node.key] = prev_node.key 98 | adj_node.visit_state = State.visited 99 | return None 100 | ``` 101 | 102 | We won't be able to fit all users on the same machine, we'll need to [shard](https://github.com/donnemartin/system-design-primer#sharding) users across **Person Servers** and access them with a **Lookup Service**. 103 | 104 | * The **Client** sends a request to the **Web Server**, running as a [reverse proxy](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 105 | * The **Web Server** forwards the request to the **Search API** server 106 | * The **Search API** server forwards the request to the **User Graph Service** 107 | * The **User Graph Service** does does the following: 108 | * Uses the **Lookup Service** to find the **Person Server** where the current user's info is stored 109 | * Finds the appropriate **Person Server** to retrieve the current user's list of `friend_ids` 110 | * Runs a BFS search using the current user as the `source` and the current user's `friend_ids` as the ids for each `adjacent_node` 111 | * To get the `adjacent_node` from a given id: 112 | * The **User Graph Service** will *again* need to communicate with the **Lookup Service** to determine which **Person Server** stores the`adjacent_node` matching the given id (potential for optimization) 113 | 114 | **Clarify with your interviewer how much code you should be writing**. 115 | 116 | **Note**: Error handling is excluded below for simplicity. Ask if you should code proper error handing. 117 | 118 | **Lookup Service** implementation: 119 | 120 | ``` 121 | class LookupService(object): 122 | 123 | def __init__(self): 124 | self.lookup = self._init_lookup() # key: person_id, value: person_server 125 | 126 | def _init_lookup(self): 127 | ... 128 | 129 | def lookup_person_server(self, person_id): 130 | return self.lookup[person_id] 131 | ``` 132 | 133 | **Person Server** implementation: 134 | 135 | ``` 136 | class PersonServer(object): 137 | 138 | def __init__(self): 139 | self.people = {} # key: person_id, value: person 140 | 141 | def add_person(self, person): 142 | ... 143 | 144 | def people(self, ids): 145 | results = [] 146 | for id in ids: 147 | if id in self.people: 148 | results.append(self.people[id]) 149 | return results 150 | ``` 151 | 152 | **Person** implementation: 153 | 154 | ``` 155 | class Person(object): 156 | 157 | def __init__(self, id, name, friend_ids): 158 | self.id = id 159 | self.name = name 160 | self.friend_ids = friend_ids 161 | ``` 162 | 163 | **User Graph Service** implementation: 164 | 165 | ``` 166 | class UserGraphService(object): 167 | 168 | def __init__(self, lookup_service): 169 | self.lookup_service = lookup_service 170 | 171 | def person(self, person_id): 172 | person_server = self.lookup_service.lookup_person_server(person_id) 173 | return person_server.people([person_id]) 174 | 175 | def shortest_path(self, source_key, dest_key): 176 | if source_key is None or dest_key is None: 177 | return None 178 | if source_key is dest_key: 179 | return [source_key] 180 | prev_node_keys = self._shortest_path(source_key, dest_key) 181 | if prev_node_keys is None: 182 | return None 183 | else: 184 | # Iterate through the path_ids backwards, starting at dest_key 185 | path_ids = [dest_key] 186 | prev_node_key = prev_node_keys[dest_key] 187 | while prev_node_key is not None: 188 | path_ids.append(prev_node_key) 189 | prev_node_key = prev_node_keys[prev_node_key] 190 | # Reverse the list since we iterated backwards 191 | return path_ids[::-1] 192 | 193 | def _shortest_path(self, source_key, dest_key, path): 194 | # Use the id to get the Person 195 | source = self.person(source_key) 196 | # Update our bfs queue 197 | queue = deque() 198 | queue.append(source) 199 | # prev_node_keys keeps track of each hop from 200 | # the source_key to the dest_key 201 | prev_node_keys = {source_key: None} 202 | # We'll use visited_ids to keep track of which nodes we've 203 | # visited, which can be different from a typical bfs where 204 | # this can be stored in the node itself 205 | visited_ids = set() 206 | visited_ids.add(source.id) 207 | while queue: 208 | node = queue.popleft() 209 | if node.key is dest_key: 210 | return prev_node_keys 211 | prev_node = node 212 | for friend_id in node.friend_ids: 213 | if friend_id not in visited_ids: 214 | friend_node = self.person(friend_id) 215 | queue.append(friend_node) 216 | prev_node_keys[friend_id] = prev_node.key 217 | visited_ids.add(friend_id) 218 | return None 219 | ``` 220 | 221 | We'll use a public [**REST API**](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest): 222 | 223 | ``` 224 | $ curl https://social.com/api/v1/friend_search?person_id=1234 225 | ``` 226 | 227 | Response: 228 | 229 | ``` 230 | { 231 | "person_id": "100", 232 | "name": "foo", 233 | "link": "https://social.com/foo", 234 | }, 235 | { 236 | "person_id": "53", 237 | "name": "bar", 238 | "link": "https://social.com/bar", 239 | }, 240 | { 241 | "person_id": "1234", 242 | "name": "baz", 243 | "link": "https://social.com/baz", 244 | }, 245 | ``` 246 | 247 | For internal communications, we could use [Remote Procedure Calls](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc). 248 | 249 | ## Step 4: Scale the design 250 | 251 | > Identify and address bottlenecks, given the constraints. 252 | 253 | ![Imgur](http://i.imgur.com/cdCv5g7.png) 254 | 255 | **Important: Do not simply jump right into the final design from the initial design!** 256 | 257 | State you would 1) **Benchmark/Load Test**, 2) **Profile** for bottlenecks 3) address bottlenecks while evaluating alternatives and trade-offs, and 4) repeat. See [Design a system that scales to millions of users on AWS](https://github.com/donnemartin/system-design-primer/blob/master/solutions/system_design/scaling_aws/README.md) as a sample on how to iteratively scale the initial design. 258 | 259 | It's important to discuss what bottlenecks you might encounter with the initial design and how you might address each of them. For example, what issues are addressed by adding a **Load Balancer** with multiple **Web Servers**? **CDN**? **Master-Slave Replicas**? What are the alternatives and **Trade-Offs** for each? 260 | 261 | We'll introduce some components to complete the design and to address scalability issues. Internal load balancers are not shown to reduce clutter. 262 | 263 | *To avoid repeating discussions*, refer to the following [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) for main talking points, tradeoffs, and alternatives: 264 | 265 | * [DNS](https://github.com/donnemartin/system-design-primer#domain-name-system) 266 | * [Load balancer](https://github.com/donnemartin/system-design-primer#load-balancer) 267 | * [Horizontal scaling](https://github.com/donnemartin/system-design-primer#horizontal-scaling) 268 | * [Web server (reverse proxy)](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 269 | * [API server (application layer)](https://github.com/donnemartin/system-design-primer#application-layer) 270 | * [Cache](https://github.com/donnemartin/system-design-primer#cache) 271 | * [Consistency patterns](https://github.com/donnemartin/system-design-primer#consistency-patterns) 272 | * [Availability patterns](https://github.com/donnemartin/system-design-primer#availability-patterns) 273 | 274 | To address the constraint of 400 *average* read requests per second (higher at peak), person data can be served from a **Memory Cache** such as Redis or Memcached to reduce response times and to reduce traffic to downstream services. This could be especially useful for people who do multiple searches in succession and for people who are well-connected. Reading 1 MB sequentially from memory takes about 250 microseconds, while reading from SSD takes 4x and from disk takes 80x longer.1 275 | 276 | Below are further optimizations: 277 | 278 | * Store complete or partial BFS traversals to speed up subsequent lookups in the **Memory Cache** 279 | * Batch compute offline then store complete or partial BFS traversals to speed up subsequent lookups in a **NoSQL Database** 280 | * Reduce machine jumps by batching together friend lookups hosted on the same **Person Server** 281 | * [Shard](https://github.com/donnemartin/system-design-primer#sharding) **Person Servers** by location to further improve this, as friends generally live closer to each other 282 | * Do two BFS searches at the same time, one starting from the source, and one from the destination, then merge the two paths 283 | * Start the BFS search from people with large numbers of friends, as they are more likely to reduce the number of [degrees of separation](https://en.wikipedia.org/wiki/Six_degrees_of_separation) between the current user and the search target 284 | * Set a limit based on time or number of hops before asking the user if they want to continue searching, as searching could take a considerable amount of time in some cases 285 | * Use a **Graph Database** such as [Neo4j](https://neo4j.com/) or a graph-specific query language such as [GraphQL](http://graphql.org/) (if there were no constraint preventing the use of **Graph Databases**) 286 | 287 | ## Additional talking points 288 | 289 | > Additional topics to dive into, depending on the problem scope and time remaining. 290 | 291 | ### SQL scaling patterns 292 | 293 | * [Read replicas](https://github.com/donnemartin/system-design-primer#master-slave) 294 | * [Federation](https://github.com/donnemartin/system-design-primer#federation) 295 | * [Sharding](https://github.com/donnemartin/system-design-primer#sharding) 296 | * [Denormalization](https://github.com/donnemartin/system-design-primer#denormalization) 297 | * [SQL Tuning](https://github.com/donnemartin/system-design-primer#sql-tuning) 298 | 299 | #### NoSQL 300 | 301 | * [Key-value store](https://github.com/donnemartin/system-design-primer#key-value-store) 302 | * [Document store](https://github.com/donnemartin/system-design-primer#document-store) 303 | * [Wide column store](https://github.com/donnemartin/system-design-primer#wide-column-store) 304 | * [Graph database](https://github.com/donnemartin/system-design-primer#graph-database) 305 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql) 306 | 307 | ### Caching 308 | 309 | * Where to cache 310 | * [Client caching](https://github.com/donnemartin/system-design-primer#client-caching) 311 | * [CDN caching](https://github.com/donnemartin/system-design-primer#cdn-caching) 312 | * [Web server caching](https://github.com/donnemartin/system-design-primer#web-server-caching) 313 | * [Database caching](https://github.com/donnemartin/system-design-primer#database-caching) 314 | * [Application caching](https://github.com/donnemartin/system-design-primer#application-caching) 315 | * What to cache 316 | * [Caching at the database query level](https://github.com/donnemartin/system-design-primer#caching-at-the-database-query-level) 317 | * [Caching at the object level](https://github.com/donnemartin/system-design-primer#caching-at-the-object-level) 318 | * When to update the cache 319 | * [Cache-aside](https://github.com/donnemartin/system-design-primer#cache-aside) 320 | * [Write-through](https://github.com/donnemartin/system-design-primer#write-through) 321 | * [Write-behind (write-back)](https://github.com/donnemartin/system-design-primer#write-behind-write-back) 322 | * [Refresh ahead](https://github.com/donnemartin/system-design-primer#refresh-ahead) 323 | 324 | ### Asynchronism and microservices 325 | 326 | * [Message queues](https://github.com/donnemartin/system-design-primer#message-queues) 327 | * [Task queues](https://github.com/donnemartin/system-design-primer#task-queues) 328 | * [Back pressure](https://github.com/donnemartin/system-design-primer#back-pressure) 329 | * [Microservices](https://github.com/donnemartin/system-design-primer#microservices) 330 | 331 | ### Communications 332 | 333 | * Discuss tradeoffs: 334 | * External communication with clients - [HTTP APIs following REST](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest) 335 | * Internal communications - [RPC](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc) 336 | * [Service discovery](https://github.com/donnemartin/system-design-primer#service-discovery) 337 | 338 | ### Security 339 | 340 | Refer to the [security section](https://github.com/donnemartin/system-design-primer#security). 341 | 342 | ### Latency numbers 343 | 344 | See [Latency numbers every programmer should know](https://github.com/donnemartin/system-design-primer#latency-numbers-every-programmer-should-know). 345 | 346 | ### Ongoing 347 | 348 | * Continue benchmarking and monitoring your system to address bottlenecks as they come up 349 | * Scaling is an iterative process 350 | -------------------------------------------------------------------------------- /solutions/system_design/social_graph/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/social_graph/__init__.py -------------------------------------------------------------------------------- /solutions/system_design/social_graph/social_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/social_graph/social_graph.png -------------------------------------------------------------------------------- /solutions/system_design/social_graph/social_graph_basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/social_graph/social_graph_basic.png -------------------------------------------------------------------------------- /solutions/system_design/social_graph/social_graph_snippets.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class Graph(object): 4 | 5 | def bfs(self, source, dest): 6 | if source is None: 7 | return False 8 | queue = deque() 9 | queue.append(source) 10 | source.visit_state = State.visited 11 | while queue: 12 | node = queue.popleft() 13 | print(node) 14 | if dest is node: 15 | return True 16 | for adjacent_node in node.adj_nodes.values(): 17 | if adjacent_node.visit_state == State.unvisited: 18 | queue.append(adjacent_node) 19 | adjacent_node.visit_state = State.visited 20 | return False 21 | 22 | 23 | class Person(object): 24 | 25 | def __init__(self, id, name): 26 | self.id = id 27 | self.name = name 28 | self.friend_ids = [] 29 | 30 | 31 | class LookupService(object): 32 | 33 | def __init__(self): 34 | self.lookup = {} # key: person_id, value: person_server 35 | 36 | def get_person(self, person_id): 37 | person_server = self.lookup[person_id] 38 | return person_server.people[person_id] 39 | 40 | 41 | class PersonServer(object): 42 | 43 | def __init__(self): 44 | self.people = {} # key: person_id, value: person 45 | 46 | def get_people(self, ids): 47 | results = [] 48 | for id in ids: 49 | if id in self.people: 50 | results.append(self.people[id]) 51 | return results 52 | 53 | 54 | class UserGraphService(object): 55 | 56 | def __init__(self, person_ids, lookup): 57 | self.lookup = lookup 58 | self.person_ids = person_ids 59 | self.visited_ids = set() 60 | 61 | def bfs(self, source, dest): 62 | # Use self.visited_ids to track visited nodes 63 | # Use self.lookup to translate a person_id to a Person 64 | -------------------------------------------------------------------------------- /solutions/system_design/twitter/README.md: -------------------------------------------------------------------------------- 1 | # Design the Twitter timeline and search 2 | 3 | *Note: This document links directly to relevant areas found in the [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) to avoid duplication. Refer to the linked content for general talking points, tradeoffs, and alternatives.* 4 | 5 | **Design the Facebook feed** and **Design Facebook search** are similar questions. 6 | 7 | ## Step 1: Outline use cases and constraints 8 | 9 | > Gather requirements and scope the problem. 10 | > Ask questions to clarify use cases and constraints. 11 | > Discuss assumptions. 12 | 13 | Without an interviewer to address clarifying questions, we'll define some use cases and constraints. 14 | 15 | ### Use cases 16 | 17 | #### We'll scope the problem to handle only the following use cases 18 | 19 | * **User** posts a tweet 20 | * **Service** pushes tweets to followers, sending push notifications and emails 21 | * **User** views the user timeline (activity from the user) 22 | * **User** views the home timeline (activity from people the user is following) 23 | * **User** searches keywords 24 | * **Service** has high availability 25 | 26 | #### Out of scope 27 | 28 | * **Service** pushes tweets to the Twitter Firehose and other streams 29 | * **Service** strips out tweets based on user's visibility settings 30 | * Hide @reply if the user is not also following the person being replied to 31 | * Respect 'hide retweets' setting 32 | * Analytics 33 | 34 | ### Constraints and assumptions 35 | 36 | #### State assumptions 37 | 38 | General 39 | 40 | * Traffic is not evenly distributed 41 | * Posting a tweet should be fast 42 | * Fanning out a tweet to all of your followers should be fast, unless you have millions of followers 43 | * 100 million active users 44 | * 500 million tweets per day or 15 billion tweets per month 45 | * Each tweet averages a fanout of 10 deliveries 46 | * 5 billion total tweets delivered on fanout per day 47 | * 150 billion tweets delivered on fanout per month 48 | * 250 billion read requests per month 49 | * 10 billion searches per month 50 | 51 | Timeline 52 | 53 | * Viewing the timeline should be fast 54 | * Twitter is more read heavy than write heavy 55 | * Optimize for fast reads of tweets 56 | * Ingesting tweets is write heavy 57 | 58 | Search 59 | 60 | * Searching should be fast 61 | * Search is read-heavy 62 | 63 | #### Calculate usage 64 | 65 | **Clarify with your interviewer if you should run back-of-the-envelope usage calculations.** 66 | 67 | * Size per tweet: 68 | * `tweet_id` - 8 bytes 69 | * `user_id` - 32 bytes 70 | * `text` - 140 bytes 71 | * `media` - 10 KB average 72 | * Total: ~10 KB 73 | * 150 TB of new tweet content per month 74 | * 10 KB per tweet * 500 million tweets per day * 30 days per month 75 | * 5.4 PB of new tweet content in 3 years 76 | * 100 thousand read requests per second 77 | * 250 billion read requests per month * (400 requests per second / 1 billion requests per month) 78 | * 6,000 tweets per second 79 | * 15 billion tweets delivered on fanout per month * (400 requests per second / 1 billion requests per month) 80 | * 60 thousand tweets delivered on fanout per second 81 | * 150 billion tweets delivered on fanout per month * (400 requests per second / 1 billion requests per month) 82 | * 4,000 search requests per second 83 | 84 | Handy conversion guide: 85 | 86 | * 2.5 million seconds per month 87 | * 1 request per second = 2.5 million requests per month 88 | * 40 requests per second = 100 million requests per month 89 | * 400 requests per second = 1 billion requests per month 90 | 91 | ## Step 2: Create a high level design 92 | 93 | > Outline a high level design with all important components. 94 | 95 | ![Imgur](http://i.imgur.com/48tEA2j.png) 96 | 97 | ## Step 3: Design core components 98 | 99 | > Dive into details for each core component. 100 | 101 | ### Use case: User posts a tweet 102 | 103 | We could store the user's own tweets to populate the user timeline (activity from the user) in a [relational database](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms). We should discuss the [use cases and tradeoffs between choosing SQL or NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql). 104 | 105 | Delivering tweets and building the home timeline (activity from people the user is following) is trickier. Fanning out tweets to all followers (60 thousand tweets delivered on fanout per second) will overload a traditional [relational database](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms). We'll probably want to choose a data store with fast writes such as a **NoSQL database** or **Memory Cache**. Reading 1 MB sequentially from memory takes about 250 microseconds, while reading from SSD takes 4x and from disk takes 80x longer.1 106 | 107 | We could store media such as photos or videos on an **Object Store**. 108 | 109 | * The **Client** posts a tweet to the **Web Server**, running as a [reverse proxy](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 110 | * The **Web Server** forwards the request to the **Write API** server 111 | * The **Write API** stores the tweet in the user's timeline on a **SQL database** 112 | * The **Write API** contacts the **Fan Out Service**, which does the following: 113 | * Queries the **User Graph Service** to find the user's followers stored in the **Memory Cache** 114 | * Stores the tweet in the *home timeline of the user's followers* in a **Memory Cache** 115 | * O(n) operation: 1,000 followers = 1,000 lookups and inserts 116 | * Stores the tweet in the **Search Index Service** to enable fast searching 117 | * Stores media in the **Object Store** 118 | * Uses the **Notification Service** to send out push notifications to followers: 119 | * Uses a **Queue** (not pictured) to asynchronously send out notifications 120 | 121 | **Clarify with your interviewer how much code you are expected to write**. 122 | 123 | If our **Memory Cache** is Redis, we could use a native Redis list with the following structure: 124 | 125 | ``` 126 | tweet n+2 tweet n+1 tweet n 127 | | 8 bytes 8 bytes 1 byte | 8 bytes 8 bytes 1 byte | 8 bytes 7 bytes 1 byte | 128 | | tweet_id user_id meta | tweet_id user_id meta | tweet_id user_id meta | 129 | ``` 130 | 131 | The new tweet would be placed in the **Memory Cache**, which populates user's home timeline (activity from people the user is following). 132 | 133 | We'll use a public [**REST API**](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest): 134 | 135 | ``` 136 | $ curl -X POST --data '{ "user_id": "123", "auth_token": "ABC123", \ 137 | "status": "hello world!", "media_ids": "ABC987" }' \ 138 | https://twitter.com/api/v1/tweet 139 | ``` 140 | 141 | Response: 142 | 143 | ``` 144 | { 145 | "created_at": "Wed Sep 05 00:37:15 +0000 2012", 146 | "status": "hello world!", 147 | "tweet_id": "987", 148 | "user_id": "123", 149 | ... 150 | } 151 | ``` 152 | 153 | For internal communications, we could use [Remote Procedure Calls](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc). 154 | 155 | ### Use case: User views the home timeline 156 | 157 | * The **Client** posts a home timeline request to the **Web Server** 158 | * The **Web Server** forwards the request to the **Read API** server 159 | * The **Read API** server contacts the **Timeline Service**, which does the following: 160 | * Gets the timeline data stored in the **Memory Cache**, containing tweet ids and user ids - O(1) 161 | * Queries the **Tweet Info Service** with a [multiget](http://redis.io/commands/mget) to obtain additional info about the tweet ids - O(n) 162 | * Queries the **User Info Service** with a multiget to obtain additional info about the user ids - O(n) 163 | 164 | REST API: 165 | 166 | ``` 167 | $ curl https://twitter.com/api/v1/home_timeline?user_id=123 168 | ``` 169 | 170 | Response: 171 | 172 | ``` 173 | { 174 | "user_id": "456", 175 | "tweet_id": "123", 176 | "status": "foo" 177 | }, 178 | { 179 | "user_id": "789", 180 | "tweet_id": "456", 181 | "status": "bar" 182 | }, 183 | { 184 | "user_id": "789", 185 | "tweet_id": "579", 186 | "status": "baz" 187 | }, 188 | ``` 189 | 190 | ### Use case: User views the user timeline 191 | 192 | * The **Client** posts a home timeline request to the **Web Server** 193 | * The **Web Server** forwards the request to the **Read API** server 194 | * The **Read API** retrieves the user timeline from the **SQL Database** 195 | 196 | The REST API would be similar to the home timeline, except all tweets would come from the user as opposed to the people the user is following. 197 | 198 | ### Use case: User searches keywords 199 | 200 | * The **Client** sends a search request to the **Web Server** 201 | * The **Web Server** forwards the request to the **Search API** server 202 | * The **Search API** contacts the **Search Service**, which does the following: 203 | * Parses/tokenizes the input query, determining what needs to be searched 204 | * Removes markup 205 | * Breaks up the text into terms 206 | * Fixes typos 207 | * Normalizes capitalization 208 | * Converts the query to use boolean operations 209 | * Queries the **Search Cluster** (ie [Lucene](https://lucene.apache.org/)) for the results: 210 | * [Scatter gathers](https://github.com/donnemartin/system-design-primer#under-development) each server in the cluster to determine if there are any results for the query 211 | * Merges, ranks, sorts, and returns the results 212 | 213 | REST API: 214 | 215 | ``` 216 | $ curl https://twitter.com/api/v1/search?query=hello+world 217 | ``` 218 | 219 | The response would be similar to that of the home timeline, except for tweets matching the given query. 220 | 221 | ## Step 4: Scale the design 222 | 223 | > Identify and address bottlenecks, given the constraints. 224 | 225 | ![Imgur](http://i.imgur.com/jrUBAF7.png) 226 | 227 | **Important: Do not simply jump right into the final design from the initial design!** 228 | 229 | State you would 1) **Benchmark/Load Test**, 2) **Profile** for bottlenecks 3) address bottlenecks while evaluating alternatives and trade-offs, and 4) repeat. See [Design a system that scales to millions of users on AWS](https://github.com/donnemartin/system-design-primer/blob/master/solutions/system_design/scaling_aws/README.md) as a sample on how to iteratively scale the initial design. 230 | 231 | It's important to discuss what bottlenecks you might encounter with the initial design and how you might address each of them. For example, what issues are addressed by adding a **Load Balancer** with multiple **Web Servers**? **CDN**? **Master-Slave Replicas**? What are the alternatives and **Trade-Offs** for each? 232 | 233 | We'll introduce some components to complete the design and to address scalability issues. Internal load balancers are not shown to reduce clutter. 234 | 235 | *To avoid repeating discussions*, refer to the following [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) for main talking points, tradeoffs, and alternatives: 236 | 237 | * [DNS](https://github.com/donnemartin/system-design-primer#domain-name-system) 238 | * [CDN](https://github.com/donnemartin/system-design-primer#content-delivery-network) 239 | * [Load balancer](https://github.com/donnemartin/system-design-primer#load-balancer) 240 | * [Horizontal scaling](https://github.com/donnemartin/system-design-primer#horizontal-scaling) 241 | * [Web server (reverse proxy)](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 242 | * [API server (application layer)](https://github.com/donnemartin/system-design-primer#application-layer) 243 | * [Cache](https://github.com/donnemartin/system-design-primer#cache) 244 | * [Relational database management system (RDBMS)](https://github.com/donnemartin/system-design-primer#relational-database-management-system-rdbms) 245 | * [SQL write master-slave failover](https://github.com/donnemartin/system-design-primer#fail-over) 246 | * [Master-slave replication](https://github.com/donnemartin/system-design-primer#master-slave-replication) 247 | * [Consistency patterns](https://github.com/donnemartin/system-design-primer#consistency-patterns) 248 | * [Availability patterns](https://github.com/donnemartin/system-design-primer#availability-patterns) 249 | 250 | The **Fanout Service** is a potential bottleneck. Twitter users with millions of followers could take several minutes to have their tweets go through the fanout process. This could lead to race conditions with @replies to the tweet, which we could mitigate by re-ordering the tweets at serve time. 251 | 252 | We could also avoid fanning out tweets from highly-followed users. Instead, we could search to find tweets for high-followed users, merge the search results with the user's home timeline results, then re-order the tweets at serve time. 253 | 254 | Additional optimizations include: 255 | 256 | * Keep only several hundred tweets for each home timeline in the **Memory Cache** 257 | * Keep only active users' home timeline info in the **Memory Cache** 258 | * If a user was not previously active in the past 30 days, we could rebuild the timeline from the **SQL Database** 259 | * Query the **User Graph Service** to determine who the user is following 260 | * Get the tweets from the **SQL Database** and add them to the **Memory Cache** 261 | * Store only a month of tweets in the **Tweet Info Service** 262 | * Store only active users in the **User Info Service** 263 | * The **Search Cluster** would likely need to keep the tweets in memory to keep latency low 264 | 265 | We'll also want to address the bottleneck with the **SQL Database**. 266 | 267 | Although the **Memory Cache** should reduce the load on the database, it is unlikely the **SQL Read Replicas** alone would be enough to handle the cache misses. We'll probably need to employ additional SQL scaling patterns. 268 | 269 | The high volume of writes would overwhelm a single **SQL Write Master-Slave**, also pointing to a need for additional scaling techniques. 270 | 271 | * [Federation](https://github.com/donnemartin/system-design-primer#federation) 272 | * [Sharding](https://github.com/donnemartin/system-design-primer#sharding) 273 | * [Denormalization](https://github.com/donnemartin/system-design-primer#denormalization) 274 | * [SQL Tuning](https://github.com/donnemartin/system-design-primer#sql-tuning) 275 | 276 | We should also consider moving some data to a **NoSQL Database**. 277 | 278 | ## Additional talking points 279 | 280 | > Additional topics to dive into, depending on the problem scope and time remaining. 281 | 282 | #### NoSQL 283 | 284 | * [Key-value store](https://github.com/donnemartin/system-design-primer#key-value-store) 285 | * [Document store](https://github.com/donnemartin/system-design-primer#document-store) 286 | * [Wide column store](https://github.com/donnemartin/system-design-primer#wide-column-store) 287 | * [Graph database](https://github.com/donnemartin/system-design-primer#graph-database) 288 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql) 289 | 290 | ### Caching 291 | 292 | * Where to cache 293 | * [Client caching](https://github.com/donnemartin/system-design-primer#client-caching) 294 | * [CDN caching](https://github.com/donnemartin/system-design-primer#cdn-caching) 295 | * [Web server caching](https://github.com/donnemartin/system-design-primer#web-server-caching) 296 | * [Database caching](https://github.com/donnemartin/system-design-primer#database-caching) 297 | * [Application caching](https://github.com/donnemartin/system-design-primer#application-caching) 298 | * What to cache 299 | * [Caching at the database query level](https://github.com/donnemartin/system-design-primer#caching-at-the-database-query-level) 300 | * [Caching at the object level](https://github.com/donnemartin/system-design-primer#caching-at-the-object-level) 301 | * When to update the cache 302 | * [Cache-aside](https://github.com/donnemartin/system-design-primer#cache-aside) 303 | * [Write-through](https://github.com/donnemartin/system-design-primer#write-through) 304 | * [Write-behind (write-back)](https://github.com/donnemartin/system-design-primer#write-behind-write-back) 305 | * [Refresh ahead](https://github.com/donnemartin/system-design-primer#refresh-ahead) 306 | 307 | ### Asynchronism and microservices 308 | 309 | * [Message queues](https://github.com/donnemartin/system-design-primer#message-queues) 310 | * [Task queues](https://github.com/donnemartin/system-design-primer#task-queues) 311 | * [Back pressure](https://github.com/donnemartin/system-design-primer#back-pressure) 312 | * [Microservices](https://github.com/donnemartin/system-design-primer#microservices) 313 | 314 | ### Communications 315 | 316 | * Discuss tradeoffs: 317 | * External communication with clients - [HTTP APIs following REST](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest) 318 | * Internal communications - [RPC](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc) 319 | * [Service discovery](https://github.com/donnemartin/system-design-primer#service-discovery) 320 | 321 | ### Security 322 | 323 | Refer to the [security section](https://github.com/donnemartin/system-design-primer#security). 324 | 325 | ### Latency numbers 326 | 327 | See [Latency numbers every programmer should know](https://github.com/donnemartin/system-design-primer#latency-numbers-every-programmer-should-know). 328 | 329 | ### Ongoing 330 | 331 | * Continue benchmarking and monitoring your system to address bottlenecks as they come up 332 | * Scaling is an iterative process 333 | -------------------------------------------------------------------------------- /solutions/system_design/twitter/twitter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/twitter/twitter.png -------------------------------------------------------------------------------- /solutions/system_design/twitter/twitter_basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/twitter/twitter_basic.png -------------------------------------------------------------------------------- /solutions/system_design/web_crawler/README.md: -------------------------------------------------------------------------------- 1 | # Design a web crawler 2 | 3 | *Note: This document links directly to relevant areas found in the [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) to avoid duplication. Refer to the linked content for general talking points, tradeoffs, and alternatives.* 4 | 5 | ## Step 1: Outline use cases and constraints 6 | 7 | > Gather requirements and scope the problem. 8 | > Ask questions to clarify use cases and constraints. 9 | > Discuss assumptions. 10 | 11 | Without an interviewer to address clarifying questions, we'll define some use cases and constraints. 12 | 13 | ### Use cases 14 | 15 | #### We'll scope the problem to handle only the following use cases 16 | 17 | * **Service** crawls a list of urls: 18 | * Generates reverse index of words to pages containing the search terms 19 | * Generates titles and snippets for pages 20 | * Title and snippets are static, they do not change based on search query 21 | * **User** inputs a search term and sees a list of relevant pages with titles and snippets the crawler generated 22 | * Only sketch high level components and interactions for this use case, no need to go into depth 23 | * **Service** has high availability 24 | 25 | #### Out of scope 26 | 27 | * Search analytics 28 | * Personalized search results 29 | * Page rank 30 | 31 | ### Constraints and assumptions 32 | 33 | #### State assumptions 34 | 35 | * Traffic is not evenly distributed 36 | * Some searches are very popular, while others are only executed once 37 | * Support only anonymous users 38 | * Generating search results should be fast 39 | * The web crawler should not get stuck in an infinite loop 40 | * We get stuck in an infinite loop if the graph contains a cycle 41 | * 1 billion links to crawl 42 | * Pages need to be crawled regularly to ensure freshness 43 | * Average refresh rate of about once per week, more frequent for popular sites 44 | * 4 billion links crawled each month 45 | * Average stored size per web page: 500 KB 46 | * For simplicity, count changes the same as new pages 47 | * 100 billion searches per month 48 | 49 | Exercise the use of more traditional systems - don't use existing systems such as [solr](http://lucene.apache.org/solr/) or [nutch](http://nutch.apache.org/). 50 | 51 | #### Calculate usage 52 | 53 | **Clarify with your interviewer if you should run back-of-the-envelope usage calculations.** 54 | 55 | * 2 PB of stored page content per month 56 | * 500 KB per page * 4 billion links crawled per month 57 | * 72 PB of stored page content in 3 years 58 | * 1,600 write requests per second 59 | * 40,000 search requests per second 60 | 61 | Handy conversion guide: 62 | 63 | * 2.5 million seconds per month 64 | * 1 request per second = 2.5 million requests per month 65 | * 40 requests per second = 100 million requests per month 66 | * 400 requests per second = 1 billion requests per month 67 | 68 | ## Step 2: Create a high level design 69 | 70 | > Outline a high level design with all important components. 71 | 72 | ![Imgur](http://i.imgur.com/xjdAAUv.png) 73 | 74 | ## Step 3: Design core components 75 | 76 | > Dive into details for each core component. 77 | 78 | ### Use case: Service crawls a list of urls 79 | 80 | We'll assume we have an initial list of `links_to_crawl` ranked initially based on overall site popularity. If this is not a reasonable assumption, we can seed the crawler with popular sites that link to outside content such as [Yahoo](https://www.yahoo.com/), [DMOZ](http://www.dmoz.org/), etc 81 | 82 | We'll use a table `crawled_links` to store processed links and their page signatures. 83 | 84 | We could store `links_to_crawl` and `crawled_links` in a key-value **NoSQL Database**. For the ranked links in `links_to_crawl`, we could use [Redis](https://redis.io/) with sorted sets to maintain a ranking of page links. We should discuss the [use cases and tradeoffs between choosing SQL or NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql). 85 | 86 | * The **Crawler Service** processes each page link by doing the following in a loop: 87 | * Takes the top ranked page link to crawl 88 | * Checks `crawled_links` in the **NoSQL Database** for an entry with a similar page signature 89 | * If we have a similar page, reduces the priority of the page link 90 | * This prevents us from getting into a cycle 91 | * Continue 92 | * Else, crawls the link 93 | * Adds a job to the **Reverse Index Service** queue to generate a [reverse index](https://en.wikipedia.org/wiki/Search_engine_indexing) 94 | * Adds a job to the **Document Service** queue to generate a static title and snippet 95 | * Generates the page signature 96 | * Removes the link from `links_to_crawl` in the **NoSQL Database** 97 | * Inserts the page link and signature to `crawled_links` in the **NoSQL Database** 98 | 99 | **Clarify with your interviewer how much code you are expected to write**. 100 | 101 | `PagesDataStore` is an abstraction within the **Crawler Service** that uses the **NoSQL Database**: 102 | 103 | ``` 104 | class PagesDataStore(object): 105 | 106 | def __init__(self, db); 107 | self.db = db 108 | ... 109 | 110 | def add_link_to_crawl(self, url): 111 | """Add the given link to `links_to_crawl`.""" 112 | ... 113 | 114 | def remove_link_to_crawl(self, url): 115 | """Remove the given link from `links_to_crawl`.""" 116 | ... 117 | 118 | def reduce_priority_link_to_crawl(self, url) 119 | """Reduce the priority of a link in `links_to_crawl` to avoid cycles.""" 120 | ... 121 | 122 | def extract_max_priority_page(self): 123 | """Return the highest priority link in `links_to_crawl`.""" 124 | ... 125 | 126 | def insert_crawled_link(self, url, signature): 127 | """Add the given link to `crawled_links`.""" 128 | ... 129 | 130 | def crawled_similar(self, signature): 131 | """Determine if we've already crawled a page matching the given signature""" 132 | ... 133 | ``` 134 | 135 | `Page` is an abstraction within the **Crawler Service** that encapsulates a page, its contents, child urls, and signature: 136 | 137 | ``` 138 | class Page(object): 139 | 140 | def __init__(self, url, contents, child_urls, signature): 141 | self.url = url 142 | self.contents = contents 143 | self.child_urls = child_urls 144 | self.signature = signature 145 | ``` 146 | 147 | `Crawler` is the main class within **Crawler Service**, composed of `Page` and `PagesDataStore`. 148 | 149 | ``` 150 | class Crawler(object): 151 | 152 | def __init__(self, data_store, reverse_index_queue, doc_index_queue): 153 | self.data_store = data_store 154 | self.reverse_index_queue = reverse_index_queue 155 | self.doc_index_queue = doc_index_queue 156 | 157 | def create_signature(self, page): 158 | """Create signature based on url and contents.""" 159 | ... 160 | 161 | def crawl_page(self, page): 162 | for url in page.child_urls: 163 | self.data_store.add_link_to_crawl(url) 164 | page.signature = self.create_signature(page) 165 | self.data_store.remove_link_to_crawl(page.url) 166 | self.data_store.insert_crawled_link(page.url, page.signature) 167 | 168 | def crawl(self): 169 | while True: 170 | page = self.data_store.extract_max_priority_page() 171 | if page is None: 172 | break 173 | if self.data_store.crawled_similar(page.signature): 174 | self.data_store.reduce_priority_link_to_crawl(page.url) 175 | else: 176 | self.crawl_page(page) 177 | ``` 178 | 179 | ### Handling duplicates 180 | 181 | We need to be careful the web crawler doesn't get stuck in an infinite loop, which happens when the graph contains a cycle. 182 | 183 | **Clarify with your interviewer how much code you are expected to write**. 184 | 185 | We'll want to remove duplicate urls: 186 | 187 | * For smaller lists we could use something like `sort | unique` 188 | * With 1 billion links to crawl, we could use **MapReduce** to output only entries that have a frequency of 1 189 | 190 | ``` 191 | class RemoveDuplicateUrls(MRJob): 192 | 193 | def mapper(self, _, line): 194 | yield line, 1 195 | 196 | def reducer(self, key, values): 197 | total = sum(values) 198 | if total == 1: 199 | yield key, total 200 | ``` 201 | 202 | Detecting duplicate content is more complex. We could generate a signature based on the contents of the page and compare those two signatures for similarity. Some potential algorithms are [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) and [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity). 203 | 204 | ### Determining when to update the crawl results 205 | 206 | Pages need to be crawled regularly to ensure freshness. Crawl results could have a `timestamp` field that indicates the last time a page was crawled. After a default time period, say one week, all pages should be refreshed. Frequently updated or more popular sites could be refreshed in shorter intervals. 207 | 208 | Although we won't dive into details on analytics, we could do some data mining to determine the mean time before a particular page is updated, and use that statistic to determine how often to re-crawl the page. 209 | 210 | We might also choose to support a `Robots.txt` file that gives webmasters control of crawl frequency. 211 | 212 | ### Use case: User inputs a search term and sees a list of relevant pages with titles and snippets 213 | 214 | * The **Client** sends a request to the **Web Server**, running as a [reverse proxy](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 215 | * The **Web Server** forwards the request to the **Query API** server 216 | * The **Query API** server does does the following: 217 | * Parses the query 218 | * Removes markup 219 | * Breaks up the text into terms 220 | * Fixes typos 221 | * Normalizes capitalization 222 | * Converts the query to use boolean operations 223 | * Uses the **Reverse Index Service** to find documents matching the query 224 | * The **Reverse Index Service** ranks the matching results and returns the top ones 225 | * Uses the **Document Service** to return titles and snippets 226 | 227 | We'll use a public [**REST API**](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest): 228 | 229 | ``` 230 | $ curl https://search.com/api/v1/search?query=hello+world 231 | ``` 232 | 233 | Response: 234 | 235 | ``` 236 | { 237 | "title": "foo's title", 238 | "snippet": "foo's snippet", 239 | "link": "https://foo.com", 240 | }, 241 | { 242 | "title": "bar's title", 243 | "snippet": "bar's snippet", 244 | "link": "https://bar.com", 245 | }, 246 | { 247 | "title": "baz's title", 248 | "snippet": "baz's snippet", 249 | "link": "https://baz.com", 250 | }, 251 | ``` 252 | 253 | For internal communications, we could use [Remote Procedure Calls](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc). 254 | 255 | ## Step 4: Scale the design 256 | 257 | > Identify and address bottlenecks, given the constraints. 258 | 259 | ![Imgur](http://i.imgur.com/bWxPtQA.png) 260 | 261 | **Important: Do not simply jump right into the final design from the initial design!** 262 | 263 | State you would 1) **Benchmark/Load Test**, 2) **Profile** for bottlenecks 3) address bottlenecks while evaluating alternatives and trade-offs, and 4) repeat. See [Design a system that scales to millions of users on AWS](https://github.com/donnemartin/system-design-primer/blob/master/solutions/system_design/scaling_aws/README.md) as a sample on how to iteratively scale the initial design. 264 | 265 | It's important to discuss what bottlenecks you might encounter with the initial design and how you might address each of them. For example, what issues are addressed by adding a **Load Balancer** with multiple **Web Servers**? **CDN**? **Master-Slave Replicas**? What are the alternatives and **Trade-Offs** for each? 266 | 267 | We'll introduce some components to complete the design and to address scalability issues. Internal load balancers are not shown to reduce clutter. 268 | 269 | *To avoid repeating discussions*, refer to the following [system design topics](https://github.com/donnemartin/system-design-primer#index-of-system-design-topics) for main talking points, tradeoffs, and alternatives: 270 | 271 | * [DNS](https://github.com/donnemartin/system-design-primer#domain-name-system) 272 | * [Load balancer](https://github.com/donnemartin/system-design-primer#load-balancer) 273 | * [Horizontal scaling](https://github.com/donnemartin/system-design-primer#horizontal-scaling) 274 | * [Web server (reverse proxy)](https://github.com/donnemartin/system-design-primer#reverse-proxy-web-server) 275 | * [API server (application layer)](https://github.com/donnemartin/system-design-primer#application-layer) 276 | * [Cache](https://github.com/donnemartin/system-design-primer#cache) 277 | * [NoSQL](https://github.com/donnemartin/system-design-primer#nosql) 278 | * [Consistency patterns](https://github.com/donnemartin/system-design-primer#consistency-patterns) 279 | * [Availability patterns](https://github.com/donnemartin/system-design-primer#availability-patterns) 280 | 281 | Some searches are very popular, while others are only executed once. Popular queries can be served from a **Memory Cache** such as Redis or Memcached to reduce response times and to avoid overloading the **Reverse Index Service** and **Document Service**. The **Memory Cache** is also useful for handling the unevenly distributed traffic and traffic spikes. Reading 1 MB sequentially from memory takes about 250 microseconds, while reading from SSD takes 4x and from disk takes 80x longer.1 282 | 283 | Below are a few other optimizations to the **Crawling Service**: 284 | 285 | * To handle the data size and request load, the **Reverse Index Service** and **Document Service** will likely need to make heavy use sharding and replication. 286 | * DNS lookup can be a bottleneck, the **Crawler Service** can keep its own DNS lookup that is refreshed periodically 287 | * The **Crawler Service** can improve performance and reduce memory usage by keeping many open connections at a time, referred to as [connection pooling](https://en.wikipedia.org/wiki/Connection_pool) 288 | * Switching to [UDP](https://github.com/donnemartin/system-design-primer#user-datagram-protocol-udp) could also boost performance 289 | * Web crawling is bandwidth intensive, ensure there is enough bandwidth to sustain high throughput 290 | 291 | ## Additional talking points 292 | 293 | > Additional topics to dive into, depending on the problem scope and time remaining. 294 | 295 | ### SQL scaling patterns 296 | 297 | * [Read replicas](https://github.com/donnemartin/system-design-primer#master-slave) 298 | * [Federation](https://github.com/donnemartin/system-design-primer#federation) 299 | * [Sharding](https://github.com/donnemartin/system-design-primer#sharding) 300 | * [Denormalization](https://github.com/donnemartin/system-design-primer#denormalization) 301 | * [SQL Tuning](https://github.com/donnemartin/system-design-primer#sql-tuning) 302 | 303 | #### NoSQL 304 | 305 | * [Key-value store](https://github.com/donnemartin/system-design-primer#key-value-store) 306 | * [Document store](https://github.com/donnemartin/system-design-primer#document-store) 307 | * [Wide column store](https://github.com/donnemartin/system-design-primer#wide-column-store) 308 | * [Graph database](https://github.com/donnemartin/system-design-primer#graph-database) 309 | * [SQL vs NoSQL](https://github.com/donnemartin/system-design-primer#sql-or-nosql) 310 | 311 | ### Caching 312 | 313 | * Where to cache 314 | * [Client caching](https://github.com/donnemartin/system-design-primer#client-caching) 315 | * [CDN caching](https://github.com/donnemartin/system-design-primer#cdn-caching) 316 | * [Web server caching](https://github.com/donnemartin/system-design-primer#web-server-caching) 317 | * [Database caching](https://github.com/donnemartin/system-design-primer#database-caching) 318 | * [Application caching](https://github.com/donnemartin/system-design-primer#application-caching) 319 | * What to cache 320 | * [Caching at the database query level](https://github.com/donnemartin/system-design-primer#caching-at-the-database-query-level) 321 | * [Caching at the object level](https://github.com/donnemartin/system-design-primer#caching-at-the-object-level) 322 | * When to update the cache 323 | * [Cache-aside](https://github.com/donnemartin/system-design-primer#cache-aside) 324 | * [Write-through](https://github.com/donnemartin/system-design-primer#write-through) 325 | * [Write-behind (write-back)](https://github.com/donnemartin/system-design-primer#write-behind-write-back) 326 | * [Refresh ahead](https://github.com/donnemartin/system-design-primer#refresh-ahead) 327 | 328 | ### Asynchronism and microservices 329 | 330 | * [Message queues](https://github.com/donnemartin/system-design-primer#message-queues) 331 | * [Task queues](https://github.com/donnemartin/system-design-primer#task-queues) 332 | * [Back pressure](https://github.com/donnemartin/system-design-primer#back-pressure) 333 | * [Microservices](https://github.com/donnemartin/system-design-primer#microservices) 334 | 335 | ### Communications 336 | 337 | * Discuss tradeoffs: 338 | * External communication with clients - [HTTP APIs following REST](https://github.com/donnemartin/system-design-primer#representational-state-transfer-rest) 339 | * Internal communications - [RPC](https://github.com/donnemartin/system-design-primer#remote-procedure-call-rpc) 340 | * [Service discovery](https://github.com/donnemartin/system-design-primer#service-discovery) 341 | 342 | ### Security 343 | 344 | Refer to the [security section](https://github.com/donnemartin/system-design-primer#security). 345 | 346 | ### Latency numbers 347 | 348 | See [Latency numbers every programmer should know](https://github.com/donnemartin/system-design-primer#latency-numbers-every-programmer-should-know). 349 | 350 | ### Ongoing 351 | 352 | * Continue benchmarking and monitoring your system to address bottlenecks as they come up 353 | * Scaling is an iterative process 354 | -------------------------------------------------------------------------------- /solutions/system_design/web_crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/web_crawler/__init__.py -------------------------------------------------------------------------------- /solutions/system_design/web_crawler/web_crawler.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/web_crawler/web_crawler.png -------------------------------------------------------------------------------- /solutions/system_design/web_crawler/web_crawler_basic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mutablealligator/system-design-primer/0889f04f0d8162b6513437d3171a8ebb9ef5e504/solutions/system_design/web_crawler/web_crawler_basic.png -------------------------------------------------------------------------------- /solutions/system_design/web_crawler/web_crawler_mapreduce.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from mrjob.job import MRJob 4 | 5 | 6 | class RemoveDuplicateUrls(MRJob): 7 | 8 | def mapper(self, _, line): 9 | yield line, 1 10 | 11 | def reducer(self, key, values): 12 | total = sum(values) 13 | if total == 1: 14 | yield key, total 15 | 16 | def steps(self): 17 | """Run the map and reduce steps.""" 18 | return [ 19 | self.mr(mapper=self.mapper, 20 | reducer=self.reducer) 21 | ] 22 | 23 | 24 | if __name__ == '__main__': 25 | RemoveDuplicateUrls.run() 26 | -------------------------------------------------------------------------------- /solutions/system_design/web_crawler/web_crawler_snippets.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class PagesDataStore(object): 4 | 5 | def __init__(self, db); 6 | self.db = db 7 | ... 8 | 9 | def add_link_to_crawl(self, url): 10 | """Add the given link to `links_to_crawl`.""" 11 | ... 12 | 13 | def remove_link_to_crawl(self, url): 14 | """Remove the given link from `links_to_crawl`.""" 15 | ... 16 | 17 | def reduce_priority_link_to_crawl(self, url) 18 | """Reduce the priority of a link in `links_to_crawl` to avoid cycles.""" 19 | ... 20 | 21 | def extract_max_priority_page(self): 22 | """Return the highest priority link in `links_to_crawl`.""" 23 | ... 24 | 25 | def insert_crawled_link(self, url, signature): 26 | """Add the given link to `crawled_links`.""" 27 | ... 28 | 29 | def crawled_similar(self, signature): 30 | """Determine if we've already crawled a page matching the given signature""" 31 | ... 32 | 33 | 34 | class Page(object): 35 | 36 | def __init__(self, url, contents, child_urls): 37 | self.url = url 38 | self.contents = contents 39 | self.child_urls = child_urls 40 | self.signature = self.create_signature() 41 | 42 | def create_signature(self): 43 | # Create signature based on url and contents 44 | ... 45 | 46 | 47 | class Crawler(object): 48 | 49 | def __init__(self, pages, data_store, reverse_index_queue, doc_index_queue): 50 | self.pages = pages 51 | self.data_store = data_store 52 | self.reverse_index_queue = reverse_index_queue 53 | self.doc_index_queue = doc_index_queue 54 | 55 | def crawl_page(self, page): 56 | for url in page.child_urls: 57 | self.data_store.add_link_to_crawl(url) 58 | self.reverse_index_queue.generate(page) 59 | self.doc_index_queue.generate(page) 60 | self.data_store.remove_link_to_crawl(page.url) 61 | self.data_store.insert_crawled_link(page.url, page.signature) 62 | 63 | def crawl(self): 64 | while True: 65 | page = self.data_store.extract_max_priority_page() 66 | if page is None: 67 | break 68 | if self.data_store.crawled_similar(page.signature): 69 | self.data_store.reduce_priority_link_to_crawl(page.url) 70 | else: 71 | self.crawl_page(page) 72 | page = self.data_store.extract_max_priority_page() 73 | --------------------------------------------------------------------------------