├── .env
├── .github
└── FUNDING.yml
├── .gitignore
├── LICENSE
├── README.md
├── _config.yml
├── bin
├── kiwix-serve
└── mwdumper-1.26.jar
├── data
├── cache
│ └── README.md
├── certs
│ └── README.md
├── dumps
│ └── README.md
├── logs
│ └── README.md
└── zim
│ └── README.md
├── docker-compose.kiwix.yml
├── docker-compose.mediawiki.yml
├── docker-compose.nginx.yml
├── docker-compose.xowa.yml
├── etc
└── nginx
│ └── nginx.conf.template
├── postimport.sql
└── preimport.sql
/.env:
--------------------------------------------------------------------------------
1 | PROJECT_DIR="/opt/wiki" # folder for all project state
2 | CONFIG_DIR="$PROJECT_DIR/etc/nginx"
3 | CACHE_DIR="$PROJECT_DIR/data/cache"
4 | CERTS_DIR="$PROJECT_DIR/data/certs"
5 | LOGS_DIR="$PROJECT_DIR/data/logs"
6 |
7 | LANG="en" # Wikipedia language to mirror
8 | LISTEN_PORT_HTTP="80" # public-facing HTTP port to bind
9 | LISTEN_PORT_HTTPS="443" # public-facing HTTPS port to bind
10 | LISTEN_HOST="wiki.example.com" # root domain to listen on
11 | LISTEN_WIKI="$LANG.$LISTEN_HOST" # wiki domain to listen on
12 | LISTEN_MEDIA="upload.$LISTEN_HOST" # uploads domain to listen on
13 |
14 | UPSTREAM_HOST="wikipedia.org" # main upstream domain
15 | UPSTREAM_WIKI="$LANG.$UPSTREAM_HOST" # upstream domain for wiki
16 | UPSTREAM_MEDIA="upload.wikimedia.org" # upstream domain for uploads
17 |
18 | # Only needed if using an nginx reverse proxy:
19 | SSL_CRT="$CERTS_DIR/$LISTEN_HOST.crt"
20 | SSL_KEY="$CERTS_DIR/$LISTEN_HOST.key"
21 | SSL_DH="$CERTS_DIR/$LISTEN_HOST.dh"
22 |
23 | CACHE_SIZE="100G" # or "500GB", "1GB", "200MB", etc.
24 | CACHE_REQUESTS="GET HEAD POST" # or "GET HEAD", "any", etc.
25 | CACHE_RESPONSES="200 206 302" # or "200 302 404", "any", etc.
26 | CACHE_DURATION="max" # or "1d", "30m", "12h", etc.
27 |
28 | ACCESS_LOG="'$LOGS_DIR/nginx.out' trace" # or "off", etc.
29 | ERROR_LOG="'$LOGS_DIR/nginx.err' warn" # or "off", etc.
30 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: pirate
2 | patreon: theSquashSH
3 | custom: https://paypal.me/NicholasSweeting
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | data/cache/*
2 | data/certs/*
3 | data/logs/*
4 | data/zim/*
5 | data/dumps/*
6 | *.xml
7 | *.xml.bz2
8 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Nick Sweeting
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
How to self-host a mirror of Wikipedia.org:
with Nginx, Kiwix, or MediaWiki/XOWA + Docker
4 |
Originally published 2019-09-08 on docs.sweeting.me.
The pretty HTML version is here and the source for this guide is on Github.
5 | A summary of how to set up a full Wikipedia.org mirror using three different approaches.
6 |
DEMO: https://other-wiki.zervice.io
7 |
8 |

9 |
10 |
11 | # Intro
12 |
13 | > **Did you know that Wikipedia.org just runs a mostly-traditional LAMP stack on [~350 servers](https://meta.wikimedia.org/wiki/Wikimedia_servers)**? (as of 2019)
14 |
15 | **Unfortunately, Wikipedia attracts lots of hate from people and nation-states who object to certain articles or want to hide information from the public eye.**
16 |
17 | Wikipedia's infrastructure (2 racks the USA, 1 in Holland, and 1 in Singapore, + CDNs) [cant always stand up to large DDoS attacks](https://wikimediafoundation.org/news/2019/09/07/malicious-attack-on-wikipedia-what-we-know-and-what-were-doing/), but thankfully they provide regular database dumps and static HTML archives to the public, and have permissive licensing that allows for rehosting with modification (even for profit!).
18 |
19 | Growing up in China [behind the GFC I often experienced Wikipedia unavailability](https://www.cnet.com/news/the-great-firewall-of-china-blocks-off-wikipedia/), and in light of the [recent DDoS](https://wikimediafoundation.org/news/2019/09/07/malicious-attack-on-wikipedia-what-we-know-and-what-were-doing/) I decided to make a guide for people to help demystify the process of running a mirror. I'm also a big advocate for free access to information, and I'm the maintainer of a major internet archiving project called [ArchiveBox](https://archivebox.io) (a self-hosted internet archiver powered by headless Chromium).
20 |
21 | **This aim of this guide is to encourage people to use these publicly available dumps to host Wikipedia mirrors, so that malicious actors don't succeed in limiting public access to one of the *world's best sources of information*.**
22 |
23 | ---
24 |
25 | ## Quickstart
26 |
27 | A *full* English Wikipedia.org clone in 3 steps.
28 |
29 | **DEMO: https://other-wiki.zervice.io**
30 |
31 | ```bash
32 | # 1. Download the Kiwix-Serve static binary from https://www.kiwix.org/en/downloads/kiwix-serve/
33 | wget 'https://download.kiwix.org/release/kiwix-tools/kiwix-tools_linux-x86_64.tar.gz'
34 | tar -xzf kiwix-tools_linux-x86_64-3.0.1.tar.gz && cd kiwix-tools_linux-x86_64-3.0.1
35 |
36 | # 2. Download a compressed Wikipedia dump from https://dumps.wikimedia.org/other/kiwix/zim/wikipedia/ (79GB, images included!)
37 | wget --continue "https://download.kiwix.org/zim/wikipedia_en_all_maxi.zim"
38 |
39 | # 3. Start the kiwix server, then visit http://127.0.0.1:8888
40 | ./kiwix-serve --verbose --port 8888 "$PWD/wikipedia_en_all_maxi_2018-10.zim"
41 | ```
42 | ---
43 |
44 |
45 | ## Getting Started
46 |
47 | Wikipedia.org itself is powered by a PHP backend called [WikiMedia](https://en.wikipedia.org/wiki/MediaWiki), using MariaDB for data storage, Varnish and Memcached for request and query caching, and ElasticSearch for full-text search. Production Wikipedia.org also runs a number of extra plugins and modules on top of MediaWiki.
48 |
49 | **🖥 There are several ways to host your own mirror of Wikipedia (with varying complexity):**
50 |
51 | 1. [**Run a caching proxy in front of Wikipedia.org**](#) (disk used on-demand for cache, low CPU use)
52 | 2. [**Serve the static HTML ZIM archive with Kiwix**](#) (10~80GB for compressed archive, low CPU use)
53 | 3. [**Run a full MediaWiki server**](#) (hardest to set up, ~600GB for XML & database, high CPU use)
54 |
55 | **💅Don't expect it to look perfect on the first try**
56 |
57 | Setting up a Wikipidea mirror involves a complex dance between software, data, and devops, so beginners are encouraged to start with the static html archive or proxy and before attempting to run a full MediaWiki Server. Users should expect their mirrors to be able to serve articles with images and search, but should not expect it to look exactly like Wikipedia.org on the first try, or the second...
58 |
59 | **✅ Choosing an approach**
60 |
61 | Each method in this guide has its pros and cons. A caching proxy is the most lightweight option, but if the upstream servers go down and a request comes in that hasn't been seen before and cached it will 404, so it's not a fully redundant mirror. The static ZIM mirror is lightweight to download and host (and requests are easy to cache), it has full-text search, but it has no interactivity, talk page history, or Wikipedia-style category pages (though they are coming soon). MediaWiki/XOWA are the most complex, but they can provide a full working Wikipedia mirror complete with history revisions, users, talk pages, search, and more.
62 |
63 | Running a full MediaWiki server is by far the hardest method to set up. Expect it to take multiple days/weeks depending on available system resources, and expect it to look fairly broken since the production Wikipedia.org team run many tweaks and plugins that take extra work to set up locally.
64 |
65 | For more info, see the [Wikipedia.org index of all dump types available, with descriptions](https://dumps.wikimedia.org/).
66 |
67 |
68 | ## Responsible Rehosting Warning
69 |
70 | ⚠️ Be aware that running a publicly-accessible mirror of Wikipedia.org with any kind of framing / content modifications / ads is *strongly discouraged*. Framing mirrors / proxy mirrors are still a good option for private use, but you need to take additional steps to mirror responsibly if you're setting up a proxy for public use (e.g. robots:noindex, takedown contact info, blocking unlicensed images, etc.).
71 |
72 | > Some mirrors load a page from the Wikimedia servers directly every time someone requests a page from them. They alter the text in some way, such as framing it with ads, then send it on to the reader. **This is called remote loading, and it is an unacceptable use of Wikimedia server resources.** Even remote loading websites with little legitimate traffic can generate significant load on our servers, due to search engine web crawlers.
73 | *https://en.wikipedia.org/wiki/Wikipedia:Mirrors_and_forks#Remote_loading*
74 |
75 |
76 | Luckily, regardless of how you choose to rehost Wikipedia ***text***, you are not breaking any terms and conditions or violating copyright law as long as you don't remove their copyright statements (however, note the article images and videos on Wikimedia.org may not be licensed for re-use).
77 |
78 | > Every contribution to the English Wikipedia has been licensed for re-use, including commercial, for-profit websites. Republication is not necessarily a breach of copyright, so long as the appropriate licenses are complied with.
79 | *https://en.wikipedia.org/wiki/Wikipedia:Mirrors_and_forks#Things_you_need_to_know*
80 |
81 |
82 | ---
83 |
84 | # [Table of Contents](https://docs.sweeting.me/s/self-host-a-wikipedia-mirror#TOC)
85 |
86 | [TOC]
87 |
88 | See the [HTML version](https://docs.sweeting.me/s/self-host-a-wikipedia-mirror#TOC) of this guide for the best browsing experience. See [pirate/wikipedia-mirror](https://github.com/pirate/wikipedia-mirror) on Github for example config source, docker-compose files, binaries, folder structure, and more.
89 |
90 | ---
91 |
92 | # Tutorial
93 |
94 | ---
95 |
96 | ## Prerequisites
97 |
98 | 1. **Provision a server to act as your Wikipedia mirror**
99 |
100 | You can use a cheap VPS provider like DigitalOcean, Vultr, Hetzner, etc. For the static ZIM archive and MediaWiki server methods you will need significant disk space, so a home server with a cheap external HD may be a better option.
101 |
102 | *The setup examples below are based on Ubuntu 19.04* running on a home server, however they should work across many other OS's with minimal tweaking (e.g. FreeBSD, macOS, Arch, etc.).
103 |
104 | 2. **Purchase a new domain or create a subdomain to host your mirror**
105 |
106 | You can use Google Domains, NameCheap, GoDaddy, etc. any registrar will work.
107 |
108 | *In the setup examples below, replace `wiki.example.com` with the domain you chose.*
109 |
110 | 3. **Point the DNS records for the domain to your mirror server**
111 |
112 | Configure these records via your DNS provider (e.g. NameCheap, DigitalOcean, CloudFlare, etc.):
113 |
114 | - `wiki.example.com` `A` -> `your server's public ip` (the root domain)
115 | - `en.wiki.example.com` `CNAME` -> `wiki.example.com` (the wiki domain)
116 | - `upload.wiki.example.com` `CNAME` -> `wiki.example.com` (the uploads/media domain)
117 |
118 | 4. **Create a directory to store the project, and a dotenv file for your config options**
119 |
120 | Not all of these values are needed for all the methods, but it's easier to just define all of them in one place and remove things later that turn out to be unneeded.
121 |
122 | ```bash
123 | mkdir -p /opt/wiki # change PROJECT_DIR below to match
124 | nano /opt/wiki/.env
125 | ```
126 | Create the `.env` config file in [`dotenv`](https://docs.docker.com/compose/env-file/)/`bash` syntax with the contents below.
127 | *Make sure to replace the example values like `wiki.example.com` with your own.*
128 | ```bash
129 | PROJECT_DIR="/opt/wiki" # folder for all project state
130 | CONFIG_DIR="$PROJECT_DIR/etc/nginx"
131 | CACHE_DIR="$PROJECT_DIR/data/cache"
132 | CERTS_DIR="$PROJECT_DIR/data/certs"
133 | LOGS_DIR="$PROJECT_DIR/data/logs"
134 |
135 | LANG="en" # Wikipedia language to mirror
136 | LISTEN_PORT_HTTP="80" # public-facing HTTP port to bind
137 | LISTEN_PORT_HTTPS="443" # public-facing HTTPS port to bind
138 | LISTEN_HOST="wiki.example.com" # root domain to listen on
139 | LISTEN_WIKI="$LANG.$LISTEN_HOST" # wiki domain to listen on
140 | LISTEN_MEDIA="upload.$LISTEN_HOST" # uploads domain to listen on
141 |
142 | UPSTREAM_HOST="wikipedia.org" # main upstream domain
143 | UPSTREAM_WIKI="$LANG.$UPSTREAM_HOST" # upstream domain for wiki
144 | UPSTREAM_MEDIA="upload.wikimedia.org" # upstream domain for uploads
145 |
146 | # Only needed if using an nginx reverse proxy:
147 | SSL_CRT="$CERTS_DIR/$LISTEN_HOST.crt"
148 | SSL_KEY="$CERTS_DIR/$LISTEN_HOST.key"
149 | SSL_DH="$CERTS_DIR/$LISTEN_HOST.dh"
150 |
151 | CACHE_SIZE="100G" # or "500GB", "1GB", "200MB", etc.
152 | CACHE_REQUESTS="GET HEAD POST" # or "GET HEAD", "any", etc.
153 | CACHE_RESPONSES="200 206 302" # or "200 302 404", "any", etc.
154 | CACHE_DURATION="max" # or "1d", "30m", "12h", etc.
155 |
156 | ACCESS_LOG="'$LOGS_DIR/nginx.out' trace" # or "off", etc.
157 | ERROR_LOG="'$LOGS_DIR/nginx.err' warn" # or "off", etc.
158 | ```
159 |
160 | *The setup steps below depend on this file existing and the config values being correct,
161 | so make sure you create it and replace all example values with your own before proceeding!*
162 |
163 | ---
164 |
165 | ## Choosing a Wikipedia archive dump
166 |
167 | - https://download.kiwix.org/zim/wikipedia/ (for BitTorrent add `.torrent` to the end of any `.zim` url)
168 | - https://en.wikipedia.org/wiki/MediaWiki
169 | - https://www.mediawiki.org/wiki/MediaWiki
170 | - https://www.mediawiki.org/wiki/Download
171 | - https://www.wikidata.org/wiki/Wikidata:Database_download
172 | - https://dumps.wikimedia.org/backup-index.html
173 |
174 | ### ZIM Static HTML Dump
175 |
176 | Wikipedia HTML dumps are provided in a highly-compressed web-archiving format called [ZIM](https://openzim.org). They can be served using a ZIM server like Kiwix (the most common one), or [ZimReader](https://openzim.org/wiki/Zimreader), [GoZIM](https://github.com/akhenakh/gozim), & [others](https://openzim.org/wiki/Readers).
177 |
178 | - [Kiwix.org full ZIM archive list](https://wiki.kiwix.org/wiki/Content_in_all_languages) or [Kiwix.org Wikipedia-specific ZIM archive list](https://library.kiwix.org/#lang=eng&q=wikipedia)
179 | - [Wikimedia.org ZIM archive list](https://dumps.wikimedia.org/other/kiwix/zim/wikipedia/)
180 | - [List of ZIM BitTorrent links](https://gist.github.com/maxogden/70674db0b5b181b8eeb1d3f9b638ab2a)
181 |
182 | ZIM archive dumps are usually published yearly, but the release schedule is not guaranteed. As of August 2019 the latest available dump containing all English articles is from October 2018:
183 |
184 | [`wikipedia_en_all_mini_2019-09.zim`](https://download.kiwix.org/zim/wikipedia/wikipedia_en_all_mini_2019-09.zim) ([torrent](https://download.kiwix.org/zim/wikipedia/wikipedia_en_all_mini_2019-09.zim.torrent)) (10GB, mini English articles, no pictures or video)
185 |
186 | [`wikipedia_en_all_nopic_2018-09.zim`](https://download.kiwix.org/zim/wikipedia/wikipedia_en_all_nopic_2018-09.zim) ([torrent](https://download.kiwix.org/zim/wikipedia/wikipedia_en_all_nopic_2018-09.zim.torrent)) (35GB, all English articles, no pictures or video)
187 |
188 | **[`wikipedia_en_all_maxi_2018-10.zim`](https://download.kiwix.org/zim/wikipedia_en_all_maxi.zim)** ([torrent](https://download.kiwix.org/zim/wikipedia_en_all_maxi.zim.torrent)) (79GB, all English articles w/ pictures, no video)
189 |
190 | [`wikipedia_en_simple_all_maxi_2020-01.zim`](https://dumps.wikimedia.org/other/kiwix/zim/wikipedia/wikipedia_en_simple_all_maxi_2020-01.zim) (1.6GB, SimpleWiki English only, good for testing)
191 |
192 | **Download your chosen Wikipedia ZIM archive** (e.g. `wikipedia_en_all_maxi_2018-10.zim`)
193 |
194 | ```bash
195 | mkdir -p /opt/wiki/data/dumps && cd /opt/wiki/data/dumps
196 |
197 | # Download via BitTorrent:
198 | transmission-cli --download-dir . 'magnet:?xt=urn:btih:O2F3E2JKCEEBCULFP2E2MRUGEVFEIHZW'
199 |
200 | # Or download via HTTPS from one of the mirrors:
201 | wget -c 'https://dumps.wikimedia.org/other/kiwix/zim/wikipedia/wikipedia_en_all_maxi_2018-10.zim'
202 | wget -c 'https://ftpmirror.your.org/pub/kiwix/zim/wikipedia/wikipedia_en_all_maxi_2018-10.zim'
203 | wget -c 'https://download.kiwix.org/zim/wikipedia/wikipedia_en_all_maxi_2018-10.zim'
204 |
205 | # Optionally after download, verify the length (fast) or MD5 checksum (slow):
206 | stat --printf="%s" wikipedia_en_all_maxi_2018-10.zim | grep 83853668638
207 | md5sum wikipedia_en_all_maxi_2018-10.zim | openssl dgst -md5 -binary | openssl enc -base64 | grep 01eMQki29P9vD5F2h6zWwQ
208 | ```
209 |
210 | ### XML Database Dump
211 |
212 | - [WikiData.org Dump Types (JSON, RDF, XML)](https://www.wikidata.org/wiki/Wikidata:Database_download)
213 | - [List of Dumps (XML dumps)](https://meta.wikimedia.org/wiki/Data_dump_torrents#English_Wikipedia)
214 | - [List of Mirrors (XML dumps)](https://dumps.wikimedia.org/mirrors.html)
215 |
216 | Database dumps are usually published monthly. As of August 2019, the latest dump containing all English articles is from July 2019:
217 |
218 | **[`enwiki-20190720-pages-articles.xml.bz2`](https://meta.wikimedia.org/wiki/Data_dump_torrents#English_Wikipedia)** (15GB, all English articles, no pictures/videos)
219 |
220 | [`simplewiki-20170820-pages-meta-current.xml.bz2`](https://itorrents.org/torrent/B23A2BDC351E58E041D79F335A3CF872DEBAE919.torrent) (180MB, SimpleWiki only, good for testing)
221 |
222 | **Download your chosen Wikipedia XML dump** (e.g. `enwiki-20190720-pages-articles.xml.bz2`)
223 |
224 | ```bash
225 | mkdir -p /opt/wiki/data/dumps && cd /opt/wiki/data/dumps
226 |
227 | # Download via BitTorrent:
228 | transmission-cli --download-dir . 'magnet:?xl=16321006399&dn=enwiki-20190720-pages-articles.xml.bz2'
229 |
230 | # Download via HTTP:
231 | # lol no. no one wants to serve you a 15GB file via HTTP
232 | ```
233 |
234 | ---
235 |
236 | ## Method #1: Run a caching proxy in front of Wikipedia.org
237 |
238 | > **Complexity:** Low
239 | > Minimal setup and operations requirements, no download of large dumps needed.
240 | > **Disk space requirements:** On-Demand
241 | > Disk is only used as pages are requested (can be 1gb up to 2TB+ depending on usage).
242 | > **CPU requirements:** Very Low
243 | > Lowest out of the three options, can be run on a tiny VPS or home-server.
244 | > **Content freshness:** Very Fresh
245 | > Configurable to cache content indefinitely or pull fresh data for every request.
246 |
247 | ### a. Running with Nginx
248 |
249 | Set the following options in your `/opt/wiki/.env` config file:
250 | `UPSTREAM_HOST=wikipedia.org`
251 | `UPSTREAM_WIKI=en.wikipedia.org`
252 | `UPSTREAM_MEDIA=upload.wikimedia.org`
253 |
254 | Then run all the setup steps below under [Nginx Reverse Proxy](#) to set up Nginx.
255 |
256 | Then restart nginx to apply your config with `systemctl restart nginx`.
257 |
258 | Your mirror should now be running and proxying requests to Wikipedia.org!
259 |
260 | Visit https://en.yourdomainhere.com to see it in action (e.g. https://en.wiki.example.com).
261 |
262 | ### b. Running with Caddy
263 |
264 | Alternatively, check out a similar setup that uses Caddy instead of Nginx as the reverse proxy: https://github.com/CristianCantoro/wikiproxy
265 |
266 | ---
267 |
268 | ## Method #2: Serve the static HTML ZIM archive with Kiwix
269 |
270 | > **Complexity:** Moderate
271 | > Static binary makes it easy to run, but it requires downloading a large dump file.
272 | > **Disk space requirements:** >80GB
273 | > The ZIM archive is a highly-compressed collection of static HTML articles only.
274 | > **CPU requirements:** Very Low
275 | > Low, especially with a CDN in front (more than a proxy, but less than a full server).
276 | > **Content freshness:** Often Stale
277 | > ZIM archives are published yearly (ish) by Wikipedia.org.
278 |
279 | First download a ZIM archive dump like `wikipedia_en_all_maxi_2018-10.zim` into `/opt/wiki/data/dumps` as described above.
280 |
281 |
282 | ### a. Running with Docker
283 |
284 | Run `kiwix-serve` with docker like so:
285 |
286 | ```bash
287 | docker run \
288 | -v '/opt/wiki/data/dumps:/data' \
289 | -p 8888:80 \
290 | kiwix/kiwix-serve \
291 | 'wikipedia_en_all_maxi_2018-10.zim'
292 | ```
293 |
294 | Or create `/opt/wiki/docker-compose.yml` and run `docker-compose up`:
295 | ```yml
296 | version: '3'
297 | services:
298 | kiwix:
299 | image: kiwix/kiwix-serve
300 | command: 'wikipedia_en_all_maxi_2018-10.zim'
301 | ports:
302 | - '8888:80'
303 | volumes:
304 | - "./data/dumps:/data"
305 | ```
306 |
307 | ### b. Running with the static binary
308 |
309 | 1. **Download the latest `kiwix-serve` binary for your OS & CPU architecture**
310 |
311 | Find the latest release for your architecture here and copy its URL to download it below:
312 | https://download.kiwix.org/release/kiwix-tools/
313 |
314 | ```bash
315 | cd /opt/wiki
316 | wget 'https://download.kiwix.org/release/kiwix-tools/kiwix-tools_linux-x86_64-3.0.1.tar.gz'
317 | tar -xzf 'kiwix-tools_linux-x86_64-3.0.1.tar.gz'
318 | mv 'kiwix-tools_linux-x86_64-3.0.1' 'bin'
319 | ```
320 |
321 | 2. **Run `kiwix-serve`, passing it a port to listen on and your ZIM archive file**
322 |
323 | ```bash
324 | /opt/wiki/bin/kiwix-serve --port 8888 /opt/wiki/data/dumps/wikipedia_en_all_maxi_2018-10.zim
325 | ```
326 |
327 | Your server should now be running!
328 |
329 | Visit http://en.yourdomainhere.com:8888 to see it in action!
330 |
331 | ### Optional Nginx Reverse Proxy
332 |
333 | Set the following options in your `/opt/wiki/.env` config file:
334 | ```bash
335 | UPSTREAM_HOST=localhost:8888
336 | UPSTREAM_WIKI=localhost:8888
337 | UPSTREAM_MEDIA=upload.wikimedia.org
338 | ```
339 |
340 | Then run all the setup steps below under [Nginx Reverse Proxy](#) to set up Nginx. To run nginx inside docker-compose next to Kiwix, see the [Run Nginx via docker-compose](#) section below.
341 |
342 | Your mirror should now be running and proxying requests to `kiwix-serve`!
343 |
344 | Visit https://en.yourdomainhere.com to see it in action (e.g. https://en.wiki.example.com).
345 |
346 |
347 | ---
348 |
349 | ## Method #3: Run a full MediaWiki server
350 |
351 | > **Complexity:** Very High
352 | > Complex multi-component setup with an intricate setup process and high resource use.
353 | > **Disk space requirements:** >550GB (>2TB needed for import phase)
354 | > The uncompressed database is very large (multiple TB with revision history and stubs).
355 | > **CPU requirements:** Moderate (very high during import phase)
356 | > Depends on usage, but it's the most demanding out of the 3 options.
357 | > **Content freshness:** Very fresh
358 | > Udpated database dumps are published monthly (ish) by Wikipedia.org.
359 |
360 | First download a database dump like [`enwiki-20190720-pages-articles.xml.bz2`](magnet:?xl=16321006399&dn=enwiki-20190720-pages-articles.xml.bz2&xt=urn:tree:tiger:zpqgda3rbnycgtcujwpqi72aiv7tyasw7rp7sdi&xt=urn:ed2k:3b291214eb785df5b21cdb62623dd319&xt=urn:aich:zuy4dfbo2ppdhsdtmlev72fggdnka6ch&xt=urn:btih:9f08161276bc95ec594ce89ed52fe18fc41168a3&xt=urn:sha1:54cbdd5e5d1ca22b7dbd16463f81fdbcd6207bab&xt=urn:md5:9be9c811e0cc5c8418c869bb33eb516c&tr=udp%3a%2f%2ftracker.openbittorrent.com%3a80&as=http%3a%2f%2fdumps.wikimedia.freemirror.org%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2&as=http%3a%2f%2fdumps.wikimedia.your.org%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2&as=http%3a%2f%2fftp.acc.umu.se%2fmirror%2fwikimedia.org%2fdumps%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2&as=https%3a%2f%2fdumps.wikimedia.freemirror.org%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2&as=https%3a%2f%2fdumps.wikimedia.your.org%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2&as=https%3a%2f%2fftp.acc.umu.se%2fmirror%2fwikimedia.org%2fdumps%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2&as=https%3a%2f%2fdumps.wikimedia.org%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2) into `/opt/wiki/data/dumps` as described above.
361 |
362 | If you need to decompress it, `pbzip2` is much faster than `bzip2`:
363 | ```bash
364 | pbzip2 -v -d -k -m10000 enwiki-20190720-pages-articles.xml.bz2
365 | # -m10000 tells it to use 10GB of RAM, adjust accordingly
366 | ```
367 |
368 | ### a. Running with XOWA in Docker
369 |
370 | https://github.com/QuantumObject/docker-xowa
371 |
372 | ```bash
373 | docker run \
374 | -v /opt/wiki/data/xowa:/opt/xowa/ \
375 | -p 8888 \
376 | sblop/xowa_offline_wikipedia
377 | ```
378 | ```yaml
379 | version: '3'
380 | services:
381 | xowa:
382 | image: sblop/xowa_offline_wikipedia
383 | ports:
384 | - 8888:80
385 | volumes:
386 | - './data/xowa:/opt/xowa'
387 | ```
388 |
389 | ### b. Running with MediaWiki in Docker
390 |
391 | - https://hub.docker.com/_/mediawiki
392 | - https://github.com/wikimedia/mediawiki-docker
393 | - https://github.com/AirHelp/mediawiki-docker
394 | - https://en.wikipedia.org/wiki/MediaWiki
395 | - https://www.mediawiki.org/wiki/MediaWiki
396 | - https://www.mediawiki.org/wiki/Download
397 | - https://www.wikidata.org/wiki/Wikidata:Database_download
398 | - https://dumps.wikimedia.org/backup-index.html
399 |
400 |
401 | **Configure your `docker-compose.yml` file**
402 |
403 | Default MediaWiki config file: https://phabricator.wikimedia.org/source/mediawiki/browse/master/includes/DefaultSettings.php
404 |
405 | Create the following `/opt/wiki/docker-compose.yml` file then run `docker-compose up`:
406 | ```yml
407 | version: '3'
408 | services:
409 | database:
410 | image: mariadb
411 | command: --max-allowed-packet=256M
412 | environment:
413 | MYSQL_DATABASE: wikipedia
414 | MYSQL_USER: wikipedia
415 | MYSQL_PASSWORD: wikipedia
416 | MYSQL_ROOT_PASSWORD: wikipedia
417 |
418 | mediawiki:
419 | image: mediawiki
420 | ports:
421 | - 8080:80
422 | depends_on:
423 | - database
424 | volumes:
425 | - './data/html:/var/www/html'
426 | # After initial setup, download LocalSettings.php into ./data/html
427 | # and uncomment the following line, then docker-compose restart
428 | # - ./LocalSettings.php:/var/www/html/LocalSettings.php
429 | ```
430 |
431 |
432 | **Then import the XML dump into the MediaWiki database:**
433 | - https://www.mediawiki.org/wiki/Manual:Importing_XML_dumps
434 | - https://hub.docker.com/r/ueland/mwdumper/
435 | - https://www.mail-archive.com/wikitech-l@lists.wikimedia.org/msg02108.html
436 |
437 | **Do not attempt to import it directly with `importDump.php`, it will take months:**
438 | ```bash
439 | php /var/www/html/maintenance/importDump.php enwiki-20170320-pages-articles-multistream.xml
440 | ```
441 |
442 | **Instead, convert the XML dump into compressed chunks of SQL then import individually:**
443 |
444 | *Warning: For large imports (e.g. English) this process can still take 5+ days depending on the system.*
445 |
446 | ```bash
447 | apt install -y openjdk-8-jre zstd pbzip2
448 |
449 | # Download patched mwdumper version and pre/post import SQL scripts
450 | wget "https://github.com/pirate/wikipedia-mirror/raw/master/bin/mwdumper-1.26.jar"
451 | wget "https://github.com/pirate/wikipedia-mirror/raw/master/preimport.sql"
452 | wget "https://github.com/pirate/wikipedia-mirror/raw/master/postimport.sql"
453 |
454 | DUMP_NAME="enwiki-20190720-pages-articles"
455 |
456 | # Decompress the XML dump using all available cores and 10GB of memory
457 | pbzip2 -v -d -k -m10000 "$DUMP.xml.bz2"
458 |
459 | # Convert the XML file into a SQL file using mwdumper
460 | java -server \
461 | -jar ./wikipedia-importing-tools/mwdumper-1.26.jar \
462 | --format=sql:1.5 \
463 | "$DUMP.xml" \
464 | > wikipedia.sql
465 |
466 | # Split the generated SQL file into compressed chunks
467 | split --additional-suffix=".sql" --lines=1000 wikipedia.sql
468 | for partial in $(ls *.sql); do
469 | zstd -z $partial
470 | done
471 |
472 | # Fix a schema issue that may otherwise cause import bugs
473 | docker-compose exec database \
474 | mysql --user=wikipedia --password=wikipedia --database=wikipedia \
475 | "ALTER TABLE page ADD page_counter bigint unsigned NOT NULL default 0;"
476 |
477 | # Import the compressed chunks into the database
478 | for partial in $(ls *.sql.zst); do
479 | zstd -dc preimport.sql.zst $partial postimport.sql.zst \
480 | | docker-compose exec database \
481 | mysql --force --user=wikipedia --password=wikipedia --database=wikipedia
482 | done
483 | ```
484 |
485 | Credit for these steps goes to https://github.com/wayneworkman/wikipedia-importing-tools.
486 |
487 |
488 | ### Optional Nginx Reverse Proxy
489 |
490 | Set the following options in your `/opt/wiki/.env` config file:
491 | ```bash
492 | UPSTREAM_HOST=localhost:8888
493 | UPSTREAM_WIKI=localhost:8888
494 | UPSTREAM_MEDIA=upload.wikimedia.org
495 | ```
496 |
497 | Then run all the setup steps below under [Nginx Reverse Proxy](#) to set up Nginx. To run nginx inside docker-compose next to MediaWiki, see the [Run Nginx via docker-compose](#) section below.
498 |
499 | Your mirror should now be running and proxying requests to your wiki server!
500 |
501 | Visit https://en.yourdomainhere.com to see it in action (e.g. https://en.wiki.example.com).
502 |
503 | ---
504 |
505 | ## Nginx Reverse Proxy
506 |
507 | You can optionally set up an Nginx reverse proxy in front of `kiwix-serve`, `Wikipedia.org`, or a `MediaWiki` server to add caching and HTTPS support.
508 |
509 | Make sure the options in `/opt/wiki/.env` are configured correctly for the type of setup you're trying to achieve.
510 |
511 | - To run nginx in front of `kiwix-serve` on localhost, set:
512 | `UPSTREAM_HOST=localhost:8888`
513 | `UPSTREAM_WIKI=localhost:8888`
514 | `UPSTREAM_MEDIA=upload.wikimedia.org`
515 | - To run nginx in front of Wikipedia.org, set:
516 | `UPSTREAM_HOST=wikipedia.org`
517 | `UPSTREAM_WIKI=en.wikipedia.org`
518 | `UPSTREAM_MEDIA=upload.wikimedia.org`
519 | - To run nginx in front of a MediaWiki server on localhost, set:
520 | `UPSTREAM_HOST=localhost:8888`
521 | `UPSTREAM_WIKI=localhost:8888`
522 | `UPSTREAM_MEDIA=upload.wikimedia.org`
523 | - To run nginx in front of a docker container via docker-compose:
524 | *See [Run Nginx via docker-compose](#) section below.*
525 |
526 | ### Install LetsEncrypt and Nginx
527 |
528 | ```bash
529 | # Install the dependencies: nginx and certbot
530 | add-apt-repository -y -n universe
531 | add-apt-repository -y -n ppa:certbot/certbot
532 | add-apt-repository -y -n ppa:nginx/stable
533 | apt update -qq
534 | apt install -y nginx-extras certbot python3-certbot-nginx
535 | systemctl enable nginx
536 | systemctl start nginx
537 | ```
538 |
539 | ### Obtain an SSL certificate via LetsEncrypt
540 | ```bash
541 | # Load your config values from step 4 into the environment, and create dirs
542 | source /opt/wiki/.env
543 | mkdir -p "$CONFIG_DIR" "$CACHE_DIR" "$CERTS_DIR" "$LOGS_DIR"
544 |
545 | # Get an SSL certificate and generate the Diffie-Hellman parameters file
546 | certbot certonly \
547 | --nginx \
548 | --agree-tos \
549 | --non-interactive \
550 | -m "ssl@$LISTEN_HOST" \
551 | --domain "$LISTEN_HOST,$LISTEN_WIKI,$LISTEN_MEDIA"
552 | openssl dhparam -out "$PROJECT_DIR/data/certs/$DOMAIN.dh" 2048
553 |
554 | # Link the certs into your project directory
555 | ln -s /etc/letsencrypt/live/$DOMAIN/fullchain.pem $PROJECT_DIR/data/certs/$DOMAIN.crt
556 | ln -s /etc/letsencrypt/live/$DOMAIN/privkey.pem $PROJECT_DIR/data/certs/$DOMAIN.key
557 | ```
558 |
559 | LetsEncrypt certs must be renewed every 90 days or they'll expire and you'll get "Invalid Certificate" errors. To have certs automatically renewed periodically, add a systemd timer or cron job to run `certbot renew`. Here's an example tutorial on how to do that:
560 | https://gregchapple.com/2018/02/16/auto-renew-lets-encrypt-certs-with-systemd-timers/
561 |
562 | ### Populate the nginx.conf template with your config
563 |
564 | ```bash
565 | # Load your config options into the environment
566 | source /opt/wiki/.env
567 |
568 |
569 | # Download the nginx config template
570 | curl --silent \
571 | "https://github.com/pirate/wikipedia-mirror/raw/master/etc/nginx/nginx.conf.template" \
572 | > "$CONFIG_DIR/nginx.conf.template"
573 |
574 | # Fill your config options into nginx.conf.template to create nginx.conf
575 | envsubst \
576 | "$(printf '${%s} ' $(bash -c "compgen -A variable"))"\
577 | < "$CONFIG_DIR/nginx.conf.template" \
578 | > "$CONFIG_DIR/nginx.conf"
579 | ```
580 |
581 |
582 | ### Run Nginx via systemd
583 | ```bash
584 | # Link the your nginx.conf into the system's default nginx config location
585 | ln -s -f "$CONFIG_DIR/nginx.conf" "/etc/nginx/nginx.conf"
586 |
587 | # Restart nginx to load the new config
588 | systemctl restart nginx
589 | ```
590 |
591 | Now you can visit https://en.yourdomainhere.com to see it in action with HTTPS!
592 |
593 | For troubleshooting, you can find the nginx logs here:
594 | `/opt/wiki/data/logs/nginx.err`
595 | `/opt/wiki/data/logs/nginx.out`
596 |
597 | ### Run Nginx via docker-compose
598 |
599 | Set the config values in your `/opt/wiki/.env` file to correspond to the docker container's hostname that you want to proxy, and tweak the directory paths to be the paths inside the container. e.g. for `mediawiki`:
600 | ```bash
601 | UPSTREAM_HOST=mediawiki:8888`
602 | UPSTREAM_WIKI=mediawiki:8888`
603 | UPSTREAM_MEDIA=upload.wikimedia.org
604 |
605 | CERTS_DIR=/certs
606 | CACHE_DIR=/cache
607 | LOGS_DIR=/logs
608 | ```
609 |
610 | Then regenerate your `nginx.conf` file with `envsubst` as described in [Nginx Reverse Proxy](#Nginx-Reverse-Proxy) below.
611 |
612 | Then add the `nginx` service to your existing `/opt/wiki/docker-compose.yml` file:
613 | ```bash
614 | version: '3'
615 | services:
616 |
617 | ...
618 |
619 | nginx:
620 | image: nginx:latest
621 | volumes:
622 | - ./etc/nginx/nginx.conf:/etc/nginx/nginx.conf
623 | - ./data/certs:/certs
624 | - ./data/cache:/cache
625 | - ./data/logs:/logs
626 | ports:
627 | - 80:80
628 | - 443:443
629 | ```
630 |
631 | ---
632 |
633 | # Further Reading
634 |
635 | - https://github.com/openzim/mwoffliner (archiving only, no serving)
636 | - https://www.yunqa.de/delphi/products/wikitaxi/index (Windows only)
637 | - https://www.nongnu.org/wp-mirror/ (last updated in 2014, [Dockerfile](https://github.com/futpib/docker-wp-mirror/blob/master/Dockerfile))
638 | - https://github.com/dustin/go-wikiparse
639 | - https://www.learn4master.com/tools/python-and-java-libraries-to-parse-wikipedia-dump-dataset
640 | - https://dkpro.github.io/dkpro-jwpl/
641 | - https://towardsdatascience.com/wikipedia-data-science-working-with-the-worlds-largest-encyclopedia-c08efbac5f5c
642 | - https://meta.wikimedia.org/wiki/Data_dumps/Import_examples#Import_into_an_empty_wiki_of_a_subset_of_en_wikipedia_on_Linux_with_MySQL
643 | - https://github.com/shimondoodkin/wikipedia-dump-import-script/blob/master/example-result.sh
644 | - https://github.com/wayneworkman/wikipedia-importing-tools
645 | - https://github.com/chrisbo246/mediawiki-loader
646 | - https://dzone.com/articles/how-clone-wikipedia-and-index
647 | - https://www.xarg.org/2016/06/importing-entire-wikipedia-into-mysql/
648 | - https://dengruo.com/blog/running-mediawiki-your-own-copy-restore-whole-mediwiki-backup
649 | - https://brionv.com/log/2007/10/02/wiki-data-dumps/
650 | - https://www.evanjones.ca/software/wikipedia2text.html
651 | - https://lists.gt.net/wiki/wikitech/160482
652 | - https://helpful.knobs-dials.com/index.php/Harvesting_wikipedia
653 | - https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community
654 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-merlot
--------------------------------------------------------------------------------
/bin/kiwix-serve:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/bin/kiwix-serve
--------------------------------------------------------------------------------
/bin/mwdumper-1.26.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/bin/mwdumper-1.26.jar
--------------------------------------------------------------------------------
/data/cache/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/data/cache/README.md
--------------------------------------------------------------------------------
/data/certs/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/data/certs/README.md
--------------------------------------------------------------------------------
/data/dumps/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/data/dumps/README.md
--------------------------------------------------------------------------------
/data/logs/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/data/logs/README.md
--------------------------------------------------------------------------------
/data/zim/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/data/zim/README.md
--------------------------------------------------------------------------------
/docker-compose.kiwix.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | kiwix:
4 | image: kiwix/kiwix-serve
5 | command: 'wikipedia_en_all_novid_2018-10.zim'
6 | ports:
7 | - '8888:80'
8 | volumes:
9 | - "./data/zim:/data"
10 |
--------------------------------------------------------------------------------
/docker-compose.mediawiki.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | database:
4 | image: mariadb
5 | command: --max-allowed-packet=256M
6 | environment:
7 | MYSQL_DATABASE: wikipedia
8 | MYSQL_USER: wikipedia
9 | MYSQL_PASSWORD: wikipedia
10 | MYSQL_ROOT_PASSWORD: wikipedia
11 |
12 | mediawiki:
13 | image: mediawiki
14 | ports:
15 | - 8080:80
16 | depends_on:
17 | - database
18 | volumes:
19 | - './data/html:/var/www/html'
20 | # After initial setup, download LocalSettings.php into ./data/html
21 | # and uncomment the following line, then docker-compose restart
22 | # - ./LocalSettings.php:/var/www/html/LocalSettings.php
23 |
--------------------------------------------------------------------------------
/docker-compose.nginx.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | nginx:
4 | image: nginx:latest
5 | volumes:
6 | - ./etc/nginx/nginx.conf:/etc/nginx/nginx.conf
7 | - ./data/certs:/certs
8 | - ./data/cache:/cache
9 | - ./data/logs:/logs
10 | ports:
11 | - 80:80
12 | - 443:443
13 |
--------------------------------------------------------------------------------
/docker-compose.xowa.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | xowa:
4 | image: sblop/xowa_offline_wikipedia
5 | ports:
6 | - 8888:80
7 | volumes:
8 | - './data/xowa:/opt/xowa'
9 |
--------------------------------------------------------------------------------
/etc/nginx/nginx.conf.template:
--------------------------------------------------------------------------------
1 | daemon on;
2 | user www-data;
3 | pid /var/run/nginx.pid;
4 | timer_resolution 100ms;
5 | worker_processes auto;
6 | events {
7 | worker_connections 1024;
8 | }
9 |
10 | http {
11 | # Logging Settings
12 | log_format trace '$remote_addr - $remote_user [$time_local] "$request" '
13 | '$status $body_bytes_sent "$http_referer" "$http_user_agent" '
14 | '"$http_x_forwarded_for" $request_id';
15 | access_log $ACCESS_LOG;
16 | error_log $ERROR_LOG;
17 |
18 | # MIME Type Settings
19 | types_hash_max_size 512;
20 | default_type application/octet-stream;
21 | include /etc/nginx/mime.types;
22 |
23 | # GZIP Settings
24 | gzip on;
25 | gzip_vary on;
26 | gzip_proxied any;
27 | gzip_comp_level 6;
28 | gzip_min_length 1000;
29 | gzip_buffers 4 32k;
30 | gzip_types text/plain
31 | text/css
32 | text/xml
33 |
34 | font/ttf
35 | font/woff
36 | font/woff2
37 |
38 | application/json
39 | application/xhtml+xml
40 | application/rss+xml
41 | application/atom_xml
42 | application/javascript
43 | application/x-javascript;
44 |
45 |
46 | # Connection Settings
47 | resolver 1.1.1.1 8.8.8.8; # DNS server to use when resolving hosts for upstreams or cert chains
48 | resolver_timeout 5s; # timeout DNS requests as failed after this many seconds waiting for a response
49 | tcp_nopush on; # enables NGINX to send HTTP response headers in one packet right after the chunk of data has been obtained by sendfile()
50 | tcp_nodelay on; # don't wait 200ms to collect response headers and data before sending directly from filesystem
51 | port_in_redirect off; # when proxying redirects, strip any custom upstream ports from the url sent to the client
52 | slice 1m; # allow breaking up files into slices so as not to block on loading an entire file to only request a small range
53 | sendfile on; # send static files direclty from filesystem without buffering in memory
54 | sendfile_max_chunk 2m; # limit each filesystem chunk sent to 1mb to prevent one connection from eating all resources
55 | send_timeout 20s; # wait up to 20s before closing response connections where client has stopped accepting response data
56 | keepalive_timeout 60s; # allow up to 60s total before closing unresponsive/dead request connections
57 | client_header_timeout 15s; # don't wait more than 15s for client to send request headers
58 | client_body_timeout 15s; # don't wait more than 15s for client to send request body
59 | client_max_body_size 50m; # maximum file upload / request size (increase to allow larger file uploads)
60 | client_body_buffer_size 16k; # buffer size for reading client request body (should be 2 pages, aka 16k on 64bit systems)
61 | client_header_buffer_size 1k; # buffer size for reading client request header (for most requests, a buffer of 1K bytes is enough unless there are long cookies)
62 | large_client_header_buffers 4 8k; # maximum number and size of buffers used for reading large client request header (A request line cannot exceed the size of one buffer)
63 | http2_push_preload on; # enable http2 pushing of files before client requests them
64 | http2_max_concurrent_pushes 10; # limit concurrent server pushes to prevent overwhelming client
65 | http2_max_concurrent_streams 128; # maximum number of concurrent HTTP/2 streams in a connection
66 |
67 | # Security Settings
68 | ssl_stapling on; # instruct clients to only allow site to be served with local SSL cert
69 | ssl_stapling_verify on; # check to make sure no one else is serving with a different cert
70 | ssl_protocols TLSv1.2 TLSv1.3; # only allow modern SSL protocols
71 | ssl_session_cache shared:SSL:50m; # enable quick-resume of previous ssl sessions
72 | ssl_session_timeout 5d; # store ssl session cache entries for 5 days
73 | ssl_session_tickets off; # session tickets break forward secrecy
74 | ssl_ciphers ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-SHA:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-AES128-SHA:ECDHE-ECDSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA:ECDHE-RSA-AES256-SHA:DHE-RSA-AES128-SHA256:DHE-RSA-AES128-SHA:DHE-RSA-AES256-SHA256:DHE-RSA-AES256-SHA:ECDHE-ECDSA-DES-CBC3-SHA:ECDHE-RSA-DES-CBC3-SHA:EDH-RSA-DES-CBC3-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA:DES-CBC3-SHA:!DSS;
75 | ssl_ecdh_curve secp384r1; # use a strong curve function for encryption
76 | ssl_prefer_server_ciphers on; # prevent downgrade attacks to weaker cipher suites
77 | server_tokens off; # hide nginx version info in error pages and headers
78 |
79 | # Reverse Proxy Settings
80 | proxy_socket_keepalive on; # keep the upstream connection open instead of opening a new connection every time
81 | proxy_request_buffering off; # dont wait for the full request to arrive before passing body to upstream
82 | proxy_buffering off; # dont wait for full response to complete before passing body to client
83 | proxy_http_version 1.1; # Properly proxy websocket connections
84 | proxy_read_timeout 120s; # terminate websockets/dead nginx<-django connections afer 5min of inactivity
85 | proxy_cache_path "$CACHE_DIR" levels=1:2 keys_zone=main:16M max_size=$CACHE_SIZE inactive=1440h use_temp_path=off;
86 | proxy_cache_key "$request_uri$is_args$args$slice_range";
87 | proxy_cache_methods $CACHE_REQUESTS;
88 | proxy_cache_valid $CACHE_RESPONSES $CACHE_DURATION;
89 | proxy_cache_valid 404 10m; # allow articles not found to be refreshed more frequently than cache duration in case they're created
90 | # proxy_cache_bypass $http_cache_control; # bypass cache if client requests with Cache-Control: max-age=0
91 | proxy_cache_use_stale error updating invalid_header timeout http_500 http_502 http_503 http_504;
92 | proxy_cache_revalidate on; # use If-Modified-Since to revalidate cached requests if they expire instead of re-downloading full response
93 | proxy_cache_lock on; # if 2 requests come in, try to only make 1 upstream request to handle them both
94 | proxy_cache_lock_age 5s; # timeout to wait for in-progress caching before sending request directly to upstream
95 | proxy_cache_lock_timeout 5s; # timeout to wait for in-progress caching before sending request directly to upstream
96 | proxy_ignore_headers X-Accel-Expires; # ignore upstream caching recommendations for nginx response caching
97 | proxy_ignore_headers Expires; # ignore upstream caching recommendations for nginx response caching
98 | proxy_ignore_headers Cache-Control; # ignore upstream caching recommendations for nginx response caching
99 | proxy_ignore_headers Set-Cookie; # cache responses even when cookies are set
100 | proxy_hide_header X-Accel-Expires; # hide upstream caching recommendation from client
101 | proxy_hide_header Expires; # hide upstream caching recommendation from client
102 | proxy_hide_header Cache-Control; # hide upstream caching recommendation from client
103 | # proxy_hide_header "Set-Cookie"; # prevent upstream cookies being set on clients at all
104 | proxy_cookie_domain .$UPSTREAM_HOST $host; # rewrite cookie domains to proxied equivalents
105 | proxy_cookie_domain $UPSTREAM_HOST $host; # rewrite cookie domains to proxied equivalents
106 | proxy_cookie_domain $UPSTREAM_WIKI $host; # rewrite cookie domains to proxied equivalents
107 | proxy_cookie_domain $UPSTREAM_MEDIA $host; # rewrite cookie domains to proxied equivalents
108 |
109 |
110 | # Server Definition
111 | server {
112 | listen $LISTEN_PORT_HTTP default_server;
113 | listen $LISTEN_PORT_HTTPS ssl http2 default_server;
114 |
115 | server_name $LISTEN_HOST $LISTEN_WIKI $LISTEN_MEDIA
116 | $UPSTREAM_HOST $UPSTREAM_WIKI $UPSTREAM_MEDIA;
117 |
118 | ssl_certificate_key $SSL_KEY;
119 | ssl_certificate $SSL_CRT;
120 | ssl_trusted_certificate $SSL_CRT;
121 | ssl_dhparam $SSL_DH;
122 |
123 | proxy_cache wikiproxy;
124 |
125 | error_page 497 https://$host$request_uri; # redirect http:443 to https:443
126 | if ($scheme = http) {
127 | return 301 https://$host$request_uri; # redirect http:80 to https:443
128 | }
129 |
130 | location / {
131 | try_files $uri $uri/ @upstream;
132 |
133 | # Replace any domains in response body text with proxied equivalents
134 | subs_filter_types text/html text/css text/xml text/javascript;
135 | subs_filter $UPSTREAM_WIKI $LISTEN_WIKI gi;
136 | subs_filter $UPSTREAM_MEDIA $LISTEN_MEDIA gi;
137 | subs_filter $UPSTREAM_HOST $LISTEN_HOST gi;
138 |
139 | # Add headers to the response sent to the client
140 | add_header Cache-Control "public";
141 | add_header X-Handled-By "$upstream_addr";
142 | add_header X-Cache-Status "$upstream_cache_status";
143 | add_header X-Request-Id "$request_id";
144 | add_header X-Content-Type-Options "nosniff";
145 | add_header X-XSS-Protection "1; mode=block";
146 | add_header Referrer-Policy "strict-origin-when-cross-origin";
147 | add_header Strict-Transport-Security "max-age=31536000; includeSubDomains";
148 | add_header X-Robots-Tag "noindex";
149 |
150 | # Cache responses for the configured amount of time
151 | expires $CACHE_DURATION;
152 | }
153 |
154 | location @upstream {
155 | if ($host ~ "^($LISTEN_HOST|$UPSTREAM_HOST)$") {
156 | set $xupstream $UPSTREAM_HOST;
157 | }
158 | if ($host ~ "^($LISTEN_WIKI|$UPSTREAM_WIKI)$") {
159 | set $xupstream $UPSTREAM_WIKI;
160 | }
161 | if ($host ~ "^($LISTEN_MEDIA|$UPSTREAM_MEDIA)$") {
162 | set $xupstream $UPSTREAM_MEDIA;
163 | }
164 |
165 | # Add headers to the request sent to the upstream server
166 | proxy_set_header Host "$xupstream";
167 | proxy_set_header Range "$slice_range";
168 | proxy_set_header Upgrade "$http_upgrade";
169 | proxy_set_header Connection "upgrade";
170 | proxy_set_header X-Request-Id "$request_id";
171 | proxy_set_header X-Real-Ip "$remote_addr";
172 | proxy_set_header X-Forwarded-Host "$host";
173 | proxy_set_header X-Forwarded-Server "$host";
174 | proxy_set_header X-Forwarded-For "$proxy_add_x_forwarded_for";
175 | proxy_set_header X-Forwarded-Protocol "$scheme";
176 |
177 | proxy_pass https://$xupstream;
178 | }
179 | }
180 | }
181 |
--------------------------------------------------------------------------------
/postimport.sql:
--------------------------------------------------------------------------------
1 | COMMIT;
2 | SET autocommit=1;
3 | SET unique_checks=1;
4 | SET foreign_key_checks=1;
5 |
--------------------------------------------------------------------------------
/preimport.sql:
--------------------------------------------------------------------------------
1 | SET autocommit=0;
2 | SET unique_checks=0;
3 | SET foreign_key_checks=0;
4 | BEGIN;
5 |
--------------------------------------------------------------------------------