├── .env ├── .github └── FUNDING.yml ├── .gitignore ├── LICENSE ├── README.md ├── _config.yml ├── bin ├── kiwix-serve └── mwdumper-1.26.jar ├── data ├── cache │ └── README.md ├── certs │ └── README.md ├── dumps │ └── README.md ├── logs │ └── README.md └── zim │ └── README.md ├── docker-compose.kiwix.yml ├── docker-compose.mediawiki.yml ├── docker-compose.nginx.yml ├── docker-compose.xowa.yml ├── etc └── nginx │ └── nginx.conf.template ├── postimport.sql └── preimport.sql /.env: -------------------------------------------------------------------------------- 1 | PROJECT_DIR="/opt/wiki" # folder for all project state 2 | CONFIG_DIR="$PROJECT_DIR/etc/nginx" 3 | CACHE_DIR="$PROJECT_DIR/data/cache" 4 | CERTS_DIR="$PROJECT_DIR/data/certs" 5 | LOGS_DIR="$PROJECT_DIR/data/logs" 6 | 7 | LANG="en" # Wikipedia language to mirror 8 | LISTEN_PORT_HTTP="80" # public-facing HTTP port to bind 9 | LISTEN_PORT_HTTPS="443" # public-facing HTTPS port to bind 10 | LISTEN_HOST="wiki.example.com" # root domain to listen on 11 | LISTEN_WIKI="$LANG.$LISTEN_HOST" # wiki domain to listen on 12 | LISTEN_MEDIA="upload.$LISTEN_HOST" # uploads domain to listen on 13 | 14 | UPSTREAM_HOST="wikipedia.org" # main upstream domain 15 | UPSTREAM_WIKI="$LANG.$UPSTREAM_HOST" # upstream domain for wiki 16 | UPSTREAM_MEDIA="upload.wikimedia.org" # upstream domain for uploads 17 | 18 | # Only needed if using an nginx reverse proxy: 19 | SSL_CRT="$CERTS_DIR/$LISTEN_HOST.crt" 20 | SSL_KEY="$CERTS_DIR/$LISTEN_HOST.key" 21 | SSL_DH="$CERTS_DIR/$LISTEN_HOST.dh" 22 | 23 | CACHE_SIZE="100G" # or "500GB", "1GB", "200MB", etc. 24 | CACHE_REQUESTS="GET HEAD POST" # or "GET HEAD", "any", etc. 25 | CACHE_RESPONSES="200 206 302" # or "200 302 404", "any", etc. 26 | CACHE_DURATION="max" # or "1d", "30m", "12h", etc. 27 | 28 | ACCESS_LOG="'$LOGS_DIR/nginx.out' trace" # or "off", etc. 29 | ERROR_LOG="'$LOGS_DIR/nginx.err' warn" # or "off", etc. 30 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: pirate 2 | patreon: theSquashSH 3 | custom: https://paypal.me/NicholasSweeting 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/cache/* 2 | data/certs/* 3 | data/logs/* 4 | data/zim/* 5 | data/dumps/* 6 | *.xml 7 | *.xml.bz2 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Nick Sweeting 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

How to self-host a mirror of Wikipedia.org:
with Nginx, Kiwix, or MediaWiki/XOWA + Docker

4 | Originally published 2019-09-08 on docs.sweeting.me.
The pretty HTML version is here and the source for this guide is on Github.


5 | A summary of how to set up a full Wikipedia.org mirror using three different approaches.
6 | DEMO: https://other-wiki.zervice.io 7 |
8 | 9 |
10 | 11 | # Intro 12 | 13 | > **Did you know that Wikipedia.org just runs a mostly-traditional LAMP stack on [~350 servers](https://meta.wikimedia.org/wiki/Wikimedia_servers)**? (as of 2019) 14 | 15 | **Unfortunately, Wikipedia attracts lots of hate from people and nation-states who object to certain articles or want to hide information from the public eye.** 16 | 17 | Wikipedia's infrastructure (2 racks the USA, 1 in Holland, and 1 in Singapore, + CDNs) [cant always stand up to large DDoS attacks](https://wikimediafoundation.org/news/2019/09/07/malicious-attack-on-wikipedia-what-we-know-and-what-were-doing/), but thankfully they provide regular database dumps and static HTML archives to the public, and have permissive licensing that allows for rehosting with modification (even for profit!). 18 | 19 | Growing up in China [behind the GFC I often experienced Wikipedia unavailability](https://www.cnet.com/news/the-great-firewall-of-china-blocks-off-wikipedia/), and in light of the [recent DDoS](https://wikimediafoundation.org/news/2019/09/07/malicious-attack-on-wikipedia-what-we-know-and-what-were-doing/) I decided to make a guide for people to help demystify the process of running a mirror. I'm also a big advocate for free access to information, and I'm the maintainer of a major internet archiving project called [ArchiveBox](https://archivebox.io) (a self-hosted internet archiver powered by headless Chromium). 20 | 21 | **This aim of this guide is to encourage people to use these publicly available dumps to host Wikipedia mirrors, so that malicious actors don't succeed in limiting public access to one of the *world's best sources of information*.** 22 | 23 | --- 24 | 25 | ## Quickstart 26 | 27 | A *full* English Wikipedia.org clone in 3 steps. 28 | 29 | **DEMO: https://other-wiki.zervice.io** 30 | 31 | ```bash 32 | # 1. Download the Kiwix-Serve static binary from https://www.kiwix.org/en/downloads/kiwix-serve/ 33 | wget 'https://download.kiwix.org/release/kiwix-tools/kiwix-tools_linux-x86_64.tar.gz' 34 | tar -xzf kiwix-tools_linux-x86_64-3.0.1.tar.gz && cd kiwix-tools_linux-x86_64-3.0.1 35 | 36 | # 2. Download a compressed Wikipedia dump from https://dumps.wikimedia.org/other/kiwix/zim/wikipedia/ (79GB, images included!) 37 | wget --continue "https://download.kiwix.org/zim/wikipedia_en_all_maxi.zim" 38 | 39 | # 3. Start the kiwix server, then visit http://127.0.0.1:8888 40 | ./kiwix-serve --verbose --port 8888 "$PWD/wikipedia_en_all_maxi_2018-10.zim" 41 | ``` 42 | --- 43 | 44 | 45 | ## Getting Started 46 | 47 | Wikipedia.org itself is powered by a PHP backend called [WikiMedia](https://en.wikipedia.org/wiki/MediaWiki), using MariaDB for data storage, Varnish and Memcached for request and query caching, and ElasticSearch for full-text search. Production Wikipedia.org also runs a number of extra plugins and modules on top of MediaWiki. 48 | 49 | **🖥 There are several ways to host your own mirror of Wikipedia (with varying complexity):** 50 | 51 | 1. [**Run a caching proxy in front of Wikipedia.org**](#) (disk used on-demand for cache, low CPU use) 52 | 2. [**Serve the static HTML ZIM archive with Kiwix**](#) (10~80GB for compressed archive, low CPU use) 53 | 3. [**Run a full MediaWiki server**](#) (hardest to set up, ~600GB for XML & database, high CPU use) 54 | 55 | **💅Don't expect it to look perfect on the first try** 56 | 57 | Setting up a Wikipidea mirror involves a complex dance between software, data, and devops, so beginners are encouraged to start with the static html archive or proxy and before attempting to run a full MediaWiki Server. Users should expect their mirrors to be able to serve articles with images and search, but should not expect it to look exactly like Wikipedia.org on the first try, or the second... 58 | 59 | **✅ Choosing an approach** 60 | 61 | Each method in this guide has its pros and cons. A caching proxy is the most lightweight option, but if the upstream servers go down and a request comes in that hasn't been seen before and cached it will 404, so it's not a fully redundant mirror. The static ZIM mirror is lightweight to download and host (and requests are easy to cache), it has full-text search, but it has no interactivity, talk page history, or Wikipedia-style category pages (though they are coming soon). MediaWiki/XOWA are the most complex, but they can provide a full working Wikipedia mirror complete with history revisions, users, talk pages, search, and more. 62 | 63 | Running a full MediaWiki server is by far the hardest method to set up. Expect it to take multiple days/weeks depending on available system resources, and expect it to look fairly broken since the production Wikipedia.org team run many tweaks and plugins that take extra work to set up locally. 64 | 65 | For more info, see the [Wikipedia.org index of all dump types available, with descriptions](https://dumps.wikimedia.org/). 66 | 67 | 68 | ## Responsible Rehosting Warning 69 | 70 | ⚠️ Be aware that running a publicly-accessible mirror of Wikipedia.org with any kind of framing / content modifications / ads is *strongly discouraged*. Framing mirrors / proxy mirrors are still a good option for private use, but you need to take additional steps to mirror responsibly if you're setting up a proxy for public use (e.g. robots:noindex, takedown contact info, blocking unlicensed images, etc.). 71 | 72 | > Some mirrors load a page from the Wikimedia servers directly every time someone requests a page from them. They alter the text in some way, such as framing it with ads, then send it on to the reader. **This is called remote loading, and it is an unacceptable use of Wikimedia server resources.** Even remote loading websites with little legitimate traffic can generate significant load on our servers, due to search engine web crawlers. 73 | *https://en.wikipedia.org/wiki/Wikipedia:Mirrors_and_forks#Remote_loading* 74 | 75 | 76 | Luckily, regardless of how you choose to rehost Wikipedia ***text***, you are not breaking any terms and conditions or violating copyright law as long as you don't remove their copyright statements (however, note the article images and videos on Wikimedia.org may not be licensed for re-use). 77 | 78 | > Every contribution to the English Wikipedia has been licensed for re-use, including commercial, for-profit websites. Republication is not necessarily a breach of copyright, so long as the appropriate licenses are complied with. 79 | *https://en.wikipedia.org/wiki/Wikipedia:Mirrors_and_forks#Things_you_need_to_know* 80 | 81 | 82 | --- 83 | 84 | # [Table of Contents](https://docs.sweeting.me/s/self-host-a-wikipedia-mirror#TOC) 85 | 86 | [TOC] 87 | 88 | See the [HTML version](https://docs.sweeting.me/s/self-host-a-wikipedia-mirror#TOC) of this guide for the best browsing experience. See [pirate/wikipedia-mirror](https://github.com/pirate/wikipedia-mirror) on Github for example config source, docker-compose files, binaries, folder structure, and more. 89 | 90 | --- 91 | 92 | # Tutorial 93 | 94 | --- 95 | 96 | ## Prerequisites 97 | 98 | 1. **Provision a server to act as your Wikipedia mirror** 99 | 100 | You can use a cheap VPS provider like DigitalOcean, Vultr, Hetzner, etc. For the static ZIM archive and MediaWiki server methods you will need significant disk space, so a home server with a cheap external HD may be a better option. 101 | 102 | *The setup examples below are based on Ubuntu 19.04* running on a home server, however they should work across many other OS's with minimal tweaking (e.g. FreeBSD, macOS, Arch, etc.). 103 | 104 | 2. **Purchase a new domain or create a subdomain to host your mirror** 105 | 106 | You can use Google Domains, NameCheap, GoDaddy, etc. any registrar will work. 107 | 108 | *In the setup examples below, replace `wiki.example.com` with the domain you chose.* 109 | 110 | 3. **Point the DNS records for the domain to your mirror server** 111 | 112 | Configure these records via your DNS provider (e.g. NameCheap, DigitalOcean, CloudFlare, etc.): 113 | 114 | - `wiki.example.com` `A` -> `your server's public ip` (the root domain) 115 | - `en.wiki.example.com` `CNAME` -> `wiki.example.com` (the wiki domain) 116 | - `upload.wiki.example.com` `CNAME` -> `wiki.example.com` (the uploads/media domain) 117 | 118 | 4. **Create a directory to store the project, and a dotenv file for your config options** 119 | 120 | Not all of these values are needed for all the methods, but it's easier to just define all of them in one place and remove things later that turn out to be unneeded. 121 | 122 | ```bash 123 | mkdir -p /opt/wiki # change PROJECT_DIR below to match 124 | nano /opt/wiki/.env 125 | ``` 126 | Create the `.env` config file in [`dotenv`](https://docs.docker.com/compose/env-file/)/`bash` syntax with the contents below. 127 | *Make sure to replace the example values like `wiki.example.com` with your own.* 128 | ```bash 129 | PROJECT_DIR="/opt/wiki" # folder for all project state 130 | CONFIG_DIR="$PROJECT_DIR/etc/nginx" 131 | CACHE_DIR="$PROJECT_DIR/data/cache" 132 | CERTS_DIR="$PROJECT_DIR/data/certs" 133 | LOGS_DIR="$PROJECT_DIR/data/logs" 134 | 135 | LANG="en" # Wikipedia language to mirror 136 | LISTEN_PORT_HTTP="80" # public-facing HTTP port to bind 137 | LISTEN_PORT_HTTPS="443" # public-facing HTTPS port to bind 138 | LISTEN_HOST="wiki.example.com" # root domain to listen on 139 | LISTEN_WIKI="$LANG.$LISTEN_HOST" # wiki domain to listen on 140 | LISTEN_MEDIA="upload.$LISTEN_HOST" # uploads domain to listen on 141 | 142 | UPSTREAM_HOST="wikipedia.org" # main upstream domain 143 | UPSTREAM_WIKI="$LANG.$UPSTREAM_HOST" # upstream domain for wiki 144 | UPSTREAM_MEDIA="upload.wikimedia.org" # upstream domain for uploads 145 | 146 | # Only needed if using an nginx reverse proxy: 147 | SSL_CRT="$CERTS_DIR/$LISTEN_HOST.crt" 148 | SSL_KEY="$CERTS_DIR/$LISTEN_HOST.key" 149 | SSL_DH="$CERTS_DIR/$LISTEN_HOST.dh" 150 | 151 | CACHE_SIZE="100G" # or "500GB", "1GB", "200MB", etc. 152 | CACHE_REQUESTS="GET HEAD POST" # or "GET HEAD", "any", etc. 153 | CACHE_RESPONSES="200 206 302" # or "200 302 404", "any", etc. 154 | CACHE_DURATION="max" # or "1d", "30m", "12h", etc. 155 | 156 | ACCESS_LOG="'$LOGS_DIR/nginx.out' trace" # or "off", etc. 157 | ERROR_LOG="'$LOGS_DIR/nginx.err' warn" # or "off", etc. 158 | ``` 159 | 160 | *The setup steps below depend on this file existing and the config values being correct, 161 | so make sure you create it and replace all example values with your own before proceeding!* 162 | 163 | --- 164 | 165 | ## Choosing a Wikipedia archive dump 166 | 167 | - https://download.kiwix.org/zim/wikipedia/ (for BitTorrent add `.torrent` to the end of any `.zim` url) 168 | - https://en.wikipedia.org/wiki/MediaWiki 169 | - https://www.mediawiki.org/wiki/MediaWiki 170 | - https://www.mediawiki.org/wiki/Download 171 | - https://www.wikidata.org/wiki/Wikidata:Database_download 172 | - https://dumps.wikimedia.org/backup-index.html 173 | 174 | ### ZIM Static HTML Dump 175 | 176 | Wikipedia HTML dumps are provided in a highly-compressed web-archiving format called [ZIM](https://openzim.org). They can be served using a ZIM server like Kiwix (the most common one), or [ZimReader](https://openzim.org/wiki/Zimreader), [GoZIM](https://github.com/akhenakh/gozim), & [others](https://openzim.org/wiki/Readers). 177 | 178 | - [Kiwix.org full ZIM archive list](https://wiki.kiwix.org/wiki/Content_in_all_languages) or [Kiwix.org Wikipedia-specific ZIM archive list](https://library.kiwix.org/#lang=eng&q=wikipedia) 179 | - [Wikimedia.org ZIM archive list](https://dumps.wikimedia.org/other/kiwix/zim/wikipedia/) 180 | - [List of ZIM BitTorrent links](https://gist.github.com/maxogden/70674db0b5b181b8eeb1d3f9b638ab2a) 181 | 182 | ZIM archive dumps are usually published yearly, but the release schedule is not guaranteed. As of August 2019 the latest available dump containing all English articles is from October 2018: 183 | 184 | [`wikipedia_en_all_mini_2019-09.zim`](https://download.kiwix.org/zim/wikipedia/wikipedia_en_all_mini_2019-09.zim) ([torrent](https://download.kiwix.org/zim/wikipedia/wikipedia_en_all_mini_2019-09.zim.torrent)) (10GB, mini English articles, no pictures or video) 185 | 186 | [`wikipedia_en_all_nopic_2018-09.zim`](https://download.kiwix.org/zim/wikipedia/wikipedia_en_all_nopic_2018-09.zim) ([torrent](https://download.kiwix.org/zim/wikipedia/wikipedia_en_all_nopic_2018-09.zim.torrent)) (35GB, all English articles, no pictures or video) 187 | 188 | **[`wikipedia_en_all_maxi_2018-10.zim`](https://download.kiwix.org/zim/wikipedia_en_all_maxi.zim)** ([torrent](https://download.kiwix.org/zim/wikipedia_en_all_maxi.zim.torrent)) (79GB, all English articles w/ pictures, no video) 189 | 190 | [`wikipedia_en_simple_all_maxi_2020-01.zim`](https://dumps.wikimedia.org/other/kiwix/zim/wikipedia/wikipedia_en_simple_all_maxi_2020-01.zim) (1.6GB, SimpleWiki English only, good for testing) 191 | 192 | **Download your chosen Wikipedia ZIM archive** (e.g. `wikipedia_en_all_maxi_2018-10.zim`) 193 | 194 | ```bash 195 | mkdir -p /opt/wiki/data/dumps && cd /opt/wiki/data/dumps 196 | 197 | # Download via BitTorrent: 198 | transmission-cli --download-dir . 'magnet:?xt=urn:btih:O2F3E2JKCEEBCULFP2E2MRUGEVFEIHZW' 199 | 200 | # Or download via HTTPS from one of the mirrors: 201 | wget -c 'https://dumps.wikimedia.org/other/kiwix/zim/wikipedia/wikipedia_en_all_maxi_2018-10.zim' 202 | wget -c 'https://ftpmirror.your.org/pub/kiwix/zim/wikipedia/wikipedia_en_all_maxi_2018-10.zim' 203 | wget -c 'https://download.kiwix.org/zim/wikipedia/wikipedia_en_all_maxi_2018-10.zim' 204 | 205 | # Optionally after download, verify the length (fast) or MD5 checksum (slow): 206 | stat --printf="%s" wikipedia_en_all_maxi_2018-10.zim | grep 83853668638 207 | md5sum wikipedia_en_all_maxi_2018-10.zim | openssl dgst -md5 -binary | openssl enc -base64 | grep 01eMQki29P9vD5F2h6zWwQ 208 | ``` 209 | 210 | ### XML Database Dump 211 | 212 | - [WikiData.org Dump Types (JSON, RDF, XML)](https://www.wikidata.org/wiki/Wikidata:Database_download) 213 | - [List of Dumps (XML dumps)](https://meta.wikimedia.org/wiki/Data_dump_torrents#English_Wikipedia) 214 | - [List of Mirrors (XML dumps)](https://dumps.wikimedia.org/mirrors.html) 215 | 216 | Database dumps are usually published monthly. As of August 2019, the latest dump containing all English articles is from July 2019: 217 | 218 | **[`enwiki-20190720-pages-articles.xml.bz2`](https://meta.wikimedia.org/wiki/Data_dump_torrents#English_Wikipedia)** (15GB, all English articles, no pictures/videos) 219 | 220 | [`simplewiki-20170820-pages-meta-current.xml.bz2`](https://itorrents.org/torrent/B23A2BDC351E58E041D79F335A3CF872DEBAE919.torrent) (180MB, SimpleWiki only, good for testing) 221 | 222 | **Download your chosen Wikipedia XML dump** (e.g. `enwiki-20190720-pages-articles.xml.bz2`) 223 | 224 | ```bash 225 | mkdir -p /opt/wiki/data/dumps && cd /opt/wiki/data/dumps 226 | 227 | # Download via BitTorrent: 228 | transmission-cli --download-dir . 'magnet:?xl=16321006399&dn=enwiki-20190720-pages-articles.xml.bz2' 229 | 230 | # Download via HTTP: 231 | # lol no. no one wants to serve you a 15GB file via HTTP 232 | ``` 233 | 234 | --- 235 | 236 | ## Method #1: Run a caching proxy in front of Wikipedia.org 237 | 238 | > **Complexity:** Low 239 | > Minimal setup and operations requirements, no download of large dumps needed. 240 | > **Disk space requirements:** On-Demand 241 | > Disk is only used as pages are requested (can be 1gb up to 2TB+ depending on usage). 242 | > **CPU requirements:** Very Low 243 | > Lowest out of the three options, can be run on a tiny VPS or home-server. 244 | > **Content freshness:** Very Fresh 245 | > Configurable to cache content indefinitely or pull fresh data for every request. 246 | 247 | ### a. Running with Nginx 248 | 249 | Set the following options in your `/opt/wiki/.env` config file: 250 | `UPSTREAM_HOST=wikipedia.org` 251 | `UPSTREAM_WIKI=en.wikipedia.org` 252 | `UPSTREAM_MEDIA=upload.wikimedia.org` 253 | 254 | Then run all the setup steps below under [Nginx Reverse Proxy](#) to set up Nginx. 255 | 256 | Then restart nginx to apply your config with `systemctl restart nginx`. 257 | 258 | Your mirror should now be running and proxying requests to Wikipedia.org! 259 | 260 | Visit https://en.yourdomainhere.com to see it in action (e.g. https://en.wiki.example.com). 261 | 262 | ### b. Running with Caddy 263 | 264 | Alternatively, check out a similar setup that uses Caddy instead of Nginx as the reverse proxy: https://github.com/CristianCantoro/wikiproxy 265 | 266 | --- 267 | 268 | ## Method #2: Serve the static HTML ZIM archive with Kiwix 269 | 270 | > **Complexity:** Moderate 271 | > Static binary makes it easy to run, but it requires downloading a large dump file. 272 | > **Disk space requirements:** >80GB 273 | > The ZIM archive is a highly-compressed collection of static HTML articles only. 274 | > **CPU requirements:** Very Low 275 | > Low, especially with a CDN in front (more than a proxy, but less than a full server). 276 | > **Content freshness:** Often Stale 277 | > ZIM archives are published yearly (ish) by Wikipedia.org. 278 | 279 | First download a ZIM archive dump like `wikipedia_en_all_maxi_2018-10.zim` into `/opt/wiki/data/dumps` as described above. 280 | 281 | 282 | ### a. Running with Docker 283 | 284 | Run `kiwix-serve` with docker like so: 285 | 286 | ```bash 287 | docker run \ 288 | -v '/opt/wiki/data/dumps:/data' \ 289 | -p 8888:80 \ 290 | kiwix/kiwix-serve \ 291 | 'wikipedia_en_all_maxi_2018-10.zim' 292 | ``` 293 | 294 | Or create `/opt/wiki/docker-compose.yml` and run `docker-compose up`: 295 | ```yml 296 | version: '3' 297 | services: 298 | kiwix: 299 | image: kiwix/kiwix-serve 300 | command: 'wikipedia_en_all_maxi_2018-10.zim' 301 | ports: 302 | - '8888:80' 303 | volumes: 304 | - "./data/dumps:/data" 305 | ``` 306 | 307 | ### b. Running with the static binary 308 | 309 | 1. **Download the latest `kiwix-serve` binary for your OS & CPU architecture** 310 | 311 | Find the latest release for your architecture here and copy its URL to download it below: 312 | https://download.kiwix.org/release/kiwix-tools/ 313 | 314 | ```bash 315 | cd /opt/wiki 316 | wget 'https://download.kiwix.org/release/kiwix-tools/kiwix-tools_linux-x86_64-3.0.1.tar.gz' 317 | tar -xzf 'kiwix-tools_linux-x86_64-3.0.1.tar.gz' 318 | mv 'kiwix-tools_linux-x86_64-3.0.1' 'bin' 319 | ``` 320 | 321 | 2. **Run `kiwix-serve`, passing it a port to listen on and your ZIM archive file** 322 | 323 | ```bash 324 | /opt/wiki/bin/kiwix-serve --port 8888 /opt/wiki/data/dumps/wikipedia_en_all_maxi_2018-10.zim 325 | ``` 326 | 327 | Your server should now be running! 328 | 329 | Visit http://en.yourdomainhere.com:8888 to see it in action! 330 | 331 | ### Optional Nginx Reverse Proxy 332 | 333 | Set the following options in your `/opt/wiki/.env` config file: 334 | ```bash 335 | UPSTREAM_HOST=localhost:8888 336 | UPSTREAM_WIKI=localhost:8888 337 | UPSTREAM_MEDIA=upload.wikimedia.org 338 | ``` 339 | 340 | Then run all the setup steps below under [Nginx Reverse Proxy](#) to set up Nginx. To run nginx inside docker-compose next to Kiwix, see the [Run Nginx via docker-compose](#) section below. 341 | 342 | Your mirror should now be running and proxying requests to `kiwix-serve`! 343 | 344 | Visit https://en.yourdomainhere.com to see it in action (e.g. https://en.wiki.example.com). 345 | 346 | 347 | --- 348 | 349 | ## Method #3: Run a full MediaWiki server 350 | 351 | > **Complexity:** Very High 352 | > Complex multi-component setup with an intricate setup process and high resource use. 353 | > **Disk space requirements:** >550GB (>2TB needed for import phase) 354 | > The uncompressed database is very large (multiple TB with revision history and stubs). 355 | > **CPU requirements:** Moderate (very high during import phase) 356 | > Depends on usage, but it's the most demanding out of the 3 options. 357 | > **Content freshness:** Very fresh 358 | > Udpated database dumps are published monthly (ish) by Wikipedia.org. 359 | 360 | First download a database dump like [`enwiki-20190720-pages-articles.xml.bz2`](magnet:?xl=16321006399&dn=enwiki-20190720-pages-articles.xml.bz2&xt=urn:tree:tiger:zpqgda3rbnycgtcujwpqi72aiv7tyasw7rp7sdi&xt=urn:ed2k:3b291214eb785df5b21cdb62623dd319&xt=urn:aich:zuy4dfbo2ppdhsdtmlev72fggdnka6ch&xt=urn:btih:9f08161276bc95ec594ce89ed52fe18fc41168a3&xt=urn:sha1:54cbdd5e5d1ca22b7dbd16463f81fdbcd6207bab&xt=urn:md5:9be9c811e0cc5c8418c869bb33eb516c&tr=udp%3a%2f%2ftracker.openbittorrent.com%3a80&as=http%3a%2f%2fdumps.wikimedia.freemirror.org%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2&as=http%3a%2f%2fdumps.wikimedia.your.org%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2&as=http%3a%2f%2fftp.acc.umu.se%2fmirror%2fwikimedia.org%2fdumps%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2&as=https%3a%2f%2fdumps.wikimedia.freemirror.org%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2&as=https%3a%2f%2fdumps.wikimedia.your.org%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2&as=https%3a%2f%2fftp.acc.umu.se%2fmirror%2fwikimedia.org%2fdumps%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2&as=https%3a%2f%2fdumps.wikimedia.org%2fenwiki%2f20190720%2fenwiki-20190720-pages-articles.xml.bz2) into `/opt/wiki/data/dumps` as described above. 361 | 362 | If you need to decompress it, `pbzip2` is much faster than `bzip2`: 363 | ```bash 364 | pbzip2 -v -d -k -m10000 enwiki-20190720-pages-articles.xml.bz2 365 | # -m10000 tells it to use 10GB of RAM, adjust accordingly 366 | ``` 367 | 368 | ### a. Running with XOWA in Docker 369 | 370 | https://github.com/QuantumObject/docker-xowa 371 | 372 | ```bash 373 | docker run \ 374 | -v /opt/wiki/data/xowa:/opt/xowa/ \ 375 | -p 8888 \ 376 | sblop/xowa_offline_wikipedia 377 | ``` 378 | ```yaml 379 | version: '3' 380 | services: 381 | xowa: 382 | image: sblop/xowa_offline_wikipedia 383 | ports: 384 | - 8888:80 385 | volumes: 386 | - './data/xowa:/opt/xowa' 387 | ``` 388 | 389 | ### b. Running with MediaWiki in Docker 390 | 391 | - https://hub.docker.com/_/mediawiki 392 | - https://github.com/wikimedia/mediawiki-docker 393 | - https://github.com/AirHelp/mediawiki-docker 394 | - https://en.wikipedia.org/wiki/MediaWiki 395 | - https://www.mediawiki.org/wiki/MediaWiki 396 | - https://www.mediawiki.org/wiki/Download 397 | - https://www.wikidata.org/wiki/Wikidata:Database_download 398 | - https://dumps.wikimedia.org/backup-index.html 399 | 400 | 401 | **Configure your `docker-compose.yml` file** 402 | 403 | Default MediaWiki config file: https://phabricator.wikimedia.org/source/mediawiki/browse/master/includes/DefaultSettings.php 404 | 405 | Create the following `/opt/wiki/docker-compose.yml` file then run `docker-compose up`: 406 | ```yml 407 | version: '3' 408 | services: 409 | database: 410 | image: mariadb 411 | command: --max-allowed-packet=256M 412 | environment: 413 | MYSQL_DATABASE: wikipedia 414 | MYSQL_USER: wikipedia 415 | MYSQL_PASSWORD: wikipedia 416 | MYSQL_ROOT_PASSWORD: wikipedia 417 | 418 | mediawiki: 419 | image: mediawiki 420 | ports: 421 | - 8080:80 422 | depends_on: 423 | - database 424 | volumes: 425 | - './data/html:/var/www/html' 426 | # After initial setup, download LocalSettings.php into ./data/html 427 | # and uncomment the following line, then docker-compose restart 428 | # - ./LocalSettings.php:/var/www/html/LocalSettings.php 429 | ``` 430 | 431 | 432 | **Then import the XML dump into the MediaWiki database:** 433 | - https://www.mediawiki.org/wiki/Manual:Importing_XML_dumps 434 | - https://hub.docker.com/r/ueland/mwdumper/ 435 | - https://www.mail-archive.com/wikitech-l@lists.wikimedia.org/msg02108.html 436 | 437 | **Do not attempt to import it directly with `importDump.php`, it will take months:** 438 | ```bash 439 | php /var/www/html/maintenance/importDump.php enwiki-20170320-pages-articles-multistream.xml 440 | ``` 441 | 442 | **Instead, convert the XML dump into compressed chunks of SQL then import individually:** 443 | 444 | *Warning: For large imports (e.g. English) this process can still take 5+ days depending on the system.* 445 | 446 | ```bash 447 | apt install -y openjdk-8-jre zstd pbzip2 448 | 449 | # Download patched mwdumper version and pre/post import SQL scripts 450 | wget "https://github.com/pirate/wikipedia-mirror/raw/master/bin/mwdumper-1.26.jar" 451 | wget "https://github.com/pirate/wikipedia-mirror/raw/master/preimport.sql" 452 | wget "https://github.com/pirate/wikipedia-mirror/raw/master/postimport.sql" 453 | 454 | DUMP_NAME="enwiki-20190720-pages-articles" 455 | 456 | # Decompress the XML dump using all available cores and 10GB of memory 457 | pbzip2 -v -d -k -m10000 "$DUMP.xml.bz2" 458 | 459 | # Convert the XML file into a SQL file using mwdumper 460 | java -server \ 461 | -jar ./wikipedia-importing-tools/mwdumper-1.26.jar \ 462 | --format=sql:1.5 \ 463 | "$DUMP.xml" \ 464 | > wikipedia.sql 465 | 466 | # Split the generated SQL file into compressed chunks 467 | split --additional-suffix=".sql" --lines=1000 wikipedia.sql 468 | for partial in $(ls *.sql); do 469 | zstd -z $partial 470 | done 471 | 472 | # Fix a schema issue that may otherwise cause import bugs 473 | docker-compose exec database \ 474 | mysql --user=wikipedia --password=wikipedia --database=wikipedia \ 475 | "ALTER TABLE page ADD page_counter bigint unsigned NOT NULL default 0;" 476 | 477 | # Import the compressed chunks into the database 478 | for partial in $(ls *.sql.zst); do 479 | zstd -dc preimport.sql.zst $partial postimport.sql.zst \ 480 | | docker-compose exec database \ 481 | mysql --force --user=wikipedia --password=wikipedia --database=wikipedia 482 | done 483 | ``` 484 | 485 | Credit for these steps goes to https://github.com/wayneworkman/wikipedia-importing-tools. 486 | 487 | 488 | ### Optional Nginx Reverse Proxy 489 | 490 | Set the following options in your `/opt/wiki/.env` config file: 491 | ```bash 492 | UPSTREAM_HOST=localhost:8888 493 | UPSTREAM_WIKI=localhost:8888 494 | UPSTREAM_MEDIA=upload.wikimedia.org 495 | ``` 496 | 497 | Then run all the setup steps below under [Nginx Reverse Proxy](#) to set up Nginx. To run nginx inside docker-compose next to MediaWiki, see the [Run Nginx via docker-compose](#) section below. 498 | 499 | Your mirror should now be running and proxying requests to your wiki server! 500 | 501 | Visit https://en.yourdomainhere.com to see it in action (e.g. https://en.wiki.example.com). 502 | 503 | --- 504 | 505 | ## Nginx Reverse Proxy 506 | 507 | You can optionally set up an Nginx reverse proxy in front of `kiwix-serve`, `Wikipedia.org`, or a `MediaWiki` server to add caching and HTTPS support. 508 | 509 | Make sure the options in `/opt/wiki/.env` are configured correctly for the type of setup you're trying to achieve. 510 | 511 | - To run nginx in front of `kiwix-serve` on localhost, set: 512 | `UPSTREAM_HOST=localhost:8888` 513 | `UPSTREAM_WIKI=localhost:8888` 514 | `UPSTREAM_MEDIA=upload.wikimedia.org` 515 | - To run nginx in front of Wikipedia.org, set: 516 | `UPSTREAM_HOST=wikipedia.org` 517 | `UPSTREAM_WIKI=en.wikipedia.org` 518 | `UPSTREAM_MEDIA=upload.wikimedia.org` 519 | - To run nginx in front of a MediaWiki server on localhost, set: 520 | `UPSTREAM_HOST=localhost:8888` 521 | `UPSTREAM_WIKI=localhost:8888` 522 | `UPSTREAM_MEDIA=upload.wikimedia.org` 523 | - To run nginx in front of a docker container via docker-compose: 524 | *See [Run Nginx via docker-compose](#) section below.* 525 | 526 | ### Install LetsEncrypt and Nginx 527 | 528 | ```bash 529 | # Install the dependencies: nginx and certbot 530 | add-apt-repository -y -n universe 531 | add-apt-repository -y -n ppa:certbot/certbot 532 | add-apt-repository -y -n ppa:nginx/stable 533 | apt update -qq 534 | apt install -y nginx-extras certbot python3-certbot-nginx 535 | systemctl enable nginx 536 | systemctl start nginx 537 | ``` 538 | 539 | ### Obtain an SSL certificate via LetsEncrypt 540 | ```bash 541 | # Load your config values from step 4 into the environment, and create dirs 542 | source /opt/wiki/.env 543 | mkdir -p "$CONFIG_DIR" "$CACHE_DIR" "$CERTS_DIR" "$LOGS_DIR" 544 | 545 | # Get an SSL certificate and generate the Diffie-Hellman parameters file 546 | certbot certonly \ 547 | --nginx \ 548 | --agree-tos \ 549 | --non-interactive \ 550 | -m "ssl@$LISTEN_HOST" \ 551 | --domain "$LISTEN_HOST,$LISTEN_WIKI,$LISTEN_MEDIA" 552 | openssl dhparam -out "$PROJECT_DIR/data/certs/$DOMAIN.dh" 2048 553 | 554 | # Link the certs into your project directory 555 | ln -s /etc/letsencrypt/live/$DOMAIN/fullchain.pem $PROJECT_DIR/data/certs/$DOMAIN.crt 556 | ln -s /etc/letsencrypt/live/$DOMAIN/privkey.pem $PROJECT_DIR/data/certs/$DOMAIN.key 557 | ``` 558 | 559 | LetsEncrypt certs must be renewed every 90 days or they'll expire and you'll get "Invalid Certificate" errors. To have certs automatically renewed periodically, add a systemd timer or cron job to run `certbot renew`. Here's an example tutorial on how to do that: 560 | https://gregchapple.com/2018/02/16/auto-renew-lets-encrypt-certs-with-systemd-timers/ 561 | 562 | ### Populate the nginx.conf template with your config 563 | 564 | ```bash 565 | # Load your config options into the environment 566 | source /opt/wiki/.env 567 | 568 | 569 | # Download the nginx config template 570 | curl --silent \ 571 | "https://github.com/pirate/wikipedia-mirror/raw/master/etc/nginx/nginx.conf.template" \ 572 | > "$CONFIG_DIR/nginx.conf.template" 573 | 574 | # Fill your config options into nginx.conf.template to create nginx.conf 575 | envsubst \ 576 | "$(printf '${%s} ' $(bash -c "compgen -A variable"))"\ 577 | < "$CONFIG_DIR/nginx.conf.template" \ 578 | > "$CONFIG_DIR/nginx.conf" 579 | ``` 580 | 581 | 582 | ### Run Nginx via systemd 583 | ```bash 584 | # Link the your nginx.conf into the system's default nginx config location 585 | ln -s -f "$CONFIG_DIR/nginx.conf" "/etc/nginx/nginx.conf" 586 | 587 | # Restart nginx to load the new config 588 | systemctl restart nginx 589 | ``` 590 | 591 | Now you can visit https://en.yourdomainhere.com to see it in action with HTTPS! 592 | 593 | For troubleshooting, you can find the nginx logs here: 594 | `/opt/wiki/data/logs/nginx.err` 595 | `/opt/wiki/data/logs/nginx.out` 596 | 597 | ### Run Nginx via docker-compose 598 | 599 | Set the config values in your `/opt/wiki/.env` file to correspond to the docker container's hostname that you want to proxy, and tweak the directory paths to be the paths inside the container. e.g. for `mediawiki`: 600 | ```bash 601 | UPSTREAM_HOST=mediawiki:8888` 602 | UPSTREAM_WIKI=mediawiki:8888` 603 | UPSTREAM_MEDIA=upload.wikimedia.org 604 | 605 | CERTS_DIR=/certs 606 | CACHE_DIR=/cache 607 | LOGS_DIR=/logs 608 | ``` 609 | 610 | Then regenerate your `nginx.conf` file with `envsubst` as described in [Nginx Reverse Proxy](#Nginx-Reverse-Proxy) below. 611 | 612 | Then add the `nginx` service to your existing `/opt/wiki/docker-compose.yml` file: 613 | ```bash 614 | version: '3' 615 | services: 616 | 617 | ... 618 | 619 | nginx: 620 | image: nginx:latest 621 | volumes: 622 | - ./etc/nginx/nginx.conf:/etc/nginx/nginx.conf 623 | - ./data/certs:/certs 624 | - ./data/cache:/cache 625 | - ./data/logs:/logs 626 | ports: 627 | - 80:80 628 | - 443:443 629 | ``` 630 | 631 | --- 632 | 633 | # Further Reading 634 | 635 | - https://github.com/openzim/mwoffliner (archiving only, no serving) 636 | - https://www.yunqa.de/delphi/products/wikitaxi/index (Windows only) 637 | - https://www.nongnu.org/wp-mirror/ (last updated in 2014, [Dockerfile](https://github.com/futpib/docker-wp-mirror/blob/master/Dockerfile)) 638 | - https://github.com/dustin/go-wikiparse 639 | - https://www.learn4master.com/tools/python-and-java-libraries-to-parse-wikipedia-dump-dataset 640 | - https://dkpro.github.io/dkpro-jwpl/ 641 | - https://towardsdatascience.com/wikipedia-data-science-working-with-the-worlds-largest-encyclopedia-c08efbac5f5c 642 | - https://meta.wikimedia.org/wiki/Data_dumps/Import_examples#Import_into_an_empty_wiki_of_a_subset_of_en_wikipedia_on_Linux_with_MySQL 643 | - https://github.com/shimondoodkin/wikipedia-dump-import-script/blob/master/example-result.sh 644 | - https://github.com/wayneworkman/wikipedia-importing-tools 645 | - https://github.com/chrisbo246/mediawiki-loader 646 | - https://dzone.com/articles/how-clone-wikipedia-and-index 647 | - https://www.xarg.org/2016/06/importing-entire-wikipedia-into-mysql/ 648 | - https://dengruo.com/blog/running-mediawiki-your-own-copy-restore-whole-mediwiki-backup 649 | - https://brionv.com/log/2007/10/02/wiki-data-dumps/ 650 | - https://www.evanjones.ca/software/wikipedia2text.html 651 | - https://lists.gt.net/wiki/wikitech/160482 652 | - https://helpful.knobs-dials.com/index.php/Harvesting_wikipedia 653 | - https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community 654 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-merlot -------------------------------------------------------------------------------- /bin/kiwix-serve: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/bin/kiwix-serve -------------------------------------------------------------------------------- /bin/mwdumper-1.26.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/bin/mwdumper-1.26.jar -------------------------------------------------------------------------------- /data/cache/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/data/cache/README.md -------------------------------------------------------------------------------- /data/certs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/data/certs/README.md -------------------------------------------------------------------------------- /data/dumps/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/data/dumps/README.md -------------------------------------------------------------------------------- /data/logs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/data/logs/README.md -------------------------------------------------------------------------------- /data/zim/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirate/wikipedia-mirror/34e620346f2a60e1fa4be03e82cac605de9a1571/data/zim/README.md -------------------------------------------------------------------------------- /docker-compose.kiwix.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | kiwix: 4 | image: kiwix/kiwix-serve 5 | command: 'wikipedia_en_all_novid_2018-10.zim' 6 | ports: 7 | - '8888:80' 8 | volumes: 9 | - "./data/zim:/data" 10 | -------------------------------------------------------------------------------- /docker-compose.mediawiki.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | database: 4 | image: mariadb 5 | command: --max-allowed-packet=256M 6 | environment: 7 | MYSQL_DATABASE: wikipedia 8 | MYSQL_USER: wikipedia 9 | MYSQL_PASSWORD: wikipedia 10 | MYSQL_ROOT_PASSWORD: wikipedia 11 | 12 | mediawiki: 13 | image: mediawiki 14 | ports: 15 | - 8080:80 16 | depends_on: 17 | - database 18 | volumes: 19 | - './data/html:/var/www/html' 20 | # After initial setup, download LocalSettings.php into ./data/html 21 | # and uncomment the following line, then docker-compose restart 22 | # - ./LocalSettings.php:/var/www/html/LocalSettings.php 23 | -------------------------------------------------------------------------------- /docker-compose.nginx.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | nginx: 4 | image: nginx:latest 5 | volumes: 6 | - ./etc/nginx/nginx.conf:/etc/nginx/nginx.conf 7 | - ./data/certs:/certs 8 | - ./data/cache:/cache 9 | - ./data/logs:/logs 10 | ports: 11 | - 80:80 12 | - 443:443 13 | -------------------------------------------------------------------------------- /docker-compose.xowa.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | xowa: 4 | image: sblop/xowa_offline_wikipedia 5 | ports: 6 | - 8888:80 7 | volumes: 8 | - './data/xowa:/opt/xowa' 9 | -------------------------------------------------------------------------------- /etc/nginx/nginx.conf.template: -------------------------------------------------------------------------------- 1 | daemon on; 2 | user www-data; 3 | pid /var/run/nginx.pid; 4 | timer_resolution 100ms; 5 | worker_processes auto; 6 | events { 7 | worker_connections 1024; 8 | } 9 | 10 | http { 11 | # Logging Settings 12 | log_format trace '$remote_addr - $remote_user [$time_local] "$request" ' 13 | '$status $body_bytes_sent "$http_referer" "$http_user_agent" ' 14 | '"$http_x_forwarded_for" $request_id'; 15 | access_log $ACCESS_LOG; 16 | error_log $ERROR_LOG; 17 | 18 | # MIME Type Settings 19 | types_hash_max_size 512; 20 | default_type application/octet-stream; 21 | include /etc/nginx/mime.types; 22 | 23 | # GZIP Settings 24 | gzip on; 25 | gzip_vary on; 26 | gzip_proxied any; 27 | gzip_comp_level 6; 28 | gzip_min_length 1000; 29 | gzip_buffers 4 32k; 30 | gzip_types text/plain 31 | text/css 32 | text/xml 33 | 34 | font/ttf 35 | font/woff 36 | font/woff2 37 | 38 | application/json 39 | application/xhtml+xml 40 | application/rss+xml 41 | application/atom_xml 42 | application/javascript 43 | application/x-javascript; 44 | 45 | 46 | # Connection Settings 47 | resolver 1.1.1.1 8.8.8.8; # DNS server to use when resolving hosts for upstreams or cert chains 48 | resolver_timeout 5s; # timeout DNS requests as failed after this many seconds waiting for a response 49 | tcp_nopush on; # enables NGINX to send HTTP response headers in one packet right after the chunk of data has been obtained by sendfile() 50 | tcp_nodelay on; # don't wait 200ms to collect response headers and data before sending directly from filesystem 51 | port_in_redirect off; # when proxying redirects, strip any custom upstream ports from the url sent to the client 52 | slice 1m; # allow breaking up files into slices so as not to block on loading an entire file to only request a small range 53 | sendfile on; # send static files direclty from filesystem without buffering in memory 54 | sendfile_max_chunk 2m; # limit each filesystem chunk sent to 1mb to prevent one connection from eating all resources 55 | send_timeout 20s; # wait up to 20s before closing response connections where client has stopped accepting response data 56 | keepalive_timeout 60s; # allow up to 60s total before closing unresponsive/dead request connections 57 | client_header_timeout 15s; # don't wait more than 15s for client to send request headers 58 | client_body_timeout 15s; # don't wait more than 15s for client to send request body 59 | client_max_body_size 50m; # maximum file upload / request size (increase to allow larger file uploads) 60 | client_body_buffer_size 16k; # buffer size for reading client request body (should be 2 pages, aka 16k on 64bit systems) 61 | client_header_buffer_size 1k; # buffer size for reading client request header (for most requests, a buffer of 1K bytes is enough unless there are long cookies) 62 | large_client_header_buffers 4 8k; # maximum number and size of buffers used for reading large client request header (A request line cannot exceed the size of one buffer) 63 | http2_push_preload on; # enable http2 pushing of files before client requests them 64 | http2_max_concurrent_pushes 10; # limit concurrent server pushes to prevent overwhelming client 65 | http2_max_concurrent_streams 128; # maximum number of concurrent HTTP/2 streams in a connection 66 | 67 | # Security Settings 68 | ssl_stapling on; # instruct clients to only allow site to be served with local SSL cert 69 | ssl_stapling_verify on; # check to make sure no one else is serving with a different cert 70 | ssl_protocols TLSv1.2 TLSv1.3; # only allow modern SSL protocols 71 | ssl_session_cache shared:SSL:50m; # enable quick-resume of previous ssl sessions 72 | ssl_session_timeout 5d; # store ssl session cache entries for 5 days 73 | ssl_session_tickets off; # session tickets break forward secrecy 74 | ssl_ciphers ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-AES128-SHA256:ECDHE-RSA-AES128-SHA256:ECDHE-ECDSA-AES128-SHA:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-AES128-SHA:ECDHE-ECDSA-AES256-SHA384:ECDHE-ECDSA-AES256-SHA:ECDHE-RSA-AES256-SHA:DHE-RSA-AES128-SHA256:DHE-RSA-AES128-SHA:DHE-RSA-AES256-SHA256:DHE-RSA-AES256-SHA:ECDHE-ECDSA-DES-CBC3-SHA:ECDHE-RSA-DES-CBC3-SHA:EDH-RSA-DES-CBC3-SHA:AES128-GCM-SHA256:AES256-GCM-SHA384:AES128-SHA256:AES256-SHA256:AES128-SHA:AES256-SHA:DES-CBC3-SHA:!DSS; 75 | ssl_ecdh_curve secp384r1; # use a strong curve function for encryption 76 | ssl_prefer_server_ciphers on; # prevent downgrade attacks to weaker cipher suites 77 | server_tokens off; # hide nginx version info in error pages and headers 78 | 79 | # Reverse Proxy Settings 80 | proxy_socket_keepalive on; # keep the upstream connection open instead of opening a new connection every time 81 | proxy_request_buffering off; # dont wait for the full request to arrive before passing body to upstream 82 | proxy_buffering off; # dont wait for full response to complete before passing body to client 83 | proxy_http_version 1.1; # Properly proxy websocket connections 84 | proxy_read_timeout 120s; # terminate websockets/dead nginx<-django connections afer 5min of inactivity 85 | proxy_cache_path "$CACHE_DIR" levels=1:2 keys_zone=main:16M max_size=$CACHE_SIZE inactive=1440h use_temp_path=off; 86 | proxy_cache_key "$request_uri$is_args$args$slice_range"; 87 | proxy_cache_methods $CACHE_REQUESTS; 88 | proxy_cache_valid $CACHE_RESPONSES $CACHE_DURATION; 89 | proxy_cache_valid 404 10m; # allow articles not found to be refreshed more frequently than cache duration in case they're created 90 | # proxy_cache_bypass $http_cache_control; # bypass cache if client requests with Cache-Control: max-age=0 91 | proxy_cache_use_stale error updating invalid_header timeout http_500 http_502 http_503 http_504; 92 | proxy_cache_revalidate on; # use If-Modified-Since to revalidate cached requests if they expire instead of re-downloading full response 93 | proxy_cache_lock on; # if 2 requests come in, try to only make 1 upstream request to handle them both 94 | proxy_cache_lock_age 5s; # timeout to wait for in-progress caching before sending request directly to upstream 95 | proxy_cache_lock_timeout 5s; # timeout to wait for in-progress caching before sending request directly to upstream 96 | proxy_ignore_headers X-Accel-Expires; # ignore upstream caching recommendations for nginx response caching 97 | proxy_ignore_headers Expires; # ignore upstream caching recommendations for nginx response caching 98 | proxy_ignore_headers Cache-Control; # ignore upstream caching recommendations for nginx response caching 99 | proxy_ignore_headers Set-Cookie; # cache responses even when cookies are set 100 | proxy_hide_header X-Accel-Expires; # hide upstream caching recommendation from client 101 | proxy_hide_header Expires; # hide upstream caching recommendation from client 102 | proxy_hide_header Cache-Control; # hide upstream caching recommendation from client 103 | # proxy_hide_header "Set-Cookie"; # prevent upstream cookies being set on clients at all 104 | proxy_cookie_domain .$UPSTREAM_HOST $host; # rewrite cookie domains to proxied equivalents 105 | proxy_cookie_domain $UPSTREAM_HOST $host; # rewrite cookie domains to proxied equivalents 106 | proxy_cookie_domain $UPSTREAM_WIKI $host; # rewrite cookie domains to proxied equivalents 107 | proxy_cookie_domain $UPSTREAM_MEDIA $host; # rewrite cookie domains to proxied equivalents 108 | 109 | 110 | # Server Definition 111 | server { 112 | listen $LISTEN_PORT_HTTP default_server; 113 | listen $LISTEN_PORT_HTTPS ssl http2 default_server; 114 | 115 | server_name $LISTEN_HOST $LISTEN_WIKI $LISTEN_MEDIA 116 | $UPSTREAM_HOST $UPSTREAM_WIKI $UPSTREAM_MEDIA; 117 | 118 | ssl_certificate_key $SSL_KEY; 119 | ssl_certificate $SSL_CRT; 120 | ssl_trusted_certificate $SSL_CRT; 121 | ssl_dhparam $SSL_DH; 122 | 123 | proxy_cache wikiproxy; 124 | 125 | error_page 497 https://$host$request_uri; # redirect http:443 to https:443 126 | if ($scheme = http) { 127 | return 301 https://$host$request_uri; # redirect http:80 to https:443 128 | } 129 | 130 | location / { 131 | try_files $uri $uri/ @upstream; 132 | 133 | # Replace any domains in response body text with proxied equivalents 134 | subs_filter_types text/html text/css text/xml text/javascript; 135 | subs_filter $UPSTREAM_WIKI $LISTEN_WIKI gi; 136 | subs_filter $UPSTREAM_MEDIA $LISTEN_MEDIA gi; 137 | subs_filter $UPSTREAM_HOST $LISTEN_HOST gi; 138 | 139 | # Add headers to the response sent to the client 140 | add_header Cache-Control "public"; 141 | add_header X-Handled-By "$upstream_addr"; 142 | add_header X-Cache-Status "$upstream_cache_status"; 143 | add_header X-Request-Id "$request_id"; 144 | add_header X-Content-Type-Options "nosniff"; 145 | add_header X-XSS-Protection "1; mode=block"; 146 | add_header Referrer-Policy "strict-origin-when-cross-origin"; 147 | add_header Strict-Transport-Security "max-age=31536000; includeSubDomains"; 148 | add_header X-Robots-Tag "noindex"; 149 | 150 | # Cache responses for the configured amount of time 151 | expires $CACHE_DURATION; 152 | } 153 | 154 | location @upstream { 155 | if ($host ~ "^($LISTEN_HOST|$UPSTREAM_HOST)$") { 156 | set $xupstream $UPSTREAM_HOST; 157 | } 158 | if ($host ~ "^($LISTEN_WIKI|$UPSTREAM_WIKI)$") { 159 | set $xupstream $UPSTREAM_WIKI; 160 | } 161 | if ($host ~ "^($LISTEN_MEDIA|$UPSTREAM_MEDIA)$") { 162 | set $xupstream $UPSTREAM_MEDIA; 163 | } 164 | 165 | # Add headers to the request sent to the upstream server 166 | proxy_set_header Host "$xupstream"; 167 | proxy_set_header Range "$slice_range"; 168 | proxy_set_header Upgrade "$http_upgrade"; 169 | proxy_set_header Connection "upgrade"; 170 | proxy_set_header X-Request-Id "$request_id"; 171 | proxy_set_header X-Real-Ip "$remote_addr"; 172 | proxy_set_header X-Forwarded-Host "$host"; 173 | proxy_set_header X-Forwarded-Server "$host"; 174 | proxy_set_header X-Forwarded-For "$proxy_add_x_forwarded_for"; 175 | proxy_set_header X-Forwarded-Protocol "$scheme"; 176 | 177 | proxy_pass https://$xupstream; 178 | } 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /postimport.sql: -------------------------------------------------------------------------------- 1 | COMMIT; 2 | SET autocommit=1; 3 | SET unique_checks=1; 4 | SET foreign_key_checks=1; 5 | -------------------------------------------------------------------------------- /preimport.sql: -------------------------------------------------------------------------------- 1 | SET autocommit=0; 2 | SET unique_checks=0; 3 | SET foreign_key_checks=0; 4 | BEGIN; 5 | --------------------------------------------------------------------------------