├── .devcontainer
└── devcontainer.json
├── .dockerignore
├── .editorconfig
├── .eslintignore
├── .eslintrc
├── .github
├── ISSUE_TEMPLATE
│ ├── config.yml
│ └── open_an_issue.md
├── config.yml
└── workflows
│ ├── generated-pr.yml
│ └── stale.yml
├── .gitignore
├── .prettierrc.js
├── BENCHMARKS.md
├── Dockerfile
├── README.md
├── assets
├── wikipedia-on-ipfs-100px.png
├── wikipedia-on-ipfs-small-flat-cropped-offset-min.png
├── wikipedia-on-ipfs.png
├── wikipedia-on-ipfs.psd
├── wikipedia-on-ipfs.pxm
└── wikipedia-on-ipfs.pxm.zip
├── bin
├── run
└── run.cmd
├── legacy-pipeline
├── demo
│ ├── wikipedia-on-ipfs-offset.png
│ └── wikipedia-on-ipfs.png
├── execute-changes.sh
├── redirect-page
│ ├── index.html
│ └── index_root.html
└── scripts
│ ├── body.js
│ ├── search-shim.js
│ └── search.js
├── mirrorzim.sh
├── package.json
├── snapshot-hashes.yml
├── snapshots
└── .ignore
├── src
├── article-transforms.ts
├── domain.ts
├── index.ts
├── site-transforms.ts
├── templates
│ ├── footer_fragment.handlebars
│ ├── index_redirect_fragment.handlebars
│ └── site.js.handlebars
├── utils
│ ├── assert-never.ts
│ ├── check-unpacked-zim-dir.ts
│ ├── download-file.ts
│ └── walk-files.ts
└── zim-to-website.ts
├── test
├── index.test.ts
├── mocha.opts
└── tsconfig.json
├── tools
├── find_main_page_name.sh
├── find_original_main_page_url.sh
└── getzim.sh
├── tsconfig.json
└── yarn.lock
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at:
2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.101.1/containers/docker-existing-dockerfile
3 | {
4 | "name": "Distributed-Docker-Mirror-Dev",
5 |
6 | // Sets the run context to one level up instead of the .devcontainer folder.
7 | "context": "..",
8 |
9 | // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename.
10 | "dockerFile": "../Dockerfile",
11 |
12 | // Set *default* container specific settings.json values on container create.
13 | "settings": {
14 | "terminal.integrated.shell.linux": null
15 | },
16 |
17 | // Add the IDs of extensions you want installed when the container is created.
18 | "extensions": [],
19 |
20 | // Use 'forwardPorts' to make a list of ports inside the container available locally.
21 | "forwardPorts": [8080],
22 |
23 | // Uncomment the next line to run commands after the container is created - for example installing git.
24 | "postCreateCommand": "yarn"
25 |
26 | // Uncomment when using a ptrace-based debugger like C++, Go, and Rust
27 | // "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ],
28 |
29 | // Uncomment to use the Docker CLI from inside the container. See https://aka.ms/vscode-remote/samples/docker-in-docker.
30 | // "mounts": [ "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ],
31 |
32 | // Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root.
33 | // "remoteUser": "vscode"
34 | }
35 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | out/
2 | tmp/
3 | snapshots/
4 | zim-to-website/
5 | extract_zim/
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 |
3 | [*]
4 | indent_style = space
5 | indent_size = 2
6 | charset = utf-8
7 | trim_trailing_whitespace = true
8 | insert_final_newline = true
9 |
10 | [*.md]
11 | trim_trailing_whitespace = false
12 |
--------------------------------------------------------------------------------
/.eslintignore:
--------------------------------------------------------------------------------
1 | /lib
2 | /tmp
3 | /snapshots
--------------------------------------------------------------------------------
/.eslintrc:
--------------------------------------------------------------------------------
1 | {
2 | "extends": ["oclif", "oclif-typescript", "plugin:prettier/recommended"],
3 | "plugins": ["simple-import-sort"],
4 | "rules": {
5 | "unicorn/no-abusive-eslint-disable": ["off"],
6 | "simple-import-sort/sort": "error",
7 | "@typescript-eslint/member-delimiter-style": [
8 | "error",
9 | {
10 | "multiline": {
11 | "delimiter": "none"
12 | },
13 | "singleline": {
14 | "delimiter": "semi"
15 | }
16 | }
17 | ]
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: Getting Help on IPFS
4 | url: https://ipfs.io/help
5 | about: All information about how and where to get help on IPFS.
6 | - name: IPFS Official Forum
7 | url: https://discuss.ipfs.io
8 | about: Please post general questions, support requests, and discussions here.
9 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/open_an_issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Open an issue
3 | about: Only for actionable issues relevant to this repository.
4 | title: ''
5 | labels: need/triage
6 | assignees: ''
7 |
8 | ---
9 |
20 |
--------------------------------------------------------------------------------
/.github/config.yml:
--------------------------------------------------------------------------------
1 | # Configuration for welcome - https://github.com/behaviorbot/welcome
2 |
3 | # Configuration for new-issue-welcome - https://github.com/behaviorbot/new-issue-welcome
4 | # Comment to be posted to on first time issues
5 | newIssueWelcomeComment: >
6 | Thank you for submitting your first issue to this repository! A maintainer
7 | will be here shortly to triage and review.
8 |
9 | In the meantime, please double-check that you have provided all the
10 | necessary information to make this process easy! Any information that can
11 | help save additional round trips is useful! We currently aim to give
12 | initial feedback within **two business days**. If this does not happen, feel
13 | free to leave a comment.
14 |
15 | Please keep an eye on how this issue will be labeled, as labels give an
16 | overview of priorities, assignments and additional actions requested by the
17 | maintainers:
18 |
19 | - "Priority" labels will show how urgent this is for the team.
20 | - "Status" labels will show if this is ready to be worked on, blocked, or in progress.
21 | - "Need" labels will indicate if additional input or analysis is required.
22 |
23 | Finally, remember to use https://discuss.ipfs.io if you just need general
24 | support.
25 |
26 | # Configuration for new-pr-welcome - https://github.com/behaviorbot/new-pr-welcome
27 | # Comment to be posted to on PRs from first time contributors in your repository
28 | newPRWelcomeComment: >
29 | Thank you for submitting this PR!
30 |
31 | A maintainer will be here shortly to review it.
32 |
33 | We are super grateful, but we are also overloaded! Help us by making sure
34 | that:
35 |
36 | * The context for this PR is clear, with relevant discussion, decisions
37 | and stakeholders linked/mentioned.
38 |
39 | * Your contribution itself is clear (code comments, self-review for the
40 | rest) and in its best form. Follow the [code contribution
41 | guidelines](https://github.com/ipfs/community/blob/master/CONTRIBUTING.md#code-contribution-guidelines)
42 | if they apply.
43 |
44 | Getting other community members to do a review would be great help too on
45 | complex PRs (you can ask in the chats/forums). If you are unsure about
46 | something, just leave us a comment.
47 |
48 | Next steps:
49 |
50 | * A maintainer will triage and assign priority to this PR, commenting on
51 | any missing things and potentially assigning a reviewer for high
52 | priority items.
53 |
54 | * The PR gets reviews, discussed and approvals as needed.
55 |
56 | * The PR is merged by maintainers when it has been approved and comments addressed.
57 |
58 | We currently aim to provide initial feedback/triaging within **two business
59 | days**. Please keep an eye on any labelling actions, as these will indicate
60 | priorities and status of your contribution.
61 |
62 | We are very grateful for your contribution!
63 |
64 |
65 | # Configuration for first-pr-merge - https://github.com/behaviorbot/first-pr-merge
66 | # Comment to be posted to on pull requests merged by a first time user
67 | # Currently disabled
68 | #firstPRMergeComment: ""
69 |
--------------------------------------------------------------------------------
/.github/workflows/generated-pr.yml:
--------------------------------------------------------------------------------
1 | name: Close Generated PRs
2 |
3 | on:
4 | schedule:
5 | - cron: '0 0 * * *'
6 | workflow_dispatch:
7 |
8 | permissions:
9 | issues: write
10 | pull-requests: write
11 |
12 | jobs:
13 | stale:
14 | uses: ipdxco/unified-github-workflows/.github/workflows/reusable-generated-pr.yml@v1
15 |
--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
1 | name: Close Stale Issues
2 |
3 | on:
4 | schedule:
5 | - cron: '0 0 * * *'
6 | workflow_dispatch:
7 |
8 | permissions:
9 | issues: write
10 | pull-requests: write
11 |
12 | jobs:
13 | stale:
14 | uses: ipdxco/unified-github-workflows/.github/workflows/reusable-stale-issue.yml@v1
15 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .cache
3 | *.zim
4 | out
5 | IPFS_PATH
6 |
7 | notes.md
8 |
9 | *-debug.log
10 | *-error.log
11 | /.nyc_output
12 | /dist
13 | /lib
14 | /package-lock.json
15 | /tmp
16 | /snapshots
17 | node_modules
18 | /zim-tools
19 | /kiwix-tools
20 |
21 | bin/zimdump
22 |
--------------------------------------------------------------------------------
/.prettierrc.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | semi: false,
3 | trailingComma: "none",
4 | singleQuote: true,
5 | printWidth: 80,
6 | tabWidth: 2
7 | };
--------------------------------------------------------------------------------
/BENCHMARKS.md:
--------------------------------------------------------------------------------
1 | Below are some quick notes on how long things take.
2 |
3 | - 2021-Q1 with 1TB NVMe SSD, i7-4770S
4 | - unpacking wikipedia_en_all_maxi_2021-02 + fixing exceptions on ssd ~3h50m
5 | (not a full build, this is without IPFS import)
6 | - full build from wikipedia_ru_all_maxi_2021-03.zim with badger ~4h11m
7 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM debian:stable
2 |
3 | ENV DEBIAN_FRONTEND=noninteractive
4 |
5 | RUN apt update
6 | RUN apt -y install --no-install-recommends git ca-certificates curl wget apt-utils
7 |
8 | # install:
9 | # - node and yarn
10 | # - go-ipfs
11 | RUN curl -sL https://deb.nodesource.com/setup_14.x -o nodesource_setup.sh \
12 | && bash nodesource_setup.sh \
13 | && apt -y install --no-install-recommends nodejs \
14 | && npm install -g yarn \
15 | && wget -nv https://dist.ipfs.io/go-ipfs/v0.8.0/go-ipfs_v0.8.0_linux-amd64.tar.gz \
16 | && tar xvfz go-ipfs_v0.8.0_linux-amd64.tar.gz \
17 | && mv go-ipfs/ipfs /usr/local/bin/ipfs \
18 | && rm -r go-ipfs && rm go-ipfs_v0.8.0_linux-amd64.tar.gz \
19 | && ipfs init -p server,local-discovery,flatfs,randomports --empty-repo \
20 | && ipfs config --json 'Experimental.ShardingEnabled' true
21 |
22 | # TODO: move repo init after external volume is mounted
23 |
24 | ENV DEBIAN_FRONTEND=dialog
25 |
26 | RUN mkdir /root/distributed-wikipedia-mirror
27 | VOLUME ["/root/distributed-wikipedia-mirror"]
28 | WORKDIR /root/distributed-wikipedia-mirror
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
Distributed Wikipedia Mirror Project
6 |
7 | Putting Wikipedia Snapshots on IPFS and working towards making it fully read-write.
8 |
9 |
10 |
11 | ## Existing Mirrors
12 |
13 | There are various ways one can access the mirrors: through a [DNSLink](https://docs.ipfs.tech/concepts/glossary/#dnslink), public [gateway](https://docs.ipfs.tech/concepts/glossary/#gateway) or directly with a [CID](https://docs.ipfs.tech/concepts/glossary/#cid).
14 |
15 | You can [read all about the available methods here](https://blog.ipfs.tech/2021-05-31-distributed-wikipedia-mirror-update/#improved-access-to-wikipedia-mirrors).
16 |
17 | ### DNSLinks
18 |
19 | - https://en.wikipedia-on-ipfs.org
20 | - https://tr.wikipedia-on-ipfs.org
21 | - https://my.wikipedia-on-ipfs.org
22 | - https://ar.wikipedia-on-ipfs.org
23 | - https://zh.wikipedia-on-ipfs.org
24 | - https://uk.wikipedia-on-ipfs.org
25 | - https://ru.wikipedia-on-ipfs.org
26 | - https://fa.wikipedia-on-ipfs.org
27 |
28 | ### CIDs
29 |
30 | The latest CIDs that the DNSLinks point at can be found in [snapshot-hashes.yml](snapshot-hashes.yml).
31 |
32 | ---
33 |
34 | Each mirror has a link to the original [Kiwix](https://kiwix.org) ZIM archive in the footer. It can be dowloaded and opened offline with the [Kiwix Reader](https://www.kiwix.org/en/download/).
35 |
36 | ## Table of Contents
37 |
38 | - [Purpose](#purpose)
39 | - [How to add new Wikipedia snapshots to IPFS](#how-to-add-new-wikipedia-snapshots-to-ipfs)
40 | - [Manual build](#manual-build)
41 | - [Docker](#docker-build)
42 | - [How to help](#how-to-help)
43 | - [Cohost a lazy copy](#cohost-a-lazy-copy)
44 | - [Cohost a full copy](#cohost-a-full-copy)
45 |
46 | ## Purpose
47 |
48 | “We believe that information—knowledge—makes the world better. That when we ask questions, get the facts, and are able to understand all perspectives on an issue, it allows us to build the foundation for a more just and tolerant society”
49 | -- Katherine Maher, Executive Director of the Wikimedia Foundation
50 |
51 | ## Wikipedia on IPFS -- Background
52 |
53 | ### What does it mean to put Wikipedia on IPFS?
54 |
55 | The idea of putting Wikipedia on IPFS has been around for a while. Every few months or so someone revives the threads. You can find such discussions in [this github issue about archiving wikipedia](https://github.com/ipfs/archives/issues/20), [this issue about possible integrations with Wikipedia](https://github.com/ipfs/notes/issues/46), and [this proposal for a new project](https://github.com/ipfs/notes/issues/47#issuecomment-140587530).
56 |
57 | We have two consecutive goals regarding Wikipedia on IPFS: Our first goal is to create periodic read-only snapshots of Wikipedia. A second goal will be to create a full-fledged read-write version of Wikipedia. This second goal would connect with the Wikimedia Foundation’s bigger, longer-running conversation about decentralizing Wikipedia, which you can read about at https://strategy.wikimedia.org/wiki/Proposal:Distributed_Wikipedia
58 |
59 | ### (Goal 1) Read-Only Wikipedia on IPFS
60 |
61 | The easy way to get Wikipedia content on IPFS is to periodically -- say every week -- take snapshots of all the content and add it to IPFS. That way the majority of Wikipedia users -- who only read wikipedia and don’t edit -- could use all the information on wikipedia with all the benefits of IPFS. Users couldn't edit it, but users could download and archive swaths of articles, or even the whole thing. People could serve it to each other peer-to-peer, reducing the bandwidth load on Wikipedia servers. People could even distribute it to each other in closed, censored, or resource-constrained networks -- with IPFS, peers do not need to be connected to the original source of the content, being connected to anyone who has the content is enough. Effectively, the content can jump from computer to computer in a peer-to-peer way, and avoid having to connect to the content source or even the internet backbone. We've been in discussions with many groups about the potential of this kind of thing, and how it could help billions of people around the world to access information better -- either free of censorship, or circumventing serious bandwidth or latency constraints.
62 |
63 | So far, we have achieved part of this goal: we have static snapshots of all of Wikipedia on IPFS. This is already a huge result that will help people access, keep, archive, cite, and distribute lots of content. In particular, we hope that this distribution helps people in Turkey, who find themselves in a tough situation. We are still working out a process to continue updating these snapshots, we hope to have someone at Wikimedia in the loop as they are the authoritative source of the content. **If you could help with this, please get in touch with us at `wikipedia-project ipfs.io`**
64 |
65 | ### (Goal 2) Fully Read-Write Wikipedia on IPFS
66 |
67 | The long term goal is to get the full-fledged read-write Wikipedia to work on top of IPFS. This is much more difficult because for a read-write application like Wikipedia to leverage the distributed nature of IPFS, we need to change how the applications write data. A read-write wikipedia on IPFS would allow it to be completely decentralized, and create an extremely difficult to censor operation. In addition to all the benefits of the static version above, the users of a read-write Wikipedia on IPFS could write content from anywhere and publish it, even without being directly connected to any wikipedia.org servers. There would be automatic version control and version history archiving. We could allow people to view, edit, and publish in completely encrypted contexts, which is important to people in highly repressive regions of the world.
68 |
69 | A full read-write version (2) would require a strong collaboration with Wikipedia.org itself, and finishing work on important dynamic content challenges -- we are working on all the technology (2) needs, but it's not ready for prime-time yet. We will update when it is.
70 |
71 | # How to add new Wikipedia snapshots to IPFS
72 |
73 | The process can be nearly fully automated, however it consists of many stages
74 | and understanding what happens during each stage is paramount if ZIM format
75 | changes and our build toolchain requires a debug and update.
76 |
77 | - [Manual build](#manual-build) are useful in debug situations, when specific stage needs to be executed multiple times to fix a bug.
78 | - [mirrorzim.sh](#mirrorzimsh) automates some steps for QA purposes and ad-hoc experimentation
79 |
82 |
83 | **Note: This is a work in progress.**. We intend to make it easy for anyone to
84 | create their own wikipedia snapshots and add them to IPFS, making sure those
85 | builds are deterministic and auditable, but our first emphasis has been to get
86 | the initial snapshots onto the network. This means some of the steps aren't as
87 | easy as we want them to be. If you run into trouble, seek help through a github
88 | issue, commenting in [chat](https://docs.ipfs.tech/community/#chat), or by posting a thread on
89 | [https://discuss.ipfs.tech](https://discuss.ipfs.tech/c/help/13).
90 |
91 | ## Manual build
92 |
93 | If you would like to create an updated Wikipedia snapshot on IPFS, you can follow these steps.
94 |
95 |
96 | ### Step 0: Clone this repository
97 |
98 | All commands assume to be run inside a cloned version of this repository
99 |
100 | Clone the distributed-wikipedia-mirror git repository
101 |
102 | ```sh
103 | $ git clone https://github.com/ipfs/distributed-wikipedia-mirror.git
104 | ```
105 |
106 | then `cd` into that directory
107 |
108 | ```sh
109 | $ cd distributed-wikipedia-mirror
110 | ```
111 |
112 | ### Step 1: Install dependencies
113 |
114 | `Node` and `yarn` are required. On Mac OS X you will need `sha256sum`, available in coreutils.
115 |
116 | Install the node dependencies:
117 |
118 | ```sh
119 | $ yarn
120 | ```
121 |
122 | Then, download the latest [zim-tools](https://download.openzim.org/release/zim-tools/) and add `zimdump` to your `PATH`.
123 | This tool is necessary for unpacking ZIM.
124 |
125 | ### Step 2: Configure your IPFS Node
126 |
127 | It is advised to use separate IPFS node for this:
128 |
129 | ```console
130 | $ export IPFS_PATH=/path/to/IPFS_PATH_WIKIPEDIA_MIRROR
131 | $ ipfs init -p server,local-discovery,flatfs,randomports --empty-repo
132 | ```
133 |
134 | #### Tune DHT for speed
135 |
136 | Wikipedia has a lot of blocks, to publish them as fast as possible,
137 | enable [Accelerated DHT Client](https://github.com/ipfs/go-ipfs/blob/master/docs/experimental-features.md#accelerated-dht-client):
138 |
139 | ```console
140 | $ ipfs config --json Experimental.AcceleratedDHTClient true
141 | ```
142 |
143 | #### Tune datastore for speed
144 |
145 | Make sure repo uses `flatfs` with `sync` set to `false`:
146 |
147 | ```console
148 | $ ipfs config --json 'Datastore.Spec.mounts' "$(ipfs config 'Datastore.Spec.mounts' | jq -c '.[0].child.sync=false')"
149 | ```
150 |
151 | **NOTE:** While badgerv1 datastore is faster is nome configurations, we choose to avoid using it with bigger builds like English because of [memory issues due to the number of files](https://github.com/ipfs/distributed-wikipedia-mirror/issues/85). Potential workaround is to use [`filestore`](https://github.com/ipfs/go-ipfs/blob/master/docs/experimental-features.md#ipfs-filestore) that avoids duplicating data and reuses unpacked files as-is.
152 |
153 | #### HAMT sharding
154 |
155 | Make sure you use go-ipfs 0.12 or later, it has automatic sharding of big directories.
156 |
157 | ### Step 3: Download the latest snapshot from kiwix.org
158 |
159 | Source of ZIM files is at https://download.kiwix.org/zim/wikipedia/
160 | Make sure you download `_all_maxi_` snapshots, as those include images.
161 |
162 | To automate this, you can also use the `getzim.sh` script:
163 |
164 | First, download the latest wiki lists using `bash ./tools/getzim.sh cache_update`
165 |
166 | After that create a download command using `bash ./tools/getzim.sh choose`, it should give an executable command e.g.
167 |
168 | ```sh
169 | Download command:
170 | $ ./tools/getzim.sh download wikipedia wikipedia tr all maxi latest
171 | ```
172 |
173 | Running the command will download the choosen zim file to the `./snapshots` directory.
174 |
175 |
176 |
177 | ### Step 4: Unpack the ZIM snapshot
178 |
179 | Unpack the ZIM snapshot using `extract_zim`:
180 |
181 | ```sh
182 | $ zimdump dump ./snapshots/wikipedia_tr_all_maxi_2021-01.zim --dir ./tmp/wikipedia_tr_all_maxi_2021-01
183 | ```
184 |
185 | > ### ℹ️ ZIM's main page
186 | >
187 | > Each ZIM file has "main page" attribute which defines the landing page set for the ZIM archive.
188 | > It is often different than the "main page" of upstream Wikipedia.
189 | > Kiwix Main page needs to be passed in the next step, so until there is an automated way to determine "main page" of ZIM, you need to open ZIM in Kiwix reader and eyeball the name of the landing page.
190 |
191 | ### Step 5: Convert the unpacked zim directory to a website with mirror info
192 |
193 | IMPORTANT: The snapshots must say who disseminated them. This effort to mirror Wikipedia snapshots is not affiliated with the Wikimedia foundation and is not connected to the volunteers whose contributions are contained in the snapshots. The snapshots must include information explaining that they were created and disseminated by independent parties, not by Wikipedia.
194 |
195 | The conversion to a working website and the appending of necessary information is is done by the node program under `./bin/run`.
196 |
197 | ```sh
198 | $ node ./bin/run --help
199 | ```
200 |
201 | The program requires main page for ZIM and online versions as one of inputs. For instance, the ZIM file for Turkish Wikipedia has a main page of `Kullanıcı:The_other_Kiwix_guy/Landing` but `https://tr.wikipedia.org` uses `Anasayfa` as the main page. Both must be passed to the node script.
202 |
203 | To determine the original main page use `./tools/find_main_page_name.sh`:
204 |
205 | ```console
206 | $ ./tools/find_main_page_name.sh tr.wikiquote.org
207 | Anasayfa
208 | ```
209 |
210 | To determine the main page in ZIM file open in in a [Kiwix reader](https://www.kiwix.org/en/kiwix-reader) or use `zimdump info` (version 3.0.0 or later) and ignore the `A/` prefix:
211 |
212 | ```console
213 | $ zimdump info wikipedia_tr_all_maxi_2021-01.zim
214 | count-entries: 1088190
215 | uuid: 840fc82f-8f14-e11e-c185-6112dba6782e
216 | cluster count: 5288
217 | checksum: 50113b4f4ef5ddb62596d361e0707f79
218 | main page: A/Kullanıcı:The_other_Kiwix_guy/Landing
219 | favicon: -/favicon
220 |
221 | $ zimdump info wikipedia_tr_all_maxi_2021-01.zim | grep -oP 'main page: A/\K\S+'
222 | Kullanıcı:The_other_Kiwix_guy/Landing
223 | ```
224 |
225 | The conversion is done on the unpacked zim directory:
226 |
227 | ```sh
228 | node ./bin/run ./tmp/wikipedia_tr_all_maxi_2021-02 \
229 | --hostingdnsdomain=tr.wikipedia-on-ipfs.org \
230 | --zimfile=./snapshots/wikipedia_tr_all_maxi_2021-02.zim \
231 | --kiwixmainpage=Kullanıcı:The_other_Kiwix_guy/Landing \
232 | --mainpage=Anasayfa
233 | ```
234 |
235 | ### Step 6: Import website directory to IPFS
236 |
237 | #### Increase the limitation of opening files
238 |
239 | In some cases, you will meet an error like `could not create socket: Too many open files` when you add files to the IPFS store. It happens when IPFS needs to open more files than it is allowed by the operating system and you can temporarily increase this limitation to avoid this error using this command.
240 |
241 | ```sh
242 | ulimit -n 65536
243 | ```
244 |
245 | #### Add immutable copy
246 |
247 | Add all the data to your node using `ipfs add`. Use the following command, replacing `$unpacked_wiki` with the path to the website that you created in Step 4 (`./tmp/wikipedia_en_all_maxi_2018-10`).
248 |
249 | ```sh
250 | $ ipfs add -r --cid-version 1 --offline $unpacked_wiki
251 | ```
252 |
253 | Save the last hash of the output from the above process. It is the CID of the website.
254 |
255 | ### Step 7: Share the root CID
256 |
257 | Share the CID of your new snapshot so people can access it and replicate it onto their machines.
258 |
259 | ### Step 8: Update *.wikipedia-on-ipfs.org
260 |
261 | Make sure at least two full reliable copies exist before updating DNSLink.
262 |
263 | ## mirrorzim.sh
264 |
265 | It is possible to automate steps 3-6 via a wrapper script named `mirrorzim.sh`.
266 | It will download the latest snapshot of specified language (if needed), unpack it, and add it to IPFS.
267 |
268 | To see how the script behaves try running it on one of the smallest wikis, such as `cu`:
269 |
270 | ```console
271 | $ ./mirrorzim.sh --languagecode=cu --wikitype=wikipedia --hostingdnsdomain=cu.wikipedia-on-ipfs.org
272 | ```
273 |
274 | ## Docker build
275 |
276 | A `Dockerfile` with all the software requirements is provided.
277 | For now it is only a handy container for running the process on non-Linux
278 | systems or if you don't want to pollute your system with all the dependencies.
279 | In the future it will be end-to-end blackbox that takes ZIM and spits out CID
280 | and repo.
281 |
282 | To build the docker image:
283 |
284 | ```sh
285 | docker build . -t distributed-wikipedia-mirror-build
286 | ```
287 |
288 | To use it as a development environment:
289 |
290 | ```sh
291 | docker run -it -v $(pwd):/root/distributed-wikipedia-mirror --net=host --entrypoint bash distributed-wikipedia-mirror-build
292 | ```
293 |
294 | # How to Help
295 |
296 | If you don't mind command line interface and have a lot of disk space,
297 | bandwidth, or code skills, continue reading.
298 |
299 | ## Share mirror CID with people who can't trust DNS
300 |
301 | Sharing a CID instead of a DNS name is useful when DNS is not reliable or
302 | trustworthy. The latest CID for specific language mirror can be found via
303 | DNSLink:
304 |
305 | ```console
306 | $ ipfs resolve -r /ipns/tr.wikipedia-on-ipfs.org
307 | /ipfs/bafy..
308 | ```
309 |
310 | CID can then be opened via `ipfs://bafy..` in a web browser with [IPFS Companion](https://github.com/ipfs-shipyard/ipfs-companion) extension
311 | resolving IPFS addresses via [IPFS Desktop](https://docs.ipfs.tech/install/ipfs-desktop/) node.
312 |
313 | You can also try [Brave browser](https://brave.com), which ships with [native support for IPFS](https://brave.com/ipfs-support/).
314 |
315 | ## Cohost a lazy copy
316 |
317 | Using MFS makes it easier to protect snapshots from being garbage collected
318 | than low level pinning because you can assign meaningful names and it won't
319 | prefetch any blocks unless you explicitly ask.
320 |
321 | Every mirrored Wikipedia article you visit will be added to your lazy
322 | copy, and will be contributing to your partial mirror. , and you won't need to host
323 | the entire thing.
324 |
325 | To cohost a lazy copy, execute:
326 |
327 | ```console
328 | $ export LNG="tr"
329 | $ ipfs files mkdir -p /wikipedia-mirror/$LNG
330 | $ ipfs files cp $(ipfs resolve -r /ipns/$LNG.wikipedia-on-ipfs.org) /wikipedia-mirror/$LNG/$LNG_$(date +%F_%T)
331 | ```
332 |
333 | Then simply start browsing the `$LNG.wikipedia-on-ipfs.org` site via your node.
334 | Every visited page will be cached, cohosted, and protected from garbage collection.
335 |
336 | ## Cohost a full copy
337 |
338 | Steps are the same as for a lazy copy, but you execute additional preload
339 | after a lazy copy is in place:
340 |
341 | ```console
342 | $ # export LNG="tr"
343 | $ ipfs refs -r /ipns/$LNG.wikipedia-on-ipfs.org
344 | ```
345 |
346 | Before you execute this, check if you have enough disk space to fit `CumulativeSize`:
347 |
348 | ```console
349 | $ # export LNG="tr"
350 | $ ipfs object stat --human /ipns/$LNG.wikipedia-on-ipfs.org ...rror MM?fix/build-2021
351 | NumLinks: 5
352 | BlockSize: 281
353 | LinksSize: 251
354 | DataSize: 30
355 | CumulativeSize: 15 GB
356 | ```
357 |
358 | We are working on improving deduplication between snapshots, but for now YMMV.
359 |
360 | ## Code
361 |
362 | If you would like to contribute more to this effort, look at the [issues](https://github.com/ipfs/distributed-wikipedia-mirror/issues) in this github repo. Especially check for [issues marked with the "wishlist" label](https://github.com/ipfs/distributed-wikipedia-mirror/labels/wishlist) and issues marked ["help wanted"](https://github.com/ipfs/distributed-wikipedia-mirror/labels/help%20wanted).
363 |
--------------------------------------------------------------------------------
/assets/wikipedia-on-ipfs-100px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/assets/wikipedia-on-ipfs-100px.png
--------------------------------------------------------------------------------
/assets/wikipedia-on-ipfs-small-flat-cropped-offset-min.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/assets/wikipedia-on-ipfs-small-flat-cropped-offset-min.png
--------------------------------------------------------------------------------
/assets/wikipedia-on-ipfs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/assets/wikipedia-on-ipfs.png
--------------------------------------------------------------------------------
/assets/wikipedia-on-ipfs.psd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/assets/wikipedia-on-ipfs.psd
--------------------------------------------------------------------------------
/assets/wikipedia-on-ipfs.pxm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/assets/wikipedia-on-ipfs.pxm
--------------------------------------------------------------------------------
/assets/wikipedia-on-ipfs.pxm.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/assets/wikipedia-on-ipfs.pxm.zip
--------------------------------------------------------------------------------
/bin/run:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 |
3 | const fs = require('fs')
4 | const path = require('path')
5 | const project = path.join(__dirname, '../tsconfig.json')
6 | const dev = fs.existsSync(project)
7 |
8 | if (dev) {
9 | require('ts-node').register({project})
10 | }
11 |
12 | require(`../${dev ? 'src' : 'lib'}`).run()
13 | .catch(require('@oclif/errors/handle'))
14 |
--------------------------------------------------------------------------------
/bin/run.cmd:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | node "%~dp0\run" %*
4 |
--------------------------------------------------------------------------------
/legacy-pipeline/demo/wikipedia-on-ipfs-offset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/legacy-pipeline/demo/wikipedia-on-ipfs-offset.png
--------------------------------------------------------------------------------
/legacy-pipeline/demo/wikipedia-on-ipfs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/legacy-pipeline/demo/wikipedia-on-ipfs.png
--------------------------------------------------------------------------------
/legacy-pipeline/execute-changes.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # vim: set ts=2 sw=2:
3 |
4 | set -euo pipefail
5 | IFS=$'\n\t'
6 |
7 | error() {
8 | echo "$@"
9 | exit 1
10 | }
11 |
12 | usage() {
13 | echo "USAGE:"
14 | echo " $0 [-h|--help] [--ipns=] [--date=]";
15 | echo " [--search=] [--main=] "
16 | echo ""
17 | echo " -h|--help - displays help"
18 | echo " --ipns - ipns hash of the archive"
19 | echo " --date - date of snapshot (defaults to this month)"
20 | echo " --search - hash of search IPLD structure"
21 | echo " --main - full name of article containing intro page (e.g. Main_Page.html)"
22 | exit 2
23 | }
24 |
25 | if [ "$(getopt --test >/dev/null 2>&1; echo $?)" -ne "4" ]; then
26 | error "getopt enchanced required, 'getopt --test' should have exit code 4"
27 | fi
28 |
29 |
30 | LONG_OPT="help,search:,ipns:,date:,main:"
31 | SHORT_OPT="h"
32 | PARSED_OPTS=$(getopt -n "$0" -o "$SHORT_OPT" -l "$LONG_OPT" -- "$@") || usage
33 |
34 | eval set -- "$PARSED_OPTS"
35 |
36 | # defaults
37 | SNAP_DATE=$(date +"%Y-%m-%d")
38 | IPNS_HASH=""
39 | SEARCH=""
40 | MAIN=index.htm
41 |
42 | while true; do
43 | case "$1" in
44 | -h|--help)
45 | usage;;
46 | --date)
47 | SNAP_DATE="$2"
48 | shift 2;;
49 | --ipns)
50 | IPNS_HASH="$2"
51 | shift 2;;
52 | --search)
53 | SEARCH="$2"
54 | shift 2;;
55 | --main)
56 | MAIN="$2"
57 | shift 2;;
58 | --)
59 | shift;
60 | break;;
61 | esac
62 | done
63 |
64 | if [ -z "${1-}" ]; then
65 | echo "Missing ipfs files root"
66 | usage
67 | fi
68 | ROOT="$1"
69 |
70 | ipfs-replace() {
71 | ipfs files rm "$ROOT/$1" >/dev/null 2>&1 || true
72 | ipfs files --flush=false cp "$2" "$ROOT/$1"
73 | }
74 |
75 | if ipfs files stat "$ROOT/A" >/dev/null 2>&1; then
76 | ipfs files mv "$ROOT/A" "$ROOT/wiki"
77 | fi
78 |
79 | NEW_BODYJS=$(
80 | sed -e 's/{{SNAPSHOT_DATE}}/'"$SNAP_DATE"'/g' \
81 | -e 's/{{IPNS_HASH}}/'"$IPNS_HASH"'/g' scripts/body.js |\
82 | if [ -n "$SEARCH" ]; then
83 | cat - <(sed -e 's/{{SEARCH_CID}}/'"$SEARCH"'/' scripts/search-shim.js)
84 | else
85 | cat -
86 | fi | ipfs add --cid-version 1 -Q
87 | )
88 |
89 | ipfs-replace "-/j/body.js" "/ipfs/$NEW_BODYJS"
90 | ipfs-replace "I/s/Wikipedia-logo-v2-200px-transparent.png" \
91 | "/ipfs/$(ipfs add --cid-version 1 -q assets/wikipedia-on-ipfs-small-flat-cropped-offset-min.png)"
92 | ipfs-replace "I/s/wikipedia-on-ipfs.png" \
93 | "/ipfs/$(ipfs add --cid-version 1 -Q assets/wikipedia-on-ipfs-100px.png)"
94 |
95 | if [ -n "$SEARCH" ]; then
96 | ipfs-replace "-/j/search.js" "/ipfs/$(ipfs add --cid-version 1 -Q scripts/search.js)"
97 | fi
98 |
99 | # comment out some debug stuff in head.js
100 | HEAD_JS_LOCATION="$(ipfs files stat --hash "$ROOT")/-/j/head.js"
101 | HEAD_JS_HASH="$(ipfs cat "$HEAD_JS_LOCATION" | sed -e "s|^\tdocument.getElementsByTagName( 'head' )|//\0|" | ipfs add --cid-version 1 -Q)"
102 |
103 | ipfs-replace "-/j/head.js" "/ipfs/$HEAD_JS_HASH"
104 |
105 | ipfs-replace "/wiki/index.html" "$ROOT/wiki/$MAIN"
106 | ipfs-replace "/index.html" "/ipfs/$(ipfs add --cid-version 1 -Q redirect-page/index_root.html)"
107 |
108 | ipfs files flush "$ROOT"
109 | echo "We are done !!!"
110 | ipfs files stat "$ROOT"
111 |
112 |
113 |
--------------------------------------------------------------------------------
/legacy-pipeline/redirect-page/index.html:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
9 |
10 |
--------------------------------------------------------------------------------
/legacy-pipeline/redirect-page/index_root.html:
--------------------------------------------------------------------------------
1 |
2 |
5 |
6 |
9 |
10 |
--------------------------------------------------------------------------------
/legacy-pipeline/scripts/body.js:
--------------------------------------------------------------------------------
1 | // This script adds a footer on all pages
2 | const documentToPrint = `
3 |
55 |
75 |
106 |
108 | `
109 |
110 | document.write(documentToPrint);
111 | document.querySelectorAll('.footer-sharing-icon').forEach((link) => {
112 | link.href = link.href.replace('{ARTICLE_URL}', window.location.href)
113 | link.href = link.href.replace('{ARTICLE_TITLE}', document.title)
114 | });
115 |
116 | document.querySelectorAll('a.ipfs').forEach((link) => {
117 | var p = window.location.pathname
118 | link.textContent = decodeURIComponent(p);
119 | link.href = p;
120 | });
121 |
122 | document.querySelectorAll('div.ipns-hide').forEach((link) => {
123 | var p = window.location.pathname
124 | if (!p.startsWith('/ipfs/')) {
125 | link.style.display = 'none';
126 | }
127 | });
128 | // fix Kurdish wiki style
129 | document.querySelectorAll('th.wîkîlogo').forEach((a) => {
130 | a.style['background-image'] = 'none'
131 | })
132 |
133 | document.querySelectorAll('a.ipns').forEach((link) => {
134 | if (link.href.indexOf('{{IPNS' + '_HASH}}') != -1) {
135 | link.parentNode.style.display = 'none'
136 | return
137 | }
138 | var loc = '/' + window.location.pathname.split("/").slice(3).join("/");
139 | link.href = link.href + loc;
140 | link.textContent = link.textContent + decodeURIComponent(loc)
141 | });
142 |
143 | document.querySelectorAll('a.http-ipfs').forEach((link) => {
144 | link.textContent = decodeURIComponent(link.href);
145 | });
146 |
147 | // (window.RLQ=window.RLQ||[]).push(function(){mw.log.warn("Gadget \"ReferenceTooltips\" styles loaded twice. Migrate to type=general. See \u003Chttps://phabricator.wikimedia.org/T42284\u003E.");});
148 |
149 | // (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgBackendResponseTime":62,"wgHostname":"mw1218"});});
150 |
--------------------------------------------------------------------------------
/legacy-pipeline/scripts/search-shim.js:
--------------------------------------------------------------------------------
1 | const searchInject = `
2 |