├── .devcontainer └── devcontainer.json ├── .dockerignore ├── .editorconfig ├── .eslintignore ├── .eslintrc ├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ └── open_an_issue.md ├── config.yml └── workflows │ ├── generated-pr.yml │ └── stale.yml ├── .gitignore ├── .prettierrc.js ├── BENCHMARKS.md ├── Dockerfile ├── README.md ├── assets ├── wikipedia-on-ipfs-100px.png ├── wikipedia-on-ipfs-small-flat-cropped-offset-min.png ├── wikipedia-on-ipfs.png ├── wikipedia-on-ipfs.psd ├── wikipedia-on-ipfs.pxm └── wikipedia-on-ipfs.pxm.zip ├── bin ├── run └── run.cmd ├── legacy-pipeline ├── demo │ ├── wikipedia-on-ipfs-offset.png │ └── wikipedia-on-ipfs.png ├── execute-changes.sh ├── redirect-page │ ├── index.html │ └── index_root.html └── scripts │ ├── body.js │ ├── search-shim.js │ └── search.js ├── mirrorzim.sh ├── package.json ├── snapshot-hashes.yml ├── snapshots └── .ignore ├── src ├── article-transforms.ts ├── domain.ts ├── index.ts ├── site-transforms.ts ├── templates │ ├── footer_fragment.handlebars │ ├── index_redirect_fragment.handlebars │ └── site.js.handlebars ├── utils │ ├── assert-never.ts │ ├── check-unpacked-zim-dir.ts │ ├── download-file.ts │ └── walk-files.ts └── zim-to-website.ts ├── test ├── index.test.ts ├── mocha.opts └── tsconfig.json ├── tools ├── find_main_page_name.sh ├── find_original_main_page_url.sh └── getzim.sh ├── tsconfig.json └── yarn.lock /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/vscode-remote/devcontainer.json or this file's README at: 2 | // https://github.com/microsoft/vscode-dev-containers/tree/v0.101.1/containers/docker-existing-dockerfile 3 | { 4 | "name": "Distributed-Docker-Mirror-Dev", 5 | 6 | // Sets the run context to one level up instead of the .devcontainer folder. 7 | "context": "..", 8 | 9 | // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. 10 | "dockerFile": "../Dockerfile", 11 | 12 | // Set *default* container specific settings.json values on container create. 13 | "settings": { 14 | "terminal.integrated.shell.linux": null 15 | }, 16 | 17 | // Add the IDs of extensions you want installed when the container is created. 18 | "extensions": [], 19 | 20 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 21 | "forwardPorts": [8080], 22 | 23 | // Uncomment the next line to run commands after the container is created - for example installing git. 24 | "postCreateCommand": "yarn" 25 | 26 | // Uncomment when using a ptrace-based debugger like C++, Go, and Rust 27 | // "runArgs": [ "--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined" ], 28 | 29 | // Uncomment to use the Docker CLI from inside the container. See https://aka.ms/vscode-remote/samples/docker-in-docker. 30 | // "mounts": [ "source=/var/run/docker.sock,target=/var/run/docker.sock,type=bind" ], 31 | 32 | // Uncomment to connect as a non-root user. See https://aka.ms/vscode-remote/containers/non-root. 33 | // "remoteUser": "vscode" 34 | } 35 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | out/ 2 | tmp/ 3 | snapshots/ 4 | zim-to-website/ 5 | extract_zim/ -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | insert_final_newline = true 9 | 10 | [*.md] 11 | trim_trailing_whitespace = false 12 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | /lib 2 | /tmp 3 | /snapshots -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["oclif", "oclif-typescript", "plugin:prettier/recommended"], 3 | "plugins": ["simple-import-sort"], 4 | "rules": { 5 | "unicorn/no-abusive-eslint-disable": ["off"], 6 | "simple-import-sort/sort": "error", 7 | "@typescript-eslint/member-delimiter-style": [ 8 | "error", 9 | { 10 | "multiline": { 11 | "delimiter": "none" 12 | }, 13 | "singleline": { 14 | "delimiter": "semi" 15 | } 16 | } 17 | ] 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Getting Help on IPFS 4 | url: https://ipfs.io/help 5 | about: All information about how and where to get help on IPFS. 6 | - name: IPFS Official Forum 7 | url: https://discuss.ipfs.io 8 | about: Please post general questions, support requests, and discussions here. 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/open_an_issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Open an issue 3 | about: Only for actionable issues relevant to this repository. 4 | title: '' 5 | labels: need/triage 6 | assignees: '' 7 | 8 | --- 9 | 20 | -------------------------------------------------------------------------------- /.github/config.yml: -------------------------------------------------------------------------------- 1 | # Configuration for welcome - https://github.com/behaviorbot/welcome 2 | 3 | # Configuration for new-issue-welcome - https://github.com/behaviorbot/new-issue-welcome 4 | # Comment to be posted to on first time issues 5 | newIssueWelcomeComment: > 6 | Thank you for submitting your first issue to this repository! A maintainer 7 | will be here shortly to triage and review. 8 | 9 | In the meantime, please double-check that you have provided all the 10 | necessary information to make this process easy! Any information that can 11 | help save additional round trips is useful! We currently aim to give 12 | initial feedback within **two business days**. If this does not happen, feel 13 | free to leave a comment. 14 | 15 | Please keep an eye on how this issue will be labeled, as labels give an 16 | overview of priorities, assignments and additional actions requested by the 17 | maintainers: 18 | 19 | - "Priority" labels will show how urgent this is for the team. 20 | - "Status" labels will show if this is ready to be worked on, blocked, or in progress. 21 | - "Need" labels will indicate if additional input or analysis is required. 22 | 23 | Finally, remember to use https://discuss.ipfs.io if you just need general 24 | support. 25 | 26 | # Configuration for new-pr-welcome - https://github.com/behaviorbot/new-pr-welcome 27 | # Comment to be posted to on PRs from first time contributors in your repository 28 | newPRWelcomeComment: > 29 | Thank you for submitting this PR! 30 | 31 | A maintainer will be here shortly to review it. 32 | 33 | We are super grateful, but we are also overloaded! Help us by making sure 34 | that: 35 | 36 | * The context for this PR is clear, with relevant discussion, decisions 37 | and stakeholders linked/mentioned. 38 | 39 | * Your contribution itself is clear (code comments, self-review for the 40 | rest) and in its best form. Follow the [code contribution 41 | guidelines](https://github.com/ipfs/community/blob/master/CONTRIBUTING.md#code-contribution-guidelines) 42 | if they apply. 43 | 44 | Getting other community members to do a review would be great help too on 45 | complex PRs (you can ask in the chats/forums). If you are unsure about 46 | something, just leave us a comment. 47 | 48 | Next steps: 49 | 50 | * A maintainer will triage and assign priority to this PR, commenting on 51 | any missing things and potentially assigning a reviewer for high 52 | priority items. 53 | 54 | * The PR gets reviews, discussed and approvals as needed. 55 | 56 | * The PR is merged by maintainers when it has been approved and comments addressed. 57 | 58 | We currently aim to provide initial feedback/triaging within **two business 59 | days**. Please keep an eye on any labelling actions, as these will indicate 60 | priorities and status of your contribution. 61 | 62 | We are very grateful for your contribution! 63 | 64 | 65 | # Configuration for first-pr-merge - https://github.com/behaviorbot/first-pr-merge 66 | # Comment to be posted to on pull requests merged by a first time user 67 | # Currently disabled 68 | #firstPRMergeComment: "" 69 | -------------------------------------------------------------------------------- /.github/workflows/generated-pr.yml: -------------------------------------------------------------------------------- 1 | name: Close Generated PRs 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * *' 6 | workflow_dispatch: 7 | 8 | permissions: 9 | issues: write 10 | pull-requests: write 11 | 12 | jobs: 13 | stale: 14 | uses: ipdxco/unified-github-workflows/.github/workflows/reusable-generated-pr.yml@v1 15 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | name: Close Stale Issues 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * *' 6 | workflow_dispatch: 7 | 8 | permissions: 9 | issues: write 10 | pull-requests: write 11 | 12 | jobs: 13 | stale: 14 | uses: ipdxco/unified-github-workflows/.github/workflows/reusable-stale-issue.yml@v1 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .cache 3 | *.zim 4 | out 5 | IPFS_PATH 6 | 7 | notes.md 8 | 9 | *-debug.log 10 | *-error.log 11 | /.nyc_output 12 | /dist 13 | /lib 14 | /package-lock.json 15 | /tmp 16 | /snapshots 17 | node_modules 18 | /zim-tools 19 | /kiwix-tools 20 | 21 | bin/zimdump 22 | -------------------------------------------------------------------------------- /.prettierrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | semi: false, 3 | trailingComma: "none", 4 | singleQuote: true, 5 | printWidth: 80, 6 | tabWidth: 2 7 | }; -------------------------------------------------------------------------------- /BENCHMARKS.md: -------------------------------------------------------------------------------- 1 | Below are some quick notes on how long things take. 2 | 3 | - 2021-Q1 with 1TB NVMe SSD, i7-4770S 4 | - unpacking wikipedia_en_all_maxi_2021-02 + fixing exceptions on ssd ~3h50m 5 | (not a full build, this is without IPFS import) 6 | - full build from wikipedia_ru_all_maxi_2021-03.zim with badger ~4h11m 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stable 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | 5 | RUN apt update 6 | RUN apt -y install --no-install-recommends git ca-certificates curl wget apt-utils 7 | 8 | # install: 9 | # - node and yarn 10 | # - go-ipfs 11 | RUN curl -sL https://deb.nodesource.com/setup_14.x -o nodesource_setup.sh \ 12 | && bash nodesource_setup.sh \ 13 | && apt -y install --no-install-recommends nodejs \ 14 | && npm install -g yarn \ 15 | && wget -nv https://dist.ipfs.io/go-ipfs/v0.8.0/go-ipfs_v0.8.0_linux-amd64.tar.gz \ 16 | && tar xvfz go-ipfs_v0.8.0_linux-amd64.tar.gz \ 17 | && mv go-ipfs/ipfs /usr/local/bin/ipfs \ 18 | && rm -r go-ipfs && rm go-ipfs_v0.8.0_linux-amd64.tar.gz \ 19 | && ipfs init -p server,local-discovery,flatfs,randomports --empty-repo \ 20 | && ipfs config --json 'Experimental.ShardingEnabled' true 21 | 22 | # TODO: move repo init after external volume is mounted 23 | 24 | ENV DEBIAN_FRONTEND=dialog 25 | 26 | RUN mkdir /root/distributed-wikipedia-mirror 27 | VOLUME ["/root/distributed-wikipedia-mirror"] 28 | WORKDIR /root/distributed-wikipedia-mirror 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

Distributed Wikipedia Mirror Project

6 |

7 | Putting Wikipedia Snapshots on IPFS and working towards making it fully read-write. 8 |
9 |
10 | 11 | ## Existing Mirrors 12 | 13 | There are various ways one can access the mirrors: through a [DNSLink](https://docs.ipfs.tech/concepts/glossary/#dnslink), public [gateway](https://docs.ipfs.tech/concepts/glossary/#gateway) or directly with a [CID](https://docs.ipfs.tech/concepts/glossary/#cid). 14 | 15 | You can [read all about the available methods here](https://blog.ipfs.tech/2021-05-31-distributed-wikipedia-mirror-update/#improved-access-to-wikipedia-mirrors). 16 | 17 | ### DNSLinks 18 | 19 | - https://en.wikipedia-on-ipfs.org 20 | - https://tr.wikipedia-on-ipfs.org 21 | - https://my.wikipedia-on-ipfs.org 22 | - https://ar.wikipedia-on-ipfs.org 23 | - https://zh.wikipedia-on-ipfs.org 24 | - https://uk.wikipedia-on-ipfs.org 25 | - https://ru.wikipedia-on-ipfs.org 26 | - https://fa.wikipedia-on-ipfs.org 27 | 28 | ### CIDs 29 | 30 | The latest CIDs that the DNSLinks point at can be found in [snapshot-hashes.yml](snapshot-hashes.yml). 31 | 32 | --- 33 | 34 | Each mirror has a link to the original [Kiwix](https://kiwix.org) ZIM archive in the footer. It can be dowloaded and opened offline with the [Kiwix Reader](https://www.kiwix.org/en/download/). 35 | 36 | ## Table of Contents 37 | 38 | - [Purpose](#purpose) 39 | - [How to add new Wikipedia snapshots to IPFS](#how-to-add-new-wikipedia-snapshots-to-ipfs) 40 | - [Manual build](#manual-build) 41 | - [Docker](#docker-build) 42 | - [How to help](#how-to-help) 43 | - [Cohost a lazy copy](#cohost-a-lazy-copy) 44 | - [Cohost a full copy](#cohost-a-full-copy) 45 | 46 | ## Purpose 47 | 48 | “We believe that information—knowledge—makes the world better. That when we ask questions, get the facts, and are able to understand all perspectives on an issue, it allows us to build the foundation for a more just and tolerant society” 49 | -- Katherine Maher, Executive Director of the Wikimedia Foundation 50 | 51 | ## Wikipedia on IPFS -- Background 52 | 53 | ### What does it mean to put Wikipedia on IPFS? 54 | 55 | The idea of putting Wikipedia on IPFS has been around for a while. Every few months or so someone revives the threads. You can find such discussions in [this github issue about archiving wikipedia](https://github.com/ipfs/archives/issues/20), [this issue about possible integrations with Wikipedia](https://github.com/ipfs/notes/issues/46), and [this proposal for a new project](https://github.com/ipfs/notes/issues/47#issuecomment-140587530). 56 | 57 | We have two consecutive goals regarding Wikipedia on IPFS: Our first goal is to create periodic read-only snapshots of Wikipedia. A second goal will be to create a full-fledged read-write version of Wikipedia. This second goal would connect with the Wikimedia Foundation’s bigger, longer-running conversation about decentralizing Wikipedia, which you can read about at https://strategy.wikimedia.org/wiki/Proposal:Distributed_Wikipedia 58 | 59 | ### (Goal 1) Read-Only Wikipedia on IPFS 60 | 61 | The easy way to get Wikipedia content on IPFS is to periodically -- say every week -- take snapshots of all the content and add it to IPFS. That way the majority of Wikipedia users -- who only read wikipedia and don’t edit -- could use all the information on wikipedia with all the benefits of IPFS. Users couldn't edit it, but users could download and archive swaths of articles, or even the whole thing. People could serve it to each other peer-to-peer, reducing the bandwidth load on Wikipedia servers. People could even distribute it to each other in closed, censored, or resource-constrained networks -- with IPFS, peers do not need to be connected to the original source of the content, being connected to anyone who has the content is enough. Effectively, the content can jump from computer to computer in a peer-to-peer way, and avoid having to connect to the content source or even the internet backbone. We've been in discussions with many groups about the potential of this kind of thing, and how it could help billions of people around the world to access information better -- either free of censorship, or circumventing serious bandwidth or latency constraints. 62 | 63 | So far, we have achieved part of this goal: we have static snapshots of all of Wikipedia on IPFS. This is already a huge result that will help people access, keep, archive, cite, and distribute lots of content. In particular, we hope that this distribution helps people in Turkey, who find themselves in a tough situation. We are still working out a process to continue updating these snapshots, we hope to have someone at Wikimedia in the loop as they are the authoritative source of the content. **If you could help with this, please get in touch with us at `wikipedia-project ipfs.io`** 64 | 65 | ### (Goal 2) Fully Read-Write Wikipedia on IPFS 66 | 67 | The long term goal is to get the full-fledged read-write Wikipedia to work on top of IPFS. This is much more difficult because for a read-write application like Wikipedia to leverage the distributed nature of IPFS, we need to change how the applications write data. A read-write wikipedia on IPFS would allow it to be completely decentralized, and create an extremely difficult to censor operation. In addition to all the benefits of the static version above, the users of a read-write Wikipedia on IPFS could write content from anywhere and publish it, even without being directly connected to any wikipedia.org servers. There would be automatic version control and version history archiving. We could allow people to view, edit, and publish in completely encrypted contexts, which is important to people in highly repressive regions of the world. 68 | 69 | A full read-write version (2) would require a strong collaboration with Wikipedia.org itself, and finishing work on important dynamic content challenges -- we are working on all the technology (2) needs, but it's not ready for prime-time yet. We will update when it is. 70 | 71 | # How to add new Wikipedia snapshots to IPFS 72 | 73 | The process can be nearly fully automated, however it consists of many stages 74 | and understanding what happens during each stage is paramount if ZIM format 75 | changes and our build toolchain requires a debug and update. 76 | 77 | - [Manual build](#manual-build) are useful in debug situations, when specific stage needs to be executed multiple times to fix a bug. 78 | - [mirrorzim.sh](#mirrorzimsh) automates some steps for QA purposes and ad-hoc experimentation 79 | 82 | 83 | **Note: This is a work in progress.**. We intend to make it easy for anyone to 84 | create their own wikipedia snapshots and add them to IPFS, making sure those 85 | builds are deterministic and auditable, but our first emphasis has been to get 86 | the initial snapshots onto the network. This means some of the steps aren't as 87 | easy as we want them to be. If you run into trouble, seek help through a github 88 | issue, commenting in [chat](https://docs.ipfs.tech/community/#chat), or by posting a thread on 89 | [https://discuss.ipfs.tech](https://discuss.ipfs.tech/c/help/13). 90 | 91 | ## Manual build 92 | 93 | If you would like to create an updated Wikipedia snapshot on IPFS, you can follow these steps. 94 | 95 | 96 | ### Step 0: Clone this repository 97 | 98 | All commands assume to be run inside a cloned version of this repository 99 | 100 | Clone the distributed-wikipedia-mirror git repository 101 | 102 | ```sh 103 | $ git clone https://github.com/ipfs/distributed-wikipedia-mirror.git 104 | ``` 105 | 106 | then `cd` into that directory 107 | 108 | ```sh 109 | $ cd distributed-wikipedia-mirror 110 | ``` 111 | 112 | ### Step 1: Install dependencies 113 | 114 | `Node` and `yarn` are required. On Mac OS X you will need `sha256sum`, available in coreutils. 115 | 116 | Install the node dependencies: 117 | 118 | ```sh 119 | $ yarn 120 | ``` 121 | 122 | Then, download the latest [zim-tools](https://download.openzim.org/release/zim-tools/) and add `zimdump` to your `PATH`. 123 | This tool is necessary for unpacking ZIM. 124 | 125 | ### Step 2: Configure your IPFS Node 126 | 127 | It is advised to use separate IPFS node for this: 128 | 129 | ```console 130 | $ export IPFS_PATH=/path/to/IPFS_PATH_WIKIPEDIA_MIRROR 131 | $ ipfs init -p server,local-discovery,flatfs,randomports --empty-repo 132 | ``` 133 | 134 | #### Tune DHT for speed 135 | 136 | Wikipedia has a lot of blocks, to publish them as fast as possible, 137 | enable [Accelerated DHT Client](https://github.com/ipfs/go-ipfs/blob/master/docs/experimental-features.md#accelerated-dht-client): 138 | 139 | ```console 140 | $ ipfs config --json Experimental.AcceleratedDHTClient true 141 | ``` 142 | 143 | #### Tune datastore for speed 144 | 145 | Make sure repo uses `flatfs` with `sync` set to `false`: 146 | 147 | ```console 148 | $ ipfs config --json 'Datastore.Spec.mounts' "$(ipfs config 'Datastore.Spec.mounts' | jq -c '.[0].child.sync=false')" 149 | ``` 150 | 151 | **NOTE:** While badgerv1 datastore is faster is nome configurations, we choose to avoid using it with bigger builds like English because of [memory issues due to the number of files](https://github.com/ipfs/distributed-wikipedia-mirror/issues/85). Potential workaround is to use [`filestore`](https://github.com/ipfs/go-ipfs/blob/master/docs/experimental-features.md#ipfs-filestore) that avoids duplicating data and reuses unpacked files as-is. 152 | 153 | #### HAMT sharding 154 | 155 | Make sure you use go-ipfs 0.12 or later, it has automatic sharding of big directories. 156 | 157 | ### Step 3: Download the latest snapshot from kiwix.org 158 | 159 | Source of ZIM files is at https://download.kiwix.org/zim/wikipedia/ 160 | Make sure you download `_all_maxi_` snapshots, as those include images. 161 | 162 | To automate this, you can also use the `getzim.sh` script: 163 | 164 | First, download the latest wiki lists using `bash ./tools/getzim.sh cache_update` 165 | 166 | After that create a download command using `bash ./tools/getzim.sh choose`, it should give an executable command e.g. 167 | 168 | ```sh 169 | Download command: 170 | $ ./tools/getzim.sh download wikipedia wikipedia tr all maxi latest 171 | ``` 172 | 173 | Running the command will download the choosen zim file to the `./snapshots` directory. 174 | 175 | 176 | 177 | ### Step 4: Unpack the ZIM snapshot 178 | 179 | Unpack the ZIM snapshot using `extract_zim`: 180 | 181 | ```sh 182 | $ zimdump dump ./snapshots/wikipedia_tr_all_maxi_2021-01.zim --dir ./tmp/wikipedia_tr_all_maxi_2021-01 183 | ``` 184 | 185 | > ### ℹ️ ZIM's main page 186 | > 187 | > Each ZIM file has "main page" attribute which defines the landing page set for the ZIM archive. 188 | > It is often different than the "main page" of upstream Wikipedia. 189 | > Kiwix Main page needs to be passed in the next step, so until there is an automated way to determine "main page" of ZIM, you need to open ZIM in Kiwix reader and eyeball the name of the landing page. 190 | 191 | ### Step 5: Convert the unpacked zim directory to a website with mirror info 192 | 193 | IMPORTANT: The snapshots must say who disseminated them. This effort to mirror Wikipedia snapshots is not affiliated with the Wikimedia foundation and is not connected to the volunteers whose contributions are contained in the snapshots. The snapshots must include information explaining that they were created and disseminated by independent parties, not by Wikipedia. 194 | 195 | The conversion to a working website and the appending of necessary information is is done by the node program under `./bin/run`. 196 | 197 | ```sh 198 | $ node ./bin/run --help 199 | ``` 200 | 201 | The program requires main page for ZIM and online versions as one of inputs. For instance, the ZIM file for Turkish Wikipedia has a main page of `Kullanıcı:The_other_Kiwix_guy/Landing` but `https://tr.wikipedia.org` uses `Anasayfa` as the main page. Both must be passed to the node script. 202 | 203 | To determine the original main page use `./tools/find_main_page_name.sh`: 204 | 205 | ```console 206 | $ ./tools/find_main_page_name.sh tr.wikiquote.org 207 | Anasayfa 208 | ``` 209 | 210 | To determine the main page in ZIM file open in in a [Kiwix reader](https://www.kiwix.org/en/kiwix-reader) or use `zimdump info` (version 3.0.0 or later) and ignore the `A/` prefix: 211 | 212 | ```console 213 | $ zimdump info wikipedia_tr_all_maxi_2021-01.zim 214 | count-entries: 1088190 215 | uuid: 840fc82f-8f14-e11e-c185-6112dba6782e 216 | cluster count: 5288 217 | checksum: 50113b4f4ef5ddb62596d361e0707f79 218 | main page: A/Kullanıcı:The_other_Kiwix_guy/Landing 219 | favicon: -/favicon 220 | 221 | $ zimdump info wikipedia_tr_all_maxi_2021-01.zim | grep -oP 'main page: A/\K\S+' 222 | Kullanıcı:The_other_Kiwix_guy/Landing 223 | ``` 224 | 225 | The conversion is done on the unpacked zim directory: 226 | 227 | ```sh 228 | node ./bin/run ./tmp/wikipedia_tr_all_maxi_2021-02 \ 229 | --hostingdnsdomain=tr.wikipedia-on-ipfs.org \ 230 | --zimfile=./snapshots/wikipedia_tr_all_maxi_2021-02.zim \ 231 | --kiwixmainpage=Kullanıcı:The_other_Kiwix_guy/Landing \ 232 | --mainpage=Anasayfa 233 | ``` 234 | 235 | ### Step 6: Import website directory to IPFS 236 | 237 | #### Increase the limitation of opening files 238 | 239 | In some cases, you will meet an error like `could not create socket: Too many open files` when you add files to the IPFS store. It happens when IPFS needs to open more files than it is allowed by the operating system and you can temporarily increase this limitation to avoid this error using this command. 240 | 241 | ```sh 242 | ulimit -n 65536 243 | ``` 244 | 245 | #### Add immutable copy 246 | 247 | Add all the data to your node using `ipfs add`. Use the following command, replacing `$unpacked_wiki` with the path to the website that you created in Step 4 (`./tmp/wikipedia_en_all_maxi_2018-10`). 248 | 249 | ```sh 250 | $ ipfs add -r --cid-version 1 --offline $unpacked_wiki 251 | ``` 252 | 253 | Save the last hash of the output from the above process. It is the CID of the website. 254 | 255 | ### Step 7: Share the root CID 256 | 257 | Share the CID of your new snapshot so people can access it and replicate it onto their machines. 258 | 259 | ### Step 8: Update *.wikipedia-on-ipfs.org 260 | 261 | Make sure at least two full reliable copies exist before updating DNSLink. 262 | 263 | ## mirrorzim.sh 264 | 265 | It is possible to automate steps 3-6 via a wrapper script named `mirrorzim.sh`. 266 | It will download the latest snapshot of specified language (if needed), unpack it, and add it to IPFS. 267 | 268 | To see how the script behaves try running it on one of the smallest wikis, such as `cu`: 269 | 270 | ```console 271 | $ ./mirrorzim.sh --languagecode=cu --wikitype=wikipedia --hostingdnsdomain=cu.wikipedia-on-ipfs.org 272 | ``` 273 | 274 | ## Docker build 275 | 276 | A `Dockerfile` with all the software requirements is provided. 277 | For now it is only a handy container for running the process on non-Linux 278 | systems or if you don't want to pollute your system with all the dependencies. 279 | In the future it will be end-to-end blackbox that takes ZIM and spits out CID 280 | and repo. 281 | 282 | To build the docker image: 283 | 284 | ```sh 285 | docker build . -t distributed-wikipedia-mirror-build 286 | ``` 287 | 288 | To use it as a development environment: 289 | 290 | ```sh 291 | docker run -it -v $(pwd):/root/distributed-wikipedia-mirror --net=host --entrypoint bash distributed-wikipedia-mirror-build 292 | ``` 293 | 294 | # How to Help 295 | 296 | If you don't mind command line interface and have a lot of disk space, 297 | bandwidth, or code skills, continue reading. 298 | 299 | ## Share mirror CID with people who can't trust DNS 300 | 301 | Sharing a CID instead of a DNS name is useful when DNS is not reliable or 302 | trustworthy. The latest CID for specific language mirror can be found via 303 | DNSLink: 304 | 305 | ```console 306 | $ ipfs resolve -r /ipns/tr.wikipedia-on-ipfs.org 307 | /ipfs/bafy.. 308 | ``` 309 | 310 | CID can then be opened via `ipfs://bafy..` in a web browser with [IPFS Companion](https://github.com/ipfs-shipyard/ipfs-companion) extension 311 | resolving IPFS addresses via [IPFS Desktop](https://docs.ipfs.tech/install/ipfs-desktop/) node. 312 | 313 | You can also try [Brave browser](https://brave.com), which ships with [native support for IPFS](https://brave.com/ipfs-support/). 314 | 315 | ## Cohost a lazy copy 316 | 317 | Using MFS makes it easier to protect snapshots from being garbage collected 318 | than low level pinning because you can assign meaningful names and it won't 319 | prefetch any blocks unless you explicitly ask. 320 | 321 | Every mirrored Wikipedia article you visit will be added to your lazy 322 | copy, and will be contributing to your partial mirror. , and you won't need to host 323 | the entire thing. 324 | 325 | To cohost a lazy copy, execute: 326 | 327 | ```console 328 | $ export LNG="tr" 329 | $ ipfs files mkdir -p /wikipedia-mirror/$LNG 330 | $ ipfs files cp $(ipfs resolve -r /ipns/$LNG.wikipedia-on-ipfs.org) /wikipedia-mirror/$LNG/$LNG_$(date +%F_%T) 331 | ``` 332 | 333 | Then simply start browsing the `$LNG.wikipedia-on-ipfs.org` site via your node. 334 | Every visited page will be cached, cohosted, and protected from garbage collection. 335 | 336 | ## Cohost a full copy 337 | 338 | Steps are the same as for a lazy copy, but you execute additional preload 339 | after a lazy copy is in place: 340 | 341 | ```console 342 | $ # export LNG="tr" 343 | $ ipfs refs -r /ipns/$LNG.wikipedia-on-ipfs.org 344 | ``` 345 | 346 | Before you execute this, check if you have enough disk space to fit `CumulativeSize`: 347 | 348 | ```console 349 | $ # export LNG="tr" 350 | $ ipfs object stat --human /ipns/$LNG.wikipedia-on-ipfs.org ...rror MM?fix/build-2021 351 | NumLinks: 5 352 | BlockSize: 281 353 | LinksSize: 251 354 | DataSize: 30 355 | CumulativeSize: 15 GB 356 | ``` 357 | 358 | We are working on improving deduplication between snapshots, but for now YMMV. 359 | 360 | ## Code 361 | 362 | If you would like to contribute more to this effort, look at the [issues](https://github.com/ipfs/distributed-wikipedia-mirror/issues) in this github repo. Especially check for [issues marked with the "wishlist" label](https://github.com/ipfs/distributed-wikipedia-mirror/labels/wishlist) and issues marked ["help wanted"](https://github.com/ipfs/distributed-wikipedia-mirror/labels/help%20wanted). 363 | -------------------------------------------------------------------------------- /assets/wikipedia-on-ipfs-100px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/assets/wikipedia-on-ipfs-100px.png -------------------------------------------------------------------------------- /assets/wikipedia-on-ipfs-small-flat-cropped-offset-min.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/assets/wikipedia-on-ipfs-small-flat-cropped-offset-min.png -------------------------------------------------------------------------------- /assets/wikipedia-on-ipfs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/assets/wikipedia-on-ipfs.png -------------------------------------------------------------------------------- /assets/wikipedia-on-ipfs.psd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/assets/wikipedia-on-ipfs.psd -------------------------------------------------------------------------------- /assets/wikipedia-on-ipfs.pxm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/assets/wikipedia-on-ipfs.pxm -------------------------------------------------------------------------------- /assets/wikipedia-on-ipfs.pxm.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/assets/wikipedia-on-ipfs.pxm.zip -------------------------------------------------------------------------------- /bin/run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const fs = require('fs') 4 | const path = require('path') 5 | const project = path.join(__dirname, '../tsconfig.json') 6 | const dev = fs.existsSync(project) 7 | 8 | if (dev) { 9 | require('ts-node').register({project}) 10 | } 11 | 12 | require(`../${dev ? 'src' : 'lib'}`).run() 13 | .catch(require('@oclif/errors/handle')) 14 | -------------------------------------------------------------------------------- /bin/run.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | node "%~dp0\run" %* 4 | -------------------------------------------------------------------------------- /legacy-pipeline/demo/wikipedia-on-ipfs-offset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/legacy-pipeline/demo/wikipedia-on-ipfs-offset.png -------------------------------------------------------------------------------- /legacy-pipeline/demo/wikipedia-on-ipfs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/legacy-pipeline/demo/wikipedia-on-ipfs.png -------------------------------------------------------------------------------- /legacy-pipeline/execute-changes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # vim: set ts=2 sw=2: 3 | 4 | set -euo pipefail 5 | IFS=$'\n\t' 6 | 7 | error() { 8 | echo "$@" 9 | exit 1 10 | } 11 | 12 | usage() { 13 | echo "USAGE:" 14 | echo " $0 [-h|--help] [--ipns=] [--date=]"; 15 | echo " [--search=] [--main=

] " 16 | echo "" 17 | echo " -h|--help - displays help" 18 | echo " --ipns - ipns hash of the archive" 19 | echo " --date - date of snapshot (defaults to this month)" 20 | echo " --search - hash of search IPLD structure" 21 | echo " --main - full name of article containing intro page (e.g. Main_Page.html)" 22 | exit 2 23 | } 24 | 25 | if [ "$(getopt --test >/dev/null 2>&1; echo $?)" -ne "4" ]; then 26 | error "getopt enchanced required, 'getopt --test' should have exit code 4" 27 | fi 28 | 29 | 30 | LONG_OPT="help,search:,ipns:,date:,main:" 31 | SHORT_OPT="h" 32 | PARSED_OPTS=$(getopt -n "$0" -o "$SHORT_OPT" -l "$LONG_OPT" -- "$@") || usage 33 | 34 | eval set -- "$PARSED_OPTS" 35 | 36 | # defaults 37 | SNAP_DATE=$(date +"%Y-%m-%d") 38 | IPNS_HASH="" 39 | SEARCH="" 40 | MAIN=index.htm 41 | 42 | while true; do 43 | case "$1" in 44 | -h|--help) 45 | usage;; 46 | --date) 47 | SNAP_DATE="$2" 48 | shift 2;; 49 | --ipns) 50 | IPNS_HASH="$2" 51 | shift 2;; 52 | --search) 53 | SEARCH="$2" 54 | shift 2;; 55 | --main) 56 | MAIN="$2" 57 | shift 2;; 58 | --) 59 | shift; 60 | break;; 61 | esac 62 | done 63 | 64 | if [ -z "${1-}" ]; then 65 | echo "Missing ipfs files root" 66 | usage 67 | fi 68 | ROOT="$1" 69 | 70 | ipfs-replace() { 71 | ipfs files rm "$ROOT/$1" >/dev/null 2>&1 || true 72 | ipfs files --flush=false cp "$2" "$ROOT/$1" 73 | } 74 | 75 | if ipfs files stat "$ROOT/A" >/dev/null 2>&1; then 76 | ipfs files mv "$ROOT/A" "$ROOT/wiki" 77 | fi 78 | 79 | NEW_BODYJS=$( 80 | sed -e 's/{{SNAPSHOT_DATE}}/'"$SNAP_DATE"'/g' \ 81 | -e 's/{{IPNS_HASH}}/'"$IPNS_HASH"'/g' scripts/body.js |\ 82 | if [ -n "$SEARCH" ]; then 83 | cat - <(sed -e 's/{{SEARCH_CID}}/'"$SEARCH"'/' scripts/search-shim.js) 84 | else 85 | cat - 86 | fi | ipfs add --cid-version 1 -Q 87 | ) 88 | 89 | ipfs-replace "-/j/body.js" "/ipfs/$NEW_BODYJS" 90 | ipfs-replace "I/s/Wikipedia-logo-v2-200px-transparent.png" \ 91 | "/ipfs/$(ipfs add --cid-version 1 -q assets/wikipedia-on-ipfs-small-flat-cropped-offset-min.png)" 92 | ipfs-replace "I/s/wikipedia-on-ipfs.png" \ 93 | "/ipfs/$(ipfs add --cid-version 1 -Q assets/wikipedia-on-ipfs-100px.png)" 94 | 95 | if [ -n "$SEARCH" ]; then 96 | ipfs-replace "-/j/search.js" "/ipfs/$(ipfs add --cid-version 1 -Q scripts/search.js)" 97 | fi 98 | 99 | # comment out some debug stuff in head.js 100 | HEAD_JS_LOCATION="$(ipfs files stat --hash "$ROOT")/-/j/head.js" 101 | HEAD_JS_HASH="$(ipfs cat "$HEAD_JS_LOCATION" | sed -e "s|^\tdocument.getElementsByTagName( 'head' )|//\0|" | ipfs add --cid-version 1 -Q)" 102 | 103 | ipfs-replace "-/j/head.js" "/ipfs/$HEAD_JS_HASH" 104 | 105 | ipfs-replace "/wiki/index.html" "$ROOT/wiki/$MAIN" 106 | ipfs-replace "/index.html" "/ipfs/$(ipfs add --cid-version 1 -Q redirect-page/index_root.html)" 107 | 108 | ipfs files flush "$ROOT" 109 | echo "We are done !!!" 110 | ipfs files stat "$ROOT" 111 | 112 | 113 | -------------------------------------------------------------------------------- /legacy-pipeline/redirect-page/index.html: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 9 | 10 | -------------------------------------------------------------------------------- /legacy-pipeline/redirect-page/index_root.html: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 9 | 10 | -------------------------------------------------------------------------------- /legacy-pipeline/scripts/body.js: -------------------------------------------------------------------------------- 1 | // This script adds a footer on all pages 2 | const documentToPrint = ` 3 | 55 | 75 | 106 | 108 | ` 109 | 110 | document.write(documentToPrint); 111 | document.querySelectorAll('.footer-sharing-icon').forEach((link) => { 112 | link.href = link.href.replace('{ARTICLE_URL}', window.location.href) 113 | link.href = link.href.replace('{ARTICLE_TITLE}', document.title) 114 | }); 115 | 116 | document.querySelectorAll('a.ipfs').forEach((link) => { 117 | var p = window.location.pathname 118 | link.textContent = decodeURIComponent(p); 119 | link.href = p; 120 | }); 121 | 122 | document.querySelectorAll('div.ipns-hide').forEach((link) => { 123 | var p = window.location.pathname 124 | if (!p.startsWith('/ipfs/')) { 125 | link.style.display = 'none'; 126 | } 127 | }); 128 | // fix Kurdish wiki style 129 | document.querySelectorAll('th.wîkîlogo').forEach((a) => { 130 | a.style['background-image'] = 'none' 131 | }) 132 | 133 | document.querySelectorAll('a.ipns').forEach((link) => { 134 | if (link.href.indexOf('{{IPNS' + '_HASH}}') != -1) { 135 | link.parentNode.style.display = 'none' 136 | return 137 | } 138 | var loc = '/' + window.location.pathname.split("/").slice(3).join("/"); 139 | link.href = link.href + loc; 140 | link.textContent = link.textContent + decodeURIComponent(loc) 141 | }); 142 | 143 | document.querySelectorAll('a.http-ipfs').forEach((link) => { 144 | link.textContent = decodeURIComponent(link.href); 145 | }); 146 | 147 | // (window.RLQ=window.RLQ||[]).push(function(){mw.log.warn("Gadget \"ReferenceTooltips\" styles loaded twice. Migrate to type=general. See \u003Chttps://phabricator.wikimedia.org/T42284\u003E.");}); 148 | 149 | // (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgBackendResponseTime":62,"wgHostname":"mw1218"});}); 150 | -------------------------------------------------------------------------------- /legacy-pipeline/scripts/search-shim.js: -------------------------------------------------------------------------------- 1 | const searchInject = ` 2 |
3 |
4 | 5 | 6 |
7 | 8 | 9 |
10 | [X] 11 |
12 |
13 | 14 | 20 | 21 | 64 |
65 | ` 66 | var script = document.createElement('script'); 67 | var wikiSearch = null; 68 | script.src = "../-/j/search.js"; 69 | script.onload = () => { 70 | wikiSearch = new WikiSearch('{{SEARCH_CID}}'); 71 | } 72 | 73 | document.head.appendChild(script); 74 | 75 | var anch = document.querySelector('#top'); 76 | 77 | var div = document.createElement('div'); 78 | div.innerHTML = searchInject; 79 | anch.parentNode.insertBefore(div.firstElementChild, anch.nextSibling); 80 | 81 | var closeBtn = document.querySelector('.search-close') 82 | 83 | closeBtn.onclick = function() { 84 | delResults(); 85 | closeBtn.style.display = "none"; 86 | }; 87 | 88 | 89 | var delResults = function() { 90 | var resultsElem = document.querySelector('.search-results'); 91 | while(resultsElem.firstChild) resultsElem.removeChild(resultsElem.firstChild); 92 | } 93 | 94 | 95 | document.querySelector('.search-form').onsubmit = function() { 96 | if (wikiSearch == null) { 97 | return false; 98 | } 99 | var val = document.querySelector('.search-text').value; 100 | wikiSearch.search(val).then(function(results) { 101 | var proto = document.getElementById('entry-proto').children[0]; 102 | var resultsElem = document.querySelector('.search-results'); 103 | delResults() 104 | 105 | results.slice(0, 19).forEach(function(art, idx) { 106 | var entry = proto.cloneNode(true); 107 | entry.getElementsByClassName('entry-id')[0].appendChild(document.createTextNode(idx)); 108 | var link = entry.getElementsByClassName('entry-link')[0]; 109 | link.appendChild(document.createTextNode(art.replace(/_/g, " ").slice(0, -5))); 110 | link.href = art; 111 | closeBtn.style.display = "inherit"; 112 | 113 | resultsElem.appendChild(entry); 114 | }) 115 | 116 | }); 117 | return false 118 | }; 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /mirrorzim.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # vim: set ts=2 sw=2: 3 | 4 | set -euo pipefail 5 | 6 | # Download a zim file, unpack it, convert to website then push to local ipfs instance 7 | 8 | usage() { 9 | echo "USAGE:" 10 | echo " $0 - download a zim file, unpack it, convert to website then add to MFS at local IPFS instance" 11 | echo "" 12 | echo "SYNOPSIS" 13 | echo " $0 --languagecode= --wikitype=" 14 | echo " [--hostingdnsdomain=]" 15 | echo " [--hostingipnshash=]" 16 | echo " [--mainpageversion=]" 17 | echo "" 18 | echo "OPTIONS" 19 | echo "" 20 | echo " -l, --languagecode string - the language of the wikimedia property e.g. tr - turkish, en - english" 21 | echo " -w, --wikitype string - the type of the wikimedia property e.g. wikipedia, wikiquote" 22 | echo " -d, --hostingdnsdomain string - the DNS domain name the mirror will be hosted at e.g. tr.wikipedia-on-ipfs.org" 23 | echo " -i, --hostingipnshash string - the IPNS hash the mirror will be hosted at e.g. QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W" 24 | echo " -v, --mainpageversion string - an override hack used on Turkish Wikipedia, it sets the main page version as there are issues with the Kiwix version id" 25 | 26 | exit 2 27 | } 28 | 29 | 30 | for i in "$@" 31 | do 32 | case $i in 33 | -l=*|--languagecode=*) 34 | LANGUAGE_CODE="${i#*=}" 35 | shift 36 | ;; 37 | -w=*|--wikitype=*) 38 | WIKI_TYPE="${i#*=}" 39 | shift 40 | ;; 41 | -d=*|--hostingdnsdomain=*) 42 | HOSTING_DNS_DOMAIN="${i#*=}" 43 | shift 44 | ;; 45 | -i=*|--hostingipnshash=*) 46 | HOSTING_IPNS_HASH="${i#*=}" 47 | shift 48 | ;; 49 | -v=*|--mainpageversion=*) 50 | MAIN_PAGE_VERSION="${i#*=}" 51 | shift 52 | ;; 53 | --default) 54 | DEFAULT=YES 55 | shift 56 | ;; 57 | *) 58 | # unknown option 59 | ;; 60 | esac 61 | done 62 | 63 | if [ -z ${LANGUAGE_CODE+x} ]; then 64 | echo "Missing wiki language code e.g. tr - turkish, en - english" 65 | usage 66 | fi 67 | 68 | if [ -z ${WIKI_TYPE+x} ]; then 69 | echo "Missing wiki type e.g. wikipedia, wikiquote" 70 | usage 71 | fi 72 | 73 | if [ -z ${HOSTING_DNS_DOMAIN+x} ]; then 74 | HOSTING_DNS_DOMAIN="" 75 | fi 76 | 77 | if [ -z ${HOSTING_IPNS_HASH+x} ]; then 78 | HOSTING_IPNS_HASH="" 79 | fi 80 | 81 | if [ -z ${MAIN_PAGE_VERSION+x} ]; then 82 | MAIN_PAGE_VERSION="" 83 | fi 84 | 85 | printf "\nEnsure zimdump is present...\n" 86 | PATH=$PATH:$(realpath ./bin) 87 | which zimdump &> /dev/null || (curl --progress-bar -L https://download.openzim.org/release/zim-tools/zim-tools_linux-x86_64-3.1.0.tar.gz | tar -xvz --strip-components=1 -C ./bin zim-tools_linux-x86_64-3.1.0/zimdump && chmod +x ./bin/zimdump) 88 | 89 | printf "\nDownload and verify the zim file...\n" 90 | ZIM_FILE_SOURCE_URL="$(./tools/getzim.sh download $WIKI_TYPE $WIKI_TYPE $LANGUAGE_CODE all maxi latest | grep 'URL:' | cut -d' ' -f3)" 91 | ZIM_FILE=$(echo $ZIM_FILE_SOURCE_URL | rev | cut -d'/' -f1 | rev) 92 | TMP_DIRECTORY="./tmp/$(echo $ZIM_FILE | cut -d'.' -f1)" 93 | 94 | # Note: successful zimdump ends with creation of $TMP_DIRECTORY/zimdump_version 95 | # We use it as a hint if tmpdir should be purged or not 96 | 97 | printf "\nRemove any partial tmp directory $TMP_DIRECTORY before run ..." 98 | # so.. turns out rsync is the fastest: https://www.slashroot.in/which-is-the-fastest-method-to-delete-files-in-linux 99 | test -e $TMP_DIRECTORY/zimdump_version || (mkdir -p ./tmp/blank && rsync -a --delete ./tmp/blank/ $TMP_DIRECTORY ; rm -rf $TMP_DIRECTORY ./tmp/blank) 100 | 101 | printf "\nUnpack the zim file into $TMP_DIRECTORY if not there already...\n" 102 | test -e $TMP_DIRECTORY/zimdump_version || (zimdump dump ./snapshots/$ZIM_FILE --dir $TMP_DIRECTORY && zimdump --version > $TMP_DIRECTORY/zimdump_version) 103 | 104 | # Find the main page of ZIM 105 | ZIM_FILE_MAIN_PAGE=$(zimdump info ./snapshots/$ZIM_FILE | grep -oP 'main page: A/\K\S+') 106 | 107 | # Resolve the main page as it is on wikipedia over http 108 | MAIN_PAGE=$(./tools/find_main_page_name.sh "$LANGUAGE_CODE.$WIKI_TYPE.org") 109 | 110 | printf "\nConvert the unpacked zim directory to a website\n" 111 | node ./bin/run $TMP_DIRECTORY \ 112 | --zimfile=./snapshots/$ZIM_FILE \ 113 | --kiwixmainpage=$ZIM_FILE_MAIN_PAGE \ 114 | --mainpage=$MAIN_PAGE \ 115 | ${HOSTING_DNS_DOMAIN:+--hostingdnsdomain=$HOSTING_DNS_DOMAIN} \ 116 | ${HOSTING_IPNS_HASH:+--hostingipnshash=$HOSTING_IPNS_HASH} \ 117 | ${MAIN_PAGE_VERSION:+--mainpageversion=$MAIN_PAGE_VERSION} 118 | 119 | printf "\n-------------------------\n" 120 | printf "\nIPFS_PATH=$IPFS_PATH\n" 121 | 122 | printf "\nAdding the processed tmp directory to IPFS\n(this part may take long time on a slow disk):\n" 123 | CID=$(ipfs add -r --cid-version 1 --pin=false --offline -Qp $TMP_DIRECTORY) 124 | MFS_DIR="/${ZIM_FILE}__$(date +%F_%T)" 125 | 126 | # pin by adding to MFS under a meaningful name 127 | ipfs files cp /ipfs/$CID "$MFS_DIR" 128 | 129 | printf "\n\n-------------------------\nD O N E !\n-------------------------\n" 130 | printf "MFS: $MFS_DIR\n" 131 | printf "CID: $CID" 132 | printf "\n-------------------------\n" 133 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "distributed-wikipedia-mirror", 3 | "description": "Mirror wikipedia on IPFS", 4 | "version": "0.2.0", 5 | "private": true, 6 | "bin": { 7 | "zim-to-website": "./bin/run" 8 | }, 9 | "bugs": "https://github.com/ipfs/distributed-wikipedia-mirror", 10 | "dependencies": { 11 | "@oclif/command": "^1", 12 | "@oclif/config": "^1", 13 | "@oclif/plugin-help": "^2", 14 | "cheerio": "^1.0.0-rc.3", 15 | "cli-ux": "^5.4.4", 16 | "date-fns": "^2.10.0", 17 | "handlebars": "^4.7.3", 18 | "node-fetch": "^2.6.0", 19 | "tslib": "^2" 20 | }, 21 | "devDependencies": { 22 | "@oclif/dev-cli": "^1", 23 | "@oclif/test": "^1", 24 | "@types/chai": "^4", 25 | "@types/cheerio": "^0.22.16", 26 | "@types/mocha": "^5", 27 | "@types/node": "^14", 28 | "@types/node-fetch": "^2.5.8", 29 | "chai": "^4", 30 | "eslint": "^5.13", 31 | "eslint-config-oclif": "^3.1", 32 | "eslint-config-oclif-typescript": "^0.1", 33 | "eslint-config-prettier": "^6.10.0", 34 | "eslint-plugin-prettier": "^3.1.2", 35 | "eslint-plugin-simple-import-sort": "^5.0.1", 36 | "mocha": "^5", 37 | "nyc": "^14", 38 | "prettier": "^1.19.1", 39 | "ts-node": "^9", 40 | "typescript": "^4.1" 41 | }, 42 | "engines": { 43 | "node": ">=14.0.0" 44 | }, 45 | "files": [ 46 | "/bin", 47 | "/lib" 48 | ], 49 | "homepage": "https://github.com/ipfs/distributed-wikipedia-mirror", 50 | "keywords": [ 51 | "ipfs", 52 | "wikipedia" 53 | ], 54 | "license": "MIT", 55 | "main": "lib/index.js", 56 | "oclif": { 57 | "bin": "zim-to-website" 58 | }, 59 | "repository": "https://github.com/ipfs/distributed-wikipedia-mirror.git", 60 | "scripts": { 61 | "posttest": "eslint . --ext .ts --config .eslintrc", 62 | "prepack": "rm -rf lib && tsc -b && oclif-dev readme", 63 | "test": "nyc --extension .ts mocha --forbid-only \"test/**/*.test.ts\"", 64 | "version": "oclif-dev readme && git add README.md", 65 | "lint": "eslint . --ext .ts --config .eslintrc && prettier --check ./src/**/*.ts tsc" 66 | }, 67 | "types": "lib/index.d.ts" 68 | } 69 | -------------------------------------------------------------------------------- /snapshot-hashes.yml: -------------------------------------------------------------------------------- 1 | %YAML 1.2 2 | --- 3 | # How to access mirror via DNSLink and CID? 4 | # https://blog.ipfs.io/2021-05-31-distributed-wikipedia-mirror-update/#improved-access-to-wikipedia-mirrors 5 | 6 | en: 7 | name: English 8 | original: en.wikipedia.org 9 | dnslink: en.wikipedia-on-ipfs.org 10 | snapshot: 11 | date: 2021-03-09 12 | source: wikipedia_en_all_maxi_2021-02.zim 13 | cid: bafybeiaysi4s6lnjev27ln5icwm6tueaw2vdykrtjkwiphwekaywqhcjze 14 | tr: 15 | name: Turkish 16 | original: tr.wikipedia.org 17 | dnslink: tr.wikipedia-on-ipfs.org 18 | snapshot: 19 | date: 2021-02-19 20 | source: wikipedia_tr_all_maxi_2021-02.zim 21 | cid: bafybeieuutdavvf55sh3jktq2dpi2hkle6dtmebe7uklod3ramihyf3xa4 22 | my: 23 | name: Myanmar 24 | original: my.wikipedia.org 25 | dnslink: my.wikipedia-on-ipfs.org 26 | snapshot: 27 | date: 2021-02-22 28 | source: wikipedia_my_all_maxi_2021-02.zim 29 | cid: bafybeib66xujztkiq7lqbupfz6arzhlncwagva35dx54nj7ipyoqpyozhy 30 | ar: 31 | name: Arabic 32 | original: ar.wikipedia.org 33 | dnslink: ar.wikipedia-on-ipfs.org 34 | snapshot: 35 | source: wikipedia_ar_all_maxi_2021-03.zim 36 | date: 2021-03-26 37 | cid: bafybeih4a6ylafdki6ailjrdvmr7o4fbbeceeeuty4v3qyyouiz5koqlpi 38 | zh: 39 | name: Chinese 40 | original: zh.wikipedia.org 41 | dnslink: zh.wikipedia-on-ipfs.org 42 | snapshot: 43 | date: 2021-03-16 44 | source: wikipedia_zh_all_maxi_2021-02.zim 45 | cid: bafybeiazgazbrj6qprr4y5hx277u4g2r5nzgo3jnxkhqx56doxdqrzms6y 46 | uk: 47 | name: Ukrainian 48 | original: uk.wikipedia.org 49 | dnslink: uk.wikipedia-on-ipfs.org 50 | snapshot: 51 | date: 2022-03-09 52 | source: wikipedia_uk_all_maxi_2022-03.zim 53 | cid: bafybeibiqlrnmws6psog7rl5ofeci3ontraitllw6wyyswnhxbwdkmw4ka 54 | ru: 55 | name: Russian 56 | original: ru.wikipedia.org 57 | dnslink: ru.wikipedia-on-ipfs.org 58 | snapshot: 59 | date: 2022-03-12 60 | source: wikipedia_ru_all_maxi_2022-03.zim 61 | cid: bafybeiezqkklnjkqywshh4lg65xblaz2scbbdgzip4vkbrc4gn37horokq 62 | fa: 63 | name: Persian 64 | original: fa.wikipedia.org 65 | dnslink: fa.wikipedia-on-ipfs.org 66 | snapshot: 67 | date: 2021-08-18 68 | source: wikipedia_fa_all_maxi_2021-06.zim 69 | cid: bafybeicpnshmz7lhp5vcowscty4v4br33cjv22nhhqestavb2mww6zbswm 70 | -------------------------------------------------------------------------------- /snapshots/.ignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ipfs/distributed-wikipedia-mirror/294475504833dad508279b5a7c8a10bc4a29da23/snapshots/.ignore -------------------------------------------------------------------------------- /src/article-transforms.ts: -------------------------------------------------------------------------------- 1 | import { format } from 'date-fns' 2 | import { readFileSync } from 'fs' 3 | import { basename, relative } from 'path' 4 | import Handlebars from 'handlebars' 5 | 6 | import { EnhancedOpts } from './domain' 7 | 8 | const footerFragment = readFileSync( 9 | './src/templates/footer_fragment.handlebars' 10 | ) 11 | 12 | const generateFooterFrom = (options: EnhancedOpts) => { 13 | // const title = $html('title').text() 14 | 15 | const context = { 16 | SNAPSHOT_DATE: format(options.snapshotDate, 'yyyy-MM'), 17 | // ARTICLE_TITLE: encodeURIComponent(title), 18 | ARTICLE_URL: `https://${options.hostingDNSDomain}/wiki/${encodeURIComponent( 19 | options.relativeFilepath 20 | )}`, 21 | ARTICLE_URL_DISPLAY: `https://${options.hostingDNSDomain}/wiki/${options.relativeFilepath}`, 22 | IPNS_HASH: options.hostingIPNSHash, 23 | CANONICAL_URL: options.canonicalUrl, 24 | CANONICAL_URL_DISPLAY: decodeURIComponent(options.canonicalUrl), 25 | IMAGES_DIR: options.relativeImagePath, 26 | ZIM_NAME: basename(options.zimFile) 27 | } 28 | 29 | const footerTemplate = Handlebars.compile(footerFragment.toString()) 30 | 31 | const footer = footerTemplate(context) 32 | 33 | return footer 34 | } 35 | 36 | export const appendFooter = ($html: any, options: EnhancedOpts) => { 37 | const footer = generateFooterFrom(options) 38 | $html('#distribution-footer').remove() 39 | $html('#mw-mf-page-center').append( 40 | `` 41 | ) 42 | } 43 | 44 | export const appendHtmlPostfix = (href: string) => { 45 | // noop: .html no longer needed since we switched to zimdump 46 | return href 47 | } 48 | 49 | export const prefixRelativeRoot = (href: string) => { 50 | if (!href.startsWith('/wiki/')) { 51 | return href 52 | } 53 | 54 | return href.replace('/wiki/', './') 55 | } 56 | 57 | export const moveRelativeLinksUpOneLevel = (href: string) => { 58 | return href.replace('../', '') 59 | } 60 | 61 | export const moveRelativeLinksDownOneLevel = (href: string) => { 62 | if (!(href.startsWith('../') || href.startsWith('http') || href.startsWith('//'))) { 63 | return `../${href}` 64 | } 65 | return href.replace('../', '../../') 66 | } 67 | 68 | export const makeScriptLinksRelativeToWiki = (href: string) => { 69 | if (!href.startsWith('-/')) { 70 | return href 71 | } 72 | 73 | return `../${href}` 74 | } 75 | 76 | export const replaceANamespaceWithWiki = (href: string) => { 77 | return href.replace('/A/', '/wiki/') 78 | } 79 | 80 | export const reworkLinks = ( 81 | $html: any, 82 | selector = 'a:not(.external)', 83 | fns: ((href: string) => string)[] = [ 84 | replaceANamespaceWithWiki, 85 | appendHtmlPostfix 86 | ] 87 | ) => { 88 | const links = $html(selector) 89 | 90 | for (const link of Object.values(links)) { 91 | const attribs = link.attribs 92 | 93 | if (!attribs || !attribs.href) { 94 | continue 95 | } 96 | 97 | let href = attribs.href 98 | 99 | for (const fn of fns) { 100 | href = fn(href) 101 | } 102 | 103 | attribs.href = href 104 | } 105 | } 106 | 107 | export const reworkScriptSrcs = ( 108 | $html: any, 109 | selector = 'a:not(.external)', 110 | fns: ((href: string) => string)[] = [ 111 | replaceANamespaceWithWiki, 112 | appendHtmlPostfix 113 | ] 114 | ) => { 115 | const scripts = $html(selector) 116 | 117 | for (const script of Object.values(scripts)) { 118 | const attribs = script.attribs 119 | 120 | if (!attribs || !attribs.src) { 121 | continue 122 | } 123 | 124 | let src = attribs.src 125 | 126 | for (const fn of fns) { 127 | src = fn(src) 128 | } 129 | 130 | attribs.src = src 131 | } 132 | } 133 | 134 | export const reworkRedirect = ( 135 | $html: any, 136 | selector = 'meta[http-equiv="refresh"]', 137 | fns: ((href: string) => string)[] = [ 138 | moveRelativeLinksDownOneLevel 139 | ] 140 | ) => { 141 | const redirects = $html(selector) 142 | 143 | for (const redirect of Object.values(redirects)) { 144 | const attribs = redirect.attribs 145 | 146 | if (!attribs || !attribs.content) { 147 | continue 148 | } 149 | 150 | let { content } = attribs 151 | 152 | let [ delay, url ] = content.split(';url=') 153 | 154 | for (const fn of fns) { 155 | url = fn(url) 156 | } 157 | 158 | attribs.content = `${delay};url=${url}` 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /src/domain.ts: -------------------------------------------------------------------------------- 1 | export interface Options { 2 | unpackedZimDir: string 3 | kiwixMainPage: string 4 | mainPage: string 5 | mainPageVersion?: number 6 | hostingDNSDomain?: string 7 | hostingIPNSHash?: string 8 | zimFile: string 9 | noOfWorkerThreads: number 10 | } 11 | 12 | export interface EnhancedOpts extends Options { 13 | snapshotDate: Date 14 | canonicalUrl: string 15 | relativeFilepath: string 16 | relativeImagePath: string 17 | } 18 | 19 | export interface Directories { 20 | unpackedZimDir: string 21 | articleFolder: string 22 | imagesFolder: string 23 | wikiFolder: string 24 | jsmodulesFolder: string 25 | } 26 | 27 | export type ArticleWorkerData = { 28 | options: Options 29 | directories: Directories 30 | } 31 | 32 | export enum MessageRequestTypes { 33 | EXIT = 'EXIT', 34 | PROCESS_ARTICLES = 'PROCESS_ARTICLES' 35 | } 36 | 37 | export enum MessageResponseTypes { 38 | PROCESSED_ARTICLES = 'PROCESSED_ARTICLES', 39 | READY = 'READY' 40 | } 41 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import { Command, flags } from '@oclif/command' 2 | 3 | import { Options } from './domain' 4 | import { zimToWebsite } from './zim-to-website' 5 | 6 | class ZimToWebsite extends Command { 7 | static description = 'Convert unpacked zim files to usable websites' 8 | 9 | static examples = [ 10 | '$ zim-to-website ./tmp \\\n --hostingdnsdomain=tr.wikipedia-on-ipfs.org \\\n --hostingipnshash=QmVH1VzGBydSfmNG7rmdDjAeBZ71UVeEahVbNpFQtwZK8W \\\n --zimfile=/path/to/wikipedia_tr_all_maxi_2019-12.zim \\\n --kiwixmainpage=Kullanıcı:The_other_Kiwix_guy/Landing \\\n --mainpage=Anasayfa' 11 | ] 12 | 13 | static flags = { 14 | version: flags.version({ char: 'v' }), 15 | help: flags.help({ char: 'h' }), 16 | zimfile: flags.string({ 17 | required: true, 18 | description: 'the location of the original (before unpacking) source zim file' 19 | }), 20 | kiwixmainpage: flags.string({ 21 | required: true, 22 | description: 23 | 'the main page as used by Kiwix and specified in the zim file' 24 | }), 25 | mainpage: flags.string({ 26 | required: true, 27 | description: 28 | "the main page as it is on the approptiate wikimedia site, e.g. 'Anasayfa' for tr.wikipedia.org" 29 | }), 30 | hostingdnsdomain: flags.string({ 31 | required: false, 32 | description: 'the DNS domain the website will be hosted at' 33 | }), 34 | hostingipnshash: flags.string({ 35 | required: false, 36 | description: 'the IPNS address the website will be hosted at on IPFS' 37 | }), 38 | mainpageversion: flags.integer({ 39 | required: false, 40 | description: 'overrides the version of the homepage used' 41 | }), 42 | numberofworkerthreads: flags.integer({ 43 | required: false, 44 | default: 6, 45 | description: 'overrides the number of worker threads' 46 | }) 47 | } 48 | 49 | static args = [{ name: 'unpackedzimdir', required: true }] 50 | 51 | async run() { 52 | const { args, flags } = this.parse(ZimToWebsite) 53 | 54 | const options: Options = { 55 | unpackedZimDir: args.unpackedzimdir, 56 | hostingDNSDomain: flags.hostingdnsdomain, 57 | hostingIPNSHash: flags.hostingipnshash, 58 | zimFile: flags.zimfile, 59 | kiwixMainPage: flags.kiwixmainpage, 60 | mainPage: flags.mainpage, 61 | mainPageVersion: flags.mainpageversion, 62 | noOfWorkerThreads: flags.numberofworkerthreads 63 | } 64 | 65 | await zimToWebsite(options) 66 | } 67 | } 68 | 69 | export = ZimToWebsite 70 | -------------------------------------------------------------------------------- /src/site-transforms.ts: -------------------------------------------------------------------------------- 1 | import cheerio from 'cheerio' 2 | import { cli } from 'cli-ux' 3 | import { format } from 'date-fns' 4 | import { 5 | copyFileSync, 6 | existsSync, 7 | lstatSync, 8 | mkdirSync, 9 | rmdirSync, 10 | readdirSync, 11 | readFileSync, 12 | renameSync, 13 | closeSync, 14 | openSync, 15 | opendirSync, 16 | unlinkSync, 17 | writeFileSync 18 | } from 'fs' 19 | import Handlebars from 'handlebars' 20 | import fetch from 'node-fetch' 21 | import path from 'path' 22 | import { join, basename, relative } from 'path' 23 | 24 | import { 25 | appendHtmlPostfix, 26 | makeScriptLinksRelativeToWiki, 27 | moveRelativeLinksUpOneLevel, 28 | moveRelativeLinksDownOneLevel, 29 | prefixRelativeRoot, 30 | reworkLinks, 31 | reworkScriptSrcs, 32 | reworkRedirect 33 | } from './article-transforms' 34 | import { Directories, Options } from './domain' 35 | import { downloadFile } from './utils/download-file' 36 | 37 | const indexRedirectFragment = readFileSync( 38 | './src/templates/index_redirect_fragment.handlebars' 39 | ) 40 | 41 | const footerFragment = readFileSync( 42 | './src/templates/footer_fragment.handlebars' 43 | ) 44 | 45 | export const copyImageAssetsIntoWiki = async ( 46 | assetsDir: string, 47 | { imagesFolder }: Directories 48 | ) => { 49 | cli.action.start(' Copying image assets into unpacked zim directory ') 50 | const imagesFiles = readdirSync(assetsDir) 51 | 52 | for (const imageFile of imagesFiles) { 53 | const filepath = join(assetsDir, imageFile) 54 | const info = lstatSync(filepath) 55 | 56 | if (!info.isFile()) { 57 | return 58 | } 59 | 60 | const imagesFolderPath = join(imagesFolder, imageFile) 61 | copyFileSync(filepath, imagesFolderPath) 62 | } 63 | cli.action.stop() 64 | } 65 | 66 | export const moveArticleFolderToWiki = ({ 67 | articleFolder, 68 | wikiFolder 69 | }: Directories) => { 70 | if (existsSync(wikiFolder)) { 71 | return 72 | } 73 | 74 | cli.action.start(' Renaming A namespace to wiki ') 75 | renameSync(articleFolder, wikiFolder) 76 | cli.action.stop() 77 | } 78 | 79 | export const fixFavicon = ({ 80 | unpackedZimDir 81 | }: Directories) => { 82 | const favicon = join(unpackedZimDir, '-', 'favicon') 83 | const faviconIco = join(unpackedZimDir, 'favicon.ico') 84 | if (existsSync(faviconIco) || !existsSync(favicon)) { 85 | return 86 | } 87 | 88 | cli.action.start(' Fixing favicon ') 89 | copyFileSync(favicon, faviconIco) 90 | cli.action.stop() 91 | } 92 | 93 | // https://github.com/ipfs/distributed-wikipedia-mirror/issues/80 94 | export const fixExceptions = async ({ 95 | unpackedZimDir, 96 | wikiFolder 97 | }: Directories) => { 98 | 99 | /* 100 | for every FOO directory in wiki/FOO 101 | find article _exceptions/A%2fFOO 102 | if exists, move it to wiki/FOO/index.html 103 | 104 | for every file matching _exceptions/A%2f* 105 | split name into segments 106 | for each but the last segment 107 | check if wiki/FOO exists, 108 | if exists and is a directory, do nothing 109 | if does not exist, create a dir 110 | if exists, but is a file, replace file with a dir, and move file to FOO/index.html 111 | finally, write last segment under wiki/FOO/bar 112 | */ 113 | 114 | // Articles with "/" in namei like "foo/bar" produce conflicts and those are saved under 115 | // url-escaped flat-files in exceptions directory 116 | // What we do here is to take every "foo" exception and rename it to foo/index.html, 117 | // so it loads fine under own name 118 | const exceptionsDir = join(unpackedZimDir, '_exceptions') 119 | if (!existsSync(exceptionsDir)) { 120 | return 121 | } 122 | const dir = opendirSync(exceptionsDir) 123 | for await (let file of dir) { 124 | let articleName 125 | try { 126 | articleName = decodeURIComponent(file.name) 127 | } catch (e) { 128 | console.error(`[fixExceptions] unable to decodeURIComponent(${file.name}), skipping `) 129 | continue 130 | } 131 | const segments = articleName.split('/') 132 | 133 | // only process exceptions from A/ namespace 134 | if (segments[0] !== 'A') continue 135 | segments.shift() // remove A/ 136 | 137 | // only process exceptions where segments have 1+ length 138 | // and can be represented as directories 139 | if (!segments.length || segments.some(s => !s.length)) continue 140 | 141 | // console.log('processing: ' + articleName) 142 | const suffixFile = segments.pop() || '' 143 | 144 | // creation of index.html breaks links created by zimdump: 145 | // needs manual adjustment of relative links to be prefixed with ../ 146 | const fixRelativeLinks = (filePath: string, depth: number) => { 147 | const fileBytes = readFileSync(filePath) 148 | const $fileHtml = cheerio.load(fileBytes.toString()) 149 | 150 | const linkFixups = Array.from({ length: depth }, (x, i) => moveRelativeLinksDownOneLevel) 151 | reworkLinks($fileHtml, 'a:not(a[href^="http"]):not(a[href^="//"])', linkFixups) 152 | reworkLinks($fileHtml, 'link[href^="../"]', linkFixups) 153 | reworkScriptSrcs($fileHtml, 'img', linkFixups) 154 | reworkScriptSrcs($fileHtml, 'script', linkFixups) 155 | reworkRedirect($fileHtml, 'meta[http-equiv="refresh"]', linkFixups) 156 | 157 | // console.log(` fixed relative paths in ${filePath}`) 158 | // renameSync(filePath, `${filePath}.original`) 159 | writeFileSync(filePath, $fileHtml.html()) 160 | } 161 | 162 | // if article is not A/foo but A/FOO/bar parent dirs need to be inspected 163 | if (segments.length) { 164 | // ensure dir at each level exists and has no conflict 165 | for (let i = 1; i < segments.length+1; i++) { 166 | const parentDir = join(wikiFolder, ...segments.slice(0,i)) 167 | // console.log(' checking parentDir: ' + parentDir) 168 | if (existsSync(parentDir)) { 169 | if (lstatSync(parentDir).isFile()) { 170 | // If a file exists under the name of a directory we need, 171 | // move file into a newly created dir 172 | const articleTmp = `${parentDir}.tmp` 173 | const articleDst = join(parentDir, 'index.html') 174 | // console.log(` parentDir is a file, renaming to ${articleDst}`) 175 | renameSync(parentDir, articleTmp) 176 | mkdirSync(parentDir, { recursive: true }) 177 | renameSync(articleTmp, articleDst) 178 | fixRelativeLinks(articleDst, i) 179 | } 180 | } else { 181 | // console.log(` created parentDir`) 182 | mkdirSync(parentDir, { recursive: true }) 183 | } 184 | } 185 | } 186 | 187 | const articleSrc = join(exceptionsDir, file.name) 188 | const articleDir = join(wikiFolder, ...segments) 189 | const articleDst = join(articleDir, suffixFile) 190 | 191 | // console.log(` renaming ${articleSrc}`) 192 | 193 | if (existsSync(articleDst) && lstatSync(articleDst).isDirectory()) { 194 | // console.log(` directory already, renaming to ${articleDst}/index.html`) 195 | const movedArticleDst = join(articleDst, 'index.html') 196 | renameSync(articleSrc, movedArticleDst) 197 | fixRelativeLinks(movedArticleDst, 1) 198 | } else { 199 | // console.log(` renamed to ${articleDst}`) 200 | renameSync(articleSrc, articleDst) 201 | } 202 | } 203 | // TODO: remove _exceptions? 204 | } 205 | 206 | export const includeSourceZim = ({ 207 | zimFile, 208 | unpackedZimDir 209 | }: Options) => { 210 | const zimCopy = join(unpackedZimDir, basename(zimFile)) 211 | if (existsSync(zimCopy)) { 212 | return 213 | } 214 | 215 | cli.action.start(' Copying source ZIM to the root of unpacked zim directory ') 216 | copyFileSync(zimFile, zimCopy) 217 | cli.action.stop() 218 | } 219 | 220 | export const insertIndexRedirect = (options: Options) => { 221 | cli.action.start(" Inserting root 'index.html' as redirect to main page") 222 | const template = Handlebars.compile(indexRedirectFragment.toString()) 223 | 224 | const indexPath = join(options.unpackedZimDir, 'index.html') 225 | const wikiIndexPath = join(options.unpackedZimDir, 'wiki', 'index.html') 226 | 227 | if (existsSync(indexPath)) { 228 | unlinkSync(indexPath) 229 | } 230 | 231 | writeFileSync( 232 | indexPath, 233 | template({ 234 | MAIN_PAGE: 'wiki/' 235 | }) 236 | ) 237 | 238 | if (existsSync(wikiIndexPath)) { 239 | unlinkSync(wikiIndexPath) 240 | } 241 | 242 | // note that this is temporary stub, we most likely 243 | // override this template with a better landing during later steps 244 | writeFileSync( 245 | wikiIndexPath, 246 | template({ 247 | MAIN_PAGE: `./${options.mainPage}` 248 | }) 249 | ) 250 | 251 | cli.action.stop() 252 | } 253 | 254 | export const resolveDirectories = (options: Options) => { 255 | const articleFolder = join(options.unpackedZimDir, 'A') 256 | const imagesFolder = join(options.unpackedZimDir, 'I') 257 | const wikiFolder = join(options.unpackedZimDir, 'wiki') 258 | const jsmodulesFolder = join(options.unpackedZimDir, '-') 259 | 260 | const directories: Directories = { 261 | unpackedZimDir: options.unpackedZimDir, 262 | articleFolder, 263 | imagesFolder, 264 | wikiFolder, 265 | jsmodulesFolder 266 | } 267 | 268 | return directories 269 | } 270 | 271 | // Gets path to an article after unpacking and fixExceptions fixups 272 | const unpackedArticlePath = (wikiFolder: string, article: string) => { 273 | let articlePath = join(wikiFolder, article) 274 | if (!existsSync(articlePath)) throw new Error(`unpacked '/wiki/${article}' is missing`) 275 | if (lstatSync(articlePath).isDirectory()) { 276 | const fixedSrc = join(articlePath, 'index.html') 277 | if (existsSync(fixedSrc)) { 278 | return fixedSrc 279 | } else { 280 | throw new Error(`unpacked '/wiki/${article}' is a dir without index.html`) 281 | } 282 | } 283 | return articlePath 284 | } 285 | 286 | 287 | // We copy "kiwix main page" to /wiki/index.html + adjust links. 288 | // This way original one can still be loaded if needed 289 | // Example for tr: 290 | // /wiki/index.html is https://tr.wikipedia.org/wiki/Kullanıcı:The_other_Kiwix_guy/Landing 291 | // /wiki/Anasayfa is https://tr.wikipedia.org/wiki/Anasayfa 292 | export const useKiwixLandingPage = async ( 293 | options: Options, 294 | { wikiFolder, imagesFolder }: Directories 295 | ) => { 296 | 297 | cli.action.start(` Generating landing page at /wiki/ from Kiwix one at /wiki/${options.kiwixMainPage}`) 298 | 299 | const landingPagePath = join(wikiFolder, 'index.html') 300 | const kiwixMainPageSrc = unpackedArticlePath(wikiFolder, options.kiwixMainPage) 301 | 302 | // Use main page from Kiwix as the landing: 303 | // In most cases it is already the best landing available 304 | copyFileSync(kiwixMainPageSrc, landingPagePath) 305 | 306 | // Tweak page title of custom landing created by The_other_Kiwix_guy :-) 307 | if (kiwixMainPageSrc.includes('The_other_Kiwix_guy')) { 308 | // Set title to one from canonical main page 309 | const $landingHtml = cheerio.load(readFileSync(landingPagePath).toString()) 310 | const canonicalUrlString = $landingHtml('link[rel="canonical"]').attr('href') 311 | if (!canonicalUrlString) { 312 | throw new Error(`Could not parse out canonical url for ${canonicalUrlString}`) 313 | } 314 | const canonicalUrl = new URL(canonicalUrlString) 315 | canonicalUrl.pathname = `wiki/${options.mainPage}` 316 | const response = await fetch(canonicalUrl) 317 | const pageBody = await response.text() 318 | const $remoteMainPageHtml = cheerio.load(pageBody) 319 | const pageTitle = $remoteMainPageHtml('title').text() 320 | $landingHtml('title').text(pageTitle) 321 | writeFileSync(landingPagePath, $landingHtml.html()) 322 | } 323 | 324 | // Fixup relative paths, if needed 325 | const depth = (options.kiwixMainPage.match(/\//g) || []).length 326 | if (depth) { 327 | const fixRelativeLinksUp = (filePath: string, depth: number) => { 328 | const fileBytes = readFileSync(filePath) 329 | const $fileHtml = cheerio.load(fileBytes.toString()) 330 | 331 | const linkFixups = Array.from({ length: depth }, (x, i) => moveRelativeLinksUpOneLevel) 332 | reworkLinks($fileHtml, 'a:not(a[href^="http"]):not(a[href^="//"])', linkFixups) 333 | reworkLinks($fileHtml, 'link[href^="../"]', linkFixups) 334 | reworkScriptSrcs($fileHtml, 'img', linkFixups) 335 | reworkScriptSrcs($fileHtml, 'script', linkFixups) 336 | 337 | // console.log(` fixed relative paths in ${filePath}`) 338 | // renameSync(filePath, `${filePath}.original`) 339 | writeFileSync(filePath, $fileHtml.html()) 340 | } 341 | fixRelativeLinksUp(landingPagePath, depth) 342 | } 343 | 344 | cli.action.stop() 345 | } 346 | 347 | // This is usually not used nor needed, but we keep this code around 348 | // in case we need to generate some language quickly and there is a bug in ZIM 349 | // that makes main page unusable. 350 | // With this, we are able to fetch corresponding revision from upstream wikipedia 351 | // and replace ZIM article with upstream one + fixup links and images. 352 | // (This is no longer needed for most ZIMs after we switched to upstream zim-tools) 353 | /* 354 | export const fetchOriginalMainPage = async ( 355 | options: Options, 356 | { wikiFolder, imagesFolder }: Directories 357 | ) => { 358 | 359 | 360 | // We copy "kiwix main page" to /wiki/index.html 361 | // This way original one can still be loaded if needed 362 | // Example for tr: 363 | // /wiki/index.html is https://tr.wikipedia.org/wiki/Kullanıcı:The_other_Kiwix_guy/Landing 364 | // /wiki/Anasayfa is https://tr.wikipedia.org/wiki/Anasayfa 365 | const mainPagePath = join(wikiFolder, 'index.html') 366 | 367 | cli.action.start(` Generating main page into /wiki/`) 368 | 369 | let kiwixMainPageSrc = join(wikiFolder, `${options.kiwixMainPage}`) 370 | 371 | // Handle namespace conflicts resolved in fixExceptions step 372 | if (lstatSync(kiwixMainPageSrc).isDirectory()) { 373 | const fixedSrc = `${kiwixMainPageSrc}/index.html` 374 | if (existsSync(fixedSrc)) { 375 | kiwixMainPageSrc = fixedSrc 376 | } else { 377 | throw new Error(`kiwixMainPageSrc "${kiwixMainPageSrc}" is a dir without index.html`) 378 | } 379 | } 380 | 381 | const kiwixMainpage = readFileSync(kiwixMainPageSrc) 382 | 383 | const $kiwixMainPageHtml = cheerio.load(kiwixMainpage.toString()) 384 | 385 | const canonicalUrlString = $kiwixMainPageHtml('link[rel="canonical"]').attr( 386 | 'href' 387 | ) 388 | 389 | if (!canonicalUrlString) { 390 | throw new Error( 391 | `Could not parse out canonical url for ${canonicalUrlString}` 392 | ) 393 | } 394 | 395 | let canonicalPageVersionid: string 396 | 397 | if (options.mainPageVersion) { 398 | canonicalPageVersionid = options.mainPageVersion.toString() 399 | } else { 400 | const matches = $kiwixMainPageHtml.html().match(/(?<=oldid=)\d+/g) 401 | 402 | if (!matches) { 403 | throw new Error('Could not parse out the canoncial urls version id') 404 | } 405 | 406 | canonicalPageVersionid = matches[0] 407 | } 408 | 409 | const canonicalUrl = new URL(canonicalUrlString) 410 | canonicalUrl.pathname = `wiki/${options.mainPage.replace('.html', '')}` 411 | 412 | canonicalUrl.searchParams.set('oldid', canonicalPageVersionid) 413 | 414 | try { 415 | const response = await fetch(canonicalUrl) 416 | const pageBody = await response.text() 417 | const $remoteMainPageHtml = cheerio.load(pageBody) 418 | 419 | const $remoteContent = $remoteMainPageHtml('#content') 420 | const remotePageTitle = $remoteMainPageHtml('title').text().replace(':The other Kiwix guy/', '') 421 | 422 | $remoteContent.addClass('content') 423 | $remoteContent.find('#siteNotice').remove() 424 | $remoteContent.find('#firstHeading').remove() 425 | $remoteContent.find('#siteSub').remove() 426 | $remoteContent.find('#contentSub').remove() 427 | $remoteContent.find('#catlinks').remove() 428 | $remoteContent.find('#mw-fr-revisiontag-old').remove() 429 | $remoteContent.find('a.mw-jump-link').remove() 430 | $remoteContent.find('#mc0').remove() 431 | 432 | // Some styling on the top banner - I know, this has got ... hacky 433 | // Set the width to 100% 434 | $remoteContent 435 | .find('#mp-topbanner') 436 | .attr('style', 'width:100% !important; margin-bottom:2px;') 437 | 438 | // Slightly reduce the size of the text 439 | $remoteContent 440 | .find('#mp-topbanner tbody tbody tr td:last-of-type') 441 | .attr('style', 'width:16%; font-size:95%;') 442 | 443 | // Change the globe icon to the wikipedia-on-IPFS version 444 | $remoteContent 445 | .find('.globegris') 446 | .attr( 447 | 'style', 448 | 'background-image: url("../I/wikipedia-on-ipfs.png"); background-repeat:no-repeat; background-position:-20px -40px; background-size: 200px; width:100%; border:1px solid #a7d7f9; vertical-align:top;' 449 | ) 450 | 451 | // Copy image downloads 452 | const $externalImages = $remoteContent.find( 453 | 'img[src*="upload.wikimedia.org"]' 454 | ) 455 | 456 | for (const $externalImage of $externalImages.toArray()) { 457 | const src = $externalImage.attribs.src 458 | const filename = path.basename(src) 459 | 460 | // eslint-disable-next-line no-await-in-loop 461 | await downloadFile( 462 | new URL(`http:${src}`), 463 | join(imagesFolder, decodeURIComponent(filename)) 464 | ) 465 | $externalImage.attribs.src = `../I/${filename}` 466 | delete $externalImage.attribs.srcset 467 | } 468 | 469 | const $kiwixNote = $kiwixMainPageHtml('#mw-content-text > div:last-child') 470 | 471 | $remoteContent.find('#mw-content-text').append($kiwixNote) 472 | 473 | // Add title from remote main page 474 | $kiwixMainPageHtml('title').text(remotePageTitle) 475 | 476 | $kiwixMainPageHtml('#content').remove() 477 | $kiwixMainPageHtml('#mw-mf-page-center').prepend($remoteContent) 478 | 479 | // Updage the article issuing link at the bottom 480 | $kiwixMainPageHtml('a') 481 | .filter((_, elem) => { 482 | // console.log(elem.attribs?.href) 483 | return elem.attribs?.href?.includes('oldid') 484 | }) 485 | .first() 486 | .attr('href', canonicalUrl.href) 487 | 488 | // Update the canoncial url from the remote main page (without oldid) 489 | canonicalUrl.searchParams.delete('oldid') 490 | $kiwixMainPageHtml('link[rel="canonical"]').attr('href', canonicalUrl.href) 491 | 492 | reworkLinks( 493 | $kiwixMainPageHtml, 494 | 'a[href^="/wiki/"]:not(a[href$=".svg"]):not(a[href$=".png"]):not(a[href$=".jpg"])', 495 | [appendHtmlPostfix, prefixRelativeRoot] 496 | ) 497 | 498 | // update css links 499 | reworkLinks($kiwixMainPageHtml, 'link[href^="../../"]', [ 500 | moveRelativeLinksUpOneLevel 501 | ]) 502 | 503 | if (options.kiwixMainPage.includes('/')) { 504 | reworkScriptSrcs($kiwixMainPageHtml, 'script', [ 505 | moveRelativeLinksUpOneLevel 506 | ]) 507 | } else { 508 | reworkScriptSrcs($kiwixMainPageHtml, 'script', [ 509 | makeScriptLinksRelativeToWiki 510 | ]) 511 | } 512 | 513 | writeFileSync(mainPagePath, $kiwixMainPageHtml.html()) 514 | 515 | cli.action.stop() 516 | } catch (error) { 517 | cli.error(error) 518 | } 519 | } 520 | */ 521 | 522 | export const appendJavscript = ( 523 | options: Options, 524 | { unpackedZimDir, jsmodulesFolder }: Directories 525 | ) => { 526 | cli.action.start(' Appending custom javascript to site.js ') 527 | 528 | const delimiter = '/* Appended by Distributed Wikipedia Mirror – details at https://github.com/ipfs/distributed-wikipedia-mirror */' 529 | const targetSiteJsFile = join(jsmodulesFolder, 'mw', 'site.js') 530 | 531 | const dwmSitejsTemplate = readFileSync('./src/templates/site.js.handlebars') 532 | .toString() 533 | .replace('', '') 535 | 536 | const context = { 537 | SNAPSHOT_DATE: format(new Date(), 'yyyy-MM'), 538 | HOSTING_IPNS_HASH: options.hostingIPNSHash, 539 | HOSTING_DNS_DOMAIN: options.hostingDNSDomain, 540 | ZIM_NAME: basename(options.zimFile) 541 | } 542 | 543 | const dwmSitejs = Handlebars.compile(dwmSitejsTemplate.toString())({ 544 | FOOTER_TEMPLATE: footerFragment 545 | .toString() 546 | .replace(/\n/g, '\\\n') 547 | .replace(/"/g, '\\"'), 548 | DWM_OPTIONS: JSON.stringify(context, null, 2) 549 | }) 550 | 551 | let originalSiteJs = readFileSync(targetSiteJsFile).toString() 552 | 553 | // hack out erroring site.js code 554 | originalSiteJs = originalSiteJs.replace( 555 | 'if(wgCanonicalSpecialPageName=="Watchlist")importScript(\'MediaWiki:Common.js/WatchlistNotice.js\');', 556 | '' 557 | ) 558 | 559 | if (originalSiteJs.includes(delimiter)) { 560 | originalSiteJs = originalSiteJs.split(delimiter)[0] 561 | } 562 | 563 | // const updatedSiteJs = `${originalSiteJs}\n${delimiter}\n${dwmSitejs}` 564 | const updatedSiteJs = `\n${delimiter}\n${dwmSitejs}` 565 | 566 | writeFileSync(targetSiteJsFile, updatedSiteJs) 567 | 568 | // hack to stop console error 569 | const targetJsConfigVarFile = join(jsmodulesFolder, 'mw', 'jsConfigVars.js') 570 | writeFileSync(targetJsConfigVarFile, '{}') 571 | 572 | // hack replace the unexpected var in startup.js 573 | const startupJsFile = join(jsmodulesFolder, 'mw', 'startup.js') 574 | let startJs = readFileSync(startupJsFile).toString() 575 | startJs = startJs 576 | .replace('function domEval(code){var', 'function domEval(code){') 577 | .replace('"/w/load.php"', '"../w/load.php"') 578 | writeFileSync(startupJsFile, startJs) 579 | 580 | // Create a stub load.php 581 | const loadPhpPath = join(unpackedZimDir, 'w', 'load.php') 582 | if (!existsSync(loadPhpPath)) { 583 | mkdirSync(join(unpackedZimDir, 'w')) 584 | writeFileSync(loadPhpPath, '/* Stubbed by Distributed Wikipedia Mirror */') 585 | } 586 | 587 | // hack: overwrite erroring js files see https://github.com/openzim/mwoffliner/issues/894 588 | for (const file of ['ext.cite.ux-enhancements.js']) { 589 | const filepath = join(jsmodulesFolder, 'mw', file) 590 | const overwriteText = 591 | '/* Overwritten by Distributed Wikipedia Mirror to prevent js errors, see https://github.com/openzim/mwoffliner/issues/894 */' 592 | writeFileSync(filepath, overwriteText) 593 | } 594 | 595 | cli.action.stop() 596 | } 597 | -------------------------------------------------------------------------------- /src/templates/footer_fragment.handlebars: -------------------------------------------------------------------------------- 1 |
2 | 96 | 144 |
145 | -------------------------------------------------------------------------------- /src/templates/index_redirect_fragment.handlebars: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 9 | -------------------------------------------------------------------------------- /src/templates/site.js.handlebars: -------------------------------------------------------------------------------- 1 | 146 | -------------------------------------------------------------------------------- /src/utils/assert-never.ts: -------------------------------------------------------------------------------- 1 | export function assertNever(x: never): never { 2 | throw new Error('Unexpected object: ' + x) 3 | } 4 | -------------------------------------------------------------------------------- /src/utils/check-unpacked-zim-dir.ts: -------------------------------------------------------------------------------- 1 | import { existsSync, lstatSync } from 'fs' 2 | 3 | export const checkUnpackedZimDir = (unpackedZimDir: string) => { 4 | if (!existsSync(unpackedZimDir)) { 5 | throw new Error(`Unpacked Zim Directory does not exist: ${unpackedZimDir}`) 6 | } 7 | 8 | if (!lstatSync(unpackedZimDir).isDirectory()) { 9 | throw new Error(`Unpacked Zim Directory must be a directory`) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/utils/download-file.ts: -------------------------------------------------------------------------------- 1 | import { exec } from 'child_process' 2 | import { URL } from 'url' 3 | 4 | export const downloadFile = (url: URL, dest: string) => { 5 | return new Promise((resolve, reject) => { 6 | const wget = `wget --continue -O "${dest}" "${url}"` 7 | 8 | exec(wget, (err, stdout, stderr) => { 9 | if (err) { 10 | if (stderr) console.error(stderr) 11 | reject(err) 12 | } 13 | 14 | resolve(stdout) 15 | }) 16 | }) 17 | } 18 | -------------------------------------------------------------------------------- /src/utils/walk-files.ts: -------------------------------------------------------------------------------- 1 | import { readdir } from 'fs' 2 | import { resolve } from 'path' 3 | import { promisify } from 'util' 4 | 5 | const readdirAsync = promisify(readdir) 6 | 7 | export default async function* walkFiles( 8 | dir: string 9 | ): AsyncGenerator { 10 | const dirents = await readdirAsync(dir, { withFileTypes: true }) 11 | for (const dirent of dirents) { 12 | const res = resolve(dir, dirent.name) 13 | if (dirent.isDirectory()) { 14 | yield* walkFiles(res) 15 | } else { 16 | yield res 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/zim-to-website.ts: -------------------------------------------------------------------------------- 1 | import { cli } from 'cli-ux' 2 | 3 | import { Options } from './domain' 4 | import { 5 | appendJavscript as appendJavascript, 6 | includeSourceZim, 7 | copyImageAssetsIntoWiki, 8 | fixFavicon, 9 | fixExceptions, 10 | // generateMainPage, 11 | useKiwixLandingPage, 12 | insertIndexRedirect, 13 | moveArticleFolderToWiki, 14 | resolveDirectories 15 | } from './site-transforms' 16 | import { checkUnpackedZimDir } from './utils/check-unpacked-zim-dir' 17 | 18 | export const zimToWebsite = async (options: Options) => { 19 | const directories = resolveDirectories(options) 20 | 21 | checkUnpackedZimDir(options.unpackedZimDir) 22 | 23 | cli.log('-------------------------') 24 | cli.log('Zim to Website Conversion') 25 | cli.log('-------------------------') 26 | cli.log(` Unpacked Zim Directory: ${options.unpackedZimDir}`) 27 | cli.log(` Zim File: ${options.zimFile}`) 28 | cli.log(` Original Main Page: ${options.mainPage ? decodeURIComponent(options.mainPage) : null}`) 29 | cli.log(` ZIM's Main Page: ${options.kiwixMainPage}`) 30 | 31 | if (options.hostingDNSDomain) { 32 | cli.log(` Hosting DNS Domain: ${options.hostingDNSDomain}`) 33 | } 34 | 35 | if (options.hostingIPNSHash) { 36 | cli.log(` Hosting IPNS Hash: ${options.hostingIPNSHash}`) 37 | } 38 | 39 | if (options.mainPageVersion) { 40 | cli.log(` Main Page version: ${options.mainPageVersion}`) 41 | } 42 | 43 | cli.log('-------------------------') 44 | cli.log('') 45 | 46 | cli.log(`Starting zim to website conversion ...`) 47 | 48 | includeSourceZim(options) 49 | copyImageAssetsIntoWiki('./assets', directories) 50 | fixFavicon(directories) 51 | moveArticleFolderToWiki(directories) 52 | await fixExceptions(directories) 53 | insertIndexRedirect(options) 54 | appendJavascript(options, directories) 55 | await useKiwixLandingPage(options, directories) 56 | 57 | // usually main page from kiwix is ok, so we dont need below 58 | // await generateMainPage(options, directories) 59 | 60 | cli.log('done') 61 | } 62 | -------------------------------------------------------------------------------- /test/index.test.ts: -------------------------------------------------------------------------------- 1 | import { expect, test } from '@oclif/test' 2 | 3 | import cmd = require('../src') 4 | 5 | describe('zim-to-website', () => { 6 | test 7 | .stdout() 8 | .do(() => cmd.run([])) 9 | .it('runs hello', ctx => { 10 | expect(ctx.stdout).to.contain('hello world') 11 | }) 12 | 13 | test 14 | .stdout() 15 | .do(() => cmd.run(['--name', 'jeff'])) 16 | .it('runs hello --name jeff', ctx => { 17 | expect(ctx.stdout).to.contain('hello jeff') 18 | }) 19 | }) 20 | -------------------------------------------------------------------------------- /test/mocha.opts: -------------------------------------------------------------------------------- 1 | --require ts-node/register 2 | --watch-extensions ts 3 | --recursive 4 | --reporter spec 5 | --timeout 5000 6 | -------------------------------------------------------------------------------- /test/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "../tsconfig", 3 | "compilerOptions": { 4 | "noEmit": true 5 | }, 6 | "references": [ 7 | {"path": ".."} 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /tools/find_main_page_name.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # vim: set ts=2 sw=2: 3 | 4 | set -euo pipefail 5 | 6 | # Every Wikipedia version uses different name of the main page 7 | 8 | usage() { 9 | echo "USAGE:" 10 | echo " $0 "; 11 | echo "" 12 | exit 2 13 | } 14 | 15 | if [ -z "${1-}" ]; then 16 | echo "Missing wikipedia domain" 17 | usage 18 | fi 19 | 20 | MAIN_PAGE=$(curl -Ls -o /dev/null -w %{url_effective} https://${1} | cut -d"/" -f5) 21 | printf "%s\n" "${MAIN_PAGE}" 22 | -------------------------------------------------------------------------------- /tools/find_original_main_page_url.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # vim: set ts=2 sw=2: 3 | 4 | set -euo pipefail 5 | 6 | # Landing pages shipping with ZIM file are either truncated or Kiwix-specific. 7 | # This script finds the URL of original version of the langing page 8 | # mathing the timestamp of snapshot in unpacked ZIM directory 9 | 10 | usage() { 11 | echo "USAGE:" 12 | echo " $0
"; 13 | echo "" 14 | exit 2 15 | } 16 | 17 | if [ -z "${1-}" ]; then 18 | echo "Missing main page name (eg. Main_Page.html) " 19 | usage 20 | fi 21 | 22 | if [ -z "${2-}" ]; then 23 | echo "Missing unpacked zim dir (eg. ./out) " 24 | usage 25 | fi 26 | 27 | MAIN_PAGE=$1 28 | ZIM_ROOT=$2 29 | 30 | SNAPSHOT_URL=$(grep -io 'https://[^"]*oldid=[^"]*' "$ZIM_ROOT/A/$MAIN_PAGE") 31 | 32 | echo $SNAPSHOT_URL 33 | -------------------------------------------------------------------------------- /tools/getzim.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # internal 4 | 5 | BASE=$(pwd -P) 6 | CACHE="$BASE/.cache" 7 | 8 | # conf 9 | 10 | BASEURL="https://download.kiwix.org/zim/" 11 | 12 | # ui funcs 13 | 14 | log() { 15 | echo "$(date +%s): $*" 16 | } 17 | 18 | textmenu() { # allows the user to choose an id or item from a list 19 | echo 20 | a=($1) # turn $1 into an array 21 | [ -z "${a[1]}" ] && echo "Skipping question \"$2\" because there is only one choice: ${a[0]}" && res="${a[0]}" && return 0 22 | i=0 23 | for c in $1; do # build a list 24 | echo "[$i] $c" 25 | i=$(expr $i + 1) 26 | done 27 | [ -z "$3" ] && read -p "[?] $2 > " _id #if no element was specified as cli arg ask the user 28 | [ ! -z "$3" ] && _id="$3" #otherwise use that 29 | id=$(echo "$_id" | sed "s|[^0-9]||g") #only numbers 30 | for e in $1; do [ "$e" == "$_id" ] && res="$e" && echo "< $res" && return 0; done #check if item is in list 31 | res=${a[$id]} 32 | [ -z "$res" ] && [ ! -z "$3" ] && echo "Invalid ID or item: $3" && exit 2 #if id/item was specified via cli exit if invalid 33 | [ -z "$id" ] && [ -z "$_id" ] && textmenu "$1" "$2" && return 0 #no input 34 | [ -z "$id" ] && echo "Please enter a number or an item name" && textmenu "$1" "$2" && return 0 35 | [ -z "$res" ] && echo "INVALID ID" && textmenu "$1" "$2" && return 0 36 | echo "< $res" #show the choice to the user 37 | } 38 | 39 | # scraper fncs 40 | 41 | fetch_with_cache() { 42 | OUTFILE=${1/"/"//"_"} 43 | OUTFILE="$CACHE/_page$OUTFILE" 44 | 45 | if [ -e "$OUTFILE" ]; then 46 | cat "$OUTFILE" 47 | else 48 | OUT=$(curl -sL "$BASEURL$1") 49 | mkdir -p "$CACHE" 50 | echo "$OUT" > "$OUTFILE" 51 | echo "$OUT" 52 | fi 53 | } 54 | 55 | get_urls() { 56 | grep href | grep -v "
" | sed -E 's|.*href="(.*)".*|\1|g' | sed "s|/||g"
 57 | }
 58 | 
 59 | # main funcs
 60 | 
 61 | cmd_cache_update() {
 62 |   echo "Updating cache..."
 63 | 
 64 |   rm -rf "$CACHE"
 65 |   mkdir -p "$CACHE"
 66 |   for url in $(fetch_with_cache | get_urls); do
 67 |     echo "Updating cache for $url..."
 68 |     fetch_with_cache "$url" > /dev/null
 69 |   done
 70 | }
 71 | 
 72 | urlp() {
 73 |   # usage: get var
 74 |   # usage: filter type lang edition tags... date/"latest"
 75 | 
 76 |   case "$1" in
 77 |     get)
 78 |       get_var="$2"
 79 |       ;;
 80 |     filter)
 81 |       shift
 82 |       filter_type="$1"
 83 |       shift
 84 |       filter_lang="$1"
 85 |       shift
 86 |       filter_edition="$1"
 87 |       shift
 88 | 
 89 |       filter_tags=()
 90 |       while [ ! -z "$2" ]; do
 91 |         filter_tags+=("$1")
 92 |         shift
 93 |       done
 94 |       filter_tags="${filter_tags[*]}"
 95 | 
 96 |       if [ -z "$filter_tags" ]; then
 97 |         filter_tags="notag"
 98 |       fi
 99 | 
100 |       if [ "$1" != "latest" ]; then
101 |         filter_date="$1"
102 |       fi
103 |       shift
104 |       ;;
105 |   esac
106 | 
107 |   while read url; do
108 |     type=""
109 |     lang=""
110 |     edition=""
111 | 
112 |     tags=()
113 |     date=""
114 | 
115 |     for group in $(echo "$url" | sed "s|.zim||g" | tr "_" "\n"); do
116 |       if [ -z "$type" ]; then
117 |         type="$group"
118 |       elif [ -z "$lang" ]; then
119 |         lang="$group"
120 |       elif [ -z "$edition" ]; then
121 |         edition="$group"
122 |       elif [[ "$group" == "20"* ]]; then
123 |         date="$group"
124 |       else
125 |         tags+=("$group")
126 |       fi
127 |     done
128 | 
129 |     tags="${tags[*]}"
130 | 
131 |     if [ -z "$tags" ]; then
132 |       tags="notag"
133 |     fi
134 | 
135 |     if [ ! -z "$get_var" ]; then
136 |       echo "${!get_var}"
137 |     else
138 |       if [ -z "$filter_type" ] || [[ "$filter_type" == "$type" ]]; then
139 |         if [ -z "$filter_lang" ] || [[ "$filter_lang" == "$lang" ]]; then
140 |           if [ -z "$filter_edition" ] || [[ "$filter_edition" == "$edition" ]]; then
141 |             if [ -z "$filter_tags" ] || [[ "$filter_tags" == "$tags" ]]; then
142 |               if [ -z "$filter_date" ] || [[ "$filter_date" == "$date" ]]; then
143 |                 echo "$url"
144 |               fi
145 |             fi
146 |           fi
147 |         fi
148 |       fi
149 |     fi
150 | 
151 |     # echo "type=$type, lang=$lang, edition=$edition, date=$date, tags=${tags[*]}"
152 |   done
153 | }
154 | 
155 | cmd_choose() {
156 |   # Select wiki
157 |   log "Getting wiki list..."
158 |   wikis=$(fetch_with_cache | get_urls)
159 |   textmenu "$wikis" "Select which wiki to mirror (choose 'other' for more)" "$1"
160 |   wiki="$res"
161 | 
162 |   log "Getting sub-wiki list..."
163 |   # there is a special case, "other", where multiple wikis are available
164 |   reallist=$(fetch_with_cache "$wiki" | get_urls | urlp get type | uniq | wc -l)
165 | 
166 |   if [ "$reallist" != "1" ]; then
167 |     wikireals=$(fetch_with_cache "$wiki" | get_urls | urlp get type | sort | uniq)
168 |     textmenu "$wikireals" "Select which wiki to mirror" "$1"
169 |     wikireal="$res"
170 |   else
171 |     wikireal="$wiki"
172 |   fi
173 | 
174 |   log "Getting language list..."
175 |   langs=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_" | urlp get lang | sort | uniq)
176 |   textmenu "$langs" "Select which language to mirror" "$2"
177 |   lang="$res"
178 | 
179 |   log "Getting edition list..."
180 |   editions=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}" | urlp get edition | sort | uniq)
181 |   textmenu "$editions" "Select which edition to mirror" "$3"
182 |   edition="$res"
183 | 
184 |   log "Getting tag list..."
185 |   tags=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${edition}" | urlp get tags | sed "s| |_|g" | sort | uniq)
186 |   textmenu "$tags" "Select which tags to use" "$4"
187 |   tag="$res"
188 | 
189 |   if [ "$tag" != "notag" ]; then
190 |     tagu="_$tag"
191 |   fi
192 | 
193 |   log "Getting date list..."
194 |   dates=$(fetch_with_cache "$wiki" | get_urls | grep "^${wikireal}_${lang}_${edition}${tagu}" | urlp get date | sort | uniq)
195 |   dates="latest $dates"
196 |   textmenu "$dates" "Select which date to mirror" "$5"
197 |   date="$res"
198 | 
199 |   echo
200 |   echo "  Download command:"
201 |   echo "    \$ $0 download $wiki $wikireal $lang $edition $tag $date"
202 |   echo
203 | 
204 |   while true; do
205 |     read -p "Download [y/N]: " doit
206 |     case "$doit" in
207 |       y)
208 |         cmd_download "$wiki" "$wikireal" "$lang" "$edition" "$tag" "$date"
209 |         exit $?
210 |         ;;
211 |       n)
212 |         exit 0
213 |         ;;
214 |     esac
215 |   done
216 | }
217 | 
218 | cmd_download_url() {
219 |   wiki="$1"
220 |   wikireal="$2"
221 |   lang="$3"
222 |   edition="$4"
223 |   tag="$5"
224 |   date="$6"
225 | 
226 |   tag=$(echo "$tag" | sed "s|_| |g")
227 |   tag=($tag)
228 | 
229 |   log "Getting download URL..."
230 |   URL=$(fetch_with_cache "$1" | get_urls | urlp filter "$wikireal" "$lang" "$edition" "${tag[@]}" "$date" | sort -s -r | head -n 1)
231 | 
232 |   if [ -z "$URL" ]; then
233 |     echo "ERROR: Download URL not found. Possibly removed?" >&2
234 |     exit 2
235 |   fi
236 | 
237 |   URL="$BASEURL$wiki/$URL"
238 | 
239 |   log "URL: $URL"
240 | 
241 |   # below is a mixture of https://stackoverflow.com/a/19841872/3990041, my knowledge and guesswork :P
242 |   SHA256=$(curl -sI "$URL" | grep digest | grep "SHA-256" | sed "s|digest: SHA-256=||g" | base64 -d -i | od -t x1 -An | tr "\n" " " | sed "s| ||g")
243 | 
244 |   log "SHA256: $SHA256"
245 | }
246 | 
247 | cmd_url() {
248 |   cmd_download_url "$@" >&2
249 |   echo '{"url":"'"$URL"'","sha256":"'"$SHA256"'"}'
250 | }
251 | 
252 | cmd_download() {
253 |   cmd_download_url "$@"
254 | 
255 |   # real=$(curl -sLI $url | grep "^Location:"  | sed "s|Location: ||g" | grep "[a-zA-Z0-9\/:\._-]*" -o) #all the redirects
256 |   OUTNAME=$(basename "$URL")
257 | 
258 |   dl_cycle() {
259 |     log "Downloading $OUTNAME..."
260 |     wget --continue -q --show-progress --progress=bar:force -P ./snapshots "$URL"
261 |     return $?
262 |   }
263 | 
264 |   check_cycle() {
265 |     log "Verifiying $OUTNAME..."
266 |     sha256="$SHA256  ./snapshots/$OUTNAME"
267 |     echo "$sha256" | sha256sum -c -
268 |     return $?
269 |   }
270 | 
271 |   if [ -e "$OUTNAME" ]; then
272 |     if ! check_cycle; then
273 |       if ! dl_cycle; then
274 |         echo "Download failed! Check your network!"
275 |       fi
276 |       if ! check_cycle; then
277 |         echo "It seems like your file is corrupted"
278 |         echo "Please remove it:" # we won't do that because the user might not want this
279 |         echo " \$ rm $OUTNAME"
280 |       fi
281 |     fi
282 |   else
283 |     if ! dl_cycle; then
284 |       echo "Download failed! Check your network!"
285 |     fi
286 |     if ! check_cycle; then
287 |       echo "It seems like your file is corrupted"
288 |       echo "Please remove it:" # we won't do that because the user might not want this
289 |       echo " \$ rm $OUTNAME"
290 |     fi
291 |   fi
292 | }
293 | 
294 | if [ -n "$(LC_ALL=C type -t cmd_$1)" ] && [ "$(LC_ALL=C type -t cmd_$1)" = function ]; then
295 |   CMD="$1"
296 |   shift
297 |   "cmd_$CMD" "$@"
298 |   exit 0
299 | else
300 |   echo "Usage: $0 cache_update"
301 |   echo "       $0 choose"
302 |   echo "       $0 download/url      "
303 |   exit 2
304 | fi
305 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "declaration": true,
 4 |     "importHelpers": true,
 5 |     "module": "commonjs",
 6 |     "outDir": "lib",
 7 |     "rootDir": "src",
 8 |     "strict": true,
 9 |     "target": "es2017",
10 |     "esModuleInterop": true,
11 |     "allowJs": true
12 |   },
13 |   "include": [
14 |     "src/**/*"
15 |   ],
16 |   "exclude": [
17 |     "lib"
18 |   ]
19 | }


--------------------------------------------------------------------------------