Browse resources from the Internet Archive on RACHEL. The Internet Archive provides free public access to
13 | collections of digitized materials, including websites, software applications/games, music, movies/videos,
14 | moving images, and millions of public-domain books.
15 | When connected to the internet these resources can be browsed and a local copy kept for use offline.
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/mediawiki/mediawiki.conf:
--------------------------------------------------------------------------------
1 | Alias /mediawiki /var/www/html/mediawiki
2 | Alias /w /var/www/html/mediawiki
3 | Alias /mwlink /var/www/html/mediawiki
4 | Alias /wiki /var/www/html/mediawiki
5 |
6 |
7 | Options +FollowSymLinks
8 | AllowOverride All
9 | = 2.3>
10 | Require all granted
11 |
12 |
13 | order allow,deny
14 | allow from all
15 |
16 |
17 |
18 | # some directories must be protected
19 |
20 | Options -FollowSymLinks
21 | AllowOverride None
22 |
23 | php_admin_flag engine off
24 |
25 |
26 | php_admin_flag engine off
27 |
28 |
29 |
30 | Options -FollowSymLinks
31 | AllowOverride None
32 |
33 | php_admin_flag engine off
34 |
35 |
36 | php_admin_flag engine off
37 |
38 |
39 |
40 | Options -FollowSymLinks
41 | AllowOverride None
42 |
43 | php_admin_flag engine off
44 |
45 |
46 | php_admin_flag engine off
47 |
48 |
49 |
--------------------------------------------------------------------------------
/.eslintrc.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": {
3 | "browser": true,
4 | "jest/globals": true
5 | },
6 | "extends": "airbnb",
7 | "plugins": [
8 | "babel",
9 | "eslint-plugin-import",
10 | "eslint-plugin-react",
11 | "jest",
12 | "jsx-a11y"
13 | ],
14 | "rules": {
15 | "arrow-parens": 0,
16 | "babel/semi": 2,
17 | "comma-dangle": 0,
18 | "default-param-last": 0,
19 | "import/no-extraneous-dependencies": [
20 | "error", {
21 | "devDependencies": ["**/*.test.js", "**/*.stories.js"],
22 | "optionalDependencies": false,
23 | "peerDependencies": false
24 | }
25 | ],
26 | "jsx-a11y/label-has-for": 0,
27 | "jsx-a11y/label-has-associated-control": 2,
28 | "max-len": 0,
29 | "no-else-return": 0,
30 | "no-multi-assign": 0,
31 | "no-nested-ternary": 0,
32 | "no-param-reassign": 0,
33 | "no-plusplus": 0,
34 | "no-return-assign": 0,
35 | "no-underscore-dangle": 0,
36 | "prefer-destructuring": 0,
37 | "prefer-object-spread": 0,
38 | "prefer-template": 0,
39 | "quote-props": 0,
40 | "react/jsx-filename-extension": [
41 | 1, {
42 | "extensions": [".js", ".jsx"]
43 | }
44 | ],
45 | "react/prefer-stateless-function": 1,
46 | "semi": 2
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/dweb-mirror.config.yaml:
--------------------------------------------------------------------------------
1 | # Example User configuration for dweb mirror
2 | # Note in YAML, indentation is significant, and you can't have multiple spaces after the ":"
3 | apps:
4 | crawl:
5 | tasks:
6 | - identifier: # Iterate over each of these identifiers
7 | - "home"
8 | - "image"
9 | - "movies"
10 | level: "details" # Getting enough to show the details page - i.e. all the thumbnails
11 | search: # Override apps.crawl.opts.defaultDetailsSearch (which would get top 40 tiles)
12 | # To get the details for the 3 most popular i.e. the content required to display it
13 | - sort: "-downloads"
14 | rows: 3
15 | level: "details"
16 | # And tile for top 10
17 | - sort: "-downloads"
18 | rows: 10
19 | level: "tile"
20 |
21 | # And get top 40 tiles for Prelinger (as spec in crawl.opts.defaultDetailsSearch)
22 | - identifier: "prelinger"
23 | level: "details"
24 |
25 | # And retrieve playable movies for AboutBan1935 and Doctorin1946 that Brewster loves to demo
26 | - identifier:
27 | - AboutBan1935
28 | - Doctorin1946
29 | - commute
30 | - unitednov65unit
31 | - mbid-ac2b87af-2774-4575-a72a-db31c8865264
32 | - paf2007-09-29..c42-sto2.flac16
33 | - bdrc-W3PD1123
34 | - thetaleofpeterra14838gut
35 | - ArtOfCommunitySecondEdition
36 | - EnglishTeluguDictionary
37 | - JourneysInPersiaAndKurdistanVolII
38 | level: details
39 |
40 | - identifier:
41 | - bali-lontar-transcribed
42 | level: details
43 | search:
44 | - sort: '-downloads'
45 | rows: 20
46 | level: details
47 |
48 | - identifier:
49 | - mitratest
50 | level: details
51 | search:
52 | - sort: '-downloads'
53 | rows: 200
54 | level: details
55 |
--------------------------------------------------------------------------------
/mdns-register.js:
--------------------------------------------------------------------------------
1 | //#!/usr/bin/env node
2 | //process.env.DEBUG="dweb-mirror:mdns";
3 | /*
4 | * Register multicast DNS
5 | *
6 | * Adapted from https://github.com/mafintosh/register-multicast-dns that uses his `multicast-dns`
7 | *
8 | *
9 | */
10 | const addr = require('network-address')
11 | const multicastdns = require('multicast-dns')
12 | const debug = require('debug')('dweb-mirror:mdns');
13 |
14 | let mdns;
15 |
16 | function registerMDNS(name) {
17 | name = name.replace('.local', '')
18 | debug("MDNS registering %s.local", name);
19 | if (typeof mdns === "undefined") { // Setup if haven't already done so
20 | mdns = multicastdns(); }
21 |
22 | mdns.on('error', function () {
23 | // ignore errors
24 | })
25 |
26 | mdns.on('query', function (query) {
27 | query.questions.forEach(q => {
28 | if ((q.name.replace('.local', '') === name) && ["A","AAAA"].includes(q.type)) {
29 | debug("MDNS responding to query %s %s", q.type, q.name);
30 | mdns.respond({
31 | answers: [{
32 | name: q.name,
33 | type: 'A',
34 | ttl: 300,
35 | data: addr.ipv4()
36 | }],
37 | additionals: [{
38 | name: q.name,
39 | type: 'AAAA',
40 | ttl: 300,
41 | data: addr.ipv6()
42 | }]
43 | })
44 | } else {
45 | // This can get very verbose as Apple especially use MDNS a lot for services, just uncomment for debugging
46 | // debug("MDNS ignoring query %s %s %O", q.type, q.name, query);
47 | }
48 | });
49 | });
50 | }
51 | function destroyMDNS() {
52 | if (mdns) {
53 | debug("MDNS Destroying");
54 | mdns.destroy;
55 | mdns = undefined;
56 | } else {
57 | debug("MDNS Not started, no destroying");
58 | }
59 | }
60 | //mdns.destroy;
61 |
62 | exports = module.exports = {registerMDNS, destroyMDNS};
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "author": {
3 | "name": "Mitra Ardron",
4 | "email": "mitra@archive.org",
5 | "url": "https://www.mitra.biz"
6 | },
7 | "bugs": {
8 | "url": "https://github.com/internetarchive/dweb-mirror/issues"
9 | },
10 | "name": "@internetarchive/dweb-mirror",
11 | "dependencies": {
12 | "@futurepress/epubjs-reader": "git+https://github.com/futurepress/epubjs-reader.git",
13 | "@internetarchive/bookreader": "^4.40.2",
14 | "@internetarchive/dweb-archive-dist": "^0.2.29",
15 | "@internetarchive/dweb-archivecontroller": "^0.2.16",
16 | "@internetarchive/dweb-transports": "^0.2.22",
17 | "@stratumn/canonicaljson": "^1.0.3",
18 | "async": "^3.2.4",
19 | "child_process": "^1.0.2",
20 | "debug": "^4.3.4",
21 | "express": "^4.18.2",
22 | "getopts": "^2.3.0",
23 | "glob": "^8.1.0",
24 | "js-yaml": "^4.1.0",
25 | "level": "^8.0.0",
26 | "morgan": "^1.10.0",
27 | "multicast-dns": "^7.2.5",
28 | "multihashes": "^4.0.3",
29 | "network-address": "^1.1.2",
30 | "prettier-bytes": "^1.0.4",
31 | "readable-stream-clone": "^0.0.7",
32 | "sharp": "^0.32.5"
33 | },
34 | "description": "Javascript library for Internet Archive Decentralized Web project: Transport",
35 | "devDependencies": {
36 | "eslint": "^8.32.0",
37 | "eslint-config-airbnb": "^19.0.4",
38 | "eslint-plugin-babel": "^5.3.1",
39 | "eslint-plugin-import": "^2.27.5",
40 | "eslint-plugin-jest": "^27.2.1",
41 | "eslint-plugin-jsx-a11y": "^6.7.1",
42 | "eslint-plugin-react": "^7.32.1"
43 | },
44 | "homepage": "https://github.com/internetarchive/dweb-mirror#readme",
45 | "keywords": [],
46 | "license": "AGPL-3.0",
47 | "repository": {
48 | "type": "git",
49 | "url": "https://github.com/internetarchive/dweb-mirror.git"
50 | },
51 | "scripts": {
52 | "postpublish": "echo 'post publish script to build'",
53 | "dependers": "grep -l dweb-mirror ../dweb*/package.json | grep -v dweb-mirror/package.json",
54 | "editdependers": "vi `grep -l dweb-mirror ../dweb*/package.json | grep -v dweb-mirror/package.json`"
55 | },
56 | "version": "0.2.97"
57 | }
58 |
--------------------------------------------------------------------------------
/INSTALLATION-faq.md:
--------------------------------------------------------------------------------
1 | # Installation and Development FAQ
2 |
3 | Please always feel free to ask questions on the
4 | [dweb-mirror repo](https://github.com/internetarchive/dweb-mirror).
5 |
6 | #### Index
7 | 1. How to connect to a RPI when do not have ethernet
8 |
9 |
10 | #### 1. How to connect to a RPI when do not have ethernet
11 |
12 | A common scenario, you've got your RPI, and your laptop,
13 | and a cellphone with data (or a WiFi at your accomodation),
14 | but you do not have a screen, keyboard, mouse,
15 | and in particular no local ethernet, at least until you get onsite.
16 | BUT ... hopefully your laptop has an Ethernet port, or you brought a USB/Ether adapter.
17 | and an ethernet cable. (I travel with both of these for just this kind of scenario)
18 |
19 | And even if you could configure it, your RPI cannot talk upstream via WiFi
20 | and offer a local access point.
21 |
22 | The following instructions assume a Mac, a PR with edits for a Windows or Linux box
23 | would be appreciated.
24 |
25 | * Setup your Phone to Tether or find the WiFi SSID & password
26 | * Connect your laptop to the WiFi
27 | * On a Mac: Preferences -> Sharing -> Internet -> from WiFi to USB 10/100 LAN
28 | * Plug your ethernet cable between laptop and RPI
29 | * Power up the RPI
30 | * You should see the RPI's WiFi access point turn up,
31 | BUT do not connect as that will lose your upstream connection.
32 | * Now we need the RPI's address, this is non trivial,
33 | and there are a number of ways to try and find it.
34 | 1. Look for the dhcp-leases on your laptop.
35 | * On Mac OSX, look in /private/var/db/dhcpd_leases
36 | * On most Linux it will be /var/lib/dhcpd/dhcpd.leases
37 | 2. OR try pinging it
38 | * With OLIP ping olip.local should work (or whatever the box is called)
39 | * With IIAB ping box or box.local or box.lan might work.
40 | 3. OR try sshing into it
41 | * The RPI's WiFi hotspot should have shown up on the laptop
42 | * Find the IP address of the WiFi hotspot router in the RPI
43 | * On a Mac: Open Network Preferences; WiFi; Advanced; TCP/IP; Router;
44 | * Lets assume this is: 192.168.123.1
45 | * ssh pi@192.168.123.1
46 | * ifconfig, look for eth0 (or maybe en0), and the inet address
47 | * reconnect laptop to hotel/cellphone wifi
48 | * Once you have the address, you can ssh into it
49 | * ping www.google.com or some other known machine to check DNS etc are working
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/install_go.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # This script is intended to make sure GO is installed for a variety of platforms, especially MacOS, RaspberryPi and Rachel3+
3 |
4 | # First figure out if we'll use brew or apt-get
5 | if (which apt-get)
6 | then
7 | apt-get -y update
8 | INSTALL="apt-get -y install"
9 | elif (which brew)
10 | then
11 | INSTALL="brew install"
12 | else
13 | echo "Neither apt-get nor brew found"
14 | exit 1
15 | fi
16 |
17 | if [ -e "${HOME}/.profile" ]
18 | then PROFILE=${HOME}/.profile
19 | elif [ -e "${HOME}/.bash_profile" ]; then PROFILE=[ -e "${HOME}/.bash_profile" ]
20 | else touch ${HOME}/.profile
21 | fi
22 |
23 |
24 | # First have to see if have a version of go already
25 | if (go version)
26 | then
27 | # TODO if anyone can figure out some math on the go version and then to go ot apt-get if its <=1.9 and then to fail if apt-get doesn't fix that
28 | echo "Go appears to be installed: `go version`"
29 | echo "If this version is not >= 1.9 then install of ipfs-update will fail, try \"apt-get -y install golang\" and see if it improves version number"
30 | if [ -n "${GOPATH}" ]
31 | then
32 | echo "Go is telling us it is at ${GOPATH} so we wont try and guess"
33 | else
34 | echo "GOPATH isnt set so guessing where to find it"
35 | if [ -d "/usr/lib/go" ]
36 | then
37 | export GOPATH="/usr/lib/go"
38 | elif [ -f "${HOME}/go" ]
39 | then
40 | export GOPATH="${HOME}/go"
41 | else
42 | echo "GOPATH isnt set and we cant find go"
43 | echo "Unless you have it somewhere strange then please edit install_go.sh so it finds it automatically, and please submit as a PR"
44 | fi
45 | cat >>${PROFILE} <>${PROFILE} <"
8 |
9 | RUN apt-get -y update && apt-get golang git
10 |
11 | ### Install IPFS #####################################################
12 | #IPFS / Docker discussion on GIT
13 | #https://github.com/yeasy/docker-ipfs/issues/1
14 | #https://github.com/protocol/collab-internet-archive/issues/49
15 | #Includes suggested DOcker line of ..
16 | #docker run -d --name ipfs-node -v /tmp/ipfs-docker-staging:/export -v /tmp/ipfs-docker-data:/data/ipfs -p 8080:8080 -p 4001:4001 -p 127.0.0.1:5001:5001)
17 | ENV API_PORT 5001
18 | ENV GATEWAY_PORT 8080
19 | ENV SWARM_PORT 4001
20 |
21 | EXPOSE ${SWARM_PORT}
22 | # This may introduce security risk to expose API_PORT public
23 | EXPOSE ${API_PORT}
24 | EXPOSE ${GATEWAY_PORT}
25 |
26 | # #https://github.com/yeasy/docker-ipfs/issues/2 asks what this does,
27 | # Only useful for this Dockerfile
28 | #ENV FABRIC_ROOT=$GOPATH/src/github.com/hyperledger/fabric
29 |
30 | #Added to apt-get above, will need if run as separate docker
31 | #RUN apt-get -y update && apt-get -y install golang git
32 |
33 | # Add go path after installing go so it can find ipfs-update when installed
34 | ENV GOPATH=/root/go
35 | ENV PATH=${GOPATH}/bin:${PATH}
36 | ENV IPFS_PATH=/pv/ipfs
37 | # Mapped to a permanent volume - this has to be the home directory for the user running IPFS (which is root) so will need a link
38 | RUN ln -s /pv/ipfs /.ipfs
39 |
40 | ## On dweb.archive.org production not doing this, instead run docker with -v so it copies in a volume
41 | # Create the fs-repo directory and switch to a non-privileged user.
42 | #ENV IPFS_PATH /data/ipfs
43 | #RUN mkdir -p $IPFS_PATH \
44 | # && adduser -D -h $IPFS_PATH -u 1000 -G users ipfs \
45 | # && chown ipfs:users $IPFS_PATH
46 | # Expose the fs-repo as a volume.
47 | # start_ipfs initializes an fs-repo if none is mounted.
48 | # Important this happens after any USER directive so permission are correct.
49 | #VOLUME $IPFS_PATH
50 |
51 |
52 |
53 | # Install ipfs using ipfs-update
54 | # config the api endpoint, may introduce security risk to expose API_PORT public
55 | # config the gateway endpoint
56 | # allow access to API from localhost
57 | # enable URLstore so it doesnt copy the files locally
58 | RUN go get -u -v github.com/ipfs/ipfs-update \
59 | && ipfs-update install latest \
60 | && cp /app/ipfs_container_daemon_modified.sh /usr/local/bin/start_ipfs
61 |
62 | #TODO IPFS connection is to specific peer id in library '/dns4/dweb.me/tcp/4245/wss/ipfs/QmPNgKEjC7wkpu3aHUzKKhZmbEfiGzL5TP1L8zZoHJyXZW'
63 | #TODO which is dweb.me
64 | #TODO once dweb.me is running in Docker, then use `ipfs config show` to get new id and add to dweb-transports/TransportIPFS config
65 |
66 | # Supervisorctl > start_ipfs_modified.sh > ipfs daemon --enable-gc --migrate=true
67 |
--------------------------------------------------------------------------------
/URL_MAPPING.md:
--------------------------------------------------------------------------------
1 | # An overview of mapping of URLs in dweb
2 |
3 | Last update 2019-12-17
4 |
5 | #### Abbreviations and explanations
6 |
7 | * In express (HTTP server in dweb-mirror) parameters are :xyz
8 |
9 | |from|to|via|notes|
10 | |----|--|---|-----|
11 | |/|redirect|/archive/archive.html|
12 | |/admin/setconfig/IDENTIFIER/LEVEL|config.writeUserTaskLevel|info
13 | |/admin/crawl/*/CRAWLID|CrawlManager|crawl status or info
14 | |/arc/archive.org|redirect|/|Legacy
15 | |/arc/archive.org/*|redirect|/*|Legacy
16 | |/archive/bookreader/BookReader/*|_sendFileFromBookreader|config.bookreader.directory/*
17 | |/archive/epubreader/*|_sendFileFromEpubreader|config.epubreader.directory/*
18 | |/archive/*|_sendFileUrlArchive|config.archiveui.directory/*
19 | |/advancedsearch?*|streamQuery, fetch_query, Routing|https://www-dweb-cors.dweb.archive.org/advancedsearch.php?*
20 | |/bookreader/BookReader/*|_sendFileFromBookreader|config.bookreader.directory/*
21 | |/BookReader/BookReaderJSIA.php?*|sendBookReaderJSIA fetch_bookreader|DATANODE/Bookreader/BookreaderJSIA.php?*
22 | |/BookReader/BookReaderJSON.php?*|sendBookReaderJSON fetch_bookreader|DATANODE/Bookreader/BookreaderJSIA.php?*|converts JSIA to JSON format
23 | |/books/IDENTIFIER/ia_manifest|sendBookReaderJSON fetch_bookreader|DATANODE/Bookreader/BookreaderJSIA.php?*|converts JSIA to JSON format (api.ArchiveLab.org)
24 | |/BookReader/BookReaderImages.php|sendBookReaderImages fetchPage|https://DATANODE/BookReader/BookReaderPreview.php or /BookReader/BookReaderImages.php
25 | |/components/*|_sendFileUrlSubdir|config.archiveui.directory/component/*
26 | |/details|redirect|/archive/archive.html|
27 | |/details/IDENTIFIER|redirect|/archive/archive.html?identifier=IDENTIFIER
28 | |/details/IDENTIFIER/page/PAGE|redirect|/archive/archive.html?identifier=IDENTIFIER&page=PAGE|bookreader
29 | |/download/IDENTIFIER/__ia_thumb.jpg|streamThumbnail, Routing|https://archive.org/services/img/IDENTIFIER
30 | |/download/IDENTIFIER/page/PAGE|sendBookReaderImages fetchPage|https://DATANODE/BookReader/BookReaderPreview.php or /BookReader/BookReaderImages.php
31 | |/download/IDENTIFIER|redirect|/archive/archive.html?identifier=IDENTIFIER
32 | |/download/IDENTIFIER/*|streamArchiveFile Routing|https://archive.org/cors/IDENTIFIER/*
33 | |/embed/IDENTIFIER?output=json|sendPlaylist Routing|https://www-dweb-cors.dev.archive.org/embed/IDENTIFIER?output=json
34 | |/epubreader/*|_sendFileFromEpubreader|config.epubreader.directory/*
35 | |/favicon.ico|sendFile|config.archiveui.directory/favicon.ico|
36 | |/images/*|sendFileUrlSubdir|config.archiveui.directory/images/*
37 | |/includes/*|sendFileUrlSubdir|config.archiveui.directory/includes/*
38 | |/info|sendInfo|{info}
39 | |/ipfs/CID|proxyUrl|ipfs:/ipfs/CID
40 | |/jw/*|sendFIleUrlSubdir|config.archiveui.directory/jw/*
41 | |/langages/*|sendFileUrlSubdir|config.archiveui.directory/langages/*
42 | |/mds/v1/get_related/all/*|sendRelated Routing|https://be-api.us.archive.org/mds|
43 | |/metadata/IDENTIFIER|fetch_metadata Routing|https://www-dweb-metadata.dev.archive.org/metadata/IDENTIFIER
44 | |/metadata/*|proxyUrl, Routing|https://dweb.archive.org/metadata/*|this is ID/FILENAME and probably broken TODO
45 | |/playlist/IDENTIFIER|sendPlaylist Routing|https://www-dweb-cors.dev.archive.org/embed/IDENTIFIER?output=json
46 | |/search?*|redirect|/archive/archive.html?*
47 | |/search.php|redirect|/archive/archive.html?*
48 | |/serve/IDENTIFIER/FILENAME|streamArchiveFile AF.cacheAndOrStream this.urls|https://archive.org/download/IDENTIFIER/FILENAME, http://www-dweb-torrent.dev.archive.org/IDENTIFIER/IDENTIFIER_archive.torrent etc
49 | |/services/img/IDENTIFIER|streamThumbnail Routing|https://archive.org/services/img/IDENTIFIER
50 | |/stream/IDENTIFIER/UNUSED|redirect|/archive/archive.html?identifier=IDENTIFIER|palmleaf wiki
51 | |/thumbnail/IDENTIFIER|streamThumbnail Routing|https://archive.org/services/img/IDENTIFIER|Legacy
52 |
--------------------------------------------------------------------------------
/start_ipfs:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # NOTE THIS IS UNUSED AN UNMAINTAINED, IT MIGHT NOT WORK, PR's WELCOME ON github.com/@internetarchive/dweb-mirror
3 | # Modified from https://github.com/ipfs/go-ipfs/blob/master/bin/container_daemon
4 | # Almost same code in "dweb" and "dweb-mirror" repos - main difference is enabling Filestore
5 | # all changes commented
6 | set -e # Break on error
7 |
8 | # Requires caller to set up
9 | # $IPFS_PATH pointing to directory space to use (aka the "repo"), if unspecified will use .ipfs, On DOCKER=/pv/ipfs
10 | # $IPFS_USER if set, and run as root then will switch to this user
11 | # $IPFS_API_PORT if not 5001
12 | # $IPFS_GATEWAY_PORT if not 8080
13 | # $IPFS_SWAM_PORT if not 4001
14 | # $IPFS_WS_PORT if not 4002
15 |
16 |
17 | #DOCKER: make sure log includes date restarting and prominent === to help find crashes
18 | echo "===== Starting IPFS daemon at `date` ================="
19 |
20 | # If running as root && specified a IPFS_USER then relaunch as that user.
21 | #DOCKER runs as root, but doesnt specify a user,
22 | #DWEB-MIRROR may specify 'ipfs' or might just run as the logged in user (not sudo-ed)
23 | if [ `id -u` -eq 0 ]; then
24 | if [ -n "${IPFS_USER}" ]; then
25 | echo "Changing user to ${IPFS_USER}"
26 | # ensure folder is writable
27 | su-exec "${IPFS_USER}" test -w "${IPFS_PATH:=${HOME}/.ipfs}" || chown -R -- "${IPFS_USER}" "${IPFS_PATH:=${HOME}/.ipfs}"
28 | # restart script with new privileges
29 | exec su-exec "${IPFS_USER}" "$0" "$@"
30 | fi
31 | fi
32 |
33 | # 2nd invocation with regular user
34 | if [ -n "${IPFS_PATH}" ] ; then
35 | ln -s "${IPFS_PATH}" ${HOME}/.ipfs # I think IPFS is using .ipfs anyway,
36 | else
37 | IPFS_PATH="${HOME}/.ipfs"
38 | fi
39 |
40 | ipfs version
41 |
42 | #DOCKER: Want peerid allocated once per machine; files to persist across invocations, and config same on all sites
43 | #TODO: It should be coming up on 4001 and 4002 like dweb.me, check !
44 | if [ -e "${IPFS_PATH}/config" ]; then
45 | echo "Found IPFS fs-repo at ${IPFS_PATH}, not reconfiguring"
46 | else
47 | ipfs init # ipfs init will create new repo if one doesnt exist which allocates a new peer-id.
48 | #DOCKER: Allow parameterization of ports by env variables
49 | ipfs config Addresses.API /ip4/0.0.0.0/tcp/${IPFS_API_PORT:=5001}
50 | ipfs config Addresses.Gateway /ip4/0.0.0.0/tcp/${IPFS_GATEWAY_PORT:=8080}
51 | ipfs config --json Addresses.Swarm "[\"/ip4/0.0.0.0/tcp/${IPFS_SWARM_PORT:=4001}\",\"/ip6/::/tcp/${IPFS_SWARM_PORT:=4001}\",\"/ip4/0.0.0.0/tcp/${IPFS_WS_PORT:=4002}/ws\",\"/ip6/::/tcp/${IPFS_WS_PORT:=4002}/ws\"]"
52 | #DOCKER: Allow access to http API from localhost
53 | ipfs config --json API.HTTPHeaders.Access-Control-Allow-Origin '["http://localhost"]'
54 | ipfs config --json API.HTTPHeaders.Access-Control-Allow-Credentials '["true"]'
55 | ipfs config --json API.HTTPHeaders.Access-Control-Allow-Methods '["PUT", "GET", "POST"]'
56 | #DOCKER: Enable urlstore
57 | ipfs config --json Experimental.UrlstoreEnabled true
58 | #DOCKER: Not enabling Filestore on production dweb.archive.org but will on dweb-mirror dockers
59 | ipfs config --json Experimental.FilestoreEnabled true
60 | fi
61 |
62 | # if the first argument is daemon
63 | if [ "$1" = "daemon" ]; then
64 | # filter the first argument until
65 | # https://github.com/ipfs/go-ipfs/pull/3573
66 | # has been resolved
67 | shift
68 | else
69 | # print deprecation warning
70 | # go-ipfs used to hardcode "ipfs daemon" in it's entrypoint
71 | # this workaround supports the new syntax so people start setting daemon explicitly
72 | # when overwriting CMD
73 | echo "DEPRECATED: arguments have been set but the first argument isn't 'daemon'" >&2
74 | echo "DEPRECATED: run 'docker run ipfs/go-ipfs daemon $@' instead" >&2
75 | echo "DEPRECATED: see the following PRs for more information:" >&2
76 | echo "DEPRECATED: * https://github.com/ipfs/go-ipfs/pull/3573" >&2
77 | echo "DEPRECATED: * https://github.com/ipfs/go-ipfs/pull/3685" >&2
78 | fi
79 |
80 | exec ipfs daemon "$@"
81 |
--------------------------------------------------------------------------------
/search.js:
--------------------------------------------------------------------------------
1 | /*
2 | * Sorry - this is ugly, OLIP uses "opensearch" which is XML based,
3 | * and to avoid including overweight XML libraries,
4 | * I will output stuff via templates
5 | */
6 | const debug = require('debug')('dweb-mirror:search');
7 | const ArchiveItem = require('./ArchiveItemPatched');
8 |
9 | const ItemsPerPage = 75;
10 |
11 | function doQuery(o, opts, config, cb) {
12 | o.fetch_metadata(opts, (err, unusedAI) => { // Not passing noCache as query usually after a fetch_metadata
13 | if (err) {
14 | debug('streamQuery could not fetch metadata for %s', o.identifier);
15 | cb(err);
16 | } else {
17 | o.fetch_query({ copyDirectory: opts.copyDirectory, wantFullResp: true, noCache: opts.noCache }, (err1, resp) => { // [ArchiveMember*]
18 | if (err1) {
19 | debug('streamQuery for q="%s" failed with %s', o.query, err1.message);
20 | cb(err1);
21 | } else if (!opts.wantCrawlInfo) {
22 | // Note we are adding crawlinfo to o - the ArchiveItem, but the resp.response.docs
23 | // is an array of pointers into same objects so its getting updated as well
24 | cb(null, resp);
25 | } else {
26 | o.addCrawlInfo({ config, copyDirectory: opts.copyDirectory }, (unusederr, unusedmembers) => {
27 | resp.response.downloaded = o.downloaded;
28 | resp.response.crawl = o.crawl;
29 | cb(null, resp);
30 | });
31 | }
32 | });
33 | }
34 | });
35 | }
36 | // https://github.com/dewitt/opensearch/blob/master/opensearch-1-1-draft-6.md#opensearch-response-elements
37 | // https://validator.w3.org/feed/docs/atom.html
38 |
39 | function atomFeedInfo(resp, {query, opts}) {
40 | // Skipping paging as OLIP not doing it
41 | const {protoHost} = opts;
42 | const queryString = query.q;
43 | const encQuery = encodeURIComponent(queryString);
44 | const now = new Date(Date.now()).toISOString();
45 | return `
46 | Offline Internet Archive Search: ${queryString}
47 |
48 | ${now}
49 |
50 | Offline Internet Archive
51 |
52 | http://archive.org/search/${encQuery}
53 | ${resp.response.numFound}
54 | ${resp.response.start}
55 | ${ItemsPerPage}
56 |
57 |
58 | `;
59 | }
60 | function atomEntry(m, {protoHost}) {
61 | // TODO note archive search results do not usually include description field as can be large so omitted
62 | return `
63 |
64 | ${m.title}
65 |
66 | https://archive.org/details/${m.identifier}
67 | ${m.publicdate}
68 |
69 | `;
70 | }
71 | function atomFrom(resp, req) {
72 | return `
73 |
74 | ${atomFeedInfo(resp, req)}
75 | ${resp.response.docs.map(m => atomEntry(m, req.opts)).join('')}
76 |
77 | `;
78 | }
79 | function XMLdescriptor({protoHost=undefined}={}) {
80 | return `
81 |
82 | Offline Internet Archive
83 | Offline Internet Archive search engine
84 |
85 |
86 | `;
87 | }
88 | function searchExpress(req, res, next) {
89 | // req should have proto host set to e.g. http://localhost:4244 or https://www-dweb-mirror.dev.archive.org
90 | if (!req.query.q) {
91 | res.status(200).send(XMLdescriptor(req.opts));
92 | } else {
93 | const sort = "-downloads"; // Opensearch doesnt specify any other metric
94 | const o = Object.assign(new ArchiveItem({sort, query: req.query.q}), { rows: ItemsPerPage, page: 1, });
95 | doQuery(
96 | o,
97 | Object.assign(req.opts, {wantCrawlInfo: false}),
98 | undefined, // config - used with crawl info - since have no way to return in Atom, for now its undefined
99 | (err, searchResult) => { // Looks like Archive search result with downloaded and crawl fields added
100 | if (err) {
101 | next(err); // doQuery sends to debug
102 | } else {
103 | res.status(200).send(atomFrom(searchResult, req));
104 | }
105 | }
106 | );
107 | }
108 | }
109 |
110 |
111 | exports = module.exports = {searchExpress, doQuery};
112 |
--------------------------------------------------------------------------------
/ArchiveFilePatched.js:
--------------------------------------------------------------------------------
1 | /* eslint-disable func-names */
2 | /*
3 | // Monkey patches dweb-archivecontroller,
4 | // Note cant merge into dweb-archivecontroller as wont work in browser; and cant create subclass as want everywhere e.g. archivefile.fetch_metadata is used to use the cache
5 | */
6 |
7 | // Standard files
8 | const debug = require('debug')('dweb-mirror:ArchiveFile');
9 | const path = require('path');
10 | // Other Archive repos
11 | const { ArchiveFile, routed } = require('@internetarchive/dweb-archivecontroller');
12 | // Local files
13 | const MirrorFS = require('./MirrorFS');
14 |
15 | /**
16 | * Common arguments across all API functions
17 | *
18 | * copyDirectory points at top level of a cache where want a copy
19 | * relFilePath path to file or item inside a cache IDENTIFIER/FILENAME
20 | * noCache ignore anything in the cache - forces re-fetching and may cause upstream server to cache it TODO-API check this is not obsoleted by separate read and write skipping
21 | * noStore do not store results in cache
22 | * skipFetchFile as an argument causes file fetching to be suppressed (used for testing only)
23 | * skipNet do not try and use the net for anything
24 | * wantStream Return results as a stream, just like received from the upstream.
25 | * wantSize Return the size as a byte-count.
26 | * copyDirectory Specify alternate directory to store results in rather than config.directories[0]
27 | * darkOk True if a dark item is a valid response (if false, and item is dark will throw an error)
28 | * start,end First and last bytes to return (0 and undefined by default)
29 | * cb(err, res) Unless otherwise documented callbacks return an error, (subclass of Error) or null, and optional return data.
30 | * Some functions also support an absent cb as returning a Promise, otherwise cb is required
31 | * feel free to add Promise support to any function lacking it, search for "Promise pattern v2" for examples of how to do this consistently.
32 | */
33 |
34 |
35 | /**
36 | * Cache an ArchiveFile, and - if wantStream=true - stream it to the consumer.
37 | * skipNet=true means just provide information on the file, do not retrieve from the net.
38 | * See above for other arguments.
39 | */
40 | ArchiveFile.prototype.cacheAndOrStream = function ({
41 | skipFetchFile = false, skipNet = false, wantStream = false, noCache = false, wantSize = false, wantBuff = false,
42 | copyDirectory = undefined, start = 0, end = undefined
43 | } = {}, cb) {
44 | const { identifier } = this; // Not available in events otherwise
45 | const filename = this.metadata.name;
46 | const debugname = [identifier, filename].join('/');
47 | MirrorFS.cacheAndOrStream({ // Try first time without Urls, keep local - note noCache will make this return error unless sha1 specified as no urls either.
48 | skipFetchFile, wantStream, wantBuff, start, end, debugname, noCache, copyDirectory, wantSize,
49 | sha1: this.metadata.sha1,
50 | relFilePath: path.join(identifier, filename),
51 | expectsize: this.metadata.size,
52 | ipfs: this.metadata.ipfs // Will usually be undefined as not currently retrieving
53 | }, (err, streamOrUndefinedOrSizeOrBuff) => {
54 | if (err && skipNet) {
55 | cb(err);
56 | } else if (err) { // Unable to retrieve locally, lets get urls and try again
57 | // noinspection JSIgnoredPromiseFromCall
58 | this.urls((err1, urls) => {
59 | if (err1) {
60 | cb(err1);
61 | } else {
62 | MirrorFS.cacheAndOrStream({
63 | skipFetchFile, wantStream, wantSize, wantBuff, start, end, debugname, noCache, copyDirectory,
64 | urls: routed(urls),
65 | sha1: this.metadata.sha1,
66 | relFilePath: path.join(identifier, filename),
67 | expectsize: this.metadata.size,
68 | ipfs: this.metadata.ipfs // Will usually be undefined as not currently retrieving
69 | }, (err2, streamOrUndefinedOrSizeOrBuff1) => {
70 | if (err2) {
71 | debug('Unable to cacheOrStream %s', debugname);
72 | cb(err2);
73 | } else {
74 | if (!wantStream && !(start || end)) { this.downloaded = true; } // No error, and not streaming so must have downloaded
75 | cb(null, (wantStream || wantSize || wantBuff) ? streamOrUndefinedOrSizeOrBuff1 : this);
76 | }
77 | });
78 | }
79 | });
80 | } else { // The local check succeeded
81 | this.downloaded = true;
82 | cb(null, (wantStream || wantSize || wantBuff) ? streamOrUndefinedOrSizeOrBuff : this);
83 | }
84 | });
85 | };
86 |
87 | /**
88 | * Return true if the file is downloaded
89 | * See common arguments above.
90 | */
91 | ArchiveFile.prototype.isDownloaded = function ({ copyDirectory = undefined }, cb) {
92 | if (this.downloaded === true) { // Already know its downloaded - note not rechecking, so its possible it was deleted.
93 | cb(null, this.downloaded);
94 | } else { // Maybe, lets check
95 | this.cacheAndOrStream({
96 | copyDirectory, skipNet: true, wantStream: false, wantSize: !this.metadata.size
97 | }, (err, res) => {
98 | if (!err && !this.metadata.size) {
99 | this.metadata.size = `${res}`; // TODO needs to be a string
100 | }
101 | // cacheAndOrStream has side effect of setting downloaded
102 | cb(null, !err);
103 | });
104 | }
105 | };
106 |
107 | exports = module.exports = ArchiveFile;
108 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # This is the master Dockerfile
2 | # it should work, but AFAIK noone is currently using dweb-mirror under Docker so if not please send post a bug report or PR
3 | # In most cases install.sh is a better way to get something running on a box.
4 | # There is a deploy of this dweb-mirror repo running under nomad at https://www-dweb-mirror.dev.archive.org
5 | # There is a variation of this in OLIP https://gitlab.com/bibliosansfrontieres/olip/dweb-mirror
6 | # The changes in both those dockerfiles are incorporated below, but commented out.
7 | #
8 |
9 | # Docker quick reference - for more details check https://docs.docker.com/engine/reference/builder/
10 | # Building
11 | # > cd ...dweb-mirror
12 | # > docker image build --no-cache -t mitraardron/dweb-mirror:latest . # Use no-cache or it might not rebuild from a changed npm
13 | # > docker push mitraardron/dweb-mirror:latest # Send to repo (this is usually not done)
14 | # For testing
15 | # > docker run -i -p 4244:4244 --name internetarchive mitraardron/dweb-mirror:latest # Test it
16 | # > docker run -i -p 4244:4244 --name internetarchive mitraardron/dweb-mirror:latest /bin/bash # OR run bash inside it
17 | # For production
18 | # > docker run -d —name internetarchive -p 4244:4244 mitraardron/dweb-mirror:latest # Run production
19 | # > docker container stop mirrorHttp # Stop running server
20 | # > docker container rm mirrorHttp # Delete container
21 | # > docker logs mirrorHttp # See the logs
22 |
23 | ## Specify node version, alternatives node:12 or node:12-alpine but
24 | # alpine images are missing git, which is needed for dependencies of dweb-archive-dist
25 | # and node:12 not available on i386 and is missing apk
26 | # www-dweb-mirror uses node:12 OLIP uses node:12-alpine
27 | # BUT sharp requires node:14 so updating here.
28 | FROM node:14
29 | # OLIP uses ...
30 | #ARG ARCH
31 | #FROM $ARCH/node:12-alpine
32 |
33 | LABEL maintainers="Mitra Ardron , Tracey Jaquith "
34 | WORKDIR /app
35 |
36 | ## Yarn used to need installing, but is now present in alpine docker and node:12 images
37 | # Yarn needs npm for the build, but should be happy with the version in the docker base distro
38 | #RUN npm i npm@latest -g
39 | # Install yarn which does a better job of de-duplicating etc
40 | #RUN npm i yarn -g
41 |
42 | ## Need git for npm to be able to install some dependencies deep in the tree (its a known node:12 issue)
43 | # Have to run as root to do the apt steps
44 | USER root
45 | # Stole this line from https://github.com/tarampampam/node-docker/blob/master/Dockerfile
46 | # Git is neeed for install, could probably switch to the apk lines if it works on www-dweb-mirror
47 | RUN set -x \
48 | apt-get update \
49 | && apt-get -yq install git \
50 | && apt-get -yq clean \
51 | && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
52 | && git --version && bash --version && ssh -V && npm -v && node -v && yarn -v
53 | #if you want bash or ssh:
54 | #RUN apt-get -yq install bash openssh-server
55 |
56 | # OLIP uses following, but `apk` is alpine linux
57 | # Also OLIP is adding python, make g++ and vips-dev which must be for debugging ?
58 | #RUN set -ex; \
59 | # apk --no-cache --update add git
60 | # mkdir -p /root/archiveorg
61 | #
62 | # i386 needs some extra packages to build dweb-mirror apparently.
63 | #RUN set -ex; \
64 | # [ `uname -p` = "i386" ] && apk --no-cache --update add python make g++ vips-dev;
65 |
66 | ## Connect to a persistent volume for (potentially large) data caching
67 | # OLIP - /data/archiveorg as /data is persistent. (Added to configDefaults.yaml#directories)
68 | # nomad: /root/archiveorg : data intentionally not persistent as used for testing
69 | RUN mkdir -p /root/archiveorg
70 |
71 | ## Copy a user config for dweb-mirror, this should be in one of the locations listed in configDefaults.yaml
72 | # Setup initial crawl - do this BEFORE the 'yarn add' of dweb-mirror
73 | # This config file is a good place to override anything (like port numbers, or initial crawl) needed for specific applications.
74 | # TODO-OLIP - need strategy for where to put this and where to read it
75 | COPY ./dweb-mirror.config.yaml /root/dweb-mirror.config.yaml
76 |
77 |
78 | ## The main install, could use "COPY" but this is sure then to get a release rather than whatever is local
79 | #Have to run install during the build otherwise will build for different environment and may fail with ELF error
80 | RUN yarn add @internetarchive/dweb-mirror
81 | RUN yarn add supervisor
82 |
83 | ## tell the world which port we use, doesnt actually make docker do anything
84 | # On dweb-mirror this is 4244
85 | # You can change this, but it MUST match the port in dweb-mirror.config.yaml
86 | EXPOSE 4244
87 |
88 | ## Nasty hack to unhack this nasty line in archive.js :-) which generates unwanted logs if running on certain CI servers at IA
89 | # nomad www-dweb-mirror only but has no negative impact on any other setup
90 | #var log = location.host.substr(0, 4) !== 'www-' ? function () {} : console.log.bind(console);
91 | RUN sed -i.BAK -e 's/www-/xwww-/' '/app/node_modules/@internetarchive/dweb-archive-dist/includes/archive.js'
92 | RUN sed -i.BAK -e 's/www-/xwww-/' '/app/node_modules/@internetarchive/dweb-archive-dist/includes/archive.min.js'
93 |
94 | WORKDIR /app/node_modules/@internetarchive/dweb-mirror
95 |
96 | # when this container is invoked like "docker exec .." this is what that will run
97 | CMD [ "/app/node_modules/.bin/supervisor", "-i", "..", "--", "internetarchive", "-sc" ]
98 |
--------------------------------------------------------------------------------
/install_ipfs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 |
4 | # I put this script together given the unavailability of any "official" way to install IPFS from a script
5 | # Alternatives include "ipfs-update" which is written in GO, and installs from source, but depends on GO >= 1.9 which means it fails on Raspberry Pis
6 | # Or getting from the web page at 'https://dist.ipfs.io/#ipfs-update' but that isn't easy to automate since the links are version specific
7 | # and currently there is no "latest" link.
8 | # See https://github.com/ipfs/go-ipfs/issues/5998
9 |
10 | # Choice of strategy - if none uncommented it will use the best strategy, usually "update"
11 | #STRATEGY="binary" # Go direct to the Go binary
12 | #STRATEGY="update" # Get the binary of ipfs-update and use that to fetch ipfs - this is generally best, and essential if IPFS already installed
13 | #STRATEGY="source" # This will be needed if the binaries arent available for your platform, but GO is.
14 |
15 |
16 | # Hard coded latest version since "lastest" isn't supported (yet)
17 | # Last line of https://dist.ipfs.io/go-ipfs/versions is current version number, fetching this is which is what ipfs-update does
18 | IPFS_LATEST_VERSION="`curl -Ls https://dist.ipfs.io/go-ipfs/versions | tail -1`" # Starts with v
19 | IPFS_CURRENT_VERSION="`ipfs --version 2>/dev/null | cut -c 14-`" # On failure will be empty, starts with digit
20 |
21 | # The startup script sets configuration the first time IPFS is run, which we also use in our Docker installs of dweb-gateway
22 | # On dweb-mirror its in dweb-mirror, in dweb its in /app/ but either way should be in PWD
23 | # Note IPFS_STARTSCRIPT uses (but has reasonable defaults) for: IPFS_USER; IPFS_API_PORT; IPFS_GATEWAY_PORT; IPFS_SWAM_PORT; IPFS_WS_PORT
24 | [ -z "${IPFS_STARTSCRIPT}"] && IPFS_STARTSCRIPT="${PWD}/start_ipfs"
25 | if [ -e "${IPFS_STARTSCRIPT}" ];
26 | then
27 | echo "Will install ${IPFS_STARTSCRIPT} as /usr/local/bin/start-ipfs.sh"
28 | else
29 | echo "There is no ${IPFS_STARTSCRIPT}, include one if you want to configure ipfs at first run"
30 | fi
31 |
32 |
33 | # Generic function to get a binary from
34 | function installLatestBinary { # arg is ipfs-update or go-ipfs
35 | PACKAGE="$1"
36 | LATEST_VERSION="`curl -Ls https://dist.ipfs.io/${PACKAGE}/versions | tail -1`" # Starts with v
37 | CURRENT_VERSION="`${PACKAGE} --version 2>/dev/null | cut -d ' ' -f 3`" # On failure will be empty
38 | if [ "${CURRENT_VERSION:=0}" = "${LATEST_VERSION}" ]
39 | then
40 | echo "Current version of ${PACKAGE} already installed"
41 | fi
42 | TARGZ="${PACKAGE}_${LATEST_VERSION}_${GOOS}-${GOARCH}.tar.gz"
43 | URL=https://dist.ipfs.io/${PACKAGE}/${LATEST_VERSION}/${TARGZ}
44 | pushd /tmp \
45 | && curl -Lv -o${TARGZ} ${URL} \
46 | && tar xvf ${TARGZ} \
47 | && cd ${PACKAGE} \
48 | && ./install.sh \
49 | && popd \
50 | && echo "You can safely delete /tmp/${TARGZ} and /tmp/ipfs"
51 | }
52 |
53 | if [ -n "${IPFS_CURRENT_VERSION}" ]
54 | then
55 | if [ "v${IPFS_CURRENT_VERSION}" = "${IPFS_LATEST_VERSION}" ]
56 | then
57 | echo "Current version of IPFS ${IPFS_LATEST_VERSION} is already installed"
58 | STRATEGY="skip"
59 | else
60 | echo "IPFS installed but not current version, will update"
61 | STRATEGY="update"
62 | fi
63 | else
64 | echo "IPFS does not appear to be installed"
65 |
66 | # Convert the portable uname results into go specific environment
67 | case `uname -m` in
68 | "armv7l") GOARCH="arm";; # e.g. Raspberry 3. Note armv8 and above would use what IPFS has as arm64, armv7 and down want "arm"
69 | "x86_64") GOARCH="amd64";; # e.g. a Mac OSX
70 | i?86) GOARCH="386";; # e.g. a Rachel3+
71 | *) echo "Unknown processor type `uname -m`- please check install_ipfs.sh but will try source"; STRATEGY="source";;
72 | esac
73 | case `uname -s` in
74 | "Darwin") GOOS="darwin";; # e.g. a Mac OSX
75 | "Linux") GOOS="linux";; # e.g. Raspberry 3 or Rachel3+
76 | *) echo "Unknown Operating system type - please check install_ipfs.sh but will try source"; STRATEGY="source";;
77 | esac
78 |
79 |
80 | [ -z "${STRATEGY}" ] && STRATEGY="update"
81 | case "${STRATEGY}" in
82 | "binary")
83 | installLatestBinary go-ipfs
84 | ;;
85 | "update")
86 | installLatestBinary ipfs-update \
87 | && ipfs-update install latest
88 | ;;
89 | "source")
90 | if (go version 2>/dev/null) && [ -n "${GOPATH}" ]
91 | then
92 | echo "Go already installed"
93 | else
94 | ./install_go.sh
95 | fi \
96 | && go get -u -v github.com/ipfs/ipfs-update \
97 | && ipfs-update install latest
98 | ;;
99 | "skip")
100 | ;;
101 | esac
102 | # first time IPFS_STARTSCRIPT is run it should configure IPFS and init the repo
103 | if [ -e "${IPFS_STARTSCRIPT}" ]; then
104 | cp ${IPFS_STARTSCRIPT} /usr/local/bin/start_ipfs
105 | echo "Start ipfs with: start_ipfs daemon"
106 | else
107 | # If you need any config changes on existing packages and do not have start_ipfs they can go here
108 | ipfs config --json Experimental.FilestoreEnabled true
109 | fi
110 | fi
111 | exit
112 | ######################################################################################
113 | Install IPFS, there are several strategies in install_ipfs.sh that should at least cover your Mac,
114 | but it might need editing if you have an odd combinations.
115 | ```
116 | cd ~/node_modules/@internetarchive/dweb-mirror
117 | ./install_ipfs.sh
118 | ```
119 | Now start the daemon, first time it initializes and configures a repo
120 | ```
121 | start_ipfs daemon &
122 | ```
123 | If it complains that 8080 is in use, then you missed editing start_ipfs and can fix this with
124 | ```
125 | ipfs config Addresses.Gateway /ip4/0.0.0.0/tcp/8081
126 | start_ipfs daemon &
127 | ```
128 | Allow ipfs to start, once it says Daemon is ready, Ctrl-C out of it
--------------------------------------------------------------------------------
/configDefaults.yaml:
--------------------------------------------------------------------------------
1 | --- #Default configuration for dweb-mirror
2 | # Note in YAML, indentation is significant, and you can't have multiple spaces after the ":"
3 |
4 | # Cache directory - where you want to store files, this directory must already exist
5 | # List of places to look for the Cache directory - will check all of them, and they don't need to exist
6 | # Paths can include unix conventions like ~ * . .. etc
7 | # Default is to look in home directory and root of any device (for example a USB) for directories "archiveorg"
8 | directories:
9 | - ~/archiveorg # Home directory of user
10 | - /Volumes/*/archiveorg # Any connected volume on standard unix
11 | - /media/pi/*/archiveorg # At least on Internet-in-a-Box USB volumes are put here
12 | - /media/usb*/archiveorg # IIAB v7 seems to put them here via usbmount
13 | - /.data/archiveorg # This is where Rachel3+ puts its data
14 | - /library/archiveorg # This is where IIAB wants content
15 | - /data/archiveorg # Persistent and shared volume on OLIB
16 | rescanDirectories: 15 # How often to rescan for these directories (in seconds)
17 | # Where to find the ArchiveUI relative to the directory this file and the code resides in
18 | archiveui:
19 | directories: # Note the code uses the first of these found to exist - note this should match the list in install.sh (which will be deprecated)
20 | - ../dweb-archive/dist # Dev repo parallel to this
21 | - node_modules/@internetarchive/dweb-archive/dist # Dev repo as dependency
22 | - ../dweb-archive-dist # Dist repo cloned parallel to us (this is the most common config)
23 | - node_modules/@internetarchive/dweb-archive-dist # Dist repo as a dependency
24 | bookreader:
25 | directories: # Note the code uses the first of these found to exist
26 | - ../bookreader/BookReader # Try a repo cloned to a directory parallel or installed to this one, which is presumably for development
27 | - node_modules/@internetarchive/bookreader/BookReader # Or a repo installed as a dependency via 'yarn install'
28 | epubreader:
29 | directories: # Note the code uses the first of these found to exist
30 | - ../epubreader/reader # Try a repo cloned to a directory parallel or installed to this one, which is presumably for development
31 | - ../../@futurepress/epubjs-reader/reader # Try repo installed as dependency at same level
32 | - node_modules/@futurepress/epubjs-reader/reader # Or a repo installed as a dependency via 'yarn install'
33 | nodemodules:
34 | directories: # Note the code uses the first of these found to exist
35 | - node_modules
36 | # How to connect to the net, for dweb-mirror HTTP directly to the gateway is usually the best way
37 | # especially for low CPU/low bandwidth environments.
38 | connect:
39 | transports: # This is the list of transports that the server or crawler will use to fetch files
40 | - 'HTTP'
41 | #- 'IPFS'
42 | #- 'WEBTORRENT'
43 | #- 'GUN'
44 | #- 'WOLK'
45 | webtorrent: # Options to webtorrent
46 | tracker: 'wrtc' # May or may not work on different platforms and needs installing
47 | trackers:
48 | - 'wss://dweb.archive.org:6969' # TODO-DM242/round4
49 | - 'wss://tracker.btorrent.xyz'
50 | - 'wss://tracker.openwebtorrent.com'
51 | - 'wss://tracker.fastcast.nz'
52 | ipfs: # Options to IPFS
53 | urlUrlstore: 'http://localhost:5001/api/v0/urlstore/add'
54 | preferredStreamTransports: # A list, in order of preference of transports to use for streams, only works if they are listed in "connect" above.
55 | - 'WEBTORRENT'
56 | - 'HTTP'
57 | - 'IPFS'
58 | mdns: "archive" # By default respond to archive.local via MDNS
59 | # The apps group include configuration only used by one application
60 | apps:
61 | # internetarchive --server uses these
62 | http:
63 | port: 4244
64 | morgan: ':method :url :req[range] :status :res[content-length] :response-time ms' # Used for logging
65 | # internetarchive --crawl uses these
66 | crawl:
67 | # An array of tasks each consists of { identifier, level, and optional search & related
68 | # level is one of:
69 | # tile: sufficient to draw Tile in its parent;
70 | # metadata: including metadata info (not often used);
71 | # details: enough to render a page, including e.g. low res video
72 | # all: all files in the item - beware this can be large
73 | # search & related consists of sort: (only -downloads supported at present), rows: how many items to retrieve;
74 | # level, search & related inside another search or related applies to the items retrieved by that search/related and nests indefinitely.
75 | # Leave tasks empty in configDefaults, it will get overwritten by the user's tasks.
76 | tasks: [ ]
77 | # opts controls how the search performs
78 | opts:
79 | concurrency: 10 # No more than this many tasks at a time (typically 10 open file downloads or searches
80 | limitTotalTasks: 3000 # No more than this many tasks total (typically one per item & file.
81 | maxFileSize: 200000000 # Maximum size of any file retrieved
82 | noCache: false # Set to true to ignore current cache contents, this is used to force the upstream server to look at each item and cache it
83 | skipFetchFile: false # Set to true to ignore actually fetching files, most often used for debugging
84 | # Default crawls if either search &| related are unspecified but crawling an item with level=detail||full
85 | defaultDetailsSearch:
86 | rows: 40 # Collect the first 40 items
87 | sort: '-downloads' # Based on the most popular
88 | level: tile # With enough information to show a tile
89 | defaultDetailsRelated:
90 | sort: '-downloads' # And choose related based on downloads
91 | rows: 6 # 6 related items
92 | level: tile # sufficiently to show a tile
93 | startOnChange: true # True if should start crawling any item that changes its level of crawing
94 | crawlEpubs: true # Set to false, or comment out to skip crawling epubs
95 | crawlPalmLeaf: false # Set to true, when on box hosting palmleafwiki
96 | palmleafwiki: # TODO-MEDIAWIKI make installation auto change this
97 | pagelink: 'https://palmleaf.org/wiki' # Prefix for any pages to open in palmleaf, use 'http://MIRRORHOST/wiki/' if coinstalled
98 | # copyDirectory: /Volumes/Transcend/test2 # Overrides where to store data, normally this is a command-line parameter if used.
99 | #============== Do not edit anything from here on down ============================
100 |
--------------------------------------------------------------------------------
/INSTALLATION-rachel.md:
--------------------------------------------------------------------------------
1 | # Installation Instructions for dweb-mirror on Rachel 3+ (from WorldPossible)
2 |
3 | If you not installing the offline archive on a Rachel 3+ then one of these documents
4 | will be much easier to follow.
5 |
6 | * [INSTALLATION.md](./INSTALLATION.md)
7 | for general installation instructions.
8 | * [INSTALLATION-dev.md](./INSTALLATION-dev.md)
9 | for developers who want to work on this code or on dweb-archive (our offline Javascript UI).
10 | These are tested on Mac OSX, but should work with only minor changes on Linux (feedback welcome).
11 | * [INSTALLATION-iiab-rpi.md](./INSTALLATION-iiab-rpi.md)
12 | to install Internet In A Box on a Rasperry Pi
13 | * [INSTALLATION-iiab-olip.md](./INSTALLATION-olip-rpi.md)
14 | to install OLIP on a Rasperry Pi
15 | * [INSTALLATION-rachel.md](./INSTALLATION-rachel.md)
16 | for Rachel on their own Rachel 3+ (incomplete)
17 |
18 | This is a work in progress - and hasn't been tried on a clean box, since many of the earlier attempts failed and I have no
19 | way to give it a full factory reset.
20 |
21 | ## See also
22 | * [https://github.com/internetarchive/dweb-mirror/issues/93] for meta task for anything Rachel related.
23 |
24 | ## Physical connection.
25 |
26 | There are docs that come with the Rachel box and are worth reading, however they are inadequate for this task set.
27 |
28 | If you have permission to do this then you'll know the passwords so we are not putting it in this public repo!
29 |
30 | * There are two ways to physically connect the Rachel3+ to the internet either directly via the Internet or via your laptop's Wifi.
31 |
32 | #### Either connect Direct to Ethernet
33 | * Connect the Rachel box to the Ethernet - it may care which socket its plugged into.
34 | * Press and hold power button till blue light comes on then wait for WiFi to start flashing (can take a few minutes)
35 | * On your laptop, connect via WiFi to "Rachel"
36 |
37 | #### OR via Laptop's WiFi.
38 | * Connect the Rachel box via Ethernet to your laptop - it may care which socket its plugged into.
39 | * first connect from another device to WiFi Rachel,
40 | * open `192.168.88.1` in your browser,
41 | * look in the top right corner for the LAN address (for me its often `192.168.2.3` so you could try that as a shortcut)
42 | * On your Laptop, `ssh 192.168.2.3` or whatever address you found above
43 | * and login with user: `cap` and the password supplied (not the same password as for the browser)
44 | * OR in browser open `http://192.168.2.3` and click on Admin,
45 | * then login with 'admin' and the password you should know
46 | * The docs below assume you are connecting to 192.168.88.1, substitute the address you found above instead
47 |
48 | ## Configure via browser window
49 | * [http://192.168.88.1/admin/modules.php] - for general administration, but we won't do this here. (next page is Hardware / Advanced)
50 | * [http://192.168.88.1:8080/]
51 | * Login, user is `admin` and not `Admin` as stated in the docs shipped with the box
52 | * you should have the password.
53 | * General Settings
54 | * Set to "Full Internet Access" and "wide area network"
55 | * Disable Captive Portal - at least for now.
56 | * Save and Apply
57 |
58 | ## Preliminaries to install
59 | ```
60 | # Please send the result of this to me, I forgot to do this, so I'm not sure how much disk I'm using below.
61 | sudo df -k
62 | # Update the list of packages apt knows about.
63 | sudo apt-get update
64 | ```
65 |
66 | ## Installing other tools (via SSH)
67 |
68 | * Or: `ssh 192.168.88.1` and
69 | * login with user: `cap` and the password supplied (not the same password as for the browser)
70 | * DONT DO THIS I BROKE MY BOX, requiring a full reinstall `sudo apt-get upgrade`
71 |
72 | #### compilation tools gcc etc
73 | * I did: `sudo apt-get install gcc g++ make` but I'm not sure which of these were actually required.
74 | * TODO on fresh machine try without these tools and edit this comment.
75 | * g++ is certainly required for nvm below
76 | * If something below complains of the other's absence then go ahead and install, the only downside is significant disk usage especially for gcc & g++
77 |
78 | #### yarn
79 | Now get yarn - as probably going to end up using both npm and yarn
80 | ```
81 | curl -sL https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add -
82 | echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list
83 | sudo apt-get update && sudo apt-get install yarn
84 | ```
85 |
86 | #### Updating Node (buggy)
87 |
88 | Many tools now require Node v10 to work, but Node no longer supports 32 bit by default.
89 | Note there is an issue open on the Rachel community about this
90 | and Refael Ackermann from the node-js team helped make 32 bit compiles available,
91 | but there is noone at Rachel who has time to look at it...
92 | [http://community.rachelfriends.org/t/installing-usable-version-of-node/1082/4]
93 |
94 |
95 | This next list only gets to v9 at present, its worth looking at `/etc/apt/sources.list.d/nodesource.list` if debugging this.
96 | ```
97 | sudo node --version # Typically shows an antique version of node around version 4.x
98 | curl -sL https://deb.nodesource.com/setup_12.x | sudo -E bash -
99 | sudo apt-cache policy nodejs # Should show v10 but for some Rachel problem it only goes to v9 which is ok but sub-optimal
100 | sudo apt-get install -y nodejs
101 | sudo npm i -g npm # Update npm
102 | ```
103 | So alternatively via NVM
104 | ```
105 | touch ~/.bash_profile # Nvm needs it to at least exist
106 | curl -o- https://raw.githubusercontent.com/creationix/nvm/v0.34.0/install.sh | bash
107 | source ~/.bash_profile
108 | nvm install node # Should compile 10.13.0 or later from source using the g++ installed earlier !!SLOW!!
109 | node -v # Check its 10.x
110 | ```
111 | TODO-RACHEL figure out the issues around `path` etc for services accessing node and use whatever Rafael produces.
112 |
113 | ## Now continue from the general INSTALLATION.md starting at step 2.
114 |
115 | ## Step 7 - Auto running
116 | The final step requires integration by the Rachel team,
117 | We've had an issue open since March 2019, but there isn't anyone available,
118 | http://community.rachelfriends.org/t/integrating-installation-of-internet-archive-as-a-server/1284
119 | so its going to be a bit of a guesswork if you want to complete this.
120 |
121 | * TODO-RACHEL - hook into supervisorctl etc [http://community.rachelfriends.org/t/starting-at-boot-supervisorctl/1202]
122 | * THEN TODO-RACHEL - auto start mirrorHttp
123 | * TODO-RACHEL - run crawl under cron
124 | * TODO-RACHEL - maybe setup easy auto-update process
125 | * TODO-RACHEL - integrate into menus [http://community.rachelfriends.org/t/integrating-into-the-main-user-facing-menu/1203]
126 |
--------------------------------------------------------------------------------
/API.md:
--------------------------------------------------------------------------------
1 | #API for dweb-mirror v0.2.0
2 |
3 | This document covers the API for v0.2.0 of dweb-mirror which is semi-stable now.
4 |
5 | #### Outline of APIs
6 |
7 | * URI support: support most of the Internet Archive's core APIs.
8 | * Config files: Control the behavior of each of the apps in this package
9 | * dweb-archivecontroller - base level classes which are extended by this package:
10 | ArchiveItem; ArchiveMember; ArchiveFile.
11 | * A set of classes that provide higher level support esp:
12 | CrawlManager; ConfigController HashStore; MirrorFS;
13 | * An applications that uses the APIs above, but which is itself forkable:
14 | internetarchive.
15 |
16 | # URI support
17 |
18 | dweb.mirror is intended to support an expanding subset of Archive APIs.
19 |
20 | Check ./MirrorHttp.js for the routing but in summary all these work as for
21 | archive.org or DATANODE.archive.org except as where noted,
22 | though given all the Internet Archive edge cases and exceptions
23 | there may be unknown places where certain options are not supported.
24 |
25 | Some archivelab.org APIs are also (partially) supported (/books and /iiif)
26 | largely because Palmleaf's extensions to Mediawiki use them.
27 |
28 | |url|notes|
29 | |---|-----|
30 | |/advancedsearch.php|Return json file as on IA Does not support fl=|
31 | |/BookReader/BookReaderJSIA.php|Return the IA specific json used by bookreader|
32 | |/BookReader/BookReaderJSON.php /books/IDENTIFIER/ia_manifest|Return the IA specific json used by bookreader|
33 | |/BookReader/BookReaderImages.php /BookReader/BookReaderPreview.php|Send page images|
34 | |/details/IDENTIFIER|Redirect to single page for IDENTIFIER|
35 | |/stream/IDENTIFIER||
36 | |/details/IDENTIFIER/page/*|Opens book to page|
37 | |/download/IDENTIFIER/page/PAGE|Returns one image|
38 | |/download/IDENTIFIER|Redirects to page that displays download directory|
39 | |/download/IDENTIFIER/FILE /serve/IDENTIFIER/FILE|Download file|
40 | |/embed/IDENTIFIER?output=json /playlist/IDENTIFIER|Return a playlist|
41 | |/iiif/:identifierindex/ LEFT,TOP,WIDTH,HEIGHT/full/0/default.jpg|Return a scaled image
42 | |/mds/v1/get_related/all/IDENTIFIER|Gets related items|
43 | |/metadata/IDENTIFIER|Standard metadata except: Does not support returning subset of metadata|
44 | |~~/metadata/IDENTIFIER/FILE~~|As used in earlier dweb.archive.org is no longer supported|
45 | |/search.php|Redirect to search results page|
46 | |/services/img/IDENTIFIER /download/IDENTIFIER/__ia_thumb.jpg|Return the thumbnail|
47 | |/images/FILE /includes/FILE /jw/FILE /bookreader/BookReader/FILE /favicon.ico |Return static files as on IA|
48 |
49 | In addition, there are URI's unique to dweb-mirror:
50 |
51 | |url|notes|
52 | |---|-----|
53 | |/admin/crawl|Set of functions to control crawling|
54 | |/archive/FILE /FILE|Returns file from this UI|
55 | |/components/FILE|Returns webcomponents used by UI|
56 | |/echo|Echo back headers (for debugging)|
57 | |/epubreader/FILE|Return file from epubreader application|
58 | |/info|Return info about the server including config and status of crawls|
59 | |/languages/FILE|Returns static language files|
60 | |/opensearch|Supports opensearch API currently only when online to gateway|
61 |
62 | # Config file
63 |
64 | There are two config files, one at dweb-mirror/configDefaults.yaml
65 | and ~/dweb-mirror.config.yaml for which there is an example you can copy.
66 |
67 | Both files follow the same format, and the settings in your home directory override that in dweb-mirror.
68 |
69 | Check ./configDefaults.yaml which has comments on each line.
70 |
71 | ```
72 | directories: [ path* ] # List of places to look for the Cache directory - expands ~/xx and ./xx and * etc
73 | archiveui: # Anything relating to display of the Archive UI
74 | directory: [ ... ] # Where to look for the files for the Archive UI - uses first - expands ~/xx and ./xx and * etc
75 | apps: # Each application can have its own configuration
76 | http: # Relating to serving
77 | crawl: # Relating to crawling
78 | upstream: "dweb.archive.org" # Where to find an upstream server, typically "dweb.archive.org"
79 | ```
80 |
81 | # Files on disk
82 |
83 | Files are stored in a 2 (or more) level directory structure, each Archive Item is a directory, and each Archive File is a file.
84 | Metadata is stored in specially named files.
85 |
86 | ### Cache outline for each Item.
87 |
88 | /archiveorg/IDENTIFIER/ each Archive item has its own directory that contains the following files.
89 |
90 | |file|from|
91 | |----|----|
92 | |IDENTIFIER.meta.json|ArchiveItem.metadata|
93 | |IDENTIFIER.reviews.json|ArchiveItem.reviews, On disk is format returned by API
94 | |IDENTIFIER.speech_vs_music_asr.json|ArchiveItem.speech_vs_music_asr.json, Format as returned by metadata API
95 | |IDENTIFIER.files.json|ArchiveItem.files|
96 | |IDENTIFIER.extra.json|ArchiveItem.{collection_titles, collection_sort_order, files_count, is_dark, dir, server}|
97 | |IDENTIFIER.member.json|ArchiveMember, As retrieved in a search
98 | |IDENTIFIER.members.json|List of members - this file is a normal ArchiveFile in fav-* collections|
99 | |IDENTIFIER.members_cached.json|ArchiveMember.*, All the search results for this item retrieved so far
100 | |IDENTIFIER.members_titleSorter_cached.json|ArchiveMember.*, Search results based on a `titleSorter` sort
101 | |IDENTIFIER.playlist.json|ArchiveItem.playlist, The playlist for the item
102 | |IDENTIFIER.bookreader.json|ArchiveItem.bookreader, The info that the bookreader API returns
103 | |__ia_thumb.jpg|Image file from /service/img/IDENTIFIER ~10kbytes|
104 |
105 | # Local classes
106 | dweb-mirror uses the three core classes from [dweb-archivecontroller](https://github.com/internetarchive/dweb-archivecontroller),
107 | but patches them so that they become cache aware.
108 | See ./ArchiveItemPatched.js ./ArchiveFilePatched.js and ./ArchiveMemberPatched.js
109 |
110 | # Applications
111 |
112 | `internetarchive`: A comprehensive function that can with
113 | -s Run a server
114 | -c Run a crawler
115 | -m Perform maintenance
116 |
117 | There is a lot more functionality, running `internetarchive -h`
118 | or look in ./internetarchive for all the options.
119 |
120 | # Installation files
121 |
122 | The following files are present in `dweb-mirror` but, as of v0.1.0 are still a work in progress and not yet defined and will probably change a lot.
123 |
124 | * Dockerfile - create docker file of dweb-mirror (untested, probably doesnt work yet)
125 | * Dockerfile_ipfs - create a docker file for an IPFS instance - to go with the above Dockerfile (untested, probably doesnt work yet)
126 | * install.sh - run during `npm install` or after `npm update` by `npm run update` - makes some links based on which other repos are installed.
127 | * install_rachel.sh - variant of install.sh being built for Rachel platform (only partially complete)
128 | * run_dockers.sh - ???
129 |
130 | # See Also
131 |
132 | * [LICENCE] - GNU Alfredo licence
133 | * [INSTALLATION.md] - generic installation instructions and links to specific instructions.
134 | * [README.md] - main documentation
135 | * [RELEASENOTES.md] - history of releases
136 | * [URL_MAPPING.md] - how universal urls flow through the different mappings in mirrorHttp and elsewhere.
137 |
138 |
139 |
--------------------------------------------------------------------------------
/INSTALLATION-olip-rpi.md:
--------------------------------------------------------------------------------
1 | # Installation instructions for dweb-mirror on OLIP on Raspberry Pi 3 or 4
2 |
3 | If you not installing dweb-archive+IIAB on a Raspberry Pi then one of these documents
4 | will be much easier to follow.
5 |
6 | * Mac OSX [INSTALLATION-osx.md](./INSTALLATION-osx.md)
7 | * Internet In A Box (IIAB) on Rasberry Pi [INSTALLATION-iiab-rpi.md](./INSTALLATION-iiab-rpi.md)
8 | * Offline Internet Platform (OLIP) on Rasberry Pi [INSTALLATION-olip.md](INSTALLATION-olip-rpi.md)
9 | * Raspberry Pi without IIAB or OLIP [INSTALLATION-rpi.md](./INSTALLATION-rpi.md)
10 | * Rachel on the 3+ [INSTALLATION-rachel.md](./INSTALLATION-rachel.md)
11 | * Rachel on the RPI [INSTALLATION-rachel-rpi.md](./INSTALLATION-rachel-rpi.md)
12 | * Mac OSX Developer [INSTALLATION-osx-dev.md](INSTALLATION-dev.md)
13 | * Everything in one doc [INSTALLATION-work.md](./INSTALLATION-work.md)
14 | * TODO developer instructions on other platforms.
15 |
16 | If anything here doesn't work please email mitra@archive.org
17 | or it would be even more helpful to post a PR on https://github.com/internetarchive/dweb-mirror
18 |
19 | NOTE (2020-02-04) OLIP is itself very experimental, as is our use of it, expect this to fail
20 |
21 | ## See also
22 | * [README.md](./README.md) for more general information
23 | * [issue #263](https://github.com/internetarchive/dweb-mirror/issues/263) for meta task for anything related to OLIP.
24 |
25 | ## Step 1 Initial setup - getting Raspbian
26 |
27 | If your Raspberry Pi comes with Raspbian you are in luck, skip to Step 1B,
28 | otherwise if it comes with NOOBS (as most do now) you'll need to replace it with Raspbian.
29 |
30 | This is what I do. (Edits welcome, if your experiences differ)
31 |
32 | * Downloaded Raspbian [Raspbian](https://www.raspberrypi.org/downloads/raspbian/) to your laptop (~1GB)
33 | * Any of the distributions should work - I test on the Desktop version
34 | * On a Mac:
35 | * downloaded [Etcher](https://www.balena.io/etcher/) (100Mb)
36 | * Run Etcher (its supposed to be able to use the zip, though for this test we used the .img from expanding hte zip), selecting a fresh 16GB SD card as the destination
37 | * On Windows or Linux,
38 | * I'm not sure the appropriate steps instead of Etcher.
39 | * Inserted into Raspbian 3 or 4, and powered up with Kbd and HDMI and Mouse inserted.
40 | * If at all possible insert Ethernet, otherwise it will work over WiFi with some extra steps.
41 | * Power it up
42 | * It prompted me for some getting started things,
43 | * Accepted "Next to get started".
44 | * Selected your country, language, keyboard - it shouldnt matter which.
45 | * Changed password since RPis get hacked on default password
46 | * Connected to WiFi (not necessary if you have Ethernet connected)
47 | * It automatically Updated OS - this can take a long time - take a break :-)
48 | * Note that this process failed for me with failures of size and sha, or with timeouts,
49 | but a restart, after the prompts for password etc,
50 | got me to a partially completed download so I did not have to start from scratch
51 | * You might want to ... Menu/Preferences/Config / Set display to highest resolution
52 | * You probably want `Menu/Raspberry Pi Configuration/Interfaces/SSH enable` so that you can SSH
53 | into the box rather than use attached keyboard and screen.
54 |
55 | ## Step 1B Workaround for Raspbian bug
56 | Raspbian has a bug that requires a patch until they push it to a new release.
57 | It looks from https://github.com/raspberrypi/linux/issues/3271 like you need to do
58 | ```
59 | sudo rpi-update
60 | ```
61 | This should only be applicable until the Raspbian available at
62 | https://www.raspberrypi.org/downloads/raspbian/
63 | is dated newer than September 2019
64 |
65 | ## Step 2 Install Offline Internet Platform (OLIP)
66 |
67 | Note its strongly recommended to connect your RPI to the Ethernet,
68 | rather than WiFi because OLIP (currently) has a number of bugs that appear if you do not connect to its own hotspot
69 |
70 | Internet Archive is accessible from the OLIP catalog after this step.
71 |
72 | These instructions come from:
73 | http://bibliosansfrontieres.gitlab.io/olip/olip-documentation/olip/installation/
74 |
75 | They might get updated there as OLIP evolves.
76 |
77 | Open a terminal window.
78 |
79 | Note - use `olip.local` so that it is shared with MDNS where networks expect to see things.
80 | `olip.lan` as in the docs is likely to fail.
81 |
82 | ```
83 | curl -sfL https://gitlab.com/bibliosansfrontieres/olip/olip-deploy/raw/master/go.sh |\
84 | sudo bash -s -- --name olip --url olip.local --descriptor http://drop.bsf-intranet.org/olip/conf-arm32v7
85 | ```
86 | to install it.
87 |
88 | When its finished - the WiFi does not (currently) appear, so reboot to bring the WiFi hotspot up.
89 | (see https://gitlab.com/bibliosansfrontieres/olip/olip-deploy/issues/7)
90 |
91 | Connect to the WiFi typically called "OLIP".
92 |
93 | Note: several bugs in OLIPs redirects mean that connecting via the Ethernet (to e.g. `olip.local`) are likely to fail.
94 | See https://gitlab.com/bibliosansfrontieres/olip/olip-deploy/issues/6
95 |
96 | - Go to http://olip.local/home
97 | - Login with username: admin password: admin
98 |
99 | It will frequently give you a sql error.
100 | https://gitlab.com/bibliosansfrontieres/olip/olip-dashboard/issues/31
101 | Reloading the page might work, it that doesn't then a reboot of teh box did not work.
102 |
103 | - `Catalog > Internet Archive > Download`, it should change to `Downloading` and then `Downloaded`
104 | note, its getting a Docker image, so its going to be a lot slower, than installation on other platforms, but not outrageous.
105 | - `Applications > Downloaded > dweb-mirror > Install`, this is surprisingly quick
106 | - `Home > Internet Archive`, should bring up the app
107 |
108 | ### 3. Edit configuration
109 |
110 | If you are doing anything non-standard, then you'll need to create and edit
111 | a local configuration file. Otherwise the application will create it the first time its needed.
112 |
113 | ```
114 | ssh pi@olip.local
115 | cd /data/dweb-mirror.app
116 | sudo cp dweb-mirror.config.yaml dweb-mirror.initial.yaml # Make a backup - there is a good chance you will need it
117 | ```
118 | and edit `dweb-mirror.config.yaml` for now see `configDefaults.yaml` for inline documentation.
119 |
120 | * `directories` wont currently work on OLIP as other volumes are not visible in the docker.
121 | see https://gitlab.com/bibliosansfrontieres/olip/olip-api/issues/20
122 | * `apps.crawl` includes a structure that lists what collections are to be installed,
123 | I suggest testing and then editing
124 |
125 | Note that directories specified in the config file can be written using shell or unix conventions such as "~/" or "../".
126 |
127 | ### 4. Test crawling
128 |
129 | #### Crawling
130 | Running crawling separately is not currently possible in OLIP.
131 |
132 | Running some `balena-engine` command to get into the app and then running `internetarchive -c` might work,
133 | but I haven't figured it out yet.
134 |
135 | ### 5 Debugging
136 |
137 | See http://bibliosansfrontieres.gitlab.io/olip/olip-documentation/olip/knowledge-base/debug-an-app/
138 | for basics on viewing logs etc.
139 |
140 |
141 | ## 8. Updating
142 |
143 | The software is frequently revised so its recommended to update, especially if you see any bugs or problems.
144 |
145 | TODO - write instructions
146 |
--------------------------------------------------------------------------------
/MirrorConfig.js:
--------------------------------------------------------------------------------
1 | const debug = require('debug')('dweb-mirror:MirrorConfig');
2 | const forever = require('async/forever');
3 | const { ObjectDeeperAssign } = require('@internetarchive/dweb-archivecontroller');
4 | const ConfigController = require('./ConfigController');
5 | const CrawlManager = require('./CrawlManager');
6 |
7 | class MirrorConfig extends ConfigController {
8 | /*
9 | Subclass of ConfigController specific to mirroring
10 | */
11 | constructor(...objs) {
12 | super(...objs);
13 | }
14 |
15 | static initializeUserConfig(cb) {
16 | // Return user configuration, initializing if required.
17 | this.initializeUserConfigFile(this.userConfigFile, this.defaultUserConfig, cb);
18 | }
19 |
20 | static new(filenames, setState, cb) {
21 | // filenames Optional list of filenames for configation otherwies uses defaultConfigFiles
22 | // setState({directories})
23 | if (typeof filenames === 'function') { cb = filenames; filenames = undefined; }
24 | if (!(filenames && filenames.length)) { filenames = this.defaultConfigFiles; } // Doesnt include userConfigFile
25 | super.new(filenames, (err, config) => {
26 | if (!err) config.setupPeriodically(setState); // Periodically - rescan Directories;
27 | cb(err, config);
28 | });
29 | }
30 |
31 | resolveDirectories(setState) {
32 | // TODO note this could be slow - it uses glob.sync - see TODO in ConfigController.resolves
33 | // Handle ~/ ./ ../ and expand * or ?? etc
34 | // setState({directories}) optional
35 | const newDirectories = ConfigController.resolves(
36 | this.configOpts.filter(c => c.directories).pop().directories // Find last one to define directories
37 | );
38 | if (!Array.isArray(this.directories)) this.directories = []; // Handle start when its undefined
39 | const adding = newDirectories.filter(d => !this.directories.includes(d));
40 | const removing = this.directories.filter(d => !newDirectories.includes(d));
41 | if (adding.length || removing.length) {
42 | if (adding.length) debug('Adding directories %s', adding.join('; '));
43 | if (removing.length) debug('Removing directories %s', removing.join('; '));
44 | this.directories = newDirectories;
45 | if (setState) setState({ directories: this.directories });
46 | }
47 | }
48 |
49 | setOpts(...opts) {
50 | // Extend base class to handle specific derivations of opts
51 | const oldDirectories = this.directories; // Save old directories
52 | super.setOpts(...opts); // Just combined and store ops
53 | // Note at this point this.directories will be set to all of them, which is not what we want.
54 | // Remove first so that resolveDirectories will report what its actually using
55 | this.directories = oldDirectories; // and restore as actually want only resolv
56 | this.resolveDirectories(); // Handle ~/ ./ ../ and expand * or ?? etc
57 | ['archiveui', 'bookreader', 'epubreader', 'nodemodules']
58 | .map(d => this[d])
59 | .forEach(o => o.directory = ConfigController.firstExisting(o.directories)); // Handle ~/ ./ ../ * ?? and find first match
60 | }
61 |
62 | setupPeriodically(setState) {
63 | // Re-resolve the directories options to see if its changed
64 | // if changed will update mfs
65 | if (this.rescanDirectories) {
66 | forever((next) => setTimeout(() => {
67 | this.resolveDirectories(setState);
68 | next();
69 | }, this.rescanDirectories * 1000));
70 | }
71 | }
72 |
73 | setAndWriteUser(obj, cb) {
74 | // Set the configuration in the ConfigManager, and write to user file
75 | this.setAndWriteUserFile(MirrorConfig.userConfigFile, obj, cb);
76 | }
77 |
78 | writeUser(cb) {
79 | // Write user configuration to file
80 | this.writeUserFile(MirrorConfig.userConfigFile, cb);
81 | }
82 |
83 | deleteUserTask({ identifier, query }) {
84 | // Remove task for identifier (handles multi-identifier tasks correctly)
85 | const task = this.findTask({ identifier });
86 | if (task) {
87 | if (Array.isArray(task.identifier) && (task.identifier.length > 1)) {
88 | task.identifier.splice(task.identifier.indexOf(identifier), 1); // Old task - remove identifier
89 | } else { // Single identifier or array length=1
90 | this.apps.crawl.tasks.splice(this.apps.crawl.tasks.indexOf(task), 1);
91 | }
92 | }
93 | }
94 |
95 | writeUserTaskLevel({ identifier, level, query }, cb) {
96 | // Update, or create a new task for an identifier (handles multi-identifier tasks correctly)
97 | if (level === 'none') {
98 | this.deleteUserTask({ identifier, query });
99 | } else {
100 | let task = this.findTask({ identifier, query });
101 | if (!task) {
102 | ObjectDeeperAssign(this, { apps: { crawl: {} } });
103 | if (!this.apps.crawl.tasks) {
104 | this.apps.crawl.tasks = [];
105 | }
106 | task = Object.assign({}, identifier ? { identifier } : null, query ? { query } : null);
107 | this.apps.crawl.tasks.push(task);
108 | } else if (Array.isArray(task.identifier) && (task.identifier.length > 1)) {
109 | task.identifier.splice(task.identifier.indexOf(identifier), 1); // Old task - remove identifier
110 | task = Object.assign({}, task, { identifier }); // New task for just this identifier
111 | this.apps.crawl.tasks.push(task);
112 | }
113 | // By this point this.apps.crawl.tasks[] should have a task {identifier}, possibly with old state i.e. findTask({identifier}) would now succeed
114 | task.level = level; // Only change level of that task
115 | }
116 | this.writeUser(cb); // And write back current state
117 | }
118 |
119 | findTask({ identifier, query }) {
120 | // Find and return task from config
121 | return this.apps.crawl.tasks.find(t => (identifier && t.identifier && t.identifier.includes(identifier)) || (query && t.query === query));
122 | }
123 |
124 | /**
125 | * Find any task and return crawlInfo (which is the task)
126 | * @param identifier
127 | * @param query
128 | * @param mediatype
129 | * @returns {identifier, query, search, related } // A task object as in the config.apps.crawl.tasks
130 | */
131 | crawlInfo({ identifier = undefined, query = undefined, mediatype = undefined }) {
132 | /*
133 | Check if member being crawled and return info suitable for adding into ArchiveMember and usable by the UI
134 | */
135 | let task = this.findTask({ identifier, query });
136 | if (!task) {
137 | task = {};
138 | } else {
139 | const isDetailsOrMore = CrawlManager._levels.indexOf(task.level) >= CrawlManager._levels.indexOf('details');
140 | const isSearch = query || (mediatype === 'collection'); // TODO-UXLOCAL need to catch searches (which do not use regular identifiers)
141 | task.search = task.search || (isDetailsOrMore && isSearch && this.apps.crawl.opts.defaultDetailsSearch);
142 | }
143 | return task;
144 | }
145 | }
146 |
147 | MirrorConfig.userConfigFile = '~/dweb-mirror.config.yaml'; // contents overwritten by writeUser or setAndWriteUser
148 | // Default to just top 30 tiles of home page
149 | // Default configuration for user file
150 | MirrorConfig.defaultUserConfig = { apps: { crawl: { tasks: [{ identifier: ['home'], level: 'details', search: [{ sort: '-downloads', rows: 30, level: 'tile' }] }] } } };
151 | // config files (later override earlier) note the userConfigFile is always appended
152 | // If this is ever more than one file in defaultConfigFiles then the code in dweb-archive that for statusFromConfig will need editing as assumes userConfigFile returned in position 1
153 | MirrorConfig.defaultConfigFiles = ['./configDefaults.yaml'];
154 |
155 | exports = module.exports = MirrorConfig;
156 |
--------------------------------------------------------------------------------
/ConfigController.js:
--------------------------------------------------------------------------------
1 | // Careful not to introduce too many dependencies in here, as called very early in applications
2 | const os = require('os');
3 | const fs = require('fs'); // See https://nodejs.org/api/fs.html
4 | const path = require('path');
5 | const glob = require('glob');
6 | const debug = require('debug')('dweb-mirror:ConfigController');
7 | const asyncMap = require('async/map');
8 | // const canonicaljson = require('@stratumn/canonicaljson');
9 | const yaml = require('js-yaml'); // https://www.npmjs.com/package/js-yaml
10 | // noinspection JSUnusedLocalSymbols
11 | const { ObjectDeeperAssign } = require('@internetarchive/dweb-archivecontroller');
12 |
13 | class ConfigController {
14 | /*
15 | A set of tools to manage and work on configuration data structures and to map to storage or UI
16 |
17 | Note the API for this is in flux as build the first few use cases
18 |
19 | Note this makes extensive use of the fact that the last of the ...objs can be edited, set back with setopts and leave this changed as expected.
20 |
21 | Nothing in ConfigManager is specific to Mirror ... the Mirror specific stuff is in the class below ...
22 |
23 | Do not use this class directly, build a subclass that provides at least writeUser(obj,cb) and initializeUserConfig(cb)
24 |
25 | */
26 | constructor(...objs) {
27 | /*
28 | Create a new config structure from one or more config objects.
29 | The fields in later arguments (at the root, or nested levels) over-write the previous ones.
30 | See config file for structure of config
31 | */
32 | this.configOpts = objs; // For info query
33 | this.setOpts(...objs);
34 | }
35 |
36 | static initializeUserConfigFile(userConfigFile, defaultUserConfig, cb) {
37 | /*
38 | userConfigFile Path (can be relative) to user config file, that may not exist
39 | defaultUserConfig Initial configuration (as object) to set the file to if it does not exist
40 | cb(err, { config } )
41 | */
42 | const f = this.resolve(userConfigFile);
43 | this.readYaml(f, { silentReadFailure: true }, (err, res) => {
44 | if (err) {
45 | this.writeYaml(f, defaultUserConfig, (err) => {
46 | if (err) debug('Unable to initialize User config file %s', f);
47 | cb(err, defaultUserConfig);
48 | });
49 | } else {
50 | cb(null, res);
51 | }
52 | });
53 | }
54 |
55 | static initializeUserConfig(cb) {
56 | cb(new Error('ConfigManager must be subclassed to provider initializeUserConfig'));
57 | }
58 |
59 | static new(filenames, cb) {
60 | /*
61 | Create a new config by reading YAML from filenames in order, (later overriding earlier)
62 | Requires MirrorConfig to implement initializeUserConfig the results of which override that in the filenames
63 |
64 | filenames optional ordered array of paths to possible config files (they may be missing), ~/ ./ * etc are expanded (I'm not sure about ../)
65 | cb(err, config) Called with an instance of MirrorConfig
66 | */
67 |
68 | asyncMap(this.resolves(filenames),
69 | (filename, cb2) => {
70 | this.readYaml(filename, { silentReadFailure: true }, (err, res) => cb2(null, res)); // Ignore err, and res should be {} if error
71 | },
72 | (err, configobjs) => { // [ {...}* ]
73 | if (err) { cb(err, null); } else {
74 | this.initializeUserConfig((err, userConfig) => {
75 | if (err) { cb(err, null); } else {
76 | const config = new this(...configobjs, userConfig);
77 | // noinspection JSUnresolvedVariable
78 | debug('config summary: directory:%o archiveui:%s bookreader:%s epubreader %s',
79 | config.directories, config.archiveui.directory, config.bookreader.directory, config.epubreader.directory);
80 | cb(null, config);
81 | }
82 | });
83 | }
84 | });
85 | }
86 |
87 | static resolve(v) { // Handle ~ or . or .. in a path
88 | // Return a resolved filename, expanding ./ ~/ and possibly ../
89 | // noinspection JSUnresolvedVariable
90 | return (v.startsWith('~/') ? path.resolve(os.homedir(), v.slice(2)) : path.resolve(process.cwd(), v));
91 | }
92 |
93 | static resolves(vv) { // TODO make async and pass a cb
94 | // Return an array of resolved filenames, this can also expand `*` etc
95 | return [].concat(...vv.map(v => this.resolve(v)) // Handle ~ or . or ..
96 | .map(p => glob.sync(p))); // And expand * etc (to an array of arrays)
97 | }
98 |
99 | static firstExisting(arr) {
100 | // Find the first of arr that exists, args can be relative to the process directory .../dweb-mirror
101 | // returns undefined if none found
102 | // noinspection JSUnresolvedVariable
103 | return this.resolves(arr).find(p => fs.existsSync(p));
104 | }
105 |
106 | setOpts(...opts) {
107 | /*
108 | Set some fields of configuration from passed object,
109 | it expands paths such as ~/foo and ./foo where appropriate.
110 | Note this currently overwrites anything at the path, but may be modified to use ObjectDeeperassign in future.
111 | */
112 | ObjectDeeperAssign(this, ...opts);
113 | // This is subclassed in MirrorConfig to handle specific derivations
114 | }
115 |
116 | static readYamlSync(filename) {
117 | /*
118 | Read an return YAML from filename
119 | Throws errors on failure to read, or failure to parse.
120 | */
121 | try {
122 | return yaml.load(fs.readFileSync(this.resolve(filename), 'utf8'));
123 | } catch (err) {
124 | debug('Error reading user configuration: %s', err.message);
125 | return {}; // Caller is free to ignore err and treat {} as an empty set of config params
126 | }
127 | }
128 |
129 | static readYaml(filename, { silentReadFailure = false } = {}, cb) {
130 | /*
131 | Read YAML from filename and return via cb(err, res),
132 | or return error if unable to read or parse.
133 | silent: if true then do not report error on failure to read
134 | */
135 | fs.readFile(filename, 'utf8', (err, yamlstr) => {
136 | if (err) {
137 | if (!silentReadFailure) {
138 | debug('Unable to read %s: %s', filename, err.message);
139 | }
140 | cb(err, {});
141 | } else {
142 | try {
143 | const o = yaml.load(yamlstr);
144 | try { cb(null, o); } catch (err) { debug('ERROR: Uncaught err in readYaml cb %o', err); }
145 | } catch (err) {
146 | debug('Unable to parse yaml: %s', err.message);
147 | cb(err, {});
148 | }
149 | }
150 | });
151 | }
152 |
153 | userConfig() {
154 | // Return the last configuration file
155 | return this.configOpts[this.configOpts.length - 1]; // Last configOpts is the "user" one that gets written
156 | }
157 |
158 | setAndwriteUser(obj, cb) {
159 | cb(new Error('ConfigManager must be subclassed to provide setAndwriteUser'));
160 | }
161 |
162 | writeUser(obj, cb) {
163 | cb(new Error('ConfigManager must be subclassed to provide writeUser'));
164 | }
165 |
166 |
167 | writeUserFile(filename, cb) {
168 | // Write user configuration to filename
169 | ConfigController.writeYaml(ConfigController.resolve(filename), this.userConfig(), cb);
170 | }
171 |
172 | setAndWriteUserFile(filename, obj, cb) {
173 | // Set local configuration in ConfigManager and write to user file
174 | // obj to replace userconfig
175 | // filename to store yaml ( ~ ./* ../* etc accepted)
176 | this.userconfig = obj;
177 | this.setOpts(obj); // Merge into combined options
178 | // By now sendInfo will send correct result back
179 | // And write to user's file
180 | ConfigController.writeYaml(ConfigController.resolve(filename), obj, cb);
181 | }
182 |
183 | static writeYaml(filename, obj, cb) {
184 | // Write yaml version of an object to a file
185 | try {
186 | const y = yaml.dump(obj);
187 | fs.writeFile(filename, y, { encoding: 'utf8' }, (err) => {
188 | if (err) { debug('Unable to write yaml to %s: %s', filename, err.message); }
189 | cb(err);
190 | });
191 | } catch (err) { // Typically a yaml dump error
192 | debug('ERROR unable to write yaml from %O', obj);
193 | cb(err);
194 | }
195 | }
196 | }
197 |
198 | exports = module.exports = ConfigController;
199 |
--------------------------------------------------------------------------------
/HashStore.js:
--------------------------------------------------------------------------------
1 | // This is level 5, there are major differences in level 8
2 | // see https://github.com/Level/level/blob/master/UPGRADING.md
3 | // The changes should be contained here, but might not be trivial and its unclear what if any value gained.
4 | const level = require('level');
5 | const debug = require('debug')('dweb-mirror:HashStore');
6 | const each = require('async/each');
7 | const waterfall = require('async/waterfall');
8 |
9 | /**
10 | * A generic Hash Store built on top of level,
11 | * table/key/value triples.
12 | * Note this could probably build on top of Redis or Gun as well -
13 | * Redis might end up too large for memory, and actually want local item store, not global one
14 | * There is one of these at the top directory of each cache directory.
15 | */
16 | /**
17 | * Common parameters across all methods
18 | * table name of table (by convention, in dweb-mirror we use the mapping as the table name e.g. `sha1.filepath`
19 | * key name of key (by convention its the left side of the table name e.g. the sha1.
20 | * For some functions (put) it may be an object mapping keys to val in which case val is ignored
21 | * val value to store
22 | */
23 | class HashStore {
24 | /**
25 | * @param config = { dir } Prefix for Directory where data stored (actual directories are DIR followed by TABLE so this should typically end with a punctuation or /}
26 | * @returns {HashStore}
27 | */
28 | constructor(config) {
29 | this.config = config;
30 | this.tables = {}; // Caches pointers to leveldb objects that manage tables (each being a stored in a directory)
31 | return this;
32 | }
33 |
34 | _tablepath(table) { // Return the file system path to where we have, or will create, a table
35 | return `${this.config.dir}${table}`;
36 | }
37 |
38 | _db(table, retries = undefined, cb) {
39 | if (typeof retries === 'function') { cb = retries, retries = 10; }
40 | if (!this.tables[table]) {
41 | const tablepath = this._tablepath(table);
42 | level(tablepath, {}, (err, res) => {
43 | if (err) {
44 | debug('Hashstore failed to open db at %s: %s', tablepath, err.message);
45 | if (retries) {
46 | setTimeout(() => this._db(table, --retries, cb), 100);
47 | } else {
48 | cb(err);
49 | }
50 | } else {
51 | this.tables[table] = res;
52 | cb(null, res);
53 | }
54 | });
55 | } else {
56 | cb(null, this.tables[table]); // Note file might not be open yet, if not any put/get/del will be queued by level till its ready
57 | }
58 | }
59 |
60 | destroy(table, cb) {
61 | level.destroy(this._tablepath(table), cb);
62 | }
63 |
64 | destroyAll(cb) {
65 | each(Object.keys(this.tables), (table, cb2) => this.destroy(table, cb2), cb);
66 | }
67 |
68 | put(table, key, val, cb) {
69 | /*
70 | Set a key to a val for a specific table.
71 | val = any valid persistent value acceptable to level (not sure what limits are)
72 | key = any valid key for level (not sure what limits are)
73 | cb(err)
74 | */
75 | if (cb) { try { f.call(this, cb); } catch (err) { cb(err); } } else { return new Promise((resolve, reject) => { try { f.call(this, (err, res) => { if (err) { reject(err); } else { resolve(res); } }); } catch (err) { reject(err); } }); } // Promisify pattern v2
76 | function f(cb2) {
77 | debug('%s.%o <- %o', table, key, val);
78 | waterfall([
79 | cb3 => this._db(table, cb3),
80 | (db, cb3) => {
81 | if (typeof key === 'object') {
82 | db.batch(Object.keys(key).map(k => ({ type: 'put', key: k, value: key[k] })), cb3);
83 | } else {
84 | db.put(key, val, cb3);
85 | }
86 | }
87 | ], (err) => {
88 | if (err) {
89 | debug('put %s %s <- %s failed %s', table, key, val, err.message);
90 | }
91 | cb2(err);
92 | });
93 | }
94 | }
95 |
96 | /**
97 | * Retrieve value stored to key, returns `undefined` if not found (not an error)
98 | */
99 | async get(table, key, cb) {
100 | if (cb) { try { f.call(this, cb); } catch (err) { cb(err); } } else { return new Promise((resolve, reject) => { try { f.call(this, (err, res) => { if (err) { reject(err); } else { resolve(res); } }); } catch (err) { reject(err); } }); } // Promisify pattern v2
101 | function f(cb2) {
102 | // This is similar to level.get except not finding the value is not an error, it returns undefined.
103 | // noinspection JSPotentiallyInvalidUsageOfClassThis
104 | if (!table || !key) {
105 | debug('Error: .get requires table (%o) and key (%o)', table, key);
106 | cb(new Error('.get requires table and key'));
107 | }
108 | return waterfall([
109 | cb3 => this._db(table, cb3),
110 | (db, cb3) => db.get(key, cb3)
111 | ], (err, val) => {
112 | if (err && (err.type === 'NotFoundError')) { // Undefined is not an error
113 | debug('get %s %s failed %s', table, key, err.message);
114 | cb2(null, undefined);
115 | } else {
116 | cb2(null, val);
117 | }
118 | });
119 | }
120 | }
121 |
122 | /* UNUSED - make async with callback if going to use it
123 | //Delete value stored at the key, future `get` will return undefined.
124 | async del(table, key) {
125 | if (typeof key === "object") { // Delete all keys in object
126 | await this._db(table).batch(Object.keys(key).map(k => {return {type: "del", key: k};}));
127 | } else {
128 | await this._db(table).del(key);
129 | }
130 | }
131 | */
132 | /* UNUSED - make async with callback if going to use it
133 | Creates a stream, that calls cb(key, value) for each key/value pair.
134 | async map(table, cb, {end=undefined}={}) {
135 | // cb(data) => data.key, data.value
136 | // Returns a stream so can add further .on
137 | // UNTESTED
138 | return this._db(table)
139 | .createReadStream()
140 | .on('data', cb );
141 | }
142 | */
143 | async keys(table, cb) {
144 | if (cb) { try { f.call(this, cb); } catch (err) { cb(err); } } else { return new Promise((resolve, reject) => { try { f.call(this, (err, res) => { if (err) { reject(err); } else { resolve(res); } }); } catch (err) { reject(err); } }); } // Promisify pattern v2
145 | function f(cb) {
146 | const keys = [];
147 | // noinspection JSPotentiallyInvalidUsageOfClassThis
148 | const db = this._db(table, (err, db) => {
149 | if (err) {
150 | debug('keys %s failed', table);
151 | cb(err);
152 | } else {
153 | db
154 | .createKeyStream()
155 | // Note close comes after End
156 | .on('data', (key) => keys.push(key))
157 | .on('end', () => { debug('%s keys on end = %o', table, keys); cb(null, keys); }) // Gets to end of stream
158 | // .on('close', () => // Gets to end of stream, or closed from outside - not used as get "end" as well
159 | .on('error', (err) => { console.error('Error in stream from', table); cb(err); });
160 | }
161 | });
162 | }
163 | }
164 |
165 | // noinspection JSUnusedGlobalSymbols
166 | static async test() {
167 | try {
168 | this.init({ dir: 'testleveldb.' });
169 | await this.put('Testtable', 'testkey', 'testval');
170 | let res = await this.get('Testtable', 'testkey');
171 | console.assert(res === 'testval');
172 | await this.put('Testtable', { A: 'AAA', B: 'BBB' });
173 | res = await this.get('Testtable', 'A');
174 | console.assert(res === 'AAA');
175 | res = await this.get('Testtable', 'B');
176 | console.assert(res === 'BBB');
177 | res = await this.del('Testtable', 'A');
178 | res = await this.get('Testtable', 'A');
179 | console.assert(res === undefined);
180 | res = await this.keys('Testtable');
181 | console.assert(res.length === 2);
182 | // Test using callback
183 | res = await this.keys('Testtable', (err, res) => {
184 | console.assert(res.length === 2);
185 | });
186 | // Now test batches
187 | // Now test map
188 | } catch (err) {
189 | console.log('Error caught in HashStore.test', err);
190 | }
191 | }
192 | }
193 |
194 | exports = module.exports = HashStore;
195 |
--------------------------------------------------------------------------------
/ArchiveMemberPatched.js:
--------------------------------------------------------------------------------
1 | /*
2 | * Monkey patches dweb-archivecontroller,
3 | * Note cant merge into dweb-archivecontroller as wont work in browser; and cant create subclass as want everywhere e.g. archivefile.fetch_metadata is used to use the cache
4 | */
5 | /* eslint-disable func-names, no-use-before-define, consistent-return */
6 | // func-names disabled because monkeypatching, consistent-return, no-use-before-define disabled because of promisify pattern
7 |
8 | // Generic NPM modules
9 | const path = require('path');
10 | const canonicaljson = require('@stratumn/canonicaljson');
11 | const debug = require('debug')('dweb-mirror:ArchiveMemberPatched');
12 | const each = require('async/each');
13 |
14 | // Other IA repos
15 | const { ArchiveMember, gateway, ObjectFilter } = require('@internetarchive/dweb-archivecontroller'); // Note also patches Object.filter
16 | // Other files in this repo
17 | const MirrorFS = require('./MirrorFS.js');
18 |
19 | /**
20 | * Common arguments across all API functions
21 | *
22 | * config A MirrorConfig object
23 | * copyDirectory points at top level of a cache where want a copy
24 | * relFilePath path to file or item inside a cache IDENTIFIER/FILENAME
25 | * noCache ignore anything in the cache - forces refetching and may cause upstream server to cache it TODO-API check this is not obsoleted by separate read and write skipping
26 | * noStore do not store results in cache
27 | * skipFetchFile as an argument causes file fetching to be suppressed (used for testing only)
28 | * skipNet do not try and use the net for anything
29 | * wantStream Return results as a stream, just like received from the upstream.
30 | * wantSize Return the size as a byte-count.
31 | * copyDirectory Specify alternate directory to store results in rather than config.directories[0]
32 | * darkOk True if a dark item is a valid response (if false, and item is dark will throw an error)
33 | * cb(err, res) Unless otherwise documented callbacks return an error, (subclass of Error) or null, and optional return data.
34 | * Some functions also support an absent cb as returning a Promise, otherwise cb is required
35 | * feel free to add Promise support to any function lacking it, search for "Promise pattern v2" for examples of how to do this consistently.
36 | */
37 |
38 |
39 | // SEE ALMOST-SAME-CODE-NAMEPART in ArchiveMember._namepart and ArchiveItem._namepart
40 | // noinspection JSUnresolvedVariable
41 | ArchiveMember._namepart = function ({ identifier, query, sort = [] }) {
42 | // The name used for the directory and file prefixes, normally the item identifier, but some special cases
43 | if (!identifier && query) {
44 | // Goal here is a string that: gives an indication of what it is; is filesystem safe; doesnt map similar but different queries to same string
45 | // Npm's sanitize-filename does a reasonable job BUT it maps all unsafe chars to same result,
46 | // encodeURLcomponent probably does a reasonable job, except for *
47 | return encodeURIComponent(`_SEARCH_${query}_${sort.join('_')}`).replace(/\*/g, '%2A');
48 | } else if (identifier) {
49 | return identifier;
50 | } else {
51 | return undefined; // Should be caught at higher level to decide not to use cache
52 | }
53 | };
54 |
55 |
56 | ArchiveMember.read = function ({
57 | identifier = undefined, query = undefined, sort = undefined, copyDirectory = undefined
58 | }, cb) {
59 | /*
60 | Read member info for an item
61 | identifier: Where to look - can be a real identifier or pseudo-one for a saved search
62 | cb(err, data structure from file)
63 | */
64 | const namepart = this._namepart({ identifier, query, sort });
65 | const part = 'member';
66 | const relFilePath = path.join(namepart, `${namepart}_${part}.json`);
67 | MirrorFS.readFile(relFilePath, { copyDirectory }, (err, jsonstring) => {
68 | if (err) {
69 | cb(err); // Not logging as not really an err for there to be no file, as will read
70 | } else {
71 | let o;
72 | try {
73 | o = canonicaljson.parse(jsonstring); // No reviver function, which would allow postprocessing
74 | } catch (err1) {
75 | // It is on the other hand an error for the JSON to be unreadable
76 | debug('Failed to parse json at %s: part %s %s', namepart, part, err1.message);
77 | cb(err1);
78 | }
79 | cb(null, o);
80 | }
81 | });
82 | };
83 | /**
84 | * this.crawl = result of config.crawlInfo
85 | * @param config MirrorConfig object
86 | * @param cb
87 | */
88 | ArchiveMember.prototype.addCrawlInfo = function ({ config }, cb) {
89 | Object.assign(this, { crawl: config.crawlInfo({ identifier: this.identifier }) });
90 | cb(null);
91 | };
92 | ArchiveMember.addCrawlInfo = function (arr, { config = undefined, copyDirectory = undefined } = {}, cb) { // Should work on an [ArchiveMember*]
93 | each(arr, (memb, cb2) => memb.addCrawlInfo({ config, copyDirectory }, cb2), cb);
94 | };
95 | /**
96 | * Read member info for an item from the cache.
97 | */
98 | ArchiveMember.prototype.read = function ({ copyDirectory }, cb) {
99 | ArchiveMember.read({
100 | identifier: this.identifier, query: this.query, sort: this.sort, copyDirectory
101 | }, cb);
102 | };
103 |
104 |
105 | /**
106 | * Save a member (fields from the `savedkeys` list) as a `IDENTIFIER_member.json` file
107 | */
108 | // noinspection JSUnresolvedVariable
109 | ArchiveMember.prototype.save = function ({ copyDirectory = undefined } = {}, cb) {
110 | if (cb) { try { f.call(this, cb); } catch (err) { cb(err); } } else { return new Promise((resolve, reject) => { try { f.call(this, (err, res) => { if (err) { reject(err); } else { resolve(res); } }); } catch (err) { reject(err); } }); } // Promisify pattern v2
111 | function f(cb0) {
112 | if (!(copyDirectory || MirrorFS.directories.length)) {
113 | cb0(new Error('Nowhere to save to'));
114 | } else {
115 | const namepart = this.identifier; // Its also in this.item.metadata.identifier but only if done a fetch_metadata
116 | const savedkeys = gateway.url_default_fl;
117 | // noinspection JSUnusedLocalSymbols
118 | const jsonToSave = canonicaljson.stringify(ObjectFilter(this, (k, unusedV) => savedkeys.includes(k)));
119 | const relFilePath = path.join(namepart, namepart + '_member.json');
120 | MirrorFS.writeFile({ relFilePath, copyDirectory }, jsonToSave, (err) => {
121 | if (err) {
122 | debug('Unable to write metadata to %s: %s', relFilePath, err.message); cb0(err);
123 | } else {
124 | cb0(null, this);
125 | }
126 | });
127 | }
128 | }
129 | };
130 |
131 |
132 | ArchiveMember.prototype.saveThumbnail = function ({
133 | skipFetchFile = false, noCache = false, wantStream = false, copyDirectory = undefined
134 | } = {}, cb0) { // TODO-API
135 | /*
136 | Save a thumbnail to the cache, note must be called after fetch_metadata
137 | wantStream true if want stream instead of ArchiveItem returned
138 | skipFetchFile true if should skip net retrieval - used for debugging
139 | noCache true if should not check cache
140 | resolve or cb(err.res) this on completion or stream on opening
141 | */
142 | if (cb0) { try { f.call(this, cb0); } catch (err) { cb0(err); } } else { return new Promise((resolve, reject) => { try { f.call(this, (err, res) => { if (err) { reject(err); } else { resolve(res); } }); } catch (err) { reject(err); } }); } // Promisify pattern v2
143 | function f(cb) {
144 | const namepart = this.identifier; // Its also in this.metadata.identifier but only if done a fetch_metadata
145 |
146 | if (!namepart) {
147 | cb(null, this);
148 | } else {
149 | // TODO-THUMBNAILS use new ArchiveItem.thumbnailFile that creates a AF for a pseudofile
150 | const relFilePath = path.join(this.identifier, '__ia_thumb.jpg'); // Assumes using __ia_thumb.jpg instead of identifier_itemimage.jpg
151 | const debugname = namepart + '/__ia_thumb.jpg';
152 | MirrorFS.cacheAndOrStream({
153 | relFilePath,
154 | skipFetchFile,
155 | wantStream,
156 | debugname,
157 | noCache,
158 | copyDirectory,
159 | urls: ['https://archive.org/services/img/' + this.identifier],
160 | }, (err, streamOrUndefined) => {
161 | if (err) {
162 | debug('Unable to cacheOrStream %s', debugname);
163 | cb(err);
164 | } else {
165 | cb(null, wantStream ? streamOrUndefined : this);
166 | }
167 | });
168 | }
169 | }
170 | };
171 |
172 | exports = module.exports = ArchiveMember;
173 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ## CHANGELOG
2 |
3 | ## Known issues - and features for next release or two
4 |
5 | (See [dweb-mirror/issues](https://github.com/internetarchive/dweb-mirror/issues) for more detail)
6 |
7 | ## Release 0.2.x
8 | * 0.2.97
9 | * fix version of level and sharp as not building otherwise
10 | * 0.2.96
11 | * depend on readable-stream-clone from nodejs not my fork
12 | * 0.2.95
13 | * yaml.safeDump -> yaml.dump
14 | * 0.2.94
15 | * Move to latest dweb-archivecontroller 0.2.16 with fix for elasticsearch error messages see issue#364
16 | * 0.2.93
17 | * Remove or Upgrade a bunch of dependencies to current versions - see #issue374 for outstanding
18 | * 0.2.92
19 | * Move to dweb-archive 0.2.29 and dweb-archivecontroller 0.2.14 but not bookreader 5.x.x
20 | * 0.2.91:
21 | * Fix some installation problems;
22 | * includes NOMAD code to allow easier deployment of instance on IA
23 | * Make sure use latest sharp (compatability issues)
24 | * Installer update to handle newer (arm64) Macs
25 | * Depends on fix in DwebTransports 0.2.22 for issue#389
26 | * id= is required by BookReaderImages.php see issue#372
27 | * 0.2.90: Fix git:// to https://
28 | * 0.2.89: Fix installation issue with yarn
29 | * 0.2.88: Fix install issues and dependencies
30 | * 0.2.86: itemid => identifier; pickup dweb-archive 0.2.27 and dweb-archivecontroller 0.2.11
31 | * 0.2.85: pickup dweb-archive 0.2.26 and dweb-archivecontroller: 0.2.10
32 | * 0.2.84: pickup dweb-archive 0.2.25
33 | * 0.2.83: Catch double header issue if redir.html is present
34 | * 0.2.82:
35 | * Support CrawlConfig dropdown
36 | * metadata/local etc;
37 | * use dweb-cors to access data nodes
38 | * 0.2.79: archivecontroller 0.2.8 to get query build fix
39 | * 0.2.77: bug when no metadata for members; OLIP docs
40 | * 0.2.76: CrawlManager bugs; opensearch;
41 | * 0.2.75: Crawls / pages / palmleaf; installation improvements (OLIP, Mediawiki)
42 | * 0.2.74: Oops - catch typo introduced in liniting
43 | * 0.2.73: Bump dependency on dweb-transports to 0.2.17 and dweb-archive-dist 0.2.18
44 | * 0.2.71:
45 | * Mediawiki installation script / instructions now work for IIAB platform
46 | * Docker improved to support OLIP
47 | * Fix bug in search
48 | * Handle page leafs for BookReaderPreview better
49 | * 0.2.70: Implement opensearch for OLIP, debug Dockerfile and mediawiki installs
50 | * 0.2.69: DOnt force protcocol to be http in mirrorHttp (breaks www-dev-mirror)
51 | * 0.2.67:
52 | * Add config to point button at local palmleafwiki
53 | * Mediawiki: Debugging ArchiveOrgAth
54 | * Aligning Dockerfile with www-dweb-mirror Dockerfile for Kubernetes
55 | * 0.2.66: Support redir.html file if used
56 | * 0.2.65:
57 | * Mediawiki installation
58 | * Dockerfile tweeks
59 | * 0.2.64:
60 | * Add eslint dependencies
61 | * Routing changes including contenthash obsolescence
62 | * Installation including Dockerfile now working
63 | * BookReader debugging for mediawiki
64 | * Refactor mirrorHttp to combine functions , and remove /archive/ redirects https://github.com/internetarchive/dweb-mirror/issues/242
65 | * 0.2.63: More epub support
66 | * 0.2.62: Use routed instead of Naming; support epub, book urls and iiif subset; error handling on no disk;
67 | * 0.2.61: Remove dweb.archive.org dependencies and major naming refactor; stream error; installation if npm protected;
68 | * 0.2.61: fetch_query cache fallback; Save on Local
69 | * 0.2.60: Major naming refactor; support magnet links
70 | * 0.2.59: Support for enhanced media player and better offline support, refactor sendFileFromDir
71 | * 0.2.58: Support radio player and speech_vs_music_asr.json
72 | * 0.2.57: support metadata/local
73 | * 0.2.56: Dockerfile; Refactor Installers; Save on collections/local/search; Epub partial; I18n dynamic files; remove wrtc dependency; EDGE CASES: local, specialidentifiers, related; BUGS: contenthash; deleting bad hashes;
74 | * 0.2.55: Fix path to archiveui, Dockerfile rework
75 | * 0.2.54: Dependency moved from dweb-archive to dweb-archive-dist; Unified install.sh and update Installation docs to reflect.
76 | * 0.2.53: Slight improvement on re-runability of install_dev.sh
77 | * 0.2.52: Catch error handling around page resolutions; Crawl Indicators for search; workaround embed API bug; bug fixes on is_dark; crawl indicators;
78 | * 0.2.51: Move Requires back into DwebTransports
79 | * 0.2.50: Explicitly require transports only if in config; Bookreader resolution optimisation;
80 | * 0.2.50: Remove dweb-objects dependency,
81 | * 0.2.49: dev installer - add webpack; MirrorFS change prefered directory; simplify rachel instructions
82 | * 0.2.48: config.directories fix; hashstores bug; Rachel install; error return from related
83 | * 0.2.47: install_armbian.sh
84 | * 0.2.46: install_dev.sh
85 | * 0.2.45: Handle /includes so do not have to edit less for font directories
86 | * 0.2.44: level to v5 (for compatability with node v12)
87 | * 0.2.43: yarn upgrade, and some Installation instructions
88 | * 0.2.42: Fix sort handling; Look for disk changes; MDNS on by default; Update dependencies; Installation docs upgrade
89 | * 0.2.40: Crawl error handling, zero length files, bookreader location, texts subtypes, crawl when offline
90 | * 0.2.39: USB mounting; exportfiles fix; Rachel intallation notes
91 | * 0.2.38: is_dark improvements; first install; docs; rachel
92 | * 0.2.37: yarn upgrade; sort in searches; write-to-usb; carousel; cleanup URLs; MDNS;
93 | * 0.2.36: Turn off IPFS install by default, dist.ipfs.io servers are too slow and unreliable. See https://github.com/ipfs/ipfs-update/issues/105
94 | * 0.2.35: download count improvements, and copyDirectory refactor
95 | * 0.2.34: Prettier byte; edge cases with _extra.json and downloaded crawling and downloaded indicators on mediatype=texts
96 | * 0.2.33: Split members>membersFav,membersSearch; /info now has Transport info; install_ipfs fixed; downloaded on searches; adding to live crawls; ef8d8f0 Mitra Ardron 28Jun2019 at 6:54 PM
97 | * 0.2.31: Support for expanded downloaded indicator (counts and sizes), crawling single files
98 | * 0.2.30: Support for /download/
99 | * 0.2.29: Improvements to MirrorFS/maintenance (to properly handle subdirectories) and HashStore/_db (to fix locks)
100 | * 0.2.27: Add Local and Reload buttons to DwebNav
101 | * 0.2.26: Crawl controls - URL support for UI in dweb-archive
102 | * 0.2.25: Support for crawl/download on related; support for reload;
103 | * 0.2.24: Correct default user config
104 | * 0.2.23: Update default UserConfig
105 | * 0.2.22: home page; not stopping server didnt start; better HashStore error handling; simplelist/collection/search_collection; start on /local
106 | * 0.2.21: USING.md and some bug fixes (with 0.2.20), do not enable IPFS by default
107 | * 0.2.20: Installation doc reorganization; better manage http server; support crawl and downloaded fields;
108 | * 0.2.19: Installer for bookreader
109 | * 0.2.18: Initialize user config file if not there
110 | * 0.2.16: Improved behavior standalone or when cant see gateway, installation documentation
111 | * 0.2.15: bookreader bug when not cached and bug in installer
112 | * 0.2.14: Add configuration for IIAB (partial)
113 | * 0.2.14: Use new playlist api
114 | * 0.2.13: Refactor to move seed support to dweb-transports reqs dweb-transport >= v0.1.40
115 | * 0.2.12: Merge mirrorHttp and crawl into internetarchive
116 | * 0.2.12: Refactor to remove subclasses of ArchiveMember reqs archivecontroller >= v0.1.51
117 | * 0.2.11: Better fully offline handling of relateditems and metadata for files
118 | * 0.2.10: Bookreader working offline; Improved installation documentation;
119 | * 0.2.9: Bug fixes including not retrying local IPFS if not installed, and fix to not require a package that was moved
120 | * 0.2.8: Bug fix
121 | * 0.2.7: Bookreader support
122 | * 0.2.6: Recognize default location for IIAB USB sticks /media/pi/*
123 | * 0.2.5: Oops - was depending on fixed version of dweb-transports
124 | * 0.2.4: Move transport config into YAML; IPFS fetching now supported
125 | * 0.2.4: Fix bug in crawling "all" for querys
126 | * 0.2.3: Simple button to change config crawl level for item/collection reqs dweb-archive v0.1.49
127 | * 0.2.2: Switch dependencies to our own repo's to npm releases (away from github)
128 | * 0.2.1: Tidy up IPFS, GO, install; and platform specific docs & installs for Rachel3+ (which now includes IPFS)
129 | * 0.2.1: Working on Raspberry Pi 3 with IPFS
130 | * 0.2.0: Integrate seeding to IPFS and IPFS installation
131 | * 0.2.0: Integrate installation process for Rachel3+ - still not perfect but works though doesnt support IPFS yet.
132 |
133 | ### Minor releases
134 |
135 | * 0.1.2: Support multiple config files in yaml, [#88](https://github.com/internetarchive/dweb-mirror/issues/88)(minor)
136 | * 0.1.2: Fix bug installing with yarn
137 | * 0.1.1: Added support for multiple cache directories
138 | * 0.1.1: Added support for "copyDirectory" to support cloning to a USB for example.
139 | * 0.1.1: Support for search collections that have "identifier:xxx*" as the query.
140 |
--------------------------------------------------------------------------------
/INSTALLATION-iiab-rpi.md:
--------------------------------------------------------------------------------
1 | # Installation instructions for dweb-mirror on IIAB on Raspberry Pi 3 or 4
2 |
3 | If you not installing dweb-archive+IIAB on a Raspberry Pi then one of these documents
4 | will be much easier to follow.
5 |
6 | * [INSTALLATION.md](./INSTALLATION.md)
7 | for general installation instructions.
8 | * [INSTALLATION-dev.md](./INSTALLATION-dev.md)
9 | for developers who want to work on this code or on dweb-archive (our offline Javascript UI).
10 | These are tested on Mac OSX, but should work with only minor changes on Linux (feedback welcome).
11 | * [INSTALLATION-iiab-rpi.md](./INSTALLATION-iiab-rpi.md)
12 | to install Internet In A Box on a Rasperry Pi
13 | * [INSTALLATION-iiab-olip.md](./INSTALLATION-olip-rpi.md)
14 | to install OLIP on a Rasperry Pi
15 | * [INSTALLATION-rachel.md](./INSTALLATION-rachel.md)
16 | for Rachel on their own Rachel 3+ (incomplete)
17 |
18 | If anything here doesn't work please email mitra@archive.org
19 | or it would be even more helpful to post a PR on https://github.com/internetarchive/dweb-mirror
20 |
21 | ## See also
22 | * [README.md](./README.md) for more general information
23 | * [issue #111](https://github.com/internetarchive/dweb-mirror/issues/111) for meta task for anything IIAB.
24 |
25 | ## Step 1 Initial setup - getting Raspbian
26 |
27 | If your Raspberry Pi comes with Raspbian you are in luck, skip to Step 1B,
28 | otherwise if it comes with NOOBS (as most do now) you'll need to replace it with Raspbian.
29 |
30 | Internet in a Box's site is short on the initial details, especially if your RPi comes with NOOBS as mine did.
31 | So this is what I did. (Edits welcome, if your experience differed)
32 |
33 | * Downloaded Raspbian [Raspbian](https://www.raspberrypi.org/downloads/raspbian/) to your laptop (~1GB)
34 | * Any of the distributions should work - I test on the Desktop version
35 | * On a Mac:
36 | * downloaded [Etcher](https://www.balena.io/etcher/) (100Mb)
37 | * Run Etcher (its supposed to be able to use the zip, though for this test we used the .img from expanding hte zip), selecting a fresh 16GB SD card as the destination
38 | * On Windows or Linux,
39 | * I'm not sure the appropriate steps instead of Etcher.
40 | * Inserted into Raspbian 3 or 4, and powered up with Kbd and HDMI and Mouse inserted.
41 | * If at all possible insert Ethernet, otherwise it will work over WiFi with some extra steps.
42 | * Power it up
43 | * It prompted me for some getting started things,
44 | * Accepted "Next to get started" though I suspect IIAB's comprehensive install gets some of them as well.
45 | * Selected your country, language, keyboard - it shouldnt matter which.
46 | * Changed password since RPis get hacked on default password
47 | * Connected to WiFi (not necessary if you have Ethernet connected)
48 | * It automatically Updated OS - this can take a long time - take a break :-)
49 | * Note that this process failed for me with failures of size and sha, or with timeouts,
50 | but a restart, after the prompts for password etc,
51 | got me to a partially completed download so I did not have to start from scratch
52 | * You might want to ... Menu/Preferences/Config / Set display to highest resolution
53 | * You probably want `Menu/Raspberry Pi Configuration/Interfaces/SSH enable` so that you can SSH
54 | into the box rather than use attached keyboard and screen.
55 |
56 | ## Step 1B Workaround for Raspbian bug
57 | Raspbian has a bug that requires a patch until they push it to a new release.
58 | It looks from https://github.com/raspberrypi/linux/issues/3271 like you need to do
59 | ```
60 | sudo rpi-update
61 | ```
62 | This should only be applicable until the Raspbian available at
63 | https://www.raspberrypi.org/downloads/raspbian/
64 | is dated newer than September 2019
65 |
66 | ## Step 2 Install Internet In A Box
67 |
68 | Note its strongly recommended to connect your RPi to the Ethernet, rather than WiFi due to both to speed,
69 | and some bugs in the IIAB installer
70 |
71 | Internet Archive is in the IIAB distribution.
72 |
73 | Open a terminal window.
74 |
75 | Run `curl d.iiab.io/install.txt | sudo bash` to install it.
76 |
77 | To enable it either
78 | a) select the `BIG` distribution, in which case Internet Archive is included
79 |
80 | OR
81 |
82 | b) select `MIN` or `MEDIUM`
83 | When prompted to edit `/etc/iiab/local_vars.yml` respond `yes` and set the crucial two lines to:
84 | ```
85 | internetarchive_install: True
86 | internetarchive_enabled: True
87 | ```
88 | and then run `sudo iiab` to continue the installation.
89 |
90 | * Update of OS was quick as it probably duplicated the step in the auto-setup above
91 | * expect the isntall to fail, and keep running `sudo iiab` to get it to complete.
92 | * It will prompt to reset password from default `iiab-admin/g0admin`
93 | * In theory it enables SSH, but sometimes after the OS upgrade to enable it I've had to:
94 | * login from an attached keyboard,
95 | * Preferences > Raspberry Config > Services > SSH > enable
96 |
97 | #### Check it worked
98 |
99 | In a browser open: `http://box.lan/admin` id=`iiab-admin` pw=`whatever you set password to during install`
100 |
101 | * Note that I've found that `box.lan` does not work as documented, and that on many setups `box.local` is required instead.
102 | See [IIAB Issue#1583](https://github.com/iiab/iiab/issues/1583)
103 |
104 | Now check dweb-mirror was installed by opening `http://box.local:4244`
105 |
106 | Also see [http://wiki.laptop.org/go/IIAB/FAQ] if it failed
107 |
108 | And if you want to run as a local WiFi hotspot (recommended) then from the ssh prompt..
109 | ```
110 | iiab-hotspot-on
111 | ```
112 |
113 | ### 3. Edit configuration
114 |
115 | If you are doing anything non-standard, then you'll need to create and edit
116 | a local configuration file. Otherwise the application will create it the first time its needed.
117 | ```
118 | cd ~/git/dweb-mirror
119 |
120 | cp ./dweb-mirror.config.yaml ${HOME} # Copy sample to your home directory and edit,
121 | ```
122 | and edit `$HOME/dweb-mirror.config.yaml` for now see `configDefaults.yaml` for inline documentation.
123 |
124 | * `directories` if you plan on using places other than any of those in dweb-mirror.config.yaml
125 | (/.data/archiveorg, and any USBs on Rachel3+, NOOBS or IIAB)
126 | * `archiveui/directories` you probably do not need to change this as it will usually guess right,
127 | but it points to the “dist” subdirectory of wherever dweb-archive is either cloned or installed by npm install.
128 | * `apps.crawl` includes a structure that lists what collections are to be installed,
129 | I suggest testing and then editing
130 |
131 | Note that directories specified in the config file can be written using shell or unix conventions such as "~/" or "../".
132 |
133 | ### 4. Test crawling and browsing
134 |
135 | #### Crawling
136 | Crawling will happen automatically, but you can also test it manually.
137 |
138 | From a command line:
139 | ```
140 | cd /opt/iiab/internetarchive//node_modules/@internetarchive/dweb-mirror && sudo ./internetarchive -sc
141 | ```
142 | * starts the HTTP server
143 | * It might take 10-15 seconds to start, be patient
144 | * It should start crawling, and get just a minimal set of icons for the home page.
145 | * the startup is a little slow but you'll see some debugging when its live.
146 | * If it reports `ERROR: Directory for the cache is not defined or doesnt exist`
147 | * then it means you didn't create a directory for it to use as a cache
148 | * the server wants you to do this, so that it doesn't fill a disk somewhere you don't want it to happen
149 | * If you see a message like `Requeued fetch of https://dweb.archive.org/info failed` then it means it cannot see
150 | the archive's servers (on `dweb.archive.org`) so it won't be able to crawl or cache initial material until you
151 | connect to the WiFi or Ethernet.
152 |
153 | Without any other arguments, `crawl` will read a set of files into into the first (already existing) directory
154 | configured in `~/dweb-mirror.config.yaml`
155 | or if there are none there, it will look in its installation directory for `configDefaults.yaml`.
156 |
157 | Look in that directory, and there should be sub-directories appearing for each item, with metadata and/or thumbnails.
158 |
159 | You can safely delete any of the crawled material and it will be re-fetched if needed.
160 |
161 | #### Browsing
162 | * In a browser try going to `http://localhost:4244`
163 | * Or from another machine: `http://archive.local:4244` or `http://:4244`
164 | * open http://localhost:4244/details/prelinger?transport=HTTP&mirror=localhost:4244
165 | to see the test crawl.
166 |
167 | If you don’t get a Archive UI then look at the server log
168 | ```
169 | service internetarchive status
170 | ```
171 | Will get the status and most recent lines
172 | ```
173 | journalctl -u internetarchive -f
174 | ```
175 | will watch the log, `Ctrl-C` will end this.
176 |
177 | Look for any “FAILING” log lines which indicate a problem
178 |
179 | Expect to see errors in the Browser log for
180 | * http://localhost:5001/api/v0/version?stream-channels=true - which is checking for a local IPFS server
181 |
182 | Expect, on slower machines/networks, to see no images the first time,
183 | refresh after a little while and most should appear.
184 |
185 | ## 7. Auto-starting
186 | IIAB will start the internetarchive server each time it reboots.
187 |
188 | ## 8. Updating
189 |
190 | The software is frequently revised so its recommended to update, especially if you see any bugs or problems.
191 |
192 | ```
193 | cd /opt/iiab/iiab
194 | git pull
195 | ./runrole --reinstall internetarchive
196 | ```
197 |
--------------------------------------------------------------------------------
/INSTALLATION-dev.md:
--------------------------------------------------------------------------------
1 | # Installation instructions for dweb-mirror development environment
2 |
3 | See [README.md] for more info
4 |
5 | These documents are for people who want to work on code either for dweb-mirror or dweb-archive.
6 | For non developers see [./INSTALLATION.md](./INSTALLATION.md).
7 |
8 | Note these are currently tested on Mac OSX only, I would love someone to test on Linux and submit changes
9 | on the repo (or just send me a corrected version of this file)
10 |
11 |
12 | ## Automatic Installation
13 |
14 | The easiest way to install is to use [./install_dev.sh](./install_dev.sh), the installation script.
15 | If it fails on your platform, it should exit at the failing step, and you can edit it and run it again,
16 | and contribute your improvements.
17 |
18 | ```
19 | curl -o- -L https://unpkg.com/@internetarchive/dweb-mirror/install_dev.sh | bash
20 | ```
21 |
22 | ## Manual Installation
23 |
24 | This will depend on your platform but some hints.
25 |
26 | You will need:
27 | * Node version 10 or later
28 | * Yarn version 1.0.0 or later
29 | * Git
30 | * Npm (the one installed by Node should be fine)
31 |
32 | (The installer gets all these if missing)
33 |
34 | It seems to help package updating etc if you install `node-pre-gyp` and `cmake`
35 | ```
36 | sudo yarn add node-pre-gyp cmake
37 | ```
38 |
39 | You'll need to clone the repositories from Git,
40 | use `lerna` and `yarn` to install them and then crosslink them.
41 |
42 | This is non-trivial to get right which is why we built the installer !
43 |
44 | 1. Clone the repositories
45 | ```
46 | mkdir -p ~/git
47 | cd ~/git
48 | git clone https://github.com/internetarchive/dweb-transports
49 | git clone https://github.com/internetarchive/dweb-archivecontroller
50 | git clone https://github.com/futurepress/epubjs-reader
51 | git clone https://github.com/internetarchive/bookreader
52 | git clone https://github.com/internetarchive/dweb-archive
53 | git clone https://github.com/internetarchive/dweb-mirror
54 | git clone --branch mitra--release https://github.com/internetarchive/iaux
55 | ```
56 |
57 | 2. run yarn install
58 | ```
59 | yarn --cwd dweb-transports install
60 | yarn --cwd dweb-archivecontroller install
61 | yarn --cwd epubjs-reader install
62 | yarn --cwd bookreader install
63 | yarn --cwd dweb-archive install
64 | yarn --cwd dweb-mirror install
65 | yarn --cwd iaux install
66 | ```
67 |
68 | 3. iaux is a multi-repo and needs lerna run
69 | ```
70 | yarn --cwd iaux run lerna bootstrap
71 | yarn --cwd iaux run lerna link
72 | ```
73 |
74 | 4. add each package repository to yarn's links, to make development changes accessible.
75 | If you already have these packages linked, change the steps appropriately.
76 | ```
77 | yarn --cwd dweb-transports link
78 | yarn --cwd dweb-archivecontroller link
79 | yarn --cwd epubjs-reader link
80 | yarn --cwd bookreader link
81 | yarn --cwd iaux/packages/ia-components link
82 | ```
83 |
84 | 5. tell yarn to use the linked development versions
85 | ```
86 | yarn --cwd dweb-archive link @internetarchive/dweb-transports
87 | yarn --cwd dweb-archive link epubjs-reader
88 | yarn --cwd dweb-archive link @internetarchive/bookreader
89 | yarn --cwd dweb-archive link @internetarchive/dweb-archivecontroller
90 | yarn --cwd dweb-archive link @internetarchive/ia-components
91 | yarn --cwd dweb-mirror link @internetarchive/dweb-transports
92 | yarn --cwd dweb-mirror link epubjs-reader
93 | yarn --cwd dweb-mirror link @internetarchive/bookreader
94 | yarn --cwd dweb-mirror link @internetarchive/dweb-archivecontroller
95 | ```
96 |
97 | 6. webpack repos to development versions
98 | ```
99 | yarn --cwd dweb-transports run webpack --mode development
100 | yarn --cwd dweb-archive run webpack --mode development
101 | yarn --cwd epubjs-reader run grunt
102 | ```
103 |
104 | 7. install http-server
105 | ```
106 | yarn global add http-server
107 | ```
108 |
109 | ### 3. Edit configuration
110 |
111 | If you are doing anything non-standard, then you'll need to create and edit
112 | a local configuration file. Otherwise the application will create it the first time its needed.
113 | ```
114 | cd ~/git/dweb-mirror
115 |
116 | cp ./dweb-mirror.config.yaml ${HOME} # Copy sample to your home directory and edit,
117 | ```
118 | and edit `$HOME/dweb-mirror.config.yaml` for now see `configDefaults.yaml` for inline documentation.
119 |
120 | * `directories` if you plan on using places other than any of those in dweb-mirror.config.yaml
121 | (/.data/archiveorg, and any USBs on Rachel3+, NOOBS or IIAB)
122 | * `archiveui/directories` you probably do not need to change this as it will usually guess right,
123 | but it points to the “dist” subdirectory of wherever dweb-archive is either cloned or installed by npm install.
124 | * `apps.crawl` includes a structure that lists what collections are to be installed,
125 | I suggest testing and then editing
126 |
127 | Note that directories specified in the config file can be written using shell or unix conventions such as "~/" or "../".
128 |
129 | ### 4. Test browsing
130 |
131 | * From a command line:
132 | ```
133 | cd ~/git/dweb-mirror && ./internetarchive --server &
134 | ```
135 | * starts the HTTP server
136 | * the startup is a little slow but you'll see some debugging when its live.
137 | * Try going to `http://localhost:4244`
138 | * Or from another machine: `http://archive.local:4244` or `http://:4244`
139 | * open http://localhost:4244/details/prelinger?transport=HTTP&mirror=localhost:4244
140 | to see the test crawl.
141 | If you don’t get a Archive UI then look at the server log (in console)
142 | to see for any “FAILING” log lines which indicate a problem
143 |
144 | Expect to see errors in the Browser log for
145 | * http://localhost:5001/api/v0/version?stream-channels=true - which is checking for a local IPFS server
146 |
147 | Expect, on slower machines/networks, to see no images the first time,
148 | refresh after a little while and most should appear.
149 |
150 | ### 5. Test crawling
151 |
152 | ```
153 | cd ~/git/dweb-mirror
154 | ./internetarchive --crawl
155 | ```
156 | Without arguments, crawl will read a set of files into into the first (already existing) directory
157 | configured in `~/dweb-mirror.config.yaml` or if there are none there, in `~/git/dweb-mirror/configDefaults.yaml`.
158 |
159 | Look in that directory, and there should be sub-directories appearing for each item, with metadata and/or thumbnails.
160 |
161 | You can safely delete any of the crawled material and it will be re-fetched if needed.
162 |
163 | ### 6. IPFS (optional)
164 | (Note IPFS is not currently being tested for dweb-mirror and this may not work)
165 |
166 | Install IPFS, there are several strategies in install_ipfs.sh that should at least cover your Mac,
167 | but it might need editing if you have an odd combinations.
168 | ```
169 | cd ~/git/dweb-mirror
170 | ./install_ipfs.sh
171 | ```
172 |
173 | Now start the daemon, first time it initializes and configures a repo
174 | ```
175 | start_ipfs daemon &
176 | ```
177 | If it complains that 8080 is in use, then you missed editing start_ipfs and can fix this with
178 | ```
179 | ipfs config Addresses.Gateway /ip4/0.0.0.0/tcp/8081
180 | start_ipfs daemon &
181 | ```
182 | Allow ipfs to start, once it says Daemon is ready, Ctrl-C out of it
183 |
184 | In the future to update IPFS you can ...
185 | ```
186 | cd ~/git/dweb-mirror && ./install_ipfs.sh
187 | ```
188 | should update it.
189 |
190 | ### 7. Auto-starting
191 |
192 | #### On Mac OSX
193 | TODO - this doesnt appear to work on OSX, and needs investigation
194 |
195 | If you want the server to start automatically when the mac boots.
196 | Run the following commands in a terminal window
197 |
198 | Edit `org.archive.mirror.plist` and
199 | change the line `${HOME}/node_modules/@internetarchive/dweb-mirror/internetarchive`
200 | to `${HOME}/git/dweb-mirror/internetarchive` or wherever you have installed dweb-mirror
201 | to be the path to "internetarchive"
202 | ```
203 | sudo cp ~/git/dweb-mirror/org.archive.mirror.plist /Library/LaunchAgents/org.archive.mirror.plist
204 | sudo launchctl load /Library/LaunchAgents/org.archive.mirror.plist
205 | ```
206 |
207 | Restart your machine and check that http://localhost:4244 still works.
208 |
209 | #### On Linux
210 | Some variation of the code in [./install.sh](./install.sh) will be needed,
211 | this hasn't been tested as for development we always run the server manually in the debugger,
212 |
213 | ### 8. Making changes
214 | You can make changes in the UI in dweb-archive, iaux/packages/ia-components, bookreader
215 | or dweb-archive-controller then:
216 | ```
217 | cd dweb-archive ; webpack --mode development -w &
218 | ```
219 | This will watch for changes so that any edits you make are immediately reflected on either of the servers and testable with a browser page reload
220 |
221 | If you make change to dweb-transports:
222 | ```
223 | cd dweb-transports ; webpack --mode development -w &
224 | ```
225 | If you make changes to dweb-mirror, then ctrl-C out of the server and restart it.
226 | ```
227 | cd dweb-mirror ; ./internetarchive -sc &
228 | ```
229 |
230 |
231 | ### 9. Running without dweb-mirror e.g. to develop in dweb-transports
232 |
233 | To run without dweb-mirror,
234 | ```
235 | cd ~/git/dweb-archive/dist
236 | http-server
237 | ```
238 | This will run a local server that can be accessed at
239 | ```
240 | http://localhost:8080/archive.html
241 | ```
242 | The code will be run from your local server, but will access content at dweb.archive.org
243 |
244 | ## FUTURE: Updating dweb-mirror for a developer
245 |
246 | ```
247 | cd ~
248 | git/dweb-mirror/install_dev.sh
249 | ```
250 | Should update all the packages from the GIT repo's and re-install,
251 | and is fairly quick if nothing much has changed.
252 |
253 |
254 |
--------------------------------------------------------------------------------
/install_dev.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ###### INSTALLATION CODE, MOSTLY DUPLICATED in dweb-mirror/install.sh and dweb-mirror/install_dev.sh . TODO: Merge these scripts to take e.g. a --dev argument.
3 | cat < /tmp/step
32 | echo "Offline Internet Archive Installer: ${STEPNAME}"
33 | }
34 |
35 | function install_pkg() {
36 | # install a package properly for the current operating system
37 | step XXX "Installing $*"
38 | if [ "${OPERATINGSYSTEM}" != "darwin" ]
39 | then
40 | sudo apt-get install -y "$@"
41 | else
42 | brew install "$@"
43 | fi
44 | }
45 |
46 | function check_cmd() {
47 | # silence a command, but preserve its exit status
48 | "$@" >/dev/null 2>&1
49 | }
50 |
51 | ###### PLATFORM AUTODETECTION CODE, DUPLICATED in dweb-mirror/install.sh, dweb-mirror/install_dev.sh, and dweb-mirror/mediawiki/mediawiki.conf
52 |
53 |
54 | # Convert the portable uname results into go specific environment note Mac has $HOSTTYPE=x86_64 but not sure that is on other platforms
55 | case `uname -m` in
56 | "armv7l") ARCHITECTURE="arm";; # e.g. Raspberry 3 or OrangePiZero. Note armv8 and above would use what IPFS has as arm64, armv7 and down want "arm"
57 | "x86_64") ARCHITECTURE="amd64";; # e.g. a Mac OSX
58 | "i?86") ARCHITECTURE="386";; # e.g. a Rachel3+
59 | *) echo "Unknown processor type `uname -m`, needs configuring"; ARCHITECTURE="unknown";;
60 | esac
61 | # See also /sys/firmware/devicetree/base/model
62 |
63 | # Now find OS type, note Mac also has a $OSTYPE
64 | case `uname -s` in
65 | "Darwin") OPERATINGSYSTEM="darwin";; # e.g. a Mac OSX
66 | "Linux") OPERATINGSYSTEM="linux";; # e.g. Raspberry 3 or Rachel3+ or OrangePiZero/Armbian
67 | *) echo "Unknown Operating system type `uname -s` - needs configuring"; OPERATINGSYSTEM="unknown";;
68 | esac
69 | # Hard to tell Armbian from Raspbian or a bigger Linux so some heuristics here
70 | [ ! -e /usr/sbin/armbian-config ] || OPERATINGSYSTEM="armbian"
71 | [ ! -e /etc/dpkg/origins/raspbian ] || OPERATINGSYSTEM="raspbian"
72 |
73 | #TODO detect Rachel, IIAB etc and set $PLATFORM
74 | PLATFORM="unknown"
75 | [ ! -e /etc/rachelinstaller-version ] || PLATFORM="rachel"
76 |
77 | # And setup some defaults
78 | INSTALLDIR=`pwd` # Default to where we are running this from
79 | YARNCONCURRENCY=1 # Good for a 386 or arm, below that use 1, for OSX go up
80 | CACHEDIR="${HOME}/archiveorg"
81 |
82 | # Override defaults based on above
83 | case "${PLATFORM}" in
84 | "rachel") CACHEDIR="/.data/archiveorg";;
85 | esac
86 | case "${ARCHITECTURE}" in
87 | "386") YARNCONCURRENCY=2;;
88 | "amd64") YARNCONCURRENCY=4;;
89 | esac
90 |
91 | echo "Architecture: ${ARCHITECTURE} OS: ${OPERATINGSYSTEM} PLATFORM: ${PLATFORM} CACHEDIR: ${CACHEDIR} INSTALLDIR: ${INSTALLDIR}"
92 |
93 | if [ "${OPERATINGSYSTEM}" != "darwin" ]
94 | then
95 | if ! yarn --version 2>/dev/null
96 | then
97 | step XXX "Adding Yarn sources"
98 | curl -sSL https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add -
99 | echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list
100 |
101 | fi
102 | set +e # update and upgrade often have non-zero return codes even though safe to continue
103 | step XXX "Apt update"
104 | sudo apt-get update
105 |
106 | step XXX "Upgrade all Apt packages"
107 | sudo dpkg --configure -a # Clear out any previous locks/interrupted opts - especially kolibri install
108 | sudo apt-get upgrade # Make sure running latest version
109 | sudo apt -y autoremove
110 | set -e # Exit on errors
111 | else # Its OSX
112 | #set +e # Uncomment if these unneccessarily have failure exit codes
113 | step XXX "Checking git and brew are installed"
114 | git --version || xcode-select --install # Get Git and other key command line tools (need this before "brew"
115 | brew --version || /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
116 | set -e
117 | fi
118 |
119 |
120 | if [ "${OPERATINGSYSTEM}" != "darwin" ]
121 | then
122 | check_cmd yarn --version || install_pkg yarn
123 | check_cmd git --version || install_pkg git
124 | # Note yarn alternative can skip the apt-key & sources steps above and ...
125 | # curl -o- -L https://yarnpkg.com/install.sh | bash
126 | # source ~/.bashrc # Fix path
127 | step XXX "Trying to install libsecret which may fail" # Failed on rachel
128 | # Allow libsecret-1-dev to fail , we might not need it
129 | install_pkg libsecret-1-dev || echo "Libsecret failed to install, but that is ok"
130 | check_cmd netstat --version || install_pkg net-tools # Make debugging so much easier
131 | else
132 | # TODO: change a couple lines below to use only curl or only wget, rather than both, as each one performs the same task as the other
133 | check_cmd curl --version || install_pkg curl
134 | check_cmd wget --version || install_pkg wget
135 | # The brew installer for node is broken (fails to run the npx line in bookreader/package.json), use the line below as found on https://nodejs.org/en/download/package-manager/#macos
136 | #check_cmd node --version || install_pkg nodejs
137 | check_cmd node --version || ( curl "https://nodejs.org/dist/latest/node-${VERSION:-$(wget -qO- https://nodejs.org/dist/latest/ | sed -nE 's|.*>node-(.*)\.pkg.*|\1|p')}.pkg" > "$HOME/Downloads/node-latest.pkg" && sudo installer -store -pkg "$HOME/Downloads/node-latest.pkg" -target "/" )
138 | check_cmd yarn --version || curl -o- -L https://yarnpkg.com/install.sh | bash
139 | source ~/.bashrc # Fix up path
140 | fi
141 |
142 | echo "==== Creating parent directory ========================="
143 | mkdir -p ${PARENTDIRECTORY}
144 | cd ${PARENTDIRECTORY}
145 |
146 | echo "==== Getting repositories from Git ========================="
147 | for REPO in ${REPOS}
148 | do
149 | if [ ${REPO} == "epubjs-reader" ] # syntax repaired 2021
150 | then GITREPO="https://github.com/futurepress/${REPO}"
151 | else GITREPO="https://github.com/internetarchive/${REPO}"
152 | fi
153 | if [ -d ${REPO} ]
154 | then
155 | pushd ${REPO}
156 | git checkout -f
157 | git pull
158 | popd
159 | else
160 | git clone ${GITREPO}
161 | fi
162 | done
163 |
164 | echo "Selecting mitra--release branch of iaux"
165 | pushd iaux
166 | git checkout mitra--release
167 | popd
168 |
169 | for REPO in ${REPOS}
170 | do
171 | echo "==== Installing ${REPO} ========================="
172 | pushd ${REPO}
173 | yarn install
174 | popd
175 | done
176 | echo "==== IAUX needs special attention as its a multi-repo ========================="
177 | pushd iaux
178 | yarn run lerna bootstrap
179 | yarn run lerna link
180 | popd
181 |
182 | #Looks like this is done during dweb-archive yarn install which runs install.sh
183 | # echo "==== Linking bundles into dweb-archive =============="
184 | #pushd dweb-archive
185 | #yarn setuphttp
186 | #popd
187 |
188 | echo "==== Forcing symbolic links so that can work on multiple repos ===== "
189 | for i in ${REPOS};
190 | do
191 | for j in ${REPOS};
192 | do
193 | dest=${j}/node_modules/@internetarchive/${i};
194 | if [ -L ${dest} -o -d ${dest} ];
195 | then
196 | echo ${i} - ${j};
197 | rm -Rf ${dest};
198 | ln -s ../../../${i} ${dest};
199 | fi;
200 | done;
201 | done;
202 | for j in ${REPOS}
203 | do
204 | dest=${j}/node_modules/@internetarchive/ia-components
205 | if [ -L ${dest} -o -d ${dest} ];
206 | then
207 | echo $dest - ia-components
208 | rm -Rf ${dest};
209 | ln -s ../../../iaux/packages/ia-components ${dest}
210 | fi
211 | done
212 |
213 | echo "=== Webpacking each repo to development version ==== "
214 | for i in dweb-archive dweb-transports
215 | do
216 | pushd $i
217 | yarn run webpack --mode development
218 | popd
219 | done
220 | for i in epubjs-reader
221 | do
222 | pushd $i
223 | yarn run grunt
224 | popd
225 | done
226 | echo "==== installing http-server ====="
227 | yarn global add http-server
228 |
229 |
230 | cat <:4244 will work
97 | * On MacOSX (or if using a browser on the RaspberryPi/OrangePi): http://localhost:4244
98 | * On Rachel try http://rachel.local:4244 or http://rachel:4244
99 | or via the main interface at http://rachel.local and click Internet Archive
100 | * On IIAB The server can be accessed at [http://box:4244](http://box:4244) or
101 | [http://box.lan:4244](http://box.lan:4244) (try
102 | [http://box.local:4244](http://box.local:4244) via mDNS over a local network,
103 | if you don't have name resolution set up to reach your Internet-in-a-Box).
104 |
105 | Try walking through [./USING.md](./USING.md) to get a tour of the system,
106 | and you can click `Home` or the Internet Archive logo, if you just want to explore the Internet Archive's
107 | resources.
108 |
109 | ## Administration
110 |
111 | Administration is carried out mostly through the same User Interface as browsing.
112 |
113 | Select `local` from any of the pages to access a display of local content.
114 | Administration tools are under `Settings`.
115 |
116 | Click on the Archive logo, in the center-top, to get the Internet
117 | Archive main interface if connected to the net.
118 |
119 | While viewing an item or collection, the `Crawl` button in the top bar
120 | indicates whether the item is being crawled or not. Clicking it will cycle
121 | through three levels:
122 |
123 | * No crawling
124 | * Details - sufficient information will be crawled to display the page, for a
125 | collection this also means getting the thumbnails and metadata for the top
126 | items.
127 | * Full - crawls everything on the item, this can be a LOT of data, including
128 | full size videos etc, so use with care if bandwidth/disk is limited.
129 |
130 | ### Disk storage
131 |
132 | The server checks for caches of content in directories called `archiveorg` in
133 | all the likely places, in particular it looks for any inserted USB drives
134 | on most systems, and if none are found, it uses `~/archiveorg`.
135 |
136 | The list of places it checks, in an unmodified installation can be seen at
137 | `https://github.com/internetarchive/dweb-mirror/blob/master/configDefaults.yaml#L7`.
138 |
139 | You can override this in `dweb-mirror.config.yaml` in the home directory of the
140 | user that runs the server. (Note on IIAB this is currently in `/root/dweb-mirror.config.yaml`)
141 | (see 'Advanced' below)
142 |
143 | Archive's `Items` are stored in subdirectories of the first of these
144 | directories found, but are read from any of the locations.
145 |
146 | If your disk space is getting full, its perfectly safe to delete any
147 | subdirectories (except `archiveorg/.hashstore`), and the server will refetch anything else it needs
148 | next time you browse to the item while connected to the internet.
149 |
150 | It is also safe to move directories to an attached USB
151 | (underneath a `archiveorg` directory at the top level of the disk)
152 | It is also safe to move attached USB's from one device to another.
153 |
154 | Some of this functionality for handling disks is still under active development,
155 | but most of it works now.
156 |
157 | ### Maintenance
158 |
159 | If you are worried about corruption, or after for example hand-editing or
160 | moving cached items around.
161 |
162 | Run everything as root
163 | ```
164 | sudo su
165 | ```
166 | cd into location for your installation
167 | ```
168 | cd ~/node_modules/@internetarchive/dweb-mirror
169 | ./internetarchive -m
170 | ```
171 | This will usually take about 5-10 minutes depending on the amount of material
172 | cached, just to rebuild a table of checksums.
173 |
174 | ### Advanced
175 |
176 | Most functionality of the tool is controlled by two YAML files, the second of
177 | which you can edit if you have access to the shell.
178 |
179 | You can view the current configuration by going to `/info` on your server.
180 | The default, and user configurations are displayed as the `0` and `1` item in
181 | the `/info` call.
182 |
183 | In the Repo is a
184 | [default YAML file](https://github.com/internetarchive/dweb-mirror/blob/master/configDefaults.yaml)
185 | which is commented. You really should never need to edit this file, as anything in it can be
186 | overridden by lines in `~/dweb-mirror.config.yaml`. Make sure you
187 | understand how yaml works before editing this file, if you break it, you can
188 | copy a new default from
189 | [dweb-mirror.config.yaml on the repo](https://github.com/internetarchive/dweb-mirror/blob/master/dweb-mirror.config.yaml)
190 |
191 | Note that this file is also edited automatically when the Crawl button
192 | described above is clicked.
193 |
194 | As the project develops, this file will be more and more editable via a UI.
195 |
196 | ## Crawling
197 |
198 | The Crawler runs automatically at startup and when you add something to the crawl,
199 | but it can also be configurable through the YAML file described above
200 | or run at a command line for access to more functionality.
201 |
202 | In a shell
203 | ```
204 | sudo sh
205 | ```
206 | cd into the location for your installation, on most platforms it is
207 | ```
208 | cd ~/node_modules/@internetarchive/dweb-mirror
209 | ```
210 | Or on IIAB it would be
211 | ```
212 | cd /opt/iiab/internetarchive/node_modules/@internetarchive/dweb-mirror
213 | ```
214 | Perform a standard crawl
215 | ```
216 | ./internetarchive --crawl
217 | ```
218 | To fetch the "foobar" item from IA
219 | ```
220 | ./internetarchive --crawl foobar
221 | ```
222 | To crawl top 10 items in the prelinger collection sufficiently to display and put
223 | them on a disk plugged into the /media/pi/xyz
224 | ```
225 | ./internetarchive --copydirectory /media/pi/xyz/archiveorg --crawl --rows 10 --level details prelinger
226 | ```
227 | To get a full list of possible arguments and some more examples
228 | ```
229 | ./internetarchive --help
230 | ```
231 |
232 | ## More info
233 |
234 | I recommend following through the tour in [USING.md](./USING.md)
235 |
236 | Dweb-Mirror lives on GitHub at:
237 | * dweb-mirror (the server) [source](https://github.com/internetarchive/dweb-mirror),
238 | and [issues tracker](https://github.com/internetarchive/dweb-mirror/issues)
239 | * dweb-archive (the UI) [source](https://github.com/internetarchive/dweb-archive),
240 | and [issues tracker](https://github.com/internetarchive/dweb-archive/issues)
241 |
242 | This project is part of the Internet Archive's larger Dweb project, see also:
243 | * [dweb-universal](https://github.com/mitra42/dweb-universal) info about others working to bring access offline.
244 | * [dweb-transports](https://github.com/internetarchive/dweb-transports) for our transport library to IPFS, WEBTORRENT, WOLK, GUN etc
245 | * [dweb-archivecontroller](https://github.com/internetarchive/dweb-archivecontroller) for an object oriented wrapper around our APIs
246 |
--------------------------------------------------------------------------------
/USING.md:
--------------------------------------------------------------------------------
1 | # Managing content on the Internet Archive’s Universal Library.
2 |
3 | ## Summary
4 |
5 | *Note: This aspect of the system is currently (Novemver 2019) in rapid evolution,
6 | updating your system is likely to improve it: see [./INSTALLATION.md]*
7 |
8 | This file is intended to compliment the [README](./README.md) and [INSTALLATION](INSTALLATION.md) documents.
9 |
10 | There are several aspects to managing content on the Internet Archive’s Universal Library which are covered below,
11 | these include crawling content to your own system, or to an external drive suitable for moving to another system,
12 | and managing a collection of material on the archive that others can download automatically.
13 |
14 |
15 | * Accessing the box
16 | * Using the page
17 | * Details page - viewing a single item
18 | * Collection and Search pages - multiple items
19 | * Accessing Internet Archive resources
20 | * Managing Crawling
21 | * Downloading content for a different box
22 | * Managing collections on Internet Archive
23 |
24 | ## Accessing the box
25 |
26 | The address to access will depend on your configuration.
27 | * On Internet In a Box
28 | * if connected to WiFi “Internet in a Box", try http://box.lan:4244
29 | * if connected via a router then http://box.local:4244 may work
30 | * On Rachel then http://rachel.local:4244 often works
31 | * If MDNS is working on your network, then http://archive.local:4244 will work.
32 | * If running on your own machine (a laptop for example) then http://localhost:4244 should work
33 | * Otherwise ask your system admin for the address of the machine its running on, usually it will be on `:4244`
34 |
35 | ## Using the page
36 |
37 | Whichever of these works it should bring you to your `local` start page.
38 | You can get back here at any time, via the `Local` button.
39 |
40 | If you have used the Internet Archive then the interface will be familiar,
41 | but there are a few differences to support offline use.
42 |
43 | At the top you'll see the Internet Archive's usual interface, a few of these buttons will (for now) only work
44 | while online, and don't appear when offline.
45 |
46 | Below that is a row of information specific to the offline application.
47 |
48 | First are health indicators.
49 |
50 | * If it shows "Mirror" in Red, it means we can't communicate with the mirror gateway,
51 | this will only happen if the gateway goes offline part way through a process.
52 | * Normally you'll see an indicator for GATEWAY, which is Green when the gateway can talk to the Archive,
53 | and Red when you are offline.
54 | * Next to that might be indicators for WebTorrent or IPFS if they have been enabled (usually they aren't).
55 | * Then comes an indicator for this page, whether it is being crawled, and if so approximately how much has been stored.
56 | * If the mirror is online to the Internet Archive (GATEWAY shows Green), then next comes a "Reload" button,
57 | you can click this to force it to check with the Archive for an up to date list.
58 | It is most useful on collections when someone else might have added something,
59 | but your gateway might be remembering an old version.
60 | * Then there is a Settings button which brings up a page that includes status of any crawls.
61 | * Finally there is a Home button which will bring you back to this page.
62 |
63 | Each tile on this page represents an item that your server will check for when it “crawls”.
64 | The first time you access the server this will depend on what was installed on the server, and it might be empty.
65 |
66 | Notice that most of the tiles should have a White, Green or Blue dot in the top right to indicate that you are crawling them.
67 | * A White dot means that enough of this item has been downloaded to be viewed offline.
68 | * The Green dot indicates that we are checking this item each time we crawl and getting enough to display offline.
69 | * A Blue dot indicates we are crawling all the content of the item, this could be a lot of data,
70 | for example a full resolution version of the video. Its rare that you’ll use this.
71 |
72 | This button also shows how much has been downloaded, for an item its the total size of downloaded files/pages,
73 | for a collection its the total amount in all collection members.
74 |
75 | Tiles come in two types, most shows items that can be displayed - books, videos, audio etc,
76 | clicking on these will display the item.
77 |
78 | Some of the tiles will show a collection which is a group of items that someone has collected together,
79 | most likely there will be at least one collection relevant to your project put on the page during installation.
80 |
81 | It shows you how many items are in the collection and how many have been downloaded
82 | e.g. 400Mb in 10 of 123 items, means 10 of the 123 items in the collection are downloaded sufficient to view offline,
83 | and a total of 400Mb is downloaded in this collection. (Which includes some files, like thumbnails, in other items).
84 |
85 | ## Details page - viewing a single item
86 |
87 | If you click on an item that is already downloaded (Blue, Green or White dot) then it will be displayed offline,
88 | the behavior depends on the kind of item.
89 | * Images are displayed and saved for offline use
90 | * Books display in a flip book format, pages you look at will be saved for offline use.
91 | * Video and Audio will play immediately and you can skip around in them as normal
92 |
93 | The crawl button at the top will indicate whether the object is being crawled and if not, whether it has been downloaded,
94 | in the same way tiles do, and also show you (approximately) the total downloaded for this item.
95 |
96 | Click on the Crawl button till it turns Green and it will download a full copy of the book, video or audio.
97 | It waits about 30 seconds to do this, allowing time to cycle back to the desired level of crawling.
98 | These items will also appear on your Local page.
99 | See the note above, usually you won’t want to leave it at blue (all) as this will usually try
100 | to download all the files, though there are some size limits.
101 |
102 | There is a Reload button which will force the server to try archive.org,
103 | this is useful if you think the item has changed, or for debugging.
104 |
105 | If you want to Save this item to a specific disk, for example to put it on a USB-drive then click the Save button.
106 | This button brings up a dialogue with a list of the available destinations.
107 | These should include any inserted drive with `archiveorg` as a directory at its top level.
108 | The content will be copied to that drive, which can then be removed and inserted into a different server.
109 |
110 | The server checks whether these disks are present every 15 seconds, so to use a new USB disk:
111 |
112 | * Insert the USB into the server.
113 | * Create a folder at its top level called `archiveorg`
114 | * Wait about 15 seconds
115 | * Reload the page you are on
116 | * Hitting `Save` should now allow this USB disk to be selected.
117 |
118 | ## Collection and Search pages - multiple items
119 |
120 | If you click on a Collection, then we’ll display a grid of tiles for all the items that have been placed in the collection.
121 | White, Green and Blue indicators mean the same as on the Local page.
122 | If you click on the crawl button till its Green then it will check this collection each time it crawls,
123 | download the tiles for the first page or so, and can be configured to get some of the items as well
124 |
125 | [issue#140](https://github.com/internetarchive/dweb-mirror/issues/140) allow UI to configure.
126 |
127 | ## Accessing Internet Archive resources
128 |
129 | The Internet Archive logo tile on the local page will take you to the Archive front page collection,
130 | content here is probably not already downloaded or crawled,
131 | but can be selected for crawling as for any other item.
132 |
133 | ## Managing crawling
134 |
135 | If you click on the "Settings" button, it should bring up a page of settings to control Crawling.
136 | This page is still under development (as of November 2019).
137 |
138 | On here you will see a list of crawls.
139 | You should get useful information about status, any errors etc.
140 | Hitting `<<` will restart the crawl and `||` or `>` pause and resume,
141 | but note that any file already being downloaded will continue to do so when you hit pause.
142 | Hitting `||` `<<` `<` will stop the current crawl, reset and retry, which is a good way to try again if,
143 | for example, you lost connection to the server part way through.
144 | It won't waste bandwidth re-downloading anything you already have.
145 |
146 | ### Advanced crawling
147 |
148 | If you have access to the command line on the server, then there is a lot more you can do with the crawler.
149 |
150 | The items selected for crawling (Green or Blue dots) are stored in a file `dweb-mirror.config.yaml`
151 | in the one directory of the server, e.g. on IIAB its in `/root/dweb-mirror.config.yaml`
152 | and on your laptop its probably in `~/dweb-mirror.config.yaml`.
153 | You can edit this file with care!
154 |
155 | From the command line, cd into the directory holding the service to run the crawler e.g. on iIAB
156 | ```
157 | cd ~/node_modules/dweb-mirror
158 | ./internetarchive --crawl
159 | ```
160 | There are lots of options possible, try `./internetarchive —help` to get guidance.
161 |
162 | This functionality will be gradually added to the UI in future releases.
163 | In the meantime if you have something specific you want to do feel free to post it as a new issue on
164 | [github](https://github.com/dweb-mirror/issues/new).
165 |
166 | ## Downloading content for a different box
167 |
168 | You can copy one or more items that are downloaded to a new storage device (e.g. a USB drive),
169 | take that device to another Universal Library server, and plug it in.
170 | All the content will appear as if it was downloaded there.
171 |
172 | To put content onto a device, you can either:
173 | * put the `copydirectory` field in the yaml file described above,
174 | * hit `Save` while on an item or search
175 | * or run a crawl at the command line
176 |
177 | cd into your device e.g. on an IIAB it would be
178 | ```
179 | cd /media/pi/foo
180 | ```
181 | Create a directory to use for the content, it must be called "archiveorg"
182 | ```
183 | mkdir archiveorg
184 | ```
185 | cd to the installation
186 | ```
187 | cd ~/internetarchive/node_modules/dweb-mirror
188 | ```
189 | copy the current crawl to the directory
190 | ```
191 | ./internetarchive --crawl --copydirectory /media/foo/archiveorg
192 | ```
193 | When its finished, you can unplug the USB drive and plug into any other device
194 |
195 | Alternatively if you want to crawl a specific collection e.g. `frenchhistory` to the drive, you would use:
196 | ```
197 | ./internetarchive --crawl --copydirectory /media/foo/archiveorg frenchhistory
198 | ```
199 | If you already have this content on your own device, then the crawl is quick,
200 | and just checks the content is up to date.
201 |
202 | ## Managing collections on Internet Archive
203 |
204 | You can create and manage your own collections on the [Internet Archive site](http://www.archive.org).
205 | Other people can then crawl those collections.
206 |
207 | First get in touch with Mitra Ardron at `mitra@archive.org`, as processes may have changed since this is written.
208 |
209 | You'll need to create an account for yourself at [archive.org](https://archive.org)
210 |
211 | We'll setup a collection for you of type `texts` - do not worry, you can put any kind of media in it.
212 |
213 | Once you have a collection, lets say `kenyanhistory`
214 | you can upload materials to the Archive by hitting the Upload button and following the instructions.
215 |
216 | You can also add any existing material on the Internet Archive to this collection.
217 |
218 | * Find the material you are looking for
219 | * You should see a URL like `https://archive.org/details/foobar`
220 | * Copy the identifier which in this case would be `foobar`
221 | * Go to `https://archive.org/services/simple-lists-admin/?identifier=kenyanhistory&list_name=items`
222 | replacing `kenyanhistory` with the name of your collection.
223 | * Enter the name of the item `foobar` into the box and click `Add`.
224 | * It might take a few minutes to show up, you can add other items while you wait.
225 | * The details page for the collection should then show your new item `https://archive.org/details/kenyanhistory`
226 |
227 | On the device, you can go to `kenyanhistory` and should see `foobar`.
228 | Hit `Refresh` and `foobar` should show up.
229 | If `kenyanhistory` is marked for crawling it should update automatically
230 |
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | ###### INSTALLATION CODE, MOSTLY DUPLICATED in dweb-mirror/install.sh and dweb-mirror/install_dev.sh . TODO: Merge these scripts to take e.g. a --dev argument.
3 | cat < /tmp/step
26 | echo "Offline Internet Archive Installer: ${STEPNAME}"
27 | }
28 |
29 | function install_pkg() {
30 | # install a package properly for the current operating system
31 | step XXX "Installing $*"
32 | if [ "${OPERATINGSYSTEM}" != "darwin" ]
33 | then
34 | sudo apt-get install -y "$@"
35 | else
36 | brew install "$@"
37 | fi
38 | }
39 |
40 | function check_cmd() {
41 | # silence a command, but preserve its exit status
42 | "$@" >/dev/null 2>&1
43 | }
44 |
45 | step 00 Determining what kind of box this is
46 |
47 | ###### PLATFORM AUTODETECTION CODE, DUPLICATED in dweb-mirror/install.sh, dweb-mirror/install_dev.sh and dweb-mirror/mediawiki/mediawiki.conf
48 |
49 | # Convert the portable uname results into go specific environment note Mac has $HOSTTYPE=x86_64 but not sure that is on other platforms
50 | case `uname -m` in
51 | "armv7l") ARCHITECTURE="arm";; # e.g. Raspberry 3 or OrangePiZero. Note armv8 and above would use what IPFS has as arm64, armv7 and down want "arm"
52 | "x86_64") ARCHITECTURE="amd64";; # e.g. a Mac OSX
53 | "arm64") ARCHITECTURE="arm64";; # e.g. a newer Mac OSX
54 | "i?86") ARCHITECTURE="386";; # e.g. a Rachel3+
55 | *) echo "Unknown processor type `uname -m`, needs configuring"; ARCHITECTURE="unknown";;
56 | esac
57 | # See also /sys/firmware/devicetree/base/model
58 |
59 | # Now find OS type, note Mac also has a $OSTYPE
60 | case `uname -s` in
61 | "Darwin") OPERATINGSYSTEM="darwin";; # e.g. a Mac OSX
62 | "Linux") OPERATINGSYSTEM="linux";; # e.g. Raspberry 3 or Rachel3+ or OrangePiZero/Armbian
63 | *) echo "Unknown Operating system type `uname -s` - needs configuring"; OPERATINGSYSTEM="unknown";;
64 | esac
65 | # Hard to tell Armbian from Raspbian or a bigger Linux so some heuristics here
66 | [ ! -e /usr/sbin/armbian-config ] || OPERATINGSYSTEM="armbian"
67 | [ ! -e /etc/dpkg/origins/raspbian ] || OPERATINGSYSTEM="raspbian"
68 |
69 | #Auto-Detect Rachel, IIAB etc and set $PLATFORM
70 | PLATFORM="unknown"
71 | [ ! -e /etc/rachelinstaller-version ] || PLATFORM="rachel"
72 | [ ! -d /opt/iiab ] || PLATFORM="iiab"
73 |
74 | #TODO Auto detect "Nuc"
75 | echo "ARCHITECTURE=${ARCHITECTURE} OPERATINGSYSTEM=${OPERATINGSYSTEM} PLATFORM=${PLATFORM}"
76 | ## END OF AUTODETECTION CODE, DUPLICATED IN in dweb-mirror/install.sh and dweb-mirror/mediawiki/mediawiki.conf
77 |
78 | # And setup some defaults
79 | INSTALLDIR=`pwd` # Default to where we are running this from
80 | YARNCONCURRENCY=1 # Good for a 386 or arm, below that use 1, for OSX go up
81 | CACHEDIR="${HOME}/archiveorg"
82 |
83 | # Override defaults based on above
84 | case "${PLATFORM}" in
85 | "rachel") CACHEDIR="/.data/archiveorg";;
86 | esac
87 | case "${ARCHITECTURE}" in
88 | "unknown") YARNCONCURRENCY=1;;
89 | "386") YARNCONCURRENCY=2;;
90 | "arm64") YARNCONCURRENCY=4;;
91 | "amd64") YARNCONCURRENCY=4;;
92 | esac
93 |
94 | echo "CACHEDIR: ${CACHEDIR} INSTALLDIR: ${INSTALLDIR}"
95 |
96 | if [ "${OPERATINGSYSTEM}" != "darwin" ]
97 | then
98 | if check_cmd yarn --version 2>/dev/null && yarn --help | grep checksums >/dev/null
99 | then
100 | echo "Yarn - the right one - looks like its installed"
101 | else
102 | step XXX "Adding Yarn sources"
103 | curl -sL https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add -
104 | echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list
105 | sudo apt-get install -Y yarn
106 | yarn --version 2>/dev/null && yarn --help | grep checksums >/dev/null # Check it worked (will error if not)
107 | fi
108 | set +e # update and upgrade often have non-zero return codes even though safe to continue
109 | step XXX "Apt update"
110 | sudo apt-get update
111 |
112 | step XXX "Upgrade all Apt packages"
113 | sudo dpkg --configure -a # Clear out any previous locks/interrupted opts - especially kolibri install
114 | sudo apt-get upgrade # Make sure running latest version
115 | sudo apt -y autoremove
116 | set -e # Exit on errors
117 | else # Its OSX
118 | #set +e # Uncomment if these unneccessarily have failure exit codes
119 | step XXX "Checking git and brew are installed"
120 | git --version || xcode-select --install # Get Git and other key command line tools (need this before "brew"
121 | brew --version || /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
122 | set -e
123 | fi
124 |
125 | if [ "${OPERATINGSYSTEM}" != "darwin" ]
126 | then
127 | check_cmd yarn --version || install_pkg yarn
128 | check_cmd git --version || install_pkg git
129 | # Note yarn alternative can skip the apt-key & sources steps above and ...
130 | # curl -o- -L https://yarnpkg.com/install.sh | bash
131 | # source ~/.bashrc # Fix path
132 | step XXX "Trying to install libsecret which may fail" # Failed on rachel
133 | # Allow libsecret-1-dev to fail , we might not need it
134 | install_pkg libsecret-1-dev || echo "Libsecret failed to install, but that is ok"
135 | check_cmd netstat --version || install_pkg net-tools # Make debugging so much easier
136 | else # Its darwin (Mac OSX)
137 | # TODO: change a couple lines below to use only curl or only wget, rather than both, as each one performs the same task as the other
138 | check_cmd curl --version || install_pkg curl
139 | check_cmd wget --version || install_pkg wget
140 | # The brew installer for node is broken (fails to run the npx line in bookreader/package.json), use the line below as found on https://nodejs.org/en/download/package-manager/#macos
141 | #node --version >>/dev/null || brew install nodejs
142 | check_cmd node --version || ( curl "https://nodejs.org/dist/latest/node-${VERSION:-$(wget -qO- https://nodejs.org/dist/latest-v12.x/ | sed -nE 's|.*>node-(.*)\.pkg.*|\1|p')}.pkg" > "$HOME/Downloads/node-latest.pkg" && sudo installer -store -pkg "$HOME/Downloads/node-latest.pkg" -target "/" )
143 | check_cmd yarn --version || curl -o- -L https://yarnpkg.com/install.sh | bash
144 | source ~/.bashrc # Fix up path
145 | fi
146 |
147 | # Previously used, but do not believe need now that not installing as many dependencies.
148 | # yarn global add node-pre-gyp
149 | # [ -d node_modules/cmake ] || [ -d /usr/local/share/.config/yarn/global/node_modules/cmake/ ] || sudo yarn global add cmake
150 |
151 | step XXX "Checking cache directory for content ${CACHEDIR} exists and is writable"
152 | if [ ! -d ${CACHEDIR} ]
153 | then
154 | step XXX "Creating cache directory for content ${CACHEDIR}"
155 | if ! mkdir -p ${CACHEDIR}
156 | then
157 | set +x # Make visible request for sudo
158 | sudo mkdir -p ${CACHEDIR}
159 | set -x
160 | fi
161 | [ -d ${CACHEDIR} ]
162 | fi
163 | if [ ! -w ${CACHEDIR} ]
164 | then
165 | step XXX "Making cache directory for content writable by ${USER}"
166 | if ! chown ${USER} ${CACHEDIR}
167 | then
168 | set -x # Make visible request for sudo
169 | sudo chown ${USER} ${CACHEDIR}
170 | set +x
171 | fi
172 | [ -w ${CACHEDIR} ] # Fail if cannot write ${CACHEDIR}
173 | fi
174 | step XXX "Yarn install or update dweb-mirror"
175 | cd ${INSTALLDIR} # By default ${HOME}
176 | yarn config set child-concurrency ${YARNCONCURRENCY}
177 | # Careful - this next test may look duplicative but a failure to install can leave directory but no package.json,
178 | # next install would do an install;upgrade removing the packages, and then failing
179 | if [ -d node_modules/@internetarchive/dweb-mirror -a -e package.json ]
180 | then
181 | # Previously installed, just check the install and upgrade
182 | yarn install
183 | yarn upgrade
184 | else
185 | # Not previously installed, install, but do not upgrade as it wastes time
186 | yarn add @internetarchive/dweb-mirror
187 | yarn install
188 | fi
189 |
190 | step XXX "Installer: Switching directories into dweb-mirror"
191 | cd ${INSTALLDIR}/node_modules/@internetarchive/dweb-mirror
192 |
193 | step XXX "Setup service to autostart at boot and start server"
194 | # Note its clear we need to edit the service but then its unclear that the armbian and rachel strategies are different, cross-try them.
195 | cat internetarchive.service \
196 | | sed -e "s:{{ internetarchive_dir }}:${INSTALLDIR}:" | sed -e "s:User=root:User=${USER}:" >/tmp/internetarchive.service
197 | if [ "${OPERATINGSYSTEM}" = "armbian" -o "${PLATFORM}" = "rachel" -o "${OPERATINGSYSTEM}" = "raspbian" -o "${OPERATINGSYSTEM}" = "linux" ]
198 | then
199 | diff /tmp/internetarchive.service /lib/systemd/system || sudo cp /tmp/internetarchive.service /lib/systemd/system
200 | sudo systemctl enable internetarchive.service # Links /etc/systemd/system/multi-user.targets.wants/internetarchive.service to /lib/systemd/system/internetarchiveservice
201 | sudo systemctl daemon-reload # Starts internetarchive
202 | #sudo service internetarchive start # Alternative starter
203 | #sudo systemctl start internetarchive.service # Alternative starter
204 | sudo service internetarchive start
205 | else
206 | echo "Installer needs a strategy to setup auto-start on this platform"
207 | fi
208 |
209 | if [ "${PLATFORM}" = "rachel" ]
210 | then
211 | step XXX "Rachel only: Copy module"
212 | sudo rm -rf /var/www/modules/en-internet_archive
213 | sudo mv rachel/en-internet_archive /var/www/modules/
214 | sudo chown -R www-data:www-data /var/www/modules/en-internet_archive
215 | fi
216 |
217 | # Do not try it on OSX, IIAB doesnt uses this installer,
218 | # Tested on raw RPI, RPI+Rachel; armbian/orangepi not needed on RPI+IIAB which uses own installer;
219 | if [ "${PLATFORM}" = "rachel" -o "${OPERATINGSYSTEM}" = "raspbian" -o "${OPERATINGSYSTEM}" = "armbian" ]
220 | then
221 | step XXX "Raspberries (including Rachel) only - usb mount - getting dependencies"
222 | sudo apt-get install -y debhelper exfat-fuse
223 | step XXX "Raspberries (including Rachel) only - getting and building usbmount package from fork that fixes some bugs"
224 | cd /var/tmp
225 | if [ -d usbmount ]
226 | then
227 | cd usbmount
228 | git pull
229 | else
230 | git clone https://github.com/rbrito/usbmount.git
231 | cd usbmount
232 | fi
233 | # Raspbian didnt require sudo, but armbian does
234 | sudo dpkg-buildpackage -us -uc -b
235 | cd ..
236 | sudo apt install -y ./usbmount_0.0.24_all.deb
237 | step XXX "Raspberries (including Rachel) only - editing usbmount.conf in place"
238 | sudo sed 's/FILESYSTEMS=.*/FILESYSTEMS="vfat ext2 ext3 ext4 ntfs-3g ntfs exfat hfsplus fuseblk"/' -i- /etc/usbmount/usbmount.conf
239 | echo "It should recognize USB drives after the next reboot"
240 | fi
241 |
242 | if [ "${OPERATINGSYSTEM}" = "armbian" ]
243 | then
244 | step XXX "Armbian closing notes"
245 | cat <
257 | wpa-psk
258 |
259 | Then start it with
260 |
261 | sudo ifup wlan0
262 |
263 | or b) to use your device as a WiFi hot spot
264 |
265 | * sudo armbian-config > network > hotspot >
266 | * At some point it asks to "select interface" I think this is the point to pick wlan0 though its unclear whether
267 | this is the WiFi interface to use, or for backhaul?
268 | * Note that once setup, it can take a minute or two for the WiFi access point to be visible.
269 | * Also note that it seems to pick unusual IP addresses, 172.24.1.1 was the gateway when I connected to it.
270 |
271 | * If anyone knows how to set this up from the command line a PR would be appreciated.
272 | * This doc might be helpful
273 | https://docs.armbian.com/User-Guide_Advanced-Features/#how-to-set-wireless-access-point
274 |
275 | EOT
276 | fi
277 |
278 | echo "Installation of offline Internet Archive (dweb-mirror) complete"
279 | if [ ! "${OPERATINGSYSTEM}" = "darwin" ]
280 | then
281 | service internetarchive status
282 | fi
283 |
--------------------------------------------------------------------------------
/internetarchive:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | process.env.DEBUG="dweb-transports dweb-transports:* dweb-archivecontroller:* dweb-mirror:* dweb-mirror:HashStore"; // Get highest level debugging of these two libraries, must be before require(dweb-transports)
3 | //process.env.DEBUG=process.env.DEBUG + " bittorrent-tracker:*"; // Add Webtorrent debugging
4 |
5 | // TODO-GUN check using GUN for metadata
6 |
7 | const debug = require('debug')("dweb-mirror:internetarchive");
8 | const getopts = require('getopts');
9 | const canonicaljson = require('@stratumn/canonicaljson');
10 | const waterfall = require('async/waterfall');
11 | const yaml = require('js-yaml'); //https://www.npmjs.com/package/js-yaml
12 | // Other IA repos
13 | // noinspection JSUndefinedPropertyAssignment
14 | global.DwebTransports = require('@internetarchive/dweb-transports');
15 | // noinspection JSUnusedLocalSymbols
16 | const {ObjectFilter} = require('@internetarchive/dweb-archivecontroller');
17 | //This Repo
18 | // noinspection JSUnusedLocalSymbols
19 | const ArchiveItem = require('./ArchiveItemPatched');
20 | // noinspection JSUnusedLocalSymbols
21 | const ArchiveFile = require('./ArchiveFilePatched');
22 | const CrawlManager = require('./CrawlManager');
23 | const MirrorFS = require('./MirrorFS');
24 | const MirrorConfig = require('./MirrorConfig');
25 | const mirrorHttp = require('./mirrorHttp');
26 | const {registerMDNS, destroyMDNS} = require('./mdns-register');
27 | const optsInt = ["depth", "maxFileSize", "concurrency", "limitTotalTasks"]; // Not part of getopts, just documenting what aren't string or boolean
28 | const optsArray = ["level", "transport", "rows", "directory"]; // Options that can be arrays
29 | const optsLowerCase = ["level"]; // Always want lowercase
30 | const optsUpperCase = ["transport"]; // Always want uppercase
31 | const optsJson = ["search", "related"]; // Can be provided as JSON rather than individual sub-opts
32 |
33 | //XXX make depth max of depth, level-1, rows
34 | //const opts = getopts("--dummy --level details commute".split(" "),{ // Just for testing different options
35 | const opts = getopts(process.argv.slice(2),{
36 | alias: { l: "level", r: "rows", h: "help", v: "verbose", d: "depth", c: "crawl", s: "server", m: "maintenance",
37 | "skipFetchFile":"skipfetchfile", "maxFileSize":"maxfilesize", "limitTotalTasks":"limittotaltasks",
38 | "copyDirectory":"copydirectory", "MDNS":"mdns", "NOMDNS": "nomdns"},
39 | boolean: ["h","v", "skipFetchFile", "noCache", "dummy", "NOMDNS"],
40 | //string: ["directory", "search", "related", "depth", "debugidentifier", "maxFileSize", "concurrency", "limitTotalTasks", "transport"],
41 | string: ["directory", "search", "related", "transport", "level", "debugidentifier"], // Not debugidentifier because undefined and "" are not the same.
42 | //default: {transport: "HTTP"}, // Getting defaults from yaml via MirrorConfig
43 | "unknown": option => { if (!optsInt.includes(option)) { console.log("Unknown option", option, ", 'internetarchive -h' for help"); process.exit()} }
44 | });
45 | const httpOrHttps = "http"; // This server is running on http, not https (at least currently)
46 |
47 | //TODO make --maintenance also delete locks on any hashstores via new hashstores.maintenance call
48 |
49 | const help = `
50 | usage: internetarchive [-hvscm] [--crawl] [--server] [--maintenance] [-l level] [-r rows] [ -d depth ] [--directory path] [--search json] [--related json]
51 | [--debugidentifier identifier] [--maxFileSize bytes] [--concurrency threads] [--limittotaltasks tasks] [--transport TRANSPORT]*
52 | [--skipfetchfile] [--noCache] [--dummy] [identifier]*
53 |
54 | -h : help print this text
55 | -v : verbose tell us which config being run (default is currently pretty verbose)
56 | -q : quiet (TODO implement this)
57 | -s : server run http server
58 | -c : crawl run crawler
59 | -m : maintenance check for things like 0 length files or partials etc and ensure in IPFS
60 | -l level : Crawl the identifiers to a certain level, valid values are:
61 | "tile" for just enough to print a collection page, including the thumbnail image
62 | "metadata" and the full metadata, which will be useful once local search is implemented
63 | "details" and enough to paint a page, including for example a lower bandwidth video
64 | "full" and all the files in the item - beware, this can get very big.
65 | -r rows : overrides any (simple) search string to crawl this number of items
66 | -d depth : crawl collections found in this collection to a depth,
67 | (0 is none, do not even crawl this collection, 1 is normal, 2 is collections in this collection
68 | --copydirectory path : Store a copy of the crawl in this directory (often used for a removable drive)
69 | --directory path : override the directory set in the configuration for the root of the cache
70 | --search json : override default search string, strict json syntax only
71 | --related json : override default setting for crawling related items, strict json syntax only
72 | --debugidentifier identifier : identifier to do extra debugging on, only really valuable when using an IDE
73 | --maxfilesize bytes : any file bigger than this will be ignored
74 | --concurrency threads : how many files or searches to be happening concurrently - use 1 for debugging, otherwise 10 is about right
75 | --limittotaltasks tasks : a maximum number of tasks to run, will be (approximately) the number of searches, plus the number of items crawled.
76 | --transport TRANSPORT : The names of transport to use, by default its HTTP, but can currently add IPFS, WEBTORRENT GUN, (TODO must currently be upper case - allow both)
77 | --skipfetchfile : Do not actually transfer the files (good for debugging)
78 | --noCache : Ignore current contents of cache and always refetch
79 | --mdns DOMAIN : Respond with MDNS to DOMAIN.local (by default MDNS responds on archive.local)
80 | --nomdns : Do not respond with MDNS on archive.local
81 | --dummy : Just print the result of the options in the JSON format used for configuration
82 |
83 | identifier : Zero or more identifiers to crawl (if none, then it will use the default query from the configuration)
84 |
85 | Examples:
86 |
87 | crawl.js prelinger # Gets the default crawl for the prelinger collection, (details on prelinger, then tiles for top 40 items in the collection and 6 related items)
88 | crawl.js --level details --rows 100 prelinger # Would pull the top 100 items in prelinger (just the tiles)
89 | crawl.js --level all commute # Fetches all the files in the commute item
90 |
91 | Specifying level, or rows more than once will apply that result to the searches, so for example:
92 |
93 | crawl.js --level details --rows 10 --level details prelinger # Gets the movies for the first 10 movies in prelinger
94 | crawl.js --level details --rows 100 --level tiles --rows 100 --level tiles movies # Gets the top 100 items in movies, and then crawls any of those items that are collections
95 | crawl.js --rows 100 --depth 2 movies # Is a shortcut to do the same thing
96 |
97 | Running crawl with no options will run the default crawls in the configuration file with no modifications, which is good for example if running under cron.
98 |
99 | A useful hint is to experiment with arguments, but add the \`--dummy\` argument to output a JSON description of the search task(s) to be carried out.
100 | `;
101 | if (opts.help) { console.log(help); process.exit(); }
102 |
103 | function processOpts() {
104 | /* Process the command line opts resulting in a munged opts - this is intended to be generic, not specific to dweb-mirror*/
105 |
106 | // Handle arrays, always return an array, even if empty
107 | optsArray.forEach(key => {
108 | if ((typeof opts[key] === "undefined") || (opts[key] === "")) {
109 | opts[key] = [];
110 | } else if (!Array.isArray(opts[key])) {
111 | opts[key] = [ opts[key] ];
112 | }
113 | });
114 |
115 | optsLowerCase.forEach(key => {
116 | opts[key] = Array.isArray(opts[key])
117 | ? opts[key].map(t=>t.toLowerCase())
118 | : opts[key].toLowerCase()});
119 |
120 | optsUpperCase.forEach(key => {
121 | opts[key] = Array.isArray(opts[key])
122 | ? opts[key].map(t=>t.toUpperCase())
123 | : opts[key].toUpperCase()});
124 |
125 | // Look for some complete json args and unpack them
126 | optsJson.forEach(key => { // search, related
127 | if (opts[key].length) {
128 | try {
129 | opts[key] = canonicaljson.parse(opts[key]);
130 | } catch (err) {
131 | console.log("Invalid json in argument", key, "=", opts[key], err.message);
132 | process.exit();
133 | }
134 | } else {
135 | opts[key] = undefined;
136 | }
137 | });
138 |
139 | }
140 |
141 | function optsToConfig() {
142 | /* Take opts, and manipulate defaults in config */
143 | processOpts(opts);
144 |
145 | // Default level is 1 level at details
146 | if (!opts.level.length) opts.level.push("details");
147 | // Default rows is in config...defaultDetailsSearch.rows if level >= detail
148 | if (!opts.rows.length) {
149 | opts.rows.push(
150 | (CrawlManager._levels.indexOf(opts.level[0]) >= CrawlManager._levels.indexOf("details")
151 | ? ((config.apps.crawl.opts.defaultDetailsSearch && config.apps.crawl.opts.defaultDetailsSearch.rows) || 0)
152 | : 0)
153 | );
154 | }
155 |
156 | // Map - if specified: config.connect.transports (plural) = opts.transport (singular but array)
157 | if (opts.transport.length) {
158 | config.setOpts({apps: {crawl: {connect: {transports: opts.transport}}}}) // Code cares about case
159 | }
160 | // Map - if specified: directories = opts.directory
161 | if (opts.directory.length) {
162 | config.setOpts({directories: opts.directory})
163 | }
164 | // Check for errors
165 | // Its an error not to specify directory if none are defined in defaults
166 | if (!config.directories.length) {
167 | debug("WARNING: Directory for the cache is not defined or doesnt exist - crawl will wait for disk to be available");
168 | //Do not make this an error, crawl will wait and server SHOULD work without disk
169 | // return new Error("ERROR: Directory for the cache is not defined or doesnt exist");
170 | }
171 | if (opts.search && (opts.rows || opts.depth)) {
172 | return new Error("ERROR: Cannot specify search with rows or depth arguments");
173 | }
174 | if (opts.debugidentifier.length) {
175 | // noinspection ES6ConvertVarToLetConst
176 | global.debugidentifier = opts.debugidentifier }
177 | if (opts.verbose || opts.dummy) {
178 | debug( "Config:"); debug(yaml.dump(ObjectFilter(config, (key, unusedValue) => key !== "configOpts")));
179 | }
180 | if (opts.mdns) { config.setOpts({mdns: opts.mdns})}
181 | if (opts.nomdns) { config.setOpts({mdns: undefined})}
182 | return null;
183 | }
184 | function _tasks() {
185 | // Somewhat complex ....
186 | // Allows specifying either ... an array of e.g. -level details -level tiles
187 | // or a -depth parameter
188 | // or an array of -rows
189 | let tasks;
190 | if (!(opts._.length || (typeof opts.crawl === "string"))) {
191 | // We have no positional arguments
192 | // If no positional args specified, then use from config.apps.crawl.tasks
193 | if (opts.depth || opts.search || opts.related ) {
194 | return new Error("If specifying options then should also specify identifiers to crawl");
195 | }
196 | // noinspection JSUnresolvedVariable
197 | tasks = config.apps.crawl.tasks; // Default or configured tasks
198 | } else {
199 | if (typeof opts.crawl === "string") { // e.g. "./internetarchive --crawl foo" which is technically wrong, but understandable.
200 | opts._.push(opts.crawl) }
201 | opts.crawl = true; // Specifying identifiers implies --crawl
202 | // We have positional arguments, use default details search
203 | function f(depthnow, depth) { // Recurses
204 | return depth
205 | ? Object.assign({}, opts.search || config.apps.crawl.opts.defaultDetailsSearch,
206 | { level: opts.level[Math.min(depthnow+1,opts.level.length-1)],
207 | rows: opts.rows[Math.min(depthnow,opts.rows.length-1)],
208 | search: f(depthnow+1, depth -1)
209 | })
210 | : undefined;
211 | }
212 | // Build an array of tasks where each specifies a multi-level search based on depth
213 | tasks = opts._.map( identifier => { return {
214 | identifier,
215 | level: opts.level[0],
216 | related: opts.related,
217 | search: f(0, Math.max(opts.depth || 0, opts.level.length, opts.rows.length)) // recurse structure
218 | } } );
219 | }
220 | return tasks;
221 | }
222 |
223 | function connect(cb) {
224 | const connectOpts = config.connect;
225 | //wrtc is not available on some platforms (esp 32 bit such as Rachel3+) so only include if requested (by webtorrent.tracker = 'wrtc' and available.
226 |
227 | // SEE-OTHER-ADDTRANSPORT in dweb-transports dweb-archive dweb-mirror
228 | // TODO-SPLIT these will need to move into local server or may be excluded by cross-origin low-bandwidth rule in chrome
229 | // These are no longer packaged in dweb-transports, include specifically only if going to use it
230 | // And make sure to do add via yarn during installation
231 | DwebTransports.loadIntoNode(connectOpts); // Note runs loadIntoNode from each used DwebTransports.TransportXxx or default DTS.Transport
232 | if (opts.verbose || opts.dummy) {
233 | debug( "Connect configuration: %o", connectOpts);
234 | }
235 | if (!opts.dummy) {
236 | DwebTransports.connect(connectOpts, unusedErr => {
237 | cb(unusedErr);
238 | });
239 | }
240 | }
241 |
242 | function crawl(cb) {
243 | // Group explicit crawl opts from config, and any other opts that CrawlManager accepts, overriding existing defaults
244 | const crawlopts = Object.assign({},
245 | config.apps.crawl.opts,
246 | ObjectFilter(opts, (k,v)=> CrawlManager.optsallowed.includes(k) && (typeof v !== "undefined")),
247 | {callbackDrainOnce: true, name: "main"}
248 | );
249 | const tasks = _tasks(); // draws from opts and config.apps.crawl.tasks
250 | if (opts.verbose || opts.dummy) {
251 | debug( "Crawl configuration: crawlopts=%o tasks=%O", crawlopts, tasks);
252 | }
253 | if (!opts.dummy) {
254 | CrawlManager.startCrawl(tasks, crawlopts, cb);
255 | }
256 | }
257 | let config;
258 | let server;
259 |
260 | function startServer(cb) {
261 | mirrorHttp(config, (err, serv) => {
262 | server = serv;
263 | if (!err && config.mdns) {
264 | registerMDNS(config.mdns);
265 | }
266 | cb(err);
267 | });
268 | }
269 | function stopServer(cb) {
270 | if (server) {
271 | debug("Closing server");
272 | server.close((err) => {
273 | if (err)
274 | debug("Failed to stop server: %s, but it might just because it was already started", err.message);
275 | destroyMDNS();
276 | cb(null); // Do not pass on failure, still want to stop transports
277 | });
278 | server = undefined; // Its obviously not running
279 | } else {
280 | cb(null);
281 | }
282 | }
283 |
284 | waterfall([
285 | cb => MirrorConfig.new(undefined,
286 | (obj) => { if (typeof obj.directories !== "undefined") MirrorFS.setState({directories: obj.directories}) },
287 | (err, res) => { config = res; cb(err); }), // Load config early, so can use in opts processing
288 | cb => cb(optsToConfig()), // Currently synchronous returning err||null
289 | cb => {
290 | if (opts.server && ! config.archiveui.directory) { debug("ERROR unlikely to work as none of %o present",config.archiveui.directoriest)}
291 | cb(null);
292 | },
293 | cb => { MirrorFS.init({
294 | directories: config.directories,
295 | httpServer: httpOrHttps+"://localhost:"+config.apps.http.port,
296 | preferredStreamTransports: config.connect.preferredStreamTransports});
297 | cb(null); },
298 | cb => connect(cb),
299 | cb => { // Start server before crawler as crawler takes time before returning
300 | if (opts.server || opts.maintenance || opts.crawl) { startServer(cb); } else { cb(null); }},
301 | cb => { // Maintenance must be after server start since needs for IPFS, should be before crawl
302 | if (opts.maintenance) { MirrorFS.maintenance({cacheDirectories: config.directories}, cb) } else {cb(null);}},
303 | cb => { if (opts.crawl) { crawl(cb); } else { cb(null); }},
304 | cb => { // Stop express server unless explicit -s option
305 | if (!opts.server) {stopServer(cb); } else { cb(null); } },
306 | cb => { // If we aren't leaving a server running, then stop the transports
307 | if (!opts.server) { DwebTransports.p_stop(cb); } else { cb(null); }}
308 | ],(err) => {
309 | if (err) {
310 | debug("Failed: %s", err.message)
311 | } else {
312 | if (server) {
313 | debug('Completed, but server still running');
314 | } else {
315 | debug('Completed');
316 | }
317 | }
318 | });
319 |
--------------------------------------------------------------------------------
/INSTALLATION.md:
--------------------------------------------------------------------------------
1 | # Offline Internet Archive - Installation
2 |
3 | See [README.md] for more info
4 |
5 | ## Introduction
6 |
7 | These are instructions to install the Internet Archive's offline server
8 | also called 'dweb-mirror, on any of the tested platforms which are currently:
9 |
10 | * Raspberry Pi 3B+ or 4 running Raspbian or NOOBS.
11 | * Orange Pi Zero running Armbian
12 | * Raspberry Pi 3B+ or 4 running Rachel on top of Rasbian.
13 | * Mac OSX 10.14 or later.
14 |
15 | It should work on most similar platforms, and we would welcome reports of success or failure.
16 |
17 | There are separate instructions for:
18 | * [INSTALLATION-dev.md](./INSTALLATION-dev.md)
19 | for developers who want to work on this code or on dweb-archive (our offline Javascript UI).
20 | These are tested on Mac OSX, but should work with only minor changes on Linux (feedback welcome).
21 | * [INSTALLATION-iiab-rpi.md](./INSTALLATION-iiab-rpi.md)
22 | to install Internet In A Box on a Rasperry Pi
23 | * [INSTALLATION-olip-rpi.md](./INSTALLATION-olip-rpi.md)
24 | to install OLIP on a Rasperry Pi
25 | * [INSTALLATION-rachel.md](./INSTALLATION-rachel.md)
26 | for Rachel on their own Rachel 3+ (incomplete)
27 |
28 | See also [INSTALLATION-faq.md](./INSTALLATION-faq.md), a work in progress to
29 | handle some of the questions.
30 | If anything here doesn't work please email mitra@archive.org
31 | or it would be even more helpful to post a question or fix
32 | on https://github.com/internetarchive/dweb-mirror/issues
33 |
34 | There are some platform specific topics on GitHub including:
35 | * OrangePi: [dweb-mirror issue#224](https://github.com/internetarchive/dweb-mirror/issues/224)
36 | * RaspberryPi: [dweb-mirror issue#110](https://github.com/internetarchive/dweb-mirror/issues/110).
37 | * Rachel: [dweb-mirror issue#93](https://github.com/internetarchive/dweb-mirror/issues/93).
38 | * Docker & OLIP: [dweb-mirror issue#263](https://github.com/internetarchive/dweb-mirror/issues/263)
39 | * Yunohost: [dweb-mirror issue#259](https://github.com/internetarchive/dweb-mirror/issues/259)
40 | * Set top boxes: [dweb-mirror issue#223](https://github.com/internetarchive/dweb-mirror/issues/223)
41 | * or feel free to start a [new Issue](https://github.com/internetarchive/dweb-mirror/issues/new)
42 |
43 | If you are on a Mac or have already got the right operating system already,
44 | you can skip to step 4 (Install) otherwise ...
45 |
46 | ## Step 1: Download the Operating System to a desktop/laptop.
47 |
48 | You'll need to download the correct version of the operating system, and then get it onto a SD in one of
49 | several ways.
50 |
51 | Skip ahead to 1A for OrangePi/Armbian; 1B for NOOBS on RPI3; 1C for NOOBS on RPI4;
52 | 1D for Rachel on RPI; 1E for Raspbian (without Rachel or IIAB); 1F for Intel Nuc
53 |
54 | #### Step 1A: Orange Pi with Armbian
55 | Download from https://www.armbian.com/download/
56 |
57 | These instructions were tested on the Orange Pi Zero, with the Debian variety which is currently "Buster".
58 | but the process probably works with other variants of the Orange-Pi, and versions of Armbian.
59 |
60 | Skip ahead to Step 2: Blow to a SD.
61 |
62 | #### Step 1B: NOOBS on RPI3
63 | On the RPI3 we started with a standard preconfigured NOOBS MicroSD card that came with the Canakit box.
64 |
65 | Skip ahead to Step 3: Boot
66 |
67 | #### Step 1C: NOOBS on RPI4
68 | The RPI4 from Canakit strangely was missing NOOBS, and the Raspberry Pi site is strangely missing NOOBS images,
69 | so this requires a slightly different approach than that detailed in 1B below.
70 | * Download the zip of NOOBS from https://www.raspberrypi.org/downloads/noobs/ and unzip it
71 | * On a Mac format open "Disk Utilities" and Erase the SD card with format "FAT".
72 | * Copy the NOOBS files to the SD card.
73 |
74 | Skip ahead to Step 3: Boot
75 |
76 | #### Step 1D: Rachel on Raspberry Pi
77 | Download [Raspbian Buster + Rachel](http://rachelfriends.org/downloads/public_ftp/rachelpi_64EN/rachelpi_2019/rachel-pi_kolibi_buster_unofficial.7z).
78 | Note this image may be moving soon to a different location soon.
79 |
80 | Skip ahead to Step 2: Blow to SD.
81 |
82 | #### Step 1E: Raspbian image without Rachel or IIAB
83 | Downloaded Raspbian [Raspbian](https://www.raspberrypi.org/downloads/raspbian/) to your laptop (~1GB).
84 | Any of the distributions should work - I test on the Desktop version,
85 |
86 | Skip ahead to Step 2: Blow to SD.
87 |
88 | #### Step 1F: Intel Nuc
89 | By the time you get it, it probably has an operating system on it
90 |
91 | TODO - Ask Davide for instructions.
92 |
93 | Skip ahead to Step 3D.
94 |
95 | ## Step 2: Blow this to a SD
96 |
97 | * Select an SD card as the destination (the larger the card, the more content it can hold)
98 | * Program the SD card with this image
99 |
100 | On a Mac: Downloaded [Etcher](https://www.balena.io/etcher/) (100Mb), install and run it.
101 | It will prompt you to select: the image you downloaded above, (and will accept .img or .zip files),
102 | and the SD card, and it should Flash and verify.
103 |
104 | On Windows or Linux, I'm not sure the best program to use and would appreciate suggestions.
105 |
106 | Skip ahead to Step 3 Boot: (3A for NOOBs; 3B for Raspbian; 3C for OrangePi/Armbian)
107 |
108 | ## Step 3: Boot and configure the Operating System
109 | You can now boot your machine - select the right platform below:
110 | NOOBS; OrangePi; Rachel; or Raspberry Pi;
111 | (skip to Step 4 if you are on Mac OSX)
112 |
113 | #### Step 3A: NOOBS on Raspberry Pi 3 or 4.
114 |
115 | NOOBS provides an easy way to select a specific operating system,
116 |
117 | Plug the SD card into the RPI, along with a power supply, HDMI display, keyboard and mouse.
118 | If at all possible insert Ethernet, otherwise it will work over WiFi with some extra steps.
119 |
120 | After it boots, it should offer you a choice of Operating Systems we test on Rasbian Desktop
121 | - the current version we've tested against is called "Buster", but this should work for other choices.
122 |
123 | It should boot up into that operating system, and you can continue in step 3B
124 |
125 | #### Step 3B: Raspbian (without Rachel or IIAB)
126 |
127 | Plug the SD card into the RPI, along with a power supply, HDMI display, keyboard and mouse.
128 | If at all possible insert Ethernet, otherwise it will work over WiFi with some extra steps.
129 |
130 | It should boot up (after as much as a minute or two) with a rainbow screen
131 | and prompt you for some getting started things.
132 | * Follow the menus to Select country, language, WiFi etc,
133 | * in particular make sure to change the password as RPIs with default passwords are frequently hacked.
134 | * Select Menu (Raspberry at top Left); `Preferences` then `Raspberry Pi Configuration`
135 | then `Interfaces` and make sure `SSH` is enabled (by default it is Disabled).
136 | * It should reboot at least once (as part of saving these changes)
137 |
138 | If you want it to run as a **hotspot** - try following the instructions at ...
139 | https://thepi.io/how-to-use-your-raspberry-pi-as-a-wireless-access-point/
140 |
141 | You can now open a Terminal window, or from your laptop `ssh raspberrypi`,
142 | login as `pi` with the password you set above.
143 |
144 | ##### Workaround for Raspbian bug:
145 | Raspbian has a bug that requires a patch until they push it to a new release.
146 | It looks from https://github.com/raspberrypi/linux/issues/3271 like you need to do
147 | ```
148 | sudo rpi-update
149 | ```
150 | This should only be applicable until the Raspbian available at
151 | https://www.raspberrypi.org/downloads/raspbian/
152 | is dated newer than September 2019
153 |
154 | After applying that patch ...
155 |
156 | Skip to Step 4
157 |
158 | #### Step 3C Orange Pi (Zero) with Armbian
159 | Booting an Orange Pi Zero or similar is tricky as there is no display/keyboard and you need the IP address to connect.
160 | Insert the SD card then Ethernet and power.
161 | I found https://lucsmall.com/2017/01/19/beginners-guide-to-the-orange-pi-zero/ to be a useful guide if you have problems.
162 | The best way appears to be to log into your router and look for "orangepi" or similar in the DHCP table.
163 | Lets assume its 192.168.0.55
164 |
165 | `ssh root@1292.168.0.55` and respond to password with the default `1234`
166 |
167 | Change your password immediately - it should prompt you and create a new user, we recommend calling this "pi"
168 |
169 | Set your timezone with:
170 | ```
171 | sudo dpkg-reconfigure tzdata
172 | ```
173 |
174 | Typically you'll either want to a) connect to your WiFi access point and be a server on it,
175 | OR b) have the Armbian act as a WiFi point itself.
176 |
177 | If so, you can do this now, or come back and do this later.
178 |
179 | a) To setup for your wifi to access your Wifi access point.
180 | ```
181 | sudo nano /etc/network/interfaces
182 | ```
183 | And add these lines to the end, using your SSID (aka wifi name) and password
184 | ```
185 | auto wlan0
186 | iface wlan0 inet dhcp
187 | wpa-ssid
188 | wpa-psk
189 | ```
190 | Then start it with
191 | ```
192 | sudo ifup wlan0
193 | ```
194 | or b) to setup as a WiFi hotspot
195 | `sudo armbian-config` choose `network` then `hotspot`
196 | (Sorry for the incomplete notes, edits appreciated ... )
197 | * At some point it asks to "select interface" I think this is the point to pick wlan0 though its unclear whether
198 | this is the WiFi interface to use, or for backhaul?
199 | * TODO document process to change SSID
200 | * Note that once setup, it can take a minute or two for the WiFi access point to be visible.
201 | * Also note that it seems to pick unusual IP addresses, 172.24.1.1 was the gateway when I connected to it.
202 |
203 | * If anyone knows how to set this up from the command line a PR would be appreciated.
204 | * This doc might be helpful
205 | https://docs.armbian.com/User-Guide_Advanced-Features/#how-to-set-wireless-access-point
206 |
207 | Skip to Step 4
208 |
209 | #### Step 3C: Rachel
210 |
211 | * Insert the SD card and boot the RPI (RPI3 or RPI4 should work)
212 | * If at all possible insert Ethernet, otherwise it will work over WiFi with some extra steps.
213 | * Power up
214 | * Connect your laptop to the RACHEL-Pi Wifi - it should give you an address like 10.10.10.xxx
215 | * ssh to 10.10.10.xxx
216 | * Login as `pi` with password `rachel`
217 |
218 | Skip to Step 4
219 |
220 | #### Step 3D: Intel Nuc
221 |
222 | THe standard install is missing a few packages.
223 | ```
224 | sudo apt update && sudo apt-get install curl net-tools ssh
225 | ```
226 | Try `node --version` if it reports 8.x.x then you maybe running Ubuntu
227 | which still seems stuck on old versions, version 10 or 12 are fine.
228 | ```
229 | curl -sL https://deb.nodesource.com/setup_12.x | sudo -E bash -
230 | sudo apt-cache policy nodejs # Should show v12.x.x, v10.x.x is fine, anything less is going to be a problem.
231 | sudo apt-get install -y nodejs
232 | ```
233 | Skip to step 4
234 |
235 | ## Step 4 Run the installer to install dweb-mirror
236 |
237 | By this point, the operating systems should be similar enough for our installation
238 | script to work out any differences, so the easiest way to finish the install
239 | is to run the installation script.
240 |
241 | In a terminal window, or via `ssh raspberrypi`
242 |
243 | We normally install it as a standard node_module under your home directory,
244 |
245 | If you prefer to install it somewhere else,
246 | `cd` to that directory before and the rest of the instructions below should work,
247 | but will need `~` substituting with the directory you started in.
248 |
249 |
250 | #### On most platforms (OrangePi+Armbian; Rasperry without Rachel; Mac OSX)
251 | ```
252 | curl -o- -L https://unpkg.com/@internetarchive/dweb-mirror/install.sh | bash
253 | ```
254 | If it fails, its safe to repeat this.
255 |
256 | #### On Rachel on Raspberry Pi
257 | There is a current problem running the script automatically, but this works...
258 | ```
259 | curl -o/tmp/install.sh -L https://unpkg.com/@internetarchive/dweb-mirror/install.sh
260 | chmod +x /tmp/install.sh
261 | sudo /tmp/install.sh
262 | ```
263 | If it fails, its safe to rerun `/tmp/install.sh`
264 |
265 | ## Step 5. (Optionally) Edit configuration
266 |
267 | If you are doing anything non-standard, then you'll need to create and edit
268 | a local configuration file. Otherwise the application will create it the first time its needed.
269 | ```
270 | cd ~/node_modules/@internetarchive/dweb-mirror
271 |
272 | cp ./dweb-mirror.config.yaml ${HOME} # Copy sample to your home directory and edit,
273 | ```
274 | and edit `$HOME/dweb-mirror.config.yaml` for now see `configDefaults.yaml` for inline documentation.
275 |
276 | * `directories` if you plan on using places other than any of those in dweb-mirror.config.yaml
277 | (/.data/archiveorg, and any USBs on Rachel3+, NOOBS or IIAB)
278 | * `apps.crawl` includes a structure that lists what collections are to be installed,
279 | I suggest testing and then editing
280 |
281 | Note that directories specified in the config file can be written using familiar shell or unix conventions such as "~/" or "../".
282 |
283 | ### Step 6. Test crawling and browsing
284 |
285 | ### Step 6A Crawling
286 | Crawling will happen automatically, but you can also test it manually.
287 |
288 | From a command line:
289 |
290 | ```
291 | cd ~/node_modules/@internetarchive/dweb-mirror && ./internetarchive -sc &
292 | ```
293 | * starts the HTTP server
294 | * It might take 10-15 seconds to start, be patient
295 | * It should start crawling, and get just a minimal set of icons for the home page.
296 | * the startup is a little slow but you'll see some debugging when its live.
297 | * If you see a message like `Requeued fetch of https://dweb.archive.org/info failed` then it means it cannot see
298 | the archive's servers (on `dweb.archive.org`) so it won't be able to crawl or cache initial material until you
299 | connect to the WiFi or Ethernet.
300 |
301 | Without any other arguments, the crawl will read a set of files into into the first (already existing) directory
302 | configured in `~/dweb-mirror.config.yaml`
303 | or if there are none there, it will look in its installation directory for `configDefaults.yaml`.
304 | If you haven't changed anything this will be `~/archiveorg`
305 |
306 | Look in that directory, and there should be sub-directories appearing for each item,
307 | with metadata and/or thumbnails.
308 |
309 | You can safely delete any of the crawled material and it will be re-fetched if needed.
310 |
311 | See [README.md](./README.md) for advanced crawling - its quite a powerful tool.
312 |
313 | ### Step 6B Browsing
314 |
315 | Open the web page - the address depends on the platform.
316 |
317 | * http://archive.local:4244 or http://archive:4244 should work on any platform,
318 | but this depends on the configuration of your LAN.
319 | * If you know the IP address then http::4244 will work
320 | * On MacOSX (or if using a browser on the RaspberryPi/OrangePi): http://localhost:4244
321 | * On Rachel try http://rachel.local:4244 or http://rachel:4244
322 | or via the main interface at http://rachel.local and click Internet Archive
323 |
324 | ### Step 6C Troubleshooting
325 |
326 | To troubleshoot you'll often need to check both the browser and server logs.
327 |
328 | Browser logs on Firefox are in `Tools` > `Web Developer` > `Web Console`
329 | On Chrome its `View` > `Developer` > `Javascript Console`
330 | In both these platforms problems are usually displayed in pink, and you can
331 | ignore any "INSUFFICIENT RESOURCES" errors.
332 |
333 | Server logs depend on the platform.
334 | * On Mac/OSX: (its in the same window you started the server from).
335 | * On most platforms.
336 | ```
337 | service internetarchive status
338 | ```
339 | Will get the status and most recent lines
340 | ```
341 | journalctl -u internetarchive
342 | ```
343 | Will get the most recent lines and
344 | ```
345 | journalctl -u internetarchive -f
346 | ```
347 | Will track the log `ctrl-C` to exit.
348 |
349 | On most platforms the logs are in `/var/log/daemon.log` if you want to analyse more deeply.
350 |
351 | Look for any “FAILING” or "ERROR" log lines which may indicate a problem
352 |
353 | ## Step 7. Auto-starting
354 |
355 | #### Step 7A: On anything except Mac OSX
356 | The server will autostart at reboot, or if it crashes.
357 | Restart your machine.
358 |
359 | ```
360 | sudo shutdown -r
361 | ```
362 | You'll need to log back into the box when it comes back up.
363 | Check the browser address (Step 6B) to see that its working,
364 | and
365 | ```
366 | service internetarchive status
367 | ```
368 | Should show it started and is pinging `/info`
369 |
370 | Skip to step 8
371 |
372 | #### Step 7B: On Mac OSX
373 | TODO - Note that I've currently had problems with getting Mac OSX to start automatically
374 | see [dweb-mirror issue#196](https://github.com/internetarchive/dweb-mirror/issues/196)
375 |
376 | If you want the server to start automatically when the mac boots.
377 | Run the following commands in a terminal window
378 |
379 | If you put the installation somewhere else, you'll need to edit `org.archive.mirror.plist` and
380 | change the line `${HOME}/node_modules/@internetarchive/dweb-mirror/internetarchive` to wherever you have dweb-mirror
381 | to be the path to "internetarchive"
382 | ```
383 | sudo cp ~/node_modules/@internetarchive/dweb-mirror/org.archive.mirror.plist /Library/LaunchAgents/org.archive.mirror.plist
384 | sudo launchctl load /Library/LaunchAgents/org.archive.mirror.plist
385 | ```
386 | Restart your machine and check that http://localhost:4244 still works.
387 |
388 | Skip to step 8
389 |
390 | ## Step 8. Updating
391 |
392 | The software is frequently revised so its recommended to update, especially if you see any bugs or problems.
393 |
394 | The quickest way is
395 | ```
396 | cd ~ # or wherever you started the process in 3a above.
397 | yarn install
398 | yarn upgrade # Upgrade all packages
399 | ```
400 |
401 | But you can also rerun the install procedure in Step 4,
402 | which will skip steps that have already completed
403 | and just update things that have changed.
404 |
405 | ## Finally
406 |
407 | I recommend reading [README.md](./README.md) for more functionality, and [USING.md](./USING.md) for a tour.
408 |
--------------------------------------------------------------------------------