├── requirements.txt ├── .gitignore ├── install-collections.sh ├── run-uwsgi.sh ├── static └── shared.css ├── templates ├── error.html ├── index.html └── search.html ├── uwsgi.ini ├── config.yaml └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | pywb>=0.9.5 2 | boto 3 | gevent 4 | uwsgi 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #don't add collections, they're automatically synced 2 | 3 | collections/ 4 | -------------------------------------------------------------------------------- /install-collections.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -d "collections" ]; then 4 | mkdir collections 5 | fi 6 | 7 | s3cmd sync -r --exclude=* --include="cluster.idx" --include="metadata.yaml" s3://aws-publicdatasets/common-crawl/cc-index/collections/ collections/ 8 | -------------------------------------------------------------------------------- /run-uwsgi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # requires uwsgi 4 | pip install uwsgi 5 | 6 | # running with gevent 7 | pip install gevent 8 | 9 | if [ $? -ne 0 ]; then 10 | "uwsgi install failed" 11 | exit 1 12 | fi 13 | 14 | mypath=$(cd `dirname $0` && pwd) 15 | 16 | params="$mypath/uwsgi.ini" 17 | 18 | uwsgi $params 19 | -------------------------------------------------------------------------------- /static/shared.css: -------------------------------------------------------------------------------- 1 | body { 2 | font-family: sans-serif; 3 | } 4 | 5 | li { 6 | margin-bottom: 12px; 7 | } 8 | 9 | form { 10 | display: inline; 11 | } 12 | 13 | input[type=text] { 14 | width: 600px; 15 | font-size: 20px; 16 | } 17 | 18 | p { 19 | margin-top: 28px; 20 | margin-bottom: 8px; 21 | } 22 | 23 | -------------------------------------------------------------------------------- /templates/error.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 7 |Error Details:
12 |13 |
14 | {{ err_details }}
15 |
16 |
17 | {% endif %}
18 |
19 |
20 |
--------------------------------------------------------------------------------
/uwsgi.ini:
--------------------------------------------------------------------------------
1 | [uwsgi]
2 | # Run with default port if not set
3 |
4 | if-env = UPORT
5 | socket = :$(UPORT)
6 | endif =
7 |
8 | if-not-env = PORT
9 | http-socket = :8080
10 | endif =
11 |
12 | venv = $(VIRTUAL_ENV)
13 |
14 | gevent = 100
15 | gevent-monkey-patch =
16 |
17 | master = true
18 | #processes = 2
19 | buffer-size = 65536
20 | die-on-term = true
21 |
22 | env = PYWB_CONFIG_FILE=./config.yaml
23 | wsgi = pywb.apps.wayback
24 |
--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | #Common-Crawl CDX Server Config
2 | #archive_paths: https://aws-publicdatasets.s3.amazonaws.com/
3 | archive_paths: s3://aws-publicdatasets/
4 |
5 | # suffix to add to collection for cdx api
6 | enable_cdx_api: -index
7 |
8 | enable_memento: true
9 |
10 | shard_index_loc:
11 | match: '.*(collections/[^/]+/)'
12 | #replace: 'http://aws-publicdatasets.s3.amazonaws.com/common-crawl/cc-index/\1'
13 | replace: 's3://aws-publicdatasets/common-crawl/cc-index/\1'
14 |
15 | # this is also the default page size
16 | max_blocks: 5
17 |
18 | # disable framed replay mode
19 | framed_replay: false
20 |
21 |
--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
11 |
12 | 13 | Please see CDX Server API Reference for more examples on how to use the query api. 14 |
15 |16 | The CommonCrawl Index Client provides command-line tools for using this api. 17 |
18 | 19 |20 | Currently available index collections: 21 |
22 |39 | Powered by pywb 40 |
41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /templates/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
10 |
11 | | {{ key }}: | {{ val }} | 15 | {% endfor %} 16 |
|---|
(See the CDX Server API Reference for more advanced query options.)
36 |