├── .gitattributes
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── lint.yml
    │   └── test.yml
├── .gitignore
├── .nojekyll
├── Dockerfile
├── LICENSE
├── README.md
├── action.yml
├── archive.py
├── assets
    └── img
    │   └── zulip.svg
├── default_settings.py
├── default_streams.yaml
├── entrypoint.sh
├── github.py
├── hosting.md
├── instructions.md
├── lib
    ├── __init__.py
    ├── common.py
    ├── date_helper.py
    ├── files.py
    ├── html.py
    ├── populate.py
    ├── sitemap.py
    ├── url.py
    ├── website.py
    └── zulip_data.py
├── requirements.txt
├── style.css
└── tests
    └── testCommon.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *   text=auto eol=lf
2 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: zulip
2 | patreon: zulip
3 | open_collective: zulip
4 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Linting
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   black:
 7 |     runs-on: ubuntu-latest
 8 |     name: Black linting
 9 |     steps:
10 |       - uses: actions/checkout@v3
11 |       - uses: psf/black@stable
12 |         with:
13 |           options: "--check"
14 |           src: "."
15 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     name: Test
13 |     steps:
14 |       - uses: actions/checkout@v3
15 |       - name: Setup Python 3.10
16 |         uses: actions/setup-python@v4
17 |         with:
18 |           python-version: "3.10"
19 |       - name: Install dependencies
20 |         run: |
21 |           pip install -r requirements.txt
22 |           pip install pytest
23 |       - name: Running Test-Suite on Linux
24 |         run: |
25 |           pytest tests/testCommon.py
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # configuration, caches, output
  2 | settings.py
  3 | streams.yaml
  4 | _json
  5 | archive
  6 | zuliprc
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # celery beat schedule file
 85 | celerybeat-schedule
 86 | 
 87 | # SageMath parsed files
 88 | *.sage.py
 89 | 
 90 | # Environments
 91 | .env
 92 | .venv
 93 | env/
 94 | venv/
 95 | ENV/
 96 | env.bak/
 97 | venv.bak/
 98 | bin/
 99 | include/
100 | lib/python3.6/
101 | pip-selfcheck.json
102 | share/
103 | 
104 | # Spyder project settings
105 | .spyderproject
106 | .spyproject
107 | 
108 | # Rope project settings
109 | .ropeproject
110 | 
111 | # mkdocs documentation
112 | /site
113 | 
114 | # mypy
115 | .mypy_cache/
116 | 
117 | # editors
118 | *.swp
119 | 


--------------------------------------------------------------------------------
/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zulip/zulip-archive/663518053b8f19b58c36ee61a5ac425cd2604ba4/.nojekyll


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11-alpine
2 | 
3 | RUN mkdir -p /zulip-archive && apk update && apk add git curl
4 | 
5 | COPY . /zulip-archive-action/
6 | 
7 | ENTRYPOINT ["sh", "/zulip-archive-action/entrypoint.sh"]
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Robert Y. Lewis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Zulip HTML archive
  2 | 
  3 | [![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
  4 | 
  5 | Generates an HTML archive of a configured set of streams within a
  6 | [Zulip](https://zulip.com) organization. It is common to archive all [public](https://zulip.com/help/stream-permissions) or [web-public](https://zulip.com/help/public-access-option) streams.
  7 | 
  8 | Example: [Lean Prover
  9 | archive](https://leanprover-community.github.io/archive/).
 10 | 
 11 | `zulip-archive` works by downloading Zulip message history via the
 12 | API, storing it in JSON files, maintaining its local archive with
 13 | incremental updates, and turning those JSON files into the HTML
 14 | archive.
 15 | 
 16 | This archive tool is often used in addition to enabling the [public access option](https://zulip.com/help/public-access-option) for your organization, which lets administrators configure selected streams to be web-public. Web-public streams can be viewed by anyone on the Internet without creating an account in your organization. The public access option does not yet support search engine indexing, which makes this archive tool a good option if it's important for your organization's chat history to appear in search results. It is easy to configure `zulip-archive` to automatically archive all web-public streams in your organization.
 17 | 
 18 | ### Contents
 19 | * [Running zulip-archive as a GitHub action](#running-zulip-archive-as-a-github-action)
 20 | * [Running zulip-archive without GitHub actions](#running-zulip-archive-without-github-actions)
 21 | * [Why archive](#why-archive)
 22 | * [Contributing and future plans](#contributing-and-future-plans)
 23 | 
 24 | ## Running zulip-archive as a GitHub action
 25 | 
 26 | Running `zulip-archive` as a GitHub action is easiest way to get up and running. The action will periodically sync a GitHub repository with the latest messages, and publish the archive website using GitHub pages. Follow the steps below to set up a `zulip-archive` GitHub action in a few minutes.
 27 | 
 28 | ### Step 1 - Create a repository for running the action
 29 | 
 30 | It's best to use a dedicated repository for running the action. You can create a new repository at https://github.com/new/.
 31 | 
 32 | ### Step 2 - Generate credentials
 33 | 
 34 | The GitHub action requires a Zulip API key in order to run. The key is used for fetching messages in public streams in your Zulip organization. It is strongly recommended that you [create a bot](https://zulip.com/help/add-a-bot-or-integration) and use its zuliprc, rather than using your personal zuliprc.
 35 | 
 36 | ### Step 3 - Store credentials as secrets in the repository
 37 | 
 38 | The credentials for your bot need to be stored in the repository as secrets, so that the action can access them during run time. You can create secrets in your repository at `https://github.com/<username>/<repo-name>/settings/secrets`, where `<username>` is your GitHub username, and `<repo-name>` is the name of the repository you are using.
 39 | 
 40 | You will need to create the following secret. Use the credentials generated in the above step as the value of each secret.
 41 | 
 42 | |Secret name   | Value                                                |
 43 | |--------------|------------------------------------------------------|
 44 | |zuliprc       | The file content of the zuliprc obtained from step 2 |
 45 | 
 46 | ### Step 4 - Enable GitHub Pages or set up base URL
 47 | 
 48 | Go to `https://github.com/<username>/<repo-name>/settings/pages`, select `main` (or a branch of your choosing), and `/` as the folder. Save the changes. The base URL of the generated site will be resolved to GitHub Pages, i.e., `https://<username>.github.io/<repo-name>` or the configured custom domain name.
 49 | 
 50 | Alternatively, you can configure the `base_url` option to populate the base URL. This option could be useful in situation when you are not using GitHub Pages.
 51 | 
 52 | ### Step 5 - Configure the streams you want to index
 53 | 
 54 | You will need to configure which streams will be indexed by `zulip-archive` by creating a `streams.yaml` file in the repository you are using for the GitHub action. As a starting point, you can make a copy of the default configuration file: `cp default_streams.yaml streams.yaml`
 55 | 
 56 | To index all the [web-public streams](https://zulip.com/help/public-access-option) in your organization, set the following as the content of your `streams.yaml` file.
 57 | 
 58 | ```yaml
 59 | included:
 60 |   - 'web-public:*'
 61 | ```
 62 | 
 63 | To index all the [public streams](https://zulip.com/help/stream-permissions), set the following as the content of your `streams.yaml` file. Note that public streams include all web-public streams.
 64 | 
 65 | ```yaml
 66 | included:
 67 |   - '*'
 68 | ```
 69 | 
 70 | You can exclude specific public streams by placing them under the `excluded` key.
 71 | 
 72 | ```yaml
 73 | included:
 74 |   - '*'
 75 | 
 76 | excluded:
 77 |   - general
 78 |   - development help
 79 | ```
 80 | 
 81 | Alternatively, you can specify only the streams that you want to index.
 82 | 
 83 | ```yaml
 84 | included:
 85 |   - python
 86 |   - data structures
 87 |   - javascript
 88 | ```
 89 | 
 90 | ### Step 6 - Enable the zulip-archive action
 91 | 
 92 | Enable the action by creating a file called `.github/workflows/main.yaml`:
 93 | 
 94 | #### Sample `main.yaml` file
 95 | 
 96 | ```yaml
 97 | on:
 98 |   schedule:
 99 |    - cron: '*/20 * * * *'
100 | 
101 | jobs:
102 |   publish_archive_job:
103 |     runs-on: ubuntu-latest
104 |     name: A job to publish zulip-archive in GitHub pages
105 |     steps:
106 |     - name: Checkout
107 |       uses: actions/checkout@v3
108 |     - name: Run archive
109 |       id: archive
110 |       uses: zulip/zulip-archive@master
111 |       with:
112 |         zuliprc: ${{ secrets.ZULIPRC }}
113 |         # Using the GitHub Token that is provided automatically by GitHub Actions
114 |         # (no setup needed).
115 |         github_token: ${{ secrets.GITHUB_TOKEN }}
116 |         delete_history: true
117 |         archive_branch: main
118 | ```
119 | 
120 | #### Configure run frequency
121 | 
122 | The above file tells GitHub to run the `zulip-archive` action every 20 minutes. You can [adjust](https://en.wikipedia.org/wiki/Cron) the `cron` key to modify the schedule as you feel appropriate.
123 | 
124 | If you Zulip organization history is very large (not the case for most users), it is recommended that you initially increase the time between runs to an hour or longer (e.g., `'0 * * * *'`). This is is because the initial archive run that fetches the messages for the first time will take a long time, and you don't want the second cron job to start before the first run is completed. After the initial run, you can shorten the cron job period as desired.
125 | 
126 | #### Configure `delete_history` option
127 | 
128 | If you are running frequent updates with a busy Zulip organization,
129 | the Git repository that you use to run the action will grow very
130 | quickly. In this situation, it is recommended that you set the `delete_history` option to
131 | `true`. This will overwrite the Git _history_ in the repository, but
132 | keep all the _content_. If you are using the repository for more than
133 | just the Zulip archive (not recommended), you may want to set the `delete_history` flag to `false`, but be
134 | warned that the repository size may explode.
135 | 
136 | ### Step 7 - Verify that everything works
137 | 
138 | Finally, verify that everything is working as expected. You can track the status of the action by visiting `https://github.com/<github-username>/<repo-name>/actions`. Once the initial run is completed, you should be able to visit the archive by opening the link provided at the end of the action run log. The link will generally be of the form `<github-username>.github.io/<repo-name>`, or `<your-personal-domain>/<repo-name>` if you have configured your own personal domain to point to GitHub pages.
139 | 
140 | If you configure `base_url` option, you can track the status of the action by visiting the URL instead.
141 | 
142 | ## Running zulip-archive without GitHub actions
143 | 
144 | For most users, running `zulip-archive` as GitHub actions should be good enough. If you want to run `zulip-archive` in your own server or do something else, see the [instructions](instructions.md) docs. The [hosting docs](hosting.md) also offer a few suggestions for good ways to host the output of this tool.
145 | 
146 | ## Why archive?
147 | 
148 | The best place to participate actively in a Zulip community is an app
149 | that tracks unread messages, formats messages properly, and is
150 | designed for efficient interaction.  However, there are several use
151 | cases where this HTML archive tool is a valuable complement to the
152 | Zulip apps:
153 | 
154 | * A public HTML archive can be indexed by search engines and doesn't
155 |   require authentication to access.  For open source projects and
156 |   other open communities, this provides a convenient search and
157 |   browsing experience for users who may not want to sign up an account
158 |   just to find previous answers to common questions.
159 | 
160 | * It's common to set up Zulip instances for one-time events such as
161 |   conferences, retreats, or hackathons.  Once the event ends, you may
162 |   want to shut down the Zulip instance for operational convenience,
163 |   but still want an archive of the communications.
164 | 
165 | * You may also decide to shut down a Zulip instance, whether to move
166 |   to another communication tool, to deduplicate instances, or because
167 |   your organization is shutting down.  You can always [export your
168 |   Zulip data](https://zulip.com/help/export-your-organization),
169 |   but the other tool may not be able to import it.  In such a case,
170 |   you can use this archive tool to keep the old conversations
171 |   accessible. (Contrast this to scenarios where your provider locks
172 |   you in to a solution, even when folks are dissatisfied with the
173 |   tool, because they own the data.)
174 | 
175 | * You may also want to publish your conversations outside of Zulip for
176 |   branding reasons or to integrate with other data.  You can modify
177 |   the tools here as needed for that.  You own your own data.
178 | 
179 | 
180 | ## Contributing and future plans
181 | 
182 | Feedback, issues, and pull requests are encouraged!  Our goal is for
183 | this project to support the needs of any community looking for an HTML
184 | archive of their Zulip organization's history, through just
185 | configuration changes.  So please report even minor inconveniences,
186 | either via a GitHub issue or by posting in the
187 | [#integrations](https://chat.zulip.org/#narrow/stream/127-integrations/) stream
188 | in the [Zulip development community](https://zulip.com/development-community/).
189 | 
190 | This project is licensed under the MIT license.
191 | 
192 | Author: [Robert Y. Lewis](https://robertylewis.com/) ([@robertylewis](https://github.com/robertylewis))
193 | 


--------------------------------------------------------------------------------
/action.yml:
--------------------------------------------------------------------------------
 1 | # action.yml
 2 | name: 'Zulip Archive'
 3 | description: 'Publish Zulip archive in GitHub pages'
 4 | inputs:
 5 |   zulip_organization_url:
 6 |     description: 'URL of Zulip organization'
 7 |     required: true
 8 |   zulip_bot_email:
 9 |     description: 'Email of the Zulip bot'
10 |     required: true
11 |   zulip_bot_key:
12 |     description: 'API key of the Zulip bot'
13 |     required: true
14 |   github_personal_access_token:
15 |     description: 'GitHub personal access token (deprecated)'
16 |     required: false
17 |     deprecationMessage: 'Please use `github_token` instead'
18 |   github_token:
19 |     description: 'GitHub Token/GitHub Personal Access Token'
20 |     required: true
21 |   delete_history:
22 |     description: 'If enabled, will delete the archive history while keeping the most recent version'
23 |     required: false
24 |     default: false
25 |   archive_branch:
26 |     description: 'Branch where to commit archive files (should coincide with GH Pages branch)'
27 |     # legacy
28 |     required: false
29 |     default: 'master'
30 |   zuliprc:
31 |     description: 'zuliprc of the Zulip bot'
32 |     required: true
33 |   site_url:
34 |     description: 'Base URL for the site. If not configured, this action will try to resolve the base URL as GH pages.'
35 |     required: false
36 | runs:
37 |   using: 'docker'
38 |   image: 'Dockerfile'
39 |   args:
40 |     - ${{ inputs.zulip_organization_url }}
41 |     - ${{ inputs.zulip_bot_email }}
42 |     - ${{ inputs.zulip_bot_key }}
43 |     - ${{ inputs.github_token }}
44 |     - ${{ inputs.delete_history }}
45 |     - ${{ inputs.archive_branch }}
46 |     - ${{ inputs.github_personal_access_token }}
47 | 


--------------------------------------------------------------------------------
/archive.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | This is the main program for the Zulip archive system.  For help:
  5 | 
  6 |     python archive.py -h
  7 | 
  8 | Note that this actual file mostly does the following:
  9 | 
 10 |     parse command line arguments
 11 |     check some settings from settings.py
 12 |     complain if you haven't made certain directories
 13 | 
 14 | The actual work is done in two main libraries:
 15 | 
 16 |     lib/html.py
 17 |     lib/populate.py
 18 | """
 19 | 
 20 | 
 21 | # The workflow (timing for the leanprover Zulip chat, on my slow laptop):
 22 | # - populate_all() builds a json file in `settings.json_directory` for each topic,
 23 | #   containing message data and an index json file mapping streams to their topics.
 24 | #   This uses the Zulip API and takes ~10 minutes to crawl the whole chat.
 25 | # - populate_incremental() assumes there is already a json cache and collects only new messages.
 26 | # - build_website() builds the webstie
 27 | # - See hosting.md for suggestions on hosting.
 28 | #
 29 | 
 30 | import sys
 31 | 
 32 | if sys.version_info < (3, 6):
 33 |     version_error = " Python version must be 3.6 or higher\n\
 34 |             Your current version of python is {}.{}\n\
 35 |             Please try again with python3.".format(
 36 |         sys.version_info.major, sys.version_info.minor
 37 |     )
 38 |     raise Exception(version_error)
 39 | import argparse
 40 | import configparser
 41 | import os
 42 | import zulip
 43 | 
 44 | from lib.common import stream_validator, exit_immediately
 45 | 
 46 | # Most of the heavy lifting is done by the following modules:
 47 | 
 48 | from lib.populate import populate_all, populate_incremental
 49 | 
 50 | from lib.website import build_website
 51 | 
 52 | from lib.sitemap import build_sitemap
 53 | 
 54 | try:
 55 |     import settings
 56 | except ModuleNotFoundError:
 57 |     # TODO: Add better instructions.
 58 |     exit_immediately(
 59 |         """
 60 |     We can't find settings.py.
 61 | 
 62 |     Please copy default_settings.py to settings.py
 63 |     and then edit the settings.py file to fit your use case.
 64 | 
 65 |     For testing, you can often leave the default settings,
 66 |     but you will still want to review them first.
 67 |     """
 68 |     )
 69 | 
 70 | NO_JSON_DIR_ERROR_WRITE = """
 71 | We cannot find a place to write JSON files.
 72 | 
 73 | Please run the below command:
 74 | 
 75 | mkdir {}"""
 76 | 
 77 | NO_JSON_DIR_ERROR_READ = """
 78 | We cannot find a place to read JSON files.
 79 | 
 80 | Please run the below command:
 81 | 
 82 | mkdir {}
 83 | 
 84 | And then fetch the JSON:
 85 | 
 86 | python archive.py -t"""
 87 | 
 88 | NO_HTML_DIR_ERROR = """
 89 | We cannot find a place to write HTML files.
 90 | 
 91 | Please run the below command:
 92 | 
 93 | mkdir {}"""
 94 | 
 95 | 
 96 | def get_json_directory(for_writing):
 97 |     json_dir = settings.json_directory
 98 | 
 99 |     if not json_dir.exists():
100 |         # I use posix paths here, since even on Windows folks will
101 |         # probably be using some kinda Unix-y shell to run mkdir.
102 |         if for_writing:
103 |             error_msg = NO_JSON_DIR_ERROR_WRITE.format(json_dir.as_posix())
104 |         else:
105 |             error_msg = NO_JSON_DIR_ERROR_READ.format(json_dir.as_posix())
106 | 
107 |         exit_immediately(error_msg)
108 | 
109 |     if not json_dir.is_dir():
110 |         exit_immediately(str(json_dir) + " needs to be a directory")
111 | 
112 |     return settings.json_directory
113 | 
114 | 
115 | def get_html_directory():
116 |     html_dir = settings.html_directory
117 | 
118 |     if not html_dir.exists():
119 |         error_msg = NO_HTML_DIR_ERROR.format(html_dir.as_posix())
120 | 
121 |         exit_immediately(error_msg)
122 | 
123 |     if not html_dir.is_dir():
124 |         exit_immediately(str(html_dir) + " needs to be a directory")
125 | 
126 |     return settings.html_directory
127 | 
128 | 
129 | def get_client_info():
130 |     config_file = "./zuliprc"
131 |     client = zulip.Client(config_file=config_file)
132 | 
133 |     # It would be convenient if the Zulip client object
134 |     # had a `site` field, but instead I just re-read the file
135 |     # directly to get it.
136 |     config = configparser.RawConfigParser()
137 |     config.read(config_file)
138 |     zulip_url = config.get("api", "site")
139 | 
140 |     return client, zulip_url
141 | 
142 | 
143 | def run():
144 |     parser = argparse.ArgumentParser(
145 |         description="Build an html archive of the Zulip chat."
146 |     )
147 |     parser.add_argument(
148 |         "-b", action="store_true", default=False, help="Build .md files"
149 |     )
150 |     parser.add_argument(
151 |         "--no-sitemap",
152 |         action="store_true",
153 |         default=False,
154 |         help="Don't build sitemap files",
155 |     )
156 |     parser.add_argument(
157 |         "-t", action="store_true", default=False, help="Make a clean json archive"
158 |     )
159 |     parser.add_argument(
160 |         "-i",
161 |         action="store_true",
162 |         default=False,
163 |         help="Incrementally update the json archive",
164 |     )
165 | 
166 |     results = parser.parse_args()
167 | 
168 |     if results.t and results.i:
169 |         print("Cannot perform both a total and incremental update. Use -t or -i.")
170 |         exit(1)
171 | 
172 |     if not (results.t or results.i or results.b):
173 |         print("\nERROR!\n\nYou have not specified any work to do.\n")
174 |         parser.print_help()
175 |         exit(1)
176 | 
177 |     json_root = get_json_directory(for_writing=results.t)
178 | 
179 |     # The directory where this archive.py is located
180 |     repo_root = os.path.dirname(os.path.realpath(__file__))
181 | 
182 |     if results.b:
183 |         md_root = get_html_directory()
184 | 
185 |     if results.t or results.i:
186 |         is_valid_stream_name = stream_validator(settings)
187 | 
188 |     client, zulip_url = get_client_info()
189 | 
190 |     if results.t:
191 |         populate_all(
192 |             client,
193 |             json_root,
194 |             is_valid_stream_name,
195 |         )
196 | 
197 |     elif results.i:
198 |         populate_incremental(
199 |             client,
200 |             json_root,
201 |             is_valid_stream_name,
202 |         )
203 | 
204 |     if results.b:
205 |         build_website(
206 |             json_root,
207 |             md_root,
208 |             settings.site_url,
209 |             settings.html_root,
210 |             settings.title,
211 |             zulip_url,
212 |             settings.zulip_icon_url,
213 |             repo_root,
214 |             settings.page_head_html,
215 |             settings.page_footer_html,
216 |         )
217 |         if not results.no_sitemap:
218 |             build_sitemap(settings.site_url, md_root.as_posix(), md_root.as_posix())
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     run()
223 | 


--------------------------------------------------------------------------------
/assets/img/zulip.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="49.99 49.99 673.14 673.14">
 2 |   <linearGradient id="a" x1="0" y1="0" x2="0" y2="1">
 3 |     <stop offset="0" stop-color="#50adff" />
 4 |     <stop offset="1" stop-color="#7877fc" />
 5 |   </linearGradient>
 6 |   <path
 7 |     d="M688.52 150.67c0 33.91-15.23 64.04-38.44 82.31L424.79 434.17c-4.18 3.59-9.62-2.19-6.61-7.03l82.64-165.46c2.31-4.63-.69-10.33-5.44-10.33H174.86c-49.64 0-90.26-45.31-90.26-100.68 0-55.37 40.62-100.68 90.26-100.68h423.39c49.65 0 90.27 45.31 90.27 100.68zM174.86 723.13h423.39c49.64 0 90.26-45.31 90.26-100.68 0-55.37-40.62-100.68-90.26-100.68H277.73c-4.75 0-7.76-5.7-5.44-10.33l82.64-165.46c3.01-4.83-2.42-10.62-6.61-7.03L123.04 540.14c-23.21 18.27-38.44 48.4-38.44 82.31 0 55.37 40.62 100.68 90.26 100.68z"
 8 |     fill="url(#a)"
 9 |   />
10 | </svg>
11 | 


--------------------------------------------------------------------------------
/default_settings.py:
--------------------------------------------------------------------------------
  1 | # Welcome to default_settings.py!  You will want to modify these values
  2 | # for your own needs and then copy them to settings.py.  Copying them
  3 | # in the same directory is the mostly likely choice here:
  4 | #
  5 | #    cp default_settings settings.py
  6 | #    <edit> settings.py
  7 | #
  8 | # If you prefer to keep the settings elsewhere, just make sure they
  9 | # are in your Python path.
 10 | 
 11 | import os
 12 | import yaml
 13 | from pathlib import Path
 14 | 
 15 | """
 16 | You generally want to start in debug mode to test out the archive,
 17 | and then set PROD_ARCHIVE to turn on production settings here.  In
 18 | production you usually change two things--the site_url and your
 19 | html_directory.
 20 | """
 21 | 
 22 | if os.getenv("PROD_ARCHIVE"):
 23 |     DEBUG = False
 24 | else:
 25 |     DEBUG = True
 26 | 
 27 | """
 28 | Set the site url.  The default below is good for local testing, but you will
 29 | definitely need to set your own value for prod.
 30 | """
 31 | 
 32 | if DEBUG:
 33 |     site_url = "http://127.0.0.1:4000"
 34 | else:
 35 |     site_url = os.getenv("SITE_URL")
 36 |     if not site_url:
 37 |         raise Exception("You need to configure site_url for prod")
 38 | 
 39 | """
 40 | Set the zulip icon url.  Folks can press the icon to see a
 41 | message in the actual Zulip instance.
 42 | """
 43 | 
 44 | if DEBUG:
 45 |     zulip_icon_url = "http://127.0.0.1:4000/assets/img/zulip.svg"
 46 | else:
 47 |     # Set this according to how you serve your prod assets.
 48 |     zulip_icon_url = os.getenv("ZULIP_ICON_URL", None)
 49 | 
 50 | 
 51 | """
 52 | Set the HTML title of your Zulip archive here.
 53 | """
 54 | title = "Zulip Chat Archive"  # Modify me!
 55 | 
 56 | """
 57 | Set the path prefix of your URLs for your website.
 58 | 
 59 | For example, you might want your main page to have
 60 | the path of archive/index.html
 61 | """
 62 | html_root = os.getenv("HTML_ROOT", "archive")  # Modify me!
 63 | 
 64 | """
 65 | When we get content from your Zulip instance, we first create
 66 | JSON files that include all of the content data from the Zulip
 67 | instance.  Having the data in JSON makes it easy to incrementally
 68 | update your data as new messages come in.
 69 | 
 70 | You will want to put this in a permanent location outside of
 71 | your repo.  Here we assume a sibling directory named zulip_json, but
 72 | you may prefer another directory structure.
 73 | """
 74 | 
 75 | json_directory = Path(os.getenv("JSON_DIRECTORY", "../zulip_json"))
 76 | 
 77 | """
 78 | We write HTML to here.
 79 | """
 80 | if DEBUG:
 81 |     html_directory = Path("./archive")  # Modify me!
 82 | else:
 83 |     try:
 84 |         html_directory = Path(os.getenv("HTML_DIRECTORY", None))
 85 |     except TypeError:
 86 |         raise Exception(
 87 |             """
 88 |             You need to set html_directory for prod, and it
 89 |             should be a different location than DEBUG mode,
 90 |             since files will likely have different urls in
 91 |             anchor tags.
 92 |             """
 93 |         )
 94 | 
 95 | 
 96 | """
 97 | This is where you modify the <head> section of every page.
 98 | """
 99 | page_head_html = (
100 |     '<html>\n<head><meta charset="utf-8"><title>Zulip Chat Archive</title></head>\n'
101 | )
102 | 
103 | """
104 | This is where you modify the <footer> section of every page.
105 | """
106 | page_footer_html = "\n</html>"
107 | 
108 | 
109 | """
110 | You may only want to include certain streams.  In `streams.yaml`
111 | file, mention the streams you want to include under `included` section.
112 | 
113 | Example
114 | ---
115 | 
116 | included:
117 |   - general
118 |   - javascript
119 |   - data structures
120 | 
121 | 
122 | A few wildcard operators are supported.
123 | 
124 | Example
125 | ---
126 | 
127 | included:
128 |   - 'web-public:*'
129 | 
130 | Using 'web-public:*' includes all the **web-public streams** in the
131 | Zulip organization. Using 'public:*' includes all the **public
132 | streams** in Zulip archive (`*` will do the same thing, for
133 | backwards-compatibility).  You can make the settings more restrictive
134 | than that, but not the opposite direction.
135 | 
136 | If you want to exclude some public streams, mention them in the
137 | `excluded` category in `streams.yaml`.
138 | 
139 | Example:
140 | ---
141 | 
142 | excluded:
143 |   - checkins
144 |   - development help
145 | 
146 | """
147 | 
148 | try:
149 |     with open("streams.yaml") as f:
150 |         streams = yaml.load(f, Loader=yaml.BaseLoader)
151 |         if "included" not in streams or not streams["included"]:
152 |             raise Exception(
153 |                 "Please specify the streams to be included under `included` section in streams.yaml file"
154 |             )
155 |         included_streams = streams["included"]
156 | 
157 |         excluded_streams = []
158 |         if "excluded" in streams and streams["excluded"]:
159 |             excluded_streams = streams["excluded"]
160 | 
161 | except FileNotFoundError:
162 |     raise Exception("Missing streams.yaml file")
163 | 


--------------------------------------------------------------------------------
/default_streams.yaml:
--------------------------------------------------------------------------------
 1 | included:
 2 | - 'web-public:*'
 3 | # - general
 4 | # - javascript
 5 | # - data structures
 6 | # - 'public:*'
 7 | 
 8 | excluded:
 9 | # - checkins
10 | # - development help
11 | 


--------------------------------------------------------------------------------
/entrypoint.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | zulip_organization_url=$1
  5 | zulip_bot_email=$2
  6 | zulip_bot_api_key=$3
  7 | github_token=$INPUT_GITHUB_TOKEN
  8 | delete_history=$5
  9 | archive_branch=$6
 10 | github_personal_access_token=$7
 11 | zuliprc=$INPUT_ZULIPRC
 12 | site_url=$INPUT_SITE_URL
 13 | 
 14 | github_personal_access_token=${github_personal_access_token:-NOT_SET}
 15 | 
 16 | if [ $github_personal_access_token != "NOT_SET" ]; then
 17 |     echo "'github_personal_access_token' input has been deprecated."
 18 |     echo "To migrate to the new setup, you have to replace it with"
 19 |     echo "github_token. For more info, see"
 20 |     echo 'https://github.com/zulip/zulip-archive#step-5---enable-zulip-archive-action'
 21 |     exit 1
 22 | fi
 23 | 
 24 | # This is a temporary workaround.
 25 | # See https://github.com/actions/checkout/issues/766
 26 | git config --global --add safe.directory "$GITHUB_WORKSPACE"
 27 | 
 28 | checked_out_repo_path="$(pwd)"
 29 | html_dir_path=$checked_out_repo_path
 30 | json_dir_path="${checked_out_repo_path}/zulip_json"
 31 | img_dir_path="${checked_out_repo_path}/assets/img"
 32 | streams_config_file_path="${checked_out_repo_path}/streams.yaml"
 33 | initial_sha="$(git rev-parse HEAD)"
 34 | 
 35 | if [ ! -f $streams_config_file_path ]; then
 36 |     echo "Missing streams.yaml file."
 37 |     exit 1
 38 | fi
 39 | 
 40 | cd "/zulip-archive-action"
 41 | 
 42 | curl "https://bootstrap.pypa.io/get-pip.py" -o "get-pip.py"
 43 | python3 get-pip.py
 44 | 
 45 | pip install virtualenv
 46 | virtualenv -p python3 .
 47 | source bin/activate
 48 | pip3 install -r requirements.txt
 49 | # crudini is not available as an Alpine pkg, so we install via pip.
 50 | pip3 install crudini
 51 | 
 52 | if [ -z "$site_url" ]; then
 53 |     echo "Setting up site URL from GitHub pages API"
 54 |     # Uses GitHub pages API
 55 |     # https://docs.github.com/en/rest/pages
 56 |     auth_header="Authorization: Bearer ${github_token}"
 57 |     accept_header="Accept: application/vnd.github+json"
 58 |     version_header="X-GitHub-Api-Version: 2022-11-28"
 59 |     page_api_url="https://api.github.com/repos/${GITHUB_REPOSITORY}/pages"
 60 | 
 61 |     print_site_url_code="import sys, json; print(json.load(sys.stdin)['html_url'])"
 62 |     # Get the GitHub pages URL
 63 |     github_pages_url_with_trailing_slash=$(curl -L -H "$accept_header" -H "$auth_header" -H "$version_header" "$page_api_url" | python3 -c "${print_site_url_code}")
 64 |     site_url=${github_pages_url_with_trailing_slash%/}
 65 | else
 66 |     site_url=${site_url%/}
 67 | fi
 68 | 
 69 | cp default_settings.py settings.py
 70 | cp $streams_config_file_path .
 71 | 
 72 | if [ -z "$zuliprc" ]; then
 73 | 	echo "Setting up Zulip details via 3 variables (zulip_organization_url, zulip_bot_key, zulip_bot_email)"
 74 | 	echo "is deprecated. The current simpler method is to just set the zuliprc content in the GH secrets."
 75 | 	crudini --set zuliprc api site "$zulip_organization_url"
 76 | 	crudini --set zuliprc api key "$zulip_bot_api_key"
 77 | 	crudini --set zuliprc api email "$zulip_bot_email"
 78 | else
 79 | 	echo "$zuliprc" > zuliprc
 80 | fi
 81 | 
 82 | export PROD_ARCHIVE=true
 83 | export SITE_URL=$site_url
 84 | export HTML_DIRECTORY=$html_dir_path
 85 | export JSON_DIRECTORY=$json_dir_path
 86 | export HTML_ROOT=""
 87 | export ZULIP_ICON_URL="${site_url}/assets/img/zulip.svg"
 88 | 
 89 | if [ ! -d $json_dir_path ]; then
 90 |     mkdir -p $json_dir_path
 91 | 
 92 |     mkdir -p $img_dir_path
 93 |     cp assets/img/* $img_dir_path
 94 | 
 95 |     python3 archive.py -t
 96 | else
 97 |     python3 archive.py -i
 98 | fi
 99 | 
100 | 
101 | python3 archive.py -b
102 | 
103 | cd ${checked_out_repo_path}
104 | 
105 | git checkout $archive_branch
106 | 
107 | git fetch origin
108 | 
109 | current_sha="$(git rev-parse origin/${archive_branch})"
110 | 
111 | if [[ "$current_sha" != "$initial_sha" ]]
112 | then
113 |   echo "Archive update failed, commits have been added while processing"
114 |   exit 1
115 | fi
116 | 
117 | echo "delete history: $delete_history"
118 | 
119 | git config --global user.email "zulip-archive-bot@users.noreply.github.com"
120 | git config --global user.name "Archive Bot"
121 | 
122 | git add -A
123 | if [[ "$delete_history" == "true" ]]
124 | then
125 | 	git commit --amend --reset-author -m "Update archive."
126 | 	# Cleanup loose objects
127 | 	git gc
128 | else
129 | 	git commit -m "Update archive."
130 | fi
131 | 
132 | git remote add origin2 https://${GITHUB_ACTOR}:${github_token}@github.com/${GITHUB_REPOSITORY}
133 | 
134 | git push origin2 HEAD:$archive_branch -f
135 | 
136 | echo "pushed"
137 | 
138 | echo "Zulip Archive published/updated in ${github_pages_url}"
139 | 


--------------------------------------------------------------------------------
/github.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | WARNING!
 5 | 
 6 | This script includes example code from the Lean Prover community, who
 7 | used this repo to store the Zulip content as well as the code.  We
 8 | recommend to most folks to create a **separate** repo for your
 9 | content, even if you are using Github to serve the content, and expect
10 | to convert this tool to a supported option based on that model.
11 | """
12 | 
13 | from datetime import datetime
14 | import time, argparse, subprocess
15 | 
16 | parser = argparse.ArgumentParser(description="Push/pull repo.")
17 | 
18 | 
19 | # resets the current repository to match origin/master
20 | def github_pull():
21 |     print(subprocess.check_output(["git", "fetch", "origin", "master"]))
22 |     print(subprocess.check_output(["git", "reset", "--hard", "origin/master"]))
23 | 
24 | 
25 | # commits changes in archive/ and pushes the current repository to origin/master
26 | def github_push():
27 |     print(subprocess.check_output(["git", "add", "archive/*"]))
28 |     print(subprocess.check_output(["git", "add", "_includes/archive_update.html"]))
29 |     print(
30 |         subprocess.check_output(
31 |             [
32 |                 "git",
33 |                 "commit",
34 |                 "-m",
35 |                 "auto update: {}".format(
36 |                     datetime.utcfromtimestamp(time.time()).strftime(
37 |                         "%b %d %Y at %H:%M UTC"
38 |                     )
39 |                 ),
40 |             ]
41 |         )
42 |     )
43 |     print(subprocess.check_output(["git", "push"]))
44 | 
45 | 
46 | parser.add_argument(
47 |     "-f",
48 |     action="store_true",
49 |     default=False,
50 |     help="Pull from GitHub before updating. (Warning: could overwrite this script.)",
51 | )
52 | parser.add_argument(
53 |     "-p", action="store_true", default=False, help="Push results to GitHub."
54 | )
55 | 
56 | if results.f:
57 |     github_pull()
58 | if results.p:
59 |     github_push()
60 | 


--------------------------------------------------------------------------------
/hosting.md:
--------------------------------------------------------------------------------
 1 | The code in this repo helps you extract content from a Zulip
 2 | instance and build the basic HTML structure, but it leaves it
 3 | to you to actually host the data on some server.
 4 | 
 5 | In other words, Zulip is not opinionated about how you serve
 6 | the HTML (and in some ways the entire mission of this project
 7 | is to empower you to put your data where you want).
 8 | 
 9 | ### General procedures
10 | 
11 | No matter where you host your data, you will typically
12 | have a "PROD" install.  This will involve more detailed
13 | updates to `settings.py` than you probably made for local
14 | testing.  Here is a typical diff:
15 | 
16 | ~~~
17 | <     # site_url = 'example.com'
18 | <     raise Exception("You need to configure site_url for prod")
19 | ---
20 | >     site_url = 'https://showell.github.io/'
21 | 46,47c45
22 | <     # Set this according to how you serve your prod assets.
23 | <     zulip_icon_url = None
24 | ---
25 | >     zulip_icon_url = 'http://showell.github.io/assets/img/zulip.svg'
26 | 82,87c80
27 | <     raise Exception('''
28 | <         You need to set html_directory for prod, and it
29 | <         should be a different location than DEBUG mode,
30 | <         since files will likely have different urls in
31 | <         anchor tags.
32 | <         ''')
33 | ---
34 | >     html_directory = Path('../website/archive')
35 | ~~~
36 | 
37 | 
38 | To build your site with prod settings, do this:
39 | 
40 | ```
41 | PROD_ARCHIVE=1 python archive.py -b
42 | ```
43 | 
44 | You will also want to copy assets to your production
45 | directory.  These include:
46 | 
47 | * zulip.svg
48 | 
49 | ### Static Server
50 | 
51 | You can use any web server of your choice to host the HTML files.
52 | One simple example is to use Python 3's `http.server`:
53 | 
54 | ```
55 | python3 -m http.server 4000
56 | ```
57 | 
58 | With the default configuration you should be able to see
59 | your archive at http://127.0.0.1:4000/.
60 | 
61 | ### GitHub
62 | 
63 | You can use GitHub Pages to serve your HTML.  We recommend
64 | configuring your `md_root` to point into your local copy of
65 | your `username.github.io` repo (e.g. `alice.github.io`) and
66 | then push from there.
67 | 
68 | Some customers will want to just use the same repo for both
69 | this tooling and the content from their Zulip instance.  We
70 | don't recommend this approach long term, since it can complicate
71 | staying up to date with patches from this repo.  If you are
72 | still interested in this approach, despite the warnings, you
73 | may find the `github.py` script to be useful.
74 | 


--------------------------------------------------------------------------------
/instructions.md:
--------------------------------------------------------------------------------
 1 | Creating your Zulip archive takes a few steps to set up.
 2 | 
 3 | ## Download dependencies
 4 | 
 5 | * Clone this repo.
 6 | * Download [python3](https://www.python.org/downloads/) if you
 7 |   don't already have it.  (We require version 3.6 or higher.)
 8 | * Install the dependencies, with `pip3 install -r requirements.txt`.
 9 | 
10 | ## Get a Zulip API key
11 | 
12 | You will need an API key to get data from Zulip.  Often you
13 | will do this by  [creating a bot](https://zulip.com/help/add-a-bot-or-integration),
14 | but you can also use your main user's API key.
15 | 
16 | * Download a [zuliprc](https://zulip.com/api/configuring-python-bindings)
17 |   file to `zuliprc` within this project.
18 | 
19 | ## Customize your settings
20 | 
21 | * Run this command:
22 | 
23 |     cp default_settings.py settings.py
24 | 
25 | * Then read `settings.py` and modify the settings to fit your needs.
26 |   (There are comments in the file that explain each setting.)
27 | * Optionally, modify the code to fit your needs. This repo
28 |   is based on the [leanprover-community Jekyll
29 |   setup](https://github.com/leanprover-community/leanprover-community.github.io).
30 | 
31 | ## Build JSON files from your Zulip instance
32 | 
33 | * Create a directory to store JSON in (see settings.py for more details).
34 | * Run `python3 archive.py -t` to download a fresh archive. (This may take
35 |   a long time.  You may wish to experiment with just a few streams at
36 |   first--see `settings.py` for details.)
37 | 
38 | Note: you will be able to update your archive later with
39 | `python3 archive.py -i` to get more messages.
40 | 
41 | ## Build the HTML files
42 | 
43 | Run this command to build your archive
44 | 
45 |     python3 archive.py -b
46 | 
47 | ## Test your changes locally
48 | 
49 | You can use this command to serve your files:
50 | 
51 |     python3 -m http.server 4000
52 | 
53 | Typically you will then view your files at http://127.0.0.1:4000/archive/.
54 | 
55 | ## Add assets
56 | 
57 | You may wish to copy the following assets into your site directory
58 | structure:
59 | 
60 | - `assets/img/zulip.svg`
61 | 
62 | ## Go to production
63 | 
64 | Once you are satisfied with your local testing, you will want to host
65 | your archive publicly.  See [hosting.md](hosting.md) for more details.
66 | 
67 | # Other notes
68 | 
69 | ## archive.py
70 | 
71 | The main tool to familiarize yourself with is `archive.py`.  It takes these
72 | options:
73 | 
74 |   * `-t` builds a fresh archive. This will download every message from the Zulip chat and might take a long time. Must be run at least once before using `-i`.
75 |   * `-i` updates the archive with messages posted since the last scrape.
76 |   * `-b` generates the markdown/html output.
77 | 
78 | ## github.py
79 | 
80 | This repostiory also contains a [hacky tool](github.py) for managing
81 | pushes to a repository hosted by GitHub Pages, which supports the
82 | following options.  Be sure to read the warnings in `github.py`.
83 | 
84 | * `-f` updates the git repository containing the script
85 | * `-p` pushes the generated files
86 | 
87 | Contributions are appreciated to make `github.py` no longer hacky.
88 | 
89 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zulip/zulip-archive/663518053b8f19b58c36ee61a5ac425cd2604ba4/lib/__init__.py


--------------------------------------------------------------------------------
/lib/common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def exit_immediately(s):
 5 |     print("\nERROR\n", s)
 6 |     exit(1)
 7 | 
 8 | 
 9 | # Safely open dir/filename, creating dir if it doesn't exist
10 | def open_outfile(dir, filename, mode):
11 |     if not dir.exists():
12 |         os.makedirs(str(dir))
13 |     return (dir / filename).open(mode, encoding="utf-8")
14 | 
15 | 
16 | def stream_validator(settings):
17 |     if not hasattr(settings, "included_streams"):
18 |         exit_immediately("Please set included_streams.")
19 | 
20 |     if len(settings.included_streams) == 0:
21 |         exit_immediately('Please add "*" to included_streams.')
22 | 
23 |     if hasattr(settings, "excluded_streams"):
24 |         excluded_streams = set(settings.excluded_streams)
25 |     else:
26 |         excluded_streams = set()
27 | 
28 |     included_streams = set(settings.included_streams)
29 | 
30 |     def validator(stream):
31 |         stream_name = stream["name"]
32 |         is_web_public = stream["is_web_public"]
33 |         is_public = not stream["invite_only"]
34 | 
35 |         if stream_name in excluded_streams:
36 |             return False
37 | 
38 |         if "web-public:*" in included_streams and is_web_public:
39 |             return True
40 | 
41 |         # The bare * case is for backwards-compatibility.
42 |         if ("*" in included_streams or "public:*" in included_streams) and is_public:
43 |             return True
44 | 
45 |         return stream_name in included_streams
46 | 
47 |     return validator
48 | 


--------------------------------------------------------------------------------
/lib/date_helper.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | 
3 | 
4 | # I don't love this format, feel free to change (I just
5 | # extracted it from prior code).
6 | def format_date1(ts):
7 |     """Nov 05 2019 at 02:51"""
8 |     return datetime.utcfromtimestamp(ts).strftime("%b %d %Y at %H:%M")
9 | 


--------------------------------------------------------------------------------
/lib/files.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | There are two major phases of this system:
  4 | 
  5 |     1. Build JSON files from Zulip data.
  6 |     2. Build static website from JSON.
  7 | 
  8 | Both phases write to directories with some directories.
  9 | Here is an example structure for the JSON piece:
 10 | 
 11 |     <json_root>
 12 |         stream_index.json
 13 |         213222general
 14 |             47413hello.json
 15 |             48863swimmingturtles.json
 16 |             51687topicdemonstration.json
 17 |             74282newstreams.json
 18 |         213224python
 19 |             47413hello.json
 20 |             95106streamevents.json
 21 | 
 22 | And then here is what your website output might
 23 | look like:
 24 | 
 25 |     <md_root>
 26 |         index.html
 27 |         style.css
 28 |         stream
 29 |             213222general
 30 |                 index.html
 31 |                 47413hello.html
 32 |                 48863swimmingturtles.html
 33 |                 51687topicdemonstration.html
 34 |                 74282newstreams.html
 35 |             213224python
 36 |                 index.html
 37 |                 47413hello.html
 38 |                 95106streamevents.html
 39 | 
 40 | In the examples above we have two streams:
 41 | 
 42 |     general:
 43 |         hello
 44 |         swimming turtles
 45 |         topic demonstration
 46 |         new streams
 47 | 
 48 |     python:
 49 |         hello
 50 |         stream events
 51 | 
 52 | We "sanitize" the directory names to avoid escaping issues
 53 | with spaces (hence the number prefix).  FWIW the number prefix
 54 | for streams corresponds to the Zulip stream id, whereas the topic
 55 | prefix is a random hash.  All that really matters is that they are
 56 | unique.
 57 | """
 58 | 
 59 | import json
 60 | 
 61 | from pathlib import Path
 62 | 
 63 | from .common import open_outfile
 64 | 
 65 | 
 66 | def read_zulip_stream_info(json_root):
 67 |     """
 68 |     stream_index.json
 69 | 
 70 |     This JSON goes two levels deep, showing every stream, and
 71 |     then within each stream, a bit of info for every topic in
 72 |     the stream.  To get actual messages within a topic, you go
 73 |     to other files deeper in the directory structure.
 74 |     """
 75 |     f = (json_root / Path("stream_index.json")).open("r", encoding="utf-8")
 76 |     stream_info = json.load(f)
 77 |     f.close()
 78 |     return stream_info
 79 | 
 80 | 
 81 | def read_zulip_messages_for_topic(
 82 |     json_root, sanitized_stream_name, sanitized_topic_name
 83 | ):
 84 |     """
 85 |     <stream>/<topic>.json
 86 | 
 87 |     This JSON has info for all the messags in a topic.
 88 |     """
 89 |     json_path = (
 90 |         json_root / Path(sanitized_stream_name) / Path(sanitized_topic_name + ".json")
 91 |     )
 92 |     f = json_path.open("r", encoding="utf-8")
 93 |     messages = json.load(f)
 94 |     f.close()
 95 |     return messages
 96 | 
 97 | 
 98 | def open_main_page(md_root):
 99 |     outfile = open_outfile(md_root, Path("index.html"), "w+")
100 |     return outfile
101 | 
102 | 
103 | def open_stream_topics_page(md_root, sanitized_stream_name):
104 |     directory = md_root / Path("stream/" + sanitized_stream_name)
105 |     outfile = open_outfile(directory, Path("index.html"), "w+")
106 |     return outfile
107 | 
108 | 
109 | def open_topic_messages_page(md_root, sanitized_stream_name, sanitized_topic_name):
110 |     directory = md_root / Path("stream/" + sanitized_stream_name + "/topic")
111 |     outfile = open_outfile(directory, Path(sanitized_topic_name + ".html"), "w+")
112 |     return outfile
113 | 


--------------------------------------------------------------------------------
/lib/html.py:
--------------------------------------------------------------------------------
  1 | """
  2 | All the functions in this file should produce pure HTML, as
  3 | opposed to Markdown or other similar languages.
  4 | 
  5 | Some folks want to work with systems that don't necessarily
  6 | support markdown (or deal with incompabilities between
  7 | different flavors of markdown), so when possible, we should
  8 | strive for pure HTML in our output in the future.
  9 | 
 10 | (Producing pure HTML doesn't have to be a burden--we can
 11 | add helpers/converters as necessary.)
 12 | """
 13 | 
 14 | import html
 15 | 
 16 | from .date_helper import format_date1
 17 | 
 18 | from .url import (
 19 |     sanitize_stream,
 20 |     sanitize,
 21 | )
 22 | 
 23 | from .url import (
 24 |     archive_message_url,
 25 |     archive_stream_url,
 26 |     archive_topic_url,
 27 |     zulip_post_url,
 28 | )
 29 | 
 30 | from .zulip_data import (
 31 |     num_topics_string,
 32 |     sorted_streams,
 33 |     sorted_topics,
 34 |     topic_info_string,
 35 | )
 36 | 
 37 | 
 38 | def topic_page_links_html(
 39 |     site_url,
 40 |     html_root,
 41 |     zulip_url,
 42 |     sanitized_stream_name,
 43 |     sanitized_topic_name,
 44 |     stream_name,
 45 |     topic_name,
 46 | ):
 47 |     stream_url = archive_stream_url(site_url, html_root, sanitized_stream_name)
 48 |     topic_url = archive_topic_url(
 49 |         site_url, html_root, sanitized_stream_name, sanitized_topic_name
 50 |     )
 51 | 
 52 |     return f"""\
 53 | <h2>Stream: <a href="{html.escape(stream_url)}">{html.escape(stream_name)}</a></h2>
 54 | <h3>Topic: <a href="{html.escape(topic_url)}">{html.escape(topic_name)}</a></h3>
 55 | 
 56 | <hr>
 57 | 
 58 | <base href="{html.escape(zulip_url)}">
 59 | """
 60 | 
 61 | 
 62 | def format_message_html(
 63 |     site_url,
 64 |     html_root,
 65 |     zulip_url,
 66 |     zulip_icon_url,
 67 |     stream_name,
 68 |     stream_id,
 69 |     topic_name,
 70 |     msg,
 71 | ):
 72 |     msg_id = str(msg["id"])
 73 | 
 74 |     zulip_link_html = link_to_zulip_html(
 75 |         zulip_url,
 76 |         zulip_icon_url,
 77 |         stream_id,
 78 |         stream_name,
 79 |         topic_name,
 80 |         msg_id,
 81 |     )
 82 | 
 83 |     user_name = msg["sender_full_name"]
 84 |     date = format_date1(msg["timestamp"])
 85 |     msg_content_html = msg["content"]
 86 |     anchor_url = archive_message_url(
 87 |         site_url,
 88 |         html_root,
 89 |         sanitize_stream(stream_name, stream_id),
 90 |         sanitize(topic_name),
 91 |         msg_id,
 92 |     )
 93 |     anchor_html = '<a name="{0}"></a>'.format(html.escape(msg_id))
 94 |     out_html = f"""
 95 | {anchor_html}
 96 | <h4>{zulip_link_html} {html.escape(user_name)} <a href="{html.escape(anchor_url)}">({html.escape(date)})</a>:</h4>
 97 | {msg_content_html}
 98 | """
 99 |     return out_html
100 | 
101 | 
102 | def link_to_zulip_html(
103 |     zulip_url,
104 |     zulip_icon_url,
105 |     stream_id,
106 |     stream_name,
107 |     topic_name,
108 |     msg_id,
109 | ):
110 |     # format a link to the original post where you click on the Zulip icon
111 |     # (if it's available)
112 |     post_link = zulip_post_url(zulip_url, stream_id, stream_name, topic_name, msg_id)
113 |     if zulip_icon_url:
114 |         img_tag_html = f'<img src="{html.escape(zulip_icon_url)}" alt="view this post on Zulip" style="width:20px;height:20px;">'
115 |     else:
116 |         img_tag_html = ""
117 |     zulip_link_html = (
118 |         f'<a href="{html.escape(post_link)}" class="zl">{img_tag_html}</a>'
119 |     )
120 |     return zulip_link_html
121 | 
122 | 
123 | def last_updated_footer_html(stream_info):
124 |     last_updated = format_date1(stream_info["time"])
125 |     date_footer_html = f"\n<hr><p>Last updated: {html.escape(last_updated)} UTC</p>"
126 |     return date_footer_html
127 | 
128 | 
129 | def stream_list_page_html(streams):
130 |     content_html = f"""\
131 | <hr>
132 | 
133 | <h2>Streams:</h2>
134 | 
135 | {stream_list_html(streams)}
136 | """
137 |     return content_html
138 | 
139 | 
140 | def stream_list_html(streams):
141 |     """
142 |     produce a list like this:
143 | 
144 |     * stream_name (n topics)
145 |     * stream_name (n topics)
146 |     * stream_name (n topics)
147 |     """
148 | 
149 |     def item_html(stream_name, stream_data):
150 |         stream_id = stream_data["id"]
151 |         sanitized_name = sanitize_stream(stream_name, stream_id)
152 |         url = f"stream/{sanitized_name}/index.html"
153 |         stream_topic_data = stream_data["topic_data"]
154 |         num_topics = num_topics_string(stream_topic_data)
155 |         return f'<li> <a href="{html.escape(url)}">{html.escape(stream_name)}</a> ({html.escape(str(num_topics))}) </li>'
156 | 
157 |     the_list = "\n\n".join(
158 |         item_html(stream_name, streams[stream_name])
159 |         for stream_name in sorted_streams(streams)
160 |     )
161 |     return "<ul>\n" + the_list + "\n</ul>"
162 | 
163 | 
164 | def topic_list_page_html(stream_name, stream_url, topic_data):
165 |     content = f"""\
166 | <h2> Stream: <a href="{html.escape(stream_url)}">{html.escape(stream_name)}</a></h2>
167 | <hr>
168 | 
169 | <h3>Topics:</h3>
170 | 
171 | {topic_list_html(topic_data)}
172 | """
173 |     return content
174 | 
175 | 
176 | def topic_list_html(topic_data):
177 |     """
178 |     produce a list like this:
179 | 
180 |     * topic name (n messages, latest: <date>)
181 |     * topic name (n messages, latest: <date>)
182 |     * topic name (n messages, latest: <date>)
183 |     """
184 | 
185 |     def item_html(topic_name, message_data):
186 |         link_html = f'<a href="topic/{html.escape(sanitize(topic_name))}.html">{html.escape(topic_name)}</a>'
187 |         topic_info = topic_info_string(message_data)
188 |         return f"<li> {link_html} ({html.escape(topic_info)}) </li>"
189 | 
190 |     the_list_html = "\n".join(
191 |         item_html(topic_name, topic_data[topic_name])
192 |         for topic_name in sorted_topics(topic_data)
193 |     )
194 |     return "<ul>\n" + the_list_html + "\n</ul>"
195 | 


--------------------------------------------------------------------------------
/lib/populate.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This library helps populate a series of JSON files from a
  3 | running Zulip instance.
  4 | 
  5 | Conceptually it just moves data in one direction:
  6 | 
  7 |     Zulip -> file system (JSON files)
  8 | 
  9 | This is probably the most technical part of the archive codebase
 10 | for now.  Conceptually, it's just connecting to Zulip with the
 11 | Python API for Zulip and getting recent messages.
 12 | 
 13 | Some of the details are about getting incremental updates from
 14 | Zulip.  See `populate_incremental`, but the gist of it is that
 15 | we read `latest_id` from the JSON and then use that as the
 16 | `anchor` in the API request to Zulip.
 17 | 
 18 | About the data:
 19 | 
 20 |     The json format for stream_index.json is something like below:
 21 | 
 22 |     {
 23 |         'time': <the last time stream_index.md was updated>,
 24 |         'streams': {
 25 |             stream_name: {
 26 |                 'id': stream_id,
 27 |                 'latest_id': id of latest post in stream,
 28 |                 'topic_data': {
 29 |                     topic_name: {
 30 |                         topic_size: num posts in topic,
 31 |                         latest_date: time of latest post }}}}}
 32 | 
 33 |     stream_index.json is created in the top level of the JSON directory.
 34 | 
 35 |     This directory also contains a subdirectory for each archived stream.
 36 | 
 37 |     In each stream subdirectory, there is a json file for each topic in that stream.
 38 | 
 39 |     This json file is a list of message objects,
 40 |     as desribed at https://zulip.com/api/get-messages
 41 | """
 42 | 
 43 | import json
 44 | import time
 45 | from datetime import datetime
 46 | from pathlib import Path
 47 | from .common import (
 48 |     exit_immediately,
 49 |     open_outfile,
 50 | )
 51 | from .url import (
 52 |     sanitize_stream,
 53 |     sanitize,
 54 | )
 55 | 
 56 | 
 57 | def dump_json(js, outfile):
 58 |     json.dump(js, outfile, ensure_ascii=False, sort_keys=True, indent=4)
 59 | 
 60 | 
 61 | # Takes a list of messages. Returns a dict mapping topic names to lists of messages in that topic.
 62 | def separate_results(list):
 63 |     map = {}
 64 |     for m in list:
 65 |         if m["subject"] not in map:
 66 |             map[m["subject"]] = [m]
 67 |         else:
 68 |             map[m["subject"]].append(m)
 69 |     return map
 70 | 
 71 | 
 72 | # Retrieves all messages matching request from Zulip, starting at post id anchor.
 73 | # As recommended in the Zulip API docs, requests 1000 messages at a time.
 74 | # Returns a list of messages.
 75 | def request_all(client, request, anchor=0):
 76 |     request["anchor"] = anchor
 77 |     request["num_before"] = 0
 78 |     request["num_after"] = 1000
 79 |     response = safe_request(client.get_messages, request)
 80 |     msgs = response["messages"]
 81 |     while not response["found_newest"]:
 82 |         request["anchor"] = response["messages"][-1]["id"] + 1
 83 |         response = safe_request(client.get_messages, request)
 84 |         msgs = msgs + response["messages"]
 85 |     return msgs
 86 | 
 87 | 
 88 | # runs client.cmd(args). If the response is a rate limit error, waits
 89 | # the requested time and then retries the request.
 90 | def safe_request(cmd, *args, **kwargs):
 91 |     rsp = cmd(*args, **kwargs)
 92 |     while rsp["result"] == "error":
 93 |         if "retry-after" in rsp:
 94 |             print("timeout hit: {}".format(rsp["retry-after"]))
 95 |             time.sleep(float(rsp["retry-after"]) + 1)
 96 |             rsp = cmd(*args, **kwargs)
 97 |         else:
 98 |             exit_immediately(rsp["msg"])
 99 |     return rsp
100 | 
101 | 
102 | def get_streams(client):
103 |     # Fetch metadata on all streams the current user has access to.
104 |     # We will filter these for processing via is_valid_stream_name.
105 |     response = safe_request(
106 |         client.get_streams,
107 |         include_public=True,
108 |         include_subscribed=True,
109 |         include_web_public=True,
110 |     )
111 |     return response["streams"]
112 | 
113 | 
114 | # Retrieves all messages from Zulip and builds a cache at json_root.
115 | def populate_all(
116 |     client,
117 |     json_root,
118 |     is_valid_stream_name,
119 | ):
120 |     all_streams = get_streams(client)
121 |     streams = [s for s in all_streams if is_valid_stream_name(s)]
122 | 
123 |     streams_data = {}
124 | 
125 |     for s in streams:
126 |         stream_name = s["name"]
127 |         stream_id = s["stream_id"]
128 | 
129 |         print(stream_name)
130 | 
131 |         topics = safe_request(client.get_stream_topics, stream_id)["topics"]
132 | 
133 |         latest_id = 0  # till we know better
134 | 
135 |         topic_data = {}
136 | 
137 |         for t in topics:
138 |             topic_name = t["name"]
139 | 
140 |             request = {
141 |                 "narrow": [
142 |                     {"operator": "stream", "operand": stream_name},
143 |                     {"operator": "topic", "operand": topic_name},
144 |                 ],
145 |                 "client_gravatar": True,
146 |                 "apply_markdown": True,
147 |             }
148 | 
149 |             messages = request_all(client, request)
150 | 
151 |             topic_count = len(messages)
152 |             last_message = messages[-1]
153 |             latest_date = last_message["timestamp"]
154 | 
155 |             topic_data[topic_name] = dict(size=topic_count, latest_date=latest_date)
156 | 
157 |             latest_id = max(latest_id, last_message["id"])
158 | 
159 |             dump_topic_messages(json_root, s, topic_name, messages)
160 | 
161 |         stream_data = dict(
162 |             id=stream_id,
163 |             latest_id=latest_id,
164 |             topic_data=topic_data,
165 |         )
166 | 
167 |         streams_data[stream_name] = stream_data
168 | 
169 |     js = dict(streams=streams_data, time=time.time())
170 |     dump_stream_index(json_root, js)
171 | 
172 | 
173 | # Retrieves only new messages from Zulip, based on timestamps from the last update.
174 | # Raises an exception if there is no index at json_root/stream_index.json
175 | def populate_incremental(
176 |     client,
177 |     json_root,
178 |     is_valid_stream_name,
179 | ):
180 |     streams = get_streams(client)
181 |     stream_index = json_root / Path("stream_index.json")
182 | 
183 |     if not stream_index.exists():
184 |         error_msg = """
185 |     You are trying to incrementally update your index, but we cannot find
186 |     a stream index at {}.
187 | 
188 |     Most likely, you have never built the index.  You can use the -t option
189 |     of this script to build a full index one time.
190 | 
191 |     (It's also possible that you have built the index but modified the configuration
192 |     or moved files in your file system.)
193 |             """.format(
194 |             stream_index
195 |         )
196 |         exit_immediately(error_msg)
197 | 
198 |     f = stream_index.open("r", encoding="utf-8")
199 |     js = json.load(f)
200 |     f.close()
201 | 
202 |     for s in (s for s in streams if is_valid_stream_name(s)):
203 |         print(s["name"])
204 |         if s["name"] not in js["streams"]:
205 |             js["streams"][s["name"]] = {
206 |                 "id": s["stream_id"],
207 |                 "latest_id": 0,
208 |                 "topic_data": {},
209 |             }
210 |         request = {
211 |             "narrow": [{"operator": "stream", "operand": s["name"]}],
212 |             "client_gravatar": True,
213 |             "apply_markdown": True,
214 |         }
215 |         new_msgs = request_all(
216 |             client, request, js["streams"][s["name"]]["latest_id"] + 1
217 |         )
218 |         if len(new_msgs) > 0:
219 |             js["streams"][s["name"]]["latest_id"] = new_msgs[-1]["id"]
220 |         nm = separate_results(new_msgs)
221 |         for topic_name in nm:
222 |             p = (
223 |                 json_root
224 |                 / Path(sanitize_stream(s["name"], s["stream_id"]))
225 |                 / Path(sanitize(topic_name) + ".json")
226 |             )
227 |             topic_exists = p.exists()
228 |             old = []
229 |             if topic_exists:
230 |                 f = p.open("r", encoding="utf-8")
231 |                 old = json.load(f)
232 |                 f.close()
233 |             m = nm[topic_name]
234 |             new_topic_data = {
235 |                 "size": len(m) + len(old),
236 |                 "latest_date": m[-1]["timestamp"],
237 |             }
238 |             js["streams"][s["name"]]["topic_data"][topic_name] = new_topic_data
239 |             dump_topic_messages(json_root, s, topic_name, old + m)
240 | 
241 |     js["time"] = time.time()
242 |     dump_stream_index(json_root, js)
243 | 
244 | 
245 | def dump_stream_index(json_root, js):
246 |     if not ("streams" in js and "time" in js):
247 |         raise Exception("programming error")
248 | 
249 |     out = open_outfile(json_root, Path("stream_index.json"), "w")
250 |     dump_json(js, out)
251 |     out.close()
252 | 
253 | 
254 | def dump_topic_messages(json_root, stream_data, topic_name, message_data):
255 |     stream_name = stream_data["name"]
256 |     stream_id = stream_data["stream_id"]
257 |     sanitized_stream_name = sanitize_stream(stream_name, stream_id)
258 |     stream_dir = json_root / Path(sanitized_stream_name)
259 | 
260 |     sanitized_topic_name = sanitize(topic_name)
261 |     topic_fn = sanitized_topic_name + ".json"
262 | 
263 |     out = open_outfile(stream_dir, topic_fn, "w")
264 |     msgs = [slim_message(m) for m in message_data]
265 |     dump_json(msgs, out)
266 |     out.close()
267 | 
268 | 
269 | def slim_message(msg):
270 |     fields = [
271 |         "content",
272 |         "id",
273 |         "sender_full_name",
274 |         "timestamp",
275 |     ]
276 |     return {k: v for k, v in msg.items() if k in fields}
277 | 


--------------------------------------------------------------------------------
/lib/sitemap.py:
--------------------------------------------------------------------------------
 1 | from glob import iglob
 2 | from typing import Iterator
 3 | 
 4 | from xml_sitemap_writer import XMLSitemap
 5 | 
 6 | 
 7 | def build_sitemap(base_url: str, archive_dir_path: str, sitemap_write_dir_path: str):
 8 |     def iterate_html_files() -> Iterator[str]:
 9 |         # Iterator yields relative path like
10 |         # archive/stream/10-errors/topic/laptop.html
11 |         # TODO: Investigate when running in windows
12 |         # TODO: Must ensure that the relative URLs are valid
13 |         return iglob("**/*.html", root_dir=archive_dir_path, recursive=True)
14 | 
15 |     with XMLSitemap(sitemap_write_dir_path, base_url) as sitemap:
16 |         sitemap.add_urls(iterate_html_files())
17 | 


--------------------------------------------------------------------------------
/lib/url.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Sometimes it feels like 80% of the battle with creating a
 3 | static website is getting all the URLs correct.
 4 | 
 5 | These are some helpers.
 6 | 
 7 | Here are some naming conventions for URL pieces:
 8 | 
 9 |     zulip_url: https://example.zulip.com
10 |     site_url: https://example.zulip-archive.com
11 |     html_root: archive
12 | 
13 | And then URLs use Zulip stream/topics, which are sometimes
14 | "sanitized" to guarantee uniqueness and not have special characters:
15 | 
16 |     stream_id: 599
17 |     stream_name: general
18 |     topic_name: lunch
19 | 
20 |     sanitized_stream_name : 599-general
21 |     sanitized_topic_name: lunch
22 | """
23 | 
24 | import urllib.parse
25 | 
26 | 
27 | def zulip_post_url(zulip_url, stream_id, stream_name, topic_name, post_id):
28 |     """
29 |     https://example.zulipchat.com/#narrow/stream/213222-general/topic/hello/near/179892604
30 |     """
31 |     sanitized = urllib.parse.quote(
32 |         "{0}-{1}/topic/{2}/near/{3}".format(stream_id, stream_name, topic_name, post_id)
33 |     )
34 |     return zulip_url + "#narrow/stream/" + sanitized
35 | 
36 | 
37 | def archive_stream_url(site_url, html_root, sanitized_stream_name):
38 |     """
39 |     http://127.0.0.1:4000/archive/stream/213222-general/index.html
40 |     """
41 |     base_url = urllib.parse.urljoin(site_url, html_root)
42 |     return f"{base_url}/stream/{sanitized_stream_name}/index.html"
43 | 
44 | 
45 | def archive_topic_url(site_url, html_root, sanitized_stream_name, sanitized_topic_name):
46 |     """
47 |     http://127.0.0.1:4000/archive/stream/213222-general/topic/newstreams.html
48 |     """
49 |     base_url = urllib.parse.urljoin(site_url, html_root)
50 |     return (
51 |         f"{base_url}/stream/{sanitized_stream_name}/topic/{sanitized_topic_name}.html"
52 |     )
53 | 
54 | 
55 | def archive_message_url(
56 |     site_url, html_root, sanitized_stream_name, sanitized_topic_name, msg_id
57 | ):
58 |     """
59 |     http://127.0.0.1:4000/archive/stream/213222-general/topic/newstreams.html#1234567
60 |     """
61 |     topic_url = archive_topic_url(
62 |         site_url, html_root, sanitized_stream_name, sanitized_topic_name
63 |     )
64 |     return f"{topic_url}#{msg_id}"
65 | 
66 | 
67 | ## String cleaning functions
68 | 
69 | 
70 | def sanitize(s):
71 |     """
72 |     Sanitize the string to a safe string that can be used in URLs
73 | 
74 |     This is copied from Zulip's core code:
75 |     https://github.com/zulip/zulip/blob/de31114d700561f32139a63a0e5f33d5c30039b3/zerver/lib/url_encoding.py#L8
76 |     """
77 |     return urllib.parse.quote(s, safe=b"").replace(".", "%2E").replace("%", ".")
78 | 
79 | 
80 | # create a unique sanitized identifier for a stream
81 | def sanitize_stream(stream_name, stream_id):
82 |     """
83 |     Encode streams for urls as something like 99-Foo-bar.
84 | 
85 |     This is copied from Zulip's core code:
86 |     https://github.com/zulip/zulip/blob/de31114d700561f32139a63a0e5f33d5c30039b3/zerver/lib/url_encoding.py#L15
87 |     """
88 | 
89 |     stream_name = stream_name.replace(" ", "-")
90 |     return str(stream_id) + "-" + sanitize(stream_name)
91 | 


--------------------------------------------------------------------------------
/lib/website.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module emits the content for your archive.
  3 | 
  4 | It emits HTML, and YAML, mostly by calling
  5 | into other modules.
  6 | 
  7 | As of April 2021, the generated html pages can be hosted simply with `python -m
  8 | http.server`.
  9 | 
 10 | This module is probably the most likely module to be forked if
 11 | you have unique requirements for how your archive should look.
 12 | 
 13 | If you are interested in porting this system away from Python to your
 14 | language of choice, this is probably the best place to start.
 15 | """
 16 | 
 17 | from pathlib import Path
 18 | import html
 19 | from shutil import copyfile, copytree
 20 | 
 21 | from .url import (
 22 |     sanitize_stream,
 23 |     sanitize,
 24 | )
 25 | 
 26 | from .files import (
 27 |     open_main_page,
 28 |     open_stream_topics_page,
 29 |     open_topic_messages_page,
 30 |     read_zulip_messages_for_topic,
 31 |     read_zulip_stream_info,
 32 | )
 33 | 
 34 | from .html import (
 35 |     format_message_html,
 36 |     last_updated_footer_html,
 37 |     topic_page_links_html,
 38 |     stream_list_page_html,
 39 |     topic_list_page_html,
 40 | )
 41 | 
 42 | from .url import (
 43 |     archive_stream_url,
 44 | )
 45 | 
 46 | 
 47 | def to_topic_page_head_html(title):
 48 |     return f'<html>\n<head><meta charset="utf-8"><title>{title}</title></head>\n'
 49 | 
 50 | 
 51 | def build_website(
 52 |     json_root,
 53 |     md_root,
 54 |     site_url,
 55 |     html_root,
 56 |     title,
 57 |     zulip_url,
 58 |     zulip_icon_url,
 59 |     repo_root,
 60 |     page_head_html,
 61 |     page_footer_html,
 62 | ):
 63 |     stream_info = read_zulip_stream_info(json_root)
 64 | 
 65 |     streams = stream_info["streams"]
 66 |     date_footer_html = last_updated_footer_html(stream_info)
 67 |     write_main_page(
 68 |         md_root,
 69 |         site_url,
 70 |         html_root,
 71 |         title,
 72 |         streams,
 73 |         date_footer_html,
 74 |         page_head_html,
 75 |         page_footer_html,
 76 |     )
 77 |     write_css(md_root)
 78 | 
 79 |     for stream_name in streams:
 80 |         print("building: ", stream_name)
 81 |         stream_data = streams[stream_name]
 82 |         topic_data = stream_data["topic_data"]
 83 | 
 84 |         write_stream_topics(
 85 |             md_root,
 86 |             site_url,
 87 |             html_root,
 88 |             title,
 89 |             stream_name,
 90 |             stream_data,
 91 |             date_footer_html,
 92 |             page_head_html,
 93 |             page_footer_html,
 94 |         )
 95 | 
 96 |         for topic_name in topic_data:
 97 |             write_topic_messages(
 98 |                 json_root,
 99 |                 md_root,
100 |                 site_url,
101 |                 html_root,
102 |                 title,
103 |                 zulip_url,
104 |                 zulip_icon_url,
105 |                 stream_name,
106 |                 streams[stream_name],
107 |                 topic_name,
108 |                 date_footer_html,
109 |                 page_head_html,
110 |                 page_footer_html,
111 |             )
112 | 
113 |     copytree(
114 |         str(Path(repo_root) / "assets"),
115 |         str(Path(md_root) / "assets"),
116 |         dirs_exist_ok=True,
117 |     )
118 | 
119 |     # Copy .nojekyll into md_root as well.
120 |     copyfile(str(Path(repo_root) / ".nojekyll"), str(Path(md_root) / ".nojekyll"))
121 | 
122 | 
123 | # writes the index page listing all streams.
124 | # `streams`: a dict mapping stream names to stream json objects as described in the header.
125 | def write_main_page(
126 |     md_root,
127 |     site_url,
128 |     html_root,
129 |     title,
130 |     streams,
131 |     date_footer_html,
132 |     page_head_html,
133 |     page_footer_html,
134 | ):
135 |     """
136 |     The main page in our website lists streams:
137 | 
138 |         Streams:
139 | 
140 |         general (70 topics)
141 |         announce (42 topics)
142 |     """
143 |     outfile = open_main_page(md_root)
144 | 
145 |     content_html = stream_list_page_html(streams)
146 | 
147 |     outfile.write(page_head_html)
148 |     outfile.write(content_html)
149 |     outfile.write(date_footer_html)
150 |     outfile.write(page_footer_html)
151 |     outfile.close()
152 | 
153 | 
154 | def write_stream_topics(
155 |     md_root,
156 |     site_url,
157 |     html_root,
158 |     title,
159 |     stream_name,
160 |     stream,
161 |     date_footer_html,
162 |     page_head_html,
163 |     page_footer_html,
164 | ):
165 |     """
166 |     A stream page lists all topics for the stream:
167 | 
168 |         Stream: social
169 | 
170 |         Topics:
171 |             lunch (4 messages)
172 |             happy hour (1 message)
173 |     """
174 | 
175 |     sanitized_stream_name = sanitize_stream(stream_name, stream["id"])
176 |     outfile = open_stream_topics_page(md_root, sanitized_stream_name)
177 | 
178 |     stream_url = archive_stream_url(site_url, html_root, sanitized_stream_name)
179 | 
180 |     topic_data = stream["topic_data"]
181 | 
182 |     content_html = topic_list_page_html(stream_name, stream_url, topic_data)
183 | 
184 |     outfile.write(page_head_html)
185 |     outfile.write(content_html)
186 |     outfile.write(date_footer_html)
187 |     outfile.write(page_footer_html)
188 |     outfile.close()
189 | 
190 | 
191 | def write_topic_messages(
192 |     json_root,
193 |     md_root,
194 |     site_url,
195 |     html_root,
196 |     title,
197 |     zulip_url,
198 |     zulip_icon_url,
199 |     stream_name,
200 |     stream,
201 |     topic_name,
202 |     date_footer_html,
203 |     page_head_html,
204 |     page_footer_html,
205 | ):
206 |     """
207 |     Writes the topics page, which lists all messages
208 |     for one particular topic within a stream:
209 | 
210 |     Stream: social
211 |     Topic: lunch
212 | 
213 |     Alice:
214 |         I want pizza!
215 | 
216 |     Bob:
217 |         No, let's get tacos!
218 |     """
219 |     stream_id = stream["id"]
220 | 
221 |     sanitized_stream_name = sanitize_stream(stream_name, stream_id)
222 |     sanitized_topic_name = sanitize(topic_name)
223 | 
224 |     messages = read_zulip_messages_for_topic(
225 |         json_root, sanitized_stream_name, sanitized_topic_name
226 |     )
227 | 
228 |     outfile = open_topic_messages_page(
229 |         md_root,
230 |         sanitized_stream_name,
231 |         sanitized_topic_name,
232 |     )
233 | 
234 |     topic_links = topic_page_links_html(
235 |         site_url,
236 |         html_root,
237 |         zulip_url,
238 |         sanitized_stream_name,
239 |         sanitized_topic_name,
240 |         stream_name,
241 |         topic_name,
242 |     )
243 | 
244 |     # We use a topic-specific title instead of `page_head_html` to improve
245 |     # search engine indexing.
246 |     outfile.write(
247 |         to_topic_page_head_html(
248 |             html.escape(topic_name) + " · " + html.escape(stream_name) + " · " + title
249 |         )
250 |     )
251 |     outfile.write(topic_links)
252 |     outfile.write(
253 |         f'\n<head><link href="{html.escape(site_url)}/style.css" rel="stylesheet"></head>\n'
254 |     )
255 | 
256 |     for msg in messages:
257 |         msg_html = format_message_html(
258 |             site_url,
259 |             html_root,
260 |             zulip_url,
261 |             zulip_icon_url,
262 |             stream_name,
263 |             stream_id,
264 |             topic_name,
265 |             msg,
266 |         )
267 |         outfile.write(msg_html)
268 |         outfile.write("\n\n")
269 | 
270 |     outfile.write(date_footer_html)
271 |     outfile.write(page_footer_html)
272 |     outfile.close()
273 | 
274 | 
275 | def write_css(md_root):
276 |     copyfile("style.css", md_root / "style.css")
277 | 


--------------------------------------------------------------------------------
/lib/zulip_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The functions here should be specific to how we store
 3 | Zulip data, without getting specific about HTML/Markdown
 4 | syntax.
 5 | 
 6 | The goal here is to have some functions that are resuable
 7 | for folks who may want to emit differently structured
 8 | HTML or markdown.
 9 | """
10 | 
11 | from .date_helper import format_date1
12 | 
13 | 
14 | def sorted_streams(streams):
15 |     """
16 |     Streams are sorted so that streams with the most topics
17 |     go to the top.
18 |     """
19 |     return sorted(streams, key=lambda s: len(streams[s]["topic_data"]), reverse=True)
20 | 
21 | 
22 | def sorted_topics(topic_data):
23 |     """
24 |     Topics are sorted so that the most recently updated
25 |     topic is at the top of the list.
26 |     """
27 |     return sorted(
28 |         topic_data, key=lambda tn: topic_data[tn]["latest_date"], reverse=True
29 |     )
30 | 
31 | 
32 | def num_topics_string(stream_topic_data):
33 |     """
34 |     example: "5 topics"
35 |     """
36 |     num_topics = len(stream_topic_data)
37 |     plural = "" if num_topics == 1 else "s"
38 |     return f"{num_topics} topic{plural}"
39 | 
40 | 
41 | def topic_info_string(message_data):
42 |     """
43 |     n messages, latest: <date>
44 |     """
45 |     cnt = message_data["size"]
46 |     plural = "" if cnt == 1 else "s"
47 |     latest_date = message_data["latest_date"]
48 |     date = format_date1(latest_date)
49 |     return f"{cnt} message{plural}, latest: {date}"
50 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyyaml==5.2
2 | xml-sitemap-writer==0.6.0
3 | zulip==0.8.2
4 | 


--------------------------------------------------------------------------------
/style.css:
--------------------------------------------------------------------------------
1 | .msg { margin-left: 2em; }
2 | 


--------------------------------------------------------------------------------
/tests/testCommon.py:
--------------------------------------------------------------------------------
 1 | # For convenience, just run the tests in the repo root directory.
 2 | import sys
 3 | 
 4 | sys.path.append("lib")
 5 | 
 6 | import common
 7 | import url
 8 | 
 9 | 
10 | class Settings:
11 |     def __init__(self, **kwargs):
12 |         for k, v in kwargs.items():
13 |             setattr(self, k, v)
14 | 
15 | 
16 | def assert_equal(v1, v2):
17 |     if v1 != v2:
18 |         print("mismatch")
19 |         print(v1)
20 |         print(v2)
21 |         raise AssertionError
22 | 
23 | 
24 | def test_sanitize():
25 |     assert_equal(
26 |         url.sanitize_stream(stream_name="foo bar", stream_id=7),
27 |         "7-foo-bar",
28 |     )
29 |     assert_equal(
30 |         url.sanitize_stream(stream_name="foo/bar/turtle[🐢]", stream_id=7),
31 |         "7-foo.2Fbar.2Fturtle.5B.F0.9F.90.A2.5D",
32 |     )
33 | 
34 |     assert_equal(
35 |         url.sanitize("pick a place for lunch *"),
36 |         "pick.20a.20place.20for.20lunch.20.2A",
37 |     )
38 |     assert_equal(
39 |         url.sanitize("!!cute-turlte/tortoise (🐢)?"),
40 |         ".21.21cute-turlte.2Ftortoise.20.28.F0.9F.90.A2.29.3F",
41 |     )
42 |     assert_equal(
43 |         url.sanitize('"the mighty turtle 🐢"'),
44 |         ".22the.20mighty.20turtle.20.F0.9F.90.A2.22",
45 |     )
46 | 
47 | 
48 | def test_validator():
49 |     def stream(name, public, web_public):
50 |         # Returns a minimalist stream dictionary.
51 |         return {"name": name, "invite_only": not public, "is_web_public": web_public}
52 | 
53 |     # Test wildcard operator for public streams.
54 |     for k in ["*", "public:*"]:
55 |         validator = common.stream_validator(Settings(included_streams=[k]))
56 |         assert_equal(validator(stream("foo", True, False)), True)
57 |         assert_equal(validator(stream("foo", True, True)), True)
58 |         assert_equal(validator(stream("bar", False, False)), False)
59 |         assert_equal(validator(stream("bar", False, True)), False)
60 | 
61 |     # Test web-public
62 |     validator = common.stream_validator(Settings(included_streams=["web-public:*"]))
63 |     assert_equal(validator(stream("foo", True, False)), False)
64 |     assert_equal(validator(stream("foo", True, True)), True)
65 |     assert_equal(validator(stream("bar", False, False)), False)
66 |     assert_equal(validator(stream("bar", False, True)), True)
67 | 
68 |     validator = common.stream_validator(Settings(included_streams=["foo", "bar"]))
69 |     assert_equal(validator(stream("foo", True, True)), True)
70 |     assert_equal(validator(stream("bar", True, True)), True)
71 |     assert_equal(validator(stream("baz", True, True)), False)
72 | 
73 |     # Test exclude.
74 |     validator = common.stream_validator(
75 |         Settings(included_streams=["*"], excluded_streams=["bad", "worse"])
76 |     )
77 |     assert_equal(validator(stream("good", True, True)), True)
78 |     assert_equal(validator(stream("bad", True, True)), False)
79 |     assert_equal(validator(stream("worse", True, True)), False)
80 | 
81 |     # edge case: excluded takes precedence over included
82 |     validator = common.stream_validator(
83 |         Settings(included_streams=["foo"], excluded_streams=["foo"])
84 |     )
85 |     assert_equal(validator(stream("foo", False, False)), False)
86 | 
87 |     validator = common.stream_validator(
88 |         Settings(included_streams=["baz"], excluded_streams=["bar"])
89 |     )
90 |     assert_equal(validator(stream("foo", True, True)), False)
91 |     assert_equal(validator(stream("bar", False, False)), False)
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     test_sanitize()
96 |     test_validator()
97 | 


--------------------------------------------------------------------------------