├── .ci
    ├── end2end
    │   ├── .dockerignore
    │   ├── Dockerfile
    │   ├── build_and_run.sh
    │   └── scripts
    │   │   ├── build_and_run_tests.sh
    │   │   ├── setup_chrome.sh
    │   │   ├── setup_firefox.sh
    │   │   └── setup_node.sh
    ├── fake-systemd
    │   └── systemctl
    ├── github-ci-compat
    ├── release
    ├── release-uv
    └── run
├── .dockerignore
├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── .gitmodules
├── CHANGELOG.org
├── LICENSE
├── README.org
├── benchmarks
    └── 20231115.org
├── ci
    └── run-github-locally
├── conftest.py
├── doc
    ├── DEVELOPMENT.org
    ├── GUIDE.org
    ├── PRIVACY.org
    ├── SOURCES.org
    ├── TROUBLESHOOTING.org
    ├── addons-mozilla-org.org
    └── config.py
├── docker
    ├── .gitignore
    ├── docker_files
    │   ├── Dockerfile
    │   ├── Dockerfile-indexer
    │   ├── docker-compose.yaml
    │   ├── indexer-config.py.example
    │   └── indexer-entrypoint.sh
    ├── get-some-data.sh
    ├── init.sh
    └── start.sh
├── extension
    ├── .ci
    │   └── build
    ├── .editorconfig
    ├── MANUAL-TESTS.org
    ├── TODO.org
    ├── __mocks__
    │   ├── browser.js
    │   └── dom-form-serializer.js
    ├── amo-metadata.json
    ├── babel.config.cjs
    ├── build
    ├── eslint.config.js
    ├── generate_manifest.js
    ├── jest.config.cjs
    ├── old
    │   ├── flow-typed
    │   │   └── webextension-polyfill.js
    │   ├── patcher.js
    │   └── webpack.config.js
    ├── package-lock.json
    ├── package.json
    ├── rollup.config.js
    ├── src
    │   ├── api.ts
    │   ├── background.ts
    │   ├── background_chrome_mv2.js
    │   ├── common.ts
    │   ├── compat.ts
    │   ├── display.ts
    │   ├── filterlist.ts
    │   ├── images
    │   │   ├── generate
    │   │   ├── ic_blacklisted_48.png
    │   │   ├── ic_blue_48.png
    │   │   ├── ic_boring_48.png
    │   │   ├── ic_error.png
    │   │   ├── ic_not_visited_48.png
    │   │   ├── ic_relatives_48.png
    │   │   ├── ic_visited_48.png
    │   │   └── source_48.svg
    │   ├── normalise.ts
    │   ├── notifications.ts
    │   ├── options.ts
    │   ├── options_page.css
    │   ├── options_page.html
    │   ├── options_page.ts
    │   ├── search.html
    │   ├── search.ts
    │   ├── selenium_bridge.js
    │   ├── showvisited.css
    │   ├── showvisited.js
    │   ├── sidebar-outer.css
    │   ├── sidebar.css
    │   ├── sidebar.ts
    │   ├── sources.ts
    │   ├── toastify.css
    │   └── toastify.js
    ├── tests
    │   ├── anchorme.test.js
    │   ├── common.test.js
    │   ├── defensify.test.js
    │   ├── integration.test.js
    │   └── test.html
    └── tsconfig.json
├── mypy.ini
├── pyproject.toml
├── pytest.ini
├── ruff.toml
├── scripts
    ├── backup-phone-history.sh
    ├── browser_history.py
    └── promnesia
├── src
    └── promnesia
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── cannon.py
    │   ├── common.py
    │   ├── compare.py
    │   ├── compat.py
    │   ├── config.py
    │   ├── database
    │       ├── common.py
    │       ├── dump.py
    │       └── load.py
    │   ├── extract.py
    │   ├── logging.py
    │   ├── misc
    │       ├── __init__.pyi
    │       ├── config_example.py
    │       └── install_server.py
    │   ├── py.typed
    │   ├── server.py
    │   ├── sources
    │       ├── auto.py
    │       ├── auto_logseq.py
    │       ├── auto_obsidian.py
    │       ├── browser.py
    │       ├── browser_legacy.py
    │       ├── demo.py
    │       ├── fbmessenger.py
    │       ├── filetypes.py
    │       ├── github.py
    │       ├── guess.py
    │       ├── hackernews.py
    │       ├── hpi.py
    │       ├── html.py
    │       ├── hypothesis.py
    │       ├── instapaper.py
    │       ├── markdown.py
    │       ├── org.py
    │       ├── plaintext.py
    │       ├── pocket.py
    │       ├── reddit.py
    │       ├── roamresearch.py
    │       ├── rss.py
    │       ├── shellcmd.py
    │       ├── signal.py
    │       ├── smscalls.py
    │       ├── stackexchange.py
    │       ├── takeout.py
    │       ├── takeout_legacy.py
    │       ├── telegram.py
    │       ├── telegram_legacy.py
    │       ├── twitter.py
    │       ├── vcs.py
    │       ├── viber.py
    │       ├── website.py
    │       └── zulip.py
    │   ├── sqlite.py
    │   └── tests
    │       ├── __init__.py
    │       ├── common.py
    │       ├── server_helper.py
    │       ├── sources
    │           ├── __init__.py
    │           ├── test_auto.py
    │           ├── test_filetypes.py
    │           ├── test_hypothesis.py
    │           ├── test_org.py
    │           ├── test_plaintext.py
    │           ├── test_shellcmd.py
    │           └── test_takeout.py
    │       ├── test_cannon.py
    │       ├── test_cli.py
    │       ├── test_compare.py
    │       ├── test_config.py
    │       ├── test_db_dump.py
    │       ├── test_extract.py
    │       ├── test_extract_urls.py
    │       ├── test_indexer.py
    │       ├── test_server.py
    │       ├── test_traverse.py
    │       └── utils.py
├── tests
    ├── addon.py
    ├── addon_helper.py
    ├── common.py
    ├── convert_screencast.py
    ├── demos.py
    ├── end2end_test.py
    ├── install_and_run
    ├── record.py
    ├── testdata
    │   ├── auto
    │   │   ├── orgs
    │   │   │   ├── file.org
    │   │   │   ├── file2.org
    │   │   │   ├── file3.org
    │   │   │   ├── file4.org
    │   │   │   └── file5.org
    │   │   └── pocket.json
    │   ├── custom
    │   │   ├── file1.txt
    │   │   └── file2.txt
    │   ├── logseq-graph
    │   │   ├── logseq
    │   │   │   └── config.edn
    │   │   └── pages
    │   │   │   └── Note.md
    │   ├── normalise
    │   │   └── ff.txt
    │   ├── obsidian-vault
    │   │   ├── .obsidian
    │   │   │   └── app.json
    │   │   └── Note.md
    │   ├── takeout-20150518T000000Z.zip
    │   ├── takeout
    │   │   └── Takeout
    │   │   │   └── My Activity
    │   │   │       └── Chrome
    │   │   │           └── MyActivity.html
    │   ├── test_config.py
    │   ├── test_multiple_page_updates
    │   │   └── index.html
    │   ├── traverse
    │   │   ├── ignoreme.txt
    │   │   ├── ignoreme2
    │   │   │   └── notrealignored.txt
    │   │   ├── imhere.txt
    │   │   └── imhere2
    │   │   │   └── real.txt
    │   └── weird.txt
    └── webdriver_utils.py
└── tox.ini


/.ci/end2end/.dockerignore:
--------------------------------------------------------------------------------
1 | build_and_run.sh
2 | 


--------------------------------------------------------------------------------
/.ci/end2end/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:latest
 2 | 
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | COPY scripts /scripts
 6 | 
 7 | RUN /scripts/setup_firefox.sh \
 8 |  && /scripts/setup_chrome.sh  \
 9 |  && /scripts/setup_node.sh    \
10 |  && apt install --yes pipx git \
11 | # using python docs as a source of some html test data
12 | # need to prevent dpkg from excluding doc files...
13 |  && sed -i '/usr.share.doc/d' /etc/dpkg/dpkg.cfg.d/excludes && apt install --yes python3-doc \
14 |  && apt clean                     \
15 |  && mkdir /promnesia
16 | 
17 | WORKDIR /promnesia
18 | 
19 | ENTRYPOINT ["/scripts/build_and_run_tests.sh"]
20 | 


--------------------------------------------------------------------------------
/.ci/end2end/build_and_run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eux
 3 | 
 4 | # TODO assert we're running under github ci?
 5 | # since this setup is kinda elaborate and can be somewhat unsafe to run blindly
 6 | 
 7 | # supposed to be called from promnesia repository root
 8 | [ -e src/promnesia ]
 9 | [ -e extension/src ]
10 | 
11 | PROMNESIA_SRC="$(pwd)"
12 | 
13 | cd .ci/end2end
14 | 
15 | IMAGE='promnesia_end2end_tests'
16 | 
17 | docker build -t "$IMAGE" .
18 | 
19 | # NOTE: dev/shm mount to prevent crashes during headless chrome
20 | docker run -v /dev/shm:/dev/shm --mount "type=bind,src=$PROMNESIA_SRC,dst=/promnesia_source,readonly=true" -e CI "$IMAGE" "$@"
21 | 


--------------------------------------------------------------------------------
/.ci/end2end/scripts/build_and_run_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eux
 3 | 
 4 | # Seems wrong to keep the whole repository in docker build context.
 5 | # So instead, we mount the repository inside the container (into /promnesia_source)
 6 | # (as read only to avoid messing up host files and crapping with caches etc.)
 7 | # However to actually run tests we do need a writable directory..
 8 | # So we copy the repo to the actual working dir here
 9 | 
10 | # ugh, kinda annoying -- not sure how to update source files when we change them on the host system...
11 | cp -R -T /promnesia_source /promnesia
12 | extension/.ci/build
13 | 
14 | git init  # todo ??? otherwise setuptools-scm fails to detect the version...
15 | 
16 | # eh. kinda annoying to jump over so many venv layer here...
17 | # but docker runs as root and it doesn't like pip install uv now
18 | # even if you pass --break-system-packages, then subsequent uv invocation also fails
19 | pipx run uv tool run --with=tox-uv tox -e end2end -- "$@"
20 | 


--------------------------------------------------------------------------------
/.ci/end2end/scripts/setup_chrome.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eux -o pipefail
 3 | 
 4 | apt update --yes
 5 | 
 6 | apt install --yes wget
 7 | 
 8 | install -d -m 0755 /etc/apt/keyrings
 9 | wget -q https://dl.google.com/linux/linux_signing_key.pub -O- | tee /etc/apt/keyrings/linux_signing_key.pub.asc > /dev/null
10 | echo "deb [signed-by=/etc/apt/keyrings/linux_signing_key.pub.asc] https://dl.google.com/linux/chrome/deb/ stable main" | tee -a /etc/apt/sources.list.d/google-chrome.list > /dev/null
11 | 
12 | apt update
13 | 
14 | apt install --yes google-chrome-stable
15 | 
16 | # sadly latest version of chrome/chromedriver isn't working due to some bugs with iframes (see install_custom_chrome)
17 | 
18 | # remove the actual chrome to get it out of the way (we do want dependencies though)
19 | apt remove --yes google-chrome-stable
20 | ! which google-chrome  # check there is no binary (in case of virtual packages or whatever)
21 | 
22 | function install_custom_chrome() {
23 |     ## this installs last revision that was actually working (1110897) or 113.0.5623.0
24 |     ## see https://bugs.chromium.org/p/chromedriver/issues/detail?id=4440
25 |     apt install --yes unzip
26 | 
27 |     mkdir /tmp/chrome
28 | 
29 |     wget -q 'https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2F1110897%2Fchrome-linux.zip?generation=1677589092014487&alt=media'         \
30 |        -O /tmp/chrome/chrome-linux.zip
31 |     unzip /tmp/chrome/chrome-linux.zip         -d /tmp/chrome
32 |     ln -sf /tmp/chrome/chrome-linux/chrome /usr/bin/google-chrome
33 | 
34 |     wget -q 'https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2F1110897%2Fchromedriver_linux64.zip?generation=1677589097630198&alt=media' \
35 |        -O /tmp/chrome/chromedriver_linux64.zip
36 |     unzip /tmp/chrome/chromedriver_linux64.zip -d /tmp/chrome
37 |     ln -sf /tmp/chrome/chromedriver_linux64/chromedriver /usr/bin/chromedriver
38 | }
39 | 
40 | install_custom_chrome
41 | 


--------------------------------------------------------------------------------
/.ci/end2end/scripts/setup_firefox.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eux -o pipefail
 3 | 
 4 | apt update --yes
 5 | 
 6 | apt install --yes wget
 7 | 
 8 | # NOTE: these days ubuntu provisions firefox via snap, and it's a nightmare to make it work with webdriver
 9 | # so we force it to use a regular package (following these instructions https://askubuntu.com/a/1510872/427470)
10 | install -d -m 0755 /etc/apt/keyrings
11 | wget -q https://packages.mozilla.org/apt/repo-signing-key.gpg -O- | tee /etc/apt/keyrings/packages.mozilla.org.asc > /dev/null
12 | echo "deb [signed-by=/etc/apt/keyrings/packages.mozilla.org.asc] https://packages.mozilla.org/apt mozilla main" | tee -a /etc/apt/sources.list.d/mozilla.list > /dev/null
13 | 
14 | # prevent snap version from overriding:
15 | echo '
16 | Package: *
17 | Pin: origin packages.mozilla.org
18 | Pin-Priority: 1000
19 | ' | tee /etc/apt/preferences.d/mozilla
20 | # to check: -- should not show anything mentioning snap
21 | # apt install --verbose-versions --dry-run firefox
22 | 
23 | apt update
24 | 
25 | apt install --yes firefox
26 | # NOTE: selenium should download the corresponding geckodriver itself via selenium_manager
27 | 


--------------------------------------------------------------------------------
/.ci/end2end/scripts/setup_node.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eux -o pipefail
3 | 
4 | apt update --yes
5 | apt install --yes curl
6 | curl -fsSL https://deb.nodesource.com/setup_20.x | bash -
7 | apt install --yes nodejs
8 | 


--------------------------------------------------------------------------------
/.ci/fake-systemd/systemctl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # dummy systemctl implementation that's capable of running the service and nothing else
 3 | 
 4 | import argparse
 5 | from pathlib import Path
 6 | from subprocess import Popen
 7 | import sys
 8 | 
 9 | 
10 | def main():
11 |     args = sys.argv[1:]
12 |     args = [x for x in args if not x.startswith('--')]
13 | 
14 |     print(args)
15 |     cmd = args[0]
16 |     if cmd != 'start':
17 |         return
18 | 
19 |     name = args[1]
20 |     sdir = Path('~/.config/systemd/user').expanduser()
21 |     unit = sdir / name
22 | 
23 |     contents = unit.read_text()
24 | 
25 |     ES = 'ExecStart='
26 |     command = None
27 |     for line in contents.splitlines():
28 |         if line.startswith(ES):
29 |             command = line[len(ES):]
30 |             break
31 |     assert command is not None, contents
32 | 
33 |     # after that will be inherined by init
34 |     print('Running: ' + command)
35 |     Popen(command, shell=True)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     main()
40 | 


--------------------------------------------------------------------------------
/.ci/github-ci-compat:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eux
 2 | 
 3 | # install sudo if it's missing
 4 | # probably means that we're running under local docker..
 5 | if ! which sudo; then
 6 |     apt update
 7 |     apt -y install sudo
 8 | fi
 9 | 
10 | # make up for differences between ubuntu:focal and github action image...
11 | sudo apt -y install python3.12 python3.12-dev
12 | sudo apt -y install python3-pip python3-setuptools
13 | 
14 | # otherwise setuptools don't work..
15 | sudo apt -y install git
16 | 
17 | # jq wants it??
18 | sudo apt -y install dh-autoreconf


--------------------------------------------------------------------------------
/.ci/release:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | '''
 3 | Run [[file:scripts/release][scripts/release]] to deploy Python package onto [[https://pypi.org][PyPi]] and [[https://test.pypi.org][test PyPi]].
 4 | 
 5 | The script expects =TWINE_PASSWORD= environment variable to contain the [[https://pypi.org/help/#apitoken][PyPi token]] (not the password!).
 6 | 
 7 | The script can be run manually.
 8 | It's also running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]]. Packages are deployed on:
 9 | - every master commit, onto test pypi
10 | - every new tag, onto production pypi
11 | 
12 | You'll need to set =TWINE_PASSWORD= and =TWINE_PASSWORD_TEST= in [[https://help.github.com/en/actions/configuring-and-managing-workflows/creating-and-storing-encrypted-secrets#creating-encrypted-secrets][secrets]]
13 | for Github Actions deployment to work.
14 | '''
15 | 
16 | import os
17 | import sys
18 | from pathlib import Path
19 | from subprocess import check_call
20 | import shutil
21 | 
22 | is_ci = os.environ.get('CI') is not None
23 | 
24 | def main() -> None:
25 |     import argparse
26 |     p = argparse.ArgumentParser()
27 |     p.add_argument('--test', action='store_true', help='use test pypi')
28 |     args = p.parse_args()
29 | 
30 |     extra = []
31 |     if args.test:
32 |         extra.extend(['--repository', 'testpypi'])
33 | 
34 |     root = Path(__file__).absolute().parent.parent
35 |     os.chdir(root) # just in case
36 | 
37 |     if is_ci:
38 |         # see https://github.com/actions/checkout/issues/217
39 |         check_call('git fetch --prune --unshallow'.split())
40 | 
41 |     dist = root / 'dist'
42 |     if dist.exists():
43 |         shutil.rmtree(dist)
44 | 
45 |     check_call(['python3', '-m', 'build'])
46 | 
47 |     TP = 'TWINE_PASSWORD'
48 |     password = os.environ.get(TP)
49 |     if password is None:
50 |         print(f"WARNING: no {TP} passed", file=sys.stderr)
51 |         import pip_secrets
52 |         password = pip_secrets.token_test if args.test else pip_secrets.token # meh
53 | 
54 |     check_call([
55 |         'python3', '-m', 'twine',
56 |         'upload', *dist.iterdir(),
57 |         *extra,
58 |     ], env={
59 |         'TWINE_USERNAME': '__token__',
60 |         TP: password,
61 |         **os.environ,
62 |     })
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 


--------------------------------------------------------------------------------
/.ci/release-uv:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | '''
 3 | Deploys Python package onto [[https://pypi.org][PyPi]] or [[https://test.pypi.org][test PyPi]].
 4 | 
 5 | - running manually
 6 | 
 7 |   You'll need =UV_PUBLISH_TOKEN= env variable
 8 | 
 9 | - running on Github Actions
10 | 
11 |   Instead of env variable, relies on configuring github as Trusted publisher (https://docs.pypi.org/trusted-publishers/) -- both for test and regular pypi
12 | 
13 |   It's running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]].
14 |   Packages are deployed on:
15 |   - every master commit, onto test pypi
16 |   - every new tag, onto production pypi
17 | '''
18 | 
19 | UV_PUBLISH_TOKEN = 'UV_PUBLISH_TOKEN'
20 | 
21 | import argparse
22 | import os
23 | import shutil
24 | from pathlib import Path
25 | from subprocess import check_call
26 | 
27 | is_ci = os.environ.get('CI') is not None
28 | 
29 | def main() -> None:
30 |     p = argparse.ArgumentParser()
31 |     p.add_argument('--use-test-pypi', action='store_true')
32 |     args = p.parse_args()
33 | 
34 |     publish_url = ['--publish-url', 'https://test.pypi.org/legacy/'] if args.use_test_pypi else []
35 | 
36 |     root = Path(__file__).absolute().parent.parent
37 |     os.chdir(root) # just in case
38 | 
39 |     if is_ci:
40 |         # see https://github.com/actions/checkout/issues/217
41 |         check_call('git fetch --prune --unshallow'.split())
42 | 
43 |     # TODO ok, for now uv won't remove dist dir if it already exists
44 |     #  https://github.com/astral-sh/uv/issues/10293
45 |     dist = root / 'dist'
46 |     if dist.exists():
47 |         shutil.rmtree(dist)
48 | 
49 |     # todo what is --force-pep517?
50 |     check_call(['uv', 'build'])
51 | 
52 |     if not is_ci:
53 |         # CI relies on trusted publishers so doesn't need env variable
54 |         assert UV_PUBLISH_TOKEN in os.environ, f'no {UV_PUBLISH_TOKEN} passed'
55 | 
56 |     check_call(['uv', 'publish', *publish_url])
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/.ci/run:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eu
 3 | 
 4 | cd "$(dirname "$0")"
 5 | cd .. # git root
 6 | 
 7 | if ! command -v sudo; then
 8 |     # CI or Docker sometimes doesn't have it, so useful to have a dummy
 9 |     function sudo {
10 |         "$@"
11 |     }
12 | fi
13 | 
14 | # --parallel-live to show outputs while it's running
15 | tox_cmd='run-parallel --parallel-live'
16 | if [ -n "${CI-}" ]; then
17 |     # install OS specific stuff here
18 |     # TODO: pyjq is not necessary anymore? will keep CI deps just in case I guess
19 |     PYJQ_DEPS=('autoconf' 'automake' 'libtool') # see https://github.com/mwilliamson/jq.py#installation
20 |     case "$OSTYPE" in
21 |     darwin*) 
22 |         # macos
23 |         brew install "${PYJQ_DEPS[@]}"
24 | 
25 |         # TODO hmm. this should be in setup.py?
26 |         brew install libmagic # for python-magic
27 |         ;;
28 |     cygwin* | msys* | win*)
29 |         # windows
30 |         # ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that
31 |         tox_cmd='run'
32 |         ;;
33 |     *)
34 |         # must be linux?
35 |         sudo apt update
36 | 
37 |         # TODO also need to warn from readme??
38 |         sudo apt install "${PYJQ_DEPS[@]}" python3-dev
39 |         ;;
40 |     esac
41 | fi
42 | 
43 | # NOTE: expects uv installed
44 | uv tool run --with tox-uv tox $tox_cmd "$@"
45 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | # org-mode test data
2 | [submodule "tests/testdata/ox-hugo"]
3 | 	path = tests/testdata/ox-hugo
4 | 	url = https://github.com/kaushalmodi/ox-hugo.git
5 | # hypothesis module + testdata
6 | [submodule "tests/testdata/hypexport"]
7 | 	path = tests/testdata/hypexport
8 | 	url = https://github.com/karlicoss/hypexport.git
9 | 


--------------------------------------------------------------------------------
/CHANGELOG.org:
--------------------------------------------------------------------------------
 1 | * =v1.0.20210415=
 2 | 
 3 | Thanks @ankostis, @purarue, @gms8994, @Cobertos and others for changes!
 4 | 
 5 | ** general
 6 | - *deprecate*: if you have =import promnesia= in the config you should switch it to =import promnesia.common=
 7 |    
 8 |     see https://github.com/karlicoss/promnesia/pull/225
 9 |     This brings us closer towards making promnesia a namespaced package to allow for better extensibility.
10 | - better Windows support https://github.com/karlicoss/promnesia/pull/197
11 | 
12 | ** indexer
13 | - *new*: 'update' style indexing is now the default https://github.com/karlicoss/promnesia/pull/211
14 | 
15 |   It means that database won't be emptied before reindexing, so if you only index a single datasource, the data for other datasources will be untouched.
16 |   If you want the previous behaviour, you can use =--overwrite=
17 | - fixes for race conditions during 'update' style indexing https://github.com/karlicoss/promnesia/pull/220
18 | - minor cannon enhancements
19 | 
20 | ** server
21 | - fix deprecation in sqlalchemy API https://github.com/karlicoss/promnesia/pull/221
22 | 
23 | ** sources
24 | 
25 | - *new*: viber data source (local desktop database)
26 |   - https://github.com/karlicoss/promnesia/pull/204
27 |   - https://github.com/karlicoss/promnesia/pull/208
28 |   - https://github.com/karlicoss/promnesia/pull/224
29 | - *new*: safari browser data https://github.com/karlicoss/promnesia/pull/207
30 | - *new*: stackexchange source https://github.com/karlicoss/promnesia/pull/189
31 | - auto indexer: better directory pruning https://github.com/karlicoss/promnesia/pull/209
32 | - telegram: enhancements to opt out of non-http link extraction
33 | - firefox: handle Fenix databases properly https://github.com/karlicoss/promnesia/pull/227
34 | - hypothesis: 
35 |   - extract tags https://github.com/karlicoss/promnesia/pull/199
36 |   - extract URLs from annotation text https://github.com/karlicoss/promnesia/pull/222
37 | 
38 | * for older versions, see https://github.com/karlicoss/promnesia/releases
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Dmitrii Gerasimov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/benchmarks/20231115.org:
--------------------------------------------------------------------------------
 1 | Running on @karlicoss desktop PC, =python3.10= (under docker)
 2 | 
 3 | - dumping via cachew NTBinder, using regular sqlalchemy insert statement
 4 | 
 5 | #+begin_example
 6 | $ python3 -m pytest --import-mode=importlib --pyargs promnesia.tests -s -k 'gc_off and benchmark and 100000'
 7 | 12.90s call     src/promnesia/tests/test_db_dump.py::test_benchmark_visits_dumping[gc_off-100000]
 8 | PASSED src/promnesia/tests/test_db_dump.py::test_benchmark_visits_dumping[gc_off-100000]
 9 | #+end_example
10 | 
11 | 
12 | - dumping via cachew NTBinder, using dbengine to insert directly -- massive speedup (added a test with 1M visits too)
13 | #+begin_example
14 | $ python3 -m pytest --import-mode=importlib --pyargs promnesia.tests -s -k 'gc_off and benchmark and 100000'
15 | 0.85s call     src/promnesia/tests/test_db_dump.py::test_benchmark_visits_dumping[gc_off-100000]
16 | 8.23s call     src/promnesia/tests/test_db_dump.py::test_benchmark_visits_dumping[gc_off-1000000]
17 | #+end_example
18 | 


--------------------------------------------------------------------------------
/ci/run-github-locally:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -eux
 2 | 
 3 | cd "$(dirname "$0")"
 4 | 
 5 | cd ..
 6 | 
 7 | act -P ubuntu-latest=ubuntu:bionic "$@"
 8 | 
 9 | # you can docker exec -it /bin/bash into the container and debug there
10 | 
11 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | # this is a hack to monkey patch pytest so it handles tests inside namespace packages without __init__.py properly
 2 | # without it, pytest can't discover the package root for some reason
 3 | # also see https://github.com/karlicoss/pytest_namespace_pkgs for more
 4 | 
 5 | import os
 6 | import pathlib
 7 | from typing import Optional
 8 | 
 9 | import _pytest.main
10 | import _pytest.pathlib
11 | 
12 | # we consider all dirs in repo/ to be namespace packages
13 | root_dir = pathlib.Path(__file__).absolute().parent.resolve() / 'src'
14 | assert root_dir.exists(), root_dir
15 | 
16 | # TODO assert it contains package name?? maybe get it via setuptools..
17 | 
18 | namespace_pkg_dirs = [str(d) for d in root_dir.iterdir() if d.is_dir()]
19 | 
20 | # resolve_package_path is called from _pytest.pathlib.import_path
21 | # takes a full abs path to the test file and needs to return the path to the 'root' package on the filesystem
22 | resolve_pkg_path_orig = _pytest.pathlib.resolve_package_path
23 | def resolve_package_path(path: pathlib.Path) -> Optional[pathlib.Path]:
24 |     result = path  # search from the test file upwards
25 |     for parent in result.parents:
26 |         if str(parent) in namespace_pkg_dirs:
27 |             return parent
28 |     if os.name == 'nt':
29 |         # ??? for some reason on windows it is trying to call this against conftest? but not on linux/osx
30 |         if path.name == 'conftest.py':
31 |             return resolve_pkg_path_orig(path)
32 |     raise RuntimeError("Couldn't determine path for ", path)
33 | _pytest.pathlib.resolve_package_path = resolve_package_path
34 | 
35 | 
36 | # without patching, the orig function returns just a package name for some reason
37 | # (I think it's used as a sort of fallback)
38 | # so we need to point it at the absolute path properly
39 | # not sure what are the consequences.. maybe it wouldn't be able to run against installed packages? not sure..
40 | search_pypath_orig = _pytest.main.search_pypath
41 | def search_pypath(module_name: str) -> str:
42 |     mpath = root_dir / module_name.replace('.', os.sep)
43 |     if not mpath.is_dir():
44 |         mpath = mpath.with_suffix('.py')
45 |         assert mpath.exists(), mpath  # just in case
46 |     return str(mpath)
47 | _pytest.main.search_pypath = search_pypath
48 | 


--------------------------------------------------------------------------------
/doc/PRIVACY.org:
--------------------------------------------------------------------------------
 1 | *TLDR: Promnesia extension isn't collecting any browser data and isn't transmitting your data to any external site. Not telemetry is collected or sent either.*
 2 | 
 3 | * How Promnesia works
 4 | See [[file:../README.org#how-does-it-work]["How does it work"]] for the longer explanation, but in summary:
 5 | 
 6 | - promnesia indexer runs against the files on your computer, and stores the results in the intermediate database (also on your computer)
 7 | - promnesia server runs on your computer and exposes a local port (=13131= by default), so it can communicate with the browser extension
 8 | - promnesia extension runs in your browser and requests various data from the promnesia server
 9 | 
10 | *By default this all happens only within your system*. The extension works in local-first/offline mode, which is one of the main goals.
11 | (However, if you want, you can set it up on an external domain, with the reverse proxy or something like that).
12 | 
13 | * What data is requested from the promnesia server
14 | 
15 | There are just a few requests to the Promnesia server the extension is doing at the moment.
16 | 
17 | - ~search/search_around~: sends whatever you typed in the search box
18 | - ~visits~: sends the current tab URL to the server (unless it's excluded)
19 | - ~visited~: sends all URLs on the current page to the server (except the excluded ones)
20 | 
21 | None of the information above is kept by the server, it *only reads the data from the database*.
22 | It is however possible that some of it is retained in the console logs, that would depend on how exactly you're running the server and the log retention policy of your system.
23 | 
24 | # for fuck's sake, github doesn't support file:GUIDE.org::#excludelist link...
25 | # so it's either broken in emacs or in org-mode. fucking hell
26 | For the maximum privacy you can use the [[file:GUIDE.org#excludelist][excludelist]] feature, this will prevent Promnesia from processing the page/domain.
27 | 
28 | # TODO if you want to backup your browser history and feed in promnesia (e.g. to overcome the 90 days limit etc)
29 | 
30 | ** External requests
31 | 
32 | - if you have [[file:GUIDE.org#excludelist][external excludelists URLs]] in the extension settings, they will be updated now and then
33 | 
34 |   These excludelists are downloaded and cached locally, so *no information about when and what you're browsing gets out*.
35 | 
36 | 
37 | * Extension permissions
38 | - =file/http/https=: the extension is meant to work on any page, hence such a broad scope
39 |   this is necessary for webNavigation callbacks to work properly and update icon/sidebar
40 | 
41 | - =storage=: for settings
42 | - =webNavigation=: receiving page status updates so extension kicks in on page loading
43 | - =contextMenus=: context menu actions
44 | 
45 | There permissions are required at the moment, but there is an [[https://github.com/karlicoss/promnesia/issues/97][issue]] for work on possibly making them optional.
46 | 
47 | - =notifications=: showing notifications
48 | 
49 | - =history=: to use local browsing history
50 | 
51 |   Local history isn't strictly required, so we could omit this if people prefer.
52 | - =bookmarks=: used as one of the sources
53 | 
54 |   It can already be toggled in the settings, so the permission could be dynamic too
55 | 
56 | * Security
57 | While I have some reasonable understanding of security, I'm no expert, so would be very grateful if you flag potential issues or [[https://github.com/karlicoss/promnesia/issues/14][go through the code]] (especially extension).
58 | 
59 | There is some ongoing work on Dockerizing: [[promnesia][issues/55]].
60 | 


--------------------------------------------------------------------------------
/doc/addons-mozilla-org.org:
--------------------------------------------------------------------------------
 1 | Sometimes extension is flagged for review and you're asked to provide unminifed source code and build instructions.
 2 | 
 3 | To provide the source code, run: =git archive master --output=promnesia-source.zip=
 4 | 
 5 | Also can point them at https://github.com/karlicoss/promnesia/tree/master/extension
 6 | 
 7 | The build instructions assume that the zip file with source code is in =/path/to/promnesia-source.zip= (on the HOST system).
 8 | *Make sure to replace it with the actual path to the source code zip file.*
 9 | 
10 | 
11 | * Building addon
12 | 
13 | To build you need *Ubuntu 24.04/Noble* and *Node 20*. The easiest way to build cleanly would be a Docker container:
14 | 
15 | #+begin_src
16 | # on the HOST system: cleanup previous container -- if it didn't exist in the first, it will show an error, ignore it
17 | docker rm -f promnesia_build
18 | 
19 | # on the HOST system: create the container
20 | docker create --name promnesia_build -it ubuntu:noble /bin/bash
21 | 
22 | # on the HOST system: put the sources into the container
23 | docker cp /path/to/promnesia-source.zip promnesia_build:/promnesia.zip
24 | 
25 | # on the HOST system: start the container
26 | docker start -i promnesia_build
27 | 
28 | #+end_src
29 | 
30 | After that build the addon (run these commands INSIDE the container if you choose to do it with Docker):
31 | 
32 | #+begin_src
33 | $ apt update && apt install -y git curl unzip
34 | $ curl -fsSL https://deb.nodesource.com/setup_20.x | bash -
35 | $ DEBIAN_FRONTEND=noninteractive apt install -y nodejs
36 | $ unzip promnesia.zip -d promnesia
37 | $ cd promnesia
38 | $ npm install
39 | $ ./build --firefox --release --lint --publish=skip
40 | #+end_src
41 | 
42 | The final artifact will be in =/promnesia/dist/artifacts/firefox/promnesia-<version>.zip= (INSIDE the container).
43 | 
44 | If you need to get it back onto the HOST system (e.g. to test in the browser), run on the HOST system (e.g. in a separate terminal):
45 | 
46 | #+begin_src
47 | docker cp promnesia_build:/promnesia/dist/artifacts/firefox/promnesia-<version>.zip .
48 | #+end_src
49 | 
50 | This will copy it into the current directory on the HOST system.
51 | 
52 | # TODO: how to keep the instructions consistent with the CI?
53 | 
54 | 


--------------------------------------------------------------------------------
/docker/.gitignore:
--------------------------------------------------------------------------------
1 | user_data/
2 | 


--------------------------------------------------------------------------------
/docker/docker_files/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3
 2 | 
 3 | RUN mkdir /user_data \
 4 |     mkdir /usr/src/promnesia
 5 | 
 6 | WORKDIR /usr/src/promnesia
 7 | COPY src/ .
 8 | COPY setup.py /usr/src/
 9 | 
10 | #RUN python /usr/src/setup.py #LookupError: setuptools-scm was unable to detect version for '/usr/src/promnesia'.
11 | 
12 | RUN pip install --no-cache-dir more_itertools pytz sqlalchemy cachew \
13 |                 appdirs urlextract python-magic \
14 |                 tzlocal \
15 |                 logzero HPI beautifulsoup4 lxml mistletoe orgparse dataset fastapi uvicorn
16 | 
17 | ENV PPATH=/usr/src/promnesia:${PPATH}
18 | VOLUME /user_data
19 | 
20 | EXPOSE 13131
21 | CMD ["python", "-m", "promnesia", "serve", "--db", "/user_data/promnesia.sqlite", "--port", "13131"]
22 | 


--------------------------------------------------------------------------------
/docker/docker_files/Dockerfile-indexer:
--------------------------------------------------------------------------------
1 | FROM promnesia:latest
2 | 
3 | RUN apt-get update && apt-get install -y cron
4 | 
5 | COPY docker/docker_files/indexer-entrypoint.sh /
6 | ENTRYPOINT ["/indexer-entrypoint.sh"]
7 | 


--------------------------------------------------------------------------------
/docker/docker_files/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: '3.3'
 2 | 
 3 | services:
 4 |   server:
 5 |     image: promnesia
 6 |     build:
 7 |       context: ../../
 8 |       dockerfile: docker/docker_files/Dockerfile
 9 |     # Uncomment to enable persisent volumes
10 |     volumes:
11 |       - ../user_data:/user_data
12 |     ports:
13 |       - "127.0.0.1:13131:13131"
14 |     restart: always
15 |   indexer:
16 |     depends_on:
17 |       - server
18 |     image: promnesia-indexer
19 |     build:
20 |       context: ../../
21 |       dockerfile: docker/docker_files/Dockerfile-indexer
22 |     # Uncomment to enable persisent volumes
23 |     volumes:
24 |       - ../user_data:/user_data
25 |     environment:
26 |       # run once every ten minutes
27 |       CRONTAB: "0-59/1 * * * * cd /usr/src/promnesia &&  /usr/local/bin/python -m promnesia index  --config /user_data/indexer-config.py"
28 | 


--------------------------------------------------------------------------------
/docker/docker_files/indexer-config.py.example:
--------------------------------------------------------------------------------
 1 | from promnesia import Source
 2 | from promnesia.sources import auto
 3 | 
 4 | # todo: we should probably have separate docker volumes for sources and for config/db
 5 | 
 6 | SOURCES = [
 7 |     Source(
 8 |         auto.index,
 9 |         '/user_data/source1/',
10 |     ),
11 | #    Source(
12 | #        auto.index,
13 | #        '/source2/',
14 | #    )
15 | ]
16 | 
17 | OUTPUT_DIR = '/user_data/'
18 | 
19 | # this is not supported yet. also, it should probably be named something else than MIME_HANDLER.
20 | #import os
21 | #MIME_HANDLER = 'editor://' + os.path.realpath(os.path.dirname(os.path.realpath(__file__)) + '../')
22 | 


--------------------------------------------------------------------------------
/docker/docker_files/indexer-entrypoint.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # note: https://lostindetails.com/articles/How-to-run-cron-inside-Docker
4 | # note: CRONTAB is set in docker-compose.yaml.
5 | 
6 | echo "${CRONTAB} > /proc/1/fd/1 2>/proc/1/fd/2" | crontab -
7 | cron -f
8 | 


--------------------------------------------------------------------------------
/docker/get-some-data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | cd "$(dirname "$0")"
 4 | 
 5 | cd user_data/
 6 | mkdir source1
 7 | cd source1
 8 | echo "i like https://github.com/karlicoss/promnesia." >> my_notes.txt
 9 | git clone https://github.com/karlicoss/exobrain
10 | git clone https://github.com/koo5/notes
11 | 
12 | 


--------------------------------------------------------------------------------
/docker/init.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | cd "$(dirname "$0")"
 4 | mkdir user_data
 5 | cp docker_files/indexer-config.py.example user_data/indexer-config.py
 6 | ./get-some-data.sh
 7 | 
 8 | # the config file will be periodically reloaded by the indexer process, and data sources will be periodically re-indexed.
 9 | 
10 | 


--------------------------------------------------------------------------------
/docker/start.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | cd "$(dirname "$0")"
4 | docker-compose -f docker_files/docker-compose.yaml build && docker-compose -f docker_files/docker-compose.yaml up
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/extension/.ci/build:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eux
 3 | 
 4 | cd "$(dirname "$0")"
 5 | cd ../.. # git root
 6 | 
 7 | cd extension
 8 | 
 9 | npm install
10 | 
11 | FAILED=0
12 | 
13 | npm run eslint || FAILED=1
14 | npm run test   || FAILED=1
15 | 
16 | for browser in 'firefox' 'chrome'; do
17 |     ./build --target "$browser" "$@"
18 | done
19 | 
20 | exit "$FAILED"
21 | 


--------------------------------------------------------------------------------
/extension/.editorconfig:
--------------------------------------------------------------------------------
 1 | # editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | end_of_line = lf
 9 | charset = utf-8
10 | trim_trailing_whitespace = true
11 | insert_final_newline = true
12 | 


--------------------------------------------------------------------------------
/extension/MANUAL-TESTS.org:
--------------------------------------------------------------------------------
1 | These are tests that aren't yet in end2end/extension unit tests
2 | 
3 | * [2023-01-06 Fri 00:18] check that linkifying is working
4 | - go to https://wiki.openhumans.org/wiki/Personal_Science_Wiki
5 | - open sidebar
6 | - [[https://wiki.openhumans.org/wiki/Personal_Science_Wiki][Personal Science Wiki]] should be in contexts
7 | - *expected*: https://wiki.openhumans.org/wiki/Personal_Science_Wiki is highlighted as URL
8 | 


--------------------------------------------------------------------------------
/extension/TODO.org:
--------------------------------------------------------------------------------
 1 | #+FILETAGS: promnesia
 2 | 
 3 | * Building
 4 | apparently needs `npm install` done [once?]
 5 | ** TODO [2019-06-13 Thu 19:37] web-ext thing  https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/web-ext_command_reference#web-ext_sign
 6 | in the addon folder
 7 | ** web-ext lint (TODO file:/// permission ; eval)   
 8 | ** web-ext build --overwrite-dest    TODO --self-hosted?
 9 | ** DONE learn how to sign https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/web-ext_command_reference#web-ext_sign
10 |    CLOSED: [2019-08-10 Sat 12:11]
11 |    :LOGBOOK:
12 |    - State "DONE"       from "TODO"       [2019-08-10 Sat 12:11]
13 |    :END:
14 | 
15 | * Testing scenarios
16 | ** [#A] [2019-04-19 Fri 22:38] open new visited tab via a link (have a html page for that)
17 | 
18 |  Pinboard: public bookmarks for tswaterman
19 |  https://pinboard.in/u:tswaterman/
20 |  Comment:
21 |  shit. it doesn't trigger if I open new tab form the link, but seems to work fine if I create new tab and enter the url..
22 | *** [2019-04-21 Sun 09:52] 
23 |     onCreated with url set
24 |     onDomContentLoaded
25 |     onUpdated with status complete
26 | 
27 | ** refreshing the tab
28 |     multiple onDomContentLoaded
29 |     onUpdated with status complete
30 | 
31 | ** click url in the same tab
32 |     onUpdated with url set   
33 |     onDomContentLoaded
34 |     onUpdated with status complete
35 |     (that works now)
36 | 
37 | ** url changes automatically in the same tab (e.g. youtube)
38 |     ------ tested on YT watch later
39 |     onUpdated with url set
40 |     NO onDomContentLoaded
41 |     onUpdated with status complete
42 |  
43 | 
44 | ** open new empty tab, type in url, enter
45 |   first, when you open the tab:
46 |     onCreated with empty tab
47 |     onUpdated with url set   
48 |     onUpdated with complete
49 |   then when you type in the url:
50 |     onUpdated with url set
51 |     onDomContentLoaded
52 |     onUpdated with status complete
53 | 
54 | 
55 | ** tab open triggered via external link handling
56 |     onCreated with url set
57 |     onDomContentLoaded
58 |     onUpdated with status complete
59 | ** [2019-04-21 Sun 10:16] ok, overall onUpdated(complete) seems the most reliable. the only a bit annoying thing is that it happens late sometimes
60 |    I guess I could keep some sort of cache? not sure..
61 | ** TODO [2019-08-10 Sat 12:11] move them to end2end test
62 | 


--------------------------------------------------------------------------------
/extension/__mocks__/browser.js:
--------------------------------------------------------------------------------
 1 | // this is to prevent tests failing on importing browser polyfill
 2 | // see https://stackoverflow.com/questions/73809020/cant-mock-webextension-polyfill-for-jest-unit-tests
 3 | import { jest } from "@jest/globals"
 4 | 
 5 | const mockBrowser = {
 6 |   history: {
 7 |     getVisits: jest.fn(),
 8 |     search   : jest.fn(),
 9 |   },
10 |   bookmarks: {
11 |     getTree: jest.fn(),
12 |   },
13 |   storage: {
14 |     sync: {
15 |       // meh.
16 |       get: (name, res) => {
17 |         res({'options': {
18 |           host: 'http://badhost:43210', // some random port, to cause it fail
19 |         }})
20 |       }
21 |     },
22 |   },
23 |   runtime: {
24 |     lastError: null,
25 |     getManifest    : () => { return {version: 'whatever'} },
26 |     getPlatformInfo: async () => {},
27 |   },
28 | }
29 | 
30 | export default mockBrowser
31 | 


--------------------------------------------------------------------------------
/extension/__mocks__/dom-form-serializer.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   serialize: null,
3 |   deserialize: null,
4 | }
5 | 


--------------------------------------------------------------------------------
/extension/amo-metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": {
 3 |     "approval_notes": "
 4 | You can find up-to-date extension code here https://github.com/karlicoss/promnesia/tree/master/extension
 5 | 
 6 | The build instructions assume that the zip file with source code is in =/path/to/promnesia-source.zip= (on the HOST system).
 7 | *Make sure to replace it with the actual path to the source code zip file.*
 8 | 
 9 | To build you need *Ubuntu 24.04/Noble* and *Node 20*. The easiest way to build cleanly would be a Docker container:
10 | 
11 | ```
12 | # on the HOST system: cleanup previous container -- if it didn't exist in the first, it will show an error, ignore it
13 | docker rm -f promnesia_build
14 | 
15 | # on the HOST system: create the container
16 | docker create --name promnesia_build -it ubuntu:noble /bin/bash
17 | 
18 | # on the HOST system: put the sources into the container
19 | docker cp /path/to/promnesia-source.zip promnesia_build:/promnesia.zip
20 | 
21 | # on the HOST system: start the container
22 | docker start -i promnesia_build
23 | ```
24 | 
25 | After that build the addon (run these commands INSIDE the container if you choose to do it with Docker):
26 | 
27 | ```
28 | $ apt update && apt install -y git curl unzip
29 | $ curl -fsSL https://deb.nodesource.com/setup_20.x | bash -
30 | $ DEBIAN_FRONTEND=noninteractive apt install -y nodejs
31 | $ unzip promnesia.zip -d promnesia
32 | $ cd promnesia
33 | $ npm install
34 | $ ./build --firefox --release --lint --publish=skip
35 | ```
36 | 
37 | 
38 | The final artifact will be in =/promnesia/dist/artifacts/firefox/promnesia-<version>.zip= (INSIDE the container).
39 | 
40 | If you need to get it back onto the HOST system (e.g. to test in the browser), run on the HOST system (e.g. in a separate terminal):
41 | 
42 | docker cp promnesia_build:/promnesia/dist/artifacts/firefox/promnesia-<version>.zip .
43 | 
44 | This will copy it into the current directory on the HOST system.
45 | "
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/extension/babel.config.cjs:
--------------------------------------------------------------------------------
 1 | const presets = [
 2 |     // this is necessary for jest? otherwsie it can't import modules..
 3 |     // ugh... I don't understand tbh, seems that even without preset-env, webpack respects browserlist??
 4 |     // and looks like without preset-env the code is cleaner???
 5 |     // but whatever, the difference is minor and I don't have energy to investigate now..
 6 |     '@babel/preset-env',
 7 | 
 8 |     // also necessary for jest? otherwise fails to import typescript
 9 |     '@babel/preset-typescript',
10 | ]
11 | const plugins = []
12 | 
13 | // if (process.env["ENV"] === "prod") {
14 | //   plugins.push(...);
15 | // }
16 | 
17 | module.exports = { presets, plugins }
18 | 


--------------------------------------------------------------------------------
/extension/eslint.config.js:
--------------------------------------------------------------------------------
 1 | // @ts-check
 2 | import globals from 'globals'
 3 | import eslint from '@eslint/js'
 4 | import tseslint from 'typescript-eslint'
 5 | 
 6 | 
 7 | export default tseslint.config(
 8 |   eslint.configs.recommended,
 9 |   ...tseslint.configs.recommended, // TODO recommendedTypeChecked??
10 |   {
11 |     rules: {
12 |       '@typescript-eslint/no-explicit-any': 'off',
13 |       '@typescript-eslint/ban-ts-comment': 'off',
14 |       '@typescript-eslint/no-unused-vars': [
15 |         'error',
16 |         {
17 |           "argsIgnorePattern": "^_",
18 |           "varsIgnorePattern": "^_",
19 |           "caughtErrorsIgnorePattern": "^_",
20 |         },
21 |       ],
22 |     },
23 |     languageOptions: {
24 |       globals: {
25 |         // necessary for document. window. etc variables to work
26 |         ...globals.browser,
27 |         ...globals.webextensions,
28 |       },
29 |     },
30 |   },
31 | )
32 | 


--------------------------------------------------------------------------------
/extension/old/patcher.js:
--------------------------------------------------------------------------------
 1 | // borrowed from https://github.com/newying61/node-module-patch-source-loader/blob/master/loader.js
 2 | const loaderUtils = require('loader-utils')
 3 | 
 4 | module.exports.default = function (source) {
 5 |   const options = this.getOptions()
 6 |   const patches = options.patches;
 7 |   for (const patch of patches) {
 8 |     let res = source.replace(patch.code, patch.newCode)
 9 |     /* TODO crap, apparently it overwrites inplace, so need to restore?
10 |      * e.g. like here.. https://github.com/tugboatcoding/rewrite-source-webpack-plugin/blob/master/src/index.js */
11 |     if (res == source) {
12 |       if (!res.includes(patch.newCode)) { // might be already patched
13 |         throw Error(`Patch ${JSON.stringify(patch)} had no effect`)
14 |       }
15 |     }
16 |     source = res
17 |   }
18 |   return source
19 | }
20 | 


--------------------------------------------------------------------------------
/extension/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Promnesia",
 3 |   "version": "1.3.1",
 4 |   "version_name": "released on 2024.06.06",
 5 |   "description": "Recall which pages you already visited, why and in which context",
 6 |   "scripts": {
 7 |     "test": "jest",
 8 |     "build": "rollup --config",
 9 |     "eslint": "eslint src",
10 |     "web-ext": "web-ext",
11 |     "release:cws": "chrome-webstore-upload"
12 |   },
13 |   "browserslist": [
14 |     "defaults and supports es6-module"
15 |   ],
16 |   "repository": {
17 |     "type": "git",
18 |     "url": "git+https://github.com/karlicoss/promnesia.git"
19 |   },
20 |   "author": "Dmitrii Gerasimov <karlicoss@gmail.com>",
21 |   "license": "MIT",
22 |   "bugs": {
23 |     "url": "https://github.com/karlicoss/promnesia/issues"
24 |   },
25 |   "homepage": "https://github.com/karlicoss/promnesia#readme",
26 |   "dependencies": {
27 |     "@codemirror/lang-css": "^6.0.1",
28 |     "@codemirror/lang-javascript": "^6.1.2",
29 |     "anchorme": "^3.0.8",
30 |     "codemirror": "^6.0.1",
31 |     "tippy.js": "^6.3.7",
32 |     "webext-options-sync": "^4.2.1"
33 |   },
34 |   "devDependencies": {
35 |     "@babel/core": "^7.24.5",
36 |     "@babel/eslint-parser": "^7.24.5",
37 |     "@babel/preset-env": "^7.24.5",
38 |     "@babel/preset-typescript": "^7.24.1",
39 |     "@eslint/js": "^9.3.0",
40 |     "@rollup/plugin-commonjs": "^25.0.8",
41 |     "@rollup/plugin-node-resolve": "^15.2.3",
42 |     "@rollup/plugin-replace": "^5.0.5",
43 |     "@rollup/plugin-typescript": "^11.1.6",
44 |     "@types/webextension-polyfill": "^0.10.7",
45 |     "chrome-webstore-upload-cli": "^3.1.0",
46 |     "eslint": "^8.57.0",
47 |     "globals": "^15.3.0",
48 |     "jest": "^29.5.0",
49 |     "jest-environment-jsdom": "^29.5.0",
50 |     "jest-fetch-mock": "^3.0.3",
51 |     "node-fetch": "^3.3.2",
52 |     "rollup": "^4.18.0",
53 |     "rollup-plugin-copy": "^3.5.0",
54 |     "tslib": "^2.6.2",
55 |     "typescript": "^5.4.5",
56 |     "typescript-eslint": "^7.10.0",
57 |     "web-ext": "^8.0.0",
58 |     "webextension-polyfill": "^0.12.0"
59 |   },
60 |   "type": "module"
61 | }
62 | 


--------------------------------------------------------------------------------
/extension/src/background_chrome_mv2.js:
--------------------------------------------------------------------------------
1 | // hack to support ES moudle background page in chrome with manifest v2
2 | // see https://stackoverflow.com/a/71081597/706389
3 | (async() => {
4 |     await import ('./background.js');
5 | })()
6 | 


--------------------------------------------------------------------------------
/extension/src/compat.ts:
--------------------------------------------------------------------------------
 1 | import browser from "webextension-polyfill"
 2 | import type {Scripting} from "webextension-polyfill"
 3 | 
 4 | import {assert} from './common'
 5 | 
 6 | 
 7 | export async function executeScript<R>(injection: Scripting.ScriptInjection): Promise<R> {
 8 |     /**
 9 |      * In firefox, executeScript sets error property, whereas in chrome it just throws
10 |      * (see https://issues.chromium.org/issues/40205757)
11 |      * For consistency, this wrapper throws in all cases instead
12 |      */
13 |     const results = await browser.scripting.executeScript(injection)
14 |     assert(results.length == 1)
15 |     const [{result, error}] = results
16 |     if (error != null) {
17 |         if (error instanceof Error) {
18 |             throw error
19 |         } else {
20 |             throw new Error(`Error during executeScript: ${error}`)
21 |         }
22 |     }
23 |     return result
24 | }
25 | 


--------------------------------------------------------------------------------
/extension/src/images/generate:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from pathlib import Path
 3 | from tempfile import NamedTemporaryFile
 4 | from subprocess import check_call
 5 | 
 6 | def main():
 7 |     size = 48
 8 |     source = Path(__file__).parent / f'source_{size}.svg'
 9 |     src_color = '#00ff00'
10 |     src_svg = source.read_text()
11 | 
12 |     spec = [
13 |         (f'ic_visited_{size}.png'    , '#00ff00'),
14 |         (f'ic_relatives_{size}.png'  , '#00cc99'),
15 |         (f'ic_not_visited_{size}.png', '#999999'),
16 |         (f'ic_boring_{size}.png'     , '#550000'),
17 |         (f'ic_blue_{size}.png'       , '#6666ff'),
18 |         (f'ic_blacklisted_{size}.png', '#000000'),
19 |         (f'ic_error.png'             , '#ff0000'),
20 |     ]
21 |     for fname, color in spec:
22 |         svg = src_svg.replace(src_color, color)
23 |         out = Path(__file__).parent / fname
24 | 
25 |         with NamedTemporaryFile(suffix='.svg') as ntf:
26 |             path = Path(ntf.name)
27 |             path.write_text(svg)
28 | 
29 |             check_call(['inkscape', '-z', '-e', str(out), str(path)])
30 | 
31 | if __name__ == '__main__':
32 |     main()
33 | 


--------------------------------------------------------------------------------
/extension/src/images/ic_blacklisted_48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_blacklisted_48.png


--------------------------------------------------------------------------------
/extension/src/images/ic_blue_48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_blue_48.png


--------------------------------------------------------------------------------
/extension/src/images/ic_boring_48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_boring_48.png


--------------------------------------------------------------------------------
/extension/src/images/ic_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_error.png


--------------------------------------------------------------------------------
/extension/src/images/ic_not_visited_48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_not_visited_48.png


--------------------------------------------------------------------------------
/extension/src/images/ic_relatives_48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_relatives_48.png


--------------------------------------------------------------------------------
/extension/src/images/ic_visited_48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_visited_48.png


--------------------------------------------------------------------------------
/extension/src/images/source_48.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <svg
 3 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
 4 |    xmlns:cc="http://creativecommons.org/ns#"
 5 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 6 |    xmlns:svg="http://www.w3.org/2000/svg"
 7 |    xmlns="http://www.w3.org/2000/svg"
 8 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 9 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
10 |    fill="#000000"
11 |    height="48"
12 |    viewBox="0 0 24 24"
13 |    width="48"
14 |    version="1.1"
15 |    id="svg6"
16 |    sodipodi:docname="ic_remove_red_eye_black_48px.svg"
17 |    inkscape:version="0.92.1 r15371">
18 |   <metadata
19 |      id="metadata12">
20 |     <rdf:RDF>
21 |       <cc:Work
22 |          rdf:about="">
23 |         <dc:format>image/svg+xml</dc:format>
24 |         <dc:type
25 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
26 |       </cc:Work>
27 |     </rdf:RDF>
28 |   </metadata>
29 |   <defs
30 |      id="defs10" />
31 |   <sodipodi:namedview
32 |      pagecolor="#ffffff"
33 |      bordercolor="#666666"
34 |      borderopacity="1"
35 |      objecttolerance="10"
36 |      gridtolerance="10"
37 |      guidetolerance="10"
38 |      inkscape:pageopacity="0"
39 |      inkscape:pageshadow="2"
40 |      inkscape:window-width="1600"
41 |      inkscape:window-height="848"
42 |      id="namedview8"
43 |      showgrid="false"
44 |      inkscape:zoom="4.9166667"
45 |      inkscape:cx="24"
46 |      inkscape:cy="24"
47 |      inkscape:window-x="0"
48 |      inkscape:window-y="24"
49 |      inkscape:window-maximized="1"
50 |      inkscape:current-layer="svg6" />
51 |   <path
52 |      d="M0 0h24v24H0z"
53 |      fill="none"
54 |      id="path2" />
55 |   <path
56 |      d="M12 4.5C7 4.5 2.73 7.61 1 12c1.73 4.39 6 7.5 11 7.5s9.27-3.11 11-7.5c-1.73-4.39-6-7.5-11-7.5zM12 17c-2.76 0-5-2.24-5-5s2.24-5 5-5 5 2.24 5 5-2.24 5-5 5zm0-8c-1.66 0-3 1.34-3 3s1.34 3 3 3 3-1.34 3-3-1.34-3-3-3z"
57 |      id="path4"
58 |      style="fill:#00ff00" />
59 | </svg>
60 | 


--------------------------------------------------------------------------------
/extension/src/normalise.ts:
--------------------------------------------------------------------------------
 1 | import type {Url} from './common'
 2 | 
 3 | // TODO should probably be merged with common or something...
 4 | 
 5 | const
 6 | STRIP_RULES = [
 7 |     [[RegExp('.*')                     , RegExp('^\\w+://'         )]],
 8 |     [[RegExp('.*')                     , RegExp('(www|ww|amp)\\.'  )]],
 9 |     [[RegExp('.*')                     , RegExp('[&#].*$'          )]],
10 |     [[RegExp('.*')                     , RegExp('/$'               )]],
11 | ]
12 | // TODO perhaps that should be semi-configurable
13 | 
14 | // TODO maybe use that normalisation library and then adjust query params etc
15 | 
16 | /*
17 |   I think, most common usecases are:
18 |   - blacklisting whole domain (e.g. for privacy reasons, like bank/etc or if something is broken)
19 |   - blacklisting specific pages (e.g. reddit/twitter/fb main page so it doesn't result it too many child contexts)
20 |   For that current approach is fine.
21 | */
22 | 
23 | // TODO careful about dots etc?
24 | 
25 | export function normalise_url(url: string): string {
26 |     let cur = url;
27 |     STRIP_RULES.forEach((rules: Array<Array<RegExp | null>>) => { // meh impure foreach..
28 |         for (const rule of rules) {
29 |             const target: RegExp = rule[0]!
30 |             const reg: RegExp | null = rule[1]
31 |             if (cur.search(target) >= 0) {
32 |                 if (reg != null) {
33 |                     cur = cur.replace(reg, '');
34 |                 }
35 |                 break;
36 |             }
37 |         }
38 |     });
39 |     return cur;
40 | }
41 | 
42 | const _re = RegExp('^(www|ww|amp)\\.'  )
43 | export function normaliseHostname(url: string): string {
44 |     return url.replace(_re, '');
45 | }
46 | 
47 | 
48 | export function normalisedURLHostname(url: Url): string {
49 |     const _hostname = new URL(url).hostname;
50 |     const hostname = normaliseHostname(_hostname);
51 |     return hostname;
52 | }
53 | 


--------------------------------------------------------------------------------
/extension/src/options_page.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |     width: 800px;
 3 | }
 4 | 
 5 | textarea {
 6 |     width: 100%;
 7 | }
 8 | 
 9 | a {
10 |     text-decoration: none;
11 | }
12 | 
13 | /* some magic to make it scrollabel on too much content, yet not always expand to max-height  */
14 | .CodeMirror {
15 |     height: auto !important;
16 | }
17 | .CodeMirror-scroll {
18 |     max-height: 25em; /*  */
19 | }
20 | /*  */
21 | 
22 | 
23 | input:invalid {
24 |     background-color: red;
25 | }
26 | 
27 | .description {
28 |     margin-left: 1em;
29 |     color: #777777;
30 |     white-space: pre;
31 | }
32 | 
33 | hr {
34 |     height: 1px;
35 |     background-color: #ccc;
36 |     border: none;
37 | }
38 | 
39 | pre {
40 |     margin: 0 0 0 0;
41 | }
42 | 
43 | label {
44 |     display: block;
45 | }
46 | 
47 | /* ugh, at least in chrome default browser style didn't work??? */
48 | input:indeterminate {
49 |     box-shadow: inset 0 0 6px 0px grey;
50 | }
51 | 


--------------------------------------------------------------------------------
/extension/src/search.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |     <head>
 4 |         <meta charset="UTF-8">
 5 |         <title>Search</title>
 6 |         <link rel="stylesheet" type="text/css" href="sidebar.css">
 7 |         <script type="module" src='search.js' defer></script>
 8 |     </head>
 9 |     <body id='promnesia-search' class='promnesia'>
10 |         <form id='search_id'>
11 |         <input type='text' id='query_id' size='80' autofocus>
12 |         <button class='button' type='submit'>🔎 Search</button>
13 |         </form>
14 | 
15 |         <div class="promnesia" id='visits'></div>
16 |         <!-- TODO careful with renaming id, css relies on it -->
17 | 
18 |     </body>
19 | </html>
20 | 


--------------------------------------------------------------------------------
/extension/src/selenium_bridge.js:
--------------------------------------------------------------------------------
 1 | // only used during tests
 2 | 
 3 | // hack to hook into the extension... https://stackoverflow.com/a/38554438/706389
 4 | for (const x of [
 5 |     'selenium-bridge-_execute_action',
 6 |     'selenium-bridge-_execute_browser_action',
 7 |     'selenium-bridge-mark_visited',
 8 |     'selenium-bridge-search',
 9 | ]) {
10 |     document.addEventListener(x, () => {
11 |         chrome.runtime.sendMessage(x)
12 |     })
13 | }
14 | 


--------------------------------------------------------------------------------
/extension/src/showvisited.css:
--------------------------------------------------------------------------------
 1 | /* TODO use variables */
 2 | 
 3 | /* actual popup with the metadata */
 4 | .promnesia-visited-popup {
 5 |     display: block; /* otherwise border etc don't work? */
 6 |     white-space: pre-wrap; /* keep whitespace intact */
 7 | 
 8 |     background: #e6e6e6fa; /* same color as in sidebar */
 9 |     color: black; /* to prevent it inheriting white text colour in dark mode */
10 | 
11 |     outline: solid 1px;
12 |     padding: 2px;
13 |     /* not sure about initial?? */
14 |     font-weight: initial;
15 |     font-size: initial;
16 | }
17 | 
18 | .promnesia-visited-popup .promnesia-visited-popup-link {
19 |     display: inline-block;
20 |     text-decoration: none;
21 |     padding-top: 0px;
22 |     padding-bottom: 0px;
23 | }
24 | 
25 | .promnesia-visited-popup .context {
26 |     display: block;
27 |     /* color: black; */
28 |     background: lightyellow;
29 |     margin: 2px;
30 | }
31 | 
32 | /* TODO need to use these styles in tippy */
33 | .promnesia-visited-popup .datetime {
34 |     display: inline-block;
35 |     float: right;
36 |     padding-left: 0.5em;
37 | }
38 | 
39 | 
40 | /* TODO not really sure about !important...  */
41 | .promnesia-visited.promnesia-eye {
42 |     /* note: outline is defined in sidebar.css */
43 |     background-repeat: no-repeat !important; /* repeats by default */
44 |     background-position-x: right !important; /* left by default */
45 | 
46 |     /* same as text size, we just want a small icon */
47 |     /* using rem to prevent it from scaling too much on navigation elements etc */
48 |     background-size: '1rem' !important;
49 | }
50 | 


--------------------------------------------------------------------------------
/extension/src/sidebar-outer.css:
--------------------------------------------------------------------------------
 1 | /* Resets for sites  https://github.com/karlicoss/promnesia/issues/102 */
 2 | body.promnesia-sidebar-active {
 3 |     transform: none;
 4 | }
 5 | .promnesia-sidebar-active #promnesia-frame {
 6 |     margin: 0;
 7 |     padding: 0;
 8 |     max-width: none;
 9 |     max-height: none;
10 |     border: 0;
11 |     border-left: 1px solid #999;
12 | }
13 | 
14 | #promnesia-frame {
15 |     /* e.g. left should result in: left initial, right 0px, top 0px, bot 0px, height 100%, width 30%
16 |        need to set irrelevant dimensions to 0px, otherwise it misbehaves
17 |     */
18 | 
19 |     /* NOTE: default is --right: 1, set via javascript in sidebar.js */
20 | 
21 |     --size: 30%;
22 | 
23 |     --is-v: var(--left, var(--right ));
24 |     --is-h: var(--top , var(--bottom));
25 | 
26 |     --init-left  : var(--left  , var(--top , var(--bottom)));
27 |     --init-right : var(--right , var(--top , var(--bottom)));
28 |     --init-top   : var(--top   , var(--left, var(--right )));
29 |     --init-bottom: var(--bottom, var(--left, var(--right )));
30 | 
31 | 
32 |     left  : calc(var(--init-left  ) * 0px);
33 |     right : calc(var(--init-right ) * 0px);
34 |     top   : calc(var(--init-top   ) * 0px);
35 |     bottom: calc(var(--init-bottom) * 0px);
36 | 
37 |     --whelper: calc(var(--is-v) * var(--size));
38 |     --hhelper: calc(var(--is-h) * var(--size));
39 |     width : var(--whelper, 100%);
40 |     height: var(--hhelper, 100%);
41 | 
42 |     position: fixed;
43 |     z-index: 2147483647;
44 | 
45 |     display: none;
46 | }
47 | 
48 | .promnesia {
49 |     padding-left  : calc(var(--left)   * var(--size));
50 |     padding-right : calc(var(--right)  * var(--size));
51 |     padding-top   : calc(var(--top)    * var(--size));
52 |     padding-bottom: calc(var(--bottom) * var(--size));
53 | }
54 | 
55 | 
56 | /* TODO expose this in settings ? */
57 | .promnesia-highlight {
58 |     background-color: #ffff6688;
59 | }
60 | 
61 | 
62 | .nonselectable {
63 |     -webkit-touch-callout: none;
64 |     -webkit-user-select: none;
65 |     -khtml-user-select: none;
66 |     -moz-user-select: none;
67 |     -ms-user-select: none;
68 |     -o-user-select: none;
69 |     user-select: none;
70 | }
71 | 
72 | .promnesia-highlight-reference {
73 |     color: orange;
74 |     font-weight: bold;
75 |     margin: 0.1em;
76 | 
77 |     white-space: nowrap; /* prevent from breaking numbers onto newlines */
78 |     bottom: 1em;
79 | }
80 | 
81 | 
82 | div.toastify {
83 |     /* eh. just a quick hack to make close button appear on the top instead of bottom */
84 |     /* otherwise if the notification is too long, it might be hidden */
85 |     display: flex;
86 | }
87 | 


--------------------------------------------------------------------------------
/extension/src/toastify.css:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * Toastify js 1.3.2
 3 |  * https://github.com/apvarun/toastify-js
 4 |  * @license MIT licensed
 5 |  *
 6 |  * Copyright (C) 2018 Varun A P
 7 |  */
 8 | 
 9 | .toastify {
10 |     padding: 12px 20px;
11 |     color: #ffffff;
12 |     display: inline-block;
13 |     box-shadow: 0 3px 6px -1px rgba(0, 0, 0, 0.12), 0 10px 36px -4px rgba(77, 96, 232, 0.3);
14 |     background: -webkit-linear-gradient(315deg, #73a5ff, #5477f5);
15 |     background: linear-gradient(135deg, #73a5ff, #5477f5);
16 |     position: fixed;
17 |     opacity: 0;
18 |     transition: all 0.4s cubic-bezier(0.215, 0.61, 0.355, 1);
19 |     border-radius: 2px;
20 |     cursor: pointer;
21 |     text-decoration: none;
22 |     max-width: calc(50% - 20px);
23 |     z-index: 2147483647;
24 | 
25 |     white-space: pre-wrap;
26 | }
27 | 
28 | .toastify.on {
29 |     opacity: 1;
30 | }
31 | 
32 | .toast-close {
33 |     opacity: 0.4;
34 |     padding: 0 5px;
35 | }
36 | 
37 | .toastify-right {
38 |     right: 15px;
39 | }
40 | 
41 | .toastify-left {
42 |     left: 15px;
43 | }
44 | 
45 | .toastify-top {
46 |     top: -150px;
47 | }
48 | 
49 | .toastify-bottom {
50 |     bottom: -150px;
51 | }
52 | 
53 | .toastify-rounded {
54 |     border-radius: 25px;
55 | }
56 | 
57 | .toastify-avatar {
58 |     width: 1.5em;
59 |     height: 1.5em;
60 |     margin: 0 5px;
61 |     border-radius: 2px;
62 | }
63 | 
64 | @media only screen and (max-width: 360px) {
65 |     .toastify-right, .toastify-left {
66 |         margin-left: auto;
67 |         margin-right: auto;
68 |         left: 0;
69 |         right: 0;
70 |         max-width: fit-content;
71 |     }
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/extension/tests/anchorme.test.js:
--------------------------------------------------------------------------------
 1 | import anchorme from "anchorme"
 2 | 
 3 | test('detects org-mode links correctly', () => {
 4 |     const res = anchorme.list(`
 5 | without the fix, if you have text like this:
 6 | 
 7 | [[https://wiki.openhumans.org/wiki/Personal_Science_Wiki][Personal Science Wiki]]
 8 | 
 9 | - also delete the min.js file because I'm not sure how to patch it -- to prevent using it by accident
10 | `).map(o => o.string)
11 |     expect(res).toStrictEqual(['https://wiki.openhumans.org/wiki/Personal_Science_Wiki'])
12 | })
13 | 


--------------------------------------------------------------------------------
/extension/tests/common.test.js:
--------------------------------------------------------------------------------
  1 | import {_fmt} from '../src/display'
  2 | 
  3 | test('formats visit date/time', () => {
  4 |     // NOTE: under Node env there might not be necessary locales (e.g. if you're running in Docker)
  5 |     // can check with: Intl.DateTimeFormat('en-GB').resolvedOptions().locale
  6 |     // e.g. it might resolve to incmplete locale like 'en'
  7 |     const dd = new Date('05 Jun 2020 05:58:00') // deliberately no timezone, it's pointless without the backend anyway
  8 |     const [ds, ts] = _fmt(dd)
  9 |     expect(ds).toBe('5 Jun 2020')
 10 |     expect(ts).toBe('05:58')
 11 | })
 12 | 
 13 | import {format_duration} from '../src/common'
 14 | 
 15 | test('formats duration', () => {
 16 |     expect(format_duration(40)).toBe('40 seconds');
 17 |     expect(format_duration(124)).toBe('2 minutes');
 18 |     expect(format_duration(24 * 60 * 60 + 95 * 60 + 20)).toBe('25 hours 35 minutes');
 19 | });
 20 | 
 21 | 
 22 | import {Visits} from '../src/common'
 23 | import {makeFakeVisits} from '../src/api'
 24 | 
 25 | test('visits', () => {
 26 |     for (const vis of [
 27 |         [],
 28 |         makeFakeVisits(1).visits,
 29 |         makeFakeVisits(10).visits,
 30 |         [new Error('some error')],
 31 |         [new Error('other error'), ...makeFakeVisits(2).visits],
 32 |     ]) {
 33 |         const v = new Visits('http://test', 'http://test', vis)
 34 |         const vv = Visits.fromJObject(v.toJObject())
 35 |         expect(v).toStrictEqual(vv)
 36 |     }
 37 | 
 38 |     // test for more elaborate error handling, make sure it preserves stack
 39 |     // apparently Error comparison doesn't do anything to the stack..
 40 |     for (const vis of [
 41 |         [function () {
 42 |             const err = new Error('some message')
 43 |             err.stack = 'stack1\nstack2'
 44 |             return err
 45 |         }()],
 46 |     ]) {
 47 |         const v = new Visits('http://test', 'http://test', vis)
 48 |         const vv = Visits.fromJObject(v.toJObject())
 49 |         const e = vv.visits[0]
 50 |         expect(e.stack).toStrictEqual('stack1\nstack2')
 51 |     }
 52 | })
 53 | 
 54 | import {normalise_url} from '../src/normalise'
 55 | 
 56 | test('normalises', () => {
 57 |     expect(normalise_url('https://www.youtube.com/playlist?list=PLWz5rJ2EKKc9CBxr3BVjPTPoDPLdPIFCE/')).toBe('youtube.com/playlist?list=PLWz5rJ2EKKc9CBxr3BVjPTPoDPLdPIFCE');
 58 | });
 59 | 
 60 | 
 61 | import {normalisedURLHostname} from '../src/normalise'
 62 | test('normalisedURLHostname', () => {
 63 |     expect(normalisedURLHostname('https://www.reddit.com/whatever')).toBe('reddit.com');
 64 |     expect(normalisedURLHostname('file:///usr/share/doc/python3/html/index.html')).toBe('');
 65 | });
 66 | 
 67 | 
 68 | import {Filterlist} from '../src/filterlist'
 69 | 
 70 | 
 71 | test('filterlists', async () => {
 72 |     // TODO make tests literate so they contribute to help docs?
 73 |     const bl_string = `
 74 | mail.google.com
 75 | https://vk.com
 76 | **github.com/issues**
 77 | /github.com/issues.*/
 78 | 
 79 | //comment.com
 80 | 
 81 | https://reddit.com/
 82 | 
 83 | `
 84 | 
 85 |     const b = new Filterlist({filterlist: bl_string, urllists_json: '[]'})
 86 | 
 87 |     // TODO eh, doesn't work with links without schema; not sure if it's ok
 88 |     expect(await b.contains('http://instagram.com/')).toBe(null)
 89 | 
 90 |     // whole domain is blocked
 91 |     expect(await b.contains('https://mail.google.com/mail/u/0/#inbox')).toContain('domain')
 92 | 
 93 | 
 94 |     // specific page is blocked
 95 |     expect(await b.contains('https://vk.com' )).toContain('exact page')
 96 |     expect(await b.contains('https://vk.com/')).toContain('exact page')
 97 |     expect(await b.contains('https://vk.com/user/whatever')).toBe(null)
 98 |     expect(await b.contains('https://reddit.com')).toContain('exact page')
 99 | 
100 |     // wildcard blockig
101 |     expect(await b.contains('http://github.com/')).toBe(null)
102 |     expect(await b.contains('http://github.com/issues/hello/123')).toContain('regex')
103 | 
104 |     // TODO later, doesn't work ATM
105 |     // expect(b.contains('http://github.com/issues/hello/123', bl)).toContain('wildcard');
106 | 
107 |     expect(await b.contains('123456')).toBe('invalid URL')
108 |     expect(await b.contains('http://comment.com')).toBe(null)
109 | })
110 | 


--------------------------------------------------------------------------------
/extension/tests/defensify.test.js:
--------------------------------------------------------------------------------
 1 | // import {defensify} from '../src/notifications.js';
 2 | 
 3 | async function inner() {
 4 |     throw 'some_error';
 5 | }
 6 | 
 7 | async function outer() {
 8 |     console.warn('before inner async');
 9 |     await inner();
10 |     console.warn('after inner async');
11 | }
12 | 
13 | 
14 | function inner2() {
15 |     throw err;
16 | }
17 | 
18 | function outer2() {
19 |     console.warn('before inner');
20 |     inner2();
21 |     console.warn('after inner');
22 | }
23 | 
24 | test('defensify', async () => {
25 |     // await alalal();
26 |     // await defensify(alalal)();
27 |     console.log("HIHIH");
28 |     // outer2(); // ok, stack is preserved
29 |     // await outer(); // ugh. stack is lost...
30 |     // const dd = new Date(0);
31 |     // expect(format_dt(dd)).toMatch(/Jan 1 1970/);
32 | });
33 | 


--------------------------------------------------------------------------------
/extension/tests/integration.test.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Ugh FFS.
  3 |  * NODE_OPTIONS=--experimental-vm-modules npm run test is working much better with ES6 imports/node dependenceis
  4 |  * but it segfaults every other time
  5 |  * https://github.com/nodejs/node/issues/35889
  6 |  */
  7 | import mockBrowser from "../__mocks__/browser"
  8 | global.chrome = mockBrowser
  9 | 
 10 | 
 11 | test('options', async () => {
 12 |     const {setOptions, getOptions} = await import ('../src/options')
 13 |     // shouldn't crash at least..
 14 |     const opts = await getOptions()
 15 | })
 16 | // TODO could check options migrations?
 17 | 
 18 | import fetch from 'node-fetch'
 19 | global.fetch = fetch
 20 | 
 21 | 
 22 | test('visits', async() => {
 23 |     const {backend, makeFakeVisits} = await import ('../src/api')
 24 | 
 25 |     // const opts = await getOptions()
 26 |     // opts.host = host: 'http//bad.host',
 27 |    
 28 |     // TODO have a defensive and offensive modes?
 29 |     // but defensive for network errors makes def makes sense anyway
 30 |     const vis = await backend.visits('http://123.com')
 31 |     expect(vis).toBeInstanceOf(Error)
 32 |     expect(vis.message).toMatch(/request .* failed/)
 33 | })
 34 | 
 35 | 
 36 | // meh.
 37 | mockBrowser.history.getVisits.mockImplementation(async (obj) => [])
 38 | mockBrowser.history.search   .mockImplementation(async (obj) => [])
 39 | mockBrowser.bookmarks.getTree.mockImplementation(async () => [{
 40 |     children: [{
 41 |         url: 'http:whatever.com/',
 42 |         dateAdded: 16 * 10 ** 8 * 1000,
 43 |     }],
 44 | }])
 45 | 
 46 | test('visits_allsources', async() => {
 47 |     const {allsources} = await import('../src/sources')
 48 | 
 49 |     const vis = await allsources.visits('https://whatever.com/')
 50 |     expect(vis.visits).toHaveLength(2)
 51 |     expect(vis.normalised_url).toStrictEqual('whatever.com')
 52 | })
 53 | 
 54 | 
 55 | test('search_works', async () => {
 56 |     const {allsources} = await import('../src/sources')
 57 | 
 58 |     // at least shouldn't crash
 59 |     const res = await allsources.search('https://123.coom')
 60 |     const [e] = res.visits
 61 |     expect(e.message).toMatch(/request .* failed/)
 62 | })
 63 | 
 64 | test('search_defensive', async() => {
 65 |     const {backend} = await import ('../src/api')
 66 |     const {MultiSource, bookmarks, thisbrowser} = await import ('../src/sources')
 67 | 
 68 | 
 69 |     // precondition: some error in processing history api, e.g. it's unavailable or something
 70 |     mockBrowser.history.search.mockImplementation(async (q) => null)
 71 |     mockBrowser.bookmarks.getTree.mockImplementation(async () => null)
 72 | 
 73 |     // TODO wtf?? for some reason default order (backend, browser, bookmarks) causes
 74 |     // 'Promise rejection was handled asynchronously'
 75 |     // I wonder if it's some issue with node fetch implementation... or just node version??
 76 |     // for some reason different order works :shrug:
 77 | 
 78 |     const res = await new MultiSource(thisbrowser, bookmarks, backend)
 79 |           .search('http://whatever.com')
 80 | 
 81 |     console.error(res.visits)
 82 |     const [e1, e2, e3] = res.visits
 83 |     // eh. fragile, but at least makes sure we test exactly the thing we want
 84 |     expect(e1.message).toMatch(/is not iterable/)
 85 |     expect(e2.message).toMatch(/Cannot read propert/)
 86 |     expect(e3.message).toMatch(/request .* failed/)
 87 | })
 88 | 
 89 | 
 90 | import fetchMock from 'jest-fetch-mock'
 91 | // TODO use it as a fixture..
 92 | // beforeEach(() => {
 93 | //   fetch.resetMocks()
 94 | // })
 95 | 
 96 | test('visits_badresponse', async() => {
 97 |     const {backend} = await import ('../src/api')
 98 | 
 99 |     fetchMock.enableMocks()
100 |     fetchMock.mockResponse('bad!')
101 |     const res = await backend.visits('http://mock.com')
102 |     expect(res).toBeInstanceOf(Error)
103 | })
104 | 
105 | 
106 | test('visited', async() => {
107 |     const {backend} = await import ('../src/api')
108 |     const {fake} = await import ('../src/api')
109 | 
110 |     fetchMock.enableMocks()
111 |     const [v] = fake.apiVisits(1)
112 |     {
113 |         fetchMock.mockOnce(`[null, ${JSON.stringify(v)}]`)
114 |         const r = await backend.visited(['http://link1', 'http://link2'])
115 |         expect(r).not.toBeInstanceOf(Error)
116 |         const [r1, r2] = r
117 |         expect(r1).toEqual(null)
118 |         expect(r2.tags).toEqual(['fake'])
119 |     }
120 | 
121 |     {
122 |         // the backend is also allowed to return boolean values (for 'was visited'/'was not visited')
123 |         // in addition, this was legacy behaviour
124 |         fetchMock.mockOnce(`[false, true, null]`)
125 |         let r = await backend.visited(['http://link1', 'http://link2', 'http://link3'])
126 |         const [r1, r2, r3] = r
127 |         expect(r1).toEqual(null)
128 |         expect(r2).not.toEqual(null)
129 |         expect(r3).toEqual(null)
130 |     }
131 | })
132 | 


--------------------------------------------------------------------------------
/extension/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "baseUrl": "src/",
 4 |     "noImplicitAny": true,
 5 |     "target": "es2022",
 6 |     "strict": true,
 7 |     "allowJs": true,
 8 | 
 9 |     // this is necessary to allos importing as import './whatever' (instead of explicit extension)
10 |     "moduleResolution": "bundler",
11 |     // esnext is necessary, otherwise bundler module resolution can't be used?
12 |     "module": "esnext",
13 | 
14 |     // without it, emacs (LSP?) complains when editing files.. not sure if impacts actual code generation?
15 |     "lib": ["es6", "dom"],
16 |   },
17 |   "include": [
18 |     "./src/**/*.ts"
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | pretty = True
 3 | show_error_context = True
 4 | show_column_numbers = True
 5 | show_error_end = True
 6 | 
 7 | check_untyped_defs = True
 8 | 
 9 | # see https://mypy.readthedocs.io/en/stable/error_code_list2.html
10 | warn_redundant_casts = True
11 | strict_equality = True
12 | warn_unused_ignores = True
13 | enable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable
14 | 
15 | 
16 | # not sure why mypy started discovering it (since 0.800??)
17 | [mypy-hypothesis]
18 | ignore_missing_imports = True
19 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | # see https://github.com/karlicoss/pymplate for up-to-date reference
  2 | [project]
  3 | dynamic = ["version"]  # version is managed by setuptools_scm
  4 | name = "promnesia"
  5 | dependencies = [
  6 |     "appdirs",            # for portable user directories detection
  7 |     "tzlocal",            # guessling local timezone
  8 |     "more_itertools",
  9 |     "typing-extensions",
 10 |     "pytz",
 11 |     "sqlalchemy>=2.0",    # DB api
 12 | 
 13 |     ##
 14 |     # NOTE: ideally we don't need to install them by default?
 15 |     # i.e. server and indexer can run on different hosts/containers etc
 16 |     # keeping here for backwards compatibility for now
 17 |     "promnesia[indexer]",
 18 |     "promnesia[server]",
 19 |     ##
 20 | ]
 21 | requires-python = ">=3.9"
 22 | 
 23 | ## these need to be set if you're planning to upload to pypi
 24 | description = "Enhancement of your browsing history"
 25 | license = {file = "LICENSE"}
 26 | authors = [
 27 |     {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
 28 | ]
 29 | maintainers = [
 30 |     {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"},
 31 | ]
 32 | [project.urls]
 33 | Homepage = "https://github.com/karlicoss/promnesia"
 34 | ##
 35 | 
 36 | 
 37 | [project.optional-dependencies]
 38 | indexer = [
 39 |     # indexer only dependencies
 40 |     "urlextract",
 41 | ]
 42 | server = [
 43 |     # server only dependencies
 44 |     "fastapi",
 45 |     "uvicorn[standard]",
 46 | ]
 47 | optional = [
 48 |     # dependencies that bring some bells & whistles
 49 |     "logzero"     ,  # pretty colored logging
 50 |     "python-magic",  # better mimetype decetion
 51 | ]
 52 | HPI = [
 53 |     # dependencies for https://github.com/karlicoss/HPI
 54 |     "HPI",  # pypi version
 55 |     # TODO add notes how to override with github version?
 56 | ]
 57 | html = [
 58 |     # dependencies for sources.html
 59 |     "beautifulsoup4",  # extracting links from the page
 60 |     "lxml"          ,  # bs4 backend
 61 | ]
 62 | markdown = [
 63 |     # dependencies for sources.html
 64 |     "mistletoe",
 65 | ]
 66 | org = [
 67 |     # dependencies for sources.org
 68 |     "orgparse>=0.3.0",
 69 | ]
 70 | telegram = [
 71 |     # used to depend on 'dataset', keeping for backwards compatibility
 72 | ]
 73 | all = [
 74 |     "promnesia[optional,HPI,html,markdown,org]",
 75 | ]
 76 | 
 77 | [dependency-groups]
 78 | testing = [
 79 |     "pytest",
 80 |     "ruff",
 81 |     "mypy",
 82 |     "lxml",  # for mypy coverage
 83 |     
 84 |     "hypothesis",
 85 | 
 86 |     "loguru",  # used in addon_helper... not sure if should just use promnesia's logger?
 87 | 
 88 |     "psutil", "types-psutil",
 89 |     "requests", "types-requests",
 90 | 
 91 |     ## other mypy stubs
 92 |     "types-pytz"      ,
 93 |     "types-requests"  ,  # used in tests
 94 |     "types-beautifulsoup4",  # NOTE: not needed after the <4.13.0 pinning is resolved
 95 |     ##
 96 | 
 97 |     # todo hmm ideally would be in corresponding testing-... sections
 98 |     # but we don't split separate mypy for end2end tests... so idk
 99 |     "selenium"      ,  # browser automations
100 |     "types-PyAutoGUI"
101 | ]
102 | testing-end2end = [
103 |     "click"         ,  # confirmations for end2end test (might remove dependency)
104 |     "pytest-timeout",  # for PYTEST_TIMEOUT env variable
105 |     "pytest-xdist"  ,  # not used atm, but helpful to parallelise end2end tests
106 | ]
107 | testing-gui = [
108 |     # pyautogui seems problematic, wheels often fail to build under windows
109 |     # we don't use it in CI, so keep in a separate extras section
110 |     "pyautogui",  # for keyboard automation during end2end tests
111 | ]
112 | 
113 | 
114 | [project.scripts]
115 | promnesia = "promnesia.__main__:main"
116 | 
117 | 
118 | [build-system]
119 | requires = ["setuptools", "setuptools-scm"]
120 | build-backend = "setuptools.build_meta"
121 | 
122 | [tool.setuptools_scm]
123 | version_scheme = "python-simplified-semver"
124 | local_scheme = "dirty-tag"
125 | 
126 | # workaround for error during uv publishing
127 | # see https://github.com/astral-sh/uv/issues/9513#issuecomment-2519527822
128 | [tool.setuptools]
129 | license-files = []
130 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
 1 | [pytest]
 2 | # discover files that don't follow test_ naming. Useful to keep tests along with the source code
 3 | python_files = *.py
 4 | norecursedirs = tests/testdata/*
 5 | addopts =
 6 |   # -rap to print tests summary even when they are successful
 7 |   -rap
 8 |   --verbose
 9 | 
10 |   # otherwise it won't discover doctests
11 |   --doctest-modules
12 | 
13 |   # show all test durations (unless they are too short)
14 |   --durations=0
15 | 


--------------------------------------------------------------------------------
/scripts/backup-phone-history.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -eu
 3 | BACKUP_DIR="$1"
 4 | 
 5 | backup_file () {
 6 |     file="$1"
 7 |     to="$2"
 8 |     fname="$(basename "$file")"
 9 |     timestamp=$(date -d "@$(stat -c %Y "$file")" +'%Y%m%d%H%M%S')
10 |     tdir="$to/$timestamp"
11 |     mkdir -p "$tdir"
12 |     cp "$file" "$tdir/$fname"
13 | }
14 | 
15 | 
16 | backup_chrome () {
17 |     backup_file '/data/data/com.android.chrome/app_chrome/Default/History'            "$BACKUP_DIR/chrome"
18 | }
19 | 
20 | backup_firefox () {
21 |     backup_file '/data/data/org.mozilla.firefox/files/mozilla/'*.default/'browser.db' "$BACKUP_DIR/firefox"
22 | }
23 | 
24 | 
25 | backup_firefox
26 | backup_chrome
27 | 


--------------------------------------------------------------------------------
/scripts/browser_history.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | DEPRECATION = 'NOTE: this is DEPRECATED! Please use https://github.com/purarue/browserexport instead'
  3 | 
  4 | from datetime import datetime, timezone
  5 | from pathlib import Path
  6 | from subprocess import check_output
  7 | import filecmp
  8 | import logging
  9 | import warnings
 10 | import sys
 11 | 
 12 | warnings.warn(DEPRECATION, DeprecationWarning)
 13 | 
 14 | Browser = str
 15 | 
 16 | CHROME = 'chrome'
 17 | FIREFOX = 'firefox'
 18 | 
 19 | def get_logger():
 20 |     return logging.getLogger('browser-history')
 21 | 
 22 | 
 23 | # TODO kython?
 24 | # TODO the with key?
 25 | def only(it):
 26 |     values = list(it)
 27 |     if len(values) == 1:
 28 |         return values[0]
 29 |     raise RuntimeError(f'Expected a single value: {values}')
 30 | 
 31 | 
 32 | def get_path(browser: Browser, profile: str='*') -> Path:
 33 |     if browser == 'chrome':
 34 |         bpath = Path('~/.config/google-chrome').expanduser()
 35 |         dbs = bpath.glob(profile + '/History')
 36 |     elif browser == 'firefox':
 37 |         bpath = Path('~/.mozilla/firefox/').expanduser()
 38 |         dbs = bpath.glob(profile + '/places.sqlite')
 39 |     else:
 40 |         raise RuntimeError(f'Unexpected browser {browser}')
 41 |     ldbs = list(dbs)
 42 |     if len(ldbs) == 1:
 43 |         return ldbs[0]
 44 |     raise RuntimeError(f'Expected single database, got {ldbs}. Perhaps you want to use --profile argument?')
 45 | 
 46 | 
 47 | 
 48 | def test_get_path():
 49 |     get_path('chrome')
 50 |     get_path('firefox', profile='*-release')
 51 | 
 52 | 
 53 | def atomic_copy(src: Path, dest: Path):
 54 |     """
 55 |     Supposed to handle cases where the file is changed while we were copying it.
 56 |     """
 57 |     import shutil
 58 | 
 59 |     differs = True
 60 |     while differs:
 61 |         res = shutil.copy(src, dest)
 62 |         differs = not filecmp.cmp(str(src), str(res))
 63 | 
 64 | 
 65 | def format_dt(dt: datetime) -> str:
 66 |     return dt.strftime('%Y%m%d%H%M%S')
 67 | 
 68 | 
 69 | def backup_history(browser: Browser, to: Path, profile: str='*', pattern=None) -> Path:
 70 |     assert to.is_dir()
 71 |     logger = get_logger()
 72 | 
 73 |     now = format_dt(datetime.now(tz=timezone.utc))
 74 | 
 75 |     path = get_path(browser, profile=profile)
 76 | 
 77 |     pattern = path.stem + '-{}' + path.suffix if pattern is None else pattern
 78 |     fname = pattern.format(now)
 79 | 
 80 | 
 81 |     res = to / fname
 82 |     logger.info('backing up to %s', res)
 83 |     # if your chrome is open, database would normally be locked, so you can't just make a snapshot
 84 |     # so we'll just copy it till it converge. bit paranoid, but should work
 85 |     atomic_copy(path, res)
 86 |     logger.info('done!')
 87 |     return res
 88 | 
 89 | 
 90 | def test_backup_history(tmp_path):
 91 |     tdir = Path(tmp_path)
 92 |     backup_history(CHROME, tdir)
 93 |     backup_history(FIREFOX, tdir, profile='*-release')
 94 | 
 95 | 
 96 | def guess_db_date(db: Path) -> str:
 97 |     maxvisit = check_output([
 98 |         'sqlite3',
 99 |         '-csv',
100 |         db,
101 |         'SELECT max(datetime(((visits.visit_time/1000000)-11644473600), "unixepoch")) FROM visits;'
102 |     ]).decode('utf8').strip().strip('"')
103 |     return format_dt(datetime.strptime(maxvisit, "%Y-%m-%d %H:%M:%S"))
104 | 
105 | 
106 | def test_guess(tmp_path):
107 |     tdir = Path(tmp_path)
108 |     db = backup_history(CHROME, tdir)
109 |     guess_db_date(db)
110 | 
111 | 
112 | def main():
113 |     logger = get_logger()
114 |     import argparse
115 |     p = argparse.ArgumentParser()
116 |     p.add_argument('--browser', type=Browser, required=True)
117 |     p.add_argument('--profile', type=str, default='*', help='Use to pick the correct profile to back up. If unspecified, will assume a single profile')
118 |     p.add_argument('--to', type=Path, required=True)
119 |     args = p.parse_args()
120 | 
121 |     # TODO do I need pattern??
122 |     backup_history(browser=args.browser, to=args.to, profile=args.profile)
123 | 
124 |     warnings.warn(DEPRECATION, DeprecationWarning)
125 |     logger.error("This script is DEPRECATED! Exiting with error code so that the use notices")
126 |     sys.exit(44)
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     main()
131 | 


--------------------------------------------------------------------------------
/scripts/promnesia:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # this script runs promnesia from the current repository (instead of the installed version)
 3 | 
 4 | set -eu
 5 | 
 6 | GIT_ROOT_DIR="$(cd "$(dirname "$0")" && git rev-parse --show-toplevel)"
 7 | 
 8 | if [[ -n "${PYTHONPATH:=}" ]]; then
 9 |     PPATH=":$PYTHONPATH"
10 | else
11 |     PPATH=""
12 | fi
13 | 
14 | PPATH="$GIT_ROOT_DIR/src$PPATH"
15 | 
16 | export DIRTY_RUN='YES' # ugh.
17 | 
18 | if command -v python3 &> /dev/null; then
19 |     # Note: python3 in Windows used "py" in command line. So $PY_BIN should be just "py"
20 |     PY_BIN="python3"
21 | else
22 |     PY_BIN="python" # warn?
23 | fi
24 | 
25 | PYTHONPATH="$PPATH" exec "$PY_BIN" -m promnesia "$@"
26 | 


--------------------------------------------------------------------------------
/src/promnesia/__init__.py:
--------------------------------------------------------------------------------
 1 | # add deprecation warning so eventually this may converted to a namespace package?
 2 | import warnings
 3 | 
 4 | from .common import (  # noqa: F401
 5 |     Context,
 6 |     DbVisit,
 7 |     Loc,
 8 |     PathIsh,
 9 |     Res,
10 |     Results,
11 |     Source,
12 |     Visit,
13 |     last,
14 | )
15 | 
16 | # TODO think again about it -- what are the pros and cons?
17 | warnings.warn("DEPRECATED! Please import directly from 'promnesia.common', e.g. 'from promnesia.common import Visit, Source, Results'", DeprecationWarning)
18 | 


--------------------------------------------------------------------------------
/src/promnesia/compare.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | # TODO perhaps make it external script?
  4 | import argparse
  5 | import logging
  6 | import sys
  7 | from collections.abc import Iterator, Sequence
  8 | from pathlib import Path
  9 | from typing import TypeVar
 10 | 
 11 | from .common import DbVisit, PathWithMtime, Url
 12 | from .database.load import row_to_db_visit
 13 | 
 14 | # TODO include latest too?
 15 | # from cconfig import ignore, filtered
 16 | 
 17 | def get_logger():
 18 |     return logging.getLogger('promnesia-db-changes')
 19 | 
 20 | # TODO return error depending on severity?
 21 | 
 22 | 
 23 | T = TypeVar('T')
 24 | 
 25 | def eliminate_by(sa: Sequence[T], sb: Sequence[T], key):
 26 |     def make_dict(s: Sequence[T]) -> dict[str, list[T]]:
 27 |         res: dict[str, list[T]] = {}
 28 |         for a in s:
 29 |             k = key(a)
 30 |             ll = res.get(k, None)
 31 |             if ll is None:
 32 |                 ll = []
 33 |                 res[k] = ll
 34 |             ll.append(a)
 35 |         return res
 36 |     da = make_dict(sa)
 37 |     db = make_dict(sb)
 38 |     ka = set(da.keys())
 39 |     kb = set(db.keys())
 40 |     onlya: set[T] = set()
 41 |     common: set[T] = set()
 42 |     onlyb: set[T] = set()
 43 |     for k in ka.union(kb):
 44 |         la = da.get(k, [])
 45 |         lb = db.get(k, [])
 46 |         common.update(la[:min(len(la), len(lb))])
 47 |         if len(la) > len(lb):
 48 |             onlya.update(la[len(lb):])
 49 |         if len(lb) > len(la):
 50 |             onlyb.update(lb[len(la):])
 51 | 
 52 |     return onlya, common, onlyb
 53 | 
 54 | 
 55 | def compare(before: list[DbVisit], after: list[DbVisit], between: str, *, log=True) -> list[DbVisit]:
 56 |     logger = get_logger()
 57 |     logger.info('comparing between: %s', between)
 58 | 
 59 |     errors: list[DbVisit] = []
 60 | 
 61 |     umap: dict[Url, list[DbVisit]] = {}
 62 |     for a in after:
 63 |         url = a.norm_url
 64 |         xx = umap.get(url, []) # TODO canonify here?
 65 |         xx.append(a)
 66 |         umap[url] = xx
 67 | 
 68 |     def reg_error(b):
 69 |         errors.append(b)
 70 |         if log:
 71 |             logger.error('between %s missing %s', between, b)
 72 |             print('ignoreline "{}", # {} {}'.format('exid', b.norm_url, b.src), file=sys.stderr)
 73 | 
 74 | 
 75 |     # the idea is that we eliminate items simultaneously from both sets
 76 |     eliminations = [
 77 |         ('identity'               , lambda x: x),
 78 |         ('without dt'             , lambda x: x._replace(src='', dt='')),
 79 |         ('without context'        , lambda x: x._replace(src='',        context='', locator='')),
 80 |         ('without dt and context' , lambda x: x._replace(src='', dt='', context='', locator='')),
 81 |     ]
 82 |     for ename, ekey in eliminations:
 83 |         logger.info('eliminating by %s', ename)
 84 |         logger.info('before: %d, after: %d', len(before), len(after))
 85 |         before, common, after = eliminate_by(before, after, key=ekey)
 86 |         logger.info('common: %d, before: %d, after: %d', len(common), len(before), len(after))
 87 | 
 88 |     logger.info('removing explicitly ignored items')
 89 |     # before = filtered(before, between=between, umap=umap)
 90 |     logger.info('before: %d', len(before))
 91 | 
 92 |     for b in before:
 93 |         reg_error(b)
 94 | 
 95 |     return errors
 96 | 
 97 | def setup_parser(p):
 98 |     # TODO better name?
 99 |     p.add_argument('--intermediate-dir', type=Path)
100 |     p.add_argument('--last', type=int, default=2)
101 |     p.add_argument('--all', action='store_const', const=0, dest='last')
102 |     p.add_argument('paths', nargs='*')
103 | 
104 | 
105 | def get_files(args):
106 |     if len(args.paths) == 0:
107 |         int_dir = args.intermediate_dir
108 |         assert int_dir.exists()
109 |         files = sorted(int_dir.glob('*.sqlite*'))
110 |         files = files[-args.last:]
111 |     else:
112 |         files = [Path(p) for p in args.paths]
113 |     return files
114 | 
115 | 
116 | def main():
117 |     p = argparse.ArgumentParser()
118 |     setup_parser(p)
119 |     args = p.parse_args()
120 |     files = get_files(args)
121 | 
122 |     errors = list(compare_files(*files))
123 |     if len(errors) > 0:
124 |         sys.exit(1)
125 | 
126 | 
127 | def compare_files(*files: Path, log=True) -> Iterator[tuple[str, DbVisit]]:
128 |     assert len(files) > 0
129 | 
130 |     logger = get_logger()
131 |     logger.info('comparing %s', files)
132 | 
133 |     last = None
134 |     last_dts = None
135 |     for f in files:
136 |         logger.info('processing %r', f)
137 |         name = f.name
138 |         this_dts = name[0: name.index('.')] # can't use stem due to multiple extensions..
139 | 
140 |         from promnesia.server import _get_stuff  # TODO ugh
141 |         engine, table = _get_stuff(PathWithMtime.make(f))
142 | 
143 |         with engine.connect() as conn:
144 |             vis = [row_to_db_visit(row) for row in conn.execute(table.select())]
145 | 
146 |         if last is not None:
147 |             between = f'{last_dts}:{this_dts}'
148 |             errs = compare(last, vis, between=between, log=log)
149 |             for e in errs:
150 |                 yield between, e
151 |         last = vis
152 |         last_dts = this_dts
153 | 
154 | if __name__ == '__main__':
155 |     main()
156 | 
157 | 


--------------------------------------------------------------------------------
/src/promnesia/compat.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING
 2 | 
 3 | if not TYPE_CHECKING:
 4 |     ## we used to have compat fixes here for these for python3.7
 5 |     ## keeping in case any sources depended on compat functions
 6 |     from subprocess import PIPE, Popen, check_call, check_output, run  # noqa: F401
 7 |     from typing import Literal, Protocol  # noqa: F401
 8 |     ##
 9 | 
10 |     # todo deprecate properly
11 |     def removeprefix(text: str, prefix: str) -> str:
12 |         return text.removeprefix(prefix)
13 | 


--------------------------------------------------------------------------------
/src/promnesia/config.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import importlib
  4 | import importlib.util
  5 | import os
  6 | import warnings
  7 | from collections.abc import Iterable
  8 | from pathlib import Path
  9 | from types import ModuleType
 10 | from typing import Callable, NamedTuple, Union
 11 | 
 12 | from .common import DbVisit, PathIsh, Res, Source, default_cache_dir, default_output_dir
 13 | 
 14 | HookT = Callable[[Res[DbVisit]], Iterable[Res[DbVisit]]]
 15 | 
 16 | 
 17 | ModuleName = str
 18 | 
 19 | # something that can be converted into a proper Source
 20 | ConfigSource = Union[Source, ModuleName, ModuleType]
 21 | 
 22 | 
 23 | class Config(NamedTuple):
 24 |     # TODO remove default from sources once migrated
 25 |     SOURCES: list[ConfigSource] = []
 26 | 
 27 |     # if not specified, uses user data dir
 28 |     OUTPUT_DIR: PathIsh | None = None
 29 | 
 30 |     CACHE_DIR: PathIsh | None = ''
 31 |     FILTERS: list[str] = []
 32 | 
 33 |     HOOK: HookT | None = None
 34 | 
 35 |     #
 36 |     # NOTE: INDEXERS is deprecated, use SOURCES instead
 37 |     INDEXERS: list[ConfigSource] = []
 38 |     #MIME_HANDLER: Optional[str] = None # TODO
 39 | 
 40 |     @property
 41 |     def sources(self) -> Iterable[Res[Source]]:
 42 |         if len(self.INDEXERS) > 0:
 43 |             warnings.warn("'INDEXERS' is deprecated. Please use 'SOURCES'!", DeprecationWarning)
 44 | 
 45 |         raw = self.SOURCES + self.INDEXERS
 46 | 
 47 |         if len(raw) == 0:
 48 |             raise RuntimeError("Please specify SOURCES in the config! See https://github.com/karlicoss/promnesia#setup for more information")
 49 | 
 50 |         for r in raw:
 51 |             if isinstance(r, ModuleName):
 52 |                 try:
 53 |                     r = importlib.import_module(r)
 54 |                 except ModuleNotFoundError as e:
 55 |                     # todo better error reporting?
 56 |                     yield e
 57 |                     continue
 58 | 
 59 |             if isinstance(r, Source):
 60 |                 yield r
 61 |             else:
 62 |                 # otherwise Source object can take care of the module we passed
 63 |                 # (see SourceIsh)
 64 |                 yield Source(r)
 65 | 
 66 |     @property
 67 |     def cache_dir(self) -> Path | None:
 68 |         # TODO we used to use this for cachew, but it's best to rely on HPI modules etc to cofigure this
 69 |         # keeping just in case for now
 70 |         cd = self.CACHE_DIR
 71 |         cpath: Path | None
 72 |         if cd is None:
 73 |             cpath = None # means 'disabled' in cachew
 74 |         elif cd == '': # meh.. but need to make it None friendly..
 75 |             cpath = default_cache_dir()
 76 |         else:
 77 |             cpath = Path(cd)
 78 |         if cpath is not None:
 79 |             cpath.mkdir(exist_ok=True, parents=True)
 80 |         return cpath
 81 | 
 82 |     # TODO also tmp dir -- perhaps should be in cache or at least possible to specify in config? not sure if useful
 83 |     @property
 84 |     def output_dir(self) -> Path:
 85 |         odir = self.OUTPUT_DIR
 86 |         opath = default_output_dir() if odir is None else Path(odir)
 87 |         opath.mkdir(exist_ok=True, parents=True)
 88 |         return opath
 89 | 
 90 |     @property
 91 |     def db(self) -> Path:
 92 |         return self.output_dir / 'promnesia.sqlite'
 93 | 
 94 |     @property
 95 |     def hook(self) -> HookT | None:
 96 |         return self.HOOK
 97 | 
 98 | instance: Config | None = None
 99 | 
100 | 
101 | def has() -> bool:
102 |     return instance is not None
103 | 
104 | def get() -> Config:
105 |     assert instance is not None, "Expected config to be set, but it's not"
106 |     return instance
107 | 
108 | 
109 | def load_from(config_file: Path) -> None:
110 |     global instance
111 |     instance = import_config(config_file)
112 | 
113 | 
114 | def reset() -> None:
115 |     global instance
116 |     assert instance is not None
117 |     instance = None
118 | 
119 | 
120 | def import_config(config_file: PathIsh) -> Config:
121 |     p = Path(config_file)
122 | 
123 |     # todo just exec??
124 |     name = p.stem
125 |     spec = importlib.util.spec_from_file_location(name, p); assert spec is not None
126 |     mod = importlib.util.module_from_spec(spec); assert mod is not None
127 |     loader = spec.loader; assert loader is not None
128 |     loader.exec_module(mod)
129 | 
130 |     d = {}
131 |     for f in Config._fields:
132 |         if hasattr(mod, f):
133 |             d[f] = getattr(mod, f)
134 |     return Config(**d)
135 | 
136 | 
137 | # TODO: ugh. this causes warnings to be repeated multiple times... need to reuse the pool or something..
138 | def use_cores() -> int | None:
139 |     '''
140 |     Somewhat experimental.
141 |     For now only used in sources.auto, perhaps later will be shared among the other indexers.
142 |     '''
143 |     # most likely needs to be some sort of pipeline thing?
144 |     cs = os.environ.get('PROMNESIA_CORES', None)
145 |     if cs is None:
146 |         return None
147 |     try:
148 |         return int(cs)
149 |     except ValueError: # any other value means 'use all
150 |         return 0
151 | 
152 | 
153 | def extra_fd_args() -> list[str]:
154 |     '''
155 |     Not sure where it belongs yet... so via env variable for now
156 |     Can be used to pass --ignore-file parameter
157 |     '''
158 |     v = os.environ.get('PROMNESIA_FD_EXTRA_ARGS', '')
159 |     extra = v.split() # eh, hopefully splitting that way is ok...
160 |     return extra
161 | 


--------------------------------------------------------------------------------
/src/promnesia/database/common.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from collections.abc import Sequence
 4 | from datetime import datetime
 5 | 
 6 | from sqlalchemy import (
 7 |     Column,
 8 |     Integer,
 9 |     String,
10 | )
11 | 
12 | # TODO maybe later move DbVisit here completely?
13 | # kinda an issue that it's technically an "api" because hook in config can patch up DbVisit
14 | from ..common import DbVisit, Loc
15 | 
16 | 
17 | def get_columns() -> Sequence[Column]:
18 |     # fmt: off
19 |     res: Sequence[Column] = [
20 |         Column('norm_url'     , String()),
21 |         Column('orig_url'     , String()),
22 |         Column('dt'           , String()),
23 |         Column('locator_title', String()),
24 |         Column('locator_href' , String()),
25 |         Column('src'          , String()),
26 |         Column('context'      , String()),
27 |         Column('duration'     , Integer())
28 |     ]
29 |     # fmt: on
30 |     assert len(res) == len(DbVisit._fields) + 1  # +1 because Locator is 'flattened'
31 |     return res
32 | 
33 | 
34 | def db_visit_to_row(v: DbVisit) -> tuple:
35 |     # ugh, very hacky...
36 |     # we want to make sure the resulting tuple only consists of simple types
37 |     # so we can use dbengine directly
38 |     dt_s = v.dt.isoformat()
39 |     row = (
40 |         v.norm_url,
41 |         v.orig_url,
42 |         dt_s,
43 |         v.locator.title,
44 |         v.locator.href,
45 |         v.src,
46 |         v.context,
47 |         v.duration,
48 |     )
49 |     return row
50 | 
51 | 
52 | def row_to_db_visit(row: Sequence) -> DbVisit:
53 |     (norm_url, orig_url, dt_s, locator_title, locator_href, src, context, duration) = row
54 |     dt_s = dt_s.split()[0]  # backwards compatibility: previously it could be a string separated with tz name
55 |     dt = datetime.fromisoformat(dt_s)
56 |     return DbVisit(
57 |         norm_url=norm_url,
58 |         orig_url=orig_url,
59 |         dt=dt,
60 |         locator=Loc(
61 |             title=locator_title,
62 |             href=locator_href,
63 |         ),
64 |         src=src,
65 |         context=context,
66 |         duration=duration,
67 |     )
68 | 


--------------------------------------------------------------------------------
/src/promnesia/database/load.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | from sqlalchemy import (
 6 |     Engine,
 7 |     Index,
 8 |     MetaData,
 9 |     Table,
10 |     create_engine,
11 |     exc,
12 | )
13 | 
14 | from .common import DbVisit, get_columns, row_to_db_visit
15 | 
16 | DbStuff = tuple[Engine, Table]
17 | 
18 | 
19 | def get_db_stuff(db_path: Path) -> DbStuff:
20 |     assert db_path.exists(), db_path
21 |     # todo how to open read only?
22 |     # actually not sure if we can since we are creating an index here
23 |     engine = create_engine(f'sqlite:///{db_path}')  # , echo=True)
24 | 
25 |     meta = MetaData()
26 |     table = Table('visits', meta, *get_columns())
27 | 
28 |     idx = Index('index_norm_url', table.c.norm_url)
29 |     try:
30 |         idx.create(bind=engine)
31 |     except exc.OperationalError as e:
32 |         if 'already exists' in str(e):
33 |             # meh, but no idea how to check it properly...
34 |             pass
35 |         else:
36 |             raise e
37 | 
38 |     # NOTE: apparently it's ok to open connection on every request? at least my comparisons didn't show anything
39 |     return engine, table
40 | 
41 | 
42 | def get_all_db_visits(db_path: Path) -> list[DbVisit]:
43 |     # NOTE: this is pretty inefficient if the DB is huge
44 |     # mostly intended for tests
45 |     engine, table = get_db_stuff(db_path)
46 |     query = table.select()
47 |     with engine.connect() as conn:
48 |         res = [row_to_db_visit(row) for row in conn.execute(query)]
49 |     engine.dispose()
50 |     return res
51 | 


--------------------------------------------------------------------------------
/src/promnesia/extract.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import re
  4 | from collections.abc import Iterable, Sequence
  5 | from functools import lru_cache
  6 | 
  7 | from .cannon import CanonifyException
  8 | from .common import (
  9 |     DbVisit,
 10 |     Filter,
 11 |     Res,
 12 |     Results,
 13 |     Source,
 14 |     SourceName,
 15 |     Url,
 16 |     Visit,
 17 |     logger,
 18 | )
 19 | 
 20 | DEFAULT_FILTERS = (
 21 |     r'^chrome-\w+://',
 22 |     r'chrome://newtab',
 23 |     r'chrome://apps',
 24 |     r'chrome://history',
 25 |     r'^about:',
 26 |     r'^blob:',
 27 |     r'^view-source:',
 28 | 
 29 |     r'^content:',
 30 | )
 31 | 
 32 | 
 33 | # TODO maybe move these to configs?
 34 | @lru_cache(1) #meh, not sure what would happen under tests?
 35 | def filters() -> Sequence[Filter]:
 36 |     from . import config
 37 | 
 38 |     flt = list(DEFAULT_FILTERS)
 39 |     if config.has(): # meeeh...
 40 |         cfg = config.get()
 41 |         flt.extend(cfg.FILTERS)
 42 |     return tuple(make_filter(f) for f in flt)
 43 | 
 44 | 
 45 | def extract_visits(source: Source, *, src: SourceName) -> Iterable[Res[DbVisit]]:
 46 |     extractor = source.extractor
 47 |     logger.info('extracting via %s ...', source.description)
 48 | 
 49 |     try:
 50 |         vit: Results = extractor()
 51 |     except Exception as e:
 52 |         # todo critical error?
 53 |         # cause that means error during binding extractor args
 54 |         logger.exception(e)
 55 |         yield e
 56 |         return
 57 | 
 58 |     handled: set[Visit] = set()
 59 |     try:
 60 |         for p in vit:
 61 |             if isinstance(p, Exception):
 62 |                 # todo not sure if need it at all?
 63 |                 # parts = ['indexer emitted exception\n']
 64 |                 # eh, exception type is ignored by format_exception completely, apparently??
 65 |                 # parts.extend(traceback.format_exception(Exception, p, p.__traceback__))
 66 |                 # logger.error(''.join(parts))
 67 |                 yield p
 68 |                 continue
 69 | 
 70 |             if p in handled: # no need to emit duplicates
 71 |                 continue
 72 |             handled.add(p)
 73 | 
 74 |             yield from as_db_visit(p, src=src)
 75 |     except Exception as e:
 76 |         # todo critical error?
 77 |         logger.exception(e)
 78 |         yield e
 79 | 
 80 | 
 81 |     logger.info('extracting via %s: got %d visits', source.description, len(handled))
 82 | 
 83 | 
 84 | def as_db_visit(v: Visit, *, src: SourceName) -> Iterable[Res[DbVisit]]:
 85 |     if filtered(v.url):
 86 |         return
 87 |     res = DbVisit.make(v, src=src)
 88 |     if isinstance(res, CanonifyException):
 89 |         # todo not sure if need this log? either way maybe get rid of canonify exception and just yield up
 90 |         logger.error('error while canonnifying %s... ignoring', v)
 91 |         logger.exception(res)
 92 |     yield res
 93 | 
 94 | 
 95 | def filtered(url: Url) -> bool:
 96 |     return any(f(url) for f in filters())
 97 | 
 98 | 
 99 | def make_filter(thing: str | Filter) -> Filter:
100 |     if isinstance(thing, str):
101 |         rc = re.compile(thing)
102 |         def filter_(u: str) -> bool:
103 |             return rc.search(u) is not None
104 |         return filter_
105 |     else: # must be predicate
106 |         return thing
107 | 


--------------------------------------------------------------------------------
/src/promnesia/misc/__init__.pyi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/src/promnesia/misc/__init__.pyi


--------------------------------------------------------------------------------
/src/promnesia/misc/config_example.py:
--------------------------------------------------------------------------------
 1 | from promnesia.common import Source
 2 | from promnesia.sources import auto
 3 | 
 4 | '''
 5 | List of sources to use.
 6 | 
 7 | You can specify your own, add more sources, etc.
 8 | See https://github.com/karlicoss/promnesia#setup for more information
 9 | '''
10 | SOURCES = [
11 |     Source(
12 |         auto.index,
13 |         # just some arbitrary directory with plaintext files
14 |         '/usr/share/vim/',
15 |     )
16 | ]
17 | 


--------------------------------------------------------------------------------
/src/promnesia/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/src/promnesia/py.typed


--------------------------------------------------------------------------------
/src/promnesia/sources/auto_logseq.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import urllib.parse
 3 | 
 4 | 
 5 | def logseq_replacer(path: str, root: str) -> str:
 6 |     if not path.startswith("editor://") or not (path.endswith((".md", ".org"))):
 7 |         return path
 8 | 
 9 |     graph = os.path.basename(root)  # noqa: PTH119
10 |     page_name = os.path.basename(path).rsplit('.', 1)[0]  # noqa: PTH119
11 |     encoded_page_name = urllib.parse.quote(page_name)
12 | 
13 |     uri = f"logseq://graph/{graph}?page={encoded_page_name}"
14 | 
15 |     return uri
16 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/auto_obsidian.py:
--------------------------------------------------------------------------------
1 | def obsidian_replacer(p: str, r: str) -> str:
2 |     if not p.startswith("editor://") or not p.endswith('.md'):
3 |         return p
4 | 
5 |     path = p.split('/', 2)[-1]
6 | 
7 |     uri = f"obsidian://{path}"
8 |     return uri
9 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/browser.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for visits from web browsers.
 3 | '''
 4 | 
 5 | from __future__ import annotations
 6 | 
 7 | import re
 8 | import warnings
 9 | from collections.abc import Iterator
10 | from typing import TYPE_CHECKING, Any
11 | 
12 | from promnesia.common import Loc, PathIsh, Results, Second, Visit, is_sqlite_db, logger
13 | 
14 | 
15 | def index(p: PathIsh | None = None) -> Results:
16 |     from . import hpi  # noqa: F401,I001
17 | 
18 |     if p is None:
19 |         from my.browser.all import history
20 |         yield from _index_new(history())
21 |         return
22 | 
23 |     warnings.warn(
24 |         f'Passing paths to promnesia.sources.browser is deprecated, you should setup my.browser.export instead. '
25 |         f'See https://github.com/purarue/browserexport#hpi .'
26 |         f'Will try to hack path to browser databases {p} into HPI config.'
27 |     )
28 |     try:
29 |         yield from _index_new_with_adhoc_config(path=p)
30 |     except Exception as e:
31 |         logger.exception(e)
32 |         warnings.warn("Hacking my.config.browser.export didn't work. You probably need to update HPI.")
33 |     else:
34 |         return
35 | 
36 |     logger.warning("Falling back onto legacy promnesia.sources.browser_legacy module")
37 |     yield from _index_old(path=p)
38 | 
39 | 
40 | def _index_old(*, path: PathIsh) -> Results:
41 |     from . import browser_legacy
42 | 
43 |     yield from browser_legacy.index(path)
44 | 
45 | 
46 | def _index_new_with_adhoc_config(*, path: PathIsh) -> Results:
47 |     from . import hpi  # noqa: F401,I001
48 | 
49 |     ## previously, it was possible to index be called with multiple different db search paths
50 |     ## this would result in each subsequent call to my.browser.export.history to invalidate cache every time
51 |     ## so we hack cachew path so it's different for each call
52 |     from my.core.core_config import config as hpi_core_config
53 |     hpi_cache_dir = hpi_core_config.get_cache_dir()
54 |     sanitized_path = re.sub(r'\W', '_', str(path))
55 |     cache_override = None if hpi_cache_dir is None else hpi_cache_dir / sanitized_path
56 |     ##
57 | 
58 |     from my.core.common import Paths, classproperty, get_files
59 |     class config:
60 |         class core:
61 |             cache_dir = cache_override
62 | 
63 |         class browser:
64 |             class export:
65 |                 @classproperty
66 |                 def export_path(cls) -> Paths:
67 |                     return tuple([f for f in get_files(path, glob='**/*') if is_sqlite_db(f)])
68 | 
69 |     from my.core.cfg import tmp_config
70 |     with tmp_config(modules='my.browser.export|my.core.core_config', config=config):
71 |         from my.browser.export import history
72 |         yield from _index_new(history())
73 | 
74 | 
75 | if TYPE_CHECKING:
76 |     from browserexport.merge import Visit as BrowserMergeVisit
77 | else:
78 |     BrowserMergeVisit = Any
79 | 
80 | 
81 | def _index_new(history: Iterator[BrowserMergeVisit]) -> Results:
82 |     for v in history:
83 |         desc: str | None = None
84 |         duration: Second | None = None
85 |         metadata = v.metadata
86 |         if metadata is not None:
87 |             desc = metadata.title
88 |             duration = metadata.duration
89 |         yield Visit(
90 |             url=v.url,
91 |             dt=v.dt,
92 |             locator=Loc(title=desc or v.url, href=v.url),
93 |             duration=duration,
94 |         )
95 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/demo.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A dummy source, used for testing
 3 | Generates a sequence of fake evenly separated visits
 4 | '''
 5 | 
 6 | from __future__ import annotations
 7 | 
 8 | from datetime import datetime, timedelta
 9 | 
10 | from promnesia.common import Loc, Results, Visit
11 | 
12 | IsoFormatDt = str
13 | Seconds = int
14 | 
15 | 
16 | # TODO allow passing isoformat string as base_dt?
17 | # and maybe something similar as delta? start with seconds maybe
18 | def index(
19 |     count: int = 100,
20 |     *,
21 |     base_dt: datetime | IsoFormatDt = datetime.min + timedelta(days=5000),
22 |     delta: timedelta | Seconds = timedelta(hours=1),
23 | ) -> Results:
24 | 
25 |     base_dt_ = base_dt if isinstance(base_dt, datetime) else datetime.fromisoformat(base_dt)
26 |     delta_ = delta if isinstance(delta, timedelta) else timedelta(seconds=delta)
27 | 
28 |     # todo with some errors too?
29 |     # todo use data generation library suggested for HPI?
30 |     for i in range(count):
31 |         yield Visit(
32 |             url=f'https://demo.com/page{i}.html',
33 |             dt=base_dt_ + delta_ * i,
34 |             locator=Loc.make('demo'),
35 |         )
36 |         # todo add context?
37 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/fbmessenger.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for the messages data.
 3 | '''
 4 | 
 5 | from promnesia.common import Loc, Results, Visit, extract_urls
 6 | 
 7 | 
 8 | def index() -> Results:
 9 |     from . import hpi  # noqa: F401,I001
10 |     from my.fbmessenger import messages
11 | 
12 |     for m in messages():
13 |         if isinstance(m, Exception):
14 |             yield m
15 |             continue
16 |         text = m.text
17 |         if text is None:
18 |             continue
19 |         urls = extract_urls(text)
20 |         if len(urls) == 0:
21 |             continue
22 | 
23 |         # TODO m.author would be niceneeds to be implemented in fbmessenger model
24 |         loc = Loc.make(
25 |             title=f'chat with {m.thread.name}',
26 |             # eh, not all threads have nicknames, and not sure how to extract reliably
27 |             href=f'https://www.messenger.com/t/{m.thread.thread_id}',
28 |         )
29 |         for u in urls:
30 |             yield Visit(
31 |                 url=u,
32 |                 dt=m.dt,
33 |                 context=m.text,
34 |                 locator=loc,
35 |             )
36 | 
37 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/filetypes.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from collections.abc import Iterable, Sequence
  4 | from functools import lru_cache
  5 | from pathlib import Path
  6 | from typing import Callable, NamedTuple, Union
  7 | 
  8 | from ..common import Results, Url
  9 | 
 10 | # TODO doesn't really belong here...
 11 | Ctx = Sequence[str]
 12 | 
 13 | class EUrl(NamedTuple):
 14 |     url: Url
 15 |     ctx: Ctx # TODO ctx here is more like a Loc
 16 | ###
 17 | 
 18 | 
 19 | # keys are mime types + extensions
 20 | Ex = Callable[[Path], Union[Results, Iterable[EUrl]]]
 21 | # None means unhandled
 22 | TYPE2IDX: dict[str, Ex | None] = {}
 23 | # NOTE: there are some types in auto.py at the moment... it's a bit messy
 24 | 
 25 | 
 26 | # TYPE2IDX only contains the 'prefixes', to speed up the lookup we are using cache..
 27 | @lru_cache(None)
 28 | def type2idx(t: str) -> Ex | None:
 29 |     if len(t) == 0:
 30 |         return None # just in case?
 31 |     # first try exact match
 32 |     e = TYPE2IDX.get(t, None)
 33 |     if e is not None:
 34 |         return e
 35 |     t = t.strip('.')
 36 |     e = TYPE2IDX.get(t, None)
 37 |     if e is not None:
 38 |         return e
 39 |     # otherwise, try prefixes?
 40 |     for k, v in TYPE2IDX.items():
 41 |         if t.strip('.').startswith(k):
 42 |             return v
 43 |     return None
 44 | 
 45 | # for now source code just indexed with grep, not sure if it's good enough?
 46 | # if not, some fanceir library could be used...
 47 | # e.g. https://github.com/karlicoss/promnesia/pull/152/commits/c2f00eb4ee4018b02c9bf3966a036db69a43373d
 48 | 
 49 | # TODO use this list?
 50 | # https://github.com/GerritCodeReview/gerrit/blob/master/resources/com/google/gerrit/server/mime/mime-types.properties
 51 | # later these might do something clever, e.g. stripping off code comments etc?
 52 | CODE = {
 53 |     'text/x-java',
 54 |     'text/x-tex',
 55 |     'text/x-sh',
 56 |     'text/x-haskell',
 57 |     'text/x-perl',
 58 |     'text/x-python', 'text/x-script.python',
 59 |     'text/x-chdr',
 60 |     'text/x-csrc',
 61 |     'text/x-c',
 62 |     'text/x-c++',
 63 |     'text/x-makefile',
 64 |     'text/troff',
 65 |     'text/x-asm',
 66 |     'text/x-objective-c',
 67 |     'text/x-lisp',
 68 |     'text/vnd.graphviz',
 69 |     'text/x-diff',  # patch files
 70 |     'text/x-php',
 71 |     'text/x-lilypond',
 72 | 
 73 |     # these didn't have a mime type, or were mistyped?
 74 |     'css',
 75 |     'el',
 76 |     'rs',
 77 |     'go',
 78 |     'hs',  # mistyped on osx
 79 |     'hpp', # mistyped on osx
 80 | 
 81 |     'edn', # clojure data
 82 | 
 83 |     '.ts', # most likely typescript.. otherwise determined as text/vnd.trolltech.linguist mime
 84 |     '.js',
 85 | }
 86 | # TODO discover more extensions with mimetypes library?
 87 | 
 88 | 
 89 | BINARY = '''
 90 | # epub was failing to detect via mime on CI for some reason..
 91 | epub
 92 | inode/x-empty
 93 | .sqlite
 94 | # comment
 95 | application/
 96 | image/
 97 | audio/
 98 | video/
 99 | '''
100 | 
101 | handle_later = lambda *_args, **_kwargs: ()
102 | 
103 | def ignore(*_args, **_kwargs):
104 |     # TODO log (once?)
105 |     yield from ()
106 | 
107 | 
108 | for x in BINARY.splitlines():
109 |     x = x.strip()
110 |     if len(x) == 0 or x[0] == '#':
111 |         continue
112 |     TYPE2IDX[x] = ignore
113 | 
114 | 
115 | TYPE2IDX.update({
116 |     '.xslx': ignore,
117 |     '.vcf' : ignore,
118 |     'message/rfc822': ignore, # ??
119 | 
120 |     # todo ignore all fonts?
121 |     'font/woff2': ignore,
122 |     'font/woff': ignore,
123 |     'text/x-Algol68': ignore,  # ugh some license file had this?? maybe always index text/ as text?
124 |     'text/x-bytecode.python': ignore,  # todo ignore all x-bytecode?
125 |     'text/calendar': ignore,
126 | 
127 |     # TODO not sure what to do about these..
128 |     'application/octet-stream': handle_later,
129 |     'application/zip'         : handle_later,
130 |     'application/x-tar'       : handle_later,
131 |     'application/gzip'        : handle_later,
132 | })
133 | 
134 | 
135 | # TODO use some existing file for initial gitignore..
136 | IGNORE = [
137 |     '.idea',
138 |     'venv',
139 |     '.git',
140 |     '.eggs',
141 |     '.mypy_cache',
142 |     '.pytest_cache',
143 |     'node_modules',
144 |     '__pycache__',
145 |     '.tox',
146 |     '.stack-work',
147 | 
148 |     # TODO not sure about these:
149 |     '.gitignore',
150 |     '.babelrc',
151 | ]
152 | 
153 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/github.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] github module
 3 | '''
 4 | from __future__ import annotations
 5 | 
 6 | # Note: requires the 'mistletoe' module if you enable render_markdown
 7 | from promnesia.common import Loc, Results, Visit, iter_urls, logger
 8 | 
 9 | 
10 | def index(*, render_markdown: bool = False) -> Results:
11 |     from . import hpi  # noqa: F401,I001
12 |     from my.github.all import events
13 | 
14 |     if render_markdown:
15 |         try:
16 |             from .markdown import TextParser, extract_from_text
17 |         except ImportError as import_err:
18 |             logger.exception(import_err)
19 |             logger.critical("Could not import markdown module to render github body markdown. Try 'python3 -m pip install mistletoe'")
20 |             render_markdown = False
21 | 
22 |     for e in events():
23 |         if isinstance(e, Exception):
24 |             yield e
25 |             continue
26 |         if e.link is None:
27 |             continue
28 | 
29 |         # if enabled, convert the (markdown) body to HTML
30 |         context: str | None = e.body
31 |         if e.body is not None and render_markdown:
32 |             context = TextParser(e.body)._doc_ashtml()  # type: ignore[possibly-undefined]
33 | 
34 |         # locator should link back to this event
35 |         loc = Loc.make(title=e.summary, href=e.link)
36 | 
37 |         # visit which links back to this event in particular
38 |         yield Visit(
39 |             url=e.link,
40 |             dt=e.dt,
41 |             context=context,
42 |             locator=loc,
43 |         )
44 | 
45 |         for url in iter_urls(e.summary):
46 |             yield Visit(
47 |                 url=url,
48 |                 dt=e.dt,
49 |                 context=context,
50 |                 locator=loc,
51 |             )
52 | 
53 |         if e.body is None:
54 |             continue
55 | 
56 |         # extract any links found in the body
57 |         #
58 |         # Note: this set gets reset every event, is here to
59 |         # prevent duplicates between URLExtract and the markdown parser
60 |         emitted: set[str] = set()
61 |         for url in iter_urls(e.body):
62 |             if url in emitted:
63 |                 continue
64 |             yield Visit(
65 |                 url=url,
66 |                 dt=e.dt,
67 |                 context=context,
68 |                 locator=loc,
69 |             )
70 |             emitted.add(url)
71 | 
72 |         # extract from markdown links like [link text](https://...)
73 |         # incase URLExtract missed any somehow
74 |         if render_markdown:
75 |             for res in extract_from_text(e.body):  # type: ignore[possibly-undefined]
76 |                 if isinstance(res, Exception):
77 |                     yield res
78 |                     continue
79 |                 if res.url in emitted:
80 |                     continue
81 |                 yield Visit(
82 |                     url=res.url,
83 |                     dt=e.dt,
84 |                     context=context,
85 |                     locator=loc,
86 |                 )
87 |                 emitted.add(res.url)
88 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/guess.py:
--------------------------------------------------------------------------------
 1 | # TODO eh. confusing how guess and auto are different...
 2 | # maybe merge them later?
 3 | from collections.abc import Iterable
 4 | from typing import Any
 5 | 
 6 | from ..common import Extraction, PathIsh
 7 | 
 8 | 
 9 | def is_git_repo(p: str) -> bool:
10 |     if '://github.com/' in p:
11 |         return True
12 |     return False
13 | 
14 | 
15 | def is_website(p: str) -> bool:
16 |     if p.startswith('http'):
17 |         return True
18 |     return False
19 | 
20 | 
21 | def index(path: PathIsh, *args, **kwargs) -> Iterable[Extraction]:
22 |     ps = str(path)
23 |     # TODO better url detection
24 | 
25 |     index_: Any # meh
26 |     if is_git_repo(ps):
27 |         from . import vcs
28 |         index_ = vcs.index
29 |     elif is_website(ps):
30 |         from . import website
31 |         index_ = website.index
32 |     else:
33 |         from . import auto
34 |         index_ = auto.index
35 |     yield from index_(path, *args, **kwargs)
36 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/hackernews.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] dogsheep module to import HackerNews items.
 3 | '''
 4 | 
 5 | import textwrap
 6 | 
 7 | from promnesia.common import Loc, Results, Visit
 8 | 
 9 | 
10 | def index() -> Results:
11 |     from . import hpi  # noqa: F401,I001
12 |     from my.hackernews import dogsheep
13 | 
14 |     for item in dogsheep.items():
15 |         if isinstance(item, Exception):
16 |             yield item
17 |             continue
18 |         hn_url = item.permalink
19 |         title = "hackernews"
20 |         if item.title:
21 |             title = item.title
22 |         elif item.text_html:
23 |             title = item.text_html
24 |             title = textwrap.shorten(
25 |                     title, width=79, placeholder="…",
26 |                     break_long_words=True)
27 |         # The locator is always the HN story. If the story is a link (as
28 |         # opposed to a text post), we insert a visit such that the link
29 |         # will point back to the corresponding HN story.
30 |         loc = Loc.make(title=title, href=hn_url)
31 |         urls = [hn_url]
32 |         if item.url is not None:
33 |             urls.append(item.url)
34 |         for url in urls:
35 |             yield Visit(
36 |                     url=url,
37 |                     dt=item.created,
38 |                     locator=loc,
39 |                     context=title,
40 |             )
41 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/hpi.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Just a helper for a more humane error message when importing my.* dependencies
 3 | '''
 4 | 
 5 | from promnesia.common import logger
 6 | 
 7 | try:
 8 |     import my  # noqa: F401
 9 | except ImportError as e:
10 |     logger.exception(e)
11 |     logger.critical("Failed during 'import my'. You probably need to install & configure HPI package first (see 'https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org')")
12 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/html.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Extracts links from HTML files
 3 | '''
 4 | 
 5 | from __future__ import annotations
 6 | 
 7 | from collections.abc import Iterator
 8 | from pathlib import Path
 9 | 
10 | from bs4 import BeautifulSoup
11 | 
12 | from promnesia.common import Loc, PathIsh, Results, Visit, file_mtime
13 | 
14 | # TODO present error summary in the very end; import errors -- makes sense to show
15 | # TODO on some exceptions, request a fallback to text?
16 | 
17 | 
18 | Url = tuple[str, str]
19 | 
20 | 
21 | def extract_urls_from_html(s: str) -> Iterator[Url]:
22 |     """
23 |     Helper method to extract URLs from any HTML, so this could
24 |     potentially be used by other modules
25 |     """
26 |     soup = BeautifulSoup(s, 'lxml')
27 |     for a in soup.find_all('a'):
28 |         href = a.attrs.get('href')
29 |         if href is None or ('://' not in href):
30 |             # second condition means relative link
31 |             continue
32 |         text = a.text
33 |         yield (href, text)
34 | 
35 | 
36 | def extract_from_file(fname: PathIsh) -> Results:
37 |     ts = file_mtime(fname)
38 | 
39 |     for href, text in extract_urls_from_html(Path(fname).read_text(errors='replace')):
40 |         yield Visit(
41 |             url=href,
42 |             dt=ts,
43 |             locator=Loc.file(fname),
44 |             context=text,
45 |         )
46 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/hypothesis.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Uses HPI [[https://github.com/karlicoss/HPI/blob/master/doc/MODULES.org#myhypothesis][hypothesis]] module
 3 | """
 4 | 
 5 | from promnesia.common import Loc, Results, Visit, extract_urls, join_tags
 6 | 
 7 | 
 8 | def index() -> Results:
 9 |     from . import hpi  # noqa: F401,I001
10 |     import my.hypothesis as hyp
11 | 
12 |     for h in hyp.highlights():
13 |         if isinstance(h, Exception):
14 |             yield h
15 |             continue
16 |         hl = h.highlight
17 |         ann = h.annotation
18 |         tags = h.tags
19 |         cparts = []
20 |         if hl is not None:
21 |             cparts.append(hl)
22 |         if ann is not None:
23 |             cparts.append(f"comment: {ann}")
24 |         if tags:
25 |             cparts.append(join_tags(tags))
26 |         visit = Visit(
27 |             url=h.url,
28 |             dt=h.created,
29 |             context="\n\n".join(cparts),
30 |             locator=Loc.make(
31 |                 title="hypothesis",
32 |                 href=h.hyp_link,
33 |             ),
34 |         )
35 | 
36 |         yield visit
37 | 
38 |         in_text_visits = (
39 |             (hl, "highlighted"),
40 |             (ann, "comment"),
41 |         )
42 |         for text, part_name in in_text_visits:
43 |             if text and text.strip():
44 |                 urls = extract_urls(text)
45 |                 for url in urls:
46 |                     yield visit._replace(
47 |                         url=url,
48 |                         locator=visit.locator._replace(title=f"hypothesis-{part_name}"),
49 |                     )
50 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/instapaper.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Uses HPI [[https://github.com/karlicoss/HPI/blob/master/doc/MODULES.org#myinstapaper][instapaper]] module
 3 | '''
 4 | from promnesia.common import Loc, Results, Visit
 5 | 
 6 | 
 7 | def index() -> Results:
 8 |     from . import hpi  # noqa: F401,I001
 9 |     import my.instapaper as ip
10 | 
11 |     for p in ip.pages():
12 |         bm = p.bookmark
13 |         hls = p.highlights
14 | 
15 |         if len(hls) == 0:
16 |             yield Visit(
17 |                 url=bm.url,
18 |                 dt=bm.dt,
19 |                 context=None,
20 |                 locator=Loc.make(title='instapaper', href=bm.instapaper_link),
21 |             )
22 |         else:
23 |             for hl in p.highlights:
24 |                 cparts = [hl.text]
25 |                 if hl.note is not None:
26 |                     cparts.append('comment: ' + hl.note)
27 |                 yield Visit(
28 |                     url=bm.url,
29 |                     dt=hl.dt,
30 |                     context='\n'.join(cparts),
31 |                     locator=Loc.make(title='instapaper', href=hl.instapaper_link),
32 |                 )
33 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/markdown.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from collections.abc import Iterator
  4 | from pathlib import Path
  5 | from typing import NamedTuple
  6 | 
  7 | import mistletoe  # type: ignore[import-untyped]
  8 | import mistletoe.block_token as BT  # type: ignore[import-untyped]
  9 | from mistletoe.html_renderer import HTMLRenderer  # type: ignore[import-untyped]
 10 | from mistletoe.span_token import AutoLink, Link  # type: ignore[import-untyped]
 11 | 
 12 | from promnesia.common import (
 13 |     Extraction,
 14 |     Loc,
 15 |     PathIsh,
 16 |     Res,
 17 |     Url,
 18 |     Visit,
 19 |     file_mtime,
 20 |     logger,
 21 | )
 22 | 
 23 | renderer = HTMLRenderer()
 24 | 
 25 | 
 26 | block_tokens = tuple(getattr(BT, name) for name in BT.__all__)
 27 | 
 28 | 
 29 | class Parsed(NamedTuple):
 30 |     url: Url
 31 |     context: str | None
 32 | 
 33 | 
 34 | Result = Res[Parsed]
 35 | 
 36 | 
 37 | # the fuck...
 38 | #
 39 | # from mistletoe import Document
 40 | # d = Document('''
 41 | # # heading
 42 | # ## sub
 43 | # ## sub2
 44 | # ''')
 45 | # d.children[0].content
 46 | # Out[13]: 'sub2'
 47 | 
 48 | # meh, but for now fine I guess
 49 | HTML_MARKER = '!html '
 50 | 
 51 | 
 52 | def _ashtml(block) -> str:
 53 |     res = renderer.render(block)
 54 |     if res.startswith('<p>') and res.endswith('</p>'):
 55 |         res = res[3:-4]  # meh, but for now fine
 56 |     return res
 57 | 
 58 | 
 59 | class Parser:
 60 |     def __init__(self, path: Path):
 61 |         self.doc = mistletoe.Document(path.read_text())
 62 | 
 63 |     def _extract(self, cur, last_block) -> Iterator[Parsed]:
 64 |         if not isinstance(cur, (AutoLink, Link)):
 65 |             # hopefully that's all??
 66 |             return
 67 | 
 68 |         url = cur.target
 69 |         # TODO fuck. it doesn't preserve line numbers/positions in text???
 70 | 
 71 |         # ugh. It can't output markdown.. https://github.com/miyuchina/mistletoe/issues/4
 72 |         context = None if last_block is None else HTML_MARKER + _ashtml(last_block)
 73 |         yield Parsed(url=url, context=context)
 74 | 
 75 |     def _walk(self, cur, last_block) -> Iterator[Result]:
 76 |         if isinstance(cur, block_tokens):
 77 |             last_block = cur
 78 | 
 79 |         try:
 80 |             yield from self._extract(cur, last_block)
 81 |         except Exception as e:
 82 |             logger.exception(e)
 83 |             yield e
 84 | 
 85 |         # keeping getattr for compatibility in older versions of mistletoe, it was optional
 86 |         children = getattr(cur, 'children', None)
 87 |         if children is None:
 88 |             return
 89 |         for c in children:
 90 |             yield from self._walk(c, last_block=last_block)
 91 | 
 92 |     def walk(self) -> Iterator[Result]:
 93 |         yield from self._walk(self.doc, last_block=None)
 94 | 
 95 | 
 96 | def extract_from_file(fname: PathIsh) -> Iterator[Extraction]:
 97 |     path = Path(fname)
 98 |     fallback_dt = file_mtime(path)
 99 | 
100 |     p = Parser(path)
101 |     for r in p.walk():
102 |         if isinstance(r, Exception):
103 |             yield r
104 |         else:
105 |             yield Visit(
106 |                 url=r.url,
107 |                 dt=fallback_dt,
108 |                 locator=Loc.file(fname),  # TODO line number
109 |                 context=r.context,
110 |             )
111 | 
112 | 
113 | class TextParser(Parser):
114 |     '''
115 |     Used to extract links/render markdown from text, e.g. reddit/github comments
116 |     Instead of chunking blocks like for files, this returns the entire
117 |     message rendered as the context
118 |     '''
119 | 
120 |     def __init__(self, text: str) -> None:
121 |         self.doc = mistletoe.Document(text)
122 | 
123 |     def _doc_ashtml(self):
124 |         '''
125 |         cached html representation of the entire html message/document
126 |         '''
127 |         if not hasattr(self, '_html'):
128 |             self._html = HTML_MARKER + _ashtml(self.doc)
129 |         return self._html
130 | 
131 |     def _extract(self, cur, last_block=None) -> Iterator[Parsed]:  # noqa: ARG002
132 |         if not isinstance(cur, (AutoLink, Link)):
133 |             return
134 | 
135 |         yield Parsed(url=cur.target, context=self._doc_ashtml())
136 | 
137 | 
138 | def extract_from_text(text: str) -> Iterator[Result]:
139 |     '''
140 |     assume this is rendering something like a github/reddit markdown message
141 |     use the entire contents of the comment/body as the context
142 |     '''
143 |     # note: returns Result (link/context), not Visit
144 |     # the callee function has to insert dt/duration etc.
145 |     yield from TextParser(text).walk()
146 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/plaintext.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from functools import lru_cache
  4 | from pathlib import Path
  5 | 
  6 | from promnesia.common import PathIsh, _is_windows
  7 | 
  8 | # https://linux-and-mac-hacks.blogspot.co.uk/2013/04/use-grep-and-regular-expressions-to.html
  9 | _URL_REGEX = r'\b(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|]'
 10 | 
 11 | if _is_windows:
 12 |     # wtf? for some reason on windows (in cmd.exe specificaly) \b isn't working...
 13 |     # this will make the regex a bit less precise, but not end of the world
 14 |     _URL_REGEX = _URL_REGEX.removeprefix(r'\b')
 15 | 
 16 | 
 17 | @lru_cache
 18 | def _has_grep() -> bool:
 19 |     import shutil
 20 |     return shutil.which('grep') is not None
 21 | 
 22 | 
 23 | Command = list[str]
 24 | 
 25 | 
 26 | _GREP_ARGS: Command = [
 27 |     '--color=never',
 28 |     '-H', # always show filename TODO not sure if works on osx
 29 |     '-n', # print line numbers (to restore context)
 30 |     '-I', # ignore binaries
 31 | ]
 32 | 
 33 | if not _is_windows:
 34 |     # exclude-dir not working on windows
 35 |     _GREP_ARGS += [
 36 |         '--exclude-dir=".git"',
 37 |     ]
 38 | 
 39 | # NOTE: grep/findstr exit with code 1 on no matches...
 40 | # we hack around it in shellcmd module (search 'grep')
 41 | def _grep(*, paths: list[str], recursive: bool) -> Command:
 42 |     return [
 43 |         'grep',
 44 |         *(['-r'] if recursive else []),
 45 |         *_GREP_ARGS,
 46 |         '-E', # 'extended' syntax
 47 |         _URL_REGEX,
 48 |         *paths,
 49 |     ]
 50 | 
 51 | def _findstr(*, path: str, recursive: bool) -> Command:
 52 |     return [
 53 |         'findstr',
 54 |         '/S',
 55 |         '/P',
 56 |         '/N',
 57 |         'https*://',
 58 |         path + (r'\*' if recursive else ''),
 59 |     ]
 60 | 
 61 | 
 62 | # TODO unify these if it works??
 63 | def _extract_from_dir(path: str) -> Command:
 64 |     if _has_grep():
 65 |         return _grep(
 66 |             paths=[path],
 67 |             recursive=True,
 68 |         )
 69 |     elif _is_windows:
 70 |         return _findstr(path=path, recursive=True)
 71 |     else:
 72 |         raise RuntimeError("no grep; don't know which search tool to use!")
 73 | 
 74 | 
 75 | def _extract_from_file(path: str) -> Command:
 76 |     if _is_windows and not _has_grep():
 77 |         return _findstr(path=path, recursive=False)
 78 | 
 79 |     return _grep(
 80 |         paths=[path],
 81 |         recursive=False,
 82 |     )
 83 | 
 84 | 
 85 | def extract_from_path(path: PathIsh) -> Command:
 86 |     pp = Path(path)
 87 | 
 88 |     if pp.is_dir(): # TODO handle archives here???
 89 |         return _extract_from_dir(str(pp))
 90 | 
 91 |     if any(pp.suffix == ex for ex in (
 92 |             '.xz',
 93 |             '.bz2',
 94 |             '.gz',
 95 |             '.zip',
 96 |     )):
 97 |         # todo should be debug?
 98 |         # or should delete it completely, feels like unpacking archives here is a bit too much
 99 |         raise RuntimeError(f"Archives aren't supported yet: {path}")
100 |         # logger.info(f"Extracting from compressed file {path}")
101 |         # import lzma
102 |         # from tempfile import NamedTemporaryFile
103 |         # # TODO hopefully, no collisions
104 |         # import os.path
105 |         # fname = os.path.join(tdir.name, os.path.basename(path))
106 |         # with open(fname, 'wb') as fo:
107 |         #     with lzma.open(path, 'r') as cf:
108 |         #         fo.write(cf.read())
109 |         #     return _extract_from_file(fname)
110 | 
111 |     r = _extract_from_file(str(pp))
112 |     return r
113 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/pocket.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for Pocket highlights & bookmarks
 3 | '''
 4 | 
 5 | from promnesia.common import Loc, Results, Visit
 6 | 
 7 | 
 8 | def index() -> Results:
 9 |     from . import hpi  # noqa: F401,I001
10 |     from my.pocket import articles
11 | 
12 |     # TODO use docstring from my. module? E.g. describing which pocket format is expected
13 | 
14 |     for a in articles():
15 |         title = a.json.get('resolved_title', None) or a.json.get('given_title', 'pocket')
16 |         loc = Loc.make(title=title, href=a.pocket_link)
17 |         # Add a reverse locator so that the Promnesia browser extension shows a
18 |         # link on the Pocket page back to the original URL.
19 |         # FIXME need to actually use it
20 |         _loc_rev = Loc.make(title=title, href=a.url)
21 |         hls = a.highlights
22 |         excerpt = a.json.get('excerpt', None)
23 |         if len(hls) == 0:
24 |             yield Visit(
25 |                 url=a.url,
26 |                 dt=a.added,
27 |                 context=excerpt,
28 |                 locator=loc,
29 |             )
30 |         for hl in hls:
31 |             yield Visit(
32 |                 url=a.url,
33 |                 dt=hl.created,
34 |                 context=hl.text,
35 |                 locator=loc,
36 |             )
37 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/roamresearch.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for Roam Research data
 3 | '''
 4 | 
 5 | from promnesia.common import Loc, Results, Visit, extract_urls
 6 | 
 7 | 
 8 | def index() -> Results:
 9 |     import my.roamresearch as RR
10 |     roam = RR.roam()
11 |     for node in roam.traverse():
12 |         yield from _collect(node)
13 | 
14 | 
15 | def _collect(node: 'RoamNode') -> Results:
16 |     title = node.title
17 |     body  = node.body or ''
18 |     if title is None:
19 |         # most notes don't have title, so we just take the first line instead..
20 |         lines = body.splitlines(keepends=True)
21 |         if len(lines) > 0:
22 |             title = lines[0]
23 |             body = ''.join(lines)
24 |     title = title or ''
25 | 
26 |     full = title + '\n' + body
27 | 
28 |     urls = extract_urls(full)
29 |     if len(urls) == 0:
30 |         return
31 | 
32 |     loc = Loc.make(
33 |         title=node.path,
34 |         href=node.permalink,
35 |     )
36 |     for u in urls:
37 |         yield Visit(
38 |             url=u,
39 |             dt=node.created,
40 |             context=body,
41 |             locator=loc,
42 |         )
43 | 
44 | 
45 | import typing
46 | 
47 | if typing.TYPE_CHECKING:
48 |     import my.roamresearch as RR
49 |     RoamNode = RR.Node
50 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/rss.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for RSS data.
 3 | '''
 4 | 
 5 | from datetime import datetime
 6 | 
 7 | import pytz
 8 | 
 9 | from promnesia.common import Loc, Results, Visit
10 | 
11 | # arbitrary,  2011-11-04 00:05:23.283+00:00
12 | default_datetime = datetime.fromtimestamp(1320365123, tz=pytz.utc)
13 | # TODO FIXME allow for visit not to have datetime?
14 | # I.e. even having context is pretty good!
15 | 
16 | def index() -> Results:
17 |     from my.rss.all import subscriptions
18 | 
19 |     for feed in subscriptions():
20 |         # TODO locator should be optional too? although could use direct link in the rss reader interface
21 |         locator = Loc.make(title='my.rss')
22 |         yield Visit(
23 |             url=feed.url,
24 |             dt=feed.created_at or default_datetime,
25 |             context='RSS subscription', # TODO use 'provider', etc?
26 |             locator=locator,
27 |         )
28 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/shellcmd.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Greps out URLs from an arbitrary shell command results.
 3 | """
 4 | 
 5 | from __future__ import annotations
 6 | 
 7 | import os
 8 | import re
 9 | import warnings
10 | from collections.abc import Sequence
11 | from datetime import datetime
12 | from subprocess import PIPE, run
13 | 
14 | from promnesia.common import (
15 |     Loc,
16 |     PathIsh,
17 |     Results,
18 |     Visit,
19 |     _is_windows,
20 |     extract_urls,
21 |     file_mtime,
22 |     now_tz,
23 | )
24 | 
25 | from .plaintext import _has_grep
26 | 
27 | 
28 | def index(command: str | Sequence[PathIsh]) -> Results:
29 |     cmd: Sequence[PathIsh]
30 |     cmds: str
31 |     if isinstance(command, str):
32 |         cmds = command
33 |         warnings.warn("Passing string as a command is very fragile('{command}'). Please use list instead.")
34 |         cmd = command.split(' ')
35 |     else:
36 |         cmds = ' '.join(map(str, command))
37 |         cmd = command
38 | 
39 |     # ugh... on windows grep does something nasty? e.g:
40 |     # grep --color=never -r -H -n -I -E http 'D:\\a\\promnesia\\promnesia\\tests\\testdata\\custom'
41 |     # D:\a\promnesia\promnesia\tests\testdata\custom/file1.txt:1:Right, so this points at http://google.com
42 |     # so part of the path has fwd slashes, part has bwd slashes...
43 |     needs_windows_grep_patching = _has_grep() and _is_windows
44 | 
45 |     def handle_line(line: str) -> Results:
46 |         # grep dumps this as
47 |         # /path/to/file:lineno:rest
48 |         # note: on Windows, path contains : after the disk name..
49 |         m = re.search(r'(.*?):(\d+?):(.*)', line)
50 |         if m is None:
51 |             # todo warn maybe?
52 |             fname = None
53 |             lineno = None
54 |         else:
55 |             fname  = m.group(1)
56 |             lineno = int(m.group(2))
57 |             line   = m.group(3)
58 | 
59 |         if fname is not None and needs_windows_grep_patching:
60 |             fname = fname.replace('/', os.sep)
61 | 
62 |         urls = extract_urls(line)
63 |         if len(urls) == 0:
64 |             return
65 | 
66 |         context = line
67 | 
68 |         ts: datetime
69 |         loc: Loc
70 |         if fname is not None:
71 |             ts = file_mtime(fname)
72 |             loc = Loc.file(fname, line=lineno)
73 |         else:
74 |             ts = now_tz()
75 |             loc = Loc.make(cmds)
76 |         for url in urls:
77 |             yield Visit(
78 |                 url=url,
79 |                 dt=ts,
80 |                 locator=loc,
81 |                 context=context,
82 |             )
83 | 
84 |     r = run(cmd, stdout=PIPE, check=False)
85 |     if r.returncode > 0:
86 |         if not (cmd[0] in {'grep', 'findstr'} and r.returncode == 1): # ugh. grep returns 1 on no matches...
87 |             r.check_returncode()
88 |     output = r.stdout
89 |     assert output is not None
90 |     lines = [line.decode('utf-8') for line in output.splitlines()]
91 |     for line in lines:
92 |         try:
93 |             yield from handle_line(line)
94 |         except Exception as e:
95 |             yield e
96 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/smscalls.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] smscalls module
 3 | '''
 4 | 
 5 | from promnesia.common import Loc, Results, Visit, extract_urls
 6 | 
 7 | 
 8 | def index() -> Results:
 9 |     from . import hpi  # noqa: F401,I001
10 |     from my.smscalls import messages
11 | 
12 |     for m in messages():
13 | 
14 |         if isinstance(m, Exception):
15 |             yield m
16 |             continue
17 | 
18 |         urls = extract_urls(m.message)
19 |         if len(urls) == 0:
20 |             continue
21 | 
22 |         if m.who is None:
23 |             loc = Loc(title=f"SMS with {m.phone_number}")
24 |         else:
25 |             loc = Loc(title=f"SMS with {m.who} ({m.phone_number})")
26 | 
27 |         for u in urls:
28 |             yield Visit(
29 |                 url=u,
30 |                 dt=m.dt,
31 |                 context=m.message,
32 |                 locator=loc,
33 |             )
34 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/stackexchange.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for Stackexchange data.
 3 | '''
 4 | 
 5 | from promnesia.common import Loc, Results, Visit
 6 | 
 7 | 
 8 | def index() -> Results:
 9 |     from . import hpi  # noqa: F401,I001
10 |     import my.stackexchange.gdpr as G
11 | 
12 |     for v in G.votes():
13 |         if isinstance(v, Exception):
14 |             yield v
15 |         else:
16 |             yield Visit(
17 |                 url=v.link,
18 |                 dt=v.when,
19 |                 context='voted', # todo use the votetype? although maybe worth ignoring downvotes
20 |                 # or, downvotes could have 'negative' ranking or something
21 |                 locator=Loc.make(title='voted', href=v.link)
22 |             )
23 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/takeout_legacy.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from promnesia.common import Loc, Results, Visit, logger
  4 | 
  5 | 
  6 | # TODO make an iterator, insert in db as we go? handle errors gracefully?
  7 | def index() -> Results:
  8 |     from . import hpi  # noqa: F401,I001
  9 |     from my.google.takeout.paths import get_takeouts
 10 | 
 11 |     takeouts = list(get_takeouts())
 12 |     # TODO if no takeouts, raise?
 13 |     # although could raise a warning on top level, when source emitted no takeouts
 14 | 
 15 |     # TODO youtube?
 16 |     google_activities = [read_google_activity(t)      for t in takeouts]
 17 |     search_activities = [read_search_activity(t)      for t in takeouts]
 18 |     browser_histories = [read_browser_history_json(t) for t in takeouts]
 19 | 
 20 |     key = lambda v: (v.dt, v.url)
 21 |     return chain(
 22 |         unique_everseen(chain(*google_activities), key=key),
 23 |         unique_everseen(chain(*search_activities), key=key),
 24 |         unique_everseen(chain(*browser_histories), key=key),
 25 |     )
 26 | 
 27 | 
 28 | 
 29 | import json
 30 | from collections.abc import Iterable
 31 | from datetime import datetime
 32 | from itertools import chain
 33 | from pathlib import Path
 34 | 
 35 | import pytz
 36 | from more_itertools import unique_everseen
 37 | 
 38 | from promnesia import config
 39 | 
 40 | try:
 41 |     from cachew import cachew
 42 | except ModuleNotFoundError as me:
 43 |     if me.name != 'cachew':
 44 |         raise me
 45 |     # this module is legacy anyway, so just make it defensive
 46 |     def cachew(*args, **kwargs):  # type: ignore[no-redef]
 47 |         return lambda f: f
 48 | 
 49 | 
 50 | # TODO use CPath? Could encapsulate a path within an archive *or* within a directory
 51 | TakeoutPath = Path
 52 | 
 53 | 
 54 | def _read_myactivity_html(takeout: TakeoutPath, kind: str) -> Iterable[Visit]:
 55 |     # FIXME switch to actual kompress? and use CPath?
 56 |     from my.core.kompress import kexists
 57 | 
 58 |     # TODO glob
 59 |     # TODO not sure about windows path separators??
 60 |     spath = 'Takeout/My Activity/' + kind
 61 |     if not kexists(takeout, spath):
 62 |         logger.warning(f"{spath} is not present in {takeout}... skipping")
 63 |         return
 64 |     logger.info('processing %s %s', takeout, kind)
 65 | 
 66 |     locator = Loc.file(spath)
 67 |     from my.google.takeout.html import read_html
 68 |     for dt, url, _title in read_html(takeout, spath):
 69 |         yield Visit(
 70 |             url=url,
 71 |             dt=dt,
 72 |             locator=locator,
 73 |             debug=kind,
 74 |         )
 75 | 
 76 | def _cpath(suffix: str):
 77 |     def fun(takeout: TakeoutPath):
 78 |         cache_dir = config.get().cache_dir
 79 |         if cache_dir is None:
 80 |             return None
 81 |         # doesn't need a nontrivial hash function, timestsamp is encoded in name
 82 |         return cache_dir / (takeout.name + '_' + suffix + '.cache')
 83 |     return fun
 84 | 
 85 | 
 86 | # todo caching should this be HPI responsibility?
 87 | # todo set global cachew logging on init?
 88 | @cachew(cache_path=_cpath('google_activity') , logger=logger)
 89 | def read_google_activity(takeout: TakeoutPath) -> Iterable[Visit]:
 90 |     return _read_myactivity_html(takeout, 'Chrome/MyActivity.html')
 91 | 
 92 | @cachew(cache_path=_cpath('search_activity') , logger=logger)
 93 | def read_search_activity(takeout: TakeoutPath) -> Iterable[Visit]:
 94 |     return _read_myactivity_html(takeout, 'Search/MyActivity.html')
 95 | 
 96 | # TODO add this to tests?
 97 | @cachew(cache_path=_cpath('browser_activity'), logger=logger)
 98 | def read_browser_history_json(takeout: TakeoutPath) -> Iterable[Visit]:
 99 |     from my.core.kompress import kexists, kopen
100 |     # not sure if this deserves moving to HPI? it's pretty trivial for now
101 |     spath = 'Takeout/Chrome/BrowserHistory.json'
102 | 
103 |     if not kexists(takeout, spath):
104 |         logger.warning(f"{spath} is not present in {takeout}... skipping")
105 |         return
106 |     logger.info('processing %s %s', takeout, spath)
107 | 
108 |     # TODO couls also add spath?
109 |     locator = Loc.file(takeout)
110 | 
111 |     # TODO this should be supported by HPI now?
112 | 
113 |     j = None
114 |     with kopen(takeout, spath) as fo: # TODO iterative parser?
115 |         j = json.load(fo)
116 | 
117 |     hist = j['Browser History']
118 |     for item in hist:
119 |         url = item['url']
120 |         time = datetime.fromtimestamp(item['time_usec'] / 10 ** 6, tz=pytz.utc)
121 |         # TODO any more interesitng info?
122 |         yield Visit(
123 |             url=url,
124 |             dt=time,
125 |             locator=locator,
126 |             debug='Chrome/BrowserHistory.json',
127 |         )
128 | 
129 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/telegram.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import warnings
 4 | from urllib.parse import unquote  # TODO mm, make it easier to rememember to use...
 5 | 
 6 | from promnesia.common import Loc, PathIsh, Results, Visit, extract_urls, logger
 7 | 
 8 | 
 9 | def index(database: PathIsh | None=None, *, http_only: bool=False, with_extra_media_info: bool=False)  -> Results:
10 |     if database is None:
11 |         # fully relying on HPI
12 |         yield from _index_new(http_only=http_only, with_extra_media_info=with_extra_media_info)
13 |         return
14 | 
15 |     warnings.warn(
16 |         f'Passing paths to promnesia.sources.telegram is deprecated, you should setup my.telegram.telegram_backup instead. '
17 |         f'Will try to hack database path {database} into HPI config.'
18 |     )
19 |     try:
20 |         yield from _index_new_with_adhoc_config(database=database, http_only=http_only, with_extra_media_info=with_extra_media_info)
21 |     except Exception as e:
22 |         logger.exception(e)
23 |         warnings.warn("Hacking my.config.telegram.telegram_backup didn't work. You probably need to update HPI.")
24 |     else:
25 |         return
26 | 
27 |     logger.warning("Falling back onto promnesia.sources.telegram_legacy module")
28 |     yield from _index_legacy(database=database, http_only=http_only)
29 | 
30 | 
31 | def _index_legacy(*, database: PathIsh, http_only: bool) -> Results:
32 |     from . import telegram_legacy
33 |     yield from telegram_legacy.index(database=database, http_only=http_only)
34 | 
35 | 
36 | def _index_new_with_adhoc_config(*, database: PathIsh, http_only: bool, with_extra_media_info: bool) -> Results:
37 |     from . import hpi  # noqa: F401,I001
38 | 
39 |     class config:
40 |         class telegram:
41 |             class telegram_backup:
42 |                 export_path: PathIsh = database
43 | 
44 |     from my.core.cfg import tmp_config
45 |     with tmp_config(modules='my.telegram.telegram_backup', config=config):
46 |         yield from _index_new(http_only=http_only, with_extra_media_info=with_extra_media_info)
47 | 
48 | 
49 | def _index_new(*, http_only: bool, with_extra_media_info: bool) -> Results:
50 |     from . import hpi  # noqa: F401,I001
51 |     from my.telegram.telegram_backup import messages
52 | 
53 |     extra_where = "(has_media == 1 OR text LIKE '%http%')" if http_only else None
54 |     for m in messages(
55 |         with_extra_media_info=with_extra_media_info,
56 |         extra_where=extra_where,
57 |     ):
58 |         text = m.text
59 | 
60 |         urls = extract_urls(text)
61 |         extra_media_info = m.extra_media_info
62 |         if extra_media_info is not None:
63 |             urls.extend(extract_urls(extra_media_info))
64 | 
65 |         if len(urls) == 0:
66 |             continue
67 | 
68 |         dt = m.time
69 |         sender = m.sender.name
70 |         chat = m.chat
71 | 
72 |         cname = chat.name if chat.name is not None else str(chat.id)
73 | 
74 |         locator = Loc.make(
75 |             title=f"chat with {cname}",
76 |             href=m.permalink,
77 |         )
78 |         context = f'{sender}: {text}'
79 | 
80 |         for u in urls:
81 |             yield Visit(
82 |                 url=unquote(u),
83 |                 dt=dt,
84 |                 context=context,
85 |                 locator=locator,
86 |             )
87 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/telegram_legacy.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Uses [[https://github.com/fabianonline/telegram_backup#readme][telegram_backup]] database for messages data
  3 | '''
  4 | 
  5 | from __future__ import annotations
  6 | 
  7 | import sqlite3
  8 | from pathlib import Path
  9 | from textwrap import dedent
 10 | from typing import TypeVar
 11 | from urllib.parse import unquote  # TODO mm, make it easier to rememember to use...
 12 | 
 13 | from promnesia.common import (
 14 |     Loc,
 15 |     PathIsh,
 16 |     Results,
 17 |     Visit,
 18 |     echain,
 19 |     extract_urls,
 20 |     from_epoch,
 21 | )
 22 | 
 23 | from ..sqlite import sqlite_connection
 24 | 
 25 | T = TypeVar("T")
 26 | 
 27 | 
 28 | def unwrap(res: T | Exception) -> T:
 29 |     if isinstance(res, Exception):
 30 |         raise res
 31 |     return res
 32 | 
 33 | 
 34 | def index(database: PathIsh, *, http_only: bool=False) -> Results:
 35 |     """
 36 |     :param database:
 37 |         the path of the sqlite generated by the _telegram_backup_ java program
 38 |     :param http_only:
 39 |         when true, do not collect IP-addresses and `python.py` strings
 40 |     """
 41 |     path = Path(database)
 42 |     assert path.is_file(), path
 43 | 
 44 |     def make_query(text_query: str) -> str:
 45 |         extra_criteria = "AND (M.has_media == 1 OR text LIKE '%http%')" if http_only else ""
 46 |         return dedent(
 47 |             f"""
 48 |             WITH entities AS (
 49 |             SELECT 'dialog' as type
 50 |                 , id
 51 |                 , coalesce(username, id) as handle
 52 |                 , coalesce(first_name || " " || last_name
 53 |                     , username
 54 |                     , id
 55 |                 ) as display_name FROM users
 56 |             UNION
 57 |             SELECT 'group' as type
 58 |                 , id
 59 |                 , id as handle
 60 |                 , coalesce(name, id) as display_name FROM chats
 61 |             )
 62 |             SELECT src.display_name AS chatname
 63 |                 , src.handle       AS chat
 64 |                 , snd.display_name AS sender
 65 |                 , M.time           AS time
 66 |                 , {text_query}     AS text
 67 |                 , M.message_id     AS mid
 68 |             FROM messages AS M
 69 |                                                                                 /* chat types are 'dialog' (1-1), 'group' and 'supergroup' */
 70 |                                                                                 /* this is abit hacky way to handle all groups in one go */
 71 |             LEFT JOIN entities AS src    ON M.source_id = src.id AND src.type = (CASE M.source_type WHEN 'supergroup' THEN 'group' ELSE M.source_type END)
 72 |             LEFT JOIN entities AS snd    ON M.sender_id = snd.id AND snd.type = 'dialog'
 73 |             WHERE
 74 |                 M.message_type NOT IN ('service_message', 'empty_message')
 75 |                 {extra_criteria}
 76 |             ORDER BY time;
 77 |             """)
 78 | 
 79 |     with sqlite_connection(path, immutable=True, row_factory='row') as db:
 80 |         # TODO yield error if chatname or chat or smth else is null?
 81 |         for row in db.execute(make_query('M.text')):
 82 |             try:
 83 |                 yield from _handle_row(row)
 84 |             except Exception as ex:
 85 |                 yield echain(RuntimeError(f'While handling {row}'), ex)
 86 | 
 87 |         # old (also 'stable') version doesn't have 'json' column yet...
 88 |         messages_columns = [d[0] for d in db.execute('SELECT * FROM messages').description]
 89 |         # todo hmm what is 'markup_json'??
 90 |         if 'json' in messages_columns:
 91 |             for row in db.execute(make_query("json_extract(json, '$.media.webpage.description')")):
 92 |                 try:
 93 |                     yield from _handle_row(row)
 94 |                 except Exception as ex:
 95 |                     yield echain(RuntimeError(f'While handling {row}'), ex)
 96 | 
 97 | 
 98 | def _handle_row(row: sqlite3.Row) -> Results:
 99 |     text = row['text']
100 |     if text is None:
101 |         return
102 |     urls = extract_urls(text)
103 |     if len(urls) == 0:
104 |         return
105 |     dt            = from_epoch(row['time'])
106 |     mid: str      = unwrap(row['mid'])
107 | 
108 |     # TODO perhaps we could be defensive with null sender/chat etc and still emit the Visit
109 |     sender: str   = unwrap(row['sender'])
110 |     chatname: str = unwrap(row['chatname'])
111 |     chat: str     = unwrap(row['chat'])
112 | 
113 |     in_context = f'https://t.me/{chat}/{mid}'
114 |     for u in urls:
115 |         # https://www.reddit.com/r/Telegram/comments/6ufwi3/link_to_a_specific_message_in_a_channel_possible/
116 |         # hmm, only seems to work on mobile app, but better than nothing...
117 |         yield Visit(
118 |             url=unquote(u),
119 |             dt=dt,
120 |             context=f"{sender}: {text}",
121 |             locator=Loc.make(
122 |                 title=f"chat with {chatname}",
123 |                 href=in_context,
124 |             ),
125 |         )
126 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/twitter.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for Twitter data.
 3 | '''
 4 | 
 5 | from collections.abc import Iterable
 6 | 
 7 | from promnesia.common import Loc, Res, Results, Visit, extract_urls, logger
 8 | 
 9 | 
10 | def index() -> Results:
11 |     from . import hpi  # noqa: F401,I001
12 |     import my.twitter.all as tw
13 |     from my.twitter.archive import Tweet  # todo extract to common or something?
14 | 
15 |     # TODO hmm. tweets themselves are sort of visits? not sure if they should contribute..
16 |     processed = 0
17 |     tweets: Iterable[Res[Tweet]] = tw.tweets()
18 |     for t in tweets:
19 |         if isinstance(t, Exception):
20 |             yield t
21 |             continue
22 | 
23 |         processed += 1
24 |         try:
25 |             urls = t.urls
26 |         except Exception as e: # just in case..
27 |             yield e
28 |             urls = []
29 | 
30 |         if len(urls) == 0:
31 |             # if entities haven't detected anything it usually means RT or reply in my case, so worth trying again to extract
32 |             # e.g. replies from json twitter takeouts don't seem to have entities set
33 |             urls = extract_urls(t.text)
34 |             # t.co refers to the retweeted tweet, so perhaps not very meaningful
35 |             urls = [u for u in urls if '/t.co/' not in u]
36 | 
37 |         loc = Loc.make(title='twitter', href=t.permalink)
38 |         for u in urls:
39 |             yield Visit(
40 |                 url=u,
41 |                 dt=t.created_at,
42 |                 context=t.text,
43 |                 locator=loc,
44 |             )
45 |     logger.info('processed %d tweets', processed)
46 | 
47 | 
48 | # ok, so it doesn't necessarily have everything in entities, eg.
49 | # {
50 | #   "retweeted" : false,
51 | #   "source" : "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
52 | #   "entities" : {
53 | #     "hashtags" : [ ],
54 | #     "symbols" : [ ],
55 | #     "user_mentions" : [ ],
56 | #     "urls" : [ ]
57 | #   },
58 | #   "full_text" : "http://old.slackware.ru/article.ghtml?ID=544  Забавно =)",
59 | #  ...
60 | # }
61 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/vcs.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Clones & indexes Git repositories (via sources.auto)
 3 | '''
 4 | from __future__ import annotations
 5 | 
 6 | import re
 7 | from collections.abc import Iterable
 8 | 
 9 | # TODO not sure if worth exposing... could be just handled by auto or something?)
10 | from pathlib import Path
11 | from subprocess import check_call
12 | 
13 | from ..common import Extraction, PathIsh, get_tmpdir, slugify
14 | 
15 | 
16 | def index(path: PathIsh, *args, **kwargs) -> Iterable[Extraction]:
17 |     repo = str(path)
18 | 
19 |     # TODO this looks pretty horrible as a context name
20 |     # perhaps pass context here since we know it should be github repo?
21 |     tp = Path(get_tmpdir().name) / slugify(repo)
22 |     # note: https://bugs.python.org/issue33617 , it doesn't like Path here on Windows
23 |     check_call(['git', 'clone', repo, str(tp)])
24 | 
25 |     def replacer(p: PathIsh, prefix: str=str(tp), repo: str=repo) -> str:
26 |         ps = str(p)
27 |         # TODO prefix is a bit misleading
28 |         pos = ps.find(prefix)
29 |         if pos == -1:
30 |             # TODO not sure if should happen...
31 |             return ps
32 |         # TODO ugh. seems that blame view https://github.com/davidgasquez/handbook/blame/master/README.md#L25 is the most reliable
33 |         # in raw mode can't jump onto line, when markdown is renderend can't jump either
34 |         rest = ps[pos + len(prefix):]
35 |         rest = re.sub(r':(\d+)$', r'#L\1', rest) # patch line number...
36 |         return repo + '/blame/master' + rest
37 | 
38 |         # TODO doesn't work for git:
39 |         # TODO think about something more generic... this isn't too sustainable
40 |     # TODO not sure if context should be local or github?...
41 | 
42 |     from . import auto
43 |     yield from auto.index(tp, *args, replacer=replacer, **kwargs)
44 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/website.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Clones a website with wget and indexes via sources.auto
 3 | '''
 4 | 
 5 | import re
 6 | from collections.abc import Iterable
 7 | from pathlib import Path
 8 | from subprocess import run
 9 | 
10 | from promnesia.common import Extraction, PathIsh, get_logger, get_tmpdir, slugify
11 | 
12 | 
13 | def index(path: PathIsh, *args, **kwargs) -> Iterable[Extraction]:
14 |     logger = get_logger()
15 |     url = str(path)
16 | 
17 |     # TODO better context name
18 |     tp = Path(get_tmpdir().name) / slugify(url)
19 | 
20 |     # TODO careful, set some hard limit on data size? use --quota?
21 |     # https://www.linuxjournal.com/content/downloading-entire-web-site-wget
22 | 
23 |     cmd = [
24 |         'wget', '--directory-prefix', str(tp),
25 |         '--no-verbose',
26 |         '--recursive',
27 |         '-A', 'html,html,txt', # TODO eh, ideally would use mime type I guess...
28 |         '--no-parent',
29 |         url,
30 |     ]
31 |     # TODO follow sitemap? e.g. gwern
32 |     logger.info(' '.join(cmd))
33 |     res = run(cmd, check=False)
34 | 
35 |     if res.returncode == 8:
36 |         # man wget: 8 means server error (e.g. broken link)
37 |         yield RuntimeError('Encountered server error(s) during downloading')
38 |     else:
39 |         # rest of the errors are a bit more critical..
40 |         res.check_returncode()
41 | 
42 |     def replacer(p: PathIsh, prefix: str=str(tp), url: str=url) -> str:
43 |         ps = str(p)
44 |         pos = ps.find(prefix)
45 |         if pos == -1:
46 |             return ps
47 |         rest = ps[pos + len(prefix):]
48 |         # now this should look kinda like /domain.tld/rest (due to the way wget downloads stuff)
49 |         rest = re.sub(r'/.*?/', '/', rest)
50 |         return url + rest
51 | 
52 |     # TODO create a file that maps prefix?
53 |     # TODO ugh. it creates a directory with a domain... how to map it to http/https properly?
54 | 
55 |     # TODO smarter html handling
56 |     from . import auto
57 |     yield from auto.index(tp, *args, replacer=replacer, **kwargs)
58 | 


--------------------------------------------------------------------------------
/src/promnesia/sources/zulip.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for Zulip data.
 3 | '''
 4 | 
 5 | from promnesia.common import Loc, Results, Visit, iter_urls
 6 | 
 7 | 
 8 | def index() -> Results:
 9 |     from . import hpi  # noqa: F401,I001
10 |     import my.zulip.organization as Z
11 | 
12 |     for m in Z.messages():
13 |         if isinstance(m, Exception):
14 |             yield m
15 |             continue
16 |         loc = Loc.make(title=f'{m.sender.full_name} mentioned', href=m.permalink)
17 |         # todo if syntax is markdown, could extract title as well?
18 |         content = m.content
19 |         for u in iter_urls(content, syntax='markdown'):
20 |             yield Visit(
21 |                 url=u,
22 |                 dt=m.sent,
23 |                 # TODO render as markdown?
24 |                 context=content,
25 |                 locator=loc,
26 |             )
27 | 


--------------------------------------------------------------------------------
/src/promnesia/sqlite.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import sqlite3
 4 | from collections.abc import Iterator
 5 | from contextlib import contextmanager
 6 | from typing import Any, Callable, Literal, Union
 7 | 
 8 | from .common import PathIsh
 9 | 
10 | # NOTE: copy pasted from HPI
11 | 
12 | SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any]
13 | 
14 | def dict_factory(cursor, row):
15 |     fields = [column[0] for column in cursor.description]
16 |     return dict(zip(fields, row))
17 | 
18 | 
19 | Factory = Union[SqliteRowFactory, Literal['row', 'dict']]
20 | 
21 | @contextmanager
22 | def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Factory | None=None) -> Iterator[sqlite3.Connection]:
23 |     dbp = f'file:{db}'
24 |     # https://www.sqlite.org/draft/uri.html#uriimmutable
25 |     if immutable:
26 |         dbp = f'{dbp}?immutable=1'
27 |     row_factory_: Any = None
28 |     if row_factory is not None:
29 |         if callable(row_factory):
30 |             row_factory_ = row_factory
31 |         elif row_factory == 'row':
32 |             row_factory_ = sqlite3.Row
33 |         elif row_factory == 'dict':
34 |             row_factory_ = dict_factory
35 |         else:
36 |             raise RuntimeError("should not happen")
37 | 
38 |     conn = sqlite3.connect(dbp, uri=True)
39 |     try:
40 |         conn.row_factory = row_factory_
41 |         with conn:
42 |             yield conn
43 |     finally:
44 |         # Connection context manager isn't actually closing the connection, only keeps transaction
45 |         conn.close()
46 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/src/promnesia/tests/__init__.py


--------------------------------------------------------------------------------
/src/promnesia/tests/common.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import gc
  4 | import inspect
  5 | import os
  6 | import socket
  7 | import sys
  8 | from collections.abc import Iterator
  9 | from contextlib import closing, contextmanager
 10 | from pathlib import Path
 11 | from textwrap import dedent
 12 | from typing import NoReturn, TypeVar
 13 | 
 14 | import pytest
 15 | 
 16 | from ..common import Res, _is_windows
 17 | 
 18 | 
 19 | def under_ci() -> bool:
 20 |     return 'CI' in os.environ
 21 | 
 22 | 
 23 | def throw(x: Exception) -> NoReturn:
 24 |     '''
 25 |     like raise, but can be an expression...
 26 |     '''
 27 |     raise x
 28 | 
 29 | 
 30 | @pytest.fixture
 31 | def gc_control(*, gc_on: bool):
 32 |     if gc_on:
 33 |         # no need to do anything, should be on by default
 34 |         yield
 35 |         return
 36 | 
 37 |     gc.disable()
 38 |     try:
 39 |         yield
 40 |     finally:
 41 |         gc.enable()
 42 | 
 43 | 
 44 | running_on_ci = 'CI' in os.environ
 45 | 
 46 | 
 47 | GIT_ROOT = Path(__file__).absolute().parent.parent.parent.parent
 48 | TESTDATA = GIT_ROOT / 'tests/testdata'
 49 | 
 50 | 
 51 | def get_testdata(path: str) -> Path:
 52 |     assert TESTDATA.is_dir()
 53 |     res = TESTDATA / path
 54 |     if not res.exists():
 55 |         raise RuntimeError(f"'{res}' not found! You propably need to run 'git submodule update --init --recursive'")
 56 |     return TESTDATA / path
 57 | 
 58 | 
 59 | @contextmanager
 60 | def tmp_popen(*args, **kwargs):
 61 |     import psutil
 62 | 
 63 |     with psutil.Popen(*args, **kwargs) as p:
 64 |         try:
 65 |             yield p
 66 |         finally:
 67 |             for c in p.children(recursive=True):
 68 |                 c.kill()
 69 |             p.kill()
 70 |             p.wait()
 71 | 
 72 | 
 73 | # meh
 74 | def promnesia_bin(*args):
 75 |     # not sure it's a good idea to diverge, but not sure if there's a better way either?
 76 |     # ugh. on windows there is no bash so can't use the script
 77 |     # whatever...
 78 |     if under_ci() or _is_windows:
 79 |         # should be able to use the installed version
 80 |         return [sys.executable, '-m', 'promnesia', *args]
 81 |     else:
 82 |         # use version from the repository
 83 |         root = Path(__file__).parent.parent.parent.parent
 84 |         pm = root / 'scripts/promnesia'
 85 |         return [pm, *args]
 86 | 
 87 | 
 88 | # meh... not great
 89 | @pytest.fixture
 90 | def reset_filters():
 91 |     from .. import extract
 92 | 
 93 |     extract.filters.cache_clear()
 94 |     try:
 95 |         yield
 96 |     finally:
 97 |         extract.filters.cache_clear()
 98 | 
 99 | 
100 | # TODO could be a TypeGuard from 3.10
101 | V = TypeVar('V')
102 | 
103 | 
104 | def unwrap(r: Res[V]) -> V:
105 |     assert not isinstance(r, Exception), r
106 |     return r
107 | 
108 | 
109 | def write_config(path: Path, gen, **kwargs) -> None:
110 |     output_dir = path.parent
111 |     cfg_src = dedent('\n'.join(inspect.getsource(gen).splitlines()[1:])) + f"\nOUTPUT_DIR = r'{output_dir}'"
112 |     for k, v in kwargs.items():
113 |         assert k in cfg_src, k
114 |         cfg_src = cfg_src.replace(k, repr(str(v)))  # meh
115 |     path.write_text(cfg_src)
116 | 
117 | 
118 | @contextmanager
119 | def free_port() -> Iterator[int]:
120 |     # this is a generator to make sure there are no race conditions between the time we call this and launch program
121 |     #
122 |     # also some relevant articles about this 'technique'
123 |     # - https://eklitzke.org/binding-on-port-zero
124 |     # - https://idea.popcount.org/2014-04-03-bind-before-connect
125 |     # - https://blog.cloudflare.com/the-quantum-state-of-a-tcp-port
126 |     with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
127 |         if sys.platform == 'linux':
128 |             # Ok, so from what I've been reading, SO_REUSEADDR should only be necessary in the program that reuses the port
129 |             # However, this answer (or man socket) claims we need it on both sites in Linux? see https://superuser.com/a/587955/300795
130 |             s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
131 |         # also not sure where REUSEADDR is set in uvicorn (e.g. here reuse_address isn't passed?)
132 |         # https://github.com/encode/uvicorn/blob/6d666d99a285153bc4613e811543c39eca57054a/uvicorn/server.py#L162C37-L162C50
133 |         # but from strace looks like it is called somewhere :shrug:
134 | 
135 |         # assign euphemeral port
136 |         # see table in
137 |         # https://stackoverflow.com/questions/14388706/how-do-so-reuseaddr-and-so-reuseport-differ/14388707#14388707
138 |         # we rely on server binding to localhost later (or anything except 0.0.0.0 really)
139 |         s.bind(('', 0))
140 | 
141 |         port = s.getsockname()[1]
142 |         yield port
143 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/server_helper.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import sys
 4 | import time
 5 | from collections.abc import Iterator
 6 | from contextlib import contextmanager
 7 | from dataclasses import dataclass
 8 | from pathlib import Path
 9 | from typing import Any
10 | 
11 | import psutil
12 | import requests
13 | 
14 | from ..common import PathIsh
15 | from .common import free_port, promnesia_bin, tmp_popen
16 | 
17 | 
18 | @dataclass
19 | class Helper:
20 |     host: str
21 |     port: str
22 |     process: psutil.Popen
23 | 
24 |     def get(self, path: str):
25 |         # check it's alive first so the error is cleaner
26 |         assert self.process.poll() is None, self.process
27 |         return requests.get(f'http://{self.host}:{self.port}' + path)
28 | 
29 |     def post(self, path: str, *, json: dict[str, Any] | None = None):
30 |         assert self.process.poll() is None, self.process
31 |         return requests.post(f'http://{self.host}:{self.port}' + path, json=json)
32 | 
33 | 
34 | @contextmanager
35 | def run_server(db: PathIsh | None = None, *, timezone: str | None = None) -> Iterator[Helper]:
36 |     # TODO not sure, perhaps best to use a thread or something?
37 |     # but for some tests makes more sense to test in a separate process
38 |     with free_port() as pp:
39 |         # ugh. under docker 'localhost' tries to bind it to ipv6 (::1) for some reason???
40 |         host = '0.0.0.0' if Path('/.dockerenv').exists() else 'localhost'
41 |         port = str(pp)
42 |         args = [
43 |             'serve',
44 |             '--host', host,
45 |             '--quiet',
46 |             '--port', port,
47 |             *([] if timezone is None else ['--timezone', timezone]),
48 |             *([] if db is None else ['--db'  , str(db)]),
49 |         ]
50 |         with tmp_popen(promnesia_bin(*args)) as server_process:
51 |             server = Helper(host=host, port=port, process=server_process)
52 | 
53 |             # wait till ready
54 |             for _ in range(50):
55 |                 try:
56 |                     server.get('/status').json()
57 |                     break
58 |                 except:
59 |                     time.sleep(0.1)
60 |             else:
61 |                 raise RuntimeError("Cooldn't connect to '{st}' after 50 attempts")
62 |             print(f"Started server up, db: {db}", file=sys.stderr)
63 | 
64 |             yield server
65 | 
66 |             # TODO use logger!
67 |             print("Done with the server", file=sys.stderr)
68 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/sources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/src/promnesia/tests/sources/__init__.py


--------------------------------------------------------------------------------
/src/promnesia/tests/sources/test_auto.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from itertools import groupby
 3 | 
 4 | from ...sources import auto
 5 | from ..common import get_testdata, throw
 6 | 
 7 | sa2464 = 'https://www.scottaaronson.com/blog/?p=2464'
 8 | 
 9 | _JSON_URLS = {
10 |     'https://johncarlosbaez.wordpress.com/2016/09/09/struggles-with-the-continuum-part-2/',
11 |     sa2464,
12 | }
13 | 
14 | 
15 | def makemap(visits):
16 |     key = lambda v: v.url
17 | 
18 |     def it():
19 |         vit = (throw(v) if isinstance(v, Exception) else v for v in visits)
20 |         for k, g in groupby(sorted(vit, key=key), key=key):
21 |             yield k, sorted(g)
22 | 
23 |     return dict(it())
24 | 
25 | 
26 | def test_json() -> None:
27 |     mm = makemap(auto.index(get_testdata('auto'), ignored='*/orgs/*'))
28 |     assert mm.keys() == _JSON_URLS
29 | 
30 |     # TODO not sure if they deserve separate visits..
31 |     [v1, v2] = mm[sa2464]
32 |     assert v1.context == 'list::yyy::given_url'
33 |     # todo not sure if editor:// work on Windows
34 |     assert v1.locator.href.startswith('editor://')
35 |     assert v1.locator.href.endswith('pocket.json')
36 |     # TODO line number?
37 | 
38 | 
39 | def test_auto() -> None:
40 |     mm = makemap(auto.index(get_testdata('auto')))
41 |     org_link = 'https://www.youtube.com/watch?v=rHIkrotSwcc'
42 |     assert {
43 |         *_JSON_URLS,
44 |         org_link,
45 |     }.issubset(mm.keys())
46 | 
47 |     [v] = mm[org_link]
48 |     assert v.locator.title == 'orgs' + os.sep + 'file.org:14'  # meh
49 |     assert v.locator.href.endswith('file.org:14')
50 |     assert "xxx /r/cpp" in v.context
51 |     assert "I've enjoyed [Chandler Carruth's" in v.context
52 | 
53 | 
54 | def test_obsidian() -> None:
55 |     mm = makemap(auto.index(get_testdata('obsidian-vault')))
56 |     example_url = 'https://example.com'
57 |     [v] = mm[example_url]
58 |     assert v.locator.href.startswith('obsidian://')
59 | 
60 | 
61 | def test_logseq() -> None:
62 |     mm = makemap(auto.index(get_testdata('logseq-graph')))
63 |     example_url = 'https://example.com'
64 |     [v] = mm[example_url]
65 |     assert v.locator.href.startswith('logseq://')
66 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/sources/test_filetypes.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from ...common import PathIsh
 4 | from ...common import _is_windows as windows
 5 | from ...sources.auto import by_path
 6 | 
 7 | 
 8 | def handled(p: PathIsh) -> bool:
 9 |     idx, m = by_path(Path(p))
10 |     return idx is not None
11 |     # ideally these won't hit libmagic path (would try to open the file and cause FileNotFoundError)
12 | 
13 | 
14 | def test_filetypes() -> None:
15 |     # test media
16 |     for ext in 'avi mp4 mp3 webm'.split() + ([] if windows else 'mkv'.split()):
17 |         assert handled('file.' + ext)
18 | 
19 |     # images
20 |     for ext in 'gif jpg png jpeg'.split():
21 |         assert handled('file.' + ext)
22 | 
23 |     # TODO more granual checks that these are ignored?
24 |     # binaries
25 |     for ext in 'o sqlite'.split() + ([] if windows else 'class jar'.split()):
26 |         assert handled('file.' + ext)
27 | 
28 |     # these might have potentially some links
29 |     for ext in [
30 |         'svg',
31 |         'pdf', 'epub', 'ps',
32 |         'doc', 'ppt', 'xsl',
33 |         # seriously, windows doesn't know about docx???
34 |         *([] if windows else 'docx pptx xlsx'.split()),
35 |         *([] if windows else 'ods odt rtf'.split()),
36 |     ] + ([] if windows else 'djvu'.split()):
37 |         assert handled('file.' + ext)
38 | 
39 |     # source code
40 |     for ext in 'rs tex el js sh hs pl h py hpp c go css'.split() + ([] if windows else 'java cpp'.split()):
41 |         assert handled('file.' + ext)
42 | 
43 |     assert handled('x.html')
44 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/sources/test_hypothesis.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from my.core.cfg import tmp_config
 4 | 
 5 | from ...__main__ import do_index
 6 | from ...database.load import get_all_db_visits
 7 | from ..common import get_testdata, write_config
 8 | 
 9 | 
10 | def index_hypothesis(tmp_path: Path) -> None:
11 |     def cfg() -> None:
12 |         from promnesia.common import Source
13 |         from promnesia.sources import hypothesis
14 | 
15 |         SOURCES = [Source(hypothesis.index, name='hyp')]  # noqa: F841
16 | 
17 |     cfg_path = tmp_path / 'config.py'
18 |     write_config(cfg_path, cfg)
19 | 
20 |     class hpi_config:
21 |         class hypothesis:
22 |             export_path = get_testdata('hypexport/testdata') / 'netrights-dashboard-mockup/data/*.json'
23 | 
24 |     with tmp_config(modules='my.hypothesis', config=hpi_config):
25 |         do_index(cfg_path)
26 | 
27 | 
28 | def test_hypothesis(tmp_path: Path) -> None:
29 |     index_hypothesis(tmp_path)
30 | 
31 |     visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
32 |     assert len(visits) > 100
33 | 
34 |     [vis] = [x for x in visits if 'fundamental fact of evolution' in (x.context or '')]
35 | 
36 |     assert vis.norm_url == 'wired.com/2017/04/the-myth-of-a-superhuman-ai'
37 |     assert vis.orig_url == 'https://www.wired.com/2017/04/the-myth-of-a-superhuman-ai/'
38 |     assert vis.locator.href == 'https://hyp.is/_Z9ccmVZEeexBOO7mToqdg/www.wired.com/2017/04/the-myth-of-a-superhuman-ai/'
39 |     assert 'misconception about evolution is fueling misconception about AI' in (vis.context or '')  # contains notes as well
40 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/sources/test_org.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from ...common import Visit
 4 | from ...sources.org import extract_from_file
 5 | from ..common import get_testdata, throw
 6 | 
 7 | 
 8 | def delrf(s: str | None) -> str | None:
 9 |     if s is None:
10 |         return None
11 |     # meh.. not sure how ot handle this properly, ideally should be via pytest?
12 |     # not sure if should just do it in the indexer? e.g. extension might not like it
13 |     return s.replace('\r', '')
14 | 
15 | 
16 | def test_org_indexer() -> None:
17 |     [_, cpp, cozy] = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file.org'))]
18 | 
19 |     assert cpp.url == 'https://www.youtube.com/watch?v=rHIkrotSwcc'
20 |     # TODO not sure about filetags?
21 |     exp = '''
22 | xxx /r/cpp   :cpp:programming:
23 |  I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_](
24 |  https://www.youtube.com/watch?v=rHIkrotSwcc) very much.
25 | 
26 | '''.lstrip()
27 |     assert delrf(cpp.context) == exp
28 | 
29 |     assert cozy.url == 'https://twitter.com/Mappletons/status/1255221220263563269'
30 | 
31 | 
32 | def test_org_indexer_2() -> None:
33 |     items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file3.org'))]
34 | 
35 |     assert len(items) == 6
36 |     assert items[0].url == 'https://www.reddit.com/r/androidapps/comments/4i36z9/how_you_use_your_android_to_the_maximum/d2uq24i'
37 |     assert items[1].url == 'https://link.com'
38 |     assert items[-2].url == 'https://en.wikipedia.org/wiki/Resilio_Sync'
39 |     # TODO shit def need org specific url extractor (and then extract from everything remaining)
40 |     # assert results[-1].url == 'https://en.wikipedia.org/wiki/InterPlanetary_File_System'
41 | 
42 | 
43 | def test_heading() -> None:
44 |     items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file2.org'))]
45 |     assert {i.url for i in items} == {
46 |         'https://en.wikipedia.org/wiki/Computational_topology',
47 |         'http://graphics.stanford.edu/courses/cs468-09-fall/',
48 |         'https://en.wikipedia.org/wiki/Triangulation_(topology)',
49 |         'https://en.wikipedia.org/wiki/Digital_manifold',
50 |     }
51 | 
52 | 
53 | def test_url_in_properties() -> None:
54 |     items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file4.org'))]
55 | 
56 |     assert len(items) == 2, items
57 |     assert items[0].url == 'https://example.org/ref_example'
58 |     assert items[1].url == 'http://example.org/a_test'
59 | 
60 | 
61 | def test_5() -> None:
62 |     items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file5.org'))]
63 | 
64 |     assert len(items) == 0  # shouldn't crash at least
65 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/sources/test_plaintext.py:
--------------------------------------------------------------------------------
 1 | from ...common import Source
 2 | from ...extract import extract_visits
 3 | from ...sources import plaintext, shellcmd
 4 | from ..common import get_testdata, unwrap
 5 | 
 6 | 
 7 | def test_plaintext_path_extractor() -> None:
 8 |     visits = list(
 9 |         extract_visits(
10 |             Source(
11 |                 shellcmd.index,
12 |                 plaintext.extract_from_path(get_testdata('custom')),
13 |             ),
14 |             src='whatever',
15 |         )
16 |     )
17 |     assert {unwrap(v).orig_url for v in visits} == {
18 |         'http://google.com',
19 |         'http://google.com/',
20 |         'http://some-weird-domain.xyz/whatever',
21 |         'https://google.com',
22 |         'http://what.about.this.link',
23 |     }
24 | 
25 |     [wa] = [v for v in visits if unwrap(v).orig_url == 'http://what.about.this.link']
26 |     f2 = get_testdata('custom') / 'file2.txt'
27 |     assert unwrap(wa).locator.href == f'editor://{f2}:3'  # occurs line 3
28 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/sources/test_shellcmd.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from ...common import Source, _is_windows
 4 | from ...extract import extract_visits
 5 | from ...sources import shellcmd
 6 | from ..common import get_testdata
 7 | 
 8 | 
 9 | @pytest.mark.skipif(_is_windows, reason="no grep on windows")
10 | def test_via_grep() -> None:
11 | 
12 |     visits = list(
13 |         extract_visits(
14 |             Source(
15 |                 shellcmd.index,
16 |                 # meh. maybe should deprecate plain string here...
17 |                 r"""grep -Eo -r --no-filename (http|https)://\S+ """ + str(get_testdata('custom')),
18 |             ),
19 |             src='whatever',
20 |         )
21 |     )
22 |     # TODO I guess filtering of equivalent urls should rather be tested on something having context (e.g. org mode)
23 |     assert len(visits) == 5
24 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/sources/test_takeout.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timezone
 2 | 
 3 | import pytest
 4 | from my.core.cfg import tmp_config
 5 | 
 6 | from ...common import Source
 7 | from ...extract import extract_visits
 8 | from ...sources import takeout
 9 | from ..common import get_testdata, unwrap
10 | 
11 | 
12 | # TODO apply in conftest so it's used in all tests?
13 | @pytest.fixture
14 | def no_cachew():
15 |     from my.core.cachew import disabled_cachew
16 | 
17 |     with disabled_cachew():
18 |         yield
19 | 
20 | 
21 | # todo testing this logic probably belongs to hpi or google_takeout_export, but whatever
22 | def test_takeout_directory(no_cachew) -> None:
23 |     class config:
24 |         class google:
25 |             takeout_path = get_testdata('takeout')
26 | 
27 |     with tmp_config(modules='my.google.takeout.*', config=config):
28 |         visits = list(extract_visits(Source(takeout.index), src='takeout'))
29 | 
30 |     assert len(visits) == 3
31 |     assert all(unwrap(v).dt.tzinfo is not None for v in visits)
32 | 
33 | 
34 | def test_takeout_zip(no_cachew) -> None:
35 |     class config:
36 |         class google:
37 |             takeout_path = get_testdata('takeout-20150518T000000Z.zip')
38 | 
39 |     with tmp_config(modules='my.google.takeout.*', config=config):
40 |         visits = list(extract_visits(Source(takeout.index), src='takeout'))
41 | 
42 |     assert len(visits) == 3
43 |     assert all(unwrap(v).dt.tzinfo is not None for v in visits)
44 | 
45 |     [vis] = [v for v in visits if unwrap(v).norm_url == 'takeout.google.com/settings/takeout']
46 | 
47 |     edt = datetime(
48 |         year=2018,
49 |         month=9,
50 |         day=18,
51 |         hour=5,
52 |         minute=48,
53 |         second=23,
54 |         tzinfo=timezone.utc,
55 |     )
56 |     assert unwrap(vis).dt == edt
57 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | import pytest
 5 | import requests
 6 | 
 7 | from ..common import _is_windows
 8 | from .common import get_testdata, promnesia_bin, tmp_popen
 9 | 
10 | ox_hugo_data = get_testdata('ox-hugo/test/site')
11 | 
12 | 
13 | def test_demo() -> None:
14 |     if _is_windows:
15 |         # for some reason fails to connect to server..
16 |         # not sure maybe something with port choice idk
17 |         pytest.skip("TODO broken on Windows")
18 | 
19 |     with tmp_popen(promnesia_bin('demo', '--port', '16789', ox_hugo_data)):
20 |         # TODO why does it want post??
21 |         time.sleep(2)  # meh.. need a generic helper to wait till ready...
22 |         res = {}
23 |         for _attempt in range(30):
24 |             time.sleep(1)
25 |             try:
26 |                 res = requests.post(
27 |                     "http://localhost:16789/search",
28 |                     json={'url': "https://github.com/kaushalmodi/ox-hugo/issues"},
29 |                 ).json()
30 |                 break
31 |             except:
32 |                 continue
33 |         else:
34 |             raise RuntimeError("Couldn't connect to the server")
35 |         vis = res['visits']
36 |         assert len(vis) > 50, vis
37 |         mds = [x for x in vis if x['locator']['title'] == 'content/posts/citations-example-toml.md'.replace('/', os.sep)]
38 |         orgs = [x for x in vis if x['locator']['title'].startswith('content-org/single-posts/empty_tag.org'.replace('/', os.sep))]
39 |         assert len(mds) == 1
40 |         assert len(orgs) == 1
41 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/test_compare.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | from pathlib import Path
 3 | 
 4 | from ..compare import compare_files
 5 | from .utils import index_urls
 6 | 
 7 | 
 8 | def test_compare(tmp_path: Path) -> None:
 9 |     idx = index_urls({
10 |         'https://example.com': None,
11 |         'https://en.wikipedia.org/wiki/Saturn_V': None,
12 |         'https://plato.stanford.edu/entries/qualia': None,
13 |     })
14 |     idx(tmp_path)
15 |     db     = tmp_path / 'promnesia.sqlite'
16 |     old_db = tmp_path / 'promnesia-old.sqlite'
17 |     shutil.move(str(db), str(old_db))
18 | 
19 |     idx2 = index_urls({
20 |         'https://example.com': None,
21 |         'https://www.reddit.com/r/explainlikeimfive/comments/1ev6e0/eli5entropy': None,
22 |         'https://en.wikipedia.org/wiki/Saturn_V': None,
23 |         'https://plato.stanford.edu/entries/qualia': None,
24 |     })
25 |     idx2(tmp_path)
26 | 
27 |     # should not crash, as there are more links in the new database
28 |     assert len(list(compare_files(old_db, db))) == 0
29 | 
30 |     assert len(list(compare_files(db, old_db))) == 1
31 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/test_extract.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timezone
 2 | 
 3 | import pytest
 4 | from more_itertools import ilen
 5 | 
 6 | from ..common import DbVisit, Loc, Source, Visit
 7 | from ..extract import extract_visits
 8 | from .common import (
 9 |     gc_control,  # noqa: F401
10 |     get_testdata,
11 |     running_on_ci,
12 |     unwrap,
13 | )
14 | 
15 | 
16 | def test_with_error() -> None:
17 |     class ExtractionError(Exception):
18 |         pass
19 | 
20 |     def indexer():
21 |         yield Visit(url='http://test1', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever'))
22 |         yield ExtractionError()
23 |         yield Visit(url='http://test2', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever'))
24 | 
25 |     [v1, e, v2] = extract_visits(source=Source(indexer), src='whatever')
26 |     assert isinstance(v1, DbVisit)
27 |     assert isinstance(e, Exception)
28 |     assert isinstance(v2, DbVisit)
29 | 
30 | 
31 | def test_urls_are_normalised() -> None:
32 |     # generally this stuff is covered by cannon tests, but good to check it's actually inserted in the db
33 |     # TODO maybe this should be a separate test which takes DbVisit.make separately?
34 |     # especially to decouple from shellcmd source
35 |     from ..sources import shellcmd
36 |     from ..sources.plaintext import extract_from_path
37 | 
38 |     visits = list(
39 |         extract_visits(
40 |             source=Source(shellcmd.index, extract_from_path(get_testdata('normalise'))),
41 |             src='whatever',
42 |         )
43 |     )
44 |     assert len(visits) == 7
45 | 
46 |     assert {unwrap(v).norm_url for v in visits} == {
47 |         'hi.com',
48 |         'reddit.com/post',
49 |         'argos.co.uk/webapp/wcs/stores/servlet/OrderItemDisplay',
50 |         'youtube.com/watch?v=XXlZfc1TrD0',
51 |         'youtube.com/watch?v=XXlZfc1Tr11',
52 |     }
53 | 
54 | 
55 | @pytest.mark.parametrize('count', [99, 100_000, 1_000_000])
56 | @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
57 | def test_benchmark(count: int, gc_control) -> None:
58 |     # NOTE: at the moment most time is spent canonifying urls, so not much point optimizing this in isolation
59 |     # TODO maybe could specify custom cannonifying strategy that doesn't do anything to isolate benchmark
60 |     if count > 99 and running_on_ci:
61 |         pytest.skip("test would be too slow on CI, only meant to run manually")
62 | 
63 |     from ..sources import demo
64 | 
65 |     source = Source(demo.index, count=count)
66 | 
67 |     total = ilen(extract_visits(source=source, src='whatever'))
68 |     assert total == count  # sanity check
69 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/test_extract_urls.py:
--------------------------------------------------------------------------------
 1 | from ..common import extract_urls
 2 | 
 3 | 
 4 | def test_extract_simple() -> None:
 5 |     lines = """
 6 |  I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_](
 7 |  https://www.youtube.com/watch?v=rHIkrotSwcc) very much.
 8 | """.strip()
 9 |     assert set(extract_urls(lines)) == {'https://www.youtube.com/watch?v=rHIkrotSwcc'}
10 | 
11 | 
12 | def test_extract_2() -> None:
13 |     text = '''‍♂️ Чтобы снизить вероятность ошибиться, важно знать про когнитивные искажения.
14 |     Если для вас это новое словосочетание, начните с книжки
15 |     "Гарри Поттер и Методы рационального мышления" - http://hpmor.ru/, если вы знакомы с понятием - читайте цепочки на сайтах
16 |     lesswrong.ru и lesswrong.com, книжку Даниэля Канемана "Thinking, fast and slow" и канал Пион https://t.me/ontologics
17 |     '''
18 |     assert set(extract_urls(text)) == {'http://hpmor.ru/', 'lesswrong.ru', 'lesswrong.com', 'https://t.me/ontologics'}
19 | 
20 | 
21 | def test_extract_md() -> None:
22 |     lines = '''
23 | Hey, I recently implemented a new extension for that [addons.mozilla.org](https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/), [github](https://github.com/karlicoss/grasp), perhaps it could be useful for you!
24 |     '''
25 |     assert set(extract_urls(lines)) == {
26 |         'addons.mozilla.org',
27 |         'https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/',
28 |         'https://github.com/karlicoss/grasp',
29 |     }
30 | 
31 | 
32 | # just random links to test multiline/whitespace behaviour
33 | def test_extract_3() -> None:
34 |     lines = '''
35 | python.org/one.html ?? https://python.org/two.html some extra text
36 | 
37 |     whatever.org
38 |     '''
39 |     assert set(extract_urls(lines, syntax='org')) == {
40 |         'python.org/one.html',
41 |         'https://python.org/two.html',
42 |         'whatever.org',
43 |     }
44 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/test_traverse.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | from ..common import traverse
 4 | from .common import get_testdata
 5 | 
 6 | testDataPath = get_testdata('traverse')
 7 | 
 8 | 
 9 | # Patch shutil.which so it always returns false (when trying to which fdfind, etc)
10 | # so that it falls back to find
11 | @patch('promnesia.common.shutil.which', return_value=False)
12 | def test_traverse_ignore_find(patched) -> None:
13 |     '''
14 |     traverse() with `find` but ignore some stuff
15 |     '''
16 |     paths = set(traverse(testDataPath, ignore=['ignoreme.txt', 'ignoreme2']))
17 | 
18 |     assert paths == {testDataPath / 'imhere2/real.txt', testDataPath / 'imhere.txt'}
19 | 
20 | 
21 | def test_traverse_ignore_fdfind():
22 |     '''
23 |     traverse() with `fdfind` but ignore some stuff
24 |     '''
25 |     paths = set(traverse(testDataPath, ignore=['ignoreme.txt', 'ignoreme2']))
26 | 
27 |     assert paths == {testDataPath / 'imhere.txt', testDataPath / 'imhere2/real.txt'}
28 | 
29 | 
30 | # TODO: It would be nice to test the implementation directly without having to do this
31 | # weird patching in the future
32 | @patch('promnesia.common._is_windows', new_callable=lambda: True)
33 | def test_traverse_ignore_windows(patched) -> None:
34 |     '''
35 |     traverse() with python when _is_windows is true but ignore some stuff
36 |     '''
37 |     paths = set(traverse(testDataPath, ignore=['ignoreme.txt', 'ignoreme2']))
38 | 
39 |     assert paths == {testDataPath / 'imhere.txt', testDataPath / 'imhere2/real.txt'}
40 | 


--------------------------------------------------------------------------------
/src/promnesia/tests/utils.py:
--------------------------------------------------------------------------------
 1 | from collections.abc import Mapping, Sequence
 2 | from datetime import datetime, timedelta
 3 | from pathlib import Path
 4 | from typing import Optional, Union
 5 | 
 6 | from ..common import Loc, Source, Visit
 7 | from ..database.dump import visits_to_sqlite
 8 | from ..extract import extract_visits
 9 | 
10 | # TODO a bit shit... why did I make it dict at first??
11 | Urls = Union[
12 |            Mapping[str, Optional[str]],
13 |     Sequence[tuple[str, Optional[str]]],
14 | ]
15 | 
16 | 
17 | def index_urls(urls: Urls, *, source_name: str = 'test'):
18 |     uuu = list(urls.items()) if isinstance(urls, dict) else urls
19 | 
20 |     def idx(tmp_path: Path) -> None:
21 |         def indexer():
22 |             for i, (url, ctx) in enumerate(uuu):
23 |                 yield Visit(
24 |                     url=url,
25 |                     dt=datetime.min + timedelta(days=5000) + timedelta(hours=i),
26 |                     locator=Loc.make('test'),
27 |                     context=ctx,
28 |                 )
29 | 
30 |         db_visits = extract_visits(source=Source(indexer), src=source_name)
31 |         errors = visits_to_sqlite(vit=db_visits, overwrite_db=True, _db_path=tmp_path / 'promnesia.sqlite')
32 | 
33 |         assert len(errors) == 0, errors
34 | 
35 |     return idx
36 | 


--------------------------------------------------------------------------------
/tests/common.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | import sys
 5 | import time
 6 | from contextlib import contextmanager
 7 | from functools import wraps
 8 | from pathlib import Path
 9 | from typing import Iterator, TypeVar
10 | 
11 | import pytest
12 | import requests
13 | 
14 | from promnesia.tests.common import free_port
15 | 
16 | 
17 | def has_x() -> bool:
18 |     # meh, not very portable, but good enough for now
19 |     return 'DISPLAY' in os.environ
20 | 
21 | 
22 | def under_ci() -> bool:
23 |     return 'CI' in os.environ
24 | 
25 | 
26 | def skip_if_ci(reason):
27 |     return pytest.mark.skipif(under_ci(), reason=reason)
28 | 
29 | 
30 | def uses_x(f):
31 |     @skip_if_ci('Uses X server')
32 |     @wraps(f)
33 |     def ff(*args, **kwargs):
34 |         return f(*args, **kwargs)
35 | 
36 |     return ff
37 | 
38 | 
39 | @contextmanager
40 | def tmp_popen(*args, **kwargs):
41 |     import psutil
42 | 
43 |     with psutil.Popen(*args, **kwargs) as p:
44 |         try:
45 |             yield p
46 |         finally:
47 |             for c in p.children(recursive=True):
48 |                 c.kill()
49 |             p.kill()
50 |             p.wait()
51 | 
52 | 
53 | @contextmanager
54 | def local_http_server(path: Path) -> Iterator[str]:
55 |     address = '127.0.0.1'
56 |     with (
57 |         free_port() as port,
58 |         tmp_popen([sys.executable, '-m', 'http.server', '--directory', path, '--bind', address, str(port)]) as popen,
59 |     ):
60 |         endpoint = f'http://{address}:{port}'
61 | 
62 |         # meh.. but not sure if there is a better way to find out whether it's ready to serve requests
63 |         for _attempt in range(50):
64 |             try:
65 |                 requests.get(endpoint)
66 |             except:
67 |                 time.sleep(0.05)
68 |                 continue
69 |             else:
70 |                 break
71 |         yield endpoint
72 | 
73 | 
74 | T = TypeVar('T')
75 | 
76 | 
77 | def notnone(x: T | None) -> T:
78 |     assert x is not None
79 |     return x
80 | 


--------------------------------------------------------------------------------
/tests/convert_screencast.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | from pathlib import Path
 4 | from subprocess import check_call
 5 | 
 6 | 
 7 | def convert(path: Path):
 8 |     suf = '.mp4'
 9 |     if path.suffix == suf:
10 |         # makes it easier for shell globbing...
11 |         path = path.with_suffix('')
12 | 
13 |     inp  = path.with_suffix(suf)
14 |     assert inp.exists(), inp
15 |     subs = path.with_suffix('.ssa')
16 |     webm = path.with_suffix('.webm')
17 | 
18 | 
19 |     # jeez... https://video.stackexchange.com/a/28276/29090
20 |     # otherwise quiality sucks, e.g. letters are grainy
21 |     #
22 |     # ok, nice guide.. https://gist.github.com/Vestride/278e13915894821e1d6f#convert-to-webm
23 |     #
24 |     passfile = path.with_suffix(".pass0")
25 |     for stage in [
26 |             f'-b:v 0  -crf 30  -pass 1 -passlogfile {passfile} -an -f webm /dev/null',
27 |             f'-b:v 0  -crf 30  -pass 2 -passlogfile {passfile} {webm}' if all(
28 |                 x not in str(inp) for x in (
29 |                     # fucking hell, it segfaults...
30 |                     'child-visits-2',
31 |                     'highlights',
32 |                 )) else str(webm),
33 |     ]:
34 |         check_call([
35 |             'ffmpeg',
36 |             # TODO display banner if running interactively??
37 |             # '-hide_banner', '-loglevel', 'panic', # less spam
38 |             '-y', # allow overwrite
39 |             '-i', inp,
40 |             '-vf', f"ass={subs}",
41 |             *stage.split(),
42 |         ]) # TODO cwd??
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     paths = list(map(Path, sys.argv[1:]))
47 |     from concurrent.futures import ThreadPoolExecutor
48 |     with ThreadPoolExecutor() as pool:
49 |         for _ in pool.map(convert, paths):
50 |             # need to force the iterator
51 |             pass
52 | 


--------------------------------------------------------------------------------
/tests/install_and_run:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from pathlib import Path
 3 | from subprocess import check_call, check_output
 4 | import time
 5 | import json
 6 | import os
 7 | from tempfile import TemporaryDirectory
 8 | 
 9 | 
10 | # TODO reuse example config?
11 | CONFIG = """
12 | 
13 | OUTPUT_DIR = {output_dir}
14 | 
15 | SOURCES = [
16 |     'promnesia.sources.demo',
17 | ]
18 | 
19 | """
20 | 
21 | def systemctl(*args):
22 |     check_call([
23 |         'systemctl', '--no-pager', '--user', *args,
24 |     ])
25 | 
26 | 
27 | # TODO do it in pipenv?
28 | def run(tdir: Path):
29 |     cfg = CONFIG.format(output_dir=f'"{tdir}"')
30 |     cfg_file = tdir / 'config.py'
31 |     cfg_file.write_text(cfg)
32 | 
33 | 
34 |     promnesia = Path(__file__).absolute().parent.parent / 'scripts/promnesia'
35 | 
36 |     check_call([promnesia, 'index', '--config', cfg_file])
37 | 
38 |     check_call([
39 |         promnesia, 'install-server',
40 |         '--name' , 'promnesia-test', # should add .serice arg
41 |         '--db', str(tdir / 'promnesia.sqlite'),
42 |         '--timezone', 'Europe/Moscow',
43 |         '--port', '17777', # TODO get free port?
44 |     ])
45 | 
46 |     response = None
47 |     for x in range(10):
48 |         time.sleep(1)
49 |         try:
50 |             response = json.loads(check_output([
51 |                 'curl', 'localhost:17777/status', '--data', '',
52 |             ]).decode('utf8'))
53 |             break
54 |         except Exception as e:
55 |             print(str(e))
56 |     assert response is not None
57 | 
58 |     response = json.loads(check_output([
59 |         'curl', 'localhost:17777/status', '--data', '',
60 |     ]).decode('utf8'))
61 | 
62 |     print(response)
63 |     assert response['db'] == str(tdir / 'promnesia.sqlite')
64 | 
65 |     time.sleep(1)
66 |     systemctl('is-active', 'promnesia-test.service')
67 |     print("Test succeeded!")
68 | 
69 |     # TODO prompt for cleanup?
70 |     systemctl('stop'   , 'promnesia-test.service')
71 |     systemctl('disable', 'promnesia-test.service')
72 | 
73 | 
74 | def main():
75 |     with TemporaryDirectory() as tdir:
76 |         run(Path(tdir))
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     main()
81 | 


--------------------------------------------------------------------------------
/tests/record.py:
--------------------------------------------------------------------------------
  1 | from contextlib import contextmanager
  2 | from pathlib import Path
  3 | import re
  4 | from time import sleep
  5 | from subprocess import Popen, check_output
  6 | from typing import Optional, List, Union
  7 | 
  8 | # TODO decorator that records a video if a certain env var/flag is set (pass a custom name too)
  9 | 
 10 | @contextmanager
 11 | def hotkeys(geometry: Optional[str]=None):
 12 |     # TODO kill in advance??
 13 |     ctx = Popen([
 14 |         'screenkey',
 15 |         '--no-detach',
 16 |         '--key-mode', 'composed',
 17 |         '--scr', '0',
 18 |         '--timeout', '2',
 19 |         '--bg-color', '#000000',
 20 |         '--font-color', '#ffffff',
 21 |         '--font-size', 'large',
 22 |         '--opacity', '0.6',
 23 |         # TODO hmm. it has --persist arg, but no --no-persist??
 24 |         *([] if geometry is None else ['-g', geometry]),
 25 |     ])
 26 |     with ctx as p:
 27 |         try:
 28 |             yield p
 29 |         finally:
 30 |             p.kill()
 31 | 
 32 | 
 33 | 
 34 | @contextmanager
 35 | def record(output: Optional[Path]=None, wid: Optional[str]=None, quality: Optional[str]=None):
 36 |     assert output is not None, "TODO use tmp file or current dir??"
 37 |     # TODO to fullscreen if None?
 38 |     assert wid is not None
 39 | 
 40 | 
 41 |     # ugh. no idea wtf is happening here... why is position 2,90??
 42 |     # wmctrl -i -r 230686723 -e '0,0,0,400,400'
 43 |     # xdotool getwindowgeometry 230686723
 44 |     # Window 230686723
 45 |     #   Position: 2,90 (screen: 0)
 46 |     #   Geometry: 400x400
 47 |     # Position + Geometry don't add up to the screen size. fuck.
 48 |     #
 49 |     # ok, xwininfo seems more reliable
 50 |     #
 51 |     # xwininfo -id $(xdotool getactivewindow)'
 52 |     out = check_output(['xwininfo', '-id', wid]).decode('utf8').replace('\n', ' ')
 53 |     m = re.search(r'geometry (\d+)x(\d+)[+-](\d+)[+-](\d+)', out)
 54 |     assert m is not None, out
 55 |     w, h, x, y = m.groups()
 56 | 
 57 |     # fuck.
 58 |     titlebar = 32
 59 | 
 60 |     # fuck x 2
 61 |     margin   = 28
 62 | 
 63 |     cmd: List[Union[Path, str]] = [
 64 |         'ffmpeg',
 65 |         '-hide_banner', '-loglevel', 'panic', # less spam in the terminal
 66 |         '-f', 'x11grab',
 67 |         '-y',
 68 |         '-r', '30',
 69 |         '-s', f'{w}x{titlebar + int(h)}',
 70 |         '-i', f':0.0+{x},{margin + int(y)}',
 71 |         output,
 72 |     ]
 73 |     # TODO not sure if need converter script
 74 |     # TODO double check there are no ffmpeg processes remaining?
 75 |     # maybe, set timeout?
 76 | 
 77 |     with Popen(cmd) as p:
 78 |         # early check
 79 |         sleep(0.5)
 80 |         assert p.poll() is None, f'{cmd} died!'
 81 | 
 82 |         try:
 83 |             yield p
 84 |         finally:
 85 |             assert p.poll() is None, f'{cmd} died!'
 86 | 
 87 |             p.terminate()
 88 |             p.wait(timeout=10)
 89 | 
 90 | 
 91 | # https://stackoverflow.com/a/52669454/706389
 92 | CURSOR_SCRIPT = '''
 93 | function enableCursor() {
 94 |   var seleniumFollowerImg = document.createElement("img");
 95 |   seleniumFollowerImg.setAttribute('src', 'data:image/png;base64,'
 96 |     + 'iVBORw0KGgoAAAANSUhEUgAAABQAAAAeCAQAAACGG/bgAAAAAmJLR0QA/4ePzL8AAAAJcEhZcwAA'
 97 |     + 'HsYAAB7GAZEt8iwAAAAHdElNRQfgAwgMIwdxU/i7AAABZklEQVQ4y43TsU4UURSH8W+XmYwkS2I0'
 98 |     + '9CRKpKGhsvIJjG9giQmliHFZlkUIGnEF7KTiCagpsYHWhoTQaiUUxLixYZb5KAAZZhbunu7O/PKf'
 99 |     + 'e+fcA+/pqwb4DuximEqXhT4iI8dMpBWEsWsuGYdpZFttiLSSgTvhZ1W/SvfO1CvYdV1kPghV68a3'
100 |     + '0zzUWZH5pBqEui7dnqlFmLoq0gxC1XfGZdoLal2kea8ahLoqKXNAJQBT2yJzwUTVt0bS6ANqy1ga'
101 |     + 'VCEq/oVTtjji4hQVhhnlYBH4WIJV9vlkXLm+10R8oJb79Jl1j9UdazJRGpkrmNkSF9SOz2T71s7M'
102 |     + 'SIfD2lmmfjGSRz3hK8l4w1P+bah/HJLN0sys2JSMZQB+jKo6KSc8vLlLn5ikzF4268Wg2+pPOWW6'
103 |     + 'ONcpr3PrXy9VfS473M/D7H+TLmrqsXtOGctvxvMv2oVNP+Av0uHbzbxyJaywyUjx8TlnPY2YxqkD'
104 |     + 'dAAAAABJRU5ErkJggg==');
105 |   seleniumFollowerImg.setAttribute('id', 'selenium_mouse_follower');
106 |   seleniumFollowerImg.setAttribute('style', 'position: absolute; z-index: 99999999999; pointer-events: none; left:0; top:0');
107 |   document.body.appendChild(seleniumFollowerImg);
108 |   document.onmousemove = function (e) {
109 |     document.getElementById("selenium_mouse_follower").style.left = e.pageX + 'px';
110 |     document.getElementById("selenium_mouse_follower").style.top  = e.pageY + 'px';
111 |   };
112 | };
113 | 
114 | enableCursor();
115 | '''
116 | 
117 | 
118 | # https://stackoverflow.com/a/987376/706389
119 | SELECT_SCRIPT = '''
120 | function selectText(node) {
121 |     if (document.body.createTextRange) {
122 |         const range = document.body.createTextRange();
123 |         range.moveToElementText(node);
124 |         range.select();
125 |     } else if (window.getSelection) {
126 |         const selection = window.getSelection();
127 |         const range = document.createRange();
128 |         range.selectNodeContents(node);
129 |         selection.removeAllRanges();
130 |         selection.addRange(range);
131 |     } else {
132 |         console.warn("Could not select text in node: Unsupported browser.");
133 |     }
134 | }
135 | '''
136 | 


--------------------------------------------------------------------------------
/tests/testdata/auto/orgs/file.org:
--------------------------------------------------------------------------------
 1 | * TODO [#C] figure out
 2 | :PROPERTIES:
 3 | :CREATED:  [2018-08-06 Mon 22:52]
 4 | :END:
 5 | 
 6 | most important
 7 | 
 8 | * [2019-05-10 Fri 17:20] [[https://reddit.com/r/bodyweightfitness/comments/bl7nyy/how_i_learned_to_handstand_about_5_minutes_ago/][How I learned to handstand about 5 minutes ago, after trying for around a year. A surprising method you maybe haven't tried.]] /r/bodyweightfitness
 9 | 
10 |  This whole time I've been trying to keep myself up, when you're really supposed to be preventing the fall.  This exercise gets you to use the strongest muscles in this exercise (shoulders) to prevent your torso falling down. Whereas I think previously I, and a lot of people, would be trying to balance the body mostly with the hands, and the position of the legs, if that makes sense.
11 | 
12 |  Anyway, hope it helps someone.
13 | 
14 | * TODO [#C] [2019-10-16 Wed 08:28] xxx /r/cpp :programming:cpp:
15 |  I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_](
16 |  https://www.youtube.com/watch?v=rHIkrotSwcc) very much.
17 | 
18 | 
19 | * something...
20 | 
21 | - one
22 | - zwei
23 |   [[https://twitter.com/Mappletons/status/1255221220263563269][cozyweb]]
24 | - drei
25 | 


--------------------------------------------------------------------------------
/tests/testdata/auto/orgs/file2.org:
--------------------------------------------------------------------------------
 1 | #+FILETAGS: topology
 2 | 
 3 | simulations/visualisations of fundamental group
 4 | 
 5 | https://en.wikipedia.org/wiki/Computational_topology
 6 | 
 7 | http://graphics.stanford.edu/courses/cs468-09-fall/
 8 | hmm wonder if that does it. they mention triangulation.
 9 | 
10 | https://en.wikipedia.org/wiki/Triangulation_(topology)
11 | https://en.wikipedia.org/wiki/Digital_manifold
12 | 


--------------------------------------------------------------------------------
/tests/testdata/auto/orgs/file3.org:
--------------------------------------------------------------------------------
 1 | * [2016-05-14 Sat 15:33] [[https://www.reddit.com/r/androidapps/comments/4i36z9/how_you_use_your_android_to_the_maximum/d2uq24i][sc4s2cg comments on How you use your android to the maximum?]] :android:
 2 | 
 3 | * something
 4 |       https://link.com
 5 | 
 6 | * [2019-05-14 Tue 20:26] [[https://www.instapaper.com/read/1193274157][ip]]   [[https://blog.andymatuschak.org/post/169043084412/successful-habits-through-smoothly-ratcheting][Successful habits through smoothly ratcheting targets]]
 7 | 
 8 | 
 9 | * fewf
10 | 
11 | ** [2019-05-03 Fri 08:29] apparently [[https://en.wikipedia.org/wiki/Resilio_Sync][Resilio Sync]] exists, but it's proprietary, nothing else I know of or resulting from quick googling
12 | ** [2019-06-13 Thu 19:45] [[https://en.wikipedia.org/wiki/InterPlanetary_File_System][IPFS]] looks close, but appparently not user friendly yet
13 | 
14 | 


--------------------------------------------------------------------------------
/tests/testdata/auto/orgs/file4.org:
--------------------------------------------------------------------------------
1 | :PROPERTIES:
2 | :ID:       1554c141-9567-4345-99d9-7c5e2853dbaa
3 | :ROAM_REFS: https://example.org/ref_example
4 | :END:
5 | #+title: A Ref Example
6 | 
7 | We need [[http://example.org/a_test][a test]]!
8 | 


--------------------------------------------------------------------------------
/tests/testdata/auto/orgs/file5.org:
--------------------------------------------------------------------------------
1 | * sexp in property
2 | :PROPERTIES:
3 | :CREATED: <%%(diary-date 03 25 2023)>
4 | :END:
5 | 


--------------------------------------------------------------------------------
/tests/testdata/auto/pocket.json:
--------------------------------------------------------------------------------
 1 | {
 2 |  "status": 1,
 3 |  "complete": 1,
 4 |  "list": {
 5 |   "xxx": {
 6 |      "given_url": "https://johncarlosbaez.wordpress.com/2016/09/09/struggles-with-the-continuum-part-2/",
 7 |      "given_title": "Struggles with the Continuum (Part 2) | Azimuth",
 8 |      "favorite": "0",
 9 |      "status": "0",
10 |      "sort_id": 1,
11 |      "resolved_title": "Struggles with the Continuum (Part 2)"
12 |    },
13 |   "yyy": {
14 |      "given_url": "https://www.scottaaronson.com/blog/?p=2464",
15 |      "given_title": "Bell inequality violation finally done right",
16 |      "favorite": "0",
17 |      "sort_id": 2,
18 |      "resolved_title": "Bell inequality violation finally done right",
19 |      "resolved_url": "https://www.scottaaronson.com/blog/?p=2464",
20 |      "excerpt": "A few weeks ago, Hensen et al., of the Delft University of Technology and Barcelona, Spain, put out a paper reporting the first experiment that violates the Bell inequality in a way that closes off the two main loopholes simultaneously: the locality and detection loopholes.",
21 |      "is_article": "1"
22 |   }
23 |  }
24 | }
25 | 


--------------------------------------------------------------------------------
/tests/testdata/custom/file1.txt:
--------------------------------------------------------------------------------
 1 | Right, so this points at http://google.com
 2 | 
 3 | 
 4 | something something
 5 | 
 6 | trailing slash maybe? http://google.com/ ?
 7 | 
 8 | 
 9 | whoops! http://some-weird-domain.xyz/whatever
10 | 


--------------------------------------------------------------------------------
/tests/testdata/custom/file2.txt:
--------------------------------------------------------------------------------
1 | And this points at https://google.com . Whoa, so secure!
2 | 
3 | and http://what.about.this.link? Really, you should add a space before ?, :123: to confuse grep but what if you didnt?
4 | 
5 | this should be ignored chrome-extension://bfhcmckmbimgclmdomhanencdoefcnio/options_page.html since it's an internal brwoser link
6 | 


--------------------------------------------------------------------------------
/tests/testdata/logseq-graph/logseq/config.edn:
--------------------------------------------------------------------------------
1 | {}


--------------------------------------------------------------------------------
/tests/testdata/logseq-graph/pages/Note.md:
--------------------------------------------------------------------------------
1 | This is a test note with a [link](https://example.com).
2 | 


--------------------------------------------------------------------------------
/tests/testdata/normalise/ff.txt:
--------------------------------------------------------------------------------
 1 | http://hi.com
 2 | 
 3 | 
 4 | http://reddit.com/post
 5 | 
 6 | http://reddit.com/post&stupid_param=whatever
 7 | 
 8 | 
 9 | # this v isn't detected at the moment because of the typo in http. Not sure if it ever will be?
10 | htpp://reddit.com/post?whoops
11 | 
12 | http://www.argos.co.uk/webapp/wcs/stores/servlet/OrderItemDisplay
13 | 
14 | http://www.argos.co.uk/webapp/wcs/stores/servlet/OrderItemDisplay?storeId=10151&GiftMode=Reset&langId=110&krypto=DfKzD/frV1rz5gXFhfkTSOJ/+Fphcd/Mx/H5+m5Jfbp5UlOUllPqDwFbO94lNbtFaEXhWQ7bVqIl\nbqhTqO1zcQ7FXphHXYAO3bbj07XbbDf40pQX5mQFACOPRF0LPibBG6yqBP0RDWQWUl7vcgTmqA==
15 | 
16 | 
17 | https://www.youtube.com/watch?v=XXlZfc1TrD0
18 | 
19 | https://www.youtube.com/watch?v=XXlZfc1Tr11
20 | 


--------------------------------------------------------------------------------
/tests/testdata/obsidian-vault/.obsidian/app.json:
--------------------------------------------------------------------------------
1 | {}


--------------------------------------------------------------------------------
/tests/testdata/obsidian-vault/Note.md:
--------------------------------------------------------------------------------
1 | This is a test note with a [link](https://example.com).
2 | 


--------------------------------------------------------------------------------
/tests/testdata/takeout-20150518T000000Z.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/tests/testdata/takeout-20150518T000000Z.zip


--------------------------------------------------------------------------------
/tests/testdata/takeout/Takeout/My Activity/Chrome/MyActivity.html:
--------------------------------------------------------------------------------
 1 | <html><head><title>My Activity History</title><style type="text/css">
 2 | body {
 3 |   padding: 5px;
 4 |   background: #EEEEEE;
 5 | }
 6 | 
 7 | .mdl-cell {
 8 |   background-color: #FFFFFF;
 9 | }
10 | 
11 | .content-cell.mdl-cell {
12 |   color: rgba(0, 0, 0, 0.54);
13 | }
14 | 
15 | .header-cell.mdl-cell {
16 |   border-bottom-style: solid;
17 |   border-bottom-width: 1px;
18 |   border-bottom-color: rgba(0, 0, 0, 0.1);
19 | }
20 | 
21 | .image-preview {
22 |   width:72px;
23 |   height:72px;
24 | }
25 | 
26 | </style></head><body><div class="mdl-grid">
27 | 
28 | 
29 | 
30 | 
31 | 
32 | <div class="outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp"><div class="mdl-grid"><div class="header-cell mdl-cell mdl-cell--12-col"><p class="mdl-typography--title">Search<br></p></div>
33 | <div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1">Visited&nbsp;<a href="https://productforums.google.com/forum/">https://productforums.google.com/forum/</a><br>Jan 31, 2018, 10:54:50 PM</div>
34 | <div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1 mdl-typography--text-right"></div>
35 | <div class="content-cell mdl-cell mdl-cell--12-col mdl-typography--caption"><b>Products:</b><br>&emsp;Search<br></div></div></div></div>
36 | <div class="outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp"><div class="mdl-grid">
37 | <div class="header-cell mdl-cell mdl-cell--12-col"><p class="mdl-typography--title">Search<br></p></div>
38 | <div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1">Visited&nbsp;<a href="https://www.google.com/url?q=http://www.adobe.com/creativecloud.html&amp;usg=AFQjCNH6fum5tBw7J0dbmUYKGFPduC0vSg">http://www.adobe.com/creativecloud.html</a><br>Feb 8, 2017, 12:32:39 AM</div>
39 | <div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1 mdl-typography--text-right"></div>
40 | <div class="content-cell mdl-cell mdl-cell--12-col mdl-typography--caption"><b>Products:</b><br>&emsp;Search<br></div></div></div>
41 | <div class="outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp"><div class="mdl-grid">
42 | <div class="header-cell mdl-cell mdl-cell--12-col"><p class="mdl-typography--title">Search<br></p></div>
43 | <div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1">Searched for&nbsp;<a href="https://www.google.com/search?q=adobe+creative+cloud">adobe creative cloud</a><br>Feb 8, 2017, 12:32:36 AM</div>
44 | <div class="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1 mdl-typography--text-right"></div>
45 | <div class="content-cell mdl-cell mdl-cell--12-col mdl-typography--caption"><b>Products:</b><br>&emsp;Search<br><b>Locations:</b><br>&emsp;From your home: <a href="https://google.com/maps?q=25.800819,-80.186310">https://google.com/maps?q=25.800819,-80.186310</a>
46 | <br>
47 | </div></div></div></div></body>
48 | </html>


--------------------------------------------------------------------------------
/tests/testdata/test_config.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from promnesia.common import Source
 4 | from promnesia.sources.plaintext import extract_from_path
 5 | import promnesia.sources.shellcmd as shellcmd # type: ignore
 6 | import promnesia.sources.takeout as takeout # type: ignore
 7 | 
 8 | 
 9 | def index_takeout():
10 |     class user_config:
11 |         takeout_path = 'tests/testdata/takeout-20150518T000000Z.zip'
12 | 
13 |     import my.config
14 |     my.config.google = user_config # type: ignore
15 | 
16 |     yield from takeout.index()
17 | 
18 | 
19 | class Sources:
20 | 
21 |     TAKEOUT = Source(index_takeout, name='takeout')
22 | 
23 |     PLAIN = Source(
24 |         shellcmd.index,
25 |         extract_from_path('tests/testdata/custom'),
26 |         name='test',
27 |     )
28 | 
29 | 
30 | SOURCES = [
31 |     Sources.PLAIN,
32 |     Sources.TAKEOUT,
33 | ]
34 | 
35 | # todo ugh, this shouldn't really be collected by pytest...
36 | 


--------------------------------------------------------------------------------
/tests/testdata/test_multiple_page_updates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <head>
 3 | 
 4 | <script type="text/javascript" defer>
 5 | let index = 0
 6 | const update_title = () => {
 7 |     document.title = `title ${index}`
 8 |     index += 1
 9 |     setTimeout(update_title, 50)
10 | }
11 | update_title()
12 | 
13 | </script>
14 | 
15 | </head>
16 | <html>
17 | <body>
18 | 
19 | <h1>My First Heading</h1>
20 | <p>My first paragraph.</p>
21 | 
22 | <a href="https://github.com/karlicoss/promnesia">link to promnesia</a>
23 | 
24 | <a href="https://github.com/karlicoss/promnesia/issues">link to promnesia issues</a>
25 | 
26 | <a href="https://news.ycombinator.com/newest">link to HN</a>
27 | 
28 | <a href="https://github.com/karlicoss/promnesia">another link to promnesia</a>
29 | 
30 | 
31 | </body>
32 | </html> 
33 | 


--------------------------------------------------------------------------------
/tests/testdata/traverse/ignoreme.txt:
--------------------------------------------------------------------------------
1 | jaiofjeoriheoirjg


--------------------------------------------------------------------------------
/tests/testdata/traverse/ignoreme2/notrealignored.txt:
--------------------------------------------------------------------------------
1 | notrealignores


--------------------------------------------------------------------------------
/tests/testdata/traverse/imhere.txt:
--------------------------------------------------------------------------------
1 | imhere.txt


--------------------------------------------------------------------------------
/tests/testdata/traverse/imhere2/real.txt:
--------------------------------------------------------------------------------
1 | jdfioja


--------------------------------------------------------------------------------
/tests/testdata/weird.txt:
--------------------------------------------------------------------------------
1 | https://urbandictionary.com/define.php?term=Belgian%20Whistle
2 | right, so https://en.wikipedia.org/wiki/Dinic%27s_algorithm can be used for max flow
3 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
  1 | [tox]
  2 | minversion = 3.21
  3 | # relies on the correct version of Python installed
  4 | envlist = ruff,tests-core,tests-all,mypy-core,mypy-misc
  5 | # NOTE: we don't run end2end by default since it requires elaborate setup
  6 | # https://github.com/tox-dev/tox/issues/20#issuecomment-247788333
  7 | # hack to prevent .tox from crapping to the project directory
  8 | toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox
  9 | 
 10 | [testenv]
 11 | # TODO how to get package name from setuptools?
 12 | package_name = "promnesia"
 13 | passenv =
 14 | # useful for tests to know they are running under ci
 15 |     CI
 16 |     CI_*
 17 | # respect user's cache dirs to prevent tox from crapping into project dir
 18 |     PYTHONPYCACHEPREFIX
 19 |     MYPY_CACHE_DIR
 20 |     RUFF_CACHE_DIR
 21 | #
 22 |     MY_CONFIG
 23 |     # by default we don't run browser tests to avoid confusing people when they run locally
 24 |     # but we want them on CI, so we allow to pass through the variable when we do want to run them
 25 |     WITH_BROWSER_TESTS
 26 |     # todo ugh this is all so confusing... need to simplify
 27 | usedevelop = true  # for some reason tox seems to ignore "-e ." in deps section??
 28 | uv_seed = true  # seems necessary so uv creates separate venvs per tox env?
 29 | setenv =
 30 |     HPI_MODULE_INSTALL_USE_UV=true
 31 | 
 32 | 
 33 | [testenv:ruff]
 34 | dependency_groups = testing
 35 | commands =
 36 |     {envpython} -m ruff check src/
 37 | 
 38 | 
 39 | # just the very core tests with minimal dependencies
 40 | [testenv:tests-core]
 41 | dependency_groups = testing
 42 | deps =
 43 |     -e .[markdown]
 44 |     # NOTE: markdown is only used for test_cli... might be nice to decouple
 45 | commands =
 46 |     # posargs allow test filtering, e.g. tox ... -- -k test_name
 47 |     {envpython} -m pytest \
 48 |         --pyargs {[testenv]package_name}     \
 49 |         # note: sources are tested in tests-all
 50 |         --ignore src/promnesia/sources       \
 51 |         --ignore src/promnesia/tests/sources \
 52 |         {posargs}
 53 | 
 54 | 
 55 | [testenv:tests-all]
 56 | dependency_groups = testing
 57 | deps =
 58 |     -e .[all,HPI,org]
 59 |     beautifulsoup4<4.13.0  # FIXME temporary hack until https://github.com/purarue/google_takeout_parser/pull/81 is merged
 60 |     uv  # for hpi module install
 61 | commands =
 62 |     # used in some tests
 63 |     {envpython} -m my.core module install \
 64 |         my.google.takeout.parser \
 65 |         my.hypothesis
 66 |     {envpython} -m pytest \
 67 |         --pyargs {[testenv]package_name}     \
 68 |         {posargs}
 69 | 
 70 | 
 71 | [testenv:end2end]
 72 | setenv =
 73 |     WITH_BROWSER_TESTS=true
 74 |     PYTEST_TIMEOUT=120
 75 | dependency_groups =
 76 |     testing
 77 |     testing-end2end
 78 | deps =
 79 |     -e .[HPI]
 80 |     uv  # for hpi module install
 81 | commands =
 82 |     {envpython} -m my.core module install my.hypothesis
 83 |     {envpython} -m pytest     \
 84 |         # TODO noconftest is hack due to end2end tests being in a separate dir
 85 |         # ideally need to just move it inside the package as well
 86 |         --noconftest          \
 87 |         tests/end2end_test.py \
 88 |         {posargs}
 89 | 
 90 | 
 91 | [testenv:mypy-core]
 92 | dependency_groups = testing
 93 | commands =
 94 |     {envpython} -m mypy --no-install-types \
 95 |         # note: sources are tested separately, below
 96 |         -p {[testenv]package_name} --exclude 'sources/*' \
 97 |         # txt report is a bit more convenient to view on CI
 98 |         --txt-report  .coverage.mypy-core  \
 99 |         --html-report .coverage.mypy-core  \
100 |         {posargs}
101 | 
102 | 
103 | [testenv:mypy-misc]
104 | dependency_groups = testing
105 | deps =
106 |     -e .[HPI,org,markdown]  # todo install from HPI[all] or something?
107 |     beautifulsoup4<4.13.0  # FIXME temporary hack until https://github.com/purarue/google_takeout_parser/pull/81 is merged
108 |     uv  # for hpi module install
109 | commands =
110 |     {envpython} -m my.core module install \
111 |         my.github.ghexport                \
112 |         my.hypothesis                     \
113 |         my.instapaper                     \
114 |         my.pocket                         \
115 |         my.reddit                         \
116 |         my.fbmessenger                    \
117 |         my.google.takeout.parser          \
118 |         my.browser.export
119 | 
120 |     {envpython} -m mypy --no-install-types \
121 |         -p {[testenv]package_name}.sources \
122 |         # txt report is a bit more convenient to view on CI
123 |         --txt-report  .coverage.mypy-misc \
124 |         --html-report .coverage.mypy-misc \
125 |         {posargs}
126 | 
127 |     # ugh. a bit crap to run it separately
128 |     # but first will need to move tests inside the package if we want otherwise?
129 |     # and I recall it was problematic at times..
130 |     {envpython} -m mypy --no-install-types \
131 |                    tests --exclude 'testdata/*' \
132 |                    {posargs}
133 | 


--------------------------------------------------------------------------------