├── .ci ├── end2end │ ├── .dockerignore │ ├── Dockerfile │ ├── build_and_run.sh │ └── scripts │ │ ├── build_and_run_tests.sh │ │ ├── setup_chrome.sh │ │ ├── setup_firefox.sh │ │ └── setup_node.sh ├── fake-systemd │ └── systemctl ├── github-ci-compat ├── release ├── release-uv └── run ├── .dockerignore ├── .github └── workflows │ └── main.yml ├── .gitignore ├── .gitmodules ├── CHANGELOG.org ├── LICENSE ├── README.org ├── benchmarks └── 20231115.org ├── ci └── run-github-locally ├── conftest.py ├── doc ├── DEVELOPMENT.org ├── GUIDE.org ├── PRIVACY.org ├── SOURCES.org ├── TROUBLESHOOTING.org ├── addons-mozilla-org.org └── config.py ├── docker ├── .gitignore ├── docker_files │ ├── Dockerfile │ ├── Dockerfile-indexer │ ├── docker-compose.yaml │ ├── indexer-config.py.example │ └── indexer-entrypoint.sh ├── get-some-data.sh ├── init.sh └── start.sh ├── extension ├── .ci │ └── build ├── .editorconfig ├── MANUAL-TESTS.org ├── TODO.org ├── __mocks__ │ ├── browser.js │ └── dom-form-serializer.js ├── amo-metadata.json ├── babel.config.cjs ├── build ├── eslint.config.js ├── generate_manifest.js ├── jest.config.cjs ├── old │ ├── flow-typed │ │ └── webextension-polyfill.js │ ├── patcher.js │ └── webpack.config.js ├── package-lock.json ├── package.json ├── rollup.config.js ├── src │ ├── api.ts │ ├── background.ts │ ├── background_chrome_mv2.js │ ├── common.ts │ ├── compat.ts │ ├── display.ts │ ├── filterlist.ts │ ├── images │ │ ├── generate │ │ ├── ic_blacklisted_48.png │ │ ├── ic_blue_48.png │ │ ├── ic_boring_48.png │ │ ├── ic_error.png │ │ ├── ic_not_visited_48.png │ │ ├── ic_relatives_48.png │ │ ├── ic_visited_48.png │ │ └── source_48.svg │ ├── normalise.ts │ ├── notifications.ts │ ├── options.ts │ ├── options_page.css │ ├── options_page.html │ ├── options_page.ts │ ├── search.html │ ├── search.ts │ ├── selenium_bridge.js │ ├── showvisited.css │ ├── showvisited.js │ ├── sidebar-outer.css │ ├── sidebar.css │ ├── sidebar.ts │ ├── sources.ts │ ├── toastify.css │ └── toastify.js ├── tests │ ├── anchorme.test.js │ ├── common.test.js │ ├── defensify.test.js │ ├── integration.test.js │ └── test.html └── tsconfig.json ├── mypy.ini ├── pyproject.toml ├── pytest.ini ├── ruff.toml ├── scripts ├── backup-phone-history.sh ├── browser_history.py └── promnesia ├── src └── promnesia │ ├── __init__.py │ ├── __main__.py │ ├── cannon.py │ ├── common.py │ ├── compare.py │ ├── compat.py │ ├── config.py │ ├── database │ ├── common.py │ ├── dump.py │ └── load.py │ ├── extract.py │ ├── logging.py │ ├── misc │ ├── __init__.pyi │ ├── config_example.py │ └── install_server.py │ ├── py.typed │ ├── server.py │ ├── sources │ ├── auto.py │ ├── auto_logseq.py │ ├── auto_obsidian.py │ ├── browser.py │ ├── browser_legacy.py │ ├── demo.py │ ├── fbmessenger.py │ ├── filetypes.py │ ├── github.py │ ├── guess.py │ ├── hackernews.py │ ├── hpi.py │ ├── html.py │ ├── hypothesis.py │ ├── instapaper.py │ ├── markdown.py │ ├── org.py │ ├── plaintext.py │ ├── pocket.py │ ├── reddit.py │ ├── roamresearch.py │ ├── rss.py │ ├── shellcmd.py │ ├── signal.py │ ├── smscalls.py │ ├── stackexchange.py │ ├── takeout.py │ ├── takeout_legacy.py │ ├── telegram.py │ ├── telegram_legacy.py │ ├── twitter.py │ ├── vcs.py │ ├── viber.py │ ├── website.py │ └── zulip.py │ ├── sqlite.py │ └── tests │ ├── __init__.py │ ├── common.py │ ├── server_helper.py │ ├── sources │ ├── __init__.py │ ├── test_auto.py │ ├── test_filetypes.py │ ├── test_hypothesis.py │ ├── test_org.py │ ├── test_plaintext.py │ ├── test_shellcmd.py │ └── test_takeout.py │ ├── test_cannon.py │ ├── test_cli.py │ ├── test_compare.py │ ├── test_config.py │ ├── test_db_dump.py │ ├── test_extract.py │ ├── test_extract_urls.py │ ├── test_indexer.py │ ├── test_server.py │ ├── test_traverse.py │ └── utils.py ├── tests ├── addon.py ├── addon_helper.py ├── common.py ├── convert_screencast.py ├── demos.py ├── end2end_test.py ├── install_and_run ├── record.py ├── testdata │ ├── auto │ │ ├── orgs │ │ │ ├── file.org │ │ │ ├── file2.org │ │ │ ├── file3.org │ │ │ ├── file4.org │ │ │ └── file5.org │ │ └── pocket.json │ ├── custom │ │ ├── file1.txt │ │ └── file2.txt │ ├── logseq-graph │ │ ├── logseq │ │ │ └── config.edn │ │ └── pages │ │ │ └── Note.md │ ├── normalise │ │ └── ff.txt │ ├── obsidian-vault │ │ ├── .obsidian │ │ │ └── app.json │ │ └── Note.md │ ├── takeout-20150518T000000Z.zip │ ├── takeout │ │ └── Takeout │ │ │ └── My Activity │ │ │ └── Chrome │ │ │ └── MyActivity.html │ ├── test_config.py │ ├── test_multiple_page_updates │ │ └── index.html │ ├── traverse │ │ ├── ignoreme.txt │ │ ├── ignoreme2 │ │ │ └── notrealignored.txt │ │ ├── imhere.txt │ │ └── imhere2 │ │ │ └── real.txt │ └── weird.txt └── webdriver_utils.py └── tox.ini /.ci/end2end/.dockerignore: -------------------------------------------------------------------------------- 1 | build_and_run.sh 2 | -------------------------------------------------------------------------------- /.ci/end2end/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:latest 2 | 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | 5 | COPY scripts /scripts 6 | 7 | RUN /scripts/setup_firefox.sh \ 8 | && /scripts/setup_chrome.sh \ 9 | && /scripts/setup_node.sh \ 10 | && apt install --yes pipx git \ 11 | # using python docs as a source of some html test data 12 | # need to prevent dpkg from excluding doc files... 13 | && sed -i '/usr.share.doc/d' /etc/dpkg/dpkg.cfg.d/excludes && apt install --yes python3-doc \ 14 | && apt clean \ 15 | && mkdir /promnesia 16 | 17 | WORKDIR /promnesia 18 | 19 | ENTRYPOINT ["/scripts/build_and_run_tests.sh"] 20 | -------------------------------------------------------------------------------- /.ci/end2end/build_and_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | 4 | # TODO assert we're running under github ci? 5 | # since this setup is kinda elaborate and can be somewhat unsafe to run blindly 6 | 7 | # supposed to be called from promnesia repository root 8 | [ -e src/promnesia ] 9 | [ -e extension/src ] 10 | 11 | PROMNESIA_SRC="$(pwd)" 12 | 13 | cd .ci/end2end 14 | 15 | IMAGE='promnesia_end2end_tests' 16 | 17 | docker build -t "$IMAGE" . 18 | 19 | # NOTE: dev/shm mount to prevent crashes during headless chrome 20 | docker run -v /dev/shm:/dev/shm --mount "type=bind,src=$PROMNESIA_SRC,dst=/promnesia_source,readonly=true" -e CI "$IMAGE" "$@" 21 | -------------------------------------------------------------------------------- /.ci/end2end/scripts/build_and_run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | 4 | # Seems wrong to keep the whole repository in docker build context. 5 | # So instead, we mount the repository inside the container (into /promnesia_source) 6 | # (as read only to avoid messing up host files and crapping with caches etc.) 7 | # However to actually run tests we do need a writable directory.. 8 | # So we copy the repo to the actual working dir here 9 | 10 | # ugh, kinda annoying -- not sure how to update source files when we change them on the host system... 11 | cp -R -T /promnesia_source /promnesia 12 | extension/.ci/build 13 | 14 | git init # todo ??? otherwise setuptools-scm fails to detect the version... 15 | 16 | # eh. kinda annoying to jump over so many venv layer here... 17 | # but docker runs as root and it doesn't like pip install uv now 18 | # even if you pass --break-system-packages, then subsequent uv invocation also fails 19 | pipx run uv tool run --with=tox-uv tox -e end2end -- "$@" 20 | -------------------------------------------------------------------------------- /.ci/end2end/scripts/setup_chrome.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux -o pipefail 3 | 4 | apt update --yes 5 | 6 | apt install --yes wget 7 | 8 | install -d -m 0755 /etc/apt/keyrings 9 | wget -q https://dl.google.com/linux/linux_signing_key.pub -O- | tee /etc/apt/keyrings/linux_signing_key.pub.asc > /dev/null 10 | echo "deb [signed-by=/etc/apt/keyrings/linux_signing_key.pub.asc] https://dl.google.com/linux/chrome/deb/ stable main" | tee -a /etc/apt/sources.list.d/google-chrome.list > /dev/null 11 | 12 | apt update 13 | 14 | apt install --yes google-chrome-stable 15 | 16 | # sadly latest version of chrome/chromedriver isn't working due to some bugs with iframes (see install_custom_chrome) 17 | 18 | # remove the actual chrome to get it out of the way (we do want dependencies though) 19 | apt remove --yes google-chrome-stable 20 | ! which google-chrome # check there is no binary (in case of virtual packages or whatever) 21 | 22 | function install_custom_chrome() { 23 | ## this installs last revision that was actually working (1110897) or 113.0.5623.0 24 | ## see https://bugs.chromium.org/p/chromedriver/issues/detail?id=4440 25 | apt install --yes unzip 26 | 27 | mkdir /tmp/chrome 28 | 29 | wget -q 'https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2F1110897%2Fchrome-linux.zip?generation=1677589092014487&alt=media' \ 30 | -O /tmp/chrome/chrome-linux.zip 31 | unzip /tmp/chrome/chrome-linux.zip -d /tmp/chrome 32 | ln -sf /tmp/chrome/chrome-linux/chrome /usr/bin/google-chrome 33 | 34 | wget -q 'https://www.googleapis.com/download/storage/v1/b/chromium-browser-snapshots/o/Linux_x64%2F1110897%2Fchromedriver_linux64.zip?generation=1677589097630198&alt=media' \ 35 | -O /tmp/chrome/chromedriver_linux64.zip 36 | unzip /tmp/chrome/chromedriver_linux64.zip -d /tmp/chrome 37 | ln -sf /tmp/chrome/chromedriver_linux64/chromedriver /usr/bin/chromedriver 38 | } 39 | 40 | install_custom_chrome 41 | -------------------------------------------------------------------------------- /.ci/end2end/scripts/setup_firefox.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux -o pipefail 3 | 4 | apt update --yes 5 | 6 | apt install --yes wget 7 | 8 | # NOTE: these days ubuntu provisions firefox via snap, and it's a nightmare to make it work with webdriver 9 | # so we force it to use a regular package (following these instructions https://askubuntu.com/a/1510872/427470) 10 | install -d -m 0755 /etc/apt/keyrings 11 | wget -q https://packages.mozilla.org/apt/repo-signing-key.gpg -O- | tee /etc/apt/keyrings/packages.mozilla.org.asc > /dev/null 12 | echo "deb [signed-by=/etc/apt/keyrings/packages.mozilla.org.asc] https://packages.mozilla.org/apt mozilla main" | tee -a /etc/apt/sources.list.d/mozilla.list > /dev/null 13 | 14 | # prevent snap version from overriding: 15 | echo ' 16 | Package: * 17 | Pin: origin packages.mozilla.org 18 | Pin-Priority: 1000 19 | ' | tee /etc/apt/preferences.d/mozilla 20 | # to check: -- should not show anything mentioning snap 21 | # apt install --verbose-versions --dry-run firefox 22 | 23 | apt update 24 | 25 | apt install --yes firefox 26 | # NOTE: selenium should download the corresponding geckodriver itself via selenium_manager 27 | -------------------------------------------------------------------------------- /.ci/end2end/scripts/setup_node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux -o pipefail 3 | 4 | apt update --yes 5 | apt install --yes curl 6 | curl -fsSL https://deb.nodesource.com/setup_20.x | bash - 7 | apt install --yes nodejs 8 | -------------------------------------------------------------------------------- /.ci/fake-systemd/systemctl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # dummy systemctl implementation that's capable of running the service and nothing else 3 | 4 | import argparse 5 | from pathlib import Path 6 | from subprocess import Popen 7 | import sys 8 | 9 | 10 | def main(): 11 | args = sys.argv[1:] 12 | args = [x for x in args if not x.startswith('--')] 13 | 14 | print(args) 15 | cmd = args[0] 16 | if cmd != 'start': 17 | return 18 | 19 | name = args[1] 20 | sdir = Path('~/.config/systemd/user').expanduser() 21 | unit = sdir / name 22 | 23 | contents = unit.read_text() 24 | 25 | ES = 'ExecStart=' 26 | command = None 27 | for line in contents.splitlines(): 28 | if line.startswith(ES): 29 | command = line[len(ES):] 30 | break 31 | assert command is not None, contents 32 | 33 | # after that will be inherined by init 34 | print('Running: ' + command) 35 | Popen(command, shell=True) 36 | 37 | 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /.ci/github-ci-compat: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eux 2 | 3 | # install sudo if it's missing 4 | # probably means that we're running under local docker.. 5 | if ! which sudo; then 6 | apt update 7 | apt -y install sudo 8 | fi 9 | 10 | # make up for differences between ubuntu:focal and github action image... 11 | sudo apt -y install python3.12 python3.12-dev 12 | sudo apt -y install python3-pip python3-setuptools 13 | 14 | # otherwise setuptools don't work.. 15 | sudo apt -y install git 16 | 17 | # jq wants it?? 18 | sudo apt -y install dh-autoreconf -------------------------------------------------------------------------------- /.ci/release: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Run [[file:scripts/release][scripts/release]] to deploy Python package onto [[https://pypi.org][PyPi]] and [[https://test.pypi.org][test PyPi]]. 4 | 5 | The script expects =TWINE_PASSWORD= environment variable to contain the [[https://pypi.org/help/#apitoken][PyPi token]] (not the password!). 6 | 7 | The script can be run manually. 8 | It's also running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]]. Packages are deployed on: 9 | - every master commit, onto test pypi 10 | - every new tag, onto production pypi 11 | 12 | You'll need to set =TWINE_PASSWORD= and =TWINE_PASSWORD_TEST= in [[https://help.github.com/en/actions/configuring-and-managing-workflows/creating-and-storing-encrypted-secrets#creating-encrypted-secrets][secrets]] 13 | for Github Actions deployment to work. 14 | ''' 15 | 16 | import os 17 | import sys 18 | from pathlib import Path 19 | from subprocess import check_call 20 | import shutil 21 | 22 | is_ci = os.environ.get('CI') is not None 23 | 24 | def main() -> None: 25 | import argparse 26 | p = argparse.ArgumentParser() 27 | p.add_argument('--test', action='store_true', help='use test pypi') 28 | args = p.parse_args() 29 | 30 | extra = [] 31 | if args.test: 32 | extra.extend(['--repository', 'testpypi']) 33 | 34 | root = Path(__file__).absolute().parent.parent 35 | os.chdir(root) # just in case 36 | 37 | if is_ci: 38 | # see https://github.com/actions/checkout/issues/217 39 | check_call('git fetch --prune --unshallow'.split()) 40 | 41 | dist = root / 'dist' 42 | if dist.exists(): 43 | shutil.rmtree(dist) 44 | 45 | check_call(['python3', '-m', 'build']) 46 | 47 | TP = 'TWINE_PASSWORD' 48 | password = os.environ.get(TP) 49 | if password is None: 50 | print(f"WARNING: no {TP} passed", file=sys.stderr) 51 | import pip_secrets 52 | password = pip_secrets.token_test if args.test else pip_secrets.token # meh 53 | 54 | check_call([ 55 | 'python3', '-m', 'twine', 56 | 'upload', *dist.iterdir(), 57 | *extra, 58 | ], env={ 59 | 'TWINE_USERNAME': '__token__', 60 | TP: password, 61 | **os.environ, 62 | }) 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /.ci/release-uv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Deploys Python package onto [[https://pypi.org][PyPi]] or [[https://test.pypi.org][test PyPi]]. 4 | 5 | - running manually 6 | 7 | You'll need =UV_PUBLISH_TOKEN= env variable 8 | 9 | - running on Github Actions 10 | 11 | Instead of env variable, relies on configuring github as Trusted publisher (https://docs.pypi.org/trusted-publishers/) -- both for test and regular pypi 12 | 13 | It's running as =pypi= job in [[file:.github/workflows/main.yml][Github Actions config]]. 14 | Packages are deployed on: 15 | - every master commit, onto test pypi 16 | - every new tag, onto production pypi 17 | ''' 18 | 19 | UV_PUBLISH_TOKEN = 'UV_PUBLISH_TOKEN' 20 | 21 | import argparse 22 | import os 23 | import shutil 24 | from pathlib import Path 25 | from subprocess import check_call 26 | 27 | is_ci = os.environ.get('CI') is not None 28 | 29 | def main() -> None: 30 | p = argparse.ArgumentParser() 31 | p.add_argument('--use-test-pypi', action='store_true') 32 | args = p.parse_args() 33 | 34 | publish_url = ['--publish-url', 'https://test.pypi.org/legacy/'] if args.use_test_pypi else [] 35 | 36 | root = Path(__file__).absolute().parent.parent 37 | os.chdir(root) # just in case 38 | 39 | if is_ci: 40 | # see https://github.com/actions/checkout/issues/217 41 | check_call('git fetch --prune --unshallow'.split()) 42 | 43 | # TODO ok, for now uv won't remove dist dir if it already exists 44 | # https://github.com/astral-sh/uv/issues/10293 45 | dist = root / 'dist' 46 | if dist.exists(): 47 | shutil.rmtree(dist) 48 | 49 | # todo what is --force-pep517? 50 | check_call(['uv', 'build']) 51 | 52 | if not is_ci: 53 | # CI relies on trusted publishers so doesn't need env variable 54 | assert UV_PUBLISH_TOKEN in os.environ, f'no {UV_PUBLISH_TOKEN} passed' 55 | 56 | check_call(['uv', 'publish', *publish_url]) 57 | 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /.ci/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eu 3 | 4 | cd "$(dirname "$0")" 5 | cd .. # git root 6 | 7 | if ! command -v sudo; then 8 | # CI or Docker sometimes doesn't have it, so useful to have a dummy 9 | function sudo { 10 | "$@" 11 | } 12 | fi 13 | 14 | # --parallel-live to show outputs while it's running 15 | tox_cmd='run-parallel --parallel-live' 16 | if [ -n "${CI-}" ]; then 17 | # install OS specific stuff here 18 | # TODO: pyjq is not necessary anymore? will keep CI deps just in case I guess 19 | PYJQ_DEPS=('autoconf' 'automake' 'libtool') # see https://github.com/mwilliamson/jq.py#installation 20 | case "$OSTYPE" in 21 | darwin*) 22 | # macos 23 | brew install "${PYJQ_DEPS[@]}" 24 | 25 | # TODO hmm. this should be in setup.py? 26 | brew install libmagic # for python-magic 27 | ;; 28 | cygwin* | msys* | win*) 29 | # windows 30 | # ugh. parallel stuff seems super flaky under windows, some random failures, "file used by other process" and crap like that 31 | tox_cmd='run' 32 | ;; 33 | *) 34 | # must be linux? 35 | sudo apt update 36 | 37 | # TODO also need to warn from readme?? 38 | sudo apt install "${PYJQ_DEPS[@]}" python3-dev 39 | ;; 40 | esac 41 | fi 42 | 43 | # NOTE: expects uv installed 44 | uv tool run --with tox-uv tox $tox_cmd "$@" 45 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | # org-mode test data 2 | [submodule "tests/testdata/ox-hugo"] 3 | path = tests/testdata/ox-hugo 4 | url = https://github.com/kaushalmodi/ox-hugo.git 5 | # hypothesis module + testdata 6 | [submodule "tests/testdata/hypexport"] 7 | path = tests/testdata/hypexport 8 | url = https://github.com/karlicoss/hypexport.git 9 | -------------------------------------------------------------------------------- /CHANGELOG.org: -------------------------------------------------------------------------------- 1 | * =v1.0.20210415= 2 | 3 | Thanks @ankostis, @purarue, @gms8994, @Cobertos and others for changes! 4 | 5 | ** general 6 | - *deprecate*: if you have =import promnesia= in the config you should switch it to =import promnesia.common= 7 | 8 | see https://github.com/karlicoss/promnesia/pull/225 9 | This brings us closer towards making promnesia a namespaced package to allow for better extensibility. 10 | - better Windows support https://github.com/karlicoss/promnesia/pull/197 11 | 12 | ** indexer 13 | - *new*: 'update' style indexing is now the default https://github.com/karlicoss/promnesia/pull/211 14 | 15 | It means that database won't be emptied before reindexing, so if you only index a single datasource, the data for other datasources will be untouched. 16 | If you want the previous behaviour, you can use =--overwrite= 17 | - fixes for race conditions during 'update' style indexing https://github.com/karlicoss/promnesia/pull/220 18 | - minor cannon enhancements 19 | 20 | ** server 21 | - fix deprecation in sqlalchemy API https://github.com/karlicoss/promnesia/pull/221 22 | 23 | ** sources 24 | 25 | - *new*: viber data source (local desktop database) 26 | - https://github.com/karlicoss/promnesia/pull/204 27 | - https://github.com/karlicoss/promnesia/pull/208 28 | - https://github.com/karlicoss/promnesia/pull/224 29 | - *new*: safari browser data https://github.com/karlicoss/promnesia/pull/207 30 | - *new*: stackexchange source https://github.com/karlicoss/promnesia/pull/189 31 | - auto indexer: better directory pruning https://github.com/karlicoss/promnesia/pull/209 32 | - telegram: enhancements to opt out of non-http link extraction 33 | - firefox: handle Fenix databases properly https://github.com/karlicoss/promnesia/pull/227 34 | - hypothesis: 35 | - extract tags https://github.com/karlicoss/promnesia/pull/199 36 | - extract URLs from annotation text https://github.com/karlicoss/promnesia/pull/222 37 | 38 | * for older versions, see https://github.com/karlicoss/promnesia/releases 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Dmitrii Gerasimov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /benchmarks/20231115.org: -------------------------------------------------------------------------------- 1 | Running on @karlicoss desktop PC, =python3.10= (under docker) 2 | 3 | - dumping via cachew NTBinder, using regular sqlalchemy insert statement 4 | 5 | #+begin_example 6 | $ python3 -m pytest --import-mode=importlib --pyargs promnesia.tests -s -k 'gc_off and benchmark and 100000' 7 | 12.90s call src/promnesia/tests/test_db_dump.py::test_benchmark_visits_dumping[gc_off-100000] 8 | PASSED src/promnesia/tests/test_db_dump.py::test_benchmark_visits_dumping[gc_off-100000] 9 | #+end_example 10 | 11 | 12 | - dumping via cachew NTBinder, using dbengine to insert directly -- massive speedup (added a test with 1M visits too) 13 | #+begin_example 14 | $ python3 -m pytest --import-mode=importlib --pyargs promnesia.tests -s -k 'gc_off and benchmark and 100000' 15 | 0.85s call src/promnesia/tests/test_db_dump.py::test_benchmark_visits_dumping[gc_off-100000] 16 | 8.23s call src/promnesia/tests/test_db_dump.py::test_benchmark_visits_dumping[gc_off-1000000] 17 | #+end_example 18 | -------------------------------------------------------------------------------- /ci/run-github-locally: -------------------------------------------------------------------------------- 1 | #!/bin/bash -eux 2 | 3 | cd "$(dirname "$0")" 4 | 5 | cd .. 6 | 7 | act -P ubuntu-latest=ubuntu:bionic "$@" 8 | 9 | # you can docker exec -it /bin/bash into the container and debug there 10 | 11 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | # this is a hack to monkey patch pytest so it handles tests inside namespace packages without __init__.py properly 2 | # without it, pytest can't discover the package root for some reason 3 | # also see https://github.com/karlicoss/pytest_namespace_pkgs for more 4 | 5 | import os 6 | import pathlib 7 | from typing import Optional 8 | 9 | import _pytest.main 10 | import _pytest.pathlib 11 | 12 | # we consider all dirs in repo/ to be namespace packages 13 | root_dir = pathlib.Path(__file__).absolute().parent.resolve() / 'src' 14 | assert root_dir.exists(), root_dir 15 | 16 | # TODO assert it contains package name?? maybe get it via setuptools.. 17 | 18 | namespace_pkg_dirs = [str(d) for d in root_dir.iterdir() if d.is_dir()] 19 | 20 | # resolve_package_path is called from _pytest.pathlib.import_path 21 | # takes a full abs path to the test file and needs to return the path to the 'root' package on the filesystem 22 | resolve_pkg_path_orig = _pytest.pathlib.resolve_package_path 23 | def resolve_package_path(path: pathlib.Path) -> Optional[pathlib.Path]: 24 | result = path # search from the test file upwards 25 | for parent in result.parents: 26 | if str(parent) in namespace_pkg_dirs: 27 | return parent 28 | if os.name == 'nt': 29 | # ??? for some reason on windows it is trying to call this against conftest? but not on linux/osx 30 | if path.name == 'conftest.py': 31 | return resolve_pkg_path_orig(path) 32 | raise RuntimeError("Couldn't determine path for ", path) 33 | _pytest.pathlib.resolve_package_path = resolve_package_path 34 | 35 | 36 | # without patching, the orig function returns just a package name for some reason 37 | # (I think it's used as a sort of fallback) 38 | # so we need to point it at the absolute path properly 39 | # not sure what are the consequences.. maybe it wouldn't be able to run against installed packages? not sure.. 40 | search_pypath_orig = _pytest.main.search_pypath 41 | def search_pypath(module_name: str) -> str: 42 | mpath = root_dir / module_name.replace('.', os.sep) 43 | if not mpath.is_dir(): 44 | mpath = mpath.with_suffix('.py') 45 | assert mpath.exists(), mpath # just in case 46 | return str(mpath) 47 | _pytest.main.search_pypath = search_pypath 48 | -------------------------------------------------------------------------------- /doc/PRIVACY.org: -------------------------------------------------------------------------------- 1 | *TLDR: Promnesia extension isn't collecting any browser data and isn't transmitting your data to any external site. Not telemetry is collected or sent either.* 2 | 3 | * How Promnesia works 4 | See [[file:../README.org#how-does-it-work]["How does it work"]] for the longer explanation, but in summary: 5 | 6 | - promnesia indexer runs against the files on your computer, and stores the results in the intermediate database (also on your computer) 7 | - promnesia server runs on your computer and exposes a local port (=13131= by default), so it can communicate with the browser extension 8 | - promnesia extension runs in your browser and requests various data from the promnesia server 9 | 10 | *By default this all happens only within your system*. The extension works in local-first/offline mode, which is one of the main goals. 11 | (However, if you want, you can set it up on an external domain, with the reverse proxy or something like that). 12 | 13 | * What data is requested from the promnesia server 14 | 15 | There are just a few requests to the Promnesia server the extension is doing at the moment. 16 | 17 | - ~search/search_around~: sends whatever you typed in the search box 18 | - ~visits~: sends the current tab URL to the server (unless it's excluded) 19 | - ~visited~: sends all URLs on the current page to the server (except the excluded ones) 20 | 21 | None of the information above is kept by the server, it *only reads the data from the database*. 22 | It is however possible that some of it is retained in the console logs, that would depend on how exactly you're running the server and the log retention policy of your system. 23 | 24 | # for fuck's sake, github doesn't support file:GUIDE.org::#excludelist link... 25 | # so it's either broken in emacs or in org-mode. fucking hell 26 | For the maximum privacy you can use the [[file:GUIDE.org#excludelist][excludelist]] feature, this will prevent Promnesia from processing the page/domain. 27 | 28 | # TODO if you want to backup your browser history and feed in promnesia (e.g. to overcome the 90 days limit etc) 29 | 30 | ** External requests 31 | 32 | - if you have [[file:GUIDE.org#excludelist][external excludelists URLs]] in the extension settings, they will be updated now and then 33 | 34 | These excludelists are downloaded and cached locally, so *no information about when and what you're browsing gets out*. 35 | 36 | 37 | * Extension permissions 38 | - =file/http/https=: the extension is meant to work on any page, hence such a broad scope 39 | this is necessary for webNavigation callbacks to work properly and update icon/sidebar 40 | 41 | - =storage=: for settings 42 | - =webNavigation=: receiving page status updates so extension kicks in on page loading 43 | - =contextMenus=: context menu actions 44 | 45 | There permissions are required at the moment, but there is an [[https://github.com/karlicoss/promnesia/issues/97][issue]] for work on possibly making them optional. 46 | 47 | - =notifications=: showing notifications 48 | 49 | - =history=: to use local browsing history 50 | 51 | Local history isn't strictly required, so we could omit this if people prefer. 52 | - =bookmarks=: used as one of the sources 53 | 54 | It can already be toggled in the settings, so the permission could be dynamic too 55 | 56 | * Security 57 | While I have some reasonable understanding of security, I'm no expert, so would be very grateful if you flag potential issues or [[https://github.com/karlicoss/promnesia/issues/14][go through the code]] (especially extension). 58 | 59 | There is some ongoing work on Dockerizing: [[promnesia][issues/55]]. 60 | -------------------------------------------------------------------------------- /doc/addons-mozilla-org.org: -------------------------------------------------------------------------------- 1 | Sometimes extension is flagged for review and you're asked to provide unminifed source code and build instructions. 2 | 3 | To provide the source code, run: =git archive master --output=promnesia-source.zip= 4 | 5 | Also can point them at https://github.com/karlicoss/promnesia/tree/master/extension 6 | 7 | The build instructions assume that the zip file with source code is in =/path/to/promnesia-source.zip= (on the HOST system). 8 | *Make sure to replace it with the actual path to the source code zip file.* 9 | 10 | 11 | * Building addon 12 | 13 | To build you need *Ubuntu 24.04/Noble* and *Node 20*. The easiest way to build cleanly would be a Docker container: 14 | 15 | #+begin_src 16 | # on the HOST system: cleanup previous container -- if it didn't exist in the first, it will show an error, ignore it 17 | docker rm -f promnesia_build 18 | 19 | # on the HOST system: create the container 20 | docker create --name promnesia_build -it ubuntu:noble /bin/bash 21 | 22 | # on the HOST system: put the sources into the container 23 | docker cp /path/to/promnesia-source.zip promnesia_build:/promnesia.zip 24 | 25 | # on the HOST system: start the container 26 | docker start -i promnesia_build 27 | 28 | #+end_src 29 | 30 | After that build the addon (run these commands INSIDE the container if you choose to do it with Docker): 31 | 32 | #+begin_src 33 | $ apt update && apt install -y git curl unzip 34 | $ curl -fsSL https://deb.nodesource.com/setup_20.x | bash - 35 | $ DEBIAN_FRONTEND=noninteractive apt install -y nodejs 36 | $ unzip promnesia.zip -d promnesia 37 | $ cd promnesia 38 | $ npm install 39 | $ ./build --firefox --release --lint --publish=skip 40 | #+end_src 41 | 42 | The final artifact will be in =/promnesia/dist/artifacts/firefox/promnesia-.zip= (INSIDE the container). 43 | 44 | If you need to get it back onto the HOST system (e.g. to test in the browser), run on the HOST system (e.g. in a separate terminal): 45 | 46 | #+begin_src 47 | docker cp promnesia_build:/promnesia/dist/artifacts/firefox/promnesia-.zip . 48 | #+end_src 49 | 50 | This will copy it into the current directory on the HOST system. 51 | 52 | # TODO: how to keep the instructions consistent with the CI? 53 | 54 | -------------------------------------------------------------------------------- /docker/.gitignore: -------------------------------------------------------------------------------- 1 | user_data/ 2 | -------------------------------------------------------------------------------- /docker/docker_files/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3 2 | 3 | RUN mkdir /user_data \ 4 | mkdir /usr/src/promnesia 5 | 6 | WORKDIR /usr/src/promnesia 7 | COPY src/ . 8 | COPY setup.py /usr/src/ 9 | 10 | #RUN python /usr/src/setup.py #LookupError: setuptools-scm was unable to detect version for '/usr/src/promnesia'. 11 | 12 | RUN pip install --no-cache-dir more_itertools pytz sqlalchemy cachew \ 13 | appdirs urlextract python-magic \ 14 | tzlocal \ 15 | logzero HPI beautifulsoup4 lxml mistletoe orgparse dataset fastapi uvicorn 16 | 17 | ENV PPATH=/usr/src/promnesia:${PPATH} 18 | VOLUME /user_data 19 | 20 | EXPOSE 13131 21 | CMD ["python", "-m", "promnesia", "serve", "--db", "/user_data/promnesia.sqlite", "--port", "13131"] 22 | -------------------------------------------------------------------------------- /docker/docker_files/Dockerfile-indexer: -------------------------------------------------------------------------------- 1 | FROM promnesia:latest 2 | 3 | RUN apt-get update && apt-get install -y cron 4 | 5 | COPY docker/docker_files/indexer-entrypoint.sh / 6 | ENTRYPOINT ["/indexer-entrypoint.sh"] 7 | -------------------------------------------------------------------------------- /docker/docker_files/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.3' 2 | 3 | services: 4 | server: 5 | image: promnesia 6 | build: 7 | context: ../../ 8 | dockerfile: docker/docker_files/Dockerfile 9 | # Uncomment to enable persisent volumes 10 | volumes: 11 | - ../user_data:/user_data 12 | ports: 13 | - "127.0.0.1:13131:13131" 14 | restart: always 15 | indexer: 16 | depends_on: 17 | - server 18 | image: promnesia-indexer 19 | build: 20 | context: ../../ 21 | dockerfile: docker/docker_files/Dockerfile-indexer 22 | # Uncomment to enable persisent volumes 23 | volumes: 24 | - ../user_data:/user_data 25 | environment: 26 | # run once every ten minutes 27 | CRONTAB: "0-59/1 * * * * cd /usr/src/promnesia && /usr/local/bin/python -m promnesia index --config /user_data/indexer-config.py" 28 | -------------------------------------------------------------------------------- /docker/docker_files/indexer-config.py.example: -------------------------------------------------------------------------------- 1 | from promnesia import Source 2 | from promnesia.sources import auto 3 | 4 | # todo: we should probably have separate docker volumes for sources and for config/db 5 | 6 | SOURCES = [ 7 | Source( 8 | auto.index, 9 | '/user_data/source1/', 10 | ), 11 | # Source( 12 | # auto.index, 13 | # '/source2/', 14 | # ) 15 | ] 16 | 17 | OUTPUT_DIR = '/user_data/' 18 | 19 | # this is not supported yet. also, it should probably be named something else than MIME_HANDLER. 20 | #import os 21 | #MIME_HANDLER = 'editor://' + os.path.realpath(os.path.dirname(os.path.realpath(__file__)) + '../') 22 | -------------------------------------------------------------------------------- /docker/docker_files/indexer-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # note: https://lostindetails.com/articles/How-to-run-cron-inside-Docker 4 | # note: CRONTAB is set in docker-compose.yaml. 5 | 6 | echo "${CRONTAB} > /proc/1/fd/1 2>/proc/1/fd/2" | crontab - 7 | cron -f 8 | -------------------------------------------------------------------------------- /docker/get-some-data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd "$(dirname "$0")" 4 | 5 | cd user_data/ 6 | mkdir source1 7 | cd source1 8 | echo "i like https://github.com/karlicoss/promnesia." >> my_notes.txt 9 | git clone https://github.com/karlicoss/exobrain 10 | git clone https://github.com/koo5/notes 11 | 12 | -------------------------------------------------------------------------------- /docker/init.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd "$(dirname "$0")" 4 | mkdir user_data 5 | cp docker_files/indexer-config.py.example user_data/indexer-config.py 6 | ./get-some-data.sh 7 | 8 | # the config file will be periodically reloaded by the indexer process, and data sources will be periodically re-indexed. 9 | 10 | -------------------------------------------------------------------------------- /docker/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd "$(dirname "$0")" 4 | docker-compose -f docker_files/docker-compose.yaml build && docker-compose -f docker_files/docker-compose.yaml up 5 | 6 | 7 | -------------------------------------------------------------------------------- /extension/.ci/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | 4 | cd "$(dirname "$0")" 5 | cd ../.. # git root 6 | 7 | cd extension 8 | 9 | npm install 10 | 11 | FAILED=0 12 | 13 | npm run eslint || FAILED=1 14 | npm run test || FAILED=1 15 | 16 | for browser in 'firefox' 'chrome'; do 17 | ./build --target "$browser" "$@" 18 | done 19 | 20 | exit "$FAILED" 21 | -------------------------------------------------------------------------------- /extension/.editorconfig: -------------------------------------------------------------------------------- 1 | # editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | end_of_line = lf 9 | charset = utf-8 10 | trim_trailing_whitespace = true 11 | insert_final_newline = true 12 | -------------------------------------------------------------------------------- /extension/MANUAL-TESTS.org: -------------------------------------------------------------------------------- 1 | These are tests that aren't yet in end2end/extension unit tests 2 | 3 | * [2023-01-06 Fri 00:18] check that linkifying is working 4 | - go to https://wiki.openhumans.org/wiki/Personal_Science_Wiki 5 | - open sidebar 6 | - [[https://wiki.openhumans.org/wiki/Personal_Science_Wiki][Personal Science Wiki]] should be in contexts 7 | - *expected*: https://wiki.openhumans.org/wiki/Personal_Science_Wiki is highlighted as URL 8 | -------------------------------------------------------------------------------- /extension/TODO.org: -------------------------------------------------------------------------------- 1 | #+FILETAGS: promnesia 2 | 3 | * Building 4 | apparently needs `npm install` done [once?] 5 | ** TODO [2019-06-13 Thu 19:37] web-ext thing https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/web-ext_command_reference#web-ext_sign 6 | in the addon folder 7 | ** web-ext lint (TODO file:/// permission ; eval) 8 | ** web-ext build --overwrite-dest TODO --self-hosted? 9 | ** DONE learn how to sign https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/web-ext_command_reference#web-ext_sign 10 | CLOSED: [2019-08-10 Sat 12:11] 11 | :LOGBOOK: 12 | - State "DONE" from "TODO" [2019-08-10 Sat 12:11] 13 | :END: 14 | 15 | * Testing scenarios 16 | ** [#A] [2019-04-19 Fri 22:38] open new visited tab via a link (have a html page for that) 17 | 18 | Pinboard: public bookmarks for tswaterman 19 | https://pinboard.in/u:tswaterman/ 20 | Comment: 21 | shit. it doesn't trigger if I open new tab form the link, but seems to work fine if I create new tab and enter the url.. 22 | *** [2019-04-21 Sun 09:52] 23 | onCreated with url set 24 | onDomContentLoaded 25 | onUpdated with status complete 26 | 27 | ** refreshing the tab 28 | multiple onDomContentLoaded 29 | onUpdated with status complete 30 | 31 | ** click url in the same tab 32 | onUpdated with url set 33 | onDomContentLoaded 34 | onUpdated with status complete 35 | (that works now) 36 | 37 | ** url changes automatically in the same tab (e.g. youtube) 38 | ------ tested on YT watch later 39 | onUpdated with url set 40 | NO onDomContentLoaded 41 | onUpdated with status complete 42 | 43 | 44 | ** open new empty tab, type in url, enter 45 | first, when you open the tab: 46 | onCreated with empty tab 47 | onUpdated with url set 48 | onUpdated with complete 49 | then when you type in the url: 50 | onUpdated with url set 51 | onDomContentLoaded 52 | onUpdated with status complete 53 | 54 | 55 | ** tab open triggered via external link handling 56 | onCreated with url set 57 | onDomContentLoaded 58 | onUpdated with status complete 59 | ** [2019-04-21 Sun 10:16] ok, overall onUpdated(complete) seems the most reliable. the only a bit annoying thing is that it happens late sometimes 60 | I guess I could keep some sort of cache? not sure.. 61 | ** TODO [2019-08-10 Sat 12:11] move them to end2end test 62 | -------------------------------------------------------------------------------- /extension/__mocks__/browser.js: -------------------------------------------------------------------------------- 1 | // this is to prevent tests failing on importing browser polyfill 2 | // see https://stackoverflow.com/questions/73809020/cant-mock-webextension-polyfill-for-jest-unit-tests 3 | import { jest } from "@jest/globals" 4 | 5 | const mockBrowser = { 6 | history: { 7 | getVisits: jest.fn(), 8 | search : jest.fn(), 9 | }, 10 | bookmarks: { 11 | getTree: jest.fn(), 12 | }, 13 | storage: { 14 | sync: { 15 | // meh. 16 | get: (name, res) => { 17 | res({'options': { 18 | host: 'http://badhost:43210', // some random port, to cause it fail 19 | }}) 20 | } 21 | }, 22 | }, 23 | runtime: { 24 | lastError: null, 25 | getManifest : () => { return {version: 'whatever'} }, 26 | getPlatformInfo: async () => {}, 27 | }, 28 | } 29 | 30 | export default mockBrowser 31 | -------------------------------------------------------------------------------- /extension/__mocks__/dom-form-serializer.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | serialize: null, 3 | deserialize: null, 4 | } 5 | -------------------------------------------------------------------------------- /extension/amo-metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": { 3 | "approval_notes": " 4 | You can find up-to-date extension code here https://github.com/karlicoss/promnesia/tree/master/extension 5 | 6 | The build instructions assume that the zip file with source code is in =/path/to/promnesia-source.zip= (on the HOST system). 7 | *Make sure to replace it with the actual path to the source code zip file.* 8 | 9 | To build you need *Ubuntu 24.04/Noble* and *Node 20*. The easiest way to build cleanly would be a Docker container: 10 | 11 | ``` 12 | # on the HOST system: cleanup previous container -- if it didn't exist in the first, it will show an error, ignore it 13 | docker rm -f promnesia_build 14 | 15 | # on the HOST system: create the container 16 | docker create --name promnesia_build -it ubuntu:noble /bin/bash 17 | 18 | # on the HOST system: put the sources into the container 19 | docker cp /path/to/promnesia-source.zip promnesia_build:/promnesia.zip 20 | 21 | # on the HOST system: start the container 22 | docker start -i promnesia_build 23 | ``` 24 | 25 | After that build the addon (run these commands INSIDE the container if you choose to do it with Docker): 26 | 27 | ``` 28 | $ apt update && apt install -y git curl unzip 29 | $ curl -fsSL https://deb.nodesource.com/setup_20.x | bash - 30 | $ DEBIAN_FRONTEND=noninteractive apt install -y nodejs 31 | $ unzip promnesia.zip -d promnesia 32 | $ cd promnesia 33 | $ npm install 34 | $ ./build --firefox --release --lint --publish=skip 35 | ``` 36 | 37 | 38 | The final artifact will be in =/promnesia/dist/artifacts/firefox/promnesia-.zip= (INSIDE the container). 39 | 40 | If you need to get it back onto the HOST system (e.g. to test in the browser), run on the HOST system (e.g. in a separate terminal): 41 | 42 | docker cp promnesia_build:/promnesia/dist/artifacts/firefox/promnesia-.zip . 43 | 44 | This will copy it into the current directory on the HOST system. 45 | " 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /extension/babel.config.cjs: -------------------------------------------------------------------------------- 1 | const presets = [ 2 | // this is necessary for jest? otherwsie it can't import modules.. 3 | // ugh... I don't understand tbh, seems that even without preset-env, webpack respects browserlist?? 4 | // and looks like without preset-env the code is cleaner??? 5 | // but whatever, the difference is minor and I don't have energy to investigate now.. 6 | '@babel/preset-env', 7 | 8 | // also necessary for jest? otherwise fails to import typescript 9 | '@babel/preset-typescript', 10 | ] 11 | const plugins = [] 12 | 13 | // if (process.env["ENV"] === "prod") { 14 | // plugins.push(...); 15 | // } 16 | 17 | module.exports = { presets, plugins } 18 | -------------------------------------------------------------------------------- /extension/eslint.config.js: -------------------------------------------------------------------------------- 1 | // @ts-check 2 | import globals from 'globals' 3 | import eslint from '@eslint/js' 4 | import tseslint from 'typescript-eslint' 5 | 6 | 7 | export default tseslint.config( 8 | eslint.configs.recommended, 9 | ...tseslint.configs.recommended, // TODO recommendedTypeChecked?? 10 | { 11 | rules: { 12 | '@typescript-eslint/no-explicit-any': 'off', 13 | '@typescript-eslint/ban-ts-comment': 'off', 14 | '@typescript-eslint/no-unused-vars': [ 15 | 'error', 16 | { 17 | "argsIgnorePattern": "^_", 18 | "varsIgnorePattern": "^_", 19 | "caughtErrorsIgnorePattern": "^_", 20 | }, 21 | ], 22 | }, 23 | languageOptions: { 24 | globals: { 25 | // necessary for document. window. etc variables to work 26 | ...globals.browser, 27 | ...globals.webextensions, 28 | }, 29 | }, 30 | }, 31 | ) 32 | -------------------------------------------------------------------------------- /extension/old/patcher.js: -------------------------------------------------------------------------------- 1 | // borrowed from https://github.com/newying61/node-module-patch-source-loader/blob/master/loader.js 2 | const loaderUtils = require('loader-utils') 3 | 4 | module.exports.default = function (source) { 5 | const options = this.getOptions() 6 | const patches = options.patches; 7 | for (const patch of patches) { 8 | let res = source.replace(patch.code, patch.newCode) 9 | /* TODO crap, apparently it overwrites inplace, so need to restore? 10 | * e.g. like here.. https://github.com/tugboatcoding/rewrite-source-webpack-plugin/blob/master/src/index.js */ 11 | if (res == source) { 12 | if (!res.includes(patch.newCode)) { // might be already patched 13 | throw Error(`Patch ${JSON.stringify(patch)} had no effect`) 14 | } 15 | } 16 | source = res 17 | } 18 | return source 19 | } 20 | -------------------------------------------------------------------------------- /extension/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Promnesia", 3 | "version": "1.3.1", 4 | "version_name": "released on 2024.06.06", 5 | "description": "Recall which pages you already visited, why and in which context", 6 | "scripts": { 7 | "test": "jest", 8 | "build": "rollup --config", 9 | "eslint": "eslint src", 10 | "web-ext": "web-ext", 11 | "release:cws": "chrome-webstore-upload" 12 | }, 13 | "browserslist": [ 14 | "defaults and supports es6-module" 15 | ], 16 | "repository": { 17 | "type": "git", 18 | "url": "git+https://github.com/karlicoss/promnesia.git" 19 | }, 20 | "author": "Dmitrii Gerasimov ", 21 | "license": "MIT", 22 | "bugs": { 23 | "url": "https://github.com/karlicoss/promnesia/issues" 24 | }, 25 | "homepage": "https://github.com/karlicoss/promnesia#readme", 26 | "dependencies": { 27 | "@codemirror/lang-css": "^6.0.1", 28 | "@codemirror/lang-javascript": "^6.1.2", 29 | "anchorme": "^3.0.8", 30 | "codemirror": "^6.0.1", 31 | "tippy.js": "^6.3.7", 32 | "webext-options-sync": "^4.2.1" 33 | }, 34 | "devDependencies": { 35 | "@babel/core": "^7.24.5", 36 | "@babel/eslint-parser": "^7.24.5", 37 | "@babel/preset-env": "^7.24.5", 38 | "@babel/preset-typescript": "^7.24.1", 39 | "@eslint/js": "^9.3.0", 40 | "@rollup/plugin-commonjs": "^25.0.8", 41 | "@rollup/plugin-node-resolve": "^15.2.3", 42 | "@rollup/plugin-replace": "^5.0.5", 43 | "@rollup/plugin-typescript": "^11.1.6", 44 | "@types/webextension-polyfill": "^0.10.7", 45 | "chrome-webstore-upload-cli": "^3.1.0", 46 | "eslint": "^8.57.0", 47 | "globals": "^15.3.0", 48 | "jest": "^29.5.0", 49 | "jest-environment-jsdom": "^29.5.0", 50 | "jest-fetch-mock": "^3.0.3", 51 | "node-fetch": "^3.3.2", 52 | "rollup": "^4.18.0", 53 | "rollup-plugin-copy": "^3.5.0", 54 | "tslib": "^2.6.2", 55 | "typescript": "^5.4.5", 56 | "typescript-eslint": "^7.10.0", 57 | "web-ext": "^8.0.0", 58 | "webextension-polyfill": "^0.12.0" 59 | }, 60 | "type": "module" 61 | } 62 | -------------------------------------------------------------------------------- /extension/src/background_chrome_mv2.js: -------------------------------------------------------------------------------- 1 | // hack to support ES moudle background page in chrome with manifest v2 2 | // see https://stackoverflow.com/a/71081597/706389 3 | (async() => { 4 | await import ('./background.js'); 5 | })() 6 | -------------------------------------------------------------------------------- /extension/src/compat.ts: -------------------------------------------------------------------------------- 1 | import browser from "webextension-polyfill" 2 | import type {Scripting} from "webextension-polyfill" 3 | 4 | import {assert} from './common' 5 | 6 | 7 | export async function executeScript(injection: Scripting.ScriptInjection): Promise { 8 | /** 9 | * In firefox, executeScript sets error property, whereas in chrome it just throws 10 | * (see https://issues.chromium.org/issues/40205757) 11 | * For consistency, this wrapper throws in all cases instead 12 | */ 13 | const results = await browser.scripting.executeScript(injection) 14 | assert(results.length == 1) 15 | const [{result, error}] = results 16 | if (error != null) { 17 | if (error instanceof Error) { 18 | throw error 19 | } else { 20 | throw new Error(`Error during executeScript: ${error}`) 21 | } 22 | } 23 | return result 24 | } 25 | -------------------------------------------------------------------------------- /extension/src/images/generate: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from pathlib import Path 3 | from tempfile import NamedTemporaryFile 4 | from subprocess import check_call 5 | 6 | def main(): 7 | size = 48 8 | source = Path(__file__).parent / f'source_{size}.svg' 9 | src_color = '#00ff00' 10 | src_svg = source.read_text() 11 | 12 | spec = [ 13 | (f'ic_visited_{size}.png' , '#00ff00'), 14 | (f'ic_relatives_{size}.png' , '#00cc99'), 15 | (f'ic_not_visited_{size}.png', '#999999'), 16 | (f'ic_boring_{size}.png' , '#550000'), 17 | (f'ic_blue_{size}.png' , '#6666ff'), 18 | (f'ic_blacklisted_{size}.png', '#000000'), 19 | (f'ic_error.png' , '#ff0000'), 20 | ] 21 | for fname, color in spec: 22 | svg = src_svg.replace(src_color, color) 23 | out = Path(__file__).parent / fname 24 | 25 | with NamedTemporaryFile(suffix='.svg') as ntf: 26 | path = Path(ntf.name) 27 | path.write_text(svg) 28 | 29 | check_call(['inkscape', '-z', '-e', str(out), str(path)]) 30 | 31 | if __name__ == '__main__': 32 | main() 33 | -------------------------------------------------------------------------------- /extension/src/images/ic_blacklisted_48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_blacklisted_48.png -------------------------------------------------------------------------------- /extension/src/images/ic_blue_48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_blue_48.png -------------------------------------------------------------------------------- /extension/src/images/ic_boring_48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_boring_48.png -------------------------------------------------------------------------------- /extension/src/images/ic_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_error.png -------------------------------------------------------------------------------- /extension/src/images/ic_not_visited_48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_not_visited_48.png -------------------------------------------------------------------------------- /extension/src/images/ic_relatives_48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_relatives_48.png -------------------------------------------------------------------------------- /extension/src/images/ic_visited_48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/extension/src/images/ic_visited_48.png -------------------------------------------------------------------------------- /extension/src/images/source_48.svg: -------------------------------------------------------------------------------- 1 | 2 | 18 | 20 | 21 | 23 | image/svg+xml 24 | 26 | 27 | 28 | 29 | 31 | 51 | 55 | 59 | 60 | -------------------------------------------------------------------------------- /extension/src/normalise.ts: -------------------------------------------------------------------------------- 1 | import type {Url} from './common' 2 | 3 | // TODO should probably be merged with common or something... 4 | 5 | const 6 | STRIP_RULES = [ 7 | [[RegExp('.*') , RegExp('^\\w+://' )]], 8 | [[RegExp('.*') , RegExp('(www|ww|amp)\\.' )]], 9 | [[RegExp('.*') , RegExp('[&#].*$' )]], 10 | [[RegExp('.*') , RegExp('/$' )]], 11 | ] 12 | // TODO perhaps that should be semi-configurable 13 | 14 | // TODO maybe use that normalisation library and then adjust query params etc 15 | 16 | /* 17 | I think, most common usecases are: 18 | - blacklisting whole domain (e.g. for privacy reasons, like bank/etc or if something is broken) 19 | - blacklisting specific pages (e.g. reddit/twitter/fb main page so it doesn't result it too many child contexts) 20 | For that current approach is fine. 21 | */ 22 | 23 | // TODO careful about dots etc? 24 | 25 | export function normalise_url(url: string): string { 26 | let cur = url; 27 | STRIP_RULES.forEach((rules: Array>) => { // meh impure foreach.. 28 | for (const rule of rules) { 29 | const target: RegExp = rule[0]! 30 | const reg: RegExp | null = rule[1] 31 | if (cur.search(target) >= 0) { 32 | if (reg != null) { 33 | cur = cur.replace(reg, ''); 34 | } 35 | break; 36 | } 37 | } 38 | }); 39 | return cur; 40 | } 41 | 42 | const _re = RegExp('^(www|ww|amp)\\.' ) 43 | export function normaliseHostname(url: string): string { 44 | return url.replace(_re, ''); 45 | } 46 | 47 | 48 | export function normalisedURLHostname(url: Url): string { 49 | const _hostname = new URL(url).hostname; 50 | const hostname = normaliseHostname(_hostname); 51 | return hostname; 52 | } 53 | -------------------------------------------------------------------------------- /extension/src/options_page.css: -------------------------------------------------------------------------------- 1 | body { 2 | width: 800px; 3 | } 4 | 5 | textarea { 6 | width: 100%; 7 | } 8 | 9 | a { 10 | text-decoration: none; 11 | } 12 | 13 | /* some magic to make it scrollabel on too much content, yet not always expand to max-height */ 14 | .CodeMirror { 15 | height: auto !important; 16 | } 17 | .CodeMirror-scroll { 18 | max-height: 25em; /* */ 19 | } 20 | /* */ 21 | 22 | 23 | input:invalid { 24 | background-color: red; 25 | } 26 | 27 | .description { 28 | margin-left: 1em; 29 | color: #777777; 30 | white-space: pre; 31 | } 32 | 33 | hr { 34 | height: 1px; 35 | background-color: #ccc; 36 | border: none; 37 | } 38 | 39 | pre { 40 | margin: 0 0 0 0; 41 | } 42 | 43 | label { 44 | display: block; 45 | } 46 | 47 | /* ugh, at least in chrome default browser style didn't work??? */ 48 | input:indeterminate { 49 | box-shadow: inset 0 0 6px 0px grey; 50 | } 51 | -------------------------------------------------------------------------------- /extension/src/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Search 6 | 7 | 8 | 9 | 10 |
11 | 12 | 13 |
14 | 15 |
16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /extension/src/selenium_bridge.js: -------------------------------------------------------------------------------- 1 | // only used during tests 2 | 3 | // hack to hook into the extension... https://stackoverflow.com/a/38554438/706389 4 | for (const x of [ 5 | 'selenium-bridge-_execute_action', 6 | 'selenium-bridge-_execute_browser_action', 7 | 'selenium-bridge-mark_visited', 8 | 'selenium-bridge-search', 9 | ]) { 10 | document.addEventListener(x, () => { 11 | chrome.runtime.sendMessage(x) 12 | }) 13 | } 14 | -------------------------------------------------------------------------------- /extension/src/showvisited.css: -------------------------------------------------------------------------------- 1 | /* TODO use variables */ 2 | 3 | /* actual popup with the metadata */ 4 | .promnesia-visited-popup { 5 | display: block; /* otherwise border etc don't work? */ 6 | white-space: pre-wrap; /* keep whitespace intact */ 7 | 8 | background: #e6e6e6fa; /* same color as in sidebar */ 9 | color: black; /* to prevent it inheriting white text colour in dark mode */ 10 | 11 | outline: solid 1px; 12 | padding: 2px; 13 | /* not sure about initial?? */ 14 | font-weight: initial; 15 | font-size: initial; 16 | } 17 | 18 | .promnesia-visited-popup .promnesia-visited-popup-link { 19 | display: inline-block; 20 | text-decoration: none; 21 | padding-top: 0px; 22 | padding-bottom: 0px; 23 | } 24 | 25 | .promnesia-visited-popup .context { 26 | display: block; 27 | /* color: black; */ 28 | background: lightyellow; 29 | margin: 2px; 30 | } 31 | 32 | /* TODO need to use these styles in tippy */ 33 | .promnesia-visited-popup .datetime { 34 | display: inline-block; 35 | float: right; 36 | padding-left: 0.5em; 37 | } 38 | 39 | 40 | /* TODO not really sure about !important... */ 41 | .promnesia-visited.promnesia-eye { 42 | /* note: outline is defined in sidebar.css */ 43 | background-repeat: no-repeat !important; /* repeats by default */ 44 | background-position-x: right !important; /* left by default */ 45 | 46 | /* same as text size, we just want a small icon */ 47 | /* using rem to prevent it from scaling too much on navigation elements etc */ 48 | background-size: '1rem' !important; 49 | } 50 | -------------------------------------------------------------------------------- /extension/src/sidebar-outer.css: -------------------------------------------------------------------------------- 1 | /* Resets for sites https://github.com/karlicoss/promnesia/issues/102 */ 2 | body.promnesia-sidebar-active { 3 | transform: none; 4 | } 5 | .promnesia-sidebar-active #promnesia-frame { 6 | margin: 0; 7 | padding: 0; 8 | max-width: none; 9 | max-height: none; 10 | border: 0; 11 | border-left: 1px solid #999; 12 | } 13 | 14 | #promnesia-frame { 15 | /* e.g. left should result in: left initial, right 0px, top 0px, bot 0px, height 100%, width 30% 16 | need to set irrelevant dimensions to 0px, otherwise it misbehaves 17 | */ 18 | 19 | /* NOTE: default is --right: 1, set via javascript in sidebar.js */ 20 | 21 | --size: 30%; 22 | 23 | --is-v: var(--left, var(--right )); 24 | --is-h: var(--top , var(--bottom)); 25 | 26 | --init-left : var(--left , var(--top , var(--bottom))); 27 | --init-right : var(--right , var(--top , var(--bottom))); 28 | --init-top : var(--top , var(--left, var(--right ))); 29 | --init-bottom: var(--bottom, var(--left, var(--right ))); 30 | 31 | 32 | left : calc(var(--init-left ) * 0px); 33 | right : calc(var(--init-right ) * 0px); 34 | top : calc(var(--init-top ) * 0px); 35 | bottom: calc(var(--init-bottom) * 0px); 36 | 37 | --whelper: calc(var(--is-v) * var(--size)); 38 | --hhelper: calc(var(--is-h) * var(--size)); 39 | width : var(--whelper, 100%); 40 | height: var(--hhelper, 100%); 41 | 42 | position: fixed; 43 | z-index: 2147483647; 44 | 45 | display: none; 46 | } 47 | 48 | .promnesia { 49 | padding-left : calc(var(--left) * var(--size)); 50 | padding-right : calc(var(--right) * var(--size)); 51 | padding-top : calc(var(--top) * var(--size)); 52 | padding-bottom: calc(var(--bottom) * var(--size)); 53 | } 54 | 55 | 56 | /* TODO expose this in settings ? */ 57 | .promnesia-highlight { 58 | background-color: #ffff6688; 59 | } 60 | 61 | 62 | .nonselectable { 63 | -webkit-touch-callout: none; 64 | -webkit-user-select: none; 65 | -khtml-user-select: none; 66 | -moz-user-select: none; 67 | -ms-user-select: none; 68 | -o-user-select: none; 69 | user-select: none; 70 | } 71 | 72 | .promnesia-highlight-reference { 73 | color: orange; 74 | font-weight: bold; 75 | margin: 0.1em; 76 | 77 | white-space: nowrap; /* prevent from breaking numbers onto newlines */ 78 | bottom: 1em; 79 | } 80 | 81 | 82 | div.toastify { 83 | /* eh. just a quick hack to make close button appear on the top instead of bottom */ 84 | /* otherwise if the notification is too long, it might be hidden */ 85 | display: flex; 86 | } 87 | -------------------------------------------------------------------------------- /extension/src/toastify.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Toastify js 1.3.2 3 | * https://github.com/apvarun/toastify-js 4 | * @license MIT licensed 5 | * 6 | * Copyright (C) 2018 Varun A P 7 | */ 8 | 9 | .toastify { 10 | padding: 12px 20px; 11 | color: #ffffff; 12 | display: inline-block; 13 | box-shadow: 0 3px 6px -1px rgba(0, 0, 0, 0.12), 0 10px 36px -4px rgba(77, 96, 232, 0.3); 14 | background: -webkit-linear-gradient(315deg, #73a5ff, #5477f5); 15 | background: linear-gradient(135deg, #73a5ff, #5477f5); 16 | position: fixed; 17 | opacity: 0; 18 | transition: all 0.4s cubic-bezier(0.215, 0.61, 0.355, 1); 19 | border-radius: 2px; 20 | cursor: pointer; 21 | text-decoration: none; 22 | max-width: calc(50% - 20px); 23 | z-index: 2147483647; 24 | 25 | white-space: pre-wrap; 26 | } 27 | 28 | .toastify.on { 29 | opacity: 1; 30 | } 31 | 32 | .toast-close { 33 | opacity: 0.4; 34 | padding: 0 5px; 35 | } 36 | 37 | .toastify-right { 38 | right: 15px; 39 | } 40 | 41 | .toastify-left { 42 | left: 15px; 43 | } 44 | 45 | .toastify-top { 46 | top: -150px; 47 | } 48 | 49 | .toastify-bottom { 50 | bottom: -150px; 51 | } 52 | 53 | .toastify-rounded { 54 | border-radius: 25px; 55 | } 56 | 57 | .toastify-avatar { 58 | width: 1.5em; 59 | height: 1.5em; 60 | margin: 0 5px; 61 | border-radius: 2px; 62 | } 63 | 64 | @media only screen and (max-width: 360px) { 65 | .toastify-right, .toastify-left { 66 | margin-left: auto; 67 | margin-right: auto; 68 | left: 0; 69 | right: 0; 70 | max-width: fit-content; 71 | } 72 | } 73 | 74 | -------------------------------------------------------------------------------- /extension/tests/anchorme.test.js: -------------------------------------------------------------------------------- 1 | import anchorme from "anchorme" 2 | 3 | test('detects org-mode links correctly', () => { 4 | const res = anchorme.list(` 5 | without the fix, if you have text like this: 6 | 7 | [[https://wiki.openhumans.org/wiki/Personal_Science_Wiki][Personal Science Wiki]] 8 | 9 | - also delete the min.js file because I'm not sure how to patch it -- to prevent using it by accident 10 | `).map(o => o.string) 11 | expect(res).toStrictEqual(['https://wiki.openhumans.org/wiki/Personal_Science_Wiki']) 12 | }) 13 | -------------------------------------------------------------------------------- /extension/tests/common.test.js: -------------------------------------------------------------------------------- 1 | import {_fmt} from '../src/display' 2 | 3 | test('formats visit date/time', () => { 4 | // NOTE: under Node env there might not be necessary locales (e.g. if you're running in Docker) 5 | // can check with: Intl.DateTimeFormat('en-GB').resolvedOptions().locale 6 | // e.g. it might resolve to incmplete locale like 'en' 7 | const dd = new Date('05 Jun 2020 05:58:00') // deliberately no timezone, it's pointless without the backend anyway 8 | const [ds, ts] = _fmt(dd) 9 | expect(ds).toBe('5 Jun 2020') 10 | expect(ts).toBe('05:58') 11 | }) 12 | 13 | import {format_duration} from '../src/common' 14 | 15 | test('formats duration', () => { 16 | expect(format_duration(40)).toBe('40 seconds'); 17 | expect(format_duration(124)).toBe('2 minutes'); 18 | expect(format_duration(24 * 60 * 60 + 95 * 60 + 20)).toBe('25 hours 35 minutes'); 19 | }); 20 | 21 | 22 | import {Visits} from '../src/common' 23 | import {makeFakeVisits} from '../src/api' 24 | 25 | test('visits', () => { 26 | for (const vis of [ 27 | [], 28 | makeFakeVisits(1).visits, 29 | makeFakeVisits(10).visits, 30 | [new Error('some error')], 31 | [new Error('other error'), ...makeFakeVisits(2).visits], 32 | ]) { 33 | const v = new Visits('http://test', 'http://test', vis) 34 | const vv = Visits.fromJObject(v.toJObject()) 35 | expect(v).toStrictEqual(vv) 36 | } 37 | 38 | // test for more elaborate error handling, make sure it preserves stack 39 | // apparently Error comparison doesn't do anything to the stack.. 40 | for (const vis of [ 41 | [function () { 42 | const err = new Error('some message') 43 | err.stack = 'stack1\nstack2' 44 | return err 45 | }()], 46 | ]) { 47 | const v = new Visits('http://test', 'http://test', vis) 48 | const vv = Visits.fromJObject(v.toJObject()) 49 | const e = vv.visits[0] 50 | expect(e.stack).toStrictEqual('stack1\nstack2') 51 | } 52 | }) 53 | 54 | import {normalise_url} from '../src/normalise' 55 | 56 | test('normalises', () => { 57 | expect(normalise_url('https://www.youtube.com/playlist?list=PLWz5rJ2EKKc9CBxr3BVjPTPoDPLdPIFCE/')).toBe('youtube.com/playlist?list=PLWz5rJ2EKKc9CBxr3BVjPTPoDPLdPIFCE'); 58 | }); 59 | 60 | 61 | import {normalisedURLHostname} from '../src/normalise' 62 | test('normalisedURLHostname', () => { 63 | expect(normalisedURLHostname('https://www.reddit.com/whatever')).toBe('reddit.com'); 64 | expect(normalisedURLHostname('file:///usr/share/doc/python3/html/index.html')).toBe(''); 65 | }); 66 | 67 | 68 | import {Filterlist} from '../src/filterlist' 69 | 70 | 71 | test('filterlists', async () => { 72 | // TODO make tests literate so they contribute to help docs? 73 | const bl_string = ` 74 | mail.google.com 75 | https://vk.com 76 | **github.com/issues** 77 | /github.com/issues.*/ 78 | 79 | //comment.com 80 | 81 | https://reddit.com/ 82 | 83 | ` 84 | 85 | const b = new Filterlist({filterlist: bl_string, urllists_json: '[]'}) 86 | 87 | // TODO eh, doesn't work with links without schema; not sure if it's ok 88 | expect(await b.contains('http://instagram.com/')).toBe(null) 89 | 90 | // whole domain is blocked 91 | expect(await b.contains('https://mail.google.com/mail/u/0/#inbox')).toContain('domain') 92 | 93 | 94 | // specific page is blocked 95 | expect(await b.contains('https://vk.com' )).toContain('exact page') 96 | expect(await b.contains('https://vk.com/')).toContain('exact page') 97 | expect(await b.contains('https://vk.com/user/whatever')).toBe(null) 98 | expect(await b.contains('https://reddit.com')).toContain('exact page') 99 | 100 | // wildcard blockig 101 | expect(await b.contains('http://github.com/')).toBe(null) 102 | expect(await b.contains('http://github.com/issues/hello/123')).toContain('regex') 103 | 104 | // TODO later, doesn't work ATM 105 | // expect(b.contains('http://github.com/issues/hello/123', bl)).toContain('wildcard'); 106 | 107 | expect(await b.contains('123456')).toBe('invalid URL') 108 | expect(await b.contains('http://comment.com')).toBe(null) 109 | }) 110 | -------------------------------------------------------------------------------- /extension/tests/defensify.test.js: -------------------------------------------------------------------------------- 1 | // import {defensify} from '../src/notifications.js'; 2 | 3 | async function inner() { 4 | throw 'some_error'; 5 | } 6 | 7 | async function outer() { 8 | console.warn('before inner async'); 9 | await inner(); 10 | console.warn('after inner async'); 11 | } 12 | 13 | 14 | function inner2() { 15 | throw err; 16 | } 17 | 18 | function outer2() { 19 | console.warn('before inner'); 20 | inner2(); 21 | console.warn('after inner'); 22 | } 23 | 24 | test('defensify', async () => { 25 | // await alalal(); 26 | // await defensify(alalal)(); 27 | console.log("HIHIH"); 28 | // outer2(); // ok, stack is preserved 29 | // await outer(); // ugh. stack is lost... 30 | // const dd = new Date(0); 31 | // expect(format_dt(dd)).toMatch(/Jan 1 1970/); 32 | }); 33 | -------------------------------------------------------------------------------- /extension/tests/integration.test.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Ugh FFS. 3 | * NODE_OPTIONS=--experimental-vm-modules npm run test is working much better with ES6 imports/node dependenceis 4 | * but it segfaults every other time 5 | * https://github.com/nodejs/node/issues/35889 6 | */ 7 | import mockBrowser from "../__mocks__/browser" 8 | global.chrome = mockBrowser 9 | 10 | 11 | test('options', async () => { 12 | const {setOptions, getOptions} = await import ('../src/options') 13 | // shouldn't crash at least.. 14 | const opts = await getOptions() 15 | }) 16 | // TODO could check options migrations? 17 | 18 | import fetch from 'node-fetch' 19 | global.fetch = fetch 20 | 21 | 22 | test('visits', async() => { 23 | const {backend, makeFakeVisits} = await import ('../src/api') 24 | 25 | // const opts = await getOptions() 26 | // opts.host = host: 'http//bad.host', 27 | 28 | // TODO have a defensive and offensive modes? 29 | // but defensive for network errors makes def makes sense anyway 30 | const vis = await backend.visits('http://123.com') 31 | expect(vis).toBeInstanceOf(Error) 32 | expect(vis.message).toMatch(/request .* failed/) 33 | }) 34 | 35 | 36 | // meh. 37 | mockBrowser.history.getVisits.mockImplementation(async (obj) => []) 38 | mockBrowser.history.search .mockImplementation(async (obj) => []) 39 | mockBrowser.bookmarks.getTree.mockImplementation(async () => [{ 40 | children: [{ 41 | url: 'http:whatever.com/', 42 | dateAdded: 16 * 10 ** 8 * 1000, 43 | }], 44 | }]) 45 | 46 | test('visits_allsources', async() => { 47 | const {allsources} = await import('../src/sources') 48 | 49 | const vis = await allsources.visits('https://whatever.com/') 50 | expect(vis.visits).toHaveLength(2) 51 | expect(vis.normalised_url).toStrictEqual('whatever.com') 52 | }) 53 | 54 | 55 | test('search_works', async () => { 56 | const {allsources} = await import('../src/sources') 57 | 58 | // at least shouldn't crash 59 | const res = await allsources.search('https://123.coom') 60 | const [e] = res.visits 61 | expect(e.message).toMatch(/request .* failed/) 62 | }) 63 | 64 | test('search_defensive', async() => { 65 | const {backend} = await import ('../src/api') 66 | const {MultiSource, bookmarks, thisbrowser} = await import ('../src/sources') 67 | 68 | 69 | // precondition: some error in processing history api, e.g. it's unavailable or something 70 | mockBrowser.history.search.mockImplementation(async (q) => null) 71 | mockBrowser.bookmarks.getTree.mockImplementation(async () => null) 72 | 73 | // TODO wtf?? for some reason default order (backend, browser, bookmarks) causes 74 | // 'Promise rejection was handled asynchronously' 75 | // I wonder if it's some issue with node fetch implementation... or just node version?? 76 | // for some reason different order works :shrug: 77 | 78 | const res = await new MultiSource(thisbrowser, bookmarks, backend) 79 | .search('http://whatever.com') 80 | 81 | console.error(res.visits) 82 | const [e1, e2, e3] = res.visits 83 | // eh. fragile, but at least makes sure we test exactly the thing we want 84 | expect(e1.message).toMatch(/is not iterable/) 85 | expect(e2.message).toMatch(/Cannot read propert/) 86 | expect(e3.message).toMatch(/request .* failed/) 87 | }) 88 | 89 | 90 | import fetchMock from 'jest-fetch-mock' 91 | // TODO use it as a fixture.. 92 | // beforeEach(() => { 93 | // fetch.resetMocks() 94 | // }) 95 | 96 | test('visits_badresponse', async() => { 97 | const {backend} = await import ('../src/api') 98 | 99 | fetchMock.enableMocks() 100 | fetchMock.mockResponse('bad!') 101 | const res = await backend.visits('http://mock.com') 102 | expect(res).toBeInstanceOf(Error) 103 | }) 104 | 105 | 106 | test('visited', async() => { 107 | const {backend} = await import ('../src/api') 108 | const {fake} = await import ('../src/api') 109 | 110 | fetchMock.enableMocks() 111 | const [v] = fake.apiVisits(1) 112 | { 113 | fetchMock.mockOnce(`[null, ${JSON.stringify(v)}]`) 114 | const r = await backend.visited(['http://link1', 'http://link2']) 115 | expect(r).not.toBeInstanceOf(Error) 116 | const [r1, r2] = r 117 | expect(r1).toEqual(null) 118 | expect(r2.tags).toEqual(['fake']) 119 | } 120 | 121 | { 122 | // the backend is also allowed to return boolean values (for 'was visited'/'was not visited') 123 | // in addition, this was legacy behaviour 124 | fetchMock.mockOnce(`[false, true, null]`) 125 | let r = await backend.visited(['http://link1', 'http://link2', 'http://link3']) 126 | const [r1, r2, r3] = r 127 | expect(r1).toEqual(null) 128 | expect(r2).not.toEqual(null) 129 | expect(r3).toEqual(null) 130 | } 131 | }) 132 | -------------------------------------------------------------------------------- /extension/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "baseUrl": "src/", 4 | "noImplicitAny": true, 5 | "target": "es2022", 6 | "strict": true, 7 | "allowJs": true, 8 | 9 | // this is necessary to allos importing as import './whatever' (instead of explicit extension) 10 | "moduleResolution": "bundler", 11 | // esnext is necessary, otherwise bundler module resolution can't be used? 12 | "module": "esnext", 13 | 14 | // without it, emacs (LSP?) complains when editing files.. not sure if impacts actual code generation? 15 | "lib": ["es6", "dom"], 16 | }, 17 | "include": [ 18 | "./src/**/*.ts" 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | pretty = True 3 | show_error_context = True 4 | show_column_numbers = True 5 | show_error_end = True 6 | 7 | check_untyped_defs = True 8 | 9 | # see https://mypy.readthedocs.io/en/stable/error_code_list2.html 10 | warn_redundant_casts = True 11 | strict_equality = True 12 | warn_unused_ignores = True 13 | enable_error_code = deprecated,redundant-expr,possibly-undefined,truthy-bool,truthy-iterable,ignore-without-code,unused-awaitable 14 | 15 | 16 | # not sure why mypy started discovering it (since 0.800??) 17 | [mypy-hypothesis] 18 | ignore_missing_imports = True 19 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # see https://github.com/karlicoss/pymplate for up-to-date reference 2 | [project] 3 | dynamic = ["version"] # version is managed by setuptools_scm 4 | name = "promnesia" 5 | dependencies = [ 6 | "appdirs", # for portable user directories detection 7 | "tzlocal", # guessling local timezone 8 | "more_itertools", 9 | "typing-extensions", 10 | "pytz", 11 | "sqlalchemy>=2.0", # DB api 12 | 13 | ## 14 | # NOTE: ideally we don't need to install them by default? 15 | # i.e. server and indexer can run on different hosts/containers etc 16 | # keeping here for backwards compatibility for now 17 | "promnesia[indexer]", 18 | "promnesia[server]", 19 | ## 20 | ] 21 | requires-python = ">=3.9" 22 | 23 | ## these need to be set if you're planning to upload to pypi 24 | description = "Enhancement of your browsing history" 25 | license = {file = "LICENSE"} 26 | authors = [ 27 | {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"}, 28 | ] 29 | maintainers = [ 30 | {name = "Dima Gerasimov (@karlicoss)", email = "karlicoss@gmail.com"}, 31 | ] 32 | [project.urls] 33 | Homepage = "https://github.com/karlicoss/promnesia" 34 | ## 35 | 36 | 37 | [project.optional-dependencies] 38 | indexer = [ 39 | # indexer only dependencies 40 | "urlextract", 41 | ] 42 | server = [ 43 | # server only dependencies 44 | "fastapi", 45 | "uvicorn[standard]", 46 | ] 47 | optional = [ 48 | # dependencies that bring some bells & whistles 49 | "logzero" , # pretty colored logging 50 | "python-magic", # better mimetype decetion 51 | ] 52 | HPI = [ 53 | # dependencies for https://github.com/karlicoss/HPI 54 | "HPI", # pypi version 55 | # TODO add notes how to override with github version? 56 | ] 57 | html = [ 58 | # dependencies for sources.html 59 | "beautifulsoup4", # extracting links from the page 60 | "lxml" , # bs4 backend 61 | ] 62 | markdown = [ 63 | # dependencies for sources.html 64 | "mistletoe", 65 | ] 66 | org = [ 67 | # dependencies for sources.org 68 | "orgparse>=0.3.0", 69 | ] 70 | telegram = [ 71 | # used to depend on 'dataset', keeping for backwards compatibility 72 | ] 73 | all = [ 74 | "promnesia[optional,HPI,html,markdown,org]", 75 | ] 76 | 77 | [dependency-groups] 78 | testing = [ 79 | "pytest", 80 | "ruff", 81 | "mypy", 82 | "lxml", # for mypy coverage 83 | 84 | "hypothesis", 85 | 86 | "loguru", # used in addon_helper... not sure if should just use promnesia's logger? 87 | 88 | "psutil", "types-psutil", 89 | "requests", "types-requests", 90 | 91 | ## other mypy stubs 92 | "types-pytz" , 93 | "types-requests" , # used in tests 94 | "types-beautifulsoup4", # NOTE: not needed after the <4.13.0 pinning is resolved 95 | ## 96 | 97 | # todo hmm ideally would be in corresponding testing-... sections 98 | # but we don't split separate mypy for end2end tests... so idk 99 | "selenium" , # browser automations 100 | "types-PyAutoGUI" 101 | ] 102 | testing-end2end = [ 103 | "click" , # confirmations for end2end test (might remove dependency) 104 | "pytest-timeout", # for PYTEST_TIMEOUT env variable 105 | "pytest-xdist" , # not used atm, but helpful to parallelise end2end tests 106 | ] 107 | testing-gui = [ 108 | # pyautogui seems problematic, wheels often fail to build under windows 109 | # we don't use it in CI, so keep in a separate extras section 110 | "pyautogui", # for keyboard automation during end2end tests 111 | ] 112 | 113 | 114 | [project.scripts] 115 | promnesia = "promnesia.__main__:main" 116 | 117 | 118 | [build-system] 119 | requires = ["setuptools", "setuptools-scm"] 120 | build-backend = "setuptools.build_meta" 121 | 122 | [tool.setuptools_scm] 123 | version_scheme = "python-simplified-semver" 124 | local_scheme = "dirty-tag" 125 | 126 | # workaround for error during uv publishing 127 | # see https://github.com/astral-sh/uv/issues/9513#issuecomment-2519527822 128 | [tool.setuptools] 129 | license-files = [] 130 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | # discover files that don't follow test_ naming. Useful to keep tests along with the source code 3 | python_files = *.py 4 | norecursedirs = tests/testdata/* 5 | addopts = 6 | # -rap to print tests summary even when they are successful 7 | -rap 8 | --verbose 9 | 10 | # otherwise it won't discover doctests 11 | --doctest-modules 12 | 13 | # show all test durations (unless they are too short) 14 | --durations=0 15 | -------------------------------------------------------------------------------- /scripts/backup-phone-history.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -eu 3 | BACKUP_DIR="$1" 4 | 5 | backup_file () { 6 | file="$1" 7 | to="$2" 8 | fname="$(basename "$file")" 9 | timestamp=$(date -d "@$(stat -c %Y "$file")" +'%Y%m%d%H%M%S') 10 | tdir="$to/$timestamp" 11 | mkdir -p "$tdir" 12 | cp "$file" "$tdir/$fname" 13 | } 14 | 15 | 16 | backup_chrome () { 17 | backup_file '/data/data/com.android.chrome/app_chrome/Default/History' "$BACKUP_DIR/chrome" 18 | } 19 | 20 | backup_firefox () { 21 | backup_file '/data/data/org.mozilla.firefox/files/mozilla/'*.default/'browser.db' "$BACKUP_DIR/firefox" 22 | } 23 | 24 | 25 | backup_firefox 26 | backup_chrome 27 | -------------------------------------------------------------------------------- /scripts/browser_history.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | DEPRECATION = 'NOTE: this is DEPRECATED! Please use https://github.com/purarue/browserexport instead' 3 | 4 | from datetime import datetime, timezone 5 | from pathlib import Path 6 | from subprocess import check_output 7 | import filecmp 8 | import logging 9 | import warnings 10 | import sys 11 | 12 | warnings.warn(DEPRECATION, DeprecationWarning) 13 | 14 | Browser = str 15 | 16 | CHROME = 'chrome' 17 | FIREFOX = 'firefox' 18 | 19 | def get_logger(): 20 | return logging.getLogger('browser-history') 21 | 22 | 23 | # TODO kython? 24 | # TODO the with key? 25 | def only(it): 26 | values = list(it) 27 | if len(values) == 1: 28 | return values[0] 29 | raise RuntimeError(f'Expected a single value: {values}') 30 | 31 | 32 | def get_path(browser: Browser, profile: str='*') -> Path: 33 | if browser == 'chrome': 34 | bpath = Path('~/.config/google-chrome').expanduser() 35 | dbs = bpath.glob(profile + '/History') 36 | elif browser == 'firefox': 37 | bpath = Path('~/.mozilla/firefox/').expanduser() 38 | dbs = bpath.glob(profile + '/places.sqlite') 39 | else: 40 | raise RuntimeError(f'Unexpected browser {browser}') 41 | ldbs = list(dbs) 42 | if len(ldbs) == 1: 43 | return ldbs[0] 44 | raise RuntimeError(f'Expected single database, got {ldbs}. Perhaps you want to use --profile argument?') 45 | 46 | 47 | 48 | def test_get_path(): 49 | get_path('chrome') 50 | get_path('firefox', profile='*-release') 51 | 52 | 53 | def atomic_copy(src: Path, dest: Path): 54 | """ 55 | Supposed to handle cases where the file is changed while we were copying it. 56 | """ 57 | import shutil 58 | 59 | differs = True 60 | while differs: 61 | res = shutil.copy(src, dest) 62 | differs = not filecmp.cmp(str(src), str(res)) 63 | 64 | 65 | def format_dt(dt: datetime) -> str: 66 | return dt.strftime('%Y%m%d%H%M%S') 67 | 68 | 69 | def backup_history(browser: Browser, to: Path, profile: str='*', pattern=None) -> Path: 70 | assert to.is_dir() 71 | logger = get_logger() 72 | 73 | now = format_dt(datetime.now(tz=timezone.utc)) 74 | 75 | path = get_path(browser, profile=profile) 76 | 77 | pattern = path.stem + '-{}' + path.suffix if pattern is None else pattern 78 | fname = pattern.format(now) 79 | 80 | 81 | res = to / fname 82 | logger.info('backing up to %s', res) 83 | # if your chrome is open, database would normally be locked, so you can't just make a snapshot 84 | # so we'll just copy it till it converge. bit paranoid, but should work 85 | atomic_copy(path, res) 86 | logger.info('done!') 87 | return res 88 | 89 | 90 | def test_backup_history(tmp_path): 91 | tdir = Path(tmp_path) 92 | backup_history(CHROME, tdir) 93 | backup_history(FIREFOX, tdir, profile='*-release') 94 | 95 | 96 | def guess_db_date(db: Path) -> str: 97 | maxvisit = check_output([ 98 | 'sqlite3', 99 | '-csv', 100 | db, 101 | 'SELECT max(datetime(((visits.visit_time/1000000)-11644473600), "unixepoch")) FROM visits;' 102 | ]).decode('utf8').strip().strip('"') 103 | return format_dt(datetime.strptime(maxvisit, "%Y-%m-%d %H:%M:%S")) 104 | 105 | 106 | def test_guess(tmp_path): 107 | tdir = Path(tmp_path) 108 | db = backup_history(CHROME, tdir) 109 | guess_db_date(db) 110 | 111 | 112 | def main(): 113 | logger = get_logger() 114 | import argparse 115 | p = argparse.ArgumentParser() 116 | p.add_argument('--browser', type=Browser, required=True) 117 | p.add_argument('--profile', type=str, default='*', help='Use to pick the correct profile to back up. If unspecified, will assume a single profile') 118 | p.add_argument('--to', type=Path, required=True) 119 | args = p.parse_args() 120 | 121 | # TODO do I need pattern?? 122 | backup_history(browser=args.browser, to=args.to, profile=args.profile) 123 | 124 | warnings.warn(DEPRECATION, DeprecationWarning) 125 | logger.error("This script is DEPRECATED! Exiting with error code so that the use notices") 126 | sys.exit(44) 127 | 128 | 129 | if __name__ == '__main__': 130 | main() 131 | -------------------------------------------------------------------------------- /scripts/promnesia: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # this script runs promnesia from the current repository (instead of the installed version) 3 | 4 | set -eu 5 | 6 | GIT_ROOT_DIR="$(cd "$(dirname "$0")" && git rev-parse --show-toplevel)" 7 | 8 | if [[ -n "${PYTHONPATH:=}" ]]; then 9 | PPATH=":$PYTHONPATH" 10 | else 11 | PPATH="" 12 | fi 13 | 14 | PPATH="$GIT_ROOT_DIR/src$PPATH" 15 | 16 | export DIRTY_RUN='YES' # ugh. 17 | 18 | if command -v python3 &> /dev/null; then 19 | # Note: python3 in Windows used "py" in command line. So $PY_BIN should be just "py" 20 | PY_BIN="python3" 21 | else 22 | PY_BIN="python" # warn? 23 | fi 24 | 25 | PYTHONPATH="$PPATH" exec "$PY_BIN" -m promnesia "$@" 26 | -------------------------------------------------------------------------------- /src/promnesia/__init__.py: -------------------------------------------------------------------------------- 1 | # add deprecation warning so eventually this may converted to a namespace package? 2 | import warnings 3 | 4 | from .common import ( # noqa: F401 5 | Context, 6 | DbVisit, 7 | Loc, 8 | PathIsh, 9 | Res, 10 | Results, 11 | Source, 12 | Visit, 13 | last, 14 | ) 15 | 16 | # TODO think again about it -- what are the pros and cons? 17 | warnings.warn("DEPRECATED! Please import directly from 'promnesia.common', e.g. 'from promnesia.common import Visit, Source, Results'", DeprecationWarning) 18 | -------------------------------------------------------------------------------- /src/promnesia/compare.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | # TODO perhaps make it external script? 4 | import argparse 5 | import logging 6 | import sys 7 | from collections.abc import Iterator, Sequence 8 | from pathlib import Path 9 | from typing import TypeVar 10 | 11 | from .common import DbVisit, PathWithMtime, Url 12 | from .database.load import row_to_db_visit 13 | 14 | # TODO include latest too? 15 | # from cconfig import ignore, filtered 16 | 17 | def get_logger(): 18 | return logging.getLogger('promnesia-db-changes') 19 | 20 | # TODO return error depending on severity? 21 | 22 | 23 | T = TypeVar('T') 24 | 25 | def eliminate_by(sa: Sequence[T], sb: Sequence[T], key): 26 | def make_dict(s: Sequence[T]) -> dict[str, list[T]]: 27 | res: dict[str, list[T]] = {} 28 | for a in s: 29 | k = key(a) 30 | ll = res.get(k, None) 31 | if ll is None: 32 | ll = [] 33 | res[k] = ll 34 | ll.append(a) 35 | return res 36 | da = make_dict(sa) 37 | db = make_dict(sb) 38 | ka = set(da.keys()) 39 | kb = set(db.keys()) 40 | onlya: set[T] = set() 41 | common: set[T] = set() 42 | onlyb: set[T] = set() 43 | for k in ka.union(kb): 44 | la = da.get(k, []) 45 | lb = db.get(k, []) 46 | common.update(la[:min(len(la), len(lb))]) 47 | if len(la) > len(lb): 48 | onlya.update(la[len(lb):]) 49 | if len(lb) > len(la): 50 | onlyb.update(lb[len(la):]) 51 | 52 | return onlya, common, onlyb 53 | 54 | 55 | def compare(before: list[DbVisit], after: list[DbVisit], between: str, *, log=True) -> list[DbVisit]: 56 | logger = get_logger() 57 | logger.info('comparing between: %s', between) 58 | 59 | errors: list[DbVisit] = [] 60 | 61 | umap: dict[Url, list[DbVisit]] = {} 62 | for a in after: 63 | url = a.norm_url 64 | xx = umap.get(url, []) # TODO canonify here? 65 | xx.append(a) 66 | umap[url] = xx 67 | 68 | def reg_error(b): 69 | errors.append(b) 70 | if log: 71 | logger.error('between %s missing %s', between, b) 72 | print('ignoreline "{}", # {} {}'.format('exid', b.norm_url, b.src), file=sys.stderr) 73 | 74 | 75 | # the idea is that we eliminate items simultaneously from both sets 76 | eliminations = [ 77 | ('identity' , lambda x: x), 78 | ('without dt' , lambda x: x._replace(src='', dt='')), 79 | ('without context' , lambda x: x._replace(src='', context='', locator='')), 80 | ('without dt and context' , lambda x: x._replace(src='', dt='', context='', locator='')), 81 | ] 82 | for ename, ekey in eliminations: 83 | logger.info('eliminating by %s', ename) 84 | logger.info('before: %d, after: %d', len(before), len(after)) 85 | before, common, after = eliminate_by(before, after, key=ekey) 86 | logger.info('common: %d, before: %d, after: %d', len(common), len(before), len(after)) 87 | 88 | logger.info('removing explicitly ignored items') 89 | # before = filtered(before, between=between, umap=umap) 90 | logger.info('before: %d', len(before)) 91 | 92 | for b in before: 93 | reg_error(b) 94 | 95 | return errors 96 | 97 | def setup_parser(p): 98 | # TODO better name? 99 | p.add_argument('--intermediate-dir', type=Path) 100 | p.add_argument('--last', type=int, default=2) 101 | p.add_argument('--all', action='store_const', const=0, dest='last') 102 | p.add_argument('paths', nargs='*') 103 | 104 | 105 | def get_files(args): 106 | if len(args.paths) == 0: 107 | int_dir = args.intermediate_dir 108 | assert int_dir.exists() 109 | files = sorted(int_dir.glob('*.sqlite*')) 110 | files = files[-args.last:] 111 | else: 112 | files = [Path(p) for p in args.paths] 113 | return files 114 | 115 | 116 | def main(): 117 | p = argparse.ArgumentParser() 118 | setup_parser(p) 119 | args = p.parse_args() 120 | files = get_files(args) 121 | 122 | errors = list(compare_files(*files)) 123 | if len(errors) > 0: 124 | sys.exit(1) 125 | 126 | 127 | def compare_files(*files: Path, log=True) -> Iterator[tuple[str, DbVisit]]: 128 | assert len(files) > 0 129 | 130 | logger = get_logger() 131 | logger.info('comparing %s', files) 132 | 133 | last = None 134 | last_dts = None 135 | for f in files: 136 | logger.info('processing %r', f) 137 | name = f.name 138 | this_dts = name[0: name.index('.')] # can't use stem due to multiple extensions.. 139 | 140 | from promnesia.server import _get_stuff # TODO ugh 141 | engine, table = _get_stuff(PathWithMtime.make(f)) 142 | 143 | with engine.connect() as conn: 144 | vis = [row_to_db_visit(row) for row in conn.execute(table.select())] 145 | 146 | if last is not None: 147 | between = f'{last_dts}:{this_dts}' 148 | errs = compare(last, vis, between=between, log=log) 149 | for e in errs: 150 | yield between, e 151 | last = vis 152 | last_dts = this_dts 153 | 154 | if __name__ == '__main__': 155 | main() 156 | 157 | -------------------------------------------------------------------------------- /src/promnesia/compat.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | if not TYPE_CHECKING: 4 | ## we used to have compat fixes here for these for python3.7 5 | ## keeping in case any sources depended on compat functions 6 | from subprocess import PIPE, Popen, check_call, check_output, run # noqa: F401 7 | from typing import Literal, Protocol # noqa: F401 8 | ## 9 | 10 | # todo deprecate properly 11 | def removeprefix(text: str, prefix: str) -> str: 12 | return text.removeprefix(prefix) 13 | -------------------------------------------------------------------------------- /src/promnesia/config.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import importlib 4 | import importlib.util 5 | import os 6 | import warnings 7 | from collections.abc import Iterable 8 | from pathlib import Path 9 | from types import ModuleType 10 | from typing import Callable, NamedTuple, Union 11 | 12 | from .common import DbVisit, PathIsh, Res, Source, default_cache_dir, default_output_dir 13 | 14 | HookT = Callable[[Res[DbVisit]], Iterable[Res[DbVisit]]] 15 | 16 | 17 | ModuleName = str 18 | 19 | # something that can be converted into a proper Source 20 | ConfigSource = Union[Source, ModuleName, ModuleType] 21 | 22 | 23 | class Config(NamedTuple): 24 | # TODO remove default from sources once migrated 25 | SOURCES: list[ConfigSource] = [] 26 | 27 | # if not specified, uses user data dir 28 | OUTPUT_DIR: PathIsh | None = None 29 | 30 | CACHE_DIR: PathIsh | None = '' 31 | FILTERS: list[str] = [] 32 | 33 | HOOK: HookT | None = None 34 | 35 | # 36 | # NOTE: INDEXERS is deprecated, use SOURCES instead 37 | INDEXERS: list[ConfigSource] = [] 38 | #MIME_HANDLER: Optional[str] = None # TODO 39 | 40 | @property 41 | def sources(self) -> Iterable[Res[Source]]: 42 | if len(self.INDEXERS) > 0: 43 | warnings.warn("'INDEXERS' is deprecated. Please use 'SOURCES'!", DeprecationWarning) 44 | 45 | raw = self.SOURCES + self.INDEXERS 46 | 47 | if len(raw) == 0: 48 | raise RuntimeError("Please specify SOURCES in the config! See https://github.com/karlicoss/promnesia#setup for more information") 49 | 50 | for r in raw: 51 | if isinstance(r, ModuleName): 52 | try: 53 | r = importlib.import_module(r) 54 | except ModuleNotFoundError as e: 55 | # todo better error reporting? 56 | yield e 57 | continue 58 | 59 | if isinstance(r, Source): 60 | yield r 61 | else: 62 | # otherwise Source object can take care of the module we passed 63 | # (see SourceIsh) 64 | yield Source(r) 65 | 66 | @property 67 | def cache_dir(self) -> Path | None: 68 | # TODO we used to use this for cachew, but it's best to rely on HPI modules etc to cofigure this 69 | # keeping just in case for now 70 | cd = self.CACHE_DIR 71 | cpath: Path | None 72 | if cd is None: 73 | cpath = None # means 'disabled' in cachew 74 | elif cd == '': # meh.. but need to make it None friendly.. 75 | cpath = default_cache_dir() 76 | else: 77 | cpath = Path(cd) 78 | if cpath is not None: 79 | cpath.mkdir(exist_ok=True, parents=True) 80 | return cpath 81 | 82 | # TODO also tmp dir -- perhaps should be in cache or at least possible to specify in config? not sure if useful 83 | @property 84 | def output_dir(self) -> Path: 85 | odir = self.OUTPUT_DIR 86 | opath = default_output_dir() if odir is None else Path(odir) 87 | opath.mkdir(exist_ok=True, parents=True) 88 | return opath 89 | 90 | @property 91 | def db(self) -> Path: 92 | return self.output_dir / 'promnesia.sqlite' 93 | 94 | @property 95 | def hook(self) -> HookT | None: 96 | return self.HOOK 97 | 98 | instance: Config | None = None 99 | 100 | 101 | def has() -> bool: 102 | return instance is not None 103 | 104 | def get() -> Config: 105 | assert instance is not None, "Expected config to be set, but it's not" 106 | return instance 107 | 108 | 109 | def load_from(config_file: Path) -> None: 110 | global instance 111 | instance = import_config(config_file) 112 | 113 | 114 | def reset() -> None: 115 | global instance 116 | assert instance is not None 117 | instance = None 118 | 119 | 120 | def import_config(config_file: PathIsh) -> Config: 121 | p = Path(config_file) 122 | 123 | # todo just exec?? 124 | name = p.stem 125 | spec = importlib.util.spec_from_file_location(name, p); assert spec is not None 126 | mod = importlib.util.module_from_spec(spec); assert mod is not None 127 | loader = spec.loader; assert loader is not None 128 | loader.exec_module(mod) 129 | 130 | d = {} 131 | for f in Config._fields: 132 | if hasattr(mod, f): 133 | d[f] = getattr(mod, f) 134 | return Config(**d) 135 | 136 | 137 | # TODO: ugh. this causes warnings to be repeated multiple times... need to reuse the pool or something.. 138 | def use_cores() -> int | None: 139 | ''' 140 | Somewhat experimental. 141 | For now only used in sources.auto, perhaps later will be shared among the other indexers. 142 | ''' 143 | # most likely needs to be some sort of pipeline thing? 144 | cs = os.environ.get('PROMNESIA_CORES', None) 145 | if cs is None: 146 | return None 147 | try: 148 | return int(cs) 149 | except ValueError: # any other value means 'use all 150 | return 0 151 | 152 | 153 | def extra_fd_args() -> list[str]: 154 | ''' 155 | Not sure where it belongs yet... so via env variable for now 156 | Can be used to pass --ignore-file parameter 157 | ''' 158 | v = os.environ.get('PROMNESIA_FD_EXTRA_ARGS', '') 159 | extra = v.split() # eh, hopefully splitting that way is ok... 160 | return extra 161 | -------------------------------------------------------------------------------- /src/promnesia/database/common.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections.abc import Sequence 4 | from datetime import datetime 5 | 6 | from sqlalchemy import ( 7 | Column, 8 | Integer, 9 | String, 10 | ) 11 | 12 | # TODO maybe later move DbVisit here completely? 13 | # kinda an issue that it's technically an "api" because hook in config can patch up DbVisit 14 | from ..common import DbVisit, Loc 15 | 16 | 17 | def get_columns() -> Sequence[Column]: 18 | # fmt: off 19 | res: Sequence[Column] = [ 20 | Column('norm_url' , String()), 21 | Column('orig_url' , String()), 22 | Column('dt' , String()), 23 | Column('locator_title', String()), 24 | Column('locator_href' , String()), 25 | Column('src' , String()), 26 | Column('context' , String()), 27 | Column('duration' , Integer()) 28 | ] 29 | # fmt: on 30 | assert len(res) == len(DbVisit._fields) + 1 # +1 because Locator is 'flattened' 31 | return res 32 | 33 | 34 | def db_visit_to_row(v: DbVisit) -> tuple: 35 | # ugh, very hacky... 36 | # we want to make sure the resulting tuple only consists of simple types 37 | # so we can use dbengine directly 38 | dt_s = v.dt.isoformat() 39 | row = ( 40 | v.norm_url, 41 | v.orig_url, 42 | dt_s, 43 | v.locator.title, 44 | v.locator.href, 45 | v.src, 46 | v.context, 47 | v.duration, 48 | ) 49 | return row 50 | 51 | 52 | def row_to_db_visit(row: Sequence) -> DbVisit: 53 | (norm_url, orig_url, dt_s, locator_title, locator_href, src, context, duration) = row 54 | dt_s = dt_s.split()[0] # backwards compatibility: previously it could be a string separated with tz name 55 | dt = datetime.fromisoformat(dt_s) 56 | return DbVisit( 57 | norm_url=norm_url, 58 | orig_url=orig_url, 59 | dt=dt, 60 | locator=Loc( 61 | title=locator_title, 62 | href=locator_href, 63 | ), 64 | src=src, 65 | context=context, 66 | duration=duration, 67 | ) 68 | -------------------------------------------------------------------------------- /src/promnesia/database/load.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | from sqlalchemy import ( 6 | Engine, 7 | Index, 8 | MetaData, 9 | Table, 10 | create_engine, 11 | exc, 12 | ) 13 | 14 | from .common import DbVisit, get_columns, row_to_db_visit 15 | 16 | DbStuff = tuple[Engine, Table] 17 | 18 | 19 | def get_db_stuff(db_path: Path) -> DbStuff: 20 | assert db_path.exists(), db_path 21 | # todo how to open read only? 22 | # actually not sure if we can since we are creating an index here 23 | engine = create_engine(f'sqlite:///{db_path}') # , echo=True) 24 | 25 | meta = MetaData() 26 | table = Table('visits', meta, *get_columns()) 27 | 28 | idx = Index('index_norm_url', table.c.norm_url) 29 | try: 30 | idx.create(bind=engine) 31 | except exc.OperationalError as e: 32 | if 'already exists' in str(e): 33 | # meh, but no idea how to check it properly... 34 | pass 35 | else: 36 | raise e 37 | 38 | # NOTE: apparently it's ok to open connection on every request? at least my comparisons didn't show anything 39 | return engine, table 40 | 41 | 42 | def get_all_db_visits(db_path: Path) -> list[DbVisit]: 43 | # NOTE: this is pretty inefficient if the DB is huge 44 | # mostly intended for tests 45 | engine, table = get_db_stuff(db_path) 46 | query = table.select() 47 | with engine.connect() as conn: 48 | res = [row_to_db_visit(row) for row in conn.execute(query)] 49 | engine.dispose() 50 | return res 51 | -------------------------------------------------------------------------------- /src/promnesia/extract.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import re 4 | from collections.abc import Iterable, Sequence 5 | from functools import lru_cache 6 | 7 | from .cannon import CanonifyException 8 | from .common import ( 9 | DbVisit, 10 | Filter, 11 | Res, 12 | Results, 13 | Source, 14 | SourceName, 15 | Url, 16 | Visit, 17 | logger, 18 | ) 19 | 20 | DEFAULT_FILTERS = ( 21 | r'^chrome-\w+://', 22 | r'chrome://newtab', 23 | r'chrome://apps', 24 | r'chrome://history', 25 | r'^about:', 26 | r'^blob:', 27 | r'^view-source:', 28 | 29 | r'^content:', 30 | ) 31 | 32 | 33 | # TODO maybe move these to configs? 34 | @lru_cache(1) #meh, not sure what would happen under tests? 35 | def filters() -> Sequence[Filter]: 36 | from . import config 37 | 38 | flt = list(DEFAULT_FILTERS) 39 | if config.has(): # meeeh... 40 | cfg = config.get() 41 | flt.extend(cfg.FILTERS) 42 | return tuple(make_filter(f) for f in flt) 43 | 44 | 45 | def extract_visits(source: Source, *, src: SourceName) -> Iterable[Res[DbVisit]]: 46 | extractor = source.extractor 47 | logger.info('extracting via %s ...', source.description) 48 | 49 | try: 50 | vit: Results = extractor() 51 | except Exception as e: 52 | # todo critical error? 53 | # cause that means error during binding extractor args 54 | logger.exception(e) 55 | yield e 56 | return 57 | 58 | handled: set[Visit] = set() 59 | try: 60 | for p in vit: 61 | if isinstance(p, Exception): 62 | # todo not sure if need it at all? 63 | # parts = ['indexer emitted exception\n'] 64 | # eh, exception type is ignored by format_exception completely, apparently?? 65 | # parts.extend(traceback.format_exception(Exception, p, p.__traceback__)) 66 | # logger.error(''.join(parts)) 67 | yield p 68 | continue 69 | 70 | if p in handled: # no need to emit duplicates 71 | continue 72 | handled.add(p) 73 | 74 | yield from as_db_visit(p, src=src) 75 | except Exception as e: 76 | # todo critical error? 77 | logger.exception(e) 78 | yield e 79 | 80 | 81 | logger.info('extracting via %s: got %d visits', source.description, len(handled)) 82 | 83 | 84 | def as_db_visit(v: Visit, *, src: SourceName) -> Iterable[Res[DbVisit]]: 85 | if filtered(v.url): 86 | return 87 | res = DbVisit.make(v, src=src) 88 | if isinstance(res, CanonifyException): 89 | # todo not sure if need this log? either way maybe get rid of canonify exception and just yield up 90 | logger.error('error while canonnifying %s... ignoring', v) 91 | logger.exception(res) 92 | yield res 93 | 94 | 95 | def filtered(url: Url) -> bool: 96 | return any(f(url) for f in filters()) 97 | 98 | 99 | def make_filter(thing: str | Filter) -> Filter: 100 | if isinstance(thing, str): 101 | rc = re.compile(thing) 102 | def filter_(u: str) -> bool: 103 | return rc.search(u) is not None 104 | return filter_ 105 | else: # must be predicate 106 | return thing 107 | -------------------------------------------------------------------------------- /src/promnesia/misc/__init__.pyi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/src/promnesia/misc/__init__.pyi -------------------------------------------------------------------------------- /src/promnesia/misc/config_example.py: -------------------------------------------------------------------------------- 1 | from promnesia.common import Source 2 | from promnesia.sources import auto 3 | 4 | ''' 5 | List of sources to use. 6 | 7 | You can specify your own, add more sources, etc. 8 | See https://github.com/karlicoss/promnesia#setup for more information 9 | ''' 10 | SOURCES = [ 11 | Source( 12 | auto.index, 13 | # just some arbitrary directory with plaintext files 14 | '/usr/share/vim/', 15 | ) 16 | ] 17 | -------------------------------------------------------------------------------- /src/promnesia/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/src/promnesia/py.typed -------------------------------------------------------------------------------- /src/promnesia/sources/auto_logseq.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import urllib.parse 3 | 4 | 5 | def logseq_replacer(path: str, root: str) -> str: 6 | if not path.startswith("editor://") or not (path.endswith((".md", ".org"))): 7 | return path 8 | 9 | graph = os.path.basename(root) # noqa: PTH119 10 | page_name = os.path.basename(path).rsplit('.', 1)[0] # noqa: PTH119 11 | encoded_page_name = urllib.parse.quote(page_name) 12 | 13 | uri = f"logseq://graph/{graph}?page={encoded_page_name}" 14 | 15 | return uri 16 | -------------------------------------------------------------------------------- /src/promnesia/sources/auto_obsidian.py: -------------------------------------------------------------------------------- 1 | def obsidian_replacer(p: str, r: str) -> str: 2 | if not p.startswith("editor://") or not p.endswith('.md'): 3 | return p 4 | 5 | path = p.split('/', 2)[-1] 6 | 7 | uri = f"obsidian://{path}" 8 | return uri 9 | -------------------------------------------------------------------------------- /src/promnesia/sources/browser.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for visits from web browsers. 3 | ''' 4 | 5 | from __future__ import annotations 6 | 7 | import re 8 | import warnings 9 | from collections.abc import Iterator 10 | from typing import TYPE_CHECKING, Any 11 | 12 | from promnesia.common import Loc, PathIsh, Results, Second, Visit, is_sqlite_db, logger 13 | 14 | 15 | def index(p: PathIsh | None = None) -> Results: 16 | from . import hpi # noqa: F401,I001 17 | 18 | if p is None: 19 | from my.browser.all import history 20 | yield from _index_new(history()) 21 | return 22 | 23 | warnings.warn( 24 | f'Passing paths to promnesia.sources.browser is deprecated, you should setup my.browser.export instead. ' 25 | f'See https://github.com/purarue/browserexport#hpi .' 26 | f'Will try to hack path to browser databases {p} into HPI config.' 27 | ) 28 | try: 29 | yield from _index_new_with_adhoc_config(path=p) 30 | except Exception as e: 31 | logger.exception(e) 32 | warnings.warn("Hacking my.config.browser.export didn't work. You probably need to update HPI.") 33 | else: 34 | return 35 | 36 | logger.warning("Falling back onto legacy promnesia.sources.browser_legacy module") 37 | yield from _index_old(path=p) 38 | 39 | 40 | def _index_old(*, path: PathIsh) -> Results: 41 | from . import browser_legacy 42 | 43 | yield from browser_legacy.index(path) 44 | 45 | 46 | def _index_new_with_adhoc_config(*, path: PathIsh) -> Results: 47 | from . import hpi # noqa: F401,I001 48 | 49 | ## previously, it was possible to index be called with multiple different db search paths 50 | ## this would result in each subsequent call to my.browser.export.history to invalidate cache every time 51 | ## so we hack cachew path so it's different for each call 52 | from my.core.core_config import config as hpi_core_config 53 | hpi_cache_dir = hpi_core_config.get_cache_dir() 54 | sanitized_path = re.sub(r'\W', '_', str(path)) 55 | cache_override = None if hpi_cache_dir is None else hpi_cache_dir / sanitized_path 56 | ## 57 | 58 | from my.core.common import Paths, classproperty, get_files 59 | class config: 60 | class core: 61 | cache_dir = cache_override 62 | 63 | class browser: 64 | class export: 65 | @classproperty 66 | def export_path(cls) -> Paths: 67 | return tuple([f for f in get_files(path, glob='**/*') if is_sqlite_db(f)]) 68 | 69 | from my.core.cfg import tmp_config 70 | with tmp_config(modules='my.browser.export|my.core.core_config', config=config): 71 | from my.browser.export import history 72 | yield from _index_new(history()) 73 | 74 | 75 | if TYPE_CHECKING: 76 | from browserexport.merge import Visit as BrowserMergeVisit 77 | else: 78 | BrowserMergeVisit = Any 79 | 80 | 81 | def _index_new(history: Iterator[BrowserMergeVisit]) -> Results: 82 | for v in history: 83 | desc: str | None = None 84 | duration: Second | None = None 85 | metadata = v.metadata 86 | if metadata is not None: 87 | desc = metadata.title 88 | duration = metadata.duration 89 | yield Visit( 90 | url=v.url, 91 | dt=v.dt, 92 | locator=Loc(title=desc or v.url, href=v.url), 93 | duration=duration, 94 | ) 95 | -------------------------------------------------------------------------------- /src/promnesia/sources/demo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A dummy source, used for testing 3 | Generates a sequence of fake evenly separated visits 4 | ''' 5 | 6 | from __future__ import annotations 7 | 8 | from datetime import datetime, timedelta 9 | 10 | from promnesia.common import Loc, Results, Visit 11 | 12 | IsoFormatDt = str 13 | Seconds = int 14 | 15 | 16 | # TODO allow passing isoformat string as base_dt? 17 | # and maybe something similar as delta? start with seconds maybe 18 | def index( 19 | count: int = 100, 20 | *, 21 | base_dt: datetime | IsoFormatDt = datetime.min + timedelta(days=5000), 22 | delta: timedelta | Seconds = timedelta(hours=1), 23 | ) -> Results: 24 | 25 | base_dt_ = base_dt if isinstance(base_dt, datetime) else datetime.fromisoformat(base_dt) 26 | delta_ = delta if isinstance(delta, timedelta) else timedelta(seconds=delta) 27 | 28 | # todo with some errors too? 29 | # todo use data generation library suggested for HPI? 30 | for i in range(count): 31 | yield Visit( 32 | url=f'https://demo.com/page{i}.html', 33 | dt=base_dt_ + delta_ * i, 34 | locator=Loc.make('demo'), 35 | ) 36 | # todo add context? 37 | -------------------------------------------------------------------------------- /src/promnesia/sources/fbmessenger.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for the messages data. 3 | ''' 4 | 5 | from promnesia.common import Loc, Results, Visit, extract_urls 6 | 7 | 8 | def index() -> Results: 9 | from . import hpi # noqa: F401,I001 10 | from my.fbmessenger import messages 11 | 12 | for m in messages(): 13 | if isinstance(m, Exception): 14 | yield m 15 | continue 16 | text = m.text 17 | if text is None: 18 | continue 19 | urls = extract_urls(text) 20 | if len(urls) == 0: 21 | continue 22 | 23 | # TODO m.author would be niceneeds to be implemented in fbmessenger model 24 | loc = Loc.make( 25 | title=f'chat with {m.thread.name}', 26 | # eh, not all threads have nicknames, and not sure how to extract reliably 27 | href=f'https://www.messenger.com/t/{m.thread.thread_id}', 28 | ) 29 | for u in urls: 30 | yield Visit( 31 | url=u, 32 | dt=m.dt, 33 | context=m.text, 34 | locator=loc, 35 | ) 36 | 37 | -------------------------------------------------------------------------------- /src/promnesia/sources/filetypes.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections.abc import Iterable, Sequence 4 | from functools import lru_cache 5 | from pathlib import Path 6 | from typing import Callable, NamedTuple, Union 7 | 8 | from ..common import Results, Url 9 | 10 | # TODO doesn't really belong here... 11 | Ctx = Sequence[str] 12 | 13 | class EUrl(NamedTuple): 14 | url: Url 15 | ctx: Ctx # TODO ctx here is more like a Loc 16 | ### 17 | 18 | 19 | # keys are mime types + extensions 20 | Ex = Callable[[Path], Union[Results, Iterable[EUrl]]] 21 | # None means unhandled 22 | TYPE2IDX: dict[str, Ex | None] = {} 23 | # NOTE: there are some types in auto.py at the moment... it's a bit messy 24 | 25 | 26 | # TYPE2IDX only contains the 'prefixes', to speed up the lookup we are using cache.. 27 | @lru_cache(None) 28 | def type2idx(t: str) -> Ex | None: 29 | if len(t) == 0: 30 | return None # just in case? 31 | # first try exact match 32 | e = TYPE2IDX.get(t, None) 33 | if e is not None: 34 | return e 35 | t = t.strip('.') 36 | e = TYPE2IDX.get(t, None) 37 | if e is not None: 38 | return e 39 | # otherwise, try prefixes? 40 | for k, v in TYPE2IDX.items(): 41 | if t.strip('.').startswith(k): 42 | return v 43 | return None 44 | 45 | # for now source code just indexed with grep, not sure if it's good enough? 46 | # if not, some fanceir library could be used... 47 | # e.g. https://github.com/karlicoss/promnesia/pull/152/commits/c2f00eb4ee4018b02c9bf3966a036db69a43373d 48 | 49 | # TODO use this list? 50 | # https://github.com/GerritCodeReview/gerrit/blob/master/resources/com/google/gerrit/server/mime/mime-types.properties 51 | # later these might do something clever, e.g. stripping off code comments etc? 52 | CODE = { 53 | 'text/x-java', 54 | 'text/x-tex', 55 | 'text/x-sh', 56 | 'text/x-haskell', 57 | 'text/x-perl', 58 | 'text/x-python', 'text/x-script.python', 59 | 'text/x-chdr', 60 | 'text/x-csrc', 61 | 'text/x-c', 62 | 'text/x-c++', 63 | 'text/x-makefile', 64 | 'text/troff', 65 | 'text/x-asm', 66 | 'text/x-objective-c', 67 | 'text/x-lisp', 68 | 'text/vnd.graphviz', 69 | 'text/x-diff', # patch files 70 | 'text/x-php', 71 | 'text/x-lilypond', 72 | 73 | # these didn't have a mime type, or were mistyped? 74 | 'css', 75 | 'el', 76 | 'rs', 77 | 'go', 78 | 'hs', # mistyped on osx 79 | 'hpp', # mistyped on osx 80 | 81 | 'edn', # clojure data 82 | 83 | '.ts', # most likely typescript.. otherwise determined as text/vnd.trolltech.linguist mime 84 | '.js', 85 | } 86 | # TODO discover more extensions with mimetypes library? 87 | 88 | 89 | BINARY = ''' 90 | # epub was failing to detect via mime on CI for some reason.. 91 | epub 92 | inode/x-empty 93 | .sqlite 94 | # comment 95 | application/ 96 | image/ 97 | audio/ 98 | video/ 99 | ''' 100 | 101 | handle_later = lambda *_args, **_kwargs: () 102 | 103 | def ignore(*_args, **_kwargs): 104 | # TODO log (once?) 105 | yield from () 106 | 107 | 108 | for x in BINARY.splitlines(): 109 | x = x.strip() 110 | if len(x) == 0 or x[0] == '#': 111 | continue 112 | TYPE2IDX[x] = ignore 113 | 114 | 115 | TYPE2IDX.update({ 116 | '.xslx': ignore, 117 | '.vcf' : ignore, 118 | 'message/rfc822': ignore, # ?? 119 | 120 | # todo ignore all fonts? 121 | 'font/woff2': ignore, 122 | 'font/woff': ignore, 123 | 'text/x-Algol68': ignore, # ugh some license file had this?? maybe always index text/ as text? 124 | 'text/x-bytecode.python': ignore, # todo ignore all x-bytecode? 125 | 'text/calendar': ignore, 126 | 127 | # TODO not sure what to do about these.. 128 | 'application/octet-stream': handle_later, 129 | 'application/zip' : handle_later, 130 | 'application/x-tar' : handle_later, 131 | 'application/gzip' : handle_later, 132 | }) 133 | 134 | 135 | # TODO use some existing file for initial gitignore.. 136 | IGNORE = [ 137 | '.idea', 138 | 'venv', 139 | '.git', 140 | '.eggs', 141 | '.mypy_cache', 142 | '.pytest_cache', 143 | 'node_modules', 144 | '__pycache__', 145 | '.tox', 146 | '.stack-work', 147 | 148 | # TODO not sure about these: 149 | '.gitignore', 150 | '.babelrc', 151 | ] 152 | 153 | -------------------------------------------------------------------------------- /src/promnesia/sources/github.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] github module 3 | ''' 4 | from __future__ import annotations 5 | 6 | # Note: requires the 'mistletoe' module if you enable render_markdown 7 | from promnesia.common import Loc, Results, Visit, iter_urls, logger 8 | 9 | 10 | def index(*, render_markdown: bool = False) -> Results: 11 | from . import hpi # noqa: F401,I001 12 | from my.github.all import events 13 | 14 | if render_markdown: 15 | try: 16 | from .markdown import TextParser, extract_from_text 17 | except ImportError as import_err: 18 | logger.exception(import_err) 19 | logger.critical("Could not import markdown module to render github body markdown. Try 'python3 -m pip install mistletoe'") 20 | render_markdown = False 21 | 22 | for e in events(): 23 | if isinstance(e, Exception): 24 | yield e 25 | continue 26 | if e.link is None: 27 | continue 28 | 29 | # if enabled, convert the (markdown) body to HTML 30 | context: str | None = e.body 31 | if e.body is not None and render_markdown: 32 | context = TextParser(e.body)._doc_ashtml() # type: ignore[possibly-undefined] 33 | 34 | # locator should link back to this event 35 | loc = Loc.make(title=e.summary, href=e.link) 36 | 37 | # visit which links back to this event in particular 38 | yield Visit( 39 | url=e.link, 40 | dt=e.dt, 41 | context=context, 42 | locator=loc, 43 | ) 44 | 45 | for url in iter_urls(e.summary): 46 | yield Visit( 47 | url=url, 48 | dt=e.dt, 49 | context=context, 50 | locator=loc, 51 | ) 52 | 53 | if e.body is None: 54 | continue 55 | 56 | # extract any links found in the body 57 | # 58 | # Note: this set gets reset every event, is here to 59 | # prevent duplicates between URLExtract and the markdown parser 60 | emitted: set[str] = set() 61 | for url in iter_urls(e.body): 62 | if url in emitted: 63 | continue 64 | yield Visit( 65 | url=url, 66 | dt=e.dt, 67 | context=context, 68 | locator=loc, 69 | ) 70 | emitted.add(url) 71 | 72 | # extract from markdown links like [link text](https://...) 73 | # incase URLExtract missed any somehow 74 | if render_markdown: 75 | for res in extract_from_text(e.body): # type: ignore[possibly-undefined] 76 | if isinstance(res, Exception): 77 | yield res 78 | continue 79 | if res.url in emitted: 80 | continue 81 | yield Visit( 82 | url=res.url, 83 | dt=e.dt, 84 | context=context, 85 | locator=loc, 86 | ) 87 | emitted.add(res.url) 88 | -------------------------------------------------------------------------------- /src/promnesia/sources/guess.py: -------------------------------------------------------------------------------- 1 | # TODO eh. confusing how guess and auto are different... 2 | # maybe merge them later? 3 | from collections.abc import Iterable 4 | from typing import Any 5 | 6 | from ..common import Extraction, PathIsh 7 | 8 | 9 | def is_git_repo(p: str) -> bool: 10 | if '://github.com/' in p: 11 | return True 12 | return False 13 | 14 | 15 | def is_website(p: str) -> bool: 16 | if p.startswith('http'): 17 | return True 18 | return False 19 | 20 | 21 | def index(path: PathIsh, *args, **kwargs) -> Iterable[Extraction]: 22 | ps = str(path) 23 | # TODO better url detection 24 | 25 | index_: Any # meh 26 | if is_git_repo(ps): 27 | from . import vcs 28 | index_ = vcs.index 29 | elif is_website(ps): 30 | from . import website 31 | index_ = website.index 32 | else: 33 | from . import auto 34 | index_ = auto.index 35 | yield from index_(path, *args, **kwargs) 36 | -------------------------------------------------------------------------------- /src/promnesia/sources/hackernews.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] dogsheep module to import HackerNews items. 3 | ''' 4 | 5 | import textwrap 6 | 7 | from promnesia.common import Loc, Results, Visit 8 | 9 | 10 | def index() -> Results: 11 | from . import hpi # noqa: F401,I001 12 | from my.hackernews import dogsheep 13 | 14 | for item in dogsheep.items(): 15 | if isinstance(item, Exception): 16 | yield item 17 | continue 18 | hn_url = item.permalink 19 | title = "hackernews" 20 | if item.title: 21 | title = item.title 22 | elif item.text_html: 23 | title = item.text_html 24 | title = textwrap.shorten( 25 | title, width=79, placeholder="…", 26 | break_long_words=True) 27 | # The locator is always the HN story. If the story is a link (as 28 | # opposed to a text post), we insert a visit such that the link 29 | # will point back to the corresponding HN story. 30 | loc = Loc.make(title=title, href=hn_url) 31 | urls = [hn_url] 32 | if item.url is not None: 33 | urls.append(item.url) 34 | for url in urls: 35 | yield Visit( 36 | url=url, 37 | dt=item.created, 38 | locator=loc, 39 | context=title, 40 | ) 41 | -------------------------------------------------------------------------------- /src/promnesia/sources/hpi.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Just a helper for a more humane error message when importing my.* dependencies 3 | ''' 4 | 5 | from promnesia.common import logger 6 | 7 | try: 8 | import my # noqa: F401 9 | except ImportError as e: 10 | logger.exception(e) 11 | logger.critical("Failed during 'import my'. You probably need to install & configure HPI package first (see 'https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org')") 12 | -------------------------------------------------------------------------------- /src/promnesia/sources/html.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Extracts links from HTML files 3 | ''' 4 | 5 | from __future__ import annotations 6 | 7 | from collections.abc import Iterator 8 | from pathlib import Path 9 | 10 | from bs4 import BeautifulSoup 11 | 12 | from promnesia.common import Loc, PathIsh, Results, Visit, file_mtime 13 | 14 | # TODO present error summary in the very end; import errors -- makes sense to show 15 | # TODO on some exceptions, request a fallback to text? 16 | 17 | 18 | Url = tuple[str, str] 19 | 20 | 21 | def extract_urls_from_html(s: str) -> Iterator[Url]: 22 | """ 23 | Helper method to extract URLs from any HTML, so this could 24 | potentially be used by other modules 25 | """ 26 | soup = BeautifulSoup(s, 'lxml') 27 | for a in soup.find_all('a'): 28 | href = a.attrs.get('href') 29 | if href is None or ('://' not in href): 30 | # second condition means relative link 31 | continue 32 | text = a.text 33 | yield (href, text) 34 | 35 | 36 | def extract_from_file(fname: PathIsh) -> Results: 37 | ts = file_mtime(fname) 38 | 39 | for href, text in extract_urls_from_html(Path(fname).read_text(errors='replace')): 40 | yield Visit( 41 | url=href, 42 | dt=ts, 43 | locator=Loc.file(fname), 44 | context=text, 45 | ) 46 | -------------------------------------------------------------------------------- /src/promnesia/sources/hypothesis.py: -------------------------------------------------------------------------------- 1 | """ 2 | Uses HPI [[https://github.com/karlicoss/HPI/blob/master/doc/MODULES.org#myhypothesis][hypothesis]] module 3 | """ 4 | 5 | from promnesia.common import Loc, Results, Visit, extract_urls, join_tags 6 | 7 | 8 | def index() -> Results: 9 | from . import hpi # noqa: F401,I001 10 | import my.hypothesis as hyp 11 | 12 | for h in hyp.highlights(): 13 | if isinstance(h, Exception): 14 | yield h 15 | continue 16 | hl = h.highlight 17 | ann = h.annotation 18 | tags = h.tags 19 | cparts = [] 20 | if hl is not None: 21 | cparts.append(hl) 22 | if ann is not None: 23 | cparts.append(f"comment: {ann}") 24 | if tags: 25 | cparts.append(join_tags(tags)) 26 | visit = Visit( 27 | url=h.url, 28 | dt=h.created, 29 | context="\n\n".join(cparts), 30 | locator=Loc.make( 31 | title="hypothesis", 32 | href=h.hyp_link, 33 | ), 34 | ) 35 | 36 | yield visit 37 | 38 | in_text_visits = ( 39 | (hl, "highlighted"), 40 | (ann, "comment"), 41 | ) 42 | for text, part_name in in_text_visits: 43 | if text and text.strip(): 44 | urls = extract_urls(text) 45 | for url in urls: 46 | yield visit._replace( 47 | url=url, 48 | locator=visit.locator._replace(title=f"hypothesis-{part_name}"), 49 | ) 50 | -------------------------------------------------------------------------------- /src/promnesia/sources/instapaper.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses HPI [[https://github.com/karlicoss/HPI/blob/master/doc/MODULES.org#myinstapaper][instapaper]] module 3 | ''' 4 | from promnesia.common import Loc, Results, Visit 5 | 6 | 7 | def index() -> Results: 8 | from . import hpi # noqa: F401,I001 9 | import my.instapaper as ip 10 | 11 | for p in ip.pages(): 12 | bm = p.bookmark 13 | hls = p.highlights 14 | 15 | if len(hls) == 0: 16 | yield Visit( 17 | url=bm.url, 18 | dt=bm.dt, 19 | context=None, 20 | locator=Loc.make(title='instapaper', href=bm.instapaper_link), 21 | ) 22 | else: 23 | for hl in p.highlights: 24 | cparts = [hl.text] 25 | if hl.note is not None: 26 | cparts.append('comment: ' + hl.note) 27 | yield Visit( 28 | url=bm.url, 29 | dt=hl.dt, 30 | context='\n'.join(cparts), 31 | locator=Loc.make(title='instapaper', href=hl.instapaper_link), 32 | ) 33 | -------------------------------------------------------------------------------- /src/promnesia/sources/markdown.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections.abc import Iterator 4 | from pathlib import Path 5 | from typing import NamedTuple 6 | 7 | import mistletoe # type: ignore[import-untyped] 8 | import mistletoe.block_token as BT # type: ignore[import-untyped] 9 | from mistletoe.html_renderer import HTMLRenderer # type: ignore[import-untyped] 10 | from mistletoe.span_token import AutoLink, Link # type: ignore[import-untyped] 11 | 12 | from promnesia.common import ( 13 | Extraction, 14 | Loc, 15 | PathIsh, 16 | Res, 17 | Url, 18 | Visit, 19 | file_mtime, 20 | logger, 21 | ) 22 | 23 | renderer = HTMLRenderer() 24 | 25 | 26 | block_tokens = tuple(getattr(BT, name) for name in BT.__all__) 27 | 28 | 29 | class Parsed(NamedTuple): 30 | url: Url 31 | context: str | None 32 | 33 | 34 | Result = Res[Parsed] 35 | 36 | 37 | # the fuck... 38 | # 39 | # from mistletoe import Document 40 | # d = Document(''' 41 | # # heading 42 | # ## sub 43 | # ## sub2 44 | # ''') 45 | # d.children[0].content 46 | # Out[13]: 'sub2' 47 | 48 | # meh, but for now fine I guess 49 | HTML_MARKER = '!html ' 50 | 51 | 52 | def _ashtml(block) -> str: 53 | res = renderer.render(block) 54 | if res.startswith('

') and res.endswith('

'): 55 | res = res[3:-4] # meh, but for now fine 56 | return res 57 | 58 | 59 | class Parser: 60 | def __init__(self, path: Path): 61 | self.doc = mistletoe.Document(path.read_text()) 62 | 63 | def _extract(self, cur, last_block) -> Iterator[Parsed]: 64 | if not isinstance(cur, (AutoLink, Link)): 65 | # hopefully that's all?? 66 | return 67 | 68 | url = cur.target 69 | # TODO fuck. it doesn't preserve line numbers/positions in text??? 70 | 71 | # ugh. It can't output markdown.. https://github.com/miyuchina/mistletoe/issues/4 72 | context = None if last_block is None else HTML_MARKER + _ashtml(last_block) 73 | yield Parsed(url=url, context=context) 74 | 75 | def _walk(self, cur, last_block) -> Iterator[Result]: 76 | if isinstance(cur, block_tokens): 77 | last_block = cur 78 | 79 | try: 80 | yield from self._extract(cur, last_block) 81 | except Exception as e: 82 | logger.exception(e) 83 | yield e 84 | 85 | # keeping getattr for compatibility in older versions of mistletoe, it was optional 86 | children = getattr(cur, 'children', None) 87 | if children is None: 88 | return 89 | for c in children: 90 | yield from self._walk(c, last_block=last_block) 91 | 92 | def walk(self) -> Iterator[Result]: 93 | yield from self._walk(self.doc, last_block=None) 94 | 95 | 96 | def extract_from_file(fname: PathIsh) -> Iterator[Extraction]: 97 | path = Path(fname) 98 | fallback_dt = file_mtime(path) 99 | 100 | p = Parser(path) 101 | for r in p.walk(): 102 | if isinstance(r, Exception): 103 | yield r 104 | else: 105 | yield Visit( 106 | url=r.url, 107 | dt=fallback_dt, 108 | locator=Loc.file(fname), # TODO line number 109 | context=r.context, 110 | ) 111 | 112 | 113 | class TextParser(Parser): 114 | ''' 115 | Used to extract links/render markdown from text, e.g. reddit/github comments 116 | Instead of chunking blocks like for files, this returns the entire 117 | message rendered as the context 118 | ''' 119 | 120 | def __init__(self, text: str) -> None: 121 | self.doc = mistletoe.Document(text) 122 | 123 | def _doc_ashtml(self): 124 | ''' 125 | cached html representation of the entire html message/document 126 | ''' 127 | if not hasattr(self, '_html'): 128 | self._html = HTML_MARKER + _ashtml(self.doc) 129 | return self._html 130 | 131 | def _extract(self, cur, last_block=None) -> Iterator[Parsed]: # noqa: ARG002 132 | if not isinstance(cur, (AutoLink, Link)): 133 | return 134 | 135 | yield Parsed(url=cur.target, context=self._doc_ashtml()) 136 | 137 | 138 | def extract_from_text(text: str) -> Iterator[Result]: 139 | ''' 140 | assume this is rendering something like a github/reddit markdown message 141 | use the entire contents of the comment/body as the context 142 | ''' 143 | # note: returns Result (link/context), not Visit 144 | # the callee function has to insert dt/duration etc. 145 | yield from TextParser(text).walk() 146 | -------------------------------------------------------------------------------- /src/promnesia/sources/plaintext.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from functools import lru_cache 4 | from pathlib import Path 5 | 6 | from promnesia.common import PathIsh, _is_windows 7 | 8 | # https://linux-and-mac-hacks.blogspot.co.uk/2013/04/use-grep-and-regular-expressions-to.html 9 | _URL_REGEX = r'\b(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]*[-A-Za-z0-9+&@#/%=~_|]' 10 | 11 | if _is_windows: 12 | # wtf? for some reason on windows (in cmd.exe specificaly) \b isn't working... 13 | # this will make the regex a bit less precise, but not end of the world 14 | _URL_REGEX = _URL_REGEX.removeprefix(r'\b') 15 | 16 | 17 | @lru_cache 18 | def _has_grep() -> bool: 19 | import shutil 20 | return shutil.which('grep') is not None 21 | 22 | 23 | Command = list[str] 24 | 25 | 26 | _GREP_ARGS: Command = [ 27 | '--color=never', 28 | '-H', # always show filename TODO not sure if works on osx 29 | '-n', # print line numbers (to restore context) 30 | '-I', # ignore binaries 31 | ] 32 | 33 | if not _is_windows: 34 | # exclude-dir not working on windows 35 | _GREP_ARGS += [ 36 | '--exclude-dir=".git"', 37 | ] 38 | 39 | # NOTE: grep/findstr exit with code 1 on no matches... 40 | # we hack around it in shellcmd module (search 'grep') 41 | def _grep(*, paths: list[str], recursive: bool) -> Command: 42 | return [ 43 | 'grep', 44 | *(['-r'] if recursive else []), 45 | *_GREP_ARGS, 46 | '-E', # 'extended' syntax 47 | _URL_REGEX, 48 | *paths, 49 | ] 50 | 51 | def _findstr(*, path: str, recursive: bool) -> Command: 52 | return [ 53 | 'findstr', 54 | '/S', 55 | '/P', 56 | '/N', 57 | 'https*://', 58 | path + (r'\*' if recursive else ''), 59 | ] 60 | 61 | 62 | # TODO unify these if it works?? 63 | def _extract_from_dir(path: str) -> Command: 64 | if _has_grep(): 65 | return _grep( 66 | paths=[path], 67 | recursive=True, 68 | ) 69 | elif _is_windows: 70 | return _findstr(path=path, recursive=True) 71 | else: 72 | raise RuntimeError("no grep; don't know which search tool to use!") 73 | 74 | 75 | def _extract_from_file(path: str) -> Command: 76 | if _is_windows and not _has_grep(): 77 | return _findstr(path=path, recursive=False) 78 | 79 | return _grep( 80 | paths=[path], 81 | recursive=False, 82 | ) 83 | 84 | 85 | def extract_from_path(path: PathIsh) -> Command: 86 | pp = Path(path) 87 | 88 | if pp.is_dir(): # TODO handle archives here??? 89 | return _extract_from_dir(str(pp)) 90 | 91 | if any(pp.suffix == ex for ex in ( 92 | '.xz', 93 | '.bz2', 94 | '.gz', 95 | '.zip', 96 | )): 97 | # todo should be debug? 98 | # or should delete it completely, feels like unpacking archives here is a bit too much 99 | raise RuntimeError(f"Archives aren't supported yet: {path}") 100 | # logger.info(f"Extracting from compressed file {path}") 101 | # import lzma 102 | # from tempfile import NamedTemporaryFile 103 | # # TODO hopefully, no collisions 104 | # import os.path 105 | # fname = os.path.join(tdir.name, os.path.basename(path)) 106 | # with open(fname, 'wb') as fo: 107 | # with lzma.open(path, 'r') as cf: 108 | # fo.write(cf.read()) 109 | # return _extract_from_file(fname) 110 | 111 | r = _extract_from_file(str(pp)) 112 | return r 113 | -------------------------------------------------------------------------------- /src/promnesia/sources/pocket.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for Pocket highlights & bookmarks 3 | ''' 4 | 5 | from promnesia.common import Loc, Results, Visit 6 | 7 | 8 | def index() -> Results: 9 | from . import hpi # noqa: F401,I001 10 | from my.pocket import articles 11 | 12 | # TODO use docstring from my. module? E.g. describing which pocket format is expected 13 | 14 | for a in articles(): 15 | title = a.json.get('resolved_title', None) or a.json.get('given_title', 'pocket') 16 | loc = Loc.make(title=title, href=a.pocket_link) 17 | # Add a reverse locator so that the Promnesia browser extension shows a 18 | # link on the Pocket page back to the original URL. 19 | # FIXME need to actually use it 20 | _loc_rev = Loc.make(title=title, href=a.url) 21 | hls = a.highlights 22 | excerpt = a.json.get('excerpt', None) 23 | if len(hls) == 0: 24 | yield Visit( 25 | url=a.url, 26 | dt=a.added, 27 | context=excerpt, 28 | locator=loc, 29 | ) 30 | for hl in hls: 31 | yield Visit( 32 | url=a.url, 33 | dt=hl.created, 34 | context=hl.text, 35 | locator=loc, 36 | ) 37 | -------------------------------------------------------------------------------- /src/promnesia/sources/roamresearch.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for Roam Research data 3 | ''' 4 | 5 | from promnesia.common import Loc, Results, Visit, extract_urls 6 | 7 | 8 | def index() -> Results: 9 | import my.roamresearch as RR 10 | roam = RR.roam() 11 | for node in roam.traverse(): 12 | yield from _collect(node) 13 | 14 | 15 | def _collect(node: 'RoamNode') -> Results: 16 | title = node.title 17 | body = node.body or '' 18 | if title is None: 19 | # most notes don't have title, so we just take the first line instead.. 20 | lines = body.splitlines(keepends=True) 21 | if len(lines) > 0: 22 | title = lines[0] 23 | body = ''.join(lines) 24 | title = title or '' 25 | 26 | full = title + '\n' + body 27 | 28 | urls = extract_urls(full) 29 | if len(urls) == 0: 30 | return 31 | 32 | loc = Loc.make( 33 | title=node.path, 34 | href=node.permalink, 35 | ) 36 | for u in urls: 37 | yield Visit( 38 | url=u, 39 | dt=node.created, 40 | context=body, 41 | locator=loc, 42 | ) 43 | 44 | 45 | import typing 46 | 47 | if typing.TYPE_CHECKING: 48 | import my.roamresearch as RR 49 | RoamNode = RR.Node 50 | -------------------------------------------------------------------------------- /src/promnesia/sources/rss.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for RSS data. 3 | ''' 4 | 5 | from datetime import datetime 6 | 7 | import pytz 8 | 9 | from promnesia.common import Loc, Results, Visit 10 | 11 | # arbitrary, 2011-11-04 00:05:23.283+00:00 12 | default_datetime = datetime.fromtimestamp(1320365123, tz=pytz.utc) 13 | # TODO FIXME allow for visit not to have datetime? 14 | # I.e. even having context is pretty good! 15 | 16 | def index() -> Results: 17 | from my.rss.all import subscriptions 18 | 19 | for feed in subscriptions(): 20 | # TODO locator should be optional too? although could use direct link in the rss reader interface 21 | locator = Loc.make(title='my.rss') 22 | yield Visit( 23 | url=feed.url, 24 | dt=feed.created_at or default_datetime, 25 | context='RSS subscription', # TODO use 'provider', etc? 26 | locator=locator, 27 | ) 28 | -------------------------------------------------------------------------------- /src/promnesia/sources/shellcmd.py: -------------------------------------------------------------------------------- 1 | """ 2 | Greps out URLs from an arbitrary shell command results. 3 | """ 4 | 5 | from __future__ import annotations 6 | 7 | import os 8 | import re 9 | import warnings 10 | from collections.abc import Sequence 11 | from datetime import datetime 12 | from subprocess import PIPE, run 13 | 14 | from promnesia.common import ( 15 | Loc, 16 | PathIsh, 17 | Results, 18 | Visit, 19 | _is_windows, 20 | extract_urls, 21 | file_mtime, 22 | now_tz, 23 | ) 24 | 25 | from .plaintext import _has_grep 26 | 27 | 28 | def index(command: str | Sequence[PathIsh]) -> Results: 29 | cmd: Sequence[PathIsh] 30 | cmds: str 31 | if isinstance(command, str): 32 | cmds = command 33 | warnings.warn("Passing string as a command is very fragile('{command}'). Please use list instead.") 34 | cmd = command.split(' ') 35 | else: 36 | cmds = ' '.join(map(str, command)) 37 | cmd = command 38 | 39 | # ugh... on windows grep does something nasty? e.g: 40 | # grep --color=never -r -H -n -I -E http 'D:\\a\\promnesia\\promnesia\\tests\\testdata\\custom' 41 | # D:\a\promnesia\promnesia\tests\testdata\custom/file1.txt:1:Right, so this points at http://google.com 42 | # so part of the path has fwd slashes, part has bwd slashes... 43 | needs_windows_grep_patching = _has_grep() and _is_windows 44 | 45 | def handle_line(line: str) -> Results: 46 | # grep dumps this as 47 | # /path/to/file:lineno:rest 48 | # note: on Windows, path contains : after the disk name.. 49 | m = re.search(r'(.*?):(\d+?):(.*)', line) 50 | if m is None: 51 | # todo warn maybe? 52 | fname = None 53 | lineno = None 54 | else: 55 | fname = m.group(1) 56 | lineno = int(m.group(2)) 57 | line = m.group(3) 58 | 59 | if fname is not None and needs_windows_grep_patching: 60 | fname = fname.replace('/', os.sep) 61 | 62 | urls = extract_urls(line) 63 | if len(urls) == 0: 64 | return 65 | 66 | context = line 67 | 68 | ts: datetime 69 | loc: Loc 70 | if fname is not None: 71 | ts = file_mtime(fname) 72 | loc = Loc.file(fname, line=lineno) 73 | else: 74 | ts = now_tz() 75 | loc = Loc.make(cmds) 76 | for url in urls: 77 | yield Visit( 78 | url=url, 79 | dt=ts, 80 | locator=loc, 81 | context=context, 82 | ) 83 | 84 | r = run(cmd, stdout=PIPE, check=False) 85 | if r.returncode > 0: 86 | if not (cmd[0] in {'grep', 'findstr'} and r.returncode == 1): # ugh. grep returns 1 on no matches... 87 | r.check_returncode() 88 | output = r.stdout 89 | assert output is not None 90 | lines = [line.decode('utf-8') for line in output.splitlines()] 91 | for line in lines: 92 | try: 93 | yield from handle_line(line) 94 | except Exception as e: 95 | yield e 96 | -------------------------------------------------------------------------------- /src/promnesia/sources/smscalls.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] smscalls module 3 | ''' 4 | 5 | from promnesia.common import Loc, Results, Visit, extract_urls 6 | 7 | 8 | def index() -> Results: 9 | from . import hpi # noqa: F401,I001 10 | from my.smscalls import messages 11 | 12 | for m in messages(): 13 | 14 | if isinstance(m, Exception): 15 | yield m 16 | continue 17 | 18 | urls = extract_urls(m.message) 19 | if len(urls) == 0: 20 | continue 21 | 22 | if m.who is None: 23 | loc = Loc(title=f"SMS with {m.phone_number}") 24 | else: 25 | loc = Loc(title=f"SMS with {m.who} ({m.phone_number})") 26 | 27 | for u in urls: 28 | yield Visit( 29 | url=u, 30 | dt=m.dt, 31 | context=m.message, 32 | locator=loc, 33 | ) 34 | -------------------------------------------------------------------------------- /src/promnesia/sources/stackexchange.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for Stackexchange data. 3 | ''' 4 | 5 | from promnesia.common import Loc, Results, Visit 6 | 7 | 8 | def index() -> Results: 9 | from . import hpi # noqa: F401,I001 10 | import my.stackexchange.gdpr as G 11 | 12 | for v in G.votes(): 13 | if isinstance(v, Exception): 14 | yield v 15 | else: 16 | yield Visit( 17 | url=v.link, 18 | dt=v.when, 19 | context='voted', # todo use the votetype? although maybe worth ignoring downvotes 20 | # or, downvotes could have 'negative' ranking or something 21 | locator=Loc.make(title='voted', href=v.link) 22 | ) 23 | -------------------------------------------------------------------------------- /src/promnesia/sources/takeout_legacy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from promnesia.common import Loc, Results, Visit, logger 4 | 5 | 6 | # TODO make an iterator, insert in db as we go? handle errors gracefully? 7 | def index() -> Results: 8 | from . import hpi # noqa: F401,I001 9 | from my.google.takeout.paths import get_takeouts 10 | 11 | takeouts = list(get_takeouts()) 12 | # TODO if no takeouts, raise? 13 | # although could raise a warning on top level, when source emitted no takeouts 14 | 15 | # TODO youtube? 16 | google_activities = [read_google_activity(t) for t in takeouts] 17 | search_activities = [read_search_activity(t) for t in takeouts] 18 | browser_histories = [read_browser_history_json(t) for t in takeouts] 19 | 20 | key = lambda v: (v.dt, v.url) 21 | return chain( 22 | unique_everseen(chain(*google_activities), key=key), 23 | unique_everseen(chain(*search_activities), key=key), 24 | unique_everseen(chain(*browser_histories), key=key), 25 | ) 26 | 27 | 28 | 29 | import json 30 | from collections.abc import Iterable 31 | from datetime import datetime 32 | from itertools import chain 33 | from pathlib import Path 34 | 35 | import pytz 36 | from more_itertools import unique_everseen 37 | 38 | from promnesia import config 39 | 40 | try: 41 | from cachew import cachew 42 | except ModuleNotFoundError as me: 43 | if me.name != 'cachew': 44 | raise me 45 | # this module is legacy anyway, so just make it defensive 46 | def cachew(*args, **kwargs): # type: ignore[no-redef] 47 | return lambda f: f 48 | 49 | 50 | # TODO use CPath? Could encapsulate a path within an archive *or* within a directory 51 | TakeoutPath = Path 52 | 53 | 54 | def _read_myactivity_html(takeout: TakeoutPath, kind: str) -> Iterable[Visit]: 55 | # FIXME switch to actual kompress? and use CPath? 56 | from my.core.kompress import kexists 57 | 58 | # TODO glob 59 | # TODO not sure about windows path separators?? 60 | spath = 'Takeout/My Activity/' + kind 61 | if not kexists(takeout, spath): 62 | logger.warning(f"{spath} is not present in {takeout}... skipping") 63 | return 64 | logger.info('processing %s %s', takeout, kind) 65 | 66 | locator = Loc.file(spath) 67 | from my.google.takeout.html import read_html 68 | for dt, url, _title in read_html(takeout, spath): 69 | yield Visit( 70 | url=url, 71 | dt=dt, 72 | locator=locator, 73 | debug=kind, 74 | ) 75 | 76 | def _cpath(suffix: str): 77 | def fun(takeout: TakeoutPath): 78 | cache_dir = config.get().cache_dir 79 | if cache_dir is None: 80 | return None 81 | # doesn't need a nontrivial hash function, timestsamp is encoded in name 82 | return cache_dir / (takeout.name + '_' + suffix + '.cache') 83 | return fun 84 | 85 | 86 | # todo caching should this be HPI responsibility? 87 | # todo set global cachew logging on init? 88 | @cachew(cache_path=_cpath('google_activity') , logger=logger) 89 | def read_google_activity(takeout: TakeoutPath) -> Iterable[Visit]: 90 | return _read_myactivity_html(takeout, 'Chrome/MyActivity.html') 91 | 92 | @cachew(cache_path=_cpath('search_activity') , logger=logger) 93 | def read_search_activity(takeout: TakeoutPath) -> Iterable[Visit]: 94 | return _read_myactivity_html(takeout, 'Search/MyActivity.html') 95 | 96 | # TODO add this to tests? 97 | @cachew(cache_path=_cpath('browser_activity'), logger=logger) 98 | def read_browser_history_json(takeout: TakeoutPath) -> Iterable[Visit]: 99 | from my.core.kompress import kexists, kopen 100 | # not sure if this deserves moving to HPI? it's pretty trivial for now 101 | spath = 'Takeout/Chrome/BrowserHistory.json' 102 | 103 | if not kexists(takeout, spath): 104 | logger.warning(f"{spath} is not present in {takeout}... skipping") 105 | return 106 | logger.info('processing %s %s', takeout, spath) 107 | 108 | # TODO couls also add spath? 109 | locator = Loc.file(takeout) 110 | 111 | # TODO this should be supported by HPI now? 112 | 113 | j = None 114 | with kopen(takeout, spath) as fo: # TODO iterative parser? 115 | j = json.load(fo) 116 | 117 | hist = j['Browser History'] 118 | for item in hist: 119 | url = item['url'] 120 | time = datetime.fromtimestamp(item['time_usec'] / 10 ** 6, tz=pytz.utc) 121 | # TODO any more interesitng info? 122 | yield Visit( 123 | url=url, 124 | dt=time, 125 | locator=locator, 126 | debug='Chrome/BrowserHistory.json', 127 | ) 128 | 129 | -------------------------------------------------------------------------------- /src/promnesia/sources/telegram.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | from urllib.parse import unquote # TODO mm, make it easier to rememember to use... 5 | 6 | from promnesia.common import Loc, PathIsh, Results, Visit, extract_urls, logger 7 | 8 | 9 | def index(database: PathIsh | None=None, *, http_only: bool=False, with_extra_media_info: bool=False) -> Results: 10 | if database is None: 11 | # fully relying on HPI 12 | yield from _index_new(http_only=http_only, with_extra_media_info=with_extra_media_info) 13 | return 14 | 15 | warnings.warn( 16 | f'Passing paths to promnesia.sources.telegram is deprecated, you should setup my.telegram.telegram_backup instead. ' 17 | f'Will try to hack database path {database} into HPI config.' 18 | ) 19 | try: 20 | yield from _index_new_with_adhoc_config(database=database, http_only=http_only, with_extra_media_info=with_extra_media_info) 21 | except Exception as e: 22 | logger.exception(e) 23 | warnings.warn("Hacking my.config.telegram.telegram_backup didn't work. You probably need to update HPI.") 24 | else: 25 | return 26 | 27 | logger.warning("Falling back onto promnesia.sources.telegram_legacy module") 28 | yield from _index_legacy(database=database, http_only=http_only) 29 | 30 | 31 | def _index_legacy(*, database: PathIsh, http_only: bool) -> Results: 32 | from . import telegram_legacy 33 | yield from telegram_legacy.index(database=database, http_only=http_only) 34 | 35 | 36 | def _index_new_with_adhoc_config(*, database: PathIsh, http_only: bool, with_extra_media_info: bool) -> Results: 37 | from . import hpi # noqa: F401,I001 38 | 39 | class config: 40 | class telegram: 41 | class telegram_backup: 42 | export_path: PathIsh = database 43 | 44 | from my.core.cfg import tmp_config 45 | with tmp_config(modules='my.telegram.telegram_backup', config=config): 46 | yield from _index_new(http_only=http_only, with_extra_media_info=with_extra_media_info) 47 | 48 | 49 | def _index_new(*, http_only: bool, with_extra_media_info: bool) -> Results: 50 | from . import hpi # noqa: F401,I001 51 | from my.telegram.telegram_backup import messages 52 | 53 | extra_where = "(has_media == 1 OR text LIKE '%http%')" if http_only else None 54 | for m in messages( 55 | with_extra_media_info=with_extra_media_info, 56 | extra_where=extra_where, 57 | ): 58 | text = m.text 59 | 60 | urls = extract_urls(text) 61 | extra_media_info = m.extra_media_info 62 | if extra_media_info is not None: 63 | urls.extend(extract_urls(extra_media_info)) 64 | 65 | if len(urls) == 0: 66 | continue 67 | 68 | dt = m.time 69 | sender = m.sender.name 70 | chat = m.chat 71 | 72 | cname = chat.name if chat.name is not None else str(chat.id) 73 | 74 | locator = Loc.make( 75 | title=f"chat with {cname}", 76 | href=m.permalink, 77 | ) 78 | context = f'{sender}: {text}' 79 | 80 | for u in urls: 81 | yield Visit( 82 | url=unquote(u), 83 | dt=dt, 84 | context=context, 85 | locator=locator, 86 | ) 87 | -------------------------------------------------------------------------------- /src/promnesia/sources/telegram_legacy.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses [[https://github.com/fabianonline/telegram_backup#readme][telegram_backup]] database for messages data 3 | ''' 4 | 5 | from __future__ import annotations 6 | 7 | import sqlite3 8 | from pathlib import Path 9 | from textwrap import dedent 10 | from typing import TypeVar 11 | from urllib.parse import unquote # TODO mm, make it easier to rememember to use... 12 | 13 | from promnesia.common import ( 14 | Loc, 15 | PathIsh, 16 | Results, 17 | Visit, 18 | echain, 19 | extract_urls, 20 | from_epoch, 21 | ) 22 | 23 | from ..sqlite import sqlite_connection 24 | 25 | T = TypeVar("T") 26 | 27 | 28 | def unwrap(res: T | Exception) -> T: 29 | if isinstance(res, Exception): 30 | raise res 31 | return res 32 | 33 | 34 | def index(database: PathIsh, *, http_only: bool=False) -> Results: 35 | """ 36 | :param database: 37 | the path of the sqlite generated by the _telegram_backup_ java program 38 | :param http_only: 39 | when true, do not collect IP-addresses and `python.py` strings 40 | """ 41 | path = Path(database) 42 | assert path.is_file(), path 43 | 44 | def make_query(text_query: str) -> str: 45 | extra_criteria = "AND (M.has_media == 1 OR text LIKE '%http%')" if http_only else "" 46 | return dedent( 47 | f""" 48 | WITH entities AS ( 49 | SELECT 'dialog' as type 50 | , id 51 | , coalesce(username, id) as handle 52 | , coalesce(first_name || " " || last_name 53 | , username 54 | , id 55 | ) as display_name FROM users 56 | UNION 57 | SELECT 'group' as type 58 | , id 59 | , id as handle 60 | , coalesce(name, id) as display_name FROM chats 61 | ) 62 | SELECT src.display_name AS chatname 63 | , src.handle AS chat 64 | , snd.display_name AS sender 65 | , M.time AS time 66 | , {text_query} AS text 67 | , M.message_id AS mid 68 | FROM messages AS M 69 | /* chat types are 'dialog' (1-1), 'group' and 'supergroup' */ 70 | /* this is abit hacky way to handle all groups in one go */ 71 | LEFT JOIN entities AS src ON M.source_id = src.id AND src.type = (CASE M.source_type WHEN 'supergroup' THEN 'group' ELSE M.source_type END) 72 | LEFT JOIN entities AS snd ON M.sender_id = snd.id AND snd.type = 'dialog' 73 | WHERE 74 | M.message_type NOT IN ('service_message', 'empty_message') 75 | {extra_criteria} 76 | ORDER BY time; 77 | """) 78 | 79 | with sqlite_connection(path, immutable=True, row_factory='row') as db: 80 | # TODO yield error if chatname or chat or smth else is null? 81 | for row in db.execute(make_query('M.text')): 82 | try: 83 | yield from _handle_row(row) 84 | except Exception as ex: 85 | yield echain(RuntimeError(f'While handling {row}'), ex) 86 | 87 | # old (also 'stable') version doesn't have 'json' column yet... 88 | messages_columns = [d[0] for d in db.execute('SELECT * FROM messages').description] 89 | # todo hmm what is 'markup_json'?? 90 | if 'json' in messages_columns: 91 | for row in db.execute(make_query("json_extract(json, '$.media.webpage.description')")): 92 | try: 93 | yield from _handle_row(row) 94 | except Exception as ex: 95 | yield echain(RuntimeError(f'While handling {row}'), ex) 96 | 97 | 98 | def _handle_row(row: sqlite3.Row) -> Results: 99 | text = row['text'] 100 | if text is None: 101 | return 102 | urls = extract_urls(text) 103 | if len(urls) == 0: 104 | return 105 | dt = from_epoch(row['time']) 106 | mid: str = unwrap(row['mid']) 107 | 108 | # TODO perhaps we could be defensive with null sender/chat etc and still emit the Visit 109 | sender: str = unwrap(row['sender']) 110 | chatname: str = unwrap(row['chatname']) 111 | chat: str = unwrap(row['chat']) 112 | 113 | in_context = f'https://t.me/{chat}/{mid}' 114 | for u in urls: 115 | # https://www.reddit.com/r/Telegram/comments/6ufwi3/link_to_a_specific_message_in_a_channel_possible/ 116 | # hmm, only seems to work on mobile app, but better than nothing... 117 | yield Visit( 118 | url=unquote(u), 119 | dt=dt, 120 | context=f"{sender}: {text}", 121 | locator=Loc.make( 122 | title=f"chat with {chatname}", 123 | href=in_context, 124 | ), 125 | ) 126 | -------------------------------------------------------------------------------- /src/promnesia/sources/twitter.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for Twitter data. 3 | ''' 4 | 5 | from collections.abc import Iterable 6 | 7 | from promnesia.common import Loc, Res, Results, Visit, extract_urls, logger 8 | 9 | 10 | def index() -> Results: 11 | from . import hpi # noqa: F401,I001 12 | import my.twitter.all as tw 13 | from my.twitter.archive import Tweet # todo extract to common or something? 14 | 15 | # TODO hmm. tweets themselves are sort of visits? not sure if they should contribute.. 16 | processed = 0 17 | tweets: Iterable[Res[Tweet]] = tw.tweets() 18 | for t in tweets: 19 | if isinstance(t, Exception): 20 | yield t 21 | continue 22 | 23 | processed += 1 24 | try: 25 | urls = t.urls 26 | except Exception as e: # just in case.. 27 | yield e 28 | urls = [] 29 | 30 | if len(urls) == 0: 31 | # if entities haven't detected anything it usually means RT or reply in my case, so worth trying again to extract 32 | # e.g. replies from json twitter takeouts don't seem to have entities set 33 | urls = extract_urls(t.text) 34 | # t.co refers to the retweeted tweet, so perhaps not very meaningful 35 | urls = [u for u in urls if '/t.co/' not in u] 36 | 37 | loc = Loc.make(title='twitter', href=t.permalink) 38 | for u in urls: 39 | yield Visit( 40 | url=u, 41 | dt=t.created_at, 42 | context=t.text, 43 | locator=loc, 44 | ) 45 | logger.info('processed %d tweets', processed) 46 | 47 | 48 | # ok, so it doesn't necessarily have everything in entities, eg. 49 | # { 50 | # "retweeted" : false, 51 | # "source" : "Twitter Web Client", 52 | # "entities" : { 53 | # "hashtags" : [ ], 54 | # "symbols" : [ ], 55 | # "user_mentions" : [ ], 56 | # "urls" : [ ] 57 | # }, 58 | # "full_text" : "http://old.slackware.ru/article.ghtml?ID=544 Забавно =)", 59 | # ... 60 | # } 61 | -------------------------------------------------------------------------------- /src/promnesia/sources/vcs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Clones & indexes Git repositories (via sources.auto) 3 | ''' 4 | from __future__ import annotations 5 | 6 | import re 7 | from collections.abc import Iterable 8 | 9 | # TODO not sure if worth exposing... could be just handled by auto or something?) 10 | from pathlib import Path 11 | from subprocess import check_call 12 | 13 | from ..common import Extraction, PathIsh, get_tmpdir, slugify 14 | 15 | 16 | def index(path: PathIsh, *args, **kwargs) -> Iterable[Extraction]: 17 | repo = str(path) 18 | 19 | # TODO this looks pretty horrible as a context name 20 | # perhaps pass context here since we know it should be github repo? 21 | tp = Path(get_tmpdir().name) / slugify(repo) 22 | # note: https://bugs.python.org/issue33617 , it doesn't like Path here on Windows 23 | check_call(['git', 'clone', repo, str(tp)]) 24 | 25 | def replacer(p: PathIsh, prefix: str=str(tp), repo: str=repo) -> str: 26 | ps = str(p) 27 | # TODO prefix is a bit misleading 28 | pos = ps.find(prefix) 29 | if pos == -1: 30 | # TODO not sure if should happen... 31 | return ps 32 | # TODO ugh. seems that blame view https://github.com/davidgasquez/handbook/blame/master/README.md#L25 is the most reliable 33 | # in raw mode can't jump onto line, when markdown is renderend can't jump either 34 | rest = ps[pos + len(prefix):] 35 | rest = re.sub(r':(\d+)$', r'#L\1', rest) # patch line number... 36 | return repo + '/blame/master' + rest 37 | 38 | # TODO doesn't work for git: 39 | # TODO think about something more generic... this isn't too sustainable 40 | # TODO not sure if context should be local or github?... 41 | 42 | from . import auto 43 | yield from auto.index(tp, *args, replacer=replacer, **kwargs) 44 | -------------------------------------------------------------------------------- /src/promnesia/sources/website.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Clones a website with wget and indexes via sources.auto 3 | ''' 4 | 5 | import re 6 | from collections.abc import Iterable 7 | from pathlib import Path 8 | from subprocess import run 9 | 10 | from promnesia.common import Extraction, PathIsh, get_logger, get_tmpdir, slugify 11 | 12 | 13 | def index(path: PathIsh, *args, **kwargs) -> Iterable[Extraction]: 14 | logger = get_logger() 15 | url = str(path) 16 | 17 | # TODO better context name 18 | tp = Path(get_tmpdir().name) / slugify(url) 19 | 20 | # TODO careful, set some hard limit on data size? use --quota? 21 | # https://www.linuxjournal.com/content/downloading-entire-web-site-wget 22 | 23 | cmd = [ 24 | 'wget', '--directory-prefix', str(tp), 25 | '--no-verbose', 26 | '--recursive', 27 | '-A', 'html,html,txt', # TODO eh, ideally would use mime type I guess... 28 | '--no-parent', 29 | url, 30 | ] 31 | # TODO follow sitemap? e.g. gwern 32 | logger.info(' '.join(cmd)) 33 | res = run(cmd, check=False) 34 | 35 | if res.returncode == 8: 36 | # man wget: 8 means server error (e.g. broken link) 37 | yield RuntimeError('Encountered server error(s) during downloading') 38 | else: 39 | # rest of the errors are a bit more critical.. 40 | res.check_returncode() 41 | 42 | def replacer(p: PathIsh, prefix: str=str(tp), url: str=url) -> str: 43 | ps = str(p) 44 | pos = ps.find(prefix) 45 | if pos == -1: 46 | return ps 47 | rest = ps[pos + len(prefix):] 48 | # now this should look kinda like /domain.tld/rest (due to the way wget downloads stuff) 49 | rest = re.sub(r'/.*?/', '/', rest) 50 | return url + rest 51 | 52 | # TODO create a file that maps prefix? 53 | # TODO ugh. it creates a directory with a domain... how to map it to http/https properly? 54 | 55 | # TODO smarter html handling 56 | from . import auto 57 | yield from auto.index(tp, *args, replacer=replacer, **kwargs) 58 | -------------------------------------------------------------------------------- /src/promnesia/sources/zulip.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Uses [[https://github.com/karlicoss/HPI][HPI]] for Zulip data. 3 | ''' 4 | 5 | from promnesia.common import Loc, Results, Visit, iter_urls 6 | 7 | 8 | def index() -> Results: 9 | from . import hpi # noqa: F401,I001 10 | import my.zulip.organization as Z 11 | 12 | for m in Z.messages(): 13 | if isinstance(m, Exception): 14 | yield m 15 | continue 16 | loc = Loc.make(title=f'{m.sender.full_name} mentioned', href=m.permalink) 17 | # todo if syntax is markdown, could extract title as well? 18 | content = m.content 19 | for u in iter_urls(content, syntax='markdown'): 20 | yield Visit( 21 | url=u, 22 | dt=m.sent, 23 | # TODO render as markdown? 24 | context=content, 25 | locator=loc, 26 | ) 27 | -------------------------------------------------------------------------------- /src/promnesia/sqlite.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sqlite3 4 | from collections.abc import Iterator 5 | from contextlib import contextmanager 6 | from typing import Any, Callable, Literal, Union 7 | 8 | from .common import PathIsh 9 | 10 | # NOTE: copy pasted from HPI 11 | 12 | SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any] 13 | 14 | def dict_factory(cursor, row): 15 | fields = [column[0] for column in cursor.description] 16 | return dict(zip(fields, row)) 17 | 18 | 19 | Factory = Union[SqliteRowFactory, Literal['row', 'dict']] 20 | 21 | @contextmanager 22 | def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Factory | None=None) -> Iterator[sqlite3.Connection]: 23 | dbp = f'file:{db}' 24 | # https://www.sqlite.org/draft/uri.html#uriimmutable 25 | if immutable: 26 | dbp = f'{dbp}?immutable=1' 27 | row_factory_: Any = None 28 | if row_factory is not None: 29 | if callable(row_factory): 30 | row_factory_ = row_factory 31 | elif row_factory == 'row': 32 | row_factory_ = sqlite3.Row 33 | elif row_factory == 'dict': 34 | row_factory_ = dict_factory 35 | else: 36 | raise RuntimeError("should not happen") 37 | 38 | conn = sqlite3.connect(dbp, uri=True) 39 | try: 40 | conn.row_factory = row_factory_ 41 | with conn: 42 | yield conn 43 | finally: 44 | # Connection context manager isn't actually closing the connection, only keeps transaction 45 | conn.close() 46 | -------------------------------------------------------------------------------- /src/promnesia/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/src/promnesia/tests/__init__.py -------------------------------------------------------------------------------- /src/promnesia/tests/common.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import gc 4 | import inspect 5 | import os 6 | import socket 7 | import sys 8 | from collections.abc import Iterator 9 | from contextlib import closing, contextmanager 10 | from pathlib import Path 11 | from textwrap import dedent 12 | from typing import NoReturn, TypeVar 13 | 14 | import pytest 15 | 16 | from ..common import Res, _is_windows 17 | 18 | 19 | def under_ci() -> bool: 20 | return 'CI' in os.environ 21 | 22 | 23 | def throw(x: Exception) -> NoReturn: 24 | ''' 25 | like raise, but can be an expression... 26 | ''' 27 | raise x 28 | 29 | 30 | @pytest.fixture 31 | def gc_control(*, gc_on: bool): 32 | if gc_on: 33 | # no need to do anything, should be on by default 34 | yield 35 | return 36 | 37 | gc.disable() 38 | try: 39 | yield 40 | finally: 41 | gc.enable() 42 | 43 | 44 | running_on_ci = 'CI' in os.environ 45 | 46 | 47 | GIT_ROOT = Path(__file__).absolute().parent.parent.parent.parent 48 | TESTDATA = GIT_ROOT / 'tests/testdata' 49 | 50 | 51 | def get_testdata(path: str) -> Path: 52 | assert TESTDATA.is_dir() 53 | res = TESTDATA / path 54 | if not res.exists(): 55 | raise RuntimeError(f"'{res}' not found! You propably need to run 'git submodule update --init --recursive'") 56 | return TESTDATA / path 57 | 58 | 59 | @contextmanager 60 | def tmp_popen(*args, **kwargs): 61 | import psutil 62 | 63 | with psutil.Popen(*args, **kwargs) as p: 64 | try: 65 | yield p 66 | finally: 67 | for c in p.children(recursive=True): 68 | c.kill() 69 | p.kill() 70 | p.wait() 71 | 72 | 73 | # meh 74 | def promnesia_bin(*args): 75 | # not sure it's a good idea to diverge, but not sure if there's a better way either? 76 | # ugh. on windows there is no bash so can't use the script 77 | # whatever... 78 | if under_ci() or _is_windows: 79 | # should be able to use the installed version 80 | return [sys.executable, '-m', 'promnesia', *args] 81 | else: 82 | # use version from the repository 83 | root = Path(__file__).parent.parent.parent.parent 84 | pm = root / 'scripts/promnesia' 85 | return [pm, *args] 86 | 87 | 88 | # meh... not great 89 | @pytest.fixture 90 | def reset_filters(): 91 | from .. import extract 92 | 93 | extract.filters.cache_clear() 94 | try: 95 | yield 96 | finally: 97 | extract.filters.cache_clear() 98 | 99 | 100 | # TODO could be a TypeGuard from 3.10 101 | V = TypeVar('V') 102 | 103 | 104 | def unwrap(r: Res[V]) -> V: 105 | assert not isinstance(r, Exception), r 106 | return r 107 | 108 | 109 | def write_config(path: Path, gen, **kwargs) -> None: 110 | output_dir = path.parent 111 | cfg_src = dedent('\n'.join(inspect.getsource(gen).splitlines()[1:])) + f"\nOUTPUT_DIR = r'{output_dir}'" 112 | for k, v in kwargs.items(): 113 | assert k in cfg_src, k 114 | cfg_src = cfg_src.replace(k, repr(str(v))) # meh 115 | path.write_text(cfg_src) 116 | 117 | 118 | @contextmanager 119 | def free_port() -> Iterator[int]: 120 | # this is a generator to make sure there are no race conditions between the time we call this and launch program 121 | # 122 | # also some relevant articles about this 'technique' 123 | # - https://eklitzke.org/binding-on-port-zero 124 | # - https://idea.popcount.org/2014-04-03-bind-before-connect 125 | # - https://blog.cloudflare.com/the-quantum-state-of-a-tcp-port 126 | with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: 127 | if sys.platform == 'linux': 128 | # Ok, so from what I've been reading, SO_REUSEADDR should only be necessary in the program that reuses the port 129 | # However, this answer (or man socket) claims we need it on both sites in Linux? see https://superuser.com/a/587955/300795 130 | s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) 131 | # also not sure where REUSEADDR is set in uvicorn (e.g. here reuse_address isn't passed?) 132 | # https://github.com/encode/uvicorn/blob/6d666d99a285153bc4613e811543c39eca57054a/uvicorn/server.py#L162C37-L162C50 133 | # but from strace looks like it is called somewhere :shrug: 134 | 135 | # assign euphemeral port 136 | # see table in 137 | # https://stackoverflow.com/questions/14388706/how-do-so-reuseaddr-and-so-reuseport-differ/14388707#14388707 138 | # we rely on server binding to localhost later (or anything except 0.0.0.0 really) 139 | s.bind(('', 0)) 140 | 141 | port = s.getsockname()[1] 142 | yield port 143 | -------------------------------------------------------------------------------- /src/promnesia/tests/server_helper.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | import time 5 | from collections.abc import Iterator 6 | from contextlib import contextmanager 7 | from dataclasses import dataclass 8 | from pathlib import Path 9 | from typing import Any 10 | 11 | import psutil 12 | import requests 13 | 14 | from ..common import PathIsh 15 | from .common import free_port, promnesia_bin, tmp_popen 16 | 17 | 18 | @dataclass 19 | class Helper: 20 | host: str 21 | port: str 22 | process: psutil.Popen 23 | 24 | def get(self, path: str): 25 | # check it's alive first so the error is cleaner 26 | assert self.process.poll() is None, self.process 27 | return requests.get(f'http://{self.host}:{self.port}' + path) 28 | 29 | def post(self, path: str, *, json: dict[str, Any] | None = None): 30 | assert self.process.poll() is None, self.process 31 | return requests.post(f'http://{self.host}:{self.port}' + path, json=json) 32 | 33 | 34 | @contextmanager 35 | def run_server(db: PathIsh | None = None, *, timezone: str | None = None) -> Iterator[Helper]: 36 | # TODO not sure, perhaps best to use a thread or something? 37 | # but for some tests makes more sense to test in a separate process 38 | with free_port() as pp: 39 | # ugh. under docker 'localhost' tries to bind it to ipv6 (::1) for some reason??? 40 | host = '0.0.0.0' if Path('/.dockerenv').exists() else 'localhost' 41 | port = str(pp) 42 | args = [ 43 | 'serve', 44 | '--host', host, 45 | '--quiet', 46 | '--port', port, 47 | *([] if timezone is None else ['--timezone', timezone]), 48 | *([] if db is None else ['--db' , str(db)]), 49 | ] 50 | with tmp_popen(promnesia_bin(*args)) as server_process: 51 | server = Helper(host=host, port=port, process=server_process) 52 | 53 | # wait till ready 54 | for _ in range(50): 55 | try: 56 | server.get('/status').json() 57 | break 58 | except: 59 | time.sleep(0.1) 60 | else: 61 | raise RuntimeError("Cooldn't connect to '{st}' after 50 attempts") 62 | print(f"Started server up, db: {db}", file=sys.stderr) 63 | 64 | yield server 65 | 66 | # TODO use logger! 67 | print("Done with the server", file=sys.stderr) 68 | -------------------------------------------------------------------------------- /src/promnesia/tests/sources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/src/promnesia/tests/sources/__init__.py -------------------------------------------------------------------------------- /src/promnesia/tests/sources/test_auto.py: -------------------------------------------------------------------------------- 1 | import os 2 | from itertools import groupby 3 | 4 | from ...sources import auto 5 | from ..common import get_testdata, throw 6 | 7 | sa2464 = 'https://www.scottaaronson.com/blog/?p=2464' 8 | 9 | _JSON_URLS = { 10 | 'https://johncarlosbaez.wordpress.com/2016/09/09/struggles-with-the-continuum-part-2/', 11 | sa2464, 12 | } 13 | 14 | 15 | def makemap(visits): 16 | key = lambda v: v.url 17 | 18 | def it(): 19 | vit = (throw(v) if isinstance(v, Exception) else v for v in visits) 20 | for k, g in groupby(sorted(vit, key=key), key=key): 21 | yield k, sorted(g) 22 | 23 | return dict(it()) 24 | 25 | 26 | def test_json() -> None: 27 | mm = makemap(auto.index(get_testdata('auto'), ignored='*/orgs/*')) 28 | assert mm.keys() == _JSON_URLS 29 | 30 | # TODO not sure if they deserve separate visits.. 31 | [v1, v2] = mm[sa2464] 32 | assert v1.context == 'list::yyy::given_url' 33 | # todo not sure if editor:// work on Windows 34 | assert v1.locator.href.startswith('editor://') 35 | assert v1.locator.href.endswith('pocket.json') 36 | # TODO line number? 37 | 38 | 39 | def test_auto() -> None: 40 | mm = makemap(auto.index(get_testdata('auto'))) 41 | org_link = 'https://www.youtube.com/watch?v=rHIkrotSwcc' 42 | assert { 43 | *_JSON_URLS, 44 | org_link, 45 | }.issubset(mm.keys()) 46 | 47 | [v] = mm[org_link] 48 | assert v.locator.title == 'orgs' + os.sep + 'file.org:14' # meh 49 | assert v.locator.href.endswith('file.org:14') 50 | assert "xxx /r/cpp" in v.context 51 | assert "I've enjoyed [Chandler Carruth's" in v.context 52 | 53 | 54 | def test_obsidian() -> None: 55 | mm = makemap(auto.index(get_testdata('obsidian-vault'))) 56 | example_url = 'https://example.com' 57 | [v] = mm[example_url] 58 | assert v.locator.href.startswith('obsidian://') 59 | 60 | 61 | def test_logseq() -> None: 62 | mm = makemap(auto.index(get_testdata('logseq-graph'))) 63 | example_url = 'https://example.com' 64 | [v] = mm[example_url] 65 | assert v.locator.href.startswith('logseq://') 66 | -------------------------------------------------------------------------------- /src/promnesia/tests/sources/test_filetypes.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from ...common import PathIsh 4 | from ...common import _is_windows as windows 5 | from ...sources.auto import by_path 6 | 7 | 8 | def handled(p: PathIsh) -> bool: 9 | idx, m = by_path(Path(p)) 10 | return idx is not None 11 | # ideally these won't hit libmagic path (would try to open the file and cause FileNotFoundError) 12 | 13 | 14 | def test_filetypes() -> None: 15 | # test media 16 | for ext in 'avi mp4 mp3 webm'.split() + ([] if windows else 'mkv'.split()): 17 | assert handled('file.' + ext) 18 | 19 | # images 20 | for ext in 'gif jpg png jpeg'.split(): 21 | assert handled('file.' + ext) 22 | 23 | # TODO more granual checks that these are ignored? 24 | # binaries 25 | for ext in 'o sqlite'.split() + ([] if windows else 'class jar'.split()): 26 | assert handled('file.' + ext) 27 | 28 | # these might have potentially some links 29 | for ext in [ 30 | 'svg', 31 | 'pdf', 'epub', 'ps', 32 | 'doc', 'ppt', 'xsl', 33 | # seriously, windows doesn't know about docx??? 34 | *([] if windows else 'docx pptx xlsx'.split()), 35 | *([] if windows else 'ods odt rtf'.split()), 36 | ] + ([] if windows else 'djvu'.split()): 37 | assert handled('file.' + ext) 38 | 39 | # source code 40 | for ext in 'rs tex el js sh hs pl h py hpp c go css'.split() + ([] if windows else 'java cpp'.split()): 41 | assert handled('file.' + ext) 42 | 43 | assert handled('x.html') 44 | -------------------------------------------------------------------------------- /src/promnesia/tests/sources/test_hypothesis.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from my.core.cfg import tmp_config 4 | 5 | from ...__main__ import do_index 6 | from ...database.load import get_all_db_visits 7 | from ..common import get_testdata, write_config 8 | 9 | 10 | def index_hypothesis(tmp_path: Path) -> None: 11 | def cfg() -> None: 12 | from promnesia.common import Source 13 | from promnesia.sources import hypothesis 14 | 15 | SOURCES = [Source(hypothesis.index, name='hyp')] # noqa: F841 16 | 17 | cfg_path = tmp_path / 'config.py' 18 | write_config(cfg_path, cfg) 19 | 20 | class hpi_config: 21 | class hypothesis: 22 | export_path = get_testdata('hypexport/testdata') / 'netrights-dashboard-mockup/data/*.json' 23 | 24 | with tmp_config(modules='my.hypothesis', config=hpi_config): 25 | do_index(cfg_path) 26 | 27 | 28 | def test_hypothesis(tmp_path: Path) -> None: 29 | index_hypothesis(tmp_path) 30 | 31 | visits = get_all_db_visits(tmp_path / 'promnesia.sqlite') 32 | assert len(visits) > 100 33 | 34 | [vis] = [x for x in visits if 'fundamental fact of evolution' in (x.context or '')] 35 | 36 | assert vis.norm_url == 'wired.com/2017/04/the-myth-of-a-superhuman-ai' 37 | assert vis.orig_url == 'https://www.wired.com/2017/04/the-myth-of-a-superhuman-ai/' 38 | assert vis.locator.href == 'https://hyp.is/_Z9ccmVZEeexBOO7mToqdg/www.wired.com/2017/04/the-myth-of-a-superhuman-ai/' 39 | assert 'misconception about evolution is fueling misconception about AI' in (vis.context or '') # contains notes as well 40 | -------------------------------------------------------------------------------- /src/promnesia/tests/sources/test_org.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from ...common import Visit 4 | from ...sources.org import extract_from_file 5 | from ..common import get_testdata, throw 6 | 7 | 8 | def delrf(s: str | None) -> str | None: 9 | if s is None: 10 | return None 11 | # meh.. not sure how ot handle this properly, ideally should be via pytest? 12 | # not sure if should just do it in the indexer? e.g. extension might not like it 13 | return s.replace('\r', '') 14 | 15 | 16 | def test_org_indexer() -> None: 17 | [_, cpp, cozy] = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file.org'))] 18 | 19 | assert cpp.url == 'https://www.youtube.com/watch?v=rHIkrotSwcc' 20 | # TODO not sure about filetags? 21 | exp = ''' 22 | xxx /r/cpp :cpp:programming: 23 | I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_]( 24 | https://www.youtube.com/watch?v=rHIkrotSwcc) very much. 25 | 26 | '''.lstrip() 27 | assert delrf(cpp.context) == exp 28 | 29 | assert cozy.url == 'https://twitter.com/Mappletons/status/1255221220263563269' 30 | 31 | 32 | def test_org_indexer_2() -> None: 33 | items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file3.org'))] 34 | 35 | assert len(items) == 6 36 | assert items[0].url == 'https://www.reddit.com/r/androidapps/comments/4i36z9/how_you_use_your_android_to_the_maximum/d2uq24i' 37 | assert items[1].url == 'https://link.com' 38 | assert items[-2].url == 'https://en.wikipedia.org/wiki/Resilio_Sync' 39 | # TODO shit def need org specific url extractor (and then extract from everything remaining) 40 | # assert results[-1].url == 'https://en.wikipedia.org/wiki/InterPlanetary_File_System' 41 | 42 | 43 | def test_heading() -> None: 44 | items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file2.org'))] 45 | assert {i.url for i in items} == { 46 | 'https://en.wikipedia.org/wiki/Computational_topology', 47 | 'http://graphics.stanford.edu/courses/cs468-09-fall/', 48 | 'https://en.wikipedia.org/wiki/Triangulation_(topology)', 49 | 'https://en.wikipedia.org/wiki/Digital_manifold', 50 | } 51 | 52 | 53 | def test_url_in_properties() -> None: 54 | items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file4.org'))] 55 | 56 | assert len(items) == 2, items 57 | assert items[0].url == 'https://example.org/ref_example' 58 | assert items[1].url == 'http://example.org/a_test' 59 | 60 | 61 | def test_5() -> None: 62 | items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file5.org'))] 63 | 64 | assert len(items) == 0 # shouldn't crash at least 65 | -------------------------------------------------------------------------------- /src/promnesia/tests/sources/test_plaintext.py: -------------------------------------------------------------------------------- 1 | from ...common import Source 2 | from ...extract import extract_visits 3 | from ...sources import plaintext, shellcmd 4 | from ..common import get_testdata, unwrap 5 | 6 | 7 | def test_plaintext_path_extractor() -> None: 8 | visits = list( 9 | extract_visits( 10 | Source( 11 | shellcmd.index, 12 | plaintext.extract_from_path(get_testdata('custom')), 13 | ), 14 | src='whatever', 15 | ) 16 | ) 17 | assert {unwrap(v).orig_url for v in visits} == { 18 | 'http://google.com', 19 | 'http://google.com/', 20 | 'http://some-weird-domain.xyz/whatever', 21 | 'https://google.com', 22 | 'http://what.about.this.link', 23 | } 24 | 25 | [wa] = [v for v in visits if unwrap(v).orig_url == 'http://what.about.this.link'] 26 | f2 = get_testdata('custom') / 'file2.txt' 27 | assert unwrap(wa).locator.href == f'editor://{f2}:3' # occurs line 3 28 | -------------------------------------------------------------------------------- /src/promnesia/tests/sources/test_shellcmd.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ...common import Source, _is_windows 4 | from ...extract import extract_visits 5 | from ...sources import shellcmd 6 | from ..common import get_testdata 7 | 8 | 9 | @pytest.mark.skipif(_is_windows, reason="no grep on windows") 10 | def test_via_grep() -> None: 11 | 12 | visits = list( 13 | extract_visits( 14 | Source( 15 | shellcmd.index, 16 | # meh. maybe should deprecate plain string here... 17 | r"""grep -Eo -r --no-filename (http|https)://\S+ """ + str(get_testdata('custom')), 18 | ), 19 | src='whatever', 20 | ) 21 | ) 22 | # TODO I guess filtering of equivalent urls should rather be tested on something having context (e.g. org mode) 23 | assert len(visits) == 5 24 | -------------------------------------------------------------------------------- /src/promnesia/tests/sources/test_takeout.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | 3 | import pytest 4 | from my.core.cfg import tmp_config 5 | 6 | from ...common import Source 7 | from ...extract import extract_visits 8 | from ...sources import takeout 9 | from ..common import get_testdata, unwrap 10 | 11 | 12 | # TODO apply in conftest so it's used in all tests? 13 | @pytest.fixture 14 | def no_cachew(): 15 | from my.core.cachew import disabled_cachew 16 | 17 | with disabled_cachew(): 18 | yield 19 | 20 | 21 | # todo testing this logic probably belongs to hpi or google_takeout_export, but whatever 22 | def test_takeout_directory(no_cachew) -> None: 23 | class config: 24 | class google: 25 | takeout_path = get_testdata('takeout') 26 | 27 | with tmp_config(modules='my.google.takeout.*', config=config): 28 | visits = list(extract_visits(Source(takeout.index), src='takeout')) 29 | 30 | assert len(visits) == 3 31 | assert all(unwrap(v).dt.tzinfo is not None for v in visits) 32 | 33 | 34 | def test_takeout_zip(no_cachew) -> None: 35 | class config: 36 | class google: 37 | takeout_path = get_testdata('takeout-20150518T000000Z.zip') 38 | 39 | with tmp_config(modules='my.google.takeout.*', config=config): 40 | visits = list(extract_visits(Source(takeout.index), src='takeout')) 41 | 42 | assert len(visits) == 3 43 | assert all(unwrap(v).dt.tzinfo is not None for v in visits) 44 | 45 | [vis] = [v for v in visits if unwrap(v).norm_url == 'takeout.google.com/settings/takeout'] 46 | 47 | edt = datetime( 48 | year=2018, 49 | month=9, 50 | day=18, 51 | hour=5, 52 | minute=48, 53 | second=23, 54 | tzinfo=timezone.utc, 55 | ) 56 | assert unwrap(vis).dt == edt 57 | -------------------------------------------------------------------------------- /src/promnesia/tests/test_cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | import pytest 5 | import requests 6 | 7 | from ..common import _is_windows 8 | from .common import get_testdata, promnesia_bin, tmp_popen 9 | 10 | ox_hugo_data = get_testdata('ox-hugo/test/site') 11 | 12 | 13 | def test_demo() -> None: 14 | if _is_windows: 15 | # for some reason fails to connect to server.. 16 | # not sure maybe something with port choice idk 17 | pytest.skip("TODO broken on Windows") 18 | 19 | with tmp_popen(promnesia_bin('demo', '--port', '16789', ox_hugo_data)): 20 | # TODO why does it want post?? 21 | time.sleep(2) # meh.. need a generic helper to wait till ready... 22 | res = {} 23 | for _attempt in range(30): 24 | time.sleep(1) 25 | try: 26 | res = requests.post( 27 | "http://localhost:16789/search", 28 | json={'url': "https://github.com/kaushalmodi/ox-hugo/issues"}, 29 | ).json() 30 | break 31 | except: 32 | continue 33 | else: 34 | raise RuntimeError("Couldn't connect to the server") 35 | vis = res['visits'] 36 | assert len(vis) > 50, vis 37 | mds = [x for x in vis if x['locator']['title'] == 'content/posts/citations-example-toml.md'.replace('/', os.sep)] 38 | orgs = [x for x in vis if x['locator']['title'].startswith('content-org/single-posts/empty_tag.org'.replace('/', os.sep))] 39 | assert len(mds) == 1 40 | assert len(orgs) == 1 41 | -------------------------------------------------------------------------------- /src/promnesia/tests/test_compare.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from pathlib import Path 3 | 4 | from ..compare import compare_files 5 | from .utils import index_urls 6 | 7 | 8 | def test_compare(tmp_path: Path) -> None: 9 | idx = index_urls({ 10 | 'https://example.com': None, 11 | 'https://en.wikipedia.org/wiki/Saturn_V': None, 12 | 'https://plato.stanford.edu/entries/qualia': None, 13 | }) 14 | idx(tmp_path) 15 | db = tmp_path / 'promnesia.sqlite' 16 | old_db = tmp_path / 'promnesia-old.sqlite' 17 | shutil.move(str(db), str(old_db)) 18 | 19 | idx2 = index_urls({ 20 | 'https://example.com': None, 21 | 'https://www.reddit.com/r/explainlikeimfive/comments/1ev6e0/eli5entropy': None, 22 | 'https://en.wikipedia.org/wiki/Saturn_V': None, 23 | 'https://plato.stanford.edu/entries/qualia': None, 24 | }) 25 | idx2(tmp_path) 26 | 27 | # should not crash, as there are more links in the new database 28 | assert len(list(compare_files(old_db, db))) == 0 29 | 30 | assert len(list(compare_files(db, old_db))) == 1 31 | -------------------------------------------------------------------------------- /src/promnesia/tests/test_extract.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | 3 | import pytest 4 | from more_itertools import ilen 5 | 6 | from ..common import DbVisit, Loc, Source, Visit 7 | from ..extract import extract_visits 8 | from .common import ( 9 | gc_control, # noqa: F401 10 | get_testdata, 11 | running_on_ci, 12 | unwrap, 13 | ) 14 | 15 | 16 | def test_with_error() -> None: 17 | class ExtractionError(Exception): 18 | pass 19 | 20 | def indexer(): 21 | yield Visit(url='http://test1', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever')) 22 | yield ExtractionError() 23 | yield Visit(url='http://test2', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever')) 24 | 25 | [v1, e, v2] = extract_visits(source=Source(indexer), src='whatever') 26 | assert isinstance(v1, DbVisit) 27 | assert isinstance(e, Exception) 28 | assert isinstance(v2, DbVisit) 29 | 30 | 31 | def test_urls_are_normalised() -> None: 32 | # generally this stuff is covered by cannon tests, but good to check it's actually inserted in the db 33 | # TODO maybe this should be a separate test which takes DbVisit.make separately? 34 | # especially to decouple from shellcmd source 35 | from ..sources import shellcmd 36 | from ..sources.plaintext import extract_from_path 37 | 38 | visits = list( 39 | extract_visits( 40 | source=Source(shellcmd.index, extract_from_path(get_testdata('normalise'))), 41 | src='whatever', 42 | ) 43 | ) 44 | assert len(visits) == 7 45 | 46 | assert {unwrap(v).norm_url for v in visits} == { 47 | 'hi.com', 48 | 'reddit.com/post', 49 | 'argos.co.uk/webapp/wcs/stores/servlet/OrderItemDisplay', 50 | 'youtube.com/watch?v=XXlZfc1TrD0', 51 | 'youtube.com/watch?v=XXlZfc1Tr11', 52 | } 53 | 54 | 55 | @pytest.mark.parametrize('count', [99, 100_000, 1_000_000]) 56 | @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off']) 57 | def test_benchmark(count: int, gc_control) -> None: 58 | # NOTE: at the moment most time is spent canonifying urls, so not much point optimizing this in isolation 59 | # TODO maybe could specify custom cannonifying strategy that doesn't do anything to isolate benchmark 60 | if count > 99 and running_on_ci: 61 | pytest.skip("test would be too slow on CI, only meant to run manually") 62 | 63 | from ..sources import demo 64 | 65 | source = Source(demo.index, count=count) 66 | 67 | total = ilen(extract_visits(source=source, src='whatever')) 68 | assert total == count # sanity check 69 | -------------------------------------------------------------------------------- /src/promnesia/tests/test_extract_urls.py: -------------------------------------------------------------------------------- 1 | from ..common import extract_urls 2 | 3 | 4 | def test_extract_simple() -> None: 5 | lines = """ 6 | I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_]( 7 | https://www.youtube.com/watch?v=rHIkrotSwcc) very much. 8 | """.strip() 9 | assert set(extract_urls(lines)) == {'https://www.youtube.com/watch?v=rHIkrotSwcc'} 10 | 11 | 12 | def test_extract_2() -> None: 13 | text = '''‍♂️ Чтобы снизить вероятность ошибиться, важно знать про когнитивные искажения. 14 | Если для вас это новое словосочетание, начните с книжки 15 | "Гарри Поттер и Методы рационального мышления" - http://hpmor.ru/, если вы знакомы с понятием - читайте цепочки на сайтах 16 | lesswrong.ru и lesswrong.com, книжку Даниэля Канемана "Thinking, fast and slow" и канал Пион https://t.me/ontologics 17 | ''' 18 | assert set(extract_urls(text)) == {'http://hpmor.ru/', 'lesswrong.ru', 'lesswrong.com', 'https://t.me/ontologics'} 19 | 20 | 21 | def test_extract_md() -> None: 22 | lines = ''' 23 | Hey, I recently implemented a new extension for that [addons.mozilla.org](https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/), [github](https://github.com/karlicoss/grasp), perhaps it could be useful for you! 24 | ''' 25 | assert set(extract_urls(lines)) == { 26 | 'addons.mozilla.org', 27 | 'https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/', 28 | 'https://github.com/karlicoss/grasp', 29 | } 30 | 31 | 32 | # just random links to test multiline/whitespace behaviour 33 | def test_extract_3() -> None: 34 | lines = ''' 35 | python.org/one.html ?? https://python.org/two.html some extra text 36 | 37 | whatever.org 38 | ''' 39 | assert set(extract_urls(lines, syntax='org')) == { 40 | 'python.org/one.html', 41 | 'https://python.org/two.html', 42 | 'whatever.org', 43 | } 44 | -------------------------------------------------------------------------------- /src/promnesia/tests/test_traverse.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import patch 2 | 3 | from ..common import traverse 4 | from .common import get_testdata 5 | 6 | testDataPath = get_testdata('traverse') 7 | 8 | 9 | # Patch shutil.which so it always returns false (when trying to which fdfind, etc) 10 | # so that it falls back to find 11 | @patch('promnesia.common.shutil.which', return_value=False) 12 | def test_traverse_ignore_find(patched) -> None: 13 | ''' 14 | traverse() with `find` but ignore some stuff 15 | ''' 16 | paths = set(traverse(testDataPath, ignore=['ignoreme.txt', 'ignoreme2'])) 17 | 18 | assert paths == {testDataPath / 'imhere2/real.txt', testDataPath / 'imhere.txt'} 19 | 20 | 21 | def test_traverse_ignore_fdfind(): 22 | ''' 23 | traverse() with `fdfind` but ignore some stuff 24 | ''' 25 | paths = set(traverse(testDataPath, ignore=['ignoreme.txt', 'ignoreme2'])) 26 | 27 | assert paths == {testDataPath / 'imhere.txt', testDataPath / 'imhere2/real.txt'} 28 | 29 | 30 | # TODO: It would be nice to test the implementation directly without having to do this 31 | # weird patching in the future 32 | @patch('promnesia.common._is_windows', new_callable=lambda: True) 33 | def test_traverse_ignore_windows(patched) -> None: 34 | ''' 35 | traverse() with python when _is_windows is true but ignore some stuff 36 | ''' 37 | paths = set(traverse(testDataPath, ignore=['ignoreme.txt', 'ignoreme2'])) 38 | 39 | assert paths == {testDataPath / 'imhere.txt', testDataPath / 'imhere2/real.txt'} 40 | -------------------------------------------------------------------------------- /src/promnesia/tests/utils.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Mapping, Sequence 2 | from datetime import datetime, timedelta 3 | from pathlib import Path 4 | from typing import Optional, Union 5 | 6 | from ..common import Loc, Source, Visit 7 | from ..database.dump import visits_to_sqlite 8 | from ..extract import extract_visits 9 | 10 | # TODO a bit shit... why did I make it dict at first?? 11 | Urls = Union[ 12 | Mapping[str, Optional[str]], 13 | Sequence[tuple[str, Optional[str]]], 14 | ] 15 | 16 | 17 | def index_urls(urls: Urls, *, source_name: str = 'test'): 18 | uuu = list(urls.items()) if isinstance(urls, dict) else urls 19 | 20 | def idx(tmp_path: Path) -> None: 21 | def indexer(): 22 | for i, (url, ctx) in enumerate(uuu): 23 | yield Visit( 24 | url=url, 25 | dt=datetime.min + timedelta(days=5000) + timedelta(hours=i), 26 | locator=Loc.make('test'), 27 | context=ctx, 28 | ) 29 | 30 | db_visits = extract_visits(source=Source(indexer), src=source_name) 31 | errors = visits_to_sqlite(vit=db_visits, overwrite_db=True, _db_path=tmp_path / 'promnesia.sqlite') 32 | 33 | assert len(errors) == 0, errors 34 | 35 | return idx 36 | -------------------------------------------------------------------------------- /tests/common.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | import sys 5 | import time 6 | from contextlib import contextmanager 7 | from functools import wraps 8 | from pathlib import Path 9 | from typing import Iterator, TypeVar 10 | 11 | import pytest 12 | import requests 13 | 14 | from promnesia.tests.common import free_port 15 | 16 | 17 | def has_x() -> bool: 18 | # meh, not very portable, but good enough for now 19 | return 'DISPLAY' in os.environ 20 | 21 | 22 | def under_ci() -> bool: 23 | return 'CI' in os.environ 24 | 25 | 26 | def skip_if_ci(reason): 27 | return pytest.mark.skipif(under_ci(), reason=reason) 28 | 29 | 30 | def uses_x(f): 31 | @skip_if_ci('Uses X server') 32 | @wraps(f) 33 | def ff(*args, **kwargs): 34 | return f(*args, **kwargs) 35 | 36 | return ff 37 | 38 | 39 | @contextmanager 40 | def tmp_popen(*args, **kwargs): 41 | import psutil 42 | 43 | with psutil.Popen(*args, **kwargs) as p: 44 | try: 45 | yield p 46 | finally: 47 | for c in p.children(recursive=True): 48 | c.kill() 49 | p.kill() 50 | p.wait() 51 | 52 | 53 | @contextmanager 54 | def local_http_server(path: Path) -> Iterator[str]: 55 | address = '127.0.0.1' 56 | with ( 57 | free_port() as port, 58 | tmp_popen([sys.executable, '-m', 'http.server', '--directory', path, '--bind', address, str(port)]) as popen, 59 | ): 60 | endpoint = f'http://{address}:{port}' 61 | 62 | # meh.. but not sure if there is a better way to find out whether it's ready to serve requests 63 | for _attempt in range(50): 64 | try: 65 | requests.get(endpoint) 66 | except: 67 | time.sleep(0.05) 68 | continue 69 | else: 70 | break 71 | yield endpoint 72 | 73 | 74 | T = TypeVar('T') 75 | 76 | 77 | def notnone(x: T | None) -> T: 78 | assert x is not None 79 | return x 80 | -------------------------------------------------------------------------------- /tests/convert_screencast.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | from pathlib import Path 4 | from subprocess import check_call 5 | 6 | 7 | def convert(path: Path): 8 | suf = '.mp4' 9 | if path.suffix == suf: 10 | # makes it easier for shell globbing... 11 | path = path.with_suffix('') 12 | 13 | inp = path.with_suffix(suf) 14 | assert inp.exists(), inp 15 | subs = path.with_suffix('.ssa') 16 | webm = path.with_suffix('.webm') 17 | 18 | 19 | # jeez... https://video.stackexchange.com/a/28276/29090 20 | # otherwise quiality sucks, e.g. letters are grainy 21 | # 22 | # ok, nice guide.. https://gist.github.com/Vestride/278e13915894821e1d6f#convert-to-webm 23 | # 24 | passfile = path.with_suffix(".pass0") 25 | for stage in [ 26 | f'-b:v 0 -crf 30 -pass 1 -passlogfile {passfile} -an -f webm /dev/null', 27 | f'-b:v 0 -crf 30 -pass 2 -passlogfile {passfile} {webm}' if all( 28 | x not in str(inp) for x in ( 29 | # fucking hell, it segfaults... 30 | 'child-visits-2', 31 | 'highlights', 32 | )) else str(webm), 33 | ]: 34 | check_call([ 35 | 'ffmpeg', 36 | # TODO display banner if running interactively?? 37 | # '-hide_banner', '-loglevel', 'panic', # less spam 38 | '-y', # allow overwrite 39 | '-i', inp, 40 | '-vf', f"ass={subs}", 41 | *stage.split(), 42 | ]) # TODO cwd?? 43 | 44 | 45 | if __name__ == '__main__': 46 | paths = list(map(Path, sys.argv[1:])) 47 | from concurrent.futures import ThreadPoolExecutor 48 | with ThreadPoolExecutor() as pool: 49 | for _ in pool.map(convert, paths): 50 | # need to force the iterator 51 | pass 52 | -------------------------------------------------------------------------------- /tests/install_and_run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from pathlib import Path 3 | from subprocess import check_call, check_output 4 | import time 5 | import json 6 | import os 7 | from tempfile import TemporaryDirectory 8 | 9 | 10 | # TODO reuse example config? 11 | CONFIG = """ 12 | 13 | OUTPUT_DIR = {output_dir} 14 | 15 | SOURCES = [ 16 | 'promnesia.sources.demo', 17 | ] 18 | 19 | """ 20 | 21 | def systemctl(*args): 22 | check_call([ 23 | 'systemctl', '--no-pager', '--user', *args, 24 | ]) 25 | 26 | 27 | # TODO do it in pipenv? 28 | def run(tdir: Path): 29 | cfg = CONFIG.format(output_dir=f'"{tdir}"') 30 | cfg_file = tdir / 'config.py' 31 | cfg_file.write_text(cfg) 32 | 33 | 34 | promnesia = Path(__file__).absolute().parent.parent / 'scripts/promnesia' 35 | 36 | check_call([promnesia, 'index', '--config', cfg_file]) 37 | 38 | check_call([ 39 | promnesia, 'install-server', 40 | '--name' , 'promnesia-test', # should add .serice arg 41 | '--db', str(tdir / 'promnesia.sqlite'), 42 | '--timezone', 'Europe/Moscow', 43 | '--port', '17777', # TODO get free port? 44 | ]) 45 | 46 | response = None 47 | for x in range(10): 48 | time.sleep(1) 49 | try: 50 | response = json.loads(check_output([ 51 | 'curl', 'localhost:17777/status', '--data', '', 52 | ]).decode('utf8')) 53 | break 54 | except Exception as e: 55 | print(str(e)) 56 | assert response is not None 57 | 58 | response = json.loads(check_output([ 59 | 'curl', 'localhost:17777/status', '--data', '', 60 | ]).decode('utf8')) 61 | 62 | print(response) 63 | assert response['db'] == str(tdir / 'promnesia.sqlite') 64 | 65 | time.sleep(1) 66 | systemctl('is-active', 'promnesia-test.service') 67 | print("Test succeeded!") 68 | 69 | # TODO prompt for cleanup? 70 | systemctl('stop' , 'promnesia-test.service') 71 | systemctl('disable', 'promnesia-test.service') 72 | 73 | 74 | def main(): 75 | with TemporaryDirectory() as tdir: 76 | run(Path(tdir)) 77 | 78 | 79 | if __name__ == '__main__': 80 | main() 81 | -------------------------------------------------------------------------------- /tests/record.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from pathlib import Path 3 | import re 4 | from time import sleep 5 | from subprocess import Popen, check_output 6 | from typing import Optional, List, Union 7 | 8 | # TODO decorator that records a video if a certain env var/flag is set (pass a custom name too) 9 | 10 | @contextmanager 11 | def hotkeys(geometry: Optional[str]=None): 12 | # TODO kill in advance?? 13 | ctx = Popen([ 14 | 'screenkey', 15 | '--no-detach', 16 | '--key-mode', 'composed', 17 | '--scr', '0', 18 | '--timeout', '2', 19 | '--bg-color', '#000000', 20 | '--font-color', '#ffffff', 21 | '--font-size', 'large', 22 | '--opacity', '0.6', 23 | # TODO hmm. it has --persist arg, but no --no-persist?? 24 | *([] if geometry is None else ['-g', geometry]), 25 | ]) 26 | with ctx as p: 27 | try: 28 | yield p 29 | finally: 30 | p.kill() 31 | 32 | 33 | 34 | @contextmanager 35 | def record(output: Optional[Path]=None, wid: Optional[str]=None, quality: Optional[str]=None): 36 | assert output is not None, "TODO use tmp file or current dir??" 37 | # TODO to fullscreen if None? 38 | assert wid is not None 39 | 40 | 41 | # ugh. no idea wtf is happening here... why is position 2,90?? 42 | # wmctrl -i -r 230686723 -e '0,0,0,400,400' 43 | # xdotool getwindowgeometry 230686723 44 | # Window 230686723 45 | # Position: 2,90 (screen: 0) 46 | # Geometry: 400x400 47 | # Position + Geometry don't add up to the screen size. fuck. 48 | # 49 | # ok, xwininfo seems more reliable 50 | # 51 | # xwininfo -id $(xdotool getactivewindow)' 52 | out = check_output(['xwininfo', '-id', wid]).decode('utf8').replace('\n', ' ') 53 | m = re.search(r'geometry (\d+)x(\d+)[+-](\d+)[+-](\d+)', out) 54 | assert m is not None, out 55 | w, h, x, y = m.groups() 56 | 57 | # fuck. 58 | titlebar = 32 59 | 60 | # fuck x 2 61 | margin = 28 62 | 63 | cmd: List[Union[Path, str]] = [ 64 | 'ffmpeg', 65 | '-hide_banner', '-loglevel', 'panic', # less spam in the terminal 66 | '-f', 'x11grab', 67 | '-y', 68 | '-r', '30', 69 | '-s', f'{w}x{titlebar + int(h)}', 70 | '-i', f':0.0+{x},{margin + int(y)}', 71 | output, 72 | ] 73 | # TODO not sure if need converter script 74 | # TODO double check there are no ffmpeg processes remaining? 75 | # maybe, set timeout? 76 | 77 | with Popen(cmd) as p: 78 | # early check 79 | sleep(0.5) 80 | assert p.poll() is None, f'{cmd} died!' 81 | 82 | try: 83 | yield p 84 | finally: 85 | assert p.poll() is None, f'{cmd} died!' 86 | 87 | p.terminate() 88 | p.wait(timeout=10) 89 | 90 | 91 | # https://stackoverflow.com/a/52669454/706389 92 | CURSOR_SCRIPT = ''' 93 | function enableCursor() { 94 | var seleniumFollowerImg = document.createElement("img"); 95 | seleniumFollowerImg.setAttribute('src', 'data:image/png;base64,' 96 | + 'iVBORw0KGgoAAAANSUhEUgAAABQAAAAeCAQAAACGG/bgAAAAAmJLR0QA/4ePzL8AAAAJcEhZcwAA' 97 | + 'HsYAAB7GAZEt8iwAAAAHdElNRQfgAwgMIwdxU/i7AAABZklEQVQ4y43TsU4UURSH8W+XmYwkS2I0' 98 | + '9CRKpKGhsvIJjG9giQmliHFZlkUIGnEF7KTiCagpsYHWhoTQaiUUxLixYZb5KAAZZhbunu7O/PKf' 99 | + 'e+fcA+/pqwb4DuximEqXhT4iI8dMpBWEsWsuGYdpZFttiLSSgTvhZ1W/SvfO1CvYdV1kPghV68a3' 100 | + '0zzUWZH5pBqEui7dnqlFmLoq0gxC1XfGZdoLal2kea8ahLoqKXNAJQBT2yJzwUTVt0bS6ANqy1ga' 101 | + 'VCEq/oVTtjji4hQVhhnlYBH4WIJV9vlkXLm+10R8oJb79Jl1j9UdazJRGpkrmNkSF9SOz2T71s7M' 102 | + 'SIfD2lmmfjGSRz3hK8l4w1P+bah/HJLN0sys2JSMZQB+jKo6KSc8vLlLn5ikzF4268Wg2+pPOWW6' 103 | + 'ONcpr3PrXy9VfS473M/D7H+TLmrqsXtOGctvxvMv2oVNP+Av0uHbzbxyJaywyUjx8TlnPY2YxqkD' 104 | + 'dAAAAABJRU5ErkJggg=='); 105 | seleniumFollowerImg.setAttribute('id', 'selenium_mouse_follower'); 106 | seleniumFollowerImg.setAttribute('style', 'position: absolute; z-index: 99999999999; pointer-events: none; left:0; top:0'); 107 | document.body.appendChild(seleniumFollowerImg); 108 | document.onmousemove = function (e) { 109 | document.getElementById("selenium_mouse_follower").style.left = e.pageX + 'px'; 110 | document.getElementById("selenium_mouse_follower").style.top = e.pageY + 'px'; 111 | }; 112 | }; 113 | 114 | enableCursor(); 115 | ''' 116 | 117 | 118 | # https://stackoverflow.com/a/987376/706389 119 | SELECT_SCRIPT = ''' 120 | function selectText(node) { 121 | if (document.body.createTextRange) { 122 | const range = document.body.createTextRange(); 123 | range.moveToElementText(node); 124 | range.select(); 125 | } else if (window.getSelection) { 126 | const selection = window.getSelection(); 127 | const range = document.createRange(); 128 | range.selectNodeContents(node); 129 | selection.removeAllRanges(); 130 | selection.addRange(range); 131 | } else { 132 | console.warn("Could not select text in node: Unsupported browser."); 133 | } 134 | } 135 | ''' 136 | -------------------------------------------------------------------------------- /tests/testdata/auto/orgs/file.org: -------------------------------------------------------------------------------- 1 | * TODO [#C] figure out 2 | :PROPERTIES: 3 | :CREATED: [2018-08-06 Mon 22:52] 4 | :END: 5 | 6 | most important 7 | 8 | * [2019-05-10 Fri 17:20] [[https://reddit.com/r/bodyweightfitness/comments/bl7nyy/how_i_learned_to_handstand_about_5_minutes_ago/][How I learned to handstand about 5 minutes ago, after trying for around a year. A surprising method you maybe haven't tried.]] /r/bodyweightfitness 9 | 10 | This whole time I've been trying to keep myself up, when you're really supposed to be preventing the fall. This exercise gets you to use the strongest muscles in this exercise (shoulders) to prevent your torso falling down. Whereas I think previously I, and a lot of people, would be trying to balance the body mostly with the hands, and the position of the legs, if that makes sense. 11 | 12 | Anyway, hope it helps someone. 13 | 14 | * TODO [#C] [2019-10-16 Wed 08:28] xxx /r/cpp :programming:cpp: 15 | I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_]( 16 | https://www.youtube.com/watch?v=rHIkrotSwcc) very much. 17 | 18 | 19 | * something... 20 | 21 | - one 22 | - zwei 23 | [[https://twitter.com/Mappletons/status/1255221220263563269][cozyweb]] 24 | - drei 25 | -------------------------------------------------------------------------------- /tests/testdata/auto/orgs/file2.org: -------------------------------------------------------------------------------- 1 | #+FILETAGS: topology 2 | 3 | simulations/visualisations of fundamental group 4 | 5 | https://en.wikipedia.org/wiki/Computational_topology 6 | 7 | http://graphics.stanford.edu/courses/cs468-09-fall/ 8 | hmm wonder if that does it. they mention triangulation. 9 | 10 | https://en.wikipedia.org/wiki/Triangulation_(topology) 11 | https://en.wikipedia.org/wiki/Digital_manifold 12 | -------------------------------------------------------------------------------- /tests/testdata/auto/orgs/file3.org: -------------------------------------------------------------------------------- 1 | * [2016-05-14 Sat 15:33] [[https://www.reddit.com/r/androidapps/comments/4i36z9/how_you_use_your_android_to_the_maximum/d2uq24i][sc4s2cg comments on How you use your android to the maximum?]] :android: 2 | 3 | * something 4 | https://link.com 5 | 6 | * [2019-05-14 Tue 20:26] [[https://www.instapaper.com/read/1193274157][ip]] [[https://blog.andymatuschak.org/post/169043084412/successful-habits-through-smoothly-ratcheting][Successful habits through smoothly ratcheting targets]] 7 | 8 | 9 | * fewf 10 | 11 | ** [2019-05-03 Fri 08:29] apparently [[https://en.wikipedia.org/wiki/Resilio_Sync][Resilio Sync]] exists, but it's proprietary, nothing else I know of or resulting from quick googling 12 | ** [2019-06-13 Thu 19:45] [[https://en.wikipedia.org/wiki/InterPlanetary_File_System][IPFS]] looks close, but appparently not user friendly yet 13 | 14 | -------------------------------------------------------------------------------- /tests/testdata/auto/orgs/file4.org: -------------------------------------------------------------------------------- 1 | :PROPERTIES: 2 | :ID: 1554c141-9567-4345-99d9-7c5e2853dbaa 3 | :ROAM_REFS: https://example.org/ref_example 4 | :END: 5 | #+title: A Ref Example 6 | 7 | We need [[http://example.org/a_test][a test]]! 8 | -------------------------------------------------------------------------------- /tests/testdata/auto/orgs/file5.org: -------------------------------------------------------------------------------- 1 | * sexp in property 2 | :PROPERTIES: 3 | :CREATED: <%%(diary-date 03 25 2023)> 4 | :END: 5 | -------------------------------------------------------------------------------- /tests/testdata/auto/pocket.json: -------------------------------------------------------------------------------- 1 | { 2 | "status": 1, 3 | "complete": 1, 4 | "list": { 5 | "xxx": { 6 | "given_url": "https://johncarlosbaez.wordpress.com/2016/09/09/struggles-with-the-continuum-part-2/", 7 | "given_title": "Struggles with the Continuum (Part 2) | Azimuth", 8 | "favorite": "0", 9 | "status": "0", 10 | "sort_id": 1, 11 | "resolved_title": "Struggles with the Continuum (Part 2)" 12 | }, 13 | "yyy": { 14 | "given_url": "https://www.scottaaronson.com/blog/?p=2464", 15 | "given_title": "Bell inequality violation finally done right", 16 | "favorite": "0", 17 | "sort_id": 2, 18 | "resolved_title": "Bell inequality violation finally done right", 19 | "resolved_url": "https://www.scottaaronson.com/blog/?p=2464", 20 | "excerpt": "A few weeks ago, Hensen et al., of the Delft University of Technology and Barcelona, Spain, put out a paper reporting the first experiment that violates the Bell inequality in a way that closes off the two main loopholes simultaneously: the locality and detection loopholes.", 21 | "is_article": "1" 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/testdata/custom/file1.txt: -------------------------------------------------------------------------------- 1 | Right, so this points at http://google.com 2 | 3 | 4 | something something 5 | 6 | trailing slash maybe? http://google.com/ ? 7 | 8 | 9 | whoops! http://some-weird-domain.xyz/whatever 10 | -------------------------------------------------------------------------------- /tests/testdata/custom/file2.txt: -------------------------------------------------------------------------------- 1 | And this points at https://google.com . Whoa, so secure! 2 | 3 | and http://what.about.this.link? Really, you should add a space before ?, :123: to confuse grep but what if you didnt? 4 | 5 | this should be ignored chrome-extension://bfhcmckmbimgclmdomhanencdoefcnio/options_page.html since it's an internal brwoser link 6 | -------------------------------------------------------------------------------- /tests/testdata/logseq-graph/logseq/config.edn: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /tests/testdata/logseq-graph/pages/Note.md: -------------------------------------------------------------------------------- 1 | This is a test note with a [link](https://example.com). 2 | -------------------------------------------------------------------------------- /tests/testdata/normalise/ff.txt: -------------------------------------------------------------------------------- 1 | http://hi.com 2 | 3 | 4 | http://reddit.com/post 5 | 6 | http://reddit.com/post&stupid_param=whatever 7 | 8 | 9 | # this v isn't detected at the moment because of the typo in http. Not sure if it ever will be? 10 | htpp://reddit.com/post?whoops 11 | 12 | http://www.argos.co.uk/webapp/wcs/stores/servlet/OrderItemDisplay 13 | 14 | http://www.argos.co.uk/webapp/wcs/stores/servlet/OrderItemDisplay?storeId=10151&GiftMode=Reset&langId=110&krypto=DfKzD/frV1rz5gXFhfkTSOJ/+Fphcd/Mx/H5+m5Jfbp5UlOUllPqDwFbO94lNbtFaEXhWQ7bVqIl\nbqhTqO1zcQ7FXphHXYAO3bbj07XbbDf40pQX5mQFACOPRF0LPibBG6yqBP0RDWQWUl7vcgTmqA== 15 | 16 | 17 | https://www.youtube.com/watch?v=XXlZfc1TrD0 18 | 19 | https://www.youtube.com/watch?v=XXlZfc1Tr11 20 | -------------------------------------------------------------------------------- /tests/testdata/obsidian-vault/.obsidian/app.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /tests/testdata/obsidian-vault/Note.md: -------------------------------------------------------------------------------- 1 | This is a test note with a [link](https://example.com). 2 | -------------------------------------------------------------------------------- /tests/testdata/takeout-20150518T000000Z.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/karlicoss/promnesia/75ea4a55b9b5c49125c3fd72197e42785ad604d6/tests/testdata/takeout-20150518T000000Z.zip -------------------------------------------------------------------------------- /tests/testdata/takeout/Takeout/My Activity/Chrome/MyActivity.html: -------------------------------------------------------------------------------- 1 | My Activity History
27 | 28 | 29 | 30 | 31 | 32 |

Search

33 |
Visited https://productforums.google.com/forum/
Jan 31, 2018, 10:54:50 PM
34 |
35 |
Products:
 Search
36 |
37 |

Search

38 |
Visited http://www.adobe.com/creativecloud.html
Feb 8, 2017, 12:32:39 AM
39 |
40 |
Products:
 Search
41 |
42 |

Search

43 |
Searched for adobe creative cloud
Feb 8, 2017, 12:32:36 AM
44 |
45 |
Products:
 Search
Locations:
 From your home: https://google.com/maps?q=25.800819,-80.186310 46 |
47 |
48 | -------------------------------------------------------------------------------- /tests/testdata/test_config.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from promnesia.common import Source 4 | from promnesia.sources.plaintext import extract_from_path 5 | import promnesia.sources.shellcmd as shellcmd # type: ignore 6 | import promnesia.sources.takeout as takeout # type: ignore 7 | 8 | 9 | def index_takeout(): 10 | class user_config: 11 | takeout_path = 'tests/testdata/takeout-20150518T000000Z.zip' 12 | 13 | import my.config 14 | my.config.google = user_config # type: ignore 15 | 16 | yield from takeout.index() 17 | 18 | 19 | class Sources: 20 | 21 | TAKEOUT = Source(index_takeout, name='takeout') 22 | 23 | PLAIN = Source( 24 | shellcmd.index, 25 | extract_from_path('tests/testdata/custom'), 26 | name='test', 27 | ) 28 | 29 | 30 | SOURCES = [ 31 | Sources.PLAIN, 32 | Sources.TAKEOUT, 33 | ] 34 | 35 | # todo ugh, this shouldn't really be collected by pytest... 36 | -------------------------------------------------------------------------------- /tests/testdata/test_multiple_page_updates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 14 | 15 | 16 | 17 | 18 | 19 |

My First Heading

20 |

My first paragraph.

21 | 22 | link to promnesia 23 | 24 | link to promnesia issues 25 | 26 | link to HN 27 | 28 | another link to promnesia 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /tests/testdata/traverse/ignoreme.txt: -------------------------------------------------------------------------------- 1 | jaiofjeoriheoirjg -------------------------------------------------------------------------------- /tests/testdata/traverse/ignoreme2/notrealignored.txt: -------------------------------------------------------------------------------- 1 | notrealignores -------------------------------------------------------------------------------- /tests/testdata/traverse/imhere.txt: -------------------------------------------------------------------------------- 1 | imhere.txt -------------------------------------------------------------------------------- /tests/testdata/traverse/imhere2/real.txt: -------------------------------------------------------------------------------- 1 | jdfioja -------------------------------------------------------------------------------- /tests/testdata/weird.txt: -------------------------------------------------------------------------------- 1 | https://urbandictionary.com/define.php?term=Belgian%20Whistle 2 | right, so https://en.wikipedia.org/wiki/Dinic%27s_algorithm can be used for max flow 3 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | minversion = 3.21 3 | # relies on the correct version of Python installed 4 | envlist = ruff,tests-core,tests-all,mypy-core,mypy-misc 5 | # NOTE: we don't run end2end by default since it requires elaborate setup 6 | # https://github.com/tox-dev/tox/issues/20#issuecomment-247788333 7 | # hack to prevent .tox from crapping to the project directory 8 | toxworkdir = {env:TOXWORKDIR_BASE:}{toxinidir}/.tox 9 | 10 | [testenv] 11 | # TODO how to get package name from setuptools? 12 | package_name = "promnesia" 13 | passenv = 14 | # useful for tests to know they are running under ci 15 | CI 16 | CI_* 17 | # respect user's cache dirs to prevent tox from crapping into project dir 18 | PYTHONPYCACHEPREFIX 19 | MYPY_CACHE_DIR 20 | RUFF_CACHE_DIR 21 | # 22 | MY_CONFIG 23 | # by default we don't run browser tests to avoid confusing people when they run locally 24 | # but we want them on CI, so we allow to pass through the variable when we do want to run them 25 | WITH_BROWSER_TESTS 26 | # todo ugh this is all so confusing... need to simplify 27 | usedevelop = true # for some reason tox seems to ignore "-e ." in deps section?? 28 | uv_seed = true # seems necessary so uv creates separate venvs per tox env? 29 | setenv = 30 | HPI_MODULE_INSTALL_USE_UV=true 31 | 32 | 33 | [testenv:ruff] 34 | dependency_groups = testing 35 | commands = 36 | {envpython} -m ruff check src/ 37 | 38 | 39 | # just the very core tests with minimal dependencies 40 | [testenv:tests-core] 41 | dependency_groups = testing 42 | deps = 43 | -e .[markdown] 44 | # NOTE: markdown is only used for test_cli... might be nice to decouple 45 | commands = 46 | # posargs allow test filtering, e.g. tox ... -- -k test_name 47 | {envpython} -m pytest \ 48 | --pyargs {[testenv]package_name} \ 49 | # note: sources are tested in tests-all 50 | --ignore src/promnesia/sources \ 51 | --ignore src/promnesia/tests/sources \ 52 | {posargs} 53 | 54 | 55 | [testenv:tests-all] 56 | dependency_groups = testing 57 | deps = 58 | -e .[all,HPI,org] 59 | beautifulsoup4<4.13.0 # FIXME temporary hack until https://github.com/purarue/google_takeout_parser/pull/81 is merged 60 | uv # for hpi module install 61 | commands = 62 | # used in some tests 63 | {envpython} -m my.core module install \ 64 | my.google.takeout.parser \ 65 | my.hypothesis 66 | {envpython} -m pytest \ 67 | --pyargs {[testenv]package_name} \ 68 | {posargs} 69 | 70 | 71 | [testenv:end2end] 72 | setenv = 73 | WITH_BROWSER_TESTS=true 74 | PYTEST_TIMEOUT=120 75 | dependency_groups = 76 | testing 77 | testing-end2end 78 | deps = 79 | -e .[HPI] 80 | uv # for hpi module install 81 | commands = 82 | {envpython} -m my.core module install my.hypothesis 83 | {envpython} -m pytest \ 84 | # TODO noconftest is hack due to end2end tests being in a separate dir 85 | # ideally need to just move it inside the package as well 86 | --noconftest \ 87 | tests/end2end_test.py \ 88 | {posargs} 89 | 90 | 91 | [testenv:mypy-core] 92 | dependency_groups = testing 93 | commands = 94 | {envpython} -m mypy --no-install-types \ 95 | # note: sources are tested separately, below 96 | -p {[testenv]package_name} --exclude 'sources/*' \ 97 | # txt report is a bit more convenient to view on CI 98 | --txt-report .coverage.mypy-core \ 99 | --html-report .coverage.mypy-core \ 100 | {posargs} 101 | 102 | 103 | [testenv:mypy-misc] 104 | dependency_groups = testing 105 | deps = 106 | -e .[HPI,org,markdown] # todo install from HPI[all] or something? 107 | beautifulsoup4<4.13.0 # FIXME temporary hack until https://github.com/purarue/google_takeout_parser/pull/81 is merged 108 | uv # for hpi module install 109 | commands = 110 | {envpython} -m my.core module install \ 111 | my.github.ghexport \ 112 | my.hypothesis \ 113 | my.instapaper \ 114 | my.pocket \ 115 | my.reddit \ 116 | my.fbmessenger \ 117 | my.google.takeout.parser \ 118 | my.browser.export 119 | 120 | {envpython} -m mypy --no-install-types \ 121 | -p {[testenv]package_name}.sources \ 122 | # txt report is a bit more convenient to view on CI 123 | --txt-report .coverage.mypy-misc \ 124 | --html-report .coverage.mypy-misc \ 125 | {posargs} 126 | 127 | # ugh. a bit crap to run it separately 128 | # but first will need to move tests inside the package if we want otherwise? 129 | # and I recall it was problematic at times.. 130 | {envpython} -m mypy --no-install-types \ 131 | tests --exclude 'testdata/*' \ 132 | {posargs} 133 | --------------------------------------------------------------------------------