├── .flake8 ├── .github └── pull_request_template.md ├── .gitignore ├── .gitlab-ci.yml ├── .gitlab ├── changelog_config.yml └── merge_request_templates │ └── default.md ├── .gitmodules ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── debian ├── changelog ├── compat ├── control ├── rules ├── sosse-crawler.service ├── sosse-uwsgi.service ├── sosse.conf ├── sosse.install ├── sosse.postinst ├── uwsgi.ini └── uwsgi.params ├── doc ├── Makefile ├── build_changelog.sh ├── build_check.sh ├── get_artifacts.py ├── make.bat ├── requirements.txt └── source │ ├── _extensions │ └── code_blocks.py │ ├── _static │ └── style.css │ ├── _templates │ ├── base.html │ └── page.html │ ├── admin_ui.rst │ ├── administration.rst │ ├── cli.rst │ ├── conf.py │ ├── config_file.rst │ ├── cookies.rst │ ├── crawl │ ├── analytics.rst │ ├── crawlers.rst │ ├── feeds.rst │ ├── new_url.rst │ ├── policies.rst │ ├── queue.rst │ └── recursion_depth.rst │ ├── crawl_guidelines.rst │ ├── documents.rst │ ├── domain_settings.rst │ ├── excluded_urls.rst │ ├── guides.rst │ ├── guides │ ├── archive.rst │ ├── authentication.rst │ ├── authentication_browser_inspect.png │ ├── captcha.rst │ ├── download.rst │ ├── feed_website_monitor.rst │ └── search.rst │ ├── index.rst │ ├── install.rst │ ├── install │ ├── database.rst.template │ ├── debian.rst │ ├── debian_upgrades.rst │ ├── docker.rst │ ├── docker_compose.rst │ ├── docker_compose_upgrades.rst │ ├── docker_upgrades.rst │ ├── pip.rst │ └── pip_upgrades.rst │ ├── introduction.rst │ ├── permissions.rst │ ├── screenshots.rst │ ├── search_engines.rst │ ├── tags.rst │ ├── user │ ├── archive.rst │ ├── history.rst │ ├── profile.rst │ ├── rest_api.rst │ ├── search.rst │ ├── shortcut_list.rst │ └── shortcuts.rst │ ├── user_doc.rst │ └── webhooks.rst ├── docker-compose.yml ├── docker ├── Makefile ├── Makefile.common ├── README.md ├── debian-base │ ├── Dockerfile │ └── Makefile ├── debian-pkg │ ├── Dockerfile │ └── Makefile ├── debian-test │ ├── Dockerfile │ └── Makefile ├── debian │ ├── Dockerfile │ └── Makefile ├── doc │ ├── Dockerfile │ ├── Makefile │ └── requirements-rtd.txt ├── docker │ ├── Dockerfile │ └── Makefile ├── pg_run.sh ├── pip-base │ ├── Dockerfile │ └── Makefile ├── pip-compose │ ├── Dockerfile │ └── Makefile ├── pip-release │ ├── Dockerfile │ └── Makefile ├── pip-test │ ├── Dockerfile │ └── Makefile └── run.sh ├── package-lock.json ├── package.json ├── pyproject.toml ├── requirements.txt ├── se ├── __init__.py ├── about.py ├── add_to_queue.py ├── admin.py ├── analytics.py ├── apps.py ├── archive.py ├── atom.py ├── browser.py ├── browser_chromium.py ├── browser_firefox.py ├── browser_request.py ├── browser_selenium.py ├── cookie.py ├── cookies_import.py ├── crawl_policy.py ├── crawl_queue.py ├── crawlers.py ├── document.py ├── document_meta.py ├── domain_setting.py ├── download.py ├── favicon.py ├── history.py ├── html.py ├── html_asset.py ├── html_cache.py ├── html_snapshot.py ├── login.py ├── management │ ├── __init__.py │ └── commands │ │ ├── __init__.py │ │ ├── clear_html_archive.py │ │ ├── crawl.py │ │ ├── default_admin.py │ │ ├── default_conf.py │ │ ├── delete_documents.py │ │ ├── extract_doc.py │ │ ├── generate_secret.py │ │ ├── load_se.py │ │ └── update_se.py ├── migrations │ ├── 0001_initial.py │ ├── 0002_search_vector.py │ ├── 0003_sosse_1_1_0.py │ ├── 0004_sosse_1_2_0.py │ ├── 0005_sosse_1_3_0.py │ ├── 0006_sosse_1_4_0.py │ ├── 0007_sosse_1_5_0.py │ ├── 0008_sosse_1_6_0.py │ ├── 0009_sosse_1_7_0.py │ ├── 0010_sosse_1_8_0.py │ ├── 0011_sosse_1_9_0.py │ ├── 0012_sosse_1_10_0.py │ ├── 0013_sosse_1_11_0.py │ ├── 0014_sosse_1_12_0.py │ ├── 0015_sosse_1_13_0.py │ └── __init__.py ├── models.py ├── online.py ├── opensearch.py ├── page.py ├── profile.py ├── resources.py ├── rest_api.py ├── rest_permissions.py ├── screenshot.py ├── search.py ├── search_form.py ├── search_redirect.py ├── static │ └── se │ │ ├── admin-base.css │ │ ├── admin-change_form.js │ │ ├── admin-forms.css │ │ ├── admin-webhooks.js │ │ ├── admin.js │ │ ├── agpl-logo.svg │ │ ├── analytics.js │ │ ├── base.js │ │ ├── discord-symbol.svg │ │ ├── github-mark.png │ │ ├── gitlab-tanuki-scalability.svg │ │ ├── icon-atom.svg │ │ ├── icon-clear.svg │ │ ├── icon-cog.svg │ │ ├── icon-search.svg │ │ ├── icon-trash.svg │ │ ├── icon-user.svg │ │ ├── index.js │ │ ├── logo.png │ │ ├── logo.svg │ │ ├── screenshot.js │ │ ├── style.css │ │ └── tags.js ├── tag.py ├── tag_field.py ├── tags.py ├── tags_list.py ├── templates │ ├── admin │ │ ├── add_to_queue.html │ │ ├── analytics.html │ │ ├── app_list.html │ │ ├── base.html │ │ ├── base_site.html │ │ ├── change_form.html │ │ ├── change_list.html │ │ ├── change_list_object_tools.html │ │ ├── cookies_import.html │ │ ├── crawl_policy_desc.html │ │ ├── crawl_queue.html │ │ ├── crawl_queue_content.html │ │ ├── crawlers.html │ │ ├── crawlers_content.html │ │ ├── delete_confirmation.html │ │ ├── delete_selected_confirmation.html │ │ ├── index.html │ │ └── object_history.html │ ├── registration │ │ ├── password_change_done.html │ │ ├── password_change_form.html │ │ ├── password_reset_complete.html │ │ ├── password_reset_confirm.html │ │ ├── password_reset_done.html │ │ └── password_reset_form.html │ └── se │ │ ├── about.html │ │ ├── archive.html │ │ ├── base.html │ │ ├── base_fold.html │ │ ├── base_raw.html │ │ ├── components │ │ ├── modal.html │ │ ├── tag.html │ │ ├── tag_action.html │ │ └── tags_list.html │ │ ├── download.html │ │ ├── embed.html │ │ ├── feed.html │ │ ├── history.html │ │ ├── home_browse.html │ │ ├── html_excluded.html │ │ ├── info_fallback.html │ │ ├── main_menu.html │ │ ├── opensearch.xml │ │ ├── pagination.html │ │ ├── profile.html │ │ ├── resources.html │ │ ├── screenshot_full.html │ │ ├── search.html │ │ ├── search_redirect.html │ │ ├── tags.html │ │ ├── unknown_url.html │ │ ├── words.html │ │ └── www.html ├── test_add_to_queue.py ├── test_admin.py ├── test_atom.py ├── test_browser.py ├── test_commands.py ├── test_cookie.py ├── test_cookies_import.py ├── test_crawl.py ├── test_crawl_policy.py ├── test_document.py ├── test_document_index.py ├── test_document_meta.py ├── test_func_document_meta.py ├── test_functionals.py ├── test_html_cache.py ├── test_html_snapshot.py ├── test_misc.py ├── test_mock.py ├── test_parser.py ├── test_redirect.py ├── test_requests.py ├── test_rest_api.py ├── test_rest_permissions.py ├── test_search.py ├── test_tag.py ├── test_url.py ├── test_views.py ├── test_views_mixin.py ├── test_webhook.py ├── url.py ├── utils.py ├── views.py ├── webhook.py ├── words.py └── www.py ├── sosse-admin ├── sosse ├── __init__.py ├── conf.py ├── search_engines.json ├── settings.py ├── sosse_admin.py ├── test_runner.py ├── urls.py └── wsgi.py ├── swagger-initializer.js └── tests ├── build_doc.sh ├── cookies.json ├── coverage.patch ├── doc_test.sh ├── docker_run.sh ├── document-ja.json ├── opensearch.xml ├── pages ├── browser_detect_js.html ├── browser_detect_no_js.html ├── css_in_js.html ├── nav_elements.html ├── test.jpg ├── test.mp4 ├── test.pdf ├── test.png ├── test.wav └── test.zip ├── release.md ├── robotframework ├── config.yaml ├── docs │ ├── 01_firstrun.robot │ ├── 02_crawl.robot │ ├── 03_user.robot │ └── __init__.robot ├── guide_auth.tar.gz ├── guide_download.tar.gz ├── guide_feed_website_monitor.json ├── guides │ ├── 01_download.robot │ ├── 02_monitor_website.robot │ ├── 03_search_policy.robot │ ├── 04_auth.robot │ └── __init__.robot ├── home_docs.json ├── home_favicon.json ├── requirements.txt ├── start.sh ├── tests │ ├── 01_tags.robot │ ├── 02_webhooks.robot │ ├── 03_add_to_queue.robot │ ├── 04_crawl_policies.robot │ ├── __init__.robot │ ├── common.robot │ ├── crawl_policy.robot │ ├── documents.robot │ ├── tags.robot │ └── webhooks.robot ├── vrt.sh └── vrt │ ├── 01_vrt.robot │ └── __init__.robot ├── run_tests.sh ├── searchhistory.json ├── test_app.sh ├── vrt.md └── wait_for_pg.sh /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # E203: whitespace before ‘,’, ‘;’, or ‘:’ -> conflicts with black 3 | # E501: line too long 4 | # W503: line break before binary operator 5 | # W504: line break after binary operator 6 | ignore=E203,E501,W503,W504 7 | exclude=migrations,tests,se/deps 8 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ### Description 2 | 3 | Please include a summary of the changes you’ve made. If applicable, explain why the changes were necessary. 4 | 5 | ### Related Issue(s) 6 | 7 | Closes (if applicable) 8 | 9 | ### Checklist 10 | 11 | - [ ] My code follows the project's coding style. 12 | - [ ] I have tested my changes locally. 13 | - [ ] I have updated the documentation (if necessary). 14 | - [ ] I have added or updated tests (if applicable). 15 | 16 | ### Copyright Assignment 17 | 18 | - [ ] I confirm that the code is my original work and I am assigning copyright to the project. 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | doc/code_blocks.json 2 | doc/source/CHANGELOG.md 3 | doc/source/cli_generated.rst 4 | doc/source/config_file_generated.rst 5 | doc/source/install/database_debian_generated.rst 6 | doc/source/install/database_pip_generated.rst 7 | doc/source/user/shortcut_list_generated.rst 8 | docker/doc/requirements.txt 9 | docker/debian-base/control 10 | docker/debian-test/requirements.txt 11 | docker/debian-test/pre-commit-config.yaml 12 | docker/pip-release/pg_run.sh 13 | docker/pip-release/run.sh 14 | docker/pip-release/sosse.conf 15 | docker/pip-release/uwsgi.* 16 | docker/pip-compose/run.sh 17 | docker/pip-compose/sosse.conf 18 | docker/pip-compose/uwsgi.* 19 | se/static/swagger/ 20 | se/static/se/node_modules/ 21 | 22 | # genertated during VRT tests 23 | tests/robotframework/config.yaml 24 | -------------------------------------------------------------------------------- /.gitlab/changelog_config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | categories: 3 | feature: Features 4 | bug: Bug fixes 5 | security: Security fixes 6 | -------------------------------------------------------------------------------- /.gitlab/merge_request_templates/default.md: -------------------------------------------------------------------------------- 1 | ### Description 2 | 3 | Please include a summary of the changes you’ve made. If applicable, explain why the changes were necessary. 4 | 5 | ### Related Issue(s) 6 | 7 | Closes (if applicable) 8 | 9 | ### Checklist 10 | 11 | - [ ] My code follows the project's coding style. 12 | - [ ] I have tested my changes locally. 13 | - [ ] I have updated the documentation (if necessary). 14 | - [ ] I have added or updated tests (if applicable). 15 | 16 | ### Copyright Assignment 17 | 18 | - [ ] I confirm that the code is my original work and I am assigning copyright to the project. 19 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "se/deps/linkpreview"] 2 | path = se/deps/linkpreview 3 | url = https://github.com/meyt/linkpreview/ 4 | [submodule "se/deps/fake-useragent"] 5 | path = se/deps/fake-useragent 6 | url = https://github.com/fake-useragent/fake-useragent.git 7 | [submodule "se/deps/unicode_mime_icons"] 8 | path = se/deps/unicode_mime_icons 9 | url = https://github.com/biolds/unicode_mime_icons.git 10 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | exclude: "^(se/templates/se/opensearch.xml|doc/source/_extensions/.*)$" 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v5.0.0 7 | hooks: 8 | - id: end-of-file-fixer 9 | - id: check-ast 10 | - id: check-yaml 11 | - id: check-docstring-first 12 | - id: check-executables-have-shebangs 13 | - id: check-json 14 | - id: check-merge-conflict 15 | - id: check-shebang-scripts-are-executable 16 | - id: check-symlinks 17 | - id: check-toml 18 | - id: check-xml 19 | - id: check-yaml 20 | - id: mixed-line-ending 21 | - id: requirements-txt-fixer 22 | - id: trailing-whitespace 23 | - repo: https://github.com/asottile/pyupgrade 24 | rev: v3.20.0 25 | hooks: 26 | - id: pyupgrade 27 | - repo: https://github.com/astral-sh/ruff-pre-commit 28 | # Ruff version. 29 | rev: v0.11.11 30 | hooks: 31 | # Run the linter. 32 | - id: ruff 33 | args: [--fix] 34 | # Run the formatter. 35 | - id: ruff-format 36 | - repo: https://github.com/PyCQA/bandit 37 | rev: "1.8.3" 38 | hooks: 39 | - id: bandit 40 | exclude: "^(doc/.*|se/test_.*)$" 41 | - repo: https://github.com/biolds/docformatter 42 | rev: v1.7.6-alpha 43 | hooks: 44 | - id: docformatter 45 | - repo: https://github.com/PyCQA/doc8 46 | rev: "v1.1.2" 47 | hooks: 48 | - id: doc8 49 | - repo: https://github.com/PyCQA/isort 50 | rev: "6.0.1" 51 | hooks: 52 | - id: isort 53 | - repo: https://github.com/PyCQA/eradicate 54 | rev: "3.0.0" 55 | hooks: 56 | - id: eradicate 57 | - repo: https://github.com/asottile/yesqa 58 | rev: v1.5.0 59 | hooks: 60 | - id: yesqa 61 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | jobs: 15 | pre_build: 16 | - python3 doc/get_artifacts.py 17 | 18 | python: 19 | install: 20 | - requirements: doc/requirements.txt 21 | 22 | # Build documentation in the docs/ directory with Sphinx 23 | sphinx: 24 | configuration: doc/source/conf.py 25 | fail_on_warning: true 26 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM biolds/sosse:pip-base 2 | ARG PIP_INDEX_URL= 3 | ARG PIP_TRUSTED_HOST= 4 | RUN mkdir /root/sosse 5 | WORKDIR /root/sosse 6 | ADD requirements.txt . 7 | ADD pyproject.toml . 8 | ADD MANIFEST.in . 9 | ADD Makefile . 10 | ADD package.json . 11 | ADD swagger-initializer.js . 12 | ADD README.md . 13 | ADD se/ se/ 14 | ADD sosse/ sosse/ 15 | RUN apt-get update && apt-get install -y postgresql && apt-get clean 16 | RUN make install_js_deps 17 | RUN virtualenv /venv 18 | RUN /venv/bin/pip install ./ && /venv/bin/pip install uwsgi && /venv/bin/pip cache purge 19 | ADD debian/sosse.conf /etc/nginx/sites-enabled/default 20 | RUN mkdir -p /etc/sosse/ /etc/sosse_src/ /var/log/sosse /var/log/uwsgi /var/www/.cache /var/www/.mozilla 21 | ADD debian/uwsgi.* /etc/sosse_src/ 22 | RUN chown -R root:www-data /etc/sosse /etc/sosse_src && chmod 750 /etc/sosse_src/ && chmod 640 /etc/sosse_src/* 23 | RUN chown www-data:www-data /var/log/sosse /var/www/.cache /var/www/.mozilla 24 | ADD docker/run.sh docker/pg_run.sh / 25 | RUN chmod 755 /run.sh /pg_run.sh 26 | 27 | WORKDIR / 28 | USER postgres 29 | RUN /etc/init.d/postgresql start && \ 30 | (until pg_isready; do sleep 1; done) && \ 31 | psql --command "CREATE USER sosse WITH PASSWORD 'sosse';" && \ 32 | createdb -O sosse sosse && \ 33 | /etc/init.d/postgresql stop && \ 34 | tar -c -p -C / -f /tmp/postgres_sosse.tar.gz /var/lib/postgresql 35 | 36 | USER root 37 | CMD ["/usr/bin/bash", "/pg_run.sh"] 38 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include se/deps/fake-useragent/src/fake_useragent/data/* 2 | include se/deps/unicode_mime_icons/* 3 | include se/static/se/* 4 | include se/static/se/node_modules/@kurkle/color/dist/*.js 5 | include se/static/se/node_modules/chart.js/dist/*.js 6 | include se/static/se/node_modules/chart.js/dist/*/*.js 7 | include se/static/se/node_modules/chartjs-adapter-luxon/dist/*.min.js 8 | include se/static/se/node_modules/luxon/build/global/luxon.min.js 9 | include se/static/swagger/* 10 | include se/templates/admin/* 11 | include se/templates/se/* 12 | include se/templates/se/components/* 13 | include sosse/search_engines.json 14 | -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 10 2 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: sosse 2 | Section: admin 3 | Priority: optional 4 | Maintainer: Laurent DEFERT 5 | Build-Depends: debhelper (>=10), dh-python, python3, python3-setuptools 6 | Standards-Version: 4.0.0 7 | Homepage: https://gitlab.com/biolds1/sosse 8 | 9 | Package: sosse 10 | Architecture: any 11 | Depends: ${shlibs:Depends}, ${misc:Depends}, postgresql, nginx, uwsgi, uwsgi-plugin-python3, python3-cssutils, python3-django, python3-django-filters, python3-django-treebeard, python3-djangorestframework, python3-djangorestframework-spectacular, python3-requests, python3-bs4, python3-html5lib, python3-psycopg2, python3-django-uwsgi, python3-feedparser, python3-langdetect, python3-pil, python3-publicsuffix2, python3-psutil, python3-lxml, python3-magic, python3-defusedxml, python3-selenium, libjs-jquery, firefox-esr, chromium, chromium-driver, fonts-noto, unifont 12 | Description: Open Source Search Engine 13 | Open Source Search Engine 14 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | # -*- makefile -*- 3 | export DH_VERBOSE=1 4 | 5 | %: 6 | dh $@ --with python3 7 | 8 | override_dh_installinit: 9 | dh_installinit --name=sosse-uwsgi 10 | dh_installinit --name=sosse-crawler 11 | -------------------------------------------------------------------------------- /debian/sosse-crawler.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=SOSSE crawler 3 | After=syslog.target network.target postgresql.service 4 | 5 | [Service] 6 | ExecStartPre=-+mkdir -p /run/sosse /var/lib/sosse/downloads /var/lib/sosse/screenshots /var/lib/sosse/html /var/lib/sosse/browser_config /var/www/.mozilla /var/www/.cache /var/log/sosse 7 | ExecStartPre=-+touch /var/log/sosse/crawler.log /var/log/sosse/debug.log /var/log/sosse/main.log /var/log/sosse/webserver.log /var/log/sosse/webhooks.log 8 | ExecStartPre=-+chown www-data:www-data /run/sosse /var/lib/sosse/downloads /var/lib/sosse/screenshots /var/lib/sosse/html /var/lib/sosse/browser_config /var/www/.mozilla /var/www/.cache 9 | ExecStartPre=-+chown www-data:www-data -R /var/log/sosse/ 10 | ExecStart=/usr/bin/sosse-admin crawl 11 | User=www-data 12 | Group=www-data 13 | 14 | RuntimeDirectory= 15 | Restart=always 16 | KillSignal=SIGQUIT 17 | Type=simple 18 | NotifyAccess=all 19 | 20 | [Install] 21 | WantedBy=multi-user.target 22 | -------------------------------------------------------------------------------- /debian/sosse-uwsgi.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=SOSSE uWSGI server 3 | After=syslog.target network.target postgresql.service 4 | 5 | [Service] 6 | ExecStartPre=-+mkdir -p /run/sosse /var/lib/sosse/downloads /var/lib/sosse/screenshots /var/lib/sosse/html /var/lib/sosse/browser_config /var/www/.mozilla /var/www/.cache /var/log/sosse 7 | ExecStartPre=-+touch /var/log/sosse/crawler.log /var/log/sosse/debug.log /var/log/sosse/main.log /var/log/sosse/webserver.log /var/log/sosse/webhooks.log 8 | ExecStartPre=-+chown www-data:www-data /run/sosse /var/lib/sosse/downloads /var/lib/sosse/screenshots /var/lib/sosse/html /var/lib/sosse/browser_config /var/www/.mozilla /var/www/.cache 9 | ExecStartPre=-+chown www-data:www-data -R /var/log/sosse/ 10 | ExecStart=/usr/bin/uwsgi --uid www-data --gid www-data --plugin python3 --ini /etc/sosse/uwsgi.ini \ 11 | --logto /var/log/uwsgi/sosse.log 12 | 13 | RuntimeDirectory= 14 | Restart=always 15 | KillSignal=SIGQUIT 16 | Type=notify 17 | NotifyAccess=all 18 | 19 | [Install] 20 | WantedBy=multi-user.target 21 | -------------------------------------------------------------------------------- /debian/sosse.install: -------------------------------------------------------------------------------- 1 | sosse usr/lib/python3.11/dist-packages 2 | se usr/lib/python3.11/dist-packages 3 | sosse-admin usr/bin 4 | debian/uwsgi.ini etc/sosse/ 5 | debian/uwsgi.params etc/sosse/ 6 | debian/sosse.conf etc/nginx/sites-available 7 | -------------------------------------------------------------------------------- /debian/uwsgi.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | 3 | # Django-related settings 4 | # the base directory (full path) 5 | chdir = / 6 | # Django's wsgi file 7 | module = sosse.wsgi 8 | 9 | # process-related settings 10 | # master 11 | master = true 12 | # maximum number of worker processes 13 | processes = 10 14 | # the socket (use the full path to be safe 15 | socket = /run/sosse/uwsgi.sock 16 | # ... with appropriate permissions - may be needed 17 | # chmod-socket = 664 18 | # clear environment on exit 19 | vacuum = true 20 | -------------------------------------------------------------------------------- /debian/uwsgi.params: -------------------------------------------------------------------------------- 1 | 2 | uwsgi_param QUERY_STRING $query_string; 3 | uwsgi_param REQUEST_METHOD $request_method; 4 | uwsgi_param CONTENT_TYPE $content_type; 5 | uwsgi_param CONTENT_LENGTH $content_length; 6 | 7 | uwsgi_param REQUEST_URI $request_uri; 8 | uwsgi_param PATH_INFO $document_uri; 9 | uwsgi_param DOCUMENT_ROOT $document_root; 10 | uwsgi_param SERVER_PROTOCOL $server_protocol; 11 | uwsgi_param REQUEST_SCHEME $scheme; 12 | uwsgi_param HTTPS $https if_not_empty; 13 | 14 | uwsgi_param REMOTE_ADDR $remote_addr; 15 | uwsgi_param REMOTE_PORT $remote_port; 16 | uwsgi_param SERVER_PORT $server_port; 17 | uwsgi_param SERVER_NAME $server_name; 18 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/build_changelog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROJECT_NAME=biolds1/sosse 3 | PROJECT_URL=https://gitlab.com/$PROJECT_NAME 4 | 5 | cd "$(dirname $0)/.." 6 | echo '# Changelog' 7 | cat CHANGELOG.md | \ 8 | sed -e "s#(${PROJECT_NAME}@\([0-9a-f]\+\))#(${PROJECT_URL}/-/commit/\1)#g" \ 9 | -e "s#(${PROJECT_NAME}!\([0-9]\+\))#(${PROJECT_URL}/-/merge_requests/\1)#g" \ 10 | -------------------------------------------------------------------------------- /doc/build_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | files=$(find build/ -name '*.html') 4 | 5 | bad_files=0 6 | 7 | for file in $files; do 8 | if ! grep -q '\' "$file"; then 9 | echo "$file is missing Umami javascript snippet" >&2 10 | bad_files=1 11 | fi 12 | done 13 | 14 | exit "$bad_files" 15 | -------------------------------------------------------------------------------- /doc/get_artifacts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # Copyright 2022-2025 Laurent Defert 3 | # 4 | # This file is part of SOSSE. 5 | # 6 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 7 | # General Public License as published by the Free Software Foundation, either version 3 of the 8 | # License, or (at your option) any later version. 9 | # 10 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 11 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 12 | # See the GNU Affero General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 15 | # If not, see . 16 | 17 | import os 18 | from io import BytesIO 19 | from zipfile import ZipFile 20 | 21 | import gitlab 22 | import requests 23 | 24 | PROJECT_ID = 41220530 25 | jobs_name = ["doc_gen", "functional_docs_chromium", "functional_tests_chromium", "functional_guides_chromium"] 26 | 27 | gl = gitlab.Gitlab() 28 | project = gl.projects.get(PROJECT_ID) 29 | 30 | commit_hash = os.environ["READTHEDOCS_GIT_COMMIT_HASH"] 31 | ref_name = os.environ["READTHEDOCS_GIT_IDENTIFIER"] 32 | pipeline = project.pipelines.list(ref=ref_name, sha=commit_hash)[0] 33 | 34 | print(f"Pipeline: {pipeline.id} {pipeline.status} {ref_name} {commit_hash}") 35 | 36 | for job in pipeline.jobs.list(get_all=True): 37 | if job.name in jobs_name: 38 | if job.status != "success": 39 | print(f"Job {job.web_url} did not succeed, state: {job.status}") 40 | exit(1) 41 | 42 | artifact = job.web_url + "/artifacts/download" 43 | 44 | print(f"Download artifact for {job.name} at {artifact}") 45 | req = requests.get(artifact) 46 | try: 47 | with ZipFile(BytesIO(req.content)) as zip_file: 48 | zip_file.extractall() 49 | except: # noqa 50 | with open("/tmp/artifact", "wb") as f: 51 | f.write(req.content) 52 | raise 53 | jobs_name.remove(job.name) 54 | 55 | if len(jobs_name): 56 | print("Job(s) %s not found" % (", ".join(jobs_name))) 57 | exit(1) 58 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /doc/requirements.txt: -------------------------------------------------------------------------------- 1 | furo==2023.07.26 2 | myst-parser 3 | python-gitlab 4 | requests 5 | -------------------------------------------------------------------------------- /doc/source/_extensions/code_blocks.py: -------------------------------------------------------------------------------- 1 | # Copyright Enrico Zini 2 | # Copied from https://www.enricozini.org/blog/2020/python/checking-sphinx-code-blocks/ 3 | # Extract code blocks from sphinx 4 | 5 | from docutils.nodes import literal_block, Text 6 | import json 7 | 8 | found = [] 9 | 10 | 11 | def find_code(app, doctree, fromdocname): 12 | for node in doctree.traverse(literal_block): 13 | lang = node.attributes.get("language", "default") 14 | 15 | for subnode in node.traverse(Text): 16 | found.append({ 17 | "src": fromdocname, 18 | "lang": lang, 19 | "code": subnode, 20 | "source": node.source, 21 | "line": node.line, 22 | }) 23 | 24 | 25 | def output(app, exception): 26 | if exception is not None: 27 | return 28 | 29 | dest = app.config.test_code_output 30 | if dest is None: 31 | return 32 | 33 | with open(dest, "wt") as fd: 34 | json.dump(found, fd) 35 | 36 | 37 | def setup(app): 38 | app.add_config_value('test_code_output', None, '') 39 | 40 | app.connect('doctree-resolved', find_code) 41 | app.connect('build-finished', output) 42 | 43 | return { 44 | "version": '0.1', 45 | 'parallel_read_safe': True, 46 | 'parallel_write_safe': True, 47 | } 48 | -------------------------------------------------------------------------------- /doc/source/_static/style.css: -------------------------------------------------------------------------------- 1 | .sidebar-logo { 2 | width: 75px; 3 | text-align: left; 4 | margin: 0; 5 | } 6 | img.sosse-screenshot { 7 | box-shadow: 0px 0px 4px 4px #eee; 8 | margin-left: auto; 9 | margin-right: auto; 10 | display: block; 11 | height: 100%; 12 | object-fit: contain; 13 | } 14 | img.sosse-inline-screenshot { 15 | vertical-align: bottom; 16 | } 17 | -------------------------------------------------------------------------------- /doc/source/_templates/base.html: -------------------------------------------------------------------------------- 1 | {% extends "!base.html" %} 2 | 3 | {%- block site_meta -%} 4 | {{ super() }} 5 | {{ uma_script }} 6 | 22 | {%- endblock -%} 23 | -------------------------------------------------------------------------------- /doc/source/_templates/page.html: -------------------------------------------------------------------------------- 1 | {% extends "!page.html" %} 2 | 3 | {%- block footer -%} 4 |
5 | 6 | 🔍 7 | 8 | Search discussions 9 | 10 | 11 | 12 | 13 | 💬 14 | 15 | Discuss this page 16 | 17 | 18 | 19 | 20 | 🧑‍💼 21 | 22 | Get professional support 23 | 24 | 25 |
26 | {{ super() }} 27 | {%- endblock -%} 28 | -------------------------------------------------------------------------------- /doc/source/admin_ui.rst: -------------------------------------------------------------------------------- 1 | Administration interface 2 | ======================== 3 | 4 | To reach administration user interface, you first need to authenticate, by clicking the |user_menu_button| button, then 5 | selecting ``Log in``. 6 | 7 | .. |user_menu_button| image:: ../../tests/robotframework/screenshots/user_menu_button.png 8 | :class: sosse-inline-screenshot 9 | 10 | The default user name and password are both ``admin``. After submitting, the administration interface can be reached 11 | from the |conf_menu_button| menu, by selecting ``Administration``. 12 | 13 | .. |conf_menu_button| image:: ../../tests/robotframework/screenshots/conf_menu_button.png 14 | :class: sosse-inline-screenshot 15 | 16 | .. image:: ../../tests/robotframework/screenshots/admin_ui.png 17 | :class: sosse-screenshot 18 | -------------------------------------------------------------------------------- /doc/source/administration.rst: -------------------------------------------------------------------------------- 1 | Administration 2 | ============== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | admin_ui.rst 9 | crawl/new_url.rst 10 | crawl/queue.rst 11 | crawl/crawlers.rst 12 | crawl/analytics.rst 13 | crawl/policies.rst 14 | crawl/recursion_depth.rst 15 | crawl/feeds.rst 16 | documents.rst 17 | tags.rst 18 | domain_settings.rst 19 | cookies.rst 20 | webhooks.rst 21 | excluded_urls.rst 22 | search_engines.rst 23 | permissions.rst 24 | -------------------------------------------------------------------------------- /doc/source/cli.rst: -------------------------------------------------------------------------------- 1 | Command Line Interface 2 | ====================== 3 | 4 | SOSSE provides a ``sosse-admin`` command that is based on the 5 | `Django management command `_. 6 | It can be called with ``sosse-amdin help``, to list all commands available, and ``sosse-admin help `` to get 7 | help a specific command. The help for SOSSE specific commands is also provided below: 8 | 9 | .. include:: cli_generated.rst 10 | -------------------------------------------------------------------------------- /doc/source/config_file.rst: -------------------------------------------------------------------------------- 1 | Configuration file reference 2 | ============================ 3 | 4 | SOSSE can be configured through the configuration file ``/etc/sosse/sosse.conf``. Configuration variables are grouped in 5 | 3 sections, depending on which component they affect. Modyifing any of these option requires restarting the crawlers or 6 | the web interface. 7 | 8 | .. note:: 9 | Configuration options can also be set using environment variables by prefixing with ``SOSSE_``. 10 | For example, the proxy option of the crawler can be set by settings the ``SOSSE_PROXY`` environment variable. 11 | Envionment variable options have highher precedence than options from the configuration file. 12 | 13 | .. include:: config_file_generated.rst 14 | -------------------------------------------------------------------------------- /doc/source/cookies.rst: -------------------------------------------------------------------------------- 1 | 🍪 Cookies 2 | ========== 3 | 4 | Cookies stored by the crawlers can be seen from the :doc:`../admin_ui`, by clicking on ``Cookies``. 5 | 6 | .. image:: ../../tests/robotframework/screenshots/cookies_list.png 7 | :class: sosse-screenshot 8 | 9 | You can find which cookie applies to a specific web page by typing its URL in the search bar. 10 | 11 | Cookies import 12 | -------------- 13 | 14 | Cookies can be imported using the ``Import cookies`` link. 15 | 16 | .. image:: ../../tests/robotframework/screenshots/cookies_import.png 17 | :class: sosse-screenshot 18 | 19 | They should be entered using the `Netscape cookie format `_. 20 | Tools like `Cookie editor `_ can be used to export cookies from a browser in a compatible manner. 21 | -------------------------------------------------------------------------------- /doc/source/crawl/analytics.rst: -------------------------------------------------------------------------------- 1 | 📊 Analytics 2 | ============ 3 | 4 | .. image:: ../../../tests/robotframework/screenshots/analytics.png 5 | :class: sosse-screenshot 6 | 7 | The analytics page shows global information about indexed pages, it can be reached by clicking ``📊 Analytics`` from 8 | the |conf_menu_button| menu, or in the :doc:`../admin_ui`. 9 | 10 | .. |conf_menu_button| image:: ../../../tests/robotframework/screenshots/conf_menu_button.png 11 | :class: sosse-inline-screenshot 12 | -------------------------------------------------------------------------------- /doc/source/crawl/crawlers.rst: -------------------------------------------------------------------------------- 1 | 🕷 Crawlers 2 | =========== 3 | 4 | The crawlers page displays real-time information on crawlers processes. It can be accessed from 5 | the :doc:`../admin_ui`, by selecting ``🕷 Crawlers``. 6 | 7 | .. image:: ../../../tests/robotframework/screenshots/crawlers.png 8 | :class: sosse-screenshot 9 | -------------------------------------------------------------------------------- /doc/source/crawl/feeds.rst: -------------------------------------------------------------------------------- 1 | Atom and RSS feeds 2 | ================== 3 | 4 | SOSSE can crawl `Atom `_ and 5 | `RSS `_ feeds, this can be useful to crawl websites that are updated often, and skip 6 | already indexed pages. To index a syndication feed, it needs to be :doc:`queued explicitly `. 7 | 8 | .. note:: 9 | SOSSE crawler does not recurse into feeds declared in the ```` element of webpages. To crawl a feed, the URL of the XML feed must be added to the crawl queue manually. 10 | 11 | Caching for news aggregators 12 | ---------------------------- 13 | 14 | By crawling syndication feeds, SOSSE can be used as an offline archive for news aggregator 🐊 softwares. After the XML 15 | feed is indexed, archived pages from the feed can be registered in the aggregator using the 16 | :ref:`atom feed ` generated by SOSSE. This can be done using the 17 | :doc:`search parameters <../user/search>`: 18 | 19 | - Leave the keyword field empty 20 | - Set a search parameter to ``Keep`` ``Linked by url`` ``Equal to``, and use the URL of the XML feed as the value 21 | - Sort results by ``First crawled descending`` 22 | 23 | .. image:: ../../../tests/robotframework/screenshots/syndication_feed.png 24 | :class: sosse-screenshot 25 | -------------------------------------------------------------------------------- /doc/source/crawl/new_url.rst: -------------------------------------------------------------------------------- 1 | 🌐 Crawl a new URL 2 | ================== 3 | 4 | In the |conf_menu_button| menu, or in the :doc:`../admin_ui`, by clicking ``🌐 Crawl a new URL`` you can queue one or 5 | multiple URLs to be crawled when a worker is available. 6 | 7 | .. |conf_menu_button| image:: ../../../tests/robotframework/screenshots/conf_menu_button.png 8 | :class: sosse-inline-screenshot 9 | 10 | .. image:: ../../../tests/robotframework/screenshots/crawl_new_url.png 11 | :class: sosse-screenshot 12 | 13 | By default, only the URLs queued for crawling will be visited. The crawler will not recurse into discovered links unless 14 | explicitly configured. 15 | 16 | To control how pages are indexed and whether recursion occurs, update the relevant settings in :doc:`policies`. 17 | 18 | After submitting a URL, the next page shows the :doc:`queue`. 19 | -------------------------------------------------------------------------------- /doc/source/crawl/queue.rst: -------------------------------------------------------------------------------- 1 | ✔ Crawl queue 2 | ============= 3 | 4 | The crawl queue page displays real-time information on documents being crawled. It can be accessed from 5 | the |conf_menu_button| menu, or from the :doc:`../admin_ui`, by selecting ``✔ Crawl queue``. 6 | 7 | .. |conf_menu_button| image:: ../../../tests/robotframework/screenshots/conf_menu_button.png 8 | :class: sosse-inline-screenshot 9 | 10 | .. image:: ../../../tests/robotframework/screenshots/crawl_queue.png 11 | :class: sosse-screenshot 12 | -------------------------------------------------------------------------------- /doc/source/crawl/recursion_depth.rst: -------------------------------------------------------------------------------- 1 | Recursive crawling 2 | ================== 3 | 4 | SOSSE can crawl recursively all pages it finds, or the recursion level can be limited when crawling large websites or 5 | public sites. 6 | 7 | No limit recursion 8 | ------------------- 9 | 10 | Recursing with no limit is achieved by using a policy with a :ref:`Recursion ` set to 11 | ``Crawl all pages``. 12 | 13 | For example, a full domain can be extracted with 2 policies: 14 | 15 | * A policy for the domain with a ``URL regex`` that matches the domain, and ``Recursion`` set to ``Crawl all pages`` 16 | 17 | * A default policy with a ``Recursion`` set to ``Never crawl`` (the default) 18 | 19 | Limited recursion 20 | ----------------- 21 | 22 | Crawling pages up to a certain level can be simply achieved by setting the :ref:`Recursion ` to 23 | ``Depending on depth`` and setting the ``Recursion depth`` when :doc:`queueing the initial URL `. 24 | 25 | .. image:: ../../../tests/robotframework/screenshots/crawl_on_depth_add.png 26 | :class: sosse-screenshot 27 | 28 | Partial limited recursion 29 | ------------------------- 30 | 31 | A mixed approach is also possible, by setting a :ref:`Recursion ` to ``Depending on depth`` in 32 | one policy, and setting it to ``Crawl all pages`` in an other and a positive ``Recursion depth``. 33 | 34 | For example, one could crawl all Wikipedia, and crawl external links up to 2 levels with the following policies: 35 | 36 | * A policy for Wikipedia, with ``Recursion depth`` of 2: 37 | 38 | .. image:: ../../../tests/robotframework/screenshots/policy_all.png 39 | :class: sosse-screenshot 40 | 41 | * A default policy with a ``Depending on depth`` condition: 42 | 43 | .. image:: ../../../tests/robotframework/screenshots/policy_on_depth.png 44 | :class: sosse-screenshot 45 | -------------------------------------------------------------------------------- /doc/source/crawl_guidelines.rst: -------------------------------------------------------------------------------- 1 | Guidelines for Ethical Use 2 | ========================== 3 | 4 | When using a web crawler or scraper, it’s important to be responsible and ethical. Here are some quick tips to keep in 5 | mind: 6 | 7 | **Get Permission First** 8 | ------------------------ 9 | 10 | Before crawling a site, make sure you’re allowed to access it. Although your crawler may have the ability to ignore 11 | `robots.txt` or modify the `User-Agent`, **always respect the site owner’s preferences**: 12 | 13 | - Read the site’s terms of service to see if scraping is allowed. 14 | - If you’re unsure, consider reaching out to the site owner for permission. 15 | 16 | **Crawl Responsibly & Respect the Environment** 17 | ----------------------------------------------- 18 | 19 | Crawling can impact both website performance and the environment. Here’s how to do it responsibly: 20 | - **Avoid Overloading Servers**: Don’t make too many requests at once or crawl the same pages repeatedly. 21 | 22 | - **Use Data Dumps**: If available, use downloadable data dumps (e.g., `Kiwix `_) 23 | instead of crawling, which helps reduce server load and saves resources. 24 | 25 | - **Consider Environmental Impact**: Crawling consumes energy. Keep your crawls efficient—only collect the data you 26 | need, and avoid unnecessary large downloads like media files. 27 | 28 | - **Use APIs When Available**: If the website provides an API, prefer using it instead of crawling, as APIs are 29 | optimized for data access and reduce server load. 30 | 31 | - **Prefer Generating Scripts with AI**: When possible, use AI to generate scripts for structured data extraction 32 | rather than parsing unstructured pages, which can be less efficient and error-prone. 33 | 34 | **Respect the Web** 35 | ------------------- 36 | 37 | Ethical scraping is all about respect: 38 | 39 | - Be transparent and let site owners know if you're crawling their content. 40 | - Avoid scraping personal or sensitive information unless explicitly allowed. 41 | - Follow copyright laws and properly attribute sources. 42 | 43 | For more information, see `Is Web Scraping Legal? `_. 44 | -------------------------------------------------------------------------------- /doc/source/domain_settings.rst: -------------------------------------------------------------------------------- 1 | 🕸 Domain Settings 2 | ================== 3 | 4 | Domain level parameters can be reached from the :doc:`../admin_ui`, by clicking on ``Domain settings``. 5 | 6 | .. image:: ../../tests/robotframework/screenshots/domain_setting.png 7 | :class: sosse-screenshot 8 | 9 | Domain settings are automatically created during crawling, but can also be updated manually or created manually. 10 | 11 | Browse mode 12 | """"""""""" 13 | 14 | When the policy's :ref:`Default browse mode ` is set to ``Detect``, the ``Browse mode`` option of 15 | the domain define which browsing method to use. When its value is ``Detect``, the browsing mode is detected the next 16 | time the page is accessed, and this option is switched to either ``Chromium``, ``Firefox`` or ``Python Requests``. 17 | 18 | .. _domain_ignore_robots: 19 | 20 | Ignore robots.txt 21 | """"""""""""""""" 22 | 23 | By default the crawler will honor the ``robots.txt`` 🤖 of the domain and follow its rules depending on the 24 | :ref:`User Agent `. When enabled, this option will ignore any ``robots.txt`` rule and crawl 25 | pages of the domain unconditionally. 26 | 27 | Robots.txt status 28 | """"""""""""""""" 29 | 30 | One of: 31 | 32 | * ``Unknown``: the file has not been processed yet 33 | * ``Empty``: there is no ``robots.txt`` or it's empty 34 | * ``Loaded``: the file has been successfully loaded 35 | 36 | Robots.txt allow/disallow rules 37 | """"""""""""""""""""""""""""""" 38 | 39 | This contains the rules relevant to the crawlers :ref:`User Agent `. 40 | -------------------------------------------------------------------------------- /doc/source/excluded_urls.rst: -------------------------------------------------------------------------------- 1 | 🔗 Excluded URLs 2 | ================ 3 | 4 | The excluded URLs list can be reached from the :doc:`../admin_ui`, by clicking on ``Excluded URLs``. 5 | 6 | .. image:: ../../tests/robotframework/screenshots/excluded_url.png 7 | :class: sosse-screenshot 8 | 9 | This stores URLs that will always be skipped by the crawlers. 10 | -------------------------------------------------------------------------------- /doc/source/guides.rst: -------------------------------------------------------------------------------- 1 | Guides 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | crawl_guidelines.rst 9 | guides/search.rst 10 | guides/archive.rst 11 | guides/download.rst 12 | guides/feed_website_monitor.rst 13 | guides/authentication.rst 14 | guides/captcha.rst 15 | -------------------------------------------------------------------------------- /doc/source/guides/authentication_browser_inspect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/doc/source/guides/authentication_browser_inspect.png -------------------------------------------------------------------------------- /doc/source/guides/captcha.rst: -------------------------------------------------------------------------------- 1 | Dealing with Captchas 2 | ===================== 3 | 4 | User agent 5 | ---------- 6 | 7 | By default, the crawlers send HTTP requests with a ``SOSSE`` 8 | `User agent HTTP header `_ this can sometime lead websites to flag the 9 | crawler as a robot and display a Captcha. To mitigate this, SOSSE can use the 10 | `Fake user-agent `_ library to simulate a real browser user agent. 11 | This can be achieved with the following options in the configuration file: 12 | 13 | * :ref:`user_agent`: uncomment the option and make it empty 14 | * :ref:`fake_user_agent_browser`, 15 | :ref:`fake_user_agent_os`, 16 | :ref:`fake_user_agent_platform`: these control how the user agent is generated. 17 | It's probably best to set the ``fake_user_agent_platform`` to ``pc`` as some website may change there rendering on 18 | mobile platforms. 19 | 20 | Cookies 21 | ------- 22 | 23 | The captcha can be manually validated in a browser, then cookies can be exported and imported in SOSSE, see the 24 | :doc:`Cookies<../cookies>` documentation. 25 | -------------------------------------------------------------------------------- /doc/source/guides/search.rst: -------------------------------------------------------------------------------- 1 | Website Search 2 | ============== 3 | 4 | SOSSE allows you to crawl a website and search its pages for specific keywords. This process involves configuring 5 | a :doc:`Crawl Policy <../crawl/policies>` to define how the site is crawled, followed by searching for the desired 6 | content. 7 | 8 | Creating a Crawl Policy 9 | ----------------------- 10 | 11 | Crawl policies control how SOSSE accesses and logs website content. This section covers key settings; for full details, 12 | see the :doc:`Crawl Policies <../crawl/policies>` documentation. 13 | 14 | By default, the crawler processes only directly queued pages. Enabling recursion ensures linked pages are also crawled: 15 | 16 | - In the ``⚡ Crawl`` tab, enter a regular expression to match URLs for crawling. 17 | - In the ``🔖 Archive`` tab, disable ``Archive content`` if you only need to search pages without archiving. 18 | - In the ``🕑 Recurrence`` tab, adjust the crawl frequency as needed. 19 | 20 | .. note:: 21 | By default, SOSSE archives pages, detects if a browser is required for rendering, and adjusts crawl frequency based 22 | on site updates. Modify the policy to optimize crawl speed or reduce disk usage. 23 | 24 | .. image:: ../../../tests/robotframework/screenshots/guide_search_policy.png 25 | :class: sosse-screenshot 26 | 27 | Starting the Crawl 28 | ------------------ 29 | 30 | To begin crawling, go to the :doc:`Crawl a new URL <../crawl/new_url>` page and enter the site's homepage URL. 31 | 32 | Review the parameters, then click ``Confirm``. SOSSE will crawl the site and log pages matching the Crawl Policy. 33 | 34 | .. note:: 35 | If pages aren’t crawled as expected, check whether the site’s `robots.txt` file is blocking the crawler. 36 | *Bypass it only if authorized.* You can review this setting in the :doc:`../domain_settings` for the website. 37 | 38 | Searching the Website 39 | --------------------- 40 | 41 | Once crawling is complete, search for keywords directly from the homepage. 42 | 43 | For advanced search options, see the :doc:`search parameters <../user/search>` documentation. 44 | 45 | Additional Resources 46 | -------------------- 47 | 48 | - See :doc:`../crawl/recursion_depth` for advanced crawling strategies. 49 | - Explore the :doc:`../guides` for further assistance. 50 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. SOSSE documentation master file, created by 2 | sphinx-quickstart on Mon Apr 17 13:06:50 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | SOSSE's documentation! 7 | ====================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: 🐾 Contents: 12 | 13 | introduction.rst 14 | install.rst 15 | administration.rst 16 | guides.rst 17 | config_file.rst 18 | cli.rst 19 | user_doc.rst 20 | screenshots.rst 21 | 22 | .. toctree:: 23 | :maxdepth: 1 24 | 25 | CHANGELOG.md 26 | 27 | Indices and tables 28 | ================== 29 | 30 | * :ref:`genindex` 31 | * :ref:`modindex` 32 | * :ref:`search` 33 | -------------------------------------------------------------------------------- /doc/source/install.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | SOSSE can be installed in a few different ways 🦨: 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | :caption: Contents: 9 | 10 | install/debian.rst 11 | install/debian_upgrades.rst 12 | install/pip.rst 13 | install/pip_upgrades.rst 14 | install/docker.rst 15 | install/docker_upgrades.rst 16 | install/docker_compose.rst 17 | install/docker_compose_upgrades.rst 18 | -------------------------------------------------------------------------------- /doc/source/install/database.rst.template: -------------------------------------------------------------------------------- 1 | Database connection parameters can be changed in the ``/etc/sosse/sosse.conf`` file, you can find more information about each variable in the :doc:`../config_file`). 2 | 3 | Database creation 4 | """"""""""""""""" 5 | 6 | The PostgreSQL database can be created with the commands: 7 | 8 | .. code-block:: shell 9 | 10 | su - postgres -c "psql --command=\"CREATE USER sosse WITH PASSWORD 'CHANGE ME';\"" 11 | su - postgres -c "psql --command=\"CREATE DATABASE sosse OWNER sosse;\"" 12 | 13 | Replace ``sosse`` by an appropriate username and password, and set them in the ``/etc/sosse/sosse.conf`` configuration file. 14 | 15 | Database schema 16 | """"""""""""""" 17 | 18 | The initial database data can be injected with the following commands: 19 | 20 | .. code-block:: shell 21 | 22 | |sosse-admin| migrate 23 | |sosse-admin| update_se 24 | 25 | A default ``admin`` user with password ``admin`` can be created with: 26 | 27 | .. code-block:: shell 28 | 29 | |sosse-admin| default_admin 30 | -------------------------------------------------------------------------------- /doc/source/install/debian_upgrades.rst: -------------------------------------------------------------------------------- 1 | Debian upgrades 2 | =============== 3 | 4 | The Debian package installed following the :doc:`debian` documentation can be upgraded by running a regular: 5 | 6 | .. code-block:: shell 7 | 8 | apt-get upgrade 9 | 10 | It is recommended to make a backup of the database before upgrading. 11 | -------------------------------------------------------------------------------- /doc/source/install/docker.rst: -------------------------------------------------------------------------------- 1 | Running in Docker 2 | ================= 3 | 4 | The latest stable version of SOSSE can be run in docker with the command: 5 | 6 | .. code-block:: shell 7 | 8 | docker run -p 8005:80 --mount source=sosse_postgres,destination=/var/lib/postgresql \ 9 | --mount source=sosse_var,destination=/var/lib/sosse biolds/sosse:latest 10 | 11 | This would start an instance of SOSSE on port 8005, and would persist data in the ``sosse_postgres`` and 12 | ``sosse_var`` `Docker volumes `_. 13 | 14 | You may also locally mount other directories to access their content, with the following flags: 15 | 16 | * ``--volume $PWD/sosse-conf:/etc/sosse/``: mounting an empty directory as ``/etc/sosse/`` will create default 17 | configuration files in it. You can then edit them and restart Docker to make the changes effective. 18 | * ``--volume $PWD/sosse-log:/var/log/sosse/``: mounting this directory would let you access log files. 19 | 20 | Next steps 21 | ---------- 22 | 23 | You can now point your browser to connect to the port 8005 and log in with the user ``admin`` and the password 24 | ``admin``. For more information about the configuration, you can follow the :doc:`../administration` pages, 25 | or follow :doc:`../guides/search` to start indexing documents. 26 | -------------------------------------------------------------------------------- /doc/source/install/docker_compose.rst: -------------------------------------------------------------------------------- 1 | Running in Docker-compose 2 | ========================= 3 | 4 | To run the latest version of SOSSE with docker-compose, you need to download the latest version of the 5 | ``docker-compose.yml`` file from the SOSSE repository in a dedicated directory: 6 | 7 | .. code-block:: shell 8 | 9 | mkdir sosse 10 | cd sosse 11 | curl https://raw.githubusercontent.com/biolds/sosse/refs/heads/stable/docker-compose.yml > docker-compose.yml 12 | 13 | Review its content, then run the following command to start SOSSE: 14 | 15 | .. code-block:: shell 16 | 17 | docker-compose up -d 18 | 19 | By default, this would start an instance of SOSSE on port 8005. 20 | 21 | Next steps 22 | ---------- 23 | 24 | You can now point your browser to connect to the port 8005 and log in with the user ``admin`` and the password 25 | ``admin``. For more information about the configuration, you can follow the :doc:`../administration` pages, 26 | or follow :doc:`../guides/search` to start indexing documents. 27 | -------------------------------------------------------------------------------- /doc/source/install/docker_compose_upgrades.rst: -------------------------------------------------------------------------------- 1 | Docker-compose upgrades 2 | ======================= 3 | 4 | The Docker-compose version installed following the :doc:`docker_compose` documentation can be upgraded by running: 5 | 6 | .. code-block:: shell 7 | 8 | docker-compose pull 9 | docker compose down 10 | docker compose up -d --force-recreate 11 | 12 | It is recommended to make a backup of the database before upgrading. 13 | -------------------------------------------------------------------------------- /doc/source/install/docker_upgrades.rst: -------------------------------------------------------------------------------- 1 | Docker upgrades 2 | =============== 3 | 4 | The Docker version installed following the :doc:`docker` documentation can be upgraded by running: 5 | 6 | .. code-block:: shell 7 | 8 | docker pull biolds/sosse:latest 9 | 10 | It is recommended to make a backup of the database before upgrading. 11 | -------------------------------------------------------------------------------- /doc/source/install/pip_upgrades.rst: -------------------------------------------------------------------------------- 1 | Pip upgrades 2 | ============ 3 | 4 | The Pip packages installed following the :doc:`pip` documentation can be upgraded by running: 5 | 6 | .. code-block:: shell 7 | 8 | pip install --upgrade sosse 9 | 10 | It is recommended to make a backup of the database before upgrading. 11 | 12 | When the upgrade is done, the following commands need to be run to update the data: 13 | 14 | .. code-block:: shell 15 | 16 | sosse-admin collectstatic --noinput --clear 17 | sosse-admin migrate 18 | sosse-admin update_se 19 | -------------------------------------------------------------------------------- /doc/source/introduction.rst: -------------------------------------------------------------------------------- 1 | SOSSE Documentation 2 | =================== 3 | 4 | Welcome to the official SOSSE documentation page! Here, you'll find everything you need to get started, from 5 | installation guides to community support. Explore the links below to dive into the world of SOSSE and make the most out 6 | of the platform. 7 | 8 | 🌐 `Official Website `_ 9 | 10 | Visit the official website for the latest updates, announcements, and resources on SOSSE. Stay connected with all 11 | the details about the project. 12 | 13 | 🛠️ :doc:`Installation Guide ` 14 | 15 | Looking to install SOSSE? The installation guide will walk you through setting up SOSSE on your machine using various 16 | methods, including Docker and other configurations for persistence. 17 | 18 | 🌍 :doc:`Guidelines for Ethical Use ` 19 | 20 | When using SOSSE for web crawling or scraping, it's important to follow ethical guidelines and best practices to avoid 21 | overloading servers, violating site terms of service, or causing damage to the websites being crawled. Please review the 22 | ethical guidelines to ensure responsible usage. 23 | 24 | 📚 :doc:`Guides ` 25 | 26 | For detailed guides on key features like search, crawling, archiving, file downloads, and more, visit the SOSSE guides 27 | page. 28 | 29 | 📚 :doc:`Documentation Index ` 30 | 31 | For comprehensive documentation on all features, configurations, and usage, visit the SOSSE documentation index. This is 32 | your one-stop resource for learning everything about the platform. 33 | 34 | 💻 `GitHub Project Page `_ 35 | 36 | Check out the official SOSSE project on GitHub for access to the source code, issue tracking, and collaboration. 37 | Feel free to contribute, report bugs, or browse the code! 38 | 39 | 🎮 `Join the Discord Community `_ 40 | 41 | Join the SOSSE community on Discord! Whether you have questions, want to share your ideas, or need help with 42 | troubleshooting, our Discord server is the perfect place to connect with other users and contributors. 43 | 44 | Thank you for being a part of the SOSSE community! Whether you’re just getting started or need advanced help, all of 45 | these resources are here to assist you. 46 | -------------------------------------------------------------------------------- /doc/source/permissions.rst: -------------------------------------------------------------------------------- 1 | 👥 Permissions 2 | ============== 3 | 4 | Crawl Permissions 5 | ----------------- 6 | 7 | User management and group editing can be done from the :doc:`../admin_ui`, by clicking on ``Users`` or ``Groups``. 8 | Thanks to the `Django framework `_, fine-grained permissions can be defined by group and 9 | by user. 10 | 11 | .. image:: ../../tests/robotframework/screenshots/permissions.png 12 | :class: sosse-screenshot 13 | 14 | Permissions are set by the type of objects that can be modified through the :doc:`admin_ui`. Some of these permissions 15 | also grant access to other parts of the user interface: 16 | 17 | - ``Can add document``: Grants access to the :doc:`🌐 Crawl a new URL ` page. 18 | - ``Can change document``: Grants access to document actions such as ``Crawl now``, ``Remove from crawl queue``, 19 | ``Convert screens to JPEG``. 20 | - ``Can view crawler stats``: Grants access to the :doc:`✔ Crawl queue ` page and 21 | :doc:`🕷 Crawlers ` page. 22 | - ``Can change crawler stats``: Grants access to the ``Pause`` and ``Resume`` crawler buttons in the 23 | :doc:`✔ Crawl queue ` page and :doc:`🕷 Crawlers ` page. 24 | 25 | Search Permissions 26 | ------------------ 27 | 28 | By default, search requires users to be authenticated, but :ref:`anonymous searches ` 29 | can be enabled with the related option. 30 | -------------------------------------------------------------------------------- /doc/source/screenshots.rst: -------------------------------------------------------------------------------- 1 | Screenshots 2 | =========== 3 | 4 | .. figure:: ../../tests/robotframework/screenshots/search.png 5 | :class: sosse-screenshot 6 | 7 | :doc:`Search results ` 8 | 9 | .. raw:: html 10 | 11 |
12 |
13 | 14 | .. figure:: ../../tests/robotframework/screenshots/guide_download_archive_html.png 15 | :class: sosse-screenshot 16 | 17 | :doc:`Offline browsing ` 18 | 19 | .. raw:: html 20 | 21 |
22 |
23 | 24 | .. figure:: ../../tests/robotframework/screenshots/archive_download.png 25 | :class: sosse-screenshot 26 | 27 | :doc:`File scraping ` 28 | 29 | .. raw:: html 30 | 31 |
32 |
33 | 34 | .. figure:: ../../tests/robotframework/screenshots/analytics.png 35 | :class: sosse-screenshot 36 | 37 | :doc:`Index analytics ` 38 | 39 | .. raw:: html 40 | 41 |
42 |
43 | 44 | .. figure:: ../../tests/robotframework/screenshots/history.png 45 | :class: sosse-screenshot 46 | 47 | :doc:`Search history ` 48 | 49 | .. raw:: html 50 | 51 |
52 |
53 | 54 | .. figure:: ../../tests/robotframework/screenshots/crawl_queue.png 55 | :class: sosse-screenshot 56 | 57 | :doc:`Real-time crawling status ` 58 | 59 | .. raw:: html 60 | 61 |
62 |
63 | 64 | .. figure:: ../../tests/robotframework/screenshots/crawl_policy_decision_no_hilight.png 65 | :class: sosse-screenshot 66 | 67 | :doc:`Crawl Policies setup ` 68 | 69 | .. raw:: html 70 | 71 |
72 |
73 | 74 | .. figure:: ../../tests/robotframework/screenshots/browsable_home.png 75 | :class: sosse-screenshot 76 | 77 | :doc:`Archive browsing ` 78 | -------------------------------------------------------------------------------- /doc/source/search_engines.rst: -------------------------------------------------------------------------------- 1 | 🔍 External Search Engines 2 | ========================== 3 | 4 | The list of :doc:`user/shortcuts` can be reached from the :doc:`../admin_ui`, by clicking on ``Search engines``. 5 | 6 | .. image:: ../../tests/robotframework/screenshots/search_engines_list.png 7 | :class: sosse-screenshot 8 | 9 | New search engines can be added manually, or using the :ref:`CLI ` using an `Open Search Description `_ formatted file. 10 | 11 | .. image:: ../../tests/robotframework/screenshots/search_engine.png 12 | :class: sosse-screenshot 13 | 14 | In this form, the shortcut that will be used to redirect to the external search engine can be defined. If you add a search engine, please consider adding it to the list of `included search engines `_ and opening a Pull request (also works on `Github `_). 15 | -------------------------------------------------------------------------------- /doc/source/tags.rst: -------------------------------------------------------------------------------- 1 | ⭐ Tags 2 | ======= 3 | 4 | The tagging system allows for efficient searching and categorization of documents by associating them with tags. Tags 5 | can be assigned to documents during the crawling process based on :doc:`Crawl Policies `, or they can 6 | be manually added or edited in the :doc:`Archive page ` of Documents. 7 | 8 | Tags can be accessed by clicking **Tags** from the :doc:`../admin_ui`. 9 | 10 | .. image:: ../../tests/robotframework/screenshots/tags_list.png 11 | :class: sosse-screenshot 12 | 13 | Tags can be modified through the admin interface by selecting a tag and updating its properties: 14 | 15 | .. image:: ../../tests/robotframework/screenshots/edit_tag.png 16 | :class: sosse-screenshot 17 | 18 | Editable Fields: 19 | 20 | - Name: The label of the tag. 21 | - Parent: Allows organizing tags into a hierarchical structure by selecting a parent tag. 22 | - Documents: A link to the admin interface showing all documents associated with the tag. 23 | - Crawl Policies: A link to the admin interface showing all crawl policies that assign this tag. 24 | -------------------------------------------------------------------------------- /doc/source/user/archive.rst: -------------------------------------------------------------------------------- 1 | Offline browsing, archived pages 2 | ================================ 3 | 4 | Archived pages can be access from the search results, by clicking the ``archive`` link. 5 | 6 | .. image:: ../../../tests/robotframework/screenshots/archive_header.png 7 | :class: sosse-screenshot 8 | 9 | When the :doc:`Crawl Policy <../crawl/policies>` has ``🔖 Archive content`` or ``📷 Take screenshots`` enabled, 10 | the archive page shows the rendered content and links to other indexed pages can be clicked: 11 | 12 | .. image:: ../../../tests/robotframework/screenshots/archive_screenshot.png 13 | :class: sosse-screenshot 14 | 15 | The ``✏️ Text`` links to the text version of the page. The ``📚 Word weights`` shows the weight of 16 | stemmed words in the page, these are used to calculate the score of the page in the :doc:`search results `. 17 | -------------------------------------------------------------------------------- /doc/source/user/history.rst: -------------------------------------------------------------------------------- 1 | History 2 | ======= 3 | 4 | The history page shows the search history of the logged in user. 5 | 6 | .. image:: ../../../tests/robotframework/screenshots/history.png 7 | :class: sosse-screenshot 8 | 9 | Clicking the |delete_button| button deletes the search entry. 10 | 11 | .. |delete_button| image:: ../../../tests/robotframework/screenshots/history_delete.png 12 | :class: sosse-inline-screenshot 13 | 14 | Clicking |delete_all_button| button deletes the whole search history of the user. 15 | 16 | .. |delete_all_button| image:: ../../../tests/robotframework/screenshots/history_delete_all.png 17 | :class: sosse-inline-screenshot 18 | -------------------------------------------------------------------------------- /doc/source/user/profile.rst: -------------------------------------------------------------------------------- 1 | Profile 2 | =========== 3 | 4 | To reach the Profile user interface, click the |user_menu_button| button, then select ``Profile``. 5 | 6 | .. |user_menu_button| image:: ../../../tests/robotframework/screenshots/user_menu_button.png 7 | :class: sosse-inline-screenshot 8 | 9 | .. image:: ../../../tests/robotframework/screenshots/profile.png 10 | :class: sosse-screenshot 11 | 12 | Profile data is stored in the browser's 13 | `Local storage `_, so they are not shared across 14 | users, devices, or browsers. 15 | 16 | Theme 17 | ----- 18 | 19 | The theme option lets you choose light theme, dark theme or let it switch automatically depending on the browser 20 | configuration. 21 | 22 | Search terms parsing language 23 | ----------------------------- 24 | 25 | This defines the default language used to read the search terms typed in the search bar. SOSSE uses 26 | `PostgreSQL's Full Text Search `_ feature which uses 27 | this parameter to make searches more intelligent than simple word matches. 28 | 29 | Results by page 30 | --------------- 31 | 32 | The number of search result displayed in one page. 33 | 34 | .. _pref_principal_link: 35 | 36 | Search result main links point to the archive 37 | --------------------------------------------- 38 | 39 | When enabled, search result links point to the :doc:`archive versions ` of pages. ``source`` links are 40 | displayed to access original websites. 41 | 42 | When disabled, search result links point to original websites. ``archive`` links are displayed to access 43 | :doc:`archive versions `. 44 | 45 | .. _pref_online_mode: 46 | 47 | Online mode 48 | ----------- 49 | 50 | When :ref:`Online detection ` is set up, searching locally or online can be overridden. 51 | 52 | .. image:: ../../../tests/robotframework/screenshots/online_mode.png 53 | :class: sosse-screenshot 54 | 55 | Next to the user menu a dot displays the status of the online mode: 56 | 57 | .. image:: ../../../tests/robotframework/screenshots/online_mode_status.png 58 | :class: sosse-screenshot 59 | 60 | * Green for online 61 | * Orange for offline 62 | * Purple when ``Force online`` is selected 63 | * Blue when ``Force local`` is selected 64 | -------------------------------------------------------------------------------- /doc/source/user/rest_api.rst: -------------------------------------------------------------------------------- 1 | Rest API 2 | ======== 3 | 4 | .. image:: ../../../tests/robotframework/screenshots/swagger.png 5 | :class: sosse-screenshot 6 | 7 | A rest API is available, it can be explored with a `Swagger `_ user interface. To open it, click the |user_menu_button| button, then select ``Rest API``. 8 | 9 | .. |user_menu_button| image:: ../../../tests/robotframework/screenshots/user_menu_button.png 10 | :class: sosse-inline-screenshot 11 | -------------------------------------------------------------------------------- /doc/source/user/shortcut_list.rst: -------------------------------------------------------------------------------- 1 | Search Engine shortcut defaults 2 | =============================== 3 | 4 | You can find below the list of :doc:`shortcuts` defined by default. You can add new ones by following the 5 | :doc:`../search_engines` documentation. 6 | 7 | .. include:: shortcut_list_generated.rst 8 | -------------------------------------------------------------------------------- /doc/source/user/shortcuts.rst: -------------------------------------------------------------------------------- 1 | External search engine shortcuts 2 | ================================ 3 | 4 | .. image:: ../../../tests/robotframework/screenshots/shortcut.png 5 | :class: sosse-screenshot 6 | 7 | In the search bar, shortcuts can be used to search on external search engine. In the screenshot above, the search terms 8 | ``!b cats`` would redirect to the `Brave Search `_ search engine, searching for ``cats`` 🐈. 9 | 10 | The default list of shortcuts is available in the :doc:`shortcut_list` page, new search engines can be added in the 11 | :doc:`administration UI <../search_engines>`. 12 | 13 | The special character (``!`` by default) used to trigger the shortcut can be modified in the 14 | :ref:`configuration `. 15 | 16 | It is possible to make SOSSE redirect to an external search engine by default by setting the option 17 | :ref:`default_search_redirect `. In this case SOSSE internal searches can still be 18 | reached using the shortcut defined by :ref:`sosse_shortcut `. 19 | -------------------------------------------------------------------------------- /doc/source/user_doc.rst: -------------------------------------------------------------------------------- 1 | User documentation 2 | ================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | user/search.rst 9 | user/shortcuts.rst 10 | user/shortcut_list.rst 11 | user/profile.rst 12 | user/history.rst 13 | user/archive.rst 14 | user/rest_api.rst 15 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | sosse: 3 | image: biolds/sosse:pip-compose 4 | container_name: sosse_app 5 | depends_on: 6 | - postgres 7 | environment: 8 | # Available configuration variables can be found on https://sosse.readthedocs.io/en/stable/config_file.html 9 | # any option can be set by using the SOSSE_ prefix 10 | - SOSSE_DB_NAME=sosse_db 11 | - SOSSE_DB_USER=sosse_user 12 | - SOSSE_DB_PASS=sosse_password 13 | - SOSSE_DB_HOST=postgres 14 | ports: 15 | - "8000:80" 16 | volumes: 17 | - sosse_data:/var/lib/sosse 18 | restart: always 19 | 20 | postgres: 21 | image: postgres:latest 22 | container_name: sosse_db 23 | environment: 24 | POSTGRES_USER: sosse_user 25 | POSTGRES_PASSWORD: sosse_password 26 | POSTGRES_DB: sosse_db 27 | ports: 28 | - "5432:5432" 29 | volumes: 30 | - postgres_data:/var/lib/postgresql/data 31 | restart: always 32 | 33 | volumes: 34 | sosse_data: 35 | postgres_data: 36 | -------------------------------------------------------------------------------- /docker/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY=build 2 | 3 | build: 4 | docker pull debian:bookworm 5 | $(MAKE) -C debian-base build APT_PROXY=$(APT_PROXY) PIP_INDEX_URL=$(PIP_INDEX_URL) PIP_TRUSTED_HOST=$(PIP_TRUSTED_HOST) 6 | $(MAKE) -C debian build APT_PROXY=$(APT_PROXY) PIP_INDEX_URL=$(PIP_INDEX_URL) PIP_TRUSTED_HOST=$(PIP_TRUSTED_HOST) 7 | $(MAKE) -C debian-pkg build APT_PROXY=$(APT_PROXY) PIP_INDEX_URL=$(PIP_INDEX_URL) PIP_TRUSTED_HOST=$(PIP_TRUSTED_HOST) 8 | $(MAKE) -C debian-test build APT_PROXY=$(APT_PROXY) PIP_INDEX_URL=$(PIP_INDEX_URL) PIP_TRUSTED_HOST=$(PIP_TRUSTED_HOST) 9 | $(MAKE) -C pip-base build APT_PROXY=$(APT_PROXY) PIP_INDEX_URL=$(PIP_INDEX_URL) PIP_TRUSTED_HOST=$(PIP_TRUSTED_HOST) 10 | $(MAKE) -C pip-compose build APT_PROXY=$(APT_PROXY) PIP_INDEX_URL=$(PIP_INDEX_URL) PIP_TRUSTED_HOST=$(PIP_TRUSTED_HOST) 11 | $(MAKE) -C pip-release build APT_PROXY=$(APT_PROXY) PIP_INDEX_URL=$(PIP_INDEX_URL) PIP_TRUSTED_HOST=$(PIP_TRUSTED_HOST) 12 | $(MAKE) -C pip-test build APT_PROXY=$(APT_PROXY) PIP_INDEX_URL=$(PIP_INDEX_URL) PIP_TRUSTED_HOST=$(PIP_TRUSTED_HOST) 13 | $(MAKE) -C doc build 14 | $(MAKE) -C docker build 15 | 16 | push: 17 | @for a in $$(ls); do \ 18 | if [ -d $$a ]; then \ 19 | $(MAKE) -C $$a push; \ 20 | fi; \ 21 | done; 22 | -------------------------------------------------------------------------------- /docker/Makefile.common: -------------------------------------------------------------------------------- 1 | DOCKER_NAME=$(shell pwd | sed -e s_^.*/__) 2 | #APT_PROXY=http://192.168.3.2:3142/ 3 | #PIP_INDEX_URL=http://192.168.3.3:5000/index/ 4 | #PIP_TRUSTED_HOST=192.168.3.3 5 | 6 | .PHONY: _build push 7 | 8 | push: 9 | docker push biolds/sosse:$(DOCKER_NAME) 10 | 11 | _build: 12 | docker build --build-arg APT_PROXY=$(APT_PROXY) --build-arg PIP_INDEX_URL=$(PIP_INDEX_URL) --build-arg PIP_TRUSTED_HOST=$(PIP_TRUSTED_HOST) -t biolds/sosse:$(DOCKER_NAME) . 13 | 14 | %: 15 | $(MAKE) -f ../Makefile.common _$@ 16 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | - debian: Docker image using the Sosse Debian package, for testing purpose only 2 | - debian-pkg: image that builds the Debian package 3 | - debian-test FROM debian: image used in the Gitlab CI to run some tests (unit tests, static checks, etc.) 4 | - doc: image used to build the documentation (for testing only, the published doc is built on readthedoc) 5 | - docker FROM pip-test: image used to rebuild the Docker package to upgrade packages on Docker Hub 6 | - pip-base: base image for the pip-test and pip-release images 7 | - pip-compose: official image for Docker-compose 8 | - pip-release: official Docker image 9 | - pip-test: image used to test the pip package 10 | -------------------------------------------------------------------------------- /docker/debian-base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:bookworm 2 | ADD control / 3 | RUN apt-get update && \ 4 | grep ^Depends: /control | sed -e "s/.*},//" -e "s/,//g" | xargs apt-get install -y && \ 5 | apt-get clean autoclean && \ 6 | rm -rf /control /var/lib/cache /var/lib/log /usr/share/doc /usr/share/man 7 | -------------------------------------------------------------------------------- /docker/debian-base/Makefile: -------------------------------------------------------------------------------- 1 | include ../Makefile.common 2 | 3 | .PHONY: build 4 | 5 | build: 6 | cp ../../debian/control . 7 | $(MAKE) -f ../Makefile.common _build 8 | -------------------------------------------------------------------------------- /docker/debian-pkg/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:bookworm 2 | ARG APT_PROXY= 3 | ARG PIP_INDEX_URL= 4 | ARG PIP_TRUSTED_HOST= 5 | RUN test -z "$APT_PROXY" || (echo "Acquire::http::Proxy \"$APT_PROXY\";" > /etc/apt/apt.conf.d/proxy.conf) 6 | RUN apt update && \ 7 | apt upgrade -y && \ 8 | apt install -y make build-essential python3-dev devscripts cdbs dh-python python3-setuptools curl gnupg2 npm && \ 9 | apt-get clean autoclean && \ 10 | apt-get autoremove --yes && \ 11 | rm -rf /var/lib/cache /var/lib/log /usr/share/doc /usr/share/man 12 | RUN test -z "$APT_PROXY" || rm /etc/apt/apt.conf.d/proxy.conf 13 | -------------------------------------------------------------------------------- /docker/debian-pkg/Makefile: -------------------------------------------------------------------------------- 1 | include ../Makefile.common 2 | -------------------------------------------------------------------------------- /docker/debian-test/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM biolds/sosse:debian 2 | ARG APT_PROXY= 3 | ARG PIP_INDEX_URL= 4 | ARG PIP_TRUSTED_HOST= 5 | RUN test -z "$APT_PROXY" || (echo "Acquire::http::Proxy \"$APT_PROXY\";" > /etc/apt/apt.conf.d/proxy.conf) 6 | RUN apt update 7 | RUN apt purge -y sosse 8 | # Remove python3-coverage after version 1.13 is released 9 | RUN apt install -y python3-coverage python3-virtualenv flake8 sudo jq make git rsync 10 | RUN /etc/init.d/postgresql start && \ 11 | su - postgres -c "psql --command 'ALTER USER sosse WITH SUPERUSER;'" && \ 12 | /etc/init.d/postgresql stop 13 | RUN git clone --depth=1 https://gitlab.com/biolds1/httpbin.git /root/httpbin && \ 14 | cd /root/httpbin/httpbin && \ 15 | python3 manage.py migrate && \ 16 | python3 manage.py shell -c "from django.contrib.auth.models import User ; u = User.objects.create(username='admin', is_superuser=True, is_staff=True) ; u.set_password('admin') ; u.save()" 17 | ADD requirements.txt /tmp 18 | RUN virtualenv /robotframework-venv && /robotframework-venv/bin/pip install -r /tmp/requirements.txt && /robotframework-venv/bin/pip cache purge 19 | RUN mkdir -p /var/lib/sosse/screenshots && git clone --depth=1 https://github.com/GurvanKervern/dummy-static-website /var/lib/sosse/screenshots/website 20 | RUN test -z "$APT_PROXY" || rm /etc/apt/apt.conf.d/proxy.conf 21 | 22 | # Pre-commit installation 23 | RUN virtualenv /pre-commit-venv && /pre-commit-venv/bin/pip install pre-commit 24 | RUN mkdir -p /tmp/pre-commit && cd /tmp/pre-commit && git init 25 | ADD pre-commit-config.yaml /tmp/pre-commit/.pre-commit-config.yaml 26 | RUN cd /tmp/pre-commit && \ 27 | /pre-commit-venv/bin/pre-commit autoupdate && \ 28 | /pre-commit-venv/bin/pre-commit run -a 29 | -------------------------------------------------------------------------------- /docker/debian-test/Makefile: -------------------------------------------------------------------------------- 1 | include ../Makefile.common 2 | 3 | .PHONY: build 4 | 5 | build: 6 | cp ../../tests/robotframework/requirements.txt . 7 | cp ../../.pre-commit-config.yaml pre-commit-config.yaml 8 | $(MAKE) -f ../Makefile.common _build 9 | -------------------------------------------------------------------------------- /docker/debian/Makefile: -------------------------------------------------------------------------------- 1 | include ../Makefile.common 2 | -------------------------------------------------------------------------------- /docker/doc/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:bookworm 2 | ARG APT_PROXY= 3 | ARG PIP_INDEX_URL= 4 | ARG PIP_TRUSTED_HOST= 5 | RUN test -z "$APT_PROXY" || (echo "Acquire::http::Proxy \"$APT_PROXY\";" > /etc/apt/apt.conf.d/proxy.conf) 6 | RUN apt update 7 | RUN apt upgrade -y 8 | RUN apt install -y virtualenv jq curl 9 | RUN virtualenv /opt/sosse-doc 10 | ADD requirements.txt requirements-rtd.txt /tmp/ 11 | RUN /opt/sosse-doc/bin/pip install -r /tmp/requirements.txt && /opt/sosse-doc/bin/pip install -r /tmp/requirements-rtd.txt 12 | RUN test -z "$APT_PROXY" || rm /etc/apt/apt.conf.d/proxy.conf 13 | -------------------------------------------------------------------------------- /docker/doc/Makefile: -------------------------------------------------------------------------------- 1 | include ../Makefile.common 2 | 3 | .PHONY: build 4 | 5 | build: 6 | cp ../../doc/requirements.txt . 7 | $(MAKE) -f ../Makefile.common _build 8 | -------------------------------------------------------------------------------- /docker/doc/requirements-rtd.txt: -------------------------------------------------------------------------------- 1 | alabaster>=0.7,<0.8,!=0.7.5 2 | commonmark==0.9.1 3 | mock==1.0.1 4 | pillow 5 | readthedocs-sphinx-ext<2.3 6 | recommonmark==0.5.0 7 | sphinx 8 | sphinx-rtd-theme 9 | -------------------------------------------------------------------------------- /docker/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # inherits pip-test to build the pip pkg 2 | FROM biolds/sosse:pip-test 3 | RUN apt-get update 4 | RUN apt-get install -y ca-certificates make 5 | RUN install -m 0755 -d /etc/apt/keyrings 6 | RUN curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc 7 | RUN chmod a+r /etc/apt/keyrings/docker.asc 8 | RUN echo \ 9 | "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian \ 10 | $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ 11 | tee /etc/apt/sources.list.d/docker.list > /dev/null 12 | RUN apt-get update 13 | RUN apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin 14 | -------------------------------------------------------------------------------- /docker/docker/Makefile: -------------------------------------------------------------------------------- 1 | include ../Makefile.common 2 | -------------------------------------------------------------------------------- /docker/pg_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | test -e /var/lib/postgresql/15 || tar -x -p -C / -f /tmp/postgres_sosse.tar.gz 3 | 4 | /etc/init.d/postgresql start 5 | 6 | export SOSSE_DB_HOST=localhost 7 | 8 | exec bash /run.sh 9 | -------------------------------------------------------------------------------- /docker/pip-base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:bookworm 2 | ARG APT_PROXY= 3 | ARG PIP_INDEX_URL= 4 | ARG PIP_TRUSTED_HOST= 5 | RUN test -z "$APT_PROXY" || (echo "Acquire::http::Proxy \"$APT_PROXY\";" > /etc/apt/apt.conf.d/proxy.conf) 6 | RUN apt-get update 7 | RUN apt-get upgrade -y 8 | RUN apt-get install -y sudo python3-pip python3-dev python3-venv build-essential libpq-dev libmagic1 nginx chromium chromium-driver firefox-esr fonts-noto unifont virtualenv npm 9 | RUN test -z "$APT_PROXY" || rm /etc/apt/apt.conf.d/proxy.conf 10 | -------------------------------------------------------------------------------- /docker/pip-base/Makefile: -------------------------------------------------------------------------------- 1 | include ../Makefile.common 2 | -------------------------------------------------------------------------------- /docker/pip-compose/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM biolds/sosse:pip-base 2 | ARG PIP_INDEX_URL= 3 | ARG PIP_TRUSTED_HOST= 4 | RUN apt-get update 5 | RUN apt-get install -y postgresql-client # for pg_isready 6 | RUN virtualenv /venv 7 | RUN /venv/bin/pip install sosse uwsgi && /venv/bin/pip cache purge 8 | RUN mkdir -p /etc/sosse/ /etc/sosse_src/ /var/log/sosse /var/log/uwsgi 9 | ADD uwsgi.* /etc/sosse_src/ 10 | ADD sosse.conf /etc/nginx/sites-enabled/default 11 | RUN chown -R root:www-data /etc/sosse /etc/sosse_src && chmod 750 /etc/sosse_src/ && chmod 640 /etc/sosse_src/* 12 | RUN mkdir /var/www/.cache /var/www/.mozilla 13 | RUN chown www-data:www-data /var/www/.cache /var/www/.mozilla 14 | ADD run.sh / 15 | RUN chmod +x /run.sh 16 | 17 | USER root 18 | CMD ["/usr/bin/bash", "/run.sh"] 19 | -------------------------------------------------------------------------------- /docker/pip-compose/Makefile: -------------------------------------------------------------------------------- 1 | include ../Makefile.common 2 | 3 | .PHONY: build 4 | 5 | build: 6 | cp ../run.sh . 7 | cp ../../debian/uwsgi.* ../../debian/sosse.conf . 8 | $(MAKE) -f ../Makefile.common _build 9 | -------------------------------------------------------------------------------- /docker/pip-release/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM biolds/sosse:pip-compose 2 | ARG PIP_INDEX_URL= 3 | ARG PIP_TRUSTED_HOST= 4 | ADD run.sh pg_run.sh / 5 | RUN chmod +x /run.sh /pg_run.sh 6 | RUN apt-get update && apt-get install -y postgresql && apt-get clean 7 | 8 | WORKDIR / 9 | USER postgres 10 | RUN /etc/init.d/postgresql start && \ 11 | (until pg_isready; do sleep 1; done) && \ 12 | psql --command "CREATE USER sosse WITH PASSWORD 'sosse';" && \ 13 | createdb -O sosse sosse && \ 14 | /etc/init.d/postgresql stop && \ 15 | tar -c -p -C / -f /tmp/postgres_sosse.tar.gz /var/lib/postgresql 16 | 17 | USER root 18 | CMD ["/usr/bin/bash", "/pg_run.sh"] 19 | -------------------------------------------------------------------------------- /docker/pip-release/Makefile: -------------------------------------------------------------------------------- 1 | include ../Makefile.common 2 | 3 | .PHONY: build 4 | 5 | build: 6 | cp ../pg_run.sh ../run.sh . 7 | cp ../../debian/uwsgi.* ../../debian/sosse.conf . 8 | $(MAKE) -f ../Makefile.common _build 9 | -------------------------------------------------------------------------------- /docker/pip-test/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM biolds/sosse:pip-base 2 | ARG APT_PROXY= 3 | RUN test -z "$APT_PROXY" || (echo "Acquire::http::Proxy \"$APT_PROXY\";" > /etc/apt/apt.conf.d/proxy.conf) 4 | RUN apt update 5 | RUN apt install -y firefox-esr wget jq make git postgresql rsync curl python3-django python3-pil 6 | RUN test -z "$APT_PROXY" || rm /etc/apt/apt.conf.d/proxy.conf 7 | RUN wget https://github.com/mozilla/geckodriver/releases/download/v0.35.0/geckodriver-v0.35.0-linux64.tar.gz -O /tmp/gecko.tar.gz && \ 8 | tar xvzf /tmp/gecko.tar.gz && \ 9 | mv geckodriver /usr/local/bin/ 10 | RUN mkdir -p /var/lib/sosse/screenshots && git clone --depth=1 https://github.com/GurvanKervern/dummy-static-website /var/lib/sosse/screenshots/website 11 | RUN git clone --depth=1 https://gitlab.com/biolds1/httpbin.git /root/httpbin && \ 12 | cd /root/httpbin/httpbin && \ 13 | python3 manage.py migrate && \ 14 | python3 manage.py shell -c "from django.contrib.auth.models import User ; u = User.objects.create(username='admin', is_superuser=True, is_staff=True) ; u.set_password('admin') ; u.save()" 15 | -------------------------------------------------------------------------------- /docker/pip-test/Makefile: -------------------------------------------------------------------------------- 1 | include ../Makefile.common 2 | -------------------------------------------------------------------------------- /docker/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | until pg_isready --host "$SOSSE_DB_HOST" ; do 3 | sleep 1 4 | done 5 | 6 | test -e /etc/sosse/sosse.conf || /venv/bin/sosse-admin default_conf | sed -e "s/^#db_pass.*/db_pass=sosse/" -e "s/^#\(chromium_options=.*\)$/\\1 --no-sandbox --disable-dev-shm-usage/" >/etc/sosse_src/sosse.conf 7 | test -e /etc/sosse/sosse.conf || cp -p /etc/sosse_src/* /etc/sosse/ 8 | mkdir -p /run/sosse /var/log/sosse /var/lib/sosse/html/ 9 | touch /var/log/sosse/{debug.log,main.log,crawler.log,uwsgi.log,webserver.log,webhooks.log} 10 | chown -R www-data:www-data /run/sosse /var/log/sosse/ /var/lib/sosse 11 | 12 | /venv/bin/sosse-admin migrate 13 | /venv/bin/sosse-admin collectstatic --noinput 14 | /venv/bin/sosse-admin update_se 15 | /venv/bin/sosse-admin default_admin 16 | /venv/bin/uwsgi --uid www-data --gid www-data --ini /etc/sosse/uwsgi.ini --logto /var/log/sosse/uwsgi.log & 17 | /etc/init.d/nginx start 18 | sudo --preserve-env -u www-data /venv/bin/sosse-admin crawl & 19 | tail -F /var/log/sosse/crawler.log 20 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "sosse", 3 | "lockfileVersion": 3, 4 | "requires": true, 5 | "packages": { 6 | "": { 7 | "dependencies": { 8 | "chart.js": "^4.4.2", 9 | "chartjs-adapter-luxon": "^1.3.1", 10 | "luxon": "^3.4.4", 11 | "swagger-ui-dist": "^5.12.0" 12 | } 13 | }, 14 | "node_modules/@kurkle/color": { 15 | "version": "0.3.2", 16 | "resolved": "https://registry.npmjs.org/@kurkle/color/-/color-0.3.2.tgz", 17 | "integrity": "sha512-fuscdXJ9G1qb7W8VdHi+IwRqij3lBkosAm4ydQtEmbY58OzHXqQhvlxqEkoz0yssNVn38bcpRWgA9PP+OGoisw==" 18 | }, 19 | "node_modules/chart.js": { 20 | "version": "4.4.2", 21 | "resolved": "https://registry.npmjs.org/chart.js/-/chart.js-4.4.2.tgz", 22 | "integrity": "sha512-6GD7iKwFpP5kbSD4MeRRRlTnQvxfQREy36uEtm1hzHzcOqwWx0YEHuspuoNlslu+nciLIB7fjjsHkUv/FzFcOg==", 23 | "dependencies": { 24 | "@kurkle/color": "^0.3.0" 25 | }, 26 | "engines": { 27 | "pnpm": ">=8" 28 | } 29 | }, 30 | "node_modules/chartjs-adapter-luxon": { 31 | "version": "1.3.1", 32 | "resolved": "https://registry.npmjs.org/chartjs-adapter-luxon/-/chartjs-adapter-luxon-1.3.1.tgz", 33 | "integrity": "sha512-yxHov3X8y+reIibl1o+j18xzrcdddCLqsXhriV2+aQ4hCR66IYFchlRXUvrJVoxglJ380pgytU7YWtoqdIgqhg==", 34 | "peerDependencies": { 35 | "chart.js": ">=3.0.0", 36 | "luxon": ">=1.0.0" 37 | } 38 | }, 39 | "node_modules/luxon": { 40 | "version": "3.4.4", 41 | "resolved": "https://registry.npmjs.org/luxon/-/luxon-3.4.4.tgz", 42 | "integrity": "sha512-zobTr7akeGHnv7eBOXcRgMeCP6+uyYsczwmeRCauvpvaAltgNyTbLH/+VaEAPUeWBT+1GuNmz4wC/6jtQzbbVA==", 43 | "engines": { 44 | "node": ">=12" 45 | } 46 | }, 47 | "node_modules/swagger-ui-dist": { 48 | "version": "5.12.0", 49 | "resolved": "https://registry.npmjs.org/swagger-ui-dist/-/swagger-ui-dist-5.12.0.tgz", 50 | "integrity": "sha512-Rt1xUpbHulJVGbiQjq9yy9/r/0Pg6TmpcG+fXTaMePDc8z5WUw4LfaWts5qcNv/8ewPvBIbY7DKq7qReIKNCCQ==" 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "chart.js": "^4.4.2", 4 | "chartjs-adapter-luxon": "^1.3.1", 5 | "luxon": "^3.4.4", 6 | "swagger-ui-dist": "^5.12.0" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "setuptools-scm"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "sosse" 7 | authors = [{ name = "Laurent Defert", email = "laurent_defert@yahoo.fr" }] 8 | readme = "README.md" 9 | description = "Selenium Open Source Search Engine" 10 | requires-python = ">=3.9" 11 | keywords = ["search engine", "crawler"] 12 | license = "AGPL-3.0-only" 13 | classifiers = ["Framework :: Django", "Programming Language :: Python :: 3"] 14 | 15 | dynamic = ["version", "dependencies"] 16 | 17 | [tool.setuptools] 18 | packages = [ 19 | "se", 20 | "se.deps.linkpreview", 21 | "se.deps.linkpreview.linkpreview", 22 | "se.deps.linkpreview.linkpreview.preview", 23 | "se.deps.fake-useragent", 24 | "se.deps.fake-useragent.src", 25 | "se.deps.fake-useragent.src.fake_useragent", 26 | "se.migrations", 27 | "se.management", 28 | "se.management.commands", 29 | "sosse", 30 | ] 31 | 32 | [tool.setuptools.package-data] 33 | se = ["*.html", "*.svg", "*.js", "*.css", "*.json"] 34 | 35 | [tool.setuptools.dynamic] 36 | version = { attr = "sosse.settings.SOSSE_VERSION_TAG" } 37 | dependencies = { file = ["requirements.txt"] } 38 | 39 | [tool.autopep8] 40 | max_line_length = 1000 41 | 42 | [tool.ruff] 43 | line-length = 120 44 | 45 | [tool.doc8] 46 | # Ignore include failure since we include generated files 47 | ignore = ["D000"] 48 | max-line-length = 120 49 | 50 | [tool.isort] 51 | profile = "black" 52 | 53 | [project.scripts] 54 | sosse-admin = "sosse.sosse_admin:main" 55 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | cssutils 3 | defusedxml 4 | django<4 5 | django-filter 6 | django-treebeard 7 | django-uwsgi 8 | djangorestframework 9 | drf-spectacular 10 | feedparser 11 | html5lib 12 | langdetect 13 | lxml 14 | markdown 15 | pillow 16 | psutil 17 | psycopg2-binary 18 | publicsuffix2 19 | python-magic 20 | requests 21 | selenium<4.9 22 | -------------------------------------------------------------------------------- /se/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/se/__init__.py -------------------------------------------------------------------------------- /se/about.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from .views import UserView 17 | 18 | 19 | class AboutView(UserView): 20 | template_name = "se/about.html" 21 | title = "About" 22 | -------------------------------------------------------------------------------- /se/analytics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from .models import WorkerStats 17 | from .views import AdminView 18 | 19 | 20 | class AnalyticsView(AdminView): 21 | template_name = "admin/analytics.html" 22 | permission_required = set() 23 | title = "Analytics" 24 | 25 | def get_context_data(self): 26 | context = super().get_context_data() 27 | if self.request.user.has_perm("se.view_crawlerstats"): 28 | context["crawlers_count"] = WorkerStats.objects.count() 29 | return context 30 | -------------------------------------------------------------------------------- /se/apps.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.apps import AppConfig 17 | from django.contrib.admin.apps import AdminConfig 18 | 19 | 20 | class SEConfig(AppConfig): 21 | name = "se" 22 | verbose_name = "Crawling" 23 | default_auto_field = "django.db.models.AutoField" 24 | 25 | 26 | class SEAdminConfig(AdminConfig): 27 | default_site = "se.admin.get_admin" 28 | -------------------------------------------------------------------------------- /se/crawlers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | 17 | from .models import WorkerStats 18 | from .views import AdminView 19 | 20 | 21 | class CrawlersOperationMixin: 22 | def get_permission_required(self): 23 | if self.request.method == "POST": 24 | return {"se.change_crawlerstats"} 25 | return super().get_permission_required() 26 | 27 | def post(self, request): 28 | if "pause" in request.POST: 29 | WorkerStats.objects.update(state="paused") 30 | if "resume" in request.POST: 31 | WorkerStats.objects.update(state="running") 32 | WorkerStats.wake_up() 33 | return self.get(request) 34 | 35 | 36 | class CrawlersContentView(AdminView): 37 | template_name = "admin/crawlers_content.html" 38 | permission_required = "se.view_crawlerstats" 39 | admin_site = None 40 | 41 | def __init__(self, *args, **kwargs): 42 | self.admin_site = kwargs.pop("admin_site") 43 | super().__init__(*args, **kwargs) 44 | 45 | def get_context_data(self, **kwargs): 46 | context = super().get_context_data(**kwargs) 47 | crawlers = WorkerStats.live_state() 48 | running_count = [c for c in crawlers if c.state != "exited"] 49 | return context | { 50 | "crawlers": WorkerStats.live_state(), 51 | "running_count": running_count, 52 | "pause": WorkerStats.objects.filter(state="paused").count() == 0, 53 | } 54 | 55 | 56 | class CrawlersView(CrawlersOperationMixin, CrawlersContentView): 57 | title = "Crawlers" 58 | template_name = "admin/crawlers.html" 59 | -------------------------------------------------------------------------------- /se/download.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | import os 17 | from urllib.parse import unquote 18 | 19 | from django.conf import settings 20 | from django.views.generic import TemplateView 21 | 22 | from .archive import ArchiveMixin 23 | from .html_asset import HTMLAsset 24 | from .utils import mimetype_icon 25 | from .views import RedirectException 26 | 27 | 28 | class DownloadView(ArchiveMixin, TemplateView): 29 | template_name = "se/download.html" 30 | view_name = "download" 31 | 32 | def get_context_data(self, *args, **kwargs) -> dict: 33 | url = self._url_from_request() 34 | asset = HTMLAsset.objects.filter(url=url).order_by("download_date").last() 35 | 36 | if not asset or not os.path.exists(settings.SOSSE_HTML_SNAPSHOT_DIR + asset.filename): 37 | raise RedirectException(self.doc.get_absolute_url()) 38 | 39 | asset_path = settings.SOSSE_HTML_SNAPSHOT_DIR + asset.filename 40 | 41 | filename = url.rstrip("/").rsplit("/", 1)[1] 42 | filename = unquote(filename) 43 | if "." in filename: 44 | filename = filename.rsplit(".", 1)[0] 45 | 46 | extension = asset.filename.rsplit(".", 1)[1] 47 | filename = f"{filename}.{extension}" 48 | 49 | context = super().get_context_data() 50 | return context | { 51 | "url": self.request.build_absolute_uri(settings.SOSSE_HTML_SNAPSHOT_URL) + asset.filename, 52 | "filename": filename, 53 | "filesize": os.path.getsize(asset_path), 54 | "icon": mimetype_icon(self.doc.mimetype), 55 | "mimebase": self.doc.mimetype.split("/", 1)[0], 56 | } 57 | -------------------------------------------------------------------------------- /se/favicon.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.http import HttpResponse 17 | from django.shortcuts import get_object_or_404 18 | from django.views.generic import View 19 | 20 | from .models import FavIcon 21 | from .views import SosseLoginRequiredMixin 22 | 23 | 24 | class FavIconView(View, SosseLoginRequiredMixin): 25 | def get(self, request, favicon_id): 26 | fav = get_object_or_404(FavIcon, id=favicon_id) 27 | return HttpResponse(fav.content, content_type=fav.mimetype) 28 | -------------------------------------------------------------------------------- /se/login.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | 17 | from django.conf import settings 18 | from django.contrib.auth import REDIRECT_FIELD_NAME 19 | from django.contrib.auth.mixins import UserPassesTestMixin 20 | from django.contrib.auth.views import LoginView 21 | 22 | 23 | class SosseLoginRequiredMixin(UserPassesTestMixin): 24 | login_url = None 25 | redirect_field_name = REDIRECT_FIELD_NAME 26 | 27 | def test_func(self): 28 | if settings.SOSSE_ANONYMOUS_SEARCH: 29 | return True 30 | return self.request.user.is_authenticated 31 | 32 | 33 | class SELoginView(LoginView): 34 | template_name = "admin/login.html" 35 | -------------------------------------------------------------------------------- /se/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/se/management/__init__.py -------------------------------------------------------------------------------- /se/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/se/management/commands/__init__.py -------------------------------------------------------------------------------- /se/management/commands/clear_html_archive.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.core.management.base import BaseCommand 17 | 18 | from ...html_asset import HTMLAsset 19 | 20 | 21 | class Command(BaseCommand): 22 | help = "Clears archived HTML snapshots." 23 | 24 | def handle(self, *args, **options): 25 | self.stdout.write("Clearing archive, please wait...") 26 | HTMLAsset.objects.update(download_date=None) 27 | self.stdout.write("Done.") 28 | -------------------------------------------------------------------------------- /se/management/commands/default_admin.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | import sys 17 | 18 | from django.contrib.auth.models import User 19 | from django.core.management.base import BaseCommand 20 | 21 | 22 | class Command(BaseCommand): 23 | help = "Creates a default ``admin`` superuser with ``admin`` password,\ndoes nothing if at least one user already exists in the database." 24 | 25 | def handle(self, *args, **options): 26 | if User.objects.count() != 0: 27 | self.stdout.write("The database already has a user, skipping default user creation") 28 | sys.exit(0) 29 | 30 | user = User.objects.create(username="admin", is_superuser=True, is_staff=True, is_active=True) 31 | user.set_password("admin") 32 | user.save() 33 | self.stdout.write('Default user "admin", with password "admin" was created') 34 | -------------------------------------------------------------------------------- /se/management/commands/default_conf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.core.management.base import BaseCommand 17 | 18 | from sosse.conf import Conf 19 | 20 | 21 | class Command(BaseCommand): 22 | help = "Outputs default configuration file to stdout." 23 | 24 | def handle(self, *args, **options): 25 | self.stdout.write(Conf.generate_default()) 26 | -------------------------------------------------------------------------------- /se/management/commands/generate_secret.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.core.management.base import BaseCommand 17 | from django.core.management.utils import get_random_secret_key 18 | 19 | 20 | class Command(BaseCommand): 21 | help = "Generates a secret key to set in the configuration." 22 | doc = "Generates a secret key that can be used in the :ref:`Configuration file `." 23 | 24 | def handle(self, *args, **options): 25 | # Escape % to avoid value interpolation in the conf file 26 | # (https://docs.python.org/3/library/configparser.html#interpolation-of-values) 27 | self.stdout.write(get_random_secret_key().replace("%", "%%")) 28 | -------------------------------------------------------------------------------- /se/management/commands/load_se.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.core.management.base import BaseCommand 17 | 18 | from ...models import SearchEngine 19 | 20 | 21 | class Command(BaseCommand): 22 | help = "Loads a search engine definition from an OpenSearch Description formatted XML file." 23 | doc = """Loads a :doc:`user/shortcuts` from an `OpenSearch Description `_ formatted XML file. 24 | 25 | Most search engines provide such a file, defined in the HTML of their web page. 26 | It can be found inside a ```` element below the ```` tag, for example `Brave Search `_ defines it as: 27 | 28 | .. code-block:: html 29 | 30 | 31 | """ 32 | 33 | def add_arguments(self, parser): 34 | parser.add_argument( 35 | "opensearch_file", 36 | nargs=1, 37 | type=str, 38 | help="OpenSearch Description formatted XML file.", 39 | ) 40 | 41 | def handle(self, *args, **options): 42 | SearchEngine.parse_xml_file(options["opensearch_file"][0]) 43 | -------------------------------------------------------------------------------- /se/migrations/0003_sosse_1_1_0.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | # Generated by Django 3.2.12 on 2023-05-29 08:59 17 | 18 | from django.db import migrations, models 19 | 20 | 21 | class Migration(migrations.Migration): 22 | dependencies = [ 23 | ("se", "0002_search_vector"), 24 | ] 25 | 26 | operations = [ 27 | migrations.AddField( 28 | model_name="document", 29 | name="show_on_homepage", 30 | field=models.BooleanField(default=False, help_text="Display this document on the homepage"), 31 | ), 32 | ] 33 | -------------------------------------------------------------------------------- /se/migrations/0004_sosse_1_2_0.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | # Generated by Django 3.2.19 on 2023-07-04 12:28 17 | 18 | from django.db import migrations, models 19 | 20 | 21 | class Migration(migrations.Migration): 22 | dependencies = [ 23 | ("se", "0003_sosse_1_1_0"), 24 | ] 25 | 26 | operations = [ 27 | migrations.AddField( 28 | model_name="crawlpolicy", 29 | name="remove_nav_elements", 30 | field=models.CharField( 31 | choices=[("yes", "Yes"), ("no", "No")], 32 | default="yes", 33 | help_text="Remove navigation related elements", 34 | max_length=4, 35 | ), 36 | ), 37 | ] 38 | -------------------------------------------------------------------------------- /se/migrations/0007_sosse_1_5_0.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | # Generated by Django 3.2.19 on 2023-08-26 19:20 17 | 18 | from django.db import migrations, models 19 | 20 | 21 | class Migration(migrations.Migration): 22 | dependencies = [ 23 | ("se", "0006_sosse_1_4_0"), 24 | ] 25 | 26 | operations = [ 27 | migrations.AlterField( 28 | model_name="crawlpolicy", 29 | name="auth_login_url_re", 30 | field=models.TextField( 31 | blank=True, 32 | help_text="A redirection to an URL matching the regexp will trigger authentication", 33 | null=True, 34 | verbose_name="Login URL regexp", 35 | ), 36 | ), 37 | ] 38 | -------------------------------------------------------------------------------- /se/migrations/0010_sosse_1_8_0.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | # Generated by Django 3.2.19 on 2023-11-11 21:28 17 | 18 | from django.db import migrations, models 19 | 20 | 21 | class Migration(migrations.Migration): 22 | dependencies = [ 23 | ("se", "0009_sosse_1_7_0"), 24 | ] 25 | 26 | operations = [ 27 | migrations.AddField( 28 | model_name="excludedurl", 29 | name="starting_with", 30 | field=models.BooleanField( 31 | default=False, 32 | help_text="Exclude all urls starting with the url pattern", 33 | ), 34 | ), 35 | migrations.AddField( 36 | model_name="link", 37 | name="in_nav", 38 | field=models.BooleanField(default=False), 39 | ), 40 | ] 41 | -------------------------------------------------------------------------------- /se/migrations/0011_sosse_1_9_0.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | # Generated by Django 3.2.19 on 2024-01-22 11:26 17 | 18 | from django.db import migrations 19 | 20 | 21 | class Migration(migrations.Migration): 22 | dependencies = [ 23 | ("se", "0010_sosse_1_8_0"), 24 | ] 25 | 26 | operations = [ 27 | migrations.RenameField( 28 | model_name="crawlpolicy", 29 | old_name="condition", 30 | new_name="recursion", 31 | ), 32 | migrations.RenameField( 33 | model_name="crawlpolicy", 34 | old_name="crawl_depth", 35 | new_name="recursion_depth", 36 | ), 37 | ] 38 | -------------------------------------------------------------------------------- /se/migrations/0012_sosse_1_10_0.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | # Generated by Django 3.2.19 on 2024-06-30 09:45 17 | 18 | from django.db import migrations, models 19 | 20 | 21 | class Migration(migrations.Migration): 22 | dependencies = [ 23 | ("se", "0011_sosse_1_9_0"), 24 | ] 25 | 26 | operations = [ 27 | migrations.AddField( 28 | model_name="crawlpolicy", 29 | name="hide_documents", 30 | field=models.BooleanField(default=False, help_text="Hide documents from search results"), 31 | ), 32 | migrations.AddField( 33 | model_name="document", 34 | name="hidden", 35 | field=models.BooleanField(default=False, help_text="Hide this document from search results"), 36 | ), 37 | migrations.AddField( 38 | model_name="crawlpolicy", 39 | name="enabled", 40 | field=models.BooleanField(default=True), 41 | ), 42 | ] 43 | -------------------------------------------------------------------------------- /se/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/se/migrations/__init__.py -------------------------------------------------------------------------------- /se/opensearch.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.views.generic import TemplateView 17 | 18 | 19 | class OpensearchView(TemplateView): 20 | template_name = "se/opensearch.xml" 21 | content_type = "application/xml" 22 | 23 | def get_context_data(self, **kwargs): 24 | context = super().get_context_data(**kwargs) 25 | return context | {"url": self.request.build_absolute_uri("/").rstrip("/")} 26 | -------------------------------------------------------------------------------- /se/profile.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | import json 17 | 18 | from .document import Document 19 | from .views import UserView 20 | 21 | 22 | class ProfileView(UserView): 23 | template_name = "se/profile.html" 24 | title = "Profile" 25 | 26 | def get_context_data(self, **kwargs): 27 | context = super().get_context_data(**kwargs) 28 | return context | {"supported_langs": json.dumps(Document.get_supported_lang_dict())} 29 | -------------------------------------------------------------------------------- /se/resources.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from .views import UserView 17 | 18 | 19 | class ResourcesView(UserView): 20 | template_name = "se/resources.html" 21 | title = "Resources" 22 | -------------------------------------------------------------------------------- /se/rest_permissions.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2024 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.conf import settings 17 | from rest_framework import permissions 18 | 19 | 20 | class LoginRequiredPermission(permissions.BasePermission): 21 | def has_permission(self, request, _): 22 | if settings.SOSSE_ANONYMOUS_SEARCH: 23 | return True 24 | return request.user.is_authenticated 25 | 26 | 27 | class IsSuperUserOrStaff(permissions.BasePermission): 28 | def has_permission(self, request, _): 29 | return request.user and (request.user.is_superuser or request.user.is_staff) 30 | 31 | 32 | class DjangoModelPermissionsRW(permissions.DjangoModelPermissions): 33 | """Permission checking class that checks Django model permissions. 34 | 35 | Contrary to DjangoModelPermissions, this class also checks for read 36 | permissions. 37 | """ 38 | 39 | perms_map = permissions.DjangoModelPermissions.perms_map | { 40 | "GET": ["%(app_label)s.view_%(model_name)s"], 41 | "HEAD": ["%(app_label)s.view_%(model_name)s"], 42 | "OPTIONS": ["%(app_label)s.view_%(model_name)s"], 43 | } 44 | -------------------------------------------------------------------------------- /se/screenshot.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.conf import settings 17 | from django.views.generic import TemplateView 18 | 19 | from .archive import ArchiveMixin 20 | 21 | 22 | class ScreenshotView(ArchiveMixin, TemplateView): 23 | template_name = "se/embed.html" 24 | view_name = "screenshot" 25 | 26 | def get_context_data(self, *args, **kwargs): 27 | context = super().get_context_data(*args, **kwargs) 28 | return context | { 29 | "url": self.request.build_absolute_uri("/screenshot_full/") + self._url_from_request(), 30 | "allow_scripts": True, 31 | } 32 | 33 | 34 | class ScreenshotFullView(ArchiveMixin, TemplateView): 35 | template_name = "se/screenshot_full.html" 36 | view_name = "screenshot_full" 37 | 38 | def get_context_data(self, *args, **kwargs): 39 | context = super().get_context_data() 40 | return context | { 41 | "screenshot": settings.SOSSE_SCREENSHOTS_URL + "/" + self.doc.image_name(), 42 | "screenshot_size": self.doc.screenshot_size.split("x"), 43 | "screenshot_format": self.doc.screenshot_format, 44 | "screenshot_mime": ("image/png" if self.doc.screenshot_format == "png" else "image/jpeg"), 45 | "links": self.doc.links_to.filter(screen_pos__isnull=False).order_by("link_no"), 46 | "screens": range(self.doc.screenshot_count), 47 | } 48 | -------------------------------------------------------------------------------- /se/search_redirect.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from urllib.parse import quote_plus 17 | 18 | from django.conf import settings 19 | from django.views.generic import TemplateView 20 | 21 | from .login import SosseLoginRequiredMixin 22 | 23 | 24 | class SearchRedirectView(SosseLoginRequiredMixin, TemplateView): 25 | template_name = "se/search_redirect.html" 26 | 27 | def get_context_data(self, **kwargs): 28 | context = super().get_context_data(**kwargs) 29 | return context | { 30 | "url": self.request.build_absolute_uri("/"), 31 | "q": quote_plus(self.request.GET.get("q", "")), 32 | "settings": settings, 33 | } 34 | -------------------------------------------------------------------------------- /se/static/se/admin-webhooks.js: -------------------------------------------------------------------------------- 1 | function test_webhook() { 2 | var webhookData = {}; 3 | var form = document.getElementById("webhook_form"); 4 | 5 | form.querySelectorAll("input, select, textarea").forEach(function (input) { 6 | if (input.name && input.id.substr(0, 3) === "id_") { 7 | webhookData[input.name] = input.value; 8 | } 9 | }); 10 | 11 | var resultDiv = document.getElementById("webhook_test_result"); 12 | if (!resultDiv) { 13 | resultDiv = document.createElement("div"); 14 | resultDiv.id = "webhook_test_result"; 15 | resultDiv.style.width = "100%"; 16 | resultDiv.style.marginTop = "10px"; 17 | 18 | var webhookTestField = document.getElementById("webhook_test_button"); 19 | webhookTestField.parentElement.appendChild(resultDiv); 20 | } 21 | 22 | resultDiv.innerHTML = "Processing request..."; 23 | var payload = JSON.stringify(webhookData); 24 | 25 | fetch("/api/webhook/test_trigger/?as_html=1", { 26 | method: "POST", 27 | headers: { 28 | "Content-Type": "application/json", 29 | "X-CSRFToken": document.querySelector("[name=csrfmiddlewaretoken]").value, 30 | }, 31 | body: payload, 32 | }) 33 | .then((response) => { 34 | console.log(response); 35 | if (response.status !== 200) { 36 | throw new Error( 37 | `HTTP error! status: ${response.status} : ${response.statusText}`, 38 | ); 39 | } 40 | response.text().then((body) => { 41 | resultDiv.innerHTML = body; 42 | }); 43 | }) 44 | .catch((error) => { 45 | resultDiv.value = `Error: ${error}`; 46 | console.error("Error:", error); 47 | }); 48 | } 49 | -------------------------------------------------------------------------------- /se/static/se/discord-symbol.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /se/static/se/github-mark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/se/static/se/github-mark.png -------------------------------------------------------------------------------- /se/static/se/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/se/static/se/logo.png -------------------------------------------------------------------------------- /se/static/se/screenshot.js: -------------------------------------------------------------------------------- 1 | let images, links; 2 | 3 | function resize() { 4 | // width with implicit margin 5 | const w_width = document.body.getBoundingClientRect().width; 6 | const ratio = w_width / screen_width; 7 | 8 | for (let i = 0; i < images.length; i++) { 9 | const img = images[i]; 10 | img.style.width = `${screen_width * ratio}px`; 11 | } 12 | 13 | for (let i = 0; i < links.length; i++) { 14 | const link = links[i]; 15 | [elemLeft, elemTop, elemWidth, elemHeight] = link.dataset.loc.split(","); 16 | link.style.left = elemLeft * ratio + "px"; 17 | link.style.top = elemTop * ratio + "px"; 18 | link.style.width = elemWidth * ratio + "px"; 19 | link.style.height = elemHeight * ratio + "px"; 20 | } 21 | } 22 | 23 | document.addEventListener("DOMContentLoaded", function (event) { 24 | links = document.querySelectorAll("#screenshots > a"); 25 | images = document.querySelectorAll("#screenshots > img"); 26 | 27 | window.addEventListener("resize", function () { 28 | resize(); 29 | }); 30 | 31 | resize(); 32 | 33 | // Work-around in case the initial resize() was done while no image was loaded yet 34 | setTimeout(resize, 300); 35 | }); 36 | -------------------------------------------------------------------------------- /se/templates/admin/base_site.html: -------------------------------------------------------------------------------- 1 | {% extends 'admin/base.html' %} 2 | {% load static %} 3 | 4 | {% block title %}SOSSE · Configuration{% endblock %} 5 | -------------------------------------------------------------------------------- /se/templates/admin/change_form.html: -------------------------------------------------------------------------------- 1 | {% extends "admin/change_form.html" %} 2 | {% load i18n admin_urls static admin_modify %} 3 | 4 | {% block breadcrumbs %} 5 | 10 | {% endblock %} 11 | 12 | {% block extrahead %} 13 | {{ block.super }} 14 | 15 | {% endblock %} 16 | 17 | {% block extrastyle %} 18 | {{ block.super }} 19 | 20 | {% endblock %} 21 | 22 | {% block content %} 23 | {{ block.super }} 24 | {% include "se/components/modal.html" with id="tags" title=tags_edit_title %} 25 | {% endblock %} 26 | 27 | {% block object-tools %} 28 | 29 |
30 | {% csrf_token %} 31 | {% for action in actions %} 32 | {% if action.0 and action.0 != 'delete_selected' %} 33 | 34 | {% endif %} 35 | {% endfor %} 36 |
37 |
38 | {% endblock %} 39 | -------------------------------------------------------------------------------- /se/templates/admin/change_list_object_tools.html: -------------------------------------------------------------------------------- 1 | {% load i18n admin_urls %} 2 | 3 | {% block object-tools-items %} 4 | {% if has_add_permission %} 5 |
  • 6 | {% url cl.opts|admin_urlname:'add' as add_url %} 7 | 8 | {% blocktranslate with cl.opts.verbose_name as name %}Add {{ name }}{% endblocktranslate %} 9 | 10 |
  • 11 | {% if cl.opts.verbose_name == 'cookie' %} 12 |
  • 13 | 14 | Import cookies 15 | 16 |
  • 17 | {% endif %} 18 | {% endif %} 19 | {% endblock %} 20 | -------------------------------------------------------------------------------- /se/templates/admin/cookies_import.html: -------------------------------------------------------------------------------- 1 | {% extends "admin/base.html" %} 2 | {% block content %} 3 |
    4 | {% csrf_token %} 5 | {% if form.non_field_errors %} 6 |
      7 | {% for error in form.non_field_errors %} 8 |
    • {{ error }}
    • 9 | {% endfor %} 10 |
    11 | {% endif %} 12 |

    {{ form.cookies.label_tag }}

    13 |

    {{ form.cookies }}

    14 |

    {{ form.cookies.errors }}

    15 |

    {{ form.cookies_file.label_tag }}

    16 |

    {{ form.cookies_file }}

    17 |

    {{ form.cookies_file.errors }}

    18 |

    19 |
    20 | {% endblock %} 21 | -------------------------------------------------------------------------------- /se/templates/admin/crawl_queue.html: -------------------------------------------------------------------------------- 1 | {% extends "admin/base.html" %} 2 | {% block js %} 3 | function refresh () { 4 | fetch('{% url 'admin:crawl_queue_content' %}').then(response => { 5 | if (response.ok) { 6 | response.text().then(content => { 7 | const contentDiv = document.getElementById('content'); 8 | contentDiv.innerHTML = content; 9 | }); 10 | } 11 | }); 12 | } 13 | 14 | setInterval(refresh, {{ settings.SOSSE_CRAWL_STATUS_AUTOREFRESH }} * 1000); 15 | {% endblock %} 16 | 17 | {% block css %} 18 | input.card { 19 | height: 72px; 20 | margin-top: 0px; 21 | color: var(--text); 22 | } 23 | 24 | .card { 25 | height: 70px; 26 | font-size: large; 27 | } 28 | .green_bg { 29 | background-color: #d9fabe; 30 | } 31 | 32 | #result_list th:nth-child(1), #result_list td:nth-child(1) { 33 | width: 20px; 34 | padding: 0px; 35 | vertical-align: middle; 36 | text-align: center; 37 | } 38 | 39 | #result_list td:nth-child(1) img { 40 | height: 16px; 41 | width: 16px; 42 | } 43 | 44 | #result_list th:nth-child(2), #result_list td:nth-child(2) { 45 | white-space: nowrap; 46 | overflow: hidden; 47 | max-width: 0; 48 | width: 35%; 49 | text-overflow: ellipsis; 50 | } 51 | 52 | #result_list th:nth-child(3), #result_list td:nth-child(3) { 53 | white-space: nowrap; 54 | overflow: hidden; 55 | max-width: 0; 56 | width: 35%; 57 | text-overflow: ellipsis; 58 | } 59 | 60 | body.dark-mode #result_list tr.running { 61 | background-color: #9cd78b; 62 | font-weight: bold; 63 | } 64 | 65 | body.dark-mode #result_list tr.pending:nth-child(2n) { 66 | background-color: #1e350a; 67 | } 68 | 69 | body.dark-mode #result_list tr.pending:nth-child(2n+1) { 70 | background-color: #245000; 71 | } 72 | 73 | body.light-mode #result_list tr.running { 74 | background-color: #d9fabe; 75 | font-weight: bold; 76 | } 77 | 78 | body.light-mode #result_list tr.pending:nth-child(2n) { 79 | background-color: #ecffdc; 80 | } 81 | 82 | body.light-mode #result_list tr.pending:nth-child(2n+1) { 83 | background-color: #dceecd; 84 | } 85 | {% endblock %} 86 | 87 | {% block content %} 88 | {% include "admin/crawl_queue_content.html" %} 89 | {% endblock %} 90 | -------------------------------------------------------------------------------- /se/templates/admin/crawlers.html: -------------------------------------------------------------------------------- 1 | {% extends "admin/base.html" %} 2 | {% block js %} 3 | function refresh () { 4 | fetch('{% url 'admin:crawlers_content' %}').then(response => { 5 | if (response.ok) { 6 | response.text().then(content => { 7 | const contentDiv = document.getElementById('content'); 8 | contentDiv.innerHTML = content; 9 | }); 10 | } 11 | }); 12 | } 13 | 14 | setInterval(refresh, {{ settings.SOSSE_CRAWL_STATUS_AUTOREFRESH }} * 1000); 15 | {% endblock %} 16 | 17 | {% block css %} 18 | {% endblock %} 19 | 20 | {% block content %} 21 | {% include "admin/crawlers_content.html" %} 22 | {% endblock %} 23 | -------------------------------------------------------------------------------- /se/templates/admin/crawlers_content.html: -------------------------------------------------------------------------------- 1 | {% load static %} 2 | {% if crawlers %} 3 |

    4 | {{ running_count|length }} crawler{{ running_count|pluralize }} running. 5 |

    6 |

    7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | {% for crawler in crawlers %} 17 | 18 | 19 | 20 | 21 | 30 | 37 | 38 | {% endfor %} 39 | 40 |
    No
    PID
    State
    URL
    {{ crawler.worker_no }}{{ crawler.pid }}{{ crawler.state }} 22 | {% if crawler.doc.id %} 23 | 24 | {% if crawler.doc.favicon and not crawler.doc.favicon.missing %} 25 | icon 26 | {% endif %} 27 | 28 | {% endif %} 29 | 31 | {% if crawler.doc.id %} 32 | 33 | {{ crawler.doc.url }} 34 | 35 | {% endif %} 36 |
    41 |

    42 | 43 | {% if 'se.change_crawlerstats' in perms %} 44 |

    45 |

    46 | {% csrf_token %} 47 | {% if pause %} 48 | 49 | {% else %} 50 | 51 | {% endif %} 52 |
    53 |

    54 | {% endif %} 55 | {% else %} 56 | No crawlers running. 57 | {% endif %} 58 | -------------------------------------------------------------------------------- /se/templates/admin/delete_confirmation.html: -------------------------------------------------------------------------------- 1 | {% extends "admin/delete_confirmation.html" %} 2 | {% load i18n admin_urls static %} 3 | 4 | {% block breadcrumbs %} 5 | 11 | {% endblock %} 12 | -------------------------------------------------------------------------------- /se/templates/admin/delete_selected_confirmation.html: -------------------------------------------------------------------------------- 1 | {% extends "admin/delete_selected_confirmation.html" %} 2 | {% load i18n l10n admin_urls static %} 3 | 4 | {% block breadcrumbs %} 5 | 10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /se/templates/admin/index.html: -------------------------------------------------------------------------------- 1 | {% extends "admin/index.html" %} 2 | 3 | {% block sidebar %} 4 | {% endblock %} 5 | -------------------------------------------------------------------------------- /se/templates/admin/object_history.html: -------------------------------------------------------------------------------- 1 | {% extends "admin/object_history.html" %} 2 | {% load i18n admin_urls %} 3 | 4 | {% block breadcrumbs %} 5 | 11 | {% endblock %} 12 | -------------------------------------------------------------------------------- /se/templates/registration/password_change_done.html: -------------------------------------------------------------------------------- 1 | {% extends "registration/password_change_done.html" %} 2 | {% load i18n %} 3 | 4 | {% block breadcrumbs %} 5 | 9 | {% endblock %} 10 | -------------------------------------------------------------------------------- /se/templates/registration/password_change_form.html: -------------------------------------------------------------------------------- 1 | {% extends "registration/password_change_form.html" %} 2 | {% load i18n %} 3 | 4 | {% block breadcrumbs %} 5 | 9 | {% endblock %} 10 | -------------------------------------------------------------------------------- /se/templates/registration/password_reset_complete.html: -------------------------------------------------------------------------------- 1 | {% extends "registration/password_reset_complete.html" %} 2 | {% load i18n %} 3 | 4 | {% block breadcrumbs %} 5 | 9 | {% endblock %} 10 | -------------------------------------------------------------------------------- /se/templates/registration/password_reset_confirm.html: -------------------------------------------------------------------------------- 1 | {% extends "registration/password_reset_confirm.html" %} 2 | {% load i18n static %} 3 | 4 | {% block breadcrumbs %} 5 | 9 | {% endblock %} 10 | -------------------------------------------------------------------------------- /se/templates/registration/password_reset_done.html: -------------------------------------------------------------------------------- 1 | {% extends "registration/password_reset_done.html" %} 2 | {% load i18n %} 3 | 4 | {% block breadcrumbs %} 5 | 9 | {% endblock %} 10 | -------------------------------------------------------------------------------- /se/templates/registration/password_reset_form.html: -------------------------------------------------------------------------------- 1 | {% extends "registration/password_reset_form.html" %} 2 | {% load i18n static %} 3 | 4 | {% block breadcrumbs %} 5 | 9 | {% endblock %} 10 | -------------------------------------------------------------------------------- /se/templates/se/archive.html: -------------------------------------------------------------------------------- 1 | {% load static %} 2 | {% include "se/components/modal.html" with id="tags" title=tags_edit_title %} 3 | 4 | {% block head %} 5 | 6 | 7 | {# Token used by Javscript code to make ``fetch`` requests #} 8 | 9 | {% endblock %} 10 | 11 | 52 | -------------------------------------------------------------------------------- /se/templates/se/base.html: -------------------------------------------------------------------------------- 1 | {% extends "se/base_raw.html" %} 2 | {% load static %} 3 | 4 | {% block body_base %} 5 |
    6 |
    7 | 10 | {% if search_form %} 11 |
    12 |
    13 | {% for field in search_form.hidden_fields %} 14 | {{ field }} 15 | {% endfor %} 16 | 17 | 19 |
    20 |
    21 | {% endif %} 22 | 23 | {% if title and not hide_title %} 24 |

    {{ title }}

    25 | {% endif %} 26 | {% include "se/main_menu.html" %} 27 | {% block top_bar %} 28 | {% endblock %} 29 |
    30 | 31 | {% block body_no_margin %} 32 |
    33 | {% block body %} 34 | {% endblock %} 35 |
    36 | {% endblock %} 37 | 38 |
    39 |

    40 | © 2022-2025 41 | Sosse 42 | · About 43 |

    44 |
    45 |
    46 | {% endblock %} 47 | -------------------------------------------------------------------------------- /se/templates/se/base_fold.html: -------------------------------------------------------------------------------- 1 | {% extends "se/base.html" %} 2 | {% load static %} 3 | 4 | {% block css %} 5 | body { 6 | overflow: hidden; 7 | } 8 | #top_bar { 9 | top: 0; 10 | left: 0; 11 | right: 0; 12 | 13 | z-index: 999; 14 | position: absolute; 15 | background-color: var(--bg); 16 | display: none; 17 | } 18 | iframe { 19 | border: 0; 20 | height: 100%; 21 | width: 100%; 22 | position: fixed; 23 | left: 0; 24 | right: 0; 25 | bottom: 0; 26 | top: 0; 27 | } 28 | #fold_button { 29 | position: absolute; 30 | left: 50%; 31 | top: 2px; 32 | padding: 3px; 33 | z-index: 1000; 34 | } 35 | #fold_button > img { 36 | vertical-align: top; 37 | width: 16px; 38 | height: 16px; 39 | } 40 | {% endblock %} 41 | 42 | {% block js %} 43 | function fold_switch() { 44 | const top_bar = document.getElementById('top_bar'); 45 | const fold_button = document.getElementById('fold_button'); 46 | 47 | if (top_bar.style.display === 'block') { 48 | top_bar.style.display = 'none'; 49 | fold_button.innerHTML = fold_button.innerHTML.replace('Hide', 'Show') 50 | } else { 51 | top_bar.style.display = 'block'; 52 | fold_button.innerHTML = fold_button.innerHTML.replace('Show', 'Hide') 53 | const search_input = document.getElementById('id_q'); 54 | search_input.focus(); 55 | } 56 | } 57 | {% endblock %} 58 | 59 | {% block top_bar %} 60 | {% include "se/archive.html" %} 61 | {% endblock %} 62 | 63 | {% block body_no_margin %} 64 | 68 | {% endblock %} 69 | -------------------------------------------------------------------------------- /se/templates/se/base_raw.html: -------------------------------------------------------------------------------- 1 | {% load i18n static %} 2 | {% get_current_language as LANGUAGE_CODE %}{% get_current_language_bidi as LANGUAGE_BIDI %} 3 | 4 | 5 | 6 | 7 | 8 | {% if favicon %} 9 | 10 | {% else %} 11 | 12 | {% endif %} 13 | SOSSE{% if title %} · {{ head_title|default:title }}{% endif %} 14 | 15 | 16 | {% block head %} 17 | {% endblock %} 18 | 19 | 20 | 21 | 25 | 26 | 30 | 31 | 32 | 33 | 59 | {% block body_base %} 60 | {% endblock %} 61 | 62 | 63 | -------------------------------------------------------------------------------- /se/templates/se/components/modal.html: -------------------------------------------------------------------------------- 1 | 2 | 10 | -------------------------------------------------------------------------------- /se/templates/se/components/tag.html: -------------------------------------------------------------------------------- 1 | 2 | {% if tag.href %} 3 | 4 | {{ tag.name }} 5 | 6 | {% else %} 7 | {{ tag.name }} 8 | {% if tag.clear_href or on_delete %} 9 | 10 | {% endif %} 11 | {% if with_counters %} 12 | - 13 | {% endif %} 14 | {% endif %} 15 | 16 | -------------------------------------------------------------------------------- /se/templates/se/components/tag_action.html: -------------------------------------------------------------------------------- 1 | 12 | -------------------------------------------------------------------------------- /se/templates/se/components/tags_list.html: -------------------------------------------------------------------------------- 1 |
    2 | {% for tag in model_tags %} 3 | {% include "se/components/tag.html" with suffix="-select" classes="tag-select" %} 4 | {% endfor %} 5 | {% if 'se.change_document' in perms or django_admin %} 6 | {% include "se/components/tag_action.html" with id="edit_tags" text="⭐ Edit" onclick=tags_edit_onclick %} 7 | 8 | {% endif %} 9 |
    10 | -------------------------------------------------------------------------------- /se/templates/se/download.html: -------------------------------------------------------------------------------- 1 | {% extends "se/base.html" %} 2 | {% load static %} 3 | 4 | {% block top_bar %} 5 | {% include "se/archive.html" %} 6 | {% endblock %} 7 | 8 | {% block body %} 9 |

    {{ icon }} {{ filename }} ({{ filesize|filesizeformat }})

    10 | {{ doc.mimetype }} 11 |
    12 | 📥 13 | Download 14 |
    15 | 16 |
    17 | {% if mimebase == 'image' %} 18 | 19 | {% elif mimebase == 'video' %} 20 |
    25 | {% endblock %} 26 | -------------------------------------------------------------------------------- /se/templates/se/embed.html: -------------------------------------------------------------------------------- 1 | {% extends "se/base_fold.html" %} 2 | {% load static %} 3 | 4 | {% block js %} 5 | {{ block.super }} 6 | 7 | function frame_loaded() { 8 | const iframe = document.getElementsByTagName('iframe'); 9 | const links = iframe[0].contentWindow.document.getElementsByTagName('a'); 10 | for (let i = 0; i < links.length; i++) { 11 | const link = links[i]; 12 | link.onclick = function() { 13 | window.top.location.href = link.getAttribute('href'); 14 | }; 15 | } 16 | } 17 | {% endblock %} 18 | 19 | {% block body_no_margin %} 20 | {{ block.super }} 21 | 22 | {% endblock %} 23 | -------------------------------------------------------------------------------- /se/templates/se/feed.html: -------------------------------------------------------------------------------- 1 | {% load i18n static %} 2 | {% get_current_language as LANGUAGE_CODE %}{% get_current_language_bidi as LANGUAGE_BIDI %} 3 | 4 | 5 | 6 | 7 | 8 | {% if feed.feed.icon %} 9 | 10 | {% endif %} 11 | {% if feed.feed.title %} 12 | {{ feed.feed.title }} 13 | {% endif %} 14 | 15 | 16 | {% if 'title' in feed.feed %} 17 |

    18 | {% if 'link' in feed.feed %} 19 | 20 | {% endif %} 21 | {{ feed.feed.title }} 22 | {% if feed.feed.link %} 23 | 24 | {% endif %} 25 |

    26 | {% endif %} 27 | {% if feed.feed.description %} 28 |

    29 | {{ feed.feed.description }} 30 |

    31 | {% endif %} 32 |

    33 |

      34 | {% for entry in feed.entries %} 35 |
    • 36 | {% if entry.updated_datetime %} 37 | {{ entry.updated_datetime|date:"SHORT_DATETIME_FORMAT" }} 38 | {% endif %} 39 | {{ entry.title }} 40 |
    • 41 | {% endfor %} 42 |
    43 |

    44 | 45 | 46 | -------------------------------------------------------------------------------- /se/templates/se/home_browse.html: -------------------------------------------------------------------------------- 1 | {% load static %} 2 | 30 | -------------------------------------------------------------------------------- /se/templates/se/html_excluded.html: -------------------------------------------------------------------------------- 1 | {% extends "se/base.html" %} 2 | 3 | {% block body %} 4 |

    5 | This file was not {% if method == 'mimetype' %}saved{% else %}downloaded{% endif %} because its {{ method }} matched the exclusion regex of assets to download. 6 |

    7 | {% if crawl_policy and 'se.view_crawlpolicy' in perms %} 8 |

    9 | The crawl policy {{ crawl_policy }} can be modified to make this file available. 10 |

    11 | {% endif %} 12 | {% endblock %} 13 | -------------------------------------------------------------------------------- /se/templates/se/info_fallback.html: -------------------------------------------------------------------------------- 1 | {% if doc and not doc.crawl_last %} 2 |

    3 | This page has not been crawled yet. 4 |

    5 | {% elif doc and doc.robotstxt_rejected %} 6 |

    7 | Crawling this page was rejected by a robots.txt rule. 8 |

    9 |

    10 | You can ignore this robots.txt policy in the Domain settings for {{ doc.default_domain_setting.domain }}. 11 |

    12 | {% elif doc and doc.error %} 13 |

    14 | An error occured while crawling this page: 15 |

    16 |             {{ doc.error }}
    17 |         
    18 |

    19 | {% endif %} 20 | -------------------------------------------------------------------------------- /se/templates/se/main_menu.html: -------------------------------------------------------------------------------- 1 | {% load static %} 2 | 34 | {% if user.is_active and user.is_staff %} 35 | 44 | {% endif %} 45 | -------------------------------------------------------------------------------- /se/templates/se/opensearch.xml: -------------------------------------------------------------------------------- 1 | {% load static %} 2 | 4 | SOSSE 5 | SOSSE search engine 6 | UTF-8 7 | {{ url }}{% static 'se/logo.png' %} 8 | 9 | 10 | -------------------------------------------------------------------------------- /se/templates/se/pagination.html: -------------------------------------------------------------------------------- 1 | {% if paginated.paginator.num_pages %} 2 | 20 | {% endif %} 21 | -------------------------------------------------------------------------------- /se/templates/se/screenshot_full.html: -------------------------------------------------------------------------------- 1 | {% extends "se/base_raw.html" %} 2 | {% load static %} 3 | 4 | {% block head %} 5 | 6 | {% for screen in screens %} 7 | 8 | {% endfor %} 9 | {% endblock %} 10 | 11 | {% block js %} 12 | const screen_width = {{ screenshot_size.0 }}; 13 | const screen_height = {{ screenshot_size.1 }}; 14 | {% endblock %} 15 | 16 | {% block css %} 17 | #screenshots > img { 18 | margin-top: -5px; 19 | } 20 | .img_link { 21 | position: absolute; 22 | margin-top: -4px; 23 | } 24 | .img_link:hover { 25 | position: absolute; 26 | box-shadow: 0px 0px 4px 4px #91baff; 27 | margin: -4px 0px 0px -2px; 28 | padding: 0px 8px 4px 0px; 29 | } 30 | {% endblock %} 31 | 32 | {% block top_bar %} 33 | {% include "se/archive.html" %} 34 | {% endblock %} 35 | 36 | {% block body_base %} 37 | {% if doc.screenshot_count %} 38 |
    39 | {% for link in links %} 40 | 41 | {% endfor %} 42 | {% for screen in screens %} 43 | 44 | {% endfor %} 45 |
    46 | {% else %} 47 |
    48 | {% include "se/info_fallback.html" %} 49 |
    50 | {% endif %} 51 | {% endblock %} 52 | -------------------------------------------------------------------------------- /se/templates/se/search_redirect.html: -------------------------------------------------------------------------------- 1 | {% load i18n static %} 2 | 3 | 4 | 5 | 6 | 7 | {% if favicon %} 8 | 9 | {% else %} 10 | 11 | {% endif %} 12 | SOSSE 13 | 16 | 17 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /se/templates/se/tags.html: -------------------------------------------------------------------------------- 1 | 2 |
    3 | {% if not tags %} 4 | No tags exist yet. 5 | {% if change_permission %} 6 |
    7 | {% include "se/components/tag_action.html" with text="✏️ Create" href=create_tag_href %} 8 |
    9 | {% endif %} 10 | {% else %} 11 | {% if change_permission %} 12 |
    13 | {% include "se/components/tag_action.html" with text="📝 Edit" href=view_tags_href %} 14 |
    15 |
    16 | {% include "se/components/tag_action.html" with text="✏️ Create" href=create_tag_href %} 17 |
    18 | {% endif %} 19 | Selected: 20 |
    21 | {% for tag in tags %} 22 | {% include "se/components/tag.html" with suffix="-edit" on_delete=tag.js_add_tag_onclick classes="tag-select" bold=True %} 23 | {% endfor %} 24 | {% include "se/components/tag_action.html" with id="clear_selected_tags" text="⨉ Clear" onclick="clear_tags()" %} 25 |
    26 |
    27 | {% endif %} 28 |
    29 | 30 |
    31 | {% for tag in root_tags %} 32 | {# one div per root tag to make the grid layout #} 33 |
    34 | {% for child in tag.descendants %} 35 |
    {# this div makes sure the tag takes the full width of the grid layout's panel #} 36 | {% include "se/components/tag.html" with tag=child with_padding=True with_counters=True onclick=child.js_add_tag_onclick cursor_pointer=True %} 37 |
    38 | {% endfor %} 39 |
    40 | {% endfor %} 41 |
    42 | 43 |
    44 | 45 | 46 |
    47 | -------------------------------------------------------------------------------- /se/templates/se/unknown_url.html: -------------------------------------------------------------------------------- 1 | {% extends "se/base.html" %} 2 | {% block top_bar %} 3 | 10 | {% endblock %} 11 | {% block body %} 12 |

    13 | 🚫 This page has not been indexed yet. 14 |

    15 |

    16 |

    17 | {% if 'se.add_document' in perms %} 18 |
    19 | {% csrf_token %} 20 | 21 | 22 |
    23 | {% endif %} 24 | {% endblock %} 25 | -------------------------------------------------------------------------------- /se/templates/se/words.html: -------------------------------------------------------------------------------- 1 | {% extends "se/base.html" %} 2 | {% load static %} 3 | 4 | {% block top_bar %} 5 | {% include "se/archive.html" %} 6 | {% endblock %} 7 | 8 | {% block body %} 9 |
      10 |
    • {{ lang }} detected
    • 11 |
    • The document has {{ doc.content.split|length }} words
    • 12 |
    • The document has {{ doc.vector.split|length }} unique words
    • 13 |
    14 |
    15 |
      16 | {% for word, weights in words %} 17 |
    • {{ word }}: {{ weights }}
    • 18 | {% endfor %} 19 |
    20 | {% endblock %} 21 | -------------------------------------------------------------------------------- /se/templates/se/www.html: -------------------------------------------------------------------------------- 1 | {% extends "se/base.html" %} 2 | 3 | {% block top_bar %} 4 | {% include "se/archive.html" %} 5 | {% endblock %} 6 | 7 | {% block body %} 8 | {% if content %} 9 | {{ content }} 10 | {% else %} 11 | {% include "se/info_fallback.html" %} 12 | {% endif %} 13 | {% endblock %} 14 | -------------------------------------------------------------------------------- /se/test_admin.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.test import TransactionTestCase 17 | 18 | from .admin import DocumentOrphanFilter 19 | from .document import Document 20 | from .models import Link 21 | 22 | 23 | class TestAdmin(TransactionTestCase): 24 | def test_document_orphan_filter(self): 25 | orphan = Document.objects.wo_content().create(url="http://orphan") 26 | parent = Document.objects.wo_content().create(url="http://parent") 27 | child = Document.objects.wo_content().create(url="http://child") 28 | Link.objects.create(doc_from=parent, doc_to=child, pos=0, link_no=0) 29 | 30 | redirect_src = Document.objects.wo_content().create( 31 | url="http://redirect_src", redirect_url="http://redirect_dst" 32 | ) 33 | redirect_dst = Document.objects.wo_content().create(url="http://redirect_dst") 34 | 35 | doc_filter = DocumentOrphanFilter(None, {"orphan": "full"}, Document, None) 36 | orphaned = doc_filter.queryset(None, Document.objects.wo_content().all()) 37 | self.assertEqual(list(orphaned), [orphan]) 38 | 39 | doc_filter = DocumentOrphanFilter(None, {"orphan": "no_parent"}, Document, None) 40 | orphaned = doc_filter.queryset(None, Document.objects.wo_content().all()) 41 | self.assertEqual(list(orphaned), [orphan, parent, redirect_src]) 42 | 43 | doc_filter = DocumentOrphanFilter(None, {"orphan": "no_children"}, Document, None) 44 | orphaned = doc_filter.queryset(None, Document.objects.wo_content().all()) 45 | self.assertEqual(list(orphaned), [orphan, child, redirect_dst]) 46 | -------------------------------------------------------------------------------- /se/test_commands.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.core.management import call_command 17 | from django.test import TransactionTestCase 18 | 19 | from .document import Document 20 | 21 | 22 | class CommandsTest(TransactionTestCase): 23 | def setUp(self): 24 | Document.objects.wo_content().create(url="http://test/") 25 | 26 | def test_delete_document_match(self): 27 | self.assertEqual(Document.objects.count(), 1) 28 | call_command("delete_documents", "http://test") 29 | self.assertEqual(Document.objects.count(), 0) 30 | 31 | def test_delete_document_no_match(self): 32 | self.assertEqual(Document.objects.count(), 1) 33 | call_command("delete_documents", "http://no_test") 34 | self.assertEqual(Document.objects.count(), 1) 35 | 36 | def test_delete_document_dry_run(self): 37 | self.assertEqual(Document.objects.count(), 1) 38 | with self.assertRaises(SystemExit): 39 | call_command("delete_documents", "--dry-run", "http://test") 40 | self.assertEqual(Document.objects.count(), 1) 41 | -------------------------------------------------------------------------------- /se/test_misc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.test import TransactionTestCase, override_settings 17 | 18 | from .document import Document 19 | from .domain_setting import DomainSetting 20 | 21 | ROBOTS_TXT = """ 22 | # Test robots.txt 23 | user-agent: * 24 | allow: /allow/* 25 | disallow: /disallow/* 26 | """ 27 | 28 | 29 | class MiscTest(TransactionTestCase): 30 | def test_robots_txt(self): 31 | domain = DomainSetting.objects.create(domain="127.0.0.1") 32 | domain._parse_robotstxt(ROBOTS_TXT) 33 | self.assertEqual(domain.robots_allow, "/allow/.*") 34 | self.assertEqual(domain.robots_disallow, "/disallow/.*") 35 | 36 | domain.robots_ua_hash = DomainSetting.ua_hash() 37 | domain.robots_status = DomainSetting.ROBOTS_LOADED 38 | domain.save() 39 | 40 | self.assertTrue(domain.robots_authorized("http://127.0.0.1/allow/aa")) 41 | self.assertFalse(domain.robots_authorized("http://127.0.0.1/disallow/aa")) 42 | 43 | @override_settings(SOSSE_LINKS_NO_REFERRER=True) 44 | @override_settings(SOSSE_LINKS_NEW_TAB=True) 45 | def test_external_link(self): 46 | doc = Document(url="http://test/") 47 | self.assertEqual( 48 | doc.get_source_link(), 49 | '🌍 Source', 50 | ) 51 | 52 | @override_settings(SOSSE_LINKS_NO_REFERRER=False) 53 | @override_settings(SOSSE_LINKS_NEW_TAB=False) 54 | def test_external_link_no_opt(self): 55 | doc = Document(url="http://test/") 56 | self.assertEqual(doc.get_source_link(), '🌍 Source') 57 | -------------------------------------------------------------------------------- /se/test_requests.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | import requests 17 | from django.test import TransactionTestCase 18 | 19 | from .browser_request import BrowserRequest 20 | 21 | 22 | class RequestsTest(TransactionTestCase): 23 | def _get(self, s, url): 24 | params = BrowserRequest._requests_params() 25 | params["allow_redirects"] = True 26 | return s.get(url, **params) 27 | 28 | def test_10_cookie_set(self): 29 | s = requests.Session() 30 | self._get(s, "http://127.0.0.1:8000/cookies/set?test_key=test_value") 31 | cookies = list(s.cookies) 32 | self.assertEqual(len(cookies), 1) 33 | self.assertEqual(cookies[0].name, "test_key") 34 | self.assertEqual(cookies[0].value, "test_value") 35 | self.assertEqual(cookies[0].domain, "127.0.0.1") 36 | return s 37 | 38 | def test_20_cookie_delete(self): 39 | s = self.test_10_cookie_set() 40 | self._get(s, "http://127.0.0.1:8000/cookies/delete?test_key") 41 | cookies = list(s.cookies) 42 | self.assertEqual(cookies, []) 43 | -------------------------------------------------------------------------------- /se/words.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | 16 | from django.views.generic import TemplateView 17 | 18 | from .archive import ArchiveMixin 19 | 20 | 21 | class WordsView(ArchiveMixin, TemplateView): 22 | template_name = "se/words.html" 23 | view_name = "words" 24 | 25 | def get_context_data(self, *args, **kwargs): 26 | words = [] 27 | for w in self.doc.vector.split(): 28 | word, weights = w.split(":", 1) 29 | word = word.strip("'") 30 | words.append((word, weights)) 31 | 32 | context = super().get_context_data(*args, **kwargs) 33 | return context | {"words": words, "lang": self.doc.lang_flag(True)} 34 | -------------------------------------------------------------------------------- /sosse-admin: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd "$(dirname $0)" 3 | exec python3 -m sosse.sosse_admin "$@" 4 | -------------------------------------------------------------------------------- /sosse/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/sosse/__init__.py -------------------------------------------------------------------------------- /sosse/sosse_admin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright 2022-2025 Laurent Defert 3 | # 4 | # This file is part of SOSSE. 5 | # 6 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 7 | # General Public License as published by the Free Software Foundation, either version 3 of the 8 | # License, or (at your option) any later version. 9 | # 10 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 11 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 12 | # See the GNU Affero General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 15 | # If not, see . 16 | """Django's command-line utility for administrative tasks.""" 17 | 18 | import os 19 | import sys 20 | from pathlib import Path 21 | 22 | 23 | def main(): 24 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "sosse.settings") 25 | try: 26 | from django.core.management import execute_from_command_line 27 | except ImportError as exc: 28 | raise ImportError( 29 | "Couldn't import Django. Are you sure it's installed and " 30 | "available on your PYTHONPATH environment variable? Did you " 31 | "forget to activate a virtual environment?" 32 | ) from exc 33 | 34 | linkpreview = Path(__file__).parent.parent / "se/deps/linkpreview" 35 | sys.path.insert(0, str(linkpreview)) 36 | fake_useragent = Path(__file__).parent.parent / "se/deps/fake-useragent/src" 37 | sys.path.insert(0, str(fake_useragent)) 38 | execute_from_command_line(sys.argv) 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /sosse/wsgi.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022-2025 Laurent Defert 2 | # 3 | # This file is part of SOSSE. 4 | # 5 | # SOSSE is free software: you can redistribute it and/or modify it under the terms of the GNU Affero 6 | # General Public License as published by the Free Software Foundation, either version 3 of the 7 | # License, or (at your option) any later version. 8 | # 9 | # SOSSE is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 10 | # the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 11 | # See the GNU Affero General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Affero General Public License along with SOSSE. 14 | # If not, see . 15 | """WSGI config for sosse project. 16 | 17 | It exposes the WSGI callable as a module-level variable named ``application``. 18 | 19 | For more information on this file, see 20 | https://docs.djangoproject.com/en/2.2/howto/deployment/wsgi/ 21 | """ 22 | 23 | import os 24 | import sys 25 | from pathlib import Path 26 | 27 | from django.core.wsgi import get_wsgi_application 28 | 29 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "sosse.settings") 30 | 31 | linkpreview = Path(__file__).parent.parent / "se/deps/linkpreview" 32 | sys.path.insert(0, str(linkpreview)) 33 | fake_useragent = Path(__file__).parent.parent / "se/deps/fake-useragent/src" 34 | sys.path.insert(0, str(fake_useragent)) 35 | 36 | application = get_wsgi_application() 37 | -------------------------------------------------------------------------------- /swagger-initializer.js: -------------------------------------------------------------------------------- 1 | window.onload = function() { 2 | // 3 | 4 | // the following lines will be replaced by docker/configurator, when it runs in a docker-container 5 | window.ui = SwaggerUIBundle({ 6 | url: "/api/schema", 7 | dom_id: '#swagger-ui', 8 | deepLinking: true, 9 | presets: [ 10 | SwaggerUIBundle.presets.apis, 11 | SwaggerUIStandalonePreset 12 | ], 13 | plugins: [ 14 | SwaggerUIBundle.plugins.DownloadUrl 15 | ], 16 | layout: "BaseLayout" 17 | }); 18 | 19 | // 20 | }; 21 | -------------------------------------------------------------------------------- /tests/build_doc.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | cd "$(dirname "$0")/.." 3 | rm -rf doc/build 4 | ./sosse-admin extract_doc cli > doc/source/cli_generated.rst 5 | ./sosse-admin extract_doc conf > doc/source/config_file_generated.rst 6 | ./sosse-admin extract_doc se > doc/source/user/shortcut_list_generated.rst 7 | make -C doc html 8 | -------------------------------------------------------------------------------- /tests/cookies.json: -------------------------------------------------------------------------------- 1 | [{"model": "se.cookie", "pk": 43, "fields": {"domain": "127.0.0.1", "domain_cc": "127.0.0.1", "inc_subdomain": true, "name": "_gat", "value": "1", "path": "/", "expires": "2023-04-27T12:06:54Z", "secure": false, "same_site": "Lax", "http_only": false}}, {"model": "se.cookie", "pk": 44, "fields": {"domain": "127.0.0.1", "domain_cc": "127.0.0.1", "inc_subdomain": true, "name": "_gid", "value": "GA1.1.1966991879.1682597154", "path": "/", "expires": "2023-04-28T12:05:57Z", "secure": false, "same_site": "Lax", "http_only": false}}, {"model": "se.cookie", "pk": 45, "fields": {"domain": "127.0.0.1", "domain_cc": "127.0.0.1", "inc_subdomain": true, "name": "_ga", "value": "GA1.1.236038837.1682597154", "path": "/", "expires": "2024-05-31T12:05:57Z", "secure": false, "same_site": "Lax", "http_only": false}}] 2 | -------------------------------------------------------------------------------- /tests/coverage.patch: -------------------------------------------------------------------------------- 1 | diff --git a/sosse/urls.py b/sosse/urls.py 2 | index f37a01f..dded493 100644 3 | --- a/sosse/urls.py 4 | +++ b/sosse/urls.py 5 | @@ -53,6 +53,15 @@ from se.tags_list import TagsListView 6 | from se.words import WordsView 7 | from se.www import WWWView 8 | 9 | + 10 | +def coverage_stop(request): 11 | + import uwsgi 12 | + from django.http import HttpResponse 13 | + 14 | + uwsgi.signal(1) 15 | + return HttpResponse("Coverage stopped\n") 16 | + 17 | + 18 | urlpatterns = [ 19 | path("admin/", admin.site.urls), 20 | path("", SearchView.as_view(), name="search"), 21 | @@ -85,4 +94,5 @@ urlpatterns = [ 22 | HTMLExcludedView.as_view(), 23 | name="html_excluded", 24 | ), 25 | + path("coverage_stop/", coverage_stop), 26 | ] 27 | diff --git a/sosse/wsgi.py b/sosse/wsgi.py 28 | index 4bb9baf..ae706ee 100644 29 | --- a/sosse/wsgi.py 30 | +++ b/sosse/wsgi.py 31 | @@ -33,4 +33,21 @@ sys.path.insert(0, str(linkpreview)) 32 | fake_useragent = Path(__file__).parent.parent / "se/deps/fake-useragent/src" 33 | sys.path.insert(0, str(fake_useragent)) 34 | 35 | +from coverage import Coverage 36 | +import uwsgi 37 | +import random 38 | + 39 | +rnd = random.randint(0, 1000000) 40 | +data_file = f"/tmp/coverage-{rnd}" 41 | +cov = Coverage(data_file=data_file, data_suffix=True, source=["se", "sosse"]) 42 | +cov.start() 43 | + 44 | + 45 | +def handle_sigterm(signum): 46 | + cov.stop() 47 | + cov.save() 48 | + 49 | + 50 | +uwsgi.register_signal(1, "workers", handle_sigterm) 51 | + 52 | application = get_wsgi_application() 53 | -------------------------------------------------------------------------------- /tests/doc_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | CODE_BLOCK_FILE="$1" 3 | DOC_SRC="$2" 4 | 5 | code_length="$(jq --arg SRC "$DOC_SRC" '[.[] | select(.src == $SRC and (.lang == "shell" or .lang == "default"))] | length' < "$CODE_BLOCK_FILE")" 6 | 7 | function code_no() { 8 | jq -r --arg IDX "$1" --arg SRC "$DOC_SRC" '[.[] | select(.src == $SRC and (.lang == "shell" or .lang == "default"))][$IDX|tonumber].code' < "$CODE_BLOCK_FILE" 9 | } 10 | 11 | function show_error() { 12 | f="$(jq -r --arg IDX "$1" --arg SRC "$DOC_SRC" '[.[] | select(.src == $SRC and (.lang == "shell" or .lang == "default"))][$IDX|tonumber].source' < "$CODE_BLOCK_FILE")" 13 | l="$(jq -r --arg IDX "$1" --arg SRC "$DOC_SRC" '[.[] | select(.src == $SRC and (.lang == "shell" or .lang == "default"))][$IDX|tonumber].line' < "$CODE_BLOCK_FILE")" 14 | echo "Failed on $f, line $l." >&2 15 | } 16 | 17 | if [ "$code_length" == "0" ] 18 | then 19 | echo "No step to perform" >&2 20 | exit 1 21 | fi 22 | 23 | block_no=0 24 | while [ "$block_no" -lt "$code_length" ] 25 | do 26 | code="$(code_no "$block_no")" 27 | if echo "$code" | grep -q "nano .*" 28 | then 29 | filename="$(echo "$code" | sed 's/^nano //')" 30 | 31 | block_no=$(($block_no + 1)) 32 | code_no "$block_no" > "$filename" 33 | echo "----- Writing $filename" 34 | else 35 | _code="$(echo "$code"|grep -v ^systemctl ||:)" 36 | if [ "$_code" != "" ] 37 | then 38 | IFS=$'\n'; for line in $_code 39 | do 40 | echo "----- $line" 41 | eval "$line" 42 | exit_status=$? 43 | if [ $exit_status != 0 ] 44 | then 45 | show_error "$block_no" 46 | exit 1 47 | fi 48 | done 49 | fi 50 | fi 51 | block_no=$(($block_no + 1)) 52 | done 53 | -------------------------------------------------------------------------------- /tests/docker_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | f="$(cat "$1")" 5 | cmd="" 6 | IFS=$'\n'; for line in $f 7 | do 8 | if test -n "$cmd" 9 | then 10 | cmd="$cmd"$'\n'"$line" 11 | if grep -q '\\$' <<< "$line" 12 | then 13 | continue 14 | fi 15 | elif grep -q ^RUN <<< "$line" 16 | then 17 | cmd="$(echo $line | sed -e 's/^RUN //')" 18 | if grep -q '\\$' <<< "$line" 19 | then 20 | continue 21 | fi 22 | else 23 | continue 24 | fi 25 | 26 | echo "---- $cmd" 27 | eval "$cmd" 28 | cmd="" 29 | done 30 | -------------------------------------------------------------------------------- /tests/document-ja.json: -------------------------------------------------------------------------------- 1 | [{"model": "se.document", "pk": 123, "fields": {"url": "http://127.0.0.1/screenshots/website/jp.html", "normalized_url": "127.0.0.1 screenshots website jp.html", "title": "http://127.0.0.1/screenshots/website/jp.html", "normalized_title": "http://127.0.0.1/screenshots/website/jp.html", "content": "こんにちは", "normalized_content": "こんにちは", "content_hash": "5b5ac65deebdf3ef2fd90df959f6e814", "vector": "'/screenshots/website/jp.html':2A '127.0.0.1':1A,3A 'jp.html':6A 'screenshot':4A 'websit':5A 'こんにちは':7C", "lang_iso_639_1": "ja", "vector_lang": "english", "mimetype": "text/html", "favicon": null, "robotstxt_rejected": false, "redirect_url": null, "too_many_redirects": false, "screenshot_count": 0, "screenshot_format": "jpg", "screenshot_size": "1920x1080", "crawl_first": "2023-04-29T15:43:57.854Z", "crawl_last": "2023-04-29T15:43:57.854Z", "crawl_next": null, "crawl_dt": null, "crawl_recurse": 0, "error": "", "error_hash": "", "worker_no": null}}] 2 | -------------------------------------------------------------------------------- /tests/opensearch.xml: -------------------------------------------------------------------------------- 1 | 2 | Twitter 3 | Twitter Search 4 | 5 | https://abs.twimg.com/favicons/favicon.ico 6 | UTF-8 7 | https://twitter.com/search-home 8 | 9 | -------------------------------------------------------------------------------- /tests/pages/browser_detect_js.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 10 | 11 | 12 | test 13 | 14 | 15 | -------------------------------------------------------------------------------- /tests/pages/browser_detect_no_js.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | has no JS 4 | 5 | 6 | -------------------------------------------------------------------------------- /tests/pages/css_in_js.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 11 | 12 | test 13 | 14 | 15 | -------------------------------------------------------------------------------- /tests/pages/nav_elements.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /tests/pages/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/tests/pages/test.jpg -------------------------------------------------------------------------------- /tests/pages/test.mp4: -------------------------------------------------------------------------------- 1 | ftypisommoov mvhd -------------------------------------------------------------------------------- /tests/pages/test.pdf: -------------------------------------------------------------------------------- 1 | %PDF-1.7 2 | 1 0 obj 3 | << /Type /Catalog /Pages 2 0 R >> 4 | endobj 5 | 2 0 obj 6 | << /Type /Pages /Count 1 /Kids [3 0 R] >> 7 | endobj 8 | 3 0 obj 9 | << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >> 10 | endobj 11 | 4 0 obj 12 | << /Length 44 >> 13 | stream 14 | BT 15 | /F1 24 Tf 16 | 100 700 Td 17 | (Hello, World!) Tj 18 | ET 19 | endstream 20 | endobj 21 | xref 22 | 0 5 23 | 0000000000 65535 f 24 | 0000000010 00000 n 25 | 0000000079 00000 n 26 | 0000000175 00000 n 27 | 0000000272 00000 n 28 | trailer 29 | << /Size 5 /Root 1 0 R >> 30 | startxref 31 | 354 32 | %%EOF -------------------------------------------------------------------------------- /tests/pages/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/tests/pages/test.png -------------------------------------------------------------------------------- /tests/pages/test.wav: -------------------------------------------------------------------------------- 1 | RIFF$WAVEfmt data -------------------------------------------------------------------------------- /tests/pages/test.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/tests/pages/test.zip -------------------------------------------------------------------------------- /tests/release.md: -------------------------------------------------------------------------------- 1 | - make docker_git_build 2 | - test (docker run -p 8005:80 -e SOSSE_CRAWLER_COUNT=1 biolds/sosse:git) 3 | - debian update: version + changelog : dch -i (available in the devscripts deb package) 4 | - doc version update in doc/source/conf.py 5 | - update the CHANGELOG.md using the last changelog build it the main branch 6 | - commit as "v1.10.1 release" 7 | - MR 8 | - create tag "vX.X.X" 9 | - update the `stable` branch for the release (to update the `stable` version of readthedoc) 10 | - check RTD as the doc build can fail if Gitlab has concurrent builds running 11 | - pip release (this needs to be done before the docker step below) 12 | - clear `dist/` 13 | - download the artifacts of the `pip_pkg` step and unzip it in the root (it creates `dist/` with packages) 14 | - run `make pip_pkg_push` 15 | - debian packages 16 | - wget `` 17 | - cd /var/www/html/repo/apt/debian/ 18 | - reprepro -V --keepunreferencedfiles includedeb bookworm `` 19 | - docker build: 20 | - docker system prune -a 21 | - make docker_build 22 | - test (docker run -p 8005:80 -e SOSSE_CRAWLER_COUNT=1 biolds/sosse:latest, version no check) 23 | - make docker_push 24 | - make docker_release_push 25 | - docker tag biolds/sosse:latest biolds/sosse:X.X.X 26 | - docker push biolds/sosse:X.X.X 27 | - in case the README.md file was modified, update the description at https://hub.docker.com/repository/docker/biolds/sosse/general 28 | -------------------------------------------------------------------------------- /tests/robotframework/config.yaml: -------------------------------------------------------------------------------- 1 | SOSSE_ADMIN: sosse-admin 2 | -------------------------------------------------------------------------------- /tests/robotframework/docs/01_firstrun.robot: -------------------------------------------------------------------------------- 1 | | *Settings* | 2 | | Library | SeleniumLibrary 3 | | Resource | ../tests/common.robot 4 | 5 | | *Test Cases* | 6 | | First run checks 7 | | | SOSSE Go To | http://127.0.0.1/ 8 | | | SOSSE Capture Page Screenshot | clear_home.png 9 | | | SOSSE Go To | http://127.0.0.1/profile/ 10 | | | SOSSE Capture Page Screenshot | clear_profile.png 11 | | | SOSSE Go To | http://127.0.0.1/history/ 12 | | | SOSSE Capture Page Screenshot | clear_history.png 13 | | | SOSSE Go To | http://127.0.0.1/admin/password_change/ 14 | | | SOSSE Capture Page Screenshot | clear_pass_change.png 15 | | | SOSSE Go To | http://127.0.0.1/swagger/ 16 | | | SOSSE Capture Page Screenshot | clear_swagger.png 17 | | | SOSSE Go To | http://127.0.0.1/about/ 18 | | | SOSSE Capture Page Screenshot | clear_about.png 19 | | | SOSSE Go To | http://127.0.0.1/resources/ 20 | | | SOSSE Capture Page Screenshot | clear_resources.png 21 | | | SOSSE Go To | http://127.0.0.1/admin/ 22 | | | SOSSE Capture Page Screenshot | clear_admin.png 23 | | | SOSSE Go To | http://127.0.0.1/admin/se/document/queue/ 24 | | | SOSSE Capture Page Screenshot | clear_admin_doc_queue.png 25 | | | SOSSE Go To | http://127.0.0.1/admin/se/document/crawl_queue/ 26 | | | SOSSE Capture Page Screenshot | clear_admin_crawl_queue.png 27 | | | SOSSE Go To | http://127.0.0.1/admin/se/document/crawlers/ 28 | | | SOSSE Capture Page Screenshot | clear_admin_crawlers.png 29 | | | SOSSE Go To | http://127.0.0.1/admin/se/document/analytics/ 30 | | | SOSSE Capture Page Screenshot | clear_admin_analytics.png 31 | | | SOSSE Go To | http://127.0.0.1/admin/se/crawlpolicy/ 32 | | | SOSSE Capture Page Screenshot | clear_admin_crawlpolicy.png 33 | | | SOSSE Go To | http://127.0.0.1/admin/se/document/ 34 | | | SOSSE Capture Page Screenshot | clear_admin_document.png 35 | | | SOSSE Go To | http://127.0.0.1/admin/se/webhook/ 36 | | | SOSSE Capture Page Screenshot | clear_admin_webhook.png 37 | | | SOSSE Go To | http://127.0.0.1/admin/se/tag/ 38 | | | SOSSE Capture Page Screenshot | clear_admin_tag.png 39 | -------------------------------------------------------------------------------- /tests/robotframework/docs/__init__.robot: -------------------------------------------------------------------------------- 1 | | *Settings* | 2 | | Library | SeleniumLibrary 3 | | Resource | ../tests/common.robot 4 | | Suite Setup | Setup 5 | | Suite Teardown | Tear Down 6 | 7 | | *Keywords* | 8 | | Setup 9 | | | Set Selenium Timeout | 1 min 10 | | | Login 11 | | | Execute Javascript | window.onerror = function(errorMessage) { dialog(errorMessage); } 12 | 13 | | Tear Down 14 | | | Capture Page Screenshot 15 | | | Close All Browsers 16 | -------------------------------------------------------------------------------- /tests/robotframework/guide_auth.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/tests/robotframework/guide_auth.tar.gz -------------------------------------------------------------------------------- /tests/robotframework/guide_download.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/biolds/sosse/efe38e1b1dcb975fa8d77eeade941aa43339a1db/tests/robotframework/guide_download.tar.gz -------------------------------------------------------------------------------- /tests/robotframework/guide_feed_website_monitor.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "model": "se.crawlpolicy", 4 | "pk": 1, 5 | "fields": { 6 | "url_regex": "(default)", 7 | "url_regex_pg": ".*", 8 | "enabled": true, 9 | "recursion": "never", 10 | "mimetype_regex": ".*", 11 | "recursion_depth": 0, 12 | "keep_params": true, 13 | "hide_documents": false, 14 | "default_browse_mode": "selenium", 15 | "snapshot_html": true, 16 | "snapshot_exclude_url_re": "", 17 | "snapshot_exclude_mime_re": "", 18 | "snapshot_exclude_element_re": "", 19 | "thumbnail_mode": "prevscreen", 20 | "take_screenshots": false, 21 | "screenshot_format": "jpg", 22 | "remove_nav_elements": "idx", 23 | "script": "", 24 | "store_extern_links": false, 25 | "recrawl_freq": "adaptive", 26 | "recrawl_dt_min": "1 00:00:00", 27 | "recrawl_dt_max": "365 00:00:00", 28 | "hash_mode": "no_numbers", 29 | "recrawl_condition": "change", "auth_login_url_re": "", 30 | "auth_form_selector": "" 31 | } 32 | }, 33 | { 34 | "model": "se.crawlpolicy", 35 | "pk": 2, 36 | "fields": { 37 | "url_regex": "^https://my.broken-website.com/.*", 38 | "url_regex_pg": "^https://my.broken-website.com/.*", 39 | "enabled": true, 40 | "recursion": "always", 41 | "mimetype_regex": ".*", 42 | "recursion_depth": 0, 43 | "keep_params": true, 44 | "hide_documents": false, 45 | "default_browse_mode": "requests", 46 | "snapshot_html": false, 47 | "snapshot_exclude_url_re": "", 48 | "snapshot_exclude_mime_re": "", 49 | "snapshot_exclude_element_re": "", 50 | "thumbnail_mode": "none", 51 | "take_screenshots": false, 52 | "screenshot_format": "jpg", 53 | "remove_nav_elements": "idx", 54 | "script": "", 55 | "store_extern_links": false, 56 | "recrawl_freq": "constant", 57 | "recrawl_dt_min": "1 00:00:00", 58 | "recrawl_dt_max": null, 59 | "hash_mode": "no_numbers", 60 | "recrawl_condition": "change", "auth_login_url_re": "", 61 | "auth_form_selector": "" 62 | } 63 | } 64 | ] 65 | -------------------------------------------------------------------------------- /tests/robotframework/guides/01_download.robot: -------------------------------------------------------------------------------- 1 | | *Settings* | 2 | | Library | SeleniumLibrary 3 | | Resource | ../tests/common.robot 4 | 5 | | *Test Cases* | 6 | | Download 7 | # Kill the crawler before starting 8 | | | ${ret}= | Run Process | pkill | sosse-admin 9 | | | Log | ${ret.stdout} 10 | | | Log | ${ret.stderr} 11 | | | Run Command | ${SOSSE_ADMIN} | shell | -c | from se.models import Link ; Link.objects.all().delete() 12 | | | Run Command | ${SOSSE_ADMIN} | shell | -c | from se.document import Document ; Document.objects.wo_content().delete() 13 | | | Run Command | ${SOSSE_ADMIN} | shell | -c | from se.crawl_policy import CrawlPolicy ; CrawlPolicy.objects.all().delete() 14 | | | Run Command | ${SOSSE_ADMIN} | loaddata | ${CURDIR}/../guide_download/guide_download_dump.json | shell=True 15 | | | Run Command | ${SOSSE_ADMIN} | shell | -c | from se.document import Document ; from django.utils.timezone import now ; Document.objects.update(crawl_last\=now()) 16 | | | Run Command | rm | -rf | /var/lib/sosse/html | /var/lib/sosse/screenshots 17 | | | Run Command | mkdir | -p | /var/lib/sosse/ 18 | | | Run Command | tar | -x | -C | /var/lib/sosse/ | -f | ${CURDIR}/../guide_download/guide_download_html.tar 19 | | | Run Command | dd | if\=/dev/zero | of\=/var/lib/sosse/html/https,3A/www.gutenberg.org/cache/epub/75210/pg75210-images-3.epub_b9a445dff6.epub | count\=5000 20 | | | SOSSE Go To | http://127.0.0.1/admin/se/crawlpolicy/ 21 | | | SOSSE Capture Page Screenshot | guide_download_crawl_policies.png 22 | | | SOSSE Go To | http://127.0.0.1/admin/se/document/crawl_queue/ 23 | | | SOSSE Capture Page Screenshot | guide_download_crawl_queue.png 24 | | | SOSSE Go To | http://127.0.0.1/?q\=&doc_lang\=&s\=-crawl_first&ft1\=inc&ff1\=lby_url&fo1\=equal&fv1\=https%3A%2F%2Fwww.gutenberg.org%2Fcache%2Fepub%2Ffeeds%2Ftoday.rss&l\=fr&ps\=20&c\=1 25 | | | SOSSE Capture Page Screenshot | guide_download_view_library.png 26 | | | SOSSE Go To | http://127.0.0.1/html/https://www.gutenberg.org/ebooks/75218 27 | | | SOSSE Capture Page Screenshot | guide_download_archive_html.png 28 | | | SOSSE Go To | http://127.0.0.1/download/https://www.gutenberg.org/cache/epub/75210/pg75210-images-3.epub 29 | | | SOSSE Capture Page Screenshot | guide_download_archive_download.png 30 | -------------------------------------------------------------------------------- /tests/robotframework/guides/02_monitor_website.robot: -------------------------------------------------------------------------------- 1 | | *Settings* | 2 | | Library | SeleniumLibrary 3 | | Resource | ../tests/common.robot 4 | 5 | | *Test Cases* | 6 | | Monitor website 7 | | | Run Command | ${SOSSE_ADMIN} | shell | -c | from se.models import Link ; Link.objects.all().delete() 8 | | | Run Command | ${SOSSE_ADMIN} | shell | -c | from se.document import Document ; Document.objects.wo_content().delete() 9 | | | Run Command | ${SOSSE_ADMIN} | shell | -c | from se.crawl_policy import CrawlPolicy ; CrawlPolicy.objects.all().delete() 10 | | | Run Command | ${SOSSE_ADMIN} | loaddata | ${CURDIR}/../guide_feed_website_monitor.json | shell=True 11 | | | SOSSE Go To | http://127.0.0.1/admin/se/crawlpolicy/ 12 | | | SOSSE Capture Page Screenshot | guide_feed_website_monitor_policies.png 13 | | | SOSSE Go To | http://127.0.0.1/?l\=fr&ps\=20&c\=1&o\=l&q\=&doc_lang\=&s\=-modified_date&ft1\=inc&ff1\=doc&fo1\=regexp&fv1\=%28Unavailable%7CGateway+Timeout%7CRequest+Timeout%29 14 | # Increase the bottom padding of the top bar to make the Atom dropdown visible 15 | | | Execute Javascript | const top_bar = document.getElementById('top_bar') 16 | | | Execute Javascript | top_bar.style.paddingBottom = '45px' 17 | | | Click Element | id=atom_button 18 | | | Capture Element Screenshot | id=top_bar | guide_feed_website_monitor_error_search.png 19 | -------------------------------------------------------------------------------- /tests/robotframework/guides/03_search_policy.robot: -------------------------------------------------------------------------------- 1 | | *Settings* | 2 | | Library | SeleniumLibrary 3 | | Resource | ../tests/common.robot 4 | 5 | | *Test Cases* | 6 | | Search website 7 | | | SOSSE Go To | http://127.0.0.1/admin/se/crawlpolicy/add/ 8 | | | Input Text | id=id_url_regex | https://example.com/ 9 | | | SOSSE Capture Page Screenshot | guide_search_policy.png 10 | -------------------------------------------------------------------------------- /tests/robotframework/guides/04_auth.robot: -------------------------------------------------------------------------------- 1 | | *Settings* | 2 | | Library | SeleniumLibrary 3 | | Resource | ../tests/common.robot 4 | 5 | | *Test Cases* | 6 | | Authentication 7 | | | Run Command | ${SOSSE_ADMIN} | shell | -c | from se.models import Link ; Link.objects.all().delete() 8 | | | Run Command | ${SOSSE_ADMIN} | shell | -c | from se.document import Document ; Document.objects.wo_content().delete() 9 | | | Run Command | ${SOSSE_ADMIN} | shell | -c | from se.crawl_policy import CrawlPolicy ; CrawlPolicy.objects.all().delete() 10 | | | Run Command | ${SOSSE_ADMIN} | loaddata | ${CURDIR}/../guide_auth/guide_auth_dump.json | shell=True 11 | | | Run Command | rm | -rf | /var/lib/sosse/screenshots 12 | | | Run Command | mkdir | -p | /var/lib/sosse/ 13 | | | Run Command | tar | -x | -C | /var/lib/sosse/ | -f | ${CURDIR}/../guide_auth/guide_auth_html.tar 14 | | | SOSSE Go To | http://127.0.0.1/admin/se/crawlpolicy/ 15 | | | Click Element | xpath=//table[@id='result_list']//a[contains(., '8083')] 16 | | | Click Link | 🔒 Authentication 17 | | | SOSSE Capture Page Screenshot | guide_authentication_auth.png 18 | | | SOSSE Go To | http://127.0.0.1/?l\=fr&ps\=20&q\=Bernard+Werber&s\=crawl_first&ft1\=inc&ff1\=url&fo1\=contain&fv1\=http%3A%2F%2F192.168.119.11%3A8083%2Fbook%2F# 19 | | | SOSSE Capture Page Screenshot | guide_authentication_search.png 20 | -------------------------------------------------------------------------------- /tests/robotframework/guides/__init__.robot: -------------------------------------------------------------------------------- 1 | | *Settings* | 2 | | Library | SeleniumLibrary 3 | | Resource | ../tests/common.robot 4 | | Suite Setup | Setup 5 | | Suite Teardown | Close All Browsers 6 | 7 | | *Keywords* | 8 | | Setup 9 | | | Set Selenium Timeout | 1 min 10 | | | Login 11 | | | Execute Javascript | window.onerror = function(errorMessage) { dialog(errorMessage); } 12 | -------------------------------------------------------------------------------- /tests/robotframework/requirements.txt: -------------------------------------------------------------------------------- 1 | pyyaml 2 | robotframework 3 | robotframework-seleniumlibrary 4 | selenium<4.17 5 | -------------------------------------------------------------------------------- /tests/robotframework/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | su - postgres -c "psql -d sosse --command=\"DELETE FROM se_crawlpolicy;\"" 3 | su - postgres -c "psql -d sosse --command=\"DELETE FROM se_link;\"" 4 | su - postgres -c "psql -d sosse --command=\"DELETE FROM se_document;\"" 5 | su - postgres -c "psql -d sosse --command=\"DELETE FROM se_cookie;\"" 6 | su - postgres -c "psql -d sosse --command=\"DELETE FROM se_domainsetting;\"" 7 | su - postgres -c "psql -d sosse --command=\"DELETE FROM se_searchhistory;\"" 8 | exec /robotframework-venv/bin/robot --exitonerror --exitonfailure tests 9 | #exec /robotframework-venv/bin/robot --exitonerror --exitonfailure 02_*.robot 10 | -------------------------------------------------------------------------------- /tests/robotframework/tests/02_webhooks.robot: -------------------------------------------------------------------------------- 1 | | *Settings* | 2 | | Library | SeleniumLibrary 3 | | Library | String 4 | | Resource | common.robot 5 | | Resource | webhooks.robot 6 | 7 | | *Test Cases* | 8 | | Trigger non-saved webhook 9 | | | Clear Webhooks 10 | | | SOSSE Go To | http://127.0.0.1/admin/se/webhook/add/ 11 | | | Input Text | id=id_name | Test webhook 12 | | | Element Should Not Be Visible | id=webhook_test_result 13 | | | Click Element | id=webhook_test_button 14 | | | Wait Until Element Is Visible | id=webhook_test_result 15 | | | Wait Until Element Contains | id=webhook_test_result | Webhook configuration error 16 | | | Input Text | id=id_url | http://127.0.0.1:8000/post 17 | | | Click Element | id=webhook_test_button 18 | | | Wait Until Element Contains | id=webhook_test_result | 200 OK 19 | 20 | | Trigger saved webhook 21 | | | Clear Webhooks 22 | | | SOSSE Go To | http://127.0.0.1/admin/se/webhook/add/ 23 | | | Input Text | id=id_name | Test webhook 24 | | | Input Text | id=id_url | http://127.0.0.1:8000/post 25 | | | Click Element | xpath=//input[@value="Save"] 26 | | | ${loc}= | Get Location 27 | | | Should Be Equal | ${loc} | http://127.0.0.1/admin/se/webhook/ 28 | | | Click Link | Test webhook 29 | | | Element Should Not Be Visible | id=webhook_test_result 30 | | | Click Element | id=webhook_test_button 31 | | | Wait Until Element Is Visible | id=webhook_test_result 32 | | | Wait Until Element Contains | id=webhook_test_result | 200 OK 33 | -------------------------------------------------------------------------------- /tests/robotframework/tests/04_crawl_policies.robot: -------------------------------------------------------------------------------- 1 | | *Settings* | 2 | | Library | SeleniumLibrary 3 | | Resource | common.robot 4 | | Resource | crawl_policy.robot 5 | | Test Setup | Clear Crawl Policies 6 | 7 | | *Test Cases* | 8 | | Duplicate policy 9 | | | SOSSE Go To | http://127.0.0.1/admin/se/crawlpolicy/ 10 | | | Element Should Be Visible | xpath=//p[@class='paginator' and contains(., '1 Crawl Policy')] 11 | | | Click Element | id=action-toggle 12 | | | Select From List By Label | xpath=//select[@name='action'] | Duplicate 13 | | | Click Element | xpath=//button[text()='Go'] 14 | | | Element Should Be Visible | xpath=//p[@class='paginator' and contains(., '2 Crawl Policies')] 15 | | | Click Link | Copy of (default) 16 | -------------------------------------------------------------------------------- /tests/robotframework/tests/__init__.robot: -------------------------------------------------------------------------------- 1 | | *Settings* | 2 | | Library | SeleniumLibrary 3 | | Resource | ../tests/common.robot 4 | | Suite Setup | Setup 5 | | Suite Teardown | Tear Down 6 | 7 | | *Keywords* | 8 | | Setup 9 | | | Set Selenium Timeout | 1 min 10 | | | Login 11 | | | Execute Javascript | window.onerror = function(errorMessage) { dialog(errorMessage); } 12 | 13 | | Tear Down 14 | | | Capture Page Screenshot 15 | | | Close All Browsers 16 | -------------------------------------------------------------------------------- /tests/robotframework/tests/common.robot: -------------------------------------------------------------------------------- 1 | | *Settings* | 2 | | Library | Process 3 | 4 | | *Keywords* | 5 | | Login 6 | | | Open Browser | http://127.0.0.1/ | browser=Firefox | options=add_argument("--headless") 7 | #| | Open Browser | http://127.0.0.1/ | browser=Chrome | options=add_argument("--no-sandbox");options=add_argument("--disable-dev-shm-usage");add_argument("--headless");add_argument('--enable-precise-memory-info');add_argument('--disable-default-apps') 8 | | | Set Window Size | 1024 | 768 9 | | | Set Screenshot Directory | screenshots/ 10 | | | Input Text | id=id_username | admin 11 | | | Input Text | id=id_password | admin 12 | | | Click Element | xpath=//form[@id='login-form']//input[@type='submit'] 13 | | | Wait Until Element Contains | id=menu_username | admin 14 | 15 | 16 | | Hilight | [Arguments] | @{kwargs} 17 | | | Wait Until element Is Visible | @{kwargs} 18 | | | ${elem}= | Get WebElement | @{kwargs} 19 | | | Execute Javascript | arguments[0].style = 'box-shadow: 0px 0px 4px 4px #91ffba; margin: 5px; padding: 4px 8px 0px 8px;' | ARGUMENTS | ${elem} 20 | 21 | | Scroll To Elem | [Arguments] | @{kwargs} 22 | | | Wait Until element Is Visible | @{kwargs} 23 | | | ${elem}= | Get WebElement | @{kwargs} 24 | | | Execute Javascript | window.scroll(0, 0) 25 | | | Execute Javascript | window.scroll(0, arguments[0].getBoundingClientRect().top - 10) | ARGUMENTS | ${elem} 26 | 27 | | Scroll To Bottom 28 | | | Execute Javascript | window.scroll(0, 0) 29 | | | Execute Javascript | window.scroll(0, document.body.scrollHeight) 30 | 31 | | Run Command | [Arguments] | @{args} | &{kwargs} 32 | | | ${ret}= | Run Process | @{args} | &{kwargs} 33 | | | Log | ${ret.stdout} 34 | | | Log | ${ret.stderr} 35 | | | Should Be Equal As Numbers | ${ret.rc} | 0 36 | | | RETURN | ${ret} 37 | 38 | | SOSSE Go To | [Arguments] | @{args} | &{kwargs} | 39 | | | Page Should Not Contain | Traceback | 40 | | | Go To | @{args} | &{kwargs} | 41 | 42 | | SOSSE Wait Until Page Contains | [Arguments] | @{args} | &{kwargs} | 43 | | | Page Should Not Contain | Traceback | 44 | | | Wait Until Page Contains | @{args} | &{kwargs} | 45 | 46 | | SOSSE Capture Page Screenshot | 47 | | | [Arguments] | @{args} | &{kwargs} | 48 | | | Page Should Not Contain | Traceback | 49 | | | Page Should Not Contain | Page not found | 50 | | | Capture Page Screenshot | @{args} | &{kwargs} | 51 | -------------------------------------------------------------------------------- /tests/robotframework/tests/crawl_policy.robot: -------------------------------------------------------------------------------- 1 | | *Keywords* | 2 | 3 | | Clear Crawl Policies 4 | | | SOSSE Go To | http://127.0.0.1/admin/se/crawlpolicy/ 5 | | | ${status} | ${has_docs}= | Run Keyword And Ignore Error | Element Text Should Not Be | id=changelist-form | 1 Crawl Policy 6 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | id=action-toggle 7 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | xpath=//td[@class='action-checkbox']/input 8 | | | Run Keyword If | '${status}' == 'PASS' | Select From List By Label | xpath=//select[@name='action'] | Delete selected Crawl Policies 9 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | xpath=//button[contains(., 'Go')] 10 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | xpath=//input[@type='submit'] 11 | -------------------------------------------------------------------------------- /tests/robotframework/tests/documents.robot: -------------------------------------------------------------------------------- 1 | | *Keywords* | 2 | 3 | | Clear Documents 4 | | | SOSSE Go To | http://127.0.0.1/admin/se/document/ 5 | | | ${status} | ${has_docs}= | Run Keyword And Ignore Error | Element Text Should Not Be | id=changelist-form | 0 documents 6 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | id=action-toggle 7 | | | Run Keyword If | '${status}' == 'PASS' | Select From List By Label | xpath=//select[@name='action'] | Delete selected documents 8 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | xpath=//button[contains(., 'Go')] 9 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | xpath=//input[@type='submit'] 10 | -------------------------------------------------------------------------------- /tests/robotframework/tests/tags.robot: -------------------------------------------------------------------------------- 1 | | *Keywords* | 2 | 3 | | Create Tag | [Arguments] | ${name} | ${parent}=None 4 | | | SOSSE Go To | http://127.0.0.1/admin/se/tag/add/ 5 | | | Input Text | id=id_name | ${name} 6 | | | Run Keyword If | '${parent}' != 'None' | Select From List By Label | id=id__ref_node_id | ${parent} 7 | | | Click Element | xpath=//input[@value="Save"] 8 | 9 | | Clear Tags 10 | | | SOSSE Go To | http://127.0.0.1/admin/se/tag/ 11 | | | ${status} | ${has_tags}= | Run Keyword And Ignore Error | Element Text Should Not Be | id=changelist-form | 0 tags 12 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | id=action-toggle 13 | | | Run Keyword If | '${status}' == 'PASS' | Select From List By Label | xpath=//select[@name='action'] | Delete selected tags 14 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | xpath=//button[contains(., 'Go')] 15 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | xpath=//input[@type='submit'] 16 | 17 | | Create sample tags 18 | | | Clear Tags 19 | | | Create Tag | Hardware 20 | | | Create Tag | CPU | Hardware 21 | | | Create Tag | GPU | Hardware 22 | | | Create Tag | RAM | Hardware 23 | | | Create Tag | Storage | Hardware 24 | | | Create Tag | Motherboard | Hardware 25 | | | Create Tag | Power Supply | Hardware 26 | | 27 | | | Create Tag | Software 28 | | | Create Tag | Operating System | Software 29 | | | Create Tag | Programming Languages | Software 30 | | | Create Tag | Development Tools | Software 31 | | | Create Tag | Security Software | Software 32 | | 33 | | | Create Tag | Network & Connectivity 34 | | | Create Tag | Network Protocols | Network & Connectivity 35 | | | Create Tag | Internet Speed | Network & Connectivity 36 | | | Create Tag | WiFi Standards | Network & Connectivity 37 | | 38 | | | Create Tag | Peripheral 39 | | | Create Tag | Keyboard | Peripheral 40 | | | Create Tag | Mouse | Peripheral 41 | | | Create Tag | Monitor | Peripheral 42 | | 43 | | | Create Tag | General Usage 44 | | | Create Tag | Gaming PC | General Usage 45 | | | Create Tag | Workstation Build | General Usage 46 | | | Create Tag | AI | General Usage 47 | | | Create Tag | Budget Laptop | General Usage 48 | | | Create Tag | Custom PC Build | General Usage 49 | -------------------------------------------------------------------------------- /tests/robotframework/tests/webhooks.robot: -------------------------------------------------------------------------------- 1 | | *Keywords* | 2 | 3 | | Clear Webhooks 4 | | | SOSSE Go To | http://127.0.0.1/admin/se/webhook/ 5 | | | ${status} | ${has_docs}= | Run Keyword And Ignore Error | Element Text Should Not Be | id=changelist-form | 0 webhooks 6 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | id=action-toggle 7 | | | Run Keyword If | '${status}' == 'PASS' | Select From List By Label | xpath=//select[@name='action'] | Delete selected webhooks 8 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | xpath=//button[contains(., 'Go')] 9 | | | Run Keyword If | '${status}' == 'PASS' | Click Element | xpath=//input[@type='submit'] 10 | -------------------------------------------------------------------------------- /tests/robotframework/vrt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | su - postgres -c "psql -d sosse --command=\"DELETE FROM se_crawlpolicy;\"" 3 | su - postgres -c "psql -d sosse --command=\"DELETE FROM se_link;\"" 4 | su - postgres -c "psql -d sosse --command=\"DELETE FROM se_document;\"" 5 | su - postgres -c "psql -d sosse --command=\"DELETE FROM se_cookie;\"" 6 | su - postgres -c "psql -d sosse --command=\"DELETE FROM se_domainsetting;\"" 7 | su - postgres -c "psql -d sosse --command=\"DELETE FROM se_searchhistory;\"" 8 | 9 | export VRT_CIBUILDID=$(git rev-parse HEAD) 10 | export VRT_BRANCHNAME=master 11 | export VRT_APIURL=http://172.17.0.1:8080 12 | export VRT_PROJECT="Default project" 13 | export VRT_APIKEY=DEFAULTUSERAPIKEYTOBECHANGED 14 | export VRT_ENABLESOFTASSERT="false" 15 | export VRT_IGNORE_ERRORS="true" 16 | 17 | exec /rf-venv/bin/robot -V config.yaml \ 18 | --debug-file dbg-output \ 19 | -v VRT_CIBUILDID:$(git rev-parse HEAD) \ 20 | -v VRT_BRANCHNAME:master \ 21 | -v VRT_APIURL:http://172.17.0.1:8080 \ 22 | -v VRT_APIKEY:DEFAULTUSERAPIKEYTOBECHANGED \ 23 | -v VRT_PROJECT:Default\ project \ 24 | -v VRT_ENABLESOFTASSERT:false \ 25 | --exitonerror --exitonfailure vrt/ 26 | -------------------------------------------------------------------------------- /tests/robotframework/vrt/__init__.robot: -------------------------------------------------------------------------------- 1 | | *Settings* | 2 | | Library | SeleniumLibrary 3 | | Library | VRT 4 | | Resource | ../tests/common.robot 5 | | Suite Setup | Setup 6 | | Suite Teardown | Close All Browsers 7 | 8 | | *Keywords* | 9 | | Setup 10 | | | Login 11 | -------------------------------------------------------------------------------- /tests/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd "$(dirname $0)"/../ 3 | sudo -E -u www-data ./sosse-admin test -v3 "$@" 4 | -------------------------------------------------------------------------------- /tests/searchhistory.json: -------------------------------------------------------------------------------- 1 | [{"model": "se.searchhistory", "pk": 22, "fields": {"query": "website", "querystring": "l=en&q=website", "date": "2023-04-29T19:08:31.125Z", "user": 1}}, {"model": "se.searchhistory", "pk": 23, "fields": {"query": "!b cats", "querystring": "l=en&q=%21b+cats", "date": "2023-04-29T19:10:56.672Z", "user": 1}}, {"model": "se.searchhistory", "pk": 24, "fields": {"query": "!bi SOSSE", "querystring": "l=en&q=%21bi+SOSSE", "date": "2023-04-29T19:11:11.130Z", "user": 1}}] 2 | -------------------------------------------------------------------------------- /tests/test_app.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | set -e 3 | SERVER_DIR=/tmp/testserver 4 | SRC_SERVER_DIR=/tmp/srctestserver 5 | SOSSE_TEST_DIR="$(dirname "$0")" 6 | CURRENT_DIR="$(pwd)" 7 | 8 | if [ -d "$SERVER_DIR" ]; then 9 | echo "Test server already exists" >&2 10 | else 11 | if [ -d "$SRC_SERVER_DIR" ]; then 12 | cp -r "$SRC_SERVER_DIR" "$SERVER_DIR" 13 | else 14 | cd /tmp 15 | git clone --depth=1 https://gitlab.com/biolds1/httpbin.git $SERVER_DIR 16 | fi 17 | fi 18 | 19 | cd $SERVER_DIR/httpbin 20 | 21 | if [ ! -e /tmp/httpbin-db.sqlite3 ]; then 22 | python3 manage.py migrate 23 | python3 manage.py shell -c "from django.contrib.auth.models import User ; u = User.objects.create(username='admin', is_superuser=True, is_staff=True) ; u.set_password('admin') ; u.save()" 24 | fi 25 | 26 | if [ -e /.dockerenv ]; then 27 | python3 $SERVER_DIR/httpbin/manage.py runserver 0.0.0.0:8000 & 28 | else 29 | if [ ! -e /etc/systemd/system/django-test.service ]; then 30 | cat </etc/systemd/system/django-test.service 31 | [Unit] 32 | Description=TestServer 33 | 34 | [Service] 35 | ExecStart=/usr/bin/python3 $SERVER_DIR/httpbin/manage.py runserver 0.0.0.0:8000 36 | WorkingDirectory=$SERVER_DIR 37 | Restart=always 38 | RestartSec=10 39 | 40 | [Install] 41 | WantedBy=multi-user.target 42 | EOF 43 | 44 | systemctl daemon-reload 45 | systemctl enable django-test.service 46 | systemctl restart django-test.service 47 | else 48 | echo "Service already exist" >&2 49 | fi 50 | fi 51 | 52 | cd "$CURRENT_DIR" 53 | mkdir -p "$SERVER_DIR/httpbin/bin/static/" 54 | rsync -avz "$SOSSE_TEST_DIR/pages" "$SERVER_DIR/httpbin/bin/static/" 55 | -------------------------------------------------------------------------------- /tests/vrt.md: -------------------------------------------------------------------------------- 1 | ### Setup 2 | 3 | Install / run https://github.com/Visual-Regression-Tracker/Visual-Regression-Tracker 4 | 5 | ### Running 6 | 7 | ``` 8 | make vrt 9 | ``` 10 | -------------------------------------------------------------------------------- /tests/wait_for_pg.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | while true 3 | do 4 | su postgres -c 'psql --command="select 1"' && break 5 | sleep 1s 6 | done 7 | --------------------------------------------------------------------------------