├── .env.example ├── .env.markdown ├── .github └── workflows │ ├── create-release.yml │ └── run-tests.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── Pipfile ├── Pipfile.lock ├── README.md ├── docker-compose.yml ├── docs ├── README.md └── history4feed.png ├── history4feed ├── __init__.py ├── app │ ├── __init__.py │ ├── admin.py │ ├── apps.py │ ├── autoschema.py │ ├── migrations │ │ ├── 0001_initial.py │ │ ├── 0002_feed_freshness_alter_feed_feed_type.py │ │ ├── 0003_alter_feed_description.py │ │ ├── 0004_alter_fulltextjob_status_alter_job_state.py │ │ ├── 0005_feed_datetime_modified.py │ │ └── __init__.py │ ├── models.py │ ├── openapi_params.py │ ├── serializers.py │ ├── settings.py │ ├── tests.py │ ├── utils.py │ └── views.py ├── asgi.py ├── h4fscripts │ ├── __init__.py │ ├── build_rss.py │ ├── celery.py │ ├── exceptions.py │ ├── h4f.py │ ├── sitemap_helpers.py │ ├── task_helper.py │ ├── wayback_helpers.py │ └── xml_utils.py ├── settings.py ├── urls.py └── wsgi.py ├── manage.py ├── pyproject.toml ├── requirements.txt ├── run.sh └── tests ├── README.md ├── __init__.py ├── requirements.txt ├── st ├── .env.schemathesis ├── __init__.py ├── hooks.py └── st.py ├── test_01_add_feeds.py ├── test_02_add_post.py ├── test_03_delete_post.py ├── test_04_delete_feed.py ├── test_05_post_filters.py ├── test_06_patch_feed.py ├── test_07_patch_post.py ├── test_99_delete_all_feeds.py └── utils.py /.env.example: -------------------------------------------------------------------------------- 1 | # POSTGRES 2 | POSTGRES_HOST= 3 | POSTGRES_PORT= 4 | POSTGRES_DB= 5 | POSTGRES_USER= 6 | POSTGRES_PASSWORD= 7 | #django settings 8 | DJANGO_SECRET= 9 | DJANGO_DEBUG= 10 | DJANGO_ALLOWED_HOSTS= 11 | DJANGO_CORS_ALLOW_ALL_ORIGINS= 12 | DJANGO_CORS_ALLOWED_ORIGINS= 13 | # CELERY 14 | CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP= 15 | # SCRAPE BACKFILL SETTINGS 16 | EARLIEST_SEARCH_DATE= 17 | # PROXY 18 | SCRAPFILE_APIKEY= 19 | # SETTINGS TO AVOID RATE LIMITS 20 | WAYBACK_SLEEP_SECONDS= 21 | WAYBACK_BACKOFF_TIME= 22 | REQUEST_RETRY_COUNT= 23 | # API SETTINGS 24 | DEFAULT_PAGE_SIZE= 25 | MAX_PAGE_SIZE= 26 | # SERPER 27 | SERPER_API_KEY= -------------------------------------------------------------------------------- /.env.markdown: -------------------------------------------------------------------------------- 1 | # Environmental file info 2 | 3 | If you're running in production, you should set these securely. 4 | 5 | However, if you just want to experiment, set the following values 6 | 7 | ## Django Settings 8 | 9 | These are all Django settings, defined in `history4feed/settings.py` 10 | 11 | * `DJANGO_SECRET`: `insecure_django_secret` 12 | * `DJANGO_DEBUG`: `True` 13 | * `DJANGO_ALLOWED_HOSTS`: BLANK 14 | * `DJANGO_CORS_ALLOW_ALL_ORIGINS`: `True` 15 | * `DJANGO_CORS_ALLOWED_ORIGINS`: LEAVE EMPTY 16 | 17 | ## Postgres Settings 18 | 19 | These are all Django settings, defined in `history4feed/settings.py` 20 | 21 | * `POSTGRES_HOST`: `pgdb` 22 | * `POSTGRES_PORT`: BLANK 23 | * `POSTGRES_DB`: `postgres` 24 | * `POSTGRES_USER`: `postgres` 25 | * `POSTGRES_PASSWORD`: `postgres` 26 | 27 | ## Celery settings 28 | 29 | * `CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP`: `1` 30 | 31 | ## history4feed API settings 32 | 33 | These define how the API behaves. 34 | 35 | * `MAX_PAGE_SIZE`: `50` 36 | * This is the maximum number of results the API will ever return before pagination 37 | * `DEFAULT_PAGE_SIZE`: `50` 38 | * The default page size of result returned by the API 39 | 40 | ## Search Index Mode (Serper) 41 | 42 | Search index mode, uses the [Serper API Key](https://serper.dev/) to scrape search results. 43 | 44 | * `SERPER_API_KEY` 45 | * [Get your key here](https://serper.dev/api-key). 46 | 47 | ## Scrape backfill settings 48 | 49 | * `EARLIEST_SEARCH_DATE`: `2020-01-01T00:00:00Z` 50 | * determines how far history4feed will backfill posts for newly added feeds. e.g. `EARLIEST_SEARCH_DATE=2020-01-01T00:00:00Z` will import all posts with a publish date >= `2020-01-01T00:00:00Z` 51 | 52 | ## Proxy settings 53 | 54 | * `SCRAPFILE_APIKEY`: YOUR_API_KEY 55 | * We strongly recommend using the [ScrapFly](https://scrapfly.io/) proxy service with history4feed. Though we have no affiliation with them, it is the best proxy service we've tested and thus built in support for it to history4feed. 56 | 57 | ## Settings to avoid rate limits if not using Scrapfly 58 | 59 | If you're not using a Proxy it is very likely you'll run into rate limits on the WayBack Machine and the blogs you're requesting the full text from. You should therefore consider the following options 60 | 61 | * `WAYBACK_SLEEP_SECONDS`: `45` 62 | * This is useful when a large amount of posts are returned. This sets the time between each request to get the full text of the article to reduce servers blocking robotic requests. 63 | * `REQUEST_RETRY_COUNT`: `3` 64 | * This is useful when a large amount of posts are returned. This sets the number of retries when a non-200 response is returned. -------------------------------------------------------------------------------- /.github/workflows/create-release.yml: -------------------------------------------------------------------------------- 1 | name: Create Release 2 | run-name: creating release 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | create-release: 11 | runs-on: ubuntu-latest 12 | permissions: 13 | contents: write 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: "3.11" 20 | - name: Install pypa/build 21 | run: python3 -m pip install build --user 22 | 23 | - name: Build a binary wheel and a source tarball 24 | run: python3 -m build 25 | 26 | - name: Make release 27 | env: 28 | GITHUB_TOKEN: ${{ github.token }} 29 | run: | 30 | REF_NAME="${{ github.ref_name }}-$(date +"%Y-%m-%d-%H-%M-%S")" 31 | gh release create "$REF_NAME" --repo '${{ github.repository }}' --notes "" 32 | gh release upload "$REF_NAME" dist/** --repo '${{ github.repository }}' 33 | -------------------------------------------------------------------------------- /.github/workflows/run-tests.yml: -------------------------------------------------------------------------------- 1 | name: Run Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | test-schema-thesis: 11 | runs-on: ubuntu-latest 12 | environment: test_pipeline 13 | 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@v4 17 | with: 18 | fetch-depth: 1 19 | 20 | - name: Set up Python 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: "3.11" 24 | 25 | 26 | - name: Set .env for docker-compose 27 | run: | 28 | echo "EARLIEST_SEARCH_DATE=$(date -u -d yesterday +'%Y-%m-%dT%H:%M:%SZ')" >> .env 29 | echo "SCRAPFLY_APIKEY=${{secrets.SCRAPFLY_APIKEY}}" >> .env 30 | 31 | 32 | cat tests/st/.env.schemathesis >> .env 33 | 34 | echo ==== env file start ===== 35 | cat .env 36 | echo 37 | echo ==== env file end ===== 38 | 39 | 40 | - name: Start docker-compose 41 | uses: hoverkraft-tech/compose-action@v2.0.2 42 | with: 43 | compose-file: | 44 | docker-compose.yml 45 | compose-flags: 46 | --env-file .env 47 | -p h4f-action 48 | 49 | - name: Get IP addresses 50 | id: get_ip 51 | run: | 52 | IP_ADDRESS=$(docker network inspect -f '{{range.IPAM.Config}}{{.Gateway}}{{end}}' h4f-action_default) 53 | echo "ip_address=$IP_ADDRESS" >> "$GITHUB_OUTPUT" 54 | echo "IP_ADDRESS=$IP_ADDRESS" >> "$GITHUB_OUTPUT" 55 | echo "SERVICE_BASE_URL=http://$IP_ADDRESS:8002/" >> "$GITHUB_OUTPUT" 56 | cat "$GITHUB_OUTPUT" 57 | 58 | - name: Wait for server to start 59 | run: | 60 | RETRY_DELAY=3 61 | RETRY_COUNT=10 62 | echo "Waiting for server to start" 63 | curl --retry-delay $RETRY_DELAY --retry $RETRY_COUNT --retry-connrefused ${{ steps.get_ip.outputs.SERVICE_BASE_URL }} > /dev/null 64 | if [ $? -ne 0 ]; then 65 | echo "exiting after waiting $(( $RETRY_DELAY * $RETRY_COUNT )) seconds for server to start" 66 | exit 1 67 | fi 68 | 69 | 70 | 71 | - name: test all endpoints 1 72 | id: test-endpoints 73 | run: | 74 | pip install -r tests/requirements.txt 75 | export SERVICE_BASE_URL="${{ steps.get_ip.outputs.SERVICE_BASE_URL }}" 76 | 77 | pytest tests/ 78 | 79 | - name: run schemathesis 80 | uses: schemathesis/action@v1 81 | env: 82 | SCHEMATHESIS_HOOKS: tests.st.hooks 83 | with: 84 | schema: ${{ steps.get_ip.outputs.SERVICE_BASE_URL }}/api/schema/ 85 | checks: all 86 | wait-for-schema: '30' 87 | args: '--generation-allow-x00 false --show-trace' 88 | version: 3.39.15 89 | 90 | - name: test delete all 91 | id: test-endpoints-2 92 | run: | 93 | pip install -r tests/requirements.txt 94 | export DELETE_ALL_FEEDS=true 95 | export SERVICE_BASE_URL="${{ steps.get_ip.outputs.SERVICE_BASE_URL }}" 96 | pytest tests/test_99_delete_all_feeds.py 97 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | staticfiles 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | #.idea/ 162 | 163 | config* 164 | 165 | # ignore venv in config 166 | 167 | history4feed-venv/ 168 | 169 | # ignore created dirs with generated data 170 | 171 | logs/ 172 | output/ 173 | 174 | # mac files 175 | .DS_Store 176 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11 2 | ENV PYTHONUNBUFFERED=1 3 | WORKDIR /usr/src/app 4 | COPY requirements.txt ./ 5 | RUN pip install -r requirements.txt 6 | 7 | COPY . /usr/src/app -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2020 DOGESEC (https://www.dogesec.com/) 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | readability-lxml = "*" 8 | python-dateutil = "*" 9 | brotlipy = "*" 10 | python-dotenv = "*" 11 | djangorestframework = "*" 12 | drf-spectacular = "*" 13 | lxml_html_clean = "*" 14 | celery = "*" 15 | redis = "*" 16 | psycopg2-binary = "*" 17 | gunicorn = "*" 18 | django-filter = "*" 19 | requests = "*" 20 | fake-useragent = "==1.5.1" 21 | 22 | [dev-packages] 23 | autopep8 = "*" 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # history4feed 2 | 3 | [![codecov](https://codecov.io/gh/muchdogesec/history4feed/graph/badge.svg?token=3Z5LELB8OP)](https://codecov.io/gh/muchdogesec/history4feed) 4 | 5 | ## Overview 6 | 7 | ![](docs/history4feed.png) 8 | 9 | It is common for feeds (RSS or XML) to only include a limited number of posts. I generally see the latest 3 - 5 posts of a blog in a feed. For blogs that have been operating for years, this means potentially thousands of posts are missed. 10 | 11 | There is no way to page through historic articles using an RSS or ATOM feed (they were not designed for this), which means the first poll of the feed will only contain the limited number of articles in the feed. This limit is defined by the blog owner. 12 | 13 | history4feed can be used to create a complete history for a blog and output it as an RSS feed. 14 | 15 | history4feed offers an API interface that; 16 | 17 | 1. takes an RSS / ATOM feed URL 18 | 2. downloads a Wayback Machine archive for the feed 19 | 3. identified all unique blog posts in the historic feeds downloaded 20 | 4. downloads a HTML version of the article content on each page 21 | 5. stores the post record in the databases 22 | 6. exposes the posts as JSON or XML RSS 23 | 24 | ## tl;dr 25 | 26 | [![history4feed](https://img.youtube.com/vi/z1ATbiecbg4/0.jpg)](https://www.youtube.com/watch?v=z1ATbiecbg4) 27 | 28 | [Watch the demo](https://www.youtube.com/watch?v=z1ATbiecbg4). 29 | 30 | ## Install 31 | 32 | ### Download and configure 33 | 34 | ```shell 35 | # clone the latest code 36 | git clone https://github.com/muchdogesec/history4feed 37 | ``` 38 | 39 | ### Configuration options 40 | 41 | history4feed has various settings that are defined in an `.env` file. 42 | 43 | To create a template for the file: 44 | 45 | ```shell 46 | cp .env.example .env 47 | ``` 48 | 49 | To see more information about how to set the variables, and what they do, read the `.env.markdown` file. 50 | 51 | ### Build the Docker Image 52 | 53 | ```shell 54 | sudo docker compose build 55 | ``` 56 | 57 | ### Start the server 58 | 59 | ```shell 60 | sudo docker compose up 61 | ``` 62 | 63 | ### Access the server 64 | 65 | The webserver (Django) should now be running on: http://127.0.0.1:8002/ 66 | 67 | You can access the Swagger UI for the API in a browser at: http://127.0.0.1:8002/api/schema/swagger-ui/ 68 | 69 | ## Useful supporting tools 70 | 71 | * [Full Text, Full Archive RSS Feeds for any Blog](https://www.dogesec.com/blog/full_text_rss_atom_blog_feeds/) 72 | * [An up-to-date list of threat intel blogs that post cyber threat intelligence research](https://github.com/muchdogesec/awesome_threat_intel_blogs) 73 | * [Donate to the Wayback Machine](https://archive.org/donate) 74 | 75 | ## Support 76 | 77 | [Minimal support provided via the DOGESEC community](https://community.dogesec.com/). 78 | 79 | ## License 80 | 81 | [Apache 2.0](/LICENSE). -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | django: 3 | image: history4feed 4 | build: . 5 | command: > 6 | bash -c " 7 | python manage.py collectstatic --no-input && 8 | python manage.py makemigrations && 9 | python manage.py migrate && 10 | gunicorn history4feed.wsgi:application --bind 0.0.0.0:8002 --reload 11 | " 12 | volumes: 13 | - .:/usr/src/app/ 14 | ports: 15 | - 8002:8002 16 | environment: 17 | - DEBUG=1 18 | - CELERY_BROKER_URL=redis://redis:6379/0 19 | env_file: 20 | - ./.env 21 | depends_on: 22 | pgdb: 23 | condition: service_healthy 24 | redis: 25 | condition: service_started 26 | celery: 27 | image: history4feed 28 | build: . 29 | command: > 30 | bash -c " 31 | celery -A history4feed.h4fscripts worker -l INFO 32 | " 33 | volumes: 34 | - .:/usr/src/app 35 | environment: 36 | - DEBUG=1 37 | - CELERY_BROKER_URL=redis://redis:6379/0 38 | - result_backend=redis://redis:6379/1 39 | env_file: 40 | - ./.env 41 | depends_on: 42 | - django 43 | - redis 44 | pgdb: 45 | image: postgres 46 | env_file: 47 | - ./.env 48 | volumes: 49 | - pgdata:/var/lib/postgresql/data/ 50 | healthcheck: 51 | test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}"] 52 | interval: 10s 53 | retries: 5 54 | start_period: 30s 55 | timeout: 10s 56 | redis: 57 | image: "redis:alpine" 58 | volumes: 59 | pgdata: -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | ## Basics of RSS 2 | 3 | RSS stands for Really Simple Syndication. Simply put, RSS is a standardized format using a computer (and human) readable format that shows what has changed for a website, and is especially used by blogs, podcasts, news sites, etc, for this reason. 4 | 5 | Here is a sample of an RSS feed from The Record by the Recorded Future team; `https://therecord.media/feed/`. 6 | 7 | Note, in many cases a blog will clearly show their RSS (or ATOM) feed URL, but not all. Whilst not all blogs have RSS feeds, if you open up a browser, navigate to the blog, and click view page source, you can usually find the feed address under the `link rel="alternate" type="application/rss+xml"` or `application/atom+xml` HTML tag. 8 | 9 | Here's an example... 10 | 11 | ```shell 12 | curl "https://krebsonsecurity.com/" > demo_1.html 13 | ``` 14 | 15 | ```html 16 | 17 | 18 | ``` 19 | 20 | Note, you might see more than one feed, above one is for posts, the other for blog comments. 21 | 22 | It's not always that simple to detect the feed URL... 23 | 24 | The Recorded Future Record RSS feed; 25 | 26 | ```shell 27 | curl "https://therecord.media/news" > demo_2.html 28 | ``` 29 | 30 | Is nestled in custom properties... 31 | 32 | ```js 33 | "rssLink":{"id":12,"target":"_blank","externalUrl":"https://therecord.media/feed/" 34 | ``` 35 | 36 | Sometimes a feed will exist, but is not exposed in the HTML (in which case you can try and guess the URL pattern for it). Some blogs just have no feeds. 37 | 38 | In some cases, a blog will also have feeds per category (vs getting the entire blog, which you might not always want), which you can find using the category/tag/etc, URL. e.g. 39 | 40 | ```shell 41 | curl "https://blogs.infoblox.com/category/cyber-threat-intelligence/" > demo_3.html 42 | ``` 43 | 44 | ```html 45 | 46 | 47 | 48 | ``` 49 | 50 | Generally an RSS feed has an XML structure containing at least the following items; 51 | 52 | ```xml 53 | 54 | 55 | 56 | 57 | W3Schools Home Page 58 | https://www.w3schools.com 59 | Free web building tutorials 60 | 61 | RSS Tutorial 62 | https://www.w3schools.com/xml/xml_rss.asp 63 | New RSS tutorial on W3Schools 64 | Tue, 03 Jun 2003 09:39:21 GMT 65 | 66 | 67 | XML Tutorial 68 | https://www.w3schools.com/xml 69 | New XML tutorial on W3Schools 70 | Tue, 10 Jun 2003 11:34:12 GMT 71 | 72 | 73 | 74 | 75 | ``` 76 | 77 | The `` tags capture the entire feed including metadata about the feed (`title`, `link`, and `description` in this case). There are many other optional elements that can be included in the `` tags, [as defined here](https://www.rssboard.org/rss-specification). 78 | 79 | Each article in the feed is defined inside each `` tag with sub-elements, generally the most important being: 80 | 81 | * `title`: The title of the post / article 82 | * `link`: The URL of the post / article 83 | * `description`: The article content 84 | * `pubDate`: The date the article was published 85 | 86 | There are many other optional elements that can be included in the `` tags, [as defined here](https://www.rssboard.org/rss-specification). 87 | 88 | ## Basics of ATOM 89 | 90 | Atom is a similar format to RSS and used for the same reasons. It is a slightly newer format than XML (although almost 20 years old) and designed to cover some of the shortcomings of RSS. 91 | 92 | Here is a sample of an ATOM feed from the 0patch blog... 93 | 94 | ```shell 95 | curl "https://blog.0patch.com/" > demo_4.html 96 | ``` 97 | 98 | ```html 99 | 100 | 101 | 102 | ``` 103 | 104 | Note, an RSS version is also available above; `application/rss+xml` vs `application/atom+xml`. 105 | 106 | An ATOM feed has a similar XML structure to RSS, however, you will notice some of the element names are different. 107 | 108 | ```xml 109 | 110 | 111 | 112 | Example Feed 113 | 114 | 2003-12-13T18:30:02Z 115 | 116 | John Doe 117 | 118 | urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6 119 | 120 | 121 | Atom-Powered Robots Run Amok 122 | 123 | urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 124 | 2003-12-13T18:30:02Z 125 | 2003-12-13T18:30:02Z 126 | Something 127 | Some text. 128 | 129 | 130 | ``` 131 | 132 | The blog information is captured at the top of the document. 133 | 134 | Each article in the feed is defined inside each `` tag with sub-elements, generally the most important being: 135 | 136 | * `title`: The title of the post / article 137 | * `id`: The UUID of the post 138 | * `link`: The URL of the post / article 139 | * `published`: The date the article was published 140 | * `content`: The article content 141 | 142 | There are many other optional elements that can be included in the `` tags, [as defined here](https://validator.w3.org/feed/docs/atom.html). 143 | 144 | ## The solution 145 | 146 | There are two ways I came up with to get historic posts from a blog; 147 | 148 | 1. Scrape the blog for historic posts. This is the most accurate way to do it, though given the different structure of blogs and websites, this can become complex, requiring a fair bit of manual scraping logic to be written for each blog you want to follow 149 | 2. [Inspired by this Reddit thread](https://www.reddit.com/r/webscraping/comments/zxduid/python_library_to_scrape_rssfeeds_from/), use the Wayback Machine's archive. Often the Wayback Machine will have captured snapshots of a feed (though not always). For example, `https://therecord.media/feed/` has been captured [187 times between November 1, 2020 and August 12, 2022](https://web.archive.org/web/20220000000000*/https://therecord.media/feed/). 150 | 151 | Whilst the Wayback Machine will completely miss some blog archives, a particular problem for smaller sites that are less likely to be regularly indexed by the WBM), and potentially miss certain feed items where the RSS feed updates faster the WBM re-indexes the site, I chose this approach as it is currently the most scalable way I could come up with to backfill history (and most of the requirements for my use-cases were from high profile sites with a fairly small publish rate). 152 | 153 | [Waybackpack](https://github.com/jsvine/waybackpack) is a command-line tool that lets you download the entire Wayback Machine archive for a given URL for this purpose. 154 | 155 | Here is an example of how to use it with The Record Feed; 156 | 157 | ```shell 158 | python3 -m venv tutorial_env 159 | source tutorial_env/bin/activate 160 | pip3 install waybackpack 161 | waybackpack https://therecord.media/feed/ -d ~/Downloads/therecord_media_feed --from-date 2015 --uniques-only 162 | ``` 163 | 164 | In the above command I am requesting all unique feed pages downloaded by the Wayback Machine (`--uniques-only `) from 2015 (`--from-date 2015`) from the feed URL (`https://therecord.media/feed/`) 165 | 166 | Which produces about 100 unique `index.html` files (where `index.html` is the actual RSS feed). They are nested in folders named with the index datetime (time captured by WBM) in the format `YYYYMMDDHHMMSS` like so; 167 | 168 | ``` 169 | 170 | ~/Downloads/therecord_media_feed 171 | ├── 20220808162900 172 | │ └── therecord.media 173 | │ └── feed 174 | │ └── index.html 175 | ├── 20220805213430 176 | │ └── therecord.media 177 | │ └── feed 178 | │ └── index.html 179 | ... 180 | └── 20201101220102 181 | └── therecord.media 182 | └── feed 183 | └── index.html 184 | ``` 185 | 186 | It is important to point out unique entries just mean the `index.html` files have at least one difference. That is to say, much of the file can actually be the same (and include the same articles). Also whilst saved as .html documents, the content is actually pure .xml. 187 | 188 | Take `20220808162900 > therecord.media > index.html` and `20220805213430 > therecord.media > index.html` 189 | 190 | Both of these files contain the same item; 191 | 192 | ```xml 193 | 194 | Twitter confirms January breach, urges pseudonymous accounts to not add email or phone number 195 | https://therecord.media/twitter-confirms-january-breach-urges-pseudonymous-accounts-to-not-add-email-or-phone-number/ 196 | ``` 197 | 198 | history4feed looks at all unique `` elements in the downloaded `index.html` files to find the unique ``s. 199 | 200 | Note, this blog is in RSS format. 201 | 202 | Here's another example, this time using an ATOM feed as an example; 203 | 204 | ```shell 205 | waybackpack https://www.schneier.com/feed/atom/ -d ~/Downloads/schneier_feed --from-date 2015 --uniques-only 206 | ``` 207 | 208 | Looking at a snippet from one of the `index.html` files; 209 | 210 | ```xml 211 | 212 | 213 | Bruce Schneier 214 | 215 | <![CDATA[Friday Squid Blogging: Vegan Chili Squid]]> 216 | 217 | https://www.schneier.com/?p=60711 218 | 2021-01-04T16:50:54Z 219 | 2021-01-22T22:19:15Z 220 | ``` 221 | 222 | Here, history4feed looks at the `` property value (ATOM) in or `` tags (RSS)) for the articles in the feeds and passes them to readability-lxml. 235 | 236 | The result is then reprinted in the `description` or `content` field depending on feed type, overwriting the potentially partial content that it originally contained. 237 | 238 | Note, history4feed cannot detect if a feed is full or partial so will always request the full content for all items via readability-lxml, regardless of whether the feed content is partial or full. 239 | 240 | ## Dealing with encoding in post content 241 | 242 | For ATOM properties; 243 | 244 | * `title`: The title of the post / article 245 | * `description`: The article content 246 | 247 | And for RSS properties; 248 | 249 | * `title`: The title of the post / article 250 | * `content`: The article content 251 | 252 | The data is typically printed in one of three ways, either; 253 | 254 | * Encoded: e.g. contains `>` vs `>` 255 | * Decoded Raw: standard HTML tags 256 | * Decoded CDATA: the actual Decoded Raw HTML is inside `` tags 257 | 258 | As an example, endcoded 259 | 260 | ```html 261 | >img src="https://cms.therecord.media/uploads/2023_0706_Ransomware_Tracker_Most_Prolific_Groups_6a567c11da.jpg"< 262 | ``` 263 | 264 | Which as decoded raw html looks as follows 265 | 266 | ```html 267 | 268 | ``` 269 | 270 | Which as decoded CDATA looks like 271 | 272 | ```html 273 | ]]> 274 | ``` 275 | 276 | In the responses provided by history4feed, the XML endpoint will return encoded HTML, the JSON response will return decoded HTML. 277 | 278 | ## Live feed data (data not from WBM) 279 | 280 | In addition to the historical feed information pulled by the Wayback Machine, history4feed also includes the latest posts in the live feed URL. 281 | 282 | Live feed data always takes precedence. history4feed will remove duplicate entries found in the Wayback Machine response also present in the live feed, and will instead use the live feed version by default. 283 | 284 | ## Rebuilding the feed (for output XML API output) 285 | 286 | history4feed stores data in the database as JSON. 287 | 288 | However, to support an RSS XML API endpoint (that can be used with a feed reader), history4feed converts all feeds and their content into a single RSS formatted XML file at request time. 289 | 290 | RSS is always the output, regardless of wether input was ATOM or RSS. 291 | 292 | The RSS files for each feed contain a simplified header; 293 | 294 | ```xml 295 | 296 | 297 | 298 | CHANNEL.TITLE (RSS) / FEED.TITLE (ATOM) 299 | CHANNEL.DESCRIPTION (RSS) / FEED.SUBTITLE (ATOM) 300 | FEED URL ENTERED BY USER 301 | SCRIPT EXECUTION TIME 302 | https://www.github.com/history4feed 303 | 304 | 305 | 306 | ``` 307 | 308 | Each item the be printed between `` tags is rebuilt as follows; 309 | 310 | ```xml 311 | 312 | CHANNEL.ITEM.TITLE (RSS) / FEED.ENTRY.TITLE (ATOM) 313 | CHANNEL.ITEM.DESCRIPTION (RSS) / FEED.ENTRY.CONTENT (ATOM) EITHER ENCODED OR DECODED BASED ON USER SETTING -- THIS IS THE FULL BLOG POST AFTER FULL TEXT EXTRACTED 314 | CHANNEL.ITEM.LINK (RSS) / FEED.ENTRY.LINK (ATOM) 315 | CHANNEL.ITEM.PUBDATE (RSS) / FEED.ENTRY.PUBLISHED (ATOM) 316 | CHANNEL.ITEM.AUTHOR (RSS) / FEED.ENTRY.AUTHOR (ATOM) 317 | CHANNEL.ITEM.CATERGORY [N] (RSS) / FEED.ENTRY.CATEGORY [N] (ATOM) 318 | CHANNEL.ITEM.CATERGORY [N] (RSS) / FEED.ENTRY.CATEGORY [N] (ATOM) 319 | 320 | ``` 321 | 322 | ## Dealing with feed validation on input 323 | 324 | ATOM feeds are XML documents. ATOM feeds can be validated by checking for the header tags where `` tag in the header of the document. e.g. https://www.hackread.com/feed/ 327 | 328 | Feeds are validated to ensure they contain this data before any processing is carries out. 329 | 330 | For example, the source of https://github.com/signalscorps/history4feed/ does not show an RSS or ATOM feed, so would return an error. 331 | 332 | ## Dealing with IP throttling during full text requests 333 | 334 | Many sites will stop robotic request to their content. As the full text function of history4feed relies on accessing each blog post individually this can result in potentially thousands of requests to the Wayback Machine and which have a high risk of being blocked. 335 | 336 | history4feed has two potential workarounds to solve this problem; 337 | 338 | ### 1. Use a proxy 339 | 340 | history4feed supports the use of [ScrapFly](https://scrapfly.io/). 341 | 342 | This is a paid service ([with a free tier](https://scrapfly.io/pricing)). In my own research, its the best proxy for web scraping. 343 | 344 | You will need to register for an account and grab your API key. 345 | 346 | Note, due to many site blocking access to Russian IPs, the request includes the following proxy locations only; 347 | 348 | ```shell 349 | country=us,ca,mx,gb,fr,de,au,at,be,hr,cz,dk,ee,fi,ie,se,es,pt,nl 350 | ``` 351 | 352 | ### 2. Use inbuilt app settings 353 | 354 | It's best to request only what you need, and also slow down the rate at which the content is requested (so the request look more like a human). 355 | 356 | history4feed supports the following options; 357 | 358 | * sleep times: sets the time between each request to get the full post text 359 | * time range: an earliest and latest post time can be set, reducing the number of items returned in a single script run. Similarly, you can reduce the content by ignoring entries in the live feed. 360 | * retries: by default, when in full text mode history4feed will retry the page a certain number of times in case of error. If it still fails after retries count reached, the script will fail. You can change the retries as you require. 361 | 362 | ## A note on error handling 363 | 364 | Due to the way old feeds are pulled from WBM, it is likely some will now be deleted (404s). Similarly, the site might reject requests (403's -- see proxy use as a solution to this). 365 | 366 | history4feed will soft handle these errors and log the failure, including the HTTP status code and the particular URL that failed. You can view the logs for each run in the `logs/` directory. 367 | 368 | This means that if it's required you can go back and get this post manually. However, one limitation of soft error handling is you won't be able to do this using the same history4feed install though. -------------------------------------------------------------------------------- /docs/history4feed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/muchdogesec/history4feed/085ea559ffde6f62aa8d76b2cb889e4bec6fba26/docs/history4feed.png -------------------------------------------------------------------------------- /history4feed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/muchdogesec/history4feed/085ea559ffde6f62aa8d76b2cb889e4bec6fba26/history4feed/__init__.py -------------------------------------------------------------------------------- /history4feed/app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/muchdogesec/history4feed/085ea559ffde6f62aa8d76b2cb889e4bec6fba26/history4feed/app/__init__.py -------------------------------------------------------------------------------- /history4feed/app/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /history4feed/app/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class AppConfig(AppConfig): 5 | default_auto_field = 'django.db.models.BigAutoField' 6 | name = 'history4feed.app' 7 | label = 'history4feed' 8 | -------------------------------------------------------------------------------- /history4feed/app/autoschema.py: -------------------------------------------------------------------------------- 1 | from drf_spectacular.openapi import AutoSchema 2 | from rest_framework.serializers import Serializer 3 | from rest_framework.views import exception_handler 4 | from rest_framework.exceptions import ValidationError 5 | from django.core import exceptions 6 | from dogesec_commons.utils.autoschema import CustomAutoSchema 7 | 8 | class H4FSchema(CustomAutoSchema): 9 | def _is_list_view(self, serializer: Serializer | type[Serializer] | None = None) -> bool: 10 | if self.path.endswith("/xml/"): 11 | return True 12 | return super()._is_list_view(serializer) -------------------------------------------------------------------------------- /history4feed/app/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.9 on 2025-02-14 09:30 2 | 3 | import django.db.models.deletion 4 | import history4feed.app.models 5 | import uuid 6 | from django.db import migrations, models 7 | 8 | 9 | class Migration(migrations.Migration): 10 | 11 | initial = True 12 | 13 | dependencies = [ 14 | ] 15 | 16 | operations = [ 17 | migrations.CreateModel( 18 | name='Category', 19 | fields=[ 20 | ('name', history4feed.app.models.SlugField(max_length=1000, primary_key=True, serialize=False)), 21 | ], 22 | ), 23 | migrations.CreateModel( 24 | name='Feed', 25 | fields=[ 26 | ('id', models.UUIDField(help_text='UUID of feed generated by history4feed', primary_key=True, serialize=False)), 27 | ('title', models.CharField(help_text='found in the of RSS output. Is always kept up to date with the latest feed import values for this property.', max_length=1000)), 28 | ('description', models.CharField(help_text='found in the of RSS output. Is always kept up to date with the latest feed import values for this property.', max_length=10240)), 29 | ('url', models.URLField(help_text='\nThe URL of the RSS or ATOM feed\n\nNote this will be validated to ensure the feed is in the correct format.\n', max_length=1000, unique=True, validators=[history4feed.app.models.normalize_url])), 30 | ('earliest_item_pubdate', models.DateTimeField(help_text='pubdate of earliest post', null=True)), 31 | ('latest_item_pubdate', models.DateTimeField(help_text='pubdate of latest post', null=True)), 32 | ('datetime_added', models.DateTimeField(auto_now_add=True, help_text='date feed entry was added to database')), 33 | ('feed_type', models.CharField(choices=[('rss', 'Rss'), ('atom', 'Atom'), ('skeleton', 'Skeleton')], editable=False, help_text='type of feed', max_length=12)), 34 | ('pretty_url', models.URLField(default=None, max_length=1000, null=True)), 35 | ], 36 | ), 37 | migrations.CreateModel( 38 | name='Job', 39 | fields=[ 40 | ('id', models.UUIDField(default=uuid.uuid4, help_text='UUID of job', primary_key=True, serialize=False)), 41 | ('state', models.CharField(choices=[('pending', 'Pending'), ('running', 'Running'), ('success', 'Success'), ('failed', 'Failed')], default='pending', help_text='state of the job', max_length=12)), 42 | ('run_datetime', models.DateTimeField(auto_now_add=True, help_text='time job was executed')), 43 | ('earliest_item_requested', models.DateTimeField(help_text='shows the earliest time for posts requested. Useful for when jobs are run to see if the time range it runs across is expected', null=True)), 44 | ('latest_item_requested', models.DateTimeField(help_text='shows the latest time for posts requested', null=True)), 45 | ('info', models.CharField(help_text='contains a useful summary of the job (e.g. number of posts retrieved, errors logged)', max_length=10240)), 46 | ('include_remote_blogs', models.BooleanField(default=False)), 47 | ('feed', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='history4feed.feed')), 48 | ], 49 | ), 50 | migrations.CreateModel( 51 | name='Post', 52 | fields=[ 53 | ('id', models.UUIDField(help_text='UUID of items generated by history4feed', primary_key=True, serialize=False)), 54 | ('datetime_added', models.DateTimeField(auto_now_add=True)), 55 | ('datetime_updated', models.DateTimeField(auto_now=True)), 56 | ('title', models.CharField(help_text='found in the element of feed output', max_length=1000)), 57 | ('description', models.CharField(blank=True, help_text='found in the element of feed output', max_length=2097152)), 58 | ('link', models.URLField(help_text='link to full article. found in the element of feed output', max_length=1000, validators=[history4feed.app.models.normalize_url])), 59 | ('pubdate', models.DateTimeField(help_text='date of publication.')), 60 | ('author', models.CharField(blank=True, help_text='author of the post', max_length=1000, null=True)), 61 | ('is_full_text', models.BooleanField(default=False, help_text='if full text has been retrieved')), 62 | ('content_type', models.CharField(default='plain/text', help_text='content type of the description', max_length=200)), 63 | ('added_manually', models.BooleanField(default=False)), 64 | ('deleted_manually', models.BooleanField(default=False, help_text='this post is hidden from user')), 65 | ('categories', models.ManyToManyField(blank=True, help_text='categories of the post', related_name='posts', to='history4feed.category')), 66 | ('feed', models.ForeignKey(help_text='feed id this item belongs too', on_delete=django.db.models.deletion.CASCADE, related_name='posts', to='history4feed.feed')), 67 | ], 68 | ), 69 | migrations.CreateModel( 70 | name='FulltextJob', 71 | fields=[ 72 | ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 73 | ('status', models.CharField(choices=[('retrieved', 'Retrieved'), ('skipped', 'Skipped'), ('failed', 'Failed'), ('retrieving', 'Retrieving')], default='retrieving', max_length=15)), 74 | ('error_str', models.CharField(blank=True, max_length=1500, null=True)), 75 | ('link', models.CharField(max_length=1500)), 76 | ('job', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='fulltext_jobs', to='history4feed.job')), 77 | ('post', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='fulltext_jobs', to='history4feed.post')), 78 | ], 79 | ), 80 | migrations.AddConstraint( 81 | model_name='post', 82 | constraint=models.UniqueConstraint(fields=('link', 'feed'), name='unique_link_by_feed'), 83 | ), 84 | ] 85 | -------------------------------------------------------------------------------- /history4feed/app/migrations/0002_feed_freshness_alter_feed_feed_type.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1.6 on 2025-02-24 12:48 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('history4feed', '0001_initial'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AddField( 14 | model_name='feed', 15 | name='freshness', 16 | field=models.DateTimeField(default=None, null=True), 17 | ), 18 | migrations.AlterField( 19 | model_name='feed', 20 | name='feed_type', 21 | field=models.CharField(choices=[('rss', 'Rss'), ('atom', 'Atom'), ('skeleton', 'Skeleton'), ('search_index', 'Search Index')], editable=False, help_text='type of feed', max_length=12), 22 | ), 23 | ] 24 | -------------------------------------------------------------------------------- /history4feed/app/migrations/0003_alter_feed_description.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1.6 on 2025-03-28 10:32 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('history4feed', '0002_feed_freshness_alter_feed_feed_type'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='feed', 15 | name='description', 16 | field=models.CharField(default=None, help_text='found in the of RSS output. Is always kept up to date with the latest feed import values for this property.', max_length=10240, null=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /history4feed/app/migrations/0004_alter_fulltextjob_status_alter_job_state.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1.6 on 2025-05-02 13:03 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('history4feed', '0003_alter_feed_description'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='fulltextjob', 15 | name='status', 16 | field=models.CharField(choices=[('retrieved', 'Retrieved'), ('skipped', 'Skipped'), ('cancelled', 'Cancelled'), ('failed', 'Failed'), ('retrieving', 'Retrieving')], default='retrieving', max_length=15), 17 | ), 18 | migrations.AlterField( 19 | model_name='job', 20 | name='state', 21 | field=models.CharField(choices=[('pending', 'Pending'), ('running', 'Running'), ('success', 'Success'), ('cancelled', 'Cancelled'), ('failed', 'Failed')], default='pending', help_text='state of the job', max_length=12), 22 | ), 23 | ] 24 | -------------------------------------------------------------------------------- /history4feed/app/migrations/0005_feed_datetime_modified.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1.6 on 2025-05-02 14:53 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('history4feed', '0004_alter_fulltextjob_status_alter_job_state'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AddField( 14 | model_name='feed', 15 | name='datetime_modified', 16 | field=models.DateTimeField(default=None, help_text='date feed entry was edited in the database', null=True), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /history4feed/app/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/muchdogesec/history4feed/085ea559ffde6f62aa8d76b2cb889e4bec6fba26/history4feed/app/migrations/__init__.py -------------------------------------------------------------------------------- /history4feed/app/models.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | from typing import Iterable 3 | from urllib.parse import urlparse 4 | import uuid 5 | from .settings import history4feed_server_settings 6 | from django.db import models 7 | from rest_framework import validators 8 | from uuid import uuid4 9 | from django.utils.text import slugify 10 | import hyperlink 11 | from django.db.models import Min, Max 12 | from django.db.models import OuterRef, Subquery 13 | from django.db.models import F 14 | from django.utils import timezone 15 | 16 | POST_DESCRIPTION_MAX_LENGTH = 2 * 1024 * 1024 # 2MiB 17 | FEED_DESCRIPTION_MAX_LENGTH = 10*1024 # 10KiB 18 | 19 | class JobState(models.TextChoices): 20 | PENDING = "pending" 21 | RUNNING = "running" 22 | SUCCESS = "success" 23 | CANCELLED = "cancelled" 24 | FAILED = "failed" 25 | 26 | class FeedType(models.TextChoices): 27 | RSS = "rss" 28 | ATOM = "atom" 29 | SKELETON = "skeleton" 30 | SEARCH_INDEX = "search_index" 31 | 32 | # Create your models here. 33 | 34 | class SlugField(models.CharField): 35 | def get_prep_value(self, value): 36 | return slugify(str(value)) 37 | 38 | class Category(models.Model): 39 | name = SlugField(max_length=1000, primary_key=True) 40 | 41 | 42 | def stix_id(url): 43 | return uuid.uuid5(uuid.UUID(str(history4feed_server_settings.HISTORY4FEED_NAMESPACE)), url) 44 | 45 | def normalize_url(url): 46 | try: 47 | u = hyperlink.parse(url) 48 | return u.normalize(url).to_text() 49 | except Exception as e: 50 | raise validators.ValidationError(f"URL normalization failed") 51 | 52 | AUTO_TITLE_TRAIL = "%^%*(%" 53 | def title_as_string(value: str): 54 | if value.endswith(AUTO_TITLE_TRAIL): 55 | value = value[:-len(AUTO_TITLE_TRAIL)] 56 | return value 57 | 58 | class Feed(models.Model): 59 | id = models.UUIDField(primary_key=True, help_text="UUID of feed generated by history4feed") 60 | title = models.CharField(max_length=1000, help_text="found in the of RSS output. Is always kept up to date with the latest feed import values for this property.") 61 | description = models.CharField(max_length=FEED_DESCRIPTION_MAX_LENGTH, help_text="found in the of RSS output. Is always kept up to date with the latest feed import values for this property.", null=True, default=None) 62 | url = models.URLField(max_length=1000, unique=True, help_text=dedent(""" 63 | The URL of the RSS or ATOM feed 64 | 65 | Note this will be validated to ensure the feed is in the correct format. 66 | """), validators=[normalize_url]) 67 | earliest_item_pubdate = models.DateTimeField(null=True, help_text="pubdate of earliest post") 68 | latest_item_pubdate = models.DateTimeField(null=True, help_text="pubdate of latest post") 69 | datetime_added = models.DateTimeField(auto_now_add=True, editable=False, help_text="date feed entry was added to database") 70 | datetime_modified = models.DateTimeField(default=None, null=True, help_text="date feed entry was edited in the database") 71 | feed_type = models.CharField(choices=FeedType.choices, max_length=12, null=False, editable=False, help_text="type of feed") 72 | pretty_url = models.URLField(max_length=1000, null=True, default=None) 73 | freshness = models.DateTimeField(null=True, default=None) 74 | 75 | def get_post_count(self): 76 | return self.posts.filter(deleted_manually=False).count() 77 | 78 | def save(self, *args, **kwargs) -> None: 79 | if not self.id: 80 | self.id = stix_id(self.url) 81 | self.earliest_item_pubdate, self.latest_item_pubdate = self.posts.aggregate(min=Min('pubdate'), max=Max('pubdate')).values() 82 | self.datetime_modified = self.datetime_modified or self.datetime_added 83 | return super().save(*args, **kwargs) 84 | 85 | def get_pretty_url(self): 86 | return self.pretty_url or self.url 87 | 88 | def set_title(self, title): 89 | if not self.title or self.title.endswith(AUTO_TITLE_TRAIL): 90 | self.title = title + AUTO_TITLE_TRAIL 91 | 92 | def set_description(self, description): 93 | if not self.description or self.description.endswith(AUTO_TITLE_TRAIL): 94 | self.description = description + AUTO_TITLE_TRAIL 95 | 96 | class Job(models.Model): 97 | id = models.UUIDField(primary_key=True, default=uuid4, help_text="UUID of job") 98 | state = models.CharField(choices=JobState.choices, max_length=12, default=JobState.PENDING, null=False, help_text="state of the job") 99 | run_datetime = models.DateTimeField(auto_now_add=True, editable=False, help_text="time job was executed") 100 | earliest_item_requested = models.DateTimeField(null=True, help_text="shows the earliest time for posts requested. Useful for when jobs are run to see if the time range it runs across is expected") 101 | latest_item_requested = models.DateTimeField(null=True, help_text="shows the latest time for posts requested") 102 | feed = models.ForeignKey(Feed, on_delete=models.CASCADE) 103 | info = models.CharField(max_length=FEED_DESCRIPTION_MAX_LENGTH, help_text="contains a useful summary of the job (e.g. number of posts retrieved, errors logged)") 104 | include_remote_blogs = models.BooleanField(default=False) 105 | 106 | def urls(self): 107 | retval = {} 108 | ft_job: FulltextJob = None 109 | for ft_job in self.fulltext_jobs.all(): 110 | retval[ft_job.status] = retval.get(ft_job.status, []) 111 | retval[ft_job.status].append(dict(url=ft_job.link, id=ft_job.post_id)) 112 | return retval 113 | 114 | def should_skip_post(self, post_link: str): 115 | return (not self.include_remote_blogs) and urlparse(self.feed.url).hostname.split('.')[-2:] != urlparse(post_link).hostname.split('.')[-2:] 116 | 117 | def cancel(self): 118 | if self.state in [JobState.PENDING, JobState.RUNNING]: 119 | self.state = JobState.CANCELLED 120 | self.save() 121 | return 122 | 123 | def is_cancelled(self): 124 | return self.state == JobState.CANCELLED 125 | 126 | 127 | class FullTextState(models.TextChoices): 128 | RETRIEVED = "retrieved" 129 | SKIPPED = "skipped" 130 | CANCELLED = "cancelled" 131 | FAILED = "failed" 132 | RETRIEVING = "retrieving" 133 | 134 | class Post(models.Model): 135 | id = models.UUIDField(primary_key=True, help_text="UUID of items generated by history4feed") 136 | datetime_added = models.DateTimeField(auto_now_add=True, editable=False) 137 | datetime_updated = models.DateTimeField(auto_now=True) 138 | title = models.CharField(max_length=1000, help_text="found in the element of feed output") 139 | description = models.CharField(max_length=POST_DESCRIPTION_MAX_LENGTH, blank=True, help_text="found in the element of feed output") 140 | link = models.URLField(max_length=1000, help_text="link to full article. found in the element of feed output", validators=[normalize_url]) 141 | pubdate = models.DateTimeField(help_text="date of publication.") 142 | author = models.CharField(max_length=1000, help_text="author of the post", null=True, blank=True) 143 | categories = models.ManyToManyField(Category, related_name="posts", help_text="categories of the post", blank=True) 144 | feed = models.ForeignKey(Feed, on_delete=models.CASCADE, related_name="posts", help_text="feed id this item belongs too") 145 | is_full_text = models.BooleanField(default=False, help_text="if full text has been retrieved") 146 | content_type = models.CharField(default="plain/text", max_length=200, help_text="content type of the description") 147 | added_manually = models.BooleanField(default=False) 148 | deleted_manually = models.BooleanField(default=False, help_text="this post is hidden from user") 149 | 150 | class Meta: 151 | constraints = [ 152 | models.UniqueConstraint(fields=["link", "feed"], name="unique_link_by_feed"), 153 | ] 154 | 155 | def add_categories(self, categories): 156 | categories = [Category.objects.get_or_create(name=name)[0] for name in categories] 157 | self.categories.set(categories) 158 | 159 | 160 | def save(self, *args, **kwargs) -> None: 161 | if not self.id: 162 | pubdate = self.pubdate.strftime("%Y-%m-%dT%H:%M:%S.%fZ") 163 | self.id = stix_id(f"{self.feed.id}+{self.link}+{pubdate}") 164 | return super().save(*args, **kwargs) 165 | 166 | @classmethod 167 | def visible_posts(cls): 168 | return cls.objects.filter(deleted_manually=False).annotate( 169 | last_job_id=Subquery( 170 | FulltextJob.objects.filter( 171 | post_id=OuterRef('pk') 172 | ).order_by('-job__run_datetime') # Ordering by publication date to get the latest book 173 | .values('job__id')[:1] # We take only the first (most recent) book 174 | ) 175 | ) 176 | 177 | class FulltextJob(models.Model): 178 | post = models.ForeignKey(Post, on_delete=models.SET_NULL, null=True, related_name="fulltext_jobs") 179 | job = models.ForeignKey(Job, related_name="fulltext_jobs", on_delete=models.CASCADE) 180 | status = models.CharField(max_length=15, choices=FullTextState.choices, default=FullTextState.RETRIEVING) 181 | error_str = models.CharField(max_length=1500, null=True, blank=True) 182 | link = models.CharField(max_length=1500) 183 | 184 | 185 | 186 | def is_cancelled(self): 187 | return self.job.state == JobState.CANCELLED -------------------------------------------------------------------------------- /history4feed/app/openapi_params.py: -------------------------------------------------------------------------------- 1 | from drf_spectacular.utils import OpenApiParameter, OpenApiResponse, OpenApiExample 2 | from drf_spectacular.types import OpenApiTypes 3 | from textwrap import dedent 4 | from .serializers import PostSerializer 5 | 6 | 7 | FEED_ID_PARAM = OpenApiParameter( 8 | "feed_id", 9 | type=OpenApiTypes.UUID, 10 | description="The ID of the Feed. You can search for Feed IDs using the GET Feeds endpoints. e.g. `6c6e6448-04d4-42a3-9214-4f0f7d02694e`", 11 | location=OpenApiParameter.PATH, 12 | ) 13 | JOB_ID_PARAM = OpenApiParameter( 14 | "job_id", 15 | type=OpenApiTypes.UUID, 16 | description="The ID of the Job. You can search for Job IDs using the GET Jobs endpoints. e.g. `7db25a55-55e4-4bc5-b189-3e2ca4e304e5`", 17 | location=OpenApiParameter.PATH, 18 | ) 19 | POST_ID_PARAM = OpenApiParameter( 20 | "post_id", 21 | type=OpenApiTypes.UUID, 22 | description="The ID of the Post. You can search for Post IDs using the GET Posts endpoints for a specific Feed. e.g. `797e94b1-efdc-4e66-a748-f2b6a5896a89`", 23 | location=OpenApiParameter.PATH, 24 | ) 25 | 26 | 27 | XML_RESPONSE = OpenApiResponse( 28 | response=PostSerializer(many=True), 29 | description="", 30 | examples=[ 31 | OpenApiExample( 32 | "xml", 33 | value=dedent( 34 | """ 35 | 36 | 37 | 38 | Example CTI Blog 39 | 40 | 41 | https://cti.example.com/feed/ 42 | 2024-07-02T17:07:31+00:00 43 | 44 | DNS Probing Operation 45 | https://cti.example.com/blog/dns-probing-operation/ 46 | 2024-06-03T15:00:52+00:00 47 | <html></html> 48 | infoblox-threat-intel 49 | dns 50 | dns-intel 51 | dns-threat-intelligence 52 | malware 53 | 54 | John Doe (Admin) 55 | 56 | 57 | 58 | 59 | """ 60 | ), 61 | ) 62 | ], 63 | ) 64 | 65 | HTTP404_EXAMPLE = OpenApiExample("http-404", {"message": "resource not found", "code": 404}) 66 | HTTP400_EXAMPLE = OpenApiExample("http-400", {"message": "request not understood", "code": 400}) 67 | -------------------------------------------------------------------------------- /history4feed/app/serializers.py: -------------------------------------------------------------------------------- 1 | from rest_framework import serializers, validators, exceptions 2 | from .models import AUTO_TITLE_TRAIL, FEED_DESCRIPTION_MAX_LENGTH, Category, Feed, Post, Job, normalize_url, FeedType, title_as_string 3 | from django.db import models as django_models 4 | from django.utils.translation import gettext_lazy as _ 5 | 6 | class TitleField(serializers.CharField): 7 | def to_internal_value(self, data): 8 | return super().to_internal_value(data) 9 | def to_representation(self, value): 10 | return title_as_string(super().to_representation(value)) 11 | 12 | class InvalidFeed(exceptions.APIException): 13 | status_code = 406 14 | 15 | class FeedSerializer(serializers.ModelSerializer): 16 | count_of_posts = serializers.IntegerField(source='get_post_count', read_only=True, help_text="Number of posts in feed") 17 | include_remote_blogs = serializers.BooleanField(write_only=True, default=False) 18 | pretty_url = serializers.URLField(allow_null=True, required=False, help_text="This is a cosmetic URL. It is designed to show the actual blog link to browse to in a web browser (not the feed)") 19 | title = TitleField(required=False, max_length=256, allow_null=True, allow_blank=True) 20 | description = TitleField(required=False, max_length=FEED_DESCRIPTION_MAX_LENGTH, allow_null=True, allow_blank=True) 21 | use_search_index = serializers.BooleanField(default=False, write_only=True, help_text="should use search index instead") 22 | class Meta: 23 | model = Feed 24 | # fields = '__all__' 25 | exclude = ['freshness'] 26 | read_only_fields = ['id', 'earliest_item_pubdate', 'latest_item_pubdate', 'datetime_added', "datetime_modified"] 27 | 28 | def create(self, validated_data: dict): 29 | validated_data = validated_data.copy() 30 | validated_data.pop('include_remote_blogs', None) 31 | validated_data.pop('use_search_index', None) 32 | return super().create(validated_data) 33 | 34 | class SkeletonFeedSerializer(FeedSerializer): 35 | include_remote_blogs = None 36 | use_search_index = None 37 | title = serializers.CharField(required=True, help_text="title of feed") 38 | description = serializers.CharField(required=False, help_text="description of feed", allow_blank=True) 39 | feed_type = serializers.HiddenField(default=FeedType.SKELETON) 40 | 41 | class SearchIndexFeedSerializer(FeedSerializer): 42 | title = serializers.CharField(required=True, help_text="title of feed") 43 | description = serializers.CharField(required=True, help_text="description of feed") 44 | feed_type = serializers.HiddenField(default=FeedType.SEARCH_INDEX) 45 | 46 | 47 | class FeedCreatedJobSerializer(FeedSerializer): 48 | job_id = serializers.UUIDField(read_only=True, help_text="only returns with POST /feeds/") 49 | job_state = serializers.CharField(read_only=True, help_text="only returns with POST /feeds/") 50 | 51 | 52 | class PostListSerializer(serializers.ListSerializer): 53 | child = None 54 | 55 | @property 56 | def feed_id(self): 57 | return self.context.get('feed_id') 58 | 59 | 60 | def run_child_validation(self, data): 61 | """ 62 | Run validation on child serializer. 63 | You may need to override this method to support multiple updates. For example: 64 | 65 | self.child.instance = self.instance.get(pk=data['id']) 66 | self.child.initial_data = data 67 | return super().run_child_validation(data) 68 | """ 69 | data.setdefault('feed', self.feed_id) 70 | return self.child.run_validation(data) 71 | 72 | def create(self, validated_data: list[dict]): 73 | instances = [] 74 | for attrs in validated_data: 75 | feed_id = attrs.setdefault('feed_id', self.feed_id) 76 | instance = None 77 | try: 78 | instance = Post.objects.get(feed_id=feed_id, link=attrs['link']) 79 | except: 80 | pass 81 | if instance: 82 | instance = self.child.update(instance, attrs) 83 | else: 84 | instance = self.child.create(attrs) 85 | 86 | instances.append(instance) 87 | return instances 88 | 89 | class PostSerializer(serializers.ModelSerializer): 90 | # categories = serializers.ManyRelatedField() 91 | class Meta: 92 | list_serializer_class = PostListSerializer 93 | model = Post 94 | exclude = ['feed', 'deleted_manually'] 95 | read_only_fields = ["id", "datetime_updated", "datetime_added", "description", "is_full_text", "content_type", "added_manually"] 96 | 97 | 98 | def run_validation(self, data=...): 99 | if categories := data.get('categories'): 100 | data['categories'] = [Category.objects.get_or_create(name=name)[0].name for name in categories] 101 | return super().run_validation(data) 102 | 103 | class PostWithFeedIDSerializer(PostSerializer): 104 | feed_id = serializers.UUIDField() 105 | 106 | class PatchSerializer(serializers.Serializer): 107 | pass 108 | 109 | class FeedPatchSerializer(serializers.ModelSerializer): 110 | title = serializers.CharField(required=True, help_text="title of feed") 111 | description = serializers.CharField(required=True, help_text="description of feed") 112 | 113 | class Meta: 114 | model = Feed 115 | fields = ['title', 'description', 'pretty_url'] 116 | 117 | class FeedFetchSerializer(FeedPatchSerializer, FeedSerializer): 118 | class Meta: 119 | model = Feed 120 | fields = ['include_remote_blogs'] 121 | 122 | class PostCreateSerializer(PostSerializer): 123 | link = serializers.URLField(validators=[normalize_url]) 124 | class feed_class(serializers.HiddenField): 125 | def get_default(self): 126 | return self.context.get('feed_id') 127 | feed_id = feed_class(default=None) 128 | 129 | class Meta: 130 | list_serializer_class = PostListSerializer 131 | model = Post 132 | fields = ["title", "link", "pubdate", "author", "categories", "feed_id"] 133 | validators = [ 134 | validators.UniqueTogetherValidator( 135 | queryset=Post.visible_posts(), 136 | fields=('feed_id', 'link'), 137 | message='Post with link already exists in feed.', 138 | ) 139 | ] 140 | 141 | class PostPatchSerializer(PostSerializer): 142 | class Meta: 143 | model = Post 144 | fields = ["title", "pubdate", "author", "categories"] 145 | 146 | 147 | class CreatePostsSerializer(serializers.Serializer): 148 | posts = PostCreateSerializer(many=True, allow_empty=False) 149 | 150 | def create(self, validated_data): 151 | posts = [{**post, **self.save_kwargs} for post in validated_data["posts"]] 152 | 153 | return self.fields['posts'].create(posts) 154 | 155 | def save(self, **kwargs): 156 | self.save_kwargs = kwargs 157 | return super().save(**kwargs) 158 | 159 | 160 | 161 | class JobUrlStatusSerializer(serializers.Serializer): 162 | class joburlstatus(serializers.Serializer): 163 | url = serializers.URLField() 164 | id = serializers.UUIDField() 165 | retrieved = joburlstatus(many=True, default=[]) 166 | retrieving = joburlstatus(many=True, default=[]) 167 | skipped = joburlstatus(many=True, default=[]) 168 | failed = joburlstatus(many=True, default=[]) 169 | cancelled = joburlstatus(many=True, default=[]) 170 | 171 | class JobSerializer(serializers.ModelSerializer): 172 | count_of_items = serializers.IntegerField(read_only=True) 173 | feed_id = serializers.UUIDField(read_only=True, source='feed.id') 174 | urls = JobUrlStatusSerializer() 175 | class Meta: 176 | model = Job 177 | # fields = '__all__' 178 | exclude = ['feed'] 179 | 180 | class PostJobSerializer(JobSerializer): 181 | pass 182 | -------------------------------------------------------------------------------- /history4feed/app/settings.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | from typing import Any, Dict, get_type_hints 3 | import uuid 4 | 5 | from django.conf import settings 6 | from rest_framework.settings import APISettings, perform_import, api_settings 7 | 8 | H4F_DEFAULTS: dict[str, any] = { 9 | 'SCRAPFLY_KEY': '', 10 | 'WAYBACK_SLEEP_SECONDS': 20, 11 | 'EARLIEST_SEARCH_DATE': datetime(2020, 1, 1, tzinfo=timezone.utc), 12 | 'REQUEST_RETRY_COUNT': 3, 13 | 'HISTORY4FEED_NAMESPACE': uuid.UUID("6c6e6448-04d4-42a3-9214-4f0f7d02694e"), 14 | "BRAVE_SEARCH_API_KEY": None 15 | } 16 | 17 | IMPORT_STRINGS = [ 18 | ] 19 | 20 | class History4FeedServerSettings(APISettings): 21 | SCRAPFLY_KEY: str 22 | WAYBACK_SLEEP_SECONDS: int 23 | EARLIEST_SEARCH_DATE: datetime 24 | REQUEST_RETRY_COUNT: int 25 | HISTORY4FEED_NAMESPACE : str|uuid.UUID 26 | BRAVE_SEARCH_API_KEY: str 27 | 28 | history4feed_server_settings = History4FeedServerSettings( 29 | user_settings=getattr(settings, 'HISTORY4FEED_SETTINGS', {}), # type: ignore 30 | defaults=H4F_DEFAULTS, # type: ignore 31 | import_strings=IMPORT_STRINGS, 32 | ) -------------------------------------------------------------------------------- /history4feed/app/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /history4feed/app/utils.py: -------------------------------------------------------------------------------- 1 | from rest_framework import pagination, response, renderers 2 | from rest_framework.filters import OrderingFilter, BaseFilterBackend 3 | from django.utils.encoding import force_str 4 | from django.db.models import Q 5 | from datetime import datetime, UTC 6 | import typing 7 | from dogesec_commons.utils import Pagination, Ordering 8 | from django.utils import timezone 9 | from django.forms import DateTimeField 10 | from django_filters.rest_framework import filters 11 | 12 | class DatetimeFieldUTC(DateTimeField): 13 | def to_python(self, value): 14 | value = super().to_python(value) 15 | return value and value.astimezone(UTC) 16 | 17 | class DatetimeFilter(filters.Filter): 18 | field_class = DatetimeFieldUTC 19 | 20 | class MinMaxDateFilter(BaseFilterBackend): 21 | min_val = datetime.min 22 | max_value = datetime.max 23 | def get_fields(self, view): 24 | out = {} 25 | fields = getattr(view, 'minmax_date_fields', []) 26 | if not isinstance(fields, list): 27 | return out 28 | for field in fields: 29 | out[f"{field}_max"] = field 30 | out[f"{field}_min"] = field 31 | return out 32 | 33 | def parse_date(self, value): 34 | return DatetimeFieldUTC().to_python(value) 35 | 36 | def filter_queryset(self, request, queryset, view): 37 | valid_fields = self.get_fields(view) 38 | valid_params = [(k, v) for k, v in request.query_params.items() if k in valid_fields] 39 | queries = {} 40 | for param, value in valid_params: 41 | field_name = valid_fields[param] 42 | if param.endswith('_max'): 43 | queries[f"{field_name}__lte"] = self.parse_date(value) 44 | else: 45 | queries[f"{field_name}__gte"] = self.parse_date(value) 46 | return queryset.filter(Q(**queries)) 47 | 48 | def get_schema_operation_parameters(self, view): 49 | parameters = [] 50 | valid_fields = self.get_fields(view) 51 | for query_name, field_name in valid_fields.items(): 52 | _type = "Maximum" 53 | if query_name.endswith('min'): 54 | _type = "Minimum" 55 | parameter = { 56 | 'name': query_name, 57 | 'required': False, 58 | 'in': 'query', 59 | 'description': f"{_type} value of `{field_name}` to filter by in format `YYYY-MM-DD`.", 60 | 'schema': { 61 | 'type': 'string', 'format': 'date', 62 | }, 63 | } 64 | parameters.append(parameter) 65 | return parameters 66 | 67 | 68 | 69 | # use pagination to modify how xml/rss renders 70 | class XMLPostPagination(Pagination): 71 | def get_paginated_response_schema(self, schema): 72 | return { 73 | 'type': 'string', 74 | 'example': '' 75 | } 76 | 77 | def get_paginated_response(self, data): 78 | return response.Response(data, headers={ 79 | 'rss_page_size': self.get_page_size(self.request), 80 | 'rss_page_number': self.page.number, 81 | 'rss_page_results_count': len(self.page), 82 | 'rss_total_results_count': self.page.paginator.count, 83 | }, content_type="application/rss+xml; charset=UTF-8") 84 | 85 | def get_schema_operation_parameters(self, view): 86 | return super().get_schema_operation_parameters(view) 87 | 88 | class RSSRenderer(renderers.BaseRenderer): 89 | media_type = "application/rss+xml" 90 | format = "xml" 91 | 92 | def render(self, data, accepted_media_type=None, renderer_context=None): 93 | return data -------------------------------------------------------------------------------- /history4feed/app/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import get_object_or_404 2 | 3 | from .autoschema import H4FSchema 4 | 5 | from .openapi_params import ( 6 | HTTP400_EXAMPLE, 7 | HTTP404_EXAMPLE, 8 | JOB_ID_PARAM, 9 | FEED_ID_PARAM, 10 | POST_ID_PARAM, 11 | XML_RESPONSE, 12 | ) 13 | from .utils import ( 14 | DatetimeFilter, 15 | Ordering, 16 | Pagination, 17 | MinMaxDateFilter, 18 | RSSRenderer, 19 | XMLPostPagination, 20 | ) 21 | from dogesec_commons.utils.serializers import CommonErrorSerializer 22 | # from .openapi_params import FEED_PARAMS, POST_PARAMS 23 | 24 | from .serializers import CreatePostsSerializer, FeedCreatedJobSerializer, FeedFetchSerializer, FeedPatchSerializer, PostPatchSerializer, PostWithFeedIDSerializer, SearchIndexFeedSerializer, SkeletonFeedSerializer, PatchSerializer, PostJobSerializer, PostSerializer, FeedSerializer, JobSerializer, PostCreateSerializer 25 | from .models import AUTO_TITLE_TRAIL, FulltextJob, JobState, Post, Feed, Job, FeedType 26 | from rest_framework import ( 27 | viewsets, 28 | request, 29 | response, 30 | mixins, 31 | decorators, 32 | renderers, 33 | pagination, 34 | status, 35 | validators, 36 | ) 37 | from django.http import HttpResponse 38 | from ..h4fscripts import h4f, task_helper, build_rss 39 | from drf_spectacular.utils import ( 40 | extend_schema, 41 | extend_schema_view, 42 | OpenApiResponse, 43 | OpenApiExample, 44 | ) 45 | from drf_spectacular.types import OpenApiTypes 46 | from django_filters.rest_framework import ( 47 | DjangoFilterBackend, 48 | FilterSet, 49 | Filter, 50 | BaseCSVFilter, 51 | UUIDFilter, 52 | BaseInFilter, 53 | filters, 54 | ) 55 | from django.db.models import Count, Q, Subquery, OuterRef 56 | from datetime import datetime 57 | import textwrap 58 | from django.utils import timezone 59 | 60 | from history4feed.app import serializers 61 | 62 | from history4feed.app import utils 63 | 64 | from drf_spectacular.views import SpectacularAPIView 65 | 66 | class SchemaViewCached(SpectacularAPIView): 67 | _schema = None 68 | 69 | def _get_schema_response(self, request): 70 | version = self.api_version or request.version or self._get_version_parameter(request) 71 | if not self.__class__._schema: 72 | generator = self.generator_class(urlconf=self.urlconf, api_version=version, patterns=self.patterns) 73 | self.__class__._schema = generator.get_schema(request=request, public=self.serve_public) 74 | return response.Response( 75 | data=self.__class__._schema, 76 | headers={"Content-Disposition": f'inline; filename="{self._get_filename(request, version)}"'} 77 | ) 78 | 79 | class Response(response.Response): 80 | DEFAULT_HEADERS = { 81 | "Access-Control-Allow-Origin": "*", 82 | } 83 | CONTENT_TYPE = "application/json" 84 | 85 | def __init__( 86 | self, 87 | data=None, 88 | status=None, 89 | template_name=None, 90 | headers=None, 91 | exception=False, 92 | content_type=CONTENT_TYPE, 93 | ): 94 | headers = headers or {} 95 | headers.update(self.DEFAULT_HEADERS) 96 | super().__init__(data, status, template_name, headers, exception, content_type) 97 | 98 | 99 | class ErrorResp(Response): 100 | def __init__(self, status, title, details=None): 101 | super().__init__({"message": title, "code": status}, status=status) 102 | 103 | 104 | # Create your views here. 105 | 106 | @extend_schema_view( 107 | retrieve=extend_schema( 108 | summary="Get a Post", 109 | description=textwrap.dedent( 110 | """ 111 | This will return a single Post by its ID. It is useful if you only want to get the data for a single entry. 112 | """ 113 | ), 114 | responses={ 115 | 200: PostWithFeedIDSerializer, 116 | 404: OpenApiResponse(CommonErrorSerializer, "Post not found", examples=[HTTP404_EXAMPLE]), 117 | 400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]), 118 | }, 119 | ), 120 | list=extend_schema( 121 | summary="Search for Posts", 122 | description=textwrap.dedent( 123 | """ 124 | Search through Posts from all Blogs. Filter by the ones you're interested in. 125 | """ 126 | ), 127 | responses={ 128 | 200: PostWithFeedIDSerializer, 129 | 404: OpenApiResponse(CommonErrorSerializer, "Feed not found", examples=[HTTP404_EXAMPLE]), 130 | 400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]), 131 | }, 132 | ), 133 | destroy=extend_schema( 134 | summary="Delete a Post by ID", 135 | description=textwrap.dedent( 136 | """ 137 | This will delete the post inside of the feed. Deleting the post will remove it forever and it will not be reindexed on subsequent feed updates. The only way to re-index it is to add it manually. 138 | """ 139 | ), 140 | ), 141 | reindex=extend_schema( 142 | summary="Update a Post in a Feed", 143 | description=textwrap.dedent( 144 | """ 145 | When blog posts are modified, the RSS or ATOM feeds or search results are not often updated with the new modification time. As such, fetching for blog will cause these updated posts to be missed. 146 | 147 | To ensure the post stored in the database matches the one currently published you can make a request to this endpoint using the Post ID to update it. 148 | 149 | This update will only change the content (`description`) stored for the Post. It will not update the `title`, `pubdate`, `author`, or `categories`. If you need to update these properties you can use the Update Post Metadata endpoint. 150 | 151 | **IMPORTANT**: This action will delete the original post as well as all the STIX SDO and SRO objects created during the processing of the original text. Mostly this is not an issue, however, if the post has been removed at source you will end up with an empty entry for this Post. 152 | 153 | The response will return the Job information responsible for getting the requested data you can track using the `id` returned via the GET Jobs by ID endpoint. 154 | """ 155 | ), 156 | responses={ 157 | 201: PostJobSerializer, 158 | 404: OpenApiResponse(CommonErrorSerializer, "post does not exist", examples=[HTTP404_EXAMPLE]), 159 | }, 160 | request=PatchSerializer, 161 | ), 162 | partial_update=extend_schema( 163 | summary="Update a Posts Metadata", 164 | description=textwrap.dedent( 165 | """ 166 | In most cases, the automatically indexed metadata (or user submitted metadata in the case of manually added Posts) will be fine. 167 | 168 | However, these may be occasions you want to change the values of the `title`, `pubdate`, `author`, or `categories` for a Post. 169 | 170 | The following key/values are accepted in the body of the request: 171 | 172 | * `pubdate` (required): The date of the blog post in the format `YYYY-MM-DD`. history4feed cannot accurately determine a post date in all cases, so you must enter it manually. 173 | * `title` (required): history4feed cannot accurately determine the title of a post in all cases, so you must enter it manually. 174 | * `author` (optional): the value to be stored for the author of the post. 175 | * `categories` (optional) : the value(s) to be stored for the category of the post. Pass as a list like `["tag1","tag2"]`. 176 | 177 | Only one key/value is required. If no values are passed, they will be remain unchanged from the current state. 178 | 179 | It is not possible to manually modify any other values for the Post object. You can update the post content using the Update a Post in A Feed endpoint. 180 | """ 181 | ), 182 | responses={ 183 | 201: PostSerializer, 184 | 400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]), 185 | 404: OpenApiResponse(CommonErrorSerializer, "post does not exist", examples=[HTTP404_EXAMPLE]), 186 | }, 187 | request=PostPatchSerializer, 188 | ), 189 | 190 | ) 191 | class PostOnlyView(mixins.RetrieveModelMixin, mixins.ListModelMixin, viewsets.GenericViewSet): 192 | openapi_path_params = [POST_ID_PARAM] 193 | openapi_tags = ["Posts"] 194 | serializer_class = PostWithFeedIDSerializer 195 | lookup_url_kwarg = "post_id" 196 | pagination_class = Pagination("posts") 197 | filter_backends = [DjangoFilterBackend, Ordering, MinMaxDateFilter] 198 | ordering_fields = ["pubdate", "title", "datetime_updated", "datetime_added"] 199 | ordering = "pubdate_descending" 200 | minmax_date_fields = ["pubdate"] 201 | 202 | class filterset_class(FilterSet): 203 | feed_id = filters.BaseInFilter(help_text="Filter the results by one or more `feed_id`(s). e.g. `3f388179-4683-4495-889f-690c5de3ae7c`") 204 | title = Filter( 205 | help_text="Filter the content by the `title` of the post. Will search for titles that contain the value entered. Search is wildcard so `exploit` will match `exploited` and `exploits`.", 206 | lookup_expr="icontains", 207 | ) 208 | description = Filter( 209 | help_text="Filter by the content post `description`. Will search for descriptions that contain the value entered. Search is wildcard so `exploit` will match `exploited` and `exploits`.", 210 | lookup_expr="icontains", 211 | ) 212 | link = Filter( 213 | help_text="Filter the content by a posts `link`. Will search for links that contain the value entered. Search is wildcard so `dogesec` will return any URL that contains the string `dogesec`.", 214 | lookup_expr="icontains", 215 | ) 216 | job_id = Filter(help_text="Filter the results by the Job ID the Post was downloaded or updated in. e.g. `6606bd0c-9d9d-4ffd-81bb-81c9196ccfe6`", field_name="fulltext_jobs__job_id") 217 | job_state = filters.ChoiceFilter(choices=JobState.choices, help_text="Filter by job status") 218 | updated_after = DatetimeFilter(help_text="Only show posts with a `datetime_updated` after the time specified. It must be in `YYYY-MM-DD HH:MM[:ss[.uuuuuu]][TZ]`, e.g. `2020-01-01 00:00`", field_name="datetime_updated", lookup_expr="gt") 219 | 220 | def get_queryset(self): 221 | return Post.visible_posts() \ 222 | .annotate(job_state=Subquery(Job.objects.filter(pk=OuterRef('last_job_id')).values('state')[:1])) 223 | 224 | def partial_update(self, request, *args, **kwargs): 225 | instance = self.get_object() 226 | serializer = PostPatchSerializer(instance, data=request.data, partial=True) 227 | serializer.is_valid(raise_exception=True) 228 | serializer.save() 229 | 230 | if getattr(instance, '_prefetched_objects_cache', None): 231 | # If 'prefetch_related' has been applied to a queryset, we need to 232 | # forcibly invalidate the prefetch cache on the instance. 233 | instance._prefetched_objects_cache = {} 234 | 235 | s = self.get_serializer(instance) 236 | return Response(s.data, status=status.HTTP_201_CREATED) 237 | 238 | @decorators.action(detail=True, methods=['PATCH']) 239 | def reindex(self, request, *args, **kwargs): 240 | post, job_obj = self.new_reindex_post_job(request) 241 | job_resp = JobSerializer(job_obj).data.copy() 242 | job_resp.update(post_id=post.id) 243 | return Response(job_resp, status=status.HTTP_201_CREATED) 244 | 245 | def new_reindex_post_job(self, request): 246 | s = PatchSerializer(data=request.data) 247 | s.is_valid(raise_exception=True) 248 | post: Post = self.get_object() 249 | job_obj = task_helper.new_patch_posts_job(post.feed, [post]) 250 | return post, job_obj 251 | 252 | def destroy(self, *args, **kwargs): 253 | obj = self.get_object() 254 | obj.deleted_manually = True 255 | obj.save() 256 | obj.feed.save() 257 | return Response(None, status=status.HTTP_204_NO_CONTENT) 258 | 259 | 260 | 261 | class FeedView(viewsets.ModelViewSet): 262 | openapi_tags = ["Feeds"] 263 | serializer_class = FeedSerializer 264 | queryset = Feed.objects.all() 265 | lookup_url_kwarg = "feed_id" 266 | pagination_class = Pagination("feeds") 267 | http_method_names = ["get", "post", "patch", "delete"] 268 | 269 | filter_backends = [DjangoFilterBackend, Ordering, MinMaxDateFilter] 270 | ordering_fields = [ 271 | "datetime_added", 272 | "title", 273 | "url", 274 | "count_of_posts", 275 | "earliest_item_pubdate", 276 | "latest_item_pubdate", 277 | ] 278 | ordering = ["-datetime_added"] 279 | minmax_date_fields = ["earliest_item_pubdate", "latest_item_pubdate"] 280 | 281 | class filterset_class(FilterSet): 282 | title = Filter( 283 | help_text="Filter by the content in feed title. Will search for titles that contain the value entered. Search is wildcard so `exploit` will match `exploited` and `exploits`.", 284 | lookup_expr="icontains", 285 | ) 286 | description = Filter( 287 | help_text="Filter by the content in feed description. Will search for descriptions that contain the value entered. Search is wildcard so `exploit` will match `exploited` and `exploits`.", 288 | lookup_expr="icontains", 289 | ) 290 | url = Filter( 291 | help_text="Filter by the content in a feeds URL. Will search for URLs that contain the value entered. Search is wildcard so `google` will match `google.com` and `google.co.uk`.", 292 | lookup_expr="icontains", 293 | ) 294 | id = BaseCSVFilter( 295 | help_text="Filter by feed id(s), comma-separated, e.g `6c6e6448-04d4-42a3-9214-4f0f7d02694e,2bce5b30-7014-4a5d-ade7-12913fe6ac36`", 296 | lookup_expr="in", 297 | ) 298 | feed_type = filters.MultipleChoiceFilter( 299 | help_text="Filter by `feed_type`", 300 | choices=FeedType.choices, 301 | ) 302 | 303 | 304 | def get_queryset(self): 305 | return Feed.objects.all().annotate(count_of_posts=Count("posts")) 306 | 307 | @extend_schema( 308 | summary="Create a New Feed", 309 | description=textwrap.dedent( 310 | """ 311 | Use this endpoint to create to a new feed. 312 | 313 | The following key/values are accepted in the body of the request: 314 | 315 | * `url` (required): a valid RSS or ATOM feed URL (if `use_search_index` = `false`) OR the URL of the blog (if `use_search_index` = `true`). 316 | * `include_remote_blogs` (required): is a boolean setting and will ask history4feed to ignore any feeds not on the same domain as the URL of the feed. Some RSS/ATOM feeds include remote posts from other sites (e.g. for a paid promotion). This setting (set to `false` allows you to ignore remote posts that do not use the same domain as the `url` used). Generally you should set `include_remote_blogs` to `false`. The one exception is when things like feed aggregators (e.g. Feedburner) URLs are used, where the actual blog posts are not on the `feedburner.com` (or whatever) domain. In this case `include_remote_blogs` should be set to `true`. 317 | * `pretty_url` (optional): you can also include a secondary URL in the database. This is designed to be used to show the link to the blog (not the RSS/ATOM) feed so that a user can navigate to the blog in their browser. 318 | * `title` (optional): the title of the feed will be used if not passed. You can also manually pass the title of the blog here. 319 | * `description` (optional): the description of the feed will be used if not passed. You can also manually pass the description of the blog here. 320 | * `use_search_index` (optional, default is `false`): If the `url` is not a valid RSS or ATOM feed you must set this mode to `true`. Set to `true` this mode uses search results that contain the base `url` passed vs. the RSS/ATOM feed entries (when this mode is set to `false`). This mode is only be able to index results in Google Search, so can miss some sites entirely where they are not indexed by Google. You must also pass a `title` and `description` when setting this mode to `true`. Note, you can use the skeleton endpoint to create a feed manually from a non RSS/ATOM URL or where search results do not satisfy your use case. 321 | 322 | The `id` of a Feed is generated using a UUIDv5. The namespace used is `6c6e6448-04d4-42a3-9214-4f0f7d02694e` and the value used is `` (e.g. `https://muchdogesec.github.io/fakeblog123/feeds/rss-feed-encoded.xml` would have the id `d1d96b71-c687-50db-9d2b-d0092d1d163a`). Therefore, you cannot add a URL that already exists, you must first delete it to add it with new settings. 323 | 324 | Each post ID is generated using a UUIDv5. The namespace used is `6c6e6448-04d4-42a3-9214-4f0f7d02694e` and the value used `++` (e.g. `d1d96b71-c687-50db-9d2b-d0092d1d163a+https://muchdogesec.github.io/fakeblog123///test3/2024/08/20/update-post.html+2024-08-20T10:00:00.000000Z` = `22173843-f008-5afa-a8fb-7fc7a4e3bfda`). 325 | 326 | The response will return the Job information responsible for getting the requested data you can track using the `id` returned via the GET Jobs by ID endpoint. 327 | """ 328 | ), 329 | responses={ 330 | 201: FeedCreatedJobSerializer, 331 | 400: OpenApiResponse(CommonErrorSerializer, "Bad request", examples=[HTTP400_EXAMPLE]), 332 | 406: OpenApiResponse(CommonErrorSerializer, "Invalid feed url", examples=[OpenApiExample(name="http-406", value={"detail": "invalid feed url", "code": 406})]), 333 | }, 334 | request=FeedSerializer, 335 | ) 336 | def create(self, request: request.Request, **kwargs): 337 | 338 | job_obj = self.new_create_job(request) 339 | resp_data = self.serializer_class(job_obj.feed).data.copy() 340 | resp_data.update( 341 | job_state=job_obj.state, 342 | job_id=job_obj.id, 343 | ) 344 | return Response(resp_data, status=status.HTTP_201_CREATED) 345 | 346 | def new_create_job(self, request: request.Request): 347 | feed_data = {} 348 | s = FeedSerializer(data=request.data) 349 | s.is_valid(raise_exception=True) 350 | if s.validated_data["use_search_index"]: 351 | s = SearchIndexFeedSerializer(data=request.data) 352 | s.is_valid(raise_exception=True) 353 | feed_data.update(feed_type=FeedType.SEARCH_INDEX) 354 | else: 355 | try: 356 | feed_data = h4f.parse_feed_from_url(s.data["url"]) 357 | except Exception as e: 358 | raise serializers.InvalidFeed(s.data["url"]) 359 | 360 | for k in ['title', 'description']: 361 | if v := s.validated_data.get(k): 362 | feed_data[k] = v 363 | elif v := feed_data.get(k): 364 | feed_data[k] = v + AUTO_TITLE_TRAIL 365 | 366 | s = FeedSerializer(data={**s.data, **feed_data}) 367 | s.is_valid(raise_exception=True) 368 | 369 | feed_obj: Feed = s.save(feed_type=feed_data['feed_type']) 370 | job_obj = task_helper.new_job(feed_obj, s.validated_data.get('include_remote_blogs', False)) 371 | return job_obj 372 | 373 | @extend_schema( 374 | summary="Create a New Skeleton Feed", 375 | description=textwrap.dedent( 376 | """ 377 | Sometimes it might be the case you want to curate a blog manually using various URLs from different blogs. This is what `skeleton` feeds are designed for, allowing you to create a skeleton feed and then add posts to it manually later on using the add post manually endpoint. 378 | 379 | The following key/values are accepted in the body of the request: 380 | 381 | * `url` (required): the URL to be attached to the feed. Needs to be a URL (because this is what feed ID is generated from), however does not need to be valid. 382 | * `pretty_url` (optional): you can also include a secondary URL in the database. This is designed to be used to show the link to the blog (not the RSS/ATOM) feed so that a user can navigate to the blog in their browser. 383 | * `title` (required): the title of the feed 384 | * `description` (optional): the description of the feed 385 | 386 | The response will return the created Feed object with the Feed `id`. 387 | """ 388 | ), 389 | responses={ 390 | 201: FeedSerializer, 391 | 400: OpenApiResponse(CommonErrorSerializer, "Bad request", examples=[HTTP400_EXAMPLE]), 392 | }, 393 | request=SkeletonFeedSerializer, 394 | ) 395 | @decorators.action(methods=['POST'], detail=False) 396 | def skeleton(self, request: request.Request, **kwargs): 397 | s = SkeletonFeedSerializer(data=request.data) 398 | s.is_valid(raise_exception=True) 399 | instance = s.save() 400 | return Response(FeedSerializer(instance).data, status=status.HTTP_201_CREATED) 401 | 402 | @extend_schema( 403 | parameters=[FEED_ID_PARAM], 404 | summary="Update a Feeds Metadata", 405 | request=FeedPatchSerializer, 406 | description=textwrap.dedent( 407 | """ 408 | Update the metadata of the Feed. 409 | 410 | Note, it is not possible to update the `url` of the feed. You must delete the Feed and add it again to modify the `url`. 411 | 412 | The following key/values are accepted in the body of the request: 413 | 414 | * `title` (optional): update the `title` of the Feed 415 | * `description` (optional): update the `description` of the Feed 416 | * `pretty_url` (optional): update the `pretty_url of the Feed 417 | 418 | Only one/key value is required in the request. For those not passed, the current value will remain unchanged. 419 | 420 | The response will contain the newly updated Feed object. 421 | 422 | Every time the feed is updated, the `datetime_modified` property in the Feed object will be updated accordingly. 423 | """ 424 | ), 425 | responses={ 426 | 201: FeedSerializer, 427 | 400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]), 428 | (404, "application/json"): OpenApiResponse(CommonErrorSerializer, "Feed not found", examples=[HTTP404_EXAMPLE]), 429 | }, 430 | ) 431 | def partial_update(self, request, *args, **kwargs): 432 | feed_obj: Feed = self.get_object() 433 | s = FeedPatchSerializer(feed_obj, data=request.data, partial=True) 434 | s.is_valid(raise_exception=True) 435 | s.save(datetime_modified=timezone.now()) 436 | return Response(self.serializer_class(feed_obj).data, status=status.HTTP_201_CREATED) 437 | 438 | @extend_schema( 439 | parameters=[FEED_ID_PARAM], 440 | summary="Fetch Updates for a Feed", 441 | request=FeedFetchSerializer, 442 | description=textwrap.dedent( 443 | """ 444 | Use this endpoint to check for new posts on this blog since the last post time. An update request will immediately trigger a job to get the posts between `latest_item_pubdate` for feed and time you make a request to this endpoint. 445 | 446 | The following key/values are accepted in the body of the request: 447 | 448 | * `include_remote_blogs` (required): is a boolean setting and will ask history4feed to ignore any feeds not on the same domain as the URL of the feed. Some feeds include remote posts from other sites (e.g. for a paid promotion). This setting (set to `false` allows you to ignore remote posts that do not use the same domain as the `url` used). Generally you should set `include_remote_blogs` to `false`. The one exception is when things like feed aggregators (e.g. Feedburner) URLs are used, where the actual blog posts are not on the `feedburner.com` (or whatever) domain. In this case `include_remote_blogs` should be set to `true`. 449 | 450 | Each post ID is generated using a UUIDv5. The namespace used is `6c6e6448-04d4-42a3-9214-4f0f7d02694e` (history4feed) and the value used `++` (e.g. `d1d96b71-c687-50db-9d2b-d0092d1d163a+https://muchdogesec.github.io/fakeblog123///test3/2024/08/20/update-post.html+2024-08-20T10:00:00.000000Z` = `22173843-f008-5afa-a8fb-7fc7a4e3bfda`). 451 | 452 | **IMPORTANT:** this request will fail if run against a Skeleton type feed. Skeleton feeds can only be updated by adding posts to them manually using the Manually Add a Post to a Feed endpoint. 453 | 454 | **IMPORTANT:** this endpoint can miss updates that have happened to currently indexed posts (where the RSS or ATOM feed or search results do not report the updated date correctly -- which is actually very common). To solve this issue for currently indexed blog posts, use the Update a Post in a Feed endpoint directly. 455 | 456 | The response will return the Job information responsible for getting the requested data you can track using the `id` returned via the GET Jobs by ID endpoint. 457 | """ 458 | ), 459 | responses={ 460 | 201: FeedCreatedJobSerializer, 461 | 400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]), 462 | (404, "application/json"): OpenApiResponse(CommonErrorSerializer, "Feed not found", examples=[HTTP404_EXAMPLE]), 463 | }, 464 | ) 465 | @decorators.action(methods=["PATCH"], detail=True) 466 | def fetch(self, request, *args, **kwargs): 467 | job_obj = self.new_fetch_job(request) 468 | feed = self.serializer_class(self.get_object()).data.copy() 469 | feed.update( 470 | job_state=job_obj.state, 471 | job_id=job_obj.id, 472 | ) 473 | return Response(feed, status=status.HTTP_201_CREATED) 474 | 475 | def new_fetch_job(self, request): 476 | feed_obj: Feed = self.get_object() 477 | if feed_obj.feed_type == FeedType.SKELETON: 478 | raise validators.ValidationError(f"fetch not supported for feed of type {feed_obj.feed_type}") 479 | s = FeedFetchSerializer(feed_obj, data=request.data, partial=True) 480 | s.is_valid(raise_exception=True) 481 | s.save() 482 | return task_helper.new_job(feed_obj, s.validated_data.get('include_remote_blogs', False)) 483 | 484 | @extend_schema( 485 | summary="Search for Feeds", 486 | description=textwrap.dedent( 487 | """ 488 | Use this endpoint to get a list of all the feeds you are currently subscribed to. This endpoint is usually used to get the ID of Deed you want to get blog post data for in a follow up request to the GET Feed Posts endpoints or to get the status of a job related to the Feed in a follow up request to the GET Job endpoint. If you already know the id of the Feed already, you can use the GET Feeds by ID endpoint. 489 | """ 490 | ), 491 | responses={ 492 | 200: FeedSerializer, 493 | 400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]), 494 | }, 495 | ) 496 | def list(self, request, *args, **kwargs): 497 | return super().list(request, *args, **kwargs) 498 | 499 | @extend_schema( 500 | parameters=[FEED_ID_PARAM], 501 | summary="Get a Feed", 502 | description=textwrap.dedent( 503 | """ 504 | Use this endpoint to get information about a specific feed using its ID. You can search for a Feed ID using the GET Feeds endpoint, if required. 505 | """ 506 | ), 507 | responses={ 508 | 200: FeedSerializer, 509 | 404: OpenApiResponse(CommonErrorSerializer, "Not found", examples=[HTTP404_EXAMPLE]), 510 | }, 511 | ) 512 | def retrieve(self, request, *args, **kwargs): 513 | return super().retrieve(request, *args, **kwargs) 514 | 515 | @extend_schema( 516 | parameters=[FEED_ID_PARAM], 517 | summary="Delete a Feed", 518 | description=textwrap.dedent( 519 | """ 520 | Use this endpoint to delete a feed using its ID. This will delete all posts (items) that belong to the feed and cannot be reversed. 521 | """ 522 | ), 523 | responses={ 524 | 204: {}, 525 | 404: OpenApiResponse( 526 | CommonErrorSerializer, 527 | "Feed does not exist", 528 | examples=[HTTP404_EXAMPLE], 529 | ), 530 | }, 531 | ) 532 | def destroy(self, request, *args, **kwargs): 533 | return super().destroy(request, *args, **kwargs) 534 | 535 | class RSSView(viewsets.GenericViewSet): 536 | class filterset_class(PostOnlyView.filterset_class): 537 | feed_id = None 538 | openapi_tags = ["Feeds"] 539 | renderer_classes=[RSSRenderer] 540 | lookup_url_kwarg = 'feed_id' 541 | 542 | @extend_schema( 543 | parameters=[FEED_ID_PARAM], 544 | filters=True, 545 | summary="RSS Feed for Feed", 546 | description=textwrap.dedent( 547 | """ 548 | Use this endpoint with your feed reader. The response of this endpoint is valid RSS XML for the Posts in the Feed. If you want more flexibility (perhaps to build a custom integration) use the JSON version of this endpoint. 549 | """ 550 | ), 551 | responses={ 552 | (200, RSSRenderer.media_type): XML_RESPONSE, 553 | (404, "application/json"): OpenApiResponse(CommonErrorSerializer, "Feed not found", examples=[HTTP404_EXAMPLE]), 554 | (400, "application/json"): OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]), 555 | }, 556 | ) 557 | @decorators.action( 558 | methods=["get"], 559 | detail=True, 560 | pagination_class=XMLPostPagination("xml_posts"), 561 | ) 562 | def rss(self, request: request.Request, *args, feed_id=None, **kwargs): 563 | feed_obj = get_object_or_404(Feed, id=feed_id) 564 | queryset = self.filter_queryset(self.get_queryset()) 565 | page = self.paginate_queryset(queryset) 566 | body = build_rss.build_rss(feed_obj, page) 567 | return self.paginator.get_paginated_response(body) 568 | 569 | def get_queryset(self): 570 | return PostOnlyView.get_queryset(self).filter(feed_id=self.kwargs.get("feed_id")) 571 | 572 | 573 | 574 | @extend_schema_view( 575 | retrieve=extend_schema( 576 | parameters=[FEED_ID_PARAM, POST_ID_PARAM], 577 | summary="Get a Post in a Feed", 578 | description=textwrap.dedent( 579 | """ 580 | This will return a single Post in a Feed using its ID. It is useful if you only want to get the data for a single entry. 581 | """ 582 | ), 583 | responses={ 584 | 200: PostSerializer, 585 | 404: OpenApiResponse(CommonErrorSerializer, "Feed or post not found", examples=[HTTP404_EXAMPLE]), 586 | 400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]), 587 | }, 588 | ), 589 | list=extend_schema( 590 | summary="Search for Posts in a Feed (JSON)", 591 | description=textwrap.dedent( 592 | """ 593 | Use this endpoint if you want to search through all Posts in a Feed. The response of this endpoint is JSON, and is useful if you're building a custom integration to a downstream tool. If you just want to import the data for this blog into your feed reader use the RSS version of this endpoint. 594 | """ 595 | ), 596 | responses={ 597 | 200: PostSerializer, 598 | 404: OpenApiResponse(CommonErrorSerializer, "Feed not found", examples=[HTTP404_EXAMPLE]), 599 | 400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]), 600 | }, 601 | ), 602 | ) 603 | 604 | class feed_post_view( 605 | mixins.CreateModelMixin, 606 | viewsets.GenericViewSet 607 | ): 608 | 609 | openapi_tags = ["Feeds"] 610 | serializer_class = PostSerializer 611 | 612 | class filterset_class(PostOnlyView.filterset_class): 613 | feed_id = None 614 | 615 | 616 | def get_queryset(self): 617 | return PostOnlyView.get_queryset(self).filter(feed_id=self.kwargs.get("feed_id")) 618 | 619 | 620 | @extend_schema( 621 | parameters=[FEED_ID_PARAM], 622 | summary="Manually Add a Post to A Feed", 623 | description=textwrap.dedent( 624 | """ 625 | Sometimes historic posts are missed when a feed is indexed (typically when no Wayback Machine archive exists). 626 | 627 | This endpoint allows you to add Posts manually to a Feed. 628 | 629 | If the feed you want to add a post to does not already exist, you should first add it using the POST Feed or POST skeleton feed endpoints. 630 | 631 | The following key/values are accepted in the body of the request: 632 | 633 | * `link` (required - must be unique): The URL of the blog post. This is where the content of the post is found. It cannot be the same as the `url` of a post already in this feed. If you want to update the post, use the PATCH post endpoint. 634 | * `pubdate` (required): The date of the blog post in the format `YYYY-MM-DDTHH:MM:SS.sssZ`. history4feed cannot accurately determine a post date in all cases, so you must enter it manually. 635 | * `title` (required): history4feed cannot accurately determine the title of a post in all cases, so you must enter it manually. 636 | * `author` (optional): the value to be stored for the author of the post. 637 | * `categories` (optional) : the value(s) to be stored for the category of the post. Pass as a list like `["tag1","tag2"]`. 638 | 639 | The response will return the Job information responsible for getting the requested data you can track using the `id` returned via the GET Jobs by ID endpoint. 640 | 641 | Each post ID is generated using a UUIDv5. The namespace used is `6c6e6448-04d4-42a3-9214-4f0f7d02694e` and the value used `++` (e.g. `d1d96b71-c687-50db-9d2b-d0092d1d163a+https://muchdogesec.github.io/fakeblog123///test3/2024/08/20/update-post.html+2024-08-20T10:00:00.000000Z` = `22173843-f008-5afa-a8fb-7fc7a4e3bfda`). 642 | 643 | _Note: We do have a proof-of-concept to scrape a site for all blog post urls, titles, and pubdate called [sitemap2posts](https://github.com/muchdogesec/sitemap2posts) which can help form the request body needed for this endpoint._ 644 | """ 645 | ), 646 | responses={ 647 | 201: PostJobSerializer, 648 | 404: OpenApiResponse(CommonErrorSerializer, "Feed does not exist", examples=[HTTP404_EXAMPLE]), 649 | }, 650 | request=CreatePostsSerializer, 651 | ) 652 | def create(self, request, *args, feed_id=None, **kwargs): 653 | job_obj = self.new_create_post_job(request, feed_id) 654 | job_resp = JobSerializer(job_obj).data.copy() 655 | # job_resp.update(post_id=post.id) 656 | return Response(job_resp, status=status.HTTP_201_CREATED) 657 | 658 | def new_create_post_job(self, request, feed_id): 659 | feed_obj = get_object_or_404(Feed, id=feed_id) 660 | data = dict(request.data) #, feed_id=feed_id, feed=feed_id) 661 | 662 | s = CreatePostsSerializer(data=data, context=dict(feed_id=feed_id)) 663 | s.is_valid(raise_exception=True) 664 | 665 | posts = s.save(added_manually=True, deleted_manually=False) 666 | 667 | job_obj = task_helper.new_patch_posts_job(feed_obj, posts) 668 | return job_obj 669 | 670 | 671 | @extend_schema( 672 | summary="Update all Posts in a Feed", 673 | description=textwrap.dedent( 674 | """ 675 | This endpoint will reindex the Post content (`description`) for all Post IDs currently listed in the Feed. 676 | 677 | This request will only change the content (`description`) stored for the Post ID. It will not update the title, pubdate, author, or categories. If you need to update these properties you can use the Update Post Metadata endpoint. 678 | 679 | Note, if you only want to update the content of a single post, it is much more effecient to use the Update a Post in a Feed endpoint. 680 | """ 681 | ), 682 | responses={ 683 | 201: PostJobSerializer, 684 | 404: OpenApiResponse(CommonErrorSerializer, "Feed does not exist", examples=[HTTP404_EXAMPLE]), 685 | }, 686 | request={}, 687 | ) 688 | @decorators.action(methods=["PATCH"], detail=False, url_path='reindex') 689 | def reindex_feed(self, request, *args, feed_id=None, **kwargs): 690 | job_obj = self.new_reindex_feed_job(feed_id) 691 | job_resp = JobSerializer(job_obj).data.copy() 692 | # job_resp.update(post_id=post.id) 693 | return Response(job_resp, status=status.HTTP_201_CREATED) 694 | 695 | def new_reindex_feed_job(self, feed_id): 696 | posts = self.get_queryset().all() 697 | feed_obj = get_object_or_404(Feed, id=feed_id) 698 | 699 | job_obj = task_helper.new_patch_posts_job(feed_obj, posts) 700 | return job_obj 701 | 702 | 703 | class FeedPostView( 704 | feed_post_view 705 | ): 706 | pass 707 | 708 | class JobView( 709 | mixins.RetrieveModelMixin, mixins.ListModelMixin, viewsets.GenericViewSet 710 | ): 711 | serializer_class = JobSerializer 712 | pagination_class = Pagination("jobs") 713 | filter_backends = [DjangoFilterBackend, Ordering] 714 | ordering_fields = ["run_datetime", "state"] 715 | ordering = "run_datetime_descending" 716 | openapi_tags = ["Jobs"] 717 | lookup_url_kwarg = "job_id" 718 | lookup_field = "id" 719 | 720 | class filterset_class(FilterSet): 721 | feed_id = Filter( 722 | help_text="Filter Jobs by the ID of the Feed they belong to. You can search for Feed IDs using the GET Feeds endpoints. Note a Feed can have multiple jobs associated with it where a PATCH request has been run to update the Feed. e.g. `6c6e6448-04d4-42a3-9214-4f0f7d02694e`" 723 | ) 724 | state = Filter(help_text="Filter by the status of a Job") 725 | post_id = UUIDFilter(help_text="Filter Jobs by the ID of the Post they belong to. You can search for Post IDs using the GET Posts endpoint. Note a Post can have multiple jobs associated with it where a PATCH request has been run to update a Feed or a Post. e.g `797e94b1-efdc-4e66-a748-f2b6a5896a89`", field_name="fulltext_jobs__post_id") 726 | 727 | def get_queryset(self): 728 | return Job.objects.all().annotate(count_of_items=Count("fulltext_jobs")) 729 | 730 | def filter_queryset(self, queryset): 731 | return super().filter_queryset(queryset) 732 | 733 | @extend_schema( 734 | summary="Search Jobs", 735 | description=textwrap.dedent( 736 | """ 737 | Jobs track the status of the request to get posts for Feeds. For every new Feed added and every update to a Feed requested a job will be created. The `id` of a job is printed in the POST and PATCH responses respectively, but you can use this endpoint to search for the id again, if required. 738 | """ 739 | ), 740 | responses={ 741 | 200: JobSerializer, 742 | 400: OpenApiResponse( 743 | CommonErrorSerializer, 744 | "Request not understood", 745 | [HTTP400_EXAMPLE], 746 | ), 747 | }, 748 | ) 749 | def list(self, request, *args, **kwargs): 750 | return super().list(request, *args, **kwargs) 751 | 752 | @extend_schema( 753 | parameters=[JOB_ID_PARAM], 754 | summary="Get a Job", 755 | description=textwrap.dedent( 756 | """ 757 | Using a Job ID you can retrieve information about its state via this endpoint. This is useful to see if a Job to get data is complete, how many posts were imported in the job, or if an error has occurred. 758 | """ 759 | ), 760 | responses={ 761 | 200: JobSerializer, 762 | 404: OpenApiResponse( 763 | CommonErrorSerializer, 764 | "Job not found", 765 | [HTTP404_EXAMPLE], 766 | ), 767 | }, 768 | ) 769 | def retrieve(self, request, *args, **kwargs): 770 | return super().retrieve(request, *args, **kwargs) 771 | 772 | 773 | @extend_schema( 774 | parameters=[JOB_ID_PARAM], 775 | summary="Kill a running Job that is indexing Posts", 776 | description=textwrap.dedent( 777 | """ 778 | Using a Job ID you can kill it whilst it is still in `running` or `pending` state. 779 | 780 | If any posts have already been downloaded before the job is complete, they will still remain and you will need to delete them using the delete endpoints manually. 781 | 782 | The job will enter `cancelled` state when cancelled. 783 | """ 784 | ), 785 | responses={ 786 | 204: {}, 787 | 404: OpenApiResponse( 788 | CommonErrorSerializer, 789 | "Job not found", 790 | [HTTP404_EXAMPLE], 791 | ), 792 | }, 793 | ) 794 | @decorators.action(methods=['DELETE'], detail=True, url_path="kill") 795 | def cancel_job(self, request, *args, **kwargs): 796 | obj: Job = self.get_object() 797 | obj.cancel() 798 | return Response(status=status.HTTP_204_NO_CONTENT) 799 | -------------------------------------------------------------------------------- /history4feed/asgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASGI config for history4feed project. 3 | 4 | It exposes the ASGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.0/howto/deployment/asgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.asgi import get_asgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'history4feed.settings') 15 | 16 | application = get_asgi_application() 17 | -------------------------------------------------------------------------------- /history4feed/h4fscripts/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | LOG_PRINT = 105 3 | 4 | def newLogger(name: str) -> logging.Logger: 5 | # Configure logging 6 | logging.addLevelName(LOG_PRINT, "LOG") 7 | stream_handler = logging.StreamHandler() # Log to stdout and stderr 8 | stream_handler.setLevel(logging.INFO) 9 | logging.basicConfig( 10 | level=logging.INFO, 11 | format=f"%(asctime)s [%(levelname)s] %(message)s", 12 | handlers=[stream_handler], 13 | datefmt='%d-%b-%y %H:%M:%S' 14 | ) 15 | logger = logging.getLogger("history4feed") 16 | logger.print = lambda msg: logger.log(LOG_PRINT, msg) 17 | logger.print("=====================history4feed======================") 18 | 19 | return logger 20 | 21 | logger = newLogger("h4f-logger") -------------------------------------------------------------------------------- /history4feed/h4fscripts/build_rss.py: -------------------------------------------------------------------------------- 1 | from .xml_utils import createRSSHeader, createCDataElement, createTextElement 2 | from ..app.models import Feed, Post 3 | from django.db.models.manager import BaseManager 4 | from xml.dom.minidom import Document 5 | 6 | 7 | def build_rss(feed_obj: Feed, posts_set: BaseManager[Post]): 8 | document, channel = createRSSHeader(feed_obj.title, feed_obj.description, feed_obj.url, feed_obj.latest_item_pubdate) 9 | for post in posts_set: 10 | channel.appendChild(build_entry_element(post, document)) 11 | 12 | return document.toprettyxml() 13 | 14 | def build_entry_element(post: Post, d: Document): 15 | element = d.createElement('item') 16 | element.appendChild(createTextElement(d, "title", post.title)) 17 | 18 | link = createTextElement(d, "link", post.link) 19 | link.setAttribute("href", post.link) 20 | element.appendChild(link) 21 | element.appendChild(createTextElement(d, "pubDate", post.pubdate.isoformat())) 22 | if post.description: 23 | description = post.description 24 | description = description 25 | element.appendChild(createTextElement(d, "description", description)) 26 | 27 | for category in post.categories.all(): 28 | element.appendChild(createTextElement(d, "category", category.name)) 29 | 30 | if post.author: 31 | author = d.createElement('author') 32 | author.appendChild(createTextElement(d, "name", post.author)) 33 | element.appendChild(author) 34 | return element -------------------------------------------------------------------------------- /history4feed/h4fscripts/celery.py: -------------------------------------------------------------------------------- 1 | import os 2 | from celery import Celery 3 | # Set the default Django settings module for the 'celery' program. 4 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'history4feed.settings') 5 | 6 | app = Celery('history4feed') 7 | 8 | 9 | app.config_from_object('os:environ', namespace='CELERY') 10 | 11 | # Load task modules from all registered Django apps. 12 | app.autodiscover_tasks() -------------------------------------------------------------------------------- /history4feed/h4fscripts/exceptions.py: -------------------------------------------------------------------------------- 1 | class history4feedException(Exception): 2 | pass 3 | class UnknownFeedtypeException(history4feedException): 4 | pass 5 | class ParseArgumentException(history4feedException): 6 | pass 7 | class FetchRedirect(history4feedException): 8 | pass 9 | 10 | class ScrapflyError(Exception): 11 | pass -------------------------------------------------------------------------------- /history4feed/h4fscripts/h4f.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import time 3 | from io import BytesIO, StringIO 4 | from xml.dom.minidom import Element, parse 5 | import os 6 | from history4feed.app.settings import history4feed_server_settings as settings 7 | import requests 8 | from dateutil.parser import parse as parse_date 9 | from readability import Document as ReadabilityDocument 10 | import brotli 11 | from types import SimpleNamespace 12 | from .import logger 13 | from .xml_utils import getAtomLink, getFirstChildByTag, getFirstElementByTag, getText 14 | from .exceptions import history4feedException, UnknownFeedtypeException, FetchRedirect, ScrapflyError 15 | import fake_useragent 16 | from urllib.parse import urljoin 17 | 18 | def fetch_page_with_retries(url, retry_count=3, sleep_seconds=settings.WAYBACK_SLEEP_SECONDS, **kwargs): 19 | ua = fake_useragent.UserAgent() 20 | session = requests.Session() 21 | session.max_redirects = 3 22 | headers = kwargs.get('headers', {}) 23 | headers.update({ 24 | "User-Agent": ua.random, 25 | }) 26 | kwargs.update(headers=headers) 27 | error = None 28 | for i in range(retry_count+1): 29 | try: 30 | if i > 0: 31 | time.sleep(sleep_seconds * 1.5 ** (i-1)) 32 | return fetch_page(session, url, **kwargs) 33 | except FatalError: 34 | raise 35 | except BaseException as e: 36 | error = e 37 | print(error) 38 | raise ConnectionError(f"could not fetch page after {retry_count} retries") from error 39 | 40 | class FatalError(Exception): 41 | pass 42 | 43 | def fetch_page(session, url, headers=None) -> tuple[bytes, str, str]: 44 | proxy_apikey = os.getenv("SCRAPFLY_APIKEY") 45 | headers = headers or {} 46 | 47 | if proxy_apikey: 48 | logger.info(f"Fetching `{url}` via scrapfly.io") 49 | headers = dict((f"headers[{k}]", v) for k, v in headers.items()) 50 | resp = session.get("https://api.scrapfly.io/scrape", params=dict(**headers, key=proxy_apikey, url=url, country="us,ca,mx,gb,fr,de,au,at,be,hr,cz,dk,ee,fi,ie,se,es,pt,nl")) 51 | json_data = resp.json() 52 | if resp.status_code != 200: 53 | raise ScrapflyError(json_data) 54 | result = SimpleNamespace(**json_data['result']) 55 | if result.status_code > 499: 56 | raise FatalError(f"Got server error {result.status_code}, stopping") 57 | if result.status_code > 399: 58 | raise history4feedException(f"PROXY_GET Request failed for `{url}`, status: {result.status_code}, reason: {result.status}") 59 | elif result.status_code > 299: 60 | raise FetchRedirect(f"PROXY_GET for `{url}` redirected, status: {result.status_code}, reason: {result.status}") 61 | return result.content.encode(), result.content_type, result.url 62 | 63 | logger.info(f"Fetching `{url}`") 64 | resp: requests.Response = session.get(url, headers=headers) 65 | content = resp.content 66 | if not resp.ok: 67 | raise history4feedException(f"GET Request failed for `{url}`, status: {resp.status_code}, reason: {resp.reason}") 68 | 69 | # some times, wayback returns br encoding, try decompressing 70 | try: 71 | content = brotli.decompress(content) 72 | except Exception as err: 73 | logger.print(f"brotli decompress fail: {err}") 74 | return content, resp.headers.get("content-type"), resp.url 75 | 76 | def parse_feed_from_url(url): 77 | data, content_type, url = fetch_page_with_retries(url, retry_count=0) 78 | return parse_feed_from_content(data, url) 79 | 80 | 81 | @dataclass 82 | class PostDict: 83 | link: str 84 | title: str 85 | pubdate: str 86 | author: str = None 87 | categories: list[str] = None 88 | description: str = "EMPTY BODY" 89 | content_type: str = "text/html" 90 | 91 | def parse_feed_from_content(data: bytes, url: str): 92 | feed_data = {} 93 | try: 94 | if isinstance(data, str): 95 | document = parse(StringIO(data)) 96 | else: 97 | document = parse(BytesIO(data)) 98 | # check if it's atom or rss 99 | if rss := getFirstElementByTag(document, "rss"): 100 | channel = getFirstElementByTag(rss, "channel") 101 | feed_data['description'] = getText(getFirstElementByTag(channel, "description")) 102 | feed_data['title'] = getText(getFirstElementByTag(channel, "title")) 103 | # feed_data['rel'] = getText(getFirstElementByTag(channel, "link")) 104 | 105 | feed_data["feed_type"] = "rss" 106 | elif feed := getFirstElementByTag(document, "feed"): 107 | feed_data['description'] = getText(getFirstElementByTag(feed, "subtitle")) 108 | feed_data['title'] = getText(getFirstElementByTag(feed, "title")) 109 | # feed_data['rel'] = getAtomLink(feed) 110 | 111 | feed_data["feed_type"] = "atom" 112 | else: 113 | raise UnknownFeedtypeException("feed is neither RSS or ATOM") 114 | feed_data["url"] = url 115 | return feed_data 116 | except BaseException as e: 117 | raise UnknownFeedtypeException(f"Failed to parse feed from `{url}`") from e 118 | 119 | def get_publish_date(item): 120 | published = getFirstElementByTag(item, "published") 121 | if not published: 122 | published = getFirstElementByTag(item, "pubDate") 123 | return parse_date(getText(published)) 124 | 125 | def get_categories(entry: Element) -> list[str]: 126 | categories = [] 127 | for category in entry.getElementsByTagName('category'): 128 | cat = category.getAttribute('term') or getText(category) 129 | if not cat: 130 | cat = category 131 | categories.append(cat) 132 | return categories 133 | 134 | def get_author(item): 135 | author = getFirstElementByTag(item, "dc:creator") 136 | if not author: 137 | author = getFirstElementByTag(item, "author") 138 | author = getFirstElementByTag(author, "name") or author 139 | return getText(author) 140 | 141 | 142 | def parse_items(elem, link): 143 | return PostDict( 144 | # element = elem, 145 | link = link, 146 | title = getText(getFirstElementByTag(elem, "title")), 147 | pubdate = get_publish_date(elem), 148 | author = get_author(elem), 149 | categories = get_categories(elem), 150 | description="", 151 | content_type="plain/text", 152 | ) 153 | 154 | def parse_posts_from_rss_feed(base_url, data) -> dict[str, PostDict]: 155 | entries = {} 156 | document = parse(BytesIO(data)) 157 | channel = getFirstElementByTag(document, "channel") 158 | 159 | for item in channel.getElementsByTagName("item"): 160 | link = urljoin(base_url, getText(getFirstElementByTag(item, "link")).strip()) 161 | entries[link] = parse_items(item, link) 162 | entries[link].description = parse_rss_description(item) 163 | return entries 164 | 165 | def parse_posts_from_atom_feed(base_url, data): 166 | entries = {} 167 | document = parse(BytesIO(data)) 168 | 169 | for item in document.getElementsByTagName("entry"): 170 | link = urljoin(base_url, getAtomLink(item, rel='alternate')) 171 | entries[link] = parse_items(item, link) 172 | entries[link].description, content_type = parse_atom_description(item) 173 | if content_type: 174 | entries[link].content_type = content_type 175 | return entries 176 | 177 | def parse_atom_description(item: Element): 178 | description = "" 179 | if summary := getFirstChildByTag(item, "summary"): 180 | description = getText(summary) 181 | if content := getFirstChildByTag(item, "content"): 182 | description = getText(content) 183 | return description, None 184 | 185 | def parse_rss_description(item: Element): 186 | return getText(getFirstChildByTag(item, "description")) 187 | 188 | 189 | def is_valid_atom_feed(xml): 190 | pass 191 | 192 | def is_valid_atom_feed(xml): 193 | pass 194 | 195 | def get_full_text(link): 196 | try: 197 | page, content_type, url = fetch_page_with_retries(link) 198 | doc = ReadabilityDocument(page, url=url) 199 | return doc.summary(), content_type 200 | except BaseException as e: 201 | raise history4feedException(f"Error processing fulltext: {e}") from e 202 | 203 | -------------------------------------------------------------------------------- /history4feed/h4fscripts/sitemap_helpers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import time 5 | from collections import namedtuple 6 | from urllib.parse import urlencode 7 | from .h4f import FatalError, PostDict, fetch_page_with_retries 8 | from history4feed.app.settings import history4feed_server_settings as settings 9 | import requests 10 | from datetime import UTC, datetime as dt 11 | import time 12 | import requests 13 | from datetime import datetime as dt, date, timedelta 14 | from dateparser import parse as parse_date 15 | DEFAULT_USER_AGENT = "curl" 16 | 17 | class SearchIndexError(FatalError): 18 | pass 19 | 20 | def fetch_posts_links_with_serper(site, from_time: dt, to_time: dt = None) -> dict[str, PostDict]: 21 | s = requests.Session() 22 | s.headers.update({ 23 | 'X-API-KEY': os.getenv("SERPER_API_KEY"), 24 | 'Content-Type': 'application/json' 25 | }) 26 | 27 | params = dict(num=100, page=1) 28 | entries: dict[str, PostDict] = {} 29 | to_time = to_time or dt.now(UTC) 30 | if not to_time.tzinfo: 31 | to_time = to_time.replace(tzinfo=UTC) 32 | 33 | frame_start = from_time - timedelta(days=1) 34 | credits_used = 0 35 | 36 | while frame_start < to_time: 37 | frame_end = frame_start + timedelta(days=100) 38 | params.update(q=f"site:{site} after:{frame_start.date().isoformat()} before:{frame_end.date().isoformat()}", page=1) 39 | while True: 40 | resp = s.get("https://google.serper.dev/search", params=params) 41 | if not resp.ok: 42 | raise SearchIndexError(f"Serper Request GOT {resp.status_code}: {resp.text}") 43 | data = resp.json() 44 | credits_used += data['credits'] 45 | for d in data['organic']: 46 | date = d.get('date') 47 | if date: 48 | date = parse_date(date) 49 | else: 50 | date = min(frame_end, to_time) 51 | post = PostDict(link=d['link'], title=d['title'], pubdate=date, categories=[]) 52 | entries[post.link] = post 53 | params['page'] += 1 54 | if len(data['organic']) < params['num']: 55 | break 56 | frame_start = frame_end - timedelta(days=1) 57 | logging.info(f"got {len(entries)} posts between {from_time} and {to_time}, used {credits_used} credits") 58 | return entries 59 | 60 | -------------------------------------------------------------------------------- /history4feed/h4fscripts/task_helper.py: -------------------------------------------------------------------------------- 1 | import time 2 | from celery import shared_task, Task as CeleryTask 3 | import celery 4 | from celery.result import ResultSet, AsyncResult 5 | import redis 6 | 7 | from history4feed.h4fscripts.sitemap_helpers import fetch_posts_links_with_serper 8 | 9 | from ..app import models 10 | from . import h4f, wayback_helpers, logger, exceptions 11 | from datetime import UTC, datetime 12 | from history4feed.app.settings import history4feed_server_settings as settings 13 | 14 | from urllib.parse import urlparse 15 | from contextlib import contextmanager 16 | from django.core.cache import cache 17 | from rest_framework.exceptions import APIException, Throttled 18 | from django.db import transaction 19 | 20 | LOCK_EXPIRE = 60 * 60 21 | 22 | def get_lock_id(feed: models.Feed): 23 | lock_id = f"feed-lock-{feed.id}" 24 | logger.debug("using lock id %s", lock_id) 25 | return lock_id 26 | 27 | def queue_lock(feed: models.Feed, job=None): 28 | lock_value = dict(feed_id=str(feed.id)) 29 | if job: 30 | lock_value["job_id"] = str(job.id) 31 | 32 | status = cache.add(get_lock_id(feed), lock_value, timeout=LOCK_EXPIRE) 33 | return status 34 | 35 | 36 | 37 | def new_job(feed: models.Feed, include_remote_blogs): 38 | with transaction.atomic(): 39 | job_obj = models.Job.objects.create( 40 | feed=feed, 41 | earliest_item_requested=feed.latest_item_pubdate or settings.EARLIEST_SEARCH_DATE, 42 | latest_item_requested=datetime.now(UTC), 43 | include_remote_blogs=include_remote_blogs, 44 | ) 45 | if not queue_lock(feed, job_obj): 46 | raise Throttled(detail={"message": "A job is already running for this feed", **cache.get(get_lock_id(feed))}) 47 | 48 | (start_job.s(job_obj.pk)| retrieve_posts_from_links.s(job_obj.pk) | wait_for_all_with_retry.s() | collect_and_schedule_removal.si(job_obj.pk)).apply_async(countdown=5, link_error=error_handler.s(job_obj.pk)) 49 | return job_obj 50 | 51 | def new_patch_posts_job(feed: models.Feed, posts: list[models.Post], include_remote_blogs=True): 52 | job_obj = models.Job.objects.create( 53 | feed=posts[0].feed, 54 | state=models.JobState.PENDING, 55 | include_remote_blogs=include_remote_blogs, 56 | ) 57 | ft_jobs = [models.FulltextJob.objects.create( 58 | job_id=job_obj.id, 59 | post_id=post.id, 60 | link=post.link, 61 | ) for post in posts] 62 | chain = celery.chain([retrieve_full_text.si(ft_job.pk) for ft_job in ft_jobs]) 63 | ( start_post_job.si(job_obj.id) | chain | collect_and_schedule_removal.si(job_obj.pk)).apply_async(link_error=error_handler.s(job_obj.pk), countdown=5) 64 | return job_obj 65 | 66 | @shared_task(bind=True, default_retry_delay=10) 67 | def start_post_job(self: CeleryTask, job_id): 68 | job = models.Job.objects.get(pk=job_id) 69 | if job.is_cancelled(): 70 | job.info = "job cancelled while in queue" 71 | job.save() 72 | return False 73 | if not queue_lock(job.feed, job): 74 | return self.retry(max_retries=360) 75 | job.state = models.JobState.RUNNING 76 | job.save() 77 | return True 78 | 79 | @shared_task 80 | def start_job(job_id): 81 | job = models.Job.objects.get(pk=job_id) 82 | feed = job.feed 83 | job.state = models.JobState.RUNNING 84 | job.save() 85 | try: 86 | if feed.feed_type == models.FeedType.SEARCH_INDEX: 87 | return [feed.url] 88 | return wayback_helpers.get_wayback_urls(feed.url, job.earliest_item_requested, job.latest_item_requested) 89 | except BaseException as e: 90 | job.state = models.JobState.FAILED 91 | job.info = str(e) 92 | job.save() 93 | return [] 94 | 95 | @shared_task(bind=True, default_retry_delay=10) 96 | def wait_for_all_with_retry(self, result_ids): 97 | if not result_ids: 98 | return [] 99 | result_set = ResultSet([AsyncResult(task_id) for task_id in result_ids]) 100 | if not result_set.ready(): 101 | return self.retry(max_retries=360) 102 | return result_ids 103 | 104 | @shared_task 105 | def retrieve_posts_from_links(urls, job_id): 106 | if not urls: 107 | return [] 108 | full_text_chain = models.Job.objects.get(pk=job_id) 109 | feed = full_text_chain.feed 110 | chains = [] 111 | parsed_feed = {} 112 | job = models.Job.objects.get(id=job_id) 113 | for index, url in enumerate(urls): 114 | if job.is_cancelled(): 115 | break 116 | error = None 117 | if feed.feed_type == models.FeedType.SEARCH_INDEX: 118 | start_time = feed.freshness or settings.EARLIEST_SEARCH_DATE 119 | if not start_time.tzinfo: 120 | start_time = start_time.replace(tzinfo=UTC) 121 | crawled_posts = fetch_posts_links_with_serper(url, from_time=start_time, to_time=job.run_datetime) 122 | posts = [add_new_post(feed, job, post_dict) for post_dict in crawled_posts.values()] 123 | else: 124 | parsed_feed, posts, error = retrieve_posts_from_url(url, feed, job) 125 | if error: 126 | logger.exception(error) 127 | continue 128 | if not posts: 129 | logger.warning('no new post in `%s`', url) 130 | continue 131 | 132 | chain_tasks = [] 133 | for post in posts: 134 | ftjob_entry = models.FulltextJob.objects.create( 135 | job_id=job_id, 136 | post_id=post.id, 137 | link=post.link, 138 | ) 139 | chain_tasks.append(retrieve_full_text.si(ftjob_entry.pk)) 140 | full_text_chain = celery.chain(chain_tasks) 141 | chains.append(full_text_chain.apply_async()) 142 | 143 | if parsed_feed: 144 | feed.set_description(parsed_feed['description']) 145 | feed.set_title(parsed_feed['title']) 146 | feed.freshness = job.run_datetime 147 | 148 | feed.save() 149 | logger.info("====\n"*5) 150 | return [result.id for result in chains] 151 | 152 | class JobCancelled(Exception): 153 | pass 154 | 155 | @shared_task(bind=True) 156 | def collect_and_schedule_removal(sender, job_id): 157 | logger.print(f"===> {sender=}, {job_id=} ") 158 | job = models.Job.objects.get(pk=job_id) 159 | remove_lock(job) 160 | if job.state == models.JobState.RUNNING: 161 | job.state = models.JobState.SUCCESS 162 | job.save() 163 | 164 | def remove_lock(job): 165 | if cache.delete(get_lock_id(job.feed)): 166 | logger.debug("lock deleted") 167 | else: 168 | logger.debug("Failed to remove lock") 169 | 170 | def retrieve_posts_from_url(url, db_feed: models.Feed, job: models.Job): 171 | back_off_seconds = settings.WAYBACK_SLEEP_SECONDS 172 | all_posts: list[models.Post] = [] 173 | error = None 174 | parsed_feed = {} 175 | for i in range(settings.REQUEST_RETRY_COUNT): 176 | if i != 0: 177 | time.sleep(back_off_seconds) 178 | try: 179 | if job.is_cancelled(): 180 | raise JobCancelled("job was terminated by user") 181 | data, content_type, url = h4f.fetch_page_with_retries(url) 182 | parsed_feed = h4f.parse_feed_from_content(data, url) 183 | if parsed_feed['feed_type'] == models.FeedType.ATOM: 184 | posts = h4f.parse_posts_from_atom_feed(url, data) 185 | elif parsed_feed['feed_type'] == models.FeedType.RSS: 186 | posts = h4f.parse_posts_from_rss_feed(url, data) 187 | else: 188 | raise exceptions.UnknownFeedtypeException("unknown feed type `{}` at {}".format(parsed_feed['feed_type'], url)) 189 | for post_dict in posts.values(): 190 | # make sure that post and feed share the same domain 191 | post = add_new_post(db_feed, job, post_dict) 192 | if not post: 193 | continue 194 | all_posts.append(post) 195 | db_feed.save() 196 | logger.info(f"saved {len(posts)} posts for {url}") 197 | break 198 | except ConnectionError as e: 199 | logger.error(e, exc_info=True) 200 | error = e 201 | logger.info(f"job with url {url} ran into an issue {e}, backing off for {back_off_seconds} seconds") 202 | back_off_seconds *= 1.2 203 | except BaseException as e: 204 | logger.error(e, exc_info=True) 205 | error = e 206 | break 207 | return parsed_feed, all_posts, error 208 | 209 | def add_new_post(db_feed: models.Feed, job: models.Job, post_dict: h4f.PostDict): 210 | # make sure that post and feed share the same domain 211 | if job.should_skip_post(post_dict.link): 212 | models.FulltextJob.objects.create( 213 | job_id=job.id, 214 | status=models.FullTextState.SKIPPED, 215 | link=post_dict.link, 216 | ) 217 | return None 218 | categories = post_dict.categories 219 | del post_dict.categories 220 | post, created = models.Post.objects.get_or_create(defaults=post_dict.__dict__, feed=db_feed, link=post_dict.link) 221 | if not created or post.deleted_manually: 222 | return None 223 | 224 | post.save() 225 | post.add_categories(categories) 226 | return post 227 | 228 | @shared_task(bind=True) 229 | def retrieve_full_text(self, ftjob_pk): 230 | fulltext_job = models.FulltextJob.objects.get(pk=ftjob_pk) 231 | try: 232 | if fulltext_job.is_cancelled(): 233 | raise JobCancelled() 234 | else: 235 | fulltext_job.post.description, fulltext_job.post.content_type = h4f.get_full_text(fulltext_job.post.link) 236 | fulltext_job.status = models.FullTextState.RETRIEVED 237 | fulltext_job.error_str = "" 238 | fulltext_job.post.is_full_text = True 239 | except JobCancelled: 240 | fulltext_job.status = models.FullTextState.CANCELLED 241 | fulltext_job.error_str = "job cancelled while retrieving fulltext" 242 | except BaseException as e: 243 | fulltext_job.error_str = str(e) 244 | fulltext_job.status = models.FullTextState.FAILED 245 | fulltext_job.save() 246 | fulltext_job.post.save() 247 | logger.print(f"{self}") 248 | 249 | 250 | 251 | from celery import signals 252 | @signals.worker_ready.connect 253 | def mark_old_jobs_as_failed(**kwargs): 254 | models.Job.objects.filter(state__in=[models.JobState.PENDING, models.JobState.RUNNING]).update(state=models.JobState.CANCELLED, info="job cancelled automatically on server startup") 255 | 256 | @shared_task 257 | def error_handler(request, exc: Exception, traceback, job_id): 258 | job = models.Job.objects.get(pk=job_id) 259 | job.state = models.JobState.FAILED 260 | job.info = f"job failed: {exc}" 261 | job.save() 262 | remove_lock(job) 263 | logger.error('Job {3} with task_id {0} raised exception: {1!r}\n{2!r}'.format( 264 | request.id, exc, traceback, job_id)) -------------------------------------------------------------------------------- /history4feed/h4fscripts/wayback_helpers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from datetime import datetime as dt, UTC 4 | from collections import namedtuple 5 | from urllib.parse import urlencode 6 | from .h4f import FatalError, fetch_page_with_retries 7 | from history4feed.app.settings import history4feed_server_settings as settings 8 | 9 | DEFAULT_USER_AGENT = "curl" 10 | 11 | 12 | CDXSearchResult = namedtuple("CDXSearchResult", ["urlkey", "timestamp", "original_url", "mimetype", "statuscode", "digest", "length"]) 13 | 14 | def cdx_search(url, earliest: dt, latest: dt=None, retry_count=3, sleep_seconds=settings.WAYBACK_SLEEP_SECONDS, user_agent="curl") -> list[CDXSearchResult]: 15 | latest = latest or dt.now(UTC) 16 | query = urlencode([ 17 | ("from", as_wayback_date(earliest)), 18 | ("to", as_wayback_date(latest)), 19 | ("url", url), 20 | ("filter", "statuscode:200"), 21 | ("output", "json"), 22 | ("collapse", "digest"), 23 | ]) 24 | 25 | headers = {} 26 | 27 | error = None 28 | 29 | for i in range(retry_count+1): 30 | if i > 0: 31 | time.sleep(sleep_seconds * 1.5**(i-1)) 32 | try: 33 | res, content_type, _ = fetch_page_with_retries(f"http://web.archive.org/cdx/search/cdx?{query}", headers=headers) 34 | res_json = json.loads(res) 35 | error = None 36 | break 37 | except FatalError: 38 | return [] 39 | except BaseException as e: 40 | error = e 41 | continue 42 | if error: 43 | raise error 44 | out = {} 45 | for v in res_json[1:]: 46 | try: 47 | v[6] = int(v[6]) 48 | v[4] = int(v[4]) 49 | v = CDXSearchResult(*v) 50 | out[v.digest] = v 51 | except: 52 | pass 53 | return list(out.values()) 54 | 55 | def as_wayback_date(date: dt) -> str: 56 | return date.strftime('%Y%m%d') 57 | 58 | def get_wayback_urls(url, from_date, to_date=None): 59 | to_date = to_date or dt.now(UTC) 60 | urls = [] 61 | results = cdx_search(url, from_date, to_date) 62 | for result in results: 63 | urls.append(f"https://web.archive.org/web/{result.timestamp}id_/{result.original_url}") 64 | urls.append(url) 65 | return urls -------------------------------------------------------------------------------- /history4feed/h4fscripts/xml_utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timezone 2 | from xml.dom.minidom import Document, Element 3 | 4 | 5 | def createTextElement(document: Document, tagName, text): 6 | el = document.createElement(tagName) 7 | txtNode = document.createTextNode(text or "") 8 | el.appendChild(txtNode) 9 | return el 10 | 11 | def createCDataElement(document: Document, tagName, text): 12 | el = document.createElement(tagName) 13 | txtNode = document.createCDATASection(text or "") 14 | el.appendChild(txtNode) 15 | return el 16 | 17 | def createRSSHeader(title, description, url, last_build_date=None): 18 | last_build_date = last_build_date or datetime.now(timezone.utc) 19 | d = Document() 20 | rss = d.createElement("rss") 21 | d.appendChild(rss) 22 | rss.setAttribute("version", "2.0") 23 | channel = d.createElement("channel") 24 | rss.appendChild(channel) 25 | channel.appendChild(createTextElement(d, "title", title)) 26 | channel.appendChild(createTextElement(d, "description", description)) 27 | channel.appendChild(createTextElement(d, "link", url)) 28 | channel.appendChild(createTextElement(d, "lastBuildDate", last_build_date.isoformat())) 29 | # channel.appendChild(createTextElement(d, "generator", LINK_TO_SELF)) 30 | return d, channel 31 | 32 | 33 | def getText(nodelist: list[Element]): 34 | if not nodelist: 35 | return '' 36 | if not isinstance(nodelist, list): 37 | nodelist = nodelist.childNodes 38 | rc = [] 39 | for node in nodelist: 40 | if node.nodeType == node.TEXT_NODE or node.nodeType == node.CDATA_SECTION_NODE: 41 | rc.append(node.data) 42 | return ''.join(rc) 43 | 44 | def getFirstElementByTag(node, tag): 45 | if not node: 46 | return None 47 | elems = node.getElementsByTagName(tag) 48 | return (elems or None) and elems[0] 49 | 50 | def getFirstChildByTag(node: Element, tag): 51 | child = None 52 | for c in node.childNodes: 53 | if c.nodeName == tag: 54 | child = c 55 | break 56 | return child 57 | 58 | 59 | def getAtomLink(node: Element, rel='self'): 60 | links = [child for child in node.childNodes if child.nodeType == child.ELEMENT_NODE and child.tagName in ['link', 'atom:link']] 61 | 62 | link = links[0] 63 | for l in links: 64 | r = l.attributes.get('rel') 65 | if r and r.value == rel: 66 | link = l 67 | break 68 | return link.attributes['href'].value -------------------------------------------------------------------------------- /history4feed/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for history4feed project. 3 | 4 | Generated by 'django-admin startproject' using Django 5.0.6. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.0/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/5.0/ref/settings/ 11 | """ 12 | 13 | import os 14 | from pathlib import Path 15 | from dotenv import load_dotenv 16 | from datetime import UTC, datetime 17 | from textwrap import dedent 18 | 19 | load_dotenv() 20 | 21 | # Build paths inside the project like this: BASE_DIR / 'subdir'. 22 | BASE_DIR = Path(__file__).resolve().parent.parent 23 | 24 | 25 | # Quick-start development settings - unsuitable for production 26 | # See https://docs.djangoproject.com/en/5.0/howto/deployment/checklist/ 27 | 28 | # SECURITY WARNING: keep the secret key used in production secret! 29 | SECRET_KEY = os.environ.get('DJANGO_SECRET', "insecure_django_secret") 30 | 31 | # SECURITY WARNING: don't run with debug turned on in production! 32 | DEBUG = os.getenv('DJANGO_DEBUG', False) 33 | 34 | ALLOWED_HOSTS = os.getenv('DJANGO_ALLOWED_HOSTS', "localhost 127.0.0.1 [::1]").split() 35 | 36 | CELERY_BROKER_URL = os.environ["CELERY_BROKER_URL"] 37 | 38 | CACHES = { 39 | 'default': { 40 | 'BACKEND': 'django.core.cache.backends.redis.RedisCache', 41 | 'LOCATION': CELERY_BROKER_URL, # Use the appropriate Redis server URL 42 | 'OPTIONS': { 43 | # 'CLIENT_CLASS': 'django.core.cache.backends.redis.RedisCacheClient', 44 | } 45 | } 46 | } 47 | 48 | #CORS_ALLOW_ALL_ORIGINS = os.environ.get('DJANGO_CORS_ALLOW_ALL_ORIGINS', True) 49 | #CORS_ALLOWED_ORIGINS = [os.environ.get('DJANGO_CORS_ALLOWED_ORIGINS', "http://127.0.0.1:8002")] 50 | 51 | # Application definition 52 | 53 | INSTALLED_APPS = [ 54 | 'django.contrib.admin', 55 | 'django.contrib.auth', 56 | 'django.contrib.contenttypes', 57 | 'django.contrib.sessions', 58 | 'django.contrib.messages', 59 | 'django.contrib.staticfiles', 60 | 'rest_framework', 61 | 'drf_spectacular', 62 | 'django.contrib.postgres', 63 | 'history4feed.app', 64 | ] 65 | 66 | MIDDLEWARE = [ 67 | 'django.middleware.security.SecurityMiddleware', 68 | 'whitenoise.middleware.WhiteNoiseMiddleware', 69 | 'django.contrib.sessions.middleware.SessionMiddleware', 70 | 'django.middleware.common.CommonMiddleware', 71 | 'django.middleware.csrf.CsrfViewMiddleware', 72 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 73 | 'django.contrib.messages.middleware.MessageMiddleware', 74 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 75 | ] 76 | 77 | ROOT_URLCONF = 'history4feed.urls' 78 | 79 | TEMPLATES = [ 80 | { 81 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 82 | 'DIRS': [], 83 | 'APP_DIRS': True, 84 | 'OPTIONS': { 85 | 'context_processors': [ 86 | 'django.template.context_processors.debug', 87 | 'django.template.context_processors.request', 88 | 'django.contrib.auth.context_processors.auth', 89 | 'django.contrib.messages.context_processors.messages', 90 | ], 91 | }, 92 | }, 93 | ] 94 | 95 | WSGI_APPLICATION = 'history4feed.wsgi.application' 96 | 97 | 98 | # Database 99 | # https://docs.djangoproject.com/en/5.0/ref/settings/#databases 100 | 101 | DATABASES = { 102 | 'default': { 103 | 'ENGINE': 'django.db.backends.postgresql', 104 | 'NAME': os.getenv('POSTGRES_DB'), # Database name 105 | 'USER': os.getenv('POSTGRES_USER'), # Database user 106 | 'PASSWORD': os.getenv('POSTGRES_PASSWORD'), # Database password 107 | 'HOST': os.getenv('POSTGRES_HOST'), # PostgreSQL service name in Docker Compose 108 | 'PORT': os.getenv('POSTGRES_PORT'), # PostgreSQL default port 109 | }, 110 | 'sqlite': { 111 | 'ENGINE': 'django.db.backends.sqlite3', 112 | 'NAME': BASE_DIR / 'db.sqlite3', 113 | }, 114 | } 115 | 116 | 117 | # Password validation 118 | # https://docs.djangoproject.com/en/5.0/ref/settings/#auth-password-validators 119 | 120 | AUTH_PASSWORD_VALIDATORS = [ 121 | { 122 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 123 | }, 124 | { 125 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 126 | }, 127 | { 128 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 129 | }, 130 | { 131 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 132 | }, 133 | ] 134 | 135 | 136 | # Internationalization 137 | # https://docs.djangoproject.com/en/5.0/topics/i18n/ 138 | 139 | LANGUAGE_CODE = 'en-us' 140 | 141 | TIME_ZONE = 'UTC' 142 | 143 | USE_I18N = True 144 | 145 | USE_TZ = True 146 | 147 | 148 | # Static files (CSS, JavaScript, Images) 149 | # https://docs.djangoproject.com/en/5.0/howto/static-files/ 150 | 151 | STATIC_URL = 'static/' 152 | STATIC_ROOT = BASE_DIR / "staticfiles" 153 | 154 | # Default primary key field type 155 | # https://docs.djangoproject.com/en/5.0/ref/settings/#default-auto-field 156 | 157 | DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' 158 | 159 | 160 | REST_FRAMEWORK = { 161 | # YOUR SETTINGS 162 | 'DEFAULT_SCHEMA_CLASS': 'history4feed.app.autoschema.H4FSchema', 163 | 'DEFAULT_FILTER_BACKENDS': ['django_filters.rest_framework.DjangoFilterBackend'], 164 | 'DEFAULT_AUTHENTICATION_CLASSES': [], 165 | 'EXCEPTION_HANDLER': "dogesec_commons.utils.custom_exception_handler", 166 | 'DATETIME_FORMAT': '%Y-%m-%dT%H:%M:%SZ', 167 | } 168 | 169 | SPECTACULAR_SETTINGS = { 170 | 'TITLE': "history4feed API", 171 | 'DESCRIPTION': dedent(""" 172 | history4feed can be used to create a complete history for a blog and output it as an RSS feed. 173 | """), 174 | 'VERSION': '1.0.0', 175 | 'CONTACT': { 176 | 'email': 'noreply@dogesec.com', 177 | 'url': 'https://github.com/muchdogesec/history4feed', 178 | }, 179 | 'TAGS': [ 180 | { 181 | "name": "Feeds", 182 | "description": "Subscribe and retrieve Feeds" 183 | }, 184 | { 185 | "name": "Posts", 186 | "description": "Retrieve Posts in Feeds" 187 | }, 188 | { 189 | "name": "Jobs", 190 | "description": "Check the status of data retrieval from Feeds" 191 | }, 192 | ], 193 | 194 | } 195 | 196 | DEFAULT_PAGE_SIZE = int(os.getenv("DEFAULT_PAGE_SIZE", 50)) 197 | MAXIMUM_PAGE_SIZE = int(os.getenv("MAX_PAGE_SIZE", 50)) 198 | 199 | HISTORY4FEED_SETTINGS = { 200 | 'WAYBACK_SLEEP_SECONDS': int(os.getenv("WAYBACK_SLEEP_SECONDS", 20)), 201 | 'EARLIEST_SEARCH_DATE': datetime.strptime(os.environ.get("EARLIEST_SEARCH_DATE", "2024-01-01T00:00:00Z"), "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=UTC), 202 | 'REQUEST_RETRY_COUNT': int(os.getenv("REQUEST_RETRY_COUNT", 3)), 203 | } -------------------------------------------------------------------------------- /history4feed/urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | URL configuration for history4feed project. 3 | 4 | The `urlpatterns` list routes URLs to views. For more information please see: 5 | https://docs.djangoproject.com/en/5.0/topics/http/urls/ 6 | Examples: 7 | Function views 8 | 1. Add an import: from my_app import views 9 | 2. Add a URL to urlpatterns: path('', views.home, name='home') 10 | Class-based views 11 | 1. Add an import: from other_app.views import Home 12 | 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') 13 | Including another URLconf 14 | 1. Import the include() function: from django.urls import include, path 15 | 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 16 | """ 17 | from django.contrib import admin 18 | from django.urls import include, path 19 | from .app import views 20 | from rest_framework import routers 21 | from drf_spectacular.views import SpectacularAPIView, SpectacularRedocView, SpectacularSwaggerView 22 | 23 | 24 | from django.http import JsonResponse 25 | def handler404(*args, **kwargs): 26 | return JsonResponse(dict(code=404, message='non-existent page'), status=404) 27 | 28 | def handler500(*args, **kwargs): 29 | return JsonResponse(dict(code=500, message='internal server error'), status=500) 30 | 31 | 32 | API_VERSION = "v1" 33 | 34 | router = routers.SimpleRouter(use_regex_path=False) 35 | router.register("feeds", views.FeedView, "feed-view") 36 | router.register("feeds//posts", views.FeedPostView, "feed-post-view") 37 | router.register("feeds", views.RSSView, "feed-rss-view") 38 | router.register("posts", views.PostOnlyView, "post-view") 39 | router.register("jobs", views.JobView, "job-view") 40 | 41 | 42 | urlpatterns = [ 43 | path(f'api/{API_VERSION}/', include(router.urls)), 44 | path('admin/', admin.site.urls), 45 | 46 | # YOUR PATTERNS 47 | path('api/schema/', views.SchemaViewCached.as_view(), name='schema'), 48 | # Optional UI: 49 | path('api/schema/swagger-ui/', SpectacularSwaggerView.as_view(url_name='schema'), name='swagger-ui'), 50 | ] 51 | -------------------------------------------------------------------------------- /history4feed/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for history4feed project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.0/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'history4feed.settings') 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | 7 | def main(): 8 | """Run administrative tasks.""" 9 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'history4feed.settings') 10 | try: 11 | from django.core.management import execute_from_command_line 12 | except ImportError as exc: 13 | raise ImportError( 14 | "Couldn't import Django. Are you sure it's installed and " 15 | "available on your PYTHONPATH environment variable? Did you " 16 | "forget to activate a virtual environment?" 17 | ) from exc 18 | execute_from_command_line(sys.argv) 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "history4feed" 7 | version = "0.0.1-pre" 8 | authors = [ 9 | { name="DOGESEC", email="support@dogesec.com" }, 10 | ] 11 | description = "History4Feed" 12 | readme = "README.md" 13 | requires-python = ">=3.9" 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: Apache Software License", 17 | "Operating System :: OS Independent", 18 | ] 19 | dependencies = [ 20 | "djangorestframework>=3.15.2", 21 | "drf-spectacular>=0.27.2", 22 | "celery>=5.4.0; python_version >= '3.8'", 23 | "psycopg2-binary>=2.9.10", 24 | "redis", 25 | "brotlipy>=0.7.0", 26 | "lxml-html-clean>=0.4.1", 27 | "fake-useragent>=1.5.1", 28 | "hyperlink", 29 | "django-filter>=24.2", 30 | "dateparser>=1.2.1", 31 | ] 32 | [project.urls] 33 | Homepage = "https://github.com/muchdogesec/history4feed" 34 | Issues = "https://github.com/muchdogesec/history4feed/issues" 35 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -i https://pypi.org/simple 2 | amqp==5.2.0; python_version >= '3.6' 3 | asgiref==3.8.1; python_version >= '3.8' 4 | attrs==23.2.0; python_version >= '3.7' 5 | billiard==4.2.0; python_version >= '3.7' 6 | brotlipy==0.7.0 7 | celery==5.4.0; python_version >= '3.8' 8 | certifi==2025.4.26; python_version >= '3.6' 9 | chardet==5.2.0; python_version >= '3.7' 10 | charset-normalizer==3.3.2; python_full_version >= '3.7.0' 11 | click==8.1.7; python_version >= '3.7' 12 | click-didyoumean==0.3.1; python_full_version >= '3.6.2' 13 | click-plugins==1.1.1 14 | click-repl==0.3.0; python_version >= '3.6' 15 | cssselect==1.2.0; python_version >= '3.7' 16 | django==5.1.7; python_version >= '3.10' 17 | django-filter==24.2; python_version >= '3.8' 18 | djangorestframework==3.15.2; python_version >= '3.6' 19 | drf-spectacular==0.27.2; python_version >= '3.7' 20 | gunicorn==23.0.0; python_version >= '3.7' 21 | idna==3.7; python_version >= '3.5' 22 | inflection==0.5.1; python_version >= '3.5' 23 | jsonschema==4.22.0; python_version >= '3.8' 24 | jsonschema-specifications==2023.12.1; python_version >= '3.8' 25 | kombu==5.3.7; python_version >= '3.8' 26 | lxml==5.2.2; python_version >= '3.6' 27 | lxml-html-clean==0.4.1 28 | packaging==24.0; python_version >= '3.7' 29 | prompt-toolkit==3.0.45; python_full_version >= '3.7.0' 30 | psycopg2-binary==2.9.10; python_version >= '3.7' 31 | pycparser==2.22; python_version >= '3.8' 32 | python-dateutil==2.9.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' 33 | python-dotenv==1.0.1; python_version >= '3.8' 34 | pyyaml==6.0.1; python_version >= '3.6' 35 | readability-lxml==0.8.1 36 | redis==5.0.4; python_version >= '3.7' 37 | referencing==0.35.1; python_version >= '3.8' 38 | requests==2.32.3; python_version >= '3.8' 39 | six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' 40 | sqlparse==0.5.0; python_version >= '3.8' 41 | tzdata==2024.1; python_version >= '2' 42 | uritemplate==4.1.1; python_version >= '3.6' 43 | urllib3==2.2.2; python_version >= '3.8' 44 | vine==5.1.0; python_version >= '3.6' 45 | wcwidth==0.2.13 46 | fake-useragent==1.5.1 47 | whitenoise==6.7.0 48 | hyperlink==21.0.0 49 | dateparser==1.2.1 50 | stix2arango @ https://github.com/muchdogesec/stix2arango/releases/download/main-2025-02-04-14-14-39/stix2arango-0.0.4rc0-py3-none-any.whl 51 | dogesec_commons @ https://github.com/muchdogesec/dogesec_commons/releases/download/main-2025-05-26-13-08-23/dogesec_commons-0.0.7rc1-py3-none-any.whl -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | python manage.py migrate 2 | #gunicorn history4feed.wsgi:application --bind 0.0.0.0:8002 --reload 3 | python manage.py runserver 0.0.0.0:8002 -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Tests 2 | 3 | ## Environment setup 4 | 5 | ```shell 6 | python3 -m venv history4feed-venv && \ 7 | source history4feed-venv/bin/activate && \ 8 | pip3 install -r requirements.txt 9 | ```` 10 | 11 | ## API schema tests 12 | 13 | ```shell 14 | st run --checks all http://127.0.0.1:8002/api/schema --generation-allow-x00 true 15 | ``` 16 | 17 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/muchdogesec/history4feed/085ea559ffde6f62aa8d76b2cb889e4bec6fba26/tests/__init__.py -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv==1.0.1 2 | parameterized==0.9.0 3 | pytest==8.3.4 4 | requests==2.32.2 5 | python-dateutil==2.9.0.post0 6 | pytest-subtests 7 | schemathesis==3.38.7; python_version >= '3.8' -------------------------------------------------------------------------------- /tests/st/.env.schemathesis: -------------------------------------------------------------------------------- 1 | DJANGO_ALLOWED_HOSTS=* 2 | DJANGO_CORS_ALLOW_ALL_ORIGINS=* 3 | DJANGO_CORS_ALLOWED_ORIGINS=* 4 | DJANGO_DEBUG= 5 | DEFAULT_PAGE_SIZE=5000 6 | CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP=1 7 | POSTGRES_HOST=pgdb 8 | POSTGRES_DB=postgres 9 | POSTGRES_USER=postgres 10 | POSTGRES_PASSWORD=postgres -------------------------------------------------------------------------------- /tests/st/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/muchdogesec/history4feed/085ea559ffde6f62aa8d76b2cb889e4bec6fba26/tests/st/__init__.py -------------------------------------------------------------------------------- /tests/st/hooks.py: -------------------------------------------------------------------------------- 1 | import json 2 | import schemathesis, schemathesis.schemas 3 | from schemathesis.specs.openapi.schemas import BaseOpenAPISchema 4 | from schemathesis import Case 5 | from schemathesis.transports.responses import GenericResponse 6 | 7 | @schemathesis.hook 8 | def after_load_schema( 9 | context: schemathesis.hooks.HookContext, 10 | schema: BaseOpenAPISchema, 11 | ) -> None: 12 | 13 | schema.add_link( 14 | source=schema["/api/v1/jobs/"]['GET'], 15 | target=schema["/api/v1/jobs/{job_id}/"]['GET'], 16 | status_code=200, 17 | parameters={"path.job_id": '$response.body#/jobs/0/id'} 18 | ) 19 | for method in ['GET', 'PATCH', 'DELETE']: 20 | schema.add_link( 21 | source=schema['/api/v1/feeds/']['GET'], 22 | target=schema['/api/v1/feeds/{feed_id}/'][method], 23 | status_code=200, 24 | parameters={"path.feed_id": "$response.body#/feeds/0/id"} 25 | ) 26 | 27 | for method in ['GET', 'PATCH', 'DELETE']: 28 | schema.add_link( 29 | source=schema['/api/v1/posts/']['GET'], 30 | target=schema['/api/v1/posts/{post_id}/'][method], 31 | status_code=200, 32 | parameters={"path.post_id": "$response.body#/posts/0/id"} 33 | ) -------------------------------------------------------------------------------- /tests/st/st.py: -------------------------------------------------------------------------------- 1 | from hypothesis.stateful import initialize 2 | import schemathesis 3 | import hooks 4 | 5 | schema = schemathesis.from_uri("http://localhost:8006/api/schema/") 6 | 7 | 8 | 9 | 10 | BaseAPIWorkflow = schema.as_state_machine() 11 | BaseAPIWorkflow.run() -------------------------------------------------------------------------------- /tests/test_01_add_feeds.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from types import SimpleNamespace 4 | import unittest, pytest 5 | from urllib.parse import urljoin 6 | 7 | from tests.utils import remove_unknown_keys, wait_for_jobs 8 | 9 | base_url = os.environ["SERVICE_BASE_URL"] 10 | import requests 11 | 12 | 13 | DATA = [ 14 | { 15 | "id": "d1d96b71-c687-50db-9d2b-d0092d1d163a", 16 | "feed_type": "rss", 17 | "include_remote_blogs": False, 18 | "url": "https://muchdogesec.github.io/fakeblog123/feeds/rss-feed-encoded.xml", 19 | }, 20 | { 21 | "id": "cb0ba709-b841-521a-a3f2-5e1429f4d366", 22 | "feed_type": "atom", 23 | "pretty_url": "https://muchdogesec.github.io/fakeblog123/", 24 | "title": "Custom Title", 25 | "description": "Custom description", 26 | "include_remote_blogs": False, 27 | "url": "https://muchdogesec.github.io/fakeblog123/feeds/atom-feed-decoded.xml", 28 | }, 29 | { 30 | "id": "121e5557-7277-5aa3-945d-e466c6bf92d5", 31 | "title": "Custom Title 2", 32 | "feed_type": "atom", 33 | "include_remote_blogs": False, 34 | "url": "https://muchdogesec.github.io/fakeblog123/feeds/atom-feed-cdata.xml", 35 | }, 36 | { 37 | "id": "8f89731d-b9de-5931-9182-5460af59ca84", 38 | "description": "Custom description 2", 39 | "feed_type": "rss", 40 | "include_remote_blogs": False, 41 | "url": "https://muchdogesec.github.io/fakeblog123/feeds/rss-feed-decoded.xml", 42 | }, 43 | { 44 | "id": "9c04d319-a949-52df-bcb6-5a73a1458fe5", 45 | "feed_type": "atom", 46 | "include_remote_blogs": False, 47 | "url": "https://muchdogesec.github.io/fakeblog123/feeds/atom-feed-decoded-partial.xml", 48 | }, 49 | { 50 | "id": "d63dad15-8e23-57eb-80f7-715cedf85f33", # not passed in request 51 | "feed_type": "skeleton", # not passed in request 52 | "pretty_url": "https://muchdogesec.github.io/fakeblog123/about/", 53 | "url": "https://muchdogesec.github.io/fakeblog123/", 54 | "title": "Skeleton custom Title", 55 | "description": "Skeleton custom description" 56 | } 57 | ] 58 | 59 | def all_blog_parameters(): 60 | return [ 61 | pytest.param(k["url"], k, k.get("should_fail", False)) 62 | for k in DATA 63 | ] 64 | 65 | @pytest.mark.parametrize( 66 | ["url", "blog_data", "should_fail"], 67 | all_blog_parameters(), 68 | ) 69 | def test_add_blog(url, blog_data: dict, should_fail): 70 | payload = remove_unknown_keys(blog_data, ["pretty_url", "title", "description", "include_remote_blogs", "url"]) 71 | 72 | endpoint = urljoin(base_url, "api/v1/feeds/") 73 | 74 | if blog_data["feed_type"] == "skeleton": 75 | post_resp = requests.post(urljoin(endpoint, "skeleton/"), json=payload) 76 | else: 77 | post_resp = requests.post(endpoint, json=payload) 78 | 79 | if should_fail: 80 | assert not post_resp.ok, "add feed request expected to fail" 81 | return 82 | 83 | assert post_resp.status_code == 201, f"request failed: {post_resp.text}" 84 | post_resp_data = post_resp.json() 85 | job_id = post_resp_data.get("job_id") 86 | feed_id = post_resp_data["id"] 87 | if job_id: 88 | wait_for_jobs(job_id) 89 | 90 | feed_resp = requests.get(urljoin(base_url, f"api/v1/feeds/{feed_id}/")) 91 | resp_data = feed_resp.json() 92 | 93 | assert resp_data["id"] == blog_data["id"] 94 | 95 | if expected_pretty_url := blog_data.get("pretty_url"): 96 | assert resp_data["pretty_url"] == expected_pretty_url 97 | 98 | if expected_title := blog_data.get("title"): 99 | assert resp_data["title"] == expected_title 100 | 101 | if expected_description := blog_data.get("description"): 102 | assert resp_data["description"] == expected_description 103 | 104 | if expected_feed_type := blog_data.get("feed_type"): 105 | assert resp_data["feed_type"] == expected_feed_type 106 | 107 | if payload.get('use_search_index'): 108 | assert resp_data["feed_type"] == "search_index" 109 | -------------------------------------------------------------------------------- /tests/test_02_add_post.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from types import SimpleNamespace 4 | import unittest, pytest 5 | from urllib.parse import urljoin 6 | 7 | from tests.utils import remove_unknown_keys, wait_for_jobs 8 | 9 | base_url = os.environ["SERVICE_BASE_URL"] 10 | import requests 11 | 12 | def all_posts(): 13 | DATA = [ 14 | { 15 | "feed_id": "d63dad15-8e23-57eb-80f7-715cedf85f33", 16 | "title": "Example COM", 17 | "id": "223565cd-dd4f-54c2-9bbd-63019f39554f", 18 | "link": "https://example.com/", 19 | "pubdate": "2024-08-11T16:12:03Z", 20 | "author": "test", 21 | "categories": [ 22 | "test", 23 | "test2" 24 | ] 25 | }, 26 | { 27 | "feed_id": "d63dad15-8e23-57eb-80f7-715cedf85f33", 28 | "title": "Example ORG", 29 | "id": "a378c839-0940-56fb-b52c-e5b78d34ec94", 30 | "link": "https://example.org/", 31 | "pubdate": "2024-03-22T16:11:03Z", 32 | "author": "test", 33 | "categories": [ 34 | "test", 35 | "test2" 36 | ] 37 | }, 38 | { 39 | "feed_id": "d63dad15-8e23-57eb-80f7-715cedf85f33", 40 | "title": "Example COM under real", 41 | "id": "223565cd-dd4f-54c2-9bbd-63019f39554f", 42 | "link": "https://example.com/", 43 | "pubdate": "2024-08-11T16:12:03Z", 44 | "author": "test", 45 | "categories": [ 46 | "test", 47 | "test2" 48 | ], 49 | "should_fail": True, #already added 50 | }, 51 | { 52 | "feed_id": "d63dad15-8e23-57eb-80f7-715cedf85f33", 53 | "title": "Example ORG under real", 54 | "id": "a378c839-0940-56fb-b52c-e5b78d34ec94", 55 | "link": "https://example.org/", 56 | "pubdate": "2024-03-22T16:11:03Z", 57 | "author": "test", 58 | "categories": [ 59 | "test", 60 | "test2" 61 | ], 62 | "should_fail": True, #already added 63 | }, 64 | ] 65 | return [ 66 | [d["feed_id"], d["link"], d, d.get("should_fail")] 67 | for d in DATA 68 | ] 69 | 70 | @pytest.mark.parametrize( 71 | ["feed_id", "post_url", "post_data", "should_fail"], 72 | all_posts() 73 | ) 74 | def test_add_post(feed_id, post_url, post_data, should_fail): 75 | payload = remove_unknown_keys(post_data, ["link", "title", "pubdate", "author", "categories"]) 76 | post_job_resp = requests.post(urljoin(base_url, f"api/v1/feeds/{feed_id}/posts/"), json=dict(posts=[payload])) 77 | 78 | if should_fail: 79 | assert post_job_resp.status_code == 400, "add feed request expected to fail" 80 | return 81 | 82 | assert post_job_resp.status_code == 201, f"request failed: {post_job_resp.text}" 83 | post_job_resp_data = post_job_resp.json() 84 | assert post_job_resp_data["feed_id"] == feed_id, "wrong feed id" 85 | assert len(post_job_resp_data["urls"]["retrieving"]) == 1, "one post expected" 86 | post_id = post_job_resp_data["urls"]["retrieving"][0]["id"] 87 | expected_id = post_data["id"] 88 | assert post_id == expected_id 89 | job_id = post_job_resp_data['id'] 90 | 91 | job_data = wait_for_jobs(job_id) 92 | post_data_resp = requests.get(urljoin(base_url, f"api/v1/posts/{post_id}/")) 93 | post_data_resp_data = post_data_resp.json() 94 | assert post_data_resp_data["title"] == post_data["title"] 95 | assert post_data_resp_data["pubdate"] == post_data["pubdate"] 96 | assert set(post_data_resp_data["categories"]) == set(post_data.get("categories", [])) 97 | -------------------------------------------------------------------------------- /tests/test_03_delete_post.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | import time 5 | from types import SimpleNamespace 6 | import unittest, pytest 7 | from urllib.parse import urljoin 8 | 9 | from tests.utils import remove_unknown_keys, wait_for_jobs 10 | 11 | base_url = os.environ["SERVICE_BASE_URL"] 12 | import requests 13 | 14 | @pytest.mark.parametrize( 15 | ["post_id", "should_fail"], 16 | [ 17 | ["9c04d319-a949-52df-bcb6-5a73a1458fe5", True], #post does not exist 18 | ["4aa844cb-18e6-58cc-bed1-4c22abf3b977", False], 19 | ["4aa844cb-18e6-58cc-bed1-4c22abf3b977", True], #post already deleted 20 | ] 21 | ) 22 | def test_delete_post(post_id, should_fail): 23 | post_url = urljoin(base_url, f"api/v1/posts/{post_id}/") 24 | delete_resp = requests.delete(post_url) 25 | 26 | if should_fail: 27 | assert delete_resp.status_code == 404, f"delete post request expected to fail: {delete_resp.text}" 28 | return 29 | assert delete_resp.status_code == 204, f"unexpected status, body: {delete_resp.text}" 30 | 31 | 32 | get_resp = requests.get(post_url) 33 | assert get_resp.status_code == 404, f"post should already be deleted" 34 | -------------------------------------------------------------------------------- /tests/test_04_delete_feed.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import os 4 | import time 5 | from types import SimpleNamespace 6 | import unittest, pytest 7 | from urllib.parse import urljoin 8 | 9 | from tests.utils import remove_unknown_keys, wait_for_jobs 10 | 11 | base_url = os.environ["SERVICE_BASE_URL"] 12 | import requests 13 | 14 | @pytest.mark.parametrize( 15 | ["feed_id", "should_fail"], 16 | [ 17 | ["c2fe0594-f463-5362-afe7-6950bda94bc6", True], #feed does not exist 18 | ["9c04d319-a949-52df-bcb6-5a73a1458fe5", False], 19 | ["9c04d319-a949-52df-bcb6-5a73a1458fe5", True], #feed already deleted 20 | ] 21 | ) 22 | def test_delete_feed(feed_id, should_fail): 23 | feed_url = urljoin(base_url, f"api/v1/feeds/{feed_id}/") 24 | delete_resp = requests.delete(feed_url) 25 | 26 | if should_fail: 27 | assert delete_resp.status_code == 404, f"delete feed request expected to fail: {delete_resp.text}" 28 | return 29 | assert delete_resp.status_code == 204, f"unexpected status, body: {delete_resp.text}" 30 | 31 | 32 | get_resp = requests.get(feed_url) 33 | assert get_resp.status_code == 404, f"feed should already be deleted" 34 | -------------------------------------------------------------------------------- /tests/test_05_post_filters.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import time 4 | from types import SimpleNamespace 5 | import unittest, pytest 6 | from urllib.parse import urljoin 7 | 8 | from tests.utils import get_post_ids_for_job, is_sorted, remove_unknown_keys, wait_for_jobs 9 | 10 | base_url = os.environ["SERVICE_BASE_URL"] 11 | import requests 12 | 13 | 14 | @pytest.mark.parametrize( 15 | ["filters", "expected_ids"], 16 | [ 17 | [ 18 | dict(feed_id="d1d96b71-c687-50db-9d2b-d0092d1d163a"), 19 | [ 20 | "f8c75694-a834-5e35-b0a3-52034a1d9f6d", 21 | "85a762c9-00f9-5c0c-9858-498883e13ea1", 22 | "29be2407-d5d1-5b47-bbb5-1c51a84d48eb", 23 | "84a8ff1c-c463-5a97-b0c4-93daf7102b5f", 24 | "cfdb68b8-3d80-572d-9350-58baf57eabfb", 25 | "8f16d2be-7b06-5f3c-a851-9cce31b4fec8", 26 | ], 27 | ], # feed does not exist 28 | [ 29 | dict(link="test2/2024/08/07"), 30 | [ 31 | "afef9ebd-2dee-5ab9-be0b-96c2ad83a1bb", 32 | "48310096-d1f3-5e30-9910-5d7d0fd400be", 33 | "d8aa9854-43fc-5816-b7ef-fc93810b29a5", 34 | "f8c75694-a834-5e35-b0a3-52034a1d9f6d", 35 | ], 36 | ], 37 | [ 38 | dict(title="uPdATe this Post"), 39 | [ 40 | "58514345-4e10-54c9-8f2c-d81507088079", 41 | "8c72f15c-abeb-5c90-b239-6429f53696f9", 42 | "8f16d2be-7b06-5f3c-a851-9cce31b4fec8", 43 | "f214c1fd-5370-5dff-bd49-fd74bf32c7fe", 44 | ], 45 | ], 46 | [ 47 | dict(title="example org"), 48 | [ 49 | "a378c839-0940-56fb-b52c-e5b78d34ec94", 50 | ], 51 | ], 52 | [ 53 | dict(description="example domain"), 54 | [ 55 | "223565cd-dd4f-54c2-9bbd-63019f39554f", 56 | "a378c839-0940-56fb-b52c-e5b78d34ec94", 57 | ], 58 | ], 59 | ], 60 | ) 61 | def test_filters_generic(filters: dict, expected_ids: list[str]): 62 | expected_ids = set(expected_ids) 63 | url = urljoin(base_url, "api/v1/posts/") 64 | resp = requests.get(url, params=filters) 65 | resp_data = resp.json() 66 | assert resp_data["total_results_count"] == len(expected_ids) 67 | assert {post["id"] for post in resp_data["posts"]} == expected_ids 68 | 69 | 70 | def random_posts_values(key, count): 71 | url = urljoin(base_url, "api/v1/posts/") 72 | resp = requests.get(url) 73 | data = resp.json() 74 | return [post[key] for post in random.choices(data["posts"], k=count)] 75 | 76 | 77 | def more_pubdate_filters(count): 78 | filters = [] 79 | pubdates = random_posts_values("pubdate", 50) 80 | for i in range(count): 81 | mmin = mmax = None 82 | if random.random() > 0.7: 83 | mmax = random.choice(pubdates) 84 | if random.random() < 0.3: 85 | mmin = random.choice(pubdates) 86 | if mmin or mmax: 87 | filters.append([mmin, mmax]) 88 | return filters 89 | 90 | 91 | @pytest.mark.parametrize( 92 | ["pubdate_min", "pubdate_max"], 93 | [ 94 | ["2024-03-22T16:11:03Z", "2024-08-11T16:12:03Z"], 95 | ["2025-03-22T16:11:03Z", "2024-08-11T16:12:03Z"], 96 | ], 97 | ) 98 | def test_pubdate_minmax(pubdate_min, pubdate_max): 99 | filters = {} 100 | if pubdate_min: 101 | filters.update(pubdate_min=pubdate_min) 102 | if pubdate_max: 103 | filters.update(pubdate_max=pubdate_max) 104 | 105 | assert pubdate_max or pubdate_min, "at least one of two filters required" 106 | 107 | url = urljoin(base_url, "api/v1/posts/") 108 | resp = requests.get(url, params=filters) 109 | assert resp.status_code == 200 110 | resp_data = resp.json() 111 | for d in resp_data["posts"]: 112 | if pubdate_min: 113 | assert ( 114 | d["pubdate"] >= pubdate_min 115 | ), "pubdate must not be less than pubdate_min" 116 | if pubdate_max: 117 | assert ( 118 | d["pubdate"] <= pubdate_max 119 | ), "pubdate must not be greater than pubdate_max" 120 | 121 | 122 | @pytest.mark.parametrize( 123 | "updated_after", ["2024-03-22T16:11:03Z", "2030-03-22T16:11:03Z"] 124 | ) 125 | def test_updated_after(updated_after): 126 | assert updated_after, "value cannot be None" 127 | 128 | url = urljoin(base_url, "api/v1/posts/") 129 | resp = requests.get(url, params=dict(pubdate_min=updated_after)) 130 | assert resp.status_code == 200 131 | resp_data = resp.json() 132 | for d in resp_data["posts"]: 133 | assert ( 134 | d["datetime_updated"] >= updated_after 135 | ), "datetime_updated must not be greater than updated_after" 136 | 137 | 138 | def test_extra_updated_after(subtests): 139 | for datetime_updated in random_posts_values("datetime_updated", 12): 140 | with subtests.test( 141 | "randomly_generated updated_after query", updated_after=datetime_updated 142 | ): 143 | test_updated_after(datetime_updated) 144 | 145 | 146 | def test_extra_pubdate_filters(subtests): 147 | for dmin, dmax in more_pubdate_filters(22): 148 | with subtests.test( 149 | "randomly_generated pubdate_* query", pubdate_min=dmin, pubdate_max=dmax 150 | ): 151 | test_pubdate_minmax(dmin, dmax) 152 | 153 | 154 | def test_job_filter(subtests): 155 | def test_job_id_filter(job_id, post_ids): 156 | url = urljoin(base_url, "api/v1/posts/") 157 | resp = requests.get(url, params=dict(job_id=job_id)) 158 | data = resp.json() 159 | for post in data["posts"]: 160 | assert post['id'] in post_ids, "post does not belong to job" 161 | assert data['total_results_count'] == len(post_ids) 162 | 163 | jobs_resp = requests.get(urljoin(base_url, "api/v1/jobs/")) 164 | for job in jobs_resp.json()['jobs']: 165 | with subtests.test("test_job_id_filter", job_id=job['id']): 166 | test_job_id_filter(job['id'], [x[0] for x in get_post_ids_for_job(job)]) 167 | 168 | 169 | @pytest.mark.parametrize( 170 | ["sort_filter", "expected_sort"], 171 | [ 172 | ("", "pubdate_descending"), #default filter 173 | ("pubdate_descending", "pubdate_descending"), 174 | ("pubdate_ascending", "pubdate_ascending"), 175 | ("title_descending", "title_descending"), 176 | ("title_ascending", "title_ascending"), 177 | ("datetime_updated_descending", "datetime_updated_descending"), 178 | ("datetime_updated_ascending", "datetime_updated_ascending"), 179 | ("datetime_added_descending", "datetime_added_descending"), 180 | ("datetime_added_ascending", "datetime_added_ascending"), 181 | ] 182 | ) 183 | def test_list_posts_sort(sort_filter: str, expected_sort: str): 184 | reports_url = urljoin(base_url, f"api/v1/posts/") 185 | filters = dict(sort=sort_filter) if sort_filter else None 186 | get_resp = requests.get(reports_url, params=filters) 187 | assert get_resp.status_code == 200, f"response: {get_resp.text}" 188 | posts = get_resp.json()["posts"] 189 | property, _, direction = expected_sort.rpartition('_') 190 | def sort_fn(obj): 191 | retval = obj[property] 192 | print(retval) 193 | return retval 194 | assert is_sorted(posts, key=sort_fn, reverse=direction == 'descending'), f"expected posts to be sorted by {property} in {direction} order" 195 | -------------------------------------------------------------------------------- /tests/test_06_patch_feed.py: -------------------------------------------------------------------------------- 1 | from datetime import UTC, datetime 2 | import os 3 | import time 4 | from types import SimpleNamespace 5 | import unittest, pytest 6 | from urllib.parse import urljoin 7 | from dateutil.parser import parse as parse_date 8 | 9 | from tests.utils import remove_unknown_keys, wait_for_jobs 10 | 11 | base_url = os.environ["SERVICE_BASE_URL"] 12 | import requests 13 | @pytest.mark.parametrize( 14 | ["feed_id", "metadata"], 15 | [ 16 | ["d1d96b71-c687-50db-9d2b-d0092d1d163a", dict(title="updated title")], 17 | ["d63dad15-8e23-57eb-80f7-715cedf85f33", dict(title="updated title", description="new description")], 18 | ["d1d96b71-c687-50db-9d2b-d0092d1d163a", dict(pretty_url="https://muchdogesec.github.io/fakeblog123/?added_later=true")], 19 | ] 20 | ) 21 | def test_update_feed_metadata(feed_id, metadata): 22 | resp = requests.patch(urljoin(base_url, f"api/v1/feeds/{feed_id}/"), json=metadata) 23 | assert resp.status_code == 201 24 | resp_data = resp.json() 25 | 26 | if expected_pretty_url := metadata.get("pretty_url"): 27 | assert resp_data["pretty_url"] == expected_pretty_url 28 | 29 | if expected_title := metadata.get("title"): 30 | assert resp_data["title"] == expected_title 31 | 32 | if expected_description := metadata.get("description"): 33 | assert resp_data["description"] == expected_description 34 | 35 | # def test_feed_reindex(feed_id): 36 | # start_time = datetime.now(UTC) 37 | # resp = requests.patch(urljoin(base_url, f"api/v1/feeds/{feed_id}/")) 38 | # assert resp.status_code == 201 39 | # resp_data = resp.json() 40 | -------------------------------------------------------------------------------- /tests/test_07_patch_post.py: -------------------------------------------------------------------------------- 1 | from datetime import UTC, datetime 2 | import os 3 | import time 4 | from types import SimpleNamespace 5 | import unittest, pytest 6 | from urllib.parse import urljoin 7 | from dateutil.parser import parse as parse_date 8 | 9 | from tests.utils import remove_unknown_keys, wait_for_jobs 10 | 11 | base_url = os.environ["SERVICE_BASE_URL"] 12 | import requests 13 | @pytest.mark.parametrize( 14 | ["post_id", "metadata"], 15 | [ 16 | ["58514345-4e10-54c9-8f2c-d81507088079", dict(title="updated post title")], 17 | ["a378c839-0940-56fb-b52c-e5b78d34ec94", dict(title="updated title", author="new post author")], 18 | ["58514345-4e10-54c9-8f2c-d81507088079", dict(pubdate="2009-03-04T14:56:07Z")], 19 | ] 20 | ) 21 | def test_update_post_metadata(post_id, metadata): 22 | resp = requests.patch(urljoin(base_url, f"api/v1/posts/{post_id}/"), json=metadata) 23 | assert resp.status_code == 201 24 | resp_data = resp.json() 25 | 26 | if expected_pretty_url := metadata.get("pretty_url"): 27 | assert resp_data["pretty_url"] == expected_pretty_url 28 | 29 | if expected_categories := metadata.get("categories"): 30 | assert resp_data["categories"] == expected_categories 31 | 32 | if expected_author := metadata.get("author"): 33 | assert resp_data["author"] == expected_author 34 | 35 | 36 | if expected_pubdate := metadata.get("pubdate"): 37 | assert resp_data["pubdate"] == expected_pubdate -------------------------------------------------------------------------------- /tests/test_99_delete_all_feeds.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from types import SimpleNamespace 4 | import unittest, pytest 5 | from urllib.parse import urljoin 6 | 7 | base_url = os.environ["SERVICE_BASE_URL"] 8 | import requests 9 | 10 | 11 | def get_all_feeds(): 12 | if not os.getenv('DELETE_ALL_FEEDS'): 13 | return [] 14 | resp = requests.get(urljoin(base_url, "api/v1/feeds/")) 15 | return [[feed["id"]] for feed in resp.json()["feeds"]] 16 | 17 | @pytest.mark.parametrize( 18 | ["feed_id"], 19 | get_all_feeds(), 20 | ) 21 | def test_delete_blog(feed_id): 22 | resp = requests.delete(urljoin(base_url, f"api/v1/feeds/{feed_id}/")) 23 | assert resp.status_code == 204, "unexpected status code" 24 | resp = requests.get(urljoin(base_url, f"api/v1/feeds/{feed_id}/")) 25 | assert resp.status_code == 404, "feed should not exist after deletion" 26 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | from itertools import tee 2 | from operator import lt 3 | import os 4 | import time 5 | from types import SimpleNamespace 6 | import unittest, pytest 7 | from urllib.parse import urljoin 8 | 9 | base_url = os.environ["SERVICE_BASE_URL"] 10 | import requests 11 | 12 | 13 | 14 | def remove_unknown_keys(data: dict, known_keys: list): 15 | payload = data.copy() 16 | for k in list(payload.keys()): 17 | if k not in known_keys: 18 | payload.pop(k, None) 19 | return payload 20 | 21 | 22 | def wait_for_jobs(job_id): 23 | try_count = 0 24 | while True: 25 | job_data = requests.get(f"{base_url}/api/v1/jobs/{job_id}/").json() 26 | job_status = job_data["state"] 27 | if job_status in ["success", "failed"]: 28 | assert job_status == "success", f"response: {job_data}" 29 | return job_data 30 | try_count += 1 31 | assert try_count < 30, "stopped after 30 retries" 32 | time.sleep(3) 33 | 34 | 35 | def get_post_ids_for_job(job:dict): 36 | retval = [] 37 | for type, d in job['urls'].items(): 38 | if type == 'skipped': 39 | continue 40 | for p in d: 41 | retval.append((p['id'], type)) 42 | return retval 43 | 44 | 45 | def is_sorted(iterable, key=None, reverse=False): 46 | it = iterable if (key is None) else map(key, iterable) 47 | a, b = tee(it) 48 | next(b, None) 49 | if reverse: 50 | b, a = a, b 51 | return not any(map(lt, b, a)) --------------------------------------------------------------------------------